|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 372580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 9.375e-06, |
|
"loss": 6.8502, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.875e-05, |
|
"loss": 5.3458, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 5.0282, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 3.75e-05, |
|
"loss": 4.8074, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 4.6301, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 5.625e-05, |
|
"loss": 4.488, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 4.3738, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 7.5e-05, |
|
"loss": 4.2832, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 4.194, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 4.1227, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00010312499999999999, |
|
"loss": 4.0613, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.0001125, |
|
"loss": 3.996, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.00012185624999999998, |
|
"loss": 3.9415, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.000131221875, |
|
"loss": 3.8838, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 0.000140596875, |
|
"loss": 3.8372, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.0001499625, |
|
"loss": 3.7999, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.00015933749999999996, |
|
"loss": 3.7624, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.00016871249999999996, |
|
"loss": 3.7291, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.34637140776693487, |
|
"eval_loss": 3.9229166507720947, |
|
"eval_runtime": 146.7032, |
|
"eval_samples_per_second": 394.886, |
|
"eval_steps_per_second": 6.176, |
|
"step": 18629 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.000178078125, |
|
"loss": 3.6906, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 0.00018745312499999998, |
|
"loss": 3.6596, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.00019681874999999998, |
|
"loss": 3.6393, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.00020618437499999995, |
|
"loss": 3.618, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 0.00021555937499999998, |
|
"loss": 3.6044, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.00022493437499999998, |
|
"loss": 3.5839, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.00023429999999999998, |
|
"loss": 3.5764, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 0.00024367499999999997, |
|
"loss": 3.5571, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 0.00025305, |
|
"loss": 3.5493, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"learning_rate": 0.000262415625, |
|
"loss": 3.5324, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.000271790625, |
|
"loss": 3.5234, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.00028115624999999994, |
|
"loss": 3.5032, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 0.00029053124999999994, |
|
"loss": 3.4949, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.00029990624999999993, |
|
"loss": 3.4876, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.00029912795818897174, |
|
"loss": 3.4725, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 0.00029824798872511596, |
|
"loss": 3.4631, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.000297367138410946, |
|
"loss": 3.4478, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 0.0002964871689470902, |
|
"loss": 3.4351, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.0002956063186329203, |
|
"loss": 3.4237, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.37469545649604485, |
|
"eval_loss": 3.654151439666748, |
|
"eval_runtime": 146.6726, |
|
"eval_samples_per_second": 394.968, |
|
"eval_steps_per_second": 6.177, |
|
"step": 37258 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 0.00029472546831875034, |
|
"loss": 3.3817, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.00029384549885489456, |
|
"loss": 3.3638, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.00029296464854072465, |
|
"loss": 3.3632, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.00029208467907686886, |
|
"loss": 3.3531, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 0.00029120470961301303, |
|
"loss": 3.3486, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.0002903238592988431, |
|
"loss": 3.3489, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"learning_rate": 0.00028944300898467315, |
|
"loss": 3.34, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.00028856215867050324, |
|
"loss": 3.3353, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 0.0002876813083563333, |
|
"loss": 3.328, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 0.0002868013388924775, |
|
"loss": 3.3191, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.0002859204885783076, |
|
"loss": 3.3164, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 0.0002850396382641376, |
|
"loss": 3.3104, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"learning_rate": 0.00028415966880028184, |
|
"loss": 3.3026, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.0002832788184861119, |
|
"loss": 3.3037, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"learning_rate": 0.00028239884902225615, |
|
"loss": 3.295, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 0.0002815179987080862, |
|
"loss": 3.2928, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 0.0002806371483939162, |
|
"loss": 3.2876, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"learning_rate": 0.0002797562980797463, |
|
"loss": 3.2842, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3880011689741996, |
|
"eval_loss": 3.5183119773864746, |
|
"eval_runtime": 146.4806, |
|
"eval_samples_per_second": 395.486, |
|
"eval_steps_per_second": 6.185, |
|
"step": 55887 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.0002788772094662047, |
|
"loss": 3.2741, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 0.00027799635915203474, |
|
"loss": 3.213, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 0.00027711638968817896, |
|
"loss": 3.2153, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.000276235539374009, |
|
"loss": 3.2143, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 0.0002753546890598391, |
|
"loss": 3.2183, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 0.0002744747195959833, |
|
"loss": 3.2182, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.00027359386928181334, |
|
"loss": 3.21, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"learning_rate": 0.00027271389981795756, |
|
"loss": 3.2111, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.0002718339303541018, |
|
"loss": 3.212, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"learning_rate": 0.00027095308003993187, |
|
"loss": 3.2113, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"learning_rate": 0.0002700722297257619, |
|
"loss": 3.2115, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.00026919137941159194, |
|
"loss": 3.2122, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"learning_rate": 0.0002683114099477362, |
|
"loss": 3.2081, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"learning_rate": 0.00026743055963356625, |
|
"loss": 3.2045, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.00026654970931939634, |
|
"loss": 3.2016, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"learning_rate": 0.00026566885900522637, |
|
"loss": 3.2019, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"learning_rate": 0.0002647888895413706, |
|
"loss": 3.1989, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 0.0002639080392272006, |
|
"loss": 3.1971, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 0.00026302718891303066, |
|
"loss": 3.1952, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.39376396191712576, |
|
"eval_loss": 3.474755048751831, |
|
"eval_runtime": 147.5232, |
|
"eval_samples_per_second": 392.691, |
|
"eval_steps_per_second": 6.141, |
|
"step": 74516 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.00026214721944917493, |
|
"loss": 3.1576, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"learning_rate": 0.00026126636913500497, |
|
"loss": 3.1329, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"learning_rate": 0.00026038551882083506, |
|
"loss": 3.1287, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"learning_rate": 0.0002595046685066651, |
|
"loss": 3.1305, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"learning_rate": 0.00025862557989312344, |
|
"loss": 3.1394, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"learning_rate": 0.00025774472957895353, |
|
"loss": 3.1349, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"learning_rate": 0.00025686387926478356, |
|
"loss": 3.1386, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 0.00025598390980092784, |
|
"loss": 3.1376, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.00025510305948675787, |
|
"loss": 3.1401, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"learning_rate": 0.0002542222091725879, |
|
"loss": 3.134, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"learning_rate": 0.0002533422397087321, |
|
"loss": 3.1311, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.00025246138939456216, |
|
"loss": 3.1384, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"learning_rate": 0.00025158141993070643, |
|
"loss": 3.1377, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"learning_rate": 0.00025070056961653647, |
|
"loss": 3.1346, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"learning_rate": 0.0002498206001526807, |
|
"loss": 3.1381, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"learning_rate": 0.0002489397498385107, |
|
"loss": 3.1292, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"learning_rate": 0.0002480588995243408, |
|
"loss": 3.1317, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"learning_rate": 0.00024717804921017085, |
|
"loss": 3.1313, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"learning_rate": 0.0002462980797463151, |
|
"loss": 3.1351, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.39715434979773, |
|
"eval_loss": 3.4493298530578613, |
|
"eval_runtime": 146.562, |
|
"eval_samples_per_second": 395.266, |
|
"eval_steps_per_second": 6.182, |
|
"step": 93145 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"learning_rate": 0.0002454181102824593, |
|
"loss": 3.0699, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"learning_rate": 0.0002445372599682894, |
|
"loss": 3.0649, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"learning_rate": 0.0002436564096541194, |
|
"loss": 3.0682, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"learning_rate": 0.00024277555933994947, |
|
"loss": 3.0778, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"learning_rate": 0.0002418955898760937, |
|
"loss": 3.0749, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"learning_rate": 0.00024101473956192375, |
|
"loss": 3.0774, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"learning_rate": 0.0002401338892477538, |
|
"loss": 3.079, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"learning_rate": 0.00023925391978389803, |
|
"loss": 3.078, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"learning_rate": 0.0002383730694697281, |
|
"loss": 3.0779, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"learning_rate": 0.00023749310000587231, |
|
"loss": 3.0822, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"learning_rate": 0.00023661224969170235, |
|
"loss": 3.0886, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"learning_rate": 0.0002357322802278466, |
|
"loss": 3.0826, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"learning_rate": 0.00023485142991367666, |
|
"loss": 3.083, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"learning_rate": 0.00023397057959950672, |
|
"loss": 3.0808, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"learning_rate": 0.00023308972928533675, |
|
"loss": 3.0832, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"learning_rate": 0.00023220887897116682, |
|
"loss": 3.0856, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"learning_rate": 0.00023132890950731104, |
|
"loss": 3.0828, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"learning_rate": 0.0002304480591931411, |
|
"loss": 3.0844, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.4004582266021962, |
|
"eval_loss": 3.415574312210083, |
|
"eval_runtime": 147.2002, |
|
"eval_samples_per_second": 393.552, |
|
"eval_steps_per_second": 6.155, |
|
"step": 111774 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"learning_rate": 0.00022956808972928534, |
|
"loss": 3.068, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"learning_rate": 0.00022868723941511538, |
|
"loss": 3.0137, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"learning_rate": 0.0002278072699512596, |
|
"loss": 3.0237, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"learning_rate": 0.00022692641963708966, |
|
"loss": 3.0212, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"learning_rate": 0.00022604645017323385, |
|
"loss": 3.0283, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"learning_rate": 0.00022516559985906391, |
|
"loss": 3.0323, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"learning_rate": 0.00022428474954489398, |
|
"loss": 3.0307, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"learning_rate": 0.00022340389923072404, |
|
"loss": 3.0335, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"learning_rate": 0.00022252392976686826, |
|
"loss": 3.0377, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"learning_rate": 0.00022164396030301248, |
|
"loss": 3.0392, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"learning_rate": 0.00022076310998884254, |
|
"loss": 3.0342, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"learning_rate": 0.0002198822596746726, |
|
"loss": 3.0388, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"learning_rate": 0.00021900140936050263, |
|
"loss": 3.0405, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"learning_rate": 0.0002181205590463327, |
|
"loss": 3.0445, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"learning_rate": 0.00021724058958247694, |
|
"loss": 3.0388, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"learning_rate": 0.000216359739268307, |
|
"loss": 3.0434, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"learning_rate": 0.0002154797698044512, |
|
"loss": 3.0398, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"learning_rate": 0.00021459891949028126, |
|
"loss": 3.0393, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"learning_rate": 0.00021371806917611132, |
|
"loss": 3.0442, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.40321569998711065, |
|
"eval_loss": 3.3853769302368164, |
|
"eval_runtime": 146.6813, |
|
"eval_samples_per_second": 394.945, |
|
"eval_steps_per_second": 6.177, |
|
"step": 130403 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"learning_rate": 0.00021283809971225557, |
|
"loss": 2.9992, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"learning_rate": 0.00021195724939808563, |
|
"loss": 2.9735, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"learning_rate": 0.00021107727993422982, |
|
"loss": 2.9852, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"learning_rate": 0.00021019642962005988, |
|
"loss": 2.987, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"learning_rate": 0.0002093164601562041, |
|
"loss": 2.9856, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"learning_rate": 0.00020843560984203414, |
|
"loss": 2.9901, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"learning_rate": 0.0002075547595278642, |
|
"loss": 2.9944, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"learning_rate": 0.00020667390921369426, |
|
"loss": 2.9964, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"learning_rate": 0.0002057939397498385, |
|
"loss": 2.9989, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"learning_rate": 0.00020491308943566854, |
|
"loss": 2.9983, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.57, |
|
"learning_rate": 0.00020403311997181276, |
|
"loss": 3.0016, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"learning_rate": 0.00020315226965764282, |
|
"loss": 3.0024, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"learning_rate": 0.00020227141934347288, |
|
"loss": 3.0029, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"learning_rate": 0.0002013914498796171, |
|
"loss": 3.0057, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"learning_rate": 0.00020051059956544717, |
|
"loss": 3.0074, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"learning_rate": 0.00019962974925127723, |
|
"loss": 3.0036, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.89, |
|
"learning_rate": 0.0001987488989371073, |
|
"loss": 3.0127, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"learning_rate": 0.00019786892947325148, |
|
"loss": 3.0076, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 0.00019698896000939573, |
|
"loss": 3.008, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4029720181670573, |
|
"eval_loss": 3.406196117401123, |
|
"eval_runtime": 147.0273, |
|
"eval_samples_per_second": 394.015, |
|
"eval_steps_per_second": 6.162, |
|
"step": 149032 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"learning_rate": 0.0001961081096952258, |
|
"loss": 2.9374, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.11, |
|
"learning_rate": 0.00019522725938105585, |
|
"loss": 2.9476, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"learning_rate": 0.0001943464090668859, |
|
"loss": 2.9477, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"learning_rate": 0.0001934664396030301, |
|
"loss": 2.9527, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.27, |
|
"learning_rate": 0.00019258647013917432, |
|
"loss": 2.9556, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"learning_rate": 0.0001917056198250044, |
|
"loss": 2.9656, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"learning_rate": 0.0001908256503611486, |
|
"loss": 2.9603, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.43, |
|
"learning_rate": 0.00018994480004697867, |
|
"loss": 2.9675, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"learning_rate": 0.0001890648305831229, |
|
"loss": 2.9637, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"learning_rate": 0.00018818398026895295, |
|
"loss": 2.9655, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"learning_rate": 0.0001873040108050972, |
|
"loss": 2.9693, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"learning_rate": 0.00018642316049092723, |
|
"loss": 2.9702, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"learning_rate": 0.0001855423101767573, |
|
"loss": 2.9723, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"learning_rate": 0.00018466145986258735, |
|
"loss": 2.9766, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"learning_rate": 0.00018378149039873155, |
|
"loss": 2.9716, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.86, |
|
"learning_rate": 0.0001829006400845616, |
|
"loss": 2.9744, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.91, |
|
"learning_rate": 0.00018202067062070585, |
|
"loss": 2.9746, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"learning_rate": 0.00018113982030653592, |
|
"loss": 2.9768, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.40467940291506055, |
|
"eval_loss": 3.3969688415527344, |
|
"eval_runtime": 146.7783, |
|
"eval_samples_per_second": 394.684, |
|
"eval_steps_per_second": 6.173, |
|
"step": 167661 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"learning_rate": 0.0001802598508426801, |
|
"loss": 2.9532, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"learning_rate": 0.00017937900052851017, |
|
"loss": 2.9113, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.13, |
|
"learning_rate": 0.00017849815021434023, |
|
"loss": 2.9146, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"learning_rate": 0.00017761818075048445, |
|
"loss": 2.9201, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"learning_rate": 0.00017673733043631449, |
|
"loss": 2.9252, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.29, |
|
"learning_rate": 0.00017585736097245873, |
|
"loss": 2.933, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.34, |
|
"learning_rate": 0.0001749765106582888, |
|
"loss": 2.9311, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.39, |
|
"learning_rate": 0.00017409566034411886, |
|
"loss": 2.932, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.45, |
|
"learning_rate": 0.0001732148100299489, |
|
"loss": 2.9385, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"learning_rate": 0.00017233395971577895, |
|
"loss": 2.9352, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"learning_rate": 0.00017145310940160901, |
|
"loss": 2.9421, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"learning_rate": 0.00017057313993775323, |
|
"loss": 2.9382, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.66, |
|
"learning_rate": 0.00016969317047389745, |
|
"loss": 2.9397, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.72, |
|
"learning_rate": 0.00016881232015972751, |
|
"loss": 2.9424, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"learning_rate": 0.00016793235069587173, |
|
"loss": 2.9413, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.82, |
|
"learning_rate": 0.0001670515003817018, |
|
"loss": 2.9477, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"learning_rate": 0.00016617065006753183, |
|
"loss": 2.9463, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.93, |
|
"learning_rate": 0.00016529068060367605, |
|
"loss": 2.9485, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 9.98, |
|
"learning_rate": 0.0001644098302895061, |
|
"loss": 2.9498, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.40466066332636297, |
|
"eval_loss": 3.4024200439453125, |
|
"eval_runtime": 146.7679, |
|
"eval_samples_per_second": 394.712, |
|
"eval_steps_per_second": 6.173, |
|
"step": 186290 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"learning_rate": 0.00016352897997533617, |
|
"loss": 2.8971, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.09, |
|
"learning_rate": 0.00016264989136179455, |
|
"loss": 2.8859, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.15, |
|
"learning_rate": 0.0001617690410476246, |
|
"loss": 2.8923, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"learning_rate": 0.00016088819073345467, |
|
"loss": 2.896, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"learning_rate": 0.00016000734041928474, |
|
"loss": 2.8957, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.31, |
|
"learning_rate": 0.00015912737095542895, |
|
"loss": 2.9042, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"learning_rate": 0.00015824652064125902, |
|
"loss": 2.9053, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.41, |
|
"learning_rate": 0.00015736655117740324, |
|
"loss": 2.9073, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.47, |
|
"learning_rate": 0.0001564857008632333, |
|
"loss": 2.909, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.52, |
|
"learning_rate": 0.00015560485054906333, |
|
"loss": 2.9105, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.57, |
|
"learning_rate": 0.0001547240002348934, |
|
"loss": 2.9139, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.63, |
|
"learning_rate": 0.00015384314992072346, |
|
"loss": 2.9178, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.68, |
|
"learning_rate": 0.00015296229960655352, |
|
"loss": 2.9157, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.74, |
|
"learning_rate": 0.00015208233014269774, |
|
"loss": 2.9183, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.79, |
|
"learning_rate": 0.00015120236067884196, |
|
"loss": 2.9167, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.84, |
|
"learning_rate": 0.00015032151036467202, |
|
"loss": 2.9226, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.9, |
|
"learning_rate": 0.00014944154090081624, |
|
"loss": 2.9192, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"learning_rate": 0.0001485606905866463, |
|
"loss": 2.917, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4038836756487508, |
|
"eval_loss": 3.424196243286133, |
|
"eval_runtime": 146.9603, |
|
"eval_samples_per_second": 394.195, |
|
"eval_steps_per_second": 6.165, |
|
"step": 204919 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 0.00014767984027247636, |
|
"loss": 2.9203, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.06, |
|
"learning_rate": 0.00014679987080862058, |
|
"loss": 2.8546, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"learning_rate": 0.00014591902049445064, |
|
"loss": 2.8642, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.17, |
|
"learning_rate": 0.00014503817018028068, |
|
"loss": 2.8687, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.22, |
|
"learning_rate": 0.00014415820071642492, |
|
"loss": 2.8739, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"learning_rate": 0.00014327735040225496, |
|
"loss": 2.8749, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.33, |
|
"learning_rate": 0.0001423973809383992, |
|
"loss": 2.8796, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.38, |
|
"learning_rate": 0.00014151653062422924, |
|
"loss": 2.8808, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.43, |
|
"learning_rate": 0.00014063656116037346, |
|
"loss": 2.8825, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.49, |
|
"learning_rate": 0.00013975659169651768, |
|
"loss": 2.8886, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.54, |
|
"learning_rate": 0.00013887574138234774, |
|
"loss": 2.8905, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.59, |
|
"learning_rate": 0.0001379948910681778, |
|
"loss": 2.8877, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.65, |
|
"learning_rate": 0.00013711404075400786, |
|
"loss": 2.8924, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.7, |
|
"learning_rate": 0.00013623407129015208, |
|
"loss": 2.8915, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"learning_rate": 0.00013535322097598214, |
|
"loss": 2.8901, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.81, |
|
"learning_rate": 0.00013447325151212636, |
|
"loss": 2.8894, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.86, |
|
"learning_rate": 0.00013359240119795643, |
|
"loss": 2.8924, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.92, |
|
"learning_rate": 0.00013271155088378646, |
|
"loss": 2.8998, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.97, |
|
"learning_rate": 0.0001318315814199307, |
|
"loss": 2.9005, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.40485423857592023, |
|
"eval_loss": 3.409266710281372, |
|
"eval_runtime": 147.6369, |
|
"eval_samples_per_second": 392.388, |
|
"eval_steps_per_second": 6.137, |
|
"step": 223548 |
|
}, |
|
{ |
|
"epoch": 12.02, |
|
"learning_rate": 0.00013095073110576074, |
|
"loss": 2.8672, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.08, |
|
"learning_rate": 0.0001300698807915908, |
|
"loss": 2.8401, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.13, |
|
"learning_rate": 0.00012918991132773502, |
|
"loss": 2.838, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.19, |
|
"learning_rate": 0.00012830906101356508, |
|
"loss": 2.8478, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"learning_rate": 0.00012742821069939515, |
|
"loss": 2.8567, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.29, |
|
"learning_rate": 0.00012654824123553937, |
|
"loss": 2.8554, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.35, |
|
"learning_rate": 0.0001256673909213694, |
|
"loss": 2.8574, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"learning_rate": 0.0001247883023078278, |
|
"loss": 2.8588, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.45, |
|
"learning_rate": 0.00012390745199365787, |
|
"loss": 2.857, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.51, |
|
"learning_rate": 0.00012302660167948793, |
|
"loss": 2.8649, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"learning_rate": 0.00012214575136531796, |
|
"loss": 2.862, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.61, |
|
"learning_rate": 0.00012126666275177637, |
|
"loss": 2.8702, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.67, |
|
"learning_rate": 0.00012038581243760643, |
|
"loss": 2.8654, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.72, |
|
"learning_rate": 0.00011950496212343648, |
|
"loss": 2.8664, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.78, |
|
"learning_rate": 0.00011862499265958071, |
|
"loss": 2.8714, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.83, |
|
"learning_rate": 0.00011774414234541076, |
|
"loss": 2.8719, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.88, |
|
"learning_rate": 0.00011686329203124082, |
|
"loss": 2.8718, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"learning_rate": 0.00011598244171707087, |
|
"loss": 2.8766, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 12.99, |
|
"learning_rate": 0.00011510159140290093, |
|
"loss": 2.8747, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.40508253915650494, |
|
"eval_loss": 3.419220209121704, |
|
"eval_runtime": 147.1595, |
|
"eval_samples_per_second": 393.661, |
|
"eval_steps_per_second": 6.157, |
|
"step": 242177 |
|
}, |
|
{ |
|
"epoch": 13.04, |
|
"learning_rate": 0.00011422162193904515, |
|
"loss": 2.8227, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.1, |
|
"learning_rate": 0.00011334165247518937, |
|
"loss": 2.8177, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.15, |
|
"learning_rate": 0.00011246080216101942, |
|
"loss": 2.8252, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.21, |
|
"learning_rate": 0.00011157995184684948, |
|
"loss": 2.8268, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.26, |
|
"learning_rate": 0.00011069910153267953, |
|
"loss": 2.8324, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.31, |
|
"learning_rate": 0.00010981825121850959, |
|
"loss": 2.8336, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.37, |
|
"learning_rate": 0.00010893828175465381, |
|
"loss": 2.8315, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.42, |
|
"learning_rate": 0.00010805743144048387, |
|
"loss": 2.8384, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.47, |
|
"learning_rate": 0.00010717746197662809, |
|
"loss": 2.8431, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.53, |
|
"learning_rate": 0.00010629661166245815, |
|
"loss": 2.8403, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.58, |
|
"learning_rate": 0.0001054157613482882, |
|
"loss": 2.8461, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.63, |
|
"learning_rate": 0.00010453579188443243, |
|
"loss": 2.8443, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.69, |
|
"learning_rate": 0.00010365494157026248, |
|
"loss": 2.8493, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.74, |
|
"learning_rate": 0.00010277497210640671, |
|
"loss": 2.8451, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"learning_rate": 0.00010189412179223676, |
|
"loss": 2.8502, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.85, |
|
"learning_rate": 0.00010101327147806682, |
|
"loss": 2.8475, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.9, |
|
"learning_rate": 0.00010013242116389687, |
|
"loss": 2.8509, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.96, |
|
"learning_rate": 9.92524517000411e-05, |
|
"loss": 2.8542, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.40528363710833504, |
|
"eval_loss": 3.4232749938964844, |
|
"eval_runtime": 146.7575, |
|
"eval_samples_per_second": 394.74, |
|
"eval_steps_per_second": 6.173, |
|
"step": 260806 |
|
}, |
|
{ |
|
"epoch": 14.01, |
|
"learning_rate": 9.837160138587115e-05, |
|
"loss": 2.8396, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.06, |
|
"learning_rate": 9.749075107170121e-05, |
|
"loss": 2.7946, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.12, |
|
"learning_rate": 9.660990075753126e-05, |
|
"loss": 2.7988, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.17, |
|
"learning_rate": 9.573081214398965e-05, |
|
"loss": 2.8067, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.23, |
|
"learning_rate": 9.48499618298197e-05, |
|
"loss": 2.8113, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.28, |
|
"learning_rate": 9.396911151564976e-05, |
|
"loss": 2.8143, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.33, |
|
"learning_rate": 9.308826120147981e-05, |
|
"loss": 2.8156, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.39, |
|
"learning_rate": 9.220829173762404e-05, |
|
"loss": 2.8157, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.44, |
|
"learning_rate": 9.132744142345409e-05, |
|
"loss": 2.8176, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.49, |
|
"learning_rate": 9.044659110928415e-05, |
|
"loss": 2.8195, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.55, |
|
"learning_rate": 8.956662164542837e-05, |
|
"loss": 2.823, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"learning_rate": 8.868577133125844e-05, |
|
"loss": 2.8243, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.65, |
|
"learning_rate": 8.780492101708848e-05, |
|
"loss": 2.8264, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.71, |
|
"learning_rate": 8.692495155323272e-05, |
|
"loss": 2.8238, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.76, |
|
"learning_rate": 8.604410123906277e-05, |
|
"loss": 2.8249, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.82, |
|
"learning_rate": 8.5164131775207e-05, |
|
"loss": 2.8285, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.87, |
|
"learning_rate": 8.428328146103705e-05, |
|
"loss": 2.8252, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.92, |
|
"learning_rate": 8.340243114686711e-05, |
|
"loss": 2.8293, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 14.98, |
|
"learning_rate": 8.252246168301133e-05, |
|
"loss": 2.8326, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.40544732304975456, |
|
"eval_loss": 3.4313971996307373, |
|
"eval_runtime": 146.9557, |
|
"eval_samples_per_second": 394.207, |
|
"eval_steps_per_second": 6.165, |
|
"step": 279435 |
|
}, |
|
{ |
|
"epoch": 15.03, |
|
"learning_rate": 8.164161136884139e-05, |
|
"loss": 2.8031, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.08, |
|
"learning_rate": 8.076164190498561e-05, |
|
"loss": 2.7829, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.14, |
|
"learning_rate": 7.988079159081567e-05, |
|
"loss": 2.7866, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.19, |
|
"learning_rate": 7.899994127664572e-05, |
|
"loss": 2.7878, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.25, |
|
"learning_rate": 7.811997181278994e-05, |
|
"loss": 2.7902, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.3, |
|
"learning_rate": 7.723912149861999e-05, |
|
"loss": 2.7923, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.35, |
|
"learning_rate": 7.635915203476422e-05, |
|
"loss": 2.7957, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.41, |
|
"learning_rate": 7.547830172059427e-05, |
|
"loss": 2.7959, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.46, |
|
"learning_rate": 7.459745140642433e-05, |
|
"loss": 2.7977, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.51, |
|
"learning_rate": 7.371748194256855e-05, |
|
"loss": 2.8023, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.57, |
|
"learning_rate": 7.283663162839861e-05, |
|
"loss": 2.8038, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.62, |
|
"learning_rate": 7.195666216454283e-05, |
|
"loss": 2.8004, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.67, |
|
"learning_rate": 7.107581185037289e-05, |
|
"loss": 2.8042, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.73, |
|
"learning_rate": 7.019496153620294e-05, |
|
"loss": 2.8065, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.78, |
|
"learning_rate": 6.9314111222033e-05, |
|
"loss": 2.8046, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.84, |
|
"learning_rate": 6.843414175817722e-05, |
|
"loss": 2.8082, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.89, |
|
"learning_rate": 6.755329144400728e-05, |
|
"loss": 2.8096, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.94, |
|
"learning_rate": 6.667244112983733e-05, |
|
"loss": 2.8063, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 6.579247166598155e-05, |
|
"loss": 2.8125, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.40515031064394535, |
|
"eval_loss": 3.4404408931732178, |
|
"eval_runtime": 146.6919, |
|
"eval_samples_per_second": 394.916, |
|
"eval_steps_per_second": 6.176, |
|
"step": 298064 |
|
}, |
|
{ |
|
"epoch": 16.05, |
|
"learning_rate": 6.491338305243995e-05, |
|
"loss": 2.7653, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.1, |
|
"learning_rate": 6.403253273827e-05, |
|
"loss": 2.7653, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.16, |
|
"learning_rate": 6.315168242410006e-05, |
|
"loss": 2.7705, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.21, |
|
"learning_rate": 6.227083210993011e-05, |
|
"loss": 2.7698, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.26, |
|
"learning_rate": 6.138998179576017e-05, |
|
"loss": 2.7756, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.32, |
|
"learning_rate": 6.050913148159022e-05, |
|
"loss": 2.7763, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.37, |
|
"learning_rate": 5.962828116742028e-05, |
|
"loss": 2.7787, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.43, |
|
"learning_rate": 5.87483117035645e-05, |
|
"loss": 2.7797, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.48, |
|
"learning_rate": 5.786746138939455e-05, |
|
"loss": 2.7807, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.53, |
|
"learning_rate": 5.698749192553878e-05, |
|
"loss": 2.7803, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.59, |
|
"learning_rate": 5.610664161136883e-05, |
|
"loss": 2.7844, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.64, |
|
"learning_rate": 5.522579129719889e-05, |
|
"loss": 2.7835, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.69, |
|
"learning_rate": 5.4345821833343114e-05, |
|
"loss": 2.7858, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.75, |
|
"learning_rate": 5.346497151917317e-05, |
|
"loss": 2.787, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"learning_rate": 5.25850020553174e-05, |
|
"loss": 2.7899, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.86, |
|
"learning_rate": 5.170415174114745e-05, |
|
"loss": 2.7873, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.91, |
|
"learning_rate": 5.0823301426977506e-05, |
|
"loss": 2.7879, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.96, |
|
"learning_rate": 4.994245111280756e-05, |
|
"loss": 2.7911, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.40542401610610557, |
|
"eval_loss": 3.4449925422668457, |
|
"eval_runtime": 146.7539, |
|
"eval_samples_per_second": 394.749, |
|
"eval_steps_per_second": 6.174, |
|
"step": 316693 |
|
}, |
|
{ |
|
"epoch": 17.02, |
|
"learning_rate": 4.906248164895178e-05, |
|
"loss": 2.7767, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.07, |
|
"learning_rate": 4.8182512185096006e-05, |
|
"loss": 2.7487, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.12, |
|
"learning_rate": 4.730166187092606e-05, |
|
"loss": 2.7542, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.18, |
|
"learning_rate": 4.6420811556756116e-05, |
|
"loss": 2.759, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.23, |
|
"learning_rate": 4.554084209290035e-05, |
|
"loss": 2.7571, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.28, |
|
"learning_rate": 4.46599917787304e-05, |
|
"loss": 2.7582, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.34, |
|
"learning_rate": 4.3780022314874616e-05, |
|
"loss": 2.7629, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.39, |
|
"learning_rate": 4.289917200070467e-05, |
|
"loss": 2.76, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.45, |
|
"learning_rate": 4.2018321686534727e-05, |
|
"loss": 2.7596, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"learning_rate": 4.113835222267895e-05, |
|
"loss": 2.762, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.55, |
|
"learning_rate": 4.0258382758823185e-05, |
|
"loss": 2.7645, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.61, |
|
"learning_rate": 3.937753244465324e-05, |
|
"loss": 2.7674, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.66, |
|
"learning_rate": 3.849668213048329e-05, |
|
"loss": 2.7634, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.71, |
|
"learning_rate": 3.7615831816313344e-05, |
|
"loss": 2.7657, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.77, |
|
"learning_rate": 3.67349815021434e-05, |
|
"loss": 2.7677, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.82, |
|
"learning_rate": 3.5854131187973454e-05, |
|
"loss": 2.7711, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.88, |
|
"learning_rate": 3.497416172411768e-05, |
|
"loss": 2.7684, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.93, |
|
"learning_rate": 3.40941922602619e-05, |
|
"loss": 2.7698, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 17.98, |
|
"learning_rate": 3.321422279640613e-05, |
|
"loss": 2.7682, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.4054381211728672, |
|
"eval_loss": 3.4487550258636475, |
|
"eval_runtime": 146.6725, |
|
"eval_samples_per_second": 394.968, |
|
"eval_steps_per_second": 6.177, |
|
"step": 335322 |
|
}, |
|
{ |
|
"epoch": 18.04, |
|
"learning_rate": 3.233337248223618e-05, |
|
"loss": 2.7483, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.09, |
|
"learning_rate": 3.1452522168066235e-05, |
|
"loss": 2.7436, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.14, |
|
"learning_rate": 3.057167185389629e-05, |
|
"loss": 2.7403, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.2, |
|
"learning_rate": 2.9691702390040516e-05, |
|
"loss": 2.7426, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.25, |
|
"learning_rate": 2.881085207587057e-05, |
|
"loss": 2.7424, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.3, |
|
"learning_rate": 2.7930001761700627e-05, |
|
"loss": 2.7414, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.36, |
|
"learning_rate": 2.704915144753068e-05, |
|
"loss": 2.7417, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.41, |
|
"learning_rate": 2.6170062833989075e-05, |
|
"loss": 2.7445, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.47, |
|
"learning_rate": 2.528921251981913e-05, |
|
"loss": 2.7418, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.52, |
|
"learning_rate": 2.4408362205649185e-05, |
|
"loss": 2.7499, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.57, |
|
"learning_rate": 2.352751189147924e-05, |
|
"loss": 2.7496, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.63, |
|
"learning_rate": 2.2648423277937634e-05, |
|
"loss": 2.749, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.68, |
|
"learning_rate": 2.176757296376769e-05, |
|
"loss": 2.7483, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.73, |
|
"learning_rate": 2.0886722649597744e-05, |
|
"loss": 2.7452, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.79, |
|
"learning_rate": 2.00058723354278e-05, |
|
"loss": 2.7509, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.84, |
|
"learning_rate": 1.9125022021257855e-05, |
|
"loss": 2.7539, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.9, |
|
"learning_rate": 1.8245052557402077e-05, |
|
"loss": 2.7494, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.95, |
|
"learning_rate": 1.7364202243232132e-05, |
|
"loss": 2.7512, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4053620881463235, |
|
"eval_loss": 3.4580540657043457, |
|
"eval_runtime": 146.8189, |
|
"eval_samples_per_second": 394.574, |
|
"eval_steps_per_second": 6.171, |
|
"step": 353951 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"learning_rate": 1.6484232779376355e-05, |
|
"loss": 2.7469, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.06, |
|
"learning_rate": 1.560338246520641e-05, |
|
"loss": 2.7291, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.11, |
|
"learning_rate": 1.4723413001350636e-05, |
|
"loss": 2.7264, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.16, |
|
"learning_rate": 1.384344353749486e-05, |
|
"loss": 2.7311, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.22, |
|
"learning_rate": 1.2962593223324915e-05, |
|
"loss": 2.7316, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.27, |
|
"learning_rate": 1.208174290915497e-05, |
|
"loss": 2.7296, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.32, |
|
"learning_rate": 1.1200892594985025e-05, |
|
"loss": 2.7333, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.38, |
|
"learning_rate": 1.032092313112925e-05, |
|
"loss": 2.7345, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.43, |
|
"learning_rate": 9.440953667273474e-06, |
|
"loss": 2.7319, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.49, |
|
"learning_rate": 8.560103353103529e-06, |
|
"loss": 2.7309, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.54, |
|
"learning_rate": 7.679253038933582e-06, |
|
"loss": 2.7327, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.59, |
|
"learning_rate": 6.798402724763638e-06, |
|
"loss": 2.7342, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.65, |
|
"learning_rate": 5.917552410593693e-06, |
|
"loss": 2.7296, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.7, |
|
"learning_rate": 5.037582946737917e-06, |
|
"loss": 2.7344, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.75, |
|
"learning_rate": 4.156732632567972e-06, |
|
"loss": 2.7325, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.81, |
|
"learning_rate": 3.275882318398027e-06, |
|
"loss": 2.7282, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.86, |
|
"learning_rate": 2.395912854542251e-06, |
|
"loss": 2.7303, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.92, |
|
"learning_rate": 1.5150625403723059e-06, |
|
"loss": 2.7292, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 19.97, |
|
"learning_rate": 6.342122262023606e-07, |
|
"loss": 2.7331, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.40517314741870225, |
|
"eval_loss": 3.466306209564209, |
|
"eval_runtime": 147.2197, |
|
"eval_samples_per_second": 393.5, |
|
"eval_steps_per_second": 6.154, |
|
"step": 372580 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 372580, |
|
"total_flos": 1.56977765434368e+18, |
|
"train_loss": 3.0355079814286947, |
|
"train_runtime": 47458.051, |
|
"train_samples_per_second": 251.219, |
|
"train_steps_per_second": 7.851 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 372580, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.56977765434368e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|