Toshiiiii1's picture
Upload 11 files
a9a104e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9938916950546224,
"eval_steps": 500,
"global_step": 34000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06,
"learning_rate": 1.4679976512037582e-06,
"loss": 0.412,
"step": 500
},
{
"epoch": 0.06,
"eval_loss": 0.06404077261686325,
"eval_runtime": 119.6505,
"eval_samples_per_second": 54.676,
"eval_steps_per_second": 6.837,
"step": 500
},
{
"epoch": 0.12,
"learning_rate": 2.9359953024075165e-06,
"loss": 0.0572,
"step": 1000
},
{
"epoch": 0.12,
"eval_loss": 0.033456169068813324,
"eval_runtime": 119.4711,
"eval_samples_per_second": 54.758,
"eval_steps_per_second": 6.847,
"step": 1000
},
{
"epoch": 0.18,
"learning_rate": 4.403992953611275e-06,
"loss": 0.0392,
"step": 1500
},
{
"epoch": 0.18,
"eval_loss": 0.027742423117160797,
"eval_runtime": 119.6338,
"eval_samples_per_second": 54.684,
"eval_steps_per_second": 6.838,
"step": 1500
},
{
"epoch": 0.23,
"learning_rate": 5.871990604815033e-06,
"loss": 0.0339,
"step": 2000
},
{
"epoch": 0.23,
"eval_loss": 0.024982118979096413,
"eval_runtime": 119.4896,
"eval_samples_per_second": 54.75,
"eval_steps_per_second": 6.846,
"step": 2000
},
{
"epoch": 0.29,
"learning_rate": 7.33998825601879e-06,
"loss": 0.0321,
"step": 2500
},
{
"epoch": 0.29,
"eval_loss": 0.02286355197429657,
"eval_runtime": 119.7537,
"eval_samples_per_second": 54.629,
"eval_steps_per_second": 6.831,
"step": 2500
},
{
"epoch": 0.35,
"learning_rate": 8.80798590722255e-06,
"loss": 0.0286,
"step": 3000
},
{
"epoch": 0.35,
"eval_loss": 0.02225133590400219,
"eval_runtime": 115.351,
"eval_samples_per_second": 56.714,
"eval_steps_per_second": 7.091,
"step": 3000
},
{
"epoch": 0.41,
"learning_rate": 9.969327155256804e-06,
"loss": 0.0265,
"step": 3500
},
{
"epoch": 0.41,
"eval_loss": 0.02050725743174553,
"eval_runtime": 115.364,
"eval_samples_per_second": 56.707,
"eval_steps_per_second": 7.091,
"step": 3500
},
{
"epoch": 0.47,
"learning_rate": 9.806173725771716e-06,
"loss": 0.026,
"step": 4000
},
{
"epoch": 0.47,
"eval_loss": 0.020657476037740707,
"eval_runtime": 118.0643,
"eval_samples_per_second": 55.41,
"eval_steps_per_second": 6.928,
"step": 4000
},
{
"epoch": 0.53,
"learning_rate": 9.64302029628663e-06,
"loss": 0.0238,
"step": 4500
},
{
"epoch": 0.53,
"eval_loss": 0.019942762330174446,
"eval_runtime": 119.4625,
"eval_samples_per_second": 54.762,
"eval_steps_per_second": 6.847,
"step": 4500
},
{
"epoch": 0.59,
"learning_rate": 9.479866866801542e-06,
"loss": 0.0251,
"step": 5000
},
{
"epoch": 0.59,
"eval_loss": 0.020365213975310326,
"eval_runtime": 119.5062,
"eval_samples_per_second": 54.742,
"eval_steps_per_second": 6.845,
"step": 5000
},
{
"epoch": 0.65,
"learning_rate": 9.316713437316454e-06,
"loss": 0.0244,
"step": 5500
},
{
"epoch": 0.65,
"eval_loss": 0.02077455259859562,
"eval_runtime": 119.2739,
"eval_samples_per_second": 54.849,
"eval_steps_per_second": 6.858,
"step": 5500
},
{
"epoch": 0.7,
"learning_rate": 9.153560007831366e-06,
"loss": 0.0235,
"step": 6000
},
{
"epoch": 0.7,
"eval_loss": 0.019606556743383408,
"eval_runtime": 119.4635,
"eval_samples_per_second": 54.761,
"eval_steps_per_second": 6.847,
"step": 6000
},
{
"epoch": 0.76,
"learning_rate": 8.990406578346278e-06,
"loss": 0.0232,
"step": 6500
},
{
"epoch": 0.76,
"eval_loss": 0.01969091035425663,
"eval_runtime": 119.7331,
"eval_samples_per_second": 54.638,
"eval_steps_per_second": 6.832,
"step": 6500
},
{
"epoch": 0.82,
"learning_rate": 8.82725314886119e-06,
"loss": 0.0225,
"step": 7000
},
{
"epoch": 0.82,
"eval_loss": 0.019132908433675766,
"eval_runtime": 119.5514,
"eval_samples_per_second": 54.721,
"eval_steps_per_second": 6.842,
"step": 7000
},
{
"epoch": 0.88,
"learning_rate": 8.664099719376103e-06,
"loss": 0.0212,
"step": 7500
},
{
"epoch": 0.88,
"eval_loss": 0.018609512597322464,
"eval_runtime": 119.4541,
"eval_samples_per_second": 54.766,
"eval_steps_per_second": 6.848,
"step": 7500
},
{
"epoch": 0.94,
"learning_rate": 8.500946289891015e-06,
"loss": 0.0225,
"step": 8000
},
{
"epoch": 0.94,
"eval_loss": 0.018011104315519333,
"eval_runtime": 116.0368,
"eval_samples_per_second": 56.379,
"eval_steps_per_second": 7.049,
"step": 8000
},
{
"epoch": 1.0,
"learning_rate": 8.337792860405927e-06,
"loss": 0.0231,
"step": 8500
},
{
"epoch": 1.0,
"eval_loss": 0.01797027327120304,
"eval_runtime": 115.2093,
"eval_samples_per_second": 56.784,
"eval_steps_per_second": 7.1,
"step": 8500
},
{
"epoch": 1.06,
"learning_rate": 8.174639430920839e-06,
"loss": 0.018,
"step": 9000
},
{
"epoch": 1.06,
"eval_loss": 0.018307719379663467,
"eval_runtime": 116.9177,
"eval_samples_per_second": 55.954,
"eval_steps_per_second": 6.996,
"step": 9000
},
{
"epoch": 1.12,
"learning_rate": 8.01148600143575e-06,
"loss": 0.0178,
"step": 9500
},
{
"epoch": 1.12,
"eval_loss": 0.018220532685518265,
"eval_runtime": 119.4643,
"eval_samples_per_second": 54.761,
"eval_steps_per_second": 6.847,
"step": 9500
},
{
"epoch": 1.17,
"learning_rate": 7.848332571950663e-06,
"loss": 0.0173,
"step": 10000
},
{
"epoch": 1.17,
"eval_loss": 0.018342604860663414,
"eval_runtime": 119.7353,
"eval_samples_per_second": 54.637,
"eval_steps_per_second": 6.832,
"step": 10000
},
{
"epoch": 1.23,
"learning_rate": 7.685179142465575e-06,
"loss": 0.0176,
"step": 10500
},
{
"epoch": 1.23,
"eval_loss": 0.01870131492614746,
"eval_runtime": 119.9453,
"eval_samples_per_second": 54.542,
"eval_steps_per_second": 6.82,
"step": 10500
},
{
"epoch": 1.29,
"learning_rate": 7.5220257129804875e-06,
"loss": 0.0177,
"step": 11000
},
{
"epoch": 1.29,
"eval_loss": 0.018137916922569275,
"eval_runtime": 119.817,
"eval_samples_per_second": 54.6,
"eval_steps_per_second": 6.827,
"step": 11000
},
{
"epoch": 1.35,
"learning_rate": 7.358872283495399e-06,
"loss": 0.0171,
"step": 11500
},
{
"epoch": 1.35,
"eval_loss": 0.018740132451057434,
"eval_runtime": 119.4709,
"eval_samples_per_second": 54.758,
"eval_steps_per_second": 6.847,
"step": 11500
},
{
"epoch": 1.41,
"learning_rate": 7.195718854010312e-06,
"loss": 0.019,
"step": 12000
},
{
"epoch": 1.41,
"eval_loss": 0.018057728186249733,
"eval_runtime": 119.9707,
"eval_samples_per_second": 54.53,
"eval_steps_per_second": 6.818,
"step": 12000
},
{
"epoch": 1.47,
"learning_rate": 7.032565424525224e-06,
"loss": 0.0174,
"step": 12500
},
{
"epoch": 1.47,
"eval_loss": 0.018135011196136475,
"eval_runtime": 119.6311,
"eval_samples_per_second": 54.685,
"eval_steps_per_second": 6.838,
"step": 12500
},
{
"epoch": 1.53,
"learning_rate": 6.869411995040136e-06,
"loss": 0.0179,
"step": 13000
},
{
"epoch": 1.53,
"eval_loss": 0.017942175269126892,
"eval_runtime": 117.6519,
"eval_samples_per_second": 55.605,
"eval_steps_per_second": 6.953,
"step": 13000
},
{
"epoch": 1.59,
"learning_rate": 6.706258565555048e-06,
"loss": 0.0166,
"step": 13500
},
{
"epoch": 1.59,
"eval_loss": 0.01796996220946312,
"eval_runtime": 115.4295,
"eval_samples_per_second": 56.675,
"eval_steps_per_second": 7.087,
"step": 13500
},
{
"epoch": 1.64,
"learning_rate": 6.543105136069961e-06,
"loss": 0.0174,
"step": 14000
},
{
"epoch": 1.64,
"eval_loss": 0.018622903153300285,
"eval_runtime": 116.043,
"eval_samples_per_second": 56.376,
"eval_steps_per_second": 7.049,
"step": 14000
},
{
"epoch": 1.7,
"learning_rate": 6.379951706584873e-06,
"loss": 0.0162,
"step": 14500
},
{
"epoch": 1.7,
"eval_loss": 0.017875785008072853,
"eval_runtime": 119.775,
"eval_samples_per_second": 54.619,
"eval_steps_per_second": 6.829,
"step": 14500
},
{
"epoch": 1.76,
"learning_rate": 6.216798277099785e-06,
"loss": 0.0163,
"step": 15000
},
{
"epoch": 1.76,
"eval_loss": 0.018203964456915855,
"eval_runtime": 119.7603,
"eval_samples_per_second": 54.626,
"eval_steps_per_second": 6.83,
"step": 15000
},
{
"epoch": 1.82,
"learning_rate": 6.0536448476146966e-06,
"loss": 0.0168,
"step": 15500
},
{
"epoch": 1.82,
"eval_loss": 0.017764363437891006,
"eval_runtime": 119.5774,
"eval_samples_per_second": 54.709,
"eval_steps_per_second": 6.841,
"step": 15500
},
{
"epoch": 1.88,
"learning_rate": 5.890491418129609e-06,
"loss": 0.0178,
"step": 16000
},
{
"epoch": 1.88,
"eval_loss": 0.017852840945124626,
"eval_runtime": 119.5232,
"eval_samples_per_second": 54.734,
"eval_steps_per_second": 6.844,
"step": 16000
},
{
"epoch": 1.94,
"learning_rate": 5.727337988644521e-06,
"loss": 0.0168,
"step": 16500
},
{
"epoch": 1.94,
"eval_loss": 0.017764879390597343,
"eval_runtime": 119.6082,
"eval_samples_per_second": 54.695,
"eval_steps_per_second": 6.839,
"step": 16500
},
{
"epoch": 2.0,
"learning_rate": 5.564184559159433e-06,
"loss": 0.0168,
"step": 17000
},
{
"epoch": 2.0,
"eval_loss": 0.017377818003296852,
"eval_runtime": 119.6291,
"eval_samples_per_second": 54.686,
"eval_steps_per_second": 6.838,
"step": 17000
},
{
"epoch": 2.06,
"learning_rate": 5.401031129674347e-06,
"loss": 0.0143,
"step": 17500
},
{
"epoch": 2.06,
"eval_loss": 0.017800554633140564,
"eval_runtime": 119.9539,
"eval_samples_per_second": 54.538,
"eval_steps_per_second": 6.819,
"step": 17500
},
{
"epoch": 2.11,
"learning_rate": 5.237877700189259e-06,
"loss": 0.014,
"step": 18000
},
{
"epoch": 2.11,
"eval_loss": 0.0179632306098938,
"eval_runtime": 118.0782,
"eval_samples_per_second": 55.404,
"eval_steps_per_second": 6.928,
"step": 18000
},
{
"epoch": 2.17,
"learning_rate": 5.074724270704171e-06,
"loss": 0.0143,
"step": 18500
},
{
"epoch": 2.17,
"eval_loss": 0.018571963533759117,
"eval_runtime": 115.3792,
"eval_samples_per_second": 56.7,
"eval_steps_per_second": 7.09,
"step": 18500
},
{
"epoch": 2.23,
"learning_rate": 4.911570841219083e-06,
"loss": 0.0137,
"step": 19000
},
{
"epoch": 2.23,
"eval_loss": 0.018732914701104164,
"eval_runtime": 116.2594,
"eval_samples_per_second": 56.271,
"eval_steps_per_second": 7.036,
"step": 19000
},
{
"epoch": 2.29,
"learning_rate": 4.748417411733995e-06,
"loss": 0.0131,
"step": 19500
},
{
"epoch": 2.29,
"eval_loss": 0.018157465383410454,
"eval_runtime": 119.6325,
"eval_samples_per_second": 54.684,
"eval_steps_per_second": 6.838,
"step": 19500
},
{
"epoch": 2.35,
"learning_rate": 4.585263982248907e-06,
"loss": 0.0134,
"step": 20000
},
{
"epoch": 2.35,
"eval_loss": 0.01858236826956272,
"eval_runtime": 119.626,
"eval_samples_per_second": 54.687,
"eval_steps_per_second": 6.838,
"step": 20000
},
{
"epoch": 2.41,
"learning_rate": 4.42211055276382e-06,
"loss": 0.0131,
"step": 20500
},
{
"epoch": 2.41,
"eval_loss": 0.01760929264128208,
"eval_runtime": 119.8276,
"eval_samples_per_second": 54.595,
"eval_steps_per_second": 6.826,
"step": 20500
},
{
"epoch": 2.47,
"learning_rate": 4.258957123278732e-06,
"loss": 0.0138,
"step": 21000
},
{
"epoch": 2.47,
"eval_loss": 0.01776733435690403,
"eval_runtime": 119.5072,
"eval_samples_per_second": 54.741,
"eval_steps_per_second": 6.845,
"step": 21000
},
{
"epoch": 2.53,
"learning_rate": 4.095803693793644e-06,
"loss": 0.0131,
"step": 21500
},
{
"epoch": 2.53,
"eval_loss": 0.018140822649002075,
"eval_runtime": 119.8335,
"eval_samples_per_second": 54.592,
"eval_steps_per_second": 6.826,
"step": 21500
},
{
"epoch": 2.58,
"learning_rate": 3.932650264308556e-06,
"loss": 0.0139,
"step": 22000
},
{
"epoch": 2.58,
"eval_loss": 0.018083902075886726,
"eval_runtime": 120.1704,
"eval_samples_per_second": 54.439,
"eval_steps_per_second": 6.807,
"step": 22000
},
{
"epoch": 2.64,
"learning_rate": 3.7694968348234683e-06,
"loss": 0.0139,
"step": 22500
},
{
"epoch": 2.64,
"eval_loss": 0.018096571788191795,
"eval_runtime": 119.7812,
"eval_samples_per_second": 54.616,
"eval_steps_per_second": 6.829,
"step": 22500
},
{
"epoch": 2.7,
"learning_rate": 3.6063434053383807e-06,
"loss": 0.0133,
"step": 23000
},
{
"epoch": 2.7,
"eval_loss": 0.01770329661667347,
"eval_runtime": 118.2737,
"eval_samples_per_second": 55.312,
"eval_steps_per_second": 6.916,
"step": 23000
},
{
"epoch": 2.76,
"learning_rate": 3.4431899758532926e-06,
"loss": 0.0135,
"step": 23500
},
{
"epoch": 2.76,
"eval_loss": 0.01808938756585121,
"eval_runtime": 115.7874,
"eval_samples_per_second": 56.5,
"eval_steps_per_second": 7.065,
"step": 23500
},
{
"epoch": 2.82,
"learning_rate": 3.280036546368205e-06,
"loss": 0.0131,
"step": 24000
},
{
"epoch": 2.82,
"eval_loss": 0.017787907272577286,
"eval_runtime": 115.8567,
"eval_samples_per_second": 56.466,
"eval_steps_per_second": 7.06,
"step": 24000
},
{
"epoch": 2.88,
"learning_rate": 3.116883116883117e-06,
"loss": 0.0137,
"step": 24500
},
{
"epoch": 2.88,
"eval_loss": 0.017733994871377945,
"eval_runtime": 120.2603,
"eval_samples_per_second": 54.399,
"eval_steps_per_second": 6.802,
"step": 24500
},
{
"epoch": 2.94,
"learning_rate": 2.9537296873980292e-06,
"loss": 0.0133,
"step": 25000
},
{
"epoch": 2.94,
"eval_loss": 0.017949102446436882,
"eval_runtime": 119.971,
"eval_samples_per_second": 54.53,
"eval_steps_per_second": 6.818,
"step": 25000
},
{
"epoch": 3.0,
"learning_rate": 2.7905762579129416e-06,
"loss": 0.0136,
"step": 25500
},
{
"epoch": 3.0,
"eval_loss": 0.017474107444286346,
"eval_runtime": 119.96,
"eval_samples_per_second": 54.535,
"eval_steps_per_second": 6.819,
"step": 25500
},
{
"epoch": 3.05,
"learning_rate": 2.6274228284278535e-06,
"loss": 0.0124,
"step": 26000
},
{
"epoch": 3.05,
"eval_loss": 0.018201593309640884,
"eval_runtime": 119.9656,
"eval_samples_per_second": 54.532,
"eval_steps_per_second": 6.819,
"step": 26000
},
{
"epoch": 3.11,
"learning_rate": 2.464269398942766e-06,
"loss": 0.0121,
"step": 26500
},
{
"epoch": 3.11,
"eval_loss": 0.01811986044049263,
"eval_runtime": 119.7914,
"eval_samples_per_second": 54.612,
"eval_steps_per_second": 6.829,
"step": 26500
},
{
"epoch": 3.17,
"learning_rate": 2.3011159694576783e-06,
"loss": 0.012,
"step": 27000
},
{
"epoch": 3.17,
"eval_loss": 0.018191542476415634,
"eval_runtime": 119.8265,
"eval_samples_per_second": 54.596,
"eval_steps_per_second": 6.827,
"step": 27000
},
{
"epoch": 3.23,
"learning_rate": 2.13796253997259e-06,
"loss": 0.0115,
"step": 27500
},
{
"epoch": 3.23,
"eval_loss": 0.018120231106877327,
"eval_runtime": 119.6169,
"eval_samples_per_second": 54.691,
"eval_steps_per_second": 6.839,
"step": 27500
},
{
"epoch": 3.29,
"learning_rate": 1.9748091104875025e-06,
"loss": 0.0117,
"step": 28000
},
{
"epoch": 3.29,
"eval_loss": 0.017889145761728287,
"eval_runtime": 118.9939,
"eval_samples_per_second": 54.978,
"eval_steps_per_second": 6.874,
"step": 28000
},
{
"epoch": 3.35,
"learning_rate": 1.811655681002415e-06,
"loss": 0.0113,
"step": 28500
},
{
"epoch": 3.35,
"eval_loss": 0.017741482704877853,
"eval_runtime": 115.6814,
"eval_samples_per_second": 56.552,
"eval_steps_per_second": 7.071,
"step": 28500
},
{
"epoch": 3.41,
"learning_rate": 1.648502251517327e-06,
"loss": 0.0124,
"step": 29000
},
{
"epoch": 3.41,
"eval_loss": 0.017794128507375717,
"eval_runtime": 115.7328,
"eval_samples_per_second": 56.527,
"eval_steps_per_second": 7.068,
"step": 29000
},
{
"epoch": 3.47,
"learning_rate": 1.4853488220322392e-06,
"loss": 0.012,
"step": 29500
},
{
"epoch": 3.47,
"eval_loss": 0.018301891162991524,
"eval_runtime": 119.5898,
"eval_samples_per_second": 54.704,
"eval_steps_per_second": 6.84,
"step": 29500
},
{
"epoch": 3.52,
"learning_rate": 1.3221953925471516e-06,
"loss": 0.0119,
"step": 30000
},
{
"epoch": 3.52,
"eval_loss": 0.01817336678504944,
"eval_runtime": 120.0384,
"eval_samples_per_second": 54.499,
"eval_steps_per_second": 6.814,
"step": 30000
},
{
"epoch": 3.58,
"learning_rate": 1.1590419630620637e-06,
"loss": 0.0115,
"step": 30500
},
{
"epoch": 3.58,
"eval_loss": 0.018085774034261703,
"eval_runtime": 119.7931,
"eval_samples_per_second": 54.611,
"eval_steps_per_second": 6.828,
"step": 30500
},
{
"epoch": 3.64,
"learning_rate": 9.958885335769758e-07,
"loss": 0.012,
"step": 31000
},
{
"epoch": 3.64,
"eval_loss": 0.017980104312300682,
"eval_runtime": 119.8348,
"eval_samples_per_second": 54.592,
"eval_steps_per_second": 6.826,
"step": 31000
},
{
"epoch": 3.7,
"learning_rate": 8.327351040918881e-07,
"loss": 0.0116,
"step": 31500
},
{
"epoch": 3.7,
"eval_loss": 0.0181511789560318,
"eval_runtime": 119.9569,
"eval_samples_per_second": 54.536,
"eval_steps_per_second": 6.819,
"step": 31500
},
{
"epoch": 3.76,
"learning_rate": 6.695816746068002e-07,
"loss": 0.0108,
"step": 32000
},
{
"epoch": 3.76,
"eval_loss": 0.018154002726078033,
"eval_runtime": 119.8401,
"eval_samples_per_second": 54.589,
"eval_steps_per_second": 6.826,
"step": 32000
},
{
"epoch": 3.82,
"learning_rate": 5.064282451217125e-07,
"loss": 0.0118,
"step": 32500
},
{
"epoch": 3.82,
"eval_loss": 0.01812034100294113,
"eval_runtime": 120.1728,
"eval_samples_per_second": 54.438,
"eval_steps_per_second": 6.807,
"step": 32500
},
{
"epoch": 3.88,
"learning_rate": 3.4327481563662475e-07,
"loss": 0.0114,
"step": 33000
},
{
"epoch": 3.88,
"eval_loss": 0.018135515972971916,
"eval_runtime": 119.8212,
"eval_samples_per_second": 54.598,
"eval_steps_per_second": 6.827,
"step": 33000
},
{
"epoch": 3.94,
"learning_rate": 1.801213861515369e-07,
"loss": 0.0121,
"step": 33500
},
{
"epoch": 3.94,
"eval_loss": 0.01807536743581295,
"eval_runtime": 115.6623,
"eval_samples_per_second": 56.561,
"eval_steps_per_second": 7.072,
"step": 33500
},
{
"epoch": 3.99,
"learning_rate": 1.6967956666449132e-08,
"loss": 0.0112,
"step": 34000
},
{
"epoch": 3.99,
"eval_loss": 0.01807805709540844,
"eval_runtime": 115.5079,
"eval_samples_per_second": 56.637,
"eval_steps_per_second": 7.082,
"step": 34000
}
],
"logging_steps": 500,
"max_steps": 34052,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"total_flos": 3.237991334295552e+16,
"train_batch_size": 18,
"trial_name": null,
"trial_params": null
}