smolm-autoreg-bpe-counterfactual-babylm-indef_articles_with_pl_nouns-removal-3e-4
/
trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 20.0, | |
"eval_steps": 500, | |
"global_step": 372020, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.05, | |
"learning_rate": 9.375e-06, | |
"loss": 6.8474, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.11, | |
"learning_rate": 1.875e-05, | |
"loss": 5.3572, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 0.16, | |
"learning_rate": 2.8125e-05, | |
"loss": 5.0383, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 0.22, | |
"learning_rate": 3.75e-05, | |
"loss": 4.8113, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 0.27, | |
"learning_rate": 4.6874999999999994e-05, | |
"loss": 4.6399, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 0.32, | |
"learning_rate": 5.625e-05, | |
"loss": 4.5038, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 0.38, | |
"learning_rate": 6.5625e-05, | |
"loss": 4.3831, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 0.43, | |
"learning_rate": 7.5e-05, | |
"loss": 4.2865, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 0.48, | |
"learning_rate": 8.437499999999999e-05, | |
"loss": 4.1961, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 0.54, | |
"learning_rate": 9.374999999999999e-05, | |
"loss": 4.1295, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 0.59, | |
"learning_rate": 0.000103115625, | |
"loss": 4.057, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 0.65, | |
"learning_rate": 0.000112490625, | |
"loss": 3.9959, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 0.7, | |
"learning_rate": 0.000121865625, | |
"loss": 3.9361, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 0.75, | |
"learning_rate": 0.000131221875, | |
"loss": 3.8845, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 0.81, | |
"learning_rate": 0.000140596875, | |
"loss": 3.844, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 0.86, | |
"learning_rate": 0.0001499625, | |
"loss": 3.8116, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 0.91, | |
"learning_rate": 0.000159328125, | |
"loss": 3.7724, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 0.97, | |
"learning_rate": 0.000168703125, | |
"loss": 3.7446, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.3461573860094159, | |
"eval_loss": 3.9129602909088135, | |
"eval_runtime": 148.9938, | |
"eval_samples_per_second": 388.741, | |
"eval_steps_per_second": 6.074, | |
"step": 18601 | |
}, | |
{ | |
"epoch": 1.02, | |
"learning_rate": 0.00017806875, | |
"loss": 3.7109, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 1.08, | |
"learning_rate": 0.00018744374999999999, | |
"loss": 3.6733, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 1.13, | |
"learning_rate": 0.00019680937499999996, | |
"loss": 3.649, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 1.18, | |
"learning_rate": 0.00020618437499999995, | |
"loss": 3.6311, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 1.24, | |
"learning_rate": 0.00021555937499999998, | |
"loss": 3.6133, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 1.29, | |
"learning_rate": 0.00022492499999999998, | |
"loss": 3.6013, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 1.34, | |
"learning_rate": 0.00023429999999999998, | |
"loss": 3.5853, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 1.4, | |
"learning_rate": 0.00024366562499999997, | |
"loss": 3.5709, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 1.45, | |
"learning_rate": 0.000253040625, | |
"loss": 3.566, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 1.51, | |
"learning_rate": 0.000262415625, | |
"loss": 3.5454, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 1.56, | |
"learning_rate": 0.000271790625, | |
"loss": 3.5315, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 1.61, | |
"learning_rate": 0.00028115624999999994, | |
"loss": 3.5155, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 1.67, | |
"learning_rate": 0.00029053124999999994, | |
"loss": 3.5075, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 1.72, | |
"learning_rate": 0.00029990624999999993, | |
"loss": 3.499, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 1.77, | |
"learning_rate": 0.00029912740427033704, | |
"loss": 3.4882, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 1.83, | |
"learning_rate": 0.00029824510322922176, | |
"loss": 3.4677, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 1.88, | |
"learning_rate": 0.00029736280218810654, | |
"loss": 3.4639, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 1.94, | |
"learning_rate": 0.0002964805011469913, | |
"loss": 3.4498, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 1.99, | |
"learning_rate": 0.0002955990824069172, | |
"loss": 3.4295, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.37515525238085, | |
"eval_loss": 3.6268346309661865, | |
"eval_runtime": 150.4919, | |
"eval_samples_per_second": 384.871, | |
"eval_steps_per_second": 6.014, | |
"step": 37202 | |
}, | |
{ | |
"epoch": 2.04, | |
"learning_rate": 0.00029471678136580196, | |
"loss": 3.3882, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 2.1, | |
"learning_rate": 0.0002938353626257279, | |
"loss": 3.3783, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 2.15, | |
"learning_rate": 0.00029295306158461267, | |
"loss": 3.3682, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 2.2, | |
"learning_rate": 0.00029207076054349744, | |
"loss": 3.3669, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 2.26, | |
"learning_rate": 0.0002911893418034233, | |
"loss": 3.3568, | |
"step": 42000 | |
}, | |
{ | |
"epoch": 2.31, | |
"learning_rate": 0.0002903070407623081, | |
"loss": 3.3567, | |
"step": 43000 | |
}, | |
{ | |
"epoch": 2.37, | |
"learning_rate": 0.00028942562202223396, | |
"loss": 3.3465, | |
"step": 44000 | |
}, | |
{ | |
"epoch": 2.42, | |
"learning_rate": 0.00028854332098111874, | |
"loss": 3.3394, | |
"step": 45000 | |
}, | |
{ | |
"epoch": 2.47, | |
"learning_rate": 0.0002876610199400035, | |
"loss": 3.3411, | |
"step": 46000 | |
}, | |
{ | |
"epoch": 2.53, | |
"learning_rate": 0.0002867787188988883, | |
"loss": 3.3295, | |
"step": 47000 | |
}, | |
{ | |
"epoch": 2.58, | |
"learning_rate": 0.00028589730015881416, | |
"loss": 3.3291, | |
"step": 48000 | |
}, | |
{ | |
"epoch": 2.63, | |
"learning_rate": 0.00028501588141874003, | |
"loss": 3.3172, | |
"step": 49000 | |
}, | |
{ | |
"epoch": 2.69, | |
"learning_rate": 0.0002841335803776248, | |
"loss": 3.3157, | |
"step": 50000 | |
}, | |
{ | |
"epoch": 2.74, | |
"learning_rate": 0.00028325216163755073, | |
"loss": 3.3098, | |
"step": 51000 | |
}, | |
{ | |
"epoch": 2.8, | |
"learning_rate": 0.0002823698605964355, | |
"loss": 3.3071, | |
"step": 52000 | |
}, | |
{ | |
"epoch": 2.85, | |
"learning_rate": 0.00028148755955532023, | |
"loss": 3.2994, | |
"step": 53000 | |
}, | |
{ | |
"epoch": 2.9, | |
"learning_rate": 0.00028060614081524616, | |
"loss": 3.2919, | |
"step": 54000 | |
}, | |
{ | |
"epoch": 2.96, | |
"learning_rate": 0.0002797238397741309, | |
"loss": 3.2878, | |
"step": 55000 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.38779592675795943, | |
"eval_loss": 3.5073165893554688, | |
"eval_runtime": 150.2329, | |
"eval_samples_per_second": 385.535, | |
"eval_steps_per_second": 6.024, | |
"step": 55803 | |
}, | |
{ | |
"epoch": 3.01, | |
"learning_rate": 0.00027884153873301566, | |
"loss": 3.278, | |
"step": 56000 | |
}, | |
{ | |
"epoch": 3.06, | |
"learning_rate": 0.0002779610022939827, | |
"loss": 3.2281, | |
"step": 57000 | |
}, | |
{ | |
"epoch": 3.12, | |
"learning_rate": 0.00027707870125286745, | |
"loss": 3.2237, | |
"step": 58000 | |
}, | |
{ | |
"epoch": 3.17, | |
"learning_rate": 0.0002761972825127933, | |
"loss": 3.2292, | |
"step": 59000 | |
}, | |
{ | |
"epoch": 3.23, | |
"learning_rate": 0.0002753149814716781, | |
"loss": 3.2293, | |
"step": 60000 | |
}, | |
{ | |
"epoch": 3.28, | |
"learning_rate": 0.0002744326804305629, | |
"loss": 3.2232, | |
"step": 61000 | |
}, | |
{ | |
"epoch": 3.33, | |
"learning_rate": 0.00027355037938944765, | |
"loss": 3.2279, | |
"step": 62000 | |
}, | |
{ | |
"epoch": 3.39, | |
"learning_rate": 0.00027266807834833243, | |
"loss": 3.2248, | |
"step": 63000 | |
}, | |
{ | |
"epoch": 3.44, | |
"learning_rate": 0.0002717857773072172, | |
"loss": 3.2182, | |
"step": 64000 | |
}, | |
{ | |
"epoch": 3.49, | |
"learning_rate": 0.00027090347626610193, | |
"loss": 3.2216, | |
"step": 65000 | |
}, | |
{ | |
"epoch": 3.55, | |
"learning_rate": 0.00027002205752602786, | |
"loss": 3.2212, | |
"step": 66000 | |
}, | |
{ | |
"epoch": 3.6, | |
"learning_rate": 0.0002691406387859537, | |
"loss": 3.2125, | |
"step": 67000 | |
}, | |
{ | |
"epoch": 3.66, | |
"learning_rate": 0.0002682583377448385, | |
"loss": 3.2149, | |
"step": 68000 | |
}, | |
{ | |
"epoch": 3.71, | |
"learning_rate": 0.0002673760367037233, | |
"loss": 3.2145, | |
"step": 69000 | |
}, | |
{ | |
"epoch": 3.76, | |
"learning_rate": 0.00026649373566260806, | |
"loss": 3.2114, | |
"step": 70000 | |
}, | |
{ | |
"epoch": 3.82, | |
"learning_rate": 0.00026561231692253393, | |
"loss": 3.2074, | |
"step": 71000 | |
}, | |
{ | |
"epoch": 3.87, | |
"learning_rate": 0.0002647300158814187, | |
"loss": 3.2033, | |
"step": 72000 | |
}, | |
{ | |
"epoch": 3.92, | |
"learning_rate": 0.00026384859714134463, | |
"loss": 3.2111, | |
"step": 73000 | |
}, | |
{ | |
"epoch": 3.98, | |
"learning_rate": 0.00026296629610022935, | |
"loss": 3.2031, | |
"step": 74000 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.39482574918846874, | |
"eval_loss": 3.46301007270813, | |
"eval_runtime": 149.3064, | |
"eval_samples_per_second": 387.927, | |
"eval_steps_per_second": 6.061, | |
"step": 74404 | |
}, | |
{ | |
"epoch": 4.03, | |
"learning_rate": 0.00026208399505911413, | |
"loss": 3.16, | |
"step": 75000 | |
}, | |
{ | |
"epoch": 4.09, | |
"learning_rate": 0.0002612016940179989, | |
"loss": 3.1399, | |
"step": 76000 | |
}, | |
{ | |
"epoch": 4.14, | |
"learning_rate": 0.00026032027527792483, | |
"loss": 3.1365, | |
"step": 77000 | |
}, | |
{ | |
"epoch": 4.19, | |
"learning_rate": 0.0002594379742368096, | |
"loss": 3.1442, | |
"step": 78000 | |
}, | |
{ | |
"epoch": 4.25, | |
"learning_rate": 0.0002585565554967355, | |
"loss": 3.1495, | |
"step": 79000 | |
}, | |
{ | |
"epoch": 4.3, | |
"learning_rate": 0.00025767425445562026, | |
"loss": 3.1482, | |
"step": 80000 | |
}, | |
{ | |
"epoch": 4.35, | |
"learning_rate": 0.000256791953414505, | |
"loss": 3.1491, | |
"step": 81000 | |
}, | |
{ | |
"epoch": 4.41, | |
"learning_rate": 0.0002559105346744309, | |
"loss": 3.1457, | |
"step": 82000 | |
}, | |
{ | |
"epoch": 4.46, | |
"learning_rate": 0.0002550291159343568, | |
"loss": 3.1516, | |
"step": 83000 | |
}, | |
{ | |
"epoch": 4.52, | |
"learning_rate": 0.00025414681489324155, | |
"loss": 3.1469, | |
"step": 84000 | |
}, | |
{ | |
"epoch": 4.57, | |
"learning_rate": 0.00025326451385212633, | |
"loss": 3.1491, | |
"step": 85000 | |
}, | |
{ | |
"epoch": 4.62, | |
"learning_rate": 0.0002523822128110111, | |
"loss": 3.1484, | |
"step": 86000 | |
}, | |
{ | |
"epoch": 4.68, | |
"learning_rate": 0.0002514999117698959, | |
"loss": 3.1452, | |
"step": 87000 | |
}, | |
{ | |
"epoch": 4.73, | |
"learning_rate": 0.00025061849302982175, | |
"loss": 3.1454, | |
"step": 88000 | |
}, | |
{ | |
"epoch": 4.78, | |
"learning_rate": 0.00024973619198870653, | |
"loss": 3.1461, | |
"step": 89000 | |
}, | |
{ | |
"epoch": 4.84, | |
"learning_rate": 0.0002488547732486324, | |
"loss": 3.1483, | |
"step": 90000 | |
}, | |
{ | |
"epoch": 4.89, | |
"learning_rate": 0.0002479724722075172, | |
"loss": 3.1441, | |
"step": 91000 | |
}, | |
{ | |
"epoch": 4.95, | |
"learning_rate": 0.00024709017116640196, | |
"loss": 3.1448, | |
"step": 92000 | |
}, | |
{ | |
"epoch": 5.0, | |
"learning_rate": 0.00024620787012528673, | |
"loss": 3.1443, | |
"step": 93000 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.39941869370337724, | |
"eval_loss": 3.407740354537964, | |
"eval_runtime": 149.7411, | |
"eval_samples_per_second": 386.801, | |
"eval_steps_per_second": 6.044, | |
"step": 93005 | |
}, | |
{ | |
"epoch": 5.05, | |
"learning_rate": 0.0002453264513852126, | |
"loss": 3.0734, | |
"step": 94000 | |
}, | |
{ | |
"epoch": 5.11, | |
"learning_rate": 0.0002444441503440974, | |
"loss": 3.0793, | |
"step": 95000 | |
}, | |
{ | |
"epoch": 5.16, | |
"learning_rate": 0.00024356273160402328, | |
"loss": 3.0807, | |
"step": 96000 | |
}, | |
{ | |
"epoch": 5.21, | |
"learning_rate": 0.00024268043056290805, | |
"loss": 3.0863, | |
"step": 97000 | |
}, | |
{ | |
"epoch": 5.27, | |
"learning_rate": 0.0002417981295217928, | |
"loss": 3.0939, | |
"step": 98000 | |
}, | |
{ | |
"epoch": 5.32, | |
"learning_rate": 0.0002409167107817187, | |
"loss": 3.0949, | |
"step": 99000 | |
}, | |
{ | |
"epoch": 5.38, | |
"learning_rate": 0.00024003440974060345, | |
"loss": 3.0931, | |
"step": 100000 | |
}, | |
{ | |
"epoch": 5.43, | |
"learning_rate": 0.00023915299100052938, | |
"loss": 3.0941, | |
"step": 101000 | |
}, | |
{ | |
"epoch": 5.48, | |
"learning_rate": 0.00023827068995941413, | |
"loss": 3.0931, | |
"step": 102000 | |
}, | |
{ | |
"epoch": 5.54, | |
"learning_rate": 0.0002373883889182989, | |
"loss": 3.094, | |
"step": 103000 | |
}, | |
{ | |
"epoch": 5.59, | |
"learning_rate": 0.00023650697017822477, | |
"loss": 3.0968, | |
"step": 104000 | |
}, | |
{ | |
"epoch": 5.64, | |
"learning_rate": 0.00023562466913710955, | |
"loss": 3.098, | |
"step": 105000 | |
}, | |
{ | |
"epoch": 5.7, | |
"learning_rate": 0.00023474325039703542, | |
"loss": 3.0946, | |
"step": 106000 | |
}, | |
{ | |
"epoch": 5.75, | |
"learning_rate": 0.00023386094935592023, | |
"loss": 3.0951, | |
"step": 107000 | |
}, | |
{ | |
"epoch": 5.81, | |
"learning_rate": 0.00023297953061584612, | |
"loss": 3.0984, | |
"step": 108000 | |
}, | |
{ | |
"epoch": 5.86, | |
"learning_rate": 0.00023209722957473087, | |
"loss": 3.094, | |
"step": 109000 | |
}, | |
{ | |
"epoch": 5.91, | |
"learning_rate": 0.00023121492853361565, | |
"loss": 3.099, | |
"step": 110000 | |
}, | |
{ | |
"epoch": 5.97, | |
"learning_rate": 0.00023033350979354152, | |
"loss": 3.0973, | |
"step": 111000 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.402754033471634, | |
"eval_loss": 3.372433662414551, | |
"eval_runtime": 149.8825, | |
"eval_samples_per_second": 386.436, | |
"eval_steps_per_second": 6.038, | |
"step": 111606 | |
}, | |
{ | |
"epoch": 6.02, | |
"learning_rate": 0.0002294512087524263, | |
"loss": 3.0661, | |
"step": 112000 | |
}, | |
{ | |
"epoch": 6.07, | |
"learning_rate": 0.0002285697900123522, | |
"loss": 3.0288, | |
"step": 113000 | |
}, | |
{ | |
"epoch": 6.13, | |
"learning_rate": 0.00022768748897123697, | |
"loss": 3.0356, | |
"step": 114000 | |
}, | |
{ | |
"epoch": 6.18, | |
"learning_rate": 0.00022680518793012175, | |
"loss": 3.0368, | |
"step": 115000 | |
}, | |
{ | |
"epoch": 6.24, | |
"learning_rate": 0.0002259228868890065, | |
"loss": 3.0423, | |
"step": 116000 | |
}, | |
{ | |
"epoch": 6.29, | |
"learning_rate": 0.0002250414681489324, | |
"loss": 3.0489, | |
"step": 117000 | |
}, | |
{ | |
"epoch": 6.34, | |
"learning_rate": 0.00022415916710781717, | |
"loss": 3.0487, | |
"step": 118000 | |
}, | |
{ | |
"epoch": 6.4, | |
"learning_rate": 0.00022327774836774307, | |
"loss": 3.0484, | |
"step": 119000 | |
}, | |
{ | |
"epoch": 6.45, | |
"learning_rate": 0.00022239544732662785, | |
"loss": 3.0518, | |
"step": 120000 | |
}, | |
{ | |
"epoch": 6.51, | |
"learning_rate": 0.0002215131462855126, | |
"loss": 3.0504, | |
"step": 121000 | |
}, | |
{ | |
"epoch": 6.56, | |
"learning_rate": 0.00022063084524439738, | |
"loss": 3.0549, | |
"step": 122000 | |
}, | |
{ | |
"epoch": 6.61, | |
"learning_rate": 0.00021974942650432325, | |
"loss": 3.0526, | |
"step": 123000 | |
}, | |
{ | |
"epoch": 6.67, | |
"learning_rate": 0.00021886889006529023, | |
"loss": 3.0552, | |
"step": 124000 | |
}, | |
{ | |
"epoch": 6.72, | |
"learning_rate": 0.00021798658902417504, | |
"loss": 3.0549, | |
"step": 125000 | |
}, | |
{ | |
"epoch": 6.77, | |
"learning_rate": 0.00021710428798305982, | |
"loss": 3.0557, | |
"step": 126000 | |
}, | |
{ | |
"epoch": 6.83, | |
"learning_rate": 0.00021622198694194457, | |
"loss": 3.0567, | |
"step": 127000 | |
}, | |
{ | |
"epoch": 6.88, | |
"learning_rate": 0.00021534056820187046, | |
"loss": 3.0581, | |
"step": 128000 | |
}, | |
{ | |
"epoch": 6.94, | |
"learning_rate": 0.00021445826716075521, | |
"loss": 3.0577, | |
"step": 129000 | |
}, | |
{ | |
"epoch": 6.99, | |
"learning_rate": 0.0002135768484206811, | |
"loss": 3.0617, | |
"step": 130000 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.40619524851129696, | |
"eval_loss": 3.3562216758728027, | |
"eval_runtime": 149.8137, | |
"eval_samples_per_second": 386.613, | |
"eval_steps_per_second": 6.041, | |
"step": 130207 | |
}, | |
{ | |
"epoch": 7.04, | |
"learning_rate": 0.00021269454737956592, | |
"loss": 2.9986, | |
"step": 131000 | |
}, | |
{ | |
"epoch": 7.1, | |
"learning_rate": 0.00021181224633845067, | |
"loss": 2.9938, | |
"step": 132000 | |
}, | |
{ | |
"epoch": 7.15, | |
"learning_rate": 0.00021092994529733544, | |
"loss": 2.997, | |
"step": 133000 | |
}, | |
{ | |
"epoch": 7.2, | |
"learning_rate": 0.00021004764425622022, | |
"loss": 3.0043, | |
"step": 134000 | |
}, | |
{ | |
"epoch": 7.26, | |
"learning_rate": 0.0002091662255161461, | |
"loss": 3.0065, | |
"step": 135000 | |
}, | |
{ | |
"epoch": 7.31, | |
"learning_rate": 0.00020828392447503087, | |
"loss": 3.0121, | |
"step": 136000 | |
}, | |
{ | |
"epoch": 7.37, | |
"learning_rate": 0.00020740162343391562, | |
"loss": 3.011, | |
"step": 137000 | |
}, | |
{ | |
"epoch": 7.42, | |
"learning_rate": 0.00020652020469384152, | |
"loss": 3.0123, | |
"step": 138000 | |
}, | |
{ | |
"epoch": 7.47, | |
"learning_rate": 0.00020563790365272627, | |
"loss": 3.014, | |
"step": 139000 | |
}, | |
{ | |
"epoch": 7.53, | |
"learning_rate": 0.0002047564849126522, | |
"loss": 3.0197, | |
"step": 140000 | |
}, | |
{ | |
"epoch": 7.58, | |
"learning_rate": 0.00020387418387153697, | |
"loss": 3.019, | |
"step": 141000 | |
}, | |
{ | |
"epoch": 7.63, | |
"learning_rate": 0.00020299188283042172, | |
"loss": 3.0203, | |
"step": 142000 | |
}, | |
{ | |
"epoch": 7.69, | |
"learning_rate": 0.00020211046409034761, | |
"loss": 3.0176, | |
"step": 143000 | |
}, | |
{ | |
"epoch": 7.74, | |
"learning_rate": 0.00020122816304923236, | |
"loss": 3.0197, | |
"step": 144000 | |
}, | |
{ | |
"epoch": 7.8, | |
"learning_rate": 0.0002003467443091583, | |
"loss": 3.021, | |
"step": 145000 | |
}, | |
{ | |
"epoch": 7.85, | |
"learning_rate": 0.00019946444326804304, | |
"loss": 3.028, | |
"step": 146000 | |
}, | |
{ | |
"epoch": 7.9, | |
"learning_rate": 0.00019858302452796894, | |
"loss": 3.0219, | |
"step": 147000 | |
}, | |
{ | |
"epoch": 7.96, | |
"learning_rate": 0.00019770072348685369, | |
"loss": 3.0252, | |
"step": 148000 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.40585283337274547, | |
"eval_loss": 3.364830255508423, | |
"eval_runtime": 150.706, | |
"eval_samples_per_second": 384.325, | |
"eval_steps_per_second": 6.005, | |
"step": 148808 | |
}, | |
{ | |
"epoch": 8.01, | |
"learning_rate": 0.00019681842244573846, | |
"loss": 3.0107, | |
"step": 149000 | |
}, | |
{ | |
"epoch": 8.06, | |
"learning_rate": 0.00019593700370566433, | |
"loss": 2.9562, | |
"step": 150000 | |
}, | |
{ | |
"epoch": 8.12, | |
"learning_rate": 0.0001950547026645491, | |
"loss": 2.9643, | |
"step": 151000 | |
}, | |
{ | |
"epoch": 8.17, | |
"learning_rate": 0.000194173283924475, | |
"loss": 2.9706, | |
"step": 152000 | |
}, | |
{ | |
"epoch": 8.23, | |
"learning_rate": 0.00019329098288335979, | |
"loss": 2.974, | |
"step": 153000 | |
}, | |
{ | |
"epoch": 8.28, | |
"learning_rate": 0.00019240868184224456, | |
"loss": 2.9751, | |
"step": 154000 | |
}, | |
{ | |
"epoch": 8.33, | |
"learning_rate": 0.00019152638080112934, | |
"loss": 2.9802, | |
"step": 155000 | |
}, | |
{ | |
"epoch": 8.39, | |
"learning_rate": 0.0001906449620610552, | |
"loss": 2.9811, | |
"step": 156000 | |
}, | |
{ | |
"epoch": 8.44, | |
"learning_rate": 0.00018976354332098108, | |
"loss": 2.9821, | |
"step": 157000 | |
}, | |
{ | |
"epoch": 8.49, | |
"learning_rate": 0.000188882124580907, | |
"loss": 2.9879, | |
"step": 158000 | |
}, | |
{ | |
"epoch": 8.55, | |
"learning_rate": 0.00018799982353979175, | |
"loss": 2.9908, | |
"step": 159000 | |
}, | |
{ | |
"epoch": 8.6, | |
"learning_rate": 0.00018711840479971765, | |
"loss": 2.9866, | |
"step": 160000 | |
}, | |
{ | |
"epoch": 8.66, | |
"learning_rate": 0.0001862361037586024, | |
"loss": 2.9891, | |
"step": 161000 | |
}, | |
{ | |
"epoch": 8.71, | |
"learning_rate": 0.00018535380271748718, | |
"loss": 2.9896, | |
"step": 162000 | |
}, | |
{ | |
"epoch": 8.76, | |
"learning_rate": 0.00018447150167637196, | |
"loss": 2.9915, | |
"step": 163000 | |
}, | |
{ | |
"epoch": 8.82, | |
"learning_rate": 0.00018358920063525676, | |
"loss": 2.9924, | |
"step": 164000 | |
}, | |
{ | |
"epoch": 8.87, | |
"learning_rate": 0.0001827068995941415, | |
"loss": 2.9922, | |
"step": 165000 | |
}, | |
{ | |
"epoch": 8.92, | |
"learning_rate": 0.0001818254808540674, | |
"loss": 2.9917, | |
"step": 166000 | |
}, | |
{ | |
"epoch": 8.98, | |
"learning_rate": 0.00018094317981295216, | |
"loss": 2.994, | |
"step": 167000 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.407066838467657, | |
"eval_loss": 3.3581929206848145, | |
"eval_runtime": 150.1138, | |
"eval_samples_per_second": 385.841, | |
"eval_steps_per_second": 6.029, | |
"step": 167409 | |
}, | |
{ | |
"epoch": 9.03, | |
"learning_rate": 0.00018006087877183694, | |
"loss": 2.9573, | |
"step": 168000 | |
}, | |
{ | |
"epoch": 9.09, | |
"learning_rate": 0.0001791785777307217, | |
"loss": 2.9322, | |
"step": 169000 | |
}, | |
{ | |
"epoch": 9.14, | |
"learning_rate": 0.00017829715899064758, | |
"loss": 2.9376, | |
"step": 170000 | |
}, | |
{ | |
"epoch": 9.19, | |
"learning_rate": 0.0001774148579495324, | |
"loss": 2.9392, | |
"step": 171000 | |
}, | |
{ | |
"epoch": 9.25, | |
"learning_rate": 0.0001765325569084171, | |
"loss": 2.9452, | |
"step": 172000 | |
}, | |
{ | |
"epoch": 9.3, | |
"learning_rate": 0.00017565025586730192, | |
"loss": 2.9497, | |
"step": 173000 | |
}, | |
{ | |
"epoch": 9.35, | |
"learning_rate": 0.00017476883712722778, | |
"loss": 2.9484, | |
"step": 174000 | |
}, | |
{ | |
"epoch": 9.41, | |
"learning_rate": 0.00017388653608611256, | |
"loss": 2.9534, | |
"step": 175000 | |
}, | |
{ | |
"epoch": 9.46, | |
"learning_rate": 0.00017300423504499734, | |
"loss": 2.9566, | |
"step": 176000 | |
}, | |
{ | |
"epoch": 9.52, | |
"learning_rate": 0.00017212193400388212, | |
"loss": 2.9573, | |
"step": 177000 | |
}, | |
{ | |
"epoch": 9.57, | |
"learning_rate": 0.00017124139756484913, | |
"loss": 2.9621, | |
"step": 178000 | |
}, | |
{ | |
"epoch": 9.62, | |
"learning_rate": 0.00017035909652373388, | |
"loss": 2.9643, | |
"step": 179000 | |
}, | |
{ | |
"epoch": 9.68, | |
"learning_rate": 0.00016947679548261866, | |
"loss": 2.9616, | |
"step": 180000 | |
}, | |
{ | |
"epoch": 9.73, | |
"learning_rate": 0.00016859537674254453, | |
"loss": 2.9635, | |
"step": 181000 | |
}, | |
{ | |
"epoch": 9.78, | |
"learning_rate": 0.0001677130757014293, | |
"loss": 2.9642, | |
"step": 182000 | |
}, | |
{ | |
"epoch": 9.84, | |
"learning_rate": 0.00016683077466031409, | |
"loss": 2.9676, | |
"step": 183000 | |
}, | |
{ | |
"epoch": 9.89, | |
"learning_rate": 0.00016594847361919884, | |
"loss": 2.9648, | |
"step": 184000 | |
}, | |
{ | |
"epoch": 9.95, | |
"learning_rate": 0.00016506881948120697, | |
"loss": 2.9712, | |
"step": 185000 | |
}, | |
{ | |
"epoch": 10.0, | |
"learning_rate": 0.00016418651844009175, | |
"loss": 2.9693, | |
"step": 186000 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.407509351419911, | |
"eval_loss": 3.36879825592041, | |
"eval_runtime": 150.0406, | |
"eval_samples_per_second": 386.029, | |
"eval_steps_per_second": 6.032, | |
"step": 186010 | |
}, | |
{ | |
"epoch": 10.05, | |
"learning_rate": 0.0001633042173989765, | |
"loss": 2.9024, | |
"step": 187000 | |
}, | |
{ | |
"epoch": 10.11, | |
"learning_rate": 0.00016242191635786128, | |
"loss": 2.9062, | |
"step": 188000 | |
}, | |
{ | |
"epoch": 10.16, | |
"learning_rate": 0.00016153961531674605, | |
"loss": 2.9137, | |
"step": 189000 | |
}, | |
{ | |
"epoch": 10.21, | |
"learning_rate": 0.00016065731427563083, | |
"loss": 2.9198, | |
"step": 190000 | |
}, | |
{ | |
"epoch": 10.27, | |
"learning_rate": 0.00015977589553555673, | |
"loss": 2.9199, | |
"step": 191000 | |
}, | |
{ | |
"epoch": 10.32, | |
"learning_rate": 0.0001588935944944415, | |
"loss": 2.9239, | |
"step": 192000 | |
}, | |
{ | |
"epoch": 10.38, | |
"learning_rate": 0.00015801129345332626, | |
"loss": 2.9313, | |
"step": 193000 | |
}, | |
{ | |
"epoch": 10.43, | |
"learning_rate": 0.00015712899241221103, | |
"loss": 2.9295, | |
"step": 194000 | |
}, | |
{ | |
"epoch": 10.48, | |
"learning_rate": 0.0001562475736721369, | |
"loss": 2.9337, | |
"step": 195000 | |
}, | |
{ | |
"epoch": 10.54, | |
"learning_rate": 0.0001553661549320628, | |
"loss": 2.9304, | |
"step": 196000 | |
}, | |
{ | |
"epoch": 10.59, | |
"learning_rate": 0.00015448385389094755, | |
"loss": 2.9378, | |
"step": 197000 | |
}, | |
{ | |
"epoch": 10.64, | |
"learning_rate": 0.00015360155284983236, | |
"loss": 2.9367, | |
"step": 198000 | |
}, | |
{ | |
"epoch": 10.7, | |
"learning_rate": 0.00015272013410975825, | |
"loss": 2.9404, | |
"step": 199000 | |
}, | |
{ | |
"epoch": 10.75, | |
"learning_rate": 0.000151837833068643, | |
"loss": 2.9415, | |
"step": 200000 | |
}, | |
{ | |
"epoch": 10.81, | |
"learning_rate": 0.00015095553202752778, | |
"loss": 2.9428, | |
"step": 201000 | |
}, | |
{ | |
"epoch": 10.86, | |
"learning_rate": 0.00015007411328745365, | |
"loss": 2.9405, | |
"step": 202000 | |
}, | |
{ | |
"epoch": 10.91, | |
"learning_rate": 0.00014919181224633843, | |
"loss": 2.943, | |
"step": 203000 | |
}, | |
{ | |
"epoch": 10.97, | |
"learning_rate": 0.00014831039350626432, | |
"loss": 2.9383, | |
"step": 204000 | |
}, | |
{ | |
"epoch": 11.0, | |
"eval_accuracy": 0.4091652648494099, | |
"eval_loss": 3.3513429164886475, | |
"eval_runtime": 149.8996, | |
"eval_samples_per_second": 386.392, | |
"eval_steps_per_second": 6.037, | |
"step": 204611 | |
}, | |
{ | |
"epoch": 11.02, | |
"learning_rate": 0.0001474280924651491, | |
"loss": 2.9199, | |
"step": 205000 | |
}, | |
{ | |
"epoch": 11.07, | |
"learning_rate": 0.00014654579142403385, | |
"loss": 2.8839, | |
"step": 206000 | |
}, | |
{ | |
"epoch": 11.13, | |
"learning_rate": 0.00014566349038291863, | |
"loss": 2.8913, | |
"step": 207000 | |
}, | |
{ | |
"epoch": 11.18, | |
"learning_rate": 0.00014478207164284453, | |
"loss": 2.8901, | |
"step": 208000 | |
}, | |
{ | |
"epoch": 11.24, | |
"learning_rate": 0.0001439006529027704, | |
"loss": 2.8959, | |
"step": 209000 | |
}, | |
{ | |
"epoch": 11.29, | |
"learning_rate": 0.0001430183518616552, | |
"loss": 2.8992, | |
"step": 210000 | |
}, | |
{ | |
"epoch": 11.34, | |
"learning_rate": 0.00014213605082053995, | |
"loss": 2.9022, | |
"step": 211000 | |
}, | |
{ | |
"epoch": 11.4, | |
"learning_rate": 0.00014125374977942473, | |
"loss": 2.9044, | |
"step": 212000 | |
}, | |
{ | |
"epoch": 11.45, | |
"learning_rate": 0.0001403714487383095, | |
"loss": 2.9076, | |
"step": 213000 | |
}, | |
{ | |
"epoch": 11.5, | |
"learning_rate": 0.0001394900299982354, | |
"loss": 2.9106, | |
"step": 214000 | |
}, | |
{ | |
"epoch": 11.56, | |
"learning_rate": 0.00013860772895712015, | |
"loss": 2.9124, | |
"step": 215000 | |
}, | |
{ | |
"epoch": 11.61, | |
"learning_rate": 0.00013772631021704605, | |
"loss": 2.9112, | |
"step": 216000 | |
}, | |
{ | |
"epoch": 11.67, | |
"learning_rate": 0.00013684489147697192, | |
"loss": 2.9151, | |
"step": 217000 | |
}, | |
{ | |
"epoch": 11.72, | |
"learning_rate": 0.0001359625904358567, | |
"loss": 2.916, | |
"step": 218000 | |
}, | |
{ | |
"epoch": 11.77, | |
"learning_rate": 0.0001350811716957826, | |
"loss": 2.9181, | |
"step": 219000 | |
}, | |
{ | |
"epoch": 11.83, | |
"learning_rate": 0.00013419887065466737, | |
"loss": 2.9202, | |
"step": 220000 | |
}, | |
{ | |
"epoch": 11.88, | |
"learning_rate": 0.00013331656961355212, | |
"loss": 2.9172, | |
"step": 221000 | |
}, | |
{ | |
"epoch": 11.93, | |
"learning_rate": 0.0001324342685724369, | |
"loss": 2.9191, | |
"step": 222000 | |
}, | |
{ | |
"epoch": 11.99, | |
"learning_rate": 0.00013155196753132168, | |
"loss": 2.9188, | |
"step": 223000 | |
}, | |
{ | |
"epoch": 12.0, | |
"eval_accuracy": 0.4086410613324161, | |
"eval_loss": 3.3659284114837646, | |
"eval_runtime": 149.7389, | |
"eval_samples_per_second": 386.807, | |
"eval_steps_per_second": 6.044, | |
"step": 223212 | |
}, | |
{ | |
"epoch": 12.04, | |
"learning_rate": 0.00013067054879124757, | |
"loss": 2.8731, | |
"step": 224000 | |
}, | |
{ | |
"epoch": 12.1, | |
"learning_rate": 0.00012978913005117344, | |
"loss": 2.8633, | |
"step": 225000 | |
}, | |
{ | |
"epoch": 12.15, | |
"learning_rate": 0.00012890682901005822, | |
"loss": 2.8684, | |
"step": 226000 | |
}, | |
{ | |
"epoch": 12.2, | |
"learning_rate": 0.000128024527968943, | |
"loss": 2.8681, | |
"step": 227000 | |
}, | |
{ | |
"epoch": 12.26, | |
"learning_rate": 0.00012714222692782778, | |
"loss": 2.8766, | |
"step": 228000 | |
}, | |
{ | |
"epoch": 12.31, | |
"learning_rate": 0.00012625992588671253, | |
"loss": 2.8833, | |
"step": 229000 | |
}, | |
{ | |
"epoch": 12.36, | |
"learning_rate": 0.0001253776248455973, | |
"loss": 2.8843, | |
"step": 230000 | |
}, | |
{ | |
"epoch": 12.42, | |
"learning_rate": 0.0001244962061055232, | |
"loss": 2.8805, | |
"step": 231000 | |
}, | |
{ | |
"epoch": 12.47, | |
"learning_rate": 0.00012361478736544907, | |
"loss": 2.8888, | |
"step": 232000 | |
}, | |
{ | |
"epoch": 12.53, | |
"learning_rate": 0.00012273248632433385, | |
"loss": 2.8911, | |
"step": 233000 | |
}, | |
{ | |
"epoch": 12.58, | |
"learning_rate": 0.00012185018528321863, | |
"loss": 2.8895, | |
"step": 234000 | |
}, | |
{ | |
"epoch": 12.63, | |
"learning_rate": 0.00012096876654314452, | |
"loss": 2.8906, | |
"step": 235000 | |
}, | |
{ | |
"epoch": 12.69, | |
"learning_rate": 0.00012008646550202929, | |
"loss": 2.8921, | |
"step": 236000 | |
}, | |
{ | |
"epoch": 12.74, | |
"learning_rate": 0.00011920504676195517, | |
"loss": 2.8938, | |
"step": 237000 | |
}, | |
{ | |
"epoch": 12.8, | |
"learning_rate": 0.00011832274572083993, | |
"loss": 2.894, | |
"step": 238000 | |
}, | |
{ | |
"epoch": 12.85, | |
"learning_rate": 0.00011744044467972471, | |
"loss": 2.8977, | |
"step": 239000 | |
}, | |
{ | |
"epoch": 12.9, | |
"learning_rate": 0.00011655814363860949, | |
"loss": 2.8978, | |
"step": 240000 | |
}, | |
{ | |
"epoch": 12.96, | |
"learning_rate": 0.00011567672489853537, | |
"loss": 2.8978, | |
"step": 241000 | |
}, | |
{ | |
"epoch": 13.0, | |
"eval_accuracy": 0.4096840939871445, | |
"eval_loss": 3.3580610752105713, | |
"eval_runtime": 149.9409, | |
"eval_samples_per_second": 386.285, | |
"eval_steps_per_second": 6.036, | |
"step": 241813 | |
}, | |
{ | |
"epoch": 13.01, | |
"learning_rate": 0.00011479442385742015, | |
"loss": 2.8892, | |
"step": 242000 | |
}, | |
{ | |
"epoch": 13.06, | |
"learning_rate": 0.00011391212281630491, | |
"loss": 2.839, | |
"step": 243000 | |
}, | |
{ | |
"epoch": 13.12, | |
"learning_rate": 0.0001130307040762308, | |
"loss": 2.8503, | |
"step": 244000 | |
}, | |
{ | |
"epoch": 13.17, | |
"learning_rate": 0.00011214840303511556, | |
"loss": 2.8516, | |
"step": 245000 | |
}, | |
{ | |
"epoch": 13.23, | |
"learning_rate": 0.00011126610199400035, | |
"loss": 2.8554, | |
"step": 246000 | |
}, | |
{ | |
"epoch": 13.28, | |
"learning_rate": 0.00011038468325392623, | |
"loss": 2.8568, | |
"step": 247000 | |
}, | |
{ | |
"epoch": 13.33, | |
"learning_rate": 0.000109502382212811, | |
"loss": 2.862, | |
"step": 248000 | |
}, | |
{ | |
"epoch": 13.39, | |
"learning_rate": 0.00010862008117169576, | |
"loss": 2.8615, | |
"step": 249000 | |
}, | |
{ | |
"epoch": 13.44, | |
"learning_rate": 0.00010773778013058055, | |
"loss": 2.8704, | |
"step": 250000 | |
}, | |
{ | |
"epoch": 13.49, | |
"learning_rate": 0.00010685636139050644, | |
"loss": 2.8627, | |
"step": 251000 | |
}, | |
{ | |
"epoch": 13.55, | |
"learning_rate": 0.00010597494265043232, | |
"loss": 2.8684, | |
"step": 252000 | |
}, | |
{ | |
"epoch": 13.6, | |
"learning_rate": 0.0001050926416093171, | |
"loss": 2.8714, | |
"step": 253000 | |
}, | |
{ | |
"epoch": 13.66, | |
"learning_rate": 0.00010421034056820186, | |
"loss": 2.8749, | |
"step": 254000 | |
}, | |
{ | |
"epoch": 13.71, | |
"learning_rate": 0.00010332892182812776, | |
"loss": 2.87, | |
"step": 255000 | |
}, | |
{ | |
"epoch": 13.76, | |
"learning_rate": 0.00010244662078701252, | |
"loss": 2.8721, | |
"step": 256000 | |
}, | |
{ | |
"epoch": 13.82, | |
"learning_rate": 0.0001015643197458973, | |
"loss": 2.8775, | |
"step": 257000 | |
}, | |
{ | |
"epoch": 13.87, | |
"learning_rate": 0.00010068201870478206, | |
"loss": 2.8779, | |
"step": 258000 | |
}, | |
{ | |
"epoch": 13.92, | |
"learning_rate": 9.980148226574907e-05, | |
"loss": 2.8696, | |
"step": 259000 | |
}, | |
{ | |
"epoch": 13.98, | |
"learning_rate": 9.891918122463383e-05, | |
"loss": 2.8784, | |
"step": 260000 | |
}, | |
{ | |
"epoch": 14.0, | |
"eval_accuracy": 0.4103098060923963, | |
"eval_loss": 3.3657193183898926, | |
"eval_runtime": 154.6068, | |
"eval_samples_per_second": 374.628, | |
"eval_steps_per_second": 5.854, | |
"step": 260414 | |
}, | |
{ | |
"epoch": 14.03, | |
"learning_rate": 9.803776248455973e-05, | |
"loss": 2.8434, | |
"step": 261000 | |
}, | |
{ | |
"epoch": 14.09, | |
"learning_rate": 9.715546144344449e-05, | |
"loss": 2.824, | |
"step": 262000 | |
}, | |
{ | |
"epoch": 14.14, | |
"learning_rate": 9.627404270337037e-05, | |
"loss": 2.8286, | |
"step": 263000 | |
}, | |
{ | |
"epoch": 14.19, | |
"learning_rate": 9.539174166225517e-05, | |
"loss": 2.8351, | |
"step": 264000 | |
}, | |
{ | |
"epoch": 14.25, | |
"learning_rate": 9.450944062113993e-05, | |
"loss": 2.8403, | |
"step": 265000 | |
}, | |
{ | |
"epoch": 14.3, | |
"learning_rate": 9.362713958002469e-05, | |
"loss": 2.8419, | |
"step": 266000 | |
}, | |
{ | |
"epoch": 14.35, | |
"learning_rate": 9.274483853890947e-05, | |
"loss": 2.8428, | |
"step": 267000 | |
}, | |
{ | |
"epoch": 14.41, | |
"learning_rate": 9.186341979883535e-05, | |
"loss": 2.8455, | |
"step": 268000 | |
}, | |
{ | |
"epoch": 14.46, | |
"learning_rate": 9.098200105876124e-05, | |
"loss": 2.8471, | |
"step": 269000 | |
}, | |
{ | |
"epoch": 14.52, | |
"learning_rate": 9.009970001764601e-05, | |
"loss": 2.8474, | |
"step": 270000 | |
}, | |
{ | |
"epoch": 14.57, | |
"learning_rate": 8.92182812775719e-05, | |
"loss": 2.8482, | |
"step": 271000 | |
}, | |
{ | |
"epoch": 14.62, | |
"learning_rate": 8.833598023645668e-05, | |
"loss": 2.8493, | |
"step": 272000 | |
}, | |
{ | |
"epoch": 14.68, | |
"learning_rate": 8.745367919534144e-05, | |
"loss": 2.8536, | |
"step": 273000 | |
}, | |
{ | |
"epoch": 14.73, | |
"learning_rate": 8.65713781542262e-05, | |
"loss": 2.853, | |
"step": 274000 | |
}, | |
{ | |
"epoch": 14.78, | |
"learning_rate": 8.5689077113111e-05, | |
"loss": 2.8526, | |
"step": 275000 | |
}, | |
{ | |
"epoch": 14.84, | |
"learning_rate": 8.4808540674078e-05, | |
"loss": 2.8543, | |
"step": 276000 | |
}, | |
{ | |
"epoch": 14.89, | |
"learning_rate": 8.392623963296276e-05, | |
"loss": 2.8549, | |
"step": 277000 | |
}, | |
{ | |
"epoch": 14.95, | |
"learning_rate": 8.304393859184754e-05, | |
"loss": 2.8575, | |
"step": 278000 | |
}, | |
{ | |
"epoch": 15.0, | |
"learning_rate": 8.216251985177341e-05, | |
"loss": 2.8592, | |
"step": 279000 | |
}, | |
{ | |
"epoch": 15.0, | |
"eval_accuracy": 0.4102248237203603, | |
"eval_loss": 3.3692827224731445, | |
"eval_runtime": 155.1467, | |
"eval_samples_per_second": 373.324, | |
"eval_steps_per_second": 5.833, | |
"step": 279015 | |
}, | |
{ | |
"epoch": 15.05, | |
"learning_rate": 8.12802188106582e-05, | |
"loss": 2.806, | |
"step": 280000 | |
}, | |
{ | |
"epoch": 15.11, | |
"learning_rate": 8.039880007058407e-05, | |
"loss": 2.8122, | |
"step": 281000 | |
}, | |
{ | |
"epoch": 15.16, | |
"learning_rate": 7.951649902946885e-05, | |
"loss": 2.8176, | |
"step": 282000 | |
}, | |
{ | |
"epoch": 15.21, | |
"learning_rate": 7.863419798835361e-05, | |
"loss": 2.8156, | |
"step": 283000 | |
}, | |
{ | |
"epoch": 15.27, | |
"learning_rate": 7.775277924827951e-05, | |
"loss": 2.8164, | |
"step": 284000 | |
}, | |
{ | |
"epoch": 15.32, | |
"learning_rate": 7.687047820716427e-05, | |
"loss": 2.8215, | |
"step": 285000 | |
}, | |
{ | |
"epoch": 15.38, | |
"learning_rate": 7.598905946709017e-05, | |
"loss": 2.8236, | |
"step": 286000 | |
}, | |
{ | |
"epoch": 15.43, | |
"learning_rate": 7.510675842597493e-05, | |
"loss": 2.8233, | |
"step": 287000 | |
}, | |
{ | |
"epoch": 15.48, | |
"learning_rate": 7.422533968590081e-05, | |
"loss": 2.8257, | |
"step": 288000 | |
}, | |
{ | |
"epoch": 15.54, | |
"learning_rate": 7.334303864478559e-05, | |
"loss": 2.8313, | |
"step": 289000 | |
}, | |
{ | |
"epoch": 15.59, | |
"learning_rate": 7.246161990471149e-05, | |
"loss": 2.8306, | |
"step": 290000 | |
}, | |
{ | |
"epoch": 15.64, | |
"learning_rate": 7.157931886359625e-05, | |
"loss": 2.8365, | |
"step": 291000 | |
}, | |
{ | |
"epoch": 15.7, | |
"learning_rate": 7.069701782248102e-05, | |
"loss": 2.8369, | |
"step": 292000 | |
}, | |
{ | |
"epoch": 15.75, | |
"learning_rate": 6.981559908240691e-05, | |
"loss": 2.8358, | |
"step": 293000 | |
}, | |
{ | |
"epoch": 15.81, | |
"learning_rate": 6.893329804129168e-05, | |
"loss": 2.8321, | |
"step": 294000 | |
}, | |
{ | |
"epoch": 15.86, | |
"learning_rate": 6.805187930121757e-05, | |
"loss": 2.8366, | |
"step": 295000 | |
}, | |
{ | |
"epoch": 15.91, | |
"learning_rate": 6.716957826010234e-05, | |
"loss": 2.8378, | |
"step": 296000 | |
}, | |
{ | |
"epoch": 15.97, | |
"learning_rate": 6.62872772189871e-05, | |
"loss": 2.8415, | |
"step": 297000 | |
}, | |
{ | |
"epoch": 16.0, | |
"eval_accuracy": 0.40921652299159444, | |
"eval_loss": 3.386690616607666, | |
"eval_runtime": 150.4052, | |
"eval_samples_per_second": 385.093, | |
"eval_steps_per_second": 6.017, | |
"step": 297616 | |
}, | |
{ | |
"epoch": 16.02, | |
"learning_rate": 6.540497617787188e-05, | |
"loss": 2.8232, | |
"step": 298000 | |
}, | |
{ | |
"epoch": 16.07, | |
"learning_rate": 6.452267513675666e-05, | |
"loss": 2.7984, | |
"step": 299000 | |
}, | |
{ | |
"epoch": 16.13, | |
"learning_rate": 6.364213869772366e-05, | |
"loss": 2.795, | |
"step": 300000 | |
}, | |
{ | |
"epoch": 16.18, | |
"learning_rate": 6.275983765660842e-05, | |
"loss": 2.7994, | |
"step": 301000 | |
}, | |
{ | |
"epoch": 16.24, | |
"learning_rate": 6.187841891653431e-05, | |
"loss": 2.8011, | |
"step": 302000 | |
}, | |
{ | |
"epoch": 16.29, | |
"learning_rate": 6.0996117875419085e-05, | |
"loss": 2.8058, | |
"step": 303000 | |
}, | |
{ | |
"epoch": 16.34, | |
"learning_rate": 6.011469913534497e-05, | |
"loss": 2.8093, | |
"step": 304000 | |
}, | |
{ | |
"epoch": 16.4, | |
"learning_rate": 5.9232398094229746e-05, | |
"loss": 2.8096, | |
"step": 305000 | |
}, | |
{ | |
"epoch": 16.45, | |
"learning_rate": 5.8350979354155636e-05, | |
"loss": 2.8079, | |
"step": 306000 | |
}, | |
{ | |
"epoch": 16.5, | |
"learning_rate": 5.74686783130404e-05, | |
"loss": 2.8131, | |
"step": 307000 | |
}, | |
{ | |
"epoch": 16.56, | |
"learning_rate": 5.658637727192517e-05, | |
"loss": 2.8123, | |
"step": 308000 | |
}, | |
{ | |
"epoch": 16.61, | |
"learning_rate": 5.570495853185107e-05, | |
"loss": 2.8134, | |
"step": 309000 | |
}, | |
{ | |
"epoch": 16.67, | |
"learning_rate": 5.482265749073583e-05, | |
"loss": 2.817, | |
"step": 310000 | |
}, | |
{ | |
"epoch": 16.72, | |
"learning_rate": 5.39403564496206e-05, | |
"loss": 2.8147, | |
"step": 311000 | |
}, | |
{ | |
"epoch": 16.77, | |
"learning_rate": 5.305805540850538e-05, | |
"loss": 2.8145, | |
"step": 312000 | |
}, | |
{ | |
"epoch": 16.83, | |
"learning_rate": 5.217575436739015e-05, | |
"loss": 2.8123, | |
"step": 313000 | |
}, | |
{ | |
"epoch": 16.88, | |
"learning_rate": 5.129521792835715e-05, | |
"loss": 2.8173, | |
"step": 314000 | |
}, | |
{ | |
"epoch": 16.93, | |
"learning_rate": 5.041291688724192e-05, | |
"loss": 2.8142, | |
"step": 315000 | |
}, | |
{ | |
"epoch": 16.99, | |
"learning_rate": 4.95306158461267e-05, | |
"loss": 2.8198, | |
"step": 316000 | |
}, | |
{ | |
"epoch": 17.0, | |
"eval_accuracy": 0.41009368886643593, | |
"eval_loss": 3.3789703845977783, | |
"eval_runtime": 149.2821, | |
"eval_samples_per_second": 387.99, | |
"eval_steps_per_second": 6.062, | |
"step": 316217 | |
}, | |
{ | |
"epoch": 17.04, | |
"learning_rate": 4.8649197106052584e-05, | |
"loss": 2.7896, | |
"step": 317000 | |
}, | |
{ | |
"epoch": 17.1, | |
"learning_rate": 4.7766896064937355e-05, | |
"loss": 2.7827, | |
"step": 318000 | |
}, | |
{ | |
"epoch": 17.15, | |
"learning_rate": 4.6884595023822125e-05, | |
"loss": 2.7812, | |
"step": 319000 | |
}, | |
{ | |
"epoch": 17.2, | |
"learning_rate": 4.600317628374801e-05, | |
"loss": 2.7871, | |
"step": 320000 | |
}, | |
{ | |
"epoch": 17.26, | |
"learning_rate": 4.512175754367389e-05, | |
"loss": 2.7895, | |
"step": 321000 | |
}, | |
{ | |
"epoch": 17.31, | |
"learning_rate": 4.423945650255867e-05, | |
"loss": 2.7907, | |
"step": 322000 | |
}, | |
{ | |
"epoch": 17.36, | |
"learning_rate": 4.335715546144344e-05, | |
"loss": 2.788, | |
"step": 323000 | |
}, | |
{ | |
"epoch": 17.42, | |
"learning_rate": 4.247485442032821e-05, | |
"loss": 2.7862, | |
"step": 324000 | |
}, | |
{ | |
"epoch": 17.47, | |
"learning_rate": 4.159255337921299e-05, | |
"loss": 2.7948, | |
"step": 325000 | |
}, | |
{ | |
"epoch": 17.53, | |
"learning_rate": 4.071113463913887e-05, | |
"loss": 2.7939, | |
"step": 326000 | |
}, | |
{ | |
"epoch": 17.58, | |
"learning_rate": 3.982883359802364e-05, | |
"loss": 2.7953, | |
"step": 327000 | |
}, | |
{ | |
"epoch": 17.63, | |
"learning_rate": 3.894653255690842e-05, | |
"loss": 2.7983, | |
"step": 328000 | |
}, | |
{ | |
"epoch": 17.69, | |
"learning_rate": 3.80651138168343e-05, | |
"loss": 2.7977, | |
"step": 329000 | |
}, | |
{ | |
"epoch": 17.74, | |
"learning_rate": 3.7182812775719074e-05, | |
"loss": 2.7977, | |
"step": 330000 | |
}, | |
{ | |
"epoch": 17.79, | |
"learning_rate": 3.6300511734603844e-05, | |
"loss": 2.7985, | |
"step": 331000 | |
}, | |
{ | |
"epoch": 17.85, | |
"learning_rate": 3.541909299452973e-05, | |
"loss": 2.7956, | |
"step": 332000 | |
}, | |
{ | |
"epoch": 17.9, | |
"learning_rate": 3.4536791953414505e-05, | |
"loss": 2.8001, | |
"step": 333000 | |
}, | |
{ | |
"epoch": 17.96, | |
"learning_rate": 3.3654490912299276e-05, | |
"loss": 2.8013, | |
"step": 334000 | |
}, | |
{ | |
"epoch": 18.0, | |
"eval_accuracy": 0.40988771578132727, | |
"eval_loss": 3.392399311065674, | |
"eval_runtime": 149.3924, | |
"eval_samples_per_second": 387.704, | |
"eval_steps_per_second": 6.058, | |
"step": 334818 | |
}, | |
{ | |
"epoch": 18.01, | |
"learning_rate": 3.2772189871184046e-05, | |
"loss": 2.795, | |
"step": 335000 | |
}, | |
{ | |
"epoch": 18.06, | |
"learning_rate": 3.189077113110993e-05, | |
"loss": 2.7694, | |
"step": 336000 | |
}, | |
{ | |
"epoch": 18.12, | |
"learning_rate": 3.101023469207693e-05, | |
"loss": 2.7677, | |
"step": 337000 | |
}, | |
{ | |
"epoch": 18.17, | |
"learning_rate": 3.0127933650961703e-05, | |
"loss": 2.7737, | |
"step": 338000 | |
}, | |
{ | |
"epoch": 18.22, | |
"learning_rate": 2.9245632609846477e-05, | |
"loss": 2.772, | |
"step": 339000 | |
}, | |
{ | |
"epoch": 18.28, | |
"learning_rate": 2.836333156873125e-05, | |
"loss": 2.7774, | |
"step": 340000 | |
}, | |
{ | |
"epoch": 18.33, | |
"learning_rate": 2.748103052761602e-05, | |
"loss": 2.7737, | |
"step": 341000 | |
}, | |
{ | |
"epoch": 18.39, | |
"learning_rate": 2.6599611787541905e-05, | |
"loss": 2.7789, | |
"step": 342000 | |
}, | |
{ | |
"epoch": 18.44, | |
"learning_rate": 2.571731074642668e-05, | |
"loss": 2.779, | |
"step": 343000 | |
}, | |
{ | |
"epoch": 18.49, | |
"learning_rate": 2.483500970531145e-05, | |
"loss": 2.7798, | |
"step": 344000 | |
}, | |
{ | |
"epoch": 18.55, | |
"learning_rate": 2.3953590965237337e-05, | |
"loss": 2.7792, | |
"step": 345000 | |
}, | |
{ | |
"epoch": 18.6, | |
"learning_rate": 2.307128992412211e-05, | |
"loss": 2.7788, | |
"step": 346000 | |
}, | |
{ | |
"epoch": 18.65, | |
"learning_rate": 2.2189871184047998e-05, | |
"loss": 2.7776, | |
"step": 347000 | |
}, | |
{ | |
"epoch": 18.71, | |
"learning_rate": 2.1307570142932765e-05, | |
"loss": 2.779, | |
"step": 348000 | |
}, | |
{ | |
"epoch": 18.76, | |
"learning_rate": 2.042526910181754e-05, | |
"loss": 2.7764, | |
"step": 349000 | |
}, | |
{ | |
"epoch": 18.82, | |
"learning_rate": 1.9542968060702313e-05, | |
"loss": 2.7811, | |
"step": 350000 | |
}, | |
{ | |
"epoch": 18.87, | |
"learning_rate": 1.8661549320628196e-05, | |
"loss": 2.7809, | |
"step": 351000 | |
}, | |
{ | |
"epoch": 18.92, | |
"learning_rate": 1.7780130580554083e-05, | |
"loss": 2.7794, | |
"step": 352000 | |
}, | |
{ | |
"epoch": 18.98, | |
"learning_rate": 1.6897829539438854e-05, | |
"loss": 2.7836, | |
"step": 353000 | |
}, | |
{ | |
"epoch": 19.0, | |
"eval_accuracy": 0.4097668594277361, | |
"eval_loss": 3.401383399963379, | |
"eval_runtime": 149.562, | |
"eval_samples_per_second": 387.264, | |
"eval_steps_per_second": 6.051, | |
"step": 353419 | |
}, | |
{ | |
"epoch": 19.03, | |
"learning_rate": 1.6017293100405857e-05, | |
"loss": 2.7676, | |
"step": 354000 | |
}, | |
{ | |
"epoch": 19.08, | |
"learning_rate": 1.5134992059290627e-05, | |
"loss": 2.7617, | |
"step": 355000 | |
}, | |
{ | |
"epoch": 19.14, | |
"learning_rate": 1.4252691018175401e-05, | |
"loss": 2.7597, | |
"step": 356000 | |
}, | |
{ | |
"epoch": 19.19, | |
"learning_rate": 1.3371272278101286e-05, | |
"loss": 2.7629, | |
"step": 357000 | |
}, | |
{ | |
"epoch": 19.25, | |
"learning_rate": 1.2488971236986057e-05, | |
"loss": 2.7652, | |
"step": 358000 | |
}, | |
{ | |
"epoch": 19.3, | |
"learning_rate": 1.1606670195870831e-05, | |
"loss": 2.7624, | |
"step": 359000 | |
}, | |
{ | |
"epoch": 19.35, | |
"learning_rate": 1.0725251455796716e-05, | |
"loss": 2.7646, | |
"step": 360000 | |
}, | |
{ | |
"epoch": 19.41, | |
"learning_rate": 9.84295041468149e-06, | |
"loss": 2.7645, | |
"step": 361000 | |
}, | |
{ | |
"epoch": 19.46, | |
"learning_rate": 8.96064937356626e-06, | |
"loss": 2.7625, | |
"step": 362000 | |
}, | |
{ | |
"epoch": 19.52, | |
"learning_rate": 8.078348332451031e-06, | |
"loss": 2.7621, | |
"step": 363000 | |
}, | |
{ | |
"epoch": 19.57, | |
"learning_rate": 7.196047291335803e-06, | |
"loss": 2.7639, | |
"step": 364000 | |
}, | |
{ | |
"epoch": 19.62, | |
"learning_rate": 6.3146285512616895e-06, | |
"loss": 2.7626, | |
"step": 365000 | |
}, | |
{ | |
"epoch": 19.68, | |
"learning_rate": 5.432327510146462e-06, | |
"loss": 2.7637, | |
"step": 366000 | |
}, | |
{ | |
"epoch": 19.73, | |
"learning_rate": 4.550026469031233e-06, | |
"loss": 2.7605, | |
"step": 367000 | |
}, | |
{ | |
"epoch": 19.78, | |
"learning_rate": 3.66860772895712e-06, | |
"loss": 2.7683, | |
"step": 368000 | |
}, | |
{ | |
"epoch": 19.84, | |
"learning_rate": 2.7863066878418916e-06, | |
"loss": 2.7608, | |
"step": 369000 | |
}, | |
{ | |
"epoch": 19.89, | |
"learning_rate": 1.904005646726663e-06, | |
"loss": 2.765, | |
"step": 370000 | |
}, | |
{ | |
"epoch": 19.95, | |
"learning_rate": 1.0225869066525498e-06, | |
"loss": 2.7635, | |
"step": 371000 | |
}, | |
{ | |
"epoch": 20.0, | |
"learning_rate": 1.4028586553732134e-07, | |
"loss": 2.7626, | |
"step": 372000 | |
}, | |
{ | |
"epoch": 20.0, | |
"eval_accuracy": 0.40992788926628976, | |
"eval_loss": 3.4053595066070557, | |
"eval_runtime": 149.4807, | |
"eval_samples_per_second": 387.475, | |
"eval_steps_per_second": 6.054, | |
"step": 372020 | |
}, | |
{ | |
"epoch": 20.0, | |
"step": 372020, | |
"total_flos": 1.56740238729216e+18, | |
"train_loss": 3.0558329318429918, | |
"train_runtime": 48026.3336, | |
"train_samples_per_second": 247.871, | |
"train_steps_per_second": 7.746 | |
} | |
], | |
"logging_steps": 1000, | |
"max_steps": 372020, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 20, | |
"save_steps": 5000, | |
"total_flos": 1.56740238729216e+18, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |