kanishka's picture
End of training
4689630
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 744020,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"learning_rate": 3.125e-05,
"loss": 6.2858,
"step": 1000
},
{
"epoch": 0.05,
"learning_rate": 6.25e-05,
"loss": 5.118,
"step": 2000
},
{
"epoch": 0.08,
"learning_rate": 9.375e-05,
"loss": 4.8257,
"step": 3000
},
{
"epoch": 0.11,
"learning_rate": 0.000125,
"loss": 4.62,
"step": 4000
},
{
"epoch": 0.13,
"learning_rate": 0.00015625,
"loss": 4.4689,
"step": 5000
},
{
"epoch": 0.16,
"learning_rate": 0.0001875,
"loss": 4.3516,
"step": 6000
},
{
"epoch": 0.19,
"learning_rate": 0.00021875,
"loss": 4.2636,
"step": 7000
},
{
"epoch": 0.22,
"learning_rate": 0.00025,
"loss": 4.1906,
"step": 8000
},
{
"epoch": 0.24,
"learning_rate": 0.00028125000000000003,
"loss": 4.1354,
"step": 9000
},
{
"epoch": 0.27,
"learning_rate": 0.0003125,
"loss": 4.0582,
"step": 10000
},
{
"epoch": 0.3,
"learning_rate": 0.00034365625,
"loss": 3.9986,
"step": 11000
},
{
"epoch": 0.32,
"learning_rate": 0.00037490625,
"loss": 3.9535,
"step": 12000
},
{
"epoch": 0.35,
"learning_rate": 0.00040615625,
"loss": 3.9158,
"step": 13000
},
{
"epoch": 0.38,
"learning_rate": 0.00043737500000000005,
"loss": 3.8747,
"step": 14000
},
{
"epoch": 0.4,
"learning_rate": 0.000468625,
"loss": 3.8471,
"step": 15000
},
{
"epoch": 0.43,
"learning_rate": 0.00049984375,
"loss": 3.8178,
"step": 16000
},
{
"epoch": 0.46,
"learning_rate": 0.00053109375,
"loss": 3.794,
"step": 17000
},
{
"epoch": 0.48,
"learning_rate": 0.0005623125,
"loss": 3.7705,
"step": 18000
},
{
"epoch": 0.51,
"learning_rate": 0.0005935625,
"loss": 3.757,
"step": 19000
},
{
"epoch": 0.54,
"learning_rate": 0.00062478125,
"loss": 3.731,
"step": 20000
},
{
"epoch": 0.56,
"learning_rate": 0.0006560312499999999,
"loss": 3.7135,
"step": 21000
},
{
"epoch": 0.59,
"learning_rate": 0.00068725,
"loss": 3.6993,
"step": 22000
},
{
"epoch": 0.62,
"learning_rate": 0.00071846875,
"loss": 3.6745,
"step": 23000
},
{
"epoch": 0.65,
"learning_rate": 0.00074971875,
"loss": 3.668,
"step": 24000
},
{
"epoch": 0.67,
"learning_rate": 0.0007809375,
"loss": 3.6522,
"step": 25000
},
{
"epoch": 0.7,
"learning_rate": 0.0008121875,
"loss": 3.6422,
"step": 26000
},
{
"epoch": 0.73,
"learning_rate": 0.0008434062500000001,
"loss": 3.6284,
"step": 27000
},
{
"epoch": 0.75,
"learning_rate": 0.00087465625,
"loss": 3.6213,
"step": 28000
},
{
"epoch": 0.78,
"learning_rate": 0.00090590625,
"loss": 3.6071,
"step": 29000
},
{
"epoch": 0.81,
"learning_rate": 0.000937125,
"loss": 3.5975,
"step": 30000
},
{
"epoch": 0.83,
"learning_rate": 0.000968375,
"loss": 3.5921,
"step": 31000
},
{
"epoch": 0.86,
"learning_rate": 0.00099959375,
"loss": 3.5862,
"step": 32000
},
{
"epoch": 0.89,
"learning_rate": 0.0009986138029830622,
"loss": 3.5652,
"step": 33000
},
{
"epoch": 0.91,
"learning_rate": 0.000997209348052021,
"loss": 3.5558,
"step": 34000
},
{
"epoch": 0.94,
"learning_rate": 0.0009958048931209798,
"loss": 3.5398,
"step": 35000
},
{
"epoch": 0.97,
"learning_rate": 0.0009944018426448695,
"loss": 3.527,
"step": 36000
},
{
"epoch": 0.99,
"learning_rate": 0.0009929973877138283,
"loss": 3.5148,
"step": 37000
},
{
"epoch": 1.0,
"eval_accuracy": 0.3670570705333534,
"eval_loss": 3.7269980907440186,
"eval_runtime": 147.4611,
"eval_samples_per_second": 392.781,
"eval_steps_per_second": 6.137,
"step": 37201
},
{
"epoch": 1.02,
"learning_rate": 0.0009915929327827871,
"loss": 3.4761,
"step": 38000
},
{
"epoch": 1.05,
"learning_rate": 0.0009901898823066768,
"loss": 3.4631,
"step": 39000
},
{
"epoch": 1.08,
"learning_rate": 0.0009887854273756356,
"loss": 3.4614,
"step": 40000
},
{
"epoch": 1.1,
"learning_rate": 0.0009873809724445942,
"loss": 3.4538,
"step": 41000
},
{
"epoch": 1.13,
"learning_rate": 0.000985976517513553,
"loss": 3.4587,
"step": 42000
},
{
"epoch": 1.16,
"learning_rate": 0.0009845720625825118,
"loss": 3.4417,
"step": 43000
},
{
"epoch": 1.18,
"learning_rate": 0.0009831690121064015,
"loss": 3.4351,
"step": 44000
},
{
"epoch": 1.21,
"learning_rate": 0.0009817645571753603,
"loss": 3.4312,
"step": 45000
},
{
"epoch": 1.24,
"learning_rate": 0.0009803601022443191,
"loss": 3.4235,
"step": 46000
},
{
"epoch": 1.26,
"learning_rate": 0.0009789570517682088,
"loss": 3.4191,
"step": 47000
},
{
"epoch": 1.29,
"learning_rate": 0.0009775525968371674,
"loss": 3.4077,
"step": 48000
},
{
"epoch": 1.32,
"learning_rate": 0.0009761495463610572,
"loss": 3.4119,
"step": 49000
},
{
"epoch": 1.34,
"learning_rate": 0.000974745091430016,
"loss": 3.392,
"step": 50000
},
{
"epoch": 1.37,
"learning_rate": 0.0009733406364989748,
"loss": 3.3985,
"step": 51000
},
{
"epoch": 1.4,
"learning_rate": 0.0009719361815679335,
"loss": 3.3884,
"step": 52000
},
{
"epoch": 1.42,
"learning_rate": 0.0009705317266368922,
"loss": 3.3792,
"step": 53000
},
{
"epoch": 1.45,
"learning_rate": 0.000969127271705851,
"loss": 3.3734,
"step": 54000
},
{
"epoch": 1.48,
"learning_rate": 0.0009677242212297408,
"loss": 3.3699,
"step": 55000
},
{
"epoch": 1.51,
"learning_rate": 0.0009663197662986994,
"loss": 3.3721,
"step": 56000
},
{
"epoch": 1.53,
"learning_rate": 0.0009649167158225893,
"loss": 3.3688,
"step": 57000
},
{
"epoch": 1.56,
"learning_rate": 0.000963512260891548,
"loss": 3.3598,
"step": 58000
},
{
"epoch": 1.59,
"learning_rate": 0.0009621078059605067,
"loss": 3.3608,
"step": 59000
},
{
"epoch": 1.61,
"learning_rate": 0.0009607047554843966,
"loss": 3.3529,
"step": 60000
},
{
"epoch": 1.64,
"learning_rate": 0.0009593003005533553,
"loss": 3.3448,
"step": 61000
},
{
"epoch": 1.67,
"learning_rate": 0.000957895845622314,
"loss": 3.3475,
"step": 62000
},
{
"epoch": 1.69,
"learning_rate": 0.0009564927951462038,
"loss": 3.3339,
"step": 63000
},
{
"epoch": 1.72,
"learning_rate": 0.0009550883402151626,
"loss": 3.3342,
"step": 64000
},
{
"epoch": 1.75,
"learning_rate": 0.0009536852897390523,
"loss": 3.3323,
"step": 65000
},
{
"epoch": 1.77,
"learning_rate": 0.0009522808348080109,
"loss": 3.3332,
"step": 66000
},
{
"epoch": 1.8,
"learning_rate": 0.0009508763798769697,
"loss": 3.32,
"step": 67000
},
{
"epoch": 1.83,
"learning_rate": 0.0009494733294008595,
"loss": 3.3214,
"step": 68000
},
{
"epoch": 1.85,
"learning_rate": 0.0009480688744698182,
"loss": 3.3226,
"step": 69000
},
{
"epoch": 1.88,
"learning_rate": 0.000946664419538777,
"loss": 3.3158,
"step": 70000
},
{
"epoch": 1.91,
"learning_rate": 0.0009452613690626668,
"loss": 3.3179,
"step": 71000
},
{
"epoch": 1.94,
"learning_rate": 0.0009438569141316255,
"loss": 3.3089,
"step": 72000
},
{
"epoch": 1.96,
"learning_rate": 0.0009424524592005843,
"loss": 3.3103,
"step": 73000
},
{
"epoch": 1.99,
"learning_rate": 0.0009410480042695429,
"loss": 3.3074,
"step": 74000
},
{
"epoch": 2.0,
"eval_accuracy": 0.38968938775071477,
"eval_loss": 3.484098196029663,
"eval_runtime": 146.602,
"eval_samples_per_second": 395.083,
"eval_steps_per_second": 6.173,
"step": 74402
},
{
"epoch": 2.02,
"learning_rate": 0.0009396435493385018,
"loss": 3.2586,
"step": 75000
},
{
"epoch": 2.04,
"learning_rate": 0.0009382419033173227,
"loss": 3.2401,
"step": 76000
},
{
"epoch": 2.07,
"learning_rate": 0.0009368374483862813,
"loss": 3.2462,
"step": 77000
},
{
"epoch": 2.1,
"learning_rate": 0.00093543299345524,
"loss": 3.2422,
"step": 78000
},
{
"epoch": 2.12,
"learning_rate": 0.0009340285385241988,
"loss": 3.2446,
"step": 79000
},
{
"epoch": 2.15,
"learning_rate": 0.0009326254880480886,
"loss": 3.2477,
"step": 80000
},
{
"epoch": 2.18,
"learning_rate": 0.0009312224375719783,
"loss": 3.2454,
"step": 81000
},
{
"epoch": 2.2,
"learning_rate": 0.000929817982640937,
"loss": 3.2402,
"step": 82000
},
{
"epoch": 2.23,
"learning_rate": 0.0009284135277098959,
"loss": 3.2414,
"step": 83000
},
{
"epoch": 2.26,
"learning_rate": 0.0009270104772337856,
"loss": 3.2409,
"step": 84000
},
{
"epoch": 2.28,
"learning_rate": 0.0009256060223027443,
"loss": 3.2413,
"step": 85000
},
{
"epoch": 2.31,
"learning_rate": 0.000924201567371703,
"loss": 3.2392,
"step": 86000
},
{
"epoch": 2.34,
"learning_rate": 0.0009227985168955928,
"loss": 3.2399,
"step": 87000
},
{
"epoch": 2.37,
"learning_rate": 0.0009213940619645515,
"loss": 3.2404,
"step": 88000
},
{
"epoch": 2.39,
"learning_rate": 0.0009199896070335103,
"loss": 3.2342,
"step": 89000
},
{
"epoch": 2.42,
"learning_rate": 0.000918585152102469,
"loss": 3.2307,
"step": 90000
},
{
"epoch": 2.45,
"learning_rate": 0.0009171821016263588,
"loss": 3.2342,
"step": 91000
},
{
"epoch": 2.47,
"learning_rate": 0.0009157790511502487,
"loss": 3.2372,
"step": 92000
},
{
"epoch": 2.5,
"learning_rate": 0.0009143745962192074,
"loss": 3.2274,
"step": 93000
},
{
"epoch": 2.53,
"learning_rate": 0.0009129715457430971,
"loss": 3.2337,
"step": 94000
},
{
"epoch": 2.55,
"learning_rate": 0.000911567090812056,
"loss": 3.2228,
"step": 95000
},
{
"epoch": 2.58,
"learning_rate": 0.0009101626358810146,
"loss": 3.2285,
"step": 96000
},
{
"epoch": 2.61,
"learning_rate": 0.0009087581809499733,
"loss": 3.2247,
"step": 97000
},
{
"epoch": 2.63,
"learning_rate": 0.000907355130473863,
"loss": 3.2241,
"step": 98000
},
{
"epoch": 2.66,
"learning_rate": 0.0009059520799977529,
"loss": 3.2243,
"step": 99000
},
{
"epoch": 2.69,
"learning_rate": 0.0009045476250667116,
"loss": 3.221,
"step": 100000
},
{
"epoch": 2.71,
"learning_rate": 0.0009031431701356703,
"loss": 3.2195,
"step": 101000
},
{
"epoch": 2.74,
"learning_rate": 0.0009017401196595602,
"loss": 3.2168,
"step": 102000
},
{
"epoch": 2.77,
"learning_rate": 0.0009003356647285189,
"loss": 3.2185,
"step": 103000
},
{
"epoch": 2.8,
"learning_rate": 0.0008989326142524087,
"loss": 3.2177,
"step": 104000
},
{
"epoch": 2.82,
"learning_rate": 0.0008975281593213675,
"loss": 3.2188,
"step": 105000
},
{
"epoch": 2.85,
"learning_rate": 0.0008961237043903261,
"loss": 3.2152,
"step": 106000
},
{
"epoch": 2.88,
"learning_rate": 0.0008947192494592848,
"loss": 3.2146,
"step": 107000
},
{
"epoch": 2.9,
"learning_rate": 0.0008933176034381056,
"loss": 3.2083,
"step": 108000
},
{
"epoch": 2.93,
"learning_rate": 0.0008919131485070644,
"loss": 3.2108,
"step": 109000
},
{
"epoch": 2.96,
"learning_rate": 0.0008905086935760231,
"loss": 3.2138,
"step": 110000
},
{
"epoch": 2.98,
"learning_rate": 0.0008891056430999129,
"loss": 3.1988,
"step": 111000
},
{
"epoch": 3.0,
"eval_accuracy": 0.3979050669647656,
"eval_loss": 3.4299747943878174,
"eval_runtime": 149.1667,
"eval_samples_per_second": 388.29,
"eval_steps_per_second": 6.067,
"step": 111603
},
{
"epoch": 3.01,
"learning_rate": 0.0008877011881688717,
"loss": 3.1786,
"step": 112000
},
{
"epoch": 3.04,
"learning_rate": 0.0008862967332378304,
"loss": 3.14,
"step": 113000
},
{
"epoch": 3.06,
"learning_rate": 0.0008848936827617202,
"loss": 3.1408,
"step": 114000
},
{
"epoch": 3.09,
"learning_rate": 0.000883489227830679,
"loss": 3.1491,
"step": 115000
},
{
"epoch": 3.12,
"learning_rate": 0.0008820847728996376,
"loss": 3.1428,
"step": 116000
},
{
"epoch": 3.15,
"learning_rate": 0.0008806803179685963,
"loss": 3.1515,
"step": 117000
},
{
"epoch": 3.17,
"learning_rate": 0.0008792772674924862,
"loss": 3.1396,
"step": 118000
},
{
"epoch": 3.2,
"learning_rate": 0.0008778728125614449,
"loss": 3.1518,
"step": 119000
},
{
"epoch": 3.23,
"learning_rate": 0.0008764683576304036,
"loss": 3.1464,
"step": 120000
},
{
"epoch": 3.25,
"learning_rate": 0.0008750653071542935,
"loss": 3.1562,
"step": 121000
},
{
"epoch": 3.28,
"learning_rate": 0.0008736608522232522,
"loss": 3.1454,
"step": 122000
},
{
"epoch": 3.31,
"learning_rate": 0.0008722563972922109,
"loss": 3.153,
"step": 123000
},
{
"epoch": 3.33,
"learning_rate": 0.0008708547512710317,
"loss": 3.1608,
"step": 124000
},
{
"epoch": 3.36,
"learning_rate": 0.0008694502963399905,
"loss": 3.1545,
"step": 125000
},
{
"epoch": 3.39,
"learning_rate": 0.0008680458414089491,
"loss": 3.1477,
"step": 126000
},
{
"epoch": 3.41,
"learning_rate": 0.0008666427909328389,
"loss": 3.1499,
"step": 127000
},
{
"epoch": 3.44,
"learning_rate": 0.0008652383360017977,
"loss": 3.1532,
"step": 128000
},
{
"epoch": 3.47,
"learning_rate": 0.0008638338810707564,
"loss": 3.1507,
"step": 129000
},
{
"epoch": 3.49,
"learning_rate": 0.0008624308305946462,
"loss": 3.1545,
"step": 130000
},
{
"epoch": 3.52,
"learning_rate": 0.000861026375663605,
"loss": 3.1478,
"step": 131000
},
{
"epoch": 3.55,
"learning_rate": 0.0008596233251874948,
"loss": 3.157,
"step": 132000
},
{
"epoch": 3.58,
"learning_rate": 0.0008582188702564535,
"loss": 3.1439,
"step": 133000
},
{
"epoch": 3.6,
"learning_rate": 0.0008568144153254123,
"loss": 3.1461,
"step": 134000
},
{
"epoch": 3.63,
"learning_rate": 0.0008554099603943709,
"loss": 3.1467,
"step": 135000
},
{
"epoch": 3.66,
"learning_rate": 0.0008540069099182607,
"loss": 3.1486,
"step": 136000
},
{
"epoch": 3.68,
"learning_rate": 0.0008526024549872195,
"loss": 3.1467,
"step": 137000
},
{
"epoch": 3.71,
"learning_rate": 0.0008511994045111093,
"loss": 3.1482,
"step": 138000
},
{
"epoch": 3.74,
"learning_rate": 0.000849794949580068,
"loss": 3.1508,
"step": 139000
},
{
"epoch": 3.76,
"learning_rate": 0.0008483904946490267,
"loss": 3.1574,
"step": 140000
},
{
"epoch": 3.79,
"learning_rate": 0.0008469860397179855,
"loss": 3.1437,
"step": 141000
},
{
"epoch": 3.82,
"learning_rate": 0.0008455815847869442,
"loss": 3.1427,
"step": 142000
},
{
"epoch": 3.84,
"learning_rate": 0.000844178534310834,
"loss": 3.15,
"step": 143000
},
{
"epoch": 3.87,
"learning_rate": 0.0008427740793797927,
"loss": 3.1489,
"step": 144000
},
{
"epoch": 3.9,
"learning_rate": 0.0008413710289036824,
"loss": 3.1449,
"step": 145000
},
{
"epoch": 3.92,
"learning_rate": 0.0008399665739726412,
"loss": 3.1465,
"step": 146000
},
{
"epoch": 3.95,
"learning_rate": 0.0008385621190416,
"loss": 3.1375,
"step": 147000
},
{
"epoch": 3.98,
"learning_rate": 0.0008371576641105587,
"loss": 3.152,
"step": 148000
},
{
"epoch": 4.0,
"eval_accuracy": 0.4049755331384225,
"eval_loss": 3.3773725032806396,
"eval_runtime": 149.0965,
"eval_samples_per_second": 388.473,
"eval_steps_per_second": 6.07,
"step": 148804
},
{
"epoch": 4.01,
"learning_rate": 0.0008357546136344485,
"loss": 3.1283,
"step": 149000
},
{
"epoch": 4.03,
"learning_rate": 0.0008343515631583383,
"loss": 3.0733,
"step": 150000
},
{
"epoch": 4.06,
"learning_rate": 0.000832947108227297,
"loss": 3.0775,
"step": 151000
},
{
"epoch": 4.09,
"learning_rate": 0.0008315426532962557,
"loss": 3.0746,
"step": 152000
},
{
"epoch": 4.11,
"learning_rate": 0.0008301381983652145,
"loss": 3.0835,
"step": 153000
},
{
"epoch": 4.14,
"learning_rate": 0.0008287351478891042,
"loss": 3.0848,
"step": 154000
},
{
"epoch": 4.17,
"learning_rate": 0.0008273306929580629,
"loss": 3.0851,
"step": 155000
},
{
"epoch": 4.19,
"learning_rate": 0.0008259276424819527,
"loss": 3.0789,
"step": 156000
},
{
"epoch": 4.22,
"learning_rate": 0.0008245231875509115,
"loss": 3.0933,
"step": 157000
},
{
"epoch": 4.25,
"learning_rate": 0.0008231201370748013,
"loss": 3.0864,
"step": 158000
},
{
"epoch": 4.27,
"learning_rate": 0.00082171568214376,
"loss": 3.0948,
"step": 159000
},
{
"epoch": 4.3,
"learning_rate": 0.0008203126316676498,
"loss": 3.0914,
"step": 160000
},
{
"epoch": 4.33,
"learning_rate": 0.0008189081767366086,
"loss": 3.0954,
"step": 161000
},
{
"epoch": 4.35,
"learning_rate": 0.0008175037218055673,
"loss": 3.0923,
"step": 162000
},
{
"epoch": 4.38,
"learning_rate": 0.000816099266874526,
"loss": 3.0964,
"step": 163000
},
{
"epoch": 4.41,
"learning_rate": 0.0008146962163984158,
"loss": 3.0941,
"step": 164000
},
{
"epoch": 4.44,
"learning_rate": 0.0008132917614673745,
"loss": 3.0902,
"step": 165000
},
{
"epoch": 4.46,
"learning_rate": 0.0008118887109912643,
"loss": 3.0969,
"step": 166000
},
{
"epoch": 4.49,
"learning_rate": 0.000810484256060223,
"loss": 3.0948,
"step": 167000
},
{
"epoch": 4.52,
"learning_rate": 0.0008090798011291817,
"loss": 3.0874,
"step": 168000
},
{
"epoch": 4.54,
"learning_rate": 0.0008076767506530715,
"loss": 3.0981,
"step": 169000
},
{
"epoch": 4.57,
"learning_rate": 0.0008062722957220303,
"loss": 3.0934,
"step": 170000
},
{
"epoch": 4.6,
"learning_rate": 0.000804867840790989,
"loss": 3.0974,
"step": 171000
},
{
"epoch": 4.62,
"learning_rate": 0.0008034633858599479,
"loss": 3.0942,
"step": 172000
},
{
"epoch": 4.65,
"learning_rate": 0.0008020603353838376,
"loss": 3.0917,
"step": 173000
},
{
"epoch": 4.68,
"learning_rate": 0.0008006558804527962,
"loss": 3.0974,
"step": 174000
},
{
"epoch": 4.7,
"learning_rate": 0.000799252829976686,
"loss": 3.0948,
"step": 175000
},
{
"epoch": 4.73,
"learning_rate": 0.0007978483750456448,
"loss": 3.099,
"step": 176000
},
{
"epoch": 4.76,
"learning_rate": 0.0007964439201146035,
"loss": 3.1001,
"step": 177000
},
{
"epoch": 4.78,
"learning_rate": 0.0007950394651835623,
"loss": 3.0987,
"step": 178000
},
{
"epoch": 4.81,
"learning_rate": 0.0007936364147074521,
"loss": 3.0925,
"step": 179000
},
{
"epoch": 4.84,
"learning_rate": 0.0007922319597764108,
"loss": 3.0969,
"step": 180000
},
{
"epoch": 4.87,
"learning_rate": 0.0007908289093003006,
"loss": 3.0986,
"step": 181000
},
{
"epoch": 4.89,
"learning_rate": 0.0007894244543692594,
"loss": 3.095,
"step": 182000
},
{
"epoch": 4.92,
"learning_rate": 0.000788019999438218,
"loss": 3.0932,
"step": 183000
},
{
"epoch": 4.95,
"learning_rate": 0.0007866169489621078,
"loss": 3.0919,
"step": 184000
},
{
"epoch": 4.97,
"learning_rate": 0.0007852124940310666,
"loss": 3.0978,
"step": 185000
},
{
"epoch": 5.0,
"learning_rate": 0.0007838080391000253,
"loss": 3.0973,
"step": 186000
},
{
"epoch": 5.0,
"eval_accuracy": 0.40901955199174495,
"eval_loss": 3.346210479736328,
"eval_runtime": 147.9489,
"eval_samples_per_second": 391.487,
"eval_steps_per_second": 6.117,
"step": 186005
},
{
"epoch": 5.03,
"learning_rate": 0.000782403584168984,
"loss": 3.021,
"step": 187000
},
{
"epoch": 5.05,
"learning_rate": 0.0007810005336928739,
"loss": 3.0218,
"step": 188000
},
{
"epoch": 5.08,
"learning_rate": 0.0007795960787618326,
"loss": 3.0321,
"step": 189000
},
{
"epoch": 5.11,
"learning_rate": 0.0007781930282857223,
"loss": 3.0359,
"step": 190000
},
{
"epoch": 5.13,
"learning_rate": 0.0007767885733546812,
"loss": 3.0365,
"step": 191000
},
{
"epoch": 5.16,
"learning_rate": 0.0007753855228785709,
"loss": 3.0411,
"step": 192000
},
{
"epoch": 5.19,
"learning_rate": 0.0007739810679475295,
"loss": 3.0414,
"step": 193000
},
{
"epoch": 5.21,
"learning_rate": 0.0007725766130164883,
"loss": 3.0395,
"step": 194000
},
{
"epoch": 5.24,
"learning_rate": 0.0007711721580854471,
"loss": 3.042,
"step": 195000
},
{
"epoch": 5.27,
"learning_rate": 0.0007697691076093368,
"loss": 3.0454,
"step": 196000
},
{
"epoch": 5.3,
"learning_rate": 0.0007683646526782956,
"loss": 3.0444,
"step": 197000
},
{
"epoch": 5.32,
"learning_rate": 0.0007669601977472544,
"loss": 3.0452,
"step": 198000
},
{
"epoch": 5.35,
"learning_rate": 0.0007655557428162131,
"loss": 3.0496,
"step": 199000
},
{
"epoch": 5.38,
"learning_rate": 0.0007641526923401028,
"loss": 3.0454,
"step": 200000
},
{
"epoch": 5.4,
"learning_rate": 0.0007627482374090615,
"loss": 3.048,
"step": 201000
},
{
"epoch": 5.43,
"learning_rate": 0.0007613451869329513,
"loss": 3.0478,
"step": 202000
},
{
"epoch": 5.46,
"learning_rate": 0.00075994073200191,
"loss": 3.0495,
"step": 203000
},
{
"epoch": 5.48,
"learning_rate": 0.0007585376815257999,
"loss": 3.0498,
"step": 204000
},
{
"epoch": 5.51,
"learning_rate": 0.0007571332265947586,
"loss": 3.0483,
"step": 205000
},
{
"epoch": 5.54,
"learning_rate": 0.0007557301761186483,
"loss": 3.0534,
"step": 206000
},
{
"epoch": 5.56,
"learning_rate": 0.0007543257211876072,
"loss": 3.0583,
"step": 207000
},
{
"epoch": 5.59,
"learning_rate": 0.0007529212662565659,
"loss": 3.0512,
"step": 208000
},
{
"epoch": 5.62,
"learning_rate": 0.0007515182157804556,
"loss": 3.0507,
"step": 209000
},
{
"epoch": 5.65,
"learning_rate": 0.0007501137608494145,
"loss": 3.0499,
"step": 210000
},
{
"epoch": 5.67,
"learning_rate": 0.0007487107103733042,
"loss": 3.0572,
"step": 211000
},
{
"epoch": 5.7,
"learning_rate": 0.0007473062554422628,
"loss": 3.0519,
"step": 212000
},
{
"epoch": 5.73,
"learning_rate": 0.0007459018005112215,
"loss": 3.0495,
"step": 213000
},
{
"epoch": 5.75,
"learning_rate": 0.0007444987500351114,
"loss": 3.0612,
"step": 214000
},
{
"epoch": 5.78,
"learning_rate": 0.0007430956995590012,
"loss": 3.0547,
"step": 215000
},
{
"epoch": 5.81,
"learning_rate": 0.0007416912446279599,
"loss": 3.054,
"step": 216000
},
{
"epoch": 5.83,
"learning_rate": 0.0007402867896969187,
"loss": 3.0547,
"step": 217000
},
{
"epoch": 5.86,
"learning_rate": 0.0007388823347658774,
"loss": 3.0588,
"step": 218000
},
{
"epoch": 5.89,
"learning_rate": 0.0007374792842897672,
"loss": 3.0531,
"step": 219000
},
{
"epoch": 5.91,
"learning_rate": 0.0007360762338136569,
"loss": 3.0585,
"step": 220000
},
{
"epoch": 5.94,
"learning_rate": 0.0007346717788826157,
"loss": 3.0522,
"step": 221000
},
{
"epoch": 5.97,
"learning_rate": 0.0007332673239515743,
"loss": 3.0604,
"step": 222000
},
{
"epoch": 5.99,
"learning_rate": 0.0007318628690205332,
"loss": 3.0543,
"step": 223000
},
{
"epoch": 6.0,
"eval_accuracy": 0.4064327960745534,
"eval_loss": 3.3686516284942627,
"eval_runtime": 149.1251,
"eval_samples_per_second": 388.399,
"eval_steps_per_second": 6.069,
"step": 223206
},
{
"epoch": 6.02,
"learning_rate": 0.0007304598185444229,
"loss": 3.0023,
"step": 224000
},
{
"epoch": 6.05,
"learning_rate": 0.0007290553636133816,
"loss": 2.9878,
"step": 225000
},
{
"epoch": 6.08,
"learning_rate": 0.0007276509086823405,
"loss": 2.9862,
"step": 226000
},
{
"epoch": 6.1,
"learning_rate": 0.0007262464537512992,
"loss": 2.993,
"step": 227000
},
{
"epoch": 6.13,
"learning_rate": 0.0007248434032751889,
"loss": 2.9986,
"step": 228000
},
{
"epoch": 6.16,
"learning_rate": 0.0007234389483441476,
"loss": 2.9984,
"step": 229000
},
{
"epoch": 6.18,
"learning_rate": 0.0007220344934131064,
"loss": 2.9975,
"step": 230000
},
{
"epoch": 6.21,
"learning_rate": 0.0007206314429369961,
"loss": 3.0058,
"step": 231000
},
{
"epoch": 6.24,
"learning_rate": 0.0007192269880059548,
"loss": 3.003,
"step": 232000
},
{
"epoch": 6.26,
"learning_rate": 0.0007178239375298447,
"loss": 3.0059,
"step": 233000
},
{
"epoch": 6.29,
"learning_rate": 0.0007164194825988034,
"loss": 3.0085,
"step": 234000
},
{
"epoch": 6.32,
"learning_rate": 0.0007150150276677621,
"loss": 3.0067,
"step": 235000
},
{
"epoch": 6.34,
"learning_rate": 0.000713611977191652,
"loss": 3.0048,
"step": 236000
},
{
"epoch": 6.37,
"learning_rate": 0.0007122075222606107,
"loss": 3.007,
"step": 237000
},
{
"epoch": 6.4,
"learning_rate": 0.0007108044717845005,
"loss": 3.0163,
"step": 238000
},
{
"epoch": 6.42,
"learning_rate": 0.0007094000168534593,
"loss": 3.0104,
"step": 239000
},
{
"epoch": 6.45,
"learning_rate": 0.0007079955619224179,
"loss": 3.0145,
"step": 240000
},
{
"epoch": 6.48,
"learning_rate": 0.0007065939159012388,
"loss": 3.0145,
"step": 241000
},
{
"epoch": 6.51,
"learning_rate": 0.0007051894609701974,
"loss": 3.0127,
"step": 242000
},
{
"epoch": 6.53,
"learning_rate": 0.0007037850060391562,
"loss": 3.0159,
"step": 243000
},
{
"epoch": 6.56,
"learning_rate": 0.0007023805511081149,
"loss": 3.0151,
"step": 244000
},
{
"epoch": 6.59,
"learning_rate": 0.0007009760961770736,
"loss": 3.02,
"step": 245000
},
{
"epoch": 6.61,
"learning_rate": 0.0006995730457009635,
"loss": 3.019,
"step": 246000
},
{
"epoch": 6.64,
"learning_rate": 0.0006981685907699222,
"loss": 3.0139,
"step": 247000
},
{
"epoch": 6.67,
"learning_rate": 0.000696765540293812,
"loss": 3.0183,
"step": 248000
},
{
"epoch": 6.69,
"learning_rate": 0.0006953610853627708,
"loss": 3.0203,
"step": 249000
},
{
"epoch": 6.72,
"learning_rate": 0.0006939566304317294,
"loss": 3.024,
"step": 250000
},
{
"epoch": 6.75,
"learning_rate": 0.0006925535799556192,
"loss": 3.0248,
"step": 251000
},
{
"epoch": 6.77,
"learning_rate": 0.000691149125024578,
"loss": 3.0249,
"step": 252000
},
{
"epoch": 6.8,
"learning_rate": 0.0006897460745484677,
"loss": 3.0181,
"step": 253000
},
{
"epoch": 6.83,
"learning_rate": 0.0006883416196174265,
"loss": 3.0196,
"step": 254000
},
{
"epoch": 6.85,
"learning_rate": 0.0006869371646863853,
"loss": 3.0244,
"step": 255000
},
{
"epoch": 6.88,
"learning_rate": 0.000685534114210275,
"loss": 3.0251,
"step": 256000
},
{
"epoch": 6.91,
"learning_rate": 0.0006841296592792338,
"loss": 3.0219,
"step": 257000
},
{
"epoch": 6.94,
"learning_rate": 0.0006827266088031235,
"loss": 3.0203,
"step": 258000
},
{
"epoch": 6.96,
"learning_rate": 0.0006813221538720823,
"loss": 3.0225,
"step": 259000
},
{
"epoch": 6.99,
"learning_rate": 0.0006799176989410409,
"loss": 3.0161,
"step": 260000
},
{
"epoch": 7.0,
"eval_accuracy": 0.4113539808027173,
"eval_loss": 3.339113473892212,
"eval_runtime": 148.8983,
"eval_samples_per_second": 388.99,
"eval_steps_per_second": 6.078,
"step": 260407
},
{
"epoch": 7.02,
"learning_rate": 0.0006785132440099997,
"loss": 2.9764,
"step": 261000
},
{
"epoch": 7.04,
"learning_rate": 0.0006771101935338895,
"loss": 2.9576,
"step": 262000
},
{
"epoch": 7.07,
"learning_rate": 0.0006757057386028482,
"loss": 2.9529,
"step": 263000
},
{
"epoch": 7.1,
"learning_rate": 0.0006743012836718069,
"loss": 2.9603,
"step": 264000
},
{
"epoch": 7.12,
"learning_rate": 0.0006728968287407658,
"loss": 2.9641,
"step": 265000
},
{
"epoch": 7.15,
"learning_rate": 0.0006714937782646555,
"loss": 2.9675,
"step": 266000
},
{
"epoch": 7.18,
"learning_rate": 0.0006700907277885453,
"loss": 2.9641,
"step": 267000
},
{
"epoch": 7.2,
"learning_rate": 0.0006686862728575041,
"loss": 2.9672,
"step": 268000
},
{
"epoch": 7.23,
"learning_rate": 0.0006672818179264628,
"loss": 2.9774,
"step": 269000
},
{
"epoch": 7.26,
"learning_rate": 0.0006658773629954214,
"loss": 2.9753,
"step": 270000
},
{
"epoch": 7.28,
"learning_rate": 0.0006644743125193112,
"loss": 2.9674,
"step": 271000
},
{
"epoch": 7.31,
"learning_rate": 0.00066306985758827,
"loss": 2.9716,
"step": 272000
},
{
"epoch": 7.34,
"learning_rate": 0.0006616654026572287,
"loss": 2.9788,
"step": 273000
},
{
"epoch": 7.37,
"learning_rate": 0.0006602623521811185,
"loss": 2.9791,
"step": 274000
},
{
"epoch": 7.39,
"learning_rate": 0.0006588593017050083,
"loss": 2.9821,
"step": 275000
},
{
"epoch": 7.42,
"learning_rate": 0.000657454846773967,
"loss": 2.9828,
"step": 276000
},
{
"epoch": 7.45,
"learning_rate": 0.0006560503918429258,
"loss": 2.9802,
"step": 277000
},
{
"epoch": 7.47,
"learning_rate": 0.0006546459369118846,
"loss": 2.9858,
"step": 278000
},
{
"epoch": 7.5,
"learning_rate": 0.0006532428864357743,
"loss": 2.9814,
"step": 279000
},
{
"epoch": 7.53,
"learning_rate": 0.0006518398359596641,
"loss": 2.9865,
"step": 280000
},
{
"epoch": 7.55,
"learning_rate": 0.0006504353810286228,
"loss": 2.9894,
"step": 281000
},
{
"epoch": 7.58,
"learning_rate": 0.0006490309260975815,
"loss": 2.9832,
"step": 282000
},
{
"epoch": 7.61,
"learning_rate": 0.0006476264711665402,
"loss": 2.986,
"step": 283000
},
{
"epoch": 7.63,
"learning_rate": 0.0006462234206904301,
"loss": 2.9924,
"step": 284000
},
{
"epoch": 7.66,
"learning_rate": 0.0006448189657593888,
"loss": 2.9838,
"step": 285000
},
{
"epoch": 7.69,
"learning_rate": 0.0006434145108283475,
"loss": 2.99,
"step": 286000
},
{
"epoch": 7.71,
"learning_rate": 0.0006420100558973064,
"loss": 2.9873,
"step": 287000
},
{
"epoch": 7.74,
"learning_rate": 0.0006406070054211961,
"loss": 2.9866,
"step": 288000
},
{
"epoch": 7.77,
"learning_rate": 0.0006392025504901547,
"loss": 2.9869,
"step": 289000
},
{
"epoch": 7.8,
"learning_rate": 0.0006377995000140445,
"loss": 2.9881,
"step": 290000
},
{
"epoch": 7.82,
"learning_rate": 0.0006363964495379343,
"loss": 2.9825,
"step": 291000
},
{
"epoch": 7.85,
"learning_rate": 0.000634991994606893,
"loss": 2.9951,
"step": 292000
},
{
"epoch": 7.88,
"learning_rate": 0.0006335875396758518,
"loss": 2.9958,
"step": 293000
},
{
"epoch": 7.9,
"learning_rate": 0.0006321830847448106,
"loss": 2.997,
"step": 294000
},
{
"epoch": 7.93,
"learning_rate": 0.0006307800342687003,
"loss": 2.9886,
"step": 295000
},
{
"epoch": 7.96,
"learning_rate": 0.0006293755793376591,
"loss": 3.0001,
"step": 296000
},
{
"epoch": 7.98,
"learning_rate": 0.0006279725288615489,
"loss": 2.9858,
"step": 297000
},
{
"epoch": 8.0,
"eval_accuracy": 0.4104764790291721,
"eval_loss": 3.347707748413086,
"eval_runtime": 148.9691,
"eval_samples_per_second": 388.806,
"eval_steps_per_second": 6.075,
"step": 297608
},
{
"epoch": 8.01,
"learning_rate": 0.0006265680739305076,
"loss": 2.9621,
"step": 298000
},
{
"epoch": 8.04,
"learning_rate": 0.0006251650234543974,
"loss": 2.9243,
"step": 299000
},
{
"epoch": 8.06,
"learning_rate": 0.000623760568523356,
"loss": 2.9237,
"step": 300000
},
{
"epoch": 8.09,
"learning_rate": 0.0006223575180472459,
"loss": 2.9296,
"step": 301000
},
{
"epoch": 8.12,
"learning_rate": 0.0006209530631162046,
"loss": 2.9321,
"step": 302000
},
{
"epoch": 8.14,
"learning_rate": 0.0006195486081851633,
"loss": 2.9411,
"step": 303000
},
{
"epoch": 8.17,
"learning_rate": 0.0006181441532541221,
"loss": 2.9375,
"step": 304000
},
{
"epoch": 8.2,
"learning_rate": 0.0006167396983230808,
"loss": 2.9463,
"step": 305000
},
{
"epoch": 8.23,
"learning_rate": 0.0006153366478469706,
"loss": 2.9421,
"step": 306000
},
{
"epoch": 8.25,
"learning_rate": 0.0006139321929159294,
"loss": 2.9412,
"step": 307000
},
{
"epoch": 8.28,
"learning_rate": 0.0006125291424398192,
"loss": 2.9451,
"step": 308000
},
{
"epoch": 8.31,
"learning_rate": 0.0006111246875087778,
"loss": 2.9487,
"step": 309000
},
{
"epoch": 8.33,
"learning_rate": 0.0006097216370326676,
"loss": 2.9474,
"step": 310000
},
{
"epoch": 8.36,
"learning_rate": 0.0006083171821016264,
"loss": 2.9504,
"step": 311000
},
{
"epoch": 8.39,
"learning_rate": 0.0006069127271705851,
"loss": 2.9526,
"step": 312000
},
{
"epoch": 8.41,
"learning_rate": 0.0006055096766944749,
"loss": 2.948,
"step": 313000
},
{
"epoch": 8.44,
"learning_rate": 0.0006041066262183647,
"loss": 2.954,
"step": 314000
},
{
"epoch": 8.47,
"learning_rate": 0.0006027021712873234,
"loss": 2.9529,
"step": 315000
},
{
"epoch": 8.49,
"learning_rate": 0.0006012977163562821,
"loss": 2.9531,
"step": 316000
},
{
"epoch": 8.52,
"learning_rate": 0.0005998932614252409,
"loss": 2.9572,
"step": 317000
},
{
"epoch": 8.55,
"learning_rate": 0.0005984902109491307,
"loss": 2.9579,
"step": 318000
},
{
"epoch": 8.58,
"learning_rate": 0.0005970857560180893,
"loss": 2.9599,
"step": 319000
},
{
"epoch": 8.6,
"learning_rate": 0.0005956827055419792,
"loss": 2.9537,
"step": 320000
},
{
"epoch": 8.63,
"learning_rate": 0.0005942782506109379,
"loss": 2.9554,
"step": 321000
},
{
"epoch": 8.66,
"learning_rate": 0.0005928737956798966,
"loss": 2.9589,
"step": 322000
},
{
"epoch": 8.68,
"learning_rate": 0.0005914707452037865,
"loss": 2.9607,
"step": 323000
},
{
"epoch": 8.71,
"learning_rate": 0.0005900662902727452,
"loss": 2.958,
"step": 324000
},
{
"epoch": 8.74,
"learning_rate": 0.0005886632397966349,
"loss": 2.9597,
"step": 325000
},
{
"epoch": 8.76,
"learning_rate": 0.0005872587848655937,
"loss": 2.9666,
"step": 326000
},
{
"epoch": 8.79,
"learning_rate": 0.0005858557343894835,
"loss": 2.9572,
"step": 327000
},
{
"epoch": 8.82,
"learning_rate": 0.0005844512794584422,
"loss": 2.9654,
"step": 328000
},
{
"epoch": 8.84,
"learning_rate": 0.000583048228982332,
"loss": 2.9631,
"step": 329000
},
{
"epoch": 8.87,
"learning_rate": 0.0005816437740512907,
"loss": 2.9666,
"step": 330000
},
{
"epoch": 8.9,
"learning_rate": 0.0005802393191202494,
"loss": 2.9719,
"step": 331000
},
{
"epoch": 8.92,
"learning_rate": 0.0005788348641892081,
"loss": 2.9649,
"step": 332000
},
{
"epoch": 8.95,
"learning_rate": 0.0005774332181680289,
"loss": 2.9659,
"step": 333000
},
{
"epoch": 8.98,
"learning_rate": 0.0005760287632369877,
"loss": 2.9718,
"step": 334000
},
{
"epoch": 9.0,
"eval_accuracy": 0.41122177107294106,
"eval_loss": 3.343648672103882,
"eval_runtime": 147.5276,
"eval_samples_per_second": 392.604,
"eval_steps_per_second": 6.134,
"step": 334809
},
{
"epoch": 9.01,
"learning_rate": 0.0005746243083059465,
"loss": 2.9542,
"step": 335000
},
{
"epoch": 9.03,
"learning_rate": 0.0005732212578298362,
"loss": 2.891,
"step": 336000
},
{
"epoch": 9.06,
"learning_rate": 0.000571816802898795,
"loss": 2.9009,
"step": 337000
},
{
"epoch": 9.09,
"learning_rate": 0.0005704123479677537,
"loss": 2.8991,
"step": 338000
},
{
"epoch": 9.11,
"learning_rate": 0.0005690078930367125,
"loss": 2.9084,
"step": 339000
},
{
"epoch": 9.14,
"learning_rate": 0.0005676048425606023,
"loss": 2.91,
"step": 340000
},
{
"epoch": 9.17,
"learning_rate": 0.0005662003876295609,
"loss": 2.9123,
"step": 341000
},
{
"epoch": 9.19,
"learning_rate": 0.0005647973371534507,
"loss": 2.9128,
"step": 342000
},
{
"epoch": 9.22,
"learning_rate": 0.0005633942866773404,
"loss": 2.9138,
"step": 343000
},
{
"epoch": 9.25,
"learning_rate": 0.0005619898317462993,
"loss": 2.9208,
"step": 344000
},
{
"epoch": 9.27,
"learning_rate": 0.000560585376815258,
"loss": 2.9197,
"step": 345000
},
{
"epoch": 9.3,
"learning_rate": 0.0005591809218842168,
"loss": 2.919,
"step": 346000
},
{
"epoch": 9.33,
"learning_rate": 0.0005577778714081066,
"loss": 2.9241,
"step": 347000
},
{
"epoch": 9.35,
"learning_rate": 0.0005563734164770653,
"loss": 2.9236,
"step": 348000
},
{
"epoch": 9.38,
"learning_rate": 0.000554970366000955,
"loss": 2.9224,
"step": 349000
},
{
"epoch": 9.41,
"learning_rate": 0.0005535673155248449,
"loss": 2.9247,
"step": 350000
},
{
"epoch": 9.44,
"learning_rate": 0.0005521628605938036,
"loss": 2.9262,
"step": 351000
},
{
"epoch": 9.46,
"learning_rate": 0.0005507584056627622,
"loss": 2.9309,
"step": 352000
},
{
"epoch": 9.49,
"learning_rate": 0.000549353950731721,
"loss": 2.9299,
"step": 353000
},
{
"epoch": 9.52,
"learning_rate": 0.0005479509002556108,
"loss": 2.9349,
"step": 354000
},
{
"epoch": 9.54,
"learning_rate": 0.0005465464453245695,
"loss": 2.9367,
"step": 355000
},
{
"epoch": 9.57,
"learning_rate": 0.0005451433948484594,
"loss": 2.933,
"step": 356000
},
{
"epoch": 9.6,
"learning_rate": 0.0005437389399174181,
"loss": 2.9336,
"step": 357000
},
{
"epoch": 9.62,
"learning_rate": 0.0005423344849863768,
"loss": 2.9349,
"step": 358000
},
{
"epoch": 9.65,
"learning_rate": 0.0005409314345102666,
"loss": 2.9415,
"step": 359000
},
{
"epoch": 9.68,
"learning_rate": 0.0005395269795792254,
"loss": 2.9328,
"step": 360000
},
{
"epoch": 9.7,
"learning_rate": 0.0005381239291031151,
"loss": 2.9346,
"step": 361000
},
{
"epoch": 9.73,
"learning_rate": 0.0005367194741720737,
"loss": 2.9391,
"step": 362000
},
{
"epoch": 9.76,
"learning_rate": 0.0005353150192410326,
"loss": 2.9393,
"step": 363000
},
{
"epoch": 9.78,
"learning_rate": 0.0005339119687649223,
"loss": 2.9419,
"step": 364000
},
{
"epoch": 9.81,
"learning_rate": 0.000532507513833881,
"loss": 2.9377,
"step": 365000
},
{
"epoch": 9.84,
"learning_rate": 0.0005311058678127018,
"loss": 2.9373,
"step": 366000
},
{
"epoch": 9.87,
"learning_rate": 0.0005297014128816607,
"loss": 2.9416,
"step": 367000
},
{
"epoch": 9.89,
"learning_rate": 0.0005282969579506194,
"loss": 2.9433,
"step": 368000
},
{
"epoch": 9.92,
"learning_rate": 0.0005268925030195782,
"loss": 2.9406,
"step": 369000
},
{
"epoch": 9.95,
"learning_rate": 0.000525489452543468,
"loss": 2.9419,
"step": 370000
},
{
"epoch": 9.97,
"learning_rate": 0.0005240849976124267,
"loss": 2.9411,
"step": 371000
},
{
"epoch": 10.0,
"learning_rate": 0.0005226805426813853,
"loss": 2.9399,
"step": 372000
},
{
"epoch": 10.0,
"eval_accuracy": 0.41210269901326396,
"eval_loss": 3.345149278640747,
"eval_runtime": 148.3396,
"eval_samples_per_second": 390.455,
"eval_steps_per_second": 6.101,
"step": 372010
},
{
"epoch": 10.03,
"learning_rate": 0.0005212774922052751,
"loss": 2.8712,
"step": 373000
},
{
"epoch": 10.05,
"learning_rate": 0.0005198730372742338,
"loss": 2.8767,
"step": 374000
},
{
"epoch": 10.08,
"learning_rate": 0.0005184699867981236,
"loss": 2.8784,
"step": 375000
},
{
"epoch": 10.11,
"learning_rate": 0.0005170655318670824,
"loss": 2.8827,
"step": 376000
},
{
"epoch": 10.13,
"learning_rate": 0.0005156624813909722,
"loss": 2.8869,
"step": 377000
},
{
"epoch": 10.16,
"learning_rate": 0.0005142594309148619,
"loss": 2.8899,
"step": 378000
},
{
"epoch": 10.19,
"learning_rate": 0.0005128549759838207,
"loss": 2.8921,
"step": 379000
},
{
"epoch": 10.21,
"learning_rate": 0.0005114505210527795,
"loss": 2.8914,
"step": 380000
},
{
"epoch": 10.24,
"learning_rate": 0.0005100460661217382,
"loss": 2.8919,
"step": 381000
},
{
"epoch": 10.27,
"learning_rate": 0.0005086430156456279,
"loss": 2.9003,
"step": 382000
},
{
"epoch": 10.3,
"learning_rate": 0.0005072385607145867,
"loss": 2.8972,
"step": 383000
},
{
"epoch": 10.32,
"learning_rate": 0.0005058355102384764,
"loss": 2.8954,
"step": 384000
},
{
"epoch": 10.35,
"learning_rate": 0.0005044310553074351,
"loss": 2.9014,
"step": 385000
},
{
"epoch": 10.38,
"learning_rate": 0.000503026600376394,
"loss": 2.903,
"step": 386000
},
{
"epoch": 10.4,
"learning_rate": 0.0005016235499002837,
"loss": 2.9039,
"step": 387000
},
{
"epoch": 10.43,
"learning_rate": 0.0005002190949692424,
"loss": 2.8998,
"step": 388000
},
{
"epoch": 10.46,
"learning_rate": 0.0004988160444931322,
"loss": 2.9079,
"step": 389000
},
{
"epoch": 10.48,
"learning_rate": 0.000497412994017022,
"loss": 2.9058,
"step": 390000
},
{
"epoch": 10.51,
"learning_rate": 0.0004960085390859808,
"loss": 2.9025,
"step": 391000
},
{
"epoch": 10.54,
"learning_rate": 0.0004946040841549395,
"loss": 2.9084,
"step": 392000
},
{
"epoch": 10.56,
"learning_rate": 0.0004931996292238982,
"loss": 2.9051,
"step": 393000
},
{
"epoch": 10.59,
"learning_rate": 0.0004917965787477879,
"loss": 2.9119,
"step": 394000
},
{
"epoch": 10.62,
"learning_rate": 0.0004903921238167468,
"loss": 2.909,
"step": 395000
},
{
"epoch": 10.64,
"learning_rate": 0.0004889876688857055,
"loss": 2.9127,
"step": 396000
},
{
"epoch": 10.67,
"learning_rate": 0.00048758461840959523,
"loss": 2.9129,
"step": 397000
},
{
"epoch": 10.7,
"learning_rate": 0.000486180163478554,
"loss": 2.9112,
"step": 398000
},
{
"epoch": 10.73,
"learning_rate": 0.00048477711300244376,
"loss": 2.9202,
"step": 399000
},
{
"epoch": 10.75,
"learning_rate": 0.0004833726580714025,
"loss": 2.9161,
"step": 400000
},
{
"epoch": 10.78,
"learning_rate": 0.00048196820314036124,
"loss": 2.9211,
"step": 401000
},
{
"epoch": 10.81,
"learning_rate": 0.00048056374820931995,
"loss": 2.9192,
"step": 402000
},
{
"epoch": 10.83,
"learning_rate": 0.00047916069773320976,
"loss": 2.9145,
"step": 403000
},
{
"epoch": 10.86,
"learning_rate": 0.00047775624280216853,
"loss": 2.9171,
"step": 404000
},
{
"epoch": 10.89,
"learning_rate": 0.0004763517878711272,
"loss": 2.9132,
"step": 405000
},
{
"epoch": 10.91,
"learning_rate": 0.00047495014184994805,
"loss": 2.9178,
"step": 406000
},
{
"epoch": 10.94,
"learning_rate": 0.00047354568691890676,
"loss": 2.9177,
"step": 407000
},
{
"epoch": 10.97,
"learning_rate": 0.00047214123198786553,
"loss": 2.9154,
"step": 408000
},
{
"epoch": 10.99,
"learning_rate": 0.0004707367770568243,
"loss": 2.9207,
"step": 409000
},
{
"epoch": 11.0,
"eval_accuracy": 0.4129671679171056,
"eval_loss": 3.358556032180786,
"eval_runtime": 148.219,
"eval_samples_per_second": 390.773,
"eval_steps_per_second": 6.106,
"step": 409211
},
{
"epoch": 11.02,
"learning_rate": 0.00046933232212578296,
"loss": 2.8685,
"step": 410000
},
{
"epoch": 11.05,
"learning_rate": 0.00046792927164967277,
"loss": 2.8573,
"step": 411000
},
{
"epoch": 11.07,
"learning_rate": 0.0004665248167186315,
"loss": 2.8612,
"step": 412000
},
{
"epoch": 11.1,
"learning_rate": 0.0004651217662425213,
"loss": 2.862,
"step": 413000
},
{
"epoch": 11.13,
"learning_rate": 0.00046371871576641105,
"loss": 2.8641,
"step": 414000
},
{
"epoch": 11.16,
"learning_rate": 0.0004623142608353698,
"loss": 2.8669,
"step": 415000
},
{
"epoch": 11.18,
"learning_rate": 0.00046090980590432853,
"loss": 2.8682,
"step": 416000
},
{
"epoch": 11.21,
"learning_rate": 0.0004595053509732873,
"loss": 2.8753,
"step": 417000
},
{
"epoch": 11.24,
"learning_rate": 0.00045810230049717705,
"loss": 2.8688,
"step": 418000
},
{
"epoch": 11.26,
"learning_rate": 0.0004566978455661358,
"loss": 2.8752,
"step": 419000
},
{
"epoch": 11.29,
"learning_rate": 0.0004552947950900256,
"loss": 2.8755,
"step": 420000
},
{
"epoch": 11.32,
"learning_rate": 0.0004538903401589843,
"loss": 2.8753,
"step": 421000
},
{
"epoch": 11.34,
"learning_rate": 0.00045248728968287405,
"loss": 2.8777,
"step": 422000
},
{
"epoch": 11.37,
"learning_rate": 0.0004510828347518328,
"loss": 2.8822,
"step": 423000
},
{
"epoch": 11.4,
"learning_rate": 0.00044968118873065367,
"loss": 2.8774,
"step": 424000
},
{
"epoch": 11.42,
"learning_rate": 0.0004482767337996124,
"loss": 2.8873,
"step": 425000
},
{
"epoch": 11.45,
"learning_rate": 0.0004468722788685711,
"loss": 2.8866,
"step": 426000
},
{
"epoch": 11.48,
"learning_rate": 0.00044546782393752987,
"loss": 2.8853,
"step": 427000
},
{
"epoch": 11.51,
"learning_rate": 0.0004440647734614197,
"loss": 2.8821,
"step": 428000
},
{
"epoch": 11.53,
"learning_rate": 0.00044266031853037834,
"loss": 2.8884,
"step": 429000
},
{
"epoch": 11.56,
"learning_rate": 0.0004412558635993371,
"loss": 2.8865,
"step": 430000
},
{
"epoch": 11.59,
"learning_rate": 0.0004398528131232269,
"loss": 2.8887,
"step": 431000
},
{
"epoch": 11.61,
"learning_rate": 0.00043844835819218563,
"loss": 2.8915,
"step": 432000
},
{
"epoch": 11.64,
"learning_rate": 0.00043704530771607544,
"loss": 2.8884,
"step": 433000
},
{
"epoch": 11.67,
"learning_rate": 0.0004356408527850341,
"loss": 2.8851,
"step": 434000
},
{
"epoch": 11.69,
"learning_rate": 0.0004342378023089239,
"loss": 2.8882,
"step": 435000
},
{
"epoch": 11.72,
"learning_rate": 0.0004328333473778827,
"loss": 2.8893,
"step": 436000
},
{
"epoch": 11.75,
"learning_rate": 0.00043143029690177244,
"loss": 2.8898,
"step": 437000
},
{
"epoch": 11.77,
"learning_rate": 0.0004300258419707312,
"loss": 2.8875,
"step": 438000
},
{
"epoch": 11.8,
"learning_rate": 0.0004286213870396899,
"loss": 2.8952,
"step": 439000
},
{
"epoch": 11.83,
"learning_rate": 0.0004272183365635797,
"loss": 2.8938,
"step": 440000
},
{
"epoch": 11.85,
"learning_rate": 0.00042581388163253844,
"loss": 2.8965,
"step": 441000
},
{
"epoch": 11.88,
"learning_rate": 0.00042440942670149716,
"loss": 2.8929,
"step": 442000
},
{
"epoch": 11.91,
"learning_rate": 0.00042300637622538697,
"loss": 2.9017,
"step": 443000
},
{
"epoch": 11.94,
"learning_rate": 0.0004216019212943457,
"loss": 2.895,
"step": 444000
},
{
"epoch": 11.96,
"learning_rate": 0.00042019887081823544,
"loss": 2.892,
"step": 445000
},
{
"epoch": 11.99,
"learning_rate": 0.0004187944158871942,
"loss": 2.8987,
"step": 446000
},
{
"epoch": 12.0,
"eval_accuracy": 0.4122773663391878,
"eval_loss": 3.355417013168335,
"eval_runtime": 147.56,
"eval_samples_per_second": 392.518,
"eval_steps_per_second": 6.133,
"step": 446412
},
{
"epoch": 12.02,
"learning_rate": 0.000417389960956153,
"loss": 2.8605,
"step": 447000
},
{
"epoch": 12.04,
"learning_rate": 0.0004159883149349737,
"loss": 2.8334,
"step": 448000
},
{
"epoch": 12.07,
"learning_rate": 0.0004145838600039325,
"loss": 2.8413,
"step": 449000
},
{
"epoch": 12.1,
"learning_rate": 0.0004131794050728912,
"loss": 2.8457,
"step": 450000
},
{
"epoch": 12.12,
"learning_rate": 0.00041177495014184997,
"loss": 2.8426,
"step": 451000
},
{
"epoch": 12.15,
"learning_rate": 0.0004103718996657397,
"loss": 2.8498,
"step": 452000
},
{
"epoch": 12.18,
"learning_rate": 0.0004089674447346985,
"loss": 2.8513,
"step": 453000
},
{
"epoch": 12.2,
"learning_rate": 0.0004075629898036572,
"loss": 2.8469,
"step": 454000
},
{
"epoch": 12.23,
"learning_rate": 0.0004061585348726159,
"loss": 2.8472,
"step": 455000
},
{
"epoch": 12.26,
"learning_rate": 0.00040475548439650573,
"loss": 2.8486,
"step": 456000
},
{
"epoch": 12.28,
"learning_rate": 0.0004033510294654645,
"loss": 2.8561,
"step": 457000
},
{
"epoch": 12.31,
"learning_rate": 0.00040194797898935426,
"loss": 2.8519,
"step": 458000
},
{
"epoch": 12.34,
"learning_rate": 0.00040054352405831297,
"loss": 2.8522,
"step": 459000
},
{
"epoch": 12.37,
"learning_rate": 0.00039913906912727174,
"loss": 2.8585,
"step": 460000
},
{
"epoch": 12.39,
"learning_rate": 0.00039773461419623045,
"loss": 2.8596,
"step": 461000
},
{
"epoch": 12.42,
"learning_rate": 0.00039633156372012026,
"loss": 2.8608,
"step": 462000
},
{
"epoch": 12.45,
"learning_rate": 0.00039492851324401,
"loss": 2.8572,
"step": 463000
},
{
"epoch": 12.47,
"learning_rate": 0.00039352405831296873,
"loss": 2.8622,
"step": 464000
},
{
"epoch": 12.5,
"learning_rate": 0.0003921196033819275,
"loss": 2.8632,
"step": 465000
},
{
"epoch": 12.53,
"learning_rate": 0.00039071655290581726,
"loss": 2.8635,
"step": 466000
},
{
"epoch": 12.55,
"learning_rate": 0.000389312097974776,
"loss": 2.8693,
"step": 467000
},
{
"epoch": 12.58,
"learning_rate": 0.00038790764304373474,
"loss": 2.8685,
"step": 468000
},
{
"epoch": 12.61,
"learning_rate": 0.00038650318811269345,
"loss": 2.863,
"step": 469000
},
{
"epoch": 12.63,
"learning_rate": 0.00038510013763658327,
"loss": 2.8722,
"step": 470000
},
{
"epoch": 12.66,
"learning_rate": 0.000383695682705542,
"loss": 2.8671,
"step": 471000
},
{
"epoch": 12.69,
"learning_rate": 0.0003822912277745007,
"loss": 2.8703,
"step": 472000
},
{
"epoch": 12.71,
"learning_rate": 0.0003808881772983905,
"loss": 2.8754,
"step": 473000
},
{
"epoch": 12.74,
"learning_rate": 0.0003794837223673492,
"loss": 2.8687,
"step": 474000
},
{
"epoch": 12.77,
"learning_rate": 0.00037808067189123903,
"loss": 2.8734,
"step": 475000
},
{
"epoch": 12.8,
"learning_rate": 0.0003766762169601978,
"loss": 2.8748,
"step": 476000
},
{
"epoch": 12.82,
"learning_rate": 0.00037527316648408755,
"loss": 2.8773,
"step": 477000
},
{
"epoch": 12.85,
"learning_rate": 0.00037386871155304627,
"loss": 2.8739,
"step": 478000
},
{
"epoch": 12.88,
"learning_rate": 0.000372465661076936,
"loss": 2.8759,
"step": 479000
},
{
"epoch": 12.9,
"learning_rate": 0.0003710612061458948,
"loss": 2.8762,
"step": 480000
},
{
"epoch": 12.93,
"learning_rate": 0.00036965675121485356,
"loss": 2.8766,
"step": 481000
},
{
"epoch": 12.96,
"learning_rate": 0.0003682522962838122,
"loss": 2.8768,
"step": 482000
},
{
"epoch": 12.98,
"learning_rate": 0.000366847841352771,
"loss": 2.8779,
"step": 483000
},
{
"epoch": 13.0,
"eval_accuracy": 0.41304986617795647,
"eval_loss": 3.3615658283233643,
"eval_runtime": 148.2115,
"eval_samples_per_second": 390.793,
"eval_steps_per_second": 6.106,
"step": 483613
},
{
"epoch": 13.01,
"learning_rate": 0.00036544479087666074,
"loss": 2.8498,
"step": 484000
},
{
"epoch": 13.04,
"learning_rate": 0.00036404174040055055,
"loss": 2.8192,
"step": 485000
},
{
"epoch": 13.06,
"learning_rate": 0.0003626372854695093,
"loss": 2.8181,
"step": 486000
},
{
"epoch": 13.09,
"learning_rate": 0.000361232830538468,
"loss": 2.8195,
"step": 487000
},
{
"epoch": 13.12,
"learning_rate": 0.00035982837560742675,
"loss": 2.8275,
"step": 488000
},
{
"epoch": 13.14,
"learning_rate": 0.0003584239206763855,
"loss": 2.8255,
"step": 489000
},
{
"epoch": 13.17,
"learning_rate": 0.0003570208702002753,
"loss": 2.8286,
"step": 490000
},
{
"epoch": 13.2,
"learning_rate": 0.000355616415269234,
"loss": 2.8343,
"step": 491000
},
{
"epoch": 13.23,
"learning_rate": 0.00035421336479312375,
"loss": 2.8334,
"step": 492000
},
{
"epoch": 13.25,
"learning_rate": 0.0003528089098620825,
"loss": 2.8299,
"step": 493000
},
{
"epoch": 13.28,
"learning_rate": 0.0003514044549310413,
"loss": 2.8365,
"step": 494000
},
{
"epoch": 13.31,
"learning_rate": 0.00035,
"loss": 2.8353,
"step": 495000
},
{
"epoch": 13.33,
"learning_rate": 0.00034859694952388975,
"loss": 2.8377,
"step": 496000
},
{
"epoch": 13.36,
"learning_rate": 0.0003471924945928485,
"loss": 2.8407,
"step": 497000
},
{
"epoch": 13.39,
"learning_rate": 0.0003457880396618073,
"loss": 2.8428,
"step": 498000
},
{
"epoch": 13.41,
"learning_rate": 0.00034438498918569705,
"loss": 2.8405,
"step": 499000
},
{
"epoch": 13.44,
"learning_rate": 0.00034298053425465576,
"loss": 2.8419,
"step": 500000
},
{
"epoch": 13.47,
"learning_rate": 0.0003415774837785455,
"loss": 2.8469,
"step": 501000
},
{
"epoch": 13.49,
"learning_rate": 0.0003401730288475043,
"loss": 2.8488,
"step": 502000
},
{
"epoch": 13.52,
"learning_rate": 0.00033876997837139404,
"loss": 2.8434,
"step": 503000
},
{
"epoch": 13.55,
"learning_rate": 0.0003373655234403528,
"loss": 2.8472,
"step": 504000
},
{
"epoch": 13.57,
"learning_rate": 0.0003359610685093115,
"loss": 2.8471,
"step": 505000
},
{
"epoch": 13.6,
"learning_rate": 0.0003345566135782703,
"loss": 2.8512,
"step": 506000
},
{
"epoch": 13.63,
"learning_rate": 0.00033315496755709114,
"loss": 2.8477,
"step": 507000
},
{
"epoch": 13.66,
"learning_rate": 0.0003317505126260498,
"loss": 2.8482,
"step": 508000
},
{
"epoch": 13.68,
"learning_rate": 0.00033034605769500857,
"loss": 2.8487,
"step": 509000
},
{
"epoch": 13.71,
"learning_rate": 0.0003289416027639673,
"loss": 2.8496,
"step": 510000
},
{
"epoch": 13.74,
"learning_rate": 0.0003275385522878571,
"loss": 2.8543,
"step": 511000
},
{
"epoch": 13.76,
"learning_rate": 0.0003261355018117469,
"loss": 2.851,
"step": 512000
},
{
"epoch": 13.79,
"learning_rate": 0.00032473104688070557,
"loss": 2.853,
"step": 513000
},
{
"epoch": 13.82,
"learning_rate": 0.0003233279964045954,
"loss": 2.8558,
"step": 514000
},
{
"epoch": 13.84,
"learning_rate": 0.00032192354147355415,
"loss": 2.8448,
"step": 515000
},
{
"epoch": 13.87,
"learning_rate": 0.00032051908654251286,
"loss": 2.8517,
"step": 516000
},
{
"epoch": 13.9,
"learning_rate": 0.0003191146316114716,
"loss": 2.8543,
"step": 517000
},
{
"epoch": 13.92,
"learning_rate": 0.0003177115811353614,
"loss": 2.8519,
"step": 518000
},
{
"epoch": 13.95,
"learning_rate": 0.0003163071262043201,
"loss": 2.8479,
"step": 519000
},
{
"epoch": 13.98,
"learning_rate": 0.00031490267127327887,
"loss": 2.8519,
"step": 520000
},
{
"epoch": 14.0,
"eval_accuracy": 0.41285833673710687,
"eval_loss": 3.369619846343994,
"eval_runtime": 147.964,
"eval_samples_per_second": 391.446,
"eval_steps_per_second": 6.116,
"step": 520814
},
{
"epoch": 14.0,
"learning_rate": 0.0003134982163422376,
"loss": 2.8511,
"step": 521000
},
{
"epoch": 14.03,
"learning_rate": 0.00031209376141119635,
"loss": 2.802,
"step": 522000
},
{
"epoch": 14.06,
"learning_rate": 0.0003106907109350861,
"loss": 2.7993,
"step": 523000
},
{
"epoch": 14.09,
"learning_rate": 0.0003092862560040449,
"loss": 2.8106,
"step": 524000
},
{
"epoch": 14.11,
"learning_rate": 0.0003078818010730036,
"loss": 2.8026,
"step": 525000
},
{
"epoch": 14.14,
"learning_rate": 0.00030647875059689334,
"loss": 2.8074,
"step": 526000
},
{
"epoch": 14.17,
"learning_rate": 0.0003050742956658521,
"loss": 2.8076,
"step": 527000
},
{
"epoch": 14.19,
"learning_rate": 0.00030367124518974187,
"loss": 2.8154,
"step": 528000
},
{
"epoch": 14.22,
"learning_rate": 0.00030226679025870064,
"loss": 2.8121,
"step": 529000
},
{
"epoch": 14.25,
"learning_rate": 0.0003008637397825904,
"loss": 2.816,
"step": 530000
},
{
"epoch": 14.27,
"learning_rate": 0.0002994592848515491,
"loss": 2.8158,
"step": 531000
},
{
"epoch": 14.3,
"learning_rate": 0.00029805623437543886,
"loss": 2.8182,
"step": 532000
},
{
"epoch": 14.33,
"learning_rate": 0.0002966531838993287,
"loss": 2.8169,
"step": 533000
},
{
"epoch": 14.35,
"learning_rate": 0.00029524872896828744,
"loss": 2.8197,
"step": 534000
},
{
"epoch": 14.38,
"learning_rate": 0.00029384427403724616,
"loss": 2.818,
"step": 535000
},
{
"epoch": 14.41,
"learning_rate": 0.00029243981910620487,
"loss": 2.821,
"step": 536000
},
{
"epoch": 14.44,
"learning_rate": 0.0002910367686300946,
"loss": 2.8227,
"step": 537000
},
{
"epoch": 14.46,
"learning_rate": 0.0002896323136990534,
"loss": 2.8222,
"step": 538000
},
{
"epoch": 14.49,
"learning_rate": 0.0002882292632229432,
"loss": 2.8308,
"step": 539000
},
{
"epoch": 14.52,
"learning_rate": 0.0002868248082919019,
"loss": 2.8315,
"step": 540000
},
{
"epoch": 14.54,
"learning_rate": 0.00028542035336086063,
"loss": 2.8244,
"step": 541000
},
{
"epoch": 14.57,
"learning_rate": 0.0002840173028847504,
"loss": 2.8245,
"step": 542000
},
{
"epoch": 14.6,
"learning_rate": 0.00028261284795370916,
"loss": 2.8289,
"step": 543000
},
{
"epoch": 14.62,
"learning_rate": 0.00028120979747759897,
"loss": 2.8252,
"step": 544000
},
{
"epoch": 14.65,
"learning_rate": 0.0002798053425465577,
"loss": 2.8265,
"step": 545000
},
{
"epoch": 14.68,
"learning_rate": 0.0002784022920704475,
"loss": 2.8309,
"step": 546000
},
{
"epoch": 14.7,
"learning_rate": 0.00027699924159433725,
"loss": 2.8286,
"step": 547000
},
{
"epoch": 14.73,
"learning_rate": 0.00027559478666329596,
"loss": 2.8289,
"step": 548000
},
{
"epoch": 14.76,
"learning_rate": 0.00027419033173225473,
"loss": 2.8297,
"step": 549000
},
{
"epoch": 14.78,
"learning_rate": 0.00027278587680121345,
"loss": 2.8295,
"step": 550000
},
{
"epoch": 14.81,
"learning_rate": 0.00027138282632510326,
"loss": 2.8369,
"step": 551000
},
{
"epoch": 14.84,
"learning_rate": 0.000269979775848993,
"loss": 2.8354,
"step": 552000
},
{
"epoch": 14.87,
"learning_rate": 0.00026857532091795173,
"loss": 2.8305,
"step": 553000
},
{
"epoch": 14.89,
"learning_rate": 0.0002671708659869105,
"loss": 2.8355,
"step": 554000
},
{
"epoch": 14.92,
"learning_rate": 0.00026576641105586926,
"loss": 2.8353,
"step": 555000
},
{
"epoch": 14.95,
"learning_rate": 0.000264363360579759,
"loss": 2.8427,
"step": 556000
},
{
"epoch": 14.97,
"learning_rate": 0.00026295890564871773,
"loss": 2.8361,
"step": 557000
},
{
"epoch": 15.0,
"learning_rate": 0.00026155445071767645,
"loss": 2.8395,
"step": 558000
},
{
"epoch": 15.0,
"eval_accuracy": 0.41281084066040374,
"eval_loss": 3.3729231357574463,
"eval_runtime": 147.9664,
"eval_samples_per_second": 391.44,
"eval_steps_per_second": 6.116,
"step": 558015
},
{
"epoch": 15.03,
"learning_rate": 0.00026015140024156626,
"loss": 2.7847,
"step": 559000
},
{
"epoch": 15.05,
"learning_rate": 0.000258746945310525,
"loss": 2.7891,
"step": 560000
},
{
"epoch": 15.08,
"learning_rate": 0.0002573424903794837,
"loss": 2.788,
"step": 561000
},
{
"epoch": 15.11,
"learning_rate": 0.0002559394399033735,
"loss": 2.7885,
"step": 562000
},
{
"epoch": 15.13,
"learning_rate": 0.00025453498497233227,
"loss": 2.7939,
"step": 563000
},
{
"epoch": 15.16,
"learning_rate": 0.000253133338951153,
"loss": 2.7933,
"step": 564000
},
{
"epoch": 15.19,
"learning_rate": 0.0002517288840201118,
"loss": 2.7946,
"step": 565000
},
{
"epoch": 15.21,
"learning_rate": 0.00025032442908907055,
"loss": 2.7977,
"step": 566000
},
{
"epoch": 15.24,
"learning_rate": 0.0002489213786129603,
"loss": 2.7946,
"step": 567000
},
{
"epoch": 15.27,
"learning_rate": 0.00024751692368191907,
"loss": 2.7985,
"step": 568000
},
{
"epoch": 15.3,
"learning_rate": 0.0002461124687508778,
"loss": 2.7984,
"step": 569000
},
{
"epoch": 15.32,
"learning_rate": 0.00024470801381983655,
"loss": 2.7972,
"step": 570000
},
{
"epoch": 15.35,
"learning_rate": 0.0002433049633437263,
"loss": 2.7978,
"step": 571000
},
{
"epoch": 15.38,
"learning_rate": 0.00024190050841268505,
"loss": 2.8039,
"step": 572000
},
{
"epoch": 15.4,
"learning_rate": 0.00024049605348164377,
"loss": 2.8002,
"step": 573000
},
{
"epoch": 15.43,
"learning_rate": 0.00023909300300553355,
"loss": 2.8051,
"step": 574000
},
{
"epoch": 15.46,
"learning_rate": 0.00023768854807449232,
"loss": 2.8069,
"step": 575000
},
{
"epoch": 15.48,
"learning_rate": 0.00023628409314345103,
"loss": 2.8039,
"step": 576000
},
{
"epoch": 15.51,
"learning_rate": 0.00023488104266734081,
"loss": 2.8068,
"step": 577000
},
{
"epoch": 15.54,
"learning_rate": 0.00023347658773629953,
"loss": 2.8093,
"step": 578000
},
{
"epoch": 15.56,
"learning_rate": 0.0002320735372601893,
"loss": 2.8067,
"step": 579000
},
{
"epoch": 15.59,
"learning_rate": 0.00023066908232914808,
"loss": 2.8073,
"step": 580000
},
{
"epoch": 15.62,
"learning_rate": 0.0002292646273981068,
"loss": 2.8129,
"step": 581000
},
{
"epoch": 15.64,
"learning_rate": 0.00022786017246706554,
"loss": 2.8102,
"step": 582000
},
{
"epoch": 15.67,
"learning_rate": 0.00022645712199095532,
"loss": 2.812,
"step": 583000
},
{
"epoch": 15.7,
"learning_rate": 0.00022505266705991406,
"loss": 2.8093,
"step": 584000
},
{
"epoch": 15.73,
"learning_rate": 0.00022364961658380384,
"loss": 2.8139,
"step": 585000
},
{
"epoch": 15.75,
"learning_rate": 0.00022224516165276256,
"loss": 2.8115,
"step": 586000
},
{
"epoch": 15.78,
"learning_rate": 0.0002208407067217213,
"loss": 2.8157,
"step": 587000
},
{
"epoch": 15.81,
"learning_rate": 0.00021943765624561108,
"loss": 2.8138,
"step": 588000
},
{
"epoch": 15.83,
"learning_rate": 0.00021803320131456982,
"loss": 2.8146,
"step": 589000
},
{
"epoch": 15.86,
"learning_rate": 0.0002166301508384596,
"loss": 2.8138,
"step": 590000
},
{
"epoch": 15.89,
"learning_rate": 0.00021522569590741835,
"loss": 2.8195,
"step": 591000
},
{
"epoch": 15.91,
"learning_rate": 0.0002138226454313081,
"loss": 2.8192,
"step": 592000
},
{
"epoch": 15.94,
"learning_rate": 0.00021241819050026685,
"loss": 2.8169,
"step": 593000
},
{
"epoch": 15.97,
"learning_rate": 0.00021101373556922559,
"loss": 2.8174,
"step": 594000
},
{
"epoch": 15.99,
"learning_rate": 0.00020961068509311537,
"loss": 2.8151,
"step": 595000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4140438576219447,
"eval_loss": 3.3717539310455322,
"eval_runtime": 148.2583,
"eval_samples_per_second": 390.669,
"eval_steps_per_second": 6.104,
"step": 595216
},
{
"epoch": 16.02,
"learning_rate": 0.0002082062301620741,
"loss": 2.7799,
"step": 596000
},
{
"epoch": 16.05,
"learning_rate": 0.00020680177523103282,
"loss": 2.7671,
"step": 597000
},
{
"epoch": 16.07,
"learning_rate": 0.0002053987247549226,
"loss": 2.772,
"step": 598000
},
{
"epoch": 16.1,
"learning_rate": 0.00020399426982388135,
"loss": 2.7732,
"step": 599000
},
{
"epoch": 16.13,
"learning_rate": 0.00020259121934777113,
"loss": 2.7791,
"step": 600000
},
{
"epoch": 16.16,
"learning_rate": 0.00020118676441672987,
"loss": 2.7742,
"step": 601000
},
{
"epoch": 16.18,
"learning_rate": 0.0001997823094856886,
"loss": 2.7786,
"step": 602000
},
{
"epoch": 16.21,
"learning_rate": 0.0001983792590095784,
"loss": 2.7834,
"step": 603000
},
{
"epoch": 16.24,
"learning_rate": 0.00019697620853346818,
"loss": 2.7824,
"step": 604000
},
{
"epoch": 16.26,
"learning_rate": 0.0001955717536024269,
"loss": 2.7857,
"step": 605000
},
{
"epoch": 16.29,
"learning_rate": 0.00019416729867138564,
"loss": 2.7824,
"step": 606000
},
{
"epoch": 16.32,
"learning_rate": 0.00019276284374034438,
"loss": 2.7849,
"step": 607000
},
{
"epoch": 16.34,
"learning_rate": 0.00019135838880930312,
"loss": 2.7853,
"step": 608000
},
{
"epoch": 16.37,
"learning_rate": 0.0001899553383331929,
"loss": 2.7886,
"step": 609000
},
{
"epoch": 16.4,
"learning_rate": 0.00018855088340215162,
"loss": 2.7843,
"step": 610000
},
{
"epoch": 16.42,
"learning_rate": 0.0001871478329260414,
"loss": 2.7929,
"step": 611000
},
{
"epoch": 16.45,
"learning_rate": 0.00018574337799500017,
"loss": 2.7879,
"step": 612000
},
{
"epoch": 16.48,
"learning_rate": 0.00018434032751888993,
"loss": 2.7893,
"step": 613000
},
{
"epoch": 16.5,
"learning_rate": 0.0001829372770427797,
"loss": 2.791,
"step": 614000
},
{
"epoch": 16.53,
"learning_rate": 0.00018153282211173845,
"loss": 2.7879,
"step": 615000
},
{
"epoch": 16.56,
"learning_rate": 0.0001801297716356282,
"loss": 2.7904,
"step": 616000
},
{
"epoch": 16.59,
"learning_rate": 0.00017872531670458695,
"loss": 2.7892,
"step": 617000
},
{
"epoch": 16.61,
"learning_rate": 0.0001773208617735457,
"loss": 2.7929,
"step": 618000
},
{
"epoch": 16.64,
"learning_rate": 0.00017591640684250443,
"loss": 2.7952,
"step": 619000
},
{
"epoch": 16.67,
"learning_rate": 0.00017451195191146317,
"loss": 2.7835,
"step": 620000
},
{
"epoch": 16.69,
"learning_rate": 0.00017311030589028397,
"loss": 2.793,
"step": 621000
},
{
"epoch": 16.72,
"learning_rate": 0.0001717058509592427,
"loss": 2.7948,
"step": 622000
},
{
"epoch": 16.75,
"learning_rate": 0.00017030139602820148,
"loss": 2.7917,
"step": 623000
},
{
"epoch": 16.77,
"learning_rate": 0.0001688969410971602,
"loss": 2.7938,
"step": 624000
},
{
"epoch": 16.8,
"learning_rate": 0.00016749248616611893,
"loss": 2.7978,
"step": 625000
},
{
"epoch": 16.83,
"learning_rate": 0.0001660894356900087,
"loss": 2.7945,
"step": 626000
},
{
"epoch": 16.85,
"learning_rate": 0.00016468498075896746,
"loss": 2.7943,
"step": 627000
},
{
"epoch": 16.88,
"learning_rate": 0.00016328193028285724,
"loss": 2.7918,
"step": 628000
},
{
"epoch": 16.91,
"learning_rate": 0.00016187747535181596,
"loss": 2.7988,
"step": 629000
},
{
"epoch": 16.94,
"learning_rate": 0.00016047442487570574,
"loss": 2.7968,
"step": 630000
},
{
"epoch": 16.96,
"learning_rate": 0.00015906996994466445,
"loss": 2.7929,
"step": 631000
},
{
"epoch": 16.99,
"learning_rate": 0.00015766551501362322,
"loss": 2.798,
"step": 632000
},
{
"epoch": 17.0,
"eval_accuracy": 0.41277355590429304,
"eval_loss": 3.385791063308716,
"eval_runtime": 147.8055,
"eval_samples_per_second": 391.866,
"eval_steps_per_second": 6.123,
"step": 632417
},
{
"epoch": 17.02,
"learning_rate": 0.000156262464537513,
"loss": 2.7792,
"step": 633000
},
{
"epoch": 17.04,
"learning_rate": 0.00015485800960647172,
"loss": 2.7608,
"step": 634000
},
{
"epoch": 17.07,
"learning_rate": 0.0001534549591303615,
"loss": 2.7645,
"step": 635000
},
{
"epoch": 17.1,
"learning_rate": 0.00015205050419932024,
"loss": 2.7656,
"step": 636000
},
{
"epoch": 17.12,
"learning_rate": 0.00015064604926827898,
"loss": 2.7604,
"step": 637000
},
{
"epoch": 17.15,
"learning_rate": 0.00014924299879216877,
"loss": 2.7683,
"step": 638000
},
{
"epoch": 17.18,
"learning_rate": 0.00014783854386112748,
"loss": 2.7652,
"step": 639000
},
{
"epoch": 17.2,
"learning_rate": 0.00014643408893008622,
"loss": 2.7589,
"step": 640000
},
{
"epoch": 17.23,
"learning_rate": 0.000145029633999045,
"loss": 2.7709,
"step": 641000
},
{
"epoch": 17.26,
"learning_rate": 0.00014362658352293475,
"loss": 2.7638,
"step": 642000
},
{
"epoch": 17.28,
"learning_rate": 0.00014222353304682453,
"loss": 2.7667,
"step": 643000
},
{
"epoch": 17.31,
"learning_rate": 0.00014081907811578327,
"loss": 2.7637,
"step": 644000
},
{
"epoch": 17.34,
"learning_rate": 0.00013941462318474201,
"loss": 2.7752,
"step": 645000
},
{
"epoch": 17.37,
"learning_rate": 0.0001380115727086318,
"loss": 2.7716,
"step": 646000
},
{
"epoch": 17.39,
"learning_rate": 0.0001366071177775905,
"loss": 2.7665,
"step": 647000
},
{
"epoch": 17.42,
"learning_rate": 0.00013520266284654925,
"loss": 2.7669,
"step": 648000
},
{
"epoch": 17.45,
"learning_rate": 0.00013379820791550802,
"loss": 2.7743,
"step": 649000
},
{
"epoch": 17.47,
"learning_rate": 0.00013239375298446673,
"loss": 2.7733,
"step": 650000
},
{
"epoch": 17.5,
"learning_rate": 0.00013099070250835652,
"loss": 2.7713,
"step": 651000
},
{
"epoch": 17.53,
"learning_rate": 0.0001295876520322463,
"loss": 2.7694,
"step": 652000
},
{
"epoch": 17.55,
"learning_rate": 0.00012818319710120502,
"loss": 2.767,
"step": 653000
},
{
"epoch": 17.58,
"learning_rate": 0.00012677874217016378,
"loss": 2.7736,
"step": 654000
},
{
"epoch": 17.61,
"learning_rate": 0.0001253742872391225,
"loss": 2.7743,
"step": 655000
},
{
"epoch": 17.63,
"learning_rate": 0.00012396983230808124,
"loss": 2.7739,
"step": 656000
},
{
"epoch": 17.66,
"learning_rate": 0.000122566781831971,
"loss": 2.7722,
"step": 657000
},
{
"epoch": 17.69,
"learning_rate": 0.00012116232690092975,
"loss": 2.776,
"step": 658000
},
{
"epoch": 17.71,
"learning_rate": 0.00011975787196988849,
"loss": 2.7807,
"step": 659000
},
{
"epoch": 17.74,
"learning_rate": 0.00011835482149377827,
"loss": 2.7719,
"step": 660000
},
{
"epoch": 17.77,
"learning_rate": 0.000116950366562737,
"loss": 2.7747,
"step": 661000
},
{
"epoch": 17.8,
"learning_rate": 0.00011554731608662679,
"loss": 2.7782,
"step": 662000
},
{
"epoch": 17.82,
"learning_rate": 0.00011414286115558551,
"loss": 2.7738,
"step": 663000
},
{
"epoch": 17.85,
"learning_rate": 0.0001127398106794753,
"loss": 2.7756,
"step": 664000
},
{
"epoch": 17.88,
"learning_rate": 0.00011133535574843404,
"loss": 2.7715,
"step": 665000
},
{
"epoch": 17.9,
"learning_rate": 0.00010993090081739278,
"loss": 2.7809,
"step": 666000
},
{
"epoch": 17.93,
"learning_rate": 0.00010852785034128255,
"loss": 2.7813,
"step": 667000
},
{
"epoch": 17.96,
"learning_rate": 0.00010712339541024129,
"loss": 2.7748,
"step": 668000
},
{
"epoch": 17.98,
"learning_rate": 0.00010571894047920003,
"loss": 2.7738,
"step": 669000
},
{
"epoch": 18.0,
"eval_accuracy": 0.41297422178988324,
"eval_loss": 3.407961368560791,
"eval_runtime": 148.1896,
"eval_samples_per_second": 390.851,
"eval_steps_per_second": 6.107,
"step": 669618
},
{
"epoch": 18.01,
"learning_rate": 0.0001043158900030898,
"loss": 2.7651,
"step": 670000
},
{
"epoch": 18.04,
"learning_rate": 0.00010291143507204854,
"loss": 2.7487,
"step": 671000
},
{
"epoch": 18.06,
"learning_rate": 0.00010150838459593833,
"loss": 2.7489,
"step": 672000
},
{
"epoch": 18.09,
"learning_rate": 0.00010010392966489705,
"loss": 2.7467,
"step": 673000
},
{
"epoch": 18.12,
"learning_rate": 9.870087918878684e-05,
"loss": 2.7491,
"step": 674000
},
{
"epoch": 18.14,
"learning_rate": 9.729642425774556e-05,
"loss": 2.7511,
"step": 675000
},
{
"epoch": 18.17,
"learning_rate": 9.58919693267043e-05,
"loss": 2.745,
"step": 676000
},
{
"epoch": 18.2,
"learning_rate": 9.448891885059409e-05,
"loss": 2.7478,
"step": 677000
},
{
"epoch": 18.23,
"learning_rate": 9.308446391955282e-05,
"loss": 2.755,
"step": 678000
},
{
"epoch": 18.25,
"learning_rate": 9.168000898851156e-05,
"loss": 2.7538,
"step": 679000
},
{
"epoch": 18.28,
"learning_rate": 9.027695851240134e-05,
"loss": 2.755,
"step": 680000
},
{
"epoch": 18.31,
"learning_rate": 8.887250358136008e-05,
"loss": 2.7577,
"step": 681000
},
{
"epoch": 18.33,
"learning_rate": 8.746945310524985e-05,
"loss": 2.7505,
"step": 682000
},
{
"epoch": 18.36,
"learning_rate": 8.60649981742086e-05,
"loss": 2.7591,
"step": 683000
},
{
"epoch": 18.39,
"learning_rate": 8.466194769809838e-05,
"loss": 2.7601,
"step": 684000
},
{
"epoch": 18.41,
"learning_rate": 8.32574927670571e-05,
"loss": 2.7567,
"step": 685000
},
{
"epoch": 18.44,
"learning_rate": 8.185444229094687e-05,
"loss": 2.7547,
"step": 686000
},
{
"epoch": 18.47,
"learning_rate": 8.044998735990562e-05,
"loss": 2.7584,
"step": 687000
},
{
"epoch": 18.49,
"learning_rate": 7.904553242886437e-05,
"loss": 2.7554,
"step": 688000
},
{
"epoch": 18.52,
"learning_rate": 7.764388640768517e-05,
"loss": 2.756,
"step": 689000
},
{
"epoch": 18.55,
"learning_rate": 7.623943147664391e-05,
"loss": 2.7581,
"step": 690000
},
{
"epoch": 18.57,
"learning_rate": 7.483497654560266e-05,
"loss": 2.7593,
"step": 691000
},
{
"epoch": 18.6,
"learning_rate": 7.343192606949243e-05,
"loss": 2.7549,
"step": 692000
},
{
"epoch": 18.63,
"learning_rate": 7.202747113845116e-05,
"loss": 2.761,
"step": 693000
},
{
"epoch": 18.66,
"learning_rate": 7.06230162074099e-05,
"loss": 2.7556,
"step": 694000
},
{
"epoch": 18.68,
"learning_rate": 6.921856127636864e-05,
"loss": 2.7513,
"step": 695000
},
{
"epoch": 18.71,
"learning_rate": 6.781410634532737e-05,
"loss": 2.7577,
"step": 696000
},
{
"epoch": 18.74,
"learning_rate": 6.641105586921716e-05,
"loss": 2.759,
"step": 697000
},
{
"epoch": 18.76,
"learning_rate": 6.50066009381759e-05,
"loss": 2.7603,
"step": 698000
},
{
"epoch": 18.79,
"learning_rate": 6.360355046206568e-05,
"loss": 2.7598,
"step": 699000
},
{
"epoch": 18.82,
"learning_rate": 6.219909553102441e-05,
"loss": 2.7545,
"step": 700000
},
{
"epoch": 18.84,
"learning_rate": 6.079464059998314e-05,
"loss": 2.7603,
"step": 701000
},
{
"epoch": 18.87,
"learning_rate": 5.939018566894189e-05,
"loss": 2.7558,
"step": 702000
},
{
"epoch": 18.9,
"learning_rate": 5.798713519283167e-05,
"loss": 2.7559,
"step": 703000
},
{
"epoch": 18.92,
"learning_rate": 5.65826802617904e-05,
"loss": 2.7584,
"step": 704000
},
{
"epoch": 18.95,
"learning_rate": 5.5178225330749135e-05,
"loss": 2.7614,
"step": 705000
},
{
"epoch": 18.98,
"learning_rate": 5.377517485463892e-05,
"loss": 2.7555,
"step": 706000
},
{
"epoch": 19.0,
"eval_accuracy": 0.41307418524410433,
"eval_loss": 3.4066617488861084,
"eval_runtime": 148.4411,
"eval_samples_per_second": 390.189,
"eval_steps_per_second": 6.097,
"step": 706819
},
{
"epoch": 19.0,
"learning_rate": 5.237071992359765e-05,
"loss": 2.7536,
"step": 707000
},
{
"epoch": 19.03,
"learning_rate": 5.096766944748743e-05,
"loss": 2.7341,
"step": 708000
},
{
"epoch": 19.06,
"learning_rate": 4.956461897137721e-05,
"loss": 2.7413,
"step": 709000
},
{
"epoch": 19.09,
"learning_rate": 4.816016404033595e-05,
"loss": 2.7406,
"step": 710000
},
{
"epoch": 19.11,
"learning_rate": 4.675570910929468e-05,
"loss": 2.7417,
"step": 711000
},
{
"epoch": 19.14,
"learning_rate": 4.535265863318446e-05,
"loss": 2.7401,
"step": 712000
},
{
"epoch": 19.17,
"learning_rate": 4.39482037021432e-05,
"loss": 2.741,
"step": 713000
},
{
"epoch": 19.19,
"learning_rate": 4.254374877110194e-05,
"loss": 2.7379,
"step": 714000
},
{
"epoch": 19.22,
"learning_rate": 4.1139293840060675e-05,
"loss": 2.7369,
"step": 715000
},
{
"epoch": 19.25,
"learning_rate": 3.973483890901941e-05,
"loss": 2.7397,
"step": 716000
},
{
"epoch": 19.27,
"learning_rate": 3.8330383977978144e-05,
"loss": 2.7431,
"step": 717000
},
{
"epoch": 19.3,
"learning_rate": 3.692873795679897e-05,
"loss": 2.7404,
"step": 718000
},
{
"epoch": 19.33,
"learning_rate": 3.5524283025757704e-05,
"loss": 2.7413,
"step": 719000
},
{
"epoch": 19.35,
"learning_rate": 3.411982809471644e-05,
"loss": 2.7379,
"step": 720000
},
{
"epoch": 19.38,
"learning_rate": 3.271537316367518e-05,
"loss": 2.7436,
"step": 721000
},
{
"epoch": 19.41,
"learning_rate": 3.1312322687564956e-05,
"loss": 2.7363,
"step": 722000
},
{
"epoch": 19.43,
"learning_rate": 2.9907867756523694e-05,
"loss": 2.7387,
"step": 723000
},
{
"epoch": 19.46,
"learning_rate": 2.850481728041347e-05,
"loss": 2.7431,
"step": 724000
},
{
"epoch": 19.49,
"learning_rate": 2.710036234937221e-05,
"loss": 2.7458,
"step": 725000
},
{
"epoch": 19.52,
"learning_rate": 2.569731187326199e-05,
"loss": 2.7419,
"step": 726000
},
{
"epoch": 19.54,
"learning_rate": 2.4292856942220723e-05,
"loss": 2.7415,
"step": 727000
},
{
"epoch": 19.57,
"learning_rate": 2.2888402011179464e-05,
"loss": 2.7392,
"step": 728000
},
{
"epoch": 19.6,
"learning_rate": 2.1485351535069238e-05,
"loss": 2.7467,
"step": 729000
},
{
"epoch": 19.62,
"learning_rate": 2.008089660402798e-05,
"loss": 2.7393,
"step": 730000
},
{
"epoch": 19.65,
"learning_rate": 1.8677846127917755e-05,
"loss": 2.7378,
"step": 731000
},
{
"epoch": 19.68,
"learning_rate": 1.7273391196876493e-05,
"loss": 2.747,
"step": 732000
},
{
"epoch": 19.7,
"learning_rate": 1.587034072076627e-05,
"loss": 2.7419,
"step": 733000
},
{
"epoch": 19.73,
"learning_rate": 1.4465885789725008e-05,
"loss": 2.7466,
"step": 734000
},
{
"epoch": 19.76,
"learning_rate": 1.3062835313614786e-05,
"loss": 2.7354,
"step": 735000
},
{
"epoch": 19.78,
"learning_rate": 1.1658380382573524e-05,
"loss": 2.7457,
"step": 736000
},
{
"epoch": 19.81,
"learning_rate": 1.025392545153226e-05,
"loss": 2.7398,
"step": 737000
},
{
"epoch": 19.84,
"learning_rate": 8.850874975422038e-06,
"loss": 2.7415,
"step": 738000
},
{
"epoch": 19.87,
"learning_rate": 7.446420044380776e-06,
"loss": 2.7381,
"step": 739000
},
{
"epoch": 19.89,
"learning_rate": 6.043369568270554e-06,
"loss": 2.7407,
"step": 740000
},
{
"epoch": 19.92,
"learning_rate": 4.638914637229291e-06,
"loss": 2.7397,
"step": 741000
},
{
"epoch": 19.95,
"learning_rate": 3.2344597061880285e-06,
"loss": 2.7412,
"step": 742000
},
{
"epoch": 19.97,
"learning_rate": 1.831409230077807e-06,
"loss": 2.7409,
"step": 743000
},
{
"epoch": 20.0,
"learning_rate": 4.2695429903654394e-07,
"loss": 2.7434,
"step": 744000
},
{
"epoch": 20.0,
"eval_accuracy": 0.41252109443859236,
"eval_loss": 3.417576313018799,
"eval_runtime": 148.878,
"eval_samples_per_second": 389.043,
"eval_steps_per_second": 6.079,
"step": 744020
},
{
"epoch": 20.0,
"step": 744020,
"total_flos": 1.56740238729216e+18,
"train_loss": 2.994195082282441,
"train_runtime": 55239.7487,
"train_samples_per_second": 215.503,
"train_steps_per_second": 13.469
}
],
"logging_steps": 1000,
"max_steps": 744020,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000,
"total_flos": 1.56740238729216e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}