Mistral-Chem-v1-417M / trainer_state.json
RaphaelMourad's picture
Upload 9 files
991451a verified
raw
history blame contribute delete
No virus
82.9 kB
{
"best_metric": 1.305156946182251,
"best_model_checkpoint": "./results/models/checkpoint-230688",
"epoch": 24.0,
"eval_steps": 500,
"global_step": 230688,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05201831044527674,
"grad_norm": 0.34375,
"learning_rate": 0.001997919267582189,
"loss": 2.3383,
"step": 500
},
{
"epoch": 0.10403662089055347,
"grad_norm": 0.93359375,
"learning_rate": 0.001995838535164378,
"loss": 1.9394,
"step": 1000
},
{
"epoch": 0.1560549313358302,
"grad_norm": 0.5078125,
"learning_rate": 0.001993757802746567,
"loss": 1.8509,
"step": 1500
},
{
"epoch": 0.20807324178110695,
"grad_norm": 0.341796875,
"learning_rate": 0.0019916770703287557,
"loss": 1.8119,
"step": 2000
},
{
"epoch": 0.2600915522263837,
"grad_norm": 0.30859375,
"learning_rate": 0.0019895963379109446,
"loss": 1.746,
"step": 2500
},
{
"epoch": 0.3121098626716604,
"grad_norm": 0.4921875,
"learning_rate": 0.0019875156054931335,
"loss": 1.7113,
"step": 3000
},
{
"epoch": 0.3641281731169372,
"grad_norm": 0.37109375,
"learning_rate": 0.0019854348730753224,
"loss": 1.6861,
"step": 3500
},
{
"epoch": 0.4161464835622139,
"grad_norm": 0.2451171875,
"learning_rate": 0.0019833541406575114,
"loss": 1.6518,
"step": 4000
},
{
"epoch": 0.4681647940074906,
"grad_norm": 1.2578125,
"learning_rate": 0.0019812734082397003,
"loss": 1.6257,
"step": 4500
},
{
"epoch": 0.5201831044527674,
"grad_norm": 0.322265625,
"learning_rate": 0.0019791926758218896,
"loss": 1.6184,
"step": 5000
},
{
"epoch": 0.5722014148980441,
"grad_norm": 0.2177734375,
"learning_rate": 0.001977111943404078,
"loss": 1.6034,
"step": 5500
},
{
"epoch": 0.6242197253433208,
"grad_norm": 0.46875,
"learning_rate": 0.001975031210986267,
"loss": 1.5798,
"step": 6000
},
{
"epoch": 0.6762380357885975,
"grad_norm": 0.349609375,
"learning_rate": 0.0019729504785684564,
"loss": 1.6023,
"step": 6500
},
{
"epoch": 0.7282563462338744,
"grad_norm": 0.314453125,
"learning_rate": 0.0019708697461506453,
"loss": 1.6354,
"step": 7000
},
{
"epoch": 0.7802746566791511,
"grad_norm": 0.259765625,
"learning_rate": 0.0019687890137328337,
"loss": 1.6039,
"step": 7500
},
{
"epoch": 0.8322929671244278,
"grad_norm": 0.48046875,
"learning_rate": 0.001966708281315023,
"loss": 1.5863,
"step": 8000
},
{
"epoch": 0.8843112775697045,
"grad_norm": 0.99609375,
"learning_rate": 0.001964627548897212,
"loss": 1.5758,
"step": 8500
},
{
"epoch": 0.9363295880149812,
"grad_norm": 0.2333984375,
"learning_rate": 0.0019625468164794005,
"loss": 1.5658,
"step": 9000
},
{
"epoch": 0.9883478984602581,
"grad_norm": 0.296875,
"learning_rate": 0.00196046608406159,
"loss": 1.5664,
"step": 9500
},
{
"epoch": 1.0,
"eval_loss": 1.6215704679489136,
"eval_runtime": 1.5075,
"eval_samples_per_second": 663.37,
"eval_steps_per_second": 0.663,
"step": 9612
},
{
"epoch": 1.0403662089055348,
"grad_norm": 0.28515625,
"learning_rate": 0.0019583853516437788,
"loss": 1.5684,
"step": 10000
},
{
"epoch": 1.0923845193508115,
"grad_norm": 0.75,
"learning_rate": 0.0019563046192259677,
"loss": 1.5536,
"step": 10500
},
{
"epoch": 1.1444028297960882,
"grad_norm": 0.29296875,
"learning_rate": 0.0019542238868081566,
"loss": 1.5495,
"step": 11000
},
{
"epoch": 1.196421140241365,
"grad_norm": 0.25,
"learning_rate": 0.0019521431543903455,
"loss": 1.529,
"step": 11500
},
{
"epoch": 1.2484394506866416,
"grad_norm": 0.38671875,
"learning_rate": 0.0019500624219725344,
"loss": 1.5307,
"step": 12000
},
{
"epoch": 1.3004577611319184,
"grad_norm": 0.21875,
"learning_rate": 0.0019479816895547233,
"loss": 1.5422,
"step": 12500
},
{
"epoch": 1.352476071577195,
"grad_norm": 0.34375,
"learning_rate": 0.0019459009571369122,
"loss": 1.5281,
"step": 13000
},
{
"epoch": 1.404494382022472,
"grad_norm": 0.2421875,
"learning_rate": 0.0019438202247191011,
"loss": 1.5232,
"step": 13500
},
{
"epoch": 1.4565126924677487,
"grad_norm": 0.98828125,
"learning_rate": 0.00194173949230129,
"loss": 1.5286,
"step": 14000
},
{
"epoch": 1.5085310029130254,
"grad_norm": 0.72265625,
"learning_rate": 0.001939658759883479,
"loss": 1.5286,
"step": 14500
},
{
"epoch": 1.5605493133583022,
"grad_norm": 0.341796875,
"learning_rate": 0.001937578027465668,
"loss": 1.5173,
"step": 15000
},
{
"epoch": 1.6125676238035789,
"grad_norm": 0.2451171875,
"learning_rate": 0.0019354972950478568,
"loss": 1.5055,
"step": 15500
},
{
"epoch": 1.6645859342488556,
"grad_norm": 0.2255859375,
"learning_rate": 0.0019334165626300457,
"loss": 1.5071,
"step": 16000
},
{
"epoch": 1.7166042446941323,
"grad_norm": 0.37890625,
"learning_rate": 0.0019313358302122348,
"loss": 1.5071,
"step": 16500
},
{
"epoch": 1.768622555139409,
"grad_norm": 0.451171875,
"learning_rate": 0.0019292550977944235,
"loss": 1.5211,
"step": 17000
},
{
"epoch": 1.8206408655846857,
"grad_norm": 0.25,
"learning_rate": 0.0019271743653766125,
"loss": 1.5211,
"step": 17500
},
{
"epoch": 1.8726591760299627,
"grad_norm": 0.41796875,
"learning_rate": 0.0019250936329588016,
"loss": 1.5159,
"step": 18000
},
{
"epoch": 1.9246774864752392,
"grad_norm": 0.466796875,
"learning_rate": 0.0019230129005409905,
"loss": 1.5006,
"step": 18500
},
{
"epoch": 1.9766957969205161,
"grad_norm": 0.451171875,
"learning_rate": 0.0019209321681231794,
"loss": 1.5082,
"step": 19000
},
{
"epoch": 2.0,
"eval_loss": 1.5491766929626465,
"eval_runtime": 1.6608,
"eval_samples_per_second": 602.119,
"eval_steps_per_second": 0.602,
"step": 19224
},
{
"epoch": 2.0287141073657926,
"grad_norm": 0.296875,
"learning_rate": 0.0019188514357053683,
"loss": 1.5182,
"step": 19500
},
{
"epoch": 2.0807324178110695,
"grad_norm": 0.28125,
"learning_rate": 0.0019167707032875572,
"loss": 1.5172,
"step": 20000
},
{
"epoch": 2.132750728256346,
"grad_norm": 0.37109375,
"learning_rate": 0.0019146899708697464,
"loss": 1.5101,
"step": 20500
},
{
"epoch": 2.184769038701623,
"grad_norm": 0.2392578125,
"learning_rate": 0.001912609238451935,
"loss": 1.5086,
"step": 21000
},
{
"epoch": 2.2367873491468995,
"grad_norm": 0.2421875,
"learning_rate": 0.001910528506034124,
"loss": 1.4943,
"step": 21500
},
{
"epoch": 2.2888056595921764,
"grad_norm": 0.296875,
"learning_rate": 0.0019084477736163131,
"loss": 1.4848,
"step": 22000
},
{
"epoch": 2.3408239700374533,
"grad_norm": 0.306640625,
"learning_rate": 0.0019063670411985018,
"loss": 1.4823,
"step": 22500
},
{
"epoch": 2.39284228048273,
"grad_norm": 0.26953125,
"learning_rate": 0.0019042863087806907,
"loss": 1.4702,
"step": 23000
},
{
"epoch": 2.444860590928007,
"grad_norm": 1.296875,
"learning_rate": 0.0019022055763628799,
"loss": 1.4673,
"step": 23500
},
{
"epoch": 2.4968789013732833,
"grad_norm": 0.26953125,
"learning_rate": 0.0019001248439450688,
"loss": 1.4706,
"step": 24000
},
{
"epoch": 2.54889721181856,
"grad_norm": 0.75,
"learning_rate": 0.0018980441115272575,
"loss": 1.4635,
"step": 24500
},
{
"epoch": 2.6009155222638367,
"grad_norm": 0.201171875,
"learning_rate": 0.0018959633791094466,
"loss": 1.4499,
"step": 25000
},
{
"epoch": 2.6529338327091136,
"grad_norm": 0.3125,
"learning_rate": 0.0018938826466916355,
"loss": 1.4453,
"step": 25500
},
{
"epoch": 2.70495214315439,
"grad_norm": 0.27734375,
"learning_rate": 0.0018918019142738244,
"loss": 1.4463,
"step": 26000
},
{
"epoch": 2.756970453599667,
"grad_norm": 0.283203125,
"learning_rate": 0.0018897211818560133,
"loss": 1.452,
"step": 26500
},
{
"epoch": 2.808988764044944,
"grad_norm": 0.2236328125,
"learning_rate": 0.0018876404494382023,
"loss": 1.448,
"step": 27000
},
{
"epoch": 2.8610070744902205,
"grad_norm": 0.244140625,
"learning_rate": 0.0018855597170203914,
"loss": 1.4525,
"step": 27500
},
{
"epoch": 2.9130253849354975,
"grad_norm": 0.2412109375,
"learning_rate": 0.00188347898460258,
"loss": 1.4457,
"step": 28000
},
{
"epoch": 2.965043695380774,
"grad_norm": 0.60546875,
"learning_rate": 0.001881398252184769,
"loss": 1.4468,
"step": 28500
},
{
"epoch": 3.0,
"eval_loss": 1.4870332479476929,
"eval_runtime": 1.4668,
"eval_samples_per_second": 681.76,
"eval_steps_per_second": 0.682,
"step": 28836
},
{
"epoch": 3.017062005826051,
"grad_norm": 0.349609375,
"learning_rate": 0.0018793175197669581,
"loss": 1.4453,
"step": 29000
},
{
"epoch": 3.0690803162713274,
"grad_norm": 1.4453125,
"learning_rate": 0.001877236787349147,
"loss": 1.4455,
"step": 29500
},
{
"epoch": 3.1210986267166043,
"grad_norm": 0.2314453125,
"learning_rate": 0.0018751560549313357,
"loss": 1.4378,
"step": 30000
},
{
"epoch": 3.173116937161881,
"grad_norm": 0.326171875,
"learning_rate": 0.0018730753225135249,
"loss": 1.4342,
"step": 30500
},
{
"epoch": 3.2251352476071578,
"grad_norm": 5.09375,
"learning_rate": 0.0018709945900957138,
"loss": 1.4401,
"step": 31000
},
{
"epoch": 3.2771535580524347,
"grad_norm": 0.31640625,
"learning_rate": 0.0018689138576779025,
"loss": 1.4317,
"step": 31500
},
{
"epoch": 3.329171868497711,
"grad_norm": 0.291015625,
"learning_rate": 0.0018668331252600916,
"loss": 1.4252,
"step": 32000
},
{
"epoch": 3.381190178942988,
"grad_norm": 0.435546875,
"learning_rate": 0.0018647523928422805,
"loss": 1.427,
"step": 32500
},
{
"epoch": 3.4332084893882646,
"grad_norm": 0.56640625,
"learning_rate": 0.0018626716604244697,
"loss": 1.4207,
"step": 33000
},
{
"epoch": 3.4852267998335416,
"grad_norm": 0.31640625,
"learning_rate": 0.0018605909280066584,
"loss": 1.4209,
"step": 33500
},
{
"epoch": 3.537245110278818,
"grad_norm": 0.255859375,
"learning_rate": 0.0018585101955888473,
"loss": 1.418,
"step": 34000
},
{
"epoch": 3.589263420724095,
"grad_norm": 0.33984375,
"learning_rate": 0.0018564294631710364,
"loss": 1.4153,
"step": 34500
},
{
"epoch": 3.6412817311693715,
"grad_norm": 0.404296875,
"learning_rate": 0.001854348730753225,
"loss": 1.4171,
"step": 35000
},
{
"epoch": 3.6933000416146484,
"grad_norm": 0.7421875,
"learning_rate": 0.001852267998335414,
"loss": 1.4203,
"step": 35500
},
{
"epoch": 3.7453183520599254,
"grad_norm": 0.265625,
"learning_rate": 0.0018501872659176031,
"loss": 1.4189,
"step": 36000
},
{
"epoch": 3.797336662505202,
"grad_norm": 0.53125,
"learning_rate": 0.001848106533499792,
"loss": 1.4212,
"step": 36500
},
{
"epoch": 3.8493549729504783,
"grad_norm": 0.2158203125,
"learning_rate": 0.0018460258010819808,
"loss": 1.4151,
"step": 37000
},
{
"epoch": 3.9013732833957553,
"grad_norm": 0.349609375,
"learning_rate": 0.0018439450686641699,
"loss": 1.4087,
"step": 37500
},
{
"epoch": 3.9533915938410322,
"grad_norm": 0.228515625,
"learning_rate": 0.0018418643362463588,
"loss": 1.4038,
"step": 38000
},
{
"epoch": 4.0,
"eval_loss": 1.4296818971633911,
"eval_runtime": 1.3293,
"eval_samples_per_second": 752.251,
"eval_steps_per_second": 0.752,
"step": 38448
},
{
"epoch": 4.005409904286309,
"grad_norm": 2.53125,
"learning_rate": 0.0018397836038285475,
"loss": 1.4037,
"step": 38500
},
{
"epoch": 4.057428214731585,
"grad_norm": 1.703125,
"learning_rate": 0.0018377028714107366,
"loss": 1.402,
"step": 39000
},
{
"epoch": 4.109446525176862,
"grad_norm": 0.2138671875,
"learning_rate": 0.0018356221389929255,
"loss": 1.3972,
"step": 39500
},
{
"epoch": 4.161464835622139,
"grad_norm": 0.2001953125,
"learning_rate": 0.0018335414065751145,
"loss": 1.3996,
"step": 40000
},
{
"epoch": 4.213483146067416,
"grad_norm": 0.455078125,
"learning_rate": 0.0018314606741573034,
"loss": 1.3989,
"step": 40500
},
{
"epoch": 4.265501456512692,
"grad_norm": 0.2041015625,
"learning_rate": 0.0018293799417394923,
"loss": 1.3945,
"step": 41000
},
{
"epoch": 4.317519766957969,
"grad_norm": 0.267578125,
"learning_rate": 0.0018272992093216814,
"loss": 1.3936,
"step": 41500
},
{
"epoch": 4.369538077403246,
"grad_norm": 0.384765625,
"learning_rate": 0.0018252184769038703,
"loss": 1.3906,
"step": 42000
},
{
"epoch": 4.421556387848523,
"grad_norm": 0.380859375,
"learning_rate": 0.001823137744486059,
"loss": 1.3946,
"step": 42500
},
{
"epoch": 4.473574698293799,
"grad_norm": 0.353515625,
"learning_rate": 0.0018210570120682482,
"loss": 1.4069,
"step": 43000
},
{
"epoch": 4.525593008739076,
"grad_norm": 0.373046875,
"learning_rate": 0.001818976279650437,
"loss": 1.4049,
"step": 43500
},
{
"epoch": 4.577611319184353,
"grad_norm": 0.263671875,
"learning_rate": 0.0018168955472326258,
"loss": 1.3995,
"step": 44000
},
{
"epoch": 4.62962962962963,
"grad_norm": 0.392578125,
"learning_rate": 0.001814814814814815,
"loss": 1.4053,
"step": 44500
},
{
"epoch": 4.681647940074907,
"grad_norm": 0.5078125,
"learning_rate": 0.0018127340823970038,
"loss": 1.4,
"step": 45000
},
{
"epoch": 4.733666250520183,
"grad_norm": 0.255859375,
"learning_rate": 0.0018106533499791927,
"loss": 1.3946,
"step": 45500
},
{
"epoch": 4.78568456096546,
"grad_norm": 0.498046875,
"learning_rate": 0.0018085726175613816,
"loss": 1.3914,
"step": 46000
},
{
"epoch": 4.837702871410737,
"grad_norm": 1.0703125,
"learning_rate": 0.0018064918851435705,
"loss": 1.3882,
"step": 46500
},
{
"epoch": 4.889721181856014,
"grad_norm": 0.30859375,
"learning_rate": 0.0018044111527257595,
"loss": 1.3905,
"step": 47000
},
{
"epoch": 4.94173949230129,
"grad_norm": 0.2197265625,
"learning_rate": 0.0018023304203079484,
"loss": 1.3927,
"step": 47500
},
{
"epoch": 4.9937578027465666,
"grad_norm": 0.23828125,
"learning_rate": 0.0018002496878901373,
"loss": 1.3883,
"step": 48000
},
{
"epoch": 5.0,
"eval_loss": 1.4223600625991821,
"eval_runtime": 1.6852,
"eval_samples_per_second": 593.419,
"eval_steps_per_second": 0.593,
"step": 48060
},
{
"epoch": 5.0457761131918435,
"grad_norm": 0.29296875,
"learning_rate": 0.0017981689554723264,
"loss": 1.3873,
"step": 48500
},
{
"epoch": 5.09779442363712,
"grad_norm": 0.203125,
"learning_rate": 0.0017960882230545153,
"loss": 1.3831,
"step": 49000
},
{
"epoch": 5.149812734082397,
"grad_norm": 0.251953125,
"learning_rate": 0.001794007490636704,
"loss": 1.3784,
"step": 49500
},
{
"epoch": 5.201831044527673,
"grad_norm": 0.271484375,
"learning_rate": 0.0017919267582188932,
"loss": 1.3821,
"step": 50000
},
{
"epoch": 5.25384935497295,
"grad_norm": 0.451171875,
"learning_rate": 0.001789846025801082,
"loss": 1.3781,
"step": 50500
},
{
"epoch": 5.305867665418227,
"grad_norm": 0.33203125,
"learning_rate": 0.0017877652933832708,
"loss": 1.381,
"step": 51000
},
{
"epoch": 5.357885975863504,
"grad_norm": 0.40625,
"learning_rate": 0.00178568456096546,
"loss": 1.381,
"step": 51500
},
{
"epoch": 5.40990428630878,
"grad_norm": 0.263671875,
"learning_rate": 0.0017836038285476488,
"loss": 1.375,
"step": 52000
},
{
"epoch": 5.461922596754057,
"grad_norm": 0.314453125,
"learning_rate": 0.0017815230961298377,
"loss": 1.3776,
"step": 52500
},
{
"epoch": 5.513940907199334,
"grad_norm": 0.30078125,
"learning_rate": 0.0017794423637120266,
"loss": 1.3773,
"step": 53000
},
{
"epoch": 5.565959217644611,
"grad_norm": 0.228515625,
"learning_rate": 0.0017773616312942156,
"loss": 1.3809,
"step": 53500
},
{
"epoch": 5.617977528089888,
"grad_norm": 0.8671875,
"learning_rate": 0.0017752808988764045,
"loss": 1.3786,
"step": 54000
},
{
"epoch": 5.669995838535164,
"grad_norm": 0.275390625,
"learning_rate": 0.0017732001664585936,
"loss": 1.3762,
"step": 54500
},
{
"epoch": 5.722014148980441,
"grad_norm": 0.2451171875,
"learning_rate": 0.0017711194340407823,
"loss": 1.3741,
"step": 55000
},
{
"epoch": 5.774032459425718,
"grad_norm": 0.224609375,
"learning_rate": 0.0017690387016229714,
"loss": 1.3719,
"step": 55500
},
{
"epoch": 5.826050769870995,
"grad_norm": 0.208984375,
"learning_rate": 0.0017669579692051603,
"loss": 1.3712,
"step": 56000
},
{
"epoch": 5.878069080316271,
"grad_norm": 0.26171875,
"learning_rate": 0.001764877236787349,
"loss": 1.3716,
"step": 56500
},
{
"epoch": 5.930087390761548,
"grad_norm": 0.2373046875,
"learning_rate": 0.0017627965043695382,
"loss": 1.3739,
"step": 57000
},
{
"epoch": 5.982105701206825,
"grad_norm": 0.30859375,
"learning_rate": 0.001760715771951727,
"loss": 1.3744,
"step": 57500
},
{
"epoch": 6.0,
"eval_loss": 1.4039781093597412,
"eval_runtime": 1.6711,
"eval_samples_per_second": 598.397,
"eval_steps_per_second": 0.598,
"step": 57672
},
{
"epoch": 6.034124011652102,
"grad_norm": 0.2451171875,
"learning_rate": 0.001758635039533916,
"loss": 1.3701,
"step": 58000
},
{
"epoch": 6.086142322097379,
"grad_norm": 0.2197265625,
"learning_rate": 0.001756554307116105,
"loss": 1.3629,
"step": 58500
},
{
"epoch": 6.138160632542655,
"grad_norm": 0.2451171875,
"learning_rate": 0.0017544735746982938,
"loss": 1.3656,
"step": 59000
},
{
"epoch": 6.190178942987932,
"grad_norm": 0.23046875,
"learning_rate": 0.0017523928422804827,
"loss": 1.3673,
"step": 59500
},
{
"epoch": 6.242197253433209,
"grad_norm": 0.2080078125,
"learning_rate": 0.0017503121098626717,
"loss": 1.363,
"step": 60000
},
{
"epoch": 6.294215563878486,
"grad_norm": 1.046875,
"learning_rate": 0.0017482313774448606,
"loss": 1.3632,
"step": 60500
},
{
"epoch": 6.346233874323762,
"grad_norm": 0.359375,
"learning_rate": 0.0017461506450270495,
"loss": 1.36,
"step": 61000
},
{
"epoch": 6.398252184769039,
"grad_norm": 0.58203125,
"learning_rate": 0.0017440699126092386,
"loss": 1.3578,
"step": 61500
},
{
"epoch": 6.4502704952143155,
"grad_norm": 0.2421875,
"learning_rate": 0.0017419891801914273,
"loss": 1.3622,
"step": 62000
},
{
"epoch": 6.502288805659592,
"grad_norm": 0.21484375,
"learning_rate": 0.0017399084477736164,
"loss": 1.3607,
"step": 62500
},
{
"epoch": 6.554307116104869,
"grad_norm": 0.39453125,
"learning_rate": 0.0017378277153558054,
"loss": 1.3552,
"step": 63000
},
{
"epoch": 6.606325426550145,
"grad_norm": 0.322265625,
"learning_rate": 0.001735746982937994,
"loss": 1.3518,
"step": 63500
},
{
"epoch": 6.658343736995422,
"grad_norm": 0.310546875,
"learning_rate": 0.0017336662505201832,
"loss": 1.3498,
"step": 64000
},
{
"epoch": 6.710362047440699,
"grad_norm": 1.1484375,
"learning_rate": 0.001731585518102372,
"loss": 1.3528,
"step": 64500
},
{
"epoch": 6.762380357885976,
"grad_norm": 0.216796875,
"learning_rate": 0.001729504785684561,
"loss": 1.3528,
"step": 65000
},
{
"epoch": 6.814398668331252,
"grad_norm": 0.234375,
"learning_rate": 0.00172742405326675,
"loss": 1.3514,
"step": 65500
},
{
"epoch": 6.866416978776529,
"grad_norm": 0.44921875,
"learning_rate": 0.0017253433208489388,
"loss": 1.3519,
"step": 66000
},
{
"epoch": 6.918435289221806,
"grad_norm": 0.2412109375,
"learning_rate": 0.0017232625884311278,
"loss": 1.3475,
"step": 66500
},
{
"epoch": 6.970453599667083,
"grad_norm": 0.51171875,
"learning_rate": 0.0017211818560133169,
"loss": 1.3498,
"step": 67000
},
{
"epoch": 7.0,
"eval_loss": 1.3733755350112915,
"eval_runtime": 1.5013,
"eval_samples_per_second": 666.111,
"eval_steps_per_second": 0.666,
"step": 67284
},
{
"epoch": 7.022471910112359,
"grad_norm": 0.2373046875,
"learning_rate": 0.0017191011235955056,
"loss": 1.348,
"step": 67500
},
{
"epoch": 7.074490220557636,
"grad_norm": 0.2333984375,
"learning_rate": 0.0017170203911776945,
"loss": 1.3467,
"step": 68000
},
{
"epoch": 7.126508531002913,
"grad_norm": 0.26953125,
"learning_rate": 0.0017149396587598836,
"loss": 1.3484,
"step": 68500
},
{
"epoch": 7.17852684144819,
"grad_norm": 0.216796875,
"learning_rate": 0.0017128589263420723,
"loss": 1.3504,
"step": 69000
},
{
"epoch": 7.230545151893467,
"grad_norm": 0.27734375,
"learning_rate": 0.0017107781939242615,
"loss": 1.3474,
"step": 69500
},
{
"epoch": 7.282563462338743,
"grad_norm": 0.4296875,
"learning_rate": 0.0017086974615064504,
"loss": 1.3457,
"step": 70000
},
{
"epoch": 7.33458177278402,
"grad_norm": 0.494140625,
"learning_rate": 0.0017066167290886393,
"loss": 1.344,
"step": 70500
},
{
"epoch": 7.386600083229297,
"grad_norm": 0.259765625,
"learning_rate": 0.0017045359966708282,
"loss": 1.3409,
"step": 71000
},
{
"epoch": 7.438618393674574,
"grad_norm": 0.267578125,
"learning_rate": 0.001702455264253017,
"loss": 1.3427,
"step": 71500
},
{
"epoch": 7.49063670411985,
"grad_norm": 0.2236328125,
"learning_rate": 0.001700374531835206,
"loss": 1.3446,
"step": 72000
},
{
"epoch": 7.542655014565127,
"grad_norm": 0.26953125,
"learning_rate": 0.001698293799417395,
"loss": 1.3427,
"step": 72500
},
{
"epoch": 7.594673325010404,
"grad_norm": 0.251953125,
"learning_rate": 0.0016962130669995838,
"loss": 1.3451,
"step": 73000
},
{
"epoch": 7.646691635455681,
"grad_norm": 0.236328125,
"learning_rate": 0.0016941323345817728,
"loss": 1.3473,
"step": 73500
},
{
"epoch": 7.698709945900957,
"grad_norm": 0.2490234375,
"learning_rate": 0.001692051602163962,
"loss": 1.3487,
"step": 74000
},
{
"epoch": 7.750728256346234,
"grad_norm": 0.349609375,
"learning_rate": 0.0016899708697461506,
"loss": 1.3565,
"step": 74500
},
{
"epoch": 7.802746566791511,
"grad_norm": 0.291015625,
"learning_rate": 0.0016878901373283395,
"loss": 1.348,
"step": 75000
},
{
"epoch": 7.8547648772367875,
"grad_norm": 0.2021484375,
"learning_rate": 0.0016858094049105286,
"loss": 1.3478,
"step": 75500
},
{
"epoch": 7.9067831876820645,
"grad_norm": 0.259765625,
"learning_rate": 0.0016837286724927173,
"loss": 1.3484,
"step": 76000
},
{
"epoch": 7.9588014981273405,
"grad_norm": 0.1943359375,
"learning_rate": 0.0016816479400749065,
"loss": 1.3457,
"step": 76500
},
{
"epoch": 8.0,
"eval_loss": 1.3691484928131104,
"eval_runtime": 1.5204,
"eval_samples_per_second": 657.725,
"eval_steps_per_second": 0.658,
"step": 76896
},
{
"epoch": 8.010819808572618,
"grad_norm": 0.244140625,
"learning_rate": 0.0016795672076570954,
"loss": 1.3419,
"step": 77000
},
{
"epoch": 8.062838119017893,
"grad_norm": 0.271484375,
"learning_rate": 0.0016774864752392843,
"loss": 1.3375,
"step": 77500
},
{
"epoch": 8.11485642946317,
"grad_norm": 0.19921875,
"learning_rate": 0.0016754057428214732,
"loss": 1.3368,
"step": 78000
},
{
"epoch": 8.166874739908447,
"grad_norm": 0.349609375,
"learning_rate": 0.0016733250104036621,
"loss": 1.3385,
"step": 78500
},
{
"epoch": 8.218893050353724,
"grad_norm": 0.28125,
"learning_rate": 0.001671244277985851,
"loss": 1.3329,
"step": 79000
},
{
"epoch": 8.270911360799001,
"grad_norm": 0.462890625,
"learning_rate": 0.0016691635455680402,
"loss": 1.3346,
"step": 79500
},
{
"epoch": 8.322929671244278,
"grad_norm": 0.1943359375,
"learning_rate": 0.0016670828131502289,
"loss": 1.3342,
"step": 80000
},
{
"epoch": 8.374947981689555,
"grad_norm": 0.2099609375,
"learning_rate": 0.0016650020807324178,
"loss": 1.3313,
"step": 80500
},
{
"epoch": 8.426966292134832,
"grad_norm": 0.265625,
"learning_rate": 0.001662921348314607,
"loss": 1.33,
"step": 81000
},
{
"epoch": 8.478984602580109,
"grad_norm": 0.244140625,
"learning_rate": 0.0016608406158967956,
"loss": 1.3321,
"step": 81500
},
{
"epoch": 8.531002913025384,
"grad_norm": 0.2373046875,
"learning_rate": 0.0016587598834789845,
"loss": 1.3322,
"step": 82000
},
{
"epoch": 8.583021223470661,
"grad_norm": 0.2412109375,
"learning_rate": 0.0016566791510611736,
"loss": 1.3354,
"step": 82500
},
{
"epoch": 8.635039533915938,
"grad_norm": 0.26171875,
"learning_rate": 0.0016545984186433626,
"loss": 1.3358,
"step": 83000
},
{
"epoch": 8.687057844361215,
"grad_norm": 0.2109375,
"learning_rate": 0.0016525176862255513,
"loss": 1.3303,
"step": 83500
},
{
"epoch": 8.739076154806492,
"grad_norm": 0.1865234375,
"learning_rate": 0.0016504369538077404,
"loss": 1.3332,
"step": 84000
},
{
"epoch": 8.791094465251769,
"grad_norm": 0.201171875,
"learning_rate": 0.0016483562213899293,
"loss": 1.3337,
"step": 84500
},
{
"epoch": 8.843112775697046,
"grad_norm": 0.6015625,
"learning_rate": 0.0016462754889721182,
"loss": 1.3321,
"step": 85000
},
{
"epoch": 8.895131086142323,
"grad_norm": 0.205078125,
"learning_rate": 0.0016441947565543071,
"loss": 1.3283,
"step": 85500
},
{
"epoch": 8.947149396587598,
"grad_norm": 0.376953125,
"learning_rate": 0.001642114024136496,
"loss": 1.3306,
"step": 86000
},
{
"epoch": 8.999167707032875,
"grad_norm": 0.2099609375,
"learning_rate": 0.0016400332917186852,
"loss": 1.3315,
"step": 86500
},
{
"epoch": 9.0,
"eval_loss": 1.3568580150604248,
"eval_runtime": 1.6522,
"eval_samples_per_second": 605.266,
"eval_steps_per_second": 0.605,
"step": 86508
},
{
"epoch": 9.051186017478152,
"grad_norm": 0.77734375,
"learning_rate": 0.0016379525593008739,
"loss": 1.3232,
"step": 87000
},
{
"epoch": 9.103204327923429,
"grad_norm": 0.2265625,
"learning_rate": 0.0016358718268830628,
"loss": 1.3251,
"step": 87500
},
{
"epoch": 9.155222638368706,
"grad_norm": 0.21484375,
"learning_rate": 0.001633791094465252,
"loss": 1.3295,
"step": 88000
},
{
"epoch": 9.207240948813983,
"grad_norm": 0.25,
"learning_rate": 0.0016317103620474408,
"loss": 1.3261,
"step": 88500
},
{
"epoch": 9.25925925925926,
"grad_norm": 0.171875,
"learning_rate": 0.0016296296296296295,
"loss": 1.3288,
"step": 89000
},
{
"epoch": 9.311277569704536,
"grad_norm": 0.1962890625,
"learning_rate": 0.0016275488972118187,
"loss": 1.3276,
"step": 89500
},
{
"epoch": 9.363295880149813,
"grad_norm": 0.22265625,
"learning_rate": 0.0016254681647940076,
"loss": 1.3257,
"step": 90000
},
{
"epoch": 9.41531419059509,
"grad_norm": 0.26171875,
"learning_rate": 0.0016233874323761963,
"loss": 1.3219,
"step": 90500
},
{
"epoch": 9.467332501040365,
"grad_norm": 0.2470703125,
"learning_rate": 0.0016213066999583854,
"loss": 1.3219,
"step": 91000
},
{
"epoch": 9.519350811485642,
"grad_norm": 0.2109375,
"learning_rate": 0.0016192259675405743,
"loss": 1.3216,
"step": 91500
},
{
"epoch": 9.57136912193092,
"grad_norm": 0.189453125,
"learning_rate": 0.0016171452351227634,
"loss": 1.324,
"step": 92000
},
{
"epoch": 9.623387432376196,
"grad_norm": 0.1943359375,
"learning_rate": 0.0016150645027049521,
"loss": 1.3212,
"step": 92500
},
{
"epoch": 9.675405742821473,
"grad_norm": 0.2041015625,
"learning_rate": 0.001612983770287141,
"loss": 1.3217,
"step": 93000
},
{
"epoch": 9.72742405326675,
"grad_norm": 0.22265625,
"learning_rate": 0.0016109030378693302,
"loss": 1.3219,
"step": 93500
},
{
"epoch": 9.779442363712027,
"grad_norm": 0.2470703125,
"learning_rate": 0.0016088223054515189,
"loss": 1.3219,
"step": 94000
},
{
"epoch": 9.831460674157304,
"grad_norm": 0.2080078125,
"learning_rate": 0.0016067415730337078,
"loss": 1.3188,
"step": 94500
},
{
"epoch": 9.88347898460258,
"grad_norm": 0.2392578125,
"learning_rate": 0.001604660840615897,
"loss": 1.3205,
"step": 95000
},
{
"epoch": 9.935497295047856,
"grad_norm": 1.828125,
"learning_rate": 0.0016025801081980858,
"loss": 1.3238,
"step": 95500
},
{
"epoch": 9.987515605493133,
"grad_norm": 0.3984375,
"learning_rate": 0.0016004993757802745,
"loss": 1.3224,
"step": 96000
},
{
"epoch": 10.0,
"eval_loss": 1.3528562784194946,
"eval_runtime": 1.936,
"eval_samples_per_second": 516.533,
"eval_steps_per_second": 0.517,
"step": 96120
},
{
"epoch": 10.03953391593841,
"grad_norm": 0.2490234375,
"learning_rate": 0.0015984186433624637,
"loss": 1.3174,
"step": 96500
},
{
"epoch": 10.091552226383687,
"grad_norm": 0.404296875,
"learning_rate": 0.0015963379109446526,
"loss": 1.3199,
"step": 97000
},
{
"epoch": 10.143570536828964,
"grad_norm": 0.1982421875,
"learning_rate": 0.0015942571785268413,
"loss": 1.3187,
"step": 97500
},
{
"epoch": 10.19558884727424,
"grad_norm": 0.27734375,
"learning_rate": 0.0015921764461090304,
"loss": 1.3205,
"step": 98000
},
{
"epoch": 10.247607157719518,
"grad_norm": 0.259765625,
"learning_rate": 0.0015900957136912193,
"loss": 1.3201,
"step": 98500
},
{
"epoch": 10.299625468164795,
"grad_norm": 0.65625,
"learning_rate": 0.0015880149812734085,
"loss": 1.3193,
"step": 99000
},
{
"epoch": 10.35164377861007,
"grad_norm": 0.23828125,
"learning_rate": 0.0015859342488555972,
"loss": 1.3181,
"step": 99500
},
{
"epoch": 10.403662089055347,
"grad_norm": 0.23828125,
"learning_rate": 0.001583853516437786,
"loss": 1.3169,
"step": 100000
},
{
"epoch": 10.455680399500624,
"grad_norm": 0.298828125,
"learning_rate": 0.0015817727840199752,
"loss": 1.3162,
"step": 100500
},
{
"epoch": 10.5076987099459,
"grad_norm": 0.458984375,
"learning_rate": 0.0015796920516021641,
"loss": 1.3217,
"step": 101000
},
{
"epoch": 10.559717020391178,
"grad_norm": 0.337890625,
"learning_rate": 0.0015776113191843528,
"loss": 1.3245,
"step": 101500
},
{
"epoch": 10.611735330836455,
"grad_norm": 0.369140625,
"learning_rate": 0.001575530586766542,
"loss": 1.3233,
"step": 102000
},
{
"epoch": 10.663753641281732,
"grad_norm": 0.31640625,
"learning_rate": 0.0015734498543487309,
"loss": 1.3182,
"step": 102500
},
{
"epoch": 10.715771951727008,
"grad_norm": 0.2216796875,
"learning_rate": 0.0015713691219309195,
"loss": 1.3163,
"step": 103000
},
{
"epoch": 10.767790262172285,
"grad_norm": 0.2216796875,
"learning_rate": 0.0015692883895131087,
"loss": 1.3181,
"step": 103500
},
{
"epoch": 10.81980857261756,
"grad_norm": 0.25390625,
"learning_rate": 0.0015672076570952976,
"loss": 1.3171,
"step": 104000
},
{
"epoch": 10.871826883062838,
"grad_norm": 0.255859375,
"learning_rate": 0.0015651269246774865,
"loss": 1.3177,
"step": 104500
},
{
"epoch": 10.923845193508114,
"grad_norm": 0.306640625,
"learning_rate": 0.0015630461922596754,
"loss": 1.3211,
"step": 105000
},
{
"epoch": 10.975863503953391,
"grad_norm": 0.34765625,
"learning_rate": 0.0015609654598418643,
"loss": 1.3183,
"step": 105500
},
{
"epoch": 11.0,
"eval_loss": 1.347601056098938,
"eval_runtime": 1.5374,
"eval_samples_per_second": 650.453,
"eval_steps_per_second": 0.65,
"step": 105732
},
{
"epoch": 11.027881814398668,
"grad_norm": 0.21484375,
"learning_rate": 0.0015588847274240535,
"loss": 1.315,
"step": 106000
},
{
"epoch": 11.079900124843945,
"grad_norm": 0.2138671875,
"learning_rate": 0.0015568039950062422,
"loss": 1.3125,
"step": 106500
},
{
"epoch": 11.131918435289222,
"grad_norm": 0.1865234375,
"learning_rate": 0.001554723262588431,
"loss": 1.3114,
"step": 107000
},
{
"epoch": 11.1839367457345,
"grad_norm": 0.2412109375,
"learning_rate": 0.0015526425301706202,
"loss": 1.3104,
"step": 107500
},
{
"epoch": 11.235955056179776,
"grad_norm": 0.412109375,
"learning_rate": 0.0015505617977528091,
"loss": 1.3109,
"step": 108000
},
{
"epoch": 11.287973366625051,
"grad_norm": 0.2265625,
"learning_rate": 0.0015484810653349978,
"loss": 1.3119,
"step": 108500
},
{
"epoch": 11.339991677070328,
"grad_norm": 0.310546875,
"learning_rate": 0.001546400332917187,
"loss": 1.31,
"step": 109000
},
{
"epoch": 11.392009987515605,
"grad_norm": 0.50390625,
"learning_rate": 0.0015443196004993759,
"loss": 1.3114,
"step": 109500
},
{
"epoch": 11.444028297960882,
"grad_norm": 0.40234375,
"learning_rate": 0.0015422388680815646,
"loss": 1.3133,
"step": 110000
},
{
"epoch": 11.496046608406159,
"grad_norm": 0.2216796875,
"learning_rate": 0.0015401581356637537,
"loss": 1.3129,
"step": 110500
},
{
"epoch": 11.548064918851436,
"grad_norm": 0.267578125,
"learning_rate": 0.0015380774032459426,
"loss": 1.3125,
"step": 111000
},
{
"epoch": 11.600083229296713,
"grad_norm": 0.212890625,
"learning_rate": 0.0015359966708281315,
"loss": 1.312,
"step": 111500
},
{
"epoch": 11.65210153974199,
"grad_norm": 0.19140625,
"learning_rate": 0.0015339159384103204,
"loss": 1.3107,
"step": 112000
},
{
"epoch": 11.704119850187267,
"grad_norm": 0.251953125,
"learning_rate": 0.0015318352059925093,
"loss": 1.3116,
"step": 112500
},
{
"epoch": 11.756138160632542,
"grad_norm": 0.189453125,
"learning_rate": 0.0015297544735746985,
"loss": 1.3104,
"step": 113000
},
{
"epoch": 11.808156471077819,
"grad_norm": 0.353515625,
"learning_rate": 0.0015276737411568874,
"loss": 1.3102,
"step": 113500
},
{
"epoch": 11.860174781523096,
"grad_norm": 0.2314453125,
"learning_rate": 0.001525593008739076,
"loss": 1.309,
"step": 114000
},
{
"epoch": 11.912193091968373,
"grad_norm": 0.2431640625,
"learning_rate": 0.0015235122763212652,
"loss": 1.3096,
"step": 114500
},
{
"epoch": 11.96421140241365,
"grad_norm": 0.9296875,
"learning_rate": 0.0015214315439034541,
"loss": 1.307,
"step": 115000
},
{
"epoch": 12.0,
"eval_loss": 1.3371446132659912,
"eval_runtime": 1.4263,
"eval_samples_per_second": 701.11,
"eval_steps_per_second": 0.701,
"step": 115344
},
{
"epoch": 12.016229712858927,
"grad_norm": 0.185546875,
"learning_rate": 0.0015193508114856428,
"loss": 1.3041,
"step": 115500
},
{
"epoch": 12.068248023304204,
"grad_norm": 0.345703125,
"learning_rate": 0.001517270079067832,
"loss": 1.3025,
"step": 116000
},
{
"epoch": 12.12026633374948,
"grad_norm": 0.232421875,
"learning_rate": 0.0015151893466500209,
"loss": 1.3041,
"step": 116500
},
{
"epoch": 12.172284644194757,
"grad_norm": 0.20703125,
"learning_rate": 0.0015131086142322098,
"loss": 1.3063,
"step": 117000
},
{
"epoch": 12.224302954640033,
"grad_norm": 0.2119140625,
"learning_rate": 0.0015110278818143987,
"loss": 1.3037,
"step": 117500
},
{
"epoch": 12.27632126508531,
"grad_norm": 0.2099609375,
"learning_rate": 0.0015089471493965876,
"loss": 1.3049,
"step": 118000
},
{
"epoch": 12.328339575530586,
"grad_norm": 0.32421875,
"learning_rate": 0.0015068664169787765,
"loss": 1.3033,
"step": 118500
},
{
"epoch": 12.380357885975863,
"grad_norm": 0.1884765625,
"learning_rate": 0.0015047856845609654,
"loss": 1.3032,
"step": 119000
},
{
"epoch": 12.43237619642114,
"grad_norm": 0.3125,
"learning_rate": 0.0015027049521431544,
"loss": 1.3031,
"step": 119500
},
{
"epoch": 12.484394506866417,
"grad_norm": 0.291015625,
"learning_rate": 0.0015006242197253433,
"loss": 1.3031,
"step": 120000
},
{
"epoch": 12.536412817311694,
"grad_norm": 0.90625,
"learning_rate": 0.0014985434873075324,
"loss": 1.304,
"step": 120500
},
{
"epoch": 12.588431127756971,
"grad_norm": 0.2216796875,
"learning_rate": 0.001496462754889721,
"loss": 1.3042,
"step": 121000
},
{
"epoch": 12.640449438202246,
"grad_norm": 0.283203125,
"learning_rate": 0.0014943820224719102,
"loss": 1.3055,
"step": 121500
},
{
"epoch": 12.692467748647523,
"grad_norm": 0.20703125,
"learning_rate": 0.0014923012900540991,
"loss": 1.3061,
"step": 122000
},
{
"epoch": 12.7444860590928,
"grad_norm": 0.390625,
"learning_rate": 0.0014902205576362878,
"loss": 1.304,
"step": 122500
},
{
"epoch": 12.796504369538077,
"grad_norm": 0.408203125,
"learning_rate": 0.001488139825218477,
"loss": 1.3048,
"step": 123000
},
{
"epoch": 12.848522679983354,
"grad_norm": 0.2099609375,
"learning_rate": 0.0014860590928006659,
"loss": 1.3038,
"step": 123500
},
{
"epoch": 12.900540990428631,
"grad_norm": 0.24609375,
"learning_rate": 0.0014839783603828548,
"loss": 1.3029,
"step": 124000
},
{
"epoch": 12.952559300873908,
"grad_norm": 0.2021484375,
"learning_rate": 0.0014818976279650437,
"loss": 1.302,
"step": 124500
},
{
"epoch": 13.0,
"eval_loss": 1.336362600326538,
"eval_runtime": 1.5551,
"eval_samples_per_second": 643.05,
"eval_steps_per_second": 0.643,
"step": 124956
},
{
"epoch": 13.004577611319185,
"grad_norm": 0.27734375,
"learning_rate": 0.0014798168955472326,
"loss": 1.3067,
"step": 125000
},
{
"epoch": 13.056595921764462,
"grad_norm": 0.3125,
"learning_rate": 0.0014777361631294215,
"loss": 1.3017,
"step": 125500
},
{
"epoch": 13.108614232209737,
"grad_norm": 0.28515625,
"learning_rate": 0.0014756554307116107,
"loss": 1.3037,
"step": 126000
},
{
"epoch": 13.160632542655014,
"grad_norm": 0.28515625,
"learning_rate": 0.0014735746982937994,
"loss": 1.3047,
"step": 126500
},
{
"epoch": 13.21265085310029,
"grad_norm": 0.1982421875,
"learning_rate": 0.0014714939658759883,
"loss": 1.3034,
"step": 127000
},
{
"epoch": 13.264669163545568,
"grad_norm": 7.0,
"learning_rate": 0.0014694132334581774,
"loss": 1.303,
"step": 127500
},
{
"epoch": 13.316687473990845,
"grad_norm": 0.2275390625,
"learning_rate": 0.0014673325010403661,
"loss": 1.304,
"step": 128000
},
{
"epoch": 13.368705784436122,
"grad_norm": 0.240234375,
"learning_rate": 0.0014652517686225552,
"loss": 1.3033,
"step": 128500
},
{
"epoch": 13.420724094881399,
"grad_norm": 0.349609375,
"learning_rate": 0.0014631710362047442,
"loss": 1.3014,
"step": 129000
},
{
"epoch": 13.472742405326676,
"grad_norm": 0.2392578125,
"learning_rate": 0.001461090303786933,
"loss": 1.3004,
"step": 129500
},
{
"epoch": 13.524760715771952,
"grad_norm": 0.6875,
"learning_rate": 0.001459009571369122,
"loss": 1.3011,
"step": 130000
},
{
"epoch": 13.576779026217228,
"grad_norm": 0.3359375,
"learning_rate": 0.001456928838951311,
"loss": 1.3005,
"step": 130500
},
{
"epoch": 13.628797336662505,
"grad_norm": 0.1943359375,
"learning_rate": 0.0014548481065334998,
"loss": 1.3022,
"step": 131000
},
{
"epoch": 13.680815647107782,
"grad_norm": 0.2197265625,
"learning_rate": 0.0014527673741156887,
"loss": 1.3013,
"step": 131500
},
{
"epoch": 13.732833957553058,
"grad_norm": 0.1904296875,
"learning_rate": 0.0014506866416978776,
"loss": 1.3005,
"step": 132000
},
{
"epoch": 13.784852267998335,
"grad_norm": 0.2734375,
"learning_rate": 0.0014486059092800666,
"loss": 1.2997,
"step": 132500
},
{
"epoch": 13.836870578443612,
"grad_norm": 0.2412109375,
"learning_rate": 0.0014465251768622557,
"loss": 1.2998,
"step": 133000
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.2421875,
"learning_rate": 0.0014444444444444444,
"loss": 1.3001,
"step": 133500
},
{
"epoch": 13.940907199334166,
"grad_norm": 0.2265625,
"learning_rate": 0.0014423637120266333,
"loss": 1.2998,
"step": 134000
},
{
"epoch": 13.992925509779443,
"grad_norm": 0.455078125,
"learning_rate": 0.0014402829796088224,
"loss": 1.3013,
"step": 134500
},
{
"epoch": 14.0,
"eval_loss": 1.3362102508544922,
"eval_runtime": 1.3748,
"eval_samples_per_second": 727.372,
"eval_steps_per_second": 0.727,
"step": 134568
},
{
"epoch": 14.044943820224718,
"grad_norm": 0.220703125,
"learning_rate": 0.0014382022471910111,
"loss": 1.2953,
"step": 135000
},
{
"epoch": 14.096962130669995,
"grad_norm": 0.2197265625,
"learning_rate": 0.0014361215147732003,
"loss": 1.2956,
"step": 135500
},
{
"epoch": 14.148980441115272,
"grad_norm": 0.3359375,
"learning_rate": 0.0014340407823553892,
"loss": 1.2953,
"step": 136000
},
{
"epoch": 14.20099875156055,
"grad_norm": 0.337890625,
"learning_rate": 0.001431960049937578,
"loss": 1.2959,
"step": 136500
},
{
"epoch": 14.253017062005826,
"grad_norm": 0.2158203125,
"learning_rate": 0.001429879317519767,
"loss": 1.2951,
"step": 137000
},
{
"epoch": 14.305035372451103,
"grad_norm": 0.1953125,
"learning_rate": 0.001427798585101956,
"loss": 1.2948,
"step": 137500
},
{
"epoch": 14.35705368289638,
"grad_norm": 0.412109375,
"learning_rate": 0.0014257178526841448,
"loss": 1.2962,
"step": 138000
},
{
"epoch": 14.409071993341657,
"grad_norm": 0.21484375,
"learning_rate": 0.001423637120266334,
"loss": 1.2941,
"step": 138500
},
{
"epoch": 14.461090303786934,
"grad_norm": 0.205078125,
"learning_rate": 0.0014215563878485226,
"loss": 1.2958,
"step": 139000
},
{
"epoch": 14.513108614232209,
"grad_norm": 0.2255859375,
"learning_rate": 0.0014194756554307116,
"loss": 1.2949,
"step": 139500
},
{
"epoch": 14.565126924677486,
"grad_norm": 0.177734375,
"learning_rate": 0.0014173949230129007,
"loss": 1.2933,
"step": 140000
},
{
"epoch": 14.617145235122763,
"grad_norm": 0.291015625,
"learning_rate": 0.0014153141905950894,
"loss": 1.295,
"step": 140500
},
{
"epoch": 14.66916354556804,
"grad_norm": 0.33984375,
"learning_rate": 0.0014132334581772783,
"loss": 1.2944,
"step": 141000
},
{
"epoch": 14.721181856013317,
"grad_norm": 0.19140625,
"learning_rate": 0.0014111527257594674,
"loss": 1.2923,
"step": 141500
},
{
"epoch": 14.773200166458594,
"grad_norm": 0.271484375,
"learning_rate": 0.0014090719933416563,
"loss": 1.2925,
"step": 142000
},
{
"epoch": 14.82521847690387,
"grad_norm": 0.3828125,
"learning_rate": 0.0014069912609238453,
"loss": 1.2907,
"step": 142500
},
{
"epoch": 14.877236787349148,
"grad_norm": 0.21875,
"learning_rate": 0.0014049105285060342,
"loss": 1.2936,
"step": 143000
},
{
"epoch": 14.929255097794425,
"grad_norm": 0.25390625,
"learning_rate": 0.001402829796088223,
"loss": 1.2927,
"step": 143500
},
{
"epoch": 14.9812734082397,
"grad_norm": 0.419921875,
"learning_rate": 0.001400749063670412,
"loss": 1.2911,
"step": 144000
},
{
"epoch": 15.0,
"eval_loss": 1.3259565830230713,
"eval_runtime": 1.5089,
"eval_samples_per_second": 662.754,
"eval_steps_per_second": 0.663,
"step": 144180
},
{
"epoch": 15.033291718684977,
"grad_norm": 0.2216796875,
"learning_rate": 0.001398668331252601,
"loss": 1.2892,
"step": 144500
},
{
"epoch": 15.085310029130254,
"grad_norm": 0.474609375,
"learning_rate": 0.0013965875988347898,
"loss": 1.2898,
"step": 145000
},
{
"epoch": 15.13732833957553,
"grad_norm": 0.2119140625,
"learning_rate": 0.001394506866416979,
"loss": 1.2916,
"step": 145500
},
{
"epoch": 15.189346650020807,
"grad_norm": 0.2373046875,
"learning_rate": 0.0013924261339991677,
"loss": 1.2906,
"step": 146000
},
{
"epoch": 15.241364960466084,
"grad_norm": 0.294921875,
"learning_rate": 0.0013903454015813566,
"loss": 1.2905,
"step": 146500
},
{
"epoch": 15.293383270911361,
"grad_norm": 0.2734375,
"learning_rate": 0.0013882646691635457,
"loss": 1.2898,
"step": 147000
},
{
"epoch": 15.345401581356638,
"grad_norm": 0.2060546875,
"learning_rate": 0.0013861839367457346,
"loss": 1.2893,
"step": 147500
},
{
"epoch": 15.397419891801913,
"grad_norm": 0.2451171875,
"learning_rate": 0.0013841032043279233,
"loss": 1.2905,
"step": 148000
},
{
"epoch": 15.44943820224719,
"grad_norm": 0.19140625,
"learning_rate": 0.0013820224719101124,
"loss": 1.2899,
"step": 148500
},
{
"epoch": 15.501456512692467,
"grad_norm": 0.2138671875,
"learning_rate": 0.0013799417394923014,
"loss": 1.2913,
"step": 149000
},
{
"epoch": 15.553474823137744,
"grad_norm": 0.2109375,
"learning_rate": 0.0013778610070744903,
"loss": 1.29,
"step": 149500
},
{
"epoch": 15.605493133583021,
"grad_norm": 0.279296875,
"learning_rate": 0.0013757802746566792,
"loss": 1.2906,
"step": 150000
},
{
"epoch": 15.657511444028298,
"grad_norm": 0.224609375,
"learning_rate": 0.001373699542238868,
"loss": 1.289,
"step": 150500
},
{
"epoch": 15.709529754473575,
"grad_norm": 0.33984375,
"learning_rate": 0.0013716188098210572,
"loss": 1.2894,
"step": 151000
},
{
"epoch": 15.761548064918852,
"grad_norm": 0.271484375,
"learning_rate": 0.001369538077403246,
"loss": 1.2876,
"step": 151500
},
{
"epoch": 15.813566375364129,
"grad_norm": 0.201171875,
"learning_rate": 0.0013674573449854348,
"loss": 1.2886,
"step": 152000
},
{
"epoch": 15.865584685809406,
"grad_norm": 0.2158203125,
"learning_rate": 0.001365376612567624,
"loss": 1.2875,
"step": 152500
},
{
"epoch": 15.917602996254681,
"grad_norm": 0.1943359375,
"learning_rate": 0.0013632958801498127,
"loss": 1.2874,
"step": 153000
},
{
"epoch": 15.969621306699958,
"grad_norm": 0.2138671875,
"learning_rate": 0.0013612151477320016,
"loss": 1.2876,
"step": 153500
},
{
"epoch": 16.0,
"eval_loss": 1.3189575672149658,
"eval_runtime": 1.5607,
"eval_samples_per_second": 640.73,
"eval_steps_per_second": 0.641,
"step": 153792
},
{
"epoch": 16.021639617145237,
"grad_norm": 0.20703125,
"learning_rate": 0.0013591344153141907,
"loss": 1.285,
"step": 154000
},
{
"epoch": 16.073657927590514,
"grad_norm": 0.1845703125,
"learning_rate": 0.0013570536828963796,
"loss": 1.2834,
"step": 154500
},
{
"epoch": 16.125676238035787,
"grad_norm": 0.310546875,
"learning_rate": 0.0013549729504785683,
"loss": 1.2828,
"step": 155000
},
{
"epoch": 16.177694548481064,
"grad_norm": 0.291015625,
"learning_rate": 0.0013528922180607575,
"loss": 1.2826,
"step": 155500
},
{
"epoch": 16.22971285892634,
"grad_norm": 0.208984375,
"learning_rate": 0.0013508114856429464,
"loss": 1.2825,
"step": 156000
},
{
"epoch": 16.281731169371618,
"grad_norm": 0.2578125,
"learning_rate": 0.001348730753225135,
"loss": 1.2836,
"step": 156500
},
{
"epoch": 16.333749479816895,
"grad_norm": 0.291015625,
"learning_rate": 0.0013466500208073242,
"loss": 1.2853,
"step": 157000
},
{
"epoch": 16.38576779026217,
"grad_norm": 0.208984375,
"learning_rate": 0.0013445692883895131,
"loss": 1.2859,
"step": 157500
},
{
"epoch": 16.43778610070745,
"grad_norm": 0.267578125,
"learning_rate": 0.0013424885559717022,
"loss": 1.2841,
"step": 158000
},
{
"epoch": 16.489804411152726,
"grad_norm": 0.259765625,
"learning_rate": 0.001340407823553891,
"loss": 1.2834,
"step": 158500
},
{
"epoch": 16.541822721598002,
"grad_norm": 0.197265625,
"learning_rate": 0.0013383270911360799,
"loss": 1.2834,
"step": 159000
},
{
"epoch": 16.59384103204328,
"grad_norm": 0.19140625,
"learning_rate": 0.001336246358718269,
"loss": 1.2834,
"step": 159500
},
{
"epoch": 16.645859342488556,
"grad_norm": 0.29296875,
"learning_rate": 0.001334165626300458,
"loss": 1.2856,
"step": 160000
},
{
"epoch": 16.697877652933833,
"grad_norm": 0.50390625,
"learning_rate": 0.0013320848938826466,
"loss": 1.2829,
"step": 160500
},
{
"epoch": 16.74989596337911,
"grad_norm": 0.2119140625,
"learning_rate": 0.0013300041614648357,
"loss": 1.283,
"step": 161000
},
{
"epoch": 16.801914273824387,
"grad_norm": 0.208984375,
"learning_rate": 0.0013279234290470246,
"loss": 1.2848,
"step": 161500
},
{
"epoch": 16.853932584269664,
"grad_norm": 0.2216796875,
"learning_rate": 0.0013258426966292133,
"loss": 1.2837,
"step": 162000
},
{
"epoch": 16.90595089471494,
"grad_norm": 0.2109375,
"learning_rate": 0.0013237619642114025,
"loss": 1.2824,
"step": 162500
},
{
"epoch": 16.957969205160218,
"grad_norm": 0.2236328125,
"learning_rate": 0.0013216812317935914,
"loss": 1.284,
"step": 163000
},
{
"epoch": 17.0,
"eval_loss": 1.3203132152557373,
"eval_runtime": 1.432,
"eval_samples_per_second": 698.327,
"eval_steps_per_second": 0.698,
"step": 163404
},
{
"epoch": 17.00998751560549,
"grad_norm": 0.33984375,
"learning_rate": 0.0013196004993757803,
"loss": 1.2829,
"step": 163500
},
{
"epoch": 17.06200582605077,
"grad_norm": 0.21875,
"learning_rate": 0.0013175197669579692,
"loss": 1.2788,
"step": 164000
},
{
"epoch": 17.114024136496045,
"grad_norm": 0.25390625,
"learning_rate": 0.0013154390345401581,
"loss": 1.2805,
"step": 164500
},
{
"epoch": 17.166042446941322,
"grad_norm": 0.2275390625,
"learning_rate": 0.0013133583021223473,
"loss": 1.2815,
"step": 165000
},
{
"epoch": 17.2180607573866,
"grad_norm": 0.41015625,
"learning_rate": 0.001311277569704536,
"loss": 1.2817,
"step": 165500
},
{
"epoch": 17.270079067831876,
"grad_norm": 0.345703125,
"learning_rate": 0.0013091968372867249,
"loss": 1.2835,
"step": 166000
},
{
"epoch": 17.322097378277153,
"grad_norm": 0.2216796875,
"learning_rate": 0.001307116104868914,
"loss": 1.2823,
"step": 166500
},
{
"epoch": 17.37411568872243,
"grad_norm": 0.2158203125,
"learning_rate": 0.001305035372451103,
"loss": 1.2831,
"step": 167000
},
{
"epoch": 17.426133999167707,
"grad_norm": 0.2216796875,
"learning_rate": 0.0013029546400332916,
"loss": 1.283,
"step": 167500
},
{
"epoch": 17.478152309612984,
"grad_norm": 0.3984375,
"learning_rate": 0.0013008739076154807,
"loss": 1.2823,
"step": 168000
},
{
"epoch": 17.53017062005826,
"grad_norm": 0.2333984375,
"learning_rate": 0.0012987931751976696,
"loss": 1.2826,
"step": 168500
},
{
"epoch": 17.582188930503538,
"grad_norm": 0.255859375,
"learning_rate": 0.0012967124427798583,
"loss": 1.2823,
"step": 169000
},
{
"epoch": 17.634207240948815,
"grad_norm": 0.373046875,
"learning_rate": 0.0012946317103620475,
"loss": 1.2851,
"step": 169500
},
{
"epoch": 17.68622555139409,
"grad_norm": 0.2138671875,
"learning_rate": 0.0012925509779442364,
"loss": 1.2828,
"step": 170000
},
{
"epoch": 17.73824386183937,
"grad_norm": 0.21875,
"learning_rate": 0.0012904702455264253,
"loss": 1.2842,
"step": 170500
},
{
"epoch": 17.790262172284645,
"grad_norm": 0.2080078125,
"learning_rate": 0.0012883895131086142,
"loss": 1.2839,
"step": 171000
},
{
"epoch": 17.842280482729922,
"grad_norm": 0.25390625,
"learning_rate": 0.0012863087806908031,
"loss": 1.2848,
"step": 171500
},
{
"epoch": 17.8942987931752,
"grad_norm": 1.0,
"learning_rate": 0.0012842280482729923,
"loss": 1.2834,
"step": 172000
},
{
"epoch": 17.946317103620473,
"grad_norm": 0.2099609375,
"learning_rate": 0.0012821473158551812,
"loss": 1.2839,
"step": 172500
},
{
"epoch": 17.99833541406575,
"grad_norm": 0.283203125,
"learning_rate": 0.0012800665834373699,
"loss": 1.2837,
"step": 173000
},
{
"epoch": 18.0,
"eval_loss": 1.3176885843276978,
"eval_runtime": 1.6332,
"eval_samples_per_second": 612.278,
"eval_steps_per_second": 0.612,
"step": 173016
},
{
"epoch": 18.050353724511027,
"grad_norm": 0.7734375,
"learning_rate": 0.001277985851019559,
"loss": 1.2788,
"step": 173500
},
{
"epoch": 18.102372034956304,
"grad_norm": 0.224609375,
"learning_rate": 0.001275905118601748,
"loss": 1.28,
"step": 174000
},
{
"epoch": 18.15439034540158,
"grad_norm": 0.251953125,
"learning_rate": 0.0012738243861839366,
"loss": 1.2797,
"step": 174500
},
{
"epoch": 18.206408655846857,
"grad_norm": 0.2890625,
"learning_rate": 0.0012717436537661257,
"loss": 1.2813,
"step": 175000
},
{
"epoch": 18.258426966292134,
"grad_norm": 0.205078125,
"learning_rate": 0.0012696629213483147,
"loss": 1.2816,
"step": 175500
},
{
"epoch": 18.31044527673741,
"grad_norm": 0.21484375,
"learning_rate": 0.0012675821889305036,
"loss": 1.282,
"step": 176000
},
{
"epoch": 18.36246358718269,
"grad_norm": 0.62109375,
"learning_rate": 0.0012655014565126925,
"loss": 1.2821,
"step": 176500
},
{
"epoch": 18.414481897627965,
"grad_norm": 0.2353515625,
"learning_rate": 0.0012634207240948814,
"loss": 1.2827,
"step": 177000
},
{
"epoch": 18.466500208073242,
"grad_norm": 0.2197265625,
"learning_rate": 0.0012613399916770703,
"loss": 1.2802,
"step": 177500
},
{
"epoch": 18.51851851851852,
"grad_norm": 0.181640625,
"learning_rate": 0.0012592592592592592,
"loss": 1.2803,
"step": 178000
},
{
"epoch": 18.570536828963796,
"grad_norm": 0.353515625,
"learning_rate": 0.0012571785268414481,
"loss": 1.2808,
"step": 178500
},
{
"epoch": 18.622555139409073,
"grad_norm": 0.1865234375,
"learning_rate": 0.0012550977944236373,
"loss": 1.2795,
"step": 179000
},
{
"epoch": 18.67457344985435,
"grad_norm": 0.361328125,
"learning_rate": 0.0012530170620058262,
"loss": 1.2789,
"step": 179500
},
{
"epoch": 18.726591760299627,
"grad_norm": 0.2158203125,
"learning_rate": 0.0012509363295880149,
"loss": 1.2805,
"step": 180000
},
{
"epoch": 18.778610070744904,
"grad_norm": 1.640625,
"learning_rate": 0.001248855597170204,
"loss": 1.2808,
"step": 180500
},
{
"epoch": 18.83062838119018,
"grad_norm": 0.25390625,
"learning_rate": 0.001246774864752393,
"loss": 1.2798,
"step": 181000
},
{
"epoch": 18.882646691635454,
"grad_norm": 0.2490234375,
"learning_rate": 0.0012446941323345816,
"loss": 1.2794,
"step": 181500
},
{
"epoch": 18.93466500208073,
"grad_norm": 0.19140625,
"learning_rate": 0.0012426133999167708,
"loss": 1.2801,
"step": 182000
},
{
"epoch": 18.986683312526008,
"grad_norm": 0.181640625,
"learning_rate": 0.0012405326674989597,
"loss": 1.2823,
"step": 182500
},
{
"epoch": 19.0,
"eval_loss": 1.3176276683807373,
"eval_runtime": 1.3968,
"eval_samples_per_second": 715.946,
"eval_steps_per_second": 0.716,
"step": 182628
},
{
"epoch": 19.038701622971285,
"grad_norm": 0.203125,
"learning_rate": 0.0012384519350811486,
"loss": 1.2802,
"step": 183000
},
{
"epoch": 19.090719933416562,
"grad_norm": 0.326171875,
"learning_rate": 0.0012363712026633375,
"loss": 1.2782,
"step": 183500
},
{
"epoch": 19.14273824386184,
"grad_norm": 0.2216796875,
"learning_rate": 0.0012342904702455264,
"loss": 1.2769,
"step": 184000
},
{
"epoch": 19.194756554307116,
"grad_norm": 0.2060546875,
"learning_rate": 0.0012322097378277153,
"loss": 1.2789,
"step": 184500
},
{
"epoch": 19.246774864752393,
"grad_norm": 0.24609375,
"learning_rate": 0.0012301290054099045,
"loss": 1.279,
"step": 185000
},
{
"epoch": 19.29879317519767,
"grad_norm": 0.2080078125,
"learning_rate": 0.0012280482729920932,
"loss": 1.2813,
"step": 185500
},
{
"epoch": 19.350811485642947,
"grad_norm": 0.1923828125,
"learning_rate": 0.0012259675405742823,
"loss": 1.2811,
"step": 186000
},
{
"epoch": 19.402829796088223,
"grad_norm": 0.2060546875,
"learning_rate": 0.0012238868081564712,
"loss": 1.2826,
"step": 186500
},
{
"epoch": 19.4548481065335,
"grad_norm": 0.27734375,
"learning_rate": 0.00122180607573866,
"loss": 1.2811,
"step": 187000
},
{
"epoch": 19.506866416978777,
"grad_norm": 0.2236328125,
"learning_rate": 0.001219725343320849,
"loss": 1.2814,
"step": 187500
},
{
"epoch": 19.558884727424054,
"grad_norm": 0.41015625,
"learning_rate": 0.001217644610903038,
"loss": 1.2794,
"step": 188000
},
{
"epoch": 19.61090303786933,
"grad_norm": 0.2041015625,
"learning_rate": 0.0012155638784852269,
"loss": 1.2801,
"step": 188500
},
{
"epoch": 19.662921348314608,
"grad_norm": 0.435546875,
"learning_rate": 0.0012134831460674158,
"loss": 1.2796,
"step": 189000
},
{
"epoch": 19.714939658759885,
"grad_norm": 0.1962890625,
"learning_rate": 0.0012114024136496047,
"loss": 1.2785,
"step": 189500
},
{
"epoch": 19.76695796920516,
"grad_norm": 5.6875,
"learning_rate": 0.0012093216812317936,
"loss": 1.2783,
"step": 190000
},
{
"epoch": 19.818976279650435,
"grad_norm": 0.205078125,
"learning_rate": 0.0012072409488139825,
"loss": 1.2782,
"step": 190500
},
{
"epoch": 19.870994590095712,
"grad_norm": 3.21875,
"learning_rate": 0.0012051602163961714,
"loss": 1.2792,
"step": 191000
},
{
"epoch": 19.92301290054099,
"grad_norm": 0.2119140625,
"learning_rate": 0.0012030794839783603,
"loss": 1.2803,
"step": 191500
},
{
"epoch": 19.975031210986266,
"grad_norm": 0.419921875,
"learning_rate": 0.0012009987515605495,
"loss": 1.2778,
"step": 192000
},
{
"epoch": 20.0,
"eval_loss": 1.3075087070465088,
"eval_runtime": 1.6065,
"eval_samples_per_second": 622.457,
"eval_steps_per_second": 0.622,
"step": 192240
},
{
"epoch": 20.027049521431543,
"grad_norm": 0.296875,
"learning_rate": 0.0011989180191427382,
"loss": 1.2763,
"step": 192500
},
{
"epoch": 20.07906783187682,
"grad_norm": 0.26171875,
"learning_rate": 0.0011968372867249273,
"loss": 1.2755,
"step": 193000
},
{
"epoch": 20.131086142322097,
"grad_norm": 0.220703125,
"learning_rate": 0.0011947565543071162,
"loss": 1.2754,
"step": 193500
},
{
"epoch": 20.183104452767374,
"grad_norm": 0.1875,
"learning_rate": 0.001192675821889305,
"loss": 1.2756,
"step": 194000
},
{
"epoch": 20.23512276321265,
"grad_norm": 0.2734375,
"learning_rate": 0.001190595089471494,
"loss": 1.278,
"step": 194500
},
{
"epoch": 20.287141073657928,
"grad_norm": 0.1767578125,
"learning_rate": 0.001188514357053683,
"loss": 1.2766,
"step": 195000
},
{
"epoch": 20.339159384103205,
"grad_norm": 0.466796875,
"learning_rate": 0.0011864336246358719,
"loss": 1.2769,
"step": 195500
},
{
"epoch": 20.39117769454848,
"grad_norm": 0.189453125,
"learning_rate": 0.0011843528922180608,
"loss": 1.2788,
"step": 196000
},
{
"epoch": 20.44319600499376,
"grad_norm": 0.20703125,
"learning_rate": 0.0011822721598002497,
"loss": 1.279,
"step": 196500
},
{
"epoch": 20.495214315439036,
"grad_norm": 0.17578125,
"learning_rate": 0.0011801914273824386,
"loss": 1.2782,
"step": 197000
},
{
"epoch": 20.547232625884313,
"grad_norm": 0.25390625,
"learning_rate": 0.0011781106949646277,
"loss": 1.2773,
"step": 197500
},
{
"epoch": 20.59925093632959,
"grad_norm": 0.22265625,
"learning_rate": 0.0011760299625468164,
"loss": 1.279,
"step": 198000
},
{
"epoch": 20.651269246774866,
"grad_norm": 0.265625,
"learning_rate": 0.0011739492301290053,
"loss": 1.2786,
"step": 198500
},
{
"epoch": 20.70328755722014,
"grad_norm": 0.296875,
"learning_rate": 0.0011718684977111945,
"loss": 1.2788,
"step": 199000
},
{
"epoch": 20.755305867665417,
"grad_norm": 0.8046875,
"learning_rate": 0.0011697877652933832,
"loss": 1.2781,
"step": 199500
},
{
"epoch": 20.807324178110694,
"grad_norm": 0.2314453125,
"learning_rate": 0.001167707032875572,
"loss": 1.2789,
"step": 200000
},
{
"epoch": 20.85934248855597,
"grad_norm": 0.23046875,
"learning_rate": 0.0011656263004577612,
"loss": 1.2809,
"step": 200500
},
{
"epoch": 20.911360799001248,
"grad_norm": 0.21875,
"learning_rate": 0.0011635455680399501,
"loss": 1.2778,
"step": 201000
},
{
"epoch": 20.963379109446524,
"grad_norm": 0.265625,
"learning_rate": 0.001161464835622139,
"loss": 1.2777,
"step": 201500
},
{
"epoch": 21.0,
"eval_loss": 1.3104900121688843,
"eval_runtime": 1.3896,
"eval_samples_per_second": 719.624,
"eval_steps_per_second": 0.72,
"step": 201852
},
{
"epoch": 21.0153974198918,
"grad_norm": 0.2265625,
"learning_rate": 0.001159384103204328,
"loss": 1.2777,
"step": 202000
},
{
"epoch": 21.06741573033708,
"grad_norm": 0.173828125,
"learning_rate": 0.0011573033707865169,
"loss": 1.2763,
"step": 202500
},
{
"epoch": 21.119434040782355,
"grad_norm": 0.2353515625,
"learning_rate": 0.0011552226383687058,
"loss": 1.2753,
"step": 203000
},
{
"epoch": 21.171452351227632,
"grad_norm": 0.1875,
"learning_rate": 0.0011531419059508947,
"loss": 1.2752,
"step": 203500
},
{
"epoch": 21.22347066167291,
"grad_norm": 0.251953125,
"learning_rate": 0.0011510611735330836,
"loss": 1.2754,
"step": 204000
},
{
"epoch": 21.275488972118186,
"grad_norm": 0.251953125,
"learning_rate": 0.0011489804411152727,
"loss": 1.2756,
"step": 204500
},
{
"epoch": 21.327507282563463,
"grad_norm": 0.2412109375,
"learning_rate": 0.0011468997086974614,
"loss": 1.2754,
"step": 205000
},
{
"epoch": 21.37952559300874,
"grad_norm": 0.208984375,
"learning_rate": 0.0011448189762796504,
"loss": 1.2753,
"step": 205500
},
{
"epoch": 21.431543903454017,
"grad_norm": 0.361328125,
"learning_rate": 0.0011427382438618395,
"loss": 1.2757,
"step": 206000
},
{
"epoch": 21.483562213899294,
"grad_norm": 0.19140625,
"learning_rate": 0.0011406575114440284,
"loss": 1.2755,
"step": 206500
},
{
"epoch": 21.53558052434457,
"grad_norm": 0.248046875,
"learning_rate": 0.001138576779026217,
"loss": 1.2764,
"step": 207000
},
{
"epoch": 21.587598834789844,
"grad_norm": 0.1953125,
"learning_rate": 0.0011364960466084062,
"loss": 1.2765,
"step": 207500
},
{
"epoch": 21.63961714523512,
"grad_norm": 0.48828125,
"learning_rate": 0.0011344153141905951,
"loss": 1.2756,
"step": 208000
},
{
"epoch": 21.691635455680398,
"grad_norm": 0.1796875,
"learning_rate": 0.001132334581772784,
"loss": 1.2772,
"step": 208500
},
{
"epoch": 21.743653766125675,
"grad_norm": 0.30859375,
"learning_rate": 0.001130253849354973,
"loss": 1.2757,
"step": 209000
},
{
"epoch": 21.795672076570952,
"grad_norm": 2.46875,
"learning_rate": 0.0011281731169371619,
"loss": 1.2767,
"step": 209500
},
{
"epoch": 21.84769038701623,
"grad_norm": 0.2431640625,
"learning_rate": 0.001126092384519351,
"loss": 1.2741,
"step": 210000
},
{
"epoch": 21.899708697461506,
"grad_norm": 0.94921875,
"learning_rate": 0.0011240116521015397,
"loss": 1.277,
"step": 210500
},
{
"epoch": 21.951727007906783,
"grad_norm": 0.1923828125,
"learning_rate": 0.0011219309196837286,
"loss": 1.2761,
"step": 211000
},
{
"epoch": 22.0,
"eval_loss": 1.309814453125,
"eval_runtime": 1.7271,
"eval_samples_per_second": 578.995,
"eval_steps_per_second": 0.579,
"step": 211464
},
{
"epoch": 22.00374531835206,
"grad_norm": 0.212890625,
"learning_rate": 0.0011198501872659178,
"loss": 1.2756,
"step": 211500
},
{
"epoch": 22.055763628797337,
"grad_norm": 0.259765625,
"learning_rate": 0.0011177694548481065,
"loss": 1.2751,
"step": 212000
},
{
"epoch": 22.107781939242614,
"grad_norm": 0.181640625,
"learning_rate": 0.0011156887224302954,
"loss": 1.2746,
"step": 212500
},
{
"epoch": 22.15980024968789,
"grad_norm": 0.279296875,
"learning_rate": 0.0011136079900124845,
"loss": 1.2754,
"step": 213000
},
{
"epoch": 22.211818560133167,
"grad_norm": 0.259765625,
"learning_rate": 0.0011115272575946734,
"loss": 1.2767,
"step": 213500
},
{
"epoch": 22.263836870578444,
"grad_norm": 0.2392578125,
"learning_rate": 0.0011094465251768621,
"loss": 1.2737,
"step": 214000
},
{
"epoch": 22.31585518102372,
"grad_norm": 0.2734375,
"learning_rate": 0.0011073657927590512,
"loss": 1.2737,
"step": 214500
},
{
"epoch": 22.367873491469,
"grad_norm": 0.33984375,
"learning_rate": 0.0011052850603412402,
"loss": 1.2735,
"step": 215000
},
{
"epoch": 22.419891801914275,
"grad_norm": 0.251953125,
"learning_rate": 0.001103204327923429,
"loss": 1.2733,
"step": 215500
},
{
"epoch": 22.471910112359552,
"grad_norm": 0.373046875,
"learning_rate": 0.001101123595505618,
"loss": 1.2723,
"step": 216000
},
{
"epoch": 22.52392842280483,
"grad_norm": 0.2216796875,
"learning_rate": 0.001099042863087807,
"loss": 1.274,
"step": 216500
},
{
"epoch": 22.575946733250102,
"grad_norm": 0.267578125,
"learning_rate": 0.001096962130669996,
"loss": 1.2744,
"step": 217000
},
{
"epoch": 22.62796504369538,
"grad_norm": 0.1953125,
"learning_rate": 0.0010948813982521847,
"loss": 1.2758,
"step": 217500
},
{
"epoch": 22.679983354140656,
"grad_norm": 0.28125,
"learning_rate": 0.0010928006658343736,
"loss": 1.2755,
"step": 218000
},
{
"epoch": 22.732001664585933,
"grad_norm": 0.23828125,
"learning_rate": 0.0010907199334165628,
"loss": 1.2742,
"step": 218500
},
{
"epoch": 22.78401997503121,
"grad_norm": 0.2109375,
"learning_rate": 0.0010886392009987517,
"loss": 1.2741,
"step": 219000
},
{
"epoch": 22.836038285476487,
"grad_norm": 0.251953125,
"learning_rate": 0.0010865584685809404,
"loss": 1.2739,
"step": 219500
},
{
"epoch": 22.888056595921764,
"grad_norm": 0.166015625,
"learning_rate": 0.0010844777361631295,
"loss": 1.2747,
"step": 220000
},
{
"epoch": 22.94007490636704,
"grad_norm": 0.216796875,
"learning_rate": 0.0010823970037453184,
"loss": 1.2733,
"step": 220500
},
{
"epoch": 22.992093216812318,
"grad_norm": 0.197265625,
"learning_rate": 0.0010803162713275071,
"loss": 1.2734,
"step": 221000
},
{
"epoch": 23.0,
"eval_loss": 1.3059455156326294,
"eval_runtime": 1.6804,
"eval_samples_per_second": 595.106,
"eval_steps_per_second": 0.595,
"step": 221076
},
{
"epoch": 23.044111527257595,
"grad_norm": 0.2060546875,
"learning_rate": 0.0010782355389096963,
"loss": 1.2717,
"step": 221500
},
{
"epoch": 23.096129837702872,
"grad_norm": 0.2021484375,
"learning_rate": 0.0010761548064918852,
"loss": 1.2719,
"step": 222000
},
{
"epoch": 23.14814814814815,
"grad_norm": 0.1796875,
"learning_rate": 0.0010740740740740743,
"loss": 1.2732,
"step": 222500
},
{
"epoch": 23.200166458593426,
"grad_norm": 0.318359375,
"learning_rate": 0.001071993341656263,
"loss": 1.2743,
"step": 223000
},
{
"epoch": 23.252184769038703,
"grad_norm": 0.87890625,
"learning_rate": 0.001069912609238452,
"loss": 1.2724,
"step": 223500
},
{
"epoch": 23.30420307948398,
"grad_norm": 0.423828125,
"learning_rate": 0.001067831876820641,
"loss": 1.271,
"step": 224000
},
{
"epoch": 23.356221389929257,
"grad_norm": 0.25390625,
"learning_rate": 0.0010657511444028297,
"loss": 1.2721,
"step": 224500
},
{
"epoch": 23.408239700374533,
"grad_norm": 0.1962890625,
"learning_rate": 0.0010636704119850186,
"loss": 1.2716,
"step": 225000
},
{
"epoch": 23.460258010819807,
"grad_norm": 0.294921875,
"learning_rate": 0.0010615896795672078,
"loss": 1.2714,
"step": 225500
},
{
"epoch": 23.512276321265084,
"grad_norm": 0.29296875,
"learning_rate": 0.0010595089471493967,
"loss": 1.2719,
"step": 226000
},
{
"epoch": 23.56429463171036,
"grad_norm": 0.189453125,
"learning_rate": 0.0010574282147315854,
"loss": 1.2716,
"step": 226500
},
{
"epoch": 23.616312942155638,
"grad_norm": 0.2138671875,
"learning_rate": 0.0010553474823137745,
"loss": 1.272,
"step": 227000
},
{
"epoch": 23.668331252600915,
"grad_norm": 0.890625,
"learning_rate": 0.0010532667498959634,
"loss": 1.2729,
"step": 227500
},
{
"epoch": 23.72034956304619,
"grad_norm": 0.2470703125,
"learning_rate": 0.0010511860174781521,
"loss": 1.2728,
"step": 228000
},
{
"epoch": 23.77236787349147,
"grad_norm": 0.251953125,
"learning_rate": 0.0010491052850603413,
"loss": 1.2733,
"step": 228500
},
{
"epoch": 23.824386183936745,
"grad_norm": 0.1806640625,
"learning_rate": 0.0010470245526425302,
"loss": 1.2713,
"step": 229000
},
{
"epoch": 23.876404494382022,
"grad_norm": 0.40234375,
"learning_rate": 0.0010449438202247193,
"loss": 1.2714,
"step": 229500
},
{
"epoch": 23.9284228048273,
"grad_norm": 0.30078125,
"learning_rate": 0.001042863087806908,
"loss": 1.2724,
"step": 230000
},
{
"epoch": 23.980441115272576,
"grad_norm": 0.2197265625,
"learning_rate": 0.001040782355389097,
"loss": 1.2716,
"step": 230500
},
{
"epoch": 24.0,
"eval_loss": 1.305156946182251,
"eval_runtime": 2.3203,
"eval_samples_per_second": 430.983,
"eval_steps_per_second": 0.431,
"step": 230688
}
],
"logging_steps": 500,
"max_steps": 480600,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.304114629940989e+19,
"train_batch_size": 1024,
"trial_name": null,
"trial_params": null
}