swin-tiny-patch4_eurosat / trainer_state.json
suredream's picture
End of training
87d12c8 verified
raw
history blame contribute delete
No virus
178 kB
{
"best_metric": 0.060996126383543015,
"best_model_checkpoint": "./eurosat_outpus/checkpoint-10125",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 10125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0049382716049382715,
"grad_norm": 38.450260162353516,
"learning_rate": 1.9980246913580248e-05,
"loss": 0.1979,
"step": 10
},
{
"epoch": 0.009876543209876543,
"grad_norm": 22.966312408447266,
"learning_rate": 1.9960493827160498e-05,
"loss": 0.3363,
"step": 20
},
{
"epoch": 0.014814814814814815,
"grad_norm": 73.729248046875,
"learning_rate": 1.9940740740740744e-05,
"loss": 0.323,
"step": 30
},
{
"epoch": 0.019753086419753086,
"grad_norm": 58.143798828125,
"learning_rate": 1.992098765432099e-05,
"loss": 0.3155,
"step": 40
},
{
"epoch": 0.024691358024691357,
"grad_norm": 38.21614074707031,
"learning_rate": 1.9901234567901237e-05,
"loss": 0.1625,
"step": 50
},
{
"epoch": 0.02962962962962963,
"grad_norm": 9.119422912597656,
"learning_rate": 1.9881481481481483e-05,
"loss": 0.4599,
"step": 60
},
{
"epoch": 0.0345679012345679,
"grad_norm": 0.6812440156936646,
"learning_rate": 1.986172839506173e-05,
"loss": 0.1372,
"step": 70
},
{
"epoch": 0.03950617283950617,
"grad_norm": 17.003955841064453,
"learning_rate": 1.9841975308641976e-05,
"loss": 0.184,
"step": 80
},
{
"epoch": 0.044444444444444446,
"grad_norm": 4.515798568725586,
"learning_rate": 1.9822222222222226e-05,
"loss": 0.327,
"step": 90
},
{
"epoch": 0.04938271604938271,
"grad_norm": 134.1114959716797,
"learning_rate": 1.9802469135802472e-05,
"loss": 0.1908,
"step": 100
},
{
"epoch": 0.05432098765432099,
"grad_norm": 61.6785774230957,
"learning_rate": 1.978271604938272e-05,
"loss": 0.4477,
"step": 110
},
{
"epoch": 0.05925925925925926,
"grad_norm": 0.12833823263645172,
"learning_rate": 1.9762962962962965e-05,
"loss": 0.2536,
"step": 120
},
{
"epoch": 0.06419753086419754,
"grad_norm": 108.99272155761719,
"learning_rate": 1.974320987654321e-05,
"loss": 0.3968,
"step": 130
},
{
"epoch": 0.0691358024691358,
"grad_norm": 35.6202507019043,
"learning_rate": 1.9723456790123458e-05,
"loss": 0.268,
"step": 140
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.6642296314239502,
"learning_rate": 1.9703703703703704e-05,
"loss": 0.2327,
"step": 150
},
{
"epoch": 0.07901234567901234,
"grad_norm": 110.14276123046875,
"learning_rate": 1.968395061728395e-05,
"loss": 0.5562,
"step": 160
},
{
"epoch": 0.08395061728395062,
"grad_norm": 78.44914245605469,
"learning_rate": 1.96641975308642e-05,
"loss": 0.2922,
"step": 170
},
{
"epoch": 0.08888888888888889,
"grad_norm": 63.069766998291016,
"learning_rate": 1.9644444444444447e-05,
"loss": 0.3735,
"step": 180
},
{
"epoch": 0.09382716049382717,
"grad_norm": 246.51309204101562,
"learning_rate": 1.9624691358024693e-05,
"loss": 0.3787,
"step": 190
},
{
"epoch": 0.09876543209876543,
"grad_norm": 30.573638916015625,
"learning_rate": 1.960493827160494e-05,
"loss": 0.3193,
"step": 200
},
{
"epoch": 0.1037037037037037,
"grad_norm": 0.3898200988769531,
"learning_rate": 1.9585185185185186e-05,
"loss": 0.2929,
"step": 210
},
{
"epoch": 0.10864197530864197,
"grad_norm": 85.34624481201172,
"learning_rate": 1.9565432098765432e-05,
"loss": 0.3892,
"step": 220
},
{
"epoch": 0.11358024691358025,
"grad_norm": 2.596446990966797,
"learning_rate": 1.954567901234568e-05,
"loss": 0.2047,
"step": 230
},
{
"epoch": 0.11851851851851852,
"grad_norm": 16.085424423217773,
"learning_rate": 1.952592592592593e-05,
"loss": 0.3302,
"step": 240
},
{
"epoch": 0.12345679012345678,
"grad_norm": 49.19840621948242,
"learning_rate": 1.9506172839506175e-05,
"loss": 0.1766,
"step": 250
},
{
"epoch": 0.12839506172839507,
"grad_norm": 49.17105484008789,
"learning_rate": 1.948641975308642e-05,
"loss": 0.3533,
"step": 260
},
{
"epoch": 0.13333333333333333,
"grad_norm": 31.642642974853516,
"learning_rate": 1.9466666666666668e-05,
"loss": 0.262,
"step": 270
},
{
"epoch": 0.1382716049382716,
"grad_norm": 49.4565544128418,
"learning_rate": 1.9446913580246914e-05,
"loss": 0.3649,
"step": 280
},
{
"epoch": 0.14320987654320988,
"grad_norm": 33.3835563659668,
"learning_rate": 1.942716049382716e-05,
"loss": 0.2047,
"step": 290
},
{
"epoch": 0.14814814814814814,
"grad_norm": 30.190998077392578,
"learning_rate": 1.9407407407407407e-05,
"loss": 0.2901,
"step": 300
},
{
"epoch": 0.15308641975308643,
"grad_norm": 73.48704528808594,
"learning_rate": 1.9387654320987657e-05,
"loss": 0.399,
"step": 310
},
{
"epoch": 0.1580246913580247,
"grad_norm": 2.583846092224121,
"learning_rate": 1.9367901234567903e-05,
"loss": 0.3736,
"step": 320
},
{
"epoch": 0.16296296296296298,
"grad_norm": 100.03057861328125,
"learning_rate": 1.934814814814815e-05,
"loss": 0.2624,
"step": 330
},
{
"epoch": 0.16790123456790124,
"grad_norm": 0.5729751586914062,
"learning_rate": 1.93283950617284e-05,
"loss": 0.4662,
"step": 340
},
{
"epoch": 0.1728395061728395,
"grad_norm": 133.75845336914062,
"learning_rate": 1.9308641975308646e-05,
"loss": 0.4739,
"step": 350
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.10531154274940491,
"learning_rate": 1.928888888888889e-05,
"loss": 0.2592,
"step": 360
},
{
"epoch": 0.18271604938271604,
"grad_norm": 4.886446952819824,
"learning_rate": 1.9269135802469135e-05,
"loss": 0.2601,
"step": 370
},
{
"epoch": 0.18765432098765433,
"grad_norm": 20.537151336669922,
"learning_rate": 1.9249382716049385e-05,
"loss": 0.1298,
"step": 380
},
{
"epoch": 0.1925925925925926,
"grad_norm": 101.08270263671875,
"learning_rate": 1.922962962962963e-05,
"loss": 0.2034,
"step": 390
},
{
"epoch": 0.19753086419753085,
"grad_norm": 101.60489654541016,
"learning_rate": 1.9209876543209878e-05,
"loss": 0.5103,
"step": 400
},
{
"epoch": 0.20246913580246914,
"grad_norm": 4.052034854888916,
"learning_rate": 1.9190123456790124e-05,
"loss": 0.3028,
"step": 410
},
{
"epoch": 0.2074074074074074,
"grad_norm": 21.401437759399414,
"learning_rate": 1.9170370370370374e-05,
"loss": 0.4247,
"step": 420
},
{
"epoch": 0.2123456790123457,
"grad_norm": 24.329212188720703,
"learning_rate": 1.915061728395062e-05,
"loss": 0.3295,
"step": 430
},
{
"epoch": 0.21728395061728395,
"grad_norm": 6.972232341766357,
"learning_rate": 1.9130864197530867e-05,
"loss": 0.1283,
"step": 440
},
{
"epoch": 0.2222222222222222,
"grad_norm": 2.0192437171936035,
"learning_rate": 1.9111111111111113e-05,
"loss": 0.3414,
"step": 450
},
{
"epoch": 0.2271604938271605,
"grad_norm": 209.81227111816406,
"learning_rate": 1.909135802469136e-05,
"loss": 0.418,
"step": 460
},
{
"epoch": 0.23209876543209876,
"grad_norm": 64.07678985595703,
"learning_rate": 1.9071604938271606e-05,
"loss": 0.4821,
"step": 470
},
{
"epoch": 0.23703703703703705,
"grad_norm": 6.7498087882995605,
"learning_rate": 1.9051851851851852e-05,
"loss": 0.5004,
"step": 480
},
{
"epoch": 0.2419753086419753,
"grad_norm": 67.15694427490234,
"learning_rate": 1.9032098765432102e-05,
"loss": 0.2125,
"step": 490
},
{
"epoch": 0.24691358024691357,
"grad_norm": 49.803070068359375,
"learning_rate": 1.901234567901235e-05,
"loss": 0.2238,
"step": 500
},
{
"epoch": 0.2518518518518518,
"grad_norm": 76.97516632080078,
"learning_rate": 1.8992592592592595e-05,
"loss": 0.6817,
"step": 510
},
{
"epoch": 0.25679012345679014,
"grad_norm": 45.78963088989258,
"learning_rate": 1.897283950617284e-05,
"loss": 0.2176,
"step": 520
},
{
"epoch": 0.2617283950617284,
"grad_norm": 1.871187448501587,
"learning_rate": 1.8953086419753087e-05,
"loss": 0.0952,
"step": 530
},
{
"epoch": 0.26666666666666666,
"grad_norm": 52.718875885009766,
"learning_rate": 1.8933333333333334e-05,
"loss": 0.306,
"step": 540
},
{
"epoch": 0.2716049382716049,
"grad_norm": 23.83916473388672,
"learning_rate": 1.891358024691358e-05,
"loss": 0.306,
"step": 550
},
{
"epoch": 0.2765432098765432,
"grad_norm": 38.469512939453125,
"learning_rate": 1.889382716049383e-05,
"loss": 0.3775,
"step": 560
},
{
"epoch": 0.2814814814814815,
"grad_norm": 71.19271087646484,
"learning_rate": 1.8874074074074076e-05,
"loss": 0.2412,
"step": 570
},
{
"epoch": 0.28641975308641976,
"grad_norm": 10.515379905700684,
"learning_rate": 1.8854320987654323e-05,
"loss": 0.3623,
"step": 580
},
{
"epoch": 0.291358024691358,
"grad_norm": 56.489166259765625,
"learning_rate": 1.883456790123457e-05,
"loss": 0.2472,
"step": 590
},
{
"epoch": 0.2962962962962963,
"grad_norm": 11.128917694091797,
"learning_rate": 1.8814814814814816e-05,
"loss": 0.512,
"step": 600
},
{
"epoch": 0.3012345679012346,
"grad_norm": 2.7650094032287598,
"learning_rate": 1.8795061728395062e-05,
"loss": 0.518,
"step": 610
},
{
"epoch": 0.30617283950617286,
"grad_norm": 55.65047073364258,
"learning_rate": 1.877530864197531e-05,
"loss": 0.2817,
"step": 620
},
{
"epoch": 0.3111111111111111,
"grad_norm": 8.692935943603516,
"learning_rate": 1.8755555555555558e-05,
"loss": 0.2106,
"step": 630
},
{
"epoch": 0.3160493827160494,
"grad_norm": 2.446716785430908,
"learning_rate": 1.8735802469135805e-05,
"loss": 0.2604,
"step": 640
},
{
"epoch": 0.32098765432098764,
"grad_norm": 12.735766410827637,
"learning_rate": 1.871604938271605e-05,
"loss": 0.5116,
"step": 650
},
{
"epoch": 0.32592592592592595,
"grad_norm": 1.8498376607894897,
"learning_rate": 1.8696296296296297e-05,
"loss": 0.0587,
"step": 660
},
{
"epoch": 0.3308641975308642,
"grad_norm": 0.5433443188667297,
"learning_rate": 1.8676543209876544e-05,
"loss": 0.5793,
"step": 670
},
{
"epoch": 0.3358024691358025,
"grad_norm": 0.06385264545679092,
"learning_rate": 1.865679012345679e-05,
"loss": 0.2238,
"step": 680
},
{
"epoch": 0.34074074074074073,
"grad_norm": 129.50604248046875,
"learning_rate": 1.8637037037037037e-05,
"loss": 0.4826,
"step": 690
},
{
"epoch": 0.345679012345679,
"grad_norm": 20.5740909576416,
"learning_rate": 1.8617283950617286e-05,
"loss": 0.4072,
"step": 700
},
{
"epoch": 0.3506172839506173,
"grad_norm": 1.4352848529815674,
"learning_rate": 1.8597530864197533e-05,
"loss": 0.1077,
"step": 710
},
{
"epoch": 0.35555555555555557,
"grad_norm": 1.2378454208374023,
"learning_rate": 1.857777777777778e-05,
"loss": 0.2087,
"step": 720
},
{
"epoch": 0.36049382716049383,
"grad_norm": 54.489768981933594,
"learning_rate": 1.8558024691358025e-05,
"loss": 0.34,
"step": 730
},
{
"epoch": 0.3654320987654321,
"grad_norm": 85.84687042236328,
"learning_rate": 1.8538271604938275e-05,
"loss": 0.145,
"step": 740
},
{
"epoch": 0.37037037037037035,
"grad_norm": 20.322895050048828,
"learning_rate": 1.851851851851852e-05,
"loss": 0.4289,
"step": 750
},
{
"epoch": 0.37530864197530867,
"grad_norm": 1.6802163124084473,
"learning_rate": 1.8498765432098768e-05,
"loss": 0.4687,
"step": 760
},
{
"epoch": 0.3802469135802469,
"grad_norm": 125.9644546508789,
"learning_rate": 1.8479012345679014e-05,
"loss": 0.5029,
"step": 770
},
{
"epoch": 0.3851851851851852,
"grad_norm": 46.97697830200195,
"learning_rate": 1.845925925925926e-05,
"loss": 0.4326,
"step": 780
},
{
"epoch": 0.39012345679012345,
"grad_norm": 82.32715606689453,
"learning_rate": 1.8439506172839507e-05,
"loss": 0.3779,
"step": 790
},
{
"epoch": 0.3950617283950617,
"grad_norm": 48.87428665161133,
"learning_rate": 1.8419753086419754e-05,
"loss": 0.4167,
"step": 800
},
{
"epoch": 0.4,
"grad_norm": 0.4260449707508087,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.3357,
"step": 810
},
{
"epoch": 0.4049382716049383,
"grad_norm": 3.1416447162628174,
"learning_rate": 1.838024691358025e-05,
"loss": 0.0848,
"step": 820
},
{
"epoch": 0.40987654320987654,
"grad_norm": 0.17075039446353912,
"learning_rate": 1.8360493827160496e-05,
"loss": 0.5083,
"step": 830
},
{
"epoch": 0.4148148148148148,
"grad_norm": 78.5146713256836,
"learning_rate": 1.8340740740740743e-05,
"loss": 0.3346,
"step": 840
},
{
"epoch": 0.41975308641975306,
"grad_norm": 38.72228240966797,
"learning_rate": 1.832098765432099e-05,
"loss": 0.2796,
"step": 850
},
{
"epoch": 0.4246913580246914,
"grad_norm": 28.315433502197266,
"learning_rate": 1.8301234567901235e-05,
"loss": 0.4984,
"step": 860
},
{
"epoch": 0.42962962962962964,
"grad_norm": 1.3758037090301514,
"learning_rate": 1.8281481481481482e-05,
"loss": 0.2027,
"step": 870
},
{
"epoch": 0.4345679012345679,
"grad_norm": 33.141361236572266,
"learning_rate": 1.826172839506173e-05,
"loss": 0.1499,
"step": 880
},
{
"epoch": 0.43950617283950616,
"grad_norm": 48.69041442871094,
"learning_rate": 1.8241975308641978e-05,
"loss": 0.1608,
"step": 890
},
{
"epoch": 0.4444444444444444,
"grad_norm": 35.90753173828125,
"learning_rate": 1.8222222222222224e-05,
"loss": 0.2102,
"step": 900
},
{
"epoch": 0.44938271604938274,
"grad_norm": 27.275602340698242,
"learning_rate": 1.820246913580247e-05,
"loss": 0.3688,
"step": 910
},
{
"epoch": 0.454320987654321,
"grad_norm": 14.521764755249023,
"learning_rate": 1.8182716049382717e-05,
"loss": 0.3542,
"step": 920
},
{
"epoch": 0.45925925925925926,
"grad_norm": 22.390480041503906,
"learning_rate": 1.8162962962962963e-05,
"loss": 0.1098,
"step": 930
},
{
"epoch": 0.4641975308641975,
"grad_norm": 5.19728422164917,
"learning_rate": 1.814320987654321e-05,
"loss": 0.2809,
"step": 940
},
{
"epoch": 0.4691358024691358,
"grad_norm": 0.5096778869628906,
"learning_rate": 1.812345679012346e-05,
"loss": 0.1538,
"step": 950
},
{
"epoch": 0.4740740740740741,
"grad_norm": 107.34992980957031,
"learning_rate": 1.8103703703703706e-05,
"loss": 0.5366,
"step": 960
},
{
"epoch": 0.47901234567901235,
"grad_norm": 37.320709228515625,
"learning_rate": 1.8083950617283952e-05,
"loss": 0.1668,
"step": 970
},
{
"epoch": 0.4839506172839506,
"grad_norm": 20.405574798583984,
"learning_rate": 1.80641975308642e-05,
"loss": 0.3588,
"step": 980
},
{
"epoch": 0.4888888888888889,
"grad_norm": 1.3000644445419312,
"learning_rate": 1.8044444444444445e-05,
"loss": 0.088,
"step": 990
},
{
"epoch": 0.49382716049382713,
"grad_norm": 37.02173614501953,
"learning_rate": 1.802469135802469e-05,
"loss": 0.2648,
"step": 1000
},
{
"epoch": 0.49876543209876545,
"grad_norm": 48.47230529785156,
"learning_rate": 1.8004938271604938e-05,
"loss": 0.5109,
"step": 1010
},
{
"epoch": 0.5037037037037037,
"grad_norm": 51.70542907714844,
"learning_rate": 1.7985185185185188e-05,
"loss": 0.3476,
"step": 1020
},
{
"epoch": 0.508641975308642,
"grad_norm": 2.4657256603240967,
"learning_rate": 1.7965432098765434e-05,
"loss": 0.3445,
"step": 1030
},
{
"epoch": 0.5135802469135803,
"grad_norm": 96.39098358154297,
"learning_rate": 1.794567901234568e-05,
"loss": 0.2845,
"step": 1040
},
{
"epoch": 0.5185185185185185,
"grad_norm": 45.08651351928711,
"learning_rate": 1.7925925925925927e-05,
"loss": 0.2077,
"step": 1050
},
{
"epoch": 0.5234567901234568,
"grad_norm": 0.06106605753302574,
"learning_rate": 1.7906172839506177e-05,
"loss": 0.0399,
"step": 1060
},
{
"epoch": 0.528395061728395,
"grad_norm": 36.55531692504883,
"learning_rate": 1.788641975308642e-05,
"loss": 0.3436,
"step": 1070
},
{
"epoch": 0.5333333333333333,
"grad_norm": 2.2626407146453857,
"learning_rate": 1.7866666666666666e-05,
"loss": 0.6299,
"step": 1080
},
{
"epoch": 0.5382716049382716,
"grad_norm": 16.667465209960938,
"learning_rate": 1.7846913580246913e-05,
"loss": 0.2992,
"step": 1090
},
{
"epoch": 0.5432098765432098,
"grad_norm": 41.49295425415039,
"learning_rate": 1.7827160493827162e-05,
"loss": 0.2554,
"step": 1100
},
{
"epoch": 0.5481481481481482,
"grad_norm": 4.2133002281188965,
"learning_rate": 1.780740740740741e-05,
"loss": 0.2452,
"step": 1110
},
{
"epoch": 0.5530864197530864,
"grad_norm": 49.12704086303711,
"learning_rate": 1.7787654320987655e-05,
"loss": 0.3656,
"step": 1120
},
{
"epoch": 0.5580246913580247,
"grad_norm": 21.075599670410156,
"learning_rate": 1.7767901234567905e-05,
"loss": 0.0648,
"step": 1130
},
{
"epoch": 0.562962962962963,
"grad_norm": 0.5144210457801819,
"learning_rate": 1.774814814814815e-05,
"loss": 0.2793,
"step": 1140
},
{
"epoch": 0.5679012345679012,
"grad_norm": 53.27878189086914,
"learning_rate": 1.7728395061728398e-05,
"loss": 0.206,
"step": 1150
},
{
"epoch": 0.5728395061728395,
"grad_norm": 36.761356353759766,
"learning_rate": 1.7708641975308644e-05,
"loss": 0.3469,
"step": 1160
},
{
"epoch": 0.5777777777777777,
"grad_norm": 3.539717435836792,
"learning_rate": 1.768888888888889e-05,
"loss": 0.2327,
"step": 1170
},
{
"epoch": 0.582716049382716,
"grad_norm": 3.940678596496582,
"learning_rate": 1.7669135802469137e-05,
"loss": 0.2148,
"step": 1180
},
{
"epoch": 0.5876543209876544,
"grad_norm": 44.36384963989258,
"learning_rate": 1.7649382716049383e-05,
"loss": 0.3014,
"step": 1190
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.4438416063785553,
"learning_rate": 1.7629629629629633e-05,
"loss": 0.2904,
"step": 1200
},
{
"epoch": 0.5975308641975309,
"grad_norm": 0.08722967654466629,
"learning_rate": 1.760987654320988e-05,
"loss": 0.6003,
"step": 1210
},
{
"epoch": 0.6024691358024692,
"grad_norm": 3.851921319961548,
"learning_rate": 1.7590123456790126e-05,
"loss": 0.1097,
"step": 1220
},
{
"epoch": 0.6074074074074074,
"grad_norm": 2.105475425720215,
"learning_rate": 1.7570370370370372e-05,
"loss": 0.0446,
"step": 1230
},
{
"epoch": 0.6123456790123457,
"grad_norm": 1.8762763738632202,
"learning_rate": 1.755061728395062e-05,
"loss": 0.3621,
"step": 1240
},
{
"epoch": 0.6172839506172839,
"grad_norm": 0.8981475234031677,
"learning_rate": 1.7530864197530865e-05,
"loss": 0.6072,
"step": 1250
},
{
"epoch": 0.6222222222222222,
"grad_norm": 0.05930788442492485,
"learning_rate": 1.751111111111111e-05,
"loss": 0.2451,
"step": 1260
},
{
"epoch": 0.6271604938271605,
"grad_norm": 47.51054763793945,
"learning_rate": 1.7491358024691358e-05,
"loss": 0.2657,
"step": 1270
},
{
"epoch": 0.6320987654320988,
"grad_norm": 84.59910583496094,
"learning_rate": 1.7471604938271608e-05,
"loss": 0.2821,
"step": 1280
},
{
"epoch": 0.6370370370370371,
"grad_norm": 92.97787475585938,
"learning_rate": 1.7451851851851854e-05,
"loss": 0.3573,
"step": 1290
},
{
"epoch": 0.6419753086419753,
"grad_norm": 134.259033203125,
"learning_rate": 1.74320987654321e-05,
"loss": 0.2441,
"step": 1300
},
{
"epoch": 0.6469135802469136,
"grad_norm": 61.10758972167969,
"learning_rate": 1.7412345679012347e-05,
"loss": 0.3629,
"step": 1310
},
{
"epoch": 0.6518518518518519,
"grad_norm": 0.031939879059791565,
"learning_rate": 1.7392592592592593e-05,
"loss": 0.0489,
"step": 1320
},
{
"epoch": 0.6567901234567901,
"grad_norm": 52.49007034301758,
"learning_rate": 1.737283950617284e-05,
"loss": 0.2687,
"step": 1330
},
{
"epoch": 0.6617283950617284,
"grad_norm": 4.723176002502441,
"learning_rate": 1.7353086419753086e-05,
"loss": 0.2252,
"step": 1340
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.503265619277954,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.3459,
"step": 1350
},
{
"epoch": 0.671604938271605,
"grad_norm": 68.56127166748047,
"learning_rate": 1.7313580246913582e-05,
"loss": 0.4752,
"step": 1360
},
{
"epoch": 0.6765432098765433,
"grad_norm": 96.8653793334961,
"learning_rate": 1.729382716049383e-05,
"loss": 0.1921,
"step": 1370
},
{
"epoch": 0.6814814814814815,
"grad_norm": 139.44691467285156,
"learning_rate": 1.7274074074074075e-05,
"loss": 0.2266,
"step": 1380
},
{
"epoch": 0.6864197530864198,
"grad_norm": 7.88108491897583,
"learning_rate": 1.725432098765432e-05,
"loss": 0.1687,
"step": 1390
},
{
"epoch": 0.691358024691358,
"grad_norm": 61.542091369628906,
"learning_rate": 1.7234567901234568e-05,
"loss": 0.1411,
"step": 1400
},
{
"epoch": 0.6962962962962963,
"grad_norm": 0.7576116919517517,
"learning_rate": 1.7214814814814814e-05,
"loss": 0.4797,
"step": 1410
},
{
"epoch": 0.7012345679012346,
"grad_norm": 14.038137435913086,
"learning_rate": 1.7195061728395064e-05,
"loss": 0.0933,
"step": 1420
},
{
"epoch": 0.7061728395061728,
"grad_norm": 46.00447463989258,
"learning_rate": 1.717530864197531e-05,
"loss": 0.3333,
"step": 1430
},
{
"epoch": 0.7111111111111111,
"grad_norm": 180.21914672851562,
"learning_rate": 1.7155555555555557e-05,
"loss": 0.4032,
"step": 1440
},
{
"epoch": 0.7160493827160493,
"grad_norm": 211.60653686523438,
"learning_rate": 1.7135802469135806e-05,
"loss": 0.3602,
"step": 1450
},
{
"epoch": 0.7209876543209877,
"grad_norm": 10.442931175231934,
"learning_rate": 1.7116049382716053e-05,
"loss": 0.3413,
"step": 1460
},
{
"epoch": 0.725925925925926,
"grad_norm": 54.73400115966797,
"learning_rate": 1.70962962962963e-05,
"loss": 0.1263,
"step": 1470
},
{
"epoch": 0.7308641975308642,
"grad_norm": 7.259425163269043,
"learning_rate": 1.7076543209876542e-05,
"loss": 0.2431,
"step": 1480
},
{
"epoch": 0.7358024691358025,
"grad_norm": 96.37651824951172,
"learning_rate": 1.7056790123456792e-05,
"loss": 0.5599,
"step": 1490
},
{
"epoch": 0.7407407407407407,
"grad_norm": 9.702010154724121,
"learning_rate": 1.7037037037037038e-05,
"loss": 0.3493,
"step": 1500
},
{
"epoch": 0.745679012345679,
"grad_norm": 29.10769271850586,
"learning_rate": 1.7017283950617285e-05,
"loss": 0.3369,
"step": 1510
},
{
"epoch": 0.7506172839506173,
"grad_norm": 77.2637939453125,
"learning_rate": 1.699753086419753e-05,
"loss": 0.5669,
"step": 1520
},
{
"epoch": 0.7555555555555555,
"grad_norm": 0.2619607150554657,
"learning_rate": 1.697777777777778e-05,
"loss": 0.2248,
"step": 1530
},
{
"epoch": 0.7604938271604939,
"grad_norm": 49.25140380859375,
"learning_rate": 1.6958024691358027e-05,
"loss": 0.1465,
"step": 1540
},
{
"epoch": 0.7654320987654321,
"grad_norm": 1.6038424968719482,
"learning_rate": 1.6938271604938274e-05,
"loss": 0.2798,
"step": 1550
},
{
"epoch": 0.7703703703703704,
"grad_norm": 0.2095940262079239,
"learning_rate": 1.691851851851852e-05,
"loss": 0.359,
"step": 1560
},
{
"epoch": 0.7753086419753087,
"grad_norm": 53.154632568359375,
"learning_rate": 1.6898765432098766e-05,
"loss": 0.1937,
"step": 1570
},
{
"epoch": 0.7802469135802469,
"grad_norm": 6.8274006843566895,
"learning_rate": 1.6879012345679013e-05,
"loss": 0.2881,
"step": 1580
},
{
"epoch": 0.7851851851851852,
"grad_norm": 115.4723892211914,
"learning_rate": 1.685925925925926e-05,
"loss": 0.2592,
"step": 1590
},
{
"epoch": 0.7901234567901234,
"grad_norm": 0.015067143365740776,
"learning_rate": 1.683950617283951e-05,
"loss": 0.3331,
"step": 1600
},
{
"epoch": 0.7950617283950617,
"grad_norm": 28.81291961669922,
"learning_rate": 1.6819753086419755e-05,
"loss": 0.5361,
"step": 1610
},
{
"epoch": 0.8,
"grad_norm": 0.010893706232309341,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.1873,
"step": 1620
},
{
"epoch": 0.8049382716049382,
"grad_norm": 1.351131796836853,
"learning_rate": 1.6780246913580248e-05,
"loss": 0.2294,
"step": 1630
},
{
"epoch": 0.8098765432098766,
"grad_norm": 60.61597442626953,
"learning_rate": 1.6760493827160495e-05,
"loss": 0.2917,
"step": 1640
},
{
"epoch": 0.8148148148148148,
"grad_norm": 11.661639213562012,
"learning_rate": 1.674074074074074e-05,
"loss": 0.4087,
"step": 1650
},
{
"epoch": 0.8197530864197531,
"grad_norm": 251.9644012451172,
"learning_rate": 1.6720987654320987e-05,
"loss": 0.2226,
"step": 1660
},
{
"epoch": 0.8246913580246914,
"grad_norm": 7.840044975280762,
"learning_rate": 1.6701234567901237e-05,
"loss": 0.5515,
"step": 1670
},
{
"epoch": 0.8296296296296296,
"grad_norm": 0.08511721342802048,
"learning_rate": 1.6681481481481484e-05,
"loss": 0.2206,
"step": 1680
},
{
"epoch": 0.8345679012345679,
"grad_norm": 19.307905197143555,
"learning_rate": 1.666172839506173e-05,
"loss": 0.2081,
"step": 1690
},
{
"epoch": 0.8395061728395061,
"grad_norm": 1.045444130897522,
"learning_rate": 1.6641975308641976e-05,
"loss": 0.1815,
"step": 1700
},
{
"epoch": 0.8444444444444444,
"grad_norm": 5.953945636749268,
"learning_rate": 1.6622222222222223e-05,
"loss": 0.2312,
"step": 1710
},
{
"epoch": 0.8493827160493828,
"grad_norm": 5.905419826507568,
"learning_rate": 1.660246913580247e-05,
"loss": 0.1001,
"step": 1720
},
{
"epoch": 0.854320987654321,
"grad_norm": 118.84114837646484,
"learning_rate": 1.6582716049382715e-05,
"loss": 0.2355,
"step": 1730
},
{
"epoch": 0.8592592592592593,
"grad_norm": 27.624740600585938,
"learning_rate": 1.6562962962962965e-05,
"loss": 0.1446,
"step": 1740
},
{
"epoch": 0.8641975308641975,
"grad_norm": 126.23757934570312,
"learning_rate": 1.654320987654321e-05,
"loss": 0.2595,
"step": 1750
},
{
"epoch": 0.8691358024691358,
"grad_norm": 2.478506326675415,
"learning_rate": 1.6523456790123458e-05,
"loss": 0.0814,
"step": 1760
},
{
"epoch": 0.8740740740740741,
"grad_norm": 42.80133819580078,
"learning_rate": 1.6503703703703704e-05,
"loss": 0.1934,
"step": 1770
},
{
"epoch": 0.8790123456790123,
"grad_norm": 0.015840064734220505,
"learning_rate": 1.648395061728395e-05,
"loss": 0.2246,
"step": 1780
},
{
"epoch": 0.8839506172839506,
"grad_norm": 112.66703796386719,
"learning_rate": 1.6464197530864197e-05,
"loss": 0.2628,
"step": 1790
},
{
"epoch": 0.8888888888888888,
"grad_norm": 33.77766036987305,
"learning_rate": 1.6444444444444444e-05,
"loss": 0.3193,
"step": 1800
},
{
"epoch": 0.8938271604938272,
"grad_norm": 236.83761596679688,
"learning_rate": 1.6424691358024693e-05,
"loss": 0.3724,
"step": 1810
},
{
"epoch": 0.8987654320987655,
"grad_norm": 57.66241455078125,
"learning_rate": 1.640493827160494e-05,
"loss": 0.3194,
"step": 1820
},
{
"epoch": 0.9037037037037037,
"grad_norm": 142.6712646484375,
"learning_rate": 1.6385185185185186e-05,
"loss": 0.2389,
"step": 1830
},
{
"epoch": 0.908641975308642,
"grad_norm": 0.11197575181722641,
"learning_rate": 1.6365432098765433e-05,
"loss": 0.268,
"step": 1840
},
{
"epoch": 0.9135802469135802,
"grad_norm": 407.26885986328125,
"learning_rate": 1.6345679012345682e-05,
"loss": 0.2186,
"step": 1850
},
{
"epoch": 0.9185185185185185,
"grad_norm": 0.057163987308740616,
"learning_rate": 1.632592592592593e-05,
"loss": 0.4509,
"step": 1860
},
{
"epoch": 0.9234567901234568,
"grad_norm": 66.4487075805664,
"learning_rate": 1.6306172839506175e-05,
"loss": 0.3745,
"step": 1870
},
{
"epoch": 0.928395061728395,
"grad_norm": 115.2850570678711,
"learning_rate": 1.628641975308642e-05,
"loss": 0.4606,
"step": 1880
},
{
"epoch": 0.9333333333333333,
"grad_norm": 66.02615356445312,
"learning_rate": 1.6266666666666668e-05,
"loss": 0.2206,
"step": 1890
},
{
"epoch": 0.9382716049382716,
"grad_norm": 2.386338949203491,
"learning_rate": 1.6246913580246914e-05,
"loss": 0.364,
"step": 1900
},
{
"epoch": 0.9432098765432099,
"grad_norm": 57.060977935791016,
"learning_rate": 1.622716049382716e-05,
"loss": 0.2837,
"step": 1910
},
{
"epoch": 0.9481481481481482,
"grad_norm": 0.7722509503364563,
"learning_rate": 1.620740740740741e-05,
"loss": 0.6167,
"step": 1920
},
{
"epoch": 0.9530864197530864,
"grad_norm": 0.762596845626831,
"learning_rate": 1.6187654320987657e-05,
"loss": 0.1728,
"step": 1930
},
{
"epoch": 0.9580246913580247,
"grad_norm": 40.202091217041016,
"learning_rate": 1.6167901234567903e-05,
"loss": 0.2449,
"step": 1940
},
{
"epoch": 0.9629629629629629,
"grad_norm": 57.35947799682617,
"learning_rate": 1.614814814814815e-05,
"loss": 0.4488,
"step": 1950
},
{
"epoch": 0.9679012345679012,
"grad_norm": 68.08243560791016,
"learning_rate": 1.6128395061728396e-05,
"loss": 0.3488,
"step": 1960
},
{
"epoch": 0.9728395061728395,
"grad_norm": 1.9619942903518677,
"learning_rate": 1.6108641975308642e-05,
"loss": 0.2035,
"step": 1970
},
{
"epoch": 0.9777777777777777,
"grad_norm": 0.8691776990890503,
"learning_rate": 1.608888888888889e-05,
"loss": 0.2162,
"step": 1980
},
{
"epoch": 0.9827160493827161,
"grad_norm": 0.5446602702140808,
"learning_rate": 1.606913580246914e-05,
"loss": 0.4364,
"step": 1990
},
{
"epoch": 0.9876543209876543,
"grad_norm": 10.081711769104004,
"learning_rate": 1.6049382716049385e-05,
"loss": 0.2213,
"step": 2000
},
{
"epoch": 0.9925925925925926,
"grad_norm": 0.02493743598461151,
"learning_rate": 1.602962962962963e-05,
"loss": 0.0608,
"step": 2010
},
{
"epoch": 0.9975308641975309,
"grad_norm": 2.9489526748657227,
"learning_rate": 1.6009876543209878e-05,
"loss": 0.3004,
"step": 2020
},
{
"epoch": 1.0,
"eval_accuracy": 0.9677777777777777,
"eval_loss": 0.11802458763122559,
"eval_runtime": 32.902,
"eval_samples_per_second": 164.124,
"eval_steps_per_second": 20.515,
"step": 2025
},
{
"epoch": 1.002469135802469,
"grad_norm": 138.59349060058594,
"learning_rate": 1.5990123456790124e-05,
"loss": 0.2046,
"step": 2030
},
{
"epoch": 1.0074074074074073,
"grad_norm": 0.05510491877794266,
"learning_rate": 1.597037037037037e-05,
"loss": 0.1356,
"step": 2040
},
{
"epoch": 1.0123456790123457,
"grad_norm": 14.264396667480469,
"learning_rate": 1.5950617283950617e-05,
"loss": 0.1624,
"step": 2050
},
{
"epoch": 1.017283950617284,
"grad_norm": 0.9380566477775574,
"learning_rate": 1.5930864197530867e-05,
"loss": 0.2289,
"step": 2060
},
{
"epoch": 1.0222222222222221,
"grad_norm": 0.017738979309797287,
"learning_rate": 1.5911111111111113e-05,
"loss": 0.4301,
"step": 2070
},
{
"epoch": 1.0271604938271606,
"grad_norm": 0.030082279816269875,
"learning_rate": 1.589135802469136e-05,
"loss": 0.1049,
"step": 2080
},
{
"epoch": 1.0320987654320988,
"grad_norm": 0.28669413924217224,
"learning_rate": 1.5871604938271606e-05,
"loss": 0.1245,
"step": 2090
},
{
"epoch": 1.037037037037037,
"grad_norm": 7.697299003601074,
"learning_rate": 1.5851851851851852e-05,
"loss": 0.6147,
"step": 2100
},
{
"epoch": 1.0419753086419754,
"grad_norm": 99.23163604736328,
"learning_rate": 1.58320987654321e-05,
"loss": 0.1613,
"step": 2110
},
{
"epoch": 1.0469135802469136,
"grad_norm": 52.61363220214844,
"learning_rate": 1.5812345679012345e-05,
"loss": 0.3256,
"step": 2120
},
{
"epoch": 1.0518518518518518,
"grad_norm": 87.68861389160156,
"learning_rate": 1.5792592592592595e-05,
"loss": 0.2956,
"step": 2130
},
{
"epoch": 1.05679012345679,
"grad_norm": 30.490577697753906,
"learning_rate": 1.577283950617284e-05,
"loss": 0.2226,
"step": 2140
},
{
"epoch": 1.0617283950617284,
"grad_norm": 1.5879323482513428,
"learning_rate": 1.5753086419753088e-05,
"loss": 0.3573,
"step": 2150
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.36435502767562866,
"learning_rate": 1.5733333333333334e-05,
"loss": 0.167,
"step": 2160
},
{
"epoch": 1.0716049382716049,
"grad_norm": 0.3206441104412079,
"learning_rate": 1.5713580246913584e-05,
"loss": 0.2698,
"step": 2170
},
{
"epoch": 1.0765432098765433,
"grad_norm": 17.28899574279785,
"learning_rate": 1.569382716049383e-05,
"loss": 0.383,
"step": 2180
},
{
"epoch": 1.0814814814814815,
"grad_norm": 31.972209930419922,
"learning_rate": 1.5674074074074073e-05,
"loss": 0.2109,
"step": 2190
},
{
"epoch": 1.0864197530864197,
"grad_norm": 35.79594802856445,
"learning_rate": 1.565432098765432e-05,
"loss": 0.2666,
"step": 2200
},
{
"epoch": 1.0913580246913581,
"grad_norm": 3.0720813274383545,
"learning_rate": 1.563456790123457e-05,
"loss": 0.0663,
"step": 2210
},
{
"epoch": 1.0962962962962963,
"grad_norm": 46.16384506225586,
"learning_rate": 1.5614814814814816e-05,
"loss": 0.1775,
"step": 2220
},
{
"epoch": 1.1012345679012345,
"grad_norm": 3.8352577686309814,
"learning_rate": 1.5595061728395062e-05,
"loss": 0.1719,
"step": 2230
},
{
"epoch": 1.106172839506173,
"grad_norm": 24.50127601623535,
"learning_rate": 1.5575308641975312e-05,
"loss": 0.4285,
"step": 2240
},
{
"epoch": 1.1111111111111112,
"grad_norm": 45.77573776245117,
"learning_rate": 1.555555555555556e-05,
"loss": 0.3723,
"step": 2250
},
{
"epoch": 1.1160493827160494,
"grad_norm": 51.60211181640625,
"learning_rate": 1.5535802469135805e-05,
"loss": 0.1194,
"step": 2260
},
{
"epoch": 1.1209876543209876,
"grad_norm": 48.674163818359375,
"learning_rate": 1.551604938271605e-05,
"loss": 0.3845,
"step": 2270
},
{
"epoch": 1.125925925925926,
"grad_norm": 0.43790122866630554,
"learning_rate": 1.5496296296296298e-05,
"loss": 0.1622,
"step": 2280
},
{
"epoch": 1.1308641975308642,
"grad_norm": 0.4926997125148773,
"learning_rate": 1.5476543209876544e-05,
"loss": 0.1739,
"step": 2290
},
{
"epoch": 1.1358024691358024,
"grad_norm": 27.840295791625977,
"learning_rate": 1.545679012345679e-05,
"loss": 0.1265,
"step": 2300
},
{
"epoch": 1.1407407407407408,
"grad_norm": 148.9844207763672,
"learning_rate": 1.543703703703704e-05,
"loss": 0.2187,
"step": 2310
},
{
"epoch": 1.145679012345679,
"grad_norm": 63.56736373901367,
"learning_rate": 1.5417283950617286e-05,
"loss": 0.2227,
"step": 2320
},
{
"epoch": 1.1506172839506172,
"grad_norm": 32.42955780029297,
"learning_rate": 1.5397530864197533e-05,
"loss": 0.1863,
"step": 2330
},
{
"epoch": 1.1555555555555554,
"grad_norm": 72.6145248413086,
"learning_rate": 1.537777777777778e-05,
"loss": 0.3744,
"step": 2340
},
{
"epoch": 1.1604938271604939,
"grad_norm": 4.558436393737793,
"learning_rate": 1.5358024691358026e-05,
"loss": 0.2796,
"step": 2350
},
{
"epoch": 1.165432098765432,
"grad_norm": 0.5049192905426025,
"learning_rate": 1.5338271604938272e-05,
"loss": 0.1426,
"step": 2360
},
{
"epoch": 1.1703703703703703,
"grad_norm": 0.11132398992776871,
"learning_rate": 1.531851851851852e-05,
"loss": 0.1231,
"step": 2370
},
{
"epoch": 1.1753086419753087,
"grad_norm": 26.840200424194336,
"learning_rate": 1.5298765432098768e-05,
"loss": 0.2786,
"step": 2380
},
{
"epoch": 1.180246913580247,
"grad_norm": 0.15319669246673584,
"learning_rate": 1.5279012345679015e-05,
"loss": 0.5859,
"step": 2390
},
{
"epoch": 1.1851851851851851,
"grad_norm": 39.83156204223633,
"learning_rate": 1.525925925925926e-05,
"loss": 0.4391,
"step": 2400
},
{
"epoch": 1.1901234567901235,
"grad_norm": 0.38840270042419434,
"learning_rate": 1.5239506172839507e-05,
"loss": 0.1187,
"step": 2410
},
{
"epoch": 1.1950617283950618,
"grad_norm": 0.025911659002304077,
"learning_rate": 1.5219753086419755e-05,
"loss": 0.0865,
"step": 2420
},
{
"epoch": 1.2,
"grad_norm": 81.05162048339844,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.3289,
"step": 2430
},
{
"epoch": 1.2049382716049384,
"grad_norm": 72.2834701538086,
"learning_rate": 1.5180246913580248e-05,
"loss": 0.5105,
"step": 2440
},
{
"epoch": 1.2098765432098766,
"grad_norm": 0.06509275734424591,
"learning_rate": 1.5160493827160495e-05,
"loss": 0.2435,
"step": 2450
},
{
"epoch": 1.2148148148148148,
"grad_norm": 12.417915344238281,
"learning_rate": 1.5140740740740743e-05,
"loss": 0.3175,
"step": 2460
},
{
"epoch": 1.219753086419753,
"grad_norm": 64.59101104736328,
"learning_rate": 1.5120987654320989e-05,
"loss": 0.4517,
"step": 2470
},
{
"epoch": 1.2246913580246914,
"grad_norm": 43.42831802368164,
"learning_rate": 1.5101234567901236e-05,
"loss": 0.1514,
"step": 2480
},
{
"epoch": 1.2296296296296296,
"grad_norm": 0.5973836779594421,
"learning_rate": 1.5081481481481484e-05,
"loss": 0.1027,
"step": 2490
},
{
"epoch": 1.2345679012345678,
"grad_norm": 41.84488296508789,
"learning_rate": 1.506172839506173e-05,
"loss": 0.2706,
"step": 2500
},
{
"epoch": 1.2395061728395063,
"grad_norm": 135.85255432128906,
"learning_rate": 1.5041975308641976e-05,
"loss": 0.204,
"step": 2510
},
{
"epoch": 1.2444444444444445,
"grad_norm": 14.007678985595703,
"learning_rate": 1.5022222222222223e-05,
"loss": 0.4253,
"step": 2520
},
{
"epoch": 1.2493827160493827,
"grad_norm": 34.2636833190918,
"learning_rate": 1.5002469135802471e-05,
"loss": 0.21,
"step": 2530
},
{
"epoch": 1.2543209876543209,
"grad_norm": 19.363365173339844,
"learning_rate": 1.4982716049382717e-05,
"loss": 0.2031,
"step": 2540
},
{
"epoch": 1.2592592592592593,
"grad_norm": 0.3058103919029236,
"learning_rate": 1.4962962962962964e-05,
"loss": 0.2789,
"step": 2550
},
{
"epoch": 1.2641975308641975,
"grad_norm": 70.8534164428711,
"learning_rate": 1.4943209876543212e-05,
"loss": 0.4306,
"step": 2560
},
{
"epoch": 1.269135802469136,
"grad_norm": 0.1311403512954712,
"learning_rate": 1.4923456790123458e-05,
"loss": 0.4098,
"step": 2570
},
{
"epoch": 1.2740740740740741,
"grad_norm": 84.89444732666016,
"learning_rate": 1.4903703703703705e-05,
"loss": 0.2931,
"step": 2580
},
{
"epoch": 1.2790123456790123,
"grad_norm": 0.9064738154411316,
"learning_rate": 1.4883950617283951e-05,
"loss": 0.3069,
"step": 2590
},
{
"epoch": 1.2839506172839505,
"grad_norm": 0.491811603307724,
"learning_rate": 1.4864197530864199e-05,
"loss": 0.2636,
"step": 2600
},
{
"epoch": 1.2888888888888888,
"grad_norm": 35.797969818115234,
"learning_rate": 1.4844444444444445e-05,
"loss": 0.2673,
"step": 2610
},
{
"epoch": 1.2938271604938272,
"grad_norm": 0.0416533537209034,
"learning_rate": 1.4824691358024692e-05,
"loss": 0.0711,
"step": 2620
},
{
"epoch": 1.2987654320987654,
"grad_norm": 4.76767635345459,
"learning_rate": 1.480493827160494e-05,
"loss": 0.2506,
"step": 2630
},
{
"epoch": 1.3037037037037038,
"grad_norm": 32.206031799316406,
"learning_rate": 1.4785185185185186e-05,
"loss": 0.453,
"step": 2640
},
{
"epoch": 1.308641975308642,
"grad_norm": 131.6813201904297,
"learning_rate": 1.4765432098765433e-05,
"loss": 0.1793,
"step": 2650
},
{
"epoch": 1.3135802469135802,
"grad_norm": 7.119224548339844,
"learning_rate": 1.4745679012345679e-05,
"loss": 0.0779,
"step": 2660
},
{
"epoch": 1.3185185185185184,
"grad_norm": 139.8772735595703,
"learning_rate": 1.4725925925925927e-05,
"loss": 0.4545,
"step": 2670
},
{
"epoch": 1.3234567901234568,
"grad_norm": 0.4141978919506073,
"learning_rate": 1.4706172839506174e-05,
"loss": 0.2352,
"step": 2680
},
{
"epoch": 1.328395061728395,
"grad_norm": 42.8140869140625,
"learning_rate": 1.468641975308642e-05,
"loss": 0.1611,
"step": 2690
},
{
"epoch": 1.3333333333333333,
"grad_norm": 16.763948440551758,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.0735,
"step": 2700
},
{
"epoch": 1.3382716049382717,
"grad_norm": 140.94900512695312,
"learning_rate": 1.4646913580246916e-05,
"loss": 0.1474,
"step": 2710
},
{
"epoch": 1.34320987654321,
"grad_norm": 0.9029823541641235,
"learning_rate": 1.4627160493827162e-05,
"loss": 0.0437,
"step": 2720
},
{
"epoch": 1.348148148148148,
"grad_norm": 46.620086669921875,
"learning_rate": 1.4607407407407407e-05,
"loss": 0.1856,
"step": 2730
},
{
"epoch": 1.3530864197530863,
"grad_norm": 64.09046173095703,
"learning_rate": 1.4587654320987657e-05,
"loss": 0.1532,
"step": 2740
},
{
"epoch": 1.3580246913580247,
"grad_norm": 104.23167419433594,
"learning_rate": 1.4567901234567903e-05,
"loss": 0.2386,
"step": 2750
},
{
"epoch": 1.362962962962963,
"grad_norm": 0.36242911219596863,
"learning_rate": 1.454814814814815e-05,
"loss": 0.4831,
"step": 2760
},
{
"epoch": 1.3679012345679014,
"grad_norm": 0.5484885573387146,
"learning_rate": 1.4528395061728396e-05,
"loss": 0.0836,
"step": 2770
},
{
"epoch": 1.3728395061728396,
"grad_norm": 51.26658630371094,
"learning_rate": 1.4508641975308644e-05,
"loss": 0.1736,
"step": 2780
},
{
"epoch": 1.3777777777777778,
"grad_norm": 20.211082458496094,
"learning_rate": 1.448888888888889e-05,
"loss": 0.3063,
"step": 2790
},
{
"epoch": 1.382716049382716,
"grad_norm": 0.7425023913383484,
"learning_rate": 1.4469135802469137e-05,
"loss": 0.1025,
"step": 2800
},
{
"epoch": 1.3876543209876544,
"grad_norm": 159.22314453125,
"learning_rate": 1.4449382716049385e-05,
"loss": 0.2052,
"step": 2810
},
{
"epoch": 1.3925925925925926,
"grad_norm": 47.53805923461914,
"learning_rate": 1.4429629629629631e-05,
"loss": 0.1378,
"step": 2820
},
{
"epoch": 1.3975308641975308,
"grad_norm": 0.2027841955423355,
"learning_rate": 1.4409876543209878e-05,
"loss": 0.0507,
"step": 2830
},
{
"epoch": 1.4024691358024692,
"grad_norm": 0.18290477991104126,
"learning_rate": 1.4390123456790124e-05,
"loss": 0.2193,
"step": 2840
},
{
"epoch": 1.4074074074074074,
"grad_norm": 126.16277313232422,
"learning_rate": 1.4370370370370372e-05,
"loss": 0.3206,
"step": 2850
},
{
"epoch": 1.4123456790123456,
"grad_norm": 127.88780975341797,
"learning_rate": 1.4350617283950619e-05,
"loss": 0.4142,
"step": 2860
},
{
"epoch": 1.4172839506172838,
"grad_norm": 3.724766254425049,
"learning_rate": 1.4330864197530865e-05,
"loss": 0.0783,
"step": 2870
},
{
"epoch": 1.4222222222222223,
"grad_norm": 199.94883728027344,
"learning_rate": 1.4311111111111111e-05,
"loss": 0.3896,
"step": 2880
},
{
"epoch": 1.4271604938271605,
"grad_norm": 116.74020385742188,
"learning_rate": 1.429135802469136e-05,
"loss": 0.2982,
"step": 2890
},
{
"epoch": 1.4320987654320987,
"grad_norm": 2.576690673828125,
"learning_rate": 1.4271604938271606e-05,
"loss": 0.1678,
"step": 2900
},
{
"epoch": 1.4370370370370371,
"grad_norm": 95.74549865722656,
"learning_rate": 1.4251851851851852e-05,
"loss": 0.2808,
"step": 2910
},
{
"epoch": 1.4419753086419753,
"grad_norm": 43.24068069458008,
"learning_rate": 1.42320987654321e-05,
"loss": 0.3589,
"step": 2920
},
{
"epoch": 1.4469135802469135,
"grad_norm": 40.1359977722168,
"learning_rate": 1.4212345679012347e-05,
"loss": 0.1566,
"step": 2930
},
{
"epoch": 1.4518518518518517,
"grad_norm": 7.546663284301758,
"learning_rate": 1.4192592592592593e-05,
"loss": 0.1562,
"step": 2940
},
{
"epoch": 1.4567901234567902,
"grad_norm": 117.94816589355469,
"learning_rate": 1.417283950617284e-05,
"loss": 0.3526,
"step": 2950
},
{
"epoch": 1.4617283950617284,
"grad_norm": 107.50965881347656,
"learning_rate": 1.4153086419753088e-05,
"loss": 0.2148,
"step": 2960
},
{
"epoch": 1.4666666666666668,
"grad_norm": 16.908262252807617,
"learning_rate": 1.4133333333333334e-05,
"loss": 0.451,
"step": 2970
},
{
"epoch": 1.471604938271605,
"grad_norm": 53.356773376464844,
"learning_rate": 1.411358024691358e-05,
"loss": 0.3616,
"step": 2980
},
{
"epoch": 1.4765432098765432,
"grad_norm": 44.207054138183594,
"learning_rate": 1.4093827160493829e-05,
"loss": 0.0903,
"step": 2990
},
{
"epoch": 1.4814814814814814,
"grad_norm": 78.0193862915039,
"learning_rate": 1.4074074074074075e-05,
"loss": 0.2323,
"step": 3000
},
{
"epoch": 1.4864197530864198,
"grad_norm": 1.2068320512771606,
"learning_rate": 1.4054320987654321e-05,
"loss": 0.2748,
"step": 3010
},
{
"epoch": 1.491358024691358,
"grad_norm": 15.009058952331543,
"learning_rate": 1.4034567901234568e-05,
"loss": 0.2607,
"step": 3020
},
{
"epoch": 1.4962962962962962,
"grad_norm": 1.3016469478607178,
"learning_rate": 1.4014814814814816e-05,
"loss": 0.0402,
"step": 3030
},
{
"epoch": 1.5012345679012347,
"grad_norm": 64.81990814208984,
"learning_rate": 1.3995061728395062e-05,
"loss": 0.4051,
"step": 3040
},
{
"epoch": 1.5061728395061729,
"grad_norm": 18.911441802978516,
"learning_rate": 1.3975308641975309e-05,
"loss": 0.2663,
"step": 3050
},
{
"epoch": 1.511111111111111,
"grad_norm": 89.58609771728516,
"learning_rate": 1.3955555555555558e-05,
"loss": 0.2006,
"step": 3060
},
{
"epoch": 1.5160493827160493,
"grad_norm": 84.76557922363281,
"learning_rate": 1.3935802469135805e-05,
"loss": 0.1644,
"step": 3070
},
{
"epoch": 1.5209876543209877,
"grad_norm": 0.690521240234375,
"learning_rate": 1.391604938271605e-05,
"loss": 0.3695,
"step": 3080
},
{
"epoch": 1.525925925925926,
"grad_norm": 0.9079038500785828,
"learning_rate": 1.3896296296296296e-05,
"loss": 0.1316,
"step": 3090
},
{
"epoch": 1.5308641975308643,
"grad_norm": 0.0010949569987133145,
"learning_rate": 1.3876543209876546e-05,
"loss": 0.1599,
"step": 3100
},
{
"epoch": 1.5358024691358025,
"grad_norm": 0.017062200233340263,
"learning_rate": 1.3856790123456792e-05,
"loss": 0.2102,
"step": 3110
},
{
"epoch": 1.5407407407407407,
"grad_norm": 54.44521713256836,
"learning_rate": 1.3837037037037038e-05,
"loss": 0.2856,
"step": 3120
},
{
"epoch": 1.545679012345679,
"grad_norm": 124.57701873779297,
"learning_rate": 1.3817283950617285e-05,
"loss": 0.6973,
"step": 3130
},
{
"epoch": 1.5506172839506172,
"grad_norm": 73.95056915283203,
"learning_rate": 1.3797530864197533e-05,
"loss": 0.134,
"step": 3140
},
{
"epoch": 1.5555555555555556,
"grad_norm": 114.4755630493164,
"learning_rate": 1.377777777777778e-05,
"loss": 0.4007,
"step": 3150
},
{
"epoch": 1.5604938271604938,
"grad_norm": 5.708268165588379,
"learning_rate": 1.3758024691358026e-05,
"loss": 0.2191,
"step": 3160
},
{
"epoch": 1.5654320987654322,
"grad_norm": 39.35977554321289,
"learning_rate": 1.3738271604938274e-05,
"loss": 0.1217,
"step": 3170
},
{
"epoch": 1.5703703703703704,
"grad_norm": 1.868407130241394,
"learning_rate": 1.371851851851852e-05,
"loss": 0.1177,
"step": 3180
},
{
"epoch": 1.5753086419753086,
"grad_norm": 7.092827320098877,
"learning_rate": 1.3698765432098767e-05,
"loss": 0.1979,
"step": 3190
},
{
"epoch": 1.5802469135802468,
"grad_norm": 0.005435746628791094,
"learning_rate": 1.3679012345679013e-05,
"loss": 0.1564,
"step": 3200
},
{
"epoch": 1.585185185185185,
"grad_norm": 80.7311019897461,
"learning_rate": 1.3659259259259261e-05,
"loss": 0.2003,
"step": 3210
},
{
"epoch": 1.5901234567901235,
"grad_norm": 0.9620011448860168,
"learning_rate": 1.3639506172839507e-05,
"loss": 0.1261,
"step": 3220
},
{
"epoch": 1.5950617283950619,
"grad_norm": 95.69831085205078,
"learning_rate": 1.3619753086419754e-05,
"loss": 0.1809,
"step": 3230
},
{
"epoch": 1.6,
"grad_norm": 61.438812255859375,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.506,
"step": 3240
},
{
"epoch": 1.6049382716049383,
"grad_norm": 325.63250732421875,
"learning_rate": 1.3580246913580248e-05,
"loss": 0.3093,
"step": 3250
},
{
"epoch": 1.6098765432098765,
"grad_norm": 17.00379180908203,
"learning_rate": 1.3560493827160495e-05,
"loss": 0.2099,
"step": 3260
},
{
"epoch": 1.6148148148148147,
"grad_norm": 100.260498046875,
"learning_rate": 1.3540740740740741e-05,
"loss": 0.6063,
"step": 3270
},
{
"epoch": 1.6197530864197531,
"grad_norm": 0.09998781979084015,
"learning_rate": 1.352098765432099e-05,
"loss": 0.3561,
"step": 3280
},
{
"epoch": 1.6246913580246913,
"grad_norm": 0.34626302123069763,
"learning_rate": 1.3501234567901236e-05,
"loss": 0.0074,
"step": 3290
},
{
"epoch": 1.6296296296296298,
"grad_norm": 0.034202978014945984,
"learning_rate": 1.3481481481481482e-05,
"loss": 0.4788,
"step": 3300
},
{
"epoch": 1.634567901234568,
"grad_norm": 18.52402687072754,
"learning_rate": 1.346172839506173e-05,
"loss": 0.1834,
"step": 3310
},
{
"epoch": 1.6395061728395062,
"grad_norm": 1.5653138160705566,
"learning_rate": 1.3441975308641976e-05,
"loss": 0.354,
"step": 3320
},
{
"epoch": 1.6444444444444444,
"grad_norm": 69.99710845947266,
"learning_rate": 1.3422222222222223e-05,
"loss": 0.3642,
"step": 3330
},
{
"epoch": 1.6493827160493826,
"grad_norm": 50.67994689941406,
"learning_rate": 1.340246913580247e-05,
"loss": 0.1864,
"step": 3340
},
{
"epoch": 1.654320987654321,
"grad_norm": 0.31549400091171265,
"learning_rate": 1.3382716049382717e-05,
"loss": 0.3157,
"step": 3350
},
{
"epoch": 1.6592592592592592,
"grad_norm": 111.24998474121094,
"learning_rate": 1.3362962962962964e-05,
"loss": 0.6087,
"step": 3360
},
{
"epoch": 1.6641975308641976,
"grad_norm": 23.009380340576172,
"learning_rate": 1.334320987654321e-05,
"loss": 0.1866,
"step": 3370
},
{
"epoch": 1.6691358024691358,
"grad_norm": 88.22378540039062,
"learning_rate": 1.3323456790123456e-05,
"loss": 0.1728,
"step": 3380
},
{
"epoch": 1.674074074074074,
"grad_norm": 0.3229973316192627,
"learning_rate": 1.3303703703703705e-05,
"loss": 0.4118,
"step": 3390
},
{
"epoch": 1.6790123456790123,
"grad_norm": 5.422463893890381,
"learning_rate": 1.3283950617283951e-05,
"loss": 0.2223,
"step": 3400
},
{
"epoch": 1.6839506172839505,
"grad_norm": 0.07091034948825836,
"learning_rate": 1.3264197530864197e-05,
"loss": 0.5162,
"step": 3410
},
{
"epoch": 1.6888888888888889,
"grad_norm": 0.41538941860198975,
"learning_rate": 1.3244444444444447e-05,
"loss": 0.4052,
"step": 3420
},
{
"epoch": 1.6938271604938273,
"grad_norm": 7.8336181640625,
"learning_rate": 1.3224691358024694e-05,
"loss": 0.1862,
"step": 3430
},
{
"epoch": 1.6987654320987655,
"grad_norm": 7.325730800628662,
"learning_rate": 1.3204938271604938e-05,
"loss": 0.1988,
"step": 3440
},
{
"epoch": 1.7037037037037037,
"grad_norm": 39.67108154296875,
"learning_rate": 1.3185185185185185e-05,
"loss": 0.3016,
"step": 3450
},
{
"epoch": 1.708641975308642,
"grad_norm": 0.42901355028152466,
"learning_rate": 1.3165432098765434e-05,
"loss": 0.008,
"step": 3460
},
{
"epoch": 1.7135802469135801,
"grad_norm": 99.74118041992188,
"learning_rate": 1.314567901234568e-05,
"loss": 0.3562,
"step": 3470
},
{
"epoch": 1.7185185185185186,
"grad_norm": 41.35346221923828,
"learning_rate": 1.3125925925925927e-05,
"loss": 0.2514,
"step": 3480
},
{
"epoch": 1.7234567901234568,
"grad_norm": 59.84602355957031,
"learning_rate": 1.3106172839506175e-05,
"loss": 0.3048,
"step": 3490
},
{
"epoch": 1.7283950617283952,
"grad_norm": 2.039802312850952,
"learning_rate": 1.3086419753086422e-05,
"loss": 0.2926,
"step": 3500
},
{
"epoch": 1.7333333333333334,
"grad_norm": 66.14095306396484,
"learning_rate": 1.3066666666666668e-05,
"loss": 0.3515,
"step": 3510
},
{
"epoch": 1.7382716049382716,
"grad_norm": 5.856687068939209,
"learning_rate": 1.3046913580246914e-05,
"loss": 0.2199,
"step": 3520
},
{
"epoch": 1.7432098765432098,
"grad_norm": 89.60210418701172,
"learning_rate": 1.3027160493827163e-05,
"loss": 0.3104,
"step": 3530
},
{
"epoch": 1.748148148148148,
"grad_norm": 2.4179534912109375,
"learning_rate": 1.3007407407407409e-05,
"loss": 0.2304,
"step": 3540
},
{
"epoch": 1.7530864197530864,
"grad_norm": 39.764408111572266,
"learning_rate": 1.2987654320987655e-05,
"loss": 0.3049,
"step": 3550
},
{
"epoch": 1.7580246913580246,
"grad_norm": 66.1130599975586,
"learning_rate": 1.2967901234567903e-05,
"loss": 0.1726,
"step": 3560
},
{
"epoch": 1.762962962962963,
"grad_norm": 33.54975509643555,
"learning_rate": 1.294814814814815e-05,
"loss": 0.2627,
"step": 3570
},
{
"epoch": 1.7679012345679013,
"grad_norm": 0.5882616639137268,
"learning_rate": 1.2928395061728396e-05,
"loss": 0.1133,
"step": 3580
},
{
"epoch": 1.7728395061728395,
"grad_norm": 0.09102596342563629,
"learning_rate": 1.2908641975308643e-05,
"loss": 0.1391,
"step": 3590
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.2745858430862427,
"learning_rate": 1.288888888888889e-05,
"loss": 0.1178,
"step": 3600
},
{
"epoch": 1.7827160493827159,
"grad_norm": 0.22387881577014923,
"learning_rate": 1.2869135802469137e-05,
"loss": 0.2893,
"step": 3610
},
{
"epoch": 1.7876543209876543,
"grad_norm": 0.3061552047729492,
"learning_rate": 1.2849382716049383e-05,
"loss": 0.2718,
"step": 3620
},
{
"epoch": 1.7925925925925927,
"grad_norm": 40.53445053100586,
"learning_rate": 1.282962962962963e-05,
"loss": 0.0972,
"step": 3630
},
{
"epoch": 1.797530864197531,
"grad_norm": 0.2346036285161972,
"learning_rate": 1.2809876543209878e-05,
"loss": 0.1796,
"step": 3640
},
{
"epoch": 1.8024691358024691,
"grad_norm": 84.19086456298828,
"learning_rate": 1.2790123456790124e-05,
"loss": 0.2555,
"step": 3650
},
{
"epoch": 1.8074074074074074,
"grad_norm": 26.573976516723633,
"learning_rate": 1.277037037037037e-05,
"loss": 0.1533,
"step": 3660
},
{
"epoch": 1.8123456790123456,
"grad_norm": 0.0031530587002635,
"learning_rate": 1.2750617283950619e-05,
"loss": 0.1559,
"step": 3670
},
{
"epoch": 1.817283950617284,
"grad_norm": 72.7174072265625,
"learning_rate": 1.2730864197530865e-05,
"loss": 0.1383,
"step": 3680
},
{
"epoch": 1.8222222222222222,
"grad_norm": 0.07971396297216415,
"learning_rate": 1.2711111111111112e-05,
"loss": 0.3888,
"step": 3690
},
{
"epoch": 1.8271604938271606,
"grad_norm": 82.53282165527344,
"learning_rate": 1.2691358024691358e-05,
"loss": 0.113,
"step": 3700
},
{
"epoch": 1.8320987654320988,
"grad_norm": 0.34782519936561584,
"learning_rate": 1.2671604938271606e-05,
"loss": 0.2208,
"step": 3710
},
{
"epoch": 1.837037037037037,
"grad_norm": 6.04480504989624,
"learning_rate": 1.2651851851851852e-05,
"loss": 0.3451,
"step": 3720
},
{
"epoch": 1.8419753086419752,
"grad_norm": 15.001103401184082,
"learning_rate": 1.2632098765432099e-05,
"loss": 0.0905,
"step": 3730
},
{
"epoch": 1.8469135802469134,
"grad_norm": 47.090877532958984,
"learning_rate": 1.2612345679012347e-05,
"loss": 0.2327,
"step": 3740
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.032411132007837296,
"learning_rate": 1.2592592592592593e-05,
"loss": 0.268,
"step": 3750
},
{
"epoch": 1.8567901234567903,
"grad_norm": 54.430667877197266,
"learning_rate": 1.257283950617284e-05,
"loss": 0.213,
"step": 3760
},
{
"epoch": 1.8617283950617285,
"grad_norm": 0.37125247716903687,
"learning_rate": 1.2553086419753086e-05,
"loss": 0.1433,
"step": 3770
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.05495602637529373,
"learning_rate": 1.2533333333333336e-05,
"loss": 0.3747,
"step": 3780
},
{
"epoch": 1.871604938271605,
"grad_norm": 35.28487777709961,
"learning_rate": 1.2513580246913582e-05,
"loss": 0.503,
"step": 3790
},
{
"epoch": 1.876543209876543,
"grad_norm": 60.75400924682617,
"learning_rate": 1.2493827160493827e-05,
"loss": 0.1602,
"step": 3800
},
{
"epoch": 1.8814814814814815,
"grad_norm": 137.60702514648438,
"learning_rate": 1.2474074074074073e-05,
"loss": 0.2931,
"step": 3810
},
{
"epoch": 1.8864197530864197,
"grad_norm": 60.11787796020508,
"learning_rate": 1.2454320987654323e-05,
"loss": 0.354,
"step": 3820
},
{
"epoch": 1.8913580246913582,
"grad_norm": 19.017499923706055,
"learning_rate": 1.243456790123457e-05,
"loss": 0.1684,
"step": 3830
},
{
"epoch": 1.8962962962962964,
"grad_norm": 43.31821823120117,
"learning_rate": 1.2414814814814816e-05,
"loss": 0.1728,
"step": 3840
},
{
"epoch": 1.9012345679012346,
"grad_norm": 602.893798828125,
"learning_rate": 1.2395061728395064e-05,
"loss": 0.2077,
"step": 3850
},
{
"epoch": 1.9061728395061728,
"grad_norm": 12.869080543518066,
"learning_rate": 1.237530864197531e-05,
"loss": 0.2419,
"step": 3860
},
{
"epoch": 1.911111111111111,
"grad_norm": 0.9421246647834778,
"learning_rate": 1.2355555555555557e-05,
"loss": 0.3389,
"step": 3870
},
{
"epoch": 1.9160493827160494,
"grad_norm": 3.65885591506958,
"learning_rate": 1.2335802469135803e-05,
"loss": 0.5007,
"step": 3880
},
{
"epoch": 1.9209876543209876,
"grad_norm": 3.625490665435791,
"learning_rate": 1.2316049382716051e-05,
"loss": 0.1538,
"step": 3890
},
{
"epoch": 1.925925925925926,
"grad_norm": 92.34613800048828,
"learning_rate": 1.2296296296296298e-05,
"loss": 0.4137,
"step": 3900
},
{
"epoch": 1.9308641975308642,
"grad_norm": 0.5257686376571655,
"learning_rate": 1.2276543209876544e-05,
"loss": 0.2876,
"step": 3910
},
{
"epoch": 1.9358024691358025,
"grad_norm": 0.39652788639068604,
"learning_rate": 1.2256790123456792e-05,
"loss": 0.254,
"step": 3920
},
{
"epoch": 1.9407407407407407,
"grad_norm": 26.36481285095215,
"learning_rate": 1.2237037037037039e-05,
"loss": 0.271,
"step": 3930
},
{
"epoch": 1.9456790123456789,
"grad_norm": 0.03053528629243374,
"learning_rate": 1.2217283950617285e-05,
"loss": 0.0742,
"step": 3940
},
{
"epoch": 1.9506172839506173,
"grad_norm": 0.09434489160776138,
"learning_rate": 1.2197530864197531e-05,
"loss": 0.1895,
"step": 3950
},
{
"epoch": 1.9555555555555557,
"grad_norm": 69.78058624267578,
"learning_rate": 1.217777777777778e-05,
"loss": 0.4513,
"step": 3960
},
{
"epoch": 1.960493827160494,
"grad_norm": 0.07707086950540543,
"learning_rate": 1.2158024691358026e-05,
"loss": 0.054,
"step": 3970
},
{
"epoch": 1.9654320987654321,
"grad_norm": 37.1689453125,
"learning_rate": 1.2138271604938272e-05,
"loss": 0.0594,
"step": 3980
},
{
"epoch": 1.9703703703703703,
"grad_norm": 48.61039352416992,
"learning_rate": 1.211851851851852e-05,
"loss": 0.1572,
"step": 3990
},
{
"epoch": 1.9753086419753085,
"grad_norm": 163.54615783691406,
"learning_rate": 1.2098765432098767e-05,
"loss": 0.1604,
"step": 4000
},
{
"epoch": 1.980246913580247,
"grad_norm": 85.144775390625,
"learning_rate": 1.2079012345679013e-05,
"loss": 0.1157,
"step": 4010
},
{
"epoch": 1.9851851851851852,
"grad_norm": 15.836172103881836,
"learning_rate": 1.205925925925926e-05,
"loss": 0.1904,
"step": 4020
},
{
"epoch": 1.9901234567901236,
"grad_norm": 2.649322748184204,
"learning_rate": 1.2039506172839508e-05,
"loss": 0.2893,
"step": 4030
},
{
"epoch": 1.9950617283950618,
"grad_norm": 1.9400321245193481,
"learning_rate": 1.2019753086419754e-05,
"loss": 0.2295,
"step": 4040
},
{
"epoch": 2.0,
"grad_norm": 74.16377258300781,
"learning_rate": 1.2e-05,
"loss": 0.539,
"step": 4050
},
{
"epoch": 2.0,
"eval_accuracy": 0.9798148148148148,
"eval_loss": 0.08341296017169952,
"eval_runtime": 32.2756,
"eval_samples_per_second": 167.309,
"eval_steps_per_second": 20.914,
"step": 4050
},
{
"epoch": 2.004938271604938,
"grad_norm": 0.23466235399246216,
"learning_rate": 1.1980246913580247e-05,
"loss": 0.0651,
"step": 4060
},
{
"epoch": 2.0098765432098764,
"grad_norm": 10.602593421936035,
"learning_rate": 1.1960493827160495e-05,
"loss": 0.2293,
"step": 4070
},
{
"epoch": 2.0148148148148146,
"grad_norm": 101.87135314941406,
"learning_rate": 1.1940740740740741e-05,
"loss": 0.3077,
"step": 4080
},
{
"epoch": 2.0197530864197533,
"grad_norm": 27.52354621887207,
"learning_rate": 1.1920987654320988e-05,
"loss": 0.1848,
"step": 4090
},
{
"epoch": 2.0246913580246915,
"grad_norm": 90.54155731201172,
"learning_rate": 1.1901234567901236e-05,
"loss": 0.2108,
"step": 4100
},
{
"epoch": 2.0296296296296297,
"grad_norm": 0.018464339897036552,
"learning_rate": 1.1881481481481482e-05,
"loss": 0.1732,
"step": 4110
},
{
"epoch": 2.034567901234568,
"grad_norm": 0.21476837992668152,
"learning_rate": 1.1861728395061728e-05,
"loss": 0.5227,
"step": 4120
},
{
"epoch": 2.039506172839506,
"grad_norm": 95.82560729980469,
"learning_rate": 1.1841975308641975e-05,
"loss": 0.1769,
"step": 4130
},
{
"epoch": 2.0444444444444443,
"grad_norm": 6.9548468589782715,
"learning_rate": 1.1822222222222225e-05,
"loss": 0.2134,
"step": 4140
},
{
"epoch": 2.049382716049383,
"grad_norm": 80.2332763671875,
"learning_rate": 1.180246913580247e-05,
"loss": 0.2451,
"step": 4150
},
{
"epoch": 2.054320987654321,
"grad_norm": 19.164928436279297,
"learning_rate": 1.1782716049382716e-05,
"loss": 0.1896,
"step": 4160
},
{
"epoch": 2.0592592592592593,
"grad_norm": 0.12828746438026428,
"learning_rate": 1.1762962962962965e-05,
"loss": 0.077,
"step": 4170
},
{
"epoch": 2.0641975308641975,
"grad_norm": 3.3232741355895996,
"learning_rate": 1.1743209876543212e-05,
"loss": 0.0855,
"step": 4180
},
{
"epoch": 2.0691358024691358,
"grad_norm": 0.32502618432044983,
"learning_rate": 1.1723456790123458e-05,
"loss": 0.2269,
"step": 4190
},
{
"epoch": 2.074074074074074,
"grad_norm": 1.072849154472351,
"learning_rate": 1.1703703703703703e-05,
"loss": 0.2473,
"step": 4200
},
{
"epoch": 2.079012345679012,
"grad_norm": 3.3251664638519287,
"learning_rate": 1.1683950617283953e-05,
"loss": 0.2367,
"step": 4210
},
{
"epoch": 2.083950617283951,
"grad_norm": 0.1870512068271637,
"learning_rate": 1.1664197530864199e-05,
"loss": 0.2782,
"step": 4220
},
{
"epoch": 2.088888888888889,
"grad_norm": 3.8792381286621094,
"learning_rate": 1.1644444444444446e-05,
"loss": 0.1886,
"step": 4230
},
{
"epoch": 2.093827160493827,
"grad_norm": 47.594451904296875,
"learning_rate": 1.1624691358024694e-05,
"loss": 0.3145,
"step": 4240
},
{
"epoch": 2.0987654320987654,
"grad_norm": 158.525634765625,
"learning_rate": 1.160493827160494e-05,
"loss": 0.3143,
"step": 4250
},
{
"epoch": 2.1037037037037036,
"grad_norm": 74.01322174072266,
"learning_rate": 1.1585185185185186e-05,
"loss": 0.1924,
"step": 4260
},
{
"epoch": 2.108641975308642,
"grad_norm": 75.74314880371094,
"learning_rate": 1.1565432098765433e-05,
"loss": 0.3617,
"step": 4270
},
{
"epoch": 2.11358024691358,
"grad_norm": 22.196048736572266,
"learning_rate": 1.1545679012345681e-05,
"loss": 0.2283,
"step": 4280
},
{
"epoch": 2.1185185185185187,
"grad_norm": 0.7152767777442932,
"learning_rate": 1.1525925925925927e-05,
"loss": 0.3129,
"step": 4290
},
{
"epoch": 2.123456790123457,
"grad_norm": 0.11401913315057755,
"learning_rate": 1.1506172839506174e-05,
"loss": 0.2689,
"step": 4300
},
{
"epoch": 2.128395061728395,
"grad_norm": 52.53899002075195,
"learning_rate": 1.148641975308642e-05,
"loss": 0.0563,
"step": 4310
},
{
"epoch": 2.1333333333333333,
"grad_norm": 42.3081169128418,
"learning_rate": 1.1466666666666668e-05,
"loss": 0.2296,
"step": 4320
},
{
"epoch": 2.1382716049382715,
"grad_norm": 10.208148002624512,
"learning_rate": 1.1446913580246915e-05,
"loss": 0.3501,
"step": 4330
},
{
"epoch": 2.1432098765432097,
"grad_norm": 20.181745529174805,
"learning_rate": 1.1427160493827161e-05,
"loss": 0.0309,
"step": 4340
},
{
"epoch": 2.148148148148148,
"grad_norm": 0.01720772311091423,
"learning_rate": 1.1407407407407409e-05,
"loss": 0.1887,
"step": 4350
},
{
"epoch": 2.1530864197530866,
"grad_norm": 6.094252109527588,
"learning_rate": 1.1387654320987655e-05,
"loss": 0.0933,
"step": 4360
},
{
"epoch": 2.1580246913580248,
"grad_norm": 0.02691926248371601,
"learning_rate": 1.1367901234567902e-05,
"loss": 0.1443,
"step": 4370
},
{
"epoch": 2.162962962962963,
"grad_norm": 0.3429844081401825,
"learning_rate": 1.1348148148148148e-05,
"loss": 0.253,
"step": 4380
},
{
"epoch": 2.167901234567901,
"grad_norm": 36.565834045410156,
"learning_rate": 1.1328395061728396e-05,
"loss": 0.3124,
"step": 4390
},
{
"epoch": 2.1728395061728394,
"grad_norm": 0.1142088919878006,
"learning_rate": 1.1308641975308643e-05,
"loss": 0.2102,
"step": 4400
},
{
"epoch": 2.1777777777777776,
"grad_norm": 1.0915874242782593,
"learning_rate": 1.1288888888888889e-05,
"loss": 0.117,
"step": 4410
},
{
"epoch": 2.1827160493827162,
"grad_norm": 0.015154359862208366,
"learning_rate": 1.1269135802469137e-05,
"loss": 0.2591,
"step": 4420
},
{
"epoch": 2.1876543209876544,
"grad_norm": 0.0378662571310997,
"learning_rate": 1.1249382716049384e-05,
"loss": 0.4314,
"step": 4430
},
{
"epoch": 2.1925925925925926,
"grad_norm": 39.53334045410156,
"learning_rate": 1.122962962962963e-05,
"loss": 0.0796,
"step": 4440
},
{
"epoch": 2.197530864197531,
"grad_norm": 33.39299011230469,
"learning_rate": 1.1209876543209876e-05,
"loss": 0.0708,
"step": 4450
},
{
"epoch": 2.202469135802469,
"grad_norm": 32.73172378540039,
"learning_rate": 1.1190123456790124e-05,
"loss": 0.0602,
"step": 4460
},
{
"epoch": 2.2074074074074073,
"grad_norm": 27.3021297454834,
"learning_rate": 1.117037037037037e-05,
"loss": 0.0563,
"step": 4470
},
{
"epoch": 2.212345679012346,
"grad_norm": 33.85374450683594,
"learning_rate": 1.1150617283950617e-05,
"loss": 0.3346,
"step": 4480
},
{
"epoch": 2.217283950617284,
"grad_norm": 46.218204498291016,
"learning_rate": 1.1130864197530864e-05,
"loss": 0.2087,
"step": 4490
},
{
"epoch": 2.2222222222222223,
"grad_norm": 47.22572326660156,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.2552,
"step": 4500
},
{
"epoch": 2.2271604938271605,
"grad_norm": 0.1430201381444931,
"learning_rate": 1.1091358024691358e-05,
"loss": 0.2517,
"step": 4510
},
{
"epoch": 2.2320987654320987,
"grad_norm": 11.38235092163086,
"learning_rate": 1.1071604938271604e-05,
"loss": 0.1918,
"step": 4520
},
{
"epoch": 2.237037037037037,
"grad_norm": 37.20140838623047,
"learning_rate": 1.1051851851851854e-05,
"loss": 0.1549,
"step": 4530
},
{
"epoch": 2.241975308641975,
"grad_norm": 0.10535780340433121,
"learning_rate": 1.10320987654321e-05,
"loss": 0.0271,
"step": 4540
},
{
"epoch": 2.246913580246914,
"grad_norm": 0.6121019124984741,
"learning_rate": 1.1012345679012347e-05,
"loss": 0.389,
"step": 4550
},
{
"epoch": 2.251851851851852,
"grad_norm": 35.94973373413086,
"learning_rate": 1.0992592592592592e-05,
"loss": 0.3603,
"step": 4560
},
{
"epoch": 2.25679012345679,
"grad_norm": 95.45260620117188,
"learning_rate": 1.0972839506172841e-05,
"loss": 0.4025,
"step": 4570
},
{
"epoch": 2.2617283950617284,
"grad_norm": 0.17219342291355133,
"learning_rate": 1.0953086419753088e-05,
"loss": 0.2335,
"step": 4580
},
{
"epoch": 2.2666666666666666,
"grad_norm": 1.9040601253509521,
"learning_rate": 1.0933333333333334e-05,
"loss": 0.3124,
"step": 4590
},
{
"epoch": 2.271604938271605,
"grad_norm": 77.7896957397461,
"learning_rate": 1.0913580246913582e-05,
"loss": 0.2387,
"step": 4600
},
{
"epoch": 2.276543209876543,
"grad_norm": 0.5370518565177917,
"learning_rate": 1.0893827160493829e-05,
"loss": 0.1187,
"step": 4610
},
{
"epoch": 2.2814814814814817,
"grad_norm": 113.6650619506836,
"learning_rate": 1.0874074074074075e-05,
"loss": 0.3598,
"step": 4620
},
{
"epoch": 2.28641975308642,
"grad_norm": 0.025056390091776848,
"learning_rate": 1.0854320987654322e-05,
"loss": 0.1631,
"step": 4630
},
{
"epoch": 2.291358024691358,
"grad_norm": 0.0650627464056015,
"learning_rate": 1.083456790123457e-05,
"loss": 0.257,
"step": 4640
},
{
"epoch": 2.2962962962962963,
"grad_norm": 34.378414154052734,
"learning_rate": 1.0814814814814816e-05,
"loss": 0.2349,
"step": 4650
},
{
"epoch": 2.3012345679012345,
"grad_norm": 0.046463072299957275,
"learning_rate": 1.0795061728395062e-05,
"loss": 0.0695,
"step": 4660
},
{
"epoch": 2.3061728395061727,
"grad_norm": 81.86966705322266,
"learning_rate": 1.077530864197531e-05,
"loss": 0.2093,
"step": 4670
},
{
"epoch": 2.311111111111111,
"grad_norm": 0.004781852941960096,
"learning_rate": 1.0755555555555557e-05,
"loss": 0.1424,
"step": 4680
},
{
"epoch": 2.3160493827160495,
"grad_norm": 0.817314624786377,
"learning_rate": 1.0735802469135803e-05,
"loss": 0.0413,
"step": 4690
},
{
"epoch": 2.3209876543209877,
"grad_norm": 5.055154800415039,
"learning_rate": 1.071604938271605e-05,
"loss": 0.0046,
"step": 4700
},
{
"epoch": 2.325925925925926,
"grad_norm": 133.45437622070312,
"learning_rate": 1.0696296296296298e-05,
"loss": 0.3131,
"step": 4710
},
{
"epoch": 2.330864197530864,
"grad_norm": 0.014058091677725315,
"learning_rate": 1.0676543209876544e-05,
"loss": 0.1227,
"step": 4720
},
{
"epoch": 2.3358024691358024,
"grad_norm": 4.482833385467529,
"learning_rate": 1.065679012345679e-05,
"loss": 0.1694,
"step": 4730
},
{
"epoch": 2.3407407407407406,
"grad_norm": 0.8238074779510498,
"learning_rate": 1.0637037037037037e-05,
"loss": 0.1315,
"step": 4740
},
{
"epoch": 2.3456790123456788,
"grad_norm": 55.907318115234375,
"learning_rate": 1.0617283950617285e-05,
"loss": 0.0988,
"step": 4750
},
{
"epoch": 2.3506172839506174,
"grad_norm": 119.31465911865234,
"learning_rate": 1.0597530864197531e-05,
"loss": 0.2308,
"step": 4760
},
{
"epoch": 2.3555555555555556,
"grad_norm": 5.956635475158691,
"learning_rate": 1.0577777777777778e-05,
"loss": 0.1726,
"step": 4770
},
{
"epoch": 2.360493827160494,
"grad_norm": 1.8036092519760132,
"learning_rate": 1.0558024691358026e-05,
"loss": 0.2904,
"step": 4780
},
{
"epoch": 2.365432098765432,
"grad_norm": 16.762969970703125,
"learning_rate": 1.0538271604938272e-05,
"loss": 0.039,
"step": 4790
},
{
"epoch": 2.3703703703703702,
"grad_norm": 0.5352030992507935,
"learning_rate": 1.0518518518518519e-05,
"loss": 0.6986,
"step": 4800
},
{
"epoch": 2.375308641975309,
"grad_norm": 72.20184326171875,
"learning_rate": 1.0498765432098765e-05,
"loss": 0.1986,
"step": 4810
},
{
"epoch": 2.380246913580247,
"grad_norm": 39.09406661987305,
"learning_rate": 1.0479012345679013e-05,
"loss": 0.2384,
"step": 4820
},
{
"epoch": 2.3851851851851853,
"grad_norm": 101.78142547607422,
"learning_rate": 1.045925925925926e-05,
"loss": 0.1049,
"step": 4830
},
{
"epoch": 2.3901234567901235,
"grad_norm": 31.242937088012695,
"learning_rate": 1.0439506172839506e-05,
"loss": 0.3993,
"step": 4840
},
{
"epoch": 2.3950617283950617,
"grad_norm": 107.1478271484375,
"learning_rate": 1.0419753086419756e-05,
"loss": 0.1895,
"step": 4850
},
{
"epoch": 2.4,
"grad_norm": 0.6550659537315369,
"learning_rate": 1.04e-05,
"loss": 0.2174,
"step": 4860
},
{
"epoch": 2.404938271604938,
"grad_norm": 37.14043045043945,
"learning_rate": 1.0380246913580247e-05,
"loss": 0.2233,
"step": 4870
},
{
"epoch": 2.4098765432098768,
"grad_norm": 10.13899040222168,
"learning_rate": 1.0360493827160493e-05,
"loss": 0.4372,
"step": 4880
},
{
"epoch": 2.414814814814815,
"grad_norm": 0.8044024705886841,
"learning_rate": 1.0340740740740743e-05,
"loss": 0.3235,
"step": 4890
},
{
"epoch": 2.419753086419753,
"grad_norm": 0.08543165773153305,
"learning_rate": 1.032098765432099e-05,
"loss": 0.0319,
"step": 4900
},
{
"epoch": 2.4246913580246914,
"grad_norm": 25.276649475097656,
"learning_rate": 1.0301234567901236e-05,
"loss": 0.2608,
"step": 4910
},
{
"epoch": 2.4296296296296296,
"grad_norm": 53.250003814697266,
"learning_rate": 1.0281481481481484e-05,
"loss": 0.2555,
"step": 4920
},
{
"epoch": 2.434567901234568,
"grad_norm": 0.0675877258181572,
"learning_rate": 1.026172839506173e-05,
"loss": 0.1439,
"step": 4930
},
{
"epoch": 2.439506172839506,
"grad_norm": 0.07533666491508484,
"learning_rate": 1.0241975308641977e-05,
"loss": 0.2685,
"step": 4940
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.0232541486620903,
"learning_rate": 1.0222222222222223e-05,
"loss": 0.1896,
"step": 4950
},
{
"epoch": 2.449382716049383,
"grad_norm": 0.4157695770263672,
"learning_rate": 1.0202469135802471e-05,
"loss": 0.4117,
"step": 4960
},
{
"epoch": 2.454320987654321,
"grad_norm": 6.473262786865234,
"learning_rate": 1.0182716049382717e-05,
"loss": 0.1608,
"step": 4970
},
{
"epoch": 2.4592592592592593,
"grad_norm": 47.35124588012695,
"learning_rate": 1.0162962962962964e-05,
"loss": 0.1861,
"step": 4980
},
{
"epoch": 2.4641975308641975,
"grad_norm": 0.0442415289580822,
"learning_rate": 1.014320987654321e-05,
"loss": 0.2317,
"step": 4990
},
{
"epoch": 2.4691358024691357,
"grad_norm": 0.02038310095667839,
"learning_rate": 1.0123456790123458e-05,
"loss": 0.5267,
"step": 5000
},
{
"epoch": 2.474074074074074,
"grad_norm": 166.4259033203125,
"learning_rate": 1.0103703703703705e-05,
"loss": 0.2363,
"step": 5010
},
{
"epoch": 2.4790123456790125,
"grad_norm": 68.62043762207031,
"learning_rate": 1.0083950617283951e-05,
"loss": 0.2097,
"step": 5020
},
{
"epoch": 2.4839506172839507,
"grad_norm": 2.836273431777954,
"learning_rate": 1.00641975308642e-05,
"loss": 0.2101,
"step": 5030
},
{
"epoch": 2.488888888888889,
"grad_norm": 4.900826930999756,
"learning_rate": 1.0044444444444446e-05,
"loss": 0.191,
"step": 5040
},
{
"epoch": 2.493827160493827,
"grad_norm": 22.4804744720459,
"learning_rate": 1.0024691358024692e-05,
"loss": 0.182,
"step": 5050
},
{
"epoch": 2.4987654320987653,
"grad_norm": 0.00806320272386074,
"learning_rate": 1.0004938271604938e-05,
"loss": 0.0106,
"step": 5060
},
{
"epoch": 2.5037037037037035,
"grad_norm": 0.13981568813323975,
"learning_rate": 9.985185185185185e-06,
"loss": 0.2085,
"step": 5070
},
{
"epoch": 2.5086419753086417,
"grad_norm": 115.363037109375,
"learning_rate": 9.965432098765433e-06,
"loss": 0.3881,
"step": 5080
},
{
"epoch": 2.5135802469135804,
"grad_norm": 0.5273131132125854,
"learning_rate": 9.945679012345681e-06,
"loss": 0.3149,
"step": 5090
},
{
"epoch": 2.5185185185185186,
"grad_norm": 0.044860485941171646,
"learning_rate": 9.925925925925927e-06,
"loss": 0.1487,
"step": 5100
},
{
"epoch": 2.523456790123457,
"grad_norm": 0.0039957864210009575,
"learning_rate": 9.906172839506174e-06,
"loss": 0.1385,
"step": 5110
},
{
"epoch": 2.528395061728395,
"grad_norm": 0.014863072894513607,
"learning_rate": 9.88641975308642e-06,
"loss": 0.1111,
"step": 5120
},
{
"epoch": 2.533333333333333,
"grad_norm": 75.10174560546875,
"learning_rate": 9.866666666666668e-06,
"loss": 0.1741,
"step": 5130
},
{
"epoch": 2.538271604938272,
"grad_norm": 0.048640429973602295,
"learning_rate": 9.846913580246915e-06,
"loss": 0.1827,
"step": 5140
},
{
"epoch": 2.5432098765432096,
"grad_norm": 0.25287771224975586,
"learning_rate": 9.827160493827161e-06,
"loss": 0.2889,
"step": 5150
},
{
"epoch": 2.5481481481481483,
"grad_norm": 3.0355021953582764,
"learning_rate": 9.807407407407407e-06,
"loss": 0.0549,
"step": 5160
},
{
"epoch": 2.5530864197530865,
"grad_norm": 0.008490847423672676,
"learning_rate": 9.787654320987655e-06,
"loss": 0.1945,
"step": 5170
},
{
"epoch": 2.5580246913580247,
"grad_norm": 0.055667582899332047,
"learning_rate": 9.767901234567902e-06,
"loss": 0.178,
"step": 5180
},
{
"epoch": 2.562962962962963,
"grad_norm": 2.11090350151062,
"learning_rate": 9.748148148148148e-06,
"loss": 0.1497,
"step": 5190
},
{
"epoch": 2.567901234567901,
"grad_norm": 48.44843292236328,
"learning_rate": 9.728395061728396e-06,
"loss": 0.3233,
"step": 5200
},
{
"epoch": 2.5728395061728397,
"grad_norm": 16.53707504272461,
"learning_rate": 9.708641975308643e-06,
"loss": 0.0269,
"step": 5210
},
{
"epoch": 2.5777777777777775,
"grad_norm": 85.8476791381836,
"learning_rate": 9.688888888888889e-06,
"loss": 0.4162,
"step": 5220
},
{
"epoch": 2.582716049382716,
"grad_norm": 333.21466064453125,
"learning_rate": 9.669135802469136e-06,
"loss": 0.161,
"step": 5230
},
{
"epoch": 2.5876543209876544,
"grad_norm": 46.150047302246094,
"learning_rate": 9.649382716049384e-06,
"loss": 0.1367,
"step": 5240
},
{
"epoch": 2.5925925925925926,
"grad_norm": 23.23380470275879,
"learning_rate": 9.62962962962963e-06,
"loss": 0.049,
"step": 5250
},
{
"epoch": 2.5975308641975308,
"grad_norm": 0.01312983874231577,
"learning_rate": 9.609876543209878e-06,
"loss": 0.4376,
"step": 5260
},
{
"epoch": 2.602469135802469,
"grad_norm": 0.1367645114660263,
"learning_rate": 9.590123456790124e-06,
"loss": 0.0646,
"step": 5270
},
{
"epoch": 2.6074074074074076,
"grad_norm": 0.16247719526290894,
"learning_rate": 9.570370370370371e-06,
"loss": 0.3247,
"step": 5280
},
{
"epoch": 2.612345679012346,
"grad_norm": 140.21865844726562,
"learning_rate": 9.550617283950619e-06,
"loss": 0.4135,
"step": 5290
},
{
"epoch": 2.617283950617284,
"grad_norm": 60.00096893310547,
"learning_rate": 9.530864197530865e-06,
"loss": 0.1836,
"step": 5300
},
{
"epoch": 2.6222222222222222,
"grad_norm": 0.0217946358025074,
"learning_rate": 9.511111111111112e-06,
"loss": 0.3697,
"step": 5310
},
{
"epoch": 2.6271604938271604,
"grad_norm": 75.67610931396484,
"learning_rate": 9.491358024691358e-06,
"loss": 0.0953,
"step": 5320
},
{
"epoch": 2.6320987654320986,
"grad_norm": 19.351255416870117,
"learning_rate": 9.471604938271606e-06,
"loss": 0.0855,
"step": 5330
},
{
"epoch": 2.637037037037037,
"grad_norm": 7.155949115753174,
"learning_rate": 9.451851851851853e-06,
"loss": 0.5717,
"step": 5340
},
{
"epoch": 2.6419753086419755,
"grad_norm": 143.97991943359375,
"learning_rate": 9.432098765432099e-06,
"loss": 0.0894,
"step": 5350
},
{
"epoch": 2.6469135802469137,
"grad_norm": 66.95204162597656,
"learning_rate": 9.412345679012347e-06,
"loss": 0.1136,
"step": 5360
},
{
"epoch": 2.651851851851852,
"grad_norm": 5.1548590660095215,
"learning_rate": 9.392592592592593e-06,
"loss": 0.066,
"step": 5370
},
{
"epoch": 2.65679012345679,
"grad_norm": 164.66404724121094,
"learning_rate": 9.37283950617284e-06,
"loss": 0.2865,
"step": 5380
},
{
"epoch": 2.6617283950617283,
"grad_norm": 200.15574645996094,
"learning_rate": 9.353086419753086e-06,
"loss": 0.2895,
"step": 5390
},
{
"epoch": 2.6666666666666665,
"grad_norm": 233.70343017578125,
"learning_rate": 9.333333333333334e-06,
"loss": 0.052,
"step": 5400
},
{
"epoch": 2.6716049382716047,
"grad_norm": 140.56007385253906,
"learning_rate": 9.31358024691358e-06,
"loss": 0.2882,
"step": 5410
},
{
"epoch": 2.6765432098765434,
"grad_norm": 281.7587585449219,
"learning_rate": 9.293827160493827e-06,
"loss": 0.1121,
"step": 5420
},
{
"epoch": 2.6814814814814816,
"grad_norm": 0.00958334095776081,
"learning_rate": 9.274074074074075e-06,
"loss": 0.0447,
"step": 5430
},
{
"epoch": 2.68641975308642,
"grad_norm": 0.6552028059959412,
"learning_rate": 9.254320987654322e-06,
"loss": 0.0727,
"step": 5440
},
{
"epoch": 2.691358024691358,
"grad_norm": 0.01010242011398077,
"learning_rate": 9.23456790123457e-06,
"loss": 0.1756,
"step": 5450
},
{
"epoch": 2.696296296296296,
"grad_norm": 0.013218900188803673,
"learning_rate": 9.214814814814816e-06,
"loss": 0.2895,
"step": 5460
},
{
"epoch": 2.701234567901235,
"grad_norm": 44.7857780456543,
"learning_rate": 9.195061728395062e-06,
"loss": 0.323,
"step": 5470
},
{
"epoch": 2.7061728395061726,
"grad_norm": 2.435910701751709,
"learning_rate": 9.175308641975309e-06,
"loss": 0.473,
"step": 5480
},
{
"epoch": 2.7111111111111112,
"grad_norm": 5.467461585998535,
"learning_rate": 9.155555555555557e-06,
"loss": 0.4263,
"step": 5490
},
{
"epoch": 2.7160493827160495,
"grad_norm": 0.020925594493746758,
"learning_rate": 9.135802469135803e-06,
"loss": 0.1927,
"step": 5500
},
{
"epoch": 2.7209876543209877,
"grad_norm": 0.850062906742096,
"learning_rate": 9.11604938271605e-06,
"loss": 0.2724,
"step": 5510
},
{
"epoch": 2.725925925925926,
"grad_norm": 0.8104738593101501,
"learning_rate": 9.096296296296298e-06,
"loss": 0.0688,
"step": 5520
},
{
"epoch": 2.730864197530864,
"grad_norm": 183.3977813720703,
"learning_rate": 9.076543209876544e-06,
"loss": 0.403,
"step": 5530
},
{
"epoch": 2.7358024691358027,
"grad_norm": 0.39399421215057373,
"learning_rate": 9.05679012345679e-06,
"loss": 0.2956,
"step": 5540
},
{
"epoch": 2.7407407407407405,
"grad_norm": 17.86000633239746,
"learning_rate": 9.037037037037037e-06,
"loss": 0.2467,
"step": 5550
},
{
"epoch": 2.745679012345679,
"grad_norm": 0.007520174607634544,
"learning_rate": 9.017283950617285e-06,
"loss": 0.0734,
"step": 5560
},
{
"epoch": 2.7506172839506173,
"grad_norm": 42.2265739440918,
"learning_rate": 8.997530864197531e-06,
"loss": 0.1445,
"step": 5570
},
{
"epoch": 2.7555555555555555,
"grad_norm": 55.289222717285156,
"learning_rate": 8.977777777777778e-06,
"loss": 0.1346,
"step": 5580
},
{
"epoch": 2.7604938271604937,
"grad_norm": 1.1563366651535034,
"learning_rate": 8.958024691358024e-06,
"loss": 0.1427,
"step": 5590
},
{
"epoch": 2.765432098765432,
"grad_norm": 31.966625213623047,
"learning_rate": 8.938271604938272e-06,
"loss": 0.1432,
"step": 5600
},
{
"epoch": 2.7703703703703706,
"grad_norm": 26.22989273071289,
"learning_rate": 8.91851851851852e-06,
"loss": 0.1465,
"step": 5610
},
{
"epoch": 2.775308641975309,
"grad_norm": 2.2528607845306396,
"learning_rate": 8.898765432098767e-06,
"loss": 0.1046,
"step": 5620
},
{
"epoch": 2.780246913580247,
"grad_norm": 41.7017707824707,
"learning_rate": 8.879012345679013e-06,
"loss": 0.3095,
"step": 5630
},
{
"epoch": 2.785185185185185,
"grad_norm": 80.6755142211914,
"learning_rate": 8.85925925925926e-06,
"loss": 0.1785,
"step": 5640
},
{
"epoch": 2.7901234567901234,
"grad_norm": 49.54252624511719,
"learning_rate": 8.839506172839508e-06,
"loss": 0.1924,
"step": 5650
},
{
"epoch": 2.7950617283950616,
"grad_norm": 0.05363411456346512,
"learning_rate": 8.819753086419754e-06,
"loss": 0.1327,
"step": 5660
},
{
"epoch": 2.8,
"grad_norm": 8.126516342163086,
"learning_rate": 8.8e-06,
"loss": 0.121,
"step": 5670
},
{
"epoch": 2.8049382716049385,
"grad_norm": 0.02661011926829815,
"learning_rate": 8.780246913580249e-06,
"loss": 0.0073,
"step": 5680
},
{
"epoch": 2.8098765432098767,
"grad_norm": 8.132286071777344,
"learning_rate": 8.760493827160495e-06,
"loss": 0.1296,
"step": 5690
},
{
"epoch": 2.814814814814815,
"grad_norm": 62.083099365234375,
"learning_rate": 8.740740740740741e-06,
"loss": 0.2036,
"step": 5700
},
{
"epoch": 2.819753086419753,
"grad_norm": 17.057275772094727,
"learning_rate": 8.720987654320988e-06,
"loss": 0.236,
"step": 5710
},
{
"epoch": 2.8246913580246913,
"grad_norm": 0.07913421094417572,
"learning_rate": 8.701234567901236e-06,
"loss": 0.0186,
"step": 5720
},
{
"epoch": 2.8296296296296295,
"grad_norm": 59.11501693725586,
"learning_rate": 8.681481481481482e-06,
"loss": 0.2352,
"step": 5730
},
{
"epoch": 2.8345679012345677,
"grad_norm": 0.05783538892865181,
"learning_rate": 8.661728395061729e-06,
"loss": 0.325,
"step": 5740
},
{
"epoch": 2.8395061728395063,
"grad_norm": 0.07834266871213913,
"learning_rate": 8.641975308641975e-06,
"loss": 0.0508,
"step": 5750
},
{
"epoch": 2.8444444444444446,
"grad_norm": 2.788255214691162,
"learning_rate": 8.622222222222223e-06,
"loss": 0.0728,
"step": 5760
},
{
"epoch": 2.8493827160493828,
"grad_norm": 41.630611419677734,
"learning_rate": 8.602469135802471e-06,
"loss": 0.2255,
"step": 5770
},
{
"epoch": 2.854320987654321,
"grad_norm": 0.47825512290000916,
"learning_rate": 8.582716049382716e-06,
"loss": 0.1858,
"step": 5780
},
{
"epoch": 2.859259259259259,
"grad_norm": 0.4730166494846344,
"learning_rate": 8.562962962962964e-06,
"loss": 0.0417,
"step": 5790
},
{
"epoch": 2.8641975308641974,
"grad_norm": 0.00964848231524229,
"learning_rate": 8.54320987654321e-06,
"loss": 0.2487,
"step": 5800
},
{
"epoch": 2.8691358024691356,
"grad_norm": 4.990635395050049,
"learning_rate": 8.523456790123458e-06,
"loss": 0.1967,
"step": 5810
},
{
"epoch": 2.8740740740740742,
"grad_norm": 0.06853197515010834,
"learning_rate": 8.503703703703705e-06,
"loss": 0.1847,
"step": 5820
},
{
"epoch": 2.8790123456790124,
"grad_norm": 14.369994163513184,
"learning_rate": 8.483950617283951e-06,
"loss": 0.4819,
"step": 5830
},
{
"epoch": 2.8839506172839506,
"grad_norm": 1.4478572607040405,
"learning_rate": 8.464197530864198e-06,
"loss": 0.2011,
"step": 5840
},
{
"epoch": 2.888888888888889,
"grad_norm": 197.60943603515625,
"learning_rate": 8.444444444444446e-06,
"loss": 0.2301,
"step": 5850
},
{
"epoch": 2.893827160493827,
"grad_norm": 0.3465060293674469,
"learning_rate": 8.424691358024692e-06,
"loss": 0.0814,
"step": 5860
},
{
"epoch": 2.8987654320987657,
"grad_norm": 0.22260437905788422,
"learning_rate": 8.404938271604938e-06,
"loss": 0.1913,
"step": 5870
},
{
"epoch": 2.9037037037037035,
"grad_norm": 3.2895030975341797,
"learning_rate": 8.385185185185187e-06,
"loss": 0.161,
"step": 5880
},
{
"epoch": 2.908641975308642,
"grad_norm": 75.78804016113281,
"learning_rate": 8.365432098765433e-06,
"loss": 0.2146,
"step": 5890
},
{
"epoch": 2.9135802469135803,
"grad_norm": 37.905670166015625,
"learning_rate": 8.34567901234568e-06,
"loss": 0.0171,
"step": 5900
},
{
"epoch": 2.9185185185185185,
"grad_norm": 1.2207163572311401,
"learning_rate": 8.325925925925926e-06,
"loss": 0.0008,
"step": 5910
},
{
"epoch": 2.9234567901234567,
"grad_norm": 0.26251447200775146,
"learning_rate": 8.306172839506174e-06,
"loss": 0.2391,
"step": 5920
},
{
"epoch": 2.928395061728395,
"grad_norm": 184.48342895507812,
"learning_rate": 8.28641975308642e-06,
"loss": 0.4721,
"step": 5930
},
{
"epoch": 2.9333333333333336,
"grad_norm": 2.430443048477173,
"learning_rate": 8.266666666666667e-06,
"loss": 0.3217,
"step": 5940
},
{
"epoch": 2.9382716049382713,
"grad_norm": 167.15850830078125,
"learning_rate": 8.246913580246915e-06,
"loss": 0.1661,
"step": 5950
},
{
"epoch": 2.94320987654321,
"grad_norm": 3.9648666381835938,
"learning_rate": 8.227160493827161e-06,
"loss": 0.0846,
"step": 5960
},
{
"epoch": 2.948148148148148,
"grad_norm": 0.18866649270057678,
"learning_rate": 8.207407407407409e-06,
"loss": 0.0355,
"step": 5970
},
{
"epoch": 2.9530864197530864,
"grad_norm": 0.19261124730110168,
"learning_rate": 8.187654320987654e-06,
"loss": 0.1842,
"step": 5980
},
{
"epoch": 2.9580246913580246,
"grad_norm": 0.13655029237270355,
"learning_rate": 8.167901234567902e-06,
"loss": 0.0244,
"step": 5990
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.24857792258262634,
"learning_rate": 8.148148148148148e-06,
"loss": 0.2052,
"step": 6000
},
{
"epoch": 2.9679012345679014,
"grad_norm": 85.19855499267578,
"learning_rate": 8.128395061728396e-06,
"loss": 0.187,
"step": 6010
},
{
"epoch": 2.9728395061728397,
"grad_norm": 190.1832733154297,
"learning_rate": 8.108641975308643e-06,
"loss": 0.5081,
"step": 6020
},
{
"epoch": 2.977777777777778,
"grad_norm": 0.0004998709191568196,
"learning_rate": 8.08888888888889e-06,
"loss": 0.4187,
"step": 6030
},
{
"epoch": 2.982716049382716,
"grad_norm": 0.019353624433279037,
"learning_rate": 8.069135802469137e-06,
"loss": 0.0796,
"step": 6040
},
{
"epoch": 2.9876543209876543,
"grad_norm": 0.00627252459526062,
"learning_rate": 8.049382716049384e-06,
"loss": 0.4005,
"step": 6050
},
{
"epoch": 2.9925925925925925,
"grad_norm": 159.71725463867188,
"learning_rate": 8.02962962962963e-06,
"loss": 0.028,
"step": 6060
},
{
"epoch": 2.9975308641975307,
"grad_norm": 2.6106536388397217,
"learning_rate": 8.009876543209876e-06,
"loss": 0.024,
"step": 6070
},
{
"epoch": 3.0,
"eval_accuracy": 0.9831481481481481,
"eval_loss": 0.07001630961894989,
"eval_runtime": 32.6621,
"eval_samples_per_second": 165.329,
"eval_steps_per_second": 20.666,
"step": 6075
},
{
"epoch": 3.0024691358024693,
"grad_norm": 1.3359025716781616,
"learning_rate": 7.990123456790125e-06,
"loss": 0.0996,
"step": 6080
},
{
"epoch": 3.0074074074074075,
"grad_norm": 0.05273491516709328,
"learning_rate": 7.970370370370371e-06,
"loss": 0.012,
"step": 6090
},
{
"epoch": 3.0123456790123457,
"grad_norm": 0.23167039453983307,
"learning_rate": 7.950617283950617e-06,
"loss": 0.1334,
"step": 6100
},
{
"epoch": 3.017283950617284,
"grad_norm": 0.03928215801715851,
"learning_rate": 7.930864197530865e-06,
"loss": 0.1258,
"step": 6110
},
{
"epoch": 3.022222222222222,
"grad_norm": 109.73241424560547,
"learning_rate": 7.911111111111112e-06,
"loss": 0.1747,
"step": 6120
},
{
"epoch": 3.0271604938271603,
"grad_norm": 2.945659637451172,
"learning_rate": 7.89135802469136e-06,
"loss": 0.1064,
"step": 6130
},
{
"epoch": 3.0320987654320986,
"grad_norm": 19.941844940185547,
"learning_rate": 7.871604938271605e-06,
"loss": 0.1372,
"step": 6140
},
{
"epoch": 3.037037037037037,
"grad_norm": 0.11880356073379517,
"learning_rate": 7.851851851851853e-06,
"loss": 0.0212,
"step": 6150
},
{
"epoch": 3.0419753086419754,
"grad_norm": 1.0245414972305298,
"learning_rate": 7.832098765432099e-06,
"loss": 0.2392,
"step": 6160
},
{
"epoch": 3.0469135802469136,
"grad_norm": 0.23312650620937347,
"learning_rate": 7.812345679012347e-06,
"loss": 0.0706,
"step": 6170
},
{
"epoch": 3.051851851851852,
"grad_norm": 63.500797271728516,
"learning_rate": 7.792592592592594e-06,
"loss": 0.2912,
"step": 6180
},
{
"epoch": 3.05679012345679,
"grad_norm": 4.3201727867126465,
"learning_rate": 7.77283950617284e-06,
"loss": 0.1027,
"step": 6190
},
{
"epoch": 3.0617283950617282,
"grad_norm": 0.009072243236005306,
"learning_rate": 7.753086419753088e-06,
"loss": 0.0177,
"step": 6200
},
{
"epoch": 3.066666666666667,
"grad_norm": 7.860177993774414,
"learning_rate": 7.733333333333334e-06,
"loss": 0.1266,
"step": 6210
},
{
"epoch": 3.071604938271605,
"grad_norm": 125.65026092529297,
"learning_rate": 7.71358024691358e-06,
"loss": 0.1102,
"step": 6220
},
{
"epoch": 3.0765432098765433,
"grad_norm": 64.10157012939453,
"learning_rate": 7.693827160493827e-06,
"loss": 0.2813,
"step": 6230
},
{
"epoch": 3.0814814814814815,
"grad_norm": 0.023331521078944206,
"learning_rate": 7.674074074074075e-06,
"loss": 0.4718,
"step": 6240
},
{
"epoch": 3.0864197530864197,
"grad_norm": 0.9373367428779602,
"learning_rate": 7.654320987654322e-06,
"loss": 0.1784,
"step": 6250
},
{
"epoch": 3.091358024691358,
"grad_norm": 0.09618625789880753,
"learning_rate": 7.634567901234568e-06,
"loss": 0.1675,
"step": 6260
},
{
"epoch": 3.096296296296296,
"grad_norm": 53.146034240722656,
"learning_rate": 7.614814814814816e-06,
"loss": 0.3012,
"step": 6270
},
{
"epoch": 3.1012345679012348,
"grad_norm": 0.9176463484764099,
"learning_rate": 7.5950617283950625e-06,
"loss": 0.0438,
"step": 6280
},
{
"epoch": 3.106172839506173,
"grad_norm": 0.6210525035858154,
"learning_rate": 7.57530864197531e-06,
"loss": 0.2127,
"step": 6290
},
{
"epoch": 3.111111111111111,
"grad_norm": 171.12738037109375,
"learning_rate": 7.555555555555556e-06,
"loss": 0.3021,
"step": 6300
},
{
"epoch": 3.1160493827160494,
"grad_norm": 0.15432004630565643,
"learning_rate": 7.535802469135803e-06,
"loss": 0.2258,
"step": 6310
},
{
"epoch": 3.1209876543209876,
"grad_norm": 6.785965919494629,
"learning_rate": 7.51604938271605e-06,
"loss": 0.0524,
"step": 6320
},
{
"epoch": 3.1259259259259258,
"grad_norm": 14.042142868041992,
"learning_rate": 7.496296296296297e-06,
"loss": 0.1315,
"step": 6330
},
{
"epoch": 3.1308641975308644,
"grad_norm": 0.005698219407349825,
"learning_rate": 7.476543209876543e-06,
"loss": 0.59,
"step": 6340
},
{
"epoch": 3.1358024691358026,
"grad_norm": 0.2984008193016052,
"learning_rate": 7.456790123456791e-06,
"loss": 0.1643,
"step": 6350
},
{
"epoch": 3.140740740740741,
"grad_norm": 33.20651626586914,
"learning_rate": 7.437037037037038e-06,
"loss": 0.0757,
"step": 6360
},
{
"epoch": 3.145679012345679,
"grad_norm": 39.41627883911133,
"learning_rate": 7.417283950617284e-06,
"loss": 0.2282,
"step": 6370
},
{
"epoch": 3.1506172839506172,
"grad_norm": 0.06810309737920761,
"learning_rate": 7.3975308641975315e-06,
"loss": 0.0338,
"step": 6380
},
{
"epoch": 3.1555555555555554,
"grad_norm": 0.4489476680755615,
"learning_rate": 7.377777777777778e-06,
"loss": 0.258,
"step": 6390
},
{
"epoch": 3.1604938271604937,
"grad_norm": 3.387746572494507,
"learning_rate": 7.358024691358025e-06,
"loss": 0.0842,
"step": 6400
},
{
"epoch": 3.1654320987654323,
"grad_norm": 2.4589788913726807,
"learning_rate": 7.3382716049382715e-06,
"loss": 0.1025,
"step": 6410
},
{
"epoch": 3.1703703703703705,
"grad_norm": 11.912010192871094,
"learning_rate": 7.31851851851852e-06,
"loss": 0.1159,
"step": 6420
},
{
"epoch": 3.1753086419753087,
"grad_norm": 0.0014852778986096382,
"learning_rate": 7.298765432098765e-06,
"loss": 0.1174,
"step": 6430
},
{
"epoch": 3.180246913580247,
"grad_norm": 0.23326246440410614,
"learning_rate": 7.279012345679013e-06,
"loss": 0.1595,
"step": 6440
},
{
"epoch": 3.185185185185185,
"grad_norm": 0.023275885730981827,
"learning_rate": 7.2592592592592605e-06,
"loss": 0.3177,
"step": 6450
},
{
"epoch": 3.1901234567901233,
"grad_norm": 0.0346212200820446,
"learning_rate": 7.239506172839507e-06,
"loss": 0.1329,
"step": 6460
},
{
"epoch": 3.1950617283950615,
"grad_norm": 0.14802587032318115,
"learning_rate": 7.219753086419754e-06,
"loss": 0.0812,
"step": 6470
},
{
"epoch": 3.2,
"grad_norm": 0.2590476870536804,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.0625,
"step": 6480
},
{
"epoch": 3.2049382716049384,
"grad_norm": 0.7991506457328796,
"learning_rate": 7.180246913580248e-06,
"loss": 0.0811,
"step": 6490
},
{
"epoch": 3.2098765432098766,
"grad_norm": 76.12113189697266,
"learning_rate": 7.160493827160494e-06,
"loss": 0.0727,
"step": 6500
},
{
"epoch": 3.214814814814815,
"grad_norm": 24.764394760131836,
"learning_rate": 7.140740740740741e-06,
"loss": 0.3267,
"step": 6510
},
{
"epoch": 3.219753086419753,
"grad_norm": 59.69222640991211,
"learning_rate": 7.120987654320988e-06,
"loss": 0.1661,
"step": 6520
},
{
"epoch": 3.224691358024691,
"grad_norm": 0.007727318909019232,
"learning_rate": 7.101234567901235e-06,
"loss": 0.1388,
"step": 6530
},
{
"epoch": 3.2296296296296294,
"grad_norm": 1.3282524347305298,
"learning_rate": 7.081481481481482e-06,
"loss": 0.0129,
"step": 6540
},
{
"epoch": 3.234567901234568,
"grad_norm": 58.830318450927734,
"learning_rate": 7.061728395061729e-06,
"loss": 0.075,
"step": 6550
},
{
"epoch": 3.2395061728395063,
"grad_norm": 0.0027803820557892323,
"learning_rate": 7.041975308641976e-06,
"loss": 0.0688,
"step": 6560
},
{
"epoch": 3.2444444444444445,
"grad_norm": 7.03369140625,
"learning_rate": 7.022222222222222e-06,
"loss": 0.2156,
"step": 6570
},
{
"epoch": 3.2493827160493827,
"grad_norm": 0.3327115476131439,
"learning_rate": 7.0024691358024695e-06,
"loss": 0.2499,
"step": 6580
},
{
"epoch": 3.254320987654321,
"grad_norm": 0.007271229289472103,
"learning_rate": 6.982716049382716e-06,
"loss": 0.1749,
"step": 6590
},
{
"epoch": 3.259259259259259,
"grad_norm": 0.011601006612181664,
"learning_rate": 6.962962962962964e-06,
"loss": 0.4342,
"step": 6600
},
{
"epoch": 3.2641975308641973,
"grad_norm": 1.5765591859817505,
"learning_rate": 6.943209876543211e-06,
"loss": 0.02,
"step": 6610
},
{
"epoch": 3.269135802469136,
"grad_norm": 10.005110740661621,
"learning_rate": 6.923456790123458e-06,
"loss": 0.1143,
"step": 6620
},
{
"epoch": 3.274074074074074,
"grad_norm": 0.1242939829826355,
"learning_rate": 6.903703703703705e-06,
"loss": 0.3571,
"step": 6630
},
{
"epoch": 3.2790123456790123,
"grad_norm": 57.85032272338867,
"learning_rate": 6.883950617283951e-06,
"loss": 0.3811,
"step": 6640
},
{
"epoch": 3.2839506172839505,
"grad_norm": 1.068203091621399,
"learning_rate": 6.8641975308641985e-06,
"loss": 0.1045,
"step": 6650
},
{
"epoch": 3.2888888888888888,
"grad_norm": 0.03020775318145752,
"learning_rate": 6.844444444444445e-06,
"loss": 0.0945,
"step": 6660
},
{
"epoch": 3.2938271604938274,
"grad_norm": 18.36736297607422,
"learning_rate": 6.824691358024692e-06,
"loss": 0.2015,
"step": 6670
},
{
"epoch": 3.2987654320987656,
"grad_norm": 0.0009854953968897462,
"learning_rate": 6.8049382716049385e-06,
"loss": 0.2278,
"step": 6680
},
{
"epoch": 3.303703703703704,
"grad_norm": 0.02513027749955654,
"learning_rate": 6.785185185185186e-06,
"loss": 0.2392,
"step": 6690
},
{
"epoch": 3.308641975308642,
"grad_norm": 29.72653579711914,
"learning_rate": 6.765432098765433e-06,
"loss": 0.2054,
"step": 6700
},
{
"epoch": 3.31358024691358,
"grad_norm": 0.006469042040407658,
"learning_rate": 6.745679012345679e-06,
"loss": 0.0055,
"step": 6710
},
{
"epoch": 3.3185185185185184,
"grad_norm": 129.7929229736328,
"learning_rate": 6.725925925925927e-06,
"loss": 0.0842,
"step": 6720
},
{
"epoch": 3.3234567901234566,
"grad_norm": 0.4482802748680115,
"learning_rate": 6.706172839506173e-06,
"loss": 0.1121,
"step": 6730
},
{
"epoch": 3.3283950617283953,
"grad_norm": 10.919482231140137,
"learning_rate": 6.68641975308642e-06,
"loss": 0.2323,
"step": 6740
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.10504257678985596,
"learning_rate": 6.666666666666667e-06,
"loss": 0.1925,
"step": 6750
},
{
"epoch": 3.3382716049382717,
"grad_norm": 26.70441436767578,
"learning_rate": 6.646913580246914e-06,
"loss": 0.3749,
"step": 6760
},
{
"epoch": 3.34320987654321,
"grad_norm": 1.2347007989883423,
"learning_rate": 6.62716049382716e-06,
"loss": 0.1701,
"step": 6770
},
{
"epoch": 3.348148148148148,
"grad_norm": 6.345317840576172,
"learning_rate": 6.6074074074074075e-06,
"loss": 0.0607,
"step": 6780
},
{
"epoch": 3.3530864197530863,
"grad_norm": 13.622949600219727,
"learning_rate": 6.587654320987656e-06,
"loss": 0.1763,
"step": 6790
},
{
"epoch": 3.3580246913580245,
"grad_norm": 16.68195152282715,
"learning_rate": 6.567901234567902e-06,
"loss": 0.2754,
"step": 6800
},
{
"epoch": 3.362962962962963,
"grad_norm": 0.2912677526473999,
"learning_rate": 6.548148148148149e-06,
"loss": 0.2011,
"step": 6810
},
{
"epoch": 3.3679012345679014,
"grad_norm": 76.45751953125,
"learning_rate": 6.528395061728396e-06,
"loss": 0.3157,
"step": 6820
},
{
"epoch": 3.3728395061728396,
"grad_norm": 0.0012998235179111362,
"learning_rate": 6.508641975308643e-06,
"loss": 0.1313,
"step": 6830
},
{
"epoch": 3.3777777777777778,
"grad_norm": 170.02474975585938,
"learning_rate": 6.488888888888889e-06,
"loss": 0.1319,
"step": 6840
},
{
"epoch": 3.382716049382716,
"grad_norm": 87.3119888305664,
"learning_rate": 6.4691358024691365e-06,
"loss": 0.2838,
"step": 6850
},
{
"epoch": 3.387654320987654,
"grad_norm": 25.350370407104492,
"learning_rate": 6.449382716049383e-06,
"loss": 0.1525,
"step": 6860
},
{
"epoch": 3.3925925925925924,
"grad_norm": 0.22812433540821075,
"learning_rate": 6.42962962962963e-06,
"loss": 0.0099,
"step": 6870
},
{
"epoch": 3.397530864197531,
"grad_norm": 0.06566119194030762,
"learning_rate": 6.409876543209877e-06,
"loss": 0.0049,
"step": 6880
},
{
"epoch": 3.4024691358024692,
"grad_norm": 0.003955530468374491,
"learning_rate": 6.390123456790124e-06,
"loss": 0.3611,
"step": 6890
},
{
"epoch": 3.4074074074074074,
"grad_norm": 46.40278244018555,
"learning_rate": 6.370370370370371e-06,
"loss": 0.2929,
"step": 6900
},
{
"epoch": 3.4123456790123456,
"grad_norm": 0.0017953283386304975,
"learning_rate": 6.350617283950617e-06,
"loss": 0.0162,
"step": 6910
},
{
"epoch": 3.417283950617284,
"grad_norm": 0.001457493519410491,
"learning_rate": 6.330864197530865e-06,
"loss": 0.1854,
"step": 6920
},
{
"epoch": 3.422222222222222,
"grad_norm": 0.0005978038534522057,
"learning_rate": 6.311111111111111e-06,
"loss": 0.2118,
"step": 6930
},
{
"epoch": 3.4271604938271603,
"grad_norm": 3.947251558303833,
"learning_rate": 6.291358024691358e-06,
"loss": 0.0656,
"step": 6940
},
{
"epoch": 3.432098765432099,
"grad_norm": 13.78681755065918,
"learning_rate": 6.271604938271606e-06,
"loss": 0.0259,
"step": 6950
},
{
"epoch": 3.437037037037037,
"grad_norm": 0.04035714268684387,
"learning_rate": 6.251851851851852e-06,
"loss": 0.0107,
"step": 6960
},
{
"epoch": 3.4419753086419753,
"grad_norm": 0.024245211854577065,
"learning_rate": 6.2320987654321e-06,
"loss": 0.1175,
"step": 6970
},
{
"epoch": 3.4469135802469135,
"grad_norm": 0.04458506777882576,
"learning_rate": 6.212345679012346e-06,
"loss": 0.2044,
"step": 6980
},
{
"epoch": 3.4518518518518517,
"grad_norm": 161.80392456054688,
"learning_rate": 6.192592592592594e-06,
"loss": 0.2394,
"step": 6990
},
{
"epoch": 3.45679012345679,
"grad_norm": 0.04583211988210678,
"learning_rate": 6.17283950617284e-06,
"loss": 0.0821,
"step": 7000
},
{
"epoch": 3.4617283950617286,
"grad_norm": 0.14376536011695862,
"learning_rate": 6.153086419753087e-06,
"loss": 0.3085,
"step": 7010
},
{
"epoch": 3.466666666666667,
"grad_norm": 92.59646606445312,
"learning_rate": 6.133333333333334e-06,
"loss": 0.2538,
"step": 7020
},
{
"epoch": 3.471604938271605,
"grad_norm": 83.26078033447266,
"learning_rate": 6.113580246913581e-06,
"loss": 0.3145,
"step": 7030
},
{
"epoch": 3.476543209876543,
"grad_norm": 74.77570343017578,
"learning_rate": 6.093827160493828e-06,
"loss": 0.1775,
"step": 7040
},
{
"epoch": 3.4814814814814814,
"grad_norm": 0.038955166935920715,
"learning_rate": 6.0740740740740745e-06,
"loss": 0.1509,
"step": 7050
},
{
"epoch": 3.4864197530864196,
"grad_norm": 97.1812973022461,
"learning_rate": 6.054320987654322e-06,
"loss": 0.2155,
"step": 7060
},
{
"epoch": 3.4913580246913583,
"grad_norm": 73.86189270019531,
"learning_rate": 6.034567901234568e-06,
"loss": 0.2615,
"step": 7070
},
{
"epoch": 3.4962962962962965,
"grad_norm": 0.0055229514837265015,
"learning_rate": 6.014814814814815e-06,
"loss": 0.2428,
"step": 7080
},
{
"epoch": 3.5012345679012347,
"grad_norm": 0.0022700978443026543,
"learning_rate": 5.995061728395062e-06,
"loss": 0.2049,
"step": 7090
},
{
"epoch": 3.506172839506173,
"grad_norm": 1.260072946548462,
"learning_rate": 5.975308641975309e-06,
"loss": 0.181,
"step": 7100
},
{
"epoch": 3.511111111111111,
"grad_norm": 1.283315896987915,
"learning_rate": 5.955555555555555e-06,
"loss": 0.0807,
"step": 7110
},
{
"epoch": 3.5160493827160493,
"grad_norm": 82.1073989868164,
"learning_rate": 5.935802469135803e-06,
"loss": 0.1029,
"step": 7120
},
{
"epoch": 3.5209876543209875,
"grad_norm": 8.620868682861328,
"learning_rate": 5.916049382716051e-06,
"loss": 0.2403,
"step": 7130
},
{
"epoch": 3.525925925925926,
"grad_norm": 6.648277282714844,
"learning_rate": 5.896296296296296e-06,
"loss": 0.0663,
"step": 7140
},
{
"epoch": 3.5308641975308643,
"grad_norm": 0.3625084459781647,
"learning_rate": 5.876543209876544e-06,
"loss": 0.1895,
"step": 7150
},
{
"epoch": 3.5358024691358025,
"grad_norm": 25.613967895507812,
"learning_rate": 5.856790123456791e-06,
"loss": 0.0466,
"step": 7160
},
{
"epoch": 3.5407407407407407,
"grad_norm": 0.6308773756027222,
"learning_rate": 5.837037037037038e-06,
"loss": 0.0887,
"step": 7170
},
{
"epoch": 3.545679012345679,
"grad_norm": 28.219980239868164,
"learning_rate": 5.817283950617284e-06,
"loss": 0.071,
"step": 7180
},
{
"epoch": 3.550617283950617,
"grad_norm": 42.56242752075195,
"learning_rate": 5.797530864197532e-06,
"loss": 0.345,
"step": 7190
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.07085005193948746,
"learning_rate": 5.777777777777778e-06,
"loss": 0.3513,
"step": 7200
},
{
"epoch": 3.560493827160494,
"grad_norm": 0.4435485005378723,
"learning_rate": 5.758024691358025e-06,
"loss": 0.0908,
"step": 7210
},
{
"epoch": 3.565432098765432,
"grad_norm": 0.009900487028062344,
"learning_rate": 5.7382716049382725e-06,
"loss": 0.1456,
"step": 7220
},
{
"epoch": 3.5703703703703704,
"grad_norm": 0.001979109598323703,
"learning_rate": 5.718518518518519e-06,
"loss": 0.0493,
"step": 7230
},
{
"epoch": 3.5753086419753086,
"grad_norm": 0.20845463871955872,
"learning_rate": 5.698765432098766e-06,
"loss": 0.264,
"step": 7240
},
{
"epoch": 3.580246913580247,
"grad_norm": 0.7934794425964355,
"learning_rate": 5.6790123456790125e-06,
"loss": 0.0509,
"step": 7250
},
{
"epoch": 3.585185185185185,
"grad_norm": 0.045501917600631714,
"learning_rate": 5.65925925925926e-06,
"loss": 0.0933,
"step": 7260
},
{
"epoch": 3.5901234567901232,
"grad_norm": 0.040048014372587204,
"learning_rate": 5.639506172839506e-06,
"loss": 0.0733,
"step": 7270
},
{
"epoch": 3.595061728395062,
"grad_norm": 197.66177368164062,
"learning_rate": 5.619753086419753e-06,
"loss": 0.2655,
"step": 7280
},
{
"epoch": 3.6,
"grad_norm": 0.03324214369058609,
"learning_rate": 5.600000000000001e-06,
"loss": 0.0812,
"step": 7290
},
{
"epoch": 3.6049382716049383,
"grad_norm": 124.81009674072266,
"learning_rate": 5.580246913580247e-06,
"loss": 0.1874,
"step": 7300
},
{
"epoch": 3.6098765432098765,
"grad_norm": 14.227179527282715,
"learning_rate": 5.560493827160495e-06,
"loss": 0.1483,
"step": 7310
},
{
"epoch": 3.6148148148148147,
"grad_norm": 28.93998146057129,
"learning_rate": 5.540740740740741e-06,
"loss": 0.2179,
"step": 7320
},
{
"epoch": 3.6197530864197534,
"grad_norm": 109.27143096923828,
"learning_rate": 5.520987654320989e-06,
"loss": 0.2175,
"step": 7330
},
{
"epoch": 3.624691358024691,
"grad_norm": 3.306696653366089,
"learning_rate": 5.501234567901234e-06,
"loss": 0.1275,
"step": 7340
},
{
"epoch": 3.6296296296296298,
"grad_norm": 53.0710563659668,
"learning_rate": 5.481481481481482e-06,
"loss": 0.2602,
"step": 7350
},
{
"epoch": 3.634567901234568,
"grad_norm": 0.00018215861928183585,
"learning_rate": 5.461728395061729e-06,
"loss": 0.1973,
"step": 7360
},
{
"epoch": 3.639506172839506,
"grad_norm": 14.688875198364258,
"learning_rate": 5.441975308641976e-06,
"loss": 0.1937,
"step": 7370
},
{
"epoch": 3.6444444444444444,
"grad_norm": 121.82637023925781,
"learning_rate": 5.422222222222223e-06,
"loss": 0.1325,
"step": 7380
},
{
"epoch": 3.6493827160493826,
"grad_norm": 0.004047624301165342,
"learning_rate": 5.40246913580247e-06,
"loss": 0.1085,
"step": 7390
},
{
"epoch": 3.6543209876543212,
"grad_norm": 108.3661880493164,
"learning_rate": 5.382716049382717e-06,
"loss": 0.2458,
"step": 7400
},
{
"epoch": 3.659259259259259,
"grad_norm": 0.029978841543197632,
"learning_rate": 5.362962962962963e-06,
"loss": 0.2308,
"step": 7410
},
{
"epoch": 3.6641975308641976,
"grad_norm": 32.663150787353516,
"learning_rate": 5.3432098765432105e-06,
"loss": 0.192,
"step": 7420
},
{
"epoch": 3.669135802469136,
"grad_norm": 0.000704328587744385,
"learning_rate": 5.323456790123457e-06,
"loss": 0.2431,
"step": 7430
},
{
"epoch": 3.674074074074074,
"grad_norm": 81.13653564453125,
"learning_rate": 5.303703703703704e-06,
"loss": 0.1404,
"step": 7440
},
{
"epoch": 3.6790123456790123,
"grad_norm": 0.0007958766655065119,
"learning_rate": 5.2839506172839505e-06,
"loss": 0.0767,
"step": 7450
},
{
"epoch": 3.6839506172839505,
"grad_norm": 112.87112426757812,
"learning_rate": 5.264197530864198e-06,
"loss": 0.1402,
"step": 7460
},
{
"epoch": 3.688888888888889,
"grad_norm": 41.893638610839844,
"learning_rate": 5.244444444444445e-06,
"loss": 0.1472,
"step": 7470
},
{
"epoch": 3.6938271604938273,
"grad_norm": 3.585242748260498,
"learning_rate": 5.224691358024691e-06,
"loss": 0.0414,
"step": 7480
},
{
"epoch": 3.6987654320987655,
"grad_norm": 69.6523208618164,
"learning_rate": 5.2049382716049394e-06,
"loss": 0.1479,
"step": 7490
},
{
"epoch": 3.7037037037037037,
"grad_norm": 0.9416589736938477,
"learning_rate": 5.185185185185185e-06,
"loss": 0.1811,
"step": 7500
},
{
"epoch": 3.708641975308642,
"grad_norm": 193.36740112304688,
"learning_rate": 5.165432098765433e-06,
"loss": 0.1734,
"step": 7510
},
{
"epoch": 3.71358024691358,
"grad_norm": 83.2663803100586,
"learning_rate": 5.145679012345679e-06,
"loss": 0.3664,
"step": 7520
},
{
"epoch": 3.7185185185185183,
"grad_norm": 1.504310131072998,
"learning_rate": 5.125925925925927e-06,
"loss": 0.3391,
"step": 7530
},
{
"epoch": 3.723456790123457,
"grad_norm": 63.3848876953125,
"learning_rate": 5.106172839506173e-06,
"loss": 0.2681,
"step": 7540
},
{
"epoch": 3.728395061728395,
"grad_norm": 0.005675642751157284,
"learning_rate": 5.08641975308642e-06,
"loss": 0.193,
"step": 7550
},
{
"epoch": 3.7333333333333334,
"grad_norm": 0.013251741416752338,
"learning_rate": 5.0666666666666676e-06,
"loss": 0.1873,
"step": 7560
},
{
"epoch": 3.7382716049382716,
"grad_norm": 0.0012360225664451718,
"learning_rate": 5.046913580246914e-06,
"loss": 0.2134,
"step": 7570
},
{
"epoch": 3.74320987654321,
"grad_norm": 127.34367370605469,
"learning_rate": 5.027160493827161e-06,
"loss": 0.4686,
"step": 7580
},
{
"epoch": 3.748148148148148,
"grad_norm": 0.01218173187226057,
"learning_rate": 5.007407407407408e-06,
"loss": 0.0103,
"step": 7590
},
{
"epoch": 3.753086419753086,
"grad_norm": 0.03588619455695152,
"learning_rate": 4.987654320987655e-06,
"loss": 0.0478,
"step": 7600
},
{
"epoch": 3.758024691358025,
"grad_norm": 126.76322937011719,
"learning_rate": 4.967901234567902e-06,
"loss": 0.1531,
"step": 7610
},
{
"epoch": 3.762962962962963,
"grad_norm": 39.57160949707031,
"learning_rate": 4.9481481481481485e-06,
"loss": 0.0445,
"step": 7620
},
{
"epoch": 3.7679012345679013,
"grad_norm": 0.4843272566795349,
"learning_rate": 4.928395061728396e-06,
"loss": 0.0298,
"step": 7630
},
{
"epoch": 3.7728395061728395,
"grad_norm": 33.181583404541016,
"learning_rate": 4.908641975308642e-06,
"loss": 0.3563,
"step": 7640
},
{
"epoch": 3.7777777777777777,
"grad_norm": 27.694658279418945,
"learning_rate": 4.888888888888889e-06,
"loss": 0.142,
"step": 7650
},
{
"epoch": 3.782716049382716,
"grad_norm": 0.008271468803286552,
"learning_rate": 4.869135802469136e-06,
"loss": 0.1445,
"step": 7660
},
{
"epoch": 3.787654320987654,
"grad_norm": 180.6202850341797,
"learning_rate": 4.849382716049383e-06,
"loss": 0.3204,
"step": 7670
},
{
"epoch": 3.7925925925925927,
"grad_norm": 58.78599548339844,
"learning_rate": 4.82962962962963e-06,
"loss": 0.2717,
"step": 7680
},
{
"epoch": 3.797530864197531,
"grad_norm": 48.85298538208008,
"learning_rate": 4.8098765432098774e-06,
"loss": 0.3752,
"step": 7690
},
{
"epoch": 3.802469135802469,
"grad_norm": 119.5743637084961,
"learning_rate": 4.790123456790124e-06,
"loss": 0.266,
"step": 7700
},
{
"epoch": 3.8074074074074074,
"grad_norm": 38.25589370727539,
"learning_rate": 4.770370370370371e-06,
"loss": 0.1581,
"step": 7710
},
{
"epoch": 3.8123456790123456,
"grad_norm": 4.294593811035156,
"learning_rate": 4.7506172839506175e-06,
"loss": 0.0615,
"step": 7720
},
{
"epoch": 3.817283950617284,
"grad_norm": 0.23868466913700104,
"learning_rate": 4.730864197530865e-06,
"loss": 0.2377,
"step": 7730
},
{
"epoch": 3.822222222222222,
"grad_norm": 1.3772286176681519,
"learning_rate": 4.711111111111111e-06,
"loss": 0.1767,
"step": 7740
},
{
"epoch": 3.8271604938271606,
"grad_norm": 0.004857083782553673,
"learning_rate": 4.691358024691358e-06,
"loss": 0.081,
"step": 7750
},
{
"epoch": 3.832098765432099,
"grad_norm": 62.059326171875,
"learning_rate": 4.6716049382716056e-06,
"loss": 0.1362,
"step": 7760
},
{
"epoch": 3.837037037037037,
"grad_norm": 0.022881271317601204,
"learning_rate": 4.651851851851853e-06,
"loss": 0.0713,
"step": 7770
},
{
"epoch": 3.8419753086419752,
"grad_norm": 39.1450309753418,
"learning_rate": 4.632098765432099e-06,
"loss": 0.0745,
"step": 7780
},
{
"epoch": 3.8469135802469134,
"grad_norm": 4.154773712158203,
"learning_rate": 4.6123456790123464e-06,
"loss": 0.1029,
"step": 7790
},
{
"epoch": 3.851851851851852,
"grad_norm": 68.09147644042969,
"learning_rate": 4.592592592592593e-06,
"loss": 0.0558,
"step": 7800
},
{
"epoch": 3.8567901234567903,
"grad_norm": 0.14514310657978058,
"learning_rate": 4.57283950617284e-06,
"loss": 0.0501,
"step": 7810
},
{
"epoch": 3.8617283950617285,
"grad_norm": 1.0181536674499512,
"learning_rate": 4.5530864197530865e-06,
"loss": 0.0579,
"step": 7820
},
{
"epoch": 3.8666666666666667,
"grad_norm": 141.15499877929688,
"learning_rate": 4.533333333333334e-06,
"loss": 0.2657,
"step": 7830
},
{
"epoch": 3.871604938271605,
"grad_norm": 0.6955594420433044,
"learning_rate": 4.513580246913581e-06,
"loss": 0.2284,
"step": 7840
},
{
"epoch": 3.876543209876543,
"grad_norm": 125.45293426513672,
"learning_rate": 4.493827160493827e-06,
"loss": 0.446,
"step": 7850
},
{
"epoch": 3.8814814814814813,
"grad_norm": 0.0857425257563591,
"learning_rate": 4.4740740740740746e-06,
"loss": 0.0597,
"step": 7860
},
{
"epoch": 3.88641975308642,
"grad_norm": 44.19774627685547,
"learning_rate": 4.454320987654322e-06,
"loss": 0.2066,
"step": 7870
},
{
"epoch": 3.891358024691358,
"grad_norm": 61.00041580200195,
"learning_rate": 4.434567901234568e-06,
"loss": 0.1439,
"step": 7880
},
{
"epoch": 3.8962962962962964,
"grad_norm": 0.8123835325241089,
"learning_rate": 4.4148148148148154e-06,
"loss": 0.2655,
"step": 7890
},
{
"epoch": 3.9012345679012346,
"grad_norm": 0.0009880654979497194,
"learning_rate": 4.395061728395062e-06,
"loss": 0.1153,
"step": 7900
},
{
"epoch": 3.906172839506173,
"grad_norm": 0.0027614731807261705,
"learning_rate": 4.375308641975309e-06,
"loss": 0.0693,
"step": 7910
},
{
"epoch": 3.911111111111111,
"grad_norm": 99.65026092529297,
"learning_rate": 4.3555555555555555e-06,
"loss": 0.3998,
"step": 7920
},
{
"epoch": 3.916049382716049,
"grad_norm": 17.23603057861328,
"learning_rate": 4.335802469135803e-06,
"loss": 0.2811,
"step": 7930
},
{
"epoch": 3.920987654320988,
"grad_norm": 13.379603385925293,
"learning_rate": 4.31604938271605e-06,
"loss": 0.0989,
"step": 7940
},
{
"epoch": 3.925925925925926,
"grad_norm": 0.12741827964782715,
"learning_rate": 4.296296296296296e-06,
"loss": 0.0431,
"step": 7950
},
{
"epoch": 3.9308641975308642,
"grad_norm": 164.3784637451172,
"learning_rate": 4.2765432098765436e-06,
"loss": 0.3376,
"step": 7960
},
{
"epoch": 3.9358024691358025,
"grad_norm": 0.002450704574584961,
"learning_rate": 4.256790123456791e-06,
"loss": 0.039,
"step": 7970
},
{
"epoch": 3.9407407407407407,
"grad_norm": 9.37784194946289,
"learning_rate": 4.237037037037037e-06,
"loss": 0.3296,
"step": 7980
},
{
"epoch": 3.945679012345679,
"grad_norm": 0.9755693078041077,
"learning_rate": 4.2172839506172844e-06,
"loss": 0.2798,
"step": 7990
},
{
"epoch": 3.950617283950617,
"grad_norm": 17.373695373535156,
"learning_rate": 4.197530864197531e-06,
"loss": 0.044,
"step": 8000
},
{
"epoch": 3.9555555555555557,
"grad_norm": 40.896148681640625,
"learning_rate": 4.177777777777778e-06,
"loss": 0.1641,
"step": 8010
},
{
"epoch": 3.960493827160494,
"grad_norm": 7.210272312164307,
"learning_rate": 4.158024691358025e-06,
"loss": 0.0641,
"step": 8020
},
{
"epoch": 3.965432098765432,
"grad_norm": 0.3746698498725891,
"learning_rate": 4.138271604938272e-06,
"loss": 0.1236,
"step": 8030
},
{
"epoch": 3.9703703703703703,
"grad_norm": 2.9503226280212402,
"learning_rate": 4.118518518518519e-06,
"loss": 0.0634,
"step": 8040
},
{
"epoch": 3.9753086419753085,
"grad_norm": 1.2919955253601074,
"learning_rate": 4.098765432098766e-06,
"loss": 0.0069,
"step": 8050
},
{
"epoch": 3.980246913580247,
"grad_norm": 0.08173320442438126,
"learning_rate": 4.0790123456790126e-06,
"loss": 0.1177,
"step": 8060
},
{
"epoch": 3.985185185185185,
"grad_norm": 0.10468322783708572,
"learning_rate": 4.05925925925926e-06,
"loss": 0.0602,
"step": 8070
},
{
"epoch": 3.9901234567901236,
"grad_norm": 0.1967976987361908,
"learning_rate": 4.039506172839506e-06,
"loss": 0.1996,
"step": 8080
},
{
"epoch": 3.995061728395062,
"grad_norm": 16.828914642333984,
"learning_rate": 4.0197530864197534e-06,
"loss": 0.0063,
"step": 8090
},
{
"epoch": 4.0,
"grad_norm": 142.36537170410156,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2674,
"step": 8100
},
{
"epoch": 4.0,
"eval_accuracy": 0.9833333333333333,
"eval_loss": 0.06853805482387543,
"eval_runtime": 32.7103,
"eval_samples_per_second": 165.086,
"eval_steps_per_second": 20.636,
"step": 8100
},
{
"epoch": 4.004938271604939,
"grad_norm": 0.028582552447915077,
"learning_rate": 3.980246913580247e-06,
"loss": 0.3409,
"step": 8110
},
{
"epoch": 4.009876543209876,
"grad_norm": 0.12553012371063232,
"learning_rate": 3.960493827160494e-06,
"loss": 0.1076,
"step": 8120
},
{
"epoch": 4.014814814814815,
"grad_norm": 0.08727646619081497,
"learning_rate": 3.940740740740741e-06,
"loss": 0.2658,
"step": 8130
},
{
"epoch": 4.019753086419753,
"grad_norm": 40.70219802856445,
"learning_rate": 3.920987654320988e-06,
"loss": 0.1109,
"step": 8140
},
{
"epoch": 4.0246913580246915,
"grad_norm": 0.04967527464032173,
"learning_rate": 3.901234567901235e-06,
"loss": 0.2816,
"step": 8150
},
{
"epoch": 4.029629629629629,
"grad_norm": 4.632954120635986,
"learning_rate": 3.8814814814814816e-06,
"loss": 0.0101,
"step": 8160
},
{
"epoch": 4.034567901234568,
"grad_norm": 11.988831520080566,
"learning_rate": 3.861728395061729e-06,
"loss": 0.1071,
"step": 8170
},
{
"epoch": 4.0395061728395065,
"grad_norm": 0.002083718776702881,
"learning_rate": 3.841975308641976e-06,
"loss": 0.3421,
"step": 8180
},
{
"epoch": 4.044444444444444,
"grad_norm": 7.259564399719238,
"learning_rate": 3.8222222222222224e-06,
"loss": 0.0545,
"step": 8190
},
{
"epoch": 4.049382716049383,
"grad_norm": 0.12477586418390274,
"learning_rate": 3.8024691358024697e-06,
"loss": 0.056,
"step": 8200
},
{
"epoch": 4.054320987654321,
"grad_norm": 131.77743530273438,
"learning_rate": 3.7827160493827165e-06,
"loss": 0.1617,
"step": 8210
},
{
"epoch": 4.059259259259259,
"grad_norm": 0.1798364818096161,
"learning_rate": 3.7629629629629633e-06,
"loss": 0.0063,
"step": 8220
},
{
"epoch": 4.064197530864197,
"grad_norm": 111.68184661865234,
"learning_rate": 3.74320987654321e-06,
"loss": 0.071,
"step": 8230
},
{
"epoch": 4.069135802469136,
"grad_norm": 75.00855255126953,
"learning_rate": 3.723456790123457e-06,
"loss": 0.4207,
"step": 8240
},
{
"epoch": 4.074074074074074,
"grad_norm": 0.0791148990392685,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.0377,
"step": 8250
},
{
"epoch": 4.079012345679012,
"grad_norm": 123.85789489746094,
"learning_rate": 3.6839506172839506e-06,
"loss": 0.282,
"step": 8260
},
{
"epoch": 4.083950617283951,
"grad_norm": 0.0917818695306778,
"learning_rate": 3.6641975308641982e-06,
"loss": 0.2107,
"step": 8270
},
{
"epoch": 4.088888888888889,
"grad_norm": 93.7401123046875,
"learning_rate": 3.644444444444445e-06,
"loss": 0.4766,
"step": 8280
},
{
"epoch": 4.093827160493827,
"grad_norm": 4.973775863647461,
"learning_rate": 3.624691358024692e-06,
"loss": 0.1966,
"step": 8290
},
{
"epoch": 4.098765432098766,
"grad_norm": 13.099119186401367,
"learning_rate": 3.6049382716049387e-06,
"loss": 0.0284,
"step": 8300
},
{
"epoch": 4.103703703703704,
"grad_norm": 0.14128296077251434,
"learning_rate": 3.5851851851851855e-06,
"loss": 0.3451,
"step": 8310
},
{
"epoch": 4.108641975308642,
"grad_norm": 19.09874153137207,
"learning_rate": 3.5654320987654323e-06,
"loss": 0.3137,
"step": 8320
},
{
"epoch": 4.11358024691358,
"grad_norm": 33.85554504394531,
"learning_rate": 3.545679012345679e-06,
"loss": 0.1776,
"step": 8330
},
{
"epoch": 4.118518518518519,
"grad_norm": 0.02345465123653412,
"learning_rate": 3.525925925925926e-06,
"loss": 0.2006,
"step": 8340
},
{
"epoch": 4.1234567901234565,
"grad_norm": 90.08519744873047,
"learning_rate": 3.5061728395061736e-06,
"loss": 0.2977,
"step": 8350
},
{
"epoch": 4.128395061728395,
"grad_norm": 41.20042037963867,
"learning_rate": 3.4864197530864204e-06,
"loss": 0.2238,
"step": 8360
},
{
"epoch": 4.133333333333334,
"grad_norm": 1.0883228778839111,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.0693,
"step": 8370
},
{
"epoch": 4.1382716049382715,
"grad_norm": 0.03349454700946808,
"learning_rate": 3.446913580246914e-06,
"loss": 0.1569,
"step": 8380
},
{
"epoch": 4.14320987654321,
"grad_norm": 18.927202224731445,
"learning_rate": 3.427160493827161e-06,
"loss": 0.2259,
"step": 8390
},
{
"epoch": 4.148148148148148,
"grad_norm": 41.818538665771484,
"learning_rate": 3.4074074074074077e-06,
"loss": 0.2041,
"step": 8400
},
{
"epoch": 4.153086419753087,
"grad_norm": 0.26372233033180237,
"learning_rate": 3.3876543209876545e-06,
"loss": 0.1225,
"step": 8410
},
{
"epoch": 4.158024691358024,
"grad_norm": 45.54108810424805,
"learning_rate": 3.3679012345679013e-06,
"loss": 0.2084,
"step": 8420
},
{
"epoch": 4.162962962962963,
"grad_norm": 0.014255751855671406,
"learning_rate": 3.348148148148148e-06,
"loss": 0.0153,
"step": 8430
},
{
"epoch": 4.167901234567902,
"grad_norm": 0.8963614106178284,
"learning_rate": 3.3283950617283953e-06,
"loss": 0.0802,
"step": 8440
},
{
"epoch": 4.172839506172839,
"grad_norm": 32.044166564941406,
"learning_rate": 3.3086419753086426e-06,
"loss": 0.1971,
"step": 8450
},
{
"epoch": 4.177777777777778,
"grad_norm": 0.006651794072240591,
"learning_rate": 3.2888888888888894e-06,
"loss": 0.0366,
"step": 8460
},
{
"epoch": 4.182716049382716,
"grad_norm": 1.5995298624038696,
"learning_rate": 3.2691358024691362e-06,
"loss": 0.2041,
"step": 8470
},
{
"epoch": 4.187654320987654,
"grad_norm": 0.07189402729272842,
"learning_rate": 3.249382716049383e-06,
"loss": 0.1008,
"step": 8480
},
{
"epoch": 4.192592592592592,
"grad_norm": 0.014369451440870762,
"learning_rate": 3.22962962962963e-06,
"loss": 0.1384,
"step": 8490
},
{
"epoch": 4.197530864197531,
"grad_norm": 2.7586021423339844,
"learning_rate": 3.2098765432098767e-06,
"loss": 0.1149,
"step": 8500
},
{
"epoch": 4.2024691358024695,
"grad_norm": 0.25027868151664734,
"learning_rate": 3.1901234567901235e-06,
"loss": 0.1085,
"step": 8510
},
{
"epoch": 4.207407407407407,
"grad_norm": 21.993419647216797,
"learning_rate": 3.1703703703703707e-06,
"loss": 0.1086,
"step": 8520
},
{
"epoch": 4.212345679012346,
"grad_norm": 108.14185333251953,
"learning_rate": 3.1506172839506175e-06,
"loss": 0.274,
"step": 8530
},
{
"epoch": 4.217283950617284,
"grad_norm": 0.006499402225017548,
"learning_rate": 3.1308641975308648e-06,
"loss": 0.1101,
"step": 8540
},
{
"epoch": 4.222222222222222,
"grad_norm": 25.40144920349121,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.3034,
"step": 8550
},
{
"epoch": 4.22716049382716,
"grad_norm": 0.04093475639820099,
"learning_rate": 3.0913580246913584e-06,
"loss": 0.1373,
"step": 8560
},
{
"epoch": 4.232098765432099,
"grad_norm": 0.3943523168563843,
"learning_rate": 3.0716049382716052e-06,
"loss": 0.1059,
"step": 8570
},
{
"epoch": 4.237037037037037,
"grad_norm": 34.58479309082031,
"learning_rate": 3.051851851851852e-06,
"loss": 0.1032,
"step": 8580
},
{
"epoch": 4.241975308641975,
"grad_norm": 79.955810546875,
"learning_rate": 3.032098765432099e-06,
"loss": 0.1232,
"step": 8590
},
{
"epoch": 4.246913580246914,
"grad_norm": 47.233482360839844,
"learning_rate": 3.012345679012346e-06,
"loss": 0.1098,
"step": 8600
},
{
"epoch": 4.2518518518518515,
"grad_norm": 138.7650909423828,
"learning_rate": 2.992592592592593e-06,
"loss": 0.1554,
"step": 8610
},
{
"epoch": 4.25679012345679,
"grad_norm": 34.47438430786133,
"learning_rate": 2.9728395061728397e-06,
"loss": 0.1909,
"step": 8620
},
{
"epoch": 4.261728395061729,
"grad_norm": 0.10936783254146576,
"learning_rate": 2.953086419753087e-06,
"loss": 0.1279,
"step": 8630
},
{
"epoch": 4.266666666666667,
"grad_norm": 66.3951416015625,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.4878,
"step": 8640
},
{
"epoch": 4.271604938271605,
"grad_norm": 0.7240855097770691,
"learning_rate": 2.9135802469135806e-06,
"loss": 0.171,
"step": 8650
},
{
"epoch": 4.276543209876543,
"grad_norm": 84.10567474365234,
"learning_rate": 2.8938271604938274e-06,
"loss": 0.265,
"step": 8660
},
{
"epoch": 4.281481481481482,
"grad_norm": 0.03191656991839409,
"learning_rate": 2.874074074074074e-06,
"loss": 0.3997,
"step": 8670
},
{
"epoch": 4.286419753086419,
"grad_norm": 0.05699067562818527,
"learning_rate": 2.854320987654321e-06,
"loss": 0.0334,
"step": 8680
},
{
"epoch": 4.291358024691358,
"grad_norm": 0.03787963092327118,
"learning_rate": 2.8345679012345683e-06,
"loss": 0.0026,
"step": 8690
},
{
"epoch": 4.296296296296296,
"grad_norm": 0.32715028524398804,
"learning_rate": 2.814814814814815e-06,
"loss": 0.0851,
"step": 8700
},
{
"epoch": 4.3012345679012345,
"grad_norm": 1.704313039779663,
"learning_rate": 2.795061728395062e-06,
"loss": 0.2827,
"step": 8710
},
{
"epoch": 4.306172839506173,
"grad_norm": 35.010746002197266,
"learning_rate": 2.7753086419753087e-06,
"loss": 0.307,
"step": 8720
},
{
"epoch": 4.311111111111111,
"grad_norm": 50.50590133666992,
"learning_rate": 2.755555555555556e-06,
"loss": 0.1594,
"step": 8730
},
{
"epoch": 4.3160493827160495,
"grad_norm": 31.76420783996582,
"learning_rate": 2.7358024691358028e-06,
"loss": 0.1536,
"step": 8740
},
{
"epoch": 4.320987654320987,
"grad_norm": 0.11124283820390701,
"learning_rate": 2.7160493827160496e-06,
"loss": 0.1278,
"step": 8750
},
{
"epoch": 4.325925925925926,
"grad_norm": 29.00436019897461,
"learning_rate": 2.6962962962962964e-06,
"loss": 0.0417,
"step": 8760
},
{
"epoch": 4.330864197530865,
"grad_norm": 0.002402759389951825,
"learning_rate": 2.6765432098765436e-06,
"loss": 0.077,
"step": 8770
},
{
"epoch": 4.335802469135802,
"grad_norm": 5.55736780166626,
"learning_rate": 2.6567901234567904e-06,
"loss": 0.1247,
"step": 8780
},
{
"epoch": 4.340740740740741,
"grad_norm": 0.024351775646209717,
"learning_rate": 2.6370370370370373e-06,
"loss": 0.1003,
"step": 8790
},
{
"epoch": 4.345679012345679,
"grad_norm": 0.009600900113582611,
"learning_rate": 2.617283950617284e-06,
"loss": 0.1143,
"step": 8800
},
{
"epoch": 4.350617283950617,
"grad_norm": 0.001896082772873342,
"learning_rate": 2.597530864197531e-06,
"loss": 0.0972,
"step": 8810
},
{
"epoch": 4.355555555555555,
"grad_norm": 0.0376252606511116,
"learning_rate": 2.577777777777778e-06,
"loss": 0.1537,
"step": 8820
},
{
"epoch": 4.360493827160494,
"grad_norm": 0.010516272857785225,
"learning_rate": 2.558024691358025e-06,
"loss": 0.0149,
"step": 8830
},
{
"epoch": 4.3654320987654325,
"grad_norm": 30.120134353637695,
"learning_rate": 2.5382716049382718e-06,
"loss": 0.0042,
"step": 8840
},
{
"epoch": 4.37037037037037,
"grad_norm": 0.48482951521873474,
"learning_rate": 2.5185185185185186e-06,
"loss": 0.1258,
"step": 8850
},
{
"epoch": 4.375308641975309,
"grad_norm": 9.926421165466309,
"learning_rate": 2.4987654320987654e-06,
"loss": 0.1866,
"step": 8860
},
{
"epoch": 4.380246913580247,
"grad_norm": 0.024937864392995834,
"learning_rate": 2.4790123456790126e-06,
"loss": 0.0231,
"step": 8870
},
{
"epoch": 4.385185185185185,
"grad_norm": 0.40552499890327454,
"learning_rate": 2.4592592592592594e-06,
"loss": 0.0423,
"step": 8880
},
{
"epoch": 4.390123456790123,
"grad_norm": 1.134421944618225,
"learning_rate": 2.4395061728395063e-06,
"loss": 0.1767,
"step": 8890
},
{
"epoch": 4.395061728395062,
"grad_norm": 0.06691499054431915,
"learning_rate": 2.419753086419753e-06,
"loss": 0.2377,
"step": 8900
},
{
"epoch": 4.4,
"grad_norm": 1.1887983083724976,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.1737,
"step": 8910
},
{
"epoch": 4.404938271604938,
"grad_norm": 1.4004325866699219,
"learning_rate": 2.380246913580247e-06,
"loss": 0.162,
"step": 8920
},
{
"epoch": 4.409876543209877,
"grad_norm": 5.580018520355225,
"learning_rate": 2.360493827160494e-06,
"loss": 0.251,
"step": 8930
},
{
"epoch": 4.4148148148148145,
"grad_norm": 0.007224132306873798,
"learning_rate": 2.3407407407407408e-06,
"loss": 0.1454,
"step": 8940
},
{
"epoch": 4.419753086419753,
"grad_norm": 154.13819885253906,
"learning_rate": 2.3209876543209876e-06,
"loss": 0.3889,
"step": 8950
},
{
"epoch": 4.424691358024692,
"grad_norm": 32.98945236206055,
"learning_rate": 2.301234567901235e-06,
"loss": 0.2466,
"step": 8960
},
{
"epoch": 4.42962962962963,
"grad_norm": 0.0013707876205444336,
"learning_rate": 2.2814814814814816e-06,
"loss": 0.2529,
"step": 8970
},
{
"epoch": 4.434567901234568,
"grad_norm": 80.57937622070312,
"learning_rate": 2.2617283950617284e-06,
"loss": 0.1712,
"step": 8980
},
{
"epoch": 4.439506172839506,
"grad_norm": 129.87698364257812,
"learning_rate": 2.2419753086419753e-06,
"loss": 0.1409,
"step": 8990
},
{
"epoch": 4.444444444444445,
"grad_norm": 61.0521354675293,
"learning_rate": 2.222222222222222e-06,
"loss": 0.1277,
"step": 9000
},
{
"epoch": 4.449382716049382,
"grad_norm": 0.05561920627951622,
"learning_rate": 2.2024691358024693e-06,
"loss": 0.1921,
"step": 9010
},
{
"epoch": 4.454320987654321,
"grad_norm": 0.02089673839509487,
"learning_rate": 2.182716049382716e-06,
"loss": 0.0877,
"step": 9020
},
{
"epoch": 4.459259259259259,
"grad_norm": 0.0033945185132324696,
"learning_rate": 2.162962962962963e-06,
"loss": 0.1127,
"step": 9030
},
{
"epoch": 4.4641975308641975,
"grad_norm": 0.00884201843291521,
"learning_rate": 2.1432098765432098e-06,
"loss": 0.1677,
"step": 9040
},
{
"epoch": 4.469135802469136,
"grad_norm": 16.309391021728516,
"learning_rate": 2.123456790123457e-06,
"loss": 0.1119,
"step": 9050
},
{
"epoch": 4.474074074074074,
"grad_norm": 0.035716574639081955,
"learning_rate": 2.103703703703704e-06,
"loss": 0.068,
"step": 9060
},
{
"epoch": 4.4790123456790125,
"grad_norm": 0.009720105677843094,
"learning_rate": 2.0839506172839506e-06,
"loss": 0.0933,
"step": 9070
},
{
"epoch": 4.48395061728395,
"grad_norm": 0.2953310012817383,
"learning_rate": 2.0641975308641974e-06,
"loss": 0.0775,
"step": 9080
},
{
"epoch": 4.488888888888889,
"grad_norm": 4.523210525512695,
"learning_rate": 2.0444444444444447e-06,
"loss": 0.2808,
"step": 9090
},
{
"epoch": 4.493827160493828,
"grad_norm": 2.265265464782715,
"learning_rate": 2.0246913580246915e-06,
"loss": 0.0274,
"step": 9100
},
{
"epoch": 4.498765432098765,
"grad_norm": 2.9944541454315186,
"learning_rate": 2.0049382716049383e-06,
"loss": 0.1563,
"step": 9110
},
{
"epoch": 4.503703703703704,
"grad_norm": 15.32995891571045,
"learning_rate": 1.985185185185185e-06,
"loss": 0.0304,
"step": 9120
},
{
"epoch": 4.508641975308642,
"grad_norm": 124.7613754272461,
"learning_rate": 1.9654320987654324e-06,
"loss": 0.2997,
"step": 9130
},
{
"epoch": 4.51358024691358,
"grad_norm": 0.20713317394256592,
"learning_rate": 1.945679012345679e-06,
"loss": 0.1026,
"step": 9140
},
{
"epoch": 4.518518518518518,
"grad_norm": 38.10224533081055,
"learning_rate": 1.925925925925926e-06,
"loss": 0.0983,
"step": 9150
},
{
"epoch": 4.523456790123457,
"grad_norm": 0.042433250695466995,
"learning_rate": 1.906172839506173e-06,
"loss": 0.0291,
"step": 9160
},
{
"epoch": 4.528395061728395,
"grad_norm": 3.1156327724456787,
"learning_rate": 1.8864197530864198e-06,
"loss": 0.0577,
"step": 9170
},
{
"epoch": 4.533333333333333,
"grad_norm": 0.026819046586751938,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.1211,
"step": 9180
},
{
"epoch": 4.538271604938272,
"grad_norm": 0.4800088107585907,
"learning_rate": 1.8469135802469137e-06,
"loss": 0.0023,
"step": 9190
},
{
"epoch": 4.54320987654321,
"grad_norm": 0.050341859459877014,
"learning_rate": 1.8271604938271605e-06,
"loss": 0.036,
"step": 9200
},
{
"epoch": 4.548148148148148,
"grad_norm": 0.11272630095481873,
"learning_rate": 1.8074074074074075e-06,
"loss": 0.0335,
"step": 9210
},
{
"epoch": 4.553086419753086,
"grad_norm": 44.774688720703125,
"learning_rate": 1.7876543209876545e-06,
"loss": 0.1142,
"step": 9220
},
{
"epoch": 4.558024691358025,
"grad_norm": 0.0022994689643383026,
"learning_rate": 1.7679012345679014e-06,
"loss": 0.0641,
"step": 9230
},
{
"epoch": 4.562962962962963,
"grad_norm": 0.9468904733657837,
"learning_rate": 1.7481481481481482e-06,
"loss": 0.1574,
"step": 9240
},
{
"epoch": 4.567901234567901,
"grad_norm": 0.022345565259456635,
"learning_rate": 1.7283950617283952e-06,
"loss": 0.1025,
"step": 9250
},
{
"epoch": 4.57283950617284,
"grad_norm": 12.888065338134766,
"learning_rate": 1.7086419753086422e-06,
"loss": 0.1864,
"step": 9260
},
{
"epoch": 4.5777777777777775,
"grad_norm": 94.58697509765625,
"learning_rate": 1.688888888888889e-06,
"loss": 0.1861,
"step": 9270
},
{
"epoch": 4.582716049382716,
"grad_norm": 66.434326171875,
"learning_rate": 1.6691358024691359e-06,
"loss": 0.0646,
"step": 9280
},
{
"epoch": 4.587654320987655,
"grad_norm": 0.005768710281699896,
"learning_rate": 1.6493827160493827e-06,
"loss": 0.1047,
"step": 9290
},
{
"epoch": 4.592592592592593,
"grad_norm": 0.08475484699010849,
"learning_rate": 1.62962962962963e-06,
"loss": 0.1706,
"step": 9300
},
{
"epoch": 4.597530864197531,
"grad_norm": 0.871222972869873,
"learning_rate": 1.6098765432098767e-06,
"loss": 0.0384,
"step": 9310
},
{
"epoch": 4.602469135802469,
"grad_norm": 35.023040771484375,
"learning_rate": 1.5901234567901235e-06,
"loss": 0.1562,
"step": 9320
},
{
"epoch": 4.607407407407408,
"grad_norm": 0.08310205489397049,
"learning_rate": 1.5703703703703704e-06,
"loss": 0.1636,
"step": 9330
},
{
"epoch": 4.612345679012345,
"grad_norm": 0.008625690825283527,
"learning_rate": 1.5506172839506172e-06,
"loss": 0.1299,
"step": 9340
},
{
"epoch": 4.617283950617284,
"grad_norm": 0.07079397141933441,
"learning_rate": 1.5308641975308644e-06,
"loss": 0.2401,
"step": 9350
},
{
"epoch": 4.622222222222222,
"grad_norm": 0.002696413081139326,
"learning_rate": 1.5111111111111112e-06,
"loss": 0.1377,
"step": 9360
},
{
"epoch": 4.62716049382716,
"grad_norm": 52.69441604614258,
"learning_rate": 1.491358024691358e-06,
"loss": 0.3121,
"step": 9370
},
{
"epoch": 4.632098765432099,
"grad_norm": 192.6532745361328,
"learning_rate": 1.4716049382716049e-06,
"loss": 0.0441,
"step": 9380
},
{
"epoch": 4.637037037037037,
"grad_norm": 249.43846130371094,
"learning_rate": 1.451851851851852e-06,
"loss": 0.299,
"step": 9390
},
{
"epoch": 4.6419753086419755,
"grad_norm": 0.05828845128417015,
"learning_rate": 1.432098765432099e-06,
"loss": 0.0683,
"step": 9400
},
{
"epoch": 4.646913580246913,
"grad_norm": 176.3085174560547,
"learning_rate": 1.4123456790123457e-06,
"loss": 0.0396,
"step": 9410
},
{
"epoch": 4.651851851851852,
"grad_norm": 3.0951056480407715,
"learning_rate": 1.3925925925925925e-06,
"loss": 0.0874,
"step": 9420
},
{
"epoch": 4.6567901234567906,
"grad_norm": 1.2149375677108765,
"learning_rate": 1.3728395061728398e-06,
"loss": 0.1504,
"step": 9430
},
{
"epoch": 4.661728395061728,
"grad_norm": 0.05385606735944748,
"learning_rate": 1.3530864197530866e-06,
"loss": 0.0918,
"step": 9440
},
{
"epoch": 4.666666666666667,
"grad_norm": 11.512873649597168,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0947,
"step": 9450
},
{
"epoch": 4.671604938271605,
"grad_norm": 0.024780087172985077,
"learning_rate": 1.3135802469135802e-06,
"loss": 0.0753,
"step": 9460
},
{
"epoch": 4.676543209876543,
"grad_norm": 0.2996337115764618,
"learning_rate": 1.2938271604938275e-06,
"loss": 0.171,
"step": 9470
},
{
"epoch": 4.681481481481481,
"grad_norm": 0.09016973525285721,
"learning_rate": 1.2740740740740743e-06,
"loss": 0.0803,
"step": 9480
},
{
"epoch": 4.68641975308642,
"grad_norm": 0.24141840636730194,
"learning_rate": 1.254320987654321e-06,
"loss": 0.1636,
"step": 9490
},
{
"epoch": 4.6913580246913575,
"grad_norm": 0.0026981073897331953,
"learning_rate": 1.234567901234568e-06,
"loss": 0.1209,
"step": 9500
},
{
"epoch": 4.696296296296296,
"grad_norm": 0.0028422910254448652,
"learning_rate": 1.214814814814815e-06,
"loss": 0.0334,
"step": 9510
},
{
"epoch": 4.701234567901235,
"grad_norm": 100.68513488769531,
"learning_rate": 1.1950617283950618e-06,
"loss": 0.3581,
"step": 9520
},
{
"epoch": 4.706172839506173,
"grad_norm": 0.001111358986236155,
"learning_rate": 1.1753086419753088e-06,
"loss": 0.0474,
"step": 9530
},
{
"epoch": 4.711111111111111,
"grad_norm": 60.36039733886719,
"learning_rate": 1.1555555555555556e-06,
"loss": 0.4299,
"step": 9540
},
{
"epoch": 4.716049382716049,
"grad_norm": 0.0019079376943409443,
"learning_rate": 1.1358024691358026e-06,
"loss": 0.0945,
"step": 9550
},
{
"epoch": 4.720987654320988,
"grad_norm": 0.46460771560668945,
"learning_rate": 1.1160493827160494e-06,
"loss": 0.312,
"step": 9560
},
{
"epoch": 4.725925925925926,
"grad_norm": 1.906554937362671,
"learning_rate": 1.0962962962962965e-06,
"loss": 0.0951,
"step": 9570
},
{
"epoch": 4.730864197530864,
"grad_norm": 1.5617965459823608,
"learning_rate": 1.0765432098765433e-06,
"loss": 0.1714,
"step": 9580
},
{
"epoch": 4.735802469135803,
"grad_norm": 5.5619893074035645,
"learning_rate": 1.0567901234567903e-06,
"loss": 0.008,
"step": 9590
},
{
"epoch": 4.7407407407407405,
"grad_norm": 0.01501123234629631,
"learning_rate": 1.0370370370370371e-06,
"loss": 0.4485,
"step": 9600
},
{
"epoch": 4.745679012345679,
"grad_norm": 22.644359588623047,
"learning_rate": 1.0172839506172842e-06,
"loss": 0.0708,
"step": 9610
},
{
"epoch": 4.750617283950618,
"grad_norm": 0.0668986439704895,
"learning_rate": 9.97530864197531e-07,
"loss": 0.2169,
"step": 9620
},
{
"epoch": 4.7555555555555555,
"grad_norm": 0.5103172063827515,
"learning_rate": 9.77777777777778e-07,
"loss": 0.1709,
"step": 9630
},
{
"epoch": 4.760493827160494,
"grad_norm": 63.763214111328125,
"learning_rate": 9.580246913580248e-07,
"loss": 0.3668,
"step": 9640
},
{
"epoch": 4.765432098765432,
"grad_norm": 0.013139153830707073,
"learning_rate": 9.382716049382717e-07,
"loss": 0.0545,
"step": 9650
},
{
"epoch": 4.770370370370371,
"grad_norm": 0.009220450185239315,
"learning_rate": 9.185185185185185e-07,
"loss": 0.1341,
"step": 9660
},
{
"epoch": 4.775308641975308,
"grad_norm": 0.03191829100251198,
"learning_rate": 8.987654320987656e-07,
"loss": 0.1266,
"step": 9670
},
{
"epoch": 4.780246913580247,
"grad_norm": 37.74824523925781,
"learning_rate": 8.790123456790124e-07,
"loss": 0.1043,
"step": 9680
},
{
"epoch": 4.785185185185185,
"grad_norm": 0.002283359644934535,
"learning_rate": 8.592592592592593e-07,
"loss": 0.033,
"step": 9690
},
{
"epoch": 4.790123456790123,
"grad_norm": 0.457742840051651,
"learning_rate": 8.395061728395062e-07,
"loss": 0.1186,
"step": 9700
},
{
"epoch": 4.795061728395062,
"grad_norm": 0.031063128262758255,
"learning_rate": 8.197530864197531e-07,
"loss": 0.1125,
"step": 9710
},
{
"epoch": 4.8,
"grad_norm": 0.012924841605126858,
"learning_rate": 8.000000000000001e-07,
"loss": 0.0156,
"step": 9720
},
{
"epoch": 4.8049382716049385,
"grad_norm": 0.11566291004419327,
"learning_rate": 7.802469135802469e-07,
"loss": 0.1286,
"step": 9730
},
{
"epoch": 4.809876543209876,
"grad_norm": 0.0004868748364970088,
"learning_rate": 7.604938271604939e-07,
"loss": 0.0012,
"step": 9740
},
{
"epoch": 4.814814814814815,
"grad_norm": 81.78207397460938,
"learning_rate": 7.407407407407407e-07,
"loss": 0.1942,
"step": 9750
},
{
"epoch": 4.8197530864197535,
"grad_norm": 35.868988037109375,
"learning_rate": 7.209876543209878e-07,
"loss": 0.0298,
"step": 9760
},
{
"epoch": 4.824691358024691,
"grad_norm": 147.98873901367188,
"learning_rate": 7.012345679012346e-07,
"loss": 0.2483,
"step": 9770
},
{
"epoch": 4.82962962962963,
"grad_norm": 0.013545212335884571,
"learning_rate": 6.814814814814816e-07,
"loss": 0.1292,
"step": 9780
},
{
"epoch": 4.834567901234568,
"grad_norm": 0.09124937653541565,
"learning_rate": 6.617283950617284e-07,
"loss": 0.0697,
"step": 9790
},
{
"epoch": 4.839506172839506,
"grad_norm": 0.005743750836700201,
"learning_rate": 6.419753086419754e-07,
"loss": 0.0975,
"step": 9800
},
{
"epoch": 4.844444444444444,
"grad_norm": 60.90267562866211,
"learning_rate": 6.222222222222223e-07,
"loss": 0.0291,
"step": 9810
},
{
"epoch": 4.849382716049383,
"grad_norm": 0.005148892290890217,
"learning_rate": 6.024691358024692e-07,
"loss": 0.0886,
"step": 9820
},
{
"epoch": 4.8543209876543205,
"grad_norm": 134.9575958251953,
"learning_rate": 5.827160493827161e-07,
"loss": 0.2035,
"step": 9830
},
{
"epoch": 4.859259259259259,
"grad_norm": 3.4503517150878906,
"learning_rate": 5.62962962962963e-07,
"loss": 0.1403,
"step": 9840
},
{
"epoch": 4.864197530864198,
"grad_norm": 0.5870628356933594,
"learning_rate": 5.432098765432099e-07,
"loss": 0.0645,
"step": 9850
},
{
"epoch": 4.869135802469136,
"grad_norm": 2.804311513900757,
"learning_rate": 5.234567901234569e-07,
"loss": 0.0234,
"step": 9860
},
{
"epoch": 4.874074074074074,
"grad_norm": 0.07958123087882996,
"learning_rate": 5.037037037037038e-07,
"loss": 0.0569,
"step": 9870
},
{
"epoch": 4.879012345679012,
"grad_norm": 0.013184885494410992,
"learning_rate": 4.839506172839507e-07,
"loss": 0.0514,
"step": 9880
},
{
"epoch": 4.883950617283951,
"grad_norm": 0.04747697710990906,
"learning_rate": 4.6419753086419757e-07,
"loss": 0.0002,
"step": 9890
},
{
"epoch": 4.888888888888889,
"grad_norm": 3.1284358501434326,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0066,
"step": 9900
},
{
"epoch": 4.893827160493827,
"grad_norm": 0.6298085451126099,
"learning_rate": 4.246913580246914e-07,
"loss": 0.0583,
"step": 9910
},
{
"epoch": 4.898765432098766,
"grad_norm": 0.012326150201261044,
"learning_rate": 4.0493827160493833e-07,
"loss": 0.0099,
"step": 9920
},
{
"epoch": 4.9037037037037035,
"grad_norm": 2.6905531883239746,
"learning_rate": 3.8518518518518525e-07,
"loss": 0.1259,
"step": 9930
},
{
"epoch": 4.908641975308642,
"grad_norm": 120.47846221923828,
"learning_rate": 3.6543209876543217e-07,
"loss": 0.1349,
"step": 9940
},
{
"epoch": 4.91358024691358,
"grad_norm": 0.0025870108511298895,
"learning_rate": 3.45679012345679e-07,
"loss": 0.1368,
"step": 9950
},
{
"epoch": 4.9185185185185185,
"grad_norm": 0.8233745694160461,
"learning_rate": 3.259259259259259e-07,
"loss": 0.1123,
"step": 9960
},
{
"epoch": 4.923456790123457,
"grad_norm": 0.0019518863409757614,
"learning_rate": 3.061728395061729e-07,
"loss": 0.285,
"step": 9970
},
{
"epoch": 4.928395061728395,
"grad_norm": 0.3376046121120453,
"learning_rate": 2.864197530864198e-07,
"loss": 0.4414,
"step": 9980
},
{
"epoch": 4.933333333333334,
"grad_norm": 0.006334675010293722,
"learning_rate": 2.666666666666667e-07,
"loss": 0.0841,
"step": 9990
},
{
"epoch": 4.938271604938271,
"grad_norm": 0.002394834766164422,
"learning_rate": 2.469135802469136e-07,
"loss": 0.0574,
"step": 10000
},
{
"epoch": 4.94320987654321,
"grad_norm": 0.0032636672258377075,
"learning_rate": 2.2716049382716051e-07,
"loss": 0.2294,
"step": 10010
},
{
"epoch": 4.948148148148148,
"grad_norm": 113.65235137939453,
"learning_rate": 2.074074074074074e-07,
"loss": 0.1907,
"step": 10020
},
{
"epoch": 4.953086419753086,
"grad_norm": 0.006610922981053591,
"learning_rate": 1.8765432098765433e-07,
"loss": 0.2999,
"step": 10030
},
{
"epoch": 4.958024691358025,
"grad_norm": 21.574785232543945,
"learning_rate": 1.6790123456790125e-07,
"loss": 0.1753,
"step": 10040
},
{
"epoch": 4.962962962962963,
"grad_norm": 0.019113583490252495,
"learning_rate": 1.4814814814814817e-07,
"loss": 0.1539,
"step": 10050
},
{
"epoch": 4.9679012345679014,
"grad_norm": 142.03480529785156,
"learning_rate": 1.2839506172839507e-07,
"loss": 0.201,
"step": 10060
},
{
"epoch": 4.972839506172839,
"grad_norm": 0.005930395796895027,
"learning_rate": 1.0864197530864197e-07,
"loss": 0.3736,
"step": 10070
},
{
"epoch": 4.977777777777778,
"grad_norm": 0.011048276908695698,
"learning_rate": 8.88888888888889e-08,
"loss": 0.1982,
"step": 10080
},
{
"epoch": 4.9827160493827165,
"grad_norm": 0.11679836362600327,
"learning_rate": 6.913580246913582e-08,
"loss": 0.2382,
"step": 10090
},
{
"epoch": 4.987654320987654,
"grad_norm": 114.29679870605469,
"learning_rate": 4.938271604938272e-08,
"loss": 0.5543,
"step": 10100
},
{
"epoch": 4.992592592592593,
"grad_norm": 0.07527362555265427,
"learning_rate": 2.9629629629629632e-08,
"loss": 0.0568,
"step": 10110
},
{
"epoch": 4.997530864197531,
"grad_norm": 1.4482346773147583,
"learning_rate": 9.876543209876544e-09,
"loss": 0.2086,
"step": 10120
},
{
"epoch": 5.0,
"eval_accuracy": 0.9862962962962963,
"eval_loss": 0.060996126383543015,
"eval_runtime": 32.7337,
"eval_samples_per_second": 164.968,
"eval_steps_per_second": 20.621,
"step": 10125
},
{
"epoch": 5.0,
"step": 10125,
"total_flos": 2.013785167306752e+18,
"train_loss": 0.2160879238260289,
"train_runtime": 1485.1852,
"train_samples_per_second": 54.539,
"train_steps_per_second": 6.817
}
],
"logging_steps": 10,
"max_steps": 10125,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.013785167306752e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}