100rab25's picture
End of training
2e3d8d2 verified
raw
history blame contribute delete
No virus
112 kB
{
"best_metric": 0.9518218623481781,
"best_model_checkpoint": "swin-tiny-patch4-window7-224-hotel_images_classifier_v2/checkpoint-3470",
"epoch": 4.9946023749550195,
"eval_steps": 500,
"global_step": 3470,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 4.747580528259277,
"learning_rate": 7.204610951008646e-07,
"loss": 1.9842,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 5.086691379547119,
"learning_rate": 1.4409221902017292e-06,
"loss": 1.9811,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 4.6829681396484375,
"learning_rate": 2.161383285302594e-06,
"loss": 1.9832,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 4.578265190124512,
"learning_rate": 2.8818443804034585e-06,
"loss": 1.9501,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 4.904880523681641,
"learning_rate": 3.602305475504323e-06,
"loss": 1.917,
"step": 25
},
{
"epoch": 0.04,
"grad_norm": 5.561208248138428,
"learning_rate": 4.322766570605188e-06,
"loss": 1.868,
"step": 30
},
{
"epoch": 0.05,
"grad_norm": 4.837254047393799,
"learning_rate": 5.043227665706052e-06,
"loss": 1.8244,
"step": 35
},
{
"epoch": 0.06,
"grad_norm": 4.515142440795898,
"learning_rate": 5.763688760806917e-06,
"loss": 1.7756,
"step": 40
},
{
"epoch": 0.06,
"grad_norm": 6.021132946014404,
"learning_rate": 6.484149855907781e-06,
"loss": 1.7231,
"step": 45
},
{
"epoch": 0.07,
"grad_norm": 6.325433254241943,
"learning_rate": 7.204610951008646e-06,
"loss": 1.6389,
"step": 50
},
{
"epoch": 0.08,
"grad_norm": 6.28499698638916,
"learning_rate": 7.92507204610951e-06,
"loss": 1.5651,
"step": 55
},
{
"epoch": 0.09,
"grad_norm": 4.940927982330322,
"learning_rate": 8.645533141210376e-06,
"loss": 1.5001,
"step": 60
},
{
"epoch": 0.09,
"grad_norm": 4.624394416809082,
"learning_rate": 9.36599423631124e-06,
"loss": 1.3998,
"step": 65
},
{
"epoch": 0.1,
"grad_norm": 4.9989118576049805,
"learning_rate": 1.0086455331412104e-05,
"loss": 1.2913,
"step": 70
},
{
"epoch": 0.11,
"grad_norm": 6.203399658203125,
"learning_rate": 1.0806916426512968e-05,
"loss": 1.1994,
"step": 75
},
{
"epoch": 0.12,
"grad_norm": 4.825283050537109,
"learning_rate": 1.1527377521613834e-05,
"loss": 1.0557,
"step": 80
},
{
"epoch": 0.12,
"grad_norm": 6.639811992645264,
"learning_rate": 1.2247838616714698e-05,
"loss": 1.0077,
"step": 85
},
{
"epoch": 0.13,
"grad_norm": 5.167383670806885,
"learning_rate": 1.2968299711815562e-05,
"loss": 0.9303,
"step": 90
},
{
"epoch": 0.14,
"grad_norm": 6.177196979522705,
"learning_rate": 1.3688760806916426e-05,
"loss": 0.7967,
"step": 95
},
{
"epoch": 0.14,
"grad_norm": 5.489429950714111,
"learning_rate": 1.4409221902017291e-05,
"loss": 0.7269,
"step": 100
},
{
"epoch": 0.15,
"grad_norm": 5.555374622344971,
"learning_rate": 1.5129682997118155e-05,
"loss": 0.7176,
"step": 105
},
{
"epoch": 0.16,
"grad_norm": 11.141295433044434,
"learning_rate": 1.585014409221902e-05,
"loss": 0.6796,
"step": 110
},
{
"epoch": 0.17,
"grad_norm": 7.412641525268555,
"learning_rate": 1.6570605187319883e-05,
"loss": 0.6028,
"step": 115
},
{
"epoch": 0.17,
"grad_norm": 6.904923439025879,
"learning_rate": 1.7291066282420752e-05,
"loss": 0.6348,
"step": 120
},
{
"epoch": 0.18,
"grad_norm": 11.165042877197266,
"learning_rate": 1.8011527377521615e-05,
"loss": 0.5814,
"step": 125
},
{
"epoch": 0.19,
"grad_norm": 7.367648124694824,
"learning_rate": 1.873198847262248e-05,
"loss": 0.5858,
"step": 130
},
{
"epoch": 0.19,
"grad_norm": 7.115988254547119,
"learning_rate": 1.9452449567723343e-05,
"loss": 0.5316,
"step": 135
},
{
"epoch": 0.2,
"grad_norm": 6.44365119934082,
"learning_rate": 2.017291066282421e-05,
"loss": 0.5049,
"step": 140
},
{
"epoch": 0.21,
"grad_norm": 7.195384502410889,
"learning_rate": 2.0893371757925074e-05,
"loss": 0.5511,
"step": 145
},
{
"epoch": 0.22,
"grad_norm": 17.6825008392334,
"learning_rate": 2.1613832853025936e-05,
"loss": 0.5124,
"step": 150
},
{
"epoch": 0.22,
"grad_norm": 7.656848907470703,
"learning_rate": 2.2334293948126802e-05,
"loss": 0.4794,
"step": 155
},
{
"epoch": 0.23,
"grad_norm": 7.221956729888916,
"learning_rate": 2.3054755043227668e-05,
"loss": 0.4773,
"step": 160
},
{
"epoch": 0.24,
"grad_norm": 16.787612915039062,
"learning_rate": 2.3775216138328533e-05,
"loss": 0.4746,
"step": 165
},
{
"epoch": 0.24,
"grad_norm": 7.123960494995117,
"learning_rate": 2.4495677233429396e-05,
"loss": 0.4734,
"step": 170
},
{
"epoch": 0.25,
"grad_norm": 7.737701416015625,
"learning_rate": 2.5216138328530258e-05,
"loss": 0.4613,
"step": 175
},
{
"epoch": 0.26,
"grad_norm": 7.011651515960693,
"learning_rate": 2.5936599423631124e-05,
"loss": 0.4886,
"step": 180
},
{
"epoch": 0.27,
"grad_norm": 8.571374893188477,
"learning_rate": 2.6657060518731993e-05,
"loss": 0.4702,
"step": 185
},
{
"epoch": 0.27,
"grad_norm": 7.675159454345703,
"learning_rate": 2.737752161383285e-05,
"loss": 0.5365,
"step": 190
},
{
"epoch": 0.28,
"grad_norm": 5.9088239669799805,
"learning_rate": 2.8097982708933717e-05,
"loss": 0.3823,
"step": 195
},
{
"epoch": 0.29,
"grad_norm": 5.840087413787842,
"learning_rate": 2.8818443804034583e-05,
"loss": 0.408,
"step": 200
},
{
"epoch": 0.3,
"grad_norm": 6.880429267883301,
"learning_rate": 2.953890489913545e-05,
"loss": 0.4787,
"step": 205
},
{
"epoch": 0.3,
"grad_norm": 5.355893611907959,
"learning_rate": 3.025936599423631e-05,
"loss": 0.377,
"step": 210
},
{
"epoch": 0.31,
"grad_norm": 7.921416759490967,
"learning_rate": 3.097982708933718e-05,
"loss": 0.4504,
"step": 215
},
{
"epoch": 0.32,
"grad_norm": 5.329736232757568,
"learning_rate": 3.170028818443804e-05,
"loss": 0.4056,
"step": 220
},
{
"epoch": 0.32,
"grad_norm": 5.699007034301758,
"learning_rate": 3.242074927953891e-05,
"loss": 0.395,
"step": 225
},
{
"epoch": 0.33,
"grad_norm": 10.29712963104248,
"learning_rate": 3.314121037463977e-05,
"loss": 0.433,
"step": 230
},
{
"epoch": 0.34,
"grad_norm": 8.653733253479004,
"learning_rate": 3.3861671469740636e-05,
"loss": 0.3733,
"step": 235
},
{
"epoch": 0.35,
"grad_norm": 4.476428508758545,
"learning_rate": 3.4582132564841505e-05,
"loss": 0.3758,
"step": 240
},
{
"epoch": 0.35,
"grad_norm": 7.4768571853637695,
"learning_rate": 3.530259365994236e-05,
"loss": 0.4628,
"step": 245
},
{
"epoch": 0.36,
"grad_norm": 7.058348655700684,
"learning_rate": 3.602305475504323e-05,
"loss": 0.3855,
"step": 250
},
{
"epoch": 0.37,
"grad_norm": 7.238952159881592,
"learning_rate": 3.674351585014409e-05,
"loss": 0.389,
"step": 255
},
{
"epoch": 0.37,
"grad_norm": 7.494441032409668,
"learning_rate": 3.746397694524496e-05,
"loss": 0.4285,
"step": 260
},
{
"epoch": 0.38,
"grad_norm": 6.927433490753174,
"learning_rate": 3.818443804034582e-05,
"loss": 0.357,
"step": 265
},
{
"epoch": 0.39,
"grad_norm": 8.478387832641602,
"learning_rate": 3.8904899135446685e-05,
"loss": 0.3612,
"step": 270
},
{
"epoch": 0.4,
"grad_norm": 9.04246997833252,
"learning_rate": 3.9625360230547554e-05,
"loss": 0.3068,
"step": 275
},
{
"epoch": 0.4,
"grad_norm": 7.052452087402344,
"learning_rate": 4.034582132564842e-05,
"loss": 0.3388,
"step": 280
},
{
"epoch": 0.41,
"grad_norm": 6.3666510581970215,
"learning_rate": 4.106628242074928e-05,
"loss": 0.398,
"step": 285
},
{
"epoch": 0.42,
"grad_norm": 7.982662200927734,
"learning_rate": 4.178674351585015e-05,
"loss": 0.3796,
"step": 290
},
{
"epoch": 0.42,
"grad_norm": 6.020977973937988,
"learning_rate": 4.250720461095101e-05,
"loss": 0.4266,
"step": 295
},
{
"epoch": 0.43,
"grad_norm": 7.010791778564453,
"learning_rate": 4.322766570605187e-05,
"loss": 0.4219,
"step": 300
},
{
"epoch": 0.44,
"grad_norm": 5.0191216468811035,
"learning_rate": 4.394812680115274e-05,
"loss": 0.3489,
"step": 305
},
{
"epoch": 0.45,
"grad_norm": 5.907705307006836,
"learning_rate": 4.4668587896253604e-05,
"loss": 0.3821,
"step": 310
},
{
"epoch": 0.45,
"grad_norm": 6.560094356536865,
"learning_rate": 4.538904899135447e-05,
"loss": 0.3874,
"step": 315
},
{
"epoch": 0.46,
"grad_norm": 6.429476737976074,
"learning_rate": 4.6109510086455335e-05,
"loss": 0.4096,
"step": 320
},
{
"epoch": 0.47,
"grad_norm": 7.065363883972168,
"learning_rate": 4.68299711815562e-05,
"loss": 0.3778,
"step": 325
},
{
"epoch": 0.47,
"grad_norm": 7.916449069976807,
"learning_rate": 4.7550432276657067e-05,
"loss": 0.3491,
"step": 330
},
{
"epoch": 0.48,
"grad_norm": 5.4434709548950195,
"learning_rate": 4.827089337175792e-05,
"loss": 0.3645,
"step": 335
},
{
"epoch": 0.49,
"grad_norm": 6.34391975402832,
"learning_rate": 4.899135446685879e-05,
"loss": 0.3778,
"step": 340
},
{
"epoch": 0.5,
"grad_norm": 6.070534706115723,
"learning_rate": 4.971181556195966e-05,
"loss": 0.4307,
"step": 345
},
{
"epoch": 0.5,
"grad_norm": 8.251782417297363,
"learning_rate": 4.995196926032661e-05,
"loss": 0.3964,
"step": 350
},
{
"epoch": 0.51,
"grad_norm": 5.293612957000732,
"learning_rate": 4.9871918027537626e-05,
"loss": 0.3379,
"step": 355
},
{
"epoch": 0.52,
"grad_norm": 7.164644241333008,
"learning_rate": 4.979186679474864e-05,
"loss": 0.3969,
"step": 360
},
{
"epoch": 0.53,
"grad_norm": 4.961303234100342,
"learning_rate": 4.971181556195966e-05,
"loss": 0.3697,
"step": 365
},
{
"epoch": 0.53,
"grad_norm": 6.196359157562256,
"learning_rate": 4.9631764329170674e-05,
"loss": 0.3448,
"step": 370
},
{
"epoch": 0.54,
"grad_norm": 5.836663722991943,
"learning_rate": 4.955171309638169e-05,
"loss": 0.3939,
"step": 375
},
{
"epoch": 0.55,
"grad_norm": 5.845285892486572,
"learning_rate": 4.94716618635927e-05,
"loss": 0.356,
"step": 380
},
{
"epoch": 0.55,
"grad_norm": 3.937917947769165,
"learning_rate": 4.9391610630803715e-05,
"loss": 0.3033,
"step": 385
},
{
"epoch": 0.56,
"grad_norm": 6.883370399475098,
"learning_rate": 4.9311559398014736e-05,
"loss": 0.415,
"step": 390
},
{
"epoch": 0.57,
"grad_norm": 6.164604663848877,
"learning_rate": 4.923150816522575e-05,
"loss": 0.3794,
"step": 395
},
{
"epoch": 0.58,
"grad_norm": 10.906937599182129,
"learning_rate": 4.9151456932436764e-05,
"loss": 0.3967,
"step": 400
},
{
"epoch": 0.58,
"grad_norm": 3.428271532058716,
"learning_rate": 4.907140569964778e-05,
"loss": 0.3312,
"step": 405
},
{
"epoch": 0.59,
"grad_norm": 7.288811206817627,
"learning_rate": 4.899135446685879e-05,
"loss": 0.3681,
"step": 410
},
{
"epoch": 0.6,
"grad_norm": 8.319820404052734,
"learning_rate": 4.8911303234069805e-05,
"loss": 0.3265,
"step": 415
},
{
"epoch": 0.6,
"grad_norm": 6.9813232421875,
"learning_rate": 4.883125200128082e-05,
"loss": 0.3233,
"step": 420
},
{
"epoch": 0.61,
"grad_norm": 5.874197959899902,
"learning_rate": 4.875120076849184e-05,
"loss": 0.3689,
"step": 425
},
{
"epoch": 0.62,
"grad_norm": 5.609955787658691,
"learning_rate": 4.867114953570285e-05,
"loss": 0.4213,
"step": 430
},
{
"epoch": 0.63,
"grad_norm": 5.877446174621582,
"learning_rate": 4.859109830291387e-05,
"loss": 0.419,
"step": 435
},
{
"epoch": 0.63,
"grad_norm": 6.771636962890625,
"learning_rate": 4.851104707012488e-05,
"loss": 0.3529,
"step": 440
},
{
"epoch": 0.64,
"grad_norm": 9.461392402648926,
"learning_rate": 4.8430995837335894e-05,
"loss": 0.3725,
"step": 445
},
{
"epoch": 0.65,
"grad_norm": 5.563230991363525,
"learning_rate": 4.835094460454691e-05,
"loss": 0.3675,
"step": 450
},
{
"epoch": 0.65,
"grad_norm": 6.4672465324401855,
"learning_rate": 4.827089337175792e-05,
"loss": 0.3156,
"step": 455
},
{
"epoch": 0.66,
"grad_norm": 3.8499579429626465,
"learning_rate": 4.819084213896894e-05,
"loss": 0.322,
"step": 460
},
{
"epoch": 0.67,
"grad_norm": 5.031641960144043,
"learning_rate": 4.8110790906179956e-05,
"loss": 0.3936,
"step": 465
},
{
"epoch": 0.68,
"grad_norm": 5.684152603149414,
"learning_rate": 4.803073967339097e-05,
"loss": 0.3898,
"step": 470
},
{
"epoch": 0.68,
"grad_norm": 5.913132190704346,
"learning_rate": 4.7950688440601984e-05,
"loss": 0.3331,
"step": 475
},
{
"epoch": 0.69,
"grad_norm": 5.199942588806152,
"learning_rate": 4.7870637207813005e-05,
"loss": 0.3303,
"step": 480
},
{
"epoch": 0.7,
"grad_norm": 3.992769956588745,
"learning_rate": 4.779058597502402e-05,
"loss": 0.3422,
"step": 485
},
{
"epoch": 0.71,
"grad_norm": 6.158402919769287,
"learning_rate": 4.771053474223503e-05,
"loss": 0.3152,
"step": 490
},
{
"epoch": 0.71,
"grad_norm": 4.361845016479492,
"learning_rate": 4.763048350944605e-05,
"loss": 0.3057,
"step": 495
},
{
"epoch": 0.72,
"grad_norm": 4.663881301879883,
"learning_rate": 4.7550432276657067e-05,
"loss": 0.3461,
"step": 500
},
{
"epoch": 0.73,
"grad_norm": 7.09819221496582,
"learning_rate": 4.747038104386808e-05,
"loss": 0.3675,
"step": 505
},
{
"epoch": 0.73,
"grad_norm": 5.0237956047058105,
"learning_rate": 4.7390329811079094e-05,
"loss": 0.3274,
"step": 510
},
{
"epoch": 0.74,
"grad_norm": 5.483020782470703,
"learning_rate": 4.731027857829011e-05,
"loss": 0.3055,
"step": 515
},
{
"epoch": 0.75,
"grad_norm": 4.972677707672119,
"learning_rate": 4.723022734550112e-05,
"loss": 0.3236,
"step": 520
},
{
"epoch": 0.76,
"grad_norm": 7.017973899841309,
"learning_rate": 4.7150176112712136e-05,
"loss": 0.3543,
"step": 525
},
{
"epoch": 0.76,
"grad_norm": 8.219503402709961,
"learning_rate": 4.7070124879923156e-05,
"loss": 0.3791,
"step": 530
},
{
"epoch": 0.77,
"grad_norm": 5.836394309997559,
"learning_rate": 4.699007364713417e-05,
"loss": 0.2882,
"step": 535
},
{
"epoch": 0.78,
"grad_norm": 6.394532680511475,
"learning_rate": 4.6910022414345184e-05,
"loss": 0.3741,
"step": 540
},
{
"epoch": 0.78,
"grad_norm": 5.4533843994140625,
"learning_rate": 4.68299711815562e-05,
"loss": 0.3852,
"step": 545
},
{
"epoch": 0.79,
"grad_norm": 6.065195083618164,
"learning_rate": 4.674991994876721e-05,
"loss": 0.3589,
"step": 550
},
{
"epoch": 0.8,
"grad_norm": 4.000141620635986,
"learning_rate": 4.6669868715978225e-05,
"loss": 0.2865,
"step": 555
},
{
"epoch": 0.81,
"grad_norm": 6.05587100982666,
"learning_rate": 4.658981748318924e-05,
"loss": 0.316,
"step": 560
},
{
"epoch": 0.81,
"grad_norm": 5.1732892990112305,
"learning_rate": 4.650976625040026e-05,
"loss": 0.2768,
"step": 565
},
{
"epoch": 0.82,
"grad_norm": 4.745729446411133,
"learning_rate": 4.642971501761127e-05,
"loss": 0.2796,
"step": 570
},
{
"epoch": 0.83,
"grad_norm": 4.964130878448486,
"learning_rate": 4.634966378482229e-05,
"loss": 0.3268,
"step": 575
},
{
"epoch": 0.83,
"grad_norm": 5.333953857421875,
"learning_rate": 4.62696125520333e-05,
"loss": 0.321,
"step": 580
},
{
"epoch": 0.84,
"grad_norm": 4.004300117492676,
"learning_rate": 4.6189561319244315e-05,
"loss": 0.3371,
"step": 585
},
{
"epoch": 0.85,
"grad_norm": 6.5950751304626465,
"learning_rate": 4.6109510086455335e-05,
"loss": 0.3028,
"step": 590
},
{
"epoch": 0.86,
"grad_norm": 4.516002655029297,
"learning_rate": 4.602945885366635e-05,
"loss": 0.3539,
"step": 595
},
{
"epoch": 0.86,
"grad_norm": 5.180628776550293,
"learning_rate": 4.594940762087736e-05,
"loss": 0.35,
"step": 600
},
{
"epoch": 0.87,
"grad_norm": 3.2567028999328613,
"learning_rate": 4.586935638808838e-05,
"loss": 0.323,
"step": 605
},
{
"epoch": 0.88,
"grad_norm": 3.9456095695495605,
"learning_rate": 4.57893051552994e-05,
"loss": 0.3378,
"step": 610
},
{
"epoch": 0.89,
"grad_norm": 3.6121273040771484,
"learning_rate": 4.570925392251041e-05,
"loss": 0.2565,
"step": 615
},
{
"epoch": 0.89,
"grad_norm": 4.358009338378906,
"learning_rate": 4.5629202689721425e-05,
"loss": 0.3147,
"step": 620
},
{
"epoch": 0.9,
"grad_norm": 7.531122207641602,
"learning_rate": 4.554915145693244e-05,
"loss": 0.3346,
"step": 625
},
{
"epoch": 0.91,
"grad_norm": 5.810347557067871,
"learning_rate": 4.546910022414345e-05,
"loss": 0.3196,
"step": 630
},
{
"epoch": 0.91,
"grad_norm": 6.805031776428223,
"learning_rate": 4.538904899135447e-05,
"loss": 0.2952,
"step": 635
},
{
"epoch": 0.92,
"grad_norm": 4.857294082641602,
"learning_rate": 4.530899775856549e-05,
"loss": 0.315,
"step": 640
},
{
"epoch": 0.93,
"grad_norm": 4.595619201660156,
"learning_rate": 4.52289465257765e-05,
"loss": 0.3231,
"step": 645
},
{
"epoch": 0.94,
"grad_norm": 5.075206279754639,
"learning_rate": 4.5148895292987514e-05,
"loss": 0.3019,
"step": 650
},
{
"epoch": 0.94,
"grad_norm": 4.71131706237793,
"learning_rate": 4.506884406019853e-05,
"loss": 0.3249,
"step": 655
},
{
"epoch": 0.95,
"grad_norm": 5.032394886016846,
"learning_rate": 4.498879282740954e-05,
"loss": 0.2653,
"step": 660
},
{
"epoch": 0.96,
"grad_norm": 6.4502997398376465,
"learning_rate": 4.4908741594620556e-05,
"loss": 0.309,
"step": 665
},
{
"epoch": 0.96,
"grad_norm": 5.608312129974365,
"learning_rate": 4.4828690361831576e-05,
"loss": 0.2943,
"step": 670
},
{
"epoch": 0.97,
"grad_norm": 5.454727649688721,
"learning_rate": 4.474863912904259e-05,
"loss": 0.3037,
"step": 675
},
{
"epoch": 0.98,
"grad_norm": 4.60232400894165,
"learning_rate": 4.4668587896253604e-05,
"loss": 0.2739,
"step": 680
},
{
"epoch": 0.99,
"grad_norm": 5.319153308868408,
"learning_rate": 4.458853666346462e-05,
"loss": 0.2811,
"step": 685
},
{
"epoch": 0.99,
"grad_norm": 4.6785054206848145,
"learning_rate": 4.450848543067563e-05,
"loss": 0.2929,
"step": 690
},
{
"epoch": 1.0,
"eval_accuracy": 0.9385627530364372,
"eval_loss": 0.16880780458450317,
"eval_runtime": 32.041,
"eval_samples_per_second": 308.355,
"eval_steps_per_second": 9.644,
"step": 694
},
{
"epoch": 1.0,
"grad_norm": 6.902053356170654,
"learning_rate": 4.4428434197886645e-05,
"loss": 0.3839,
"step": 695
},
{
"epoch": 1.01,
"grad_norm": 4.171269416809082,
"learning_rate": 4.434838296509766e-05,
"loss": 0.3153,
"step": 700
},
{
"epoch": 1.01,
"grad_norm": 3.1970090866088867,
"learning_rate": 4.426833173230868e-05,
"loss": 0.2399,
"step": 705
},
{
"epoch": 1.02,
"grad_norm": 5.519264221191406,
"learning_rate": 4.4188280499519693e-05,
"loss": 0.2775,
"step": 710
},
{
"epoch": 1.03,
"grad_norm": 4.797208786010742,
"learning_rate": 4.4108229266730714e-05,
"loss": 0.2805,
"step": 715
},
{
"epoch": 1.04,
"grad_norm": 9.114941596984863,
"learning_rate": 4.402817803394173e-05,
"loss": 0.2846,
"step": 720
},
{
"epoch": 1.04,
"grad_norm": 4.987404823303223,
"learning_rate": 4.394812680115274e-05,
"loss": 0.2849,
"step": 725
},
{
"epoch": 1.05,
"grad_norm": 6.2959136962890625,
"learning_rate": 4.3868075568363755e-05,
"loss": 0.3129,
"step": 730
},
{
"epoch": 1.06,
"grad_norm": 4.492276668548584,
"learning_rate": 4.378802433557477e-05,
"loss": 0.2384,
"step": 735
},
{
"epoch": 1.07,
"grad_norm": 3.5424952507019043,
"learning_rate": 4.370797310278579e-05,
"loss": 0.2183,
"step": 740
},
{
"epoch": 1.07,
"grad_norm": 7.594015598297119,
"learning_rate": 4.3627921869996804e-05,
"loss": 0.2657,
"step": 745
},
{
"epoch": 1.08,
"grad_norm": 6.9036431312561035,
"learning_rate": 4.354787063720782e-05,
"loss": 0.2678,
"step": 750
},
{
"epoch": 1.09,
"grad_norm": 7.780063629150391,
"learning_rate": 4.346781940441883e-05,
"loss": 0.3054,
"step": 755
},
{
"epoch": 1.09,
"grad_norm": 5.562774181365967,
"learning_rate": 4.3387768171629845e-05,
"loss": 0.272,
"step": 760
},
{
"epoch": 1.1,
"grad_norm": 7.2162861824035645,
"learning_rate": 4.330771693884086e-05,
"loss": 0.2678,
"step": 765
},
{
"epoch": 1.11,
"grad_norm": 5.875248432159424,
"learning_rate": 4.322766570605187e-05,
"loss": 0.2691,
"step": 770
},
{
"epoch": 1.12,
"grad_norm": 4.324618339538574,
"learning_rate": 4.314761447326289e-05,
"loss": 0.3025,
"step": 775
},
{
"epoch": 1.12,
"grad_norm": 4.129276275634766,
"learning_rate": 4.306756324047391e-05,
"loss": 0.2596,
"step": 780
},
{
"epoch": 1.13,
"grad_norm": 3.086761713027954,
"learning_rate": 4.298751200768492e-05,
"loss": 0.2528,
"step": 785
},
{
"epoch": 1.14,
"grad_norm": 4.340246200561523,
"learning_rate": 4.2907460774895934e-05,
"loss": 0.223,
"step": 790
},
{
"epoch": 1.14,
"grad_norm": 3.6360461711883545,
"learning_rate": 4.282740954210695e-05,
"loss": 0.2547,
"step": 795
},
{
"epoch": 1.15,
"grad_norm": 4.182173252105713,
"learning_rate": 4.274735830931796e-05,
"loss": 0.287,
"step": 800
},
{
"epoch": 1.16,
"grad_norm": 4.418725490570068,
"learning_rate": 4.2667307076528976e-05,
"loss": 0.2888,
"step": 805
},
{
"epoch": 1.17,
"grad_norm": 4.325172424316406,
"learning_rate": 4.2587255843739996e-05,
"loss": 0.2634,
"step": 810
},
{
"epoch": 1.17,
"grad_norm": 5.551906585693359,
"learning_rate": 4.250720461095101e-05,
"loss": 0.2651,
"step": 815
},
{
"epoch": 1.18,
"grad_norm": 3.631472110748291,
"learning_rate": 4.2427153378162024e-05,
"loss": 0.2745,
"step": 820
},
{
"epoch": 1.19,
"grad_norm": 3.5533196926116943,
"learning_rate": 4.234710214537304e-05,
"loss": 0.2936,
"step": 825
},
{
"epoch": 1.19,
"grad_norm": 4.504055023193359,
"learning_rate": 4.226705091258406e-05,
"loss": 0.3025,
"step": 830
},
{
"epoch": 1.2,
"grad_norm": 4.739752292633057,
"learning_rate": 4.218699967979507e-05,
"loss": 0.303,
"step": 835
},
{
"epoch": 1.21,
"grad_norm": 5.039779186248779,
"learning_rate": 4.2106948447006086e-05,
"loss": 0.2663,
"step": 840
},
{
"epoch": 1.22,
"grad_norm": 3.7070090770721436,
"learning_rate": 4.2026897214217107e-05,
"loss": 0.2663,
"step": 845
},
{
"epoch": 1.22,
"grad_norm": 4.351013660430908,
"learning_rate": 4.194684598142812e-05,
"loss": 0.2454,
"step": 850
},
{
"epoch": 1.23,
"grad_norm": 5.0032830238342285,
"learning_rate": 4.1866794748639134e-05,
"loss": 0.245,
"step": 855
},
{
"epoch": 1.24,
"grad_norm": 3.203274965286255,
"learning_rate": 4.178674351585015e-05,
"loss": 0.3036,
"step": 860
},
{
"epoch": 1.25,
"grad_norm": 4.47341775894165,
"learning_rate": 4.170669228306116e-05,
"loss": 0.3336,
"step": 865
},
{
"epoch": 1.25,
"grad_norm": 4.188334941864014,
"learning_rate": 4.1626641050272176e-05,
"loss": 0.2529,
"step": 870
},
{
"epoch": 1.26,
"grad_norm": 3.3264882564544678,
"learning_rate": 4.154658981748319e-05,
"loss": 0.2258,
"step": 875
},
{
"epoch": 1.27,
"grad_norm": 4.058962821960449,
"learning_rate": 4.146653858469421e-05,
"loss": 0.3129,
"step": 880
},
{
"epoch": 1.27,
"grad_norm": 4.271402359008789,
"learning_rate": 4.1386487351905224e-05,
"loss": 0.2605,
"step": 885
},
{
"epoch": 1.28,
"grad_norm": 8.134669303894043,
"learning_rate": 4.130643611911624e-05,
"loss": 0.3072,
"step": 890
},
{
"epoch": 1.29,
"grad_norm": 5.065728664398193,
"learning_rate": 4.122638488632725e-05,
"loss": 0.2557,
"step": 895
},
{
"epoch": 1.3,
"grad_norm": 4.518153190612793,
"learning_rate": 4.1146333653538265e-05,
"loss": 0.2591,
"step": 900
},
{
"epoch": 1.3,
"grad_norm": 6.0956926345825195,
"learning_rate": 4.106628242074928e-05,
"loss": 0.3001,
"step": 905
},
{
"epoch": 1.31,
"grad_norm": 4.715207099914551,
"learning_rate": 4.098623118796029e-05,
"loss": 0.2882,
"step": 910
},
{
"epoch": 1.32,
"grad_norm": 6.3927435874938965,
"learning_rate": 4.090617995517131e-05,
"loss": 0.2733,
"step": 915
},
{
"epoch": 1.32,
"grad_norm": 3.886277198791504,
"learning_rate": 4.082612872238233e-05,
"loss": 0.2558,
"step": 920
},
{
"epoch": 1.33,
"grad_norm": 6.690213203430176,
"learning_rate": 4.074607748959334e-05,
"loss": 0.2411,
"step": 925
},
{
"epoch": 1.34,
"grad_norm": 5.04226016998291,
"learning_rate": 4.0666026256804355e-05,
"loss": 0.2814,
"step": 930
},
{
"epoch": 1.35,
"grad_norm": 6.361902236938477,
"learning_rate": 4.058597502401537e-05,
"loss": 0.1918,
"step": 935
},
{
"epoch": 1.35,
"grad_norm": 6.6365227699279785,
"learning_rate": 4.050592379122638e-05,
"loss": 0.2714,
"step": 940
},
{
"epoch": 1.36,
"grad_norm": 4.794340133666992,
"learning_rate": 4.04258725584374e-05,
"loss": 0.269,
"step": 945
},
{
"epoch": 1.37,
"grad_norm": 5.207016468048096,
"learning_rate": 4.034582132564842e-05,
"loss": 0.2955,
"step": 950
},
{
"epoch": 1.37,
"grad_norm": 5.347695350646973,
"learning_rate": 4.026577009285944e-05,
"loss": 0.2341,
"step": 955
},
{
"epoch": 1.38,
"grad_norm": 7.788352966308594,
"learning_rate": 4.018571886007045e-05,
"loss": 0.2228,
"step": 960
},
{
"epoch": 1.39,
"grad_norm": 4.078495025634766,
"learning_rate": 4.0105667627281465e-05,
"loss": 0.2408,
"step": 965
},
{
"epoch": 1.4,
"grad_norm": 5.237365245819092,
"learning_rate": 4.002561639449248e-05,
"loss": 0.2891,
"step": 970
},
{
"epoch": 1.4,
"grad_norm": 5.711833953857422,
"learning_rate": 3.994556516170349e-05,
"loss": 0.323,
"step": 975
},
{
"epoch": 1.41,
"grad_norm": 3.250711679458618,
"learning_rate": 3.9865513928914506e-05,
"loss": 0.2945,
"step": 980
},
{
"epoch": 1.42,
"grad_norm": 6.933974266052246,
"learning_rate": 3.978546269612553e-05,
"loss": 0.2507,
"step": 985
},
{
"epoch": 1.42,
"grad_norm": 4.515052795410156,
"learning_rate": 3.970541146333654e-05,
"loss": 0.265,
"step": 990
},
{
"epoch": 1.43,
"grad_norm": 4.89296293258667,
"learning_rate": 3.9625360230547554e-05,
"loss": 0.2868,
"step": 995
},
{
"epoch": 1.44,
"grad_norm": 4.629034996032715,
"learning_rate": 3.954530899775857e-05,
"loss": 0.2773,
"step": 1000
},
{
"epoch": 1.45,
"grad_norm": 3.881559371948242,
"learning_rate": 3.946525776496958e-05,
"loss": 0.3336,
"step": 1005
},
{
"epoch": 1.45,
"grad_norm": 3.4768316745758057,
"learning_rate": 3.9385206532180596e-05,
"loss": 0.2212,
"step": 1010
},
{
"epoch": 1.46,
"grad_norm": 5.582344055175781,
"learning_rate": 3.930515529939161e-05,
"loss": 0.3031,
"step": 1015
},
{
"epoch": 1.47,
"grad_norm": 3.73008131980896,
"learning_rate": 3.922510406660262e-05,
"loss": 0.2557,
"step": 1020
},
{
"epoch": 1.48,
"grad_norm": 5.319180011749268,
"learning_rate": 3.9145052833813644e-05,
"loss": 0.2679,
"step": 1025
},
{
"epoch": 1.48,
"grad_norm": 6.709672451019287,
"learning_rate": 3.906500160102466e-05,
"loss": 0.2471,
"step": 1030
},
{
"epoch": 1.49,
"grad_norm": 5.294819355010986,
"learning_rate": 3.898495036823567e-05,
"loss": 0.2661,
"step": 1035
},
{
"epoch": 1.5,
"grad_norm": 3.2995288372039795,
"learning_rate": 3.8904899135446685e-05,
"loss": 0.2789,
"step": 1040
},
{
"epoch": 1.5,
"grad_norm": 4.34086799621582,
"learning_rate": 3.88248479026577e-05,
"loss": 0.2789,
"step": 1045
},
{
"epoch": 1.51,
"grad_norm": 5.209534168243408,
"learning_rate": 3.874479666986871e-05,
"loss": 0.3002,
"step": 1050
},
{
"epoch": 1.52,
"grad_norm": 5.175271034240723,
"learning_rate": 3.8664745437079733e-05,
"loss": 0.2631,
"step": 1055
},
{
"epoch": 1.53,
"grad_norm": 4.909916400909424,
"learning_rate": 3.858469420429075e-05,
"loss": 0.25,
"step": 1060
},
{
"epoch": 1.53,
"grad_norm": 3.8786613941192627,
"learning_rate": 3.850464297150176e-05,
"loss": 0.226,
"step": 1065
},
{
"epoch": 1.54,
"grad_norm": 4.349425315856934,
"learning_rate": 3.842459173871278e-05,
"loss": 0.2635,
"step": 1070
},
{
"epoch": 1.55,
"grad_norm": 5.107605934143066,
"learning_rate": 3.8344540505923795e-05,
"loss": 0.2536,
"step": 1075
},
{
"epoch": 1.55,
"grad_norm": 5.436495780944824,
"learning_rate": 3.826448927313481e-05,
"loss": 0.2911,
"step": 1080
},
{
"epoch": 1.56,
"grad_norm": 5.1116156578063965,
"learning_rate": 3.818443804034582e-05,
"loss": 0.3064,
"step": 1085
},
{
"epoch": 1.57,
"grad_norm": 4.1365742683410645,
"learning_rate": 3.810438680755684e-05,
"loss": 0.2003,
"step": 1090
},
{
"epoch": 1.58,
"grad_norm": 5.43222188949585,
"learning_rate": 3.802433557476786e-05,
"loss": 0.291,
"step": 1095
},
{
"epoch": 1.58,
"grad_norm": 6.062341690063477,
"learning_rate": 3.794428434197887e-05,
"loss": 0.2325,
"step": 1100
},
{
"epoch": 1.59,
"grad_norm": 4.5507097244262695,
"learning_rate": 3.7864233109189885e-05,
"loss": 0.2493,
"step": 1105
},
{
"epoch": 1.6,
"grad_norm": 3.3975865840911865,
"learning_rate": 3.77841818764009e-05,
"loss": 0.2349,
"step": 1110
},
{
"epoch": 1.6,
"grad_norm": 3.967979907989502,
"learning_rate": 3.770413064361191e-05,
"loss": 0.2364,
"step": 1115
},
{
"epoch": 1.61,
"grad_norm": 4.541342735290527,
"learning_rate": 3.7624079410822926e-05,
"loss": 0.2285,
"step": 1120
},
{
"epoch": 1.62,
"grad_norm": 4.848491668701172,
"learning_rate": 3.754402817803394e-05,
"loss": 0.235,
"step": 1125
},
{
"epoch": 1.63,
"grad_norm": 5.879725933074951,
"learning_rate": 3.746397694524496e-05,
"loss": 0.2759,
"step": 1130
},
{
"epoch": 1.63,
"grad_norm": 6.01210880279541,
"learning_rate": 3.7383925712455975e-05,
"loss": 0.3345,
"step": 1135
},
{
"epoch": 1.64,
"grad_norm": 4.760444641113281,
"learning_rate": 3.730387447966699e-05,
"loss": 0.2708,
"step": 1140
},
{
"epoch": 1.65,
"grad_norm": 4.630128860473633,
"learning_rate": 3.7223823246878e-05,
"loss": 0.3049,
"step": 1145
},
{
"epoch": 1.66,
"grad_norm": 4.3284101486206055,
"learning_rate": 3.7143772014089016e-05,
"loss": 0.2822,
"step": 1150
},
{
"epoch": 1.66,
"grad_norm": 6.679904937744141,
"learning_rate": 3.706372078130003e-05,
"loss": 0.2764,
"step": 1155
},
{
"epoch": 1.67,
"grad_norm": 5.192065238952637,
"learning_rate": 3.6983669548511043e-05,
"loss": 0.2479,
"step": 1160
},
{
"epoch": 1.68,
"grad_norm": 4.901111125946045,
"learning_rate": 3.6903618315722064e-05,
"loss": 0.2849,
"step": 1165
},
{
"epoch": 1.68,
"grad_norm": 6.2184977531433105,
"learning_rate": 3.682356708293308e-05,
"loss": 0.2667,
"step": 1170
},
{
"epoch": 1.69,
"grad_norm": 5.900247573852539,
"learning_rate": 3.674351585014409e-05,
"loss": 0.2992,
"step": 1175
},
{
"epoch": 1.7,
"grad_norm": 3.7004477977752686,
"learning_rate": 3.666346461735511e-05,
"loss": 0.2791,
"step": 1180
},
{
"epoch": 1.71,
"grad_norm": 4.646676063537598,
"learning_rate": 3.6583413384566126e-05,
"loss": 0.2525,
"step": 1185
},
{
"epoch": 1.71,
"grad_norm": 4.426496982574463,
"learning_rate": 3.650336215177714e-05,
"loss": 0.2624,
"step": 1190
},
{
"epoch": 1.72,
"grad_norm": 4.333110809326172,
"learning_rate": 3.6423310918988154e-05,
"loss": 0.2777,
"step": 1195
},
{
"epoch": 1.73,
"grad_norm": 3.7483744621276855,
"learning_rate": 3.6343259686199174e-05,
"loss": 0.2897,
"step": 1200
},
{
"epoch": 1.73,
"grad_norm": 5.556215286254883,
"learning_rate": 3.626320845341019e-05,
"loss": 0.3432,
"step": 1205
},
{
"epoch": 1.74,
"grad_norm": 4.707242965698242,
"learning_rate": 3.61831572206212e-05,
"loss": 0.2439,
"step": 1210
},
{
"epoch": 1.75,
"grad_norm": 4.767390251159668,
"learning_rate": 3.6103105987832216e-05,
"loss": 0.2744,
"step": 1215
},
{
"epoch": 1.76,
"grad_norm": 4.1662492752075195,
"learning_rate": 3.602305475504323e-05,
"loss": 0.267,
"step": 1220
},
{
"epoch": 1.76,
"grad_norm": 4.437891006469727,
"learning_rate": 3.594300352225424e-05,
"loss": 0.2354,
"step": 1225
},
{
"epoch": 1.77,
"grad_norm": 5.63749361038208,
"learning_rate": 3.586295228946526e-05,
"loss": 0.2557,
"step": 1230
},
{
"epoch": 1.78,
"grad_norm": 6.398256778717041,
"learning_rate": 3.578290105667628e-05,
"loss": 0.2697,
"step": 1235
},
{
"epoch": 1.78,
"grad_norm": 4.15376091003418,
"learning_rate": 3.570284982388729e-05,
"loss": 0.2672,
"step": 1240
},
{
"epoch": 1.79,
"grad_norm": 8.952369689941406,
"learning_rate": 3.5622798591098305e-05,
"loss": 0.2992,
"step": 1245
},
{
"epoch": 1.8,
"grad_norm": 7.161625385284424,
"learning_rate": 3.554274735830932e-05,
"loss": 0.3067,
"step": 1250
},
{
"epoch": 1.81,
"grad_norm": 3.848027467727661,
"learning_rate": 3.546269612552033e-05,
"loss": 0.3165,
"step": 1255
},
{
"epoch": 1.81,
"grad_norm": 5.625514507293701,
"learning_rate": 3.5382644892731347e-05,
"loss": 0.2792,
"step": 1260
},
{
"epoch": 1.82,
"grad_norm": 3.829505681991577,
"learning_rate": 3.530259365994236e-05,
"loss": 0.258,
"step": 1265
},
{
"epoch": 1.83,
"grad_norm": 4.038649559020996,
"learning_rate": 3.522254242715338e-05,
"loss": 0.2668,
"step": 1270
},
{
"epoch": 1.84,
"grad_norm": 3.746533155441284,
"learning_rate": 3.5142491194364395e-05,
"loss": 0.2571,
"step": 1275
},
{
"epoch": 1.84,
"grad_norm": 3.9205687046051025,
"learning_rate": 3.506243996157541e-05,
"loss": 0.2148,
"step": 1280
},
{
"epoch": 1.85,
"grad_norm": 5.464355945587158,
"learning_rate": 3.498238872878642e-05,
"loss": 0.2707,
"step": 1285
},
{
"epoch": 1.86,
"grad_norm": 4.321130752563477,
"learning_rate": 3.4902337495997436e-05,
"loss": 0.2434,
"step": 1290
},
{
"epoch": 1.86,
"grad_norm": 6.3836588859558105,
"learning_rate": 3.482228626320846e-05,
"loss": 0.2601,
"step": 1295
},
{
"epoch": 1.87,
"grad_norm": 2.9065053462982178,
"learning_rate": 3.474223503041947e-05,
"loss": 0.2033,
"step": 1300
},
{
"epoch": 1.88,
"grad_norm": 4.280132293701172,
"learning_rate": 3.4662183797630484e-05,
"loss": 0.2708,
"step": 1305
},
{
"epoch": 1.89,
"grad_norm": 5.5674262046813965,
"learning_rate": 3.4582132564841505e-05,
"loss": 0.2899,
"step": 1310
},
{
"epoch": 1.89,
"grad_norm": 4.071995735168457,
"learning_rate": 3.450208133205252e-05,
"loss": 0.2714,
"step": 1315
},
{
"epoch": 1.9,
"grad_norm": 6.83046817779541,
"learning_rate": 3.442203009926353e-05,
"loss": 0.2563,
"step": 1320
},
{
"epoch": 1.91,
"grad_norm": 4.866962432861328,
"learning_rate": 3.4341978866474546e-05,
"loss": 0.2898,
"step": 1325
},
{
"epoch": 1.91,
"grad_norm": 6.10991096496582,
"learning_rate": 3.426192763368556e-05,
"loss": 0.2927,
"step": 1330
},
{
"epoch": 1.92,
"grad_norm": 8.084212303161621,
"learning_rate": 3.4181876400896574e-05,
"loss": 0.2668,
"step": 1335
},
{
"epoch": 1.93,
"grad_norm": 2.702385902404785,
"learning_rate": 3.4101825168107594e-05,
"loss": 0.2617,
"step": 1340
},
{
"epoch": 1.94,
"grad_norm": 5.180947303771973,
"learning_rate": 3.402177393531861e-05,
"loss": 0.2411,
"step": 1345
},
{
"epoch": 1.94,
"grad_norm": 3.0766685009002686,
"learning_rate": 3.394172270252962e-05,
"loss": 0.2723,
"step": 1350
},
{
"epoch": 1.95,
"grad_norm": 3.833108901977539,
"learning_rate": 3.3861671469740636e-05,
"loss": 0.2237,
"step": 1355
},
{
"epoch": 1.96,
"grad_norm": 4.505425930023193,
"learning_rate": 3.378162023695165e-05,
"loss": 0.2685,
"step": 1360
},
{
"epoch": 1.96,
"grad_norm": 3.9498701095581055,
"learning_rate": 3.370156900416266e-05,
"loss": 0.2637,
"step": 1365
},
{
"epoch": 1.97,
"grad_norm": 6.345920562744141,
"learning_rate": 3.362151777137368e-05,
"loss": 0.2745,
"step": 1370
},
{
"epoch": 1.98,
"grad_norm": 4.702010154724121,
"learning_rate": 3.35414665385847e-05,
"loss": 0.2837,
"step": 1375
},
{
"epoch": 1.99,
"grad_norm": 4.943043231964111,
"learning_rate": 3.346141530579571e-05,
"loss": 0.2525,
"step": 1380
},
{
"epoch": 1.99,
"grad_norm": 3.8749611377716064,
"learning_rate": 3.3381364073006725e-05,
"loss": 0.2499,
"step": 1385
},
{
"epoch": 2.0,
"eval_accuracy": 0.9425101214574899,
"eval_loss": 0.1516382098197937,
"eval_runtime": 31.6984,
"eval_samples_per_second": 311.687,
"eval_steps_per_second": 9.748,
"step": 1389
},
{
"epoch": 2.0,
"grad_norm": 6.219438076019287,
"learning_rate": 3.330131284021774e-05,
"loss": 0.3071,
"step": 1390
},
{
"epoch": 2.01,
"grad_norm": 4.6552629470825195,
"learning_rate": 3.322126160742875e-05,
"loss": 0.2364,
"step": 1395
},
{
"epoch": 2.02,
"grad_norm": 3.997241497039795,
"learning_rate": 3.314121037463977e-05,
"loss": 0.249,
"step": 1400
},
{
"epoch": 2.02,
"grad_norm": 3.6796419620513916,
"learning_rate": 3.306115914185078e-05,
"loss": 0.2261,
"step": 1405
},
{
"epoch": 2.03,
"grad_norm": 3.0016541481018066,
"learning_rate": 3.29811079090618e-05,
"loss": 0.2127,
"step": 1410
},
{
"epoch": 2.04,
"grad_norm": 4.60055685043335,
"learning_rate": 3.2901056676272815e-05,
"loss": 0.2207,
"step": 1415
},
{
"epoch": 2.04,
"grad_norm": 6.432025909423828,
"learning_rate": 3.2821005443483835e-05,
"loss": 0.2088,
"step": 1420
},
{
"epoch": 2.05,
"grad_norm": 4.049763202667236,
"learning_rate": 3.274095421069485e-05,
"loss": 0.2193,
"step": 1425
},
{
"epoch": 2.06,
"grad_norm": 4.77670955657959,
"learning_rate": 3.266090297790586e-05,
"loss": 0.2666,
"step": 1430
},
{
"epoch": 2.07,
"grad_norm": 3.876225709915161,
"learning_rate": 3.258085174511688e-05,
"loss": 0.222,
"step": 1435
},
{
"epoch": 2.07,
"grad_norm": 2.917393207550049,
"learning_rate": 3.250080051232789e-05,
"loss": 0.2481,
"step": 1440
},
{
"epoch": 2.08,
"grad_norm": 3.858349084854126,
"learning_rate": 3.242074927953891e-05,
"loss": 0.2929,
"step": 1445
},
{
"epoch": 2.09,
"grad_norm": 4.08052921295166,
"learning_rate": 3.2340698046749925e-05,
"loss": 0.2081,
"step": 1450
},
{
"epoch": 2.09,
"grad_norm": 3.8843398094177246,
"learning_rate": 3.226064681396094e-05,
"loss": 0.1917,
"step": 1455
},
{
"epoch": 2.1,
"grad_norm": 4.356058597564697,
"learning_rate": 3.218059558117195e-05,
"loss": 0.2211,
"step": 1460
},
{
"epoch": 2.11,
"grad_norm": 5.629312038421631,
"learning_rate": 3.2100544348382966e-05,
"loss": 0.2704,
"step": 1465
},
{
"epoch": 2.12,
"grad_norm": 3.8312325477600098,
"learning_rate": 3.202049311559398e-05,
"loss": 0.2418,
"step": 1470
},
{
"epoch": 2.12,
"grad_norm": 3.1079790592193604,
"learning_rate": 3.1940441882804994e-05,
"loss": 0.1948,
"step": 1475
},
{
"epoch": 2.13,
"grad_norm": 4.682496547698975,
"learning_rate": 3.1860390650016015e-05,
"loss": 0.2023,
"step": 1480
},
{
"epoch": 2.14,
"grad_norm": 4.4082489013671875,
"learning_rate": 3.178033941722703e-05,
"loss": 0.2346,
"step": 1485
},
{
"epoch": 2.14,
"grad_norm": 5.721102714538574,
"learning_rate": 3.170028818443804e-05,
"loss": 0.2294,
"step": 1490
},
{
"epoch": 2.15,
"grad_norm": 3.2310311794281006,
"learning_rate": 3.1620236951649056e-05,
"loss": 0.2074,
"step": 1495
},
{
"epoch": 2.16,
"grad_norm": 5.734870433807373,
"learning_rate": 3.154018571886007e-05,
"loss": 0.2244,
"step": 1500
},
{
"epoch": 2.17,
"grad_norm": 4.256961822509766,
"learning_rate": 3.1460134486071084e-05,
"loss": 0.2208,
"step": 1505
},
{
"epoch": 2.17,
"grad_norm": 6.9470696449279785,
"learning_rate": 3.13800832532821e-05,
"loss": 0.2736,
"step": 1510
},
{
"epoch": 2.18,
"grad_norm": 2.8514010906219482,
"learning_rate": 3.130003202049312e-05,
"loss": 0.1989,
"step": 1515
},
{
"epoch": 2.19,
"grad_norm": 4.2279744148254395,
"learning_rate": 3.121998078770413e-05,
"loss": 0.2753,
"step": 1520
},
{
"epoch": 2.2,
"grad_norm": 3.349268674850464,
"learning_rate": 3.1139929554915145e-05,
"loss": 0.181,
"step": 1525
},
{
"epoch": 2.2,
"grad_norm": 4.550454616546631,
"learning_rate": 3.105987832212616e-05,
"loss": 0.2536,
"step": 1530
},
{
"epoch": 2.21,
"grad_norm": 3.7860782146453857,
"learning_rate": 3.097982708933718e-05,
"loss": 0.2331,
"step": 1535
},
{
"epoch": 2.22,
"grad_norm": 4.5719170570373535,
"learning_rate": 3.0899775856548194e-05,
"loss": 0.2408,
"step": 1540
},
{
"epoch": 2.22,
"grad_norm": 4.448012828826904,
"learning_rate": 3.081972462375921e-05,
"loss": 0.2219,
"step": 1545
},
{
"epoch": 2.23,
"grad_norm": 3.7972702980041504,
"learning_rate": 3.073967339097023e-05,
"loss": 0.2691,
"step": 1550
},
{
"epoch": 2.24,
"grad_norm": 4.268452167510986,
"learning_rate": 3.065962215818124e-05,
"loss": 0.2215,
"step": 1555
},
{
"epoch": 2.25,
"grad_norm": 4.145329475402832,
"learning_rate": 3.0579570925392256e-05,
"loss": 0.2488,
"step": 1560
},
{
"epoch": 2.25,
"grad_norm": 5.501221656799316,
"learning_rate": 3.049951969260327e-05,
"loss": 0.2441,
"step": 1565
},
{
"epoch": 2.26,
"grad_norm": 4.3408203125,
"learning_rate": 3.0419468459814283e-05,
"loss": 0.2308,
"step": 1570
},
{
"epoch": 2.27,
"grad_norm": 4.104162216186523,
"learning_rate": 3.0339417227025297e-05,
"loss": 0.2538,
"step": 1575
},
{
"epoch": 2.27,
"grad_norm": 5.441348075866699,
"learning_rate": 3.025936599423631e-05,
"loss": 0.2742,
"step": 1580
},
{
"epoch": 2.28,
"grad_norm": 3.3526971340179443,
"learning_rate": 3.017931476144733e-05,
"loss": 0.1934,
"step": 1585
},
{
"epoch": 2.29,
"grad_norm": 3.5918030738830566,
"learning_rate": 3.0099263528658345e-05,
"loss": 0.256,
"step": 1590
},
{
"epoch": 2.3,
"grad_norm": 3.9758517742156982,
"learning_rate": 3.001921229586936e-05,
"loss": 0.2096,
"step": 1595
},
{
"epoch": 2.3,
"grad_norm": 2.7759931087493896,
"learning_rate": 2.9939161063080373e-05,
"loss": 0.2545,
"step": 1600
},
{
"epoch": 2.31,
"grad_norm": 6.958917140960693,
"learning_rate": 2.9859109830291387e-05,
"loss": 0.2293,
"step": 1605
},
{
"epoch": 2.32,
"grad_norm": 4.162193775177002,
"learning_rate": 2.97790585975024e-05,
"loss": 0.2095,
"step": 1610
},
{
"epoch": 2.32,
"grad_norm": 3.701801061630249,
"learning_rate": 2.9699007364713418e-05,
"loss": 0.2339,
"step": 1615
},
{
"epoch": 2.33,
"grad_norm": 3.290947437286377,
"learning_rate": 2.9618956131924435e-05,
"loss": 0.209,
"step": 1620
},
{
"epoch": 2.34,
"grad_norm": 4.3231024742126465,
"learning_rate": 2.953890489913545e-05,
"loss": 0.2791,
"step": 1625
},
{
"epoch": 2.35,
"grad_norm": 3.6642446517944336,
"learning_rate": 2.9458853666346466e-05,
"loss": 0.2382,
"step": 1630
},
{
"epoch": 2.35,
"grad_norm": 6.942342281341553,
"learning_rate": 2.937880243355748e-05,
"loss": 0.2406,
"step": 1635
},
{
"epoch": 2.36,
"grad_norm": 3.886199712753296,
"learning_rate": 2.9298751200768493e-05,
"loss": 0.218,
"step": 1640
},
{
"epoch": 2.37,
"grad_norm": 3.8468515872955322,
"learning_rate": 2.9218699967979507e-05,
"loss": 0.2449,
"step": 1645
},
{
"epoch": 2.37,
"grad_norm": 3.2598648071289062,
"learning_rate": 2.913864873519052e-05,
"loss": 0.2276,
"step": 1650
},
{
"epoch": 2.38,
"grad_norm": 3.9356770515441895,
"learning_rate": 2.905859750240154e-05,
"loss": 0.2481,
"step": 1655
},
{
"epoch": 2.39,
"grad_norm": 5.803495407104492,
"learning_rate": 2.8978546269612555e-05,
"loss": 0.2699,
"step": 1660
},
{
"epoch": 2.4,
"grad_norm": 3.3325111865997314,
"learning_rate": 2.889849503682357e-05,
"loss": 0.2206,
"step": 1665
},
{
"epoch": 2.4,
"grad_norm": 5.40475606918335,
"learning_rate": 2.8818443804034583e-05,
"loss": 0.2295,
"step": 1670
},
{
"epoch": 2.41,
"grad_norm": 4.207846164703369,
"learning_rate": 2.8738392571245597e-05,
"loss": 0.2268,
"step": 1675
},
{
"epoch": 2.42,
"grad_norm": 3.405880928039551,
"learning_rate": 2.8658341338456614e-05,
"loss": 0.2773,
"step": 1680
},
{
"epoch": 2.43,
"grad_norm": 4.502201557159424,
"learning_rate": 2.8578290105667628e-05,
"loss": 0.2459,
"step": 1685
},
{
"epoch": 2.43,
"grad_norm": 2.8585033416748047,
"learning_rate": 2.8498238872878645e-05,
"loss": 0.2626,
"step": 1690
},
{
"epoch": 2.44,
"grad_norm": 4.774590015411377,
"learning_rate": 2.8418187640089662e-05,
"loss": 0.2242,
"step": 1695
},
{
"epoch": 2.45,
"grad_norm": 6.423954010009766,
"learning_rate": 2.8338136407300676e-05,
"loss": 0.2711,
"step": 1700
},
{
"epoch": 2.45,
"grad_norm": 5.023673057556152,
"learning_rate": 2.825808517451169e-05,
"loss": 0.2191,
"step": 1705
},
{
"epoch": 2.46,
"grad_norm": 3.246953010559082,
"learning_rate": 2.8178033941722703e-05,
"loss": 0.2032,
"step": 1710
},
{
"epoch": 2.47,
"grad_norm": 4.740121364593506,
"learning_rate": 2.8097982708933717e-05,
"loss": 0.2257,
"step": 1715
},
{
"epoch": 2.48,
"grad_norm": 4.652435302734375,
"learning_rate": 2.801793147614473e-05,
"loss": 0.2441,
"step": 1720
},
{
"epoch": 2.48,
"grad_norm": 3.7246835231781006,
"learning_rate": 2.7937880243355745e-05,
"loss": 0.2064,
"step": 1725
},
{
"epoch": 2.49,
"grad_norm": 2.8556969165802,
"learning_rate": 2.7857829010566765e-05,
"loss": 0.2002,
"step": 1730
},
{
"epoch": 2.5,
"grad_norm": 3.9338796138763428,
"learning_rate": 2.777777777777778e-05,
"loss": 0.2608,
"step": 1735
},
{
"epoch": 2.5,
"grad_norm": 3.847045660018921,
"learning_rate": 2.7697726544988796e-05,
"loss": 0.2167,
"step": 1740
},
{
"epoch": 2.51,
"grad_norm": 3.5335538387298584,
"learning_rate": 2.761767531219981e-05,
"loss": 0.1966,
"step": 1745
},
{
"epoch": 2.52,
"grad_norm": 3.702679395675659,
"learning_rate": 2.7537624079410824e-05,
"loss": 0.1865,
"step": 1750
},
{
"epoch": 2.53,
"grad_norm": 3.013113498687744,
"learning_rate": 2.7457572846621838e-05,
"loss": 0.199,
"step": 1755
},
{
"epoch": 2.53,
"grad_norm": 3.300877809524536,
"learning_rate": 2.737752161383285e-05,
"loss": 0.2504,
"step": 1760
},
{
"epoch": 2.54,
"grad_norm": 5.806422233581543,
"learning_rate": 2.7297470381043872e-05,
"loss": 0.2362,
"step": 1765
},
{
"epoch": 2.55,
"grad_norm": 6.372203826904297,
"learning_rate": 2.7217419148254886e-05,
"loss": 0.2298,
"step": 1770
},
{
"epoch": 2.55,
"grad_norm": 6.462773323059082,
"learning_rate": 2.71373679154659e-05,
"loss": 0.2367,
"step": 1775
},
{
"epoch": 2.56,
"grad_norm": 5.330246448516846,
"learning_rate": 2.7057316682676913e-05,
"loss": 0.2543,
"step": 1780
},
{
"epoch": 2.57,
"grad_norm": 4.1171956062316895,
"learning_rate": 2.6977265449887927e-05,
"loss": 0.2057,
"step": 1785
},
{
"epoch": 2.58,
"grad_norm": 3.247389316558838,
"learning_rate": 2.6897214217098944e-05,
"loss": 0.1965,
"step": 1790
},
{
"epoch": 2.58,
"grad_norm": 2.7912063598632812,
"learning_rate": 2.6817162984309958e-05,
"loss": 0.2103,
"step": 1795
},
{
"epoch": 2.59,
"grad_norm": 2.85927152633667,
"learning_rate": 2.6737111751520975e-05,
"loss": 0.2226,
"step": 1800
},
{
"epoch": 2.6,
"grad_norm": 3.5677337646484375,
"learning_rate": 2.6657060518731993e-05,
"loss": 0.2193,
"step": 1805
},
{
"epoch": 2.61,
"grad_norm": 5.31620979309082,
"learning_rate": 2.6577009285943006e-05,
"loss": 0.2569,
"step": 1810
},
{
"epoch": 2.61,
"grad_norm": 5.1970038414001465,
"learning_rate": 2.649695805315402e-05,
"loss": 0.2235,
"step": 1815
},
{
"epoch": 2.62,
"grad_norm": 3.6116130352020264,
"learning_rate": 2.6416906820365034e-05,
"loss": 0.2353,
"step": 1820
},
{
"epoch": 2.63,
"grad_norm": 4.2939043045043945,
"learning_rate": 2.6336855587576048e-05,
"loss": 0.2448,
"step": 1825
},
{
"epoch": 2.63,
"grad_norm": 3.7755072116851807,
"learning_rate": 2.625680435478706e-05,
"loss": 0.2131,
"step": 1830
},
{
"epoch": 2.64,
"grad_norm": 4.578812122344971,
"learning_rate": 2.6176753121998082e-05,
"loss": 0.2167,
"step": 1835
},
{
"epoch": 2.65,
"grad_norm": 4.904923439025879,
"learning_rate": 2.6096701889209096e-05,
"loss": 0.2228,
"step": 1840
},
{
"epoch": 2.66,
"grad_norm": 5.128912448883057,
"learning_rate": 2.601665065642011e-05,
"loss": 0.2888,
"step": 1845
},
{
"epoch": 2.66,
"grad_norm": 5.788363933563232,
"learning_rate": 2.5936599423631124e-05,
"loss": 0.2421,
"step": 1850
},
{
"epoch": 2.67,
"grad_norm": 4.001156806945801,
"learning_rate": 2.585654819084214e-05,
"loss": 0.1997,
"step": 1855
},
{
"epoch": 2.68,
"grad_norm": 4.3057475090026855,
"learning_rate": 2.5776496958053155e-05,
"loss": 0.2434,
"step": 1860
},
{
"epoch": 2.68,
"grad_norm": 3.524348258972168,
"learning_rate": 2.5696445725264168e-05,
"loss": 0.2188,
"step": 1865
},
{
"epoch": 2.69,
"grad_norm": 6.004559516906738,
"learning_rate": 2.561639449247519e-05,
"loss": 0.2426,
"step": 1870
},
{
"epoch": 2.7,
"grad_norm": 4.429930686950684,
"learning_rate": 2.5536343259686203e-05,
"loss": 0.2306,
"step": 1875
},
{
"epoch": 2.71,
"grad_norm": 5.706151008605957,
"learning_rate": 2.5456292026897216e-05,
"loss": 0.2194,
"step": 1880
},
{
"epoch": 2.71,
"grad_norm": 4.148650169372559,
"learning_rate": 2.537624079410823e-05,
"loss": 0.2683,
"step": 1885
},
{
"epoch": 2.72,
"grad_norm": 3.2449026107788086,
"learning_rate": 2.5296189561319244e-05,
"loss": 0.2539,
"step": 1890
},
{
"epoch": 2.73,
"grad_norm": 3.6404850482940674,
"learning_rate": 2.5216138328530258e-05,
"loss": 0.2221,
"step": 1895
},
{
"epoch": 2.73,
"grad_norm": 3.1382288932800293,
"learning_rate": 2.513608709574127e-05,
"loss": 0.2266,
"step": 1900
},
{
"epoch": 2.74,
"grad_norm": 8.027711868286133,
"learning_rate": 2.5056035862952292e-05,
"loss": 0.2944,
"step": 1905
},
{
"epoch": 2.75,
"grad_norm": 7.140124797821045,
"learning_rate": 2.4975984630163306e-05,
"loss": 0.2036,
"step": 1910
},
{
"epoch": 2.76,
"grad_norm": 3.4655325412750244,
"learning_rate": 2.489593339737432e-05,
"loss": 0.1955,
"step": 1915
},
{
"epoch": 2.76,
"grad_norm": 3.295433759689331,
"learning_rate": 2.4815882164585337e-05,
"loss": 0.2114,
"step": 1920
},
{
"epoch": 2.77,
"grad_norm": 3.806304931640625,
"learning_rate": 2.473583093179635e-05,
"loss": 0.206,
"step": 1925
},
{
"epoch": 2.78,
"grad_norm": 4.674000263214111,
"learning_rate": 2.4655779699007368e-05,
"loss": 0.2215,
"step": 1930
},
{
"epoch": 2.79,
"grad_norm": 3.5063233375549316,
"learning_rate": 2.4575728466218382e-05,
"loss": 0.2583,
"step": 1935
},
{
"epoch": 2.79,
"grad_norm": 3.4132816791534424,
"learning_rate": 2.4495677233429396e-05,
"loss": 0.2388,
"step": 1940
},
{
"epoch": 2.8,
"grad_norm": 3.2140300273895264,
"learning_rate": 2.441562600064041e-05,
"loss": 0.2395,
"step": 1945
},
{
"epoch": 2.81,
"grad_norm": 4.795976638793945,
"learning_rate": 2.4335574767851427e-05,
"loss": 0.2206,
"step": 1950
},
{
"epoch": 2.81,
"grad_norm": 3.491682767868042,
"learning_rate": 2.425552353506244e-05,
"loss": 0.2553,
"step": 1955
},
{
"epoch": 2.82,
"grad_norm": 4.174879550933838,
"learning_rate": 2.4175472302273454e-05,
"loss": 0.1969,
"step": 1960
},
{
"epoch": 2.83,
"grad_norm": 3.776137590408325,
"learning_rate": 2.409542106948447e-05,
"loss": 0.2276,
"step": 1965
},
{
"epoch": 2.84,
"grad_norm": 3.7050764560699463,
"learning_rate": 2.4015369836695485e-05,
"loss": 0.2001,
"step": 1970
},
{
"epoch": 2.84,
"grad_norm": 3.4648373126983643,
"learning_rate": 2.3935318603906502e-05,
"loss": 0.2538,
"step": 1975
},
{
"epoch": 2.85,
"grad_norm": 4.3064727783203125,
"learning_rate": 2.3855267371117516e-05,
"loss": 0.2579,
"step": 1980
},
{
"epoch": 2.86,
"grad_norm": 2.671032428741455,
"learning_rate": 2.3775216138328533e-05,
"loss": 0.2443,
"step": 1985
},
{
"epoch": 2.86,
"grad_norm": 4.2159013748168945,
"learning_rate": 2.3695164905539547e-05,
"loss": 0.2373,
"step": 1990
},
{
"epoch": 2.87,
"grad_norm": 3.787076711654663,
"learning_rate": 2.361511367275056e-05,
"loss": 0.2179,
"step": 1995
},
{
"epoch": 2.88,
"grad_norm": 3.971762180328369,
"learning_rate": 2.3535062439961578e-05,
"loss": 0.2356,
"step": 2000
},
{
"epoch": 2.89,
"grad_norm": 5.022749900817871,
"learning_rate": 2.3455011207172592e-05,
"loss": 0.2167,
"step": 2005
},
{
"epoch": 2.89,
"grad_norm": 4.616547107696533,
"learning_rate": 2.3374959974383606e-05,
"loss": 0.2266,
"step": 2010
},
{
"epoch": 2.9,
"grad_norm": 4.522019386291504,
"learning_rate": 2.329490874159462e-05,
"loss": 0.247,
"step": 2015
},
{
"epoch": 2.91,
"grad_norm": 5.141051292419434,
"learning_rate": 2.3214857508805637e-05,
"loss": 0.2028,
"step": 2020
},
{
"epoch": 2.91,
"grad_norm": 3.577793836593628,
"learning_rate": 2.313480627601665e-05,
"loss": 0.1924,
"step": 2025
},
{
"epoch": 2.92,
"grad_norm": 5.1364665031433105,
"learning_rate": 2.3054755043227668e-05,
"loss": 0.226,
"step": 2030
},
{
"epoch": 2.93,
"grad_norm": 3.8625662326812744,
"learning_rate": 2.297470381043868e-05,
"loss": 0.2329,
"step": 2035
},
{
"epoch": 2.94,
"grad_norm": 4.119937419891357,
"learning_rate": 2.28946525776497e-05,
"loss": 0.2037,
"step": 2040
},
{
"epoch": 2.94,
"grad_norm": 3.1188371181488037,
"learning_rate": 2.2814601344860712e-05,
"loss": 0.231,
"step": 2045
},
{
"epoch": 2.95,
"grad_norm": 4.263334274291992,
"learning_rate": 2.2734550112071726e-05,
"loss": 0.219,
"step": 2050
},
{
"epoch": 2.96,
"grad_norm": 4.002464771270752,
"learning_rate": 2.2654498879282743e-05,
"loss": 0.1927,
"step": 2055
},
{
"epoch": 2.97,
"grad_norm": 3.5694775581359863,
"learning_rate": 2.2574447646493757e-05,
"loss": 0.1803,
"step": 2060
},
{
"epoch": 2.97,
"grad_norm": 4.048843860626221,
"learning_rate": 2.249439641370477e-05,
"loss": 0.1837,
"step": 2065
},
{
"epoch": 2.98,
"grad_norm": 4.335817337036133,
"learning_rate": 2.2414345180915788e-05,
"loss": 0.227,
"step": 2070
},
{
"epoch": 2.99,
"grad_norm": 4.292420864105225,
"learning_rate": 2.2334293948126802e-05,
"loss": 0.2535,
"step": 2075
},
{
"epoch": 2.99,
"grad_norm": 3.625598430633545,
"learning_rate": 2.2254242715337816e-05,
"loss": 0.1633,
"step": 2080
},
{
"epoch": 3.0,
"eval_accuracy": 0.9487854251012146,
"eval_loss": 0.1372506320476532,
"eval_runtime": 31.8832,
"eval_samples_per_second": 309.881,
"eval_steps_per_second": 9.692,
"step": 2084
},
{
"epoch": 3.0,
"grad_norm": 4.9075140953063965,
"learning_rate": 2.217419148254883e-05,
"loss": 0.19,
"step": 2085
},
{
"epoch": 3.01,
"grad_norm": 4.76453971862793,
"learning_rate": 2.2094140249759847e-05,
"loss": 0.214,
"step": 2090
},
{
"epoch": 3.02,
"grad_norm": 3.710191011428833,
"learning_rate": 2.2014089016970864e-05,
"loss": 0.2197,
"step": 2095
},
{
"epoch": 3.02,
"grad_norm": 3.287574529647827,
"learning_rate": 2.1934037784181878e-05,
"loss": 0.1939,
"step": 2100
},
{
"epoch": 3.03,
"grad_norm": 3.7616758346557617,
"learning_rate": 2.1853986551392895e-05,
"loss": 0.209,
"step": 2105
},
{
"epoch": 3.04,
"grad_norm": 3.6096699237823486,
"learning_rate": 2.177393531860391e-05,
"loss": 0.2195,
"step": 2110
},
{
"epoch": 3.04,
"grad_norm": 4.259820461273193,
"learning_rate": 2.1693884085814922e-05,
"loss": 0.1813,
"step": 2115
},
{
"epoch": 3.05,
"grad_norm": 4.710832118988037,
"learning_rate": 2.1613832853025936e-05,
"loss": 0.2054,
"step": 2120
},
{
"epoch": 3.06,
"grad_norm": 2.757356882095337,
"learning_rate": 2.1533781620236953e-05,
"loss": 0.2276,
"step": 2125
},
{
"epoch": 3.07,
"grad_norm": 4.743321418762207,
"learning_rate": 2.1453730387447967e-05,
"loss": 0.1603,
"step": 2130
},
{
"epoch": 3.07,
"grad_norm": 3.536240339279175,
"learning_rate": 2.137367915465898e-05,
"loss": 0.1888,
"step": 2135
},
{
"epoch": 3.08,
"grad_norm": 3.635094404220581,
"learning_rate": 2.1293627921869998e-05,
"loss": 0.1841,
"step": 2140
},
{
"epoch": 3.09,
"grad_norm": 4.491457939147949,
"learning_rate": 2.1213576689081012e-05,
"loss": 0.2013,
"step": 2145
},
{
"epoch": 3.09,
"grad_norm": 5.20548152923584,
"learning_rate": 2.113352545629203e-05,
"loss": 0.1618,
"step": 2150
},
{
"epoch": 3.1,
"grad_norm": 3.6702117919921875,
"learning_rate": 2.1053474223503043e-05,
"loss": 0.2106,
"step": 2155
},
{
"epoch": 3.11,
"grad_norm": 3.9622325897216797,
"learning_rate": 2.097342299071406e-05,
"loss": 0.2488,
"step": 2160
},
{
"epoch": 3.12,
"grad_norm": 7.823854923248291,
"learning_rate": 2.0893371757925074e-05,
"loss": 0.2107,
"step": 2165
},
{
"epoch": 3.12,
"grad_norm": 5.4744791984558105,
"learning_rate": 2.0813320525136088e-05,
"loss": 0.1888,
"step": 2170
},
{
"epoch": 3.13,
"grad_norm": 3.024887800216675,
"learning_rate": 2.0733269292347105e-05,
"loss": 0.2051,
"step": 2175
},
{
"epoch": 3.14,
"grad_norm": 3.444693088531494,
"learning_rate": 2.065321805955812e-05,
"loss": 0.2404,
"step": 2180
},
{
"epoch": 3.15,
"grad_norm": 4.3029656410217285,
"learning_rate": 2.0573166826769133e-05,
"loss": 0.215,
"step": 2185
},
{
"epoch": 3.15,
"grad_norm": 4.038111209869385,
"learning_rate": 2.0493115593980146e-05,
"loss": 0.2003,
"step": 2190
},
{
"epoch": 3.16,
"grad_norm": 4.064023494720459,
"learning_rate": 2.0413064361191164e-05,
"loss": 0.1961,
"step": 2195
},
{
"epoch": 3.17,
"grad_norm": 5.2245707511901855,
"learning_rate": 2.0333013128402177e-05,
"loss": 0.2172,
"step": 2200
},
{
"epoch": 3.17,
"grad_norm": 4.670438289642334,
"learning_rate": 2.025296189561319e-05,
"loss": 0.1992,
"step": 2205
},
{
"epoch": 3.18,
"grad_norm": 4.39680290222168,
"learning_rate": 2.017291066282421e-05,
"loss": 0.2174,
"step": 2210
},
{
"epoch": 3.19,
"grad_norm": 6.914219379425049,
"learning_rate": 2.0092859430035225e-05,
"loss": 0.1968,
"step": 2215
},
{
"epoch": 3.2,
"grad_norm": 3.2190115451812744,
"learning_rate": 2.001280819724624e-05,
"loss": 0.1939,
"step": 2220
},
{
"epoch": 3.2,
"grad_norm": 3.638925075531006,
"learning_rate": 1.9932756964457253e-05,
"loss": 0.2431,
"step": 2225
},
{
"epoch": 3.21,
"grad_norm": 5.030416965484619,
"learning_rate": 1.985270573166827e-05,
"loss": 0.2094,
"step": 2230
},
{
"epoch": 3.22,
"grad_norm": 5.105839729309082,
"learning_rate": 1.9772654498879284e-05,
"loss": 0.2165,
"step": 2235
},
{
"epoch": 3.22,
"grad_norm": 4.913294315338135,
"learning_rate": 1.9692603266090298e-05,
"loss": 0.2171,
"step": 2240
},
{
"epoch": 3.23,
"grad_norm": 4.230659008026123,
"learning_rate": 1.961255203330131e-05,
"loss": 0.2088,
"step": 2245
},
{
"epoch": 3.24,
"grad_norm": 4.271526336669922,
"learning_rate": 1.953250080051233e-05,
"loss": 0.215,
"step": 2250
},
{
"epoch": 3.25,
"grad_norm": 6.460733413696289,
"learning_rate": 1.9452449567723343e-05,
"loss": 0.2241,
"step": 2255
},
{
"epoch": 3.25,
"grad_norm": 2.8896567821502686,
"learning_rate": 1.9372398334934356e-05,
"loss": 0.1587,
"step": 2260
},
{
"epoch": 3.26,
"grad_norm": 3.2169876098632812,
"learning_rate": 1.9292347102145374e-05,
"loss": 0.1587,
"step": 2265
},
{
"epoch": 3.27,
"grad_norm": 4.299535274505615,
"learning_rate": 1.921229586935639e-05,
"loss": 0.1819,
"step": 2270
},
{
"epoch": 3.27,
"grad_norm": 3.9862189292907715,
"learning_rate": 1.9132244636567405e-05,
"loss": 0.2099,
"step": 2275
},
{
"epoch": 3.28,
"grad_norm": 5.323502540588379,
"learning_rate": 1.905219340377842e-05,
"loss": 0.222,
"step": 2280
},
{
"epoch": 3.29,
"grad_norm": 3.4311234951019287,
"learning_rate": 1.8972142170989436e-05,
"loss": 0.1956,
"step": 2285
},
{
"epoch": 3.3,
"grad_norm": 4.878343105316162,
"learning_rate": 1.889209093820045e-05,
"loss": 0.1814,
"step": 2290
},
{
"epoch": 3.3,
"grad_norm": 2.903064489364624,
"learning_rate": 1.8812039705411463e-05,
"loss": 0.2397,
"step": 2295
},
{
"epoch": 3.31,
"grad_norm": 5.286783695220947,
"learning_rate": 1.873198847262248e-05,
"loss": 0.2362,
"step": 2300
},
{
"epoch": 3.32,
"grad_norm": 4.201813220977783,
"learning_rate": 1.8651937239833494e-05,
"loss": 0.2235,
"step": 2305
},
{
"epoch": 3.32,
"grad_norm": 3.4148082733154297,
"learning_rate": 1.8571886007044508e-05,
"loss": 0.1922,
"step": 2310
},
{
"epoch": 3.33,
"grad_norm": 4.562300682067871,
"learning_rate": 1.8491834774255522e-05,
"loss": 0.2013,
"step": 2315
},
{
"epoch": 3.34,
"grad_norm": 6.004905700683594,
"learning_rate": 1.841178354146654e-05,
"loss": 0.2215,
"step": 2320
},
{
"epoch": 3.35,
"grad_norm": 4.642991065979004,
"learning_rate": 1.8331732308677556e-05,
"loss": 0.2085,
"step": 2325
},
{
"epoch": 3.35,
"grad_norm": 2.796497344970703,
"learning_rate": 1.825168107588857e-05,
"loss": 0.2126,
"step": 2330
},
{
"epoch": 3.36,
"grad_norm": 6.009349346160889,
"learning_rate": 1.8171629843099587e-05,
"loss": 0.1906,
"step": 2335
},
{
"epoch": 3.37,
"grad_norm": 4.415472507476807,
"learning_rate": 1.80915786103106e-05,
"loss": 0.2013,
"step": 2340
},
{
"epoch": 3.38,
"grad_norm": 2.890207529067993,
"learning_rate": 1.8011527377521615e-05,
"loss": 0.2017,
"step": 2345
},
{
"epoch": 3.38,
"grad_norm": 3.2712149620056152,
"learning_rate": 1.793147614473263e-05,
"loss": 0.1997,
"step": 2350
},
{
"epoch": 3.39,
"grad_norm": 4.87721061706543,
"learning_rate": 1.7851424911943646e-05,
"loss": 0.1944,
"step": 2355
},
{
"epoch": 3.4,
"grad_norm": 5.590481281280518,
"learning_rate": 1.777137367915466e-05,
"loss": 0.1749,
"step": 2360
},
{
"epoch": 3.4,
"grad_norm": 3.1477975845336914,
"learning_rate": 1.7691322446365673e-05,
"loss": 0.1734,
"step": 2365
},
{
"epoch": 3.41,
"grad_norm": 4.50333309173584,
"learning_rate": 1.761127121357669e-05,
"loss": 0.244,
"step": 2370
},
{
"epoch": 3.42,
"grad_norm": 4.189910411834717,
"learning_rate": 1.7531219980787704e-05,
"loss": 0.2015,
"step": 2375
},
{
"epoch": 3.43,
"grad_norm": 4.48671817779541,
"learning_rate": 1.7451168747998718e-05,
"loss": 0.1994,
"step": 2380
},
{
"epoch": 3.43,
"grad_norm": 3.9251739978790283,
"learning_rate": 1.7371117515209735e-05,
"loss": 0.1798,
"step": 2385
},
{
"epoch": 3.44,
"grad_norm": 2.792525291442871,
"learning_rate": 1.7291066282420752e-05,
"loss": 0.1628,
"step": 2390
},
{
"epoch": 3.45,
"grad_norm": 3.325592041015625,
"learning_rate": 1.7211015049631766e-05,
"loss": 0.2069,
"step": 2395
},
{
"epoch": 3.45,
"grad_norm": 3.9942626953125,
"learning_rate": 1.713096381684278e-05,
"loss": 0.1866,
"step": 2400
},
{
"epoch": 3.46,
"grad_norm": 5.486047267913818,
"learning_rate": 1.7050912584053797e-05,
"loss": 0.2185,
"step": 2405
},
{
"epoch": 3.47,
"grad_norm": 3.5321319103240967,
"learning_rate": 1.697086135126481e-05,
"loss": 0.2068,
"step": 2410
},
{
"epoch": 3.48,
"grad_norm": 4.118142127990723,
"learning_rate": 1.6890810118475825e-05,
"loss": 0.2076,
"step": 2415
},
{
"epoch": 3.48,
"grad_norm": 4.678371906280518,
"learning_rate": 1.681075888568684e-05,
"loss": 0.1948,
"step": 2420
},
{
"epoch": 3.49,
"grad_norm": 5.298951148986816,
"learning_rate": 1.6730707652897856e-05,
"loss": 0.2142,
"step": 2425
},
{
"epoch": 3.5,
"grad_norm": 4.5779900550842285,
"learning_rate": 1.665065642010887e-05,
"loss": 0.1994,
"step": 2430
},
{
"epoch": 3.5,
"grad_norm": 4.762623310089111,
"learning_rate": 1.6570605187319883e-05,
"loss": 0.2213,
"step": 2435
},
{
"epoch": 3.51,
"grad_norm": 4.956728458404541,
"learning_rate": 1.64905539545309e-05,
"loss": 0.1818,
"step": 2440
},
{
"epoch": 3.52,
"grad_norm": 3.7195310592651367,
"learning_rate": 1.6410502721741918e-05,
"loss": 0.2171,
"step": 2445
},
{
"epoch": 3.53,
"grad_norm": 3.115422010421753,
"learning_rate": 1.633045148895293e-05,
"loss": 0.1873,
"step": 2450
},
{
"epoch": 3.53,
"grad_norm": 2.4611568450927734,
"learning_rate": 1.6250400256163945e-05,
"loss": 0.1999,
"step": 2455
},
{
"epoch": 3.54,
"grad_norm": 7.129974842071533,
"learning_rate": 1.6170349023374962e-05,
"loss": 0.2039,
"step": 2460
},
{
"epoch": 3.55,
"grad_norm": 3.4364309310913086,
"learning_rate": 1.6090297790585976e-05,
"loss": 0.2019,
"step": 2465
},
{
"epoch": 3.56,
"grad_norm": 7.869508266448975,
"learning_rate": 1.601024655779699e-05,
"loss": 0.1678,
"step": 2470
},
{
"epoch": 3.56,
"grad_norm": 4.7185378074646,
"learning_rate": 1.5930195325008007e-05,
"loss": 0.1934,
"step": 2475
},
{
"epoch": 3.57,
"grad_norm": 7.357175350189209,
"learning_rate": 1.585014409221902e-05,
"loss": 0.1998,
"step": 2480
},
{
"epoch": 3.58,
"grad_norm": 3.6080660820007324,
"learning_rate": 1.5770092859430035e-05,
"loss": 0.1949,
"step": 2485
},
{
"epoch": 3.58,
"grad_norm": 2.9534220695495605,
"learning_rate": 1.569004162664105e-05,
"loss": 0.1772,
"step": 2490
},
{
"epoch": 3.59,
"grad_norm": 4.7188401222229,
"learning_rate": 1.5609990393852066e-05,
"loss": 0.2164,
"step": 2495
},
{
"epoch": 3.6,
"grad_norm": 5.8504180908203125,
"learning_rate": 1.552993916106308e-05,
"loss": 0.2283,
"step": 2500
},
{
"epoch": 3.61,
"grad_norm": 4.23643684387207,
"learning_rate": 1.5449887928274097e-05,
"loss": 0.2003,
"step": 2505
},
{
"epoch": 3.61,
"grad_norm": 2.19675350189209,
"learning_rate": 1.5369836695485114e-05,
"loss": 0.1997,
"step": 2510
},
{
"epoch": 3.62,
"grad_norm": 5.1381330490112305,
"learning_rate": 1.5289785462696128e-05,
"loss": 0.195,
"step": 2515
},
{
"epoch": 3.63,
"grad_norm": 3.739199161529541,
"learning_rate": 1.5209734229907142e-05,
"loss": 0.1596,
"step": 2520
},
{
"epoch": 3.63,
"grad_norm": 4.581226348876953,
"learning_rate": 1.5129682997118155e-05,
"loss": 0.2086,
"step": 2525
},
{
"epoch": 3.64,
"grad_norm": 5.416107177734375,
"learning_rate": 1.5049631764329173e-05,
"loss": 0.2517,
"step": 2530
},
{
"epoch": 3.65,
"grad_norm": 6.070262908935547,
"learning_rate": 1.4969580531540186e-05,
"loss": 0.1801,
"step": 2535
},
{
"epoch": 3.66,
"grad_norm": 4.063976764678955,
"learning_rate": 1.48895292987512e-05,
"loss": 0.2302,
"step": 2540
},
{
"epoch": 3.66,
"grad_norm": 3.717087745666504,
"learning_rate": 1.4809478065962217e-05,
"loss": 0.2185,
"step": 2545
},
{
"epoch": 3.67,
"grad_norm": 3.2319772243499756,
"learning_rate": 1.4729426833173233e-05,
"loss": 0.2609,
"step": 2550
},
{
"epoch": 3.68,
"grad_norm": 3.7224340438842773,
"learning_rate": 1.4649375600384247e-05,
"loss": 0.1906,
"step": 2555
},
{
"epoch": 3.68,
"grad_norm": 6.972284317016602,
"learning_rate": 1.456932436759526e-05,
"loss": 0.2232,
"step": 2560
},
{
"epoch": 3.69,
"grad_norm": 3.514923095703125,
"learning_rate": 1.4489273134806278e-05,
"loss": 0.2081,
"step": 2565
},
{
"epoch": 3.7,
"grad_norm": 5.140145301818848,
"learning_rate": 1.4409221902017291e-05,
"loss": 0.2099,
"step": 2570
},
{
"epoch": 3.71,
"grad_norm": 2.977041482925415,
"learning_rate": 1.4329170669228307e-05,
"loss": 0.1689,
"step": 2575
},
{
"epoch": 3.71,
"grad_norm": 2.9438095092773438,
"learning_rate": 1.4249119436439322e-05,
"loss": 0.1788,
"step": 2580
},
{
"epoch": 3.72,
"grad_norm": 3.311598777770996,
"learning_rate": 1.4169068203650338e-05,
"loss": 0.1787,
"step": 2585
},
{
"epoch": 3.73,
"grad_norm": 4.066298961639404,
"learning_rate": 1.4089016970861352e-05,
"loss": 0.2049,
"step": 2590
},
{
"epoch": 3.74,
"grad_norm": 3.8641276359558105,
"learning_rate": 1.4008965738072365e-05,
"loss": 0.2064,
"step": 2595
},
{
"epoch": 3.74,
"grad_norm": 4.785098075866699,
"learning_rate": 1.3928914505283383e-05,
"loss": 0.213,
"step": 2600
},
{
"epoch": 3.75,
"grad_norm": 3.3832712173461914,
"learning_rate": 1.3848863272494398e-05,
"loss": 0.203,
"step": 2605
},
{
"epoch": 3.76,
"grad_norm": 3.8471434116363525,
"learning_rate": 1.3768812039705412e-05,
"loss": 0.2192,
"step": 2610
},
{
"epoch": 3.76,
"grad_norm": 4.769313335418701,
"learning_rate": 1.3688760806916426e-05,
"loss": 0.2191,
"step": 2615
},
{
"epoch": 3.77,
"grad_norm": 3.5882818698883057,
"learning_rate": 1.3608709574127443e-05,
"loss": 0.1952,
"step": 2620
},
{
"epoch": 3.78,
"grad_norm": 4.177798271179199,
"learning_rate": 1.3528658341338457e-05,
"loss": 0.2209,
"step": 2625
},
{
"epoch": 3.79,
"grad_norm": 5.218222618103027,
"learning_rate": 1.3448607108549472e-05,
"loss": 0.1953,
"step": 2630
},
{
"epoch": 3.79,
"grad_norm": 4.669002056121826,
"learning_rate": 1.3368555875760488e-05,
"loss": 0.2017,
"step": 2635
},
{
"epoch": 3.8,
"grad_norm": 4.992402076721191,
"learning_rate": 1.3288504642971503e-05,
"loss": 0.2702,
"step": 2640
},
{
"epoch": 3.81,
"grad_norm": 3.818152666091919,
"learning_rate": 1.3208453410182517e-05,
"loss": 0.2195,
"step": 2645
},
{
"epoch": 3.81,
"grad_norm": 3.825201988220215,
"learning_rate": 1.312840217739353e-05,
"loss": 0.2086,
"step": 2650
},
{
"epoch": 3.82,
"grad_norm": 3.2888553142547607,
"learning_rate": 1.3048350944604548e-05,
"loss": 0.1899,
"step": 2655
},
{
"epoch": 3.83,
"grad_norm": 4.896663665771484,
"learning_rate": 1.2968299711815562e-05,
"loss": 0.2154,
"step": 2660
},
{
"epoch": 3.84,
"grad_norm": 3.9895691871643066,
"learning_rate": 1.2888248479026577e-05,
"loss": 0.2251,
"step": 2665
},
{
"epoch": 3.84,
"grad_norm": 3.9652981758117676,
"learning_rate": 1.2808197246237594e-05,
"loss": 0.2116,
"step": 2670
},
{
"epoch": 3.85,
"grad_norm": 4.93154764175415,
"learning_rate": 1.2728146013448608e-05,
"loss": 0.2597,
"step": 2675
},
{
"epoch": 3.86,
"grad_norm": 4.236401081085205,
"learning_rate": 1.2648094780659622e-05,
"loss": 0.2312,
"step": 2680
},
{
"epoch": 3.86,
"grad_norm": 3.95443058013916,
"learning_rate": 1.2568043547870636e-05,
"loss": 0.1696,
"step": 2685
},
{
"epoch": 3.87,
"grad_norm": 2.7311601638793945,
"learning_rate": 1.2487992315081653e-05,
"loss": 0.1625,
"step": 2690
},
{
"epoch": 3.88,
"grad_norm": 3.6803927421569824,
"learning_rate": 1.2407941082292668e-05,
"loss": 0.2069,
"step": 2695
},
{
"epoch": 3.89,
"grad_norm": 3.391956329345703,
"learning_rate": 1.2327889849503684e-05,
"loss": 0.1779,
"step": 2700
},
{
"epoch": 3.89,
"grad_norm": 3.478215456008911,
"learning_rate": 1.2247838616714698e-05,
"loss": 0.1874,
"step": 2705
},
{
"epoch": 3.9,
"grad_norm": 2.4775846004486084,
"learning_rate": 1.2167787383925713e-05,
"loss": 0.1953,
"step": 2710
},
{
"epoch": 3.91,
"grad_norm": 4.715533256530762,
"learning_rate": 1.2087736151136727e-05,
"loss": 0.1863,
"step": 2715
},
{
"epoch": 3.92,
"grad_norm": 4.083915710449219,
"learning_rate": 1.2007684918347743e-05,
"loss": 0.1871,
"step": 2720
},
{
"epoch": 3.92,
"grad_norm": 2.535428285598755,
"learning_rate": 1.1927633685558758e-05,
"loss": 0.2084,
"step": 2725
},
{
"epoch": 3.93,
"grad_norm": 5.987590789794922,
"learning_rate": 1.1847582452769774e-05,
"loss": 0.172,
"step": 2730
},
{
"epoch": 3.94,
"grad_norm": 4.185674667358398,
"learning_rate": 1.1767531219980789e-05,
"loss": 0.2106,
"step": 2735
},
{
"epoch": 3.94,
"grad_norm": 3.0659992694854736,
"learning_rate": 1.1687479987191803e-05,
"loss": 0.1839,
"step": 2740
},
{
"epoch": 3.95,
"grad_norm": 8.405370712280273,
"learning_rate": 1.1607428754402818e-05,
"loss": 0.2449,
"step": 2745
},
{
"epoch": 3.96,
"grad_norm": 5.262624740600586,
"learning_rate": 1.1527377521613834e-05,
"loss": 0.1982,
"step": 2750
},
{
"epoch": 3.97,
"grad_norm": 3.3970797061920166,
"learning_rate": 1.144732628882485e-05,
"loss": 0.2383,
"step": 2755
},
{
"epoch": 3.97,
"grad_norm": 4.604133129119873,
"learning_rate": 1.1367275056035863e-05,
"loss": 0.211,
"step": 2760
},
{
"epoch": 3.98,
"grad_norm": 4.767920970916748,
"learning_rate": 1.1287223823246879e-05,
"loss": 0.2111,
"step": 2765
},
{
"epoch": 3.99,
"grad_norm": 4.075857162475586,
"learning_rate": 1.1207172590457894e-05,
"loss": 0.2011,
"step": 2770
},
{
"epoch": 3.99,
"grad_norm": 3.293419599533081,
"learning_rate": 1.1127121357668908e-05,
"loss": 0.1943,
"step": 2775
},
{
"epoch": 4.0,
"eval_accuracy": 0.9510121457489878,
"eval_loss": 0.131936714053154,
"eval_runtime": 31.7023,
"eval_samples_per_second": 311.649,
"eval_steps_per_second": 9.747,
"step": 2779
},
{
"epoch": 4.0,
"grad_norm": 2.756840229034424,
"learning_rate": 1.1047070124879923e-05,
"loss": 0.2012,
"step": 2780
},
{
"epoch": 4.01,
"grad_norm": 4.239038467407227,
"learning_rate": 1.0967018892090939e-05,
"loss": 0.1637,
"step": 2785
},
{
"epoch": 4.02,
"grad_norm": 3.6597139835357666,
"learning_rate": 1.0886967659301954e-05,
"loss": 0.1848,
"step": 2790
},
{
"epoch": 4.02,
"grad_norm": 3.050875425338745,
"learning_rate": 1.0806916426512968e-05,
"loss": 0.1565,
"step": 2795
},
{
"epoch": 4.03,
"grad_norm": 4.3006463050842285,
"learning_rate": 1.0726865193723984e-05,
"loss": 0.2116,
"step": 2800
},
{
"epoch": 4.04,
"grad_norm": 4.682863712310791,
"learning_rate": 1.0646813960934999e-05,
"loss": 0.1974,
"step": 2805
},
{
"epoch": 4.04,
"grad_norm": 3.8604190349578857,
"learning_rate": 1.0566762728146015e-05,
"loss": 0.1972,
"step": 2810
},
{
"epoch": 4.05,
"grad_norm": 4.325167655944824,
"learning_rate": 1.048671149535703e-05,
"loss": 0.1732,
"step": 2815
},
{
"epoch": 4.06,
"grad_norm": 6.881094932556152,
"learning_rate": 1.0406660262568044e-05,
"loss": 0.2527,
"step": 2820
},
{
"epoch": 4.07,
"grad_norm": 6.374682426452637,
"learning_rate": 1.032660902977906e-05,
"loss": 0.224,
"step": 2825
},
{
"epoch": 4.07,
"grad_norm": 3.154886245727539,
"learning_rate": 1.0246557796990073e-05,
"loss": 0.1613,
"step": 2830
},
{
"epoch": 4.08,
"grad_norm": 5.165164470672607,
"learning_rate": 1.0166506564201089e-05,
"loss": 0.225,
"step": 2835
},
{
"epoch": 4.09,
"grad_norm": 3.388165235519409,
"learning_rate": 1.0086455331412104e-05,
"loss": 0.2189,
"step": 2840
},
{
"epoch": 4.09,
"grad_norm": 4.795779705047607,
"learning_rate": 1.000640409862312e-05,
"loss": 0.2027,
"step": 2845
},
{
"epoch": 4.1,
"grad_norm": 3.341182231903076,
"learning_rate": 9.926352865834135e-06,
"loss": 0.1931,
"step": 2850
},
{
"epoch": 4.11,
"grad_norm": 1.956528902053833,
"learning_rate": 9.846301633045149e-06,
"loss": 0.2208,
"step": 2855
},
{
"epoch": 4.12,
"grad_norm": 6.8234076499938965,
"learning_rate": 9.766250400256164e-06,
"loss": 0.2388,
"step": 2860
},
{
"epoch": 4.12,
"grad_norm": 2.924370527267456,
"learning_rate": 9.686199167467178e-06,
"loss": 0.2046,
"step": 2865
},
{
"epoch": 4.13,
"grad_norm": 5.049492359161377,
"learning_rate": 9.606147934678195e-06,
"loss": 0.1876,
"step": 2870
},
{
"epoch": 4.14,
"grad_norm": 4.749929428100586,
"learning_rate": 9.52609670188921e-06,
"loss": 0.1649,
"step": 2875
},
{
"epoch": 4.15,
"grad_norm": 3.702878475189209,
"learning_rate": 9.446045469100225e-06,
"loss": 0.2309,
"step": 2880
},
{
"epoch": 4.15,
"grad_norm": 6.8818745613098145,
"learning_rate": 9.36599423631124e-06,
"loss": 0.2012,
"step": 2885
},
{
"epoch": 4.16,
"grad_norm": 3.418677568435669,
"learning_rate": 9.285943003522254e-06,
"loss": 0.2209,
"step": 2890
},
{
"epoch": 4.17,
"grad_norm": 3.8437540531158447,
"learning_rate": 9.20589177073327e-06,
"loss": 0.1668,
"step": 2895
},
{
"epoch": 4.17,
"grad_norm": 3.2534446716308594,
"learning_rate": 9.125840537944285e-06,
"loss": 0.2346,
"step": 2900
},
{
"epoch": 4.18,
"grad_norm": 4.049452781677246,
"learning_rate": 9.0457893051553e-06,
"loss": 0.1752,
"step": 2905
},
{
"epoch": 4.19,
"grad_norm": 4.121111869812012,
"learning_rate": 8.965738072366314e-06,
"loss": 0.2057,
"step": 2910
},
{
"epoch": 4.2,
"grad_norm": 5.423705577850342,
"learning_rate": 8.88568683957733e-06,
"loss": 0.1958,
"step": 2915
},
{
"epoch": 4.2,
"grad_norm": 3.153987407684326,
"learning_rate": 8.805635606788345e-06,
"loss": 0.1547,
"step": 2920
},
{
"epoch": 4.21,
"grad_norm": 3.7586491107940674,
"learning_rate": 8.725584373999359e-06,
"loss": 0.224,
"step": 2925
},
{
"epoch": 4.22,
"grad_norm": 4.077225208282471,
"learning_rate": 8.645533141210376e-06,
"loss": 0.2113,
"step": 2930
},
{
"epoch": 4.22,
"grad_norm": 6.970191478729248,
"learning_rate": 8.56548190842139e-06,
"loss": 0.2032,
"step": 2935
},
{
"epoch": 4.23,
"grad_norm": 4.3456926345825195,
"learning_rate": 8.485430675632405e-06,
"loss": 0.2111,
"step": 2940
},
{
"epoch": 4.24,
"grad_norm": 3.5162301063537598,
"learning_rate": 8.40537944284342e-06,
"loss": 0.1873,
"step": 2945
},
{
"epoch": 4.25,
"grad_norm": 5.653372764587402,
"learning_rate": 8.325328210054435e-06,
"loss": 0.178,
"step": 2950
},
{
"epoch": 4.25,
"grad_norm": 2.084319829940796,
"learning_rate": 8.24527697726545e-06,
"loss": 0.185,
"step": 2955
},
{
"epoch": 4.26,
"grad_norm": 3.9863054752349854,
"learning_rate": 8.165225744476466e-06,
"loss": 0.1945,
"step": 2960
},
{
"epoch": 4.27,
"grad_norm": 6.000556468963623,
"learning_rate": 8.085174511687481e-06,
"loss": 0.1823,
"step": 2965
},
{
"epoch": 4.27,
"grad_norm": 3.515742778778076,
"learning_rate": 8.005123278898495e-06,
"loss": 0.1957,
"step": 2970
},
{
"epoch": 4.28,
"grad_norm": 2.8108863830566406,
"learning_rate": 7.92507204610951e-06,
"loss": 0.1838,
"step": 2975
},
{
"epoch": 4.29,
"grad_norm": 5.262875556945801,
"learning_rate": 7.845020813320524e-06,
"loss": 0.2389,
"step": 2980
},
{
"epoch": 4.3,
"grad_norm": 5.4690752029418945,
"learning_rate": 7.76496958053154e-06,
"loss": 0.1823,
"step": 2985
},
{
"epoch": 4.3,
"grad_norm": 2.1274213790893555,
"learning_rate": 7.684918347742557e-06,
"loss": 0.1233,
"step": 2990
},
{
"epoch": 4.31,
"grad_norm": 6.855415344238281,
"learning_rate": 7.604867114953571e-06,
"loss": 0.2284,
"step": 2995
},
{
"epoch": 4.32,
"grad_norm": 5.152151584625244,
"learning_rate": 7.524815882164586e-06,
"loss": 0.1856,
"step": 3000
},
{
"epoch": 4.33,
"grad_norm": 4.211722373962402,
"learning_rate": 7.4447646493756e-06,
"loss": 0.2111,
"step": 3005
},
{
"epoch": 4.33,
"grad_norm": 4.821152210235596,
"learning_rate": 7.364713416586616e-06,
"loss": 0.1541,
"step": 3010
},
{
"epoch": 4.34,
"grad_norm": 3.2400951385498047,
"learning_rate": 7.28466218379763e-06,
"loss": 0.21,
"step": 3015
},
{
"epoch": 4.35,
"grad_norm": 3.82334566116333,
"learning_rate": 7.204610951008646e-06,
"loss": 0.1835,
"step": 3020
},
{
"epoch": 4.35,
"grad_norm": 4.301241397857666,
"learning_rate": 7.124559718219661e-06,
"loss": 0.2246,
"step": 3025
},
{
"epoch": 4.36,
"grad_norm": 3.4558205604553223,
"learning_rate": 7.044508485430676e-06,
"loss": 0.1766,
"step": 3030
},
{
"epoch": 4.37,
"grad_norm": 3.872791290283203,
"learning_rate": 6.964457252641691e-06,
"loss": 0.2126,
"step": 3035
},
{
"epoch": 4.38,
"grad_norm": 2.319420099258423,
"learning_rate": 6.884406019852706e-06,
"loss": 0.179,
"step": 3040
},
{
"epoch": 4.38,
"grad_norm": 6.737104892730713,
"learning_rate": 6.8043547870637215e-06,
"loss": 0.1882,
"step": 3045
},
{
"epoch": 4.39,
"grad_norm": 4.559133052825928,
"learning_rate": 6.724303554274736e-06,
"loss": 0.1808,
"step": 3050
},
{
"epoch": 4.4,
"grad_norm": 3.060370922088623,
"learning_rate": 6.644252321485752e-06,
"loss": 0.1923,
"step": 3055
},
{
"epoch": 4.4,
"grad_norm": 5.091296672821045,
"learning_rate": 6.564201088696765e-06,
"loss": 0.2012,
"step": 3060
},
{
"epoch": 4.41,
"grad_norm": 2.942782163619995,
"learning_rate": 6.484149855907781e-06,
"loss": 0.1731,
"step": 3065
},
{
"epoch": 4.42,
"grad_norm": 4.692785263061523,
"learning_rate": 6.404098623118797e-06,
"loss": 0.1765,
"step": 3070
},
{
"epoch": 4.43,
"grad_norm": 4.15416145324707,
"learning_rate": 6.324047390329811e-06,
"loss": 0.168,
"step": 3075
},
{
"epoch": 4.43,
"grad_norm": 4.836540699005127,
"learning_rate": 6.2439961575408265e-06,
"loss": 0.1884,
"step": 3080
},
{
"epoch": 4.44,
"grad_norm": 5.723465442657471,
"learning_rate": 6.163944924751842e-06,
"loss": 0.2006,
"step": 3085
},
{
"epoch": 4.45,
"grad_norm": 3.738910675048828,
"learning_rate": 6.083893691962857e-06,
"loss": 0.152,
"step": 3090
},
{
"epoch": 4.45,
"grad_norm": 4.6227641105651855,
"learning_rate": 6.003842459173871e-06,
"loss": 0.1885,
"step": 3095
},
{
"epoch": 4.46,
"grad_norm": 4.877871036529541,
"learning_rate": 5.923791226384887e-06,
"loss": 0.1635,
"step": 3100
},
{
"epoch": 4.47,
"grad_norm": 3.391716480255127,
"learning_rate": 5.843739993595901e-06,
"loss": 0.1917,
"step": 3105
},
{
"epoch": 4.48,
"grad_norm": 3.0858306884765625,
"learning_rate": 5.763688760806917e-06,
"loss": 0.1981,
"step": 3110
},
{
"epoch": 4.48,
"grad_norm": 3.075488805770874,
"learning_rate": 5.6836375280179315e-06,
"loss": 0.175,
"step": 3115
},
{
"epoch": 4.49,
"grad_norm": 4.415194988250732,
"learning_rate": 5.603586295228947e-06,
"loss": 0.2039,
"step": 3120
},
{
"epoch": 4.5,
"grad_norm": 4.507144451141357,
"learning_rate": 5.523535062439962e-06,
"loss": 0.1816,
"step": 3125
},
{
"epoch": 4.51,
"grad_norm": 4.327670097351074,
"learning_rate": 5.443483829650977e-06,
"loss": 0.2072,
"step": 3130
},
{
"epoch": 4.51,
"grad_norm": 3.314438819885254,
"learning_rate": 5.363432596861992e-06,
"loss": 0.1997,
"step": 3135
},
{
"epoch": 4.52,
"grad_norm": 3.981945753097534,
"learning_rate": 5.283381364073007e-06,
"loss": 0.1643,
"step": 3140
},
{
"epoch": 4.53,
"grad_norm": 3.4533607959747314,
"learning_rate": 5.203330131284022e-06,
"loss": 0.1503,
"step": 3145
},
{
"epoch": 4.53,
"grad_norm": 3.6115882396698,
"learning_rate": 5.123278898495037e-06,
"loss": 0.1712,
"step": 3150
},
{
"epoch": 4.54,
"grad_norm": 2.636838912963867,
"learning_rate": 5.043227665706052e-06,
"loss": 0.1828,
"step": 3155
},
{
"epoch": 4.55,
"grad_norm": 3.045761823654175,
"learning_rate": 4.9631764329170676e-06,
"loss": 0.167,
"step": 3160
},
{
"epoch": 4.56,
"grad_norm": 5.738334655761719,
"learning_rate": 4.883125200128082e-06,
"loss": 0.2237,
"step": 3165
},
{
"epoch": 4.56,
"grad_norm": 2.163240909576416,
"learning_rate": 4.803073967339098e-06,
"loss": 0.1411,
"step": 3170
},
{
"epoch": 4.57,
"grad_norm": 5.213181495666504,
"learning_rate": 4.723022734550112e-06,
"loss": 0.1874,
"step": 3175
},
{
"epoch": 4.58,
"grad_norm": 3.869131565093994,
"learning_rate": 4.642971501761127e-06,
"loss": 0.1756,
"step": 3180
},
{
"epoch": 4.58,
"grad_norm": 3.244732618331909,
"learning_rate": 4.5629202689721425e-06,
"loss": 0.1829,
"step": 3185
},
{
"epoch": 4.59,
"grad_norm": 3.5364272594451904,
"learning_rate": 4.482869036183157e-06,
"loss": 0.1861,
"step": 3190
},
{
"epoch": 4.6,
"grad_norm": 2.5283873081207275,
"learning_rate": 4.402817803394173e-06,
"loss": 0.1931,
"step": 3195
},
{
"epoch": 4.61,
"grad_norm": 3.36181902885437,
"learning_rate": 4.322766570605188e-06,
"loss": 0.2183,
"step": 3200
},
{
"epoch": 4.61,
"grad_norm": 5.513607025146484,
"learning_rate": 4.242715337816203e-06,
"loss": 0.1717,
"step": 3205
},
{
"epoch": 4.62,
"grad_norm": 5.976490497589111,
"learning_rate": 4.162664105027217e-06,
"loss": 0.202,
"step": 3210
},
{
"epoch": 4.63,
"grad_norm": 3.3449387550354004,
"learning_rate": 4.082612872238233e-06,
"loss": 0.2165,
"step": 3215
},
{
"epoch": 4.63,
"grad_norm": 3.3972129821777344,
"learning_rate": 4.0025616394492475e-06,
"loss": 0.1994,
"step": 3220
},
{
"epoch": 4.64,
"grad_norm": 4.022273540496826,
"learning_rate": 3.922510406660262e-06,
"loss": 0.168,
"step": 3225
},
{
"epoch": 4.65,
"grad_norm": 3.2063329219818115,
"learning_rate": 3.8424591738712785e-06,
"loss": 0.1862,
"step": 3230
},
{
"epoch": 4.66,
"grad_norm": 3.1869962215423584,
"learning_rate": 3.762407941082293e-06,
"loss": 0.1583,
"step": 3235
},
{
"epoch": 4.66,
"grad_norm": 3.648125171661377,
"learning_rate": 3.682356708293308e-06,
"loss": 0.2026,
"step": 3240
},
{
"epoch": 4.67,
"grad_norm": 4.182619571685791,
"learning_rate": 3.602305475504323e-06,
"loss": 0.1711,
"step": 3245
},
{
"epoch": 4.68,
"grad_norm": 3.2886900901794434,
"learning_rate": 3.522254242715338e-06,
"loss": 0.1778,
"step": 3250
},
{
"epoch": 4.69,
"grad_norm": 3.8204097747802734,
"learning_rate": 3.442203009926353e-06,
"loss": 0.1906,
"step": 3255
},
{
"epoch": 4.69,
"grad_norm": 4.073367595672607,
"learning_rate": 3.362151777137368e-06,
"loss": 0.1693,
"step": 3260
},
{
"epoch": 4.7,
"grad_norm": 4.779504299163818,
"learning_rate": 3.2821005443483827e-06,
"loss": 0.2031,
"step": 3265
},
{
"epoch": 4.71,
"grad_norm": 4.730034828186035,
"learning_rate": 3.2020493115593986e-06,
"loss": 0.1731,
"step": 3270
},
{
"epoch": 4.71,
"grad_norm": 4.198641300201416,
"learning_rate": 3.1219980787704133e-06,
"loss": 0.1982,
"step": 3275
},
{
"epoch": 4.72,
"grad_norm": 3.796201229095459,
"learning_rate": 3.0419468459814283e-06,
"loss": 0.2502,
"step": 3280
},
{
"epoch": 4.73,
"grad_norm": 3.4022860527038574,
"learning_rate": 2.9618956131924434e-06,
"loss": 0.1746,
"step": 3285
},
{
"epoch": 4.74,
"grad_norm": 3.493821859359741,
"learning_rate": 2.8818443804034585e-06,
"loss": 0.1862,
"step": 3290
},
{
"epoch": 4.74,
"grad_norm": 4.883081436157227,
"learning_rate": 2.8017931476144735e-06,
"loss": 0.1832,
"step": 3295
},
{
"epoch": 4.75,
"grad_norm": 4.014003753662109,
"learning_rate": 2.7217419148254886e-06,
"loss": 0.2232,
"step": 3300
},
{
"epoch": 4.76,
"grad_norm": 3.3797993659973145,
"learning_rate": 2.6416906820365037e-06,
"loss": 0.1791,
"step": 3305
},
{
"epoch": 4.76,
"grad_norm": 2.9076929092407227,
"learning_rate": 2.5616394492475183e-06,
"loss": 0.1557,
"step": 3310
},
{
"epoch": 4.77,
"grad_norm": 5.119110584259033,
"learning_rate": 2.4815882164585338e-06,
"loss": 0.1989,
"step": 3315
},
{
"epoch": 4.78,
"grad_norm": 3.889577627182007,
"learning_rate": 2.401536983669549e-06,
"loss": 0.1771,
"step": 3320
},
{
"epoch": 4.79,
"grad_norm": 2.979879379272461,
"learning_rate": 2.3214857508805635e-06,
"loss": 0.2187,
"step": 3325
},
{
"epoch": 4.79,
"grad_norm": 4.31455135345459,
"learning_rate": 2.2414345180915786e-06,
"loss": 0.1818,
"step": 3330
},
{
"epoch": 4.8,
"grad_norm": 5.267322540283203,
"learning_rate": 2.161383285302594e-06,
"loss": 0.1564,
"step": 3335
},
{
"epoch": 4.81,
"grad_norm": 4.620851516723633,
"learning_rate": 2.0813320525136087e-06,
"loss": 0.2058,
"step": 3340
},
{
"epoch": 4.81,
"grad_norm": 3.6133904457092285,
"learning_rate": 2.0012808197246238e-06,
"loss": 0.1678,
"step": 3345
},
{
"epoch": 4.82,
"grad_norm": 2.955531358718872,
"learning_rate": 1.9212295869356392e-06,
"loss": 0.1771,
"step": 3350
},
{
"epoch": 4.83,
"grad_norm": 5.3159403800964355,
"learning_rate": 1.841178354146654e-06,
"loss": 0.2387,
"step": 3355
},
{
"epoch": 4.84,
"grad_norm": 3.5263235569000244,
"learning_rate": 1.761127121357669e-06,
"loss": 0.2061,
"step": 3360
},
{
"epoch": 4.84,
"grad_norm": 3.794788122177124,
"learning_rate": 1.681075888568684e-06,
"loss": 0.1975,
"step": 3365
},
{
"epoch": 4.85,
"grad_norm": 3.7242631912231445,
"learning_rate": 1.6010246557796993e-06,
"loss": 0.202,
"step": 3370
},
{
"epoch": 4.86,
"grad_norm": 3.291221857070923,
"learning_rate": 1.5209734229907142e-06,
"loss": 0.1749,
"step": 3375
},
{
"epoch": 4.87,
"grad_norm": 7.191506385803223,
"learning_rate": 1.4409221902017292e-06,
"loss": 0.1787,
"step": 3380
},
{
"epoch": 4.87,
"grad_norm": 3.5962772369384766,
"learning_rate": 1.3608709574127443e-06,
"loss": 0.1894,
"step": 3385
},
{
"epoch": 4.88,
"grad_norm": 3.013857126235962,
"learning_rate": 1.2808197246237591e-06,
"loss": 0.1439,
"step": 3390
},
{
"epoch": 4.89,
"grad_norm": 3.8775179386138916,
"learning_rate": 1.2007684918347744e-06,
"loss": 0.1709,
"step": 3395
},
{
"epoch": 4.89,
"grad_norm": 5.876482963562012,
"learning_rate": 1.1207172590457893e-06,
"loss": 0.1823,
"step": 3400
},
{
"epoch": 4.9,
"grad_norm": 3.76519513130188,
"learning_rate": 1.0406660262568043e-06,
"loss": 0.1932,
"step": 3405
},
{
"epoch": 4.91,
"grad_norm": 3.4437146186828613,
"learning_rate": 9.606147934678196e-07,
"loss": 0.2059,
"step": 3410
},
{
"epoch": 4.92,
"grad_norm": 4.459022045135498,
"learning_rate": 8.805635606788345e-07,
"loss": 0.2016,
"step": 3415
},
{
"epoch": 4.92,
"grad_norm": 3.656373977661133,
"learning_rate": 8.005123278898497e-07,
"loss": 0.1869,
"step": 3420
},
{
"epoch": 4.93,
"grad_norm": 2.2337965965270996,
"learning_rate": 7.204610951008646e-07,
"loss": 0.1501,
"step": 3425
},
{
"epoch": 4.94,
"grad_norm": 5.598134994506836,
"learning_rate": 6.404098623118796e-07,
"loss": 0.1659,
"step": 3430
},
{
"epoch": 4.94,
"grad_norm": 4.543219089508057,
"learning_rate": 5.603586295228946e-07,
"loss": 0.2164,
"step": 3435
},
{
"epoch": 4.95,
"grad_norm": 4.817913055419922,
"learning_rate": 4.803073967339098e-07,
"loss": 0.1332,
"step": 3440
},
{
"epoch": 4.96,
"grad_norm": 6.280834674835205,
"learning_rate": 4.002561639449248e-07,
"loss": 0.2054,
"step": 3445
},
{
"epoch": 4.97,
"grad_norm": 3.0518364906311035,
"learning_rate": 3.202049311559398e-07,
"loss": 0.1904,
"step": 3450
},
{
"epoch": 4.97,
"grad_norm": 3.695298910140991,
"learning_rate": 2.401536983669549e-07,
"loss": 0.1784,
"step": 3455
},
{
"epoch": 4.98,
"grad_norm": 6.226070880889893,
"learning_rate": 1.601024655779699e-07,
"loss": 0.1871,
"step": 3460
},
{
"epoch": 4.99,
"grad_norm": 4.446568489074707,
"learning_rate": 8.005123278898495e-08,
"loss": 0.2494,
"step": 3465
},
{
"epoch": 4.99,
"grad_norm": 5.050913333892822,
"learning_rate": 0.0,
"loss": 0.2138,
"step": 3470
},
{
"epoch": 4.99,
"eval_accuracy": 0.9518218623481781,
"eval_loss": 0.1259032040834427,
"eval_runtime": 31.3409,
"eval_samples_per_second": 315.243,
"eval_steps_per_second": 9.859,
"step": 3470
},
{
"epoch": 4.99,
"step": 3470,
"total_flos": 1.1039888050539651e+19,
"train_loss": 0.287632371283402,
"train_runtime": 2790.3597,
"train_samples_per_second": 159.318,
"train_steps_per_second": 1.244
}
],
"logging_steps": 5,
"max_steps": 3470,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.1039888050539651e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}