ozair23's picture
End of training
d892f1d verified
{
"best_metric": 0.9777191259513872,
"best_model_checkpoint": "mobilenet_v2_1.0_224-finetuned-plantdisease/checkpoint-9164",
"epoch": 9.995635093845483,
"eval_steps": 500,
"global_step": 11450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008729812309035356,
"grad_norm": 5.370448589324951,
"learning_rate": 4.366812227074236e-07,
"loss": 3.7534,
"step": 10
},
{
"epoch": 0.017459624618070713,
"grad_norm": 5.382869720458984,
"learning_rate": 8.733624454148472e-07,
"loss": 3.7447,
"step": 20
},
{
"epoch": 0.026189436927106065,
"grad_norm": 5.5538249015808105,
"learning_rate": 1.3100436681222706e-06,
"loss": 3.7409,
"step": 30
},
{
"epoch": 0.034919249236141425,
"grad_norm": 5.5582685470581055,
"learning_rate": 1.7467248908296944e-06,
"loss": 3.7325,
"step": 40
},
{
"epoch": 0.04364906154517678,
"grad_norm": 5.354773998260498,
"learning_rate": 2.1834061135371177e-06,
"loss": 3.7135,
"step": 50
},
{
"epoch": 0.05237887385421213,
"grad_norm": 5.103704929351807,
"learning_rate": 2.6200873362445413e-06,
"loss": 3.714,
"step": 60
},
{
"epoch": 0.06110868616324749,
"grad_norm": 5.359903812408447,
"learning_rate": 3.056768558951965e-06,
"loss": 3.6754,
"step": 70
},
{
"epoch": 0.06983849847228285,
"grad_norm": 5.380093097686768,
"learning_rate": 3.493449781659389e-06,
"loss": 3.6767,
"step": 80
},
{
"epoch": 0.0785683107813182,
"grad_norm": 5.455582141876221,
"learning_rate": 3.930131004366813e-06,
"loss": 3.6431,
"step": 90
},
{
"epoch": 0.08729812309035356,
"grad_norm": 5.4371490478515625,
"learning_rate": 4.3668122270742355e-06,
"loss": 3.6285,
"step": 100
},
{
"epoch": 0.09602793539938892,
"grad_norm": 5.358748435974121,
"learning_rate": 4.80349344978166e-06,
"loss": 3.594,
"step": 110
},
{
"epoch": 0.10475774770842426,
"grad_norm": 5.466850757598877,
"learning_rate": 5.240174672489083e-06,
"loss": 3.5717,
"step": 120
},
{
"epoch": 0.11348756001745962,
"grad_norm": 5.143874168395996,
"learning_rate": 5.676855895196507e-06,
"loss": 3.5456,
"step": 130
},
{
"epoch": 0.12221737232649497,
"grad_norm": 5.1142683029174805,
"learning_rate": 6.11353711790393e-06,
"loss": 3.5045,
"step": 140
},
{
"epoch": 0.13094718463553034,
"grad_norm": 5.174585819244385,
"learning_rate": 6.550218340611354e-06,
"loss": 3.452,
"step": 150
},
{
"epoch": 0.1396769969445657,
"grad_norm": 5.233867168426514,
"learning_rate": 6.986899563318778e-06,
"loss": 3.4269,
"step": 160
},
{
"epoch": 0.14840680925360106,
"grad_norm": 4.996443271636963,
"learning_rate": 7.423580786026202e-06,
"loss": 3.3866,
"step": 170
},
{
"epoch": 0.1571366215626364,
"grad_norm": 5.243433475494385,
"learning_rate": 7.860262008733626e-06,
"loss": 3.3293,
"step": 180
},
{
"epoch": 0.16586643387167177,
"grad_norm": 5.2202911376953125,
"learning_rate": 8.296943231441049e-06,
"loss": 3.2767,
"step": 190
},
{
"epoch": 0.17459624618070713,
"grad_norm": 4.891767501831055,
"learning_rate": 8.733624454148471e-06,
"loss": 3.2698,
"step": 200
},
{
"epoch": 0.18332605848974248,
"grad_norm": 4.960419654846191,
"learning_rate": 9.170305676855896e-06,
"loss": 3.2333,
"step": 210
},
{
"epoch": 0.19205587079877784,
"grad_norm": 4.834344387054443,
"learning_rate": 9.60698689956332e-06,
"loss": 3.1378,
"step": 220
},
{
"epoch": 0.2007856831078132,
"grad_norm": 5.046371936798096,
"learning_rate": 1.0043668122270743e-05,
"loss": 3.0687,
"step": 230
},
{
"epoch": 0.20951549541684852,
"grad_norm": 5.039073467254639,
"learning_rate": 1.0480349344978165e-05,
"loss": 3.0194,
"step": 240
},
{
"epoch": 0.21824530772588388,
"grad_norm": 5.1495771408081055,
"learning_rate": 1.091703056768559e-05,
"loss": 2.9677,
"step": 250
},
{
"epoch": 0.22697512003491924,
"grad_norm": 4.857588768005371,
"learning_rate": 1.1353711790393014e-05,
"loss": 2.9015,
"step": 260
},
{
"epoch": 0.2357049323439546,
"grad_norm": 4.682951927185059,
"learning_rate": 1.1790393013100438e-05,
"loss": 2.8179,
"step": 270
},
{
"epoch": 0.24443474465298995,
"grad_norm": 4.9485602378845215,
"learning_rate": 1.222707423580786e-05,
"loss": 2.8279,
"step": 280
},
{
"epoch": 0.25316455696202533,
"grad_norm": 4.801344871520996,
"learning_rate": 1.2663755458515283e-05,
"loss": 2.7402,
"step": 290
},
{
"epoch": 0.2618943692710607,
"grad_norm": 4.586411952972412,
"learning_rate": 1.3100436681222708e-05,
"loss": 2.7108,
"step": 300
},
{
"epoch": 0.27062418158009605,
"grad_norm": 4.808084487915039,
"learning_rate": 1.3537117903930133e-05,
"loss": 2.585,
"step": 310
},
{
"epoch": 0.2793539938891314,
"grad_norm": 4.550442695617676,
"learning_rate": 1.3973799126637555e-05,
"loss": 2.6376,
"step": 320
},
{
"epoch": 0.28808380619816676,
"grad_norm": 4.389446258544922,
"learning_rate": 1.4410480349344979e-05,
"loss": 2.511,
"step": 330
},
{
"epoch": 0.2968136185072021,
"grad_norm": 4.278360366821289,
"learning_rate": 1.4847161572052404e-05,
"loss": 2.4379,
"step": 340
},
{
"epoch": 0.30554343081623747,
"grad_norm": 4.738358497619629,
"learning_rate": 1.5283842794759826e-05,
"loss": 2.387,
"step": 350
},
{
"epoch": 0.3142732431252728,
"grad_norm": 4.373560428619385,
"learning_rate": 1.572052401746725e-05,
"loss": 2.3534,
"step": 360
},
{
"epoch": 0.3230030554343082,
"grad_norm": 4.679076671600342,
"learning_rate": 1.6157205240174673e-05,
"loss": 2.321,
"step": 370
},
{
"epoch": 0.33173286774334354,
"grad_norm": 4.373522758483887,
"learning_rate": 1.6593886462882098e-05,
"loss": 2.2783,
"step": 380
},
{
"epoch": 0.3404626800523789,
"grad_norm": 4.47551155090332,
"learning_rate": 1.703056768558952e-05,
"loss": 2.1886,
"step": 390
},
{
"epoch": 0.34919249236141425,
"grad_norm": 4.3290181159973145,
"learning_rate": 1.7467248908296942e-05,
"loss": 2.1986,
"step": 400
},
{
"epoch": 0.3579223046704496,
"grad_norm": 4.084609031677246,
"learning_rate": 1.7903930131004367e-05,
"loss": 2.0736,
"step": 410
},
{
"epoch": 0.36665211697948497,
"grad_norm": 4.143791198730469,
"learning_rate": 1.8340611353711792e-05,
"loss": 2.045,
"step": 420
},
{
"epoch": 0.3753819292885203,
"grad_norm": 4.159194469451904,
"learning_rate": 1.8777292576419214e-05,
"loss": 1.9542,
"step": 430
},
{
"epoch": 0.3841117415975557,
"grad_norm": 4.298882007598877,
"learning_rate": 1.921397379912664e-05,
"loss": 1.9863,
"step": 440
},
{
"epoch": 0.39284155390659103,
"grad_norm": 4.460660934448242,
"learning_rate": 1.965065502183406e-05,
"loss": 1.9066,
"step": 450
},
{
"epoch": 0.4015713662156264,
"grad_norm": 4.154500484466553,
"learning_rate": 2.0087336244541487e-05,
"loss": 1.7901,
"step": 460
},
{
"epoch": 0.41030117852466175,
"grad_norm": 4.019309043884277,
"learning_rate": 2.052401746724891e-05,
"loss": 1.8314,
"step": 470
},
{
"epoch": 0.41903099083369705,
"grad_norm": 4.08663272857666,
"learning_rate": 2.096069868995633e-05,
"loss": 1.7671,
"step": 480
},
{
"epoch": 0.4277608031427324,
"grad_norm": 4.213662624359131,
"learning_rate": 2.1397379912663756e-05,
"loss": 1.7467,
"step": 490
},
{
"epoch": 0.43649061545176776,
"grad_norm": 3.9979076385498047,
"learning_rate": 2.183406113537118e-05,
"loss": 1.6146,
"step": 500
},
{
"epoch": 0.4452204277608031,
"grad_norm": 3.973045825958252,
"learning_rate": 2.2270742358078603e-05,
"loss": 1.6054,
"step": 510
},
{
"epoch": 0.4539502400698385,
"grad_norm": 4.335777282714844,
"learning_rate": 2.2707423580786028e-05,
"loss": 1.6053,
"step": 520
},
{
"epoch": 0.46268005237887383,
"grad_norm": 4.511986255645752,
"learning_rate": 2.3144104803493453e-05,
"loss": 1.5458,
"step": 530
},
{
"epoch": 0.4714098646879092,
"grad_norm": 3.8047099113464355,
"learning_rate": 2.3580786026200875e-05,
"loss": 1.5328,
"step": 540
},
{
"epoch": 0.48013967699694454,
"grad_norm": 3.791404962539673,
"learning_rate": 2.4017467248908297e-05,
"loss": 1.4993,
"step": 550
},
{
"epoch": 0.4888694893059799,
"grad_norm": 4.048672199249268,
"learning_rate": 2.445414847161572e-05,
"loss": 1.4274,
"step": 560
},
{
"epoch": 0.49759930161501525,
"grad_norm": 4.073417663574219,
"learning_rate": 2.4890829694323144e-05,
"loss": 1.3728,
"step": 570
},
{
"epoch": 0.5063291139240507,
"grad_norm": 4.089795112609863,
"learning_rate": 2.5327510917030566e-05,
"loss": 1.3612,
"step": 580
},
{
"epoch": 0.515058926233086,
"grad_norm": 3.6688883304595947,
"learning_rate": 2.576419213973799e-05,
"loss": 1.2784,
"step": 590
},
{
"epoch": 0.5237887385421214,
"grad_norm": 3.9120841026306152,
"learning_rate": 2.6200873362445416e-05,
"loss": 1.2878,
"step": 600
},
{
"epoch": 0.5325185508511567,
"grad_norm": 3.715569257736206,
"learning_rate": 2.663755458515284e-05,
"loss": 1.1857,
"step": 610
},
{
"epoch": 0.5412483631601921,
"grad_norm": 3.895285129547119,
"learning_rate": 2.7074235807860267e-05,
"loss": 1.1727,
"step": 620
},
{
"epoch": 0.5499781754692274,
"grad_norm": 3.671576738357544,
"learning_rate": 2.7510917030567685e-05,
"loss": 1.1431,
"step": 630
},
{
"epoch": 0.5587079877782628,
"grad_norm": 4.0363993644714355,
"learning_rate": 2.794759825327511e-05,
"loss": 1.1228,
"step": 640
},
{
"epoch": 0.5674378000872982,
"grad_norm": 4.259347915649414,
"learning_rate": 2.8384279475982532e-05,
"loss": 1.1352,
"step": 650
},
{
"epoch": 0.5761676123963335,
"grad_norm": 3.2646217346191406,
"learning_rate": 2.8820960698689958e-05,
"loss": 1.0213,
"step": 660
},
{
"epoch": 0.5848974247053689,
"grad_norm": 3.9578239917755127,
"learning_rate": 2.9257641921397383e-05,
"loss": 1.0741,
"step": 670
},
{
"epoch": 0.5936272370144042,
"grad_norm": 3.894042730331421,
"learning_rate": 2.9694323144104808e-05,
"loss": 1.0305,
"step": 680
},
{
"epoch": 0.6023570493234396,
"grad_norm": 4.07789945602417,
"learning_rate": 3.0131004366812227e-05,
"loss": 1.0185,
"step": 690
},
{
"epoch": 0.6110868616324749,
"grad_norm": 4.043015003204346,
"learning_rate": 3.056768558951965e-05,
"loss": 1.0215,
"step": 700
},
{
"epoch": 0.6198166739415103,
"grad_norm": 3.8258657455444336,
"learning_rate": 3.1004366812227074e-05,
"loss": 0.9917,
"step": 710
},
{
"epoch": 0.6285464862505457,
"grad_norm": 3.633079767227173,
"learning_rate": 3.14410480349345e-05,
"loss": 0.9212,
"step": 720
},
{
"epoch": 0.637276298559581,
"grad_norm": 4.321951866149902,
"learning_rate": 3.1877729257641924e-05,
"loss": 0.9052,
"step": 730
},
{
"epoch": 0.6460061108686164,
"grad_norm": 3.5607919692993164,
"learning_rate": 3.2314410480349346e-05,
"loss": 0.894,
"step": 740
},
{
"epoch": 0.6547359231776517,
"grad_norm": 3.680312156677246,
"learning_rate": 3.275109170305677e-05,
"loss": 0.8355,
"step": 750
},
{
"epoch": 0.6634657354866871,
"grad_norm": 4.108597755432129,
"learning_rate": 3.3187772925764197e-05,
"loss": 0.7819,
"step": 760
},
{
"epoch": 0.6721955477957224,
"grad_norm": 4.298781871795654,
"learning_rate": 3.362445414847162e-05,
"loss": 0.823,
"step": 770
},
{
"epoch": 0.6809253601047578,
"grad_norm": 3.7209835052490234,
"learning_rate": 3.406113537117904e-05,
"loss": 0.8231,
"step": 780
},
{
"epoch": 0.6896551724137931,
"grad_norm": 3.6912119388580322,
"learning_rate": 3.449781659388647e-05,
"loss": 0.7911,
"step": 790
},
{
"epoch": 0.6983849847228285,
"grad_norm": 3.601679563522339,
"learning_rate": 3.4934497816593884e-05,
"loss": 0.7401,
"step": 800
},
{
"epoch": 0.7071147970318639,
"grad_norm": 3.856973171234131,
"learning_rate": 3.537117903930131e-05,
"loss": 0.7565,
"step": 810
},
{
"epoch": 0.7158446093408992,
"grad_norm": 4.325893878936768,
"learning_rate": 3.5807860262008734e-05,
"loss": 0.7557,
"step": 820
},
{
"epoch": 0.7245744216499346,
"grad_norm": 4.065762996673584,
"learning_rate": 3.624454148471616e-05,
"loss": 0.7348,
"step": 830
},
{
"epoch": 0.7333042339589699,
"grad_norm": 3.6065175533294678,
"learning_rate": 3.6681222707423585e-05,
"loss": 0.7162,
"step": 840
},
{
"epoch": 0.7420340462680053,
"grad_norm": 3.7237186431884766,
"learning_rate": 3.711790393013101e-05,
"loss": 0.6657,
"step": 850
},
{
"epoch": 0.7507638585770406,
"grad_norm": 3.3277764320373535,
"learning_rate": 3.755458515283843e-05,
"loss": 0.6956,
"step": 860
},
{
"epoch": 0.759493670886076,
"grad_norm": 4.210085868835449,
"learning_rate": 3.799126637554585e-05,
"loss": 0.6844,
"step": 870
},
{
"epoch": 0.7682234831951114,
"grad_norm": 3.41595458984375,
"learning_rate": 3.842794759825328e-05,
"loss": 0.6489,
"step": 880
},
{
"epoch": 0.7769532955041467,
"grad_norm": 4.264623641967773,
"learning_rate": 3.88646288209607e-05,
"loss": 0.6868,
"step": 890
},
{
"epoch": 0.7856831078131821,
"grad_norm": 3.9044318199157715,
"learning_rate": 3.930131004366812e-05,
"loss": 0.6235,
"step": 900
},
{
"epoch": 0.7944129201222174,
"grad_norm": 3.767618179321289,
"learning_rate": 3.9737991266375545e-05,
"loss": 0.6105,
"step": 910
},
{
"epoch": 0.8031427324312528,
"grad_norm": 3.6234493255615234,
"learning_rate": 4.017467248908297e-05,
"loss": 0.6195,
"step": 920
},
{
"epoch": 0.8118725447402881,
"grad_norm": 3.8668875694274902,
"learning_rate": 4.0611353711790395e-05,
"loss": 0.6341,
"step": 930
},
{
"epoch": 0.8206023570493235,
"grad_norm": 3.911647081375122,
"learning_rate": 4.104803493449782e-05,
"loss": 0.6803,
"step": 940
},
{
"epoch": 0.8293321693583589,
"grad_norm": 4.5981950759887695,
"learning_rate": 4.1484716157205246e-05,
"loss": 0.5992,
"step": 950
},
{
"epoch": 0.8380619816673941,
"grad_norm": 3.940392017364502,
"learning_rate": 4.192139737991266e-05,
"loss": 0.5896,
"step": 960
},
{
"epoch": 0.8467917939764295,
"grad_norm": 3.4633290767669678,
"learning_rate": 4.235807860262009e-05,
"loss": 0.5743,
"step": 970
},
{
"epoch": 0.8555216062854648,
"grad_norm": 3.471381187438965,
"learning_rate": 4.279475982532751e-05,
"loss": 0.5745,
"step": 980
},
{
"epoch": 0.8642514185945002,
"grad_norm": 3.305868625640869,
"learning_rate": 4.323144104803494e-05,
"loss": 0.6056,
"step": 990
},
{
"epoch": 0.8729812309035355,
"grad_norm": 3.885556221008301,
"learning_rate": 4.366812227074236e-05,
"loss": 0.5638,
"step": 1000
},
{
"epoch": 0.8817110432125709,
"grad_norm": 3.944361448287964,
"learning_rate": 4.4104803493449784e-05,
"loss": 0.524,
"step": 1010
},
{
"epoch": 0.8904408555216062,
"grad_norm": 3.539358377456665,
"learning_rate": 4.4541484716157205e-05,
"loss": 0.5493,
"step": 1020
},
{
"epoch": 0.8991706678306416,
"grad_norm": 3.6482980251312256,
"learning_rate": 4.497816593886463e-05,
"loss": 0.5493,
"step": 1030
},
{
"epoch": 0.907900480139677,
"grad_norm": 3.9914190769195557,
"learning_rate": 4.5414847161572056e-05,
"loss": 0.4909,
"step": 1040
},
{
"epoch": 0.9166302924487123,
"grad_norm": 4.449773788452148,
"learning_rate": 4.585152838427948e-05,
"loss": 0.5089,
"step": 1050
},
{
"epoch": 0.9253601047577477,
"grad_norm": 4.087109565734863,
"learning_rate": 4.6288209606986906e-05,
"loss": 0.5272,
"step": 1060
},
{
"epoch": 0.934089917066783,
"grad_norm": 4.070830345153809,
"learning_rate": 4.672489082969432e-05,
"loss": 0.5054,
"step": 1070
},
{
"epoch": 0.9428197293758184,
"grad_norm": 3.926940679550171,
"learning_rate": 4.716157205240175e-05,
"loss": 0.5005,
"step": 1080
},
{
"epoch": 0.9515495416848537,
"grad_norm": 3.5378997325897217,
"learning_rate": 4.759825327510917e-05,
"loss": 0.5203,
"step": 1090
},
{
"epoch": 0.9602793539938891,
"grad_norm": 3.6640408039093018,
"learning_rate": 4.8034934497816594e-05,
"loss": 0.4391,
"step": 1100
},
{
"epoch": 0.9690091663029244,
"grad_norm": 3.9486780166625977,
"learning_rate": 4.847161572052402e-05,
"loss": 0.4608,
"step": 1110
},
{
"epoch": 0.9777389786119598,
"grad_norm": 3.081714153289795,
"learning_rate": 4.890829694323144e-05,
"loss": 0.4607,
"step": 1120
},
{
"epoch": 0.9864687909209952,
"grad_norm": 4.005092620849609,
"learning_rate": 4.9344978165938866e-05,
"loss": 0.4745,
"step": 1130
},
{
"epoch": 0.9951986032300305,
"grad_norm": 3.1498849391937256,
"learning_rate": 4.978165938864629e-05,
"loss": 0.3974,
"step": 1140
},
{
"epoch": 0.9995635093845482,
"eval_accuracy": 0.8978639823226123,
"eval_loss": 0.3598543405532837,
"eval_runtime": 62.6632,
"eval_samples_per_second": 259.993,
"eval_steps_per_second": 8.139,
"step": 1145
},
{
"epoch": 1.003928415539066,
"grad_norm": 3.5931997299194336,
"learning_rate": 4.997573993207181e-05,
"loss": 0.4407,
"step": 1150
},
{
"epoch": 1.0126582278481013,
"grad_norm": 2.662714719772339,
"learning_rate": 4.9927219796215426e-05,
"loss": 0.4129,
"step": 1160
},
{
"epoch": 1.0213880401571367,
"grad_norm": 3.9502806663513184,
"learning_rate": 4.9878699660359054e-05,
"loss": 0.4622,
"step": 1170
},
{
"epoch": 1.030117852466172,
"grad_norm": 3.2790305614471436,
"learning_rate": 4.9830179524502674e-05,
"loss": 0.4446,
"step": 1180
},
{
"epoch": 1.0388476647752074,
"grad_norm": 3.6010549068450928,
"learning_rate": 4.978165938864629e-05,
"loss": 0.4411,
"step": 1190
},
{
"epoch": 1.0475774770842428,
"grad_norm": 3.6778202056884766,
"learning_rate": 4.9733139252789915e-05,
"loss": 0.4662,
"step": 1200
},
{
"epoch": 1.0563072893932781,
"grad_norm": 3.7837071418762207,
"learning_rate": 4.968461911693353e-05,
"loss": 0.4305,
"step": 1210
},
{
"epoch": 1.0650371017023135,
"grad_norm": 3.306307554244995,
"learning_rate": 4.963609898107715e-05,
"loss": 0.4111,
"step": 1220
},
{
"epoch": 1.0737669140113488,
"grad_norm": 3.2483878135681152,
"learning_rate": 4.9587578845220763e-05,
"loss": 0.3983,
"step": 1230
},
{
"epoch": 1.0824967263203842,
"grad_norm": 3.236823558807373,
"learning_rate": 4.953905870936439e-05,
"loss": 0.386,
"step": 1240
},
{
"epoch": 1.0912265386294195,
"grad_norm": 3.8218398094177246,
"learning_rate": 4.949053857350801e-05,
"loss": 0.4355,
"step": 1250
},
{
"epoch": 1.099956350938455,
"grad_norm": 2.8009932041168213,
"learning_rate": 4.9442018437651625e-05,
"loss": 0.4203,
"step": 1260
},
{
"epoch": 1.1086861632474903,
"grad_norm": 3.2365312576293945,
"learning_rate": 4.9393498301795246e-05,
"loss": 0.3648,
"step": 1270
},
{
"epoch": 1.1174159755565256,
"grad_norm": 3.3708131313323975,
"learning_rate": 4.9344978165938866e-05,
"loss": 0.4026,
"step": 1280
},
{
"epoch": 1.126145787865561,
"grad_norm": 3.124727487564087,
"learning_rate": 4.929645803008249e-05,
"loss": 0.3873,
"step": 1290
},
{
"epoch": 1.1348756001745963,
"grad_norm": 3.1519768238067627,
"learning_rate": 4.92479378942261e-05,
"loss": 0.3864,
"step": 1300
},
{
"epoch": 1.1436054124836317,
"grad_norm": 3.662675619125366,
"learning_rate": 4.919941775836973e-05,
"loss": 0.4071,
"step": 1310
},
{
"epoch": 1.152335224792667,
"grad_norm": 3.7715182304382324,
"learning_rate": 4.915089762251335e-05,
"loss": 0.37,
"step": 1320
},
{
"epoch": 1.1610650371017024,
"grad_norm": 3.4320363998413086,
"learning_rate": 4.910237748665696e-05,
"loss": 0.4143,
"step": 1330
},
{
"epoch": 1.1697948494107377,
"grad_norm": 2.953200101852417,
"learning_rate": 4.905385735080058e-05,
"loss": 0.3522,
"step": 1340
},
{
"epoch": 1.178524661719773,
"grad_norm": 3.239183187484741,
"learning_rate": 4.90053372149442e-05,
"loss": 0.3704,
"step": 1350
},
{
"epoch": 1.1872544740288085,
"grad_norm": 3.6990933418273926,
"learning_rate": 4.8956817079087824e-05,
"loss": 0.392,
"step": 1360
},
{
"epoch": 1.1959842863378438,
"grad_norm": 3.497069835662842,
"learning_rate": 4.890829694323144e-05,
"loss": 0.3628,
"step": 1370
},
{
"epoch": 1.2047140986468792,
"grad_norm": 4.022006034851074,
"learning_rate": 4.8859776807375065e-05,
"loss": 0.3842,
"step": 1380
},
{
"epoch": 1.2134439109559145,
"grad_norm": 3.68331241607666,
"learning_rate": 4.8811256671518685e-05,
"loss": 0.3386,
"step": 1390
},
{
"epoch": 1.2221737232649499,
"grad_norm": 2.9863877296447754,
"learning_rate": 4.87627365356623e-05,
"loss": 0.3398,
"step": 1400
},
{
"epoch": 1.2309035355739852,
"grad_norm": 3.9146687984466553,
"learning_rate": 4.871421639980592e-05,
"loss": 0.3543,
"step": 1410
},
{
"epoch": 1.2396333478830206,
"grad_norm": 3.9005401134490967,
"learning_rate": 4.866569626394954e-05,
"loss": 0.4041,
"step": 1420
},
{
"epoch": 1.248363160192056,
"grad_norm": 2.9291443824768066,
"learning_rate": 4.861717612809316e-05,
"loss": 0.3363,
"step": 1430
},
{
"epoch": 1.2570929725010913,
"grad_norm": 3.2885146141052246,
"learning_rate": 4.8568655992236775e-05,
"loss": 0.375,
"step": 1440
},
{
"epoch": 1.2658227848101267,
"grad_norm": 3.188255548477173,
"learning_rate": 4.85201358563804e-05,
"loss": 0.3467,
"step": 1450
},
{
"epoch": 1.274552597119162,
"grad_norm": 3.08927059173584,
"learning_rate": 4.847161572052402e-05,
"loss": 0.3863,
"step": 1460
},
{
"epoch": 1.2832824094281974,
"grad_norm": 2.3255248069763184,
"learning_rate": 4.8423095584667636e-05,
"loss": 0.3498,
"step": 1470
},
{
"epoch": 1.2920122217372327,
"grad_norm": 3.268420696258545,
"learning_rate": 4.837457544881126e-05,
"loss": 0.3092,
"step": 1480
},
{
"epoch": 1.300742034046268,
"grad_norm": 3.7126426696777344,
"learning_rate": 4.832605531295488e-05,
"loss": 0.3402,
"step": 1490
},
{
"epoch": 1.3094718463553034,
"grad_norm": 2.976686954498291,
"learning_rate": 4.82775351770985e-05,
"loss": 0.3038,
"step": 1500
},
{
"epoch": 1.3182016586643388,
"grad_norm": 3.393733501434326,
"learning_rate": 4.822901504124212e-05,
"loss": 0.3353,
"step": 1510
},
{
"epoch": 1.3269314709733742,
"grad_norm": 3.2993807792663574,
"learning_rate": 4.818049490538574e-05,
"loss": 0.3427,
"step": 1520
},
{
"epoch": 1.3356612832824095,
"grad_norm": 3.6997506618499756,
"learning_rate": 4.813197476952936e-05,
"loss": 0.3535,
"step": 1530
},
{
"epoch": 1.3443910955914449,
"grad_norm": 3.4463653564453125,
"learning_rate": 4.808345463367297e-05,
"loss": 0.3431,
"step": 1540
},
{
"epoch": 1.3531209079004802,
"grad_norm": 3.227196455001831,
"learning_rate": 4.8034934497816594e-05,
"loss": 0.3114,
"step": 1550
},
{
"epoch": 1.3618507202095156,
"grad_norm": 3.5765860080718994,
"learning_rate": 4.7986414361960214e-05,
"loss": 0.3203,
"step": 1560
},
{
"epoch": 1.370580532518551,
"grad_norm": 4.020304203033447,
"learning_rate": 4.7937894226103835e-05,
"loss": 0.3496,
"step": 1570
},
{
"epoch": 1.3793103448275863,
"grad_norm": 3.5590262413024902,
"learning_rate": 4.7889374090247456e-05,
"loss": 0.3294,
"step": 1580
},
{
"epoch": 1.3880401571366217,
"grad_norm": 3.3192408084869385,
"learning_rate": 4.7840853954391076e-05,
"loss": 0.3334,
"step": 1590
},
{
"epoch": 1.396769969445657,
"grad_norm": 3.537508726119995,
"learning_rate": 4.77923338185347e-05,
"loss": 0.3349,
"step": 1600
},
{
"epoch": 1.4054997817546924,
"grad_norm": 4.00840425491333,
"learning_rate": 4.774381368267831e-05,
"loss": 0.334,
"step": 1610
},
{
"epoch": 1.4142295940637277,
"grad_norm": 3.3885114192962646,
"learning_rate": 4.769529354682193e-05,
"loss": 0.2888,
"step": 1620
},
{
"epoch": 1.422959406372763,
"grad_norm": 3.376528739929199,
"learning_rate": 4.764677341096555e-05,
"loss": 0.3122,
"step": 1630
},
{
"epoch": 1.4316892186817984,
"grad_norm": 3.8087868690490723,
"learning_rate": 4.759825327510917e-05,
"loss": 0.3433,
"step": 1640
},
{
"epoch": 1.4404190309908338,
"grad_norm": 2.8861641883850098,
"learning_rate": 4.754973313925279e-05,
"loss": 0.2913,
"step": 1650
},
{
"epoch": 1.4491488432998691,
"grad_norm": 3.975708246231079,
"learning_rate": 4.750121300339641e-05,
"loss": 0.3294,
"step": 1660
},
{
"epoch": 1.4578786556089045,
"grad_norm": 4.433221817016602,
"learning_rate": 4.7452692867540034e-05,
"loss": 0.2937,
"step": 1670
},
{
"epoch": 1.4666084679179399,
"grad_norm": 3.9927902221679688,
"learning_rate": 4.740417273168365e-05,
"loss": 0.2853,
"step": 1680
},
{
"epoch": 1.4753382802269752,
"grad_norm": 3.353848695755005,
"learning_rate": 4.735565259582727e-05,
"loss": 0.3477,
"step": 1690
},
{
"epoch": 1.4840680925360106,
"grad_norm": 3.72670841217041,
"learning_rate": 4.730713245997089e-05,
"loss": 0.3342,
"step": 1700
},
{
"epoch": 1.492797904845046,
"grad_norm": 3.5278804302215576,
"learning_rate": 4.725861232411451e-05,
"loss": 0.2832,
"step": 1710
},
{
"epoch": 1.5015277171540813,
"grad_norm": 3.5316507816314697,
"learning_rate": 4.721009218825813e-05,
"loss": 0.2962,
"step": 1720
},
{
"epoch": 1.5102575294631166,
"grad_norm": 3.522723913192749,
"learning_rate": 4.716157205240175e-05,
"loss": 0.3165,
"step": 1730
},
{
"epoch": 1.518987341772152,
"grad_norm": 3.0365447998046875,
"learning_rate": 4.711305191654537e-05,
"loss": 0.2921,
"step": 1740
},
{
"epoch": 1.5277171540811874,
"grad_norm": 3.355435848236084,
"learning_rate": 4.7064531780688984e-05,
"loss": 0.2887,
"step": 1750
},
{
"epoch": 1.5364469663902227,
"grad_norm": 4.480086326599121,
"learning_rate": 4.7016011644832605e-05,
"loss": 0.28,
"step": 1760
},
{
"epoch": 1.545176778699258,
"grad_norm": 3.315585136413574,
"learning_rate": 4.6967491508976226e-05,
"loss": 0.3122,
"step": 1770
},
{
"epoch": 1.5539065910082934,
"grad_norm": 3.043123483657837,
"learning_rate": 4.6918971373119846e-05,
"loss": 0.2679,
"step": 1780
},
{
"epoch": 1.5626364033173288,
"grad_norm": 2.6386468410491943,
"learning_rate": 4.687045123726347e-05,
"loss": 0.2796,
"step": 1790
},
{
"epoch": 1.5713662156263641,
"grad_norm": 3.1339259147644043,
"learning_rate": 4.682193110140709e-05,
"loss": 0.2942,
"step": 1800
},
{
"epoch": 1.5800960279353995,
"grad_norm": 3.921851873397827,
"learning_rate": 4.677341096555071e-05,
"loss": 0.3087,
"step": 1810
},
{
"epoch": 1.5888258402444349,
"grad_norm": 3.262922525405884,
"learning_rate": 4.672489082969432e-05,
"loss": 0.2879,
"step": 1820
},
{
"epoch": 1.5975556525534702,
"grad_norm": 3.206650733947754,
"learning_rate": 4.667637069383794e-05,
"loss": 0.278,
"step": 1830
},
{
"epoch": 1.6062854648625056,
"grad_norm": 3.8127830028533936,
"learning_rate": 4.662785055798157e-05,
"loss": 0.2656,
"step": 1840
},
{
"epoch": 1.615015277171541,
"grad_norm": 3.552639961242676,
"learning_rate": 4.657933042212518e-05,
"loss": 0.2858,
"step": 1850
},
{
"epoch": 1.6237450894805763,
"grad_norm": 3.771047353744507,
"learning_rate": 4.6530810286268804e-05,
"loss": 0.3176,
"step": 1860
},
{
"epoch": 1.6324749017896116,
"grad_norm": 3.8466198444366455,
"learning_rate": 4.6482290150412424e-05,
"loss": 0.2594,
"step": 1870
},
{
"epoch": 1.641204714098647,
"grad_norm": 3.0491561889648438,
"learning_rate": 4.6433770014556045e-05,
"loss": 0.3074,
"step": 1880
},
{
"epoch": 1.6499345264076823,
"grad_norm": 3.4488463401794434,
"learning_rate": 4.638524987869966e-05,
"loss": 0.2625,
"step": 1890
},
{
"epoch": 1.6586643387167177,
"grad_norm": 3.1591031551361084,
"learning_rate": 4.633672974284328e-05,
"loss": 0.2824,
"step": 1900
},
{
"epoch": 1.667394151025753,
"grad_norm": 3.56533145904541,
"learning_rate": 4.6288209606986906e-05,
"loss": 0.2855,
"step": 1910
},
{
"epoch": 1.6761239633347884,
"grad_norm": 3.567021131515503,
"learning_rate": 4.623968947113052e-05,
"loss": 0.2823,
"step": 1920
},
{
"epoch": 1.6848537756438238,
"grad_norm": 4.153507709503174,
"learning_rate": 4.619116933527414e-05,
"loss": 0.2907,
"step": 1930
},
{
"epoch": 1.6935835879528591,
"grad_norm": 4.215022087097168,
"learning_rate": 4.614264919941776e-05,
"loss": 0.2699,
"step": 1940
},
{
"epoch": 1.7023134002618945,
"grad_norm": 3.388324499130249,
"learning_rate": 4.609412906356138e-05,
"loss": 0.2915,
"step": 1950
},
{
"epoch": 1.7110432125709298,
"grad_norm": 3.654491901397705,
"learning_rate": 4.6045608927704996e-05,
"loss": 0.2804,
"step": 1960
},
{
"epoch": 1.7197730248799652,
"grad_norm": 3.2709882259368896,
"learning_rate": 4.5997088791848616e-05,
"loss": 0.271,
"step": 1970
},
{
"epoch": 1.7285028371890006,
"grad_norm": 3.5081441402435303,
"learning_rate": 4.5948568655992244e-05,
"loss": 0.2807,
"step": 1980
},
{
"epoch": 1.737232649498036,
"grad_norm": 2.9276814460754395,
"learning_rate": 4.590004852013586e-05,
"loss": 0.2756,
"step": 1990
},
{
"epoch": 1.7459624618070713,
"grad_norm": 3.9031786918640137,
"learning_rate": 4.585152838427948e-05,
"loss": 0.2964,
"step": 2000
},
{
"epoch": 1.7546922741161066,
"grad_norm": 3.184328079223633,
"learning_rate": 4.58030082484231e-05,
"loss": 0.2366,
"step": 2010
},
{
"epoch": 1.763422086425142,
"grad_norm": 3.6933434009552,
"learning_rate": 4.575448811256672e-05,
"loss": 0.2574,
"step": 2020
},
{
"epoch": 1.7721518987341773,
"grad_norm": 3.177960157394409,
"learning_rate": 4.570596797671033e-05,
"loss": 0.2602,
"step": 2030
},
{
"epoch": 1.7808817110432127,
"grad_norm": 4.310092926025391,
"learning_rate": 4.565744784085395e-05,
"loss": 0.2821,
"step": 2040
},
{
"epoch": 1.789611523352248,
"grad_norm": 2.7888848781585693,
"learning_rate": 4.560892770499758e-05,
"loss": 0.2409,
"step": 2050
},
{
"epoch": 1.7983413356612834,
"grad_norm": 3.6411709785461426,
"learning_rate": 4.5560407569141194e-05,
"loss": 0.2675,
"step": 2060
},
{
"epoch": 1.8070711479703188,
"grad_norm": 2.790893077850342,
"learning_rate": 4.5511887433284815e-05,
"loss": 0.3142,
"step": 2070
},
{
"epoch": 1.8158009602793541,
"grad_norm": 4.2732768058776855,
"learning_rate": 4.5463367297428435e-05,
"loss": 0.2925,
"step": 2080
},
{
"epoch": 1.8245307725883895,
"grad_norm": 3.220660448074341,
"learning_rate": 4.5414847161572056e-05,
"loss": 0.2705,
"step": 2090
},
{
"epoch": 1.8332605848974248,
"grad_norm": 4.008994102478027,
"learning_rate": 4.5366327025715677e-05,
"loss": 0.256,
"step": 2100
},
{
"epoch": 1.8419903972064602,
"grad_norm": 3.519592761993408,
"learning_rate": 4.531780688985929e-05,
"loss": 0.2503,
"step": 2110
},
{
"epoch": 1.8507202095154955,
"grad_norm": 3.621957540512085,
"learning_rate": 4.526928675400292e-05,
"loss": 0.2641,
"step": 2120
},
{
"epoch": 1.859450021824531,
"grad_norm": 3.7663767337799072,
"learning_rate": 4.522076661814653e-05,
"loss": 0.2256,
"step": 2130
},
{
"epoch": 1.8681798341335663,
"grad_norm": 2.5823781490325928,
"learning_rate": 4.517224648229015e-05,
"loss": 0.267,
"step": 2140
},
{
"epoch": 1.8769096464426016,
"grad_norm": 2.787856340408325,
"learning_rate": 4.512372634643377e-05,
"loss": 0.2841,
"step": 2150
},
{
"epoch": 1.885639458751637,
"grad_norm": 3.87716007232666,
"learning_rate": 4.507520621057739e-05,
"loss": 0.2001,
"step": 2160
},
{
"epoch": 1.8943692710606723,
"grad_norm": 3.641904830932617,
"learning_rate": 4.5026686074721014e-05,
"loss": 0.2586,
"step": 2170
},
{
"epoch": 1.9030990833697077,
"grad_norm": 3.6821677684783936,
"learning_rate": 4.497816593886463e-05,
"loss": 0.2698,
"step": 2180
},
{
"epoch": 1.911828895678743,
"grad_norm": 2.7858848571777344,
"learning_rate": 4.4929645803008255e-05,
"loss": 0.2734,
"step": 2190
},
{
"epoch": 1.9205587079877784,
"grad_norm": 3.4211864471435547,
"learning_rate": 4.488112566715187e-05,
"loss": 0.2387,
"step": 2200
},
{
"epoch": 1.9292885202968137,
"grad_norm": 2.879937171936035,
"learning_rate": 4.483260553129549e-05,
"loss": 0.2375,
"step": 2210
},
{
"epoch": 1.938018332605849,
"grad_norm": 3.930103063583374,
"learning_rate": 4.478408539543911e-05,
"loss": 0.2767,
"step": 2220
},
{
"epoch": 1.9467481449148845,
"grad_norm": 3.938791275024414,
"learning_rate": 4.473556525958273e-05,
"loss": 0.2536,
"step": 2230
},
{
"epoch": 1.9554779572239198,
"grad_norm": 2.574296236038208,
"learning_rate": 4.468704512372635e-05,
"loss": 0.2188,
"step": 2240
},
{
"epoch": 1.9642077695329552,
"grad_norm": 4.026519298553467,
"learning_rate": 4.4638524987869964e-05,
"loss": 0.2557,
"step": 2250
},
{
"epoch": 1.9729375818419905,
"grad_norm": 3.5013089179992676,
"learning_rate": 4.459000485201359e-05,
"loss": 0.2438,
"step": 2260
},
{
"epoch": 1.9816673941510259,
"grad_norm": 2.124563694000244,
"learning_rate": 4.4541484716157205e-05,
"loss": 0.2489,
"step": 2270
},
{
"epoch": 1.9903972064600612,
"grad_norm": 2.5535762310028076,
"learning_rate": 4.4492964580300826e-05,
"loss": 0.2468,
"step": 2280
},
{
"epoch": 1.9991270187690966,
"grad_norm": 3.7411351203918457,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.2155,
"step": 2290
},
{
"epoch": 2.0,
"eval_accuracy": 0.9602872575497177,
"eval_loss": 0.15250827372074127,
"eval_runtime": 62.7478,
"eval_samples_per_second": 259.643,
"eval_steps_per_second": 8.128,
"step": 2291
},
{
"epoch": 2.007856831078132,
"grad_norm": 3.3374812602996826,
"learning_rate": 4.439592430858807e-05,
"loss": 0.2331,
"step": 2300
},
{
"epoch": 2.0165866433871673,
"grad_norm": 2.8195152282714844,
"learning_rate": 4.434740417273169e-05,
"loss": 0.2219,
"step": 2310
},
{
"epoch": 2.0253164556962027,
"grad_norm": 3.702488899230957,
"learning_rate": 4.42988840368753e-05,
"loss": 0.2212,
"step": 2320
},
{
"epoch": 2.034046268005238,
"grad_norm": 2.593886613845825,
"learning_rate": 4.425036390101893e-05,
"loss": 0.2278,
"step": 2330
},
{
"epoch": 2.0427760803142734,
"grad_norm": 3.5325887203216553,
"learning_rate": 4.420184376516254e-05,
"loss": 0.2197,
"step": 2340
},
{
"epoch": 2.0515058926233087,
"grad_norm": 4.355841159820557,
"learning_rate": 4.415332362930616e-05,
"loss": 0.2384,
"step": 2350
},
{
"epoch": 2.060235704932344,
"grad_norm": 3.8640925884246826,
"learning_rate": 4.4104803493449784e-05,
"loss": 0.2102,
"step": 2360
},
{
"epoch": 2.0689655172413794,
"grad_norm": 3.6872246265411377,
"learning_rate": 4.4056283357593404e-05,
"loss": 0.2536,
"step": 2370
},
{
"epoch": 2.077695329550415,
"grad_norm": 3.1011269092559814,
"learning_rate": 4.4007763221737025e-05,
"loss": 0.2231,
"step": 2380
},
{
"epoch": 2.08642514185945,
"grad_norm": 3.202810525894165,
"learning_rate": 4.395924308588064e-05,
"loss": 0.2609,
"step": 2390
},
{
"epoch": 2.0951549541684855,
"grad_norm": 3.2603683471679688,
"learning_rate": 4.3910722950024266e-05,
"loss": 0.2367,
"step": 2400
},
{
"epoch": 2.103884766477521,
"grad_norm": 2.8735013008117676,
"learning_rate": 4.386220281416788e-05,
"loss": 0.2528,
"step": 2410
},
{
"epoch": 2.1126145787865562,
"grad_norm": 2.745392322540283,
"learning_rate": 4.38136826783115e-05,
"loss": 0.2259,
"step": 2420
},
{
"epoch": 2.1213443910955916,
"grad_norm": 2.783156633377075,
"learning_rate": 4.376516254245512e-05,
"loss": 0.2285,
"step": 2430
},
{
"epoch": 2.130074203404627,
"grad_norm": 3.6078295707702637,
"learning_rate": 4.371664240659874e-05,
"loss": 0.2222,
"step": 2440
},
{
"epoch": 2.1388040157136623,
"grad_norm": 3.379075527191162,
"learning_rate": 4.366812227074236e-05,
"loss": 0.2574,
"step": 2450
},
{
"epoch": 2.1475338280226977,
"grad_norm": 2.965963840484619,
"learning_rate": 4.3619602134885976e-05,
"loss": 0.2305,
"step": 2460
},
{
"epoch": 2.156263640331733,
"grad_norm": 3.110180616378784,
"learning_rate": 4.35710819990296e-05,
"loss": 0.2127,
"step": 2470
},
{
"epoch": 2.1649934526407684,
"grad_norm": 2.2609825134277344,
"learning_rate": 4.352256186317322e-05,
"loss": 0.1914,
"step": 2480
},
{
"epoch": 2.1737232649498037,
"grad_norm": 3.6740341186523438,
"learning_rate": 4.347404172731684e-05,
"loss": 0.2116,
"step": 2490
},
{
"epoch": 2.182453077258839,
"grad_norm": 4.600437641143799,
"learning_rate": 4.342552159146046e-05,
"loss": 0.2694,
"step": 2500
},
{
"epoch": 2.1911828895678744,
"grad_norm": 2.320915460586548,
"learning_rate": 4.337700145560408e-05,
"loss": 0.2063,
"step": 2510
},
{
"epoch": 2.19991270187691,
"grad_norm": 3.467024326324463,
"learning_rate": 4.33284813197477e-05,
"loss": 0.2474,
"step": 2520
},
{
"epoch": 2.208642514185945,
"grad_norm": 2.8249664306640625,
"learning_rate": 4.327996118389131e-05,
"loss": 0.2376,
"step": 2530
},
{
"epoch": 2.2173723264949805,
"grad_norm": 3.371345281600952,
"learning_rate": 4.323144104803494e-05,
"loss": 0.2286,
"step": 2540
},
{
"epoch": 2.226102138804016,
"grad_norm": 2.801051139831543,
"learning_rate": 4.3182920912178554e-05,
"loss": 0.1987,
"step": 2550
},
{
"epoch": 2.234831951113051,
"grad_norm": 2.635571002960205,
"learning_rate": 4.3134400776322174e-05,
"loss": 0.2364,
"step": 2560
},
{
"epoch": 2.2435617634220866,
"grad_norm": 3.7617597579956055,
"learning_rate": 4.3085880640465795e-05,
"loss": 0.2607,
"step": 2570
},
{
"epoch": 2.252291575731122,
"grad_norm": 2.7652156352996826,
"learning_rate": 4.3037360504609415e-05,
"loss": 0.2227,
"step": 2580
},
{
"epoch": 2.2610213880401573,
"grad_norm": 2.7807669639587402,
"learning_rate": 4.2988840368753036e-05,
"loss": 0.2403,
"step": 2590
},
{
"epoch": 2.2697512003491926,
"grad_norm": 3.653930425643921,
"learning_rate": 4.294032023289665e-05,
"loss": 0.226,
"step": 2600
},
{
"epoch": 2.278481012658228,
"grad_norm": 2.039379358291626,
"learning_rate": 4.289180009704028e-05,
"loss": 0.2299,
"step": 2610
},
{
"epoch": 2.2872108249672634,
"grad_norm": 2.568441390991211,
"learning_rate": 4.284327996118389e-05,
"loss": 0.2354,
"step": 2620
},
{
"epoch": 2.2959406372762987,
"grad_norm": 3.425060987472534,
"learning_rate": 4.279475982532751e-05,
"loss": 0.2334,
"step": 2630
},
{
"epoch": 2.304670449585334,
"grad_norm": 3.790520429611206,
"learning_rate": 4.274623968947113e-05,
"loss": 0.2126,
"step": 2640
},
{
"epoch": 2.3134002618943694,
"grad_norm": 3.2851438522338867,
"learning_rate": 4.269771955361475e-05,
"loss": 0.202,
"step": 2650
},
{
"epoch": 2.322130074203405,
"grad_norm": 2.7088074684143066,
"learning_rate": 4.264919941775837e-05,
"loss": 0.2,
"step": 2660
},
{
"epoch": 2.33085988651244,
"grad_norm": 2.6394031047821045,
"learning_rate": 4.260067928190199e-05,
"loss": 0.2323,
"step": 2670
},
{
"epoch": 2.3395896988214755,
"grad_norm": 3.6911752223968506,
"learning_rate": 4.2552159146045614e-05,
"loss": 0.2426,
"step": 2680
},
{
"epoch": 2.348319511130511,
"grad_norm": 2.896843671798706,
"learning_rate": 4.250363901018923e-05,
"loss": 0.205,
"step": 2690
},
{
"epoch": 2.357049323439546,
"grad_norm": 3.258150339126587,
"learning_rate": 4.245511887433285e-05,
"loss": 0.2154,
"step": 2700
},
{
"epoch": 2.3657791357485816,
"grad_norm": 2.3189845085144043,
"learning_rate": 4.240659873847647e-05,
"loss": 0.1969,
"step": 2710
},
{
"epoch": 2.374508948057617,
"grad_norm": 3.0700485706329346,
"learning_rate": 4.235807860262009e-05,
"loss": 0.2196,
"step": 2720
},
{
"epoch": 2.3832387603666523,
"grad_norm": 2.3715126514434814,
"learning_rate": 4.230955846676371e-05,
"loss": 0.2214,
"step": 2730
},
{
"epoch": 2.3919685726756876,
"grad_norm": 3.8778791427612305,
"learning_rate": 4.2261038330907324e-05,
"loss": 0.2229,
"step": 2740
},
{
"epoch": 2.400698384984723,
"grad_norm": 2.7153546810150146,
"learning_rate": 4.221251819505095e-05,
"loss": 0.1865,
"step": 2750
},
{
"epoch": 2.4094281972937583,
"grad_norm": 2.5187482833862305,
"learning_rate": 4.216399805919457e-05,
"loss": 0.2027,
"step": 2760
},
{
"epoch": 2.4181580096027937,
"grad_norm": 3.387876510620117,
"learning_rate": 4.2115477923338185e-05,
"loss": 0.2097,
"step": 2770
},
{
"epoch": 2.426887821911829,
"grad_norm": 2.9012184143066406,
"learning_rate": 4.2066957787481806e-05,
"loss": 0.1936,
"step": 2780
},
{
"epoch": 2.4356176342208644,
"grad_norm": 3.1651363372802734,
"learning_rate": 4.2018437651625426e-05,
"loss": 0.2453,
"step": 2790
},
{
"epoch": 2.4443474465298998,
"grad_norm": 3.4698972702026367,
"learning_rate": 4.196991751576905e-05,
"loss": 0.2172,
"step": 2800
},
{
"epoch": 2.453077258838935,
"grad_norm": 3.2747743129730225,
"learning_rate": 4.192139737991266e-05,
"loss": 0.244,
"step": 2810
},
{
"epoch": 2.4618070711479705,
"grad_norm": 4.365453720092773,
"learning_rate": 4.187287724405629e-05,
"loss": 0.2208,
"step": 2820
},
{
"epoch": 2.470536883457006,
"grad_norm": 3.2108471393585205,
"learning_rate": 4.182435710819991e-05,
"loss": 0.2253,
"step": 2830
},
{
"epoch": 2.479266695766041,
"grad_norm": 3.678192377090454,
"learning_rate": 4.177583697234352e-05,
"loss": 0.231,
"step": 2840
},
{
"epoch": 2.4879965080750766,
"grad_norm": 3.295539617538452,
"learning_rate": 4.172731683648714e-05,
"loss": 0.24,
"step": 2850
},
{
"epoch": 2.496726320384112,
"grad_norm": 3.0643677711486816,
"learning_rate": 4.1678796700630764e-05,
"loss": 0.2142,
"step": 2860
},
{
"epoch": 2.505456132693147,
"grad_norm": 3.3195557594299316,
"learning_rate": 4.1630276564774384e-05,
"loss": 0.2218,
"step": 2870
},
{
"epoch": 2.5141859450021826,
"grad_norm": 2.9350225925445557,
"learning_rate": 4.1581756428918e-05,
"loss": 0.1869,
"step": 2880
},
{
"epoch": 2.5229157573112175,
"grad_norm": 3.483301877975464,
"learning_rate": 4.1533236293061625e-05,
"loss": 0.1986,
"step": 2890
},
{
"epoch": 2.5316455696202533,
"grad_norm": 3.9944510459899902,
"learning_rate": 4.1484716157205246e-05,
"loss": 0.2112,
"step": 2900
},
{
"epoch": 2.5403753819292882,
"grad_norm": 3.825562000274658,
"learning_rate": 4.143619602134886e-05,
"loss": 0.2102,
"step": 2910
},
{
"epoch": 2.549105194238324,
"grad_norm": 3.575993776321411,
"learning_rate": 4.138767588549248e-05,
"loss": 0.2303,
"step": 2920
},
{
"epoch": 2.557835006547359,
"grad_norm": 2.6293749809265137,
"learning_rate": 4.13391557496361e-05,
"loss": 0.1759,
"step": 2930
},
{
"epoch": 2.5665648188563948,
"grad_norm": 3.9990651607513428,
"learning_rate": 4.129063561377972e-05,
"loss": 0.247,
"step": 2940
},
{
"epoch": 2.5752946311654297,
"grad_norm": 3.152682065963745,
"learning_rate": 4.1242115477923335e-05,
"loss": 0.22,
"step": 2950
},
{
"epoch": 2.5840244434744655,
"grad_norm": 2.9134416580200195,
"learning_rate": 4.119359534206696e-05,
"loss": 0.1966,
"step": 2960
},
{
"epoch": 2.5927542557835004,
"grad_norm": 4.1252288818359375,
"learning_rate": 4.114507520621058e-05,
"loss": 0.1766,
"step": 2970
},
{
"epoch": 2.601484068092536,
"grad_norm": 3.3701207637786865,
"learning_rate": 4.1096555070354197e-05,
"loss": 0.2062,
"step": 2980
},
{
"epoch": 2.610213880401571,
"grad_norm": 3.7040228843688965,
"learning_rate": 4.104803493449782e-05,
"loss": 0.1997,
"step": 2990
},
{
"epoch": 2.618943692710607,
"grad_norm": 3.8628756999969482,
"learning_rate": 4.099951479864144e-05,
"loss": 0.2126,
"step": 3000
},
{
"epoch": 2.627673505019642,
"grad_norm": 3.3776793479919434,
"learning_rate": 4.095099466278506e-05,
"loss": 0.2277,
"step": 3010
},
{
"epoch": 2.6364033173286776,
"grad_norm": 1.8289759159088135,
"learning_rate": 4.090247452692867e-05,
"loss": 0.2033,
"step": 3020
},
{
"epoch": 2.6451331296377125,
"grad_norm": 3.379987955093384,
"learning_rate": 4.08539543910723e-05,
"loss": 0.216,
"step": 3030
},
{
"epoch": 2.6538629419467483,
"grad_norm": 3.1933844089508057,
"learning_rate": 4.080543425521592e-05,
"loss": 0.175,
"step": 3040
},
{
"epoch": 2.6625927542557832,
"grad_norm": 4.522855758666992,
"learning_rate": 4.0756914119359534e-05,
"loss": 0.2276,
"step": 3050
},
{
"epoch": 2.671322566564819,
"grad_norm": 3.2988367080688477,
"learning_rate": 4.0708393983503154e-05,
"loss": 0.2112,
"step": 3060
},
{
"epoch": 2.680052378873854,
"grad_norm": 3.88380765914917,
"learning_rate": 4.0659873847646775e-05,
"loss": 0.1831,
"step": 3070
},
{
"epoch": 2.6887821911828897,
"grad_norm": 2.2652664184570312,
"learning_rate": 4.0611353711790395e-05,
"loss": 0.2062,
"step": 3080
},
{
"epoch": 2.6975120034919247,
"grad_norm": 2.7714035511016846,
"learning_rate": 4.0562833575934016e-05,
"loss": 0.2016,
"step": 3090
},
{
"epoch": 2.7062418158009605,
"grad_norm": 3.18550181388855,
"learning_rate": 4.0514313440077636e-05,
"loss": 0.2043,
"step": 3100
},
{
"epoch": 2.7149716281099954,
"grad_norm": 4.249231815338135,
"learning_rate": 4.046579330422126e-05,
"loss": 0.1868,
"step": 3110
},
{
"epoch": 2.723701440419031,
"grad_norm": 3.2999160289764404,
"learning_rate": 4.041727316836487e-05,
"loss": 0.1849,
"step": 3120
},
{
"epoch": 2.732431252728066,
"grad_norm": 3.1966731548309326,
"learning_rate": 4.036875303250849e-05,
"loss": 0.2143,
"step": 3130
},
{
"epoch": 2.741161065037102,
"grad_norm": 3.071214199066162,
"learning_rate": 4.032023289665211e-05,
"loss": 0.1541,
"step": 3140
},
{
"epoch": 2.749890877346137,
"grad_norm": 2.8251476287841797,
"learning_rate": 4.027171276079573e-05,
"loss": 0.1818,
"step": 3150
},
{
"epoch": 2.7586206896551726,
"grad_norm": 3.3325555324554443,
"learning_rate": 4.022319262493935e-05,
"loss": 0.2099,
"step": 3160
},
{
"epoch": 2.7673505019642075,
"grad_norm": 3.3313956260681152,
"learning_rate": 4.017467248908297e-05,
"loss": 0.2064,
"step": 3170
},
{
"epoch": 2.7760803142732433,
"grad_norm": 3.0525896549224854,
"learning_rate": 4.0126152353226594e-05,
"loss": 0.1882,
"step": 3180
},
{
"epoch": 2.7848101265822782,
"grad_norm": 2.8672525882720947,
"learning_rate": 4.007763221737021e-05,
"loss": 0.1722,
"step": 3190
},
{
"epoch": 2.793539938891314,
"grad_norm": 3.050518751144409,
"learning_rate": 4.002911208151383e-05,
"loss": 0.1933,
"step": 3200
},
{
"epoch": 2.802269751200349,
"grad_norm": 3.527492046356201,
"learning_rate": 3.998059194565745e-05,
"loss": 0.1614,
"step": 3210
},
{
"epoch": 2.8109995635093847,
"grad_norm": 4.246466159820557,
"learning_rate": 3.993207180980107e-05,
"loss": 0.2099,
"step": 3220
},
{
"epoch": 2.8197293758184196,
"grad_norm": 3.365166425704956,
"learning_rate": 3.988355167394469e-05,
"loss": 0.2046,
"step": 3230
},
{
"epoch": 2.8284591881274554,
"grad_norm": 2.6562325954437256,
"learning_rate": 3.983503153808831e-05,
"loss": 0.1593,
"step": 3240
},
{
"epoch": 2.8371890004364904,
"grad_norm": 3.081038236618042,
"learning_rate": 3.978651140223193e-05,
"loss": 0.1784,
"step": 3250
},
{
"epoch": 2.845918812745526,
"grad_norm": 3.1560895442962646,
"learning_rate": 3.9737991266375545e-05,
"loss": 0.2313,
"step": 3260
},
{
"epoch": 2.854648625054561,
"grad_norm": 3.1035749912261963,
"learning_rate": 3.9689471130519165e-05,
"loss": 0.1988,
"step": 3270
},
{
"epoch": 2.863378437363597,
"grad_norm": 3.2834436893463135,
"learning_rate": 3.9640950994662786e-05,
"loss": 0.2095,
"step": 3280
},
{
"epoch": 2.872108249672632,
"grad_norm": 2.712871789932251,
"learning_rate": 3.9592430858806406e-05,
"loss": 0.1723,
"step": 3290
},
{
"epoch": 2.8808380619816676,
"grad_norm": 2.6281795501708984,
"learning_rate": 3.954391072295003e-05,
"loss": 0.1921,
"step": 3300
},
{
"epoch": 2.8895678742907025,
"grad_norm": 2.3966925144195557,
"learning_rate": 3.949539058709365e-05,
"loss": 0.1927,
"step": 3310
},
{
"epoch": 2.8982976865997383,
"grad_norm": 3.192667245864868,
"learning_rate": 3.944687045123727e-05,
"loss": 0.2165,
"step": 3320
},
{
"epoch": 2.907027498908773,
"grad_norm": 3.994009256362915,
"learning_rate": 3.939835031538088e-05,
"loss": 0.2054,
"step": 3330
},
{
"epoch": 2.915757311217809,
"grad_norm": 3.6502673625946045,
"learning_rate": 3.93498301795245e-05,
"loss": 0.2229,
"step": 3340
},
{
"epoch": 2.924487123526844,
"grad_norm": 3.1017322540283203,
"learning_rate": 3.930131004366812e-05,
"loss": 0.1659,
"step": 3350
},
{
"epoch": 2.9332169358358797,
"grad_norm": 3.0115444660186768,
"learning_rate": 3.9252789907811743e-05,
"loss": 0.182,
"step": 3360
},
{
"epoch": 2.9419467481449146,
"grad_norm": 3.564761161804199,
"learning_rate": 3.9204269771955364e-05,
"loss": 0.2281,
"step": 3370
},
{
"epoch": 2.9506765604539504,
"grad_norm": 2.8959414958953857,
"learning_rate": 3.9155749636098985e-05,
"loss": 0.1896,
"step": 3380
},
{
"epoch": 2.9594063727629853,
"grad_norm": 2.9973928928375244,
"learning_rate": 3.9107229500242605e-05,
"loss": 0.2058,
"step": 3390
},
{
"epoch": 2.968136185072021,
"grad_norm": 3.4878735542297363,
"learning_rate": 3.905870936438622e-05,
"loss": 0.1839,
"step": 3400
},
{
"epoch": 2.976865997381056,
"grad_norm": 2.474815607070923,
"learning_rate": 3.901018922852984e-05,
"loss": 0.2162,
"step": 3410
},
{
"epoch": 2.985595809690092,
"grad_norm": 3.7291135787963867,
"learning_rate": 3.896166909267347e-05,
"loss": 0.1895,
"step": 3420
},
{
"epoch": 2.9943256219991268,
"grad_norm": 3.1307461261749268,
"learning_rate": 3.891314895681708e-05,
"loss": 0.2058,
"step": 3430
},
{
"epoch": 2.999563509384548,
"eval_accuracy": 0.9558679106309845,
"eval_loss": 0.14920948445796967,
"eval_runtime": 61.5134,
"eval_samples_per_second": 264.853,
"eval_steps_per_second": 8.291,
"step": 3436
},
{
"epoch": 3.0030554343081626,
"grad_norm": 3.64821457862854,
"learning_rate": 3.88646288209607e-05,
"loss": 0.1772,
"step": 3440
},
{
"epoch": 3.011785246617198,
"grad_norm": 2.404139280319214,
"learning_rate": 3.881610868510432e-05,
"loss": 0.1475,
"step": 3450
},
{
"epoch": 3.0205150589262333,
"grad_norm": 2.3943281173706055,
"learning_rate": 3.876758854924794e-05,
"loss": 0.1839,
"step": 3460
},
{
"epoch": 3.0292448712352686,
"grad_norm": 2.9818949699401855,
"learning_rate": 3.8719068413391556e-05,
"loss": 0.1981,
"step": 3470
},
{
"epoch": 3.037974683544304,
"grad_norm": 4.090831279754639,
"learning_rate": 3.8670548277535176e-05,
"loss": 0.1597,
"step": 3480
},
{
"epoch": 3.0467044958533394,
"grad_norm": 3.2076456546783447,
"learning_rate": 3.8622028141678804e-05,
"loss": 0.2012,
"step": 3490
},
{
"epoch": 3.0554343081623747,
"grad_norm": 3.2840895652770996,
"learning_rate": 3.857350800582242e-05,
"loss": 0.1901,
"step": 3500
},
{
"epoch": 3.06416412047141,
"grad_norm": 2.6999125480651855,
"learning_rate": 3.852498786996604e-05,
"loss": 0.1808,
"step": 3510
},
{
"epoch": 3.0728939327804454,
"grad_norm": 2.939896821975708,
"learning_rate": 3.847646773410966e-05,
"loss": 0.185,
"step": 3520
},
{
"epoch": 3.081623745089481,
"grad_norm": 3.7036166191101074,
"learning_rate": 3.842794759825328e-05,
"loss": 0.2125,
"step": 3530
},
{
"epoch": 3.090353557398516,
"grad_norm": 3.631962299346924,
"learning_rate": 3.837942746239689e-05,
"loss": 0.1828,
"step": 3540
},
{
"epoch": 3.0990833697075515,
"grad_norm": 2.847594976425171,
"learning_rate": 3.8330907326540513e-05,
"loss": 0.1442,
"step": 3550
},
{
"epoch": 3.107813182016587,
"grad_norm": 2.8544461727142334,
"learning_rate": 3.828238719068414e-05,
"loss": 0.1772,
"step": 3560
},
{
"epoch": 3.116542994325622,
"grad_norm": 2.833056688308716,
"learning_rate": 3.8233867054827755e-05,
"loss": 0.1847,
"step": 3570
},
{
"epoch": 3.1252728066346576,
"grad_norm": 3.2772698402404785,
"learning_rate": 3.8185346918971375e-05,
"loss": 0.1872,
"step": 3580
},
{
"epoch": 3.1340026189436925,
"grad_norm": 2.8444337844848633,
"learning_rate": 3.8136826783114996e-05,
"loss": 0.1829,
"step": 3590
},
{
"epoch": 3.1427324312527283,
"grad_norm": 4.018392562866211,
"learning_rate": 3.8088306647258616e-05,
"loss": 0.1709,
"step": 3600
},
{
"epoch": 3.151462243561763,
"grad_norm": 3.0900518894195557,
"learning_rate": 3.803978651140223e-05,
"loss": 0.1551,
"step": 3610
},
{
"epoch": 3.160192055870799,
"grad_norm": 2.1413562297821045,
"learning_rate": 3.799126637554585e-05,
"loss": 0.1867,
"step": 3620
},
{
"epoch": 3.168921868179834,
"grad_norm": 3.050671100616455,
"learning_rate": 3.794274623968948e-05,
"loss": 0.1884,
"step": 3630
},
{
"epoch": 3.1776516804888697,
"grad_norm": 3.41404390335083,
"learning_rate": 3.789422610383309e-05,
"loss": 0.1864,
"step": 3640
},
{
"epoch": 3.1863814927979046,
"grad_norm": 3.2197773456573486,
"learning_rate": 3.784570596797671e-05,
"loss": 0.1954,
"step": 3650
},
{
"epoch": 3.1951113051069404,
"grad_norm": 4.097252368927002,
"learning_rate": 3.779718583212033e-05,
"loss": 0.1938,
"step": 3660
},
{
"epoch": 3.2038411174159753,
"grad_norm": 3.3386523723602295,
"learning_rate": 3.774866569626395e-05,
"loss": 0.1995,
"step": 3670
},
{
"epoch": 3.212570929725011,
"grad_norm": 2.2313122749328613,
"learning_rate": 3.770014556040757e-05,
"loss": 0.1557,
"step": 3680
},
{
"epoch": 3.221300742034046,
"grad_norm": 2.797321081161499,
"learning_rate": 3.765162542455119e-05,
"loss": 0.1771,
"step": 3690
},
{
"epoch": 3.230030554343082,
"grad_norm": 3.4761507511138916,
"learning_rate": 3.7603105288694815e-05,
"loss": 0.1603,
"step": 3700
},
{
"epoch": 3.2387603666521168,
"grad_norm": 3.6318702697753906,
"learning_rate": 3.755458515283843e-05,
"loss": 0.1596,
"step": 3710
},
{
"epoch": 3.2474901789611526,
"grad_norm": 2.6632442474365234,
"learning_rate": 3.750606501698205e-05,
"loss": 0.1559,
"step": 3720
},
{
"epoch": 3.2562199912701875,
"grad_norm": 3.2897541522979736,
"learning_rate": 3.745754488112567e-05,
"loss": 0.149,
"step": 3730
},
{
"epoch": 3.2649498035792233,
"grad_norm": 2.372366189956665,
"learning_rate": 3.740902474526929e-05,
"loss": 0.1909,
"step": 3740
},
{
"epoch": 3.273679615888258,
"grad_norm": 4.001101493835449,
"learning_rate": 3.736050460941291e-05,
"loss": 0.1891,
"step": 3750
},
{
"epoch": 3.282409428197294,
"grad_norm": 3.442195177078247,
"learning_rate": 3.7311984473556525e-05,
"loss": 0.1883,
"step": 3760
},
{
"epoch": 3.291139240506329,
"grad_norm": 3.3341877460479736,
"learning_rate": 3.726346433770015e-05,
"loss": 0.183,
"step": 3770
},
{
"epoch": 3.2998690528153647,
"grad_norm": 2.7835216522216797,
"learning_rate": 3.7214944201843766e-05,
"loss": 0.1593,
"step": 3780
},
{
"epoch": 3.3085988651243996,
"grad_norm": 4.420855522155762,
"learning_rate": 3.7166424065987386e-05,
"loss": 0.1795,
"step": 3790
},
{
"epoch": 3.3173286774334354,
"grad_norm": 2.7410755157470703,
"learning_rate": 3.711790393013101e-05,
"loss": 0.1519,
"step": 3800
},
{
"epoch": 3.3260584897424703,
"grad_norm": 3.839470863342285,
"learning_rate": 3.706938379427463e-05,
"loss": 0.2107,
"step": 3810
},
{
"epoch": 3.334788302051506,
"grad_norm": 2.829495668411255,
"learning_rate": 3.702086365841825e-05,
"loss": 0.1765,
"step": 3820
},
{
"epoch": 3.343518114360541,
"grad_norm": 3.0382578372955322,
"learning_rate": 3.697234352256186e-05,
"loss": 0.1899,
"step": 3830
},
{
"epoch": 3.352247926669577,
"grad_norm": 2.4491844177246094,
"learning_rate": 3.692382338670549e-05,
"loss": 0.1659,
"step": 3840
},
{
"epoch": 3.3609777389786117,
"grad_norm": 3.5624337196350098,
"learning_rate": 3.68753032508491e-05,
"loss": 0.1681,
"step": 3850
},
{
"epoch": 3.3697075512876475,
"grad_norm": 3.9547486305236816,
"learning_rate": 3.682678311499272e-05,
"loss": 0.2041,
"step": 3860
},
{
"epoch": 3.3784373635966825,
"grad_norm": 3.0073511600494385,
"learning_rate": 3.6778262979136344e-05,
"loss": 0.1754,
"step": 3870
},
{
"epoch": 3.3871671759057183,
"grad_norm": 3.345168352127075,
"learning_rate": 3.6729742843279964e-05,
"loss": 0.1714,
"step": 3880
},
{
"epoch": 3.395896988214753,
"grad_norm": 2.464186668395996,
"learning_rate": 3.6681222707423585e-05,
"loss": 0.1861,
"step": 3890
},
{
"epoch": 3.404626800523789,
"grad_norm": 2.6841201782226562,
"learning_rate": 3.66327025715672e-05,
"loss": 0.1676,
"step": 3900
},
{
"epoch": 3.413356612832824,
"grad_norm": 2.905978202819824,
"learning_rate": 3.6584182435710826e-05,
"loss": 0.1952,
"step": 3910
},
{
"epoch": 3.4220864251418597,
"grad_norm": 2.9638357162475586,
"learning_rate": 3.653566229985444e-05,
"loss": 0.1617,
"step": 3920
},
{
"epoch": 3.4308162374508946,
"grad_norm": 3.0130560398101807,
"learning_rate": 3.648714216399806e-05,
"loss": 0.1554,
"step": 3930
},
{
"epoch": 3.4395460497599304,
"grad_norm": 3.267518997192383,
"learning_rate": 3.643862202814168e-05,
"loss": 0.1896,
"step": 3940
},
{
"epoch": 3.4482758620689653,
"grad_norm": 2.6104979515075684,
"learning_rate": 3.63901018922853e-05,
"loss": 0.1693,
"step": 3950
},
{
"epoch": 3.457005674378001,
"grad_norm": 2.9736759662628174,
"learning_rate": 3.634158175642892e-05,
"loss": 0.1654,
"step": 3960
},
{
"epoch": 3.465735486687036,
"grad_norm": 3.04263973236084,
"learning_rate": 3.6293061620572536e-05,
"loss": 0.1655,
"step": 3970
},
{
"epoch": 3.474465298996072,
"grad_norm": 3.322094202041626,
"learning_rate": 3.624454148471616e-05,
"loss": 0.1718,
"step": 3980
},
{
"epoch": 3.4831951113051067,
"grad_norm": 2.8771190643310547,
"learning_rate": 3.619602134885978e-05,
"loss": 0.188,
"step": 3990
},
{
"epoch": 3.4919249236141425,
"grad_norm": 3.3343777656555176,
"learning_rate": 3.61475012130034e-05,
"loss": 0.1765,
"step": 4000
},
{
"epoch": 3.5006547359231774,
"grad_norm": 3.087955951690674,
"learning_rate": 3.609898107714702e-05,
"loss": 0.1557,
"step": 4010
},
{
"epoch": 3.5093845482322132,
"grad_norm": 2.9191036224365234,
"learning_rate": 3.605046094129064e-05,
"loss": 0.1657,
"step": 4020
},
{
"epoch": 3.518114360541248,
"grad_norm": 3.2453222274780273,
"learning_rate": 3.600194080543426e-05,
"loss": 0.1921,
"step": 4030
},
{
"epoch": 3.526844172850284,
"grad_norm": 3.4277548789978027,
"learning_rate": 3.595342066957787e-05,
"loss": 0.1583,
"step": 4040
},
{
"epoch": 3.535573985159319,
"grad_norm": 2.132359504699707,
"learning_rate": 3.59049005337215e-05,
"loss": 0.168,
"step": 4050
},
{
"epoch": 3.5443037974683547,
"grad_norm": 3.881998062133789,
"learning_rate": 3.5856380397865114e-05,
"loss": 0.1992,
"step": 4060
},
{
"epoch": 3.5530336097773896,
"grad_norm": 3.376692771911621,
"learning_rate": 3.5807860262008734e-05,
"loss": 0.1804,
"step": 4070
},
{
"epoch": 3.5617634220864254,
"grad_norm": 2.9599671363830566,
"learning_rate": 3.5759340126152355e-05,
"loss": 0.1561,
"step": 4080
},
{
"epoch": 3.5704932343954603,
"grad_norm": 3.439408302307129,
"learning_rate": 3.5710819990295976e-05,
"loss": 0.1471,
"step": 4090
},
{
"epoch": 3.579223046704496,
"grad_norm": 3.4653496742248535,
"learning_rate": 3.5662299854439596e-05,
"loss": 0.189,
"step": 4100
},
{
"epoch": 3.587952859013531,
"grad_norm": 3.479707717895508,
"learning_rate": 3.561377971858321e-05,
"loss": 0.1771,
"step": 4110
},
{
"epoch": 3.596682671322567,
"grad_norm": 3.077345848083496,
"learning_rate": 3.556525958272684e-05,
"loss": 0.1764,
"step": 4120
},
{
"epoch": 3.6054124836316017,
"grad_norm": 2.612633228302002,
"learning_rate": 3.551673944687045e-05,
"loss": 0.1336,
"step": 4130
},
{
"epoch": 3.6141422959406375,
"grad_norm": 2.649402379989624,
"learning_rate": 3.546821931101407e-05,
"loss": 0.1609,
"step": 4140
},
{
"epoch": 3.6228721082496724,
"grad_norm": 3.7441020011901855,
"learning_rate": 3.541969917515769e-05,
"loss": 0.1758,
"step": 4150
},
{
"epoch": 3.6316019205587082,
"grad_norm": 2.341038465499878,
"learning_rate": 3.537117903930131e-05,
"loss": 0.1431,
"step": 4160
},
{
"epoch": 3.640331732867743,
"grad_norm": 2.990847587585449,
"learning_rate": 3.532265890344493e-05,
"loss": 0.1621,
"step": 4170
},
{
"epoch": 3.649061545176779,
"grad_norm": 3.005911111831665,
"learning_rate": 3.527413876758855e-05,
"loss": 0.1481,
"step": 4180
},
{
"epoch": 3.657791357485814,
"grad_norm": 3.05849027633667,
"learning_rate": 3.5225618631732174e-05,
"loss": 0.1781,
"step": 4190
},
{
"epoch": 3.6665211697948497,
"grad_norm": 2.643735647201538,
"learning_rate": 3.517709849587579e-05,
"loss": 0.1688,
"step": 4200
},
{
"epoch": 3.6752509821038846,
"grad_norm": 3.1337473392486572,
"learning_rate": 3.512857836001941e-05,
"loss": 0.1463,
"step": 4210
},
{
"epoch": 3.6839807944129204,
"grad_norm": 2.945814609527588,
"learning_rate": 3.508005822416303e-05,
"loss": 0.1545,
"step": 4220
},
{
"epoch": 3.6927106067219553,
"grad_norm": 2.5990426540374756,
"learning_rate": 3.503153808830665e-05,
"loss": 0.1525,
"step": 4230
},
{
"epoch": 3.701440419030991,
"grad_norm": 2.7641971111297607,
"learning_rate": 3.498301795245027e-05,
"loss": 0.1661,
"step": 4240
},
{
"epoch": 3.710170231340026,
"grad_norm": 2.606818437576294,
"learning_rate": 3.4934497816593884e-05,
"loss": 0.1906,
"step": 4250
},
{
"epoch": 3.718900043649062,
"grad_norm": 3.2500600814819336,
"learning_rate": 3.488597768073751e-05,
"loss": 0.1404,
"step": 4260
},
{
"epoch": 3.7276298559580967,
"grad_norm": 3.6567978858947754,
"learning_rate": 3.4837457544881125e-05,
"loss": 0.1715,
"step": 4270
},
{
"epoch": 3.7363596682671325,
"grad_norm": 2.9746952056884766,
"learning_rate": 3.4788937409024746e-05,
"loss": 0.1633,
"step": 4280
},
{
"epoch": 3.7450894805761674,
"grad_norm": 2.986154079437256,
"learning_rate": 3.4740417273168366e-05,
"loss": 0.141,
"step": 4290
},
{
"epoch": 3.753819292885203,
"grad_norm": 2.4456489086151123,
"learning_rate": 3.469189713731199e-05,
"loss": 0.1576,
"step": 4300
},
{
"epoch": 3.762549105194238,
"grad_norm": 2.2719054222106934,
"learning_rate": 3.464337700145561e-05,
"loss": 0.158,
"step": 4310
},
{
"epoch": 3.771278917503274,
"grad_norm": 2.6917176246643066,
"learning_rate": 3.459485686559922e-05,
"loss": 0.1624,
"step": 4320
},
{
"epoch": 3.780008729812309,
"grad_norm": 3.041710615158081,
"learning_rate": 3.454633672974285e-05,
"loss": 0.1573,
"step": 4330
},
{
"epoch": 3.7887385421213446,
"grad_norm": 2.517232894897461,
"learning_rate": 3.449781659388647e-05,
"loss": 0.1256,
"step": 4340
},
{
"epoch": 3.7974683544303796,
"grad_norm": 3.19498348236084,
"learning_rate": 3.444929645803008e-05,
"loss": 0.1399,
"step": 4350
},
{
"epoch": 3.8061981667394154,
"grad_norm": 3.773149013519287,
"learning_rate": 3.44007763221737e-05,
"loss": 0.1627,
"step": 4360
},
{
"epoch": 3.8149279790484503,
"grad_norm": 3.985891819000244,
"learning_rate": 3.4352256186317324e-05,
"loss": 0.161,
"step": 4370
},
{
"epoch": 3.823657791357486,
"grad_norm": 2.994173049926758,
"learning_rate": 3.4303736050460944e-05,
"loss": 0.1604,
"step": 4380
},
{
"epoch": 3.832387603666521,
"grad_norm": 3.8228750228881836,
"learning_rate": 3.425521591460456e-05,
"loss": 0.1609,
"step": 4390
},
{
"epoch": 3.841117415975557,
"grad_norm": 3.412660598754883,
"learning_rate": 3.4206695778748185e-05,
"loss": 0.1591,
"step": 4400
},
{
"epoch": 3.8498472282845917,
"grad_norm": 2.460543632507324,
"learning_rate": 3.4158175642891806e-05,
"loss": 0.171,
"step": 4410
},
{
"epoch": 3.8585770405936275,
"grad_norm": 2.8546485900878906,
"learning_rate": 3.410965550703542e-05,
"loss": 0.1662,
"step": 4420
},
{
"epoch": 3.8673068529026624,
"grad_norm": 3.296644926071167,
"learning_rate": 3.406113537117904e-05,
"loss": 0.1669,
"step": 4430
},
{
"epoch": 3.876036665211698,
"grad_norm": 3.5040674209594727,
"learning_rate": 3.401261523532266e-05,
"loss": 0.1694,
"step": 4440
},
{
"epoch": 3.884766477520733,
"grad_norm": 3.1331686973571777,
"learning_rate": 3.396409509946628e-05,
"loss": 0.1507,
"step": 4450
},
{
"epoch": 3.893496289829769,
"grad_norm": 3.2440221309661865,
"learning_rate": 3.3915574963609895e-05,
"loss": 0.1727,
"step": 4460
},
{
"epoch": 3.902226102138804,
"grad_norm": 2.514347553253174,
"learning_rate": 3.386705482775352e-05,
"loss": 0.1401,
"step": 4470
},
{
"epoch": 3.9109559144478396,
"grad_norm": 3.354827404022217,
"learning_rate": 3.381853469189714e-05,
"loss": 0.1472,
"step": 4480
},
{
"epoch": 3.9196857267568745,
"grad_norm": 2.985978126525879,
"learning_rate": 3.377001455604076e-05,
"loss": 0.1783,
"step": 4490
},
{
"epoch": 3.9284155390659103,
"grad_norm": 2.6409835815429688,
"learning_rate": 3.372149442018438e-05,
"loss": 0.1468,
"step": 4500
},
{
"epoch": 3.9371453513749453,
"grad_norm": 2.718919515609741,
"learning_rate": 3.3672974284328e-05,
"loss": 0.1672,
"step": 4510
},
{
"epoch": 3.945875163683981,
"grad_norm": 3.5522565841674805,
"learning_rate": 3.362445414847162e-05,
"loss": 0.1666,
"step": 4520
},
{
"epoch": 3.954604975993016,
"grad_norm": 3.1858441829681396,
"learning_rate": 3.357593401261523e-05,
"loss": 0.181,
"step": 4530
},
{
"epoch": 3.9633347883020518,
"grad_norm": 3.0110580921173096,
"learning_rate": 3.352741387675886e-05,
"loss": 0.1729,
"step": 4540
},
{
"epoch": 3.9720646006110867,
"grad_norm": 3.0811493396759033,
"learning_rate": 3.347889374090248e-05,
"loss": 0.1651,
"step": 4550
},
{
"epoch": 3.9807944129201225,
"grad_norm": 2.8543362617492676,
"learning_rate": 3.3430373605046094e-05,
"loss": 0.1338,
"step": 4560
},
{
"epoch": 3.9895242252291574,
"grad_norm": 3.2236084938049316,
"learning_rate": 3.3381853469189714e-05,
"loss": 0.1349,
"step": 4570
},
{
"epoch": 3.998254037538193,
"grad_norm": 3.425377368927002,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.1524,
"step": 4580
},
{
"epoch": 4.0,
"eval_accuracy": 0.9694328504787626,
"eval_loss": 0.10254349559545517,
"eval_runtime": 60.9682,
"eval_samples_per_second": 267.221,
"eval_steps_per_second": 8.365,
"step": 4582
},
{
"epoch": 4.006983849847228,
"grad_norm": 3.464984655380249,
"learning_rate": 3.3284813197476955e-05,
"loss": 0.1749,
"step": 4590
},
{
"epoch": 4.015713662156264,
"grad_norm": 3.4568910598754883,
"learning_rate": 3.323629306162057e-05,
"loss": 0.1637,
"step": 4600
},
{
"epoch": 4.024443474465299,
"grad_norm": 4.279798984527588,
"learning_rate": 3.3187772925764197e-05,
"loss": 0.1673,
"step": 4610
},
{
"epoch": 4.033173286774335,
"grad_norm": 2.7573869228363037,
"learning_rate": 3.313925278990782e-05,
"loss": 0.1473,
"step": 4620
},
{
"epoch": 4.0419030990833695,
"grad_norm": 1.6597820520401,
"learning_rate": 3.309073265405143e-05,
"loss": 0.1903,
"step": 4630
},
{
"epoch": 4.050632911392405,
"grad_norm": 2.1218221187591553,
"learning_rate": 3.304221251819505e-05,
"loss": 0.136,
"step": 4640
},
{
"epoch": 4.05936272370144,
"grad_norm": 2.941210985183716,
"learning_rate": 3.299369238233867e-05,
"loss": 0.1613,
"step": 4650
},
{
"epoch": 4.068092536010476,
"grad_norm": 2.4769389629364014,
"learning_rate": 3.294517224648229e-05,
"loss": 0.1299,
"step": 4660
},
{
"epoch": 4.076822348319511,
"grad_norm": 2.917901039123535,
"learning_rate": 3.289665211062591e-05,
"loss": 0.1522,
"step": 4670
},
{
"epoch": 4.085552160628547,
"grad_norm": 2.216571092605591,
"learning_rate": 3.2848131974769534e-05,
"loss": 0.1137,
"step": 4680
},
{
"epoch": 4.094281972937582,
"grad_norm": 3.3516063690185547,
"learning_rate": 3.2799611838913154e-05,
"loss": 0.1479,
"step": 4690
},
{
"epoch": 4.1030117852466175,
"grad_norm": 2.5216290950775146,
"learning_rate": 3.275109170305677e-05,
"loss": 0.1792,
"step": 4700
},
{
"epoch": 4.111741597555652,
"grad_norm": 3.8577330112457275,
"learning_rate": 3.270257156720039e-05,
"loss": 0.1647,
"step": 4710
},
{
"epoch": 4.120471409864688,
"grad_norm": 3.3492794036865234,
"learning_rate": 3.265405143134401e-05,
"loss": 0.1646,
"step": 4720
},
{
"epoch": 4.129201222173723,
"grad_norm": 2.7044191360473633,
"learning_rate": 3.260553129548763e-05,
"loss": 0.1373,
"step": 4730
},
{
"epoch": 4.137931034482759,
"grad_norm": 2.3833065032958984,
"learning_rate": 3.255701115963125e-05,
"loss": 0.1606,
"step": 4740
},
{
"epoch": 4.146660846791794,
"grad_norm": 2.541414260864258,
"learning_rate": 3.250849102377487e-05,
"loss": 0.1249,
"step": 4750
},
{
"epoch": 4.15539065910083,
"grad_norm": 3.2906863689422607,
"learning_rate": 3.245997088791849e-05,
"loss": 0.1458,
"step": 4760
},
{
"epoch": 4.1641204714098645,
"grad_norm": 2.537529706954956,
"learning_rate": 3.2411450752062105e-05,
"loss": 0.1559,
"step": 4770
},
{
"epoch": 4.1728502837189,
"grad_norm": 3.309135675430298,
"learning_rate": 3.2362930616205726e-05,
"loss": 0.1478,
"step": 4780
},
{
"epoch": 4.181580096027935,
"grad_norm": 3.2824246883392334,
"learning_rate": 3.2314410480349346e-05,
"loss": 0.1532,
"step": 4790
},
{
"epoch": 4.190309908336971,
"grad_norm": 2.130497694015503,
"learning_rate": 3.226589034449297e-05,
"loss": 0.1552,
"step": 4800
},
{
"epoch": 4.199039720646006,
"grad_norm": 2.358793020248413,
"learning_rate": 3.221737020863659e-05,
"loss": 0.1633,
"step": 4810
},
{
"epoch": 4.207769532955042,
"grad_norm": 2.5943782329559326,
"learning_rate": 3.216885007278021e-05,
"loss": 0.1277,
"step": 4820
},
{
"epoch": 4.216499345264077,
"grad_norm": 2.494443416595459,
"learning_rate": 3.212032993692383e-05,
"loss": 0.1644,
"step": 4830
},
{
"epoch": 4.2252291575731125,
"grad_norm": 2.339279890060425,
"learning_rate": 3.207180980106744e-05,
"loss": 0.1607,
"step": 4840
},
{
"epoch": 4.233958969882147,
"grad_norm": 2.0483193397521973,
"learning_rate": 3.202328966521106e-05,
"loss": 0.1386,
"step": 4850
},
{
"epoch": 4.242688782191183,
"grad_norm": 3.7117726802825928,
"learning_rate": 3.197476952935468e-05,
"loss": 0.1412,
"step": 4860
},
{
"epoch": 4.251418594500218,
"grad_norm": 1.738502860069275,
"learning_rate": 3.1926249393498304e-05,
"loss": 0.141,
"step": 4870
},
{
"epoch": 4.260148406809254,
"grad_norm": 3.4214649200439453,
"learning_rate": 3.1877729257641924e-05,
"loss": 0.164,
"step": 4880
},
{
"epoch": 4.268878219118289,
"grad_norm": 3.121246576309204,
"learning_rate": 3.1829209121785545e-05,
"loss": 0.1476,
"step": 4890
},
{
"epoch": 4.277608031427325,
"grad_norm": 2.7231833934783936,
"learning_rate": 3.1780688985929165e-05,
"loss": 0.1419,
"step": 4900
},
{
"epoch": 4.2863378437363595,
"grad_norm": 2.7895748615264893,
"learning_rate": 3.173216885007278e-05,
"loss": 0.161,
"step": 4910
},
{
"epoch": 4.295067656045395,
"grad_norm": 2.6406960487365723,
"learning_rate": 3.16836487142164e-05,
"loss": 0.1358,
"step": 4920
},
{
"epoch": 4.30379746835443,
"grad_norm": 2.4051973819732666,
"learning_rate": 3.163512857836002e-05,
"loss": 0.1544,
"step": 4930
},
{
"epoch": 4.312527280663466,
"grad_norm": 2.49039626121521,
"learning_rate": 3.158660844250364e-05,
"loss": 0.1551,
"step": 4940
},
{
"epoch": 4.321257092972501,
"grad_norm": 2.592521905899048,
"learning_rate": 3.153808830664726e-05,
"loss": 0.1836,
"step": 4950
},
{
"epoch": 4.329986905281537,
"grad_norm": 3.2178955078125,
"learning_rate": 3.148956817079088e-05,
"loss": 0.1562,
"step": 4960
},
{
"epoch": 4.338716717590572,
"grad_norm": 2.5670714378356934,
"learning_rate": 3.14410480349345e-05,
"loss": 0.1412,
"step": 4970
},
{
"epoch": 4.3474465298996074,
"grad_norm": 4.081165790557861,
"learning_rate": 3.1392527899078116e-05,
"loss": 0.166,
"step": 4980
},
{
"epoch": 4.356176342208642,
"grad_norm": 2.3925557136535645,
"learning_rate": 3.134400776322174e-05,
"loss": 0.1351,
"step": 4990
},
{
"epoch": 4.364906154517678,
"grad_norm": 2.4587152004241943,
"learning_rate": 3.1295487627365364e-05,
"loss": 0.1389,
"step": 5000
},
{
"epoch": 4.373635966826713,
"grad_norm": 3.3576841354370117,
"learning_rate": 3.124696749150898e-05,
"loss": 0.1431,
"step": 5010
},
{
"epoch": 4.382365779135749,
"grad_norm": 3.1087608337402344,
"learning_rate": 3.11984473556526e-05,
"loss": 0.157,
"step": 5020
},
{
"epoch": 4.391095591444784,
"grad_norm": 3.4390625953674316,
"learning_rate": 3.114992721979622e-05,
"loss": 0.1437,
"step": 5030
},
{
"epoch": 4.39982540375382,
"grad_norm": 2.3669025897979736,
"learning_rate": 3.110140708393984e-05,
"loss": 0.1229,
"step": 5040
},
{
"epoch": 4.4085552160628545,
"grad_norm": 3.4715614318847656,
"learning_rate": 3.105288694808345e-05,
"loss": 0.1436,
"step": 5050
},
{
"epoch": 4.41728502837189,
"grad_norm": 3.359426975250244,
"learning_rate": 3.1004366812227074e-05,
"loss": 0.1563,
"step": 5060
},
{
"epoch": 4.426014840680925,
"grad_norm": 3.365325450897217,
"learning_rate": 3.09558466763707e-05,
"loss": 0.1411,
"step": 5070
},
{
"epoch": 4.434744652989961,
"grad_norm": 2.716036319732666,
"learning_rate": 3.0907326540514315e-05,
"loss": 0.1535,
"step": 5080
},
{
"epoch": 4.443474465298996,
"grad_norm": 2.797657012939453,
"learning_rate": 3.0858806404657935e-05,
"loss": 0.1314,
"step": 5090
},
{
"epoch": 4.452204277608032,
"grad_norm": 2.7117514610290527,
"learning_rate": 3.0810286268801556e-05,
"loss": 0.1436,
"step": 5100
},
{
"epoch": 4.460934089917067,
"grad_norm": 2.1483347415924072,
"learning_rate": 3.0761766132945176e-05,
"loss": 0.1591,
"step": 5110
},
{
"epoch": 4.469663902226102,
"grad_norm": 3.0980610847473145,
"learning_rate": 3.071324599708879e-05,
"loss": 0.1443,
"step": 5120
},
{
"epoch": 4.478393714535137,
"grad_norm": 3.3128209114074707,
"learning_rate": 3.066472586123241e-05,
"loss": 0.1313,
"step": 5130
},
{
"epoch": 4.487123526844173,
"grad_norm": 3.275357484817505,
"learning_rate": 3.061620572537604e-05,
"loss": 0.1744,
"step": 5140
},
{
"epoch": 4.495853339153208,
"grad_norm": 3.222581148147583,
"learning_rate": 3.056768558951965e-05,
"loss": 0.1584,
"step": 5150
},
{
"epoch": 4.504583151462244,
"grad_norm": 3.389233350753784,
"learning_rate": 3.051916545366327e-05,
"loss": 0.1477,
"step": 5160
},
{
"epoch": 4.513312963771279,
"grad_norm": 3.1281654834747314,
"learning_rate": 3.047064531780689e-05,
"loss": 0.1283,
"step": 5170
},
{
"epoch": 4.522042776080315,
"grad_norm": 3.123300075531006,
"learning_rate": 3.042212518195051e-05,
"loss": 0.1496,
"step": 5180
},
{
"epoch": 4.5307725883893495,
"grad_norm": 2.7296836376190186,
"learning_rate": 3.0373605046094127e-05,
"loss": 0.133,
"step": 5190
},
{
"epoch": 4.539502400698385,
"grad_norm": 2.5812196731567383,
"learning_rate": 3.032508491023775e-05,
"loss": 0.1388,
"step": 5200
},
{
"epoch": 4.54823221300742,
"grad_norm": 3.827601671218872,
"learning_rate": 3.0276564774381372e-05,
"loss": 0.145,
"step": 5210
},
{
"epoch": 4.556962025316456,
"grad_norm": 2.3603134155273438,
"learning_rate": 3.022804463852499e-05,
"loss": 0.1126,
"step": 5220
},
{
"epoch": 4.565691837625491,
"grad_norm": 2.6947975158691406,
"learning_rate": 3.017952450266861e-05,
"loss": 0.1282,
"step": 5230
},
{
"epoch": 4.574421649934527,
"grad_norm": 2.6288444995880127,
"learning_rate": 3.0131004366812227e-05,
"loss": 0.1371,
"step": 5240
},
{
"epoch": 4.583151462243562,
"grad_norm": 3.6533212661743164,
"learning_rate": 3.0082484230955847e-05,
"loss": 0.1555,
"step": 5250
},
{
"epoch": 4.591881274552597,
"grad_norm": 2.927175760269165,
"learning_rate": 3.0033964095099464e-05,
"loss": 0.1396,
"step": 5260
},
{
"epoch": 4.600611086861632,
"grad_norm": 3.4521915912628174,
"learning_rate": 2.9985443959243088e-05,
"loss": 0.1399,
"step": 5270
},
{
"epoch": 4.609340899170668,
"grad_norm": 3.1614108085632324,
"learning_rate": 2.993692382338671e-05,
"loss": 0.139,
"step": 5280
},
{
"epoch": 4.618070711479703,
"grad_norm": 2.77795672416687,
"learning_rate": 2.9888403687530326e-05,
"loss": 0.1623,
"step": 5290
},
{
"epoch": 4.626800523788739,
"grad_norm": 3.930523633956909,
"learning_rate": 2.9839883551673947e-05,
"loss": 0.1534,
"step": 5300
},
{
"epoch": 4.635530336097774,
"grad_norm": 2.8020386695861816,
"learning_rate": 2.9791363415817564e-05,
"loss": 0.1309,
"step": 5310
},
{
"epoch": 4.64426014840681,
"grad_norm": 2.3614084720611572,
"learning_rate": 2.9742843279961184e-05,
"loss": 0.1422,
"step": 5320
},
{
"epoch": 4.6529899607158445,
"grad_norm": 3.849187135696411,
"learning_rate": 2.9694323144104808e-05,
"loss": 0.129,
"step": 5330
},
{
"epoch": 4.66171977302488,
"grad_norm": 3.5703964233398438,
"learning_rate": 2.9645803008248425e-05,
"loss": 0.1403,
"step": 5340
},
{
"epoch": 4.670449585333915,
"grad_norm": 2.742767810821533,
"learning_rate": 2.9597282872392046e-05,
"loss": 0.1234,
"step": 5350
},
{
"epoch": 4.679179397642951,
"grad_norm": 2.6478323936462402,
"learning_rate": 2.9548762736535663e-05,
"loss": 0.1395,
"step": 5360
},
{
"epoch": 4.687909209951986,
"grad_norm": 2.8150362968444824,
"learning_rate": 2.9500242600679284e-05,
"loss": 0.1378,
"step": 5370
},
{
"epoch": 4.696639022261022,
"grad_norm": 3.0545525550842285,
"learning_rate": 2.94517224648229e-05,
"loss": 0.1244,
"step": 5380
},
{
"epoch": 4.705368834570057,
"grad_norm": 3.3390815258026123,
"learning_rate": 2.940320232896652e-05,
"loss": 0.1433,
"step": 5390
},
{
"epoch": 4.714098646879092,
"grad_norm": 2.237645149230957,
"learning_rate": 2.9354682193110145e-05,
"loss": 0.1386,
"step": 5400
},
{
"epoch": 4.722828459188127,
"grad_norm": 2.8226351737976074,
"learning_rate": 2.9306162057253762e-05,
"loss": 0.1468,
"step": 5410
},
{
"epoch": 4.731558271497163,
"grad_norm": 2.2140517234802246,
"learning_rate": 2.9257641921397383e-05,
"loss": 0.1271,
"step": 5420
},
{
"epoch": 4.740288083806198,
"grad_norm": 2.853294610977173,
"learning_rate": 2.9209121785541e-05,
"loss": 0.1357,
"step": 5430
},
{
"epoch": 4.749017896115234,
"grad_norm": 3.399142265319824,
"learning_rate": 2.916060164968462e-05,
"loss": 0.1517,
"step": 5440
},
{
"epoch": 4.757747708424269,
"grad_norm": 1.853452205657959,
"learning_rate": 2.9112081513828238e-05,
"loss": 0.1323,
"step": 5450
},
{
"epoch": 4.7664775207333046,
"grad_norm": 2.7379791736602783,
"learning_rate": 2.906356137797186e-05,
"loss": 0.1471,
"step": 5460
},
{
"epoch": 4.7752073330423395,
"grad_norm": 3.7588677406311035,
"learning_rate": 2.9015041242115482e-05,
"loss": 0.1315,
"step": 5470
},
{
"epoch": 4.783937145351375,
"grad_norm": 2.9723212718963623,
"learning_rate": 2.89665211062591e-05,
"loss": 0.131,
"step": 5480
},
{
"epoch": 4.79266695766041,
"grad_norm": 2.8574113845825195,
"learning_rate": 2.891800097040272e-05,
"loss": 0.1495,
"step": 5490
},
{
"epoch": 4.801396769969446,
"grad_norm": 2.9763901233673096,
"learning_rate": 2.8869480834546337e-05,
"loss": 0.1388,
"step": 5500
},
{
"epoch": 4.810126582278481,
"grad_norm": 3.1920862197875977,
"learning_rate": 2.8820960698689958e-05,
"loss": 0.1523,
"step": 5510
},
{
"epoch": 4.818856394587517,
"grad_norm": 2.83996844291687,
"learning_rate": 2.8772440562833575e-05,
"loss": 0.1126,
"step": 5520
},
{
"epoch": 4.827586206896552,
"grad_norm": 2.066861391067505,
"learning_rate": 2.8723920426977195e-05,
"loss": 0.1337,
"step": 5530
},
{
"epoch": 4.836316019205587,
"grad_norm": 3.4259138107299805,
"learning_rate": 2.867540029112082e-05,
"loss": 0.1575,
"step": 5540
},
{
"epoch": 4.845045831514622,
"grad_norm": 3.806020975112915,
"learning_rate": 2.8626880155264436e-05,
"loss": 0.148,
"step": 5550
},
{
"epoch": 4.853775643823658,
"grad_norm": 4.127275466918945,
"learning_rate": 2.8578360019408057e-05,
"loss": 0.1487,
"step": 5560
},
{
"epoch": 4.862505456132693,
"grad_norm": 3.0096826553344727,
"learning_rate": 2.8529839883551674e-05,
"loss": 0.1627,
"step": 5570
},
{
"epoch": 4.871235268441729,
"grad_norm": 3.183000087738037,
"learning_rate": 2.8481319747695295e-05,
"loss": 0.1241,
"step": 5580
},
{
"epoch": 4.879965080750764,
"grad_norm": 2.7466609477996826,
"learning_rate": 2.843279961183892e-05,
"loss": 0.1518,
"step": 5590
},
{
"epoch": 4.8886948930597995,
"grad_norm": 2.0740559101104736,
"learning_rate": 2.8384279475982532e-05,
"loss": 0.143,
"step": 5600
},
{
"epoch": 4.8974247053688345,
"grad_norm": 2.2618136405944824,
"learning_rate": 2.8335759340126156e-05,
"loss": 0.145,
"step": 5610
},
{
"epoch": 4.90615451767787,
"grad_norm": 3.275090217590332,
"learning_rate": 2.8287239204269774e-05,
"loss": 0.141,
"step": 5620
},
{
"epoch": 4.914884329986905,
"grad_norm": 2.9142794609069824,
"learning_rate": 2.8238719068413394e-05,
"loss": 0.1263,
"step": 5630
},
{
"epoch": 4.923614142295941,
"grad_norm": 3.941188097000122,
"learning_rate": 2.819019893255701e-05,
"loss": 0.167,
"step": 5640
},
{
"epoch": 4.932343954604976,
"grad_norm": 3.0273237228393555,
"learning_rate": 2.8141678796700632e-05,
"loss": 0.1444,
"step": 5650
},
{
"epoch": 4.941073766914012,
"grad_norm": 1.5899831056594849,
"learning_rate": 2.8093158660844256e-05,
"loss": 0.1089,
"step": 5660
},
{
"epoch": 4.949803579223047,
"grad_norm": 4.153631687164307,
"learning_rate": 2.804463852498787e-05,
"loss": 0.1728,
"step": 5670
},
{
"epoch": 4.958533391532082,
"grad_norm": 2.7368574142456055,
"learning_rate": 2.7996118389131493e-05,
"loss": 0.1371,
"step": 5680
},
{
"epoch": 4.967263203841117,
"grad_norm": 3.6088647842407227,
"learning_rate": 2.794759825327511e-05,
"loss": 0.1402,
"step": 5690
},
{
"epoch": 4.975993016150153,
"grad_norm": 2.830106496810913,
"learning_rate": 2.789907811741873e-05,
"loss": 0.1483,
"step": 5700
},
{
"epoch": 4.984722828459188,
"grad_norm": 2.925632953643799,
"learning_rate": 2.7850557981562348e-05,
"loss": 0.1268,
"step": 5710
},
{
"epoch": 4.993452640768224,
"grad_norm": 2.303786277770996,
"learning_rate": 2.780203784570597e-05,
"loss": 0.1274,
"step": 5720
},
{
"epoch": 4.999563509384548,
"eval_accuracy": 0.9705990670267616,
"eval_loss": 0.09282852709293365,
"eval_runtime": 61.4449,
"eval_samples_per_second": 265.148,
"eval_steps_per_second": 8.3,
"step": 5727
},
{
"epoch": 5.002182453077259,
"grad_norm": 3.6829633712768555,
"learning_rate": 2.7753517709849593e-05,
"loss": 0.1331,
"step": 5730
},
{
"epoch": 5.0109122653862945,
"grad_norm": 2.4418623447418213,
"learning_rate": 2.7704997573993207e-05,
"loss": 0.1386,
"step": 5740
},
{
"epoch": 5.019642077695329,
"grad_norm": 2.420471429824829,
"learning_rate": 2.765647743813683e-05,
"loss": 0.1287,
"step": 5750
},
{
"epoch": 5.028371890004365,
"grad_norm": 1.8955364227294922,
"learning_rate": 2.7607957302280448e-05,
"loss": 0.1353,
"step": 5760
},
{
"epoch": 5.0371017023134,
"grad_norm": 3.353316307067871,
"learning_rate": 2.7559437166424068e-05,
"loss": 0.1445,
"step": 5770
},
{
"epoch": 5.045831514622436,
"grad_norm": 2.9570837020874023,
"learning_rate": 2.7510917030567685e-05,
"loss": 0.1374,
"step": 5780
},
{
"epoch": 5.054561326931471,
"grad_norm": 3.622004747390747,
"learning_rate": 2.7462396894711306e-05,
"loss": 0.1298,
"step": 5790
},
{
"epoch": 5.063291139240507,
"grad_norm": 2.643129587173462,
"learning_rate": 2.741387675885493e-05,
"loss": 0.146,
"step": 5800
},
{
"epoch": 5.072020951549542,
"grad_norm": 2.9872100353240967,
"learning_rate": 2.7365356622998544e-05,
"loss": 0.17,
"step": 5810
},
{
"epoch": 5.080750763858577,
"grad_norm": 3.114686965942383,
"learning_rate": 2.7316836487142168e-05,
"loss": 0.1391,
"step": 5820
},
{
"epoch": 5.089480576167612,
"grad_norm": 2.8469395637512207,
"learning_rate": 2.7268316351285785e-05,
"loss": 0.1388,
"step": 5830
},
{
"epoch": 5.098210388476648,
"grad_norm": 3.2871453762054443,
"learning_rate": 2.7219796215429405e-05,
"loss": 0.1221,
"step": 5840
},
{
"epoch": 5.106940200785683,
"grad_norm": 4.159573078155518,
"learning_rate": 2.7171276079573022e-05,
"loss": 0.1357,
"step": 5850
},
{
"epoch": 5.115670013094719,
"grad_norm": 2.4209814071655273,
"learning_rate": 2.7122755943716643e-05,
"loss": 0.1315,
"step": 5860
},
{
"epoch": 5.124399825403754,
"grad_norm": 3.1330792903900146,
"learning_rate": 2.7074235807860267e-05,
"loss": 0.144,
"step": 5870
},
{
"epoch": 5.1331296377127895,
"grad_norm": 3.4734888076782227,
"learning_rate": 2.702571567200388e-05,
"loss": 0.1255,
"step": 5880
},
{
"epoch": 5.141859450021824,
"grad_norm": 2.6538310050964355,
"learning_rate": 2.6977195536147505e-05,
"loss": 0.1309,
"step": 5890
},
{
"epoch": 5.15058926233086,
"grad_norm": 2.8028576374053955,
"learning_rate": 2.6928675400291122e-05,
"loss": 0.128,
"step": 5900
},
{
"epoch": 5.159319074639895,
"grad_norm": 2.747344970703125,
"learning_rate": 2.6880155264434742e-05,
"loss": 0.1287,
"step": 5910
},
{
"epoch": 5.168048886948931,
"grad_norm": 2.56372332572937,
"learning_rate": 2.6831635128578363e-05,
"loss": 0.141,
"step": 5920
},
{
"epoch": 5.176778699257966,
"grad_norm": 2.5955655574798584,
"learning_rate": 2.678311499272198e-05,
"loss": 0.1405,
"step": 5930
},
{
"epoch": 5.185508511567002,
"grad_norm": 2.351151943206787,
"learning_rate": 2.6734594856865604e-05,
"loss": 0.1145,
"step": 5940
},
{
"epoch": 5.194238323876037,
"grad_norm": 3.593594551086426,
"learning_rate": 2.6686074721009218e-05,
"loss": 0.1321,
"step": 5950
},
{
"epoch": 5.202968136185072,
"grad_norm": 2.935622453689575,
"learning_rate": 2.663755458515284e-05,
"loss": 0.1233,
"step": 5960
},
{
"epoch": 5.211697948494107,
"grad_norm": 2.3054494857788086,
"learning_rate": 2.658903444929646e-05,
"loss": 0.1238,
"step": 5970
},
{
"epoch": 5.220427760803143,
"grad_norm": 3.240325927734375,
"learning_rate": 2.654051431344008e-05,
"loss": 0.1613,
"step": 5980
},
{
"epoch": 5.229157573112178,
"grad_norm": 2.3457300662994385,
"learning_rate": 2.64919941775837e-05,
"loss": 0.1498,
"step": 5990
},
{
"epoch": 5.237887385421214,
"grad_norm": 3.284043550491333,
"learning_rate": 2.6443474041727317e-05,
"loss": 0.1328,
"step": 6000
},
{
"epoch": 5.246617197730249,
"grad_norm": 2.991929531097412,
"learning_rate": 2.639495390587094e-05,
"loss": 0.1334,
"step": 6010
},
{
"epoch": 5.2553470100392845,
"grad_norm": 3.4685487747192383,
"learning_rate": 2.6346433770014555e-05,
"loss": 0.1463,
"step": 6020
},
{
"epoch": 5.264076822348319,
"grad_norm": 3.8685977458953857,
"learning_rate": 2.629791363415818e-05,
"loss": 0.1219,
"step": 6030
},
{
"epoch": 5.272806634657355,
"grad_norm": 2.8441107273101807,
"learning_rate": 2.6249393498301796e-05,
"loss": 0.1243,
"step": 6040
},
{
"epoch": 5.28153644696639,
"grad_norm": 3.628505229949951,
"learning_rate": 2.6200873362445416e-05,
"loss": 0.1465,
"step": 6050
},
{
"epoch": 5.290266259275426,
"grad_norm": 3.1612138748168945,
"learning_rate": 2.6152353226589037e-05,
"loss": 0.124,
"step": 6060
},
{
"epoch": 5.298996071584461,
"grad_norm": 2.9345169067382812,
"learning_rate": 2.6103833090732654e-05,
"loss": 0.153,
"step": 6070
},
{
"epoch": 5.307725883893497,
"grad_norm": 2.2323479652404785,
"learning_rate": 2.6055312954876278e-05,
"loss": 0.1307,
"step": 6080
},
{
"epoch": 5.3164556962025316,
"grad_norm": 2.986766815185547,
"learning_rate": 2.6006792819019892e-05,
"loss": 0.1137,
"step": 6090
},
{
"epoch": 5.325185508511567,
"grad_norm": 3.07307767868042,
"learning_rate": 2.5958272683163516e-05,
"loss": 0.1372,
"step": 6100
},
{
"epoch": 5.333915320820602,
"grad_norm": 2.370492935180664,
"learning_rate": 2.5909752547307133e-05,
"loss": 0.1549,
"step": 6110
},
{
"epoch": 5.342645133129638,
"grad_norm": 1.7776038646697998,
"learning_rate": 2.5861232411450753e-05,
"loss": 0.1218,
"step": 6120
},
{
"epoch": 5.351374945438673,
"grad_norm": 3.1490137577056885,
"learning_rate": 2.5812712275594374e-05,
"loss": 0.1556,
"step": 6130
},
{
"epoch": 5.360104757747709,
"grad_norm": 2.9647440910339355,
"learning_rate": 2.576419213973799e-05,
"loss": 0.126,
"step": 6140
},
{
"epoch": 5.368834570056744,
"grad_norm": 3.315322160720825,
"learning_rate": 2.5715672003881615e-05,
"loss": 0.133,
"step": 6150
},
{
"epoch": 5.3775643823657795,
"grad_norm": 2.5604379177093506,
"learning_rate": 2.566715186802523e-05,
"loss": 0.1437,
"step": 6160
},
{
"epoch": 5.386294194674814,
"grad_norm": 1.5733325481414795,
"learning_rate": 2.5618631732168853e-05,
"loss": 0.1374,
"step": 6170
},
{
"epoch": 5.39502400698385,
"grad_norm": 2.330185890197754,
"learning_rate": 2.5570111596312467e-05,
"loss": 0.1158,
"step": 6180
},
{
"epoch": 5.403753819292885,
"grad_norm": 3.2574543952941895,
"learning_rate": 2.552159146045609e-05,
"loss": 0.1374,
"step": 6190
},
{
"epoch": 5.412483631601921,
"grad_norm": 3.3817057609558105,
"learning_rate": 2.547307132459971e-05,
"loss": 0.1272,
"step": 6200
},
{
"epoch": 5.421213443910956,
"grad_norm": 3.8969085216522217,
"learning_rate": 2.5424551188743328e-05,
"loss": 0.1297,
"step": 6210
},
{
"epoch": 5.429943256219992,
"grad_norm": 2.5738420486450195,
"learning_rate": 2.5376031052886952e-05,
"loss": 0.121,
"step": 6220
},
{
"epoch": 5.4386730685290265,
"grad_norm": 3.007840633392334,
"learning_rate": 2.5327510917030566e-05,
"loss": 0.1561,
"step": 6230
},
{
"epoch": 5.447402880838062,
"grad_norm": 2.9330294132232666,
"learning_rate": 2.527899078117419e-05,
"loss": 0.1304,
"step": 6240
},
{
"epoch": 5.456132693147097,
"grad_norm": 2.3257663249969482,
"learning_rate": 2.523047064531781e-05,
"loss": 0.1273,
"step": 6250
},
{
"epoch": 5.464862505456133,
"grad_norm": 2.6171205043792725,
"learning_rate": 2.5181950509461428e-05,
"loss": 0.1135,
"step": 6260
},
{
"epoch": 5.473592317765168,
"grad_norm": 3.440798759460449,
"learning_rate": 2.5133430373605048e-05,
"loss": 0.1246,
"step": 6270
},
{
"epoch": 5.482322130074204,
"grad_norm": 3.418937921524048,
"learning_rate": 2.5084910237748665e-05,
"loss": 0.1354,
"step": 6280
},
{
"epoch": 5.491051942383239,
"grad_norm": 2.9420177936553955,
"learning_rate": 2.503639010189229e-05,
"loss": 0.1405,
"step": 6290
},
{
"epoch": 5.4997817546922745,
"grad_norm": 3.042564630508423,
"learning_rate": 2.4987869966035906e-05,
"loss": 0.1357,
"step": 6300
},
{
"epoch": 5.508511567001309,
"grad_norm": 2.1227240562438965,
"learning_rate": 2.4939349830179527e-05,
"loss": 0.1417,
"step": 6310
},
{
"epoch": 5.517241379310345,
"grad_norm": 3.1265482902526855,
"learning_rate": 2.4890829694323144e-05,
"loss": 0.1382,
"step": 6320
},
{
"epoch": 5.52597119161938,
"grad_norm": 2.782801389694214,
"learning_rate": 2.4842309558466765e-05,
"loss": 0.1635,
"step": 6330
},
{
"epoch": 5.534701003928416,
"grad_norm": 3.488227605819702,
"learning_rate": 2.4793789422610382e-05,
"loss": 0.1303,
"step": 6340
},
{
"epoch": 5.543430816237451,
"grad_norm": 2.8877530097961426,
"learning_rate": 2.4745269286754006e-05,
"loss": 0.1676,
"step": 6350
},
{
"epoch": 5.552160628546487,
"grad_norm": 2.2188923358917236,
"learning_rate": 2.4696749150897623e-05,
"loss": 0.1118,
"step": 6360
},
{
"epoch": 5.5608904408555215,
"grad_norm": 2.9819607734680176,
"learning_rate": 2.4648229015041243e-05,
"loss": 0.1603,
"step": 6370
},
{
"epoch": 5.569620253164557,
"grad_norm": 3.2080116271972656,
"learning_rate": 2.4599708879184864e-05,
"loss": 0.1304,
"step": 6380
},
{
"epoch": 5.578350065473592,
"grad_norm": 1.6882622241973877,
"learning_rate": 2.455118874332848e-05,
"loss": 0.1171,
"step": 6390
},
{
"epoch": 5.587079877782628,
"grad_norm": 2.5788047313690186,
"learning_rate": 2.45026686074721e-05,
"loss": 0.1193,
"step": 6400
},
{
"epoch": 5.595809690091663,
"grad_norm": 3.859628915786743,
"learning_rate": 2.445414847161572e-05,
"loss": 0.124,
"step": 6410
},
{
"epoch": 5.604539502400699,
"grad_norm": 1.5977929830551147,
"learning_rate": 2.4405628335759343e-05,
"loss": 0.1194,
"step": 6420
},
{
"epoch": 5.613269314709734,
"grad_norm": 3.073011875152588,
"learning_rate": 2.435710819990296e-05,
"loss": 0.1344,
"step": 6430
},
{
"epoch": 5.6219991270187695,
"grad_norm": 2.9999372959136963,
"learning_rate": 2.430858806404658e-05,
"loss": 0.1487,
"step": 6440
},
{
"epoch": 5.630728939327804,
"grad_norm": 2.077570676803589,
"learning_rate": 2.42600679281902e-05,
"loss": 0.118,
"step": 6450
},
{
"epoch": 5.63945875163684,
"grad_norm": 3.1440892219543457,
"learning_rate": 2.4211547792333818e-05,
"loss": 0.1394,
"step": 6460
},
{
"epoch": 5.648188563945875,
"grad_norm": 2.836007833480835,
"learning_rate": 2.416302765647744e-05,
"loss": 0.1308,
"step": 6470
},
{
"epoch": 5.656918376254911,
"grad_norm": 2.675652027130127,
"learning_rate": 2.411450752062106e-05,
"loss": 0.1208,
"step": 6480
},
{
"epoch": 5.665648188563946,
"grad_norm": 2.833317756652832,
"learning_rate": 2.406598738476468e-05,
"loss": 0.1244,
"step": 6490
},
{
"epoch": 5.674378000872982,
"grad_norm": 2.444568157196045,
"learning_rate": 2.4017467248908297e-05,
"loss": 0.1153,
"step": 6500
},
{
"epoch": 5.6831078131820165,
"grad_norm": 3.581678628921509,
"learning_rate": 2.3968947113051917e-05,
"loss": 0.1431,
"step": 6510
},
{
"epoch": 5.691837625491052,
"grad_norm": 1.7067352533340454,
"learning_rate": 2.3920426977195538e-05,
"loss": 0.1322,
"step": 6520
},
{
"epoch": 5.700567437800087,
"grad_norm": 4.281068325042725,
"learning_rate": 2.3871906841339155e-05,
"loss": 0.1349,
"step": 6530
},
{
"epoch": 5.709297250109123,
"grad_norm": 2.958136558532715,
"learning_rate": 2.3823386705482776e-05,
"loss": 0.1319,
"step": 6540
},
{
"epoch": 5.718027062418158,
"grad_norm": 2.9312613010406494,
"learning_rate": 2.3774866569626396e-05,
"loss": 0.103,
"step": 6550
},
{
"epoch": 5.726756874727194,
"grad_norm": 2.7693583965301514,
"learning_rate": 2.3726346433770017e-05,
"loss": 0.1283,
"step": 6560
},
{
"epoch": 5.735486687036229,
"grad_norm": 2.7267799377441406,
"learning_rate": 2.3677826297913634e-05,
"loss": 0.1131,
"step": 6570
},
{
"epoch": 5.7442164993452645,
"grad_norm": 2.184208631515503,
"learning_rate": 2.3629306162057255e-05,
"loss": 0.1289,
"step": 6580
},
{
"epoch": 5.752946311654299,
"grad_norm": 2.8971517086029053,
"learning_rate": 2.3580786026200875e-05,
"loss": 0.1185,
"step": 6590
},
{
"epoch": 5.761676123963335,
"grad_norm": 3.3640193939208984,
"learning_rate": 2.3532265890344492e-05,
"loss": 0.139,
"step": 6600
},
{
"epoch": 5.77040593627237,
"grad_norm": 3.1321167945861816,
"learning_rate": 2.3483745754488113e-05,
"loss": 0.1594,
"step": 6610
},
{
"epoch": 5.779135748581406,
"grad_norm": 2.4703168869018555,
"learning_rate": 2.3435225618631733e-05,
"loss": 0.1528,
"step": 6620
},
{
"epoch": 5.787865560890441,
"grad_norm": 2.4260854721069336,
"learning_rate": 2.3386705482775354e-05,
"loss": 0.1272,
"step": 6630
},
{
"epoch": 5.796595373199477,
"grad_norm": 2.809253692626953,
"learning_rate": 2.333818534691897e-05,
"loss": 0.1421,
"step": 6640
},
{
"epoch": 5.8053251855085115,
"grad_norm": 2.53581166267395,
"learning_rate": 2.328966521106259e-05,
"loss": 0.1395,
"step": 6650
},
{
"epoch": 5.814054997817547,
"grad_norm": 2.912879467010498,
"learning_rate": 2.3241145075206212e-05,
"loss": 0.1269,
"step": 6660
},
{
"epoch": 5.822784810126582,
"grad_norm": 2.0264980792999268,
"learning_rate": 2.319262493934983e-05,
"loss": 0.1335,
"step": 6670
},
{
"epoch": 5.831514622435618,
"grad_norm": 2.8756725788116455,
"learning_rate": 2.3144104803493453e-05,
"loss": 0.1246,
"step": 6680
},
{
"epoch": 5.840244434744653,
"grad_norm": 2.49039363861084,
"learning_rate": 2.309558466763707e-05,
"loss": 0.1292,
"step": 6690
},
{
"epoch": 5.848974247053688,
"grad_norm": 2.6733226776123047,
"learning_rate": 2.304706453178069e-05,
"loss": 0.1064,
"step": 6700
},
{
"epoch": 5.857704059362724,
"grad_norm": 2.1536784172058105,
"learning_rate": 2.2998544395924308e-05,
"loss": 0.138,
"step": 6710
},
{
"epoch": 5.8664338716717594,
"grad_norm": 3.428746461868286,
"learning_rate": 2.295002426006793e-05,
"loss": 0.1362,
"step": 6720
},
{
"epoch": 5.875163683980794,
"grad_norm": 3.4653851985931396,
"learning_rate": 2.290150412421155e-05,
"loss": 0.1235,
"step": 6730
},
{
"epoch": 5.883893496289829,
"grad_norm": 2.283843755722046,
"learning_rate": 2.2852983988355166e-05,
"loss": 0.1081,
"step": 6740
},
{
"epoch": 5.892623308598865,
"grad_norm": 3.812995433807373,
"learning_rate": 2.280446385249879e-05,
"loss": 0.1305,
"step": 6750
},
{
"epoch": 5.901353120907901,
"grad_norm": 2.6548011302948,
"learning_rate": 2.2755943716642407e-05,
"loss": 0.135,
"step": 6760
},
{
"epoch": 5.910082933216936,
"grad_norm": 2.09759783744812,
"learning_rate": 2.2707423580786028e-05,
"loss": 0.1066,
"step": 6770
},
{
"epoch": 5.918812745525971,
"grad_norm": 2.5961296558380127,
"learning_rate": 2.2658903444929645e-05,
"loss": 0.1312,
"step": 6780
},
{
"epoch": 5.9275425578350065,
"grad_norm": 2.6198313236236572,
"learning_rate": 2.2610383309073266e-05,
"loss": 0.1213,
"step": 6790
},
{
"epoch": 5.936272370144042,
"grad_norm": 2.5672667026519775,
"learning_rate": 2.2561863173216886e-05,
"loss": 0.1097,
"step": 6800
},
{
"epoch": 5.945002182453077,
"grad_norm": 1.9707518815994263,
"learning_rate": 2.2513343037360507e-05,
"loss": 0.1015,
"step": 6810
},
{
"epoch": 5.953731994762112,
"grad_norm": 2.9768049716949463,
"learning_rate": 2.2464822901504127e-05,
"loss": 0.1246,
"step": 6820
},
{
"epoch": 5.962461807071148,
"grad_norm": 3.3005049228668213,
"learning_rate": 2.2416302765647744e-05,
"loss": 0.1286,
"step": 6830
},
{
"epoch": 5.971191619380184,
"grad_norm": 4.172717094421387,
"learning_rate": 2.2367782629791365e-05,
"loss": 0.1346,
"step": 6840
},
{
"epoch": 5.979921431689219,
"grad_norm": 3.4869165420532227,
"learning_rate": 2.2319262493934982e-05,
"loss": 0.1205,
"step": 6850
},
{
"epoch": 5.9886512439982535,
"grad_norm": 3.327515125274658,
"learning_rate": 2.2270742358078603e-05,
"loss": 0.1267,
"step": 6860
},
{
"epoch": 5.997381056307289,
"grad_norm": 3.0895607471466064,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.1141,
"step": 6870
},
{
"epoch": 6.0,
"eval_accuracy": 0.9722563221212865,
"eval_loss": 0.08740793168544769,
"eval_runtime": 61.534,
"eval_samples_per_second": 264.764,
"eval_steps_per_second": 8.288,
"step": 6873
},
{
"epoch": 6.006110868616325,
"grad_norm": 2.24489688873291,
"learning_rate": 2.2173702086365844e-05,
"loss": 0.1132,
"step": 6880
},
{
"epoch": 6.01484068092536,
"grad_norm": 3.8057761192321777,
"learning_rate": 2.2125181950509464e-05,
"loss": 0.123,
"step": 6890
},
{
"epoch": 6.023570493234396,
"grad_norm": 2.2050459384918213,
"learning_rate": 2.207666181465308e-05,
"loss": 0.1365,
"step": 6900
},
{
"epoch": 6.032300305543431,
"grad_norm": 3.0642194747924805,
"learning_rate": 2.2028141678796702e-05,
"loss": 0.1423,
"step": 6910
},
{
"epoch": 6.041030117852467,
"grad_norm": 2.7323355674743652,
"learning_rate": 2.197962154294032e-05,
"loss": 0.1451,
"step": 6920
},
{
"epoch": 6.0497599301615015,
"grad_norm": 3.7342848777770996,
"learning_rate": 2.193110140708394e-05,
"loss": 0.1157,
"step": 6930
},
{
"epoch": 6.058489742470537,
"grad_norm": 1.9272093772888184,
"learning_rate": 2.188258127122756e-05,
"loss": 0.1019,
"step": 6940
},
{
"epoch": 6.067219554779572,
"grad_norm": 2.0700807571411133,
"learning_rate": 2.183406113537118e-05,
"loss": 0.1223,
"step": 6950
},
{
"epoch": 6.075949367088608,
"grad_norm": 2.6618826389312744,
"learning_rate": 2.17855409995148e-05,
"loss": 0.1181,
"step": 6960
},
{
"epoch": 6.084679179397643,
"grad_norm": 4.103969573974609,
"learning_rate": 2.173702086365842e-05,
"loss": 0.1357,
"step": 6970
},
{
"epoch": 6.093408991706679,
"grad_norm": 2.845344066619873,
"learning_rate": 2.168850072780204e-05,
"loss": 0.1183,
"step": 6980
},
{
"epoch": 6.102138804015714,
"grad_norm": 2.7917563915252686,
"learning_rate": 2.1639980591945656e-05,
"loss": 0.1287,
"step": 6990
},
{
"epoch": 6.110868616324749,
"grad_norm": 2.6179635524749756,
"learning_rate": 2.1591460456089277e-05,
"loss": 0.1346,
"step": 7000
},
{
"epoch": 6.119598428633784,
"grad_norm": 1.979193091392517,
"learning_rate": 2.1542940320232897e-05,
"loss": 0.125,
"step": 7010
},
{
"epoch": 6.12832824094282,
"grad_norm": 2.7642877101898193,
"learning_rate": 2.1494420184376518e-05,
"loss": 0.1275,
"step": 7020
},
{
"epoch": 6.137058053251855,
"grad_norm": 2.8622918128967285,
"learning_rate": 2.144590004852014e-05,
"loss": 0.1109,
"step": 7030
},
{
"epoch": 6.145787865560891,
"grad_norm": 2.6740987300872803,
"learning_rate": 2.1397379912663756e-05,
"loss": 0.1144,
"step": 7040
},
{
"epoch": 6.154517677869926,
"grad_norm": 2.9361984729766846,
"learning_rate": 2.1348859776807376e-05,
"loss": 0.1058,
"step": 7050
},
{
"epoch": 6.163247490178962,
"grad_norm": 2.7780208587646484,
"learning_rate": 2.1300339640950993e-05,
"loss": 0.1391,
"step": 7060
},
{
"epoch": 6.1719773024879965,
"grad_norm": 3.0886313915252686,
"learning_rate": 2.1251819505094614e-05,
"loss": 0.134,
"step": 7070
},
{
"epoch": 6.180707114797032,
"grad_norm": 2.369558334350586,
"learning_rate": 2.1203299369238234e-05,
"loss": 0.1589,
"step": 7080
},
{
"epoch": 6.189436927106067,
"grad_norm": 2.826566219329834,
"learning_rate": 2.1154779233381855e-05,
"loss": 0.1125,
"step": 7090
},
{
"epoch": 6.198166739415103,
"grad_norm": 3.705179452896118,
"learning_rate": 2.1106259097525476e-05,
"loss": 0.1422,
"step": 7100
},
{
"epoch": 6.206896551724138,
"grad_norm": 2.0699045658111572,
"learning_rate": 2.1057738961669093e-05,
"loss": 0.1174,
"step": 7110
},
{
"epoch": 6.215626364033174,
"grad_norm": 1.7472636699676514,
"learning_rate": 2.1009218825812713e-05,
"loss": 0.1,
"step": 7120
},
{
"epoch": 6.224356176342209,
"grad_norm": 3.3158297538757324,
"learning_rate": 2.096069868995633e-05,
"loss": 0.1296,
"step": 7130
},
{
"epoch": 6.233085988651244,
"grad_norm": 3.7028303146362305,
"learning_rate": 2.0912178554099954e-05,
"loss": 0.1189,
"step": 7140
},
{
"epoch": 6.241815800960279,
"grad_norm": 3.309446334838867,
"learning_rate": 2.086365841824357e-05,
"loss": 0.1197,
"step": 7150
},
{
"epoch": 6.250545613269315,
"grad_norm": 2.997817277908325,
"learning_rate": 2.0815138282387192e-05,
"loss": 0.095,
"step": 7160
},
{
"epoch": 6.25927542557835,
"grad_norm": 3.9455857276916504,
"learning_rate": 2.0766618146530813e-05,
"loss": 0.1141,
"step": 7170
},
{
"epoch": 6.268005237887385,
"grad_norm": 2.8850817680358887,
"learning_rate": 2.071809801067443e-05,
"loss": 0.1241,
"step": 7180
},
{
"epoch": 6.276735050196421,
"grad_norm": 3.4822731018066406,
"learning_rate": 2.066957787481805e-05,
"loss": 0.1447,
"step": 7190
},
{
"epoch": 6.2854648625054566,
"grad_norm": 1.834179401397705,
"learning_rate": 2.0621057738961667e-05,
"loss": 0.113,
"step": 7200
},
{
"epoch": 6.2941946748144915,
"grad_norm": 2.8471169471740723,
"learning_rate": 2.057253760310529e-05,
"loss": 0.1095,
"step": 7210
},
{
"epoch": 6.302924487123526,
"grad_norm": 2.841275930404663,
"learning_rate": 2.052401746724891e-05,
"loss": 0.1174,
"step": 7220
},
{
"epoch": 6.311654299432562,
"grad_norm": 3.3309431076049805,
"learning_rate": 2.047549733139253e-05,
"loss": 0.1182,
"step": 7230
},
{
"epoch": 6.320384111741598,
"grad_norm": 2.214808225631714,
"learning_rate": 2.042697719553615e-05,
"loss": 0.1345,
"step": 7240
},
{
"epoch": 6.329113924050633,
"grad_norm": 2.0826220512390137,
"learning_rate": 2.0378457059679767e-05,
"loss": 0.1114,
"step": 7250
},
{
"epoch": 6.337843736359668,
"grad_norm": 2.8715829849243164,
"learning_rate": 2.0329936923823387e-05,
"loss": 0.1335,
"step": 7260
},
{
"epoch": 6.346573548668704,
"grad_norm": 2.2511582374572754,
"learning_rate": 2.0281416787967008e-05,
"loss": 0.1558,
"step": 7270
},
{
"epoch": 6.355303360977739,
"grad_norm": 3.4435458183288574,
"learning_rate": 2.023289665211063e-05,
"loss": 0.1433,
"step": 7280
},
{
"epoch": 6.364033173286774,
"grad_norm": 2.411850929260254,
"learning_rate": 2.0184376516254246e-05,
"loss": 0.1224,
"step": 7290
},
{
"epoch": 6.372762985595809,
"grad_norm": 2.7871601581573486,
"learning_rate": 2.0135856380397866e-05,
"loss": 0.1206,
"step": 7300
},
{
"epoch": 6.381492797904845,
"grad_norm": 2.9544012546539307,
"learning_rate": 2.0087336244541487e-05,
"loss": 0.133,
"step": 7310
},
{
"epoch": 6.390222610213881,
"grad_norm": 3.283802032470703,
"learning_rate": 2.0038816108685104e-05,
"loss": 0.1259,
"step": 7320
},
{
"epoch": 6.398952422522916,
"grad_norm": 2.1609585285186768,
"learning_rate": 1.9990295972828724e-05,
"loss": 0.1499,
"step": 7330
},
{
"epoch": 6.407682234831951,
"grad_norm": 2.2527894973754883,
"learning_rate": 1.9941775836972345e-05,
"loss": 0.1206,
"step": 7340
},
{
"epoch": 6.4164120471409865,
"grad_norm": 3.909329891204834,
"learning_rate": 1.9893255701115965e-05,
"loss": 0.099,
"step": 7350
},
{
"epoch": 6.425141859450022,
"grad_norm": 2.997509002685547,
"learning_rate": 1.9844735565259583e-05,
"loss": 0.1399,
"step": 7360
},
{
"epoch": 6.433871671759057,
"grad_norm": 2.639704465866089,
"learning_rate": 1.9796215429403203e-05,
"loss": 0.1224,
"step": 7370
},
{
"epoch": 6.442601484068092,
"grad_norm": 3.1105806827545166,
"learning_rate": 1.9747695293546824e-05,
"loss": 0.1106,
"step": 7380
},
{
"epoch": 6.451331296377128,
"grad_norm": 2.972954273223877,
"learning_rate": 1.969917515769044e-05,
"loss": 0.1216,
"step": 7390
},
{
"epoch": 6.460061108686164,
"grad_norm": 3.996022939682007,
"learning_rate": 1.965065502183406e-05,
"loss": 0.1279,
"step": 7400
},
{
"epoch": 6.468790920995199,
"grad_norm": 3.0898971557617188,
"learning_rate": 1.9602134885977682e-05,
"loss": 0.1247,
"step": 7410
},
{
"epoch": 6.4775207333042335,
"grad_norm": 2.5472044944763184,
"learning_rate": 1.9553614750121303e-05,
"loss": 0.1411,
"step": 7420
},
{
"epoch": 6.486250545613269,
"grad_norm": 3.311650514602661,
"learning_rate": 1.950509461426492e-05,
"loss": 0.1053,
"step": 7430
},
{
"epoch": 6.494980357922305,
"grad_norm": 2.9363017082214355,
"learning_rate": 1.945657447840854e-05,
"loss": 0.1191,
"step": 7440
},
{
"epoch": 6.50371017023134,
"grad_norm": 3.120633602142334,
"learning_rate": 1.940805434255216e-05,
"loss": 0.1202,
"step": 7450
},
{
"epoch": 6.512439982540375,
"grad_norm": 3.49874210357666,
"learning_rate": 1.9359534206695778e-05,
"loss": 0.1128,
"step": 7460
},
{
"epoch": 6.521169794849411,
"grad_norm": 2.98953914642334,
"learning_rate": 1.9311014070839402e-05,
"loss": 0.1191,
"step": 7470
},
{
"epoch": 6.5298996071584465,
"grad_norm": 2.440988779067993,
"learning_rate": 1.926249393498302e-05,
"loss": 0.12,
"step": 7480
},
{
"epoch": 6.538629419467481,
"grad_norm": 2.9546585083007812,
"learning_rate": 1.921397379912664e-05,
"loss": 0.143,
"step": 7490
},
{
"epoch": 6.547359231776516,
"grad_norm": 1.7305012941360474,
"learning_rate": 1.9165453663270257e-05,
"loss": 0.1083,
"step": 7500
},
{
"epoch": 6.556089044085552,
"grad_norm": 2.084421396255493,
"learning_rate": 1.9116933527413877e-05,
"loss": 0.1228,
"step": 7510
},
{
"epoch": 6.564818856394588,
"grad_norm": 3.237086057662964,
"learning_rate": 1.9068413391557498e-05,
"loss": 0.1265,
"step": 7520
},
{
"epoch": 6.573548668703623,
"grad_norm": 2.24908185005188,
"learning_rate": 1.9019893255701115e-05,
"loss": 0.0891,
"step": 7530
},
{
"epoch": 6.582278481012658,
"grad_norm": 3.3673548698425293,
"learning_rate": 1.897137311984474e-05,
"loss": 0.1377,
"step": 7540
},
{
"epoch": 6.591008293321694,
"grad_norm": 2.560291051864624,
"learning_rate": 1.8922852983988356e-05,
"loss": 0.1114,
"step": 7550
},
{
"epoch": 6.599738105630729,
"grad_norm": 1.9948253631591797,
"learning_rate": 1.8874332848131977e-05,
"loss": 0.1261,
"step": 7560
},
{
"epoch": 6.608467917939764,
"grad_norm": 3.7410624027252197,
"learning_rate": 1.8825812712275594e-05,
"loss": 0.1111,
"step": 7570
},
{
"epoch": 6.617197730248799,
"grad_norm": 3.520691156387329,
"learning_rate": 1.8777292576419214e-05,
"loss": 0.1396,
"step": 7580
},
{
"epoch": 6.625927542557835,
"grad_norm": 2.289947986602783,
"learning_rate": 1.8728772440562835e-05,
"loss": 0.1211,
"step": 7590
},
{
"epoch": 6.634657354866871,
"grad_norm": 2.078162431716919,
"learning_rate": 1.8680252304706455e-05,
"loss": 0.1258,
"step": 7600
},
{
"epoch": 6.643387167175906,
"grad_norm": 2.3327958583831787,
"learning_rate": 1.8631732168850076e-05,
"loss": 0.1177,
"step": 7610
},
{
"epoch": 6.652116979484941,
"grad_norm": 2.689931631088257,
"learning_rate": 1.8583212032993693e-05,
"loss": 0.1144,
"step": 7620
},
{
"epoch": 6.660846791793976,
"grad_norm": 2.395716428756714,
"learning_rate": 1.8534691897137314e-05,
"loss": 0.1202,
"step": 7630
},
{
"epoch": 6.669576604103012,
"grad_norm": 2.892514705657959,
"learning_rate": 1.848617176128093e-05,
"loss": 0.1227,
"step": 7640
},
{
"epoch": 6.678306416412047,
"grad_norm": 2.741924285888672,
"learning_rate": 1.843765162542455e-05,
"loss": 0.0994,
"step": 7650
},
{
"epoch": 6.687036228721082,
"grad_norm": 3.1253838539123535,
"learning_rate": 1.8389131489568172e-05,
"loss": 0.1212,
"step": 7660
},
{
"epoch": 6.695766041030118,
"grad_norm": 3.520061492919922,
"learning_rate": 1.8340611353711792e-05,
"loss": 0.1113,
"step": 7670
},
{
"epoch": 6.704495853339154,
"grad_norm": 1.9933674335479736,
"learning_rate": 1.8292091217855413e-05,
"loss": 0.0858,
"step": 7680
},
{
"epoch": 6.713225665648189,
"grad_norm": 2.0083909034729004,
"learning_rate": 1.824357108199903e-05,
"loss": 0.1199,
"step": 7690
},
{
"epoch": 6.7219554779572235,
"grad_norm": 3.1717681884765625,
"learning_rate": 1.819505094614265e-05,
"loss": 0.1207,
"step": 7700
},
{
"epoch": 6.730685290266259,
"grad_norm": 1.9426745176315308,
"learning_rate": 1.8146530810286268e-05,
"loss": 0.1157,
"step": 7710
},
{
"epoch": 6.739415102575295,
"grad_norm": 2.9203710556030273,
"learning_rate": 1.809801067442989e-05,
"loss": 0.1251,
"step": 7720
},
{
"epoch": 6.74814491488433,
"grad_norm": 3.200591564178467,
"learning_rate": 1.804949053857351e-05,
"loss": 0.1481,
"step": 7730
},
{
"epoch": 6.756874727193365,
"grad_norm": 4.032431125640869,
"learning_rate": 1.800097040271713e-05,
"loss": 0.135,
"step": 7740
},
{
"epoch": 6.765604539502401,
"grad_norm": 2.4872477054595947,
"learning_rate": 1.795245026686075e-05,
"loss": 0.0975,
"step": 7750
},
{
"epoch": 6.7743343518114365,
"grad_norm": 2.5336356163024902,
"learning_rate": 1.7903930131004367e-05,
"loss": 0.1311,
"step": 7760
},
{
"epoch": 6.783064164120471,
"grad_norm": 2.6946463584899902,
"learning_rate": 1.7855409995147988e-05,
"loss": 0.1073,
"step": 7770
},
{
"epoch": 6.791793976429506,
"grad_norm": 3.247837543487549,
"learning_rate": 1.7806889859291605e-05,
"loss": 0.1126,
"step": 7780
},
{
"epoch": 6.800523788738542,
"grad_norm": 2.301456928253174,
"learning_rate": 1.7758369723435225e-05,
"loss": 0.1314,
"step": 7790
},
{
"epoch": 6.809253601047578,
"grad_norm": 1.9753727912902832,
"learning_rate": 1.7709849587578846e-05,
"loss": 0.1,
"step": 7800
},
{
"epoch": 6.817983413356613,
"grad_norm": 2.194286584854126,
"learning_rate": 1.7661329451722467e-05,
"loss": 0.1114,
"step": 7810
},
{
"epoch": 6.826713225665648,
"grad_norm": 2.973609447479248,
"learning_rate": 1.7612809315866087e-05,
"loss": 0.1065,
"step": 7820
},
{
"epoch": 6.8354430379746836,
"grad_norm": 2.5955142974853516,
"learning_rate": 1.7564289180009704e-05,
"loss": 0.1326,
"step": 7830
},
{
"epoch": 6.844172850283719,
"grad_norm": 2.9520103931427,
"learning_rate": 1.7515769044153325e-05,
"loss": 0.1285,
"step": 7840
},
{
"epoch": 6.852902662592754,
"grad_norm": 3.955249071121216,
"learning_rate": 1.7467248908296942e-05,
"loss": 0.1029,
"step": 7850
},
{
"epoch": 6.861632474901789,
"grad_norm": 2.9063422679901123,
"learning_rate": 1.7418728772440563e-05,
"loss": 0.0971,
"step": 7860
},
{
"epoch": 6.870362287210825,
"grad_norm": 2.6652753353118896,
"learning_rate": 1.7370208636584183e-05,
"loss": 0.1306,
"step": 7870
},
{
"epoch": 6.879092099519861,
"grad_norm": 2.687307834625244,
"learning_rate": 1.7321688500727804e-05,
"loss": 0.1265,
"step": 7880
},
{
"epoch": 6.887821911828896,
"grad_norm": 2.639251708984375,
"learning_rate": 1.7273168364871424e-05,
"loss": 0.1162,
"step": 7890
},
{
"epoch": 6.896551724137931,
"grad_norm": 3.628904104232788,
"learning_rate": 1.722464822901504e-05,
"loss": 0.1368,
"step": 7900
},
{
"epoch": 6.905281536446966,
"grad_norm": 2.818596124649048,
"learning_rate": 1.7176128093158662e-05,
"loss": 0.1304,
"step": 7910
},
{
"epoch": 6.914011348756002,
"grad_norm": 2.2759499549865723,
"learning_rate": 1.712760795730228e-05,
"loss": 0.1264,
"step": 7920
},
{
"epoch": 6.922741161065037,
"grad_norm": 3.426044225692749,
"learning_rate": 1.7079087821445903e-05,
"loss": 0.1131,
"step": 7930
},
{
"epoch": 6.931470973374072,
"grad_norm": 1.9681727886199951,
"learning_rate": 1.703056768558952e-05,
"loss": 0.1231,
"step": 7940
},
{
"epoch": 6.940200785683108,
"grad_norm": 1.9072273969650269,
"learning_rate": 1.698204754973314e-05,
"loss": 0.096,
"step": 7950
},
{
"epoch": 6.948930597992144,
"grad_norm": 2.6058807373046875,
"learning_rate": 1.693352741387676e-05,
"loss": 0.1137,
"step": 7960
},
{
"epoch": 6.9576604103011785,
"grad_norm": 1.815313696861267,
"learning_rate": 1.688500727802038e-05,
"loss": 0.0869,
"step": 7970
},
{
"epoch": 6.9663902226102135,
"grad_norm": 3.1374306678771973,
"learning_rate": 1.6836487142164e-05,
"loss": 0.1256,
"step": 7980
},
{
"epoch": 6.975120034919249,
"grad_norm": 1.8264007568359375,
"learning_rate": 1.6787967006307616e-05,
"loss": 0.0917,
"step": 7990
},
{
"epoch": 6.983849847228285,
"grad_norm": 2.1605708599090576,
"learning_rate": 1.673944687045124e-05,
"loss": 0.134,
"step": 8000
},
{
"epoch": 6.99257965953732,
"grad_norm": 2.8631725311279297,
"learning_rate": 1.6690926734594857e-05,
"loss": 0.1275,
"step": 8010
},
{
"epoch": 6.999563509384548,
"eval_accuracy": 0.9620058924625583,
"eval_loss": 0.12257199734449387,
"eval_runtime": 61.6953,
"eval_samples_per_second": 264.072,
"eval_steps_per_second": 8.266,
"step": 8018
},
{
"epoch": 7.001309471846355,
"grad_norm": 2.7233829498291016,
"learning_rate": 1.6642406598738478e-05,
"loss": 0.1128,
"step": 8020
},
{
"epoch": 7.010039284155391,
"grad_norm": 2.6346898078918457,
"learning_rate": 1.6593886462882098e-05,
"loss": 0.107,
"step": 8030
},
{
"epoch": 7.018769096464426,
"grad_norm": 3.0289077758789062,
"learning_rate": 1.6545366327025715e-05,
"loss": 0.1282,
"step": 8040
},
{
"epoch": 7.027498908773461,
"grad_norm": 3.5590922832489014,
"learning_rate": 1.6496846191169336e-05,
"loss": 0.1248,
"step": 8050
},
{
"epoch": 7.036228721082496,
"grad_norm": 2.1778759956359863,
"learning_rate": 1.6448326055312957e-05,
"loss": 0.1206,
"step": 8060
},
{
"epoch": 7.044958533391532,
"grad_norm": 3.13328218460083,
"learning_rate": 1.6399805919456577e-05,
"loss": 0.1064,
"step": 8070
},
{
"epoch": 7.053688345700567,
"grad_norm": 2.862576484680176,
"learning_rate": 1.6351285783600194e-05,
"loss": 0.1133,
"step": 8080
},
{
"epoch": 7.062418158009603,
"grad_norm": 2.9792587757110596,
"learning_rate": 1.6302765647743815e-05,
"loss": 0.1542,
"step": 8090
},
{
"epoch": 7.071147970318638,
"grad_norm": 3.1806788444519043,
"learning_rate": 1.6254245511887435e-05,
"loss": 0.1284,
"step": 8100
},
{
"epoch": 7.0798777826276735,
"grad_norm": 2.1008007526397705,
"learning_rate": 1.6205725376031052e-05,
"loss": 0.0993,
"step": 8110
},
{
"epoch": 7.0886075949367084,
"grad_norm": 3.443948984146118,
"learning_rate": 1.6157205240174673e-05,
"loss": 0.1321,
"step": 8120
},
{
"epoch": 7.097337407245744,
"grad_norm": 3.075568675994873,
"learning_rate": 1.6108685104318294e-05,
"loss": 0.1135,
"step": 8130
},
{
"epoch": 7.106067219554779,
"grad_norm": 2.7205452919006348,
"learning_rate": 1.6060164968461914e-05,
"loss": 0.1045,
"step": 8140
},
{
"epoch": 7.114797031863815,
"grad_norm": 1.9306424856185913,
"learning_rate": 1.601164483260553e-05,
"loss": 0.1155,
"step": 8150
},
{
"epoch": 7.12352684417285,
"grad_norm": 1.848753571510315,
"learning_rate": 1.5963124696749152e-05,
"loss": 0.121,
"step": 8160
},
{
"epoch": 7.132256656481886,
"grad_norm": 2.816012144088745,
"learning_rate": 1.5914604560892772e-05,
"loss": 0.1195,
"step": 8170
},
{
"epoch": 7.140986468790921,
"grad_norm": 1.6042299270629883,
"learning_rate": 1.586608442503639e-05,
"loss": 0.109,
"step": 8180
},
{
"epoch": 7.149716281099956,
"grad_norm": 1.9858665466308594,
"learning_rate": 1.581756428918001e-05,
"loss": 0.1218,
"step": 8190
},
{
"epoch": 7.158446093408991,
"grad_norm": 3.661896228790283,
"learning_rate": 1.576904415332363e-05,
"loss": 0.1162,
"step": 8200
},
{
"epoch": 7.167175905718027,
"grad_norm": 3.4283740520477295,
"learning_rate": 1.572052401746725e-05,
"loss": 0.1192,
"step": 8210
},
{
"epoch": 7.175905718027062,
"grad_norm": 2.0922162532806396,
"learning_rate": 1.567200388161087e-05,
"loss": 0.1228,
"step": 8220
},
{
"epoch": 7.184635530336098,
"grad_norm": 3.018186330795288,
"learning_rate": 1.562348374575449e-05,
"loss": 0.1239,
"step": 8230
},
{
"epoch": 7.193365342645133,
"grad_norm": 3.431612491607666,
"learning_rate": 1.557496360989811e-05,
"loss": 0.1199,
"step": 8240
},
{
"epoch": 7.2020951549541685,
"grad_norm": 1.7211098670959473,
"learning_rate": 1.5526443474041727e-05,
"loss": 0.1241,
"step": 8250
},
{
"epoch": 7.210824967263203,
"grad_norm": 3.201613664627075,
"learning_rate": 1.547792333818535e-05,
"loss": 0.1528,
"step": 8260
},
{
"epoch": 7.219554779572239,
"grad_norm": 2.8337209224700928,
"learning_rate": 1.5429403202328968e-05,
"loss": 0.1037,
"step": 8270
},
{
"epoch": 7.228284591881274,
"grad_norm": 2.8004138469696045,
"learning_rate": 1.5380883066472588e-05,
"loss": 0.1221,
"step": 8280
},
{
"epoch": 7.23701440419031,
"grad_norm": 4.067490577697754,
"learning_rate": 1.5332362930616205e-05,
"loss": 0.1399,
"step": 8290
},
{
"epoch": 7.245744216499345,
"grad_norm": 3.5075736045837402,
"learning_rate": 1.5283842794759826e-05,
"loss": 0.1412,
"step": 8300
},
{
"epoch": 7.254474028808381,
"grad_norm": 3.4626917839050293,
"learning_rate": 1.5235322658903445e-05,
"loss": 0.1311,
"step": 8310
},
{
"epoch": 7.263203841117416,
"grad_norm": 3.102483034133911,
"learning_rate": 1.5186802523047064e-05,
"loss": 0.1054,
"step": 8320
},
{
"epoch": 7.271933653426451,
"grad_norm": 3.181889533996582,
"learning_rate": 1.5138282387190686e-05,
"loss": 0.1469,
"step": 8330
},
{
"epoch": 7.280663465735486,
"grad_norm": 2.642778158187866,
"learning_rate": 1.5089762251334305e-05,
"loss": 0.1359,
"step": 8340
},
{
"epoch": 7.289393278044522,
"grad_norm": 2.339479446411133,
"learning_rate": 1.5041242115477924e-05,
"loss": 0.0924,
"step": 8350
},
{
"epoch": 7.298123090353557,
"grad_norm": 3.010129690170288,
"learning_rate": 1.4992721979621544e-05,
"loss": 0.1211,
"step": 8360
},
{
"epoch": 7.306852902662593,
"grad_norm": 2.169443130493164,
"learning_rate": 1.4944201843765163e-05,
"loss": 0.0962,
"step": 8370
},
{
"epoch": 7.315582714971628,
"grad_norm": 2.3417961597442627,
"learning_rate": 1.4895681707908782e-05,
"loss": 0.1263,
"step": 8380
},
{
"epoch": 7.3243125272806635,
"grad_norm": 2.7159245014190674,
"learning_rate": 1.4847161572052404e-05,
"loss": 0.0773,
"step": 8390
},
{
"epoch": 7.333042339589698,
"grad_norm": 1.89850652217865,
"learning_rate": 1.4798641436196023e-05,
"loss": 0.1134,
"step": 8400
},
{
"epoch": 7.341772151898734,
"grad_norm": 2.856013536453247,
"learning_rate": 1.4750121300339642e-05,
"loss": 0.1278,
"step": 8410
},
{
"epoch": 7.350501964207769,
"grad_norm": 2.531646728515625,
"learning_rate": 1.470160116448326e-05,
"loss": 0.1234,
"step": 8420
},
{
"epoch": 7.359231776516805,
"grad_norm": 3.2163310050964355,
"learning_rate": 1.4653081028626881e-05,
"loss": 0.1112,
"step": 8430
},
{
"epoch": 7.36796158882584,
"grad_norm": 2.611832618713379,
"learning_rate": 1.46045608927705e-05,
"loss": 0.1146,
"step": 8440
},
{
"epoch": 7.376691401134876,
"grad_norm": 3.106451988220215,
"learning_rate": 1.4556040756914119e-05,
"loss": 0.1088,
"step": 8450
},
{
"epoch": 7.385421213443911,
"grad_norm": 2.7591700553894043,
"learning_rate": 1.4507520621057741e-05,
"loss": 0.1313,
"step": 8460
},
{
"epoch": 7.394151025752946,
"grad_norm": 2.3061065673828125,
"learning_rate": 1.445900048520136e-05,
"loss": 0.1187,
"step": 8470
},
{
"epoch": 7.402880838061981,
"grad_norm": 2.7407071590423584,
"learning_rate": 1.4410480349344979e-05,
"loss": 0.1076,
"step": 8480
},
{
"epoch": 7.411610650371017,
"grad_norm": 3.5780117511749268,
"learning_rate": 1.4361960213488598e-05,
"loss": 0.1261,
"step": 8490
},
{
"epoch": 7.420340462680052,
"grad_norm": 2.2158961296081543,
"learning_rate": 1.4313440077632218e-05,
"loss": 0.1042,
"step": 8500
},
{
"epoch": 7.429070274989088,
"grad_norm": 3.758617639541626,
"learning_rate": 1.4264919941775837e-05,
"loss": 0.1146,
"step": 8510
},
{
"epoch": 7.437800087298123,
"grad_norm": 3.035114049911499,
"learning_rate": 1.421639980591946e-05,
"loss": 0.1149,
"step": 8520
},
{
"epoch": 7.4465298996071585,
"grad_norm": 3.016139030456543,
"learning_rate": 1.4167879670063078e-05,
"loss": 0.0983,
"step": 8530
},
{
"epoch": 7.455259711916193,
"grad_norm": 1.1190143823623657,
"learning_rate": 1.4119359534206697e-05,
"loss": 0.1016,
"step": 8540
},
{
"epoch": 7.463989524225229,
"grad_norm": 1.6610057353973389,
"learning_rate": 1.4070839398350316e-05,
"loss": 0.1098,
"step": 8550
},
{
"epoch": 7.472719336534264,
"grad_norm": 2.246140956878662,
"learning_rate": 1.4022319262493935e-05,
"loss": 0.1226,
"step": 8560
},
{
"epoch": 7.4814491488433,
"grad_norm": 3.7668988704681396,
"learning_rate": 1.3973799126637555e-05,
"loss": 0.0909,
"step": 8570
},
{
"epoch": 7.490178961152335,
"grad_norm": 1.9712340831756592,
"learning_rate": 1.3925278990781174e-05,
"loss": 0.1065,
"step": 8580
},
{
"epoch": 7.498908773461371,
"grad_norm": 2.9757723808288574,
"learning_rate": 1.3876758854924796e-05,
"loss": 0.1222,
"step": 8590
},
{
"epoch": 7.5076385857704055,
"grad_norm": 3.4910058975219727,
"learning_rate": 1.3828238719068415e-05,
"loss": 0.1156,
"step": 8600
},
{
"epoch": 7.516368398079441,
"grad_norm": 2.6508007049560547,
"learning_rate": 1.3779718583212034e-05,
"loss": 0.099,
"step": 8610
},
{
"epoch": 7.525098210388476,
"grad_norm": 1.8022457361221313,
"learning_rate": 1.3731198447355653e-05,
"loss": 0.0829,
"step": 8620
},
{
"epoch": 7.533828022697512,
"grad_norm": 2.3336634635925293,
"learning_rate": 1.3682678311499272e-05,
"loss": 0.1409,
"step": 8630
},
{
"epoch": 7.542557835006547,
"grad_norm": 3.1852900981903076,
"learning_rate": 1.3634158175642892e-05,
"loss": 0.125,
"step": 8640
},
{
"epoch": 7.551287647315583,
"grad_norm": 2.8601016998291016,
"learning_rate": 1.3585638039786511e-05,
"loss": 0.1035,
"step": 8650
},
{
"epoch": 7.560017459624618,
"grad_norm": 2.5289573669433594,
"learning_rate": 1.3537117903930133e-05,
"loss": 0.1093,
"step": 8660
},
{
"epoch": 7.5687472719336535,
"grad_norm": 2.4447848796844482,
"learning_rate": 1.3488597768073752e-05,
"loss": 0.1076,
"step": 8670
},
{
"epoch": 7.577477084242688,
"grad_norm": 3.016014337539673,
"learning_rate": 1.3440077632217371e-05,
"loss": 0.1189,
"step": 8680
},
{
"epoch": 7.586206896551724,
"grad_norm": 2.6992433071136475,
"learning_rate": 1.339155749636099e-05,
"loss": 0.1088,
"step": 8690
},
{
"epoch": 7.594936708860759,
"grad_norm": 2.2244362831115723,
"learning_rate": 1.3343037360504609e-05,
"loss": 0.0965,
"step": 8700
},
{
"epoch": 7.603666521169795,
"grad_norm": 2.8653414249420166,
"learning_rate": 1.329451722464823e-05,
"loss": 0.1022,
"step": 8710
},
{
"epoch": 7.61239633347883,
"grad_norm": 3.810765027999878,
"learning_rate": 1.324599708879185e-05,
"loss": 0.1208,
"step": 8720
},
{
"epoch": 7.621126145787866,
"grad_norm": 2.089237689971924,
"learning_rate": 1.319747695293547e-05,
"loss": 0.0989,
"step": 8730
},
{
"epoch": 7.6298559580969005,
"grad_norm": 1.9120042324066162,
"learning_rate": 1.314895681707909e-05,
"loss": 0.112,
"step": 8740
},
{
"epoch": 7.638585770405936,
"grad_norm": 2.164149284362793,
"learning_rate": 1.3100436681222708e-05,
"loss": 0.1183,
"step": 8750
},
{
"epoch": 7.647315582714971,
"grad_norm": 1.7820502519607544,
"learning_rate": 1.3051916545366327e-05,
"loss": 0.1187,
"step": 8760
},
{
"epoch": 7.656045395024007,
"grad_norm": 3.0987606048583984,
"learning_rate": 1.3003396409509946e-05,
"loss": 0.1192,
"step": 8770
},
{
"epoch": 7.664775207333042,
"grad_norm": 2.332767963409424,
"learning_rate": 1.2954876273653566e-05,
"loss": 0.1219,
"step": 8780
},
{
"epoch": 7.673505019642078,
"grad_norm": 2.8338541984558105,
"learning_rate": 1.2906356137797187e-05,
"loss": 0.0953,
"step": 8790
},
{
"epoch": 7.682234831951113,
"grad_norm": 2.000577926635742,
"learning_rate": 1.2857836001940808e-05,
"loss": 0.0969,
"step": 8800
},
{
"epoch": 7.6909646442601485,
"grad_norm": 3.364076614379883,
"learning_rate": 1.2809315866084426e-05,
"loss": 0.1117,
"step": 8810
},
{
"epoch": 7.699694456569183,
"grad_norm": 4.16958475112915,
"learning_rate": 1.2760795730228045e-05,
"loss": 0.1216,
"step": 8820
},
{
"epoch": 7.708424268878219,
"grad_norm": 2.2611634731292725,
"learning_rate": 1.2712275594371664e-05,
"loss": 0.0929,
"step": 8830
},
{
"epoch": 7.717154081187254,
"grad_norm": 2.592312812805176,
"learning_rate": 1.2663755458515283e-05,
"loss": 0.1147,
"step": 8840
},
{
"epoch": 7.72588389349629,
"grad_norm": 3.680349588394165,
"learning_rate": 1.2615235322658905e-05,
"loss": 0.1184,
"step": 8850
},
{
"epoch": 7.734613705805325,
"grad_norm": 3.146328926086426,
"learning_rate": 1.2566715186802524e-05,
"loss": 0.1083,
"step": 8860
},
{
"epoch": 7.743343518114361,
"grad_norm": 3.165249824523926,
"learning_rate": 1.2518195050946145e-05,
"loss": 0.1303,
"step": 8870
},
{
"epoch": 7.7520733304233955,
"grad_norm": 3.2401604652404785,
"learning_rate": 1.2469674915089763e-05,
"loss": 0.1032,
"step": 8880
},
{
"epoch": 7.760803142732431,
"grad_norm": 1.9936988353729248,
"learning_rate": 1.2421154779233382e-05,
"loss": 0.1187,
"step": 8890
},
{
"epoch": 7.769532955041466,
"grad_norm": 2.625159502029419,
"learning_rate": 1.2372634643377003e-05,
"loss": 0.1062,
"step": 8900
},
{
"epoch": 7.778262767350502,
"grad_norm": 1.672642707824707,
"learning_rate": 1.2324114507520622e-05,
"loss": 0.1183,
"step": 8910
},
{
"epoch": 7.786992579659537,
"grad_norm": 2.378697156906128,
"learning_rate": 1.227559437166424e-05,
"loss": 0.1197,
"step": 8920
},
{
"epoch": 7.795722391968573,
"grad_norm": 2.9251904487609863,
"learning_rate": 1.222707423580786e-05,
"loss": 0.1141,
"step": 8930
},
{
"epoch": 7.804452204277608,
"grad_norm": 1.902925968170166,
"learning_rate": 1.217855409995148e-05,
"loss": 0.1034,
"step": 8940
},
{
"epoch": 7.8131820165866435,
"grad_norm": 2.213522434234619,
"learning_rate": 1.21300339640951e-05,
"loss": 0.1124,
"step": 8950
},
{
"epoch": 7.821911828895678,
"grad_norm": 3.5406954288482666,
"learning_rate": 1.208151382823872e-05,
"loss": 0.0988,
"step": 8960
},
{
"epoch": 7.830641641204714,
"grad_norm": 2.070265531539917,
"learning_rate": 1.203299369238234e-05,
"loss": 0.1182,
"step": 8970
},
{
"epoch": 7.839371453513749,
"grad_norm": 3.4907076358795166,
"learning_rate": 1.1984473556525959e-05,
"loss": 0.1009,
"step": 8980
},
{
"epoch": 7.848101265822785,
"grad_norm": 2.2645416259765625,
"learning_rate": 1.1935953420669578e-05,
"loss": 0.1021,
"step": 8990
},
{
"epoch": 7.85683107813182,
"grad_norm": 2.2879273891448975,
"learning_rate": 1.1887433284813198e-05,
"loss": 0.1166,
"step": 9000
},
{
"epoch": 7.865560890440856,
"grad_norm": 2.280569553375244,
"learning_rate": 1.1838913148956817e-05,
"loss": 0.1145,
"step": 9010
},
{
"epoch": 7.8742907027498905,
"grad_norm": 2.2898128032684326,
"learning_rate": 1.1790393013100438e-05,
"loss": 0.095,
"step": 9020
},
{
"epoch": 7.883020515058926,
"grad_norm": 2.112298011779785,
"learning_rate": 1.1741872877244056e-05,
"loss": 0.0834,
"step": 9030
},
{
"epoch": 7.891750327367961,
"grad_norm": 3.349541425704956,
"learning_rate": 1.1693352741387677e-05,
"loss": 0.1125,
"step": 9040
},
{
"epoch": 7.900480139676997,
"grad_norm": 2.482618570327759,
"learning_rate": 1.1644832605531296e-05,
"loss": 0.1059,
"step": 9050
},
{
"epoch": 7.909209951986032,
"grad_norm": 2.8729076385498047,
"learning_rate": 1.1596312469674915e-05,
"loss": 0.1312,
"step": 9060
},
{
"epoch": 7.917939764295068,
"grad_norm": 2.159461498260498,
"learning_rate": 1.1547792333818535e-05,
"loss": 0.1032,
"step": 9070
},
{
"epoch": 7.926669576604103,
"grad_norm": 1.993208646774292,
"learning_rate": 1.1499272197962154e-05,
"loss": 0.1139,
"step": 9080
},
{
"epoch": 7.9353993889131385,
"grad_norm": 2.4738166332244873,
"learning_rate": 1.1450752062105775e-05,
"loss": 0.1021,
"step": 9090
},
{
"epoch": 7.944129201222173,
"grad_norm": 2.5491795539855957,
"learning_rate": 1.1402231926249395e-05,
"loss": 0.1195,
"step": 9100
},
{
"epoch": 7.952859013531209,
"grad_norm": 2.6320948600769043,
"learning_rate": 1.1353711790393014e-05,
"loss": 0.0882,
"step": 9110
},
{
"epoch": 7.961588825840244,
"grad_norm": 2.642817497253418,
"learning_rate": 1.1305191654536633e-05,
"loss": 0.1278,
"step": 9120
},
{
"epoch": 7.97031863814928,
"grad_norm": 1.8427996635437012,
"learning_rate": 1.1256671518680253e-05,
"loss": 0.1091,
"step": 9130
},
{
"epoch": 7.979048450458315,
"grad_norm": 3.3067104816436768,
"learning_rate": 1.1208151382823872e-05,
"loss": 0.1177,
"step": 9140
},
{
"epoch": 7.987778262767351,
"grad_norm": 1.9407857656478882,
"learning_rate": 1.1159631246967491e-05,
"loss": 0.1146,
"step": 9150
},
{
"epoch": 7.9965080750763855,
"grad_norm": 2.7859206199645996,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.1323,
"step": 9160
},
{
"epoch": 8.0,
"eval_accuracy": 0.9777191259513872,
"eval_loss": 0.07020638883113861,
"eval_runtime": 60.5088,
"eval_samples_per_second": 269.25,
"eval_steps_per_second": 8.429,
"step": 9164
},
{
"epoch": 8.005237887385421,
"grad_norm": 3.0209567546844482,
"learning_rate": 1.1062590975254732e-05,
"loss": 0.1044,
"step": 9170
},
{
"epoch": 8.013967699694456,
"grad_norm": 3.3612194061279297,
"learning_rate": 1.1014070839398351e-05,
"loss": 0.1367,
"step": 9180
},
{
"epoch": 8.022697512003491,
"grad_norm": 3.807859182357788,
"learning_rate": 1.096555070354197e-05,
"loss": 0.0977,
"step": 9190
},
{
"epoch": 8.031427324312528,
"grad_norm": 1.820942759513855,
"learning_rate": 1.091703056768559e-05,
"loss": 0.1027,
"step": 9200
},
{
"epoch": 8.040157136621563,
"grad_norm": 1.7831873893737793,
"learning_rate": 1.086851043182921e-05,
"loss": 0.114,
"step": 9210
},
{
"epoch": 8.048886948930598,
"grad_norm": 2.5594382286071777,
"learning_rate": 1.0819990295972828e-05,
"loss": 0.1036,
"step": 9220
},
{
"epoch": 8.057616761239633,
"grad_norm": 2.1451315879821777,
"learning_rate": 1.0771470160116449e-05,
"loss": 0.1002,
"step": 9230
},
{
"epoch": 8.06634657354867,
"grad_norm": 2.5839290618896484,
"learning_rate": 1.072295002426007e-05,
"loss": 0.1026,
"step": 9240
},
{
"epoch": 8.075076385857704,
"grad_norm": 2.6166088581085205,
"learning_rate": 1.0674429888403688e-05,
"loss": 0.1,
"step": 9250
},
{
"epoch": 8.083806198166739,
"grad_norm": 3.226916790008545,
"learning_rate": 1.0625909752547307e-05,
"loss": 0.0857,
"step": 9260
},
{
"epoch": 8.092536010475774,
"grad_norm": 2.4332773685455322,
"learning_rate": 1.0577389616690927e-05,
"loss": 0.0862,
"step": 9270
},
{
"epoch": 8.10126582278481,
"grad_norm": 2.966545820236206,
"learning_rate": 1.0528869480834546e-05,
"loss": 0.1025,
"step": 9280
},
{
"epoch": 8.109995635093846,
"grad_norm": 2.112661123275757,
"learning_rate": 1.0480349344978165e-05,
"loss": 0.0996,
"step": 9290
},
{
"epoch": 8.11872544740288,
"grad_norm": 3.0900607109069824,
"learning_rate": 1.0431829209121786e-05,
"loss": 0.1052,
"step": 9300
},
{
"epoch": 8.127455259711915,
"grad_norm": 2.729537010192871,
"learning_rate": 1.0383309073265406e-05,
"loss": 0.1012,
"step": 9310
},
{
"epoch": 8.136185072020952,
"grad_norm": 2.780118227005005,
"learning_rate": 1.0334788937409025e-05,
"loss": 0.1136,
"step": 9320
},
{
"epoch": 8.144914884329987,
"grad_norm": 1.7321745157241821,
"learning_rate": 1.0286268801552646e-05,
"loss": 0.1115,
"step": 9330
},
{
"epoch": 8.153644696639022,
"grad_norm": 2.258513927459717,
"learning_rate": 1.0237748665696265e-05,
"loss": 0.1353,
"step": 9340
},
{
"epoch": 8.162374508948057,
"grad_norm": 3.355522632598877,
"learning_rate": 1.0189228529839883e-05,
"loss": 0.0798,
"step": 9350
},
{
"epoch": 8.171104321257094,
"grad_norm": 1.8760606050491333,
"learning_rate": 1.0140708393983504e-05,
"loss": 0.1136,
"step": 9360
},
{
"epoch": 8.179834133566128,
"grad_norm": 2.638148546218872,
"learning_rate": 1.0092188258127123e-05,
"loss": 0.1096,
"step": 9370
},
{
"epoch": 8.188563945875163,
"grad_norm": 3.003227472305298,
"learning_rate": 1.0043668122270743e-05,
"loss": 0.1027,
"step": 9380
},
{
"epoch": 8.197293758184198,
"grad_norm": 2.2695999145507812,
"learning_rate": 9.995147986414362e-06,
"loss": 0.1027,
"step": 9390
},
{
"epoch": 8.206023570493235,
"grad_norm": 2.753089666366577,
"learning_rate": 9.946627850557983e-06,
"loss": 0.1131,
"step": 9400
},
{
"epoch": 8.21475338280227,
"grad_norm": 2.087954521179199,
"learning_rate": 9.898107714701602e-06,
"loss": 0.1096,
"step": 9410
},
{
"epoch": 8.223483195111305,
"grad_norm": 2.1234874725341797,
"learning_rate": 9.84958757884522e-06,
"loss": 0.1281,
"step": 9420
},
{
"epoch": 8.23221300742034,
"grad_norm": 2.573425531387329,
"learning_rate": 9.801067442988841e-06,
"loss": 0.1221,
"step": 9430
},
{
"epoch": 8.240942819729376,
"grad_norm": 2.972698211669922,
"learning_rate": 9.75254730713246e-06,
"loss": 0.1063,
"step": 9440
},
{
"epoch": 8.249672632038411,
"grad_norm": 3.052992105484009,
"learning_rate": 9.70402717127608e-06,
"loss": 0.1117,
"step": 9450
},
{
"epoch": 8.258402444347446,
"grad_norm": 2.1158103942871094,
"learning_rate": 9.655507035419701e-06,
"loss": 0.092,
"step": 9460
},
{
"epoch": 8.267132256656481,
"grad_norm": 2.4370791912078857,
"learning_rate": 9.60698689956332e-06,
"loss": 0.1092,
"step": 9470
},
{
"epoch": 8.275862068965518,
"grad_norm": 3.0041496753692627,
"learning_rate": 9.558466763706939e-06,
"loss": 0.0869,
"step": 9480
},
{
"epoch": 8.284591881274553,
"grad_norm": 1.6758767366409302,
"learning_rate": 9.509946627850557e-06,
"loss": 0.1012,
"step": 9490
},
{
"epoch": 8.293321693583588,
"grad_norm": 2.937406301498413,
"learning_rate": 9.461426491994178e-06,
"loss": 0.1152,
"step": 9500
},
{
"epoch": 8.302051505892623,
"grad_norm": 2.93967866897583,
"learning_rate": 9.412906356137797e-06,
"loss": 0.099,
"step": 9510
},
{
"epoch": 8.31078131820166,
"grad_norm": 2.0390257835388184,
"learning_rate": 9.364386220281417e-06,
"loss": 0.0926,
"step": 9520
},
{
"epoch": 8.319511130510694,
"grad_norm": 2.6134767532348633,
"learning_rate": 9.315866084425038e-06,
"loss": 0.1139,
"step": 9530
},
{
"epoch": 8.328240942819729,
"grad_norm": 3.398548126220703,
"learning_rate": 9.267345948568657e-06,
"loss": 0.1051,
"step": 9540
},
{
"epoch": 8.336970755128764,
"grad_norm": 3.847663164138794,
"learning_rate": 9.218825812712276e-06,
"loss": 0.119,
"step": 9550
},
{
"epoch": 8.3457005674378,
"grad_norm": 3.1890709400177,
"learning_rate": 9.170305676855896e-06,
"loss": 0.1459,
"step": 9560
},
{
"epoch": 8.354430379746836,
"grad_norm": 1.5987610816955566,
"learning_rate": 9.121785540999515e-06,
"loss": 0.0985,
"step": 9570
},
{
"epoch": 8.36316019205587,
"grad_norm": 2.2405405044555664,
"learning_rate": 9.073265405143134e-06,
"loss": 0.1044,
"step": 9580
},
{
"epoch": 8.371890004364905,
"grad_norm": 2.4690017700195312,
"learning_rate": 9.024745269286754e-06,
"loss": 0.0976,
"step": 9590
},
{
"epoch": 8.380619816673942,
"grad_norm": 2.168797254562378,
"learning_rate": 8.976225133430375e-06,
"loss": 0.0933,
"step": 9600
},
{
"epoch": 8.389349628982977,
"grad_norm": 2.74342942237854,
"learning_rate": 8.927704997573994e-06,
"loss": 0.099,
"step": 9610
},
{
"epoch": 8.398079441292012,
"grad_norm": 3.6532704830169678,
"learning_rate": 8.879184861717613e-06,
"loss": 0.1303,
"step": 9620
},
{
"epoch": 8.406809253601047,
"grad_norm": 2.5002212524414062,
"learning_rate": 8.830664725861233e-06,
"loss": 0.1139,
"step": 9630
},
{
"epoch": 8.415539065910083,
"grad_norm": 2.5484557151794434,
"learning_rate": 8.782144590004852e-06,
"loss": 0.1261,
"step": 9640
},
{
"epoch": 8.424268878219118,
"grad_norm": 2.8932912349700928,
"learning_rate": 8.733624454148471e-06,
"loss": 0.1126,
"step": 9650
},
{
"epoch": 8.432998690528153,
"grad_norm": 2.334230899810791,
"learning_rate": 8.685104318292092e-06,
"loss": 0.1021,
"step": 9660
},
{
"epoch": 8.441728502837188,
"grad_norm": 2.884859561920166,
"learning_rate": 8.636584182435712e-06,
"loss": 0.0958,
"step": 9670
},
{
"epoch": 8.450458315146225,
"grad_norm": 1.8755512237548828,
"learning_rate": 8.588064046579331e-06,
"loss": 0.0928,
"step": 9680
},
{
"epoch": 8.45918812745526,
"grad_norm": 2.3897783756256104,
"learning_rate": 8.539543910722951e-06,
"loss": 0.0897,
"step": 9690
},
{
"epoch": 8.467917939764295,
"grad_norm": 3.4187004566192627,
"learning_rate": 8.49102377486657e-06,
"loss": 0.0978,
"step": 9700
},
{
"epoch": 8.47664775207333,
"grad_norm": 3.359574317932129,
"learning_rate": 8.44250363901019e-06,
"loss": 0.1029,
"step": 9710
},
{
"epoch": 8.485377564382366,
"grad_norm": 2.7510204315185547,
"learning_rate": 8.393983503153808e-06,
"loss": 0.1115,
"step": 9720
},
{
"epoch": 8.494107376691401,
"grad_norm": 2.2325894832611084,
"learning_rate": 8.345463367297429e-06,
"loss": 0.1237,
"step": 9730
},
{
"epoch": 8.502837189000436,
"grad_norm": 2.400143623352051,
"learning_rate": 8.296943231441049e-06,
"loss": 0.1202,
"step": 9740
},
{
"epoch": 8.511567001309471,
"grad_norm": 3.032205581665039,
"learning_rate": 8.248423095584668e-06,
"loss": 0.092,
"step": 9750
},
{
"epoch": 8.520296813618508,
"grad_norm": 3.307790517807007,
"learning_rate": 8.199902959728289e-06,
"loss": 0.1069,
"step": 9760
},
{
"epoch": 8.529026625927543,
"grad_norm": 3.3414857387542725,
"learning_rate": 8.151382823871907e-06,
"loss": 0.1315,
"step": 9770
},
{
"epoch": 8.537756438236578,
"grad_norm": 2.243468999862671,
"learning_rate": 8.102862688015526e-06,
"loss": 0.0986,
"step": 9780
},
{
"epoch": 8.546486250545613,
"grad_norm": 2.635031223297119,
"learning_rate": 8.054342552159147e-06,
"loss": 0.1067,
"step": 9790
},
{
"epoch": 8.55521606285465,
"grad_norm": 2.037966728210449,
"learning_rate": 8.005822416302766e-06,
"loss": 0.1053,
"step": 9800
},
{
"epoch": 8.563945875163684,
"grad_norm": 3.361685276031494,
"learning_rate": 7.957302280446386e-06,
"loss": 0.1166,
"step": 9810
},
{
"epoch": 8.572675687472719,
"grad_norm": 1.941724419593811,
"learning_rate": 7.908782144590005e-06,
"loss": 0.0954,
"step": 9820
},
{
"epoch": 8.581405499781754,
"grad_norm": 2.7277681827545166,
"learning_rate": 7.860262008733626e-06,
"loss": 0.1212,
"step": 9830
},
{
"epoch": 8.59013531209079,
"grad_norm": 2.5248844623565674,
"learning_rate": 7.811741872877244e-06,
"loss": 0.1009,
"step": 9840
},
{
"epoch": 8.598865124399826,
"grad_norm": 0.636101245880127,
"learning_rate": 7.763221737020863e-06,
"loss": 0.0992,
"step": 9850
},
{
"epoch": 8.60759493670886,
"grad_norm": 2.8368330001831055,
"learning_rate": 7.714701601164484e-06,
"loss": 0.0886,
"step": 9860
},
{
"epoch": 8.616324749017895,
"grad_norm": 2.3569271564483643,
"learning_rate": 7.666181465308103e-06,
"loss": 0.0835,
"step": 9870
},
{
"epoch": 8.625054561326932,
"grad_norm": 2.118471622467041,
"learning_rate": 7.617661329451722e-06,
"loss": 0.1155,
"step": 9880
},
{
"epoch": 8.633784373635967,
"grad_norm": 1.9520134925842285,
"learning_rate": 7.569141193595343e-06,
"loss": 0.0935,
"step": 9890
},
{
"epoch": 8.642514185945002,
"grad_norm": 2.778207540512085,
"learning_rate": 7.520621057738962e-06,
"loss": 0.1167,
"step": 9900
},
{
"epoch": 8.651243998254037,
"grad_norm": 3.1950080394744873,
"learning_rate": 7.4721009218825815e-06,
"loss": 0.124,
"step": 9910
},
{
"epoch": 8.659973810563073,
"grad_norm": 1.913509726524353,
"learning_rate": 7.423580786026202e-06,
"loss": 0.0922,
"step": 9920
},
{
"epoch": 8.668703622872108,
"grad_norm": 2.9603874683380127,
"learning_rate": 7.375060650169821e-06,
"loss": 0.1225,
"step": 9930
},
{
"epoch": 8.677433435181143,
"grad_norm": 2.5559980869293213,
"learning_rate": 7.326540514313441e-06,
"loss": 0.1159,
"step": 9940
},
{
"epoch": 8.686163247490178,
"grad_norm": 2.817579507827759,
"learning_rate": 7.2780203784570594e-06,
"loss": 0.069,
"step": 9950
},
{
"epoch": 8.694893059799215,
"grad_norm": 2.3405838012695312,
"learning_rate": 7.22950024260068e-06,
"loss": 0.083,
"step": 9960
},
{
"epoch": 8.70362287210825,
"grad_norm": 4.057965278625488,
"learning_rate": 7.180980106744299e-06,
"loss": 0.1346,
"step": 9970
},
{
"epoch": 8.712352684417285,
"grad_norm": 3.50754976272583,
"learning_rate": 7.1324599708879185e-06,
"loss": 0.113,
"step": 9980
},
{
"epoch": 8.72108249672632,
"grad_norm": 2.4381942749023438,
"learning_rate": 7.083939835031539e-06,
"loss": 0.0959,
"step": 9990
},
{
"epoch": 8.729812309035356,
"grad_norm": 1.3491463661193848,
"learning_rate": 7.035419699175158e-06,
"loss": 0.1141,
"step": 10000
},
{
"epoch": 8.738542121344391,
"grad_norm": 3.835019588470459,
"learning_rate": 6.986899563318778e-06,
"loss": 0.0997,
"step": 10010
},
{
"epoch": 8.747271933653426,
"grad_norm": 1.8673381805419922,
"learning_rate": 6.938379427462398e-06,
"loss": 0.1044,
"step": 10020
},
{
"epoch": 8.756001745962461,
"grad_norm": 3.303786039352417,
"learning_rate": 6.889859291606017e-06,
"loss": 0.1368,
"step": 10030
},
{
"epoch": 8.764731558271498,
"grad_norm": 2.6152758598327637,
"learning_rate": 6.841339155749636e-06,
"loss": 0.1186,
"step": 10040
},
{
"epoch": 8.773461370580533,
"grad_norm": 3.460881471633911,
"learning_rate": 6.792819019893256e-06,
"loss": 0.1263,
"step": 10050
},
{
"epoch": 8.782191182889568,
"grad_norm": 2.3491194248199463,
"learning_rate": 6.744298884036876e-06,
"loss": 0.1147,
"step": 10060
},
{
"epoch": 8.790920995198602,
"grad_norm": 2.6336238384246826,
"learning_rate": 6.695778748180495e-06,
"loss": 0.1044,
"step": 10070
},
{
"epoch": 8.79965080750764,
"grad_norm": 2.4493720531463623,
"learning_rate": 6.647258612324115e-06,
"loss": 0.0914,
"step": 10080
},
{
"epoch": 8.808380619816674,
"grad_norm": 2.0005953311920166,
"learning_rate": 6.598738476467735e-06,
"loss": 0.137,
"step": 10090
},
{
"epoch": 8.817110432125709,
"grad_norm": 2.4096193313598633,
"learning_rate": 6.550218340611354e-06,
"loss": 0.1235,
"step": 10100
},
{
"epoch": 8.825840244434744,
"grad_norm": 2.0157690048217773,
"learning_rate": 6.501698204754973e-06,
"loss": 0.0981,
"step": 10110
},
{
"epoch": 8.83457005674378,
"grad_norm": 1.8452261686325073,
"learning_rate": 6.4531780688985935e-06,
"loss": 0.0971,
"step": 10120
},
{
"epoch": 8.843299869052816,
"grad_norm": 2.7903761863708496,
"learning_rate": 6.404657933042213e-06,
"loss": 0.0935,
"step": 10130
},
{
"epoch": 8.85202968136185,
"grad_norm": 2.508514165878296,
"learning_rate": 6.356137797185832e-06,
"loss": 0.0991,
"step": 10140
},
{
"epoch": 8.860759493670885,
"grad_norm": 2.214094638824463,
"learning_rate": 6.307617661329453e-06,
"loss": 0.11,
"step": 10150
},
{
"epoch": 8.869489305979922,
"grad_norm": 2.2677242755889893,
"learning_rate": 6.259097525473072e-06,
"loss": 0.1491,
"step": 10160
},
{
"epoch": 8.878219118288957,
"grad_norm": 1.6787909269332886,
"learning_rate": 6.210577389616691e-06,
"loss": 0.1153,
"step": 10170
},
{
"epoch": 8.886948930597992,
"grad_norm": 2.0860350131988525,
"learning_rate": 6.162057253760311e-06,
"loss": 0.0906,
"step": 10180
},
{
"epoch": 8.895678742907027,
"grad_norm": 2.6599624156951904,
"learning_rate": 6.11353711790393e-06,
"loss": 0.1011,
"step": 10190
},
{
"epoch": 8.904408555216063,
"grad_norm": 2.9594058990478516,
"learning_rate": 6.06501698204755e-06,
"loss": 0.1179,
"step": 10200
},
{
"epoch": 8.913138367525098,
"grad_norm": 2.69802188873291,
"learning_rate": 6.01649684619117e-06,
"loss": 0.1086,
"step": 10210
},
{
"epoch": 8.921868179834133,
"grad_norm": 2.7719838619232178,
"learning_rate": 5.967976710334789e-06,
"loss": 0.1064,
"step": 10220
},
{
"epoch": 8.930597992143168,
"grad_norm": 2.729365587234497,
"learning_rate": 5.9194565744784085e-06,
"loss": 0.108,
"step": 10230
},
{
"epoch": 8.939327804452205,
"grad_norm": 3.4504830837249756,
"learning_rate": 5.870936438622028e-06,
"loss": 0.095,
"step": 10240
},
{
"epoch": 8.94805761676124,
"grad_norm": 2.7254908084869385,
"learning_rate": 5.822416302765648e-06,
"loss": 0.0973,
"step": 10250
},
{
"epoch": 8.956787429070275,
"grad_norm": 2.1603481769561768,
"learning_rate": 5.773896166909268e-06,
"loss": 0.0989,
"step": 10260
},
{
"epoch": 8.96551724137931,
"grad_norm": 3.3005552291870117,
"learning_rate": 5.725376031052887e-06,
"loss": 0.0931,
"step": 10270
},
{
"epoch": 8.974247053688346,
"grad_norm": 2.790048122406006,
"learning_rate": 5.676855895196507e-06,
"loss": 0.1102,
"step": 10280
},
{
"epoch": 8.982976865997381,
"grad_norm": 2.8936121463775635,
"learning_rate": 5.628335759340127e-06,
"loss": 0.1204,
"step": 10290
},
{
"epoch": 8.991706678306416,
"grad_norm": 3.3234341144561768,
"learning_rate": 5.5798156234837455e-06,
"loss": 0.1212,
"step": 10300
},
{
"epoch": 8.999563509384549,
"eval_accuracy": 0.9607169162779278,
"eval_loss": 0.12570950388908386,
"eval_runtime": 61.7961,
"eval_samples_per_second": 263.641,
"eval_steps_per_second": 8.253,
"step": 10309
},
{
"epoch": 9.000436490615451,
"grad_norm": 4.348520755767822,
"learning_rate": 5.531295487627366e-06,
"loss": 0.132,
"step": 10310
},
{
"epoch": 9.009166302924488,
"grad_norm": 2.5040526390075684,
"learning_rate": 5.482775351770985e-06,
"loss": 0.116,
"step": 10320
},
{
"epoch": 9.017896115233523,
"grad_norm": 2.754953622817993,
"learning_rate": 5.434255215914605e-06,
"loss": 0.1035,
"step": 10330
},
{
"epoch": 9.026625927542558,
"grad_norm": 3.0102930068969727,
"learning_rate": 5.385735080058224e-06,
"loss": 0.1104,
"step": 10340
},
{
"epoch": 9.035355739851592,
"grad_norm": 2.0556371212005615,
"learning_rate": 5.337214944201844e-06,
"loss": 0.1387,
"step": 10350
},
{
"epoch": 9.04408555216063,
"grad_norm": 2.688657760620117,
"learning_rate": 5.288694808345464e-06,
"loss": 0.1082,
"step": 10360
},
{
"epoch": 9.052815364469664,
"grad_norm": 3.8439743518829346,
"learning_rate": 5.240174672489083e-06,
"loss": 0.1144,
"step": 10370
},
{
"epoch": 9.061545176778699,
"grad_norm": 2.683326482772827,
"learning_rate": 5.191654536632703e-06,
"loss": 0.1075,
"step": 10380
},
{
"epoch": 9.070274989087734,
"grad_norm": 2.6530981063842773,
"learning_rate": 5.143134400776323e-06,
"loss": 0.0948,
"step": 10390
},
{
"epoch": 9.07900480139677,
"grad_norm": 3.5326459407806396,
"learning_rate": 5.094614264919942e-06,
"loss": 0.1101,
"step": 10400
},
{
"epoch": 9.087734613705805,
"grad_norm": 3.0565762519836426,
"learning_rate": 5.046094129063561e-06,
"loss": 0.103,
"step": 10410
},
{
"epoch": 9.09646442601484,
"grad_norm": 2.949897050857544,
"learning_rate": 4.997573993207181e-06,
"loss": 0.136,
"step": 10420
},
{
"epoch": 9.105194238323875,
"grad_norm": 2.1031014919281006,
"learning_rate": 4.949053857350801e-06,
"loss": 0.117,
"step": 10430
},
{
"epoch": 9.113924050632912,
"grad_norm": 2.6953930854797363,
"learning_rate": 4.9005337214944205e-06,
"loss": 0.0944,
"step": 10440
},
{
"epoch": 9.122653862941947,
"grad_norm": 2.513293981552124,
"learning_rate": 4.85201358563804e-06,
"loss": 0.1121,
"step": 10450
},
{
"epoch": 9.131383675250982,
"grad_norm": 2.8910365104675293,
"learning_rate": 4.80349344978166e-06,
"loss": 0.0821,
"step": 10460
},
{
"epoch": 9.140113487560017,
"grad_norm": 2.991837978363037,
"learning_rate": 4.754973313925279e-06,
"loss": 0.115,
"step": 10470
},
{
"epoch": 9.148843299869053,
"grad_norm": 3.1621274948120117,
"learning_rate": 4.7064531780688984e-06,
"loss": 0.0987,
"step": 10480
},
{
"epoch": 9.157573112178088,
"grad_norm": 2.6590192317962646,
"learning_rate": 4.657933042212519e-06,
"loss": 0.0924,
"step": 10490
},
{
"epoch": 9.166302924487123,
"grad_norm": 1.3833907842636108,
"learning_rate": 4.609412906356138e-06,
"loss": 0.0772,
"step": 10500
},
{
"epoch": 9.175032736796158,
"grad_norm": 2.954089879989624,
"learning_rate": 4.5608927704997575e-06,
"loss": 0.1372,
"step": 10510
},
{
"epoch": 9.183762549105195,
"grad_norm": 2.8369104862213135,
"learning_rate": 4.512372634643377e-06,
"loss": 0.1054,
"step": 10520
},
{
"epoch": 9.19249236141423,
"grad_norm": 1.4422260522842407,
"learning_rate": 4.463852498786997e-06,
"loss": 0.1025,
"step": 10530
},
{
"epoch": 9.201222173723265,
"grad_norm": 1.9895925521850586,
"learning_rate": 4.415332362930617e-06,
"loss": 0.0992,
"step": 10540
},
{
"epoch": 9.2099519860323,
"grad_norm": 2.9992966651916504,
"learning_rate": 4.3668122270742355e-06,
"loss": 0.0927,
"step": 10550
},
{
"epoch": 9.218681798341336,
"grad_norm": 2.2719290256500244,
"learning_rate": 4.318292091217856e-06,
"loss": 0.0987,
"step": 10560
},
{
"epoch": 9.227411610650371,
"grad_norm": 3.9194726943969727,
"learning_rate": 4.269771955361476e-06,
"loss": 0.0936,
"step": 10570
},
{
"epoch": 9.236141422959406,
"grad_norm": 2.4642674922943115,
"learning_rate": 4.221251819505095e-06,
"loss": 0.1074,
"step": 10580
},
{
"epoch": 9.244871235268441,
"grad_norm": 3.479707956314087,
"learning_rate": 4.172731683648714e-06,
"loss": 0.1051,
"step": 10590
},
{
"epoch": 9.253601047577478,
"grad_norm": 3.2341325283050537,
"learning_rate": 4.124211547792334e-06,
"loss": 0.0878,
"step": 10600
},
{
"epoch": 9.262330859886513,
"grad_norm": 2.302276849746704,
"learning_rate": 4.075691411935954e-06,
"loss": 0.1036,
"step": 10610
},
{
"epoch": 9.271060672195548,
"grad_norm": 2.9073214530944824,
"learning_rate": 4.027171276079573e-06,
"loss": 0.1181,
"step": 10620
},
{
"epoch": 9.279790484504582,
"grad_norm": 2.0145514011383057,
"learning_rate": 3.978651140223193e-06,
"loss": 0.0956,
"step": 10630
},
{
"epoch": 9.28852029681362,
"grad_norm": 4.683056831359863,
"learning_rate": 3.930131004366813e-06,
"loss": 0.138,
"step": 10640
},
{
"epoch": 9.297250109122654,
"grad_norm": 2.681694507598877,
"learning_rate": 3.881610868510432e-06,
"loss": 0.08,
"step": 10650
},
{
"epoch": 9.305979921431689,
"grad_norm": 2.887195348739624,
"learning_rate": 3.833090732654051e-06,
"loss": 0.1113,
"step": 10660
},
{
"epoch": 9.314709733740724,
"grad_norm": 2.9222280979156494,
"learning_rate": 3.7845705967976715e-06,
"loss": 0.1074,
"step": 10670
},
{
"epoch": 9.32343954604976,
"grad_norm": 2.944333076477051,
"learning_rate": 3.7360504609412907e-06,
"loss": 0.106,
"step": 10680
},
{
"epoch": 9.332169358358795,
"grad_norm": 2.468632459640503,
"learning_rate": 3.6875303250849104e-06,
"loss": 0.1049,
"step": 10690
},
{
"epoch": 9.34089917066783,
"grad_norm": 0.9164177775382996,
"learning_rate": 3.6390101892285297e-06,
"loss": 0.0718,
"step": 10700
},
{
"epoch": 9.349628982976865,
"grad_norm": 2.6750760078430176,
"learning_rate": 3.5904900533721494e-06,
"loss": 0.1086,
"step": 10710
},
{
"epoch": 9.358358795285902,
"grad_norm": 1.8694539070129395,
"learning_rate": 3.5419699175157695e-06,
"loss": 0.1099,
"step": 10720
},
{
"epoch": 9.367088607594937,
"grad_norm": 2.571378707885742,
"learning_rate": 3.493449781659389e-06,
"loss": 0.0856,
"step": 10730
},
{
"epoch": 9.375818419903972,
"grad_norm": 1.7155669927597046,
"learning_rate": 3.4449296458030085e-06,
"loss": 0.1044,
"step": 10740
},
{
"epoch": 9.384548232213007,
"grad_norm": 2.1449294090270996,
"learning_rate": 3.396409509946628e-06,
"loss": 0.096,
"step": 10750
},
{
"epoch": 9.393278044522043,
"grad_norm": 2.918898344039917,
"learning_rate": 3.3478893740902475e-06,
"loss": 0.0983,
"step": 10760
},
{
"epoch": 9.402007856831078,
"grad_norm": 2.2519733905792236,
"learning_rate": 3.2993692382338676e-06,
"loss": 0.1027,
"step": 10770
},
{
"epoch": 9.410737669140113,
"grad_norm": 4.343425750732422,
"learning_rate": 3.2508491023774865e-06,
"loss": 0.1118,
"step": 10780
},
{
"epoch": 9.419467481449148,
"grad_norm": 2.5257129669189453,
"learning_rate": 3.2023289665211066e-06,
"loss": 0.1093,
"step": 10790
},
{
"epoch": 9.428197293758185,
"grad_norm": 1.197124719619751,
"learning_rate": 3.1538088306647263e-06,
"loss": 0.1083,
"step": 10800
},
{
"epoch": 9.43692710606722,
"grad_norm": 3.033644437789917,
"learning_rate": 3.1052886948083456e-06,
"loss": 0.1076,
"step": 10810
},
{
"epoch": 9.445656918376255,
"grad_norm": 2.3595004081726074,
"learning_rate": 3.056768558951965e-06,
"loss": 0.0772,
"step": 10820
},
{
"epoch": 9.45438673068529,
"grad_norm": 1.9762500524520874,
"learning_rate": 3.008248423095585e-06,
"loss": 0.1033,
"step": 10830
},
{
"epoch": 9.463116542994326,
"grad_norm": 3.2624242305755615,
"learning_rate": 2.9597282872392042e-06,
"loss": 0.1018,
"step": 10840
},
{
"epoch": 9.471846355303361,
"grad_norm": 2.5619966983795166,
"learning_rate": 2.911208151382824e-06,
"loss": 0.1084,
"step": 10850
},
{
"epoch": 9.480576167612396,
"grad_norm": 2.26334547996521,
"learning_rate": 2.8626880155264436e-06,
"loss": 0.099,
"step": 10860
},
{
"epoch": 9.489305979921431,
"grad_norm": 2.397780656814575,
"learning_rate": 2.8141678796700633e-06,
"loss": 0.1217,
"step": 10870
},
{
"epoch": 9.498035792230468,
"grad_norm": 3.3402929306030273,
"learning_rate": 2.765647743813683e-06,
"loss": 0.1199,
"step": 10880
},
{
"epoch": 9.506765604539503,
"grad_norm": 3.3935964107513428,
"learning_rate": 2.7171276079573023e-06,
"loss": 0.1158,
"step": 10890
},
{
"epoch": 9.515495416848538,
"grad_norm": 2.4392571449279785,
"learning_rate": 2.668607472100922e-06,
"loss": 0.1306,
"step": 10900
},
{
"epoch": 9.524225229157572,
"grad_norm": 2.4739038944244385,
"learning_rate": 2.6200873362445413e-06,
"loss": 0.0886,
"step": 10910
},
{
"epoch": 9.532955041466609,
"grad_norm": 2.2183120250701904,
"learning_rate": 2.5715672003881614e-06,
"loss": 0.1015,
"step": 10920
},
{
"epoch": 9.541684853775644,
"grad_norm": 3.256857395172119,
"learning_rate": 2.5230470645317807e-06,
"loss": 0.1328,
"step": 10930
},
{
"epoch": 9.550414666084679,
"grad_norm": 2.740285634994507,
"learning_rate": 2.4745269286754004e-06,
"loss": 0.0996,
"step": 10940
},
{
"epoch": 9.559144478393714,
"grad_norm": 2.5595703125,
"learning_rate": 2.42600679281902e-06,
"loss": 0.075,
"step": 10950
},
{
"epoch": 9.56787429070275,
"grad_norm": 3.089890480041504,
"learning_rate": 2.3774866569626394e-06,
"loss": 0.1229,
"step": 10960
},
{
"epoch": 9.576604103011785,
"grad_norm": 3.369868755340576,
"learning_rate": 2.3289665211062595e-06,
"loss": 0.1194,
"step": 10970
},
{
"epoch": 9.58533391532082,
"grad_norm": 2.359818458557129,
"learning_rate": 2.2804463852498788e-06,
"loss": 0.107,
"step": 10980
},
{
"epoch": 9.594063727629855,
"grad_norm": 2.449138641357422,
"learning_rate": 2.2319262493934985e-06,
"loss": 0.1078,
"step": 10990
},
{
"epoch": 9.602793539938892,
"grad_norm": 1.8725260496139526,
"learning_rate": 2.1834061135371177e-06,
"loss": 0.0882,
"step": 11000
},
{
"epoch": 9.611523352247927,
"grad_norm": 1.452510952949524,
"learning_rate": 2.134885977680738e-06,
"loss": 0.1029,
"step": 11010
},
{
"epoch": 9.620253164556962,
"grad_norm": 4.510222911834717,
"learning_rate": 2.086365841824357e-06,
"loss": 0.1253,
"step": 11020
},
{
"epoch": 9.628982976865997,
"grad_norm": 2.815927267074585,
"learning_rate": 2.037845705967977e-06,
"loss": 0.1132,
"step": 11030
},
{
"epoch": 9.637712789175033,
"grad_norm": 3.04374098777771,
"learning_rate": 1.9893255701115965e-06,
"loss": 0.1043,
"step": 11040
},
{
"epoch": 9.646442601484068,
"grad_norm": 3.407210111618042,
"learning_rate": 1.940805434255216e-06,
"loss": 0.1207,
"step": 11050
},
{
"epoch": 9.655172413793103,
"grad_norm": 3.3913309574127197,
"learning_rate": 1.8922852983988357e-06,
"loss": 0.1153,
"step": 11060
},
{
"epoch": 9.663902226102138,
"grad_norm": 1.3927017450332642,
"learning_rate": 1.8437651625424552e-06,
"loss": 0.0865,
"step": 11070
},
{
"epoch": 9.672632038411175,
"grad_norm": 1.9914878606796265,
"learning_rate": 1.7952450266860747e-06,
"loss": 0.1071,
"step": 11080
},
{
"epoch": 9.68136185072021,
"grad_norm": 2.445969820022583,
"learning_rate": 1.7467248908296944e-06,
"loss": 0.0921,
"step": 11090
},
{
"epoch": 9.690091663029245,
"grad_norm": 3.2452139854431152,
"learning_rate": 1.698204754973314e-06,
"loss": 0.1383,
"step": 11100
},
{
"epoch": 9.69882147533828,
"grad_norm": 1.981126070022583,
"learning_rate": 1.6496846191169338e-06,
"loss": 0.1139,
"step": 11110
},
{
"epoch": 9.707551287647316,
"grad_norm": 1.5634502172470093,
"learning_rate": 1.6011644832605533e-06,
"loss": 0.0922,
"step": 11120
},
{
"epoch": 9.716281099956351,
"grad_norm": 3.633537769317627,
"learning_rate": 1.5526443474041728e-06,
"loss": 0.107,
"step": 11130
},
{
"epoch": 9.725010912265386,
"grad_norm": 2.735826015472412,
"learning_rate": 1.5041242115477925e-06,
"loss": 0.1027,
"step": 11140
},
{
"epoch": 9.733740724574421,
"grad_norm": 2.9639649391174316,
"learning_rate": 1.455604075691412e-06,
"loss": 0.0849,
"step": 11150
},
{
"epoch": 9.742470536883458,
"grad_norm": 2.332066297531128,
"learning_rate": 1.4070839398350317e-06,
"loss": 0.1055,
"step": 11160
},
{
"epoch": 9.751200349192493,
"grad_norm": 2.8028910160064697,
"learning_rate": 1.3585638039786512e-06,
"loss": 0.1135,
"step": 11170
},
{
"epoch": 9.759930161501527,
"grad_norm": 3.3625102043151855,
"learning_rate": 1.3100436681222706e-06,
"loss": 0.1219,
"step": 11180
},
{
"epoch": 9.768659973810562,
"grad_norm": 3.788644790649414,
"learning_rate": 1.2615235322658903e-06,
"loss": 0.1035,
"step": 11190
},
{
"epoch": 9.777389786119599,
"grad_norm": 3.0981266498565674,
"learning_rate": 1.21300339640951e-06,
"loss": 0.0964,
"step": 11200
},
{
"epoch": 9.786119598428634,
"grad_norm": 2.1601572036743164,
"learning_rate": 1.1644832605531297e-06,
"loss": 0.09,
"step": 11210
},
{
"epoch": 9.794849410737669,
"grad_norm": 2.264108419418335,
"learning_rate": 1.1159631246967492e-06,
"loss": 0.0925,
"step": 11220
},
{
"epoch": 9.803579223046704,
"grad_norm": 4.96955680847168,
"learning_rate": 1.067442988840369e-06,
"loss": 0.1429,
"step": 11230
},
{
"epoch": 9.81230903535574,
"grad_norm": 2.7126574516296387,
"learning_rate": 1.0189228529839884e-06,
"loss": 0.1139,
"step": 11240
},
{
"epoch": 9.821038847664775,
"grad_norm": 2.6963400840759277,
"learning_rate": 9.70402717127608e-07,
"loss": 0.1106,
"step": 11250
},
{
"epoch": 9.82976865997381,
"grad_norm": 2.730841636657715,
"learning_rate": 9.218825812712276e-07,
"loss": 0.1031,
"step": 11260
},
{
"epoch": 9.838498472282845,
"grad_norm": 3.486825704574585,
"learning_rate": 8.733624454148472e-07,
"loss": 0.1023,
"step": 11270
},
{
"epoch": 9.847228284591882,
"grad_norm": 3.1530649662017822,
"learning_rate": 8.248423095584669e-07,
"loss": 0.084,
"step": 11280
},
{
"epoch": 9.855958096900917,
"grad_norm": 1.514291524887085,
"learning_rate": 7.763221737020864e-07,
"loss": 0.1088,
"step": 11290
},
{
"epoch": 9.864687909209952,
"grad_norm": 1.9659836292266846,
"learning_rate": 7.27802037845706e-07,
"loss": 0.0722,
"step": 11300
},
{
"epoch": 9.873417721518987,
"grad_norm": 2.1583216190338135,
"learning_rate": 6.792819019893256e-07,
"loss": 0.1086,
"step": 11310
},
{
"epoch": 9.882147533828023,
"grad_norm": 3.034625291824341,
"learning_rate": 6.307617661329452e-07,
"loss": 0.1046,
"step": 11320
},
{
"epoch": 9.890877346137058,
"grad_norm": 2.7435011863708496,
"learning_rate": 5.822416302765649e-07,
"loss": 0.1083,
"step": 11330
},
{
"epoch": 9.899607158446093,
"grad_norm": 2.3298757076263428,
"learning_rate": 5.337214944201845e-07,
"loss": 0.11,
"step": 11340
},
{
"epoch": 9.908336970755128,
"grad_norm": 3.238316774368286,
"learning_rate": 4.85201358563804e-07,
"loss": 0.1281,
"step": 11350
},
{
"epoch": 9.917066783064165,
"grad_norm": 2.9456119537353516,
"learning_rate": 4.366812227074236e-07,
"loss": 0.0932,
"step": 11360
},
{
"epoch": 9.9257965953732,
"grad_norm": 1.8803859949111938,
"learning_rate": 3.881610868510432e-07,
"loss": 0.1022,
"step": 11370
},
{
"epoch": 9.934526407682235,
"grad_norm": 2.5518546104431152,
"learning_rate": 3.396409509946628e-07,
"loss": 0.0876,
"step": 11380
},
{
"epoch": 9.94325621999127,
"grad_norm": 2.5071511268615723,
"learning_rate": 2.9112081513828244e-07,
"loss": 0.1075,
"step": 11390
},
{
"epoch": 9.951986032300306,
"grad_norm": 4.196099758148193,
"learning_rate": 2.42600679281902e-07,
"loss": 0.1197,
"step": 11400
},
{
"epoch": 9.960715844609341,
"grad_norm": 2.428870916366577,
"learning_rate": 1.940805434255216e-07,
"loss": 0.1262,
"step": 11410
},
{
"epoch": 9.969445656918376,
"grad_norm": 2.2001659870147705,
"learning_rate": 1.4556040756914122e-07,
"loss": 0.1013,
"step": 11420
},
{
"epoch": 9.978175469227411,
"grad_norm": 3.1071674823760986,
"learning_rate": 9.70402717127608e-08,
"loss": 0.0883,
"step": 11430
},
{
"epoch": 9.986905281536448,
"grad_norm": 2.7770655155181885,
"learning_rate": 4.85201358563804e-08,
"loss": 0.1187,
"step": 11440
},
{
"epoch": 9.995635093845483,
"grad_norm": 2.909649133682251,
"learning_rate": 0.0,
"loss": 0.0981,
"step": 11450
},
{
"epoch": 9.995635093845483,
"eval_accuracy": 0.9750797937638105,
"eval_loss": 0.07497124373912811,
"eval_runtime": 61.2653,
"eval_samples_per_second": 265.925,
"eval_steps_per_second": 8.324,
"step": 11450
},
{
"epoch": 9.995635093845483,
"step": 11450,
"total_flos": 3.92899376185344e+18,
"train_loss": 0.3148357209992721,
"train_runtime": 13104.719,
"train_samples_per_second": 111.886,
"train_steps_per_second": 0.874
}
],
"logging_steps": 10,
"max_steps": 11450,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.92899376185344e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}