MyBGE_Model / trainer_state.json
FlySulfur's picture
Upload 15 files
636cf3f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.753199268738574,
"eval_steps": 500,
"global_step": 13000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003656307129798903,
"grad_norm": 101.57550811767578,
"learning_rate": 2.9991224862888483e-05,
"loss": 1.5598,
"step": 10
},
{
"epoch": 0.007312614259597806,
"grad_norm": 36.13117218017578,
"learning_rate": 2.996928702010969e-05,
"loss": 5.0075,
"step": 20
},
{
"epoch": 0.010968921389396709,
"grad_norm": 171.64637756347656,
"learning_rate": 2.9947349177330895e-05,
"loss": 1.9036,
"step": 30
},
{
"epoch": 0.014625228519195612,
"grad_norm": 30.211172103881836,
"learning_rate": 2.99254113345521e-05,
"loss": 0.7468,
"step": 40
},
{
"epoch": 0.018281535648994516,
"grad_norm": 0.8975659012794495,
"learning_rate": 2.990347349177331e-05,
"loss": 1.0939,
"step": 50
},
{
"epoch": 0.021937842778793418,
"grad_norm": 16.802227020263672,
"learning_rate": 2.9881535648994517e-05,
"loss": 4.4483,
"step": 60
},
{
"epoch": 0.025594149908592323,
"grad_norm": 22.710329055786133,
"learning_rate": 2.9859597806215723e-05,
"loss": 2.2961,
"step": 70
},
{
"epoch": 0.029250457038391225,
"grad_norm": 42.57908630371094,
"learning_rate": 2.983765996343693e-05,
"loss": 2.2599,
"step": 80
},
{
"epoch": 0.03290676416819013,
"grad_norm": 22.469757080078125,
"learning_rate": 2.981572212065814e-05,
"loss": 0.6366,
"step": 90
},
{
"epoch": 0.03656307129798903,
"grad_norm": 0.616607666015625,
"learning_rate": 2.9793784277879342e-05,
"loss": 2.6768,
"step": 100
},
{
"epoch": 0.04021937842778794,
"grad_norm": 38.607276916503906,
"learning_rate": 2.9776234003656307e-05,
"loss": 3.233,
"step": 110
},
{
"epoch": 0.043875685557586835,
"grad_norm": 20.329072952270508,
"learning_rate": 2.9754296160877513e-05,
"loss": 0.7113,
"step": 120
},
{
"epoch": 0.04753199268738574,
"grad_norm": 12.583466529846191,
"learning_rate": 2.9732358318098722e-05,
"loss": 0.6635,
"step": 130
},
{
"epoch": 0.051188299817184646,
"grad_norm": 16.894561767578125,
"learning_rate": 2.971042047531993e-05,
"loss": 2.5268,
"step": 140
},
{
"epoch": 0.054844606946983544,
"grad_norm": 21.74227523803711,
"learning_rate": 2.9688482632541135e-05,
"loss": 0.6855,
"step": 150
},
{
"epoch": 0.05850091407678245,
"grad_norm": 21.068952560424805,
"learning_rate": 2.966654478976234e-05,
"loss": 0.7735,
"step": 160
},
{
"epoch": 0.062157221206581355,
"grad_norm": 21.1742000579834,
"learning_rate": 2.964460694698355e-05,
"loss": 0.7335,
"step": 170
},
{
"epoch": 0.06581352833638025,
"grad_norm": 21.325294494628906,
"learning_rate": 2.9622669104204753e-05,
"loss": 0.7411,
"step": 180
},
{
"epoch": 0.06946983546617916,
"grad_norm": 14.202475547790527,
"learning_rate": 2.960073126142596e-05,
"loss": 0.6385,
"step": 190
},
{
"epoch": 0.07312614259597806,
"grad_norm": 14.897246360778809,
"learning_rate": 2.9578793418647165e-05,
"loss": 0.6281,
"step": 200
},
{
"epoch": 0.07678244972577697,
"grad_norm": 14.886935234069824,
"learning_rate": 2.955685557586837e-05,
"loss": 0.6687,
"step": 210
},
{
"epoch": 0.08043875685557587,
"grad_norm": 11.127037048339844,
"learning_rate": 2.953491773308958e-05,
"loss": 0.6167,
"step": 220
},
{
"epoch": 0.08409506398537477,
"grad_norm": 15.06424617767334,
"learning_rate": 2.9512979890310787e-05,
"loss": 0.6776,
"step": 230
},
{
"epoch": 0.08775137111517367,
"grad_norm": 16.624258041381836,
"learning_rate": 2.9491042047531993e-05,
"loss": 0.8723,
"step": 240
},
{
"epoch": 0.09140767824497258,
"grad_norm": 15.640335083007812,
"learning_rate": 2.94691042047532e-05,
"loss": 0.6707,
"step": 250
},
{
"epoch": 0.09506398537477148,
"grad_norm": 14.502679824829102,
"learning_rate": 2.944716636197441e-05,
"loss": 0.7391,
"step": 260
},
{
"epoch": 0.09872029250457039,
"grad_norm": 16.92255973815918,
"learning_rate": 2.9425228519195615e-05,
"loss": 0.8533,
"step": 270
},
{
"epoch": 0.10237659963436929,
"grad_norm": 12.30309772491455,
"learning_rate": 2.940329067641682e-05,
"loss": 0.7523,
"step": 280
},
{
"epoch": 0.10603290676416818,
"grad_norm": 10.335432052612305,
"learning_rate": 2.9381352833638024e-05,
"loss": 0.619,
"step": 290
},
{
"epoch": 0.10968921389396709,
"grad_norm": 10.138907432556152,
"learning_rate": 2.935941499085923e-05,
"loss": 0.6184,
"step": 300
},
{
"epoch": 0.113345521023766,
"grad_norm": 10.479036331176758,
"learning_rate": 2.933747714808044e-05,
"loss": 0.8974,
"step": 310
},
{
"epoch": 0.1170018281535649,
"grad_norm": 11.521620750427246,
"learning_rate": 2.9315539305301646e-05,
"loss": 0.9033,
"step": 320
},
{
"epoch": 0.1206581352833638,
"grad_norm": 10.446819305419922,
"learning_rate": 2.9293601462522852e-05,
"loss": 0.7302,
"step": 330
},
{
"epoch": 0.12431444241316271,
"grad_norm": 8.162124633789062,
"learning_rate": 2.9271663619744058e-05,
"loss": 0.6672,
"step": 340
},
{
"epoch": 0.12797074954296161,
"grad_norm": 10.791855812072754,
"learning_rate": 2.9249725776965268e-05,
"loss": 0.7167,
"step": 350
},
{
"epoch": 0.1316270566727605,
"grad_norm": 11.913755416870117,
"learning_rate": 2.9227787934186474e-05,
"loss": 0.7323,
"step": 360
},
{
"epoch": 0.13528336380255943,
"grad_norm": 13.401154518127441,
"learning_rate": 2.920585009140768e-05,
"loss": 0.7404,
"step": 370
},
{
"epoch": 0.13893967093235832,
"grad_norm": 11.721502304077148,
"learning_rate": 2.9183912248628886e-05,
"loss": 0.7038,
"step": 380
},
{
"epoch": 0.1425959780621572,
"grad_norm": 11.148709297180176,
"learning_rate": 2.9161974405850092e-05,
"loss": 0.7144,
"step": 390
},
{
"epoch": 0.14625228519195613,
"grad_norm": 11.626866340637207,
"learning_rate": 2.91400365630713e-05,
"loss": 0.6527,
"step": 400
},
{
"epoch": 0.14990859232175502,
"grad_norm": 14.398078918457031,
"learning_rate": 2.9118098720292505e-05,
"loss": 0.6695,
"step": 410
},
{
"epoch": 0.15356489945155394,
"grad_norm": 14.848665237426758,
"learning_rate": 2.909616087751371e-05,
"loss": 0.7791,
"step": 420
},
{
"epoch": 0.15722120658135283,
"grad_norm": 12.27662181854248,
"learning_rate": 2.9074223034734917e-05,
"loss": 0.7028,
"step": 430
},
{
"epoch": 0.16087751371115175,
"grad_norm": 6.21640157699585,
"learning_rate": 2.9052285191956126e-05,
"loss": 0.6076,
"step": 440
},
{
"epoch": 0.16453382084095064,
"grad_norm": 10.96060562133789,
"learning_rate": 2.9030347349177333e-05,
"loss": 0.6555,
"step": 450
},
{
"epoch": 0.16819012797074953,
"grad_norm": 12.066696166992188,
"learning_rate": 2.900840950639854e-05,
"loss": 0.7395,
"step": 460
},
{
"epoch": 0.17184643510054845,
"grad_norm": 14.306533813476562,
"learning_rate": 2.8986471663619745e-05,
"loss": 0.835,
"step": 470
},
{
"epoch": 0.17550274223034734,
"grad_norm": 15.683281898498535,
"learning_rate": 2.8964533820840954e-05,
"loss": 0.7374,
"step": 480
},
{
"epoch": 0.17915904936014626,
"grad_norm": 9.928467750549316,
"learning_rate": 2.8942595978062157e-05,
"loss": 0.6113,
"step": 490
},
{
"epoch": 0.18281535648994515,
"grad_norm": 12.735904693603516,
"learning_rate": 2.8920658135283363e-05,
"loss": 0.8668,
"step": 500
},
{
"epoch": 0.18647166361974407,
"grad_norm": 5.431804180145264,
"learning_rate": 2.889872029250457e-05,
"loss": 0.7009,
"step": 510
},
{
"epoch": 0.19012797074954296,
"grad_norm": 9.146883010864258,
"learning_rate": 2.887678244972578e-05,
"loss": 0.8343,
"step": 520
},
{
"epoch": 0.19378427787934185,
"grad_norm": 9.630278587341309,
"learning_rate": 2.8854844606946985e-05,
"loss": 0.7074,
"step": 530
},
{
"epoch": 0.19744058500914077,
"grad_norm": 6.3954901695251465,
"learning_rate": 2.883290676416819e-05,
"loss": 0.7857,
"step": 540
},
{
"epoch": 0.20109689213893966,
"grad_norm": 10.803849220275879,
"learning_rate": 2.8810968921389397e-05,
"loss": 0.6814,
"step": 550
},
{
"epoch": 0.20475319926873858,
"grad_norm": 5.025099277496338,
"learning_rate": 2.8789031078610603e-05,
"loss": 0.8558,
"step": 560
},
{
"epoch": 0.20840950639853748,
"grad_norm": 10.094544410705566,
"learning_rate": 2.8767093235831813e-05,
"loss": 0.9323,
"step": 570
},
{
"epoch": 0.21206581352833637,
"grad_norm": 9.443562507629395,
"learning_rate": 2.874515539305302e-05,
"loss": 0.715,
"step": 580
},
{
"epoch": 0.21572212065813529,
"grad_norm": 11.677664756774902,
"learning_rate": 2.8723217550274222e-05,
"loss": 0.8864,
"step": 590
},
{
"epoch": 0.21937842778793418,
"grad_norm": 4.913455009460449,
"learning_rate": 2.8701279707495428e-05,
"loss": 0.6386,
"step": 600
},
{
"epoch": 0.2230347349177331,
"grad_norm": 6.794945240020752,
"learning_rate": 2.8679341864716638e-05,
"loss": 0.6967,
"step": 610
},
{
"epoch": 0.226691042047532,
"grad_norm": 8.743935585021973,
"learning_rate": 2.8657404021937844e-05,
"loss": 0.6984,
"step": 620
},
{
"epoch": 0.2303473491773309,
"grad_norm": 8.499006271362305,
"learning_rate": 2.863546617915905e-05,
"loss": 0.7081,
"step": 630
},
{
"epoch": 0.2340036563071298,
"grad_norm": 7.359218597412109,
"learning_rate": 2.8613528336380256e-05,
"loss": 0.7312,
"step": 640
},
{
"epoch": 0.2376599634369287,
"grad_norm": 8.67283821105957,
"learning_rate": 2.8591590493601462e-05,
"loss": 0.73,
"step": 650
},
{
"epoch": 0.2413162705667276,
"grad_norm": 9.145535469055176,
"learning_rate": 2.856965265082267e-05,
"loss": 0.6741,
"step": 660
},
{
"epoch": 0.2449725776965265,
"grad_norm": 14.087048530578613,
"learning_rate": 2.8547714808043878e-05,
"loss": 0.7393,
"step": 670
},
{
"epoch": 0.24862888482632542,
"grad_norm": 11.732462882995605,
"learning_rate": 2.8525776965265084e-05,
"loss": 0.6989,
"step": 680
},
{
"epoch": 0.2522851919561243,
"grad_norm": 7.398434638977051,
"learning_rate": 2.8503839122486287e-05,
"loss": 0.611,
"step": 690
},
{
"epoch": 0.25594149908592323,
"grad_norm": 5.068675994873047,
"learning_rate": 2.8481901279707496e-05,
"loss": 0.7415,
"step": 700
},
{
"epoch": 0.2595978062157221,
"grad_norm": 9.75862979888916,
"learning_rate": 2.8459963436928702e-05,
"loss": 0.8388,
"step": 710
},
{
"epoch": 0.263254113345521,
"grad_norm": 9.038466453552246,
"learning_rate": 2.843802559414991e-05,
"loss": 0.7232,
"step": 720
},
{
"epoch": 0.26691042047531993,
"grad_norm": 13.121977806091309,
"learning_rate": 2.8416087751371115e-05,
"loss": 0.6463,
"step": 730
},
{
"epoch": 0.27056672760511885,
"grad_norm": 10.064229011535645,
"learning_rate": 2.8394149908592324e-05,
"loss": 0.8457,
"step": 740
},
{
"epoch": 0.2742230347349177,
"grad_norm": 10.455716133117676,
"learning_rate": 2.837221206581353e-05,
"loss": 0.7311,
"step": 750
},
{
"epoch": 0.27787934186471663,
"grad_norm": 9.248018264770508,
"learning_rate": 2.8350274223034736e-05,
"loss": 0.8482,
"step": 760
},
{
"epoch": 0.28153564899451555,
"grad_norm": 7.202044486999512,
"learning_rate": 2.8328336380255943e-05,
"loss": 0.7483,
"step": 770
},
{
"epoch": 0.2851919561243144,
"grad_norm": 5.500239849090576,
"learning_rate": 2.830639853747715e-05,
"loss": 0.812,
"step": 780
},
{
"epoch": 0.28884826325411334,
"grad_norm": 14.437928199768066,
"learning_rate": 2.8284460694698355e-05,
"loss": 0.6839,
"step": 790
},
{
"epoch": 0.29250457038391225,
"grad_norm": 8.881915092468262,
"learning_rate": 2.826252285191956e-05,
"loss": 0.8167,
"step": 800
},
{
"epoch": 0.2961608775137112,
"grad_norm": 13.634603500366211,
"learning_rate": 2.8240585009140767e-05,
"loss": 0.9943,
"step": 810
},
{
"epoch": 0.29981718464351004,
"grad_norm": 11.794356346130371,
"learning_rate": 2.8218647166361973e-05,
"loss": 0.8036,
"step": 820
},
{
"epoch": 0.30347349177330896,
"grad_norm": 9.6803617477417,
"learning_rate": 2.8196709323583183e-05,
"loss": 0.7858,
"step": 830
},
{
"epoch": 0.3071297989031079,
"grad_norm": 7.423046588897705,
"learning_rate": 2.817477148080439e-05,
"loss": 0.7126,
"step": 840
},
{
"epoch": 0.31078610603290674,
"grad_norm": 6.547556400299072,
"learning_rate": 2.8152833638025595e-05,
"loss": 0.886,
"step": 850
},
{
"epoch": 0.31444241316270566,
"grad_norm": 10.207584381103516,
"learning_rate": 2.81308957952468e-05,
"loss": 0.7013,
"step": 860
},
{
"epoch": 0.3180987202925046,
"grad_norm": 9.12232494354248,
"learning_rate": 2.810895795246801e-05,
"loss": 0.821,
"step": 870
},
{
"epoch": 0.3217550274223035,
"grad_norm": 8.086636543273926,
"learning_rate": 2.8087020109689217e-05,
"loss": 0.8873,
"step": 880
},
{
"epoch": 0.32541133455210236,
"grad_norm": 9.748858451843262,
"learning_rate": 2.806508226691042e-05,
"loss": 0.8942,
"step": 890
},
{
"epoch": 0.3290676416819013,
"grad_norm": 11.087379455566406,
"learning_rate": 2.8043144424131626e-05,
"loss": 0.8282,
"step": 900
},
{
"epoch": 0.3327239488117002,
"grad_norm": 10.066028594970703,
"learning_rate": 2.8021206581352832e-05,
"loss": 0.7694,
"step": 910
},
{
"epoch": 0.33638025594149906,
"grad_norm": 10.349629402160645,
"learning_rate": 2.799926873857404e-05,
"loss": 0.9706,
"step": 920
},
{
"epoch": 0.340036563071298,
"grad_norm": 5.540337562561035,
"learning_rate": 2.7977330895795248e-05,
"loss": 0.6998,
"step": 930
},
{
"epoch": 0.3436928702010969,
"grad_norm": 3.4147696495056152,
"learning_rate": 2.7955393053016454e-05,
"loss": 0.6818,
"step": 940
},
{
"epoch": 0.3473491773308958,
"grad_norm": 13.466970443725586,
"learning_rate": 2.793345521023766e-05,
"loss": 0.8013,
"step": 950
},
{
"epoch": 0.3510054844606947,
"grad_norm": 6.585829734802246,
"learning_rate": 2.791151736745887e-05,
"loss": 0.6507,
"step": 960
},
{
"epoch": 0.3546617915904936,
"grad_norm": 3.3851397037506104,
"learning_rate": 2.7889579524680076e-05,
"loss": 0.8193,
"step": 970
},
{
"epoch": 0.3583180987202925,
"grad_norm": 12.482742309570312,
"learning_rate": 2.7867641681901282e-05,
"loss": 0.7622,
"step": 980
},
{
"epoch": 0.3619744058500914,
"grad_norm": 9.126582145690918,
"learning_rate": 2.7845703839122484e-05,
"loss": 0.6539,
"step": 990
},
{
"epoch": 0.3656307129798903,
"grad_norm": 6.254278182983398,
"learning_rate": 2.7823765996343694e-05,
"loss": 0.6231,
"step": 1000
},
{
"epoch": 0.3692870201096892,
"grad_norm": 5.566930294036865,
"learning_rate": 2.78018281535649e-05,
"loss": 0.8925,
"step": 1010
},
{
"epoch": 0.37294332723948814,
"grad_norm": 11.380731582641602,
"learning_rate": 2.7779890310786106e-05,
"loss": 0.8145,
"step": 1020
},
{
"epoch": 0.376599634369287,
"grad_norm": 5.229077339172363,
"learning_rate": 2.7757952468007312e-05,
"loss": 0.6471,
"step": 1030
},
{
"epoch": 0.3802559414990859,
"grad_norm": 7.065961837768555,
"learning_rate": 2.773601462522852e-05,
"loss": 0.658,
"step": 1040
},
{
"epoch": 0.38391224862888484,
"grad_norm": 7.386284828186035,
"learning_rate": 2.7714076782449728e-05,
"loss": 0.6973,
"step": 1050
},
{
"epoch": 0.3875685557586837,
"grad_norm": 4.258168697357178,
"learning_rate": 2.7692138939670934e-05,
"loss": 0.6318,
"step": 1060
},
{
"epoch": 0.3912248628884826,
"grad_norm": 10.302197456359863,
"learning_rate": 2.767020109689214e-05,
"loss": 0.5522,
"step": 1070
},
{
"epoch": 0.39488117001828155,
"grad_norm": 6.281784534454346,
"learning_rate": 2.7648263254113347e-05,
"loss": 0.7723,
"step": 1080
},
{
"epoch": 0.39853747714808047,
"grad_norm": 8.805102348327637,
"learning_rate": 2.7626325411334553e-05,
"loss": 0.668,
"step": 1090
},
{
"epoch": 0.40219378427787933,
"grad_norm": 14.54948902130127,
"learning_rate": 2.760438756855576e-05,
"loss": 0.921,
"step": 1100
},
{
"epoch": 0.40585009140767825,
"grad_norm": 7.115081310272217,
"learning_rate": 2.7582449725776965e-05,
"loss": 0.7194,
"step": 1110
},
{
"epoch": 0.40950639853747717,
"grad_norm": 2.9493892192840576,
"learning_rate": 2.756051188299817e-05,
"loss": 0.6247,
"step": 1120
},
{
"epoch": 0.41316270566727603,
"grad_norm": 16.915966033935547,
"learning_rate": 2.7538574040219377e-05,
"loss": 0.8615,
"step": 1130
},
{
"epoch": 0.41681901279707495,
"grad_norm": 5.787754535675049,
"learning_rate": 2.7516636197440587e-05,
"loss": 0.6051,
"step": 1140
},
{
"epoch": 0.42047531992687387,
"grad_norm": 10.545123100280762,
"learning_rate": 2.7494698354661793e-05,
"loss": 0.7797,
"step": 1150
},
{
"epoch": 0.42413162705667273,
"grad_norm": 15.382741928100586,
"learning_rate": 2.7472760511883e-05,
"loss": 1.0864,
"step": 1160
},
{
"epoch": 0.42778793418647165,
"grad_norm": 5.235750198364258,
"learning_rate": 2.7450822669104205e-05,
"loss": 0.7217,
"step": 1170
},
{
"epoch": 0.43144424131627057,
"grad_norm": 6.794938564300537,
"learning_rate": 2.7428884826325415e-05,
"loss": 0.9402,
"step": 1180
},
{
"epoch": 0.4351005484460695,
"grad_norm": 11.024620056152344,
"learning_rate": 2.7406946983546617e-05,
"loss": 0.7542,
"step": 1190
},
{
"epoch": 0.43875685557586835,
"grad_norm": 11.393266677856445,
"learning_rate": 2.7385009140767824e-05,
"loss": 0.6272,
"step": 1200
},
{
"epoch": 0.4424131627056673,
"grad_norm": 8.483016967773438,
"learning_rate": 2.736307129798903e-05,
"loss": 1.2218,
"step": 1210
},
{
"epoch": 0.4460694698354662,
"grad_norm": 12.325540542602539,
"learning_rate": 2.734113345521024e-05,
"loss": 0.6524,
"step": 1220
},
{
"epoch": 0.44972577696526506,
"grad_norm": 5.426061630249023,
"learning_rate": 2.7319195612431445e-05,
"loss": 0.8768,
"step": 1230
},
{
"epoch": 0.453382084095064,
"grad_norm": 6.959734916687012,
"learning_rate": 2.729725776965265e-05,
"loss": 0.5997,
"step": 1240
},
{
"epoch": 0.4570383912248629,
"grad_norm": 14.661490440368652,
"learning_rate": 2.7275319926873858e-05,
"loss": 0.8667,
"step": 1250
},
{
"epoch": 0.4606946983546618,
"grad_norm": 10.735424995422363,
"learning_rate": 2.7253382084095064e-05,
"loss": 0.6064,
"step": 1260
},
{
"epoch": 0.4643510054844607,
"grad_norm": 10.7152681350708,
"learning_rate": 2.7231444241316273e-05,
"loss": 0.8671,
"step": 1270
},
{
"epoch": 0.4680073126142596,
"grad_norm": 8.87678050994873,
"learning_rate": 2.720950639853748e-05,
"loss": 0.9788,
"step": 1280
},
{
"epoch": 0.4716636197440585,
"grad_norm": 1.8030093908309937,
"learning_rate": 2.7187568555758682e-05,
"loss": 0.7143,
"step": 1290
},
{
"epoch": 0.4753199268738574,
"grad_norm": 10.601454734802246,
"learning_rate": 2.716563071297989e-05,
"loss": 0.8064,
"step": 1300
},
{
"epoch": 0.4789762340036563,
"grad_norm": 7.095282554626465,
"learning_rate": 2.7143692870201098e-05,
"loss": 0.7545,
"step": 1310
},
{
"epoch": 0.4826325411334552,
"grad_norm": 1.267622470855713,
"learning_rate": 2.7121755027422304e-05,
"loss": 0.765,
"step": 1320
},
{
"epoch": 0.48628884826325414,
"grad_norm": 11.8803071975708,
"learning_rate": 2.709981718464351e-05,
"loss": 0.9996,
"step": 1330
},
{
"epoch": 0.489945155393053,
"grad_norm": 9.95639705657959,
"learning_rate": 2.7077879341864716e-05,
"loss": 1.0536,
"step": 1340
},
{
"epoch": 0.4936014625228519,
"grad_norm": 11.731663703918457,
"learning_rate": 2.7055941499085926e-05,
"loss": 0.8901,
"step": 1350
},
{
"epoch": 0.49725776965265084,
"grad_norm": 7.863046646118164,
"learning_rate": 2.7034003656307132e-05,
"loss": 0.6168,
"step": 1360
},
{
"epoch": 0.5009140767824497,
"grad_norm": 7.594435214996338,
"learning_rate": 2.7012065813528338e-05,
"loss": 1.1098,
"step": 1370
},
{
"epoch": 0.5045703839122486,
"grad_norm": 5.769408702850342,
"learning_rate": 2.6990127970749544e-05,
"loss": 0.6672,
"step": 1380
},
{
"epoch": 0.5082266910420475,
"grad_norm": 7.641537666320801,
"learning_rate": 2.696819012797075e-05,
"loss": 0.9141,
"step": 1390
},
{
"epoch": 0.5118829981718465,
"grad_norm": 8.880860328674316,
"learning_rate": 2.6946252285191957e-05,
"loss": 0.7542,
"step": 1400
},
{
"epoch": 0.5155393053016454,
"grad_norm": 3.2335469722747803,
"learning_rate": 2.6924314442413163e-05,
"loss": 0.7966,
"step": 1410
},
{
"epoch": 0.5191956124314442,
"grad_norm": 3.989349365234375,
"learning_rate": 2.690237659963437e-05,
"loss": 0.7838,
"step": 1420
},
{
"epoch": 0.5228519195612431,
"grad_norm": 12.424365043640137,
"learning_rate": 2.6880438756855575e-05,
"loss": 0.9574,
"step": 1430
},
{
"epoch": 0.526508226691042,
"grad_norm": 6.308820724487305,
"learning_rate": 2.6858500914076785e-05,
"loss": 0.6676,
"step": 1440
},
{
"epoch": 0.5301645338208409,
"grad_norm": 6.80699348449707,
"learning_rate": 2.683656307129799e-05,
"loss": 0.6364,
"step": 1450
},
{
"epoch": 0.5338208409506399,
"grad_norm": 7.654812335968018,
"learning_rate": 2.6814625228519197e-05,
"loss": 0.8394,
"step": 1460
},
{
"epoch": 0.5374771480804388,
"grad_norm": 3.173919677734375,
"learning_rate": 2.6792687385740403e-05,
"loss": 0.4993,
"step": 1470
},
{
"epoch": 0.5411334552102377,
"grad_norm": 11.510188102722168,
"learning_rate": 2.677074954296161e-05,
"loss": 1.0199,
"step": 1480
},
{
"epoch": 0.5447897623400365,
"grad_norm": 9.919046401977539,
"learning_rate": 2.674881170018282e-05,
"loss": 0.8567,
"step": 1490
},
{
"epoch": 0.5484460694698354,
"grad_norm": 10.544548034667969,
"learning_rate": 2.672687385740402e-05,
"loss": 0.8208,
"step": 1500
},
{
"epoch": 0.5521023765996343,
"grad_norm": 10.39263916015625,
"learning_rate": 2.6704936014625228e-05,
"loss": 1.0027,
"step": 1510
},
{
"epoch": 0.5557586837294333,
"grad_norm": 7.957463264465332,
"learning_rate": 2.6682998171846434e-05,
"loss": 0.5865,
"step": 1520
},
{
"epoch": 0.5594149908592322,
"grad_norm": 6.65998649597168,
"learning_rate": 2.6661060329067643e-05,
"loss": 1.1056,
"step": 1530
},
{
"epoch": 0.5630712979890311,
"grad_norm": 4.286714553833008,
"learning_rate": 2.663912248628885e-05,
"loss": 0.923,
"step": 1540
},
{
"epoch": 0.56672760511883,
"grad_norm": 12.143743515014648,
"learning_rate": 2.6617184643510055e-05,
"loss": 0.8542,
"step": 1550
},
{
"epoch": 0.5703839122486288,
"grad_norm": 7.362223148345947,
"learning_rate": 2.659524680073126e-05,
"loss": 0.9177,
"step": 1560
},
{
"epoch": 0.5740402193784278,
"grad_norm": 8.774934768676758,
"learning_rate": 2.657330895795247e-05,
"loss": 0.7503,
"step": 1570
},
{
"epoch": 0.5776965265082267,
"grad_norm": 7.924509048461914,
"learning_rate": 2.6551371115173677e-05,
"loss": 0.8291,
"step": 1580
},
{
"epoch": 0.5813528336380256,
"grad_norm": 4.72158145904541,
"learning_rate": 2.6529433272394883e-05,
"loss": 0.6627,
"step": 1590
},
{
"epoch": 0.5850091407678245,
"grad_norm": 4.265242576599121,
"learning_rate": 2.6507495429616086e-05,
"loss": 0.618,
"step": 1600
},
{
"epoch": 0.5886654478976234,
"grad_norm": 7.109083652496338,
"learning_rate": 2.6485557586837292e-05,
"loss": 0.7683,
"step": 1610
},
{
"epoch": 0.5923217550274223,
"grad_norm": 8.804269790649414,
"learning_rate": 2.6463619744058502e-05,
"loss": 0.6226,
"step": 1620
},
{
"epoch": 0.5959780621572212,
"grad_norm": 5.748142242431641,
"learning_rate": 2.6441681901279708e-05,
"loss": 0.6175,
"step": 1630
},
{
"epoch": 0.5996343692870201,
"grad_norm": 10.173929214477539,
"learning_rate": 2.6419744058500914e-05,
"loss": 0.6959,
"step": 1640
},
{
"epoch": 0.603290676416819,
"grad_norm": 6.71423864364624,
"learning_rate": 2.639780621572212e-05,
"loss": 0.6785,
"step": 1650
},
{
"epoch": 0.6069469835466179,
"grad_norm": 11.05833625793457,
"learning_rate": 2.637586837294333e-05,
"loss": 0.6683,
"step": 1660
},
{
"epoch": 0.6106032906764168,
"grad_norm": 8.08876895904541,
"learning_rate": 2.6353930530164536e-05,
"loss": 0.8345,
"step": 1670
},
{
"epoch": 0.6142595978062158,
"grad_norm": 8.007697105407715,
"learning_rate": 2.6331992687385742e-05,
"loss": 0.9306,
"step": 1680
},
{
"epoch": 0.6179159049360147,
"grad_norm": 8.34351921081543,
"learning_rate": 2.6310054844606948e-05,
"loss": 0.9681,
"step": 1690
},
{
"epoch": 0.6215722120658135,
"grad_norm": 9.194400787353516,
"learning_rate": 2.6288117001828154e-05,
"loss": 0.9323,
"step": 1700
},
{
"epoch": 0.6252285191956124,
"grad_norm": 3.603123903274536,
"learning_rate": 2.626617915904936e-05,
"loss": 0.67,
"step": 1710
},
{
"epoch": 0.6288848263254113,
"grad_norm": 6.769972801208496,
"learning_rate": 2.6244241316270567e-05,
"loss": 0.6847,
"step": 1720
},
{
"epoch": 0.6325411334552102,
"grad_norm": 6.123934745788574,
"learning_rate": 2.6222303473491773e-05,
"loss": 0.5735,
"step": 1730
},
{
"epoch": 0.6361974405850092,
"grad_norm": 8.356404304504395,
"learning_rate": 2.620036563071298e-05,
"loss": 0.7249,
"step": 1740
},
{
"epoch": 0.6398537477148081,
"grad_norm": 5.085474014282227,
"learning_rate": 2.617842778793419e-05,
"loss": 0.8015,
"step": 1750
},
{
"epoch": 0.643510054844607,
"grad_norm": 6.887426376342773,
"learning_rate": 2.6156489945155395e-05,
"loss": 0.6637,
"step": 1760
},
{
"epoch": 0.6471663619744058,
"grad_norm": 7.155372619628906,
"learning_rate": 2.61345521023766e-05,
"loss": 0.6163,
"step": 1770
},
{
"epoch": 0.6508226691042047,
"grad_norm": 10.486412048339844,
"learning_rate": 2.6112614259597807e-05,
"loss": 0.7365,
"step": 1780
},
{
"epoch": 0.6544789762340036,
"grad_norm": 8.337804794311523,
"learning_rate": 2.6090676416819016e-05,
"loss": 0.6944,
"step": 1790
},
{
"epoch": 0.6581352833638026,
"grad_norm": 8.610974311828613,
"learning_rate": 2.606873857404022e-05,
"loss": 0.6498,
"step": 1800
},
{
"epoch": 0.6617915904936015,
"grad_norm": 9.723325729370117,
"learning_rate": 2.6046800731261425e-05,
"loss": 0.6993,
"step": 1810
},
{
"epoch": 0.6654478976234004,
"grad_norm": 9.187579154968262,
"learning_rate": 2.602486288848263e-05,
"loss": 0.8795,
"step": 1820
},
{
"epoch": 0.6691042047531993,
"grad_norm": 9.775445938110352,
"learning_rate": 2.600292504570384e-05,
"loss": 0.8081,
"step": 1830
},
{
"epoch": 0.6727605118829981,
"grad_norm": 10.012187004089355,
"learning_rate": 2.5980987202925047e-05,
"loss": 0.7079,
"step": 1840
},
{
"epoch": 0.676416819012797,
"grad_norm": 10.074971199035645,
"learning_rate": 2.5959049360146253e-05,
"loss": 0.6554,
"step": 1850
},
{
"epoch": 0.680073126142596,
"grad_norm": 11.149927139282227,
"learning_rate": 2.593711151736746e-05,
"loss": 0.8357,
"step": 1860
},
{
"epoch": 0.6837294332723949,
"grad_norm": 5.098260879516602,
"learning_rate": 2.5915173674588666e-05,
"loss": 0.7488,
"step": 1870
},
{
"epoch": 0.6873857404021938,
"grad_norm": 8.32321834564209,
"learning_rate": 2.5893235831809875e-05,
"loss": 0.7639,
"step": 1880
},
{
"epoch": 0.6910420475319927,
"grad_norm": 8.753900527954102,
"learning_rate": 2.587129798903108e-05,
"loss": 0.8777,
"step": 1890
},
{
"epoch": 0.6946983546617916,
"grad_norm": 5.129249095916748,
"learning_rate": 2.5849360146252284e-05,
"loss": 0.7593,
"step": 1900
},
{
"epoch": 0.6983546617915904,
"grad_norm": 10.712813377380371,
"learning_rate": 2.582742230347349e-05,
"loss": 0.6266,
"step": 1910
},
{
"epoch": 0.7020109689213894,
"grad_norm": 4.966675758361816,
"learning_rate": 2.58054844606947e-05,
"loss": 0.7851,
"step": 1920
},
{
"epoch": 0.7056672760511883,
"grad_norm": 4.763036727905273,
"learning_rate": 2.5783546617915906e-05,
"loss": 0.9193,
"step": 1930
},
{
"epoch": 0.7093235831809872,
"grad_norm": 10.881400108337402,
"learning_rate": 2.5761608775137112e-05,
"loss": 0.7159,
"step": 1940
},
{
"epoch": 0.7129798903107861,
"grad_norm": 8.307093620300293,
"learning_rate": 2.5739670932358318e-05,
"loss": 0.7091,
"step": 1950
},
{
"epoch": 0.716636197440585,
"grad_norm": 8.85936450958252,
"learning_rate": 2.5717733089579524e-05,
"loss": 0.6216,
"step": 1960
},
{
"epoch": 0.720292504570384,
"grad_norm": 8.200945854187012,
"learning_rate": 2.5695795246800734e-05,
"loss": 0.7674,
"step": 1970
},
{
"epoch": 0.7239488117001828,
"grad_norm": 6.665803909301758,
"learning_rate": 2.567385740402194e-05,
"loss": 0.5824,
"step": 1980
},
{
"epoch": 0.7276051188299817,
"grad_norm": 13.1766357421875,
"learning_rate": 2.5651919561243146e-05,
"loss": 0.8602,
"step": 1990
},
{
"epoch": 0.7312614259597806,
"grad_norm": 12.900677680969238,
"learning_rate": 2.562998171846435e-05,
"loss": 0.7945,
"step": 2000
},
{
"epoch": 0.7349177330895795,
"grad_norm": 8.223727226257324,
"learning_rate": 2.5608043875685558e-05,
"loss": 0.7058,
"step": 2010
},
{
"epoch": 0.7385740402193784,
"grad_norm": 5.132645606994629,
"learning_rate": 2.5586106032906764e-05,
"loss": 0.6005,
"step": 2020
},
{
"epoch": 0.7422303473491774,
"grad_norm": 5.319431304931641,
"learning_rate": 2.556416819012797e-05,
"loss": 0.6141,
"step": 2030
},
{
"epoch": 0.7458866544789763,
"grad_norm": 4.22127628326416,
"learning_rate": 2.5542230347349177e-05,
"loss": 0.7697,
"step": 2040
},
{
"epoch": 0.7495429616087751,
"grad_norm": 7.919135093688965,
"learning_rate": 2.5520292504570386e-05,
"loss": 0.6771,
"step": 2050
},
{
"epoch": 0.753199268738574,
"grad_norm": 8.82950496673584,
"learning_rate": 2.5498354661791592e-05,
"loss": 0.7459,
"step": 2060
},
{
"epoch": 0.7568555758683729,
"grad_norm": 6.079866886138916,
"learning_rate": 2.54764168190128e-05,
"loss": 0.78,
"step": 2070
},
{
"epoch": 0.7605118829981719,
"grad_norm": 9.02277660369873,
"learning_rate": 2.5454478976234005e-05,
"loss": 0.6527,
"step": 2080
},
{
"epoch": 0.7641681901279708,
"grad_norm": 7.963276386260986,
"learning_rate": 2.543254113345521e-05,
"loss": 0.9617,
"step": 2090
},
{
"epoch": 0.7678244972577697,
"grad_norm": 15.237689971923828,
"learning_rate": 2.5410603290676417e-05,
"loss": 0.9292,
"step": 2100
},
{
"epoch": 0.7714808043875686,
"grad_norm": 8.40709114074707,
"learning_rate": 2.5388665447897623e-05,
"loss": 1.1213,
"step": 2110
},
{
"epoch": 0.7751371115173674,
"grad_norm": 15.25880241394043,
"learning_rate": 2.536672760511883e-05,
"loss": 0.9129,
"step": 2120
},
{
"epoch": 0.7787934186471663,
"grad_norm": 9.398399353027344,
"learning_rate": 2.5344789762340035e-05,
"loss": 1.0284,
"step": 2130
},
{
"epoch": 0.7824497257769653,
"grad_norm": 9.999375343322754,
"learning_rate": 2.5322851919561245e-05,
"loss": 0.9143,
"step": 2140
},
{
"epoch": 0.7861060329067642,
"grad_norm": 6.247265815734863,
"learning_rate": 2.530091407678245e-05,
"loss": 0.5627,
"step": 2150
},
{
"epoch": 0.7897623400365631,
"grad_norm": 11.39775276184082,
"learning_rate": 2.5278976234003657e-05,
"loss": 0.5297,
"step": 2160
},
{
"epoch": 0.793418647166362,
"grad_norm": 7.309044361114502,
"learning_rate": 2.5257038391224863e-05,
"loss": 0.7952,
"step": 2170
},
{
"epoch": 0.7970749542961609,
"grad_norm": 4.260741710662842,
"learning_rate": 2.5235100548446073e-05,
"loss": 0.6773,
"step": 2180
},
{
"epoch": 0.8007312614259597,
"grad_norm": 6.936405658721924,
"learning_rate": 2.521316270566728e-05,
"loss": 0.8001,
"step": 2190
},
{
"epoch": 0.8043875685557587,
"grad_norm": 6.857205390930176,
"learning_rate": 2.5191224862888482e-05,
"loss": 0.6795,
"step": 2200
},
{
"epoch": 0.8080438756855576,
"grad_norm": 14.970353126525879,
"learning_rate": 2.5169287020109688e-05,
"loss": 1.1865,
"step": 2210
},
{
"epoch": 0.8117001828153565,
"grad_norm": 16.46977424621582,
"learning_rate": 2.5147349177330894e-05,
"loss": 1.2037,
"step": 2220
},
{
"epoch": 0.8153564899451554,
"grad_norm": 4.785205841064453,
"learning_rate": 2.5125411334552104e-05,
"loss": 0.8542,
"step": 2230
},
{
"epoch": 0.8190127970749543,
"grad_norm": 8.814366340637207,
"learning_rate": 2.510347349177331e-05,
"loss": 1.0594,
"step": 2240
},
{
"epoch": 0.8226691042047533,
"grad_norm": 6.870213031768799,
"learning_rate": 2.5081535648994516e-05,
"loss": 0.8027,
"step": 2250
},
{
"epoch": 0.8263254113345521,
"grad_norm": 3.1548120975494385,
"learning_rate": 2.5059597806215722e-05,
"loss": 0.5979,
"step": 2260
},
{
"epoch": 0.829981718464351,
"grad_norm": 7.584613800048828,
"learning_rate": 2.503765996343693e-05,
"loss": 0.6417,
"step": 2270
},
{
"epoch": 0.8336380255941499,
"grad_norm": 10.385662078857422,
"learning_rate": 2.5015722120658138e-05,
"loss": 0.9269,
"step": 2280
},
{
"epoch": 0.8372943327239488,
"grad_norm": 4.356326103210449,
"learning_rate": 2.4993784277879344e-05,
"loss": 0.7558,
"step": 2290
},
{
"epoch": 0.8409506398537477,
"grad_norm": 12.305597305297852,
"learning_rate": 2.4971846435100547e-05,
"loss": 0.9336,
"step": 2300
},
{
"epoch": 0.8446069469835467,
"grad_norm": 12.440481185913086,
"learning_rate": 2.4949908592321753e-05,
"loss": 0.7337,
"step": 2310
},
{
"epoch": 0.8482632541133455,
"grad_norm": 14.280756950378418,
"learning_rate": 2.4927970749542962e-05,
"loss": 0.7303,
"step": 2320
},
{
"epoch": 0.8519195612431444,
"grad_norm": 3.728710412979126,
"learning_rate": 2.490603290676417e-05,
"loss": 0.8716,
"step": 2330
},
{
"epoch": 0.8555758683729433,
"grad_norm": 7.865159034729004,
"learning_rate": 2.4884095063985374e-05,
"loss": 1.0542,
"step": 2340
},
{
"epoch": 0.8592321755027422,
"grad_norm": 8.721333503723145,
"learning_rate": 2.486215722120658e-05,
"loss": 1.0551,
"step": 2350
},
{
"epoch": 0.8628884826325411,
"grad_norm": 1.7179598808288574,
"learning_rate": 2.484021937842779e-05,
"loss": 0.7777,
"step": 2360
},
{
"epoch": 0.8665447897623401,
"grad_norm": 5.079452037811279,
"learning_rate": 2.4818281535648996e-05,
"loss": 0.7584,
"step": 2370
},
{
"epoch": 0.870201096892139,
"grad_norm": 2.566901683807373,
"learning_rate": 2.4796343692870202e-05,
"loss": 0.9418,
"step": 2380
},
{
"epoch": 0.8738574040219378,
"grad_norm": 0.8049097061157227,
"learning_rate": 2.477440585009141e-05,
"loss": 1.0866,
"step": 2390
},
{
"epoch": 0.8775137111517367,
"grad_norm": 12.45459270477295,
"learning_rate": 2.4752468007312615e-05,
"loss": 0.9516,
"step": 2400
},
{
"epoch": 0.8811700182815356,
"grad_norm": 10.37132453918457,
"learning_rate": 2.473053016453382e-05,
"loss": 0.885,
"step": 2410
},
{
"epoch": 0.8848263254113345,
"grad_norm": 11.392967224121094,
"learning_rate": 2.4708592321755027e-05,
"loss": 0.8999,
"step": 2420
},
{
"epoch": 0.8884826325411335,
"grad_norm": 9.597825050354004,
"learning_rate": 2.4686654478976233e-05,
"loss": 0.7255,
"step": 2430
},
{
"epoch": 0.8921389396709324,
"grad_norm": 6.229734897613525,
"learning_rate": 2.466471663619744e-05,
"loss": 0.6297,
"step": 2440
},
{
"epoch": 0.8957952468007313,
"grad_norm": 6.92341423034668,
"learning_rate": 2.464277879341865e-05,
"loss": 0.8446,
"step": 2450
},
{
"epoch": 0.8994515539305301,
"grad_norm": 6.999603748321533,
"learning_rate": 2.4620840950639855e-05,
"loss": 0.9243,
"step": 2460
},
{
"epoch": 0.903107861060329,
"grad_norm": 6.688783645629883,
"learning_rate": 2.459890310786106e-05,
"loss": 0.9761,
"step": 2470
},
{
"epoch": 0.906764168190128,
"grad_norm": 4.743894577026367,
"learning_rate": 2.4576965265082267e-05,
"loss": 0.8031,
"step": 2480
},
{
"epoch": 0.9104204753199269,
"grad_norm": 5.617483139038086,
"learning_rate": 2.4555027422303477e-05,
"loss": 0.6959,
"step": 2490
},
{
"epoch": 0.9140767824497258,
"grad_norm": 8.579802513122559,
"learning_rate": 2.4533089579524683e-05,
"loss": 0.9484,
"step": 2500
},
{
"epoch": 0.9177330895795247,
"grad_norm": 13.061285018920898,
"learning_rate": 2.4511151736745886e-05,
"loss": 1.2487,
"step": 2510
},
{
"epoch": 0.9213893967093236,
"grad_norm": 9.990108489990234,
"learning_rate": 2.4489213893967092e-05,
"loss": 0.8655,
"step": 2520
},
{
"epoch": 0.9250457038391224,
"grad_norm": 7.661534309387207,
"learning_rate": 2.44672760511883e-05,
"loss": 0.8729,
"step": 2530
},
{
"epoch": 0.9287020109689214,
"grad_norm": 3.4216301441192627,
"learning_rate": 2.4445338208409507e-05,
"loss": 0.6208,
"step": 2540
},
{
"epoch": 0.9323583180987203,
"grad_norm": 6.860260963439941,
"learning_rate": 2.4423400365630714e-05,
"loss": 0.8488,
"step": 2550
},
{
"epoch": 0.9360146252285192,
"grad_norm": 8.332857131958008,
"learning_rate": 2.440146252285192e-05,
"loss": 0.5859,
"step": 2560
},
{
"epoch": 0.9396709323583181,
"grad_norm": 6.4805402755737305,
"learning_rate": 2.4379524680073126e-05,
"loss": 0.7436,
"step": 2570
},
{
"epoch": 0.943327239488117,
"grad_norm": 5.344940662384033,
"learning_rate": 2.4357586837294335e-05,
"loss": 1.0088,
"step": 2580
},
{
"epoch": 0.946983546617916,
"grad_norm": 9.946269035339355,
"learning_rate": 2.433564899451554e-05,
"loss": 0.7087,
"step": 2590
},
{
"epoch": 0.9506398537477148,
"grad_norm": 4.209563255310059,
"learning_rate": 2.4313711151736748e-05,
"loss": 0.5656,
"step": 2600
},
{
"epoch": 0.9542961608775137,
"grad_norm": 4.404534816741943,
"learning_rate": 2.429177330895795e-05,
"loss": 0.6234,
"step": 2610
},
{
"epoch": 0.9579524680073126,
"grad_norm": 4.724971294403076,
"learning_rate": 2.426983546617916e-05,
"loss": 0.7714,
"step": 2620
},
{
"epoch": 0.9616087751371115,
"grad_norm": 6.836884498596191,
"learning_rate": 2.4247897623400366e-05,
"loss": 0.8142,
"step": 2630
},
{
"epoch": 0.9652650822669104,
"grad_norm": 3.4139904975891113,
"learning_rate": 2.4225959780621572e-05,
"loss": 0.5885,
"step": 2640
},
{
"epoch": 0.9689213893967094,
"grad_norm": 13.546429634094238,
"learning_rate": 2.420402193784278e-05,
"loss": 0.8844,
"step": 2650
},
{
"epoch": 0.9725776965265083,
"grad_norm": 5.117456436157227,
"learning_rate": 2.4182084095063988e-05,
"loss": 0.7408,
"step": 2660
},
{
"epoch": 0.9762340036563071,
"grad_norm": 11.973124504089355,
"learning_rate": 2.4160146252285194e-05,
"loss": 0.9015,
"step": 2670
},
{
"epoch": 0.979890310786106,
"grad_norm": 7.9256815910339355,
"learning_rate": 2.41382084095064e-05,
"loss": 0.8179,
"step": 2680
},
{
"epoch": 0.9835466179159049,
"grad_norm": 0.613832414150238,
"learning_rate": 2.4116270566727606e-05,
"loss": 0.8218,
"step": 2690
},
{
"epoch": 0.9872029250457038,
"grad_norm": 2.720730781555176,
"learning_rate": 2.4094332723948813e-05,
"loss": 0.5718,
"step": 2700
},
{
"epoch": 0.9908592321755028,
"grad_norm": 5.895959854125977,
"learning_rate": 2.407239488117002e-05,
"loss": 0.9486,
"step": 2710
},
{
"epoch": 0.9945155393053017,
"grad_norm": 6.581000804901123,
"learning_rate": 2.4050457038391225e-05,
"loss": 0.7959,
"step": 2720
},
{
"epoch": 0.9981718464351006,
"grad_norm": 7.979818344116211,
"learning_rate": 2.402851919561243e-05,
"loss": 0.5748,
"step": 2730
},
{
"epoch": 1.0018281535648994,
"grad_norm": 5.917481422424316,
"learning_rate": 2.4006581352833637e-05,
"loss": 1.0186,
"step": 2740
},
{
"epoch": 1.0054844606946984,
"grad_norm": 1.8859217166900635,
"learning_rate": 2.3984643510054847e-05,
"loss": 0.8374,
"step": 2750
},
{
"epoch": 1.0091407678244972,
"grad_norm": 10.354247093200684,
"learning_rate": 2.3962705667276053e-05,
"loss": 1.0679,
"step": 2760
},
{
"epoch": 1.012797074954296,
"grad_norm": 6.047128677368164,
"learning_rate": 2.394076782449726e-05,
"loss": 1.0716,
"step": 2770
},
{
"epoch": 1.016453382084095,
"grad_norm": 11.777497291564941,
"learning_rate": 2.3918829981718465e-05,
"loss": 0.8161,
"step": 2780
},
{
"epoch": 1.0201096892138939,
"grad_norm": 3.427635908126831,
"learning_rate": 2.389689213893967e-05,
"loss": 0.6662,
"step": 2790
},
{
"epoch": 1.023765996343693,
"grad_norm": 14.091401100158691,
"learning_rate": 2.387495429616088e-05,
"loss": 0.7342,
"step": 2800
},
{
"epoch": 1.0274223034734917,
"grad_norm": 6.376955032348633,
"learning_rate": 2.3853016453382083e-05,
"loss": 0.6482,
"step": 2810
},
{
"epoch": 1.0310786106032908,
"grad_norm": 3.5191450119018555,
"learning_rate": 2.383107861060329e-05,
"loss": 0.9219,
"step": 2820
},
{
"epoch": 1.0347349177330896,
"grad_norm": 4.531268119812012,
"learning_rate": 2.3809140767824496e-05,
"loss": 1.2418,
"step": 2830
},
{
"epoch": 1.0383912248628886,
"grad_norm": 6.366710186004639,
"learning_rate": 2.3787202925045705e-05,
"loss": 0.6533,
"step": 2840
},
{
"epoch": 1.0420475319926874,
"grad_norm": 2.5387659072875977,
"learning_rate": 2.376526508226691e-05,
"loss": 1.0503,
"step": 2850
},
{
"epoch": 1.0457038391224862,
"grad_norm": 3.4339308738708496,
"learning_rate": 2.3743327239488118e-05,
"loss": 1.5317,
"step": 2860
},
{
"epoch": 1.0493601462522852,
"grad_norm": 20.403852462768555,
"learning_rate": 2.3721389396709324e-05,
"loss": 1.1739,
"step": 2870
},
{
"epoch": 1.053016453382084,
"grad_norm": 9.94764232635498,
"learning_rate": 2.3699451553930533e-05,
"loss": 0.9365,
"step": 2880
},
{
"epoch": 1.056672760511883,
"grad_norm": 4.770013332366943,
"learning_rate": 2.367751371115174e-05,
"loss": 0.4251,
"step": 2890
},
{
"epoch": 1.0603290676416819,
"grad_norm": 1.9703326225280762,
"learning_rate": 2.3655575868372945e-05,
"loss": 0.6515,
"step": 2900
},
{
"epoch": 1.0639853747714807,
"grad_norm": 9.562021255493164,
"learning_rate": 2.3633638025594148e-05,
"loss": 0.7161,
"step": 2910
},
{
"epoch": 1.0676416819012797,
"grad_norm": 10.26481819152832,
"learning_rate": 2.3611700182815354e-05,
"loss": 0.7187,
"step": 2920
},
{
"epoch": 1.0712979890310785,
"grad_norm": 3.004570722579956,
"learning_rate": 2.3589762340036564e-05,
"loss": 1.0414,
"step": 2930
},
{
"epoch": 1.0749542961608776,
"grad_norm": 9.800512313842773,
"learning_rate": 2.356782449725777e-05,
"loss": 0.7966,
"step": 2940
},
{
"epoch": 1.0786106032906764,
"grad_norm": 13.301290512084961,
"learning_rate": 2.3545886654478976e-05,
"loss": 0.9953,
"step": 2950
},
{
"epoch": 1.0822669104204754,
"grad_norm": 2.7511966228485107,
"learning_rate": 2.3523948811700182e-05,
"loss": 0.705,
"step": 2960
},
{
"epoch": 1.0859232175502742,
"grad_norm": 5.51497220993042,
"learning_rate": 2.3502010968921392e-05,
"loss": 0.8642,
"step": 2970
},
{
"epoch": 1.0895795246800732,
"grad_norm": 6.78330659866333,
"learning_rate": 2.3480073126142598e-05,
"loss": 0.7787,
"step": 2980
},
{
"epoch": 1.093235831809872,
"grad_norm": 4.385842323303223,
"learning_rate": 2.3458135283363804e-05,
"loss": 1.0877,
"step": 2990
},
{
"epoch": 1.0968921389396709,
"grad_norm": 6.217209815979004,
"learning_rate": 2.343619744058501e-05,
"loss": 0.8384,
"step": 3000
},
{
"epoch": 1.1005484460694699,
"grad_norm": 13.187909126281738,
"learning_rate": 2.3414259597806216e-05,
"loss": 0.9411,
"step": 3010
},
{
"epoch": 1.1042047531992687,
"grad_norm": 0.5087007880210876,
"learning_rate": 2.3392321755027423e-05,
"loss": 0.9831,
"step": 3020
},
{
"epoch": 1.1078610603290677,
"grad_norm": 1.0318357944488525,
"learning_rate": 2.337038391224863e-05,
"loss": 0.6656,
"step": 3030
},
{
"epoch": 1.1115173674588665,
"grad_norm": 7.319566249847412,
"learning_rate": 2.3348446069469835e-05,
"loss": 0.8284,
"step": 3040
},
{
"epoch": 1.1151736745886653,
"grad_norm": 3.79536771774292,
"learning_rate": 2.332650822669104e-05,
"loss": 0.9016,
"step": 3050
},
{
"epoch": 1.1188299817184644,
"grad_norm": 8.989640235900879,
"learning_rate": 2.330457038391225e-05,
"loss": 0.7304,
"step": 3060
},
{
"epoch": 1.1224862888482632,
"grad_norm": 5.405416965484619,
"learning_rate": 2.3282632541133457e-05,
"loss": 1.0426,
"step": 3070
},
{
"epoch": 1.1261425959780622,
"grad_norm": 2.653970241546631,
"learning_rate": 2.3260694698354663e-05,
"loss": 0.9311,
"step": 3080
},
{
"epoch": 1.129798903107861,
"grad_norm": 0.901639997959137,
"learning_rate": 2.323875685557587e-05,
"loss": 0.9487,
"step": 3090
},
{
"epoch": 1.13345521023766,
"grad_norm": 4.24121618270874,
"learning_rate": 2.321681901279708e-05,
"loss": 0.7323,
"step": 3100
},
{
"epoch": 1.1371115173674589,
"grad_norm": 7.49923849105835,
"learning_rate": 2.319488117001828e-05,
"loss": 0.6855,
"step": 3110
},
{
"epoch": 1.1407678244972579,
"grad_norm": 2.1442363262176514,
"learning_rate": 2.3172943327239487e-05,
"loss": 0.8922,
"step": 3120
},
{
"epoch": 1.1444241316270567,
"grad_norm": 7.328529357910156,
"learning_rate": 2.3151005484460694e-05,
"loss": 0.8567,
"step": 3130
},
{
"epoch": 1.1480804387568555,
"grad_norm": 2.2346909046173096,
"learning_rate": 2.31290676416819e-05,
"loss": 0.9295,
"step": 3140
},
{
"epoch": 1.1517367458866545,
"grad_norm": 1.7337790727615356,
"learning_rate": 2.310712979890311e-05,
"loss": 0.9011,
"step": 3150
},
{
"epoch": 1.1553930530164533,
"grad_norm": 9.902853012084961,
"learning_rate": 2.3085191956124315e-05,
"loss": 1.1711,
"step": 3160
},
{
"epoch": 1.1590493601462524,
"grad_norm": 4.4967217445373535,
"learning_rate": 2.306325411334552e-05,
"loss": 0.8189,
"step": 3170
},
{
"epoch": 1.1627056672760512,
"grad_norm": 9.251031875610352,
"learning_rate": 2.3041316270566728e-05,
"loss": 1.0505,
"step": 3180
},
{
"epoch": 1.16636197440585,
"grad_norm": 9.177526473999023,
"learning_rate": 2.3019378427787937e-05,
"loss": 0.9835,
"step": 3190
},
{
"epoch": 1.170018281535649,
"grad_norm": 2.573434352874756,
"learning_rate": 2.2997440585009143e-05,
"loss": 0.7751,
"step": 3200
},
{
"epoch": 1.1736745886654478,
"grad_norm": 8.38436508178711,
"learning_rate": 2.2975502742230346e-05,
"loss": 0.9563,
"step": 3210
},
{
"epoch": 1.1773308957952469,
"grad_norm": 10.322296142578125,
"learning_rate": 2.2953564899451552e-05,
"loss": 1.0326,
"step": 3220
},
{
"epoch": 1.1809872029250457,
"grad_norm": 1.9485523700714111,
"learning_rate": 2.2931627056672762e-05,
"loss": 1.037,
"step": 3230
},
{
"epoch": 1.1846435100548447,
"grad_norm": 4.380084991455078,
"learning_rate": 2.2909689213893968e-05,
"loss": 0.56,
"step": 3240
},
{
"epoch": 1.1882998171846435,
"grad_norm": 6.871321201324463,
"learning_rate": 2.2887751371115174e-05,
"loss": 0.7471,
"step": 3250
},
{
"epoch": 1.1919561243144425,
"grad_norm": 9.694079399108887,
"learning_rate": 2.286581352833638e-05,
"loss": 0.9119,
"step": 3260
},
{
"epoch": 1.1956124314442413,
"grad_norm": 5.262477874755859,
"learning_rate": 2.2843875685557586e-05,
"loss": 0.6997,
"step": 3270
},
{
"epoch": 1.1992687385740401,
"grad_norm": 4.27209997177124,
"learning_rate": 2.2821937842778796e-05,
"loss": 0.5484,
"step": 3280
},
{
"epoch": 1.2029250457038392,
"grad_norm": 7.245287895202637,
"learning_rate": 2.2800000000000002e-05,
"loss": 0.5919,
"step": 3290
},
{
"epoch": 1.206581352833638,
"grad_norm": 4.369983196258545,
"learning_rate": 2.2778062157221208e-05,
"loss": 1.1287,
"step": 3300
},
{
"epoch": 1.210237659963437,
"grad_norm": 1.8020730018615723,
"learning_rate": 2.275612431444241e-05,
"loss": 0.6352,
"step": 3310
},
{
"epoch": 1.2138939670932358,
"grad_norm": 4.279252529144287,
"learning_rate": 2.273418647166362e-05,
"loss": 0.8017,
"step": 3320
},
{
"epoch": 1.2175502742230346,
"grad_norm": 4.222424030303955,
"learning_rate": 2.2712248628884826e-05,
"loss": 0.7692,
"step": 3330
},
{
"epoch": 1.2212065813528337,
"grad_norm": 3.430072069168091,
"learning_rate": 2.2690310786106033e-05,
"loss": 0.7481,
"step": 3340
},
{
"epoch": 1.2248628884826325,
"grad_norm": 5.211468696594238,
"learning_rate": 2.266837294332724e-05,
"loss": 0.8037,
"step": 3350
},
{
"epoch": 1.2285191956124315,
"grad_norm": 9.226336479187012,
"learning_rate": 2.264643510054845e-05,
"loss": 0.9476,
"step": 3360
},
{
"epoch": 1.2321755027422303,
"grad_norm": 4.394392967224121,
"learning_rate": 2.2624497257769654e-05,
"loss": 0.6557,
"step": 3370
},
{
"epoch": 1.2358318098720293,
"grad_norm": 4.641608238220215,
"learning_rate": 2.260255941499086e-05,
"loss": 0.6668,
"step": 3380
},
{
"epoch": 1.2394881170018281,
"grad_norm": 8.342939376831055,
"learning_rate": 2.2580621572212067e-05,
"loss": 0.7831,
"step": 3390
},
{
"epoch": 1.2431444241316272,
"grad_norm": 0.8947893381118774,
"learning_rate": 2.2558683729433273e-05,
"loss": 0.5656,
"step": 3400
},
{
"epoch": 1.246800731261426,
"grad_norm": 6.079960346221924,
"learning_rate": 2.253674588665448e-05,
"loss": 0.9966,
"step": 3410
},
{
"epoch": 1.2504570383912248,
"grad_norm": 9.329411506652832,
"learning_rate": 2.2514808043875685e-05,
"loss": 0.999,
"step": 3420
},
{
"epoch": 1.2541133455210238,
"grad_norm": 5.371129512786865,
"learning_rate": 2.249287020109689e-05,
"loss": 0.5545,
"step": 3430
},
{
"epoch": 1.2577696526508226,
"grad_norm": 5.013857364654541,
"learning_rate": 2.2470932358318097e-05,
"loss": 1.1535,
"step": 3440
},
{
"epoch": 1.2614259597806217,
"grad_norm": 6.94247579574585,
"learning_rate": 2.2448994515539307e-05,
"loss": 1.0131,
"step": 3450
},
{
"epoch": 1.2650822669104205,
"grad_norm": 1.685486078262329,
"learning_rate": 2.2427056672760513e-05,
"loss": 0.8378,
"step": 3460
},
{
"epoch": 1.2687385740402193,
"grad_norm": 4.796342372894287,
"learning_rate": 2.240511882998172e-05,
"loss": 0.6338,
"step": 3470
},
{
"epoch": 1.2723948811700183,
"grad_norm": 5.746938705444336,
"learning_rate": 2.2383180987202925e-05,
"loss": 0.8043,
"step": 3480
},
{
"epoch": 1.2760511882998171,
"grad_norm": 5.947088718414307,
"learning_rate": 2.236124314442413e-05,
"loss": 0.5994,
"step": 3490
},
{
"epoch": 1.2797074954296161,
"grad_norm": 1.3671913146972656,
"learning_rate": 2.233930530164534e-05,
"loss": 0.9907,
"step": 3500
},
{
"epoch": 1.283363802559415,
"grad_norm": 1.2178643941879272,
"learning_rate": 2.2317367458866544e-05,
"loss": 0.6638,
"step": 3510
},
{
"epoch": 1.2870201096892138,
"grad_norm": 8.354637145996094,
"learning_rate": 2.229542961608775e-05,
"loss": 1.1229,
"step": 3520
},
{
"epoch": 1.2906764168190128,
"grad_norm": 3.584672451019287,
"learning_rate": 2.2273491773308956e-05,
"loss": 0.7815,
"step": 3530
},
{
"epoch": 1.2943327239488118,
"grad_norm": 2.3532357215881348,
"learning_rate": 2.2251553930530166e-05,
"loss": 0.8076,
"step": 3540
},
{
"epoch": 1.2979890310786106,
"grad_norm": 3.357630729675293,
"learning_rate": 2.2229616087751372e-05,
"loss": 1.139,
"step": 3550
},
{
"epoch": 1.3016453382084094,
"grad_norm": 7.9423346519470215,
"learning_rate": 2.2207678244972578e-05,
"loss": 1.1081,
"step": 3560
},
{
"epoch": 1.3053016453382085,
"grad_norm": 10.97163200378418,
"learning_rate": 2.2185740402193784e-05,
"loss": 0.6949,
"step": 3570
},
{
"epoch": 1.3089579524680073,
"grad_norm": 3.48557448387146,
"learning_rate": 2.2163802559414994e-05,
"loss": 0.7585,
"step": 3580
},
{
"epoch": 1.3126142595978063,
"grad_norm": 7.3759565353393555,
"learning_rate": 2.21418647166362e-05,
"loss": 1.062,
"step": 3590
},
{
"epoch": 1.3162705667276051,
"grad_norm": 1.880183458328247,
"learning_rate": 2.2119926873857406e-05,
"loss": 0.6005,
"step": 3600
},
{
"epoch": 1.319926873857404,
"grad_norm": 2.7931017875671387,
"learning_rate": 2.2097989031078612e-05,
"loss": 0.5548,
"step": 3610
},
{
"epoch": 1.323583180987203,
"grad_norm": 10.527241706848145,
"learning_rate": 2.2076051188299815e-05,
"loss": 0.549,
"step": 3620
},
{
"epoch": 1.3272394881170018,
"grad_norm": 5.158708095550537,
"learning_rate": 2.2054113345521024e-05,
"loss": 1.5125,
"step": 3630
},
{
"epoch": 1.3308957952468008,
"grad_norm": 2.298628091812134,
"learning_rate": 2.203217550274223e-05,
"loss": 0.854,
"step": 3640
},
{
"epoch": 1.3345521023765996,
"grad_norm": 10.309005737304688,
"learning_rate": 2.2010237659963437e-05,
"loss": 1.0643,
"step": 3650
},
{
"epoch": 1.3382084095063984,
"grad_norm": 3.3284668922424316,
"learning_rate": 2.1988299817184643e-05,
"loss": 1.1608,
"step": 3660
},
{
"epoch": 1.3418647166361974,
"grad_norm": 2.4296984672546387,
"learning_rate": 2.1966361974405852e-05,
"loss": 0.8015,
"step": 3670
},
{
"epoch": 1.3455210237659965,
"grad_norm": 10.130197525024414,
"learning_rate": 2.194442413162706e-05,
"loss": 0.7151,
"step": 3680
},
{
"epoch": 1.3491773308957953,
"grad_norm": 9.950860023498535,
"learning_rate": 2.1922486288848265e-05,
"loss": 0.8923,
"step": 3690
},
{
"epoch": 1.352833638025594,
"grad_norm": 9.493358612060547,
"learning_rate": 2.190054844606947e-05,
"loss": 0.9494,
"step": 3700
},
{
"epoch": 1.3564899451553931,
"grad_norm": 5.511286735534668,
"learning_rate": 2.187861060329068e-05,
"loss": 0.6494,
"step": 3710
},
{
"epoch": 1.360146252285192,
"grad_norm": 0.475504994392395,
"learning_rate": 2.1856672760511883e-05,
"loss": 1.2245,
"step": 3720
},
{
"epoch": 1.363802559414991,
"grad_norm": 8.635137557983398,
"learning_rate": 2.183473491773309e-05,
"loss": 0.5039,
"step": 3730
},
{
"epoch": 1.3674588665447898,
"grad_norm": 3.8953351974487305,
"learning_rate": 2.1812797074954295e-05,
"loss": 0.5876,
"step": 3740
},
{
"epoch": 1.3711151736745886,
"grad_norm": 4.21866512298584,
"learning_rate": 2.17908592321755e-05,
"loss": 0.9671,
"step": 3750
},
{
"epoch": 1.3747714808043876,
"grad_norm": 6.784433364868164,
"learning_rate": 2.176892138939671e-05,
"loss": 1.0172,
"step": 3760
},
{
"epoch": 1.3784277879341864,
"grad_norm": 7.940158367156982,
"learning_rate": 2.1746983546617917e-05,
"loss": 0.8767,
"step": 3770
},
{
"epoch": 1.3820840950639854,
"grad_norm": 0.827899694442749,
"learning_rate": 2.1725045703839123e-05,
"loss": 0.924,
"step": 3780
},
{
"epoch": 1.3857404021937842,
"grad_norm": 4.189643383026123,
"learning_rate": 2.170310786106033e-05,
"loss": 1.0065,
"step": 3790
},
{
"epoch": 1.389396709323583,
"grad_norm": 1.9168022871017456,
"learning_rate": 2.168117001828154e-05,
"loss": 1.5282,
"step": 3800
},
{
"epoch": 1.393053016453382,
"grad_norm": 1.0433759689331055,
"learning_rate": 2.1659232175502745e-05,
"loss": 0.8177,
"step": 3810
},
{
"epoch": 1.3967093235831811,
"grad_norm": 7.197315216064453,
"learning_rate": 2.1637294332723948e-05,
"loss": 0.644,
"step": 3820
},
{
"epoch": 1.40036563071298,
"grad_norm": 4.568287372589111,
"learning_rate": 2.1615356489945154e-05,
"loss": 0.5555,
"step": 3830
},
{
"epoch": 1.4040219378427787,
"grad_norm": 9.683319091796875,
"learning_rate": 2.1593418647166363e-05,
"loss": 0.895,
"step": 3840
},
{
"epoch": 1.4076782449725778,
"grad_norm": 7.343099594116211,
"learning_rate": 2.157148080438757e-05,
"loss": 0.7645,
"step": 3850
},
{
"epoch": 1.4113345521023766,
"grad_norm": 8.893482208251953,
"learning_rate": 2.1549542961608776e-05,
"loss": 0.8237,
"step": 3860
},
{
"epoch": 1.4149908592321756,
"grad_norm": 9.558774948120117,
"learning_rate": 2.1527605118829982e-05,
"loss": 1.4352,
"step": 3870
},
{
"epoch": 1.4186471663619744,
"grad_norm": 3.180133819580078,
"learning_rate": 2.1505667276051188e-05,
"loss": 0.9726,
"step": 3880
},
{
"epoch": 1.4223034734917732,
"grad_norm": 4.226669788360596,
"learning_rate": 2.1483729433272397e-05,
"loss": 0.6608,
"step": 3890
},
{
"epoch": 1.4259597806215722,
"grad_norm": 2.0851640701293945,
"learning_rate": 2.1461791590493604e-05,
"loss": 0.8724,
"step": 3900
},
{
"epoch": 1.429616087751371,
"grad_norm": 1.5792533159255981,
"learning_rate": 2.143985374771481e-05,
"loss": 1.1509,
"step": 3910
},
{
"epoch": 1.43327239488117,
"grad_norm": 5.39309024810791,
"learning_rate": 2.1417915904936013e-05,
"loss": 0.733,
"step": 3920
},
{
"epoch": 1.436928702010969,
"grad_norm": 6.3452677726745605,
"learning_rate": 2.1395978062157222e-05,
"loss": 0.7428,
"step": 3930
},
{
"epoch": 1.4405850091407677,
"grad_norm": 4.476494312286377,
"learning_rate": 2.1374040219378428e-05,
"loss": 0.9109,
"step": 3940
},
{
"epoch": 1.4442413162705667,
"grad_norm": 11.283713340759277,
"learning_rate": 2.1352102376599634e-05,
"loss": 1.0231,
"step": 3950
},
{
"epoch": 1.4478976234003658,
"grad_norm": 3.30483341217041,
"learning_rate": 2.133016453382084e-05,
"loss": 1.0793,
"step": 3960
},
{
"epoch": 1.4515539305301646,
"grad_norm": 5.595132827758789,
"learning_rate": 2.1308226691042047e-05,
"loss": 1.5486,
"step": 3970
},
{
"epoch": 1.4552102376599634,
"grad_norm": 3.3429744243621826,
"learning_rate": 2.1286288848263256e-05,
"loss": 1.4396,
"step": 3980
},
{
"epoch": 1.4588665447897624,
"grad_norm": 2.220364570617676,
"learning_rate": 2.1264351005484462e-05,
"loss": 0.8467,
"step": 3990
},
{
"epoch": 1.4625228519195612,
"grad_norm": 1.5086268186569214,
"learning_rate": 2.124241316270567e-05,
"loss": 0.5857,
"step": 4000
},
{
"epoch": 1.4661791590493602,
"grad_norm": 7.653486251831055,
"learning_rate": 2.1220475319926875e-05,
"loss": 1.1748,
"step": 4010
},
{
"epoch": 1.469835466179159,
"grad_norm": 7.453839302062988,
"learning_rate": 2.119853747714808e-05,
"loss": 0.6077,
"step": 4020
},
{
"epoch": 1.4734917733089579,
"grad_norm": 5.1094441413879395,
"learning_rate": 2.1176599634369287e-05,
"loss": 0.7599,
"step": 4030
},
{
"epoch": 1.477148080438757,
"grad_norm": 8.580041885375977,
"learning_rate": 2.1154661791590493e-05,
"loss": 0.7057,
"step": 4040
},
{
"epoch": 1.4808043875685557,
"grad_norm": 5.279627799987793,
"learning_rate": 2.11327239488117e-05,
"loss": 0.7115,
"step": 4050
},
{
"epoch": 1.4844606946983547,
"grad_norm": 5.886457920074463,
"learning_rate": 2.111078610603291e-05,
"loss": 0.7922,
"step": 4060
},
{
"epoch": 1.4881170018281535,
"grad_norm": 8.935380935668945,
"learning_rate": 2.1088848263254115e-05,
"loss": 0.8798,
"step": 4070
},
{
"epoch": 1.4917733089579523,
"grad_norm": 4.792860984802246,
"learning_rate": 2.106691042047532e-05,
"loss": 0.7714,
"step": 4080
},
{
"epoch": 1.4954296160877514,
"grad_norm": 5.927025318145752,
"learning_rate": 2.1044972577696527e-05,
"loss": 0.8479,
"step": 4090
},
{
"epoch": 1.4990859232175504,
"grad_norm": 4.06768798828125,
"learning_rate": 2.1023034734917733e-05,
"loss": 1.0168,
"step": 4100
},
{
"epoch": 1.5027422303473492,
"grad_norm": 5.292023181915283,
"learning_rate": 2.1001096892138943e-05,
"loss": 1.1981,
"step": 4110
},
{
"epoch": 1.506398537477148,
"grad_norm": 8.131914138793945,
"learning_rate": 2.0979159049360146e-05,
"loss": 1.1474,
"step": 4120
},
{
"epoch": 1.5100548446069468,
"grad_norm": 9.737383842468262,
"learning_rate": 2.095722120658135e-05,
"loss": 0.8616,
"step": 4130
},
{
"epoch": 1.5137111517367459,
"grad_norm": 2.422138214111328,
"learning_rate": 2.0935283363802558e-05,
"loss": 0.8315,
"step": 4140
},
{
"epoch": 1.517367458866545,
"grad_norm": 1.734221339225769,
"learning_rate": 2.0913345521023767e-05,
"loss": 0.8787,
"step": 4150
},
{
"epoch": 1.5210237659963437,
"grad_norm": 0.9889214038848877,
"learning_rate": 2.0891407678244973e-05,
"loss": 0.7116,
"step": 4160
},
{
"epoch": 1.5246800731261425,
"grad_norm": 4.243373394012451,
"learning_rate": 2.086946983546618e-05,
"loss": 0.8402,
"step": 4170
},
{
"epoch": 1.5283363802559415,
"grad_norm": 3.111729860305786,
"learning_rate": 2.0847531992687386e-05,
"loss": 1.1098,
"step": 4180
},
{
"epoch": 1.5319926873857403,
"grad_norm": 1.9713119268417358,
"learning_rate": 2.0825594149908595e-05,
"loss": 0.7797,
"step": 4190
},
{
"epoch": 1.5356489945155394,
"grad_norm": 5.521538734436035,
"learning_rate": 2.08036563071298e-05,
"loss": 0.6614,
"step": 4200
},
{
"epoch": 1.5393053016453382,
"grad_norm": 2.166930675506592,
"learning_rate": 2.0781718464351008e-05,
"loss": 0.9268,
"step": 4210
},
{
"epoch": 1.542961608775137,
"grad_norm": 1.7511789798736572,
"learning_rate": 2.075978062157221e-05,
"loss": 0.8894,
"step": 4220
},
{
"epoch": 1.546617915904936,
"grad_norm": 8.769426345825195,
"learning_rate": 2.0737842778793416e-05,
"loss": 0.8558,
"step": 4230
},
{
"epoch": 1.550274223034735,
"grad_norm": 5.798864364624023,
"learning_rate": 2.0715904936014626e-05,
"loss": 0.767,
"step": 4240
},
{
"epoch": 1.5539305301645339,
"grad_norm": 5.127215385437012,
"learning_rate": 2.0693967093235832e-05,
"loss": 0.5996,
"step": 4250
},
{
"epoch": 1.5575868372943327,
"grad_norm": 3.0306711196899414,
"learning_rate": 2.0672029250457038e-05,
"loss": 0.4811,
"step": 4260
},
{
"epoch": 1.5612431444241315,
"grad_norm": 1.175572156906128,
"learning_rate": 2.0650091407678244e-05,
"loss": 0.6173,
"step": 4270
},
{
"epoch": 1.5648994515539305,
"grad_norm": 4.409485340118408,
"learning_rate": 2.0628153564899454e-05,
"loss": 0.9317,
"step": 4280
},
{
"epoch": 1.5685557586837295,
"grad_norm": 4.677966594696045,
"learning_rate": 2.060621572212066e-05,
"loss": 1.0926,
"step": 4290
},
{
"epoch": 1.5722120658135283,
"grad_norm": 8.307379722595215,
"learning_rate": 2.0584277879341866e-05,
"loss": 0.9221,
"step": 4300
},
{
"epoch": 1.5758683729433272,
"grad_norm": 2.0957555770874023,
"learning_rate": 2.0562340036563072e-05,
"loss": 1.0005,
"step": 4310
},
{
"epoch": 1.5795246800731262,
"grad_norm": 1.3396669626235962,
"learning_rate": 2.0540402193784275e-05,
"loss": 0.7351,
"step": 4320
},
{
"epoch": 1.583180987202925,
"grad_norm": 5.14472770690918,
"learning_rate": 2.0518464351005485e-05,
"loss": 1.1173,
"step": 4330
},
{
"epoch": 1.586837294332724,
"grad_norm": 2.601489782333374,
"learning_rate": 2.049652650822669e-05,
"loss": 0.6813,
"step": 4340
},
{
"epoch": 1.5904936014625228,
"grad_norm": 4.059136867523193,
"learning_rate": 2.0474588665447897e-05,
"loss": 1.0248,
"step": 4350
},
{
"epoch": 1.5941499085923216,
"grad_norm": 6.217931747436523,
"learning_rate": 2.0452650822669103e-05,
"loss": 0.7066,
"step": 4360
},
{
"epoch": 1.5978062157221207,
"grad_norm": 7.017310619354248,
"learning_rate": 2.0430712979890313e-05,
"loss": 0.6382,
"step": 4370
},
{
"epoch": 1.6014625228519197,
"grad_norm": 6.520296096801758,
"learning_rate": 2.040877513711152e-05,
"loss": 0.4315,
"step": 4380
},
{
"epoch": 1.6051188299817185,
"grad_norm": 6.086079120635986,
"learning_rate": 2.0386837294332725e-05,
"loss": 0.6829,
"step": 4390
},
{
"epoch": 1.6087751371115173,
"grad_norm": 3.4015817642211914,
"learning_rate": 2.036489945155393e-05,
"loss": 0.6142,
"step": 4400
},
{
"epoch": 1.6124314442413161,
"grad_norm": 7.188704013824463,
"learning_rate": 2.034296160877514e-05,
"loss": 0.5532,
"step": 4410
},
{
"epoch": 1.6160877513711152,
"grad_norm": 3.989145517349243,
"learning_rate": 2.0321023765996343e-05,
"loss": 1.0227,
"step": 4420
},
{
"epoch": 1.6197440585009142,
"grad_norm": 5.923662185668945,
"learning_rate": 2.029908592321755e-05,
"loss": 0.6978,
"step": 4430
},
{
"epoch": 1.623400365630713,
"grad_norm": 5.101003170013428,
"learning_rate": 2.0277148080438756e-05,
"loss": 0.9833,
"step": 4440
},
{
"epoch": 1.6270566727605118,
"grad_norm": 9.158041000366211,
"learning_rate": 2.0255210237659962e-05,
"loss": 1.0931,
"step": 4450
},
{
"epoch": 1.6307129798903108,
"grad_norm": 6.297501564025879,
"learning_rate": 2.023327239488117e-05,
"loss": 1.1048,
"step": 4460
},
{
"epoch": 1.6343692870201096,
"grad_norm": 3.9536404609680176,
"learning_rate": 2.0211334552102377e-05,
"loss": 0.7332,
"step": 4470
},
{
"epoch": 1.6380255941499087,
"grad_norm": 4.0736212730407715,
"learning_rate": 2.0189396709323584e-05,
"loss": 0.5685,
"step": 4480
},
{
"epoch": 1.6416819012797075,
"grad_norm": 11.199592590332031,
"learning_rate": 2.016745886654479e-05,
"loss": 0.8059,
"step": 4490
},
{
"epoch": 1.6453382084095063,
"grad_norm": 10.829754829406738,
"learning_rate": 2.0145521023766e-05,
"loss": 1.0358,
"step": 4500
},
{
"epoch": 1.6489945155393053,
"grad_norm": 4.670787811279297,
"learning_rate": 2.0123583180987205e-05,
"loss": 0.8369,
"step": 4510
},
{
"epoch": 1.6526508226691043,
"grad_norm": 6.225413799285889,
"learning_rate": 2.0101645338208408e-05,
"loss": 1.2236,
"step": 4520
},
{
"epoch": 1.6563071297989032,
"grad_norm": 3.398374557495117,
"learning_rate": 2.0079707495429614e-05,
"loss": 0.5667,
"step": 4530
},
{
"epoch": 1.659963436928702,
"grad_norm": 3.375204086303711,
"learning_rate": 2.0057769652650824e-05,
"loss": 0.989,
"step": 4540
},
{
"epoch": 1.6636197440585008,
"grad_norm": 4.518038749694824,
"learning_rate": 2.003583180987203e-05,
"loss": 0.7565,
"step": 4550
},
{
"epoch": 1.6672760511882998,
"grad_norm": 3.7947514057159424,
"learning_rate": 2.0013893967093236e-05,
"loss": 0.9918,
"step": 4560
},
{
"epoch": 1.6709323583180988,
"grad_norm": 2.7493553161621094,
"learning_rate": 1.9991956124314442e-05,
"loss": 0.4559,
"step": 4570
},
{
"epoch": 1.6745886654478976,
"grad_norm": 2.3222575187683105,
"learning_rate": 1.997001828153565e-05,
"loss": 0.6695,
"step": 4580
},
{
"epoch": 1.6782449725776964,
"grad_norm": 8.733063697814941,
"learning_rate": 1.9948080438756858e-05,
"loss": 0.6937,
"step": 4590
},
{
"epoch": 1.6819012797074955,
"grad_norm": 5.651478290557861,
"learning_rate": 1.9926142595978064e-05,
"loss": 0.4887,
"step": 4600
},
{
"epoch": 1.6855575868372943,
"grad_norm": 5.600511074066162,
"learning_rate": 1.990420475319927e-05,
"loss": 0.6819,
"step": 4610
},
{
"epoch": 1.6892138939670933,
"grad_norm": 5.3927903175354,
"learning_rate": 1.9882266910420476e-05,
"loss": 0.8285,
"step": 4620
},
{
"epoch": 1.6928702010968921,
"grad_norm": 4.391313076019287,
"learning_rate": 1.9860329067641682e-05,
"loss": 0.7116,
"step": 4630
},
{
"epoch": 1.696526508226691,
"grad_norm": 6.470620155334473,
"learning_rate": 1.983839122486289e-05,
"loss": 1.2811,
"step": 4640
},
{
"epoch": 1.70018281535649,
"grad_norm": 1.9842756986618042,
"learning_rate": 1.9816453382084095e-05,
"loss": 1.1558,
"step": 4650
},
{
"epoch": 1.703839122486289,
"grad_norm": 6.438689708709717,
"learning_rate": 1.97945155393053e-05,
"loss": 0.8181,
"step": 4660
},
{
"epoch": 1.7074954296160878,
"grad_norm": 5.5345845222473145,
"learning_rate": 1.977257769652651e-05,
"loss": 0.4793,
"step": 4670
},
{
"epoch": 1.7111517367458866,
"grad_norm": 6.923543930053711,
"learning_rate": 1.9750639853747717e-05,
"loss": 1.2972,
"step": 4680
},
{
"epoch": 1.7148080438756854,
"grad_norm": 7.229982376098633,
"learning_rate": 1.9728702010968923e-05,
"loss": 1.006,
"step": 4690
},
{
"epoch": 1.7184643510054844,
"grad_norm": 5.0050201416015625,
"learning_rate": 1.970676416819013e-05,
"loss": 0.7382,
"step": 4700
},
{
"epoch": 1.7221206581352835,
"grad_norm": 5.115394115447998,
"learning_rate": 1.9684826325411335e-05,
"loss": 1.0649,
"step": 4710
},
{
"epoch": 1.7257769652650823,
"grad_norm": 6.4145307540893555,
"learning_rate": 1.9662888482632544e-05,
"loss": 0.9784,
"step": 4720
},
{
"epoch": 1.729433272394881,
"grad_norm": 3.8062143325805664,
"learning_rate": 1.9640950639853747e-05,
"loss": 0.809,
"step": 4730
},
{
"epoch": 1.7330895795246801,
"grad_norm": 3.4305763244628906,
"learning_rate": 1.9619012797074953e-05,
"loss": 0.6094,
"step": 4740
},
{
"epoch": 1.736745886654479,
"grad_norm": 4.138398170471191,
"learning_rate": 1.959707495429616e-05,
"loss": 0.6374,
"step": 4750
},
{
"epoch": 1.740402193784278,
"grad_norm": 3.1539058685302734,
"learning_rate": 1.957513711151737e-05,
"loss": 0.4952,
"step": 4760
},
{
"epoch": 1.7440585009140768,
"grad_norm": 2.051999807357788,
"learning_rate": 1.9553199268738575e-05,
"loss": 0.8456,
"step": 4770
},
{
"epoch": 1.7477148080438756,
"grad_norm": 5.383764743804932,
"learning_rate": 1.953126142595978e-05,
"loss": 0.9809,
"step": 4780
},
{
"epoch": 1.7513711151736746,
"grad_norm": 10.34570026397705,
"learning_rate": 1.9509323583180987e-05,
"loss": 1.4191,
"step": 4790
},
{
"epoch": 1.7550274223034736,
"grad_norm": 7.438785552978516,
"learning_rate": 1.9487385740402194e-05,
"loss": 0.9254,
"step": 4800
},
{
"epoch": 1.7586837294332724,
"grad_norm": 5.489014148712158,
"learning_rate": 1.9465447897623403e-05,
"loss": 0.912,
"step": 4810
},
{
"epoch": 1.7623400365630713,
"grad_norm": 2.74650502204895,
"learning_rate": 1.944351005484461e-05,
"loss": 0.7544,
"step": 4820
},
{
"epoch": 1.76599634369287,
"grad_norm": 6.396740436553955,
"learning_rate": 1.9421572212065812e-05,
"loss": 0.5699,
"step": 4830
},
{
"epoch": 1.769652650822669,
"grad_norm": 4.5033745765686035,
"learning_rate": 1.9399634369287018e-05,
"loss": 0.7621,
"step": 4840
},
{
"epoch": 1.7733089579524681,
"grad_norm": 2.8868985176086426,
"learning_rate": 1.9377696526508228e-05,
"loss": 0.5894,
"step": 4850
},
{
"epoch": 1.776965265082267,
"grad_norm": 5.314028739929199,
"learning_rate": 1.9355758683729434e-05,
"loss": 0.8135,
"step": 4860
},
{
"epoch": 1.7806215722120657,
"grad_norm": 7.692873477935791,
"learning_rate": 1.933382084095064e-05,
"loss": 1.6637,
"step": 4870
},
{
"epoch": 1.7842778793418648,
"grad_norm": 6.586564064025879,
"learning_rate": 1.9311882998171846e-05,
"loss": 0.634,
"step": 4880
},
{
"epoch": 1.7879341864716636,
"grad_norm": 4.398944854736328,
"learning_rate": 1.9289945155393056e-05,
"loss": 0.9242,
"step": 4890
},
{
"epoch": 1.7915904936014626,
"grad_norm": 3.091824769973755,
"learning_rate": 1.9268007312614262e-05,
"loss": 0.5293,
"step": 4900
},
{
"epoch": 1.7952468007312614,
"grad_norm": 1.7957733869552612,
"learning_rate": 1.9246069469835468e-05,
"loss": 0.4717,
"step": 4910
},
{
"epoch": 1.7989031078610602,
"grad_norm": 8.411224365234375,
"learning_rate": 1.9224131627056674e-05,
"loss": 0.9129,
"step": 4920
},
{
"epoch": 1.8025594149908593,
"grad_norm": 6.0289788246154785,
"learning_rate": 1.9202193784277877e-05,
"loss": 0.9803,
"step": 4930
},
{
"epoch": 1.8062157221206583,
"grad_norm": 2.4739830493927,
"learning_rate": 1.9180255941499086e-05,
"loss": 0.5742,
"step": 4940
},
{
"epoch": 1.809872029250457,
"grad_norm": 5.185890197753906,
"learning_rate": 1.9158318098720292e-05,
"loss": 0.5904,
"step": 4950
},
{
"epoch": 1.813528336380256,
"grad_norm": 7.785595893859863,
"learning_rate": 1.91363802559415e-05,
"loss": 0.9302,
"step": 4960
},
{
"epoch": 1.8171846435100547,
"grad_norm": 4.2491841316223145,
"learning_rate": 1.9114442413162705e-05,
"loss": 0.6755,
"step": 4970
},
{
"epoch": 1.8208409506398537,
"grad_norm": 5.402482986450195,
"learning_rate": 1.9092504570383914e-05,
"loss": 0.7065,
"step": 4980
},
{
"epoch": 1.8244972577696528,
"grad_norm": 9.053221702575684,
"learning_rate": 1.907056672760512e-05,
"loss": 0.8879,
"step": 4990
},
{
"epoch": 1.8281535648994516,
"grad_norm": 4.956139087677002,
"learning_rate": 1.9048628884826327e-05,
"loss": 0.7763,
"step": 5000
},
{
"epoch": 1.8318098720292504,
"grad_norm": 4.047802925109863,
"learning_rate": 1.9026691042047533e-05,
"loss": 0.985,
"step": 5010
},
{
"epoch": 1.8354661791590492,
"grad_norm": 2.324805736541748,
"learning_rate": 1.9004753199268742e-05,
"loss": 0.8605,
"step": 5020
},
{
"epoch": 1.8391224862888482,
"grad_norm": 8.674615859985352,
"learning_rate": 1.8982815356489945e-05,
"loss": 0.7584,
"step": 5030
},
{
"epoch": 1.8427787934186473,
"grad_norm": 2.8716583251953125,
"learning_rate": 1.896087751371115e-05,
"loss": 0.8896,
"step": 5040
},
{
"epoch": 1.846435100548446,
"grad_norm": 4.845273494720459,
"learning_rate": 1.8938939670932357e-05,
"loss": 0.7585,
"step": 5050
},
{
"epoch": 1.8500914076782449,
"grad_norm": 1.3373600244522095,
"learning_rate": 1.8917001828153563e-05,
"loss": 0.8324,
"step": 5060
},
{
"epoch": 1.853747714808044,
"grad_norm": 3.5930116176605225,
"learning_rate": 1.8895063985374773e-05,
"loss": 0.5972,
"step": 5070
},
{
"epoch": 1.857404021937843,
"grad_norm": 2.8679511547088623,
"learning_rate": 1.887312614259598e-05,
"loss": 1.2015,
"step": 5080
},
{
"epoch": 1.8610603290676417,
"grad_norm": 5.207054615020752,
"learning_rate": 1.8851188299817185e-05,
"loss": 1.1164,
"step": 5090
},
{
"epoch": 1.8647166361974405,
"grad_norm": 4.295830249786377,
"learning_rate": 1.882925045703839e-05,
"loss": 0.9228,
"step": 5100
},
{
"epoch": 1.8683729433272394,
"grad_norm": 6.6493682861328125,
"learning_rate": 1.88073126142596e-05,
"loss": 1.2885,
"step": 5110
},
{
"epoch": 1.8720292504570384,
"grad_norm": 9.316621780395508,
"learning_rate": 1.8785374771480807e-05,
"loss": 0.9024,
"step": 5120
},
{
"epoch": 1.8756855575868374,
"grad_norm": 1.7442660331726074,
"learning_rate": 1.876343692870201e-05,
"loss": 0.6215,
"step": 5130
},
{
"epoch": 1.8793418647166362,
"grad_norm": 3.714203357696533,
"learning_rate": 1.8741499085923216e-05,
"loss": 1.0476,
"step": 5140
},
{
"epoch": 1.882998171846435,
"grad_norm": 8.656035423278809,
"learning_rate": 1.8719561243144422e-05,
"loss": 0.8761,
"step": 5150
},
{
"epoch": 1.8866544789762338,
"grad_norm": 7.139505863189697,
"learning_rate": 1.869762340036563e-05,
"loss": 0.6815,
"step": 5160
},
{
"epoch": 1.8903107861060329,
"grad_norm": 5.897740840911865,
"learning_rate": 1.8675685557586838e-05,
"loss": 0.7645,
"step": 5170
},
{
"epoch": 1.893967093235832,
"grad_norm": 6.025356292724609,
"learning_rate": 1.8653747714808044e-05,
"loss": 0.9224,
"step": 5180
},
{
"epoch": 1.8976234003656307,
"grad_norm": 3.462116003036499,
"learning_rate": 1.863180987202925e-05,
"loss": 0.6179,
"step": 5190
},
{
"epoch": 1.9012797074954295,
"grad_norm": 0.449295312166214,
"learning_rate": 1.860987202925046e-05,
"loss": 0.5513,
"step": 5200
},
{
"epoch": 1.9049360146252285,
"grad_norm": 8.190743446350098,
"learning_rate": 1.8587934186471666e-05,
"loss": 0.9577,
"step": 5210
},
{
"epoch": 1.9085923217550276,
"grad_norm": 8.000064849853516,
"learning_rate": 1.8565996343692872e-05,
"loss": 1.1829,
"step": 5220
},
{
"epoch": 1.9122486288848264,
"grad_norm": 2.7674405574798584,
"learning_rate": 1.8544058500914075e-05,
"loss": 0.9203,
"step": 5230
},
{
"epoch": 1.9159049360146252,
"grad_norm": 3.4354286193847656,
"learning_rate": 1.8522120658135284e-05,
"loss": 0.8279,
"step": 5240
},
{
"epoch": 1.919561243144424,
"grad_norm": 4.011999607086182,
"learning_rate": 1.850018281535649e-05,
"loss": 0.7985,
"step": 5250
},
{
"epoch": 1.923217550274223,
"grad_norm": 6.80394172668457,
"learning_rate": 1.8478244972577696e-05,
"loss": 0.7541,
"step": 5260
},
{
"epoch": 1.926873857404022,
"grad_norm": 9.098631858825684,
"learning_rate": 1.8456307129798903e-05,
"loss": 0.7121,
"step": 5270
},
{
"epoch": 1.9305301645338209,
"grad_norm": 8.139768600463867,
"learning_rate": 1.843436928702011e-05,
"loss": 1.0927,
"step": 5280
},
{
"epoch": 1.9341864716636197,
"grad_norm": 7.283916473388672,
"learning_rate": 1.8412431444241318e-05,
"loss": 0.9501,
"step": 5290
},
{
"epoch": 1.9378427787934185,
"grad_norm": 5.627073764801025,
"learning_rate": 1.8390493601462524e-05,
"loss": 1.2397,
"step": 5300
},
{
"epoch": 1.9414990859232175,
"grad_norm": 4.708215713500977,
"learning_rate": 1.836855575868373e-05,
"loss": 0.8767,
"step": 5310
},
{
"epoch": 1.9451553930530165,
"grad_norm": 5.6944756507873535,
"learning_rate": 1.8346617915904937e-05,
"loss": 0.7765,
"step": 5320
},
{
"epoch": 1.9488117001828154,
"grad_norm": 2.780611038208008,
"learning_rate": 1.8324680073126143e-05,
"loss": 0.9307,
"step": 5330
},
{
"epoch": 1.9524680073126142,
"grad_norm": 6.318012237548828,
"learning_rate": 1.830274223034735e-05,
"loss": 0.9262,
"step": 5340
},
{
"epoch": 1.9561243144424132,
"grad_norm": 3.8964459896087646,
"learning_rate": 1.8280804387568555e-05,
"loss": 0.6519,
"step": 5350
},
{
"epoch": 1.9597806215722122,
"grad_norm": 3.204008102416992,
"learning_rate": 1.825886654478976e-05,
"loss": 1.0352,
"step": 5360
},
{
"epoch": 1.963436928702011,
"grad_norm": 6.150453567504883,
"learning_rate": 1.823692870201097e-05,
"loss": 0.8672,
"step": 5370
},
{
"epoch": 1.9670932358318098,
"grad_norm": 3.9006292819976807,
"learning_rate": 1.8214990859232177e-05,
"loss": 0.6804,
"step": 5380
},
{
"epoch": 1.9707495429616086,
"grad_norm": 3.0023293495178223,
"learning_rate": 1.8193053016453383e-05,
"loss": 0.7373,
"step": 5390
},
{
"epoch": 1.9744058500914077,
"grad_norm": 7.111054420471191,
"learning_rate": 1.817111517367459e-05,
"loss": 0.9125,
"step": 5400
},
{
"epoch": 1.9780621572212067,
"grad_norm": 7.576889991760254,
"learning_rate": 1.8149177330895795e-05,
"loss": 0.5759,
"step": 5410
},
{
"epoch": 1.9817184643510055,
"grad_norm": 9.145369529724121,
"learning_rate": 1.8127239488117005e-05,
"loss": 0.9557,
"step": 5420
},
{
"epoch": 1.9853747714808043,
"grad_norm": 8.636487007141113,
"learning_rate": 1.8105301645338208e-05,
"loss": 0.8976,
"step": 5430
},
{
"epoch": 1.9890310786106031,
"grad_norm": 4.460054874420166,
"learning_rate": 1.8083363802559414e-05,
"loss": 0.7172,
"step": 5440
},
{
"epoch": 1.9926873857404022,
"grad_norm": 8.192395210266113,
"learning_rate": 1.806142595978062e-05,
"loss": 0.856,
"step": 5450
},
{
"epoch": 1.9963436928702012,
"grad_norm": 9.720686912536621,
"learning_rate": 1.803948811700183e-05,
"loss": 0.9177,
"step": 5460
},
{
"epoch": 2.0,
"grad_norm": 7.616659641265869,
"learning_rate": 1.8017550274223036e-05,
"loss": 0.7784,
"step": 5470
},
{
"epoch": 2.003656307129799,
"grad_norm": 5.925053596496582,
"learning_rate": 1.799561243144424e-05,
"loss": 0.6278,
"step": 5480
},
{
"epoch": 2.0073126142595976,
"grad_norm": 5.279562950134277,
"learning_rate": 1.7973674588665448e-05,
"loss": 0.581,
"step": 5490
},
{
"epoch": 2.010968921389397,
"grad_norm": 5.457578182220459,
"learning_rate": 1.7951736745886657e-05,
"loss": 0.9397,
"step": 5500
},
{
"epoch": 2.0146252285191957,
"grad_norm": 2.3031833171844482,
"learning_rate": 1.7929798903107863e-05,
"loss": 0.734,
"step": 5510
},
{
"epoch": 2.0182815356489945,
"grad_norm": 3.8108150959014893,
"learning_rate": 1.790786106032907e-05,
"loss": 0.716,
"step": 5520
},
{
"epoch": 2.0219378427787933,
"grad_norm": 6.341092586517334,
"learning_rate": 1.7885923217550272e-05,
"loss": 0.9646,
"step": 5530
},
{
"epoch": 2.025594149908592,
"grad_norm": 3.282466411590576,
"learning_rate": 1.786398537477148e-05,
"loss": 0.8874,
"step": 5540
},
{
"epoch": 2.0292504570383914,
"grad_norm": 1.760282039642334,
"learning_rate": 1.7842047531992688e-05,
"loss": 0.8131,
"step": 5550
},
{
"epoch": 2.03290676416819,
"grad_norm": 5.197391510009766,
"learning_rate": 1.7820109689213894e-05,
"loss": 0.4766,
"step": 5560
},
{
"epoch": 2.036563071297989,
"grad_norm": 6.330410480499268,
"learning_rate": 1.77981718464351e-05,
"loss": 0.8051,
"step": 5570
},
{
"epoch": 2.0402193784277878,
"grad_norm": 2.116508722305298,
"learning_rate": 1.7776234003656306e-05,
"loss": 0.5772,
"step": 5580
},
{
"epoch": 2.043875685557587,
"grad_norm": 7.164584636688232,
"learning_rate": 1.7754296160877516e-05,
"loss": 1.1541,
"step": 5590
},
{
"epoch": 2.047531992687386,
"grad_norm": 3.2902145385742188,
"learning_rate": 1.7732358318098722e-05,
"loss": 0.8753,
"step": 5600
},
{
"epoch": 2.0511882998171846,
"grad_norm": 4.900457859039307,
"learning_rate": 1.7710420475319928e-05,
"loss": 0.721,
"step": 5610
},
{
"epoch": 2.0548446069469835,
"grad_norm": 1.8482491970062256,
"learning_rate": 1.7688482632541134e-05,
"loss": 0.5912,
"step": 5620
},
{
"epoch": 2.0585009140767823,
"grad_norm": 6.206057548522949,
"learning_rate": 1.766654478976234e-05,
"loss": 0.7698,
"step": 5630
},
{
"epoch": 2.0621572212065815,
"grad_norm": 2.8507750034332275,
"learning_rate": 1.7644606946983547e-05,
"loss": 0.585,
"step": 5640
},
{
"epoch": 2.0658135283363803,
"grad_norm": 1.5750012397766113,
"learning_rate": 1.7622669104204753e-05,
"loss": 0.788,
"step": 5650
},
{
"epoch": 2.069469835466179,
"grad_norm": 0.9211186170578003,
"learning_rate": 1.760073126142596e-05,
"loss": 0.7428,
"step": 5660
},
{
"epoch": 2.073126142595978,
"grad_norm": 6.672236442565918,
"learning_rate": 1.7578793418647165e-05,
"loss": 1.1684,
"step": 5670
},
{
"epoch": 2.076782449725777,
"grad_norm": 5.079084396362305,
"learning_rate": 1.7556855575868375e-05,
"loss": 0.8299,
"step": 5680
},
{
"epoch": 2.080438756855576,
"grad_norm": 2.206005573272705,
"learning_rate": 1.753491773308958e-05,
"loss": 0.7393,
"step": 5690
},
{
"epoch": 2.084095063985375,
"grad_norm": 5.880030155181885,
"learning_rate": 1.7512979890310787e-05,
"loss": 0.7371,
"step": 5700
},
{
"epoch": 2.0877513711151736,
"grad_norm": 2.5095629692077637,
"learning_rate": 1.7491042047531993e-05,
"loss": 0.7439,
"step": 5710
},
{
"epoch": 2.0914076782449724,
"grad_norm": 3.8941352367401123,
"learning_rate": 1.7469104204753203e-05,
"loss": 0.7426,
"step": 5720
},
{
"epoch": 2.0950639853747717,
"grad_norm": 2.9596612453460693,
"learning_rate": 1.7447166361974405e-05,
"loss": 1.0313,
"step": 5730
},
{
"epoch": 2.0987202925045705,
"grad_norm": 5.640470027923584,
"learning_rate": 1.742522851919561e-05,
"loss": 0.8318,
"step": 5740
},
{
"epoch": 2.1023765996343693,
"grad_norm": 9.360175132751465,
"learning_rate": 1.7403290676416818e-05,
"loss": 0.9472,
"step": 5750
},
{
"epoch": 2.106032906764168,
"grad_norm": 3.729229688644409,
"learning_rate": 1.7381352833638024e-05,
"loss": 0.9582,
"step": 5760
},
{
"epoch": 2.109689213893967,
"grad_norm": 4.457205295562744,
"learning_rate": 1.7359414990859233e-05,
"loss": 0.848,
"step": 5770
},
{
"epoch": 2.113345521023766,
"grad_norm": 2.072932243347168,
"learning_rate": 1.733747714808044e-05,
"loss": 1.1283,
"step": 5780
},
{
"epoch": 2.117001828153565,
"grad_norm": 2.81571888923645,
"learning_rate": 1.7315539305301646e-05,
"loss": 0.8487,
"step": 5790
},
{
"epoch": 2.1206581352833638,
"grad_norm": 4.277017593383789,
"learning_rate": 1.7293601462522852e-05,
"loss": 0.7518,
"step": 5800
},
{
"epoch": 2.1243144424131626,
"grad_norm": 4.090396404266357,
"learning_rate": 1.727166361974406e-05,
"loss": 0.8979,
"step": 5810
},
{
"epoch": 2.1279707495429614,
"grad_norm": 1.6413131952285767,
"learning_rate": 1.7249725776965267e-05,
"loss": 0.5561,
"step": 5820
},
{
"epoch": 2.1316270566727606,
"grad_norm": 1.0182098150253296,
"learning_rate": 1.7227787934186474e-05,
"loss": 0.6022,
"step": 5830
},
{
"epoch": 2.1352833638025595,
"grad_norm": 1.9393812417984009,
"learning_rate": 1.7205850091407676e-05,
"loss": 0.916,
"step": 5840
},
{
"epoch": 2.1389396709323583,
"grad_norm": 1.6483741998672485,
"learning_rate": 1.7183912248628886e-05,
"loss": 0.8657,
"step": 5850
},
{
"epoch": 2.142595978062157,
"grad_norm": 5.0950927734375,
"learning_rate": 1.7161974405850092e-05,
"loss": 0.769,
"step": 5860
},
{
"epoch": 2.1462522851919563,
"grad_norm": 5.417265892028809,
"learning_rate": 1.7140036563071298e-05,
"loss": 0.8892,
"step": 5870
},
{
"epoch": 2.149908592321755,
"grad_norm": 2.497882604598999,
"learning_rate": 1.7118098720292504e-05,
"loss": 0.6083,
"step": 5880
},
{
"epoch": 2.153564899451554,
"grad_norm": 2.365013599395752,
"learning_rate": 1.709616087751371e-05,
"loss": 0.8289,
"step": 5890
},
{
"epoch": 2.1572212065813527,
"grad_norm": 4.738333225250244,
"learning_rate": 1.707422303473492e-05,
"loss": 0.8626,
"step": 5900
},
{
"epoch": 2.1608775137111516,
"grad_norm": 3.8534250259399414,
"learning_rate": 1.7052285191956126e-05,
"loss": 0.7599,
"step": 5910
},
{
"epoch": 2.164533820840951,
"grad_norm": 4.418381214141846,
"learning_rate": 1.7030347349177332e-05,
"loss": 0.6414,
"step": 5920
},
{
"epoch": 2.1681901279707496,
"grad_norm": 3.9305009841918945,
"learning_rate": 1.700840950639854e-05,
"loss": 0.8187,
"step": 5930
},
{
"epoch": 2.1718464351005484,
"grad_norm": 3.8605735301971436,
"learning_rate": 1.6986471663619744e-05,
"loss": 0.7981,
"step": 5940
},
{
"epoch": 2.1755027422303472,
"grad_norm": 0.5784508585929871,
"learning_rate": 1.696453382084095e-05,
"loss": 0.5747,
"step": 5950
},
{
"epoch": 2.1791590493601465,
"grad_norm": 3.44700288772583,
"learning_rate": 1.6942595978062157e-05,
"loss": 0.9207,
"step": 5960
},
{
"epoch": 2.1828153564899453,
"grad_norm": 2.130711317062378,
"learning_rate": 1.6920658135283363e-05,
"loss": 0.5485,
"step": 5970
},
{
"epoch": 2.186471663619744,
"grad_norm": 3.466505289077759,
"learning_rate": 1.689872029250457e-05,
"loss": 0.8698,
"step": 5980
},
{
"epoch": 2.190127970749543,
"grad_norm": 2.2737669944763184,
"learning_rate": 1.687678244972578e-05,
"loss": 0.9317,
"step": 5990
},
{
"epoch": 2.1937842778793417,
"grad_norm": 6.341795444488525,
"learning_rate": 1.6854844606946985e-05,
"loss": 0.8099,
"step": 6000
},
{
"epoch": 2.197440585009141,
"grad_norm": 7.006868839263916,
"learning_rate": 1.683290676416819e-05,
"loss": 0.9554,
"step": 6010
},
{
"epoch": 2.2010968921389398,
"grad_norm": 3.7944741249084473,
"learning_rate": 1.6810968921389397e-05,
"loss": 0.7068,
"step": 6020
},
{
"epoch": 2.2047531992687386,
"grad_norm": 3.8332672119140625,
"learning_rate": 1.6789031078610607e-05,
"loss": 0.7278,
"step": 6030
},
{
"epoch": 2.2084095063985374,
"grad_norm": 6.753068447113037,
"learning_rate": 1.676709323583181e-05,
"loss": 0.8768,
"step": 6040
},
{
"epoch": 2.212065813528336,
"grad_norm": 6.275936603546143,
"learning_rate": 1.6745155393053015e-05,
"loss": 0.743,
"step": 6050
},
{
"epoch": 2.2157221206581355,
"grad_norm": 0.639437198638916,
"learning_rate": 1.672321755027422e-05,
"loss": 0.7593,
"step": 6060
},
{
"epoch": 2.2193784277879343,
"grad_norm": 2.318837881088257,
"learning_rate": 1.670127970749543e-05,
"loss": 0.6968,
"step": 6070
},
{
"epoch": 2.223034734917733,
"grad_norm": 4.160284996032715,
"learning_rate": 1.6679341864716637e-05,
"loss": 0.764,
"step": 6080
},
{
"epoch": 2.226691042047532,
"grad_norm": 4.3529744148254395,
"learning_rate": 1.6657404021937843e-05,
"loss": 0.7927,
"step": 6090
},
{
"epoch": 2.2303473491773307,
"grad_norm": 6.292082786560059,
"learning_rate": 1.663546617915905e-05,
"loss": 0.8195,
"step": 6100
},
{
"epoch": 2.23400365630713,
"grad_norm": 1.47853684425354,
"learning_rate": 1.6613528336380256e-05,
"loss": 0.4685,
"step": 6110
},
{
"epoch": 2.2376599634369287,
"grad_norm": 1.9506633281707764,
"learning_rate": 1.6593784277879343e-05,
"loss": 0.8133,
"step": 6120
},
{
"epoch": 2.2413162705667276,
"grad_norm": 3.7667181491851807,
"learning_rate": 1.657184643510055e-05,
"loss": 0.7294,
"step": 6130
},
{
"epoch": 2.2449725776965264,
"grad_norm": 3.3465397357940674,
"learning_rate": 1.6549908592321755e-05,
"loss": 0.6542,
"step": 6140
},
{
"epoch": 2.2486288848263256,
"grad_norm": 6.2452545166015625,
"learning_rate": 1.652797074954296e-05,
"loss": 1.0769,
"step": 6150
},
{
"epoch": 2.2522851919561244,
"grad_norm": 3.649399518966675,
"learning_rate": 1.6506032906764167e-05,
"loss": 0.6094,
"step": 6160
},
{
"epoch": 2.2559414990859232,
"grad_norm": 1.8042731285095215,
"learning_rate": 1.6484095063985374e-05,
"loss": 0.9194,
"step": 6170
},
{
"epoch": 2.259597806215722,
"grad_norm": 6.0087714195251465,
"learning_rate": 1.6462157221206583e-05,
"loss": 0.7116,
"step": 6180
},
{
"epoch": 2.263254113345521,
"grad_norm": 2.632741928100586,
"learning_rate": 1.644021937842779e-05,
"loss": 0.713,
"step": 6190
},
{
"epoch": 2.26691042047532,
"grad_norm": 1.5080722570419312,
"learning_rate": 1.6418281535648995e-05,
"loss": 0.7972,
"step": 6200
},
{
"epoch": 2.270566727605119,
"grad_norm": 5.658291816711426,
"learning_rate": 1.63963436928702e-05,
"loss": 0.8587,
"step": 6210
},
{
"epoch": 2.2742230347349177,
"grad_norm": 2.7925331592559814,
"learning_rate": 1.637440585009141e-05,
"loss": 0.859,
"step": 6220
},
{
"epoch": 2.2778793418647165,
"grad_norm": 7.432958126068115,
"learning_rate": 1.6352468007312614e-05,
"loss": 1.1442,
"step": 6230
},
{
"epoch": 2.2815356489945158,
"grad_norm": 3.1976866722106934,
"learning_rate": 1.633053016453382e-05,
"loss": 0.9542,
"step": 6240
},
{
"epoch": 2.2851919561243146,
"grad_norm": 6.586294174194336,
"learning_rate": 1.6308592321755026e-05,
"loss": 0.9548,
"step": 6250
},
{
"epoch": 2.2888482632541134,
"grad_norm": 4.858059406280518,
"learning_rate": 1.6286654478976232e-05,
"loss": 0.8362,
"step": 6260
},
{
"epoch": 2.292504570383912,
"grad_norm": 0.6416640281677246,
"learning_rate": 1.6264716636197442e-05,
"loss": 0.866,
"step": 6270
},
{
"epoch": 2.296160877513711,
"grad_norm": 2.6693904399871826,
"learning_rate": 1.6242778793418648e-05,
"loss": 0.9518,
"step": 6280
},
{
"epoch": 2.2998171846435103,
"grad_norm": 3.4559848308563232,
"learning_rate": 1.6220840950639854e-05,
"loss": 0.9771,
"step": 6290
},
{
"epoch": 2.303473491773309,
"grad_norm": 4.6828460693359375,
"learning_rate": 1.619890310786106e-05,
"loss": 0.6363,
"step": 6300
},
{
"epoch": 2.307129798903108,
"grad_norm": 7.3838911056518555,
"learning_rate": 1.617696526508227e-05,
"loss": 1.1616,
"step": 6310
},
{
"epoch": 2.3107861060329067,
"grad_norm": 2.1157217025756836,
"learning_rate": 1.6155027422303476e-05,
"loss": 0.6204,
"step": 6320
},
{
"epoch": 2.3144424131627055,
"grad_norm": 5.136549949645996,
"learning_rate": 1.613308957952468e-05,
"loss": 0.9136,
"step": 6330
},
{
"epoch": 2.3180987202925047,
"grad_norm": 4.352057933807373,
"learning_rate": 1.6111151736745885e-05,
"loss": 1.0355,
"step": 6340
},
{
"epoch": 2.3217550274223036,
"grad_norm": 6.010753154754639,
"learning_rate": 1.6089213893967094e-05,
"loss": 0.8964,
"step": 6350
},
{
"epoch": 2.3254113345521024,
"grad_norm": 4.205333232879639,
"learning_rate": 1.60672760511883e-05,
"loss": 0.5251,
"step": 6360
},
{
"epoch": 2.329067641681901,
"grad_norm": 1.6704707145690918,
"learning_rate": 1.6045338208409507e-05,
"loss": 0.8491,
"step": 6370
},
{
"epoch": 2.3327239488117,
"grad_norm": 6.694083213806152,
"learning_rate": 1.6023400365630713e-05,
"loss": 0.6679,
"step": 6380
},
{
"epoch": 2.3363802559414992,
"grad_norm": 3.342144727706909,
"learning_rate": 1.600146252285192e-05,
"loss": 0.7972,
"step": 6390
},
{
"epoch": 2.340036563071298,
"grad_norm": 1.7112003564834595,
"learning_rate": 1.597952468007313e-05,
"loss": 0.6471,
"step": 6400
},
{
"epoch": 2.343692870201097,
"grad_norm": 5.751948833465576,
"learning_rate": 1.5957586837294335e-05,
"loss": 0.7035,
"step": 6410
},
{
"epoch": 2.3473491773308957,
"grad_norm": 5.628826141357422,
"learning_rate": 1.593564899451554e-05,
"loss": 0.9389,
"step": 6420
},
{
"epoch": 2.3510054844606945,
"grad_norm": 2.342500686645508,
"learning_rate": 1.5913711151736743e-05,
"loss": 0.8235,
"step": 6430
},
{
"epoch": 2.3546617915904937,
"grad_norm": 6.325570106506348,
"learning_rate": 1.5891773308957953e-05,
"loss": 0.7512,
"step": 6440
},
{
"epoch": 2.3583180987202925,
"grad_norm": 2.6905734539031982,
"learning_rate": 1.586983546617916e-05,
"loss": 0.7487,
"step": 6450
},
{
"epoch": 2.3619744058500913,
"grad_norm": 6.612014293670654,
"learning_rate": 1.5847897623400365e-05,
"loss": 0.8339,
"step": 6460
},
{
"epoch": 2.36563071297989,
"grad_norm": 5.0433526039123535,
"learning_rate": 1.582595978062157e-05,
"loss": 0.7244,
"step": 6470
},
{
"epoch": 2.3692870201096894,
"grad_norm": 7.0606536865234375,
"learning_rate": 1.580402193784278e-05,
"loss": 0.7509,
"step": 6480
},
{
"epoch": 2.372943327239488,
"grad_norm": 3.0852813720703125,
"learning_rate": 1.5782084095063987e-05,
"loss": 0.9728,
"step": 6490
},
{
"epoch": 2.376599634369287,
"grad_norm": 1.0151329040527344,
"learning_rate": 1.5760146252285193e-05,
"loss": 0.4665,
"step": 6500
},
{
"epoch": 2.380255941499086,
"grad_norm": 4.328806400299072,
"learning_rate": 1.57382084095064e-05,
"loss": 1.0313,
"step": 6510
},
{
"epoch": 2.383912248628885,
"grad_norm": 3.0017240047454834,
"learning_rate": 1.5716270566727605e-05,
"loss": 0.8768,
"step": 6520
},
{
"epoch": 2.387568555758684,
"grad_norm": 5.693215370178223,
"learning_rate": 1.569433272394881e-05,
"loss": 1.0785,
"step": 6530
},
{
"epoch": 2.3912248628884827,
"grad_norm": 1.9527254104614258,
"learning_rate": 1.5672394881170018e-05,
"loss": 0.9748,
"step": 6540
},
{
"epoch": 2.3948811700182815,
"grad_norm": 7.030393123626709,
"learning_rate": 1.5650457038391224e-05,
"loss": 0.9713,
"step": 6550
},
{
"epoch": 2.3985374771480803,
"grad_norm": 4.398252487182617,
"learning_rate": 1.562851919561243e-05,
"loss": 0.8411,
"step": 6560
},
{
"epoch": 2.4021937842778796,
"grad_norm": 4.480136394500732,
"learning_rate": 1.560658135283364e-05,
"loss": 0.7707,
"step": 6570
},
{
"epoch": 2.4058500914076784,
"grad_norm": 1.4185088872909546,
"learning_rate": 1.5584643510054846e-05,
"loss": 0.6503,
"step": 6580
},
{
"epoch": 2.409506398537477,
"grad_norm": 2.1291399002075195,
"learning_rate": 1.5562705667276052e-05,
"loss": 0.865,
"step": 6590
},
{
"epoch": 2.413162705667276,
"grad_norm": 1.884865641593933,
"learning_rate": 1.5540767824497258e-05,
"loss": 0.846,
"step": 6600
},
{
"epoch": 2.416819012797075,
"grad_norm": 5.985278129577637,
"learning_rate": 1.5518829981718464e-05,
"loss": 1.0771,
"step": 6610
},
{
"epoch": 2.420475319926874,
"grad_norm": 2.479788064956665,
"learning_rate": 1.5496892138939674e-05,
"loss": 0.5075,
"step": 6620
},
{
"epoch": 2.424131627056673,
"grad_norm": 5.608107089996338,
"learning_rate": 1.5474954296160876e-05,
"loss": 0.6899,
"step": 6630
},
{
"epoch": 2.4277879341864717,
"grad_norm": 3.776259660720825,
"learning_rate": 1.5453016453382083e-05,
"loss": 1.0147,
"step": 6640
},
{
"epoch": 2.4314442413162705,
"grad_norm": 3.7614502906799316,
"learning_rate": 1.543107861060329e-05,
"loss": 0.8459,
"step": 6650
},
{
"epoch": 2.4351005484460693,
"grad_norm": 2.099912405014038,
"learning_rate": 1.5409140767824498e-05,
"loss": 1.1646,
"step": 6660
},
{
"epoch": 2.4387568555758685,
"grad_norm": 0.44922786951065063,
"learning_rate": 1.5387202925045704e-05,
"loss": 0.8236,
"step": 6670
},
{
"epoch": 2.4424131627056673,
"grad_norm": 5.04080867767334,
"learning_rate": 1.536526508226691e-05,
"loss": 1.0443,
"step": 6680
},
{
"epoch": 2.446069469835466,
"grad_norm": 6.819587707519531,
"learning_rate": 1.5343327239488117e-05,
"loss": 1.0736,
"step": 6690
},
{
"epoch": 2.449725776965265,
"grad_norm": 3.2166435718536377,
"learning_rate": 1.5321389396709326e-05,
"loss": 0.6082,
"step": 6700
},
{
"epoch": 2.4533820840950638,
"grad_norm": 5.284506320953369,
"learning_rate": 1.5299451553930532e-05,
"loss": 0.874,
"step": 6710
},
{
"epoch": 2.457038391224863,
"grad_norm": 4.369775295257568,
"learning_rate": 1.527751371115174e-05,
"loss": 0.7909,
"step": 6720
},
{
"epoch": 2.460694698354662,
"grad_norm": 3.289560079574585,
"learning_rate": 1.5255575868372943e-05,
"loss": 0.5602,
"step": 6730
},
{
"epoch": 2.4643510054844606,
"grad_norm": 1.0491223335266113,
"learning_rate": 1.5233638025594149e-05,
"loss": 0.5654,
"step": 6740
},
{
"epoch": 2.4680073126142594,
"grad_norm": 6.228038311004639,
"learning_rate": 1.5211700182815359e-05,
"loss": 0.7385,
"step": 6750
},
{
"epoch": 2.4716636197440587,
"grad_norm": 1.8368405103683472,
"learning_rate": 1.5189762340036563e-05,
"loss": 0.7565,
"step": 6760
},
{
"epoch": 2.4753199268738575,
"grad_norm": 4.739897727966309,
"learning_rate": 1.516782449725777e-05,
"loss": 0.791,
"step": 6770
},
{
"epoch": 2.4789762340036563,
"grad_norm": 4.953397274017334,
"learning_rate": 1.5145886654478975e-05,
"loss": 0.7508,
"step": 6780
},
{
"epoch": 2.482632541133455,
"grad_norm": 2.6047234535217285,
"learning_rate": 1.5123948811700185e-05,
"loss": 0.7291,
"step": 6790
},
{
"epoch": 2.4862888482632544,
"grad_norm": 6.410040378570557,
"learning_rate": 1.5102010968921391e-05,
"loss": 0.7964,
"step": 6800
},
{
"epoch": 2.489945155393053,
"grad_norm": 3.258415699005127,
"learning_rate": 1.5080073126142595e-05,
"loss": 0.5596,
"step": 6810
},
{
"epoch": 2.493601462522852,
"grad_norm": 3.6299631595611572,
"learning_rate": 1.5058135283363802e-05,
"loss": 0.6576,
"step": 6820
},
{
"epoch": 2.497257769652651,
"grad_norm": 3.453648090362549,
"learning_rate": 1.5036197440585011e-05,
"loss": 0.6678,
"step": 6830
},
{
"epoch": 2.5009140767824496,
"grad_norm": 1.2416099309921265,
"learning_rate": 1.5014259597806217e-05,
"loss": 0.4999,
"step": 6840
},
{
"epoch": 2.504570383912249,
"grad_norm": 6.100232124328613,
"learning_rate": 1.4992321755027423e-05,
"loss": 0.6388,
"step": 6850
},
{
"epoch": 2.5082266910420477,
"grad_norm": 5.142564296722412,
"learning_rate": 1.4970383912248628e-05,
"loss": 0.6444,
"step": 6860
},
{
"epoch": 2.5118829981718465,
"grad_norm": 4.675838947296143,
"learning_rate": 1.4948446069469836e-05,
"loss": 0.7841,
"step": 6870
},
{
"epoch": 2.5155393053016453,
"grad_norm": 2.114088296890259,
"learning_rate": 1.4926508226691042e-05,
"loss": 0.9303,
"step": 6880
},
{
"epoch": 2.519195612431444,
"grad_norm": 2.986456871032715,
"learning_rate": 1.490457038391225e-05,
"loss": 0.7067,
"step": 6890
},
{
"epoch": 2.5228519195612433,
"grad_norm": 4.167447090148926,
"learning_rate": 1.4882632541133456e-05,
"loss": 0.9053,
"step": 6900
},
{
"epoch": 2.526508226691042,
"grad_norm": 4.037667751312256,
"learning_rate": 1.4860694698354662e-05,
"loss": 0.6749,
"step": 6910
},
{
"epoch": 2.530164533820841,
"grad_norm": 3.1031131744384766,
"learning_rate": 1.4838756855575868e-05,
"loss": 0.978,
"step": 6920
},
{
"epoch": 2.5338208409506398,
"grad_norm": 8.000749588012695,
"learning_rate": 1.4816819012797076e-05,
"loss": 0.9373,
"step": 6930
},
{
"epoch": 2.5374771480804386,
"grad_norm": 5.090115070343018,
"learning_rate": 1.4794881170018282e-05,
"loss": 0.6007,
"step": 6940
},
{
"epoch": 2.541133455210238,
"grad_norm": 4.267579555511475,
"learning_rate": 1.477294332723949e-05,
"loss": 0.6713,
"step": 6950
},
{
"epoch": 2.5447897623400366,
"grad_norm": 6.383331775665283,
"learning_rate": 1.4751005484460694e-05,
"loss": 0.8684,
"step": 6960
},
{
"epoch": 2.5484460694698354,
"grad_norm": 5.479264736175537,
"learning_rate": 1.47290676416819e-05,
"loss": 1.0091,
"step": 6970
},
{
"epoch": 2.5521023765996342,
"grad_norm": 1.4539798498153687,
"learning_rate": 1.4707129798903108e-05,
"loss": 1.2803,
"step": 6980
},
{
"epoch": 2.555758683729433,
"grad_norm": 4.5096755027771,
"learning_rate": 1.4685191956124314e-05,
"loss": 1.1298,
"step": 6990
},
{
"epoch": 2.5594149908592323,
"grad_norm": 2.711442232131958,
"learning_rate": 1.4663254113345522e-05,
"loss": 0.8488,
"step": 7000
},
{
"epoch": 2.563071297989031,
"grad_norm": 5.5778069496154785,
"learning_rate": 1.4641316270566727e-05,
"loss": 0.8496,
"step": 7010
},
{
"epoch": 2.56672760511883,
"grad_norm": 6.614429473876953,
"learning_rate": 1.4619378427787935e-05,
"loss": 0.897,
"step": 7020
},
{
"epoch": 2.5703839122486287,
"grad_norm": 4.096016883850098,
"learning_rate": 1.459744058500914e-05,
"loss": 0.9322,
"step": 7030
},
{
"epoch": 2.5740402193784275,
"grad_norm": 3.7577602863311768,
"learning_rate": 1.4575502742230349e-05,
"loss": 0.8438,
"step": 7040
},
{
"epoch": 2.577696526508227,
"grad_norm": 6.623696327209473,
"learning_rate": 1.4553564899451555e-05,
"loss": 0.8535,
"step": 7050
},
{
"epoch": 2.5813528336380256,
"grad_norm": 4.914971828460693,
"learning_rate": 1.453162705667276e-05,
"loss": 0.9281,
"step": 7060
},
{
"epoch": 2.5850091407678244,
"grad_norm": 3.639310359954834,
"learning_rate": 1.4509689213893967e-05,
"loss": 0.5874,
"step": 7070
},
{
"epoch": 2.5886654478976237,
"grad_norm": 4.59980583190918,
"learning_rate": 1.4487751371115173e-05,
"loss": 0.669,
"step": 7080
},
{
"epoch": 2.5923217550274225,
"grad_norm": 3.802577018737793,
"learning_rate": 1.4465813528336381e-05,
"loss": 0.7863,
"step": 7090
},
{
"epoch": 2.5959780621572213,
"grad_norm": 5.985960960388184,
"learning_rate": 1.4443875685557587e-05,
"loss": 1.1215,
"step": 7100
},
{
"epoch": 2.59963436928702,
"grad_norm": 7.36239767074585,
"learning_rate": 1.4421937842778793e-05,
"loss": 0.9102,
"step": 7110
},
{
"epoch": 2.603290676416819,
"grad_norm": 4.171439170837402,
"learning_rate": 1.44e-05,
"loss": 0.5814,
"step": 7120
},
{
"epoch": 2.606946983546618,
"grad_norm": 3.7119038105010986,
"learning_rate": 1.4378062157221207e-05,
"loss": 1.1138,
"step": 7130
},
{
"epoch": 2.610603290676417,
"grad_norm": 0.3623199164867401,
"learning_rate": 1.4356124314442413e-05,
"loss": 0.7191,
"step": 7140
},
{
"epoch": 2.6142595978062158,
"grad_norm": 7.952626705169678,
"learning_rate": 1.4334186471663621e-05,
"loss": 1.008,
"step": 7150
},
{
"epoch": 2.6179159049360146,
"grad_norm": 4.192795753479004,
"learning_rate": 1.4312248628884826e-05,
"loss": 1.0029,
"step": 7160
},
{
"epoch": 2.6215722120658134,
"grad_norm": 1.6941229104995728,
"learning_rate": 1.4290310786106033e-05,
"loss": 0.5333,
"step": 7170
},
{
"epoch": 2.6252285191956126,
"grad_norm": 4.540876865386963,
"learning_rate": 1.426837294332724e-05,
"loss": 0.6368,
"step": 7180
},
{
"epoch": 2.6288848263254114,
"grad_norm": 3.824742078781128,
"learning_rate": 1.4246435100548447e-05,
"loss": 0.8578,
"step": 7190
},
{
"epoch": 2.6325411334552102,
"grad_norm": 6.03521203994751,
"learning_rate": 1.4224497257769654e-05,
"loss": 0.9158,
"step": 7200
},
{
"epoch": 2.636197440585009,
"grad_norm": 1.0500041246414185,
"learning_rate": 1.420255941499086e-05,
"loss": 0.7151,
"step": 7210
},
{
"epoch": 2.639853747714808,
"grad_norm": 3.9835376739501953,
"learning_rate": 1.4180621572212066e-05,
"loss": 0.9856,
"step": 7220
},
{
"epoch": 2.643510054844607,
"grad_norm": 4.260631084442139,
"learning_rate": 1.4158683729433272e-05,
"loss": 0.7225,
"step": 7230
},
{
"epoch": 2.647166361974406,
"grad_norm": 4.900208473205566,
"learning_rate": 1.413674588665448e-05,
"loss": 0.7778,
"step": 7240
},
{
"epoch": 2.6508226691042047,
"grad_norm": 2.9643290042877197,
"learning_rate": 1.4114808043875686e-05,
"loss": 0.6312,
"step": 7250
},
{
"epoch": 2.6544789762340035,
"grad_norm": 2.850414752960205,
"learning_rate": 1.4092870201096894e-05,
"loss": 0.8683,
"step": 7260
},
{
"epoch": 2.6581352833638023,
"grad_norm": 5.803402423858643,
"learning_rate": 1.4070932358318098e-05,
"loss": 0.8922,
"step": 7270
},
{
"epoch": 2.6617915904936016,
"grad_norm": 4.494935512542725,
"learning_rate": 1.4048994515539306e-05,
"loss": 0.6408,
"step": 7280
},
{
"epoch": 2.6654478976234004,
"grad_norm": 2.5925052165985107,
"learning_rate": 1.4027056672760512e-05,
"loss": 0.864,
"step": 7290
},
{
"epoch": 2.669104204753199,
"grad_norm": 3.3631858825683594,
"learning_rate": 1.400511882998172e-05,
"loss": 0.603,
"step": 7300
},
{
"epoch": 2.672760511882998,
"grad_norm": 3.358248472213745,
"learning_rate": 1.3983180987202926e-05,
"loss": 0.5987,
"step": 7310
},
{
"epoch": 2.676416819012797,
"grad_norm": 3.431640386581421,
"learning_rate": 1.396124314442413e-05,
"loss": 0.891,
"step": 7320
},
{
"epoch": 2.680073126142596,
"grad_norm": 5.032719612121582,
"learning_rate": 1.3939305301645338e-05,
"loss": 0.9732,
"step": 7330
},
{
"epoch": 2.683729433272395,
"grad_norm": 7.277076721191406,
"learning_rate": 1.3917367458866545e-05,
"loss": 1.1068,
"step": 7340
},
{
"epoch": 2.6873857404021937,
"grad_norm": 2.9995198249816895,
"learning_rate": 1.3895429616087752e-05,
"loss": 0.8166,
"step": 7350
},
{
"epoch": 2.691042047531993,
"grad_norm": 3.2115001678466797,
"learning_rate": 1.3873491773308959e-05,
"loss": 0.7981,
"step": 7360
},
{
"epoch": 2.6946983546617918,
"grad_norm": 0.935015082359314,
"learning_rate": 1.3851553930530165e-05,
"loss": 0.5872,
"step": 7370
},
{
"epoch": 2.6983546617915906,
"grad_norm": 3.3315343856811523,
"learning_rate": 1.3829616087751371e-05,
"loss": 0.593,
"step": 7380
},
{
"epoch": 2.7020109689213894,
"grad_norm": 8.530818939208984,
"learning_rate": 1.3807678244972579e-05,
"loss": 1.2569,
"step": 7390
},
{
"epoch": 2.705667276051188,
"grad_norm": 3.22756290435791,
"learning_rate": 1.3785740402193785e-05,
"loss": 0.8342,
"step": 7400
},
{
"epoch": 2.7093235831809874,
"grad_norm": 1.0916093587875366,
"learning_rate": 1.3763802559414993e-05,
"loss": 0.8432,
"step": 7410
},
{
"epoch": 2.7129798903107862,
"grad_norm": 5.046055793762207,
"learning_rate": 1.3741864716636197e-05,
"loss": 0.8075,
"step": 7420
},
{
"epoch": 2.716636197440585,
"grad_norm": 4.796830654144287,
"learning_rate": 1.3719926873857405e-05,
"loss": 1.1543,
"step": 7430
},
{
"epoch": 2.720292504570384,
"grad_norm": 5.081254005432129,
"learning_rate": 1.3697989031078611e-05,
"loss": 0.9912,
"step": 7440
},
{
"epoch": 2.7239488117001827,
"grad_norm": 3.72564697265625,
"learning_rate": 1.3676051188299817e-05,
"loss": 0.7743,
"step": 7450
},
{
"epoch": 2.727605118829982,
"grad_norm": 5.735417366027832,
"learning_rate": 1.3654113345521025e-05,
"loss": 0.8145,
"step": 7460
},
{
"epoch": 2.7312614259597807,
"grad_norm": 2.6865832805633545,
"learning_rate": 1.363217550274223e-05,
"loss": 1.0108,
"step": 7470
},
{
"epoch": 2.7349177330895795,
"grad_norm": 4.572368621826172,
"learning_rate": 1.3610237659963437e-05,
"loss": 0.8924,
"step": 7480
},
{
"epoch": 2.7385740402193783,
"grad_norm": 5.849616050720215,
"learning_rate": 1.3588299817184644e-05,
"loss": 0.9521,
"step": 7490
},
{
"epoch": 2.742230347349177,
"grad_norm": 3.1010758876800537,
"learning_rate": 1.3566361974405851e-05,
"loss": 0.9336,
"step": 7500
},
{
"epoch": 2.7458866544789764,
"grad_norm": 4.738924980163574,
"learning_rate": 1.3544424131627057e-05,
"loss": 0.6897,
"step": 7510
},
{
"epoch": 2.749542961608775,
"grad_norm": 6.994441032409668,
"learning_rate": 1.3522486288848264e-05,
"loss": 1.0206,
"step": 7520
},
{
"epoch": 2.753199268738574,
"grad_norm": 2.939159393310547,
"learning_rate": 1.350054844606947e-05,
"loss": 1.2065,
"step": 7530
},
{
"epoch": 2.756855575868373,
"grad_norm": 5.182316780090332,
"learning_rate": 1.3478610603290678e-05,
"loss": 0.9884,
"step": 7540
},
{
"epoch": 2.7605118829981716,
"grad_norm": 4.590856552124023,
"learning_rate": 1.3456672760511884e-05,
"loss": 0.8894,
"step": 7550
},
{
"epoch": 2.764168190127971,
"grad_norm": 5.282886505126953,
"learning_rate": 1.343473491773309e-05,
"loss": 0.8427,
"step": 7560
},
{
"epoch": 2.7678244972577697,
"grad_norm": 3.7344796657562256,
"learning_rate": 1.3412797074954296e-05,
"loss": 0.7983,
"step": 7570
},
{
"epoch": 2.7714808043875685,
"grad_norm": 3.6710190773010254,
"learning_rate": 1.3390859232175502e-05,
"loss": 0.7053,
"step": 7580
},
{
"epoch": 2.7751371115173673,
"grad_norm": 3.3527188301086426,
"learning_rate": 1.336892138939671e-05,
"loss": 0.7733,
"step": 7590
},
{
"epoch": 2.778793418647166,
"grad_norm": 4.841655254364014,
"learning_rate": 1.3346983546617916e-05,
"loss": 0.8462,
"step": 7600
},
{
"epoch": 2.7824497257769654,
"grad_norm": 1.9838179349899292,
"learning_rate": 1.3325045703839124e-05,
"loss": 0.6837,
"step": 7610
},
{
"epoch": 2.786106032906764,
"grad_norm": 4.187015056610107,
"learning_rate": 1.3303107861060328e-05,
"loss": 0.7068,
"step": 7620
},
{
"epoch": 2.789762340036563,
"grad_norm": 4.960452079772949,
"learning_rate": 1.3281170018281536e-05,
"loss": 0.8144,
"step": 7630
},
{
"epoch": 2.7934186471663622,
"grad_norm": 5.154735565185547,
"learning_rate": 1.3259232175502742e-05,
"loss": 0.5711,
"step": 7640
},
{
"epoch": 2.797074954296161,
"grad_norm": 7.650027275085449,
"learning_rate": 1.323729433272395e-05,
"loss": 1.0676,
"step": 7650
},
{
"epoch": 2.80073126142596,
"grad_norm": 2.561450242996216,
"learning_rate": 1.3215356489945156e-05,
"loss": 1.0003,
"step": 7660
},
{
"epoch": 2.8043875685557587,
"grad_norm": 5.075997352600098,
"learning_rate": 1.319341864716636e-05,
"loss": 0.7371,
"step": 7670
},
{
"epoch": 2.8080438756855575,
"grad_norm": 5.0892181396484375,
"learning_rate": 1.3171480804387569e-05,
"loss": 0.7836,
"step": 7680
},
{
"epoch": 2.8117001828153567,
"grad_norm": 2.6121692657470703,
"learning_rate": 1.3149542961608775e-05,
"loss": 0.7845,
"step": 7690
},
{
"epoch": 2.8153564899451555,
"grad_norm": 4.506619453430176,
"learning_rate": 1.3127605118829983e-05,
"loss": 0.9367,
"step": 7700
},
{
"epoch": 2.8190127970749543,
"grad_norm": 6.061919212341309,
"learning_rate": 1.3105667276051189e-05,
"loss": 1.3084,
"step": 7710
},
{
"epoch": 2.822669104204753,
"grad_norm": 5.916521072387695,
"learning_rate": 1.3083729433272395e-05,
"loss": 0.7738,
"step": 7720
},
{
"epoch": 2.826325411334552,
"grad_norm": 4.980602741241455,
"learning_rate": 1.3061791590493601e-05,
"loss": 0.9172,
"step": 7730
},
{
"epoch": 2.829981718464351,
"grad_norm": 5.4095139503479,
"learning_rate": 1.3039853747714809e-05,
"loss": 0.9361,
"step": 7740
},
{
"epoch": 2.83363802559415,
"grad_norm": 2.727238178253174,
"learning_rate": 1.3017915904936015e-05,
"loss": 0.6558,
"step": 7750
},
{
"epoch": 2.837294332723949,
"grad_norm": 6.939225196838379,
"learning_rate": 1.2995978062157223e-05,
"loss": 0.8616,
"step": 7760
},
{
"epoch": 2.8409506398537476,
"grad_norm": 2.1128830909729004,
"learning_rate": 1.2974040219378427e-05,
"loss": 0.7853,
"step": 7770
},
{
"epoch": 2.8446069469835464,
"grad_norm": 5.917961120605469,
"learning_rate": 1.2952102376599635e-05,
"loss": 0.8472,
"step": 7780
},
{
"epoch": 2.8482632541133457,
"grad_norm": 3.7327582836151123,
"learning_rate": 1.2930164533820841e-05,
"loss": 0.7266,
"step": 7790
},
{
"epoch": 2.8519195612431445,
"grad_norm": 6.155743598937988,
"learning_rate": 1.2908226691042047e-05,
"loss": 0.797,
"step": 7800
},
{
"epoch": 2.8555758683729433,
"grad_norm": 2.516705274581909,
"learning_rate": 1.2886288848263255e-05,
"loss": 0.7553,
"step": 7810
},
{
"epoch": 2.859232175502742,
"grad_norm": 8.74838924407959,
"learning_rate": 1.286435100548446e-05,
"loss": 1.0794,
"step": 7820
},
{
"epoch": 2.862888482632541,
"grad_norm": 5.0210113525390625,
"learning_rate": 1.2842413162705668e-05,
"loss": 0.6795,
"step": 7830
},
{
"epoch": 2.86654478976234,
"grad_norm": 6.808406352996826,
"learning_rate": 1.2820475319926874e-05,
"loss": 0.8541,
"step": 7840
},
{
"epoch": 2.870201096892139,
"grad_norm": 8.608129501342773,
"learning_rate": 1.2798537477148082e-05,
"loss": 0.989,
"step": 7850
},
{
"epoch": 2.873857404021938,
"grad_norm": 3.3586058616638184,
"learning_rate": 1.2776599634369288e-05,
"loss": 0.961,
"step": 7860
},
{
"epoch": 2.8775137111517366,
"grad_norm": 3.2911384105682373,
"learning_rate": 1.2754661791590494e-05,
"loss": 0.7997,
"step": 7870
},
{
"epoch": 2.8811700182815354,
"grad_norm": 3.714557647705078,
"learning_rate": 1.27327239488117e-05,
"loss": 0.914,
"step": 7880
},
{
"epoch": 2.8848263254113347,
"grad_norm": 3.879274368286133,
"learning_rate": 1.2710786106032908e-05,
"loss": 0.8965,
"step": 7890
},
{
"epoch": 2.8884826325411335,
"grad_norm": 4.490417003631592,
"learning_rate": 1.2688848263254114e-05,
"loss": 0.7234,
"step": 7900
},
{
"epoch": 2.8921389396709323,
"grad_norm": 2.7484891414642334,
"learning_rate": 1.266691042047532e-05,
"loss": 0.6367,
"step": 7910
},
{
"epoch": 2.8957952468007315,
"grad_norm": 4.121150493621826,
"learning_rate": 1.2644972577696526e-05,
"loss": 0.9689,
"step": 7920
},
{
"epoch": 2.89945155393053,
"grad_norm": 4.113166332244873,
"learning_rate": 1.2623034734917732e-05,
"loss": 0.7614,
"step": 7930
},
{
"epoch": 2.903107861060329,
"grad_norm": 2.689598321914673,
"learning_rate": 1.260109689213894e-05,
"loss": 0.8595,
"step": 7940
},
{
"epoch": 2.906764168190128,
"grad_norm": 4.187771320343018,
"learning_rate": 1.2579159049360146e-05,
"loss": 0.8277,
"step": 7950
},
{
"epoch": 2.9104204753199268,
"grad_norm": 1.3637969493865967,
"learning_rate": 1.2557221206581354e-05,
"loss": 0.4846,
"step": 7960
},
{
"epoch": 2.914076782449726,
"grad_norm": 2.0621910095214844,
"learning_rate": 1.2535283363802559e-05,
"loss": 0.7043,
"step": 7970
},
{
"epoch": 2.917733089579525,
"grad_norm": 3.3105924129486084,
"learning_rate": 1.2513345521023766e-05,
"loss": 0.9526,
"step": 7980
},
{
"epoch": 2.9213893967093236,
"grad_norm": 6.157617092132568,
"learning_rate": 1.2491407678244973e-05,
"loss": 0.709,
"step": 7990
},
{
"epoch": 2.9250457038391224,
"grad_norm": 3.511514663696289,
"learning_rate": 1.246946983546618e-05,
"loss": 0.97,
"step": 8000
},
{
"epoch": 2.9287020109689212,
"grad_norm": 6.093450546264648,
"learning_rate": 1.2447531992687387e-05,
"loss": 0.5408,
"step": 8010
},
{
"epoch": 2.9323583180987205,
"grad_norm": 3.3278634548187256,
"learning_rate": 1.2425594149908593e-05,
"loss": 0.7725,
"step": 8020
},
{
"epoch": 2.9360146252285193,
"grad_norm": 2.85172963142395,
"learning_rate": 1.2403656307129799e-05,
"loss": 0.6774,
"step": 8030
},
{
"epoch": 2.939670932358318,
"grad_norm": 3.4190468788146973,
"learning_rate": 1.2381718464351005e-05,
"loss": 0.7875,
"step": 8040
},
{
"epoch": 2.943327239488117,
"grad_norm": 4.990618705749512,
"learning_rate": 1.2359780621572213e-05,
"loss": 0.8861,
"step": 8050
},
{
"epoch": 2.9469835466179157,
"grad_norm": 3.767422676086426,
"learning_rate": 1.2337842778793419e-05,
"loss": 0.8276,
"step": 8060
},
{
"epoch": 2.950639853747715,
"grad_norm": 5.137510776519775,
"learning_rate": 1.2315904936014625e-05,
"loss": 0.7357,
"step": 8070
},
{
"epoch": 2.954296160877514,
"grad_norm": 7.071557998657227,
"learning_rate": 1.2293967093235831e-05,
"loss": 0.8031,
"step": 8080
},
{
"epoch": 2.9579524680073126,
"grad_norm": 1.2824524641036987,
"learning_rate": 1.2272029250457039e-05,
"loss": 0.7664,
"step": 8090
},
{
"epoch": 2.9616087751371114,
"grad_norm": 3.3414242267608643,
"learning_rate": 1.2250091407678245e-05,
"loss": 0.6475,
"step": 8100
},
{
"epoch": 2.96526508226691,
"grad_norm": 2.206388473510742,
"learning_rate": 1.2228153564899453e-05,
"loss": 0.8417,
"step": 8110
},
{
"epoch": 2.9689213893967095,
"grad_norm": 1.1660181283950806,
"learning_rate": 1.2208409506398537e-05,
"loss": 0.7856,
"step": 8120
},
{
"epoch": 2.9725776965265083,
"grad_norm": 4.5918121337890625,
"learning_rate": 1.2186471663619745e-05,
"loss": 1.0086,
"step": 8130
},
{
"epoch": 2.976234003656307,
"grad_norm": 5.133539199829102,
"learning_rate": 1.2164533820840951e-05,
"loss": 0.7742,
"step": 8140
},
{
"epoch": 2.979890310786106,
"grad_norm": 3.002700090408325,
"learning_rate": 1.2142595978062159e-05,
"loss": 0.6667,
"step": 8150
},
{
"epoch": 2.9835466179159047,
"grad_norm": 2.861591100692749,
"learning_rate": 1.2120658135283363e-05,
"loss": 0.688,
"step": 8160
},
{
"epoch": 2.987202925045704,
"grad_norm": 6.372570037841797,
"learning_rate": 1.2098720292504571e-05,
"loss": 0.7678,
"step": 8170
},
{
"epoch": 2.9908592321755028,
"grad_norm": 2.619347333908081,
"learning_rate": 1.2076782449725777e-05,
"loss": 0.6073,
"step": 8180
},
{
"epoch": 2.9945155393053016,
"grad_norm": 5.605367183685303,
"learning_rate": 1.2054844606946983e-05,
"loss": 1.0449,
"step": 8190
},
{
"epoch": 2.998171846435101,
"grad_norm": 5.622511863708496,
"learning_rate": 1.2032906764168191e-05,
"loss": 0.8425,
"step": 8200
},
{
"epoch": 3.0018281535648996,
"grad_norm": 6.9952712059021,
"learning_rate": 1.2010968921389397e-05,
"loss": 0.6519,
"step": 8210
},
{
"epoch": 3.0054844606946984,
"grad_norm": 3.982757806777954,
"learning_rate": 1.1989031078610603e-05,
"loss": 0.6918,
"step": 8220
},
{
"epoch": 3.0091407678244972,
"grad_norm": 0.8815748691558838,
"learning_rate": 1.196709323583181e-05,
"loss": 0.7903,
"step": 8230
},
{
"epoch": 3.012797074954296,
"grad_norm": 3.3442909717559814,
"learning_rate": 1.1945155393053017e-05,
"loss": 0.8296,
"step": 8240
},
{
"epoch": 3.016453382084095,
"grad_norm": 2.6583852767944336,
"learning_rate": 1.1923217550274223e-05,
"loss": 0.6184,
"step": 8250
},
{
"epoch": 3.020109689213894,
"grad_norm": 7.427060127258301,
"learning_rate": 1.1901279707495431e-05,
"loss": 0.8173,
"step": 8260
},
{
"epoch": 3.023765996343693,
"grad_norm": 2.647944927215576,
"learning_rate": 1.1879341864716636e-05,
"loss": 0.5993,
"step": 8270
},
{
"epoch": 3.0274223034734917,
"grad_norm": 4.050746917724609,
"learning_rate": 1.1857404021937844e-05,
"loss": 0.8844,
"step": 8280
},
{
"epoch": 3.0310786106032905,
"grad_norm": 3.5873947143554688,
"learning_rate": 1.183546617915905e-05,
"loss": 0.9434,
"step": 8290
},
{
"epoch": 3.03473491773309,
"grad_norm": 4.723058223724365,
"learning_rate": 1.1813528336380256e-05,
"loss": 0.9812,
"step": 8300
},
{
"epoch": 3.0383912248628886,
"grad_norm": 3.5461058616638184,
"learning_rate": 1.1791590493601464e-05,
"loss": 0.7292,
"step": 8310
},
{
"epoch": 3.0420475319926874,
"grad_norm": 4.339077949523926,
"learning_rate": 1.1769652650822668e-05,
"loss": 0.6777,
"step": 8320
},
{
"epoch": 3.045703839122486,
"grad_norm": 5.269365310668945,
"learning_rate": 1.1747714808043876e-05,
"loss": 0.9067,
"step": 8330
},
{
"epoch": 3.049360146252285,
"grad_norm": 4.5295562744140625,
"learning_rate": 1.1725776965265082e-05,
"loss": 0.843,
"step": 8340
},
{
"epoch": 3.0530164533820843,
"grad_norm": 2.6268155574798584,
"learning_rate": 1.170383912248629e-05,
"loss": 0.6744,
"step": 8350
},
{
"epoch": 3.056672760511883,
"grad_norm": 0.5700417757034302,
"learning_rate": 1.1681901279707496e-05,
"loss": 0.5303,
"step": 8360
},
{
"epoch": 3.060329067641682,
"grad_norm": 2.430975914001465,
"learning_rate": 1.1659963436928702e-05,
"loss": 0.4397,
"step": 8370
},
{
"epoch": 3.0639853747714807,
"grad_norm": 5.6289167404174805,
"learning_rate": 1.1638025594149908e-05,
"loss": 0.844,
"step": 8380
},
{
"epoch": 3.0676416819012795,
"grad_norm": 4.169682025909424,
"learning_rate": 1.1616087751371116e-05,
"loss": 0.6566,
"step": 8390
},
{
"epoch": 3.0712979890310788,
"grad_norm": 5.4011101722717285,
"learning_rate": 1.1594149908592322e-05,
"loss": 0.8949,
"step": 8400
},
{
"epoch": 3.0749542961608776,
"grad_norm": 6.648904323577881,
"learning_rate": 1.157221206581353e-05,
"loss": 1.166,
"step": 8410
},
{
"epoch": 3.0786106032906764,
"grad_norm": 6.321312427520752,
"learning_rate": 1.1550274223034735e-05,
"loss": 0.8976,
"step": 8420
},
{
"epoch": 3.082266910420475,
"grad_norm": 2.092905044555664,
"learning_rate": 1.152833638025594e-05,
"loss": 0.8257,
"step": 8430
},
{
"epoch": 3.0859232175502744,
"grad_norm": 2.951486825942993,
"learning_rate": 1.1506398537477149e-05,
"loss": 0.7798,
"step": 8440
},
{
"epoch": 3.0895795246800732,
"grad_norm": 2.4010651111602783,
"learning_rate": 1.1484460694698355e-05,
"loss": 0.8107,
"step": 8450
},
{
"epoch": 3.093235831809872,
"grad_norm": 5.102409362792969,
"learning_rate": 1.1462522851919563e-05,
"loss": 0.8601,
"step": 8460
},
{
"epoch": 3.096892138939671,
"grad_norm": 4.4188008308410645,
"learning_rate": 1.1440585009140767e-05,
"loss": 0.5271,
"step": 8470
},
{
"epoch": 3.1005484460694697,
"grad_norm": 2.7852301597595215,
"learning_rate": 1.1418647166361975e-05,
"loss": 0.5515,
"step": 8480
},
{
"epoch": 3.104204753199269,
"grad_norm": 3.6287953853607178,
"learning_rate": 1.1396709323583181e-05,
"loss": 0.9211,
"step": 8490
},
{
"epoch": 3.1078610603290677,
"grad_norm": 3.436657428741455,
"learning_rate": 1.1374771480804389e-05,
"loss": 0.7869,
"step": 8500
},
{
"epoch": 3.1115173674588665,
"grad_norm": 2.1031956672668457,
"learning_rate": 1.1352833638025595e-05,
"loss": 0.7247,
"step": 8510
},
{
"epoch": 3.1151736745886653,
"grad_norm": 3.4341351985931396,
"learning_rate": 1.1330895795246801e-05,
"loss": 0.747,
"step": 8520
},
{
"epoch": 3.118829981718464,
"grad_norm": 5.897623062133789,
"learning_rate": 1.1308957952468007e-05,
"loss": 0.7396,
"step": 8530
},
{
"epoch": 3.1224862888482634,
"grad_norm": 6.546688556671143,
"learning_rate": 1.1287020109689213e-05,
"loss": 0.9603,
"step": 8540
},
{
"epoch": 3.126142595978062,
"grad_norm": 3.705522060394287,
"learning_rate": 1.1265082266910421e-05,
"loss": 0.8995,
"step": 8550
},
{
"epoch": 3.129798903107861,
"grad_norm": 4.903218746185303,
"learning_rate": 1.1243144424131627e-05,
"loss": 0.8082,
"step": 8560
},
{
"epoch": 3.13345521023766,
"grad_norm": 3.2400360107421875,
"learning_rate": 1.1221206581352834e-05,
"loss": 0.8776,
"step": 8570
},
{
"epoch": 3.137111517367459,
"grad_norm": 5.3413472175598145,
"learning_rate": 1.119926873857404e-05,
"loss": 0.9777,
"step": 8580
},
{
"epoch": 3.140767824497258,
"grad_norm": 2.983618974685669,
"learning_rate": 1.1177330895795248e-05,
"loss": 0.8496,
"step": 8590
},
{
"epoch": 3.1444241316270567,
"grad_norm": 5.781644821166992,
"learning_rate": 1.1155393053016454e-05,
"loss": 0.8602,
"step": 8600
},
{
"epoch": 3.1480804387568555,
"grad_norm": 2.5962939262390137,
"learning_rate": 1.1133455210237662e-05,
"loss": 0.7064,
"step": 8610
},
{
"epoch": 3.1517367458866543,
"grad_norm": 1.6077173948287964,
"learning_rate": 1.1111517367458866e-05,
"loss": 0.9999,
"step": 8620
},
{
"epoch": 3.1553930530164536,
"grad_norm": 6.301138401031494,
"learning_rate": 1.1089579524680074e-05,
"loss": 0.7141,
"step": 8630
},
{
"epoch": 3.1590493601462524,
"grad_norm": 6.716737747192383,
"learning_rate": 1.106764168190128e-05,
"loss": 0.9286,
"step": 8640
},
{
"epoch": 3.162705667276051,
"grad_norm": 1.6867204904556274,
"learning_rate": 1.1045703839122488e-05,
"loss": 0.5474,
"step": 8650
},
{
"epoch": 3.16636197440585,
"grad_norm": 4.190735340118408,
"learning_rate": 1.1023765996343694e-05,
"loss": 0.8172,
"step": 8660
},
{
"epoch": 3.170018281535649,
"grad_norm": 4.85944128036499,
"learning_rate": 1.1001828153564898e-05,
"loss": 0.6575,
"step": 8670
},
{
"epoch": 3.173674588665448,
"grad_norm": 3.7237160205841064,
"learning_rate": 1.0979890310786106e-05,
"loss": 0.882,
"step": 8680
},
{
"epoch": 3.177330895795247,
"grad_norm": 3.742342710494995,
"learning_rate": 1.0957952468007312e-05,
"loss": 0.6917,
"step": 8690
},
{
"epoch": 3.1809872029250457,
"grad_norm": 3.6586384773254395,
"learning_rate": 1.093601462522852e-05,
"loss": 1.279,
"step": 8700
},
{
"epoch": 3.1846435100548445,
"grad_norm": 7.146944522857666,
"learning_rate": 1.0914076782449726e-05,
"loss": 0.944,
"step": 8710
},
{
"epoch": 3.1882998171846433,
"grad_norm": 4.166520595550537,
"learning_rate": 1.0892138939670932e-05,
"loss": 0.8725,
"step": 8720
},
{
"epoch": 3.1919561243144425,
"grad_norm": 3.07065486907959,
"learning_rate": 1.0870201096892139e-05,
"loss": 0.7427,
"step": 8730
},
{
"epoch": 3.1956124314442413,
"grad_norm": 3.676762342453003,
"learning_rate": 1.0848263254113346e-05,
"loss": 0.5641,
"step": 8740
},
{
"epoch": 3.19926873857404,
"grad_norm": 6.545246124267578,
"learning_rate": 1.0826325411334553e-05,
"loss": 0.8099,
"step": 8750
},
{
"epoch": 3.202925045703839,
"grad_norm": 4.962130069732666,
"learning_rate": 1.080438756855576e-05,
"loss": 0.7208,
"step": 8760
},
{
"epoch": 3.206581352833638,
"grad_norm": 1.6501739025115967,
"learning_rate": 1.0782449725776965e-05,
"loss": 0.7099,
"step": 8770
},
{
"epoch": 3.210237659963437,
"grad_norm": 1.7010256052017212,
"learning_rate": 1.0760511882998171e-05,
"loss": 0.6378,
"step": 8780
},
{
"epoch": 3.213893967093236,
"grad_norm": 3.4093239307403564,
"learning_rate": 1.0738574040219379e-05,
"loss": 0.9871,
"step": 8790
},
{
"epoch": 3.2175502742230346,
"grad_norm": 3.0757012367248535,
"learning_rate": 1.0716636197440585e-05,
"loss": 0.8088,
"step": 8800
},
{
"epoch": 3.2212065813528334,
"grad_norm": 5.524442672729492,
"learning_rate": 1.0694698354661793e-05,
"loss": 0.6628,
"step": 8810
},
{
"epoch": 3.2248628884826327,
"grad_norm": 5.470324993133545,
"learning_rate": 1.0672760511882997e-05,
"loss": 0.8868,
"step": 8820
},
{
"epoch": 3.2285191956124315,
"grad_norm": 4.4467363357543945,
"learning_rate": 1.0650822669104205e-05,
"loss": 0.8019,
"step": 8830
},
{
"epoch": 3.2321755027422303,
"grad_norm": 4.382303714752197,
"learning_rate": 1.0628884826325411e-05,
"loss": 0.7377,
"step": 8840
},
{
"epoch": 3.235831809872029,
"grad_norm": 5.965306282043457,
"learning_rate": 1.0606946983546619e-05,
"loss": 0.7711,
"step": 8850
},
{
"epoch": 3.2394881170018284,
"grad_norm": 3.7286956310272217,
"learning_rate": 1.0585009140767825e-05,
"loss": 0.6884,
"step": 8860
},
{
"epoch": 3.243144424131627,
"grad_norm": 4.183840274810791,
"learning_rate": 1.0563071297989031e-05,
"loss": 0.6954,
"step": 8870
},
{
"epoch": 3.246800731261426,
"grad_norm": 2.53548526763916,
"learning_rate": 1.0541133455210237e-05,
"loss": 0.674,
"step": 8880
},
{
"epoch": 3.250457038391225,
"grad_norm": 4.073317527770996,
"learning_rate": 1.0519195612431444e-05,
"loss": 0.7438,
"step": 8890
},
{
"epoch": 3.2541133455210236,
"grad_norm": 0.9088375568389893,
"learning_rate": 1.0497257769652651e-05,
"loss": 0.4774,
"step": 8900
},
{
"epoch": 3.257769652650823,
"grad_norm": 3.897162914276123,
"learning_rate": 1.0475319926873858e-05,
"loss": 0.9991,
"step": 8910
},
{
"epoch": 3.2614259597806217,
"grad_norm": 4.331843376159668,
"learning_rate": 1.0453382084095064e-05,
"loss": 0.7728,
"step": 8920
},
{
"epoch": 3.2650822669104205,
"grad_norm": 4.146157264709473,
"learning_rate": 1.043144424131627e-05,
"loss": 0.8487,
"step": 8930
},
{
"epoch": 3.2687385740402193,
"grad_norm": 3.263507127761841,
"learning_rate": 1.0409506398537478e-05,
"loss": 0.7174,
"step": 8940
},
{
"epoch": 3.272394881170018,
"grad_norm": 2.1005804538726807,
"learning_rate": 1.0387568555758684e-05,
"loss": 0.6679,
"step": 8950
},
{
"epoch": 3.2760511882998173,
"grad_norm": 5.195742607116699,
"learning_rate": 1.0365630712979892e-05,
"loss": 0.7573,
"step": 8960
},
{
"epoch": 3.279707495429616,
"grad_norm": 6.104463577270508,
"learning_rate": 1.0343692870201096e-05,
"loss": 0.9765,
"step": 8970
},
{
"epoch": 3.283363802559415,
"grad_norm": 4.647432327270508,
"learning_rate": 1.0321755027422304e-05,
"loss": 0.8738,
"step": 8980
},
{
"epoch": 3.2870201096892138,
"grad_norm": 2.8530044555664062,
"learning_rate": 1.029981718464351e-05,
"loss": 0.9031,
"step": 8990
},
{
"epoch": 3.2906764168190126,
"grad_norm": 3.9043076038360596,
"learning_rate": 1.0277879341864718e-05,
"loss": 0.8052,
"step": 9000
},
{
"epoch": 3.294332723948812,
"grad_norm": 4.055187225341797,
"learning_rate": 1.0255941499085924e-05,
"loss": 1.1009,
"step": 9010
},
{
"epoch": 3.2979890310786106,
"grad_norm": 5.00345516204834,
"learning_rate": 1.0234003656307129e-05,
"loss": 0.6651,
"step": 9020
},
{
"epoch": 3.3016453382084094,
"grad_norm": 6.529092788696289,
"learning_rate": 1.0212065813528336e-05,
"loss": 0.8191,
"step": 9030
},
{
"epoch": 3.3053016453382082,
"grad_norm": 6.646930694580078,
"learning_rate": 1.0190127970749543e-05,
"loss": 0.8653,
"step": 9040
},
{
"epoch": 3.3089579524680075,
"grad_norm": 3.7335169315338135,
"learning_rate": 1.016819012797075e-05,
"loss": 0.5912,
"step": 9050
},
{
"epoch": 3.3126142595978063,
"grad_norm": 4.354644298553467,
"learning_rate": 1.0146252285191956e-05,
"loss": 0.8478,
"step": 9060
},
{
"epoch": 3.316270566727605,
"grad_norm": 5.461722373962402,
"learning_rate": 1.0124314442413163e-05,
"loss": 0.8978,
"step": 9070
},
{
"epoch": 3.319926873857404,
"grad_norm": 5.001184463500977,
"learning_rate": 1.0102376599634369e-05,
"loss": 0.7269,
"step": 9080
},
{
"epoch": 3.3235831809872027,
"grad_norm": 6.416454792022705,
"learning_rate": 1.0080438756855577e-05,
"loss": 0.5774,
"step": 9090
},
{
"epoch": 3.327239488117002,
"grad_norm": 1.3187748193740845,
"learning_rate": 1.0058500914076783e-05,
"loss": 0.7587,
"step": 9100
},
{
"epoch": 3.330895795246801,
"grad_norm": 4.8642120361328125,
"learning_rate": 1.003656307129799e-05,
"loss": 0.5372,
"step": 9110
},
{
"epoch": 3.3345521023765996,
"grad_norm": 7.198103904724121,
"learning_rate": 1.0014625228519195e-05,
"loss": 0.7514,
"step": 9120
},
{
"epoch": 3.3382084095063984,
"grad_norm": 3.342548131942749,
"learning_rate": 9.992687385740401e-06,
"loss": 0.6805,
"step": 9130
},
{
"epoch": 3.3418647166361977,
"grad_norm": 7.126440048217773,
"learning_rate": 9.970749542961609e-06,
"loss": 0.7093,
"step": 9140
},
{
"epoch": 3.3455210237659965,
"grad_norm": 3.5442097187042236,
"learning_rate": 9.948811700182815e-06,
"loss": 1.0982,
"step": 9150
},
{
"epoch": 3.3491773308957953,
"grad_norm": 6.7846550941467285,
"learning_rate": 9.926873857404023e-06,
"loss": 1.0415,
"step": 9160
},
{
"epoch": 3.352833638025594,
"grad_norm": 4.274459362030029,
"learning_rate": 9.904936014625227e-06,
"loss": 0.8924,
"step": 9170
},
{
"epoch": 3.356489945155393,
"grad_norm": 1.8313312530517578,
"learning_rate": 9.882998171846435e-06,
"loss": 0.5321,
"step": 9180
},
{
"epoch": 3.360146252285192,
"grad_norm": 3.1850969791412354,
"learning_rate": 9.861060329067641e-06,
"loss": 0.5233,
"step": 9190
},
{
"epoch": 3.363802559414991,
"grad_norm": 4.866973400115967,
"learning_rate": 9.83912248628885e-06,
"loss": 1.079,
"step": 9200
},
{
"epoch": 3.3674588665447898,
"grad_norm": 6.718703269958496,
"learning_rate": 9.817184643510055e-06,
"loss": 0.8018,
"step": 9210
},
{
"epoch": 3.3711151736745886,
"grad_norm": 2.3948628902435303,
"learning_rate": 9.795246800731262e-06,
"loss": 0.7269,
"step": 9220
},
{
"epoch": 3.3747714808043874,
"grad_norm": 5.219935417175293,
"learning_rate": 9.773308957952468e-06,
"loss": 1.0058,
"step": 9230
},
{
"epoch": 3.3784277879341866,
"grad_norm": 2.0924437046051025,
"learning_rate": 9.751371115173675e-06,
"loss": 0.6634,
"step": 9240
},
{
"epoch": 3.3820840950639854,
"grad_norm": 3.7175605297088623,
"learning_rate": 9.729433272394882e-06,
"loss": 0.7466,
"step": 9250
},
{
"epoch": 3.3857404021937842,
"grad_norm": 2.551532745361328,
"learning_rate": 9.707495429616088e-06,
"loss": 0.5294,
"step": 9260
},
{
"epoch": 3.389396709323583,
"grad_norm": 4.496357440948486,
"learning_rate": 9.685557586837294e-06,
"loss": 0.6347,
"step": 9270
},
{
"epoch": 3.393053016453382,
"grad_norm": 3.644022226333618,
"learning_rate": 9.6636197440585e-06,
"loss": 0.6684,
"step": 9280
},
{
"epoch": 3.396709323583181,
"grad_norm": 7.155831336975098,
"learning_rate": 9.641681901279708e-06,
"loss": 0.6775,
"step": 9290
},
{
"epoch": 3.40036563071298,
"grad_norm": 2.396113872528076,
"learning_rate": 9.619744058500914e-06,
"loss": 0.9559,
"step": 9300
},
{
"epoch": 3.4040219378427787,
"grad_norm": 4.719156742095947,
"learning_rate": 9.597806215722122e-06,
"loss": 0.7062,
"step": 9310
},
{
"epoch": 3.4076782449725775,
"grad_norm": 6.176454544067383,
"learning_rate": 9.575868372943328e-06,
"loss": 0.6693,
"step": 9320
},
{
"epoch": 3.411334552102377,
"grad_norm": 5.314862251281738,
"learning_rate": 9.553930530164534e-06,
"loss": 0.7944,
"step": 9330
},
{
"epoch": 3.4149908592321756,
"grad_norm": 3.4913902282714844,
"learning_rate": 9.53199268738574e-06,
"loss": 0.648,
"step": 9340
},
{
"epoch": 3.4186471663619744,
"grad_norm": 5.6252217292785645,
"learning_rate": 9.510054844606948e-06,
"loss": 1.126,
"step": 9350
},
{
"epoch": 3.422303473491773,
"grad_norm": 2.5324652194976807,
"learning_rate": 9.488117001828154e-06,
"loss": 0.6604,
"step": 9360
},
{
"epoch": 3.425959780621572,
"grad_norm": 1.6346749067306519,
"learning_rate": 9.46617915904936e-06,
"loss": 0.7154,
"step": 9370
},
{
"epoch": 3.4296160877513713,
"grad_norm": 2.9343535900115967,
"learning_rate": 9.444241316270567e-06,
"loss": 0.8223,
"step": 9380
},
{
"epoch": 3.43327239488117,
"grad_norm": 5.402102947235107,
"learning_rate": 9.422303473491773e-06,
"loss": 1.0261,
"step": 9390
},
{
"epoch": 3.436928702010969,
"grad_norm": 4.360336780548096,
"learning_rate": 9.40036563071298e-06,
"loss": 0.8469,
"step": 9400
},
{
"epoch": 3.4405850091407677,
"grad_norm": 2.100147008895874,
"learning_rate": 9.378427787934187e-06,
"loss": 0.8319,
"step": 9410
},
{
"epoch": 3.444241316270567,
"grad_norm": 5.960880279541016,
"learning_rate": 9.356489945155395e-06,
"loss": 0.8822,
"step": 9420
},
{
"epoch": 3.4478976234003658,
"grad_norm": 1.511212706565857,
"learning_rate": 9.334552102376599e-06,
"loss": 0.6203,
"step": 9430
},
{
"epoch": 3.4515539305301646,
"grad_norm": 6.034298896789551,
"learning_rate": 9.312614259597807e-06,
"loss": 1.0723,
"step": 9440
},
{
"epoch": 3.4552102376599634,
"grad_norm": 3.4445579051971436,
"learning_rate": 9.290676416819013e-06,
"loss": 0.8172,
"step": 9450
},
{
"epoch": 3.458866544789762,
"grad_norm": 3.7017529010772705,
"learning_rate": 9.26873857404022e-06,
"loss": 0.8642,
"step": 9460
},
{
"epoch": 3.4625228519195614,
"grad_norm": 1.1782617568969727,
"learning_rate": 9.246800731261427e-06,
"loss": 0.738,
"step": 9470
},
{
"epoch": 3.4661791590493602,
"grad_norm": 3.201063394546509,
"learning_rate": 9.224862888482633e-06,
"loss": 0.658,
"step": 9480
},
{
"epoch": 3.469835466179159,
"grad_norm": 6.244758605957031,
"learning_rate": 9.20292504570384e-06,
"loss": 0.8515,
"step": 9490
},
{
"epoch": 3.473491773308958,
"grad_norm": 1.5054762363433838,
"learning_rate": 9.180987202925045e-06,
"loss": 0.6815,
"step": 9500
},
{
"epoch": 3.4771480804387567,
"grad_norm": 4.566993236541748,
"learning_rate": 9.159049360146253e-06,
"loss": 0.9876,
"step": 9510
},
{
"epoch": 3.480804387568556,
"grad_norm": 2.5225489139556885,
"learning_rate": 9.13711151736746e-06,
"loss": 0.6651,
"step": 9520
},
{
"epoch": 3.4844606946983547,
"grad_norm": 2.050199031829834,
"learning_rate": 9.115173674588665e-06,
"loss": 0.5577,
"step": 9530
},
{
"epoch": 3.4881170018281535,
"grad_norm": 4.673213958740234,
"learning_rate": 9.093235831809872e-06,
"loss": 0.4605,
"step": 9540
},
{
"epoch": 3.4917733089579523,
"grad_norm": 3.7386956214904785,
"learning_rate": 9.07129798903108e-06,
"loss": 0.7403,
"step": 9550
},
{
"epoch": 3.495429616087751,
"grad_norm": 3.0746006965637207,
"learning_rate": 9.049360146252286e-06,
"loss": 0.7544,
"step": 9560
},
{
"epoch": 3.4990859232175504,
"grad_norm": 2.793351650238037,
"learning_rate": 9.027422303473493e-06,
"loss": 0.6867,
"step": 9570
},
{
"epoch": 3.502742230347349,
"grad_norm": 2.7322490215301514,
"learning_rate": 9.005484460694698e-06,
"loss": 0.5481,
"step": 9580
},
{
"epoch": 3.506398537477148,
"grad_norm": 5.938803195953369,
"learning_rate": 8.983546617915906e-06,
"loss": 0.7047,
"step": 9590
},
{
"epoch": 3.510054844606947,
"grad_norm": 4.601770877838135,
"learning_rate": 8.961608775137112e-06,
"loss": 0.9434,
"step": 9600
},
{
"epoch": 3.5137111517367456,
"grad_norm": 4.575321674346924,
"learning_rate": 8.939670932358318e-06,
"loss": 0.9526,
"step": 9610
},
{
"epoch": 3.517367458866545,
"grad_norm": 2.3321361541748047,
"learning_rate": 8.917733089579526e-06,
"loss": 0.8838,
"step": 9620
},
{
"epoch": 3.5210237659963437,
"grad_norm": 4.160899639129639,
"learning_rate": 8.89579524680073e-06,
"loss": 0.7722,
"step": 9630
},
{
"epoch": 3.5246800731261425,
"grad_norm": 4.240328311920166,
"learning_rate": 8.873857404021938e-06,
"loss": 0.8319,
"step": 9640
},
{
"epoch": 3.5283363802559418,
"grad_norm": 5.453382968902588,
"learning_rate": 8.851919561243144e-06,
"loss": 0.7496,
"step": 9650
},
{
"epoch": 3.53199268738574,
"grad_norm": 4.4032087326049805,
"learning_rate": 8.829981718464352e-06,
"loss": 0.7413,
"step": 9660
},
{
"epoch": 3.5356489945155394,
"grad_norm": 1.5674322843551636,
"learning_rate": 8.808043875685558e-06,
"loss": 0.7772,
"step": 9670
},
{
"epoch": 3.539305301645338,
"grad_norm": 1.919179916381836,
"learning_rate": 8.786106032906764e-06,
"loss": 0.6127,
"step": 9680
},
{
"epoch": 3.542961608775137,
"grad_norm": 5.616965293884277,
"learning_rate": 8.76416819012797e-06,
"loss": 1.054,
"step": 9690
},
{
"epoch": 3.5466179159049362,
"grad_norm": 4.339515209197998,
"learning_rate": 8.742230347349178e-06,
"loss": 0.8764,
"step": 9700
},
{
"epoch": 3.550274223034735,
"grad_norm": 2.599030017852783,
"learning_rate": 8.720292504570384e-06,
"loss": 0.6655,
"step": 9710
},
{
"epoch": 3.553930530164534,
"grad_norm": 7.379239082336426,
"learning_rate": 8.69835466179159e-06,
"loss": 0.8186,
"step": 9720
},
{
"epoch": 3.5575868372943327,
"grad_norm": 5.922464847564697,
"learning_rate": 8.676416819012797e-06,
"loss": 0.7687,
"step": 9730
},
{
"epoch": 3.5612431444241315,
"grad_norm": 1.1867303848266602,
"learning_rate": 8.654478976234003e-06,
"loss": 0.7539,
"step": 9740
},
{
"epoch": 3.5648994515539307,
"grad_norm": 3.390425205230713,
"learning_rate": 8.63254113345521e-06,
"loss": 0.6565,
"step": 9750
},
{
"epoch": 3.5685557586837295,
"grad_norm": 3.1860547065734863,
"learning_rate": 8.610603290676417e-06,
"loss": 0.6849,
"step": 9760
},
{
"epoch": 3.5722120658135283,
"grad_norm": 2.4596757888793945,
"learning_rate": 8.588665447897625e-06,
"loss": 0.7395,
"step": 9770
},
{
"epoch": 3.575868372943327,
"grad_norm": 2.9441282749176025,
"learning_rate": 8.566727605118829e-06,
"loss": 0.5293,
"step": 9780
},
{
"epoch": 3.579524680073126,
"grad_norm": 1.4628350734710693,
"learning_rate": 8.544789762340037e-06,
"loss": 0.8293,
"step": 9790
},
{
"epoch": 3.583180987202925,
"grad_norm": 7.661937236785889,
"learning_rate": 8.522851919561243e-06,
"loss": 0.7429,
"step": 9800
},
{
"epoch": 3.586837294332724,
"grad_norm": 2.91107177734375,
"learning_rate": 8.500914076782451e-06,
"loss": 0.6905,
"step": 9810
},
{
"epoch": 3.590493601462523,
"grad_norm": 1.8382437229156494,
"learning_rate": 8.478976234003657e-06,
"loss": 0.7484,
"step": 9820
},
{
"epoch": 3.5941499085923216,
"grad_norm": 5.709616661071777,
"learning_rate": 8.457038391224863e-06,
"loss": 0.9248,
"step": 9830
},
{
"epoch": 3.5978062157221204,
"grad_norm": 4.454899311065674,
"learning_rate": 8.43510054844607e-06,
"loss": 0.6578,
"step": 9840
},
{
"epoch": 3.6014625228519197,
"grad_norm": 6.460973739624023,
"learning_rate": 8.413162705667276e-06,
"loss": 0.8633,
"step": 9850
},
{
"epoch": 3.6051188299817185,
"grad_norm": 2.352285146713257,
"learning_rate": 8.391224862888483e-06,
"loss": 0.6608,
"step": 9860
},
{
"epoch": 3.6087751371115173,
"grad_norm": 2.8091228008270264,
"learning_rate": 8.36928702010969e-06,
"loss": 0.642,
"step": 9870
},
{
"epoch": 3.612431444241316,
"grad_norm": 2.4271621704101562,
"learning_rate": 8.347349177330896e-06,
"loss": 0.5951,
"step": 9880
},
{
"epoch": 3.616087751371115,
"grad_norm": 5.804758548736572,
"learning_rate": 8.325411334552102e-06,
"loss": 0.6621,
"step": 9890
},
{
"epoch": 3.619744058500914,
"grad_norm": 3.8473427295684814,
"learning_rate": 8.30347349177331e-06,
"loss": 0.6347,
"step": 9900
},
{
"epoch": 3.623400365630713,
"grad_norm": 3.387230396270752,
"learning_rate": 8.281535648994516e-06,
"loss": 0.7107,
"step": 9910
},
{
"epoch": 3.627056672760512,
"grad_norm": 7.850528240203857,
"learning_rate": 8.259597806215724e-06,
"loss": 0.7608,
"step": 9920
},
{
"epoch": 3.630712979890311,
"grad_norm": 4.779109954833984,
"learning_rate": 8.237659963436928e-06,
"loss": 0.9004,
"step": 9930
},
{
"epoch": 3.6343692870201094,
"grad_norm": 7.75559139251709,
"learning_rate": 8.215722120658136e-06,
"loss": 0.9884,
"step": 9940
},
{
"epoch": 3.6380255941499087,
"grad_norm": 3.2816567420959473,
"learning_rate": 8.193784277879342e-06,
"loss": 0.9046,
"step": 9950
},
{
"epoch": 3.6416819012797075,
"grad_norm": 3.8553521633148193,
"learning_rate": 8.171846435100548e-06,
"loss": 0.6122,
"step": 9960
},
{
"epoch": 3.6453382084095063,
"grad_norm": 4.713034152984619,
"learning_rate": 8.149908592321756e-06,
"loss": 0.7977,
"step": 9970
},
{
"epoch": 3.6489945155393055,
"grad_norm": 8.331437110900879,
"learning_rate": 8.12797074954296e-06,
"loss": 0.7995,
"step": 9980
},
{
"epoch": 3.6526508226691043,
"grad_norm": 2.3194291591644287,
"learning_rate": 8.106032906764168e-06,
"loss": 0.9511,
"step": 9990
},
{
"epoch": 3.656307129798903,
"grad_norm": 5.6562676429748535,
"learning_rate": 8.084095063985374e-06,
"loss": 0.7415,
"step": 10000
},
{
"epoch": 3.659963436928702,
"grad_norm": 3.207094192504883,
"learning_rate": 8.062157221206582e-06,
"loss": 0.7371,
"step": 10010
},
{
"epoch": 3.6636197440585008,
"grad_norm": 5.320219993591309,
"learning_rate": 8.040219378427788e-06,
"loss": 0.8301,
"step": 10020
},
{
"epoch": 3.6672760511883,
"grad_norm": 3.936784505844116,
"learning_rate": 8.018281535648995e-06,
"loss": 0.759,
"step": 10030
},
{
"epoch": 3.670932358318099,
"grad_norm": 1.9420430660247803,
"learning_rate": 7.9963436928702e-06,
"loss": 0.9414,
"step": 10040
},
{
"epoch": 3.6745886654478976,
"grad_norm": 5.9929728507995605,
"learning_rate": 7.974405850091408e-06,
"loss": 0.6761,
"step": 10050
},
{
"epoch": 3.6782449725776964,
"grad_norm": 5.185636520385742,
"learning_rate": 7.952468007312615e-06,
"loss": 0.775,
"step": 10060
},
{
"epoch": 3.6819012797074953,
"grad_norm": 2.565422534942627,
"learning_rate": 7.930530164533822e-06,
"loss": 0.7599,
"step": 10070
},
{
"epoch": 3.6855575868372945,
"grad_norm": 6.941178321838379,
"learning_rate": 7.908592321755027e-06,
"loss": 0.6673,
"step": 10080
},
{
"epoch": 3.6892138939670933,
"grad_norm": 3.0745110511779785,
"learning_rate": 7.886654478976233e-06,
"loss": 0.685,
"step": 10090
},
{
"epoch": 3.692870201096892,
"grad_norm": 4.359233379364014,
"learning_rate": 7.864716636197441e-06,
"loss": 0.7306,
"step": 10100
},
{
"epoch": 3.696526508226691,
"grad_norm": 2.1655170917510986,
"learning_rate": 7.842778793418647e-06,
"loss": 0.6061,
"step": 10110
},
{
"epoch": 3.7001828153564897,
"grad_norm": 2.5100502967834473,
"learning_rate": 7.820840950639855e-06,
"loss": 0.5755,
"step": 10120
},
{
"epoch": 3.703839122486289,
"grad_norm": 2.577319383621216,
"learning_rate": 7.801096892138939e-06,
"loss": 0.8511,
"step": 10130
},
{
"epoch": 3.707495429616088,
"grad_norm": 4.023679733276367,
"learning_rate": 7.779159049360147e-06,
"loss": 0.9649,
"step": 10140
},
{
"epoch": 3.7111517367458866,
"grad_norm": 3.2172110080718994,
"learning_rate": 7.757221206581353e-06,
"loss": 0.9499,
"step": 10150
},
{
"epoch": 3.7148080438756854,
"grad_norm": 4.36275053024292,
"learning_rate": 7.73528336380256e-06,
"loss": 0.7763,
"step": 10160
},
{
"epoch": 3.7184643510054842,
"grad_norm": 4.072483062744141,
"learning_rate": 7.713345521023765e-06,
"loss": 0.7541,
"step": 10170
},
{
"epoch": 3.7221206581352835,
"grad_norm": 4.370612144470215,
"learning_rate": 7.691407678244973e-06,
"loss": 1.0629,
"step": 10180
},
{
"epoch": 3.7257769652650823,
"grad_norm": 3.0197012424468994,
"learning_rate": 7.669469835466179e-06,
"loss": 0.6874,
"step": 10190
},
{
"epoch": 3.729433272394881,
"grad_norm": 2.190140962600708,
"learning_rate": 7.647531992687387e-06,
"loss": 0.784,
"step": 10200
},
{
"epoch": 3.7330895795246803,
"grad_norm": 1.6328208446502686,
"learning_rate": 7.625594149908592e-06,
"loss": 0.7953,
"step": 10210
},
{
"epoch": 3.7367458866544787,
"grad_norm": 4.16575288772583,
"learning_rate": 7.6036563071298e-06,
"loss": 0.8314,
"step": 10220
},
{
"epoch": 3.740402193784278,
"grad_norm": 6.011321067810059,
"learning_rate": 7.581718464351006e-06,
"loss": 0.6144,
"step": 10230
},
{
"epoch": 3.7440585009140768,
"grad_norm": 4.7472710609436035,
"learning_rate": 7.559780621572211e-06,
"loss": 0.7889,
"step": 10240
},
{
"epoch": 3.7477148080438756,
"grad_norm": 2.6220803260803223,
"learning_rate": 7.537842778793419e-06,
"loss": 0.7036,
"step": 10250
},
{
"epoch": 3.751371115173675,
"grad_norm": 2.190154552459717,
"learning_rate": 7.5159049360146245e-06,
"loss": 0.9437,
"step": 10260
},
{
"epoch": 3.7550274223034736,
"grad_norm": 4.362695693969727,
"learning_rate": 7.493967093235832e-06,
"loss": 0.8136,
"step": 10270
},
{
"epoch": 3.7586837294332724,
"grad_norm": 3.7511837482452393,
"learning_rate": 7.472029250457039e-06,
"loss": 0.7365,
"step": 10280
},
{
"epoch": 3.7623400365630713,
"grad_norm": 2.2203571796417236,
"learning_rate": 7.450091407678245e-06,
"loss": 0.6625,
"step": 10290
},
{
"epoch": 3.76599634369287,
"grad_norm": 4.447721004486084,
"learning_rate": 7.4281535648994516e-06,
"loss": 0.8594,
"step": 10300
},
{
"epoch": 3.7696526508226693,
"grad_norm": 5.554366111755371,
"learning_rate": 7.406215722120658e-06,
"loss": 0.891,
"step": 10310
},
{
"epoch": 3.773308957952468,
"grad_norm": 5.551204681396484,
"learning_rate": 7.384277879341865e-06,
"loss": 0.6996,
"step": 10320
},
{
"epoch": 3.776965265082267,
"grad_norm": 2.1783394813537598,
"learning_rate": 7.362340036563072e-06,
"loss": 0.7032,
"step": 10330
},
{
"epoch": 3.7806215722120657,
"grad_norm": 2.8184330463409424,
"learning_rate": 7.340402193784278e-06,
"loss": 0.7221,
"step": 10340
},
{
"epoch": 3.7842778793418645,
"grad_norm": 3.869269609451294,
"learning_rate": 7.318464351005485e-06,
"loss": 0.8456,
"step": 10350
},
{
"epoch": 3.787934186471664,
"grad_norm": 1.1639561653137207,
"learning_rate": 7.296526508226691e-06,
"loss": 0.9325,
"step": 10360
},
{
"epoch": 3.7915904936014626,
"grad_norm": 2.3072006702423096,
"learning_rate": 7.274588665447898e-06,
"loss": 0.6822,
"step": 10370
},
{
"epoch": 3.7952468007312614,
"grad_norm": 4.453368186950684,
"learning_rate": 7.252650822669105e-06,
"loss": 0.6115,
"step": 10380
},
{
"epoch": 3.7989031078610602,
"grad_norm": 2.4103519916534424,
"learning_rate": 7.230712979890311e-06,
"loss": 0.7449,
"step": 10390
},
{
"epoch": 3.802559414990859,
"grad_norm": 5.65090274810791,
"learning_rate": 7.208775137111518e-06,
"loss": 0.9821,
"step": 10400
},
{
"epoch": 3.8062157221206583,
"grad_norm": 4.507080078125,
"learning_rate": 7.186837294332723e-06,
"loss": 0.7213,
"step": 10410
},
{
"epoch": 3.809872029250457,
"grad_norm": 2.4969277381896973,
"learning_rate": 7.16489945155393e-06,
"loss": 1.2508,
"step": 10420
},
{
"epoch": 3.813528336380256,
"grad_norm": 1.090476393699646,
"learning_rate": 7.142961608775137e-06,
"loss": 0.6157,
"step": 10430
},
{
"epoch": 3.8171846435100547,
"grad_norm": 2.3121488094329834,
"learning_rate": 7.1210237659963435e-06,
"loss": 0.816,
"step": 10440
},
{
"epoch": 3.8208409506398535,
"grad_norm": 3.1048355102539062,
"learning_rate": 7.0990859232175505e-06,
"loss": 0.5809,
"step": 10450
},
{
"epoch": 3.8244972577696528,
"grad_norm": 4.020531177520752,
"learning_rate": 7.077148080438757e-06,
"loss": 0.9782,
"step": 10460
},
{
"epoch": 3.8281535648994516,
"grad_norm": 3.6427266597747803,
"learning_rate": 7.055210237659964e-06,
"loss": 0.6038,
"step": 10470
},
{
"epoch": 3.8318098720292504,
"grad_norm": 4.342096328735352,
"learning_rate": 7.033272394881171e-06,
"loss": 1.2473,
"step": 10480
},
{
"epoch": 3.835466179159049,
"grad_norm": 3.162109136581421,
"learning_rate": 7.011334552102377e-06,
"loss": 0.6195,
"step": 10490
},
{
"epoch": 3.839122486288848,
"grad_norm": 2.9012115001678467,
"learning_rate": 6.989396709323584e-06,
"loss": 0.8822,
"step": 10500
},
{
"epoch": 3.8427787934186473,
"grad_norm": 6.881933212280273,
"learning_rate": 6.96745886654479e-06,
"loss": 0.8352,
"step": 10510
},
{
"epoch": 3.846435100548446,
"grad_norm": 6.350467681884766,
"learning_rate": 6.945521023765997e-06,
"loss": 0.9048,
"step": 10520
},
{
"epoch": 3.850091407678245,
"grad_norm": 2.833682060241699,
"learning_rate": 6.923583180987203e-06,
"loss": 0.8362,
"step": 10530
},
{
"epoch": 3.853747714808044,
"grad_norm": 5.460103511810303,
"learning_rate": 6.901645338208409e-06,
"loss": 0.5568,
"step": 10540
},
{
"epoch": 3.857404021937843,
"grad_norm": 2.551905870437622,
"learning_rate": 6.879707495429616e-06,
"loss": 0.8165,
"step": 10550
},
{
"epoch": 3.8610603290676417,
"grad_norm": 4.430031776428223,
"learning_rate": 6.857769652650823e-06,
"loss": 0.9246,
"step": 10560
},
{
"epoch": 3.8647166361974405,
"grad_norm": 2.5683767795562744,
"learning_rate": 6.835831809872029e-06,
"loss": 0.7689,
"step": 10570
},
{
"epoch": 3.8683729433272394,
"grad_norm": 2.5122482776641846,
"learning_rate": 6.813893967093236e-06,
"loss": 0.8126,
"step": 10580
},
{
"epoch": 3.8720292504570386,
"grad_norm": 3.8249447345733643,
"learning_rate": 6.791956124314442e-06,
"loss": 0.6062,
"step": 10590
},
{
"epoch": 3.8756855575868374,
"grad_norm": 3.5439441204071045,
"learning_rate": 6.770018281535649e-06,
"loss": 0.621,
"step": 10600
},
{
"epoch": 3.8793418647166362,
"grad_norm": 4.30275297164917,
"learning_rate": 6.748080438756856e-06,
"loss": 0.8285,
"step": 10610
},
{
"epoch": 3.882998171846435,
"grad_norm": 4.716472148895264,
"learning_rate": 6.7261425959780625e-06,
"loss": 0.5947,
"step": 10620
},
{
"epoch": 3.886654478976234,
"grad_norm": 5.69554328918457,
"learning_rate": 6.7042047531992695e-06,
"loss": 0.6366,
"step": 10630
},
{
"epoch": 3.890310786106033,
"grad_norm": 6.0481133460998535,
"learning_rate": 6.682266910420476e-06,
"loss": 0.6759,
"step": 10640
},
{
"epoch": 3.893967093235832,
"grad_norm": 5.054582118988037,
"learning_rate": 6.660329067641682e-06,
"loss": 0.9449,
"step": 10650
},
{
"epoch": 3.8976234003656307,
"grad_norm": 4.874343395233154,
"learning_rate": 6.638391224862889e-06,
"loss": 0.542,
"step": 10660
},
{
"epoch": 3.9012797074954295,
"grad_norm": 1.556717872619629,
"learning_rate": 6.616453382084095e-06,
"loss": 0.9563,
"step": 10670
},
{
"epoch": 3.9049360146252283,
"grad_norm": 1.8250552415847778,
"learning_rate": 6.594515539305302e-06,
"loss": 0.6699,
"step": 10680
},
{
"epoch": 3.9085923217550276,
"grad_norm": 1.2335312366485596,
"learning_rate": 6.572577696526508e-06,
"loss": 0.8833,
"step": 10690
},
{
"epoch": 3.9122486288848264,
"grad_norm": 4.553168296813965,
"learning_rate": 6.550639853747715e-06,
"loss": 0.7619,
"step": 10700
},
{
"epoch": 3.915904936014625,
"grad_norm": 5.518167495727539,
"learning_rate": 6.528702010968922e-06,
"loss": 0.9136,
"step": 10710
},
{
"epoch": 3.919561243144424,
"grad_norm": 4.577470302581787,
"learning_rate": 6.506764168190128e-06,
"loss": 0.8478,
"step": 10720
},
{
"epoch": 3.923217550274223,
"grad_norm": 5.852701663970947,
"learning_rate": 6.484826325411335e-06,
"loss": 0.9866,
"step": 10730
},
{
"epoch": 3.926873857404022,
"grad_norm": 2.7787961959838867,
"learning_rate": 6.462888482632541e-06,
"loss": 0.788,
"step": 10740
},
{
"epoch": 3.930530164533821,
"grad_norm": 5.320887565612793,
"learning_rate": 6.440950639853748e-06,
"loss": 0.8878,
"step": 10750
},
{
"epoch": 3.9341864716636197,
"grad_norm": 5.620364665985107,
"learning_rate": 6.419012797074955e-06,
"loss": 0.7458,
"step": 10760
},
{
"epoch": 3.9378427787934185,
"grad_norm": 4.398257732391357,
"learning_rate": 6.3970749542961605e-06,
"loss": 0.7291,
"step": 10770
},
{
"epoch": 3.9414990859232173,
"grad_norm": 1.9630357027053833,
"learning_rate": 6.3751371115173675e-06,
"loss": 0.8595,
"step": 10780
},
{
"epoch": 3.9451553930530165,
"grad_norm": 3.069357395172119,
"learning_rate": 6.353199268738574e-06,
"loss": 0.5498,
"step": 10790
},
{
"epoch": 3.9488117001828154,
"grad_norm": 8.382603645324707,
"learning_rate": 6.331261425959781e-06,
"loss": 1.0477,
"step": 10800
},
{
"epoch": 3.952468007312614,
"grad_norm": 2.2028815746307373,
"learning_rate": 6.309323583180988e-06,
"loss": 0.7652,
"step": 10810
},
{
"epoch": 3.9561243144424134,
"grad_norm": 5.587583541870117,
"learning_rate": 6.287385740402194e-06,
"loss": 0.777,
"step": 10820
},
{
"epoch": 3.9597806215722122,
"grad_norm": 4.032431602478027,
"learning_rate": 6.265447897623401e-06,
"loss": 0.7834,
"step": 10830
},
{
"epoch": 3.963436928702011,
"grad_norm": 3.680415630340576,
"learning_rate": 6.243510054844607e-06,
"loss": 0.5833,
"step": 10840
},
{
"epoch": 3.96709323583181,
"grad_norm": 2.4800500869750977,
"learning_rate": 6.221572212065814e-06,
"loss": 0.858,
"step": 10850
},
{
"epoch": 3.9707495429616086,
"grad_norm": 4.882104873657227,
"learning_rate": 6.199634369287021e-06,
"loss": 0.8244,
"step": 10860
},
{
"epoch": 3.974405850091408,
"grad_norm": 5.2901411056518555,
"learning_rate": 6.177696526508227e-06,
"loss": 0.8658,
"step": 10870
},
{
"epoch": 3.9780621572212067,
"grad_norm": 7.267496109008789,
"learning_rate": 6.155758683729433e-06,
"loss": 0.7934,
"step": 10880
},
{
"epoch": 3.9817184643510055,
"grad_norm": 5.89931058883667,
"learning_rate": 6.133820840950639e-06,
"loss": 0.7339,
"step": 10890
},
{
"epoch": 3.9853747714808043,
"grad_norm": 5.361083507537842,
"learning_rate": 6.111882998171846e-06,
"loss": 0.8397,
"step": 10900
},
{
"epoch": 3.989031078610603,
"grad_norm": 3.948314666748047,
"learning_rate": 6.089945155393053e-06,
"loss": 0.7595,
"step": 10910
},
{
"epoch": 3.9926873857404024,
"grad_norm": 1.0359902381896973,
"learning_rate": 6.068007312614259e-06,
"loss": 0.6643,
"step": 10920
},
{
"epoch": 3.996343692870201,
"grad_norm": 5.438472270965576,
"learning_rate": 6.046069469835466e-06,
"loss": 0.8025,
"step": 10930
},
{
"epoch": 4.0,
"grad_norm": 5.690487384796143,
"learning_rate": 6.0241316270566725e-06,
"loss": 0.8951,
"step": 10940
},
{
"epoch": 4.003656307129799,
"grad_norm": 6.72605562210083,
"learning_rate": 6.0021937842778795e-06,
"loss": 0.7852,
"step": 10950
},
{
"epoch": 4.007312614259598,
"grad_norm": 6.367304801940918,
"learning_rate": 5.9802559414990865e-06,
"loss": 0.9468,
"step": 10960
},
{
"epoch": 4.010968921389397,
"grad_norm": 4.209175109863281,
"learning_rate": 5.958318098720293e-06,
"loss": 0.7559,
"step": 10970
},
{
"epoch": 4.014625228519195,
"grad_norm": 2.612675428390503,
"learning_rate": 5.9363802559415e-06,
"loss": 0.6464,
"step": 10980
},
{
"epoch": 4.0182815356489945,
"grad_norm": 3.516434907913208,
"learning_rate": 5.914442413162706e-06,
"loss": 0.761,
"step": 10990
},
{
"epoch": 4.021937842778794,
"grad_norm": 3.522313117980957,
"learning_rate": 5.892504570383912e-06,
"loss": 0.9033,
"step": 11000
},
{
"epoch": 4.025594149908592,
"grad_norm": 3.2648613452911377,
"learning_rate": 5.870566727605119e-06,
"loss": 0.5933,
"step": 11010
},
{
"epoch": 4.029250457038391,
"grad_norm": 4.611745357513428,
"learning_rate": 5.848628884826325e-06,
"loss": 1.1358,
"step": 11020
},
{
"epoch": 4.03290676416819,
"grad_norm": 2.9652693271636963,
"learning_rate": 5.826691042047532e-06,
"loss": 0.7771,
"step": 11030
},
{
"epoch": 4.036563071297989,
"grad_norm": 4.490486145019531,
"learning_rate": 5.804753199268738e-06,
"loss": 0.8091,
"step": 11040
},
{
"epoch": 4.040219378427788,
"grad_norm": 4.961881637573242,
"learning_rate": 5.782815356489945e-06,
"loss": 0.7322,
"step": 11050
},
{
"epoch": 4.043875685557587,
"grad_norm": 4.714334011077881,
"learning_rate": 5.760877513711152e-06,
"loss": 0.6583,
"step": 11060
},
{
"epoch": 4.047531992687386,
"grad_norm": 4.1526570320129395,
"learning_rate": 5.738939670932358e-06,
"loss": 0.9548,
"step": 11070
},
{
"epoch": 4.051188299817184,
"grad_norm": 6.9063239097595215,
"learning_rate": 5.717001828153565e-06,
"loss": 1.1938,
"step": 11080
},
{
"epoch": 4.0548446069469835,
"grad_norm": 4.847267150878906,
"learning_rate": 5.6950639853747714e-06,
"loss": 0.7049,
"step": 11090
},
{
"epoch": 4.058500914076783,
"grad_norm": 4.072165489196777,
"learning_rate": 5.6731261425959784e-06,
"loss": 0.5814,
"step": 11100
},
{
"epoch": 4.062157221206581,
"grad_norm": 7.266864776611328,
"learning_rate": 5.651188299817185e-06,
"loss": 0.9592,
"step": 11110
},
{
"epoch": 4.06581352833638,
"grad_norm": 4.926406383514404,
"learning_rate": 5.629250457038391e-06,
"loss": 0.5798,
"step": 11120
},
{
"epoch": 4.06946983546618,
"grad_norm": 5.232889175415039,
"learning_rate": 5.607312614259598e-06,
"loss": 0.7774,
"step": 11130
},
{
"epoch": 4.073126142595978,
"grad_norm": 2.904597759246826,
"learning_rate": 5.585374771480805e-06,
"loss": 0.9523,
"step": 11140
},
{
"epoch": 4.076782449725777,
"grad_norm": 2.809514045715332,
"learning_rate": 5.563436928702011e-06,
"loss": 0.9262,
"step": 11150
},
{
"epoch": 4.0804387568555756,
"grad_norm": 3.8771932125091553,
"learning_rate": 5.541499085923218e-06,
"loss": 0.6266,
"step": 11160
},
{
"epoch": 4.084095063985375,
"grad_norm": 1.681246280670166,
"learning_rate": 5.519561243144424e-06,
"loss": 0.8984,
"step": 11170
},
{
"epoch": 4.087751371115174,
"grad_norm": 5.3567795753479,
"learning_rate": 5.497623400365631e-06,
"loss": 0.8853,
"step": 11180
},
{
"epoch": 4.091407678244972,
"grad_norm": 4.239979267120361,
"learning_rate": 5.475685557586838e-06,
"loss": 0.8099,
"step": 11190
},
{
"epoch": 4.095063985374772,
"grad_norm": 1.681639313697815,
"learning_rate": 5.453747714808044e-06,
"loss": 0.582,
"step": 11200
},
{
"epoch": 4.09872029250457,
"grad_norm": 5.81494140625,
"learning_rate": 5.431809872029251e-06,
"loss": 0.7281,
"step": 11210
},
{
"epoch": 4.102376599634369,
"grad_norm": 4.564912796020508,
"learning_rate": 5.409872029250457e-06,
"loss": 0.7389,
"step": 11220
},
{
"epoch": 4.1060329067641685,
"grad_norm": 3.4300804138183594,
"learning_rate": 5.387934186471664e-06,
"loss": 0.6513,
"step": 11230
},
{
"epoch": 4.109689213893967,
"grad_norm": 3.141324281692505,
"learning_rate": 5.36599634369287e-06,
"loss": 0.9758,
"step": 11240
},
{
"epoch": 4.113345521023766,
"grad_norm": 1.9948968887329102,
"learning_rate": 5.3440585009140765e-06,
"loss": 0.8143,
"step": 11250
},
{
"epoch": 4.1170018281535645,
"grad_norm": 0.9731669425964355,
"learning_rate": 5.3221206581352835e-06,
"loss": 0.6493,
"step": 11260
},
{
"epoch": 4.120658135283364,
"grad_norm": 3.1560752391815186,
"learning_rate": 5.30018281535649e-06,
"loss": 0.7211,
"step": 11270
},
{
"epoch": 4.124314442413163,
"grad_norm": 1.2107890844345093,
"learning_rate": 5.278244972577697e-06,
"loss": 0.7278,
"step": 11280
},
{
"epoch": 4.127970749542961,
"grad_norm": 1.0509763956069946,
"learning_rate": 5.256307129798904e-06,
"loss": 0.5858,
"step": 11290
},
{
"epoch": 4.131627056672761,
"grad_norm": 3.70460844039917,
"learning_rate": 5.23436928702011e-06,
"loss": 0.8024,
"step": 11300
},
{
"epoch": 4.135283363802559,
"grad_norm": 4.243873596191406,
"learning_rate": 5.212431444241317e-06,
"loss": 1.0335,
"step": 11310
},
{
"epoch": 4.138939670932358,
"grad_norm": 4.228180885314941,
"learning_rate": 5.190493601462523e-06,
"loss": 0.865,
"step": 11320
},
{
"epoch": 4.1425959780621575,
"grad_norm": 8.000550270080566,
"learning_rate": 5.16855575868373e-06,
"loss": 0.9858,
"step": 11330
},
{
"epoch": 4.146252285191956,
"grad_norm": 7.326601505279541,
"learning_rate": 5.146617915904937e-06,
"loss": 1.1155,
"step": 11340
},
{
"epoch": 4.149908592321755,
"grad_norm": 6.109528064727783,
"learning_rate": 5.124680073126143e-06,
"loss": 0.5274,
"step": 11350
},
{
"epoch": 4.153564899451554,
"grad_norm": 3.239499568939209,
"learning_rate": 5.102742230347349e-06,
"loss": 0.9185,
"step": 11360
},
{
"epoch": 4.157221206581353,
"grad_norm": 5.765626430511475,
"learning_rate": 5.080804387568555e-06,
"loss": 0.6147,
"step": 11370
},
{
"epoch": 4.160877513711152,
"grad_norm": 3.189391613006592,
"learning_rate": 5.058866544789762e-06,
"loss": 0.7233,
"step": 11380
},
{
"epoch": 4.16453382084095,
"grad_norm": 5.938801288604736,
"learning_rate": 5.036928702010969e-06,
"loss": 0.7033,
"step": 11390
},
{
"epoch": 4.16819012797075,
"grad_norm": 1.7474747896194458,
"learning_rate": 5.014990859232175e-06,
"loss": 0.644,
"step": 11400
},
{
"epoch": 4.171846435100549,
"grad_norm": 2.9664547443389893,
"learning_rate": 4.993053016453382e-06,
"loss": 0.7644,
"step": 11410
},
{
"epoch": 4.175502742230347,
"grad_norm": 3.7296855449676514,
"learning_rate": 4.9711151736745885e-06,
"loss": 0.8427,
"step": 11420
},
{
"epoch": 4.1791590493601465,
"grad_norm": 5.18561315536499,
"learning_rate": 4.9491773308957955e-06,
"loss": 0.6164,
"step": 11430
},
{
"epoch": 4.182815356489945,
"grad_norm": 4.443209648132324,
"learning_rate": 4.9272394881170025e-06,
"loss": 0.6032,
"step": 11440
},
{
"epoch": 4.186471663619744,
"grad_norm": 5.131235599517822,
"learning_rate": 4.905301645338209e-06,
"loss": 0.9402,
"step": 11450
},
{
"epoch": 4.190127970749543,
"grad_norm": 7.778912544250488,
"learning_rate": 4.883363802559416e-06,
"loss": 0.7508,
"step": 11460
},
{
"epoch": 4.193784277879342,
"grad_norm": 3.81158709526062,
"learning_rate": 4.861425959780622e-06,
"loss": 0.6463,
"step": 11470
},
{
"epoch": 4.197440585009141,
"grad_norm": 3.7394750118255615,
"learning_rate": 4.839488117001828e-06,
"loss": 0.8441,
"step": 11480
},
{
"epoch": 4.201096892138939,
"grad_norm": 5.460958003997803,
"learning_rate": 4.817550274223035e-06,
"loss": 0.758,
"step": 11490
},
{
"epoch": 4.204753199268739,
"grad_norm": 3.6687943935394287,
"learning_rate": 4.795612431444241e-06,
"loss": 0.6764,
"step": 11500
},
{
"epoch": 4.208409506398538,
"grad_norm": 5.314717769622803,
"learning_rate": 4.773674588665448e-06,
"loss": 0.599,
"step": 11510
},
{
"epoch": 4.212065813528336,
"grad_norm": 1.7225974798202515,
"learning_rate": 4.751736745886654e-06,
"loss": 0.7536,
"step": 11520
},
{
"epoch": 4.2157221206581355,
"grad_norm": 4.815572261810303,
"learning_rate": 4.729798903107861e-06,
"loss": 0.7072,
"step": 11530
},
{
"epoch": 4.219378427787934,
"grad_norm": 6.468070983886719,
"learning_rate": 4.707861060329068e-06,
"loss": 1.0721,
"step": 11540
},
{
"epoch": 4.223034734917733,
"grad_norm": 2.3022828102111816,
"learning_rate": 4.685923217550274e-06,
"loss": 0.5984,
"step": 11550
},
{
"epoch": 4.226691042047532,
"grad_norm": 6.993771553039551,
"learning_rate": 4.663985374771481e-06,
"loss": 1.0772,
"step": 11560
},
{
"epoch": 4.230347349177331,
"grad_norm": 3.061063766479492,
"learning_rate": 4.642047531992687e-06,
"loss": 0.5588,
"step": 11570
},
{
"epoch": 4.23400365630713,
"grad_norm": 1.7412669658660889,
"learning_rate": 4.620109689213894e-06,
"loss": 0.7227,
"step": 11580
},
{
"epoch": 4.237659963436928,
"grad_norm": 8.846871376037598,
"learning_rate": 4.598171846435101e-06,
"loss": 0.6635,
"step": 11590
},
{
"epoch": 4.2413162705667276,
"grad_norm": 17.051027297973633,
"learning_rate": 4.576234003656307e-06,
"loss": 0.9688,
"step": 11600
},
{
"epoch": 4.244972577696527,
"grad_norm": 8.541803359985352,
"learning_rate": 4.554296160877514e-06,
"loss": 0.8742,
"step": 11610
},
{
"epoch": 4.248628884826325,
"grad_norm": 6.193167686462402,
"learning_rate": 4.53235831809872e-06,
"loss": 0.6511,
"step": 11620
},
{
"epoch": 4.252285191956124,
"grad_norm": 7.520668983459473,
"learning_rate": 4.510420475319927e-06,
"loss": 0.6708,
"step": 11630
},
{
"epoch": 4.255941499085923,
"grad_norm": 4.377003192901611,
"learning_rate": 4.488482632541134e-06,
"loss": 0.5833,
"step": 11640
},
{
"epoch": 4.259597806215722,
"grad_norm": 8.407455444335938,
"learning_rate": 4.46654478976234e-06,
"loss": 0.6796,
"step": 11650
},
{
"epoch": 4.263254113345521,
"grad_norm": 6.994277477264404,
"learning_rate": 4.444606946983547e-06,
"loss": 0.6158,
"step": 11660
},
{
"epoch": 4.26691042047532,
"grad_norm": 4.836822032928467,
"learning_rate": 4.422669104204753e-06,
"loss": 0.5752,
"step": 11670
},
{
"epoch": 4.270566727605119,
"grad_norm": 8.016481399536133,
"learning_rate": 4.40073126142596e-06,
"loss": 0.6766,
"step": 11680
},
{
"epoch": 4.274223034734918,
"grad_norm": 7.545466423034668,
"learning_rate": 4.378793418647167e-06,
"loss": 0.756,
"step": 11690
},
{
"epoch": 4.2778793418647165,
"grad_norm": 6.334908485412598,
"learning_rate": 4.356855575868373e-06,
"loss": 0.6505,
"step": 11700
},
{
"epoch": 4.281535648994516,
"grad_norm": 8.154512405395508,
"learning_rate": 4.334917733089579e-06,
"loss": 0.7931,
"step": 11710
},
{
"epoch": 4.285191956124314,
"grad_norm": 6.061620712280273,
"learning_rate": 4.312979890310786e-06,
"loss": 0.7183,
"step": 11720
},
{
"epoch": 4.288848263254113,
"grad_norm": 8.32985782623291,
"learning_rate": 4.291042047531992e-06,
"loss": 0.6878,
"step": 11730
},
{
"epoch": 4.292504570383913,
"grad_norm": 5.723931312561035,
"learning_rate": 4.269104204753199e-06,
"loss": 0.7703,
"step": 11740
},
{
"epoch": 4.296160877513711,
"grad_norm": 8.518719673156738,
"learning_rate": 4.2471663619744055e-06,
"loss": 0.6322,
"step": 11750
},
{
"epoch": 4.29981718464351,
"grad_norm": 6.429286956787109,
"learning_rate": 4.2252285191956125e-06,
"loss": 0.6678,
"step": 11760
},
{
"epoch": 4.303473491773309,
"grad_norm": 6.832225799560547,
"learning_rate": 4.2032906764168195e-06,
"loss": 0.7779,
"step": 11770
},
{
"epoch": 4.307129798903108,
"grad_norm": 5.4020233154296875,
"learning_rate": 4.181352833638026e-06,
"loss": 0.6867,
"step": 11780
},
{
"epoch": 4.310786106032907,
"grad_norm": 5.374074459075928,
"learning_rate": 4.159414990859233e-06,
"loss": 0.6868,
"step": 11790
},
{
"epoch": 4.3144424131627055,
"grad_norm": 5.138687610626221,
"learning_rate": 4.137477148080439e-06,
"loss": 0.8428,
"step": 11800
},
{
"epoch": 4.318098720292505,
"grad_norm": 10.137980461120605,
"learning_rate": 4.115539305301646e-06,
"loss": 0.7087,
"step": 11810
},
{
"epoch": 4.321755027422303,
"grad_norm": 4.559932231903076,
"learning_rate": 4.093601462522853e-06,
"loss": 0.6856,
"step": 11820
},
{
"epoch": 4.325411334552102,
"grad_norm": 6.470888137817383,
"learning_rate": 4.071663619744058e-06,
"loss": 0.7017,
"step": 11830
},
{
"epoch": 4.329067641681902,
"grad_norm": 7.216504096984863,
"learning_rate": 4.049725776965265e-06,
"loss": 0.6752,
"step": 11840
},
{
"epoch": 4.3327239488117,
"grad_norm": 6.0090460777282715,
"learning_rate": 4.027787934186471e-06,
"loss": 0.8154,
"step": 11850
},
{
"epoch": 4.336380255941499,
"grad_norm": 9.844496726989746,
"learning_rate": 4.005850091407678e-06,
"loss": 0.7323,
"step": 11860
},
{
"epoch": 4.340036563071298,
"grad_norm": 10.084904670715332,
"learning_rate": 3.983912248628885e-06,
"loss": 0.8326,
"step": 11870
},
{
"epoch": 4.343692870201097,
"grad_norm": 8.137714385986328,
"learning_rate": 3.961974405850091e-06,
"loss": 0.7666,
"step": 11880
},
{
"epoch": 4.347349177330896,
"grad_norm": 5.626021385192871,
"learning_rate": 3.940036563071298e-06,
"loss": 0.7711,
"step": 11890
},
{
"epoch": 4.3510054844606945,
"grad_norm": 7.723333358764648,
"learning_rate": 3.9180987202925044e-06,
"loss": 0.6027,
"step": 11900
},
{
"epoch": 4.354661791590494,
"grad_norm": 7.431672096252441,
"learning_rate": 3.8961608775137114e-06,
"loss": 0.666,
"step": 11910
},
{
"epoch": 4.358318098720293,
"grad_norm": 6.387314319610596,
"learning_rate": 3.874223034734918e-06,
"loss": 0.7947,
"step": 11920
},
{
"epoch": 4.361974405850091,
"grad_norm": 9.410737991333008,
"learning_rate": 3.8522851919561246e-06,
"loss": 0.7556,
"step": 11930
},
{
"epoch": 4.365630712979891,
"grad_norm": 5.66964864730835,
"learning_rate": 3.8303473491773315e-06,
"loss": 0.7359,
"step": 11940
},
{
"epoch": 4.369287020109689,
"grad_norm": 10.612873077392578,
"learning_rate": 3.8084095063985373e-06,
"loss": 0.7017,
"step": 11950
},
{
"epoch": 4.372943327239488,
"grad_norm": 5.808506488800049,
"learning_rate": 3.786471663619744e-06,
"loss": 0.7297,
"step": 11960
},
{
"epoch": 4.376599634369287,
"grad_norm": 9.308990478515625,
"learning_rate": 3.7645338208409504e-06,
"loss": 0.8003,
"step": 11970
},
{
"epoch": 4.380255941499086,
"grad_norm": 7.967999458312988,
"learning_rate": 3.742595978062157e-06,
"loss": 0.6148,
"step": 11980
},
{
"epoch": 4.383912248628885,
"grad_norm": 5.871792316436768,
"learning_rate": 3.720658135283364e-06,
"loss": 0.6653,
"step": 11990
},
{
"epoch": 4.387568555758683,
"grad_norm": 7.684876918792725,
"learning_rate": 3.6987202925045705e-06,
"loss": 0.6063,
"step": 12000
},
{
"epoch": 4.391224862888483,
"grad_norm": 8.816610336303711,
"learning_rate": 3.676782449725777e-06,
"loss": 0.6981,
"step": 12010
},
{
"epoch": 4.394881170018282,
"grad_norm": 6.932671546936035,
"learning_rate": 3.6548446069469836e-06,
"loss": 0.763,
"step": 12020
},
{
"epoch": 4.39853747714808,
"grad_norm": 7.768485069274902,
"learning_rate": 3.63290676416819e-06,
"loss": 0.6659,
"step": 12030
},
{
"epoch": 4.4021937842778796,
"grad_norm": 6.058159828186035,
"learning_rate": 3.6109689213893968e-06,
"loss": 0.7142,
"step": 12040
},
{
"epoch": 4.405850091407678,
"grad_norm": 7.062812805175781,
"learning_rate": 3.5890310786106033e-06,
"loss": 0.8288,
"step": 12050
},
{
"epoch": 4.409506398537477,
"grad_norm": 10.744300842285156,
"learning_rate": 3.56709323583181e-06,
"loss": 0.8523,
"step": 12060
},
{
"epoch": 4.413162705667276,
"grad_norm": 5.199676036834717,
"learning_rate": 3.5451553930530165e-06,
"loss": 0.6737,
"step": 12070
},
{
"epoch": 4.416819012797075,
"grad_norm": 7.22199821472168,
"learning_rate": 3.5232175502742234e-06,
"loss": 0.6945,
"step": 12080
},
{
"epoch": 4.420475319926874,
"grad_norm": 7.236554145812988,
"learning_rate": 3.50127970749543e-06,
"loss": 0.7632,
"step": 12090
},
{
"epoch": 4.424131627056672,
"grad_norm": 5.387056350708008,
"learning_rate": 3.479341864716636e-06,
"loss": 0.795,
"step": 12100
},
{
"epoch": 4.427787934186472,
"grad_norm": 5.174760341644287,
"learning_rate": 3.4574040219378427e-06,
"loss": 0.6356,
"step": 12110
},
{
"epoch": 4.431444241316271,
"grad_norm": 8.171443939208984,
"learning_rate": 3.4354661791590493e-06,
"loss": 0.916,
"step": 12120
},
{
"epoch": 4.435100548446069,
"grad_norm": 6.220861911773682,
"learning_rate": 3.4157221206581357e-06,
"loss": 0.7167,
"step": 12130
},
{
"epoch": 4.4387568555758685,
"grad_norm": 4.111860275268555,
"learning_rate": 3.393784277879342e-06,
"loss": 0.7382,
"step": 12140
},
{
"epoch": 4.442413162705667,
"grad_norm": 9.302396774291992,
"learning_rate": 3.3718464351005484e-06,
"loss": 0.7702,
"step": 12150
},
{
"epoch": 4.446069469835466,
"grad_norm": 6.859189987182617,
"learning_rate": 3.349908592321755e-06,
"loss": 0.7118,
"step": 12160
},
{
"epoch": 4.449725776965265,
"grad_norm": 8.368714332580566,
"learning_rate": 3.327970749542962e-06,
"loss": 0.6512,
"step": 12170
},
{
"epoch": 4.453382084095064,
"grad_norm": 4.548081398010254,
"learning_rate": 3.3060329067641685e-06,
"loss": 0.5731,
"step": 12180
},
{
"epoch": 4.457038391224863,
"grad_norm": 6.483217239379883,
"learning_rate": 3.2840950639853746e-06,
"loss": 0.689,
"step": 12190
},
{
"epoch": 4.460694698354661,
"grad_norm": 6.644962310791016,
"learning_rate": 3.262157221206581e-06,
"loss": 0.544,
"step": 12200
},
{
"epoch": 4.464351005484461,
"grad_norm": 5.917163848876953,
"learning_rate": 3.2402193784277877e-06,
"loss": 0.6778,
"step": 12210
},
{
"epoch": 4.46800731261426,
"grad_norm": 8.300089836120605,
"learning_rate": 3.2182815356489947e-06,
"loss": 0.6243,
"step": 12220
},
{
"epoch": 4.471663619744058,
"grad_norm": 6.0708184242248535,
"learning_rate": 3.1963436928702013e-06,
"loss": 0.7093,
"step": 12230
},
{
"epoch": 4.4753199268738575,
"grad_norm": 7.4208526611328125,
"learning_rate": 3.174405850091408e-06,
"loss": 0.7837,
"step": 12240
},
{
"epoch": 4.478976234003657,
"grad_norm": 6.546789169311523,
"learning_rate": 3.152468007312614e-06,
"loss": 0.6736,
"step": 12250
},
{
"epoch": 4.482632541133455,
"grad_norm": 4.865387916564941,
"learning_rate": 3.130530164533821e-06,
"loss": 0.6906,
"step": 12260
},
{
"epoch": 4.486288848263254,
"grad_norm": 8.03560733795166,
"learning_rate": 3.1085923217550276e-06,
"loss": 0.8666,
"step": 12270
},
{
"epoch": 4.489945155393053,
"grad_norm": 7.61192512512207,
"learning_rate": 3.086654478976234e-06,
"loss": 0.754,
"step": 12280
},
{
"epoch": 4.493601462522852,
"grad_norm": 5.770723342895508,
"learning_rate": 3.0647166361974407e-06,
"loss": 0.7219,
"step": 12290
},
{
"epoch": 4.497257769652651,
"grad_norm": 10.299765586853027,
"learning_rate": 3.0427787934186473e-06,
"loss": 0.9244,
"step": 12300
},
{
"epoch": 4.50091407678245,
"grad_norm": 7.810846328735352,
"learning_rate": 3.020840950639854e-06,
"loss": 0.5936,
"step": 12310
},
{
"epoch": 4.504570383912249,
"grad_norm": 6.715174674987793,
"learning_rate": 2.9989031078610604e-06,
"loss": 0.7276,
"step": 12320
},
{
"epoch": 4.508226691042047,
"grad_norm": 8.37287712097168,
"learning_rate": 2.976965265082267e-06,
"loss": 0.6952,
"step": 12330
},
{
"epoch": 4.5118829981718465,
"grad_norm": 4.971324443817139,
"learning_rate": 2.9550274223034735e-06,
"loss": 0.7399,
"step": 12340
},
{
"epoch": 4.515539305301646,
"grad_norm": 4.20208740234375,
"learning_rate": 2.93308957952468e-06,
"loss": 0.6338,
"step": 12350
},
{
"epoch": 4.519195612431444,
"grad_norm": 7.0777177810668945,
"learning_rate": 2.911151736745887e-06,
"loss": 0.709,
"step": 12360
},
{
"epoch": 4.522851919561243,
"grad_norm": 4.147567272186279,
"learning_rate": 2.889213893967093e-06,
"loss": 0.8053,
"step": 12370
},
{
"epoch": 4.526508226691042,
"grad_norm": 6.275250434875488,
"learning_rate": 2.8672760511882998e-06,
"loss": 0.6594,
"step": 12380
},
{
"epoch": 4.530164533820841,
"grad_norm": 8.304201126098633,
"learning_rate": 2.8453382084095063e-06,
"loss": 0.686,
"step": 12390
},
{
"epoch": 4.53382084095064,
"grad_norm": 2.2459535598754883,
"learning_rate": 2.823400365630713e-06,
"loss": 0.6314,
"step": 12400
},
{
"epoch": 4.537477148080439,
"grad_norm": 7.190771102905273,
"learning_rate": 2.80146252285192e-06,
"loss": 0.7207,
"step": 12410
},
{
"epoch": 4.541133455210238,
"grad_norm": 5.161981105804443,
"learning_rate": 2.7795246800731265e-06,
"loss": 0.6005,
"step": 12420
},
{
"epoch": 4.544789762340036,
"grad_norm": 10.310462951660156,
"learning_rate": 2.7575868372943326e-06,
"loss": 0.7278,
"step": 12430
},
{
"epoch": 4.548446069469835,
"grad_norm": 5.244387626647949,
"learning_rate": 2.735648994515539e-06,
"loss": 0.7996,
"step": 12440
},
{
"epoch": 4.552102376599635,
"grad_norm": 7.997178554534912,
"learning_rate": 2.7137111517367457e-06,
"loss": 0.792,
"step": 12450
},
{
"epoch": 4.555758683729433,
"grad_norm": 7.470856666564941,
"learning_rate": 2.6917733089579527e-06,
"loss": 0.7353,
"step": 12460
},
{
"epoch": 4.559414990859232,
"grad_norm": 8.2652006149292,
"learning_rate": 2.6698354661791593e-06,
"loss": 0.7909,
"step": 12470
},
{
"epoch": 4.5630712979890315,
"grad_norm": 10.023780822753906,
"learning_rate": 2.647897623400366e-06,
"loss": 0.7524,
"step": 12480
},
{
"epoch": 4.56672760511883,
"grad_norm": 5.3603949546813965,
"learning_rate": 2.625959780621572e-06,
"loss": 0.6137,
"step": 12490
},
{
"epoch": 4.570383912248629,
"grad_norm": 10.119514465332031,
"learning_rate": 2.6040219378427785e-06,
"loss": 0.7993,
"step": 12500
},
{
"epoch": 4.5740402193784275,
"grad_norm": 7.202580451965332,
"learning_rate": 2.5820840950639855e-06,
"loss": 0.6839,
"step": 12510
},
{
"epoch": 4.577696526508227,
"grad_norm": 6.155348777770996,
"learning_rate": 2.560146252285192e-06,
"loss": 0.6466,
"step": 12520
},
{
"epoch": 4.581352833638025,
"grad_norm": 7.591714859008789,
"learning_rate": 2.5382084095063987e-06,
"loss": 0.6854,
"step": 12530
},
{
"epoch": 4.585009140767824,
"grad_norm": 8.901870727539062,
"learning_rate": 2.5162705667276052e-06,
"loss": 0.725,
"step": 12540
},
{
"epoch": 4.588665447897624,
"grad_norm": 4.959688663482666,
"learning_rate": 2.494332723948812e-06,
"loss": 0.648,
"step": 12550
},
{
"epoch": 4.592321755027422,
"grad_norm": 3.7066445350646973,
"learning_rate": 2.4723948811700184e-06,
"loss": 0.6049,
"step": 12560
},
{
"epoch": 4.595978062157221,
"grad_norm": 7.572804927825928,
"learning_rate": 2.450457038391225e-06,
"loss": 0.6417,
"step": 12570
},
{
"epoch": 4.5996343692870205,
"grad_norm": 6.546285152435303,
"learning_rate": 2.4285191956124315e-06,
"loss": 0.6253,
"step": 12580
},
{
"epoch": 4.603290676416819,
"grad_norm": 7.330362319946289,
"learning_rate": 2.406581352833638e-06,
"loss": 0.6836,
"step": 12590
},
{
"epoch": 4.606946983546618,
"grad_norm": 7.050893306732178,
"learning_rate": 2.384643510054845e-06,
"loss": 0.6751,
"step": 12600
},
{
"epoch": 4.6106032906764165,
"grad_norm": 8.03470516204834,
"learning_rate": 2.362705667276051e-06,
"loss": 0.7185,
"step": 12610
},
{
"epoch": 4.614259597806216,
"grad_norm": 10.927925109863281,
"learning_rate": 2.3407678244972577e-06,
"loss": 0.8281,
"step": 12620
},
{
"epoch": 4.617915904936015,
"grad_norm": 12.535290718078613,
"learning_rate": 2.3188299817184643e-06,
"loss": 0.8109,
"step": 12630
},
{
"epoch": 4.621572212065813,
"grad_norm": 10.733165740966797,
"learning_rate": 2.296892138939671e-06,
"loss": 0.7886,
"step": 12640
},
{
"epoch": 4.625228519195613,
"grad_norm": 6.554678916931152,
"learning_rate": 2.274954296160878e-06,
"loss": 0.7984,
"step": 12650
},
{
"epoch": 4.628884826325411,
"grad_norm": 6.383825302124023,
"learning_rate": 2.2530164533820844e-06,
"loss": 0.7802,
"step": 12660
},
{
"epoch": 4.63254113345521,
"grad_norm": 6.0972795486450195,
"learning_rate": 2.2310786106032906e-06,
"loss": 0.5982,
"step": 12670
},
{
"epoch": 4.6361974405850095,
"grad_norm": 5.708790302276611,
"learning_rate": 2.209140767824497e-06,
"loss": 0.8417,
"step": 12680
},
{
"epoch": 4.639853747714808,
"grad_norm": 5.925148963928223,
"learning_rate": 2.1872029250457037e-06,
"loss": 0.7847,
"step": 12690
},
{
"epoch": 4.643510054844607,
"grad_norm": 8.306936264038086,
"learning_rate": 2.1652650822669107e-06,
"loss": 0.6869,
"step": 12700
},
{
"epoch": 4.6471663619744055,
"grad_norm": 5.944139003753662,
"learning_rate": 2.1433272394881172e-06,
"loss": 0.7387,
"step": 12710
},
{
"epoch": 4.650822669104205,
"grad_norm": 11.451881408691406,
"learning_rate": 2.121389396709324e-06,
"loss": 0.7313,
"step": 12720
},
{
"epoch": 4.654478976234004,
"grad_norm": 7.1728715896606445,
"learning_rate": 2.09945155393053e-06,
"loss": 0.7783,
"step": 12730
},
{
"epoch": 4.658135283363802,
"grad_norm": 10.634977340698242,
"learning_rate": 2.0775137111517365e-06,
"loss": 0.7819,
"step": 12740
},
{
"epoch": 4.661791590493602,
"grad_norm": 5.473633766174316,
"learning_rate": 2.0555758683729435e-06,
"loss": 0.914,
"step": 12750
},
{
"epoch": 4.6654478976234,
"grad_norm": 7.64341926574707,
"learning_rate": 2.03363802559415e-06,
"loss": 0.6453,
"step": 12760
},
{
"epoch": 4.669104204753199,
"grad_norm": 7.986457347869873,
"learning_rate": 2.0117001828153566e-06,
"loss": 0.6979,
"step": 12770
},
{
"epoch": 4.6727605118829985,
"grad_norm": 7.322612762451172,
"learning_rate": 1.989762340036563e-06,
"loss": 0.8874,
"step": 12780
},
{
"epoch": 4.676416819012797,
"grad_norm": 7.666032314300537,
"learning_rate": 1.9678244972577698e-06,
"loss": 0.8391,
"step": 12790
},
{
"epoch": 4.680073126142596,
"grad_norm": 8.544524192810059,
"learning_rate": 1.9458866544789763e-06,
"loss": 0.7378,
"step": 12800
},
{
"epoch": 4.683729433272395,
"grad_norm": 9.552132606506348,
"learning_rate": 1.923948811700183e-06,
"loss": 0.6094,
"step": 12810
},
{
"epoch": 4.687385740402194,
"grad_norm": 8.779314994812012,
"learning_rate": 1.9020109689213895e-06,
"loss": 0.7032,
"step": 12820
},
{
"epoch": 4.691042047531993,
"grad_norm": 4.859720230102539,
"learning_rate": 1.8800731261425962e-06,
"loss": 0.795,
"step": 12830
},
{
"epoch": 4.694698354661791,
"grad_norm": 6.823448181152344,
"learning_rate": 1.8581352833638026e-06,
"loss": 0.6433,
"step": 12840
},
{
"epoch": 4.698354661791591,
"grad_norm": 6.933642387390137,
"learning_rate": 1.8361974405850092e-06,
"loss": 0.763,
"step": 12850
},
{
"epoch": 4.702010968921389,
"grad_norm": 7.405396938323975,
"learning_rate": 1.8142595978062157e-06,
"loss": 0.8466,
"step": 12860
},
{
"epoch": 4.705667276051188,
"grad_norm": 8.228802680969238,
"learning_rate": 1.7923217550274223e-06,
"loss": 0.8017,
"step": 12870
},
{
"epoch": 4.709323583180987,
"grad_norm": 5.067279815673828,
"learning_rate": 1.770383912248629e-06,
"loss": 0.6612,
"step": 12880
},
{
"epoch": 4.712979890310786,
"grad_norm": 7.058690547943115,
"learning_rate": 1.7484460694698354e-06,
"loss": 0.7171,
"step": 12890
},
{
"epoch": 4.716636197440585,
"grad_norm": 7.31235933303833,
"learning_rate": 1.7265082266910422e-06,
"loss": 0.6917,
"step": 12900
},
{
"epoch": 4.720292504570384,
"grad_norm": 7.289247989654541,
"learning_rate": 1.7045703839122487e-06,
"loss": 0.6917,
"step": 12910
},
{
"epoch": 4.723948811700183,
"grad_norm": 10.546690940856934,
"learning_rate": 1.682632541133455e-06,
"loss": 0.718,
"step": 12920
},
{
"epoch": 4.727605118829982,
"grad_norm": 6.604415416717529,
"learning_rate": 1.6606946983546619e-06,
"loss": 0.7037,
"step": 12930
},
{
"epoch": 4.73126142595978,
"grad_norm": 5.056285381317139,
"learning_rate": 1.6387568555758684e-06,
"loss": 0.6712,
"step": 12940
},
{
"epoch": 4.7349177330895795,
"grad_norm": 6.835060119628906,
"learning_rate": 1.616819012797075e-06,
"loss": 0.8142,
"step": 12950
},
{
"epoch": 4.738574040219379,
"grad_norm": 7.166338920593262,
"learning_rate": 1.5948811700182816e-06,
"loss": 0.6812,
"step": 12960
},
{
"epoch": 4.742230347349177,
"grad_norm": 8.841276168823242,
"learning_rate": 1.5729433272394881e-06,
"loss": 0.695,
"step": 12970
},
{
"epoch": 4.745886654478976,
"grad_norm": 6.730128288269043,
"learning_rate": 1.5510054844606947e-06,
"loss": 0.6517,
"step": 12980
},
{
"epoch": 4.749542961608775,
"grad_norm": 6.670187473297119,
"learning_rate": 1.5290676416819013e-06,
"loss": 0.8538,
"step": 12990
},
{
"epoch": 4.753199268738574,
"grad_norm": 5.65201997756958,
"learning_rate": 1.507129798903108e-06,
"loss": 0.5967,
"step": 13000
}
],
"logging_steps": 10,
"max_steps": 13675,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}