TinyLlama-Cinder-NERd / trainer_state.json
Josephgflowers's picture
End of training
80c5dd3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5949,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016809547823163557,
"grad_norm": 79552.359375,
"learning_rate": 4.9915952260884186e-05,
"loss": 1.0656,
"step": 10
},
{
"epoch": 0.0033619095646327114,
"grad_norm": 79056.6640625,
"learning_rate": 4.983190452176837e-05,
"loss": 1.0215,
"step": 20
},
{
"epoch": 0.005042864346949067,
"grad_norm": 85323.828125,
"learning_rate": 4.9747856782652546e-05,
"loss": 0.9578,
"step": 30
},
{
"epoch": 0.006723819129265423,
"grad_norm": 71872.125,
"learning_rate": 4.966380904353673e-05,
"loss": 0.9151,
"step": 40
},
{
"epoch": 0.008404773911581778,
"grad_norm": 71638.46875,
"learning_rate": 4.957976130442091e-05,
"loss": 0.8803,
"step": 50
},
{
"epoch": 0.010085728693898134,
"grad_norm": 74304.5234375,
"learning_rate": 4.9495713565305096e-05,
"loss": 0.9141,
"step": 60
},
{
"epoch": 0.01176668347621449,
"grad_norm": 73676.1953125,
"learning_rate": 4.941166582618928e-05,
"loss": 0.9949,
"step": 70
},
{
"epoch": 0.013447638258530846,
"grad_norm": 76955.7890625,
"learning_rate": 4.9327618087073463e-05,
"loss": 0.8852,
"step": 80
},
{
"epoch": 0.015128593040847202,
"grad_norm": 69415.2578125,
"learning_rate": 4.924357034795764e-05,
"loss": 0.8891,
"step": 90
},
{
"epoch": 0.016809547823163556,
"grad_norm": 84991.7734375,
"learning_rate": 4.9159522608841824e-05,
"loss": 0.939,
"step": 100
},
{
"epoch": 0.01849050260547991,
"grad_norm": 72803.203125,
"learning_rate": 4.907547486972601e-05,
"loss": 0.8879,
"step": 110
},
{
"epoch": 0.020171457387796268,
"grad_norm": 73581.6640625,
"learning_rate": 4.899142713061019e-05,
"loss": 0.923,
"step": 120
},
{
"epoch": 0.021852412170112624,
"grad_norm": 80610.7421875,
"learning_rate": 4.8907379391494374e-05,
"loss": 0.8933,
"step": 130
},
{
"epoch": 0.02353336695242898,
"grad_norm": 80112.125,
"learning_rate": 4.882333165237855e-05,
"loss": 0.8761,
"step": 140
},
{
"epoch": 0.025214321734745335,
"grad_norm": 73625.0546875,
"learning_rate": 4.8739283913262734e-05,
"loss": 0.8442,
"step": 150
},
{
"epoch": 0.02689527651706169,
"grad_norm": 79199.546875,
"learning_rate": 4.865523617414692e-05,
"loss": 0.9312,
"step": 160
},
{
"epoch": 0.028576231299378047,
"grad_norm": 69096.6484375,
"learning_rate": 4.85711884350311e-05,
"loss": 0.9435,
"step": 170
},
{
"epoch": 0.030257186081694403,
"grad_norm": 66993.53125,
"learning_rate": 4.8487140695915285e-05,
"loss": 0.8933,
"step": 180
},
{
"epoch": 0.031938140864010756,
"grad_norm": 72366.515625,
"learning_rate": 4.840309295679946e-05,
"loss": 0.8513,
"step": 190
},
{
"epoch": 0.03361909564632711,
"grad_norm": 70981.46875,
"learning_rate": 4.8319045217683645e-05,
"loss": 0.9126,
"step": 200
},
{
"epoch": 0.03530005042864347,
"grad_norm": 73961.296875,
"learning_rate": 4.823499747856783e-05,
"loss": 0.908,
"step": 210
},
{
"epoch": 0.03698100521095982,
"grad_norm": 71402.046875,
"learning_rate": 4.815094973945201e-05,
"loss": 0.915,
"step": 220
},
{
"epoch": 0.03866195999327618,
"grad_norm": 69597.8984375,
"learning_rate": 4.8066902000336195e-05,
"loss": 0.892,
"step": 230
},
{
"epoch": 0.040342914775592535,
"grad_norm": 77749.3359375,
"learning_rate": 4.798285426122038e-05,
"loss": 0.9004,
"step": 240
},
{
"epoch": 0.04202386955790889,
"grad_norm": 69176.328125,
"learning_rate": 4.7898806522104555e-05,
"loss": 0.9878,
"step": 250
},
{
"epoch": 0.04370482434022525,
"grad_norm": 76940.0625,
"learning_rate": 4.781475878298874e-05,
"loss": 0.8873,
"step": 260
},
{
"epoch": 0.0453857791225416,
"grad_norm": 81460.0546875,
"learning_rate": 4.773071104387292e-05,
"loss": 0.9022,
"step": 270
},
{
"epoch": 0.04706673390485796,
"grad_norm": 69520.1953125,
"learning_rate": 4.7646663304757106e-05,
"loss": 0.8729,
"step": 280
},
{
"epoch": 0.048747688687174315,
"grad_norm": 76301.21875,
"learning_rate": 4.756261556564129e-05,
"loss": 0.9464,
"step": 290
},
{
"epoch": 0.05042864346949067,
"grad_norm": 72831.2265625,
"learning_rate": 4.7478567826525466e-05,
"loss": 0.8346,
"step": 300
},
{
"epoch": 0.05210959825180703,
"grad_norm": 73389.6328125,
"learning_rate": 4.739452008740965e-05,
"loss": 0.8794,
"step": 310
},
{
"epoch": 0.05379055303412338,
"grad_norm": 78976.9453125,
"learning_rate": 4.731047234829383e-05,
"loss": 0.9318,
"step": 320
},
{
"epoch": 0.05547150781643974,
"grad_norm": 72745.15625,
"learning_rate": 4.7226424609178016e-05,
"loss": 0.923,
"step": 330
},
{
"epoch": 0.057152462598756094,
"grad_norm": 65032.38671875,
"learning_rate": 4.71423768700622e-05,
"loss": 0.9431,
"step": 340
},
{
"epoch": 0.05883341738107245,
"grad_norm": 67867.875,
"learning_rate": 4.705832913094638e-05,
"loss": 0.853,
"step": 350
},
{
"epoch": 0.060514372163388806,
"grad_norm": 132520.328125,
"learning_rate": 4.697428139183056e-05,
"loss": 0.8772,
"step": 360
},
{
"epoch": 0.06219532694570516,
"grad_norm": 71777.140625,
"learning_rate": 4.6890233652714743e-05,
"loss": 0.841,
"step": 370
},
{
"epoch": 0.06387628172802151,
"grad_norm": 71626.65625,
"learning_rate": 4.680618591359893e-05,
"loss": 0.8281,
"step": 380
},
{
"epoch": 0.06555723651033787,
"grad_norm": 66127.0703125,
"learning_rate": 4.672213817448311e-05,
"loss": 0.9221,
"step": 390
},
{
"epoch": 0.06723819129265422,
"grad_norm": 66148.8203125,
"learning_rate": 4.6638090435367294e-05,
"loss": 0.8607,
"step": 400
},
{
"epoch": 0.06891914607497059,
"grad_norm": 75503.5078125,
"learning_rate": 4.655404269625147e-05,
"loss": 0.9062,
"step": 410
},
{
"epoch": 0.07060010085728693,
"grad_norm": 61276.5625,
"learning_rate": 4.6469994957135654e-05,
"loss": 0.8669,
"step": 420
},
{
"epoch": 0.0722810556396033,
"grad_norm": 71196.9453125,
"learning_rate": 4.638594721801984e-05,
"loss": 0.8686,
"step": 430
},
{
"epoch": 0.07396201042191965,
"grad_norm": 66918.8671875,
"learning_rate": 4.630189947890402e-05,
"loss": 0.9121,
"step": 440
},
{
"epoch": 0.07564296520423601,
"grad_norm": 64304.73828125,
"learning_rate": 4.6217851739788204e-05,
"loss": 0.8663,
"step": 450
},
{
"epoch": 0.07732391998655236,
"grad_norm": 62031.97265625,
"learning_rate": 4.613380400067238e-05,
"loss": 0.8258,
"step": 460
},
{
"epoch": 0.07900487476886872,
"grad_norm": 63083.9140625,
"learning_rate": 4.6049756261556565e-05,
"loss": 0.8657,
"step": 470
},
{
"epoch": 0.08068582955118507,
"grad_norm": 67486.75,
"learning_rate": 4.596570852244075e-05,
"loss": 0.9225,
"step": 480
},
{
"epoch": 0.08236678433350143,
"grad_norm": 68711.7890625,
"learning_rate": 4.588166078332493e-05,
"loss": 0.8172,
"step": 490
},
{
"epoch": 0.08404773911581778,
"grad_norm": 69319.46875,
"learning_rate": 4.5797613044209115e-05,
"loss": 0.8034,
"step": 500
},
{
"epoch": 0.08572869389813415,
"grad_norm": 67749.515625,
"learning_rate": 4.57135653050933e-05,
"loss": 0.9285,
"step": 510
},
{
"epoch": 0.0874096486804505,
"grad_norm": 78357.3203125,
"learning_rate": 4.5629517565977475e-05,
"loss": 0.8566,
"step": 520
},
{
"epoch": 0.08909060346276686,
"grad_norm": 59291.59375,
"learning_rate": 4.554546982686166e-05,
"loss": 0.8088,
"step": 530
},
{
"epoch": 0.0907715582450832,
"grad_norm": 72898.515625,
"learning_rate": 4.546142208774584e-05,
"loss": 0.8411,
"step": 540
},
{
"epoch": 0.09245251302739957,
"grad_norm": 65205.4296875,
"learning_rate": 4.5377374348630026e-05,
"loss": 0.9546,
"step": 550
},
{
"epoch": 0.09413346780971592,
"grad_norm": 104009.6953125,
"learning_rate": 4.529332660951421e-05,
"loss": 0.8406,
"step": 560
},
{
"epoch": 0.09581442259203228,
"grad_norm": 68204.453125,
"learning_rate": 4.5209278870398386e-05,
"loss": 0.897,
"step": 570
},
{
"epoch": 0.09749537737434863,
"grad_norm": 70454.1484375,
"learning_rate": 4.512523113128257e-05,
"loss": 0.839,
"step": 580
},
{
"epoch": 0.09917633215666499,
"grad_norm": 67070.96875,
"learning_rate": 4.504118339216675e-05,
"loss": 0.9008,
"step": 590
},
{
"epoch": 0.10085728693898134,
"grad_norm": 70355.5234375,
"learning_rate": 4.4957135653050936e-05,
"loss": 0.9318,
"step": 600
},
{
"epoch": 0.10253824172129769,
"grad_norm": 68497.7109375,
"learning_rate": 4.487308791393512e-05,
"loss": 0.9127,
"step": 610
},
{
"epoch": 0.10421919650361405,
"grad_norm": 81705.2578125,
"learning_rate": 4.47890401748193e-05,
"loss": 0.8575,
"step": 620
},
{
"epoch": 0.1059001512859304,
"grad_norm": 72572.75,
"learning_rate": 4.470499243570348e-05,
"loss": 0.8387,
"step": 630
},
{
"epoch": 0.10758110606824677,
"grad_norm": 66429.328125,
"learning_rate": 4.462094469658766e-05,
"loss": 0.8322,
"step": 640
},
{
"epoch": 0.10926206085056311,
"grad_norm": 73876.7265625,
"learning_rate": 4.453689695747185e-05,
"loss": 0.8788,
"step": 650
},
{
"epoch": 0.11094301563287948,
"grad_norm": 73238.40625,
"learning_rate": 4.445284921835603e-05,
"loss": 0.8983,
"step": 660
},
{
"epoch": 0.11262397041519583,
"grad_norm": 65609.2578125,
"learning_rate": 4.4368801479240214e-05,
"loss": 0.8406,
"step": 670
},
{
"epoch": 0.11430492519751219,
"grad_norm": 79022.8515625,
"learning_rate": 4.428475374012439e-05,
"loss": 0.8204,
"step": 680
},
{
"epoch": 0.11598587997982854,
"grad_norm": 63224.625,
"learning_rate": 4.4200706001008574e-05,
"loss": 0.8252,
"step": 690
},
{
"epoch": 0.1176668347621449,
"grad_norm": 61681.51171875,
"learning_rate": 4.411665826189276e-05,
"loss": 0.9041,
"step": 700
},
{
"epoch": 0.11934778954446125,
"grad_norm": 65631.578125,
"learning_rate": 4.403261052277694e-05,
"loss": 0.8541,
"step": 710
},
{
"epoch": 0.12102874432677761,
"grad_norm": 70977.875,
"learning_rate": 4.3948562783661124e-05,
"loss": 0.9705,
"step": 720
},
{
"epoch": 0.12270969910909396,
"grad_norm": 67375.078125,
"learning_rate": 4.38645150445453e-05,
"loss": 0.8542,
"step": 730
},
{
"epoch": 0.12439065389141032,
"grad_norm": 60750.97265625,
"learning_rate": 4.3780467305429484e-05,
"loss": 0.7832,
"step": 740
},
{
"epoch": 0.1260716086737267,
"grad_norm": 62710.984375,
"learning_rate": 4.369641956631367e-05,
"loss": 0.855,
"step": 750
},
{
"epoch": 0.12775256345604302,
"grad_norm": 56731.50390625,
"learning_rate": 4.361237182719785e-05,
"loss": 0.8591,
"step": 760
},
{
"epoch": 0.12943351823835939,
"grad_norm": 67084.78125,
"learning_rate": 4.3528324088082035e-05,
"loss": 0.8406,
"step": 770
},
{
"epoch": 0.13111447302067575,
"grad_norm": 66152.15625,
"learning_rate": 4.344427634896622e-05,
"loss": 0.9062,
"step": 780
},
{
"epoch": 0.1327954278029921,
"grad_norm": 71438.4765625,
"learning_rate": 4.3360228609850395e-05,
"loss": 0.8206,
"step": 790
},
{
"epoch": 0.13447638258530845,
"grad_norm": 60924.875,
"learning_rate": 4.327618087073458e-05,
"loss": 0.8164,
"step": 800
},
{
"epoch": 0.1361573373676248,
"grad_norm": 70178.5390625,
"learning_rate": 4.319213313161876e-05,
"loss": 0.8303,
"step": 810
},
{
"epoch": 0.13783829214994117,
"grad_norm": 64838.84765625,
"learning_rate": 4.3108085392502945e-05,
"loss": 0.8503,
"step": 820
},
{
"epoch": 0.13951924693225753,
"grad_norm": 72861.828125,
"learning_rate": 4.302403765338713e-05,
"loss": 0.9165,
"step": 830
},
{
"epoch": 0.14120020171457387,
"grad_norm": 74359.4140625,
"learning_rate": 4.2939989914271306e-05,
"loss": 0.9102,
"step": 840
},
{
"epoch": 0.14288115649689023,
"grad_norm": 70118.2109375,
"learning_rate": 4.285594217515549e-05,
"loss": 0.8703,
"step": 850
},
{
"epoch": 0.1445621112792066,
"grad_norm": 63720.28125,
"learning_rate": 4.277189443603967e-05,
"loss": 0.893,
"step": 860
},
{
"epoch": 0.14624306606152296,
"grad_norm": 77761.90625,
"learning_rate": 4.2687846696923856e-05,
"loss": 0.9613,
"step": 870
},
{
"epoch": 0.1479240208438393,
"grad_norm": 64969.37890625,
"learning_rate": 4.260379895780804e-05,
"loss": 0.8691,
"step": 880
},
{
"epoch": 0.14960497562615566,
"grad_norm": 70299.8203125,
"learning_rate": 4.2519751218692216e-05,
"loss": 0.8719,
"step": 890
},
{
"epoch": 0.15128593040847202,
"grad_norm": 65577.1171875,
"learning_rate": 4.24357034795764e-05,
"loss": 0.8554,
"step": 900
},
{
"epoch": 0.15296688519078835,
"grad_norm": 71740.8671875,
"learning_rate": 4.235165574046058e-05,
"loss": 0.9948,
"step": 910
},
{
"epoch": 0.15464783997310472,
"grad_norm": 59819.0234375,
"learning_rate": 4.226760800134477e-05,
"loss": 0.8605,
"step": 920
},
{
"epoch": 0.15632879475542108,
"grad_norm": 57238.7109375,
"learning_rate": 4.218356026222895e-05,
"loss": 0.8281,
"step": 930
},
{
"epoch": 0.15800974953773744,
"grad_norm": 70315.765625,
"learning_rate": 4.2099512523113134e-05,
"loss": 0.9126,
"step": 940
},
{
"epoch": 0.15969070432005378,
"grad_norm": 63374.6796875,
"learning_rate": 4.201546478399731e-05,
"loss": 0.8744,
"step": 950
},
{
"epoch": 0.16137165910237014,
"grad_norm": 75860.09375,
"learning_rate": 4.1931417044881494e-05,
"loss": 0.8078,
"step": 960
},
{
"epoch": 0.1630526138846865,
"grad_norm": 58454.1484375,
"learning_rate": 4.184736930576568e-05,
"loss": 0.8084,
"step": 970
},
{
"epoch": 0.16473356866700287,
"grad_norm": 68843.625,
"learning_rate": 4.176332156664986e-05,
"loss": 0.8458,
"step": 980
},
{
"epoch": 0.1664145234493192,
"grad_norm": 58082.43359375,
"learning_rate": 4.1679273827534044e-05,
"loss": 0.8148,
"step": 990
},
{
"epoch": 0.16809547823163556,
"grad_norm": 58417.78125,
"learning_rate": 4.159522608841822e-05,
"loss": 0.8373,
"step": 1000
},
{
"epoch": 0.16977643301395193,
"grad_norm": 61807.9140625,
"learning_rate": 4.1511178349302404e-05,
"loss": 0.8633,
"step": 1010
},
{
"epoch": 0.1714573877962683,
"grad_norm": 61544.55078125,
"learning_rate": 4.142713061018659e-05,
"loss": 0.7921,
"step": 1020
},
{
"epoch": 0.17313834257858463,
"grad_norm": 78909.9921875,
"learning_rate": 4.134308287107077e-05,
"loss": 0.8422,
"step": 1030
},
{
"epoch": 0.174819297360901,
"grad_norm": 75892.40625,
"learning_rate": 4.1259035131954955e-05,
"loss": 0.8307,
"step": 1040
},
{
"epoch": 0.17650025214321735,
"grad_norm": 62162.703125,
"learning_rate": 4.117498739283914e-05,
"loss": 0.8243,
"step": 1050
},
{
"epoch": 0.1781812069255337,
"grad_norm": 66981.4609375,
"learning_rate": 4.1090939653723315e-05,
"loss": 0.819,
"step": 1060
},
{
"epoch": 0.17986216170785005,
"grad_norm": 69593.6796875,
"learning_rate": 4.10068919146075e-05,
"loss": 0.8817,
"step": 1070
},
{
"epoch": 0.1815431164901664,
"grad_norm": 67435.6640625,
"learning_rate": 4.092284417549168e-05,
"loss": 0.8077,
"step": 1080
},
{
"epoch": 0.18322407127248277,
"grad_norm": 66041.1640625,
"learning_rate": 4.0838796436375865e-05,
"loss": 0.841,
"step": 1090
},
{
"epoch": 0.18490502605479914,
"grad_norm": 65792.2890625,
"learning_rate": 4.075474869726005e-05,
"loss": 0.8742,
"step": 1100
},
{
"epoch": 0.18658598083711547,
"grad_norm": 61461.2265625,
"learning_rate": 4.0670700958144226e-05,
"loss": 0.8381,
"step": 1110
},
{
"epoch": 0.18826693561943184,
"grad_norm": 62316.87109375,
"learning_rate": 4.058665321902841e-05,
"loss": 0.8629,
"step": 1120
},
{
"epoch": 0.1899478904017482,
"grad_norm": 60252.7421875,
"learning_rate": 4.050260547991259e-05,
"loss": 0.8003,
"step": 1130
},
{
"epoch": 0.19162884518406456,
"grad_norm": 133503.328125,
"learning_rate": 4.0418557740796776e-05,
"loss": 0.8469,
"step": 1140
},
{
"epoch": 0.1933097999663809,
"grad_norm": 64984.43359375,
"learning_rate": 4.033451000168096e-05,
"loss": 0.8024,
"step": 1150
},
{
"epoch": 0.19499075474869726,
"grad_norm": 64012.15234375,
"learning_rate": 4.0250462262565136e-05,
"loss": 0.8655,
"step": 1160
},
{
"epoch": 0.19667170953101362,
"grad_norm": 80887.5859375,
"learning_rate": 4.016641452344932e-05,
"loss": 0.8273,
"step": 1170
},
{
"epoch": 0.19835266431332998,
"grad_norm": 67626.34375,
"learning_rate": 4.00823667843335e-05,
"loss": 0.7986,
"step": 1180
},
{
"epoch": 0.20003361909564632,
"grad_norm": 71404.109375,
"learning_rate": 3.9998319045217687e-05,
"loss": 0.796,
"step": 1190
},
{
"epoch": 0.20171457387796268,
"grad_norm": 69063.609375,
"learning_rate": 3.991427130610187e-05,
"loss": 0.8819,
"step": 1200
},
{
"epoch": 0.20339552866027905,
"grad_norm": 64813.16796875,
"learning_rate": 3.9830223566986053e-05,
"loss": 0.8451,
"step": 1210
},
{
"epoch": 0.20507648344259538,
"grad_norm": 76470.9765625,
"learning_rate": 3.974617582787023e-05,
"loss": 0.8079,
"step": 1220
},
{
"epoch": 0.20675743822491174,
"grad_norm": 57304.13671875,
"learning_rate": 3.9662128088754414e-05,
"loss": 0.7686,
"step": 1230
},
{
"epoch": 0.2084383930072281,
"grad_norm": 58822.703125,
"learning_rate": 3.95780803496386e-05,
"loss": 0.8457,
"step": 1240
},
{
"epoch": 0.21011934778954447,
"grad_norm": 61174.65234375,
"learning_rate": 3.949403261052278e-05,
"loss": 0.752,
"step": 1250
},
{
"epoch": 0.2118003025718608,
"grad_norm": 72221.9296875,
"learning_rate": 3.9409984871406964e-05,
"loss": 0.8431,
"step": 1260
},
{
"epoch": 0.21348125735417717,
"grad_norm": 64120.45703125,
"learning_rate": 3.932593713229114e-05,
"loss": 0.8505,
"step": 1270
},
{
"epoch": 0.21516221213649353,
"grad_norm": 67320.4140625,
"learning_rate": 3.9241889393175324e-05,
"loss": 0.9042,
"step": 1280
},
{
"epoch": 0.2168431669188099,
"grad_norm": 64738.58203125,
"learning_rate": 3.915784165405951e-05,
"loss": 0.8681,
"step": 1290
},
{
"epoch": 0.21852412170112623,
"grad_norm": 87432.8515625,
"learning_rate": 3.907379391494369e-05,
"loss": 0.8661,
"step": 1300
},
{
"epoch": 0.2202050764834426,
"grad_norm": 56789.4453125,
"learning_rate": 3.8989746175827875e-05,
"loss": 0.8058,
"step": 1310
},
{
"epoch": 0.22188603126575895,
"grad_norm": 55766.0234375,
"learning_rate": 3.890569843671206e-05,
"loss": 0.8627,
"step": 1320
},
{
"epoch": 0.22356698604807532,
"grad_norm": 68595.6640625,
"learning_rate": 3.8821650697596235e-05,
"loss": 0.8299,
"step": 1330
},
{
"epoch": 0.22524794083039165,
"grad_norm": 67234.25,
"learning_rate": 3.873760295848042e-05,
"loss": 0.8106,
"step": 1340
},
{
"epoch": 0.22692889561270801,
"grad_norm": 67707.8828125,
"learning_rate": 3.86535552193646e-05,
"loss": 0.8399,
"step": 1350
},
{
"epoch": 0.22860985039502438,
"grad_norm": 60259.1015625,
"learning_rate": 3.8569507480248785e-05,
"loss": 0.8484,
"step": 1360
},
{
"epoch": 0.23029080517734074,
"grad_norm": 68358.1484375,
"learning_rate": 3.848545974113297e-05,
"loss": 0.8264,
"step": 1370
},
{
"epoch": 0.23197175995965708,
"grad_norm": 62014.7109375,
"learning_rate": 3.8401412002017145e-05,
"loss": 0.8507,
"step": 1380
},
{
"epoch": 0.23365271474197344,
"grad_norm": 61060.28125,
"learning_rate": 3.831736426290133e-05,
"loss": 0.8586,
"step": 1390
},
{
"epoch": 0.2353336695242898,
"grad_norm": 57990.08984375,
"learning_rate": 3.823331652378551e-05,
"loss": 0.8011,
"step": 1400
},
{
"epoch": 0.23701462430660616,
"grad_norm": 69242.015625,
"learning_rate": 3.8149268784669696e-05,
"loss": 0.7943,
"step": 1410
},
{
"epoch": 0.2386955790889225,
"grad_norm": 85857.8046875,
"learning_rate": 3.806522104555388e-05,
"loss": 0.864,
"step": 1420
},
{
"epoch": 0.24037653387123886,
"grad_norm": 69873.5234375,
"learning_rate": 3.7981173306438056e-05,
"loss": 0.8578,
"step": 1430
},
{
"epoch": 0.24205748865355523,
"grad_norm": 59830.73046875,
"learning_rate": 3.789712556732224e-05,
"loss": 0.8517,
"step": 1440
},
{
"epoch": 0.2437384434358716,
"grad_norm": 59296.34765625,
"learning_rate": 3.781307782820642e-05,
"loss": 0.864,
"step": 1450
},
{
"epoch": 0.24541939821818792,
"grad_norm": 63888.60546875,
"learning_rate": 3.7729030089090606e-05,
"loss": 0.8161,
"step": 1460
},
{
"epoch": 0.24710035300050429,
"grad_norm": 63231.421875,
"learning_rate": 3.764498234997479e-05,
"loss": 0.8901,
"step": 1470
},
{
"epoch": 0.24878130778282065,
"grad_norm": 62661.12890625,
"learning_rate": 3.756093461085897e-05,
"loss": 0.8638,
"step": 1480
},
{
"epoch": 0.250462262565137,
"grad_norm": 69614.8984375,
"learning_rate": 3.747688687174315e-05,
"loss": 0.8644,
"step": 1490
},
{
"epoch": 0.2521432173474534,
"grad_norm": 67442.9296875,
"learning_rate": 3.7392839132627334e-05,
"loss": 0.7659,
"step": 1500
},
{
"epoch": 0.2538241721297697,
"grad_norm": 67846.0390625,
"learning_rate": 3.730879139351152e-05,
"loss": 0.8243,
"step": 1510
},
{
"epoch": 0.25550512691208604,
"grad_norm": 65967.703125,
"learning_rate": 3.72247436543957e-05,
"loss": 0.8573,
"step": 1520
},
{
"epoch": 0.25718608169440244,
"grad_norm": 66188.1484375,
"learning_rate": 3.7140695915279884e-05,
"loss": 0.8491,
"step": 1530
},
{
"epoch": 0.25886703647671877,
"grad_norm": 55857.7890625,
"learning_rate": 3.705664817616406e-05,
"loss": 0.8314,
"step": 1540
},
{
"epoch": 0.2605479912590351,
"grad_norm": 78828.6328125,
"learning_rate": 3.6972600437048244e-05,
"loss": 0.7589,
"step": 1550
},
{
"epoch": 0.2622289460413515,
"grad_norm": 64696.65234375,
"learning_rate": 3.688855269793243e-05,
"loss": 0.774,
"step": 1560
},
{
"epoch": 0.26390990082366783,
"grad_norm": 71397.7265625,
"learning_rate": 3.680450495881661e-05,
"loss": 0.8351,
"step": 1570
},
{
"epoch": 0.2655908556059842,
"grad_norm": 66192.203125,
"learning_rate": 3.6720457219700795e-05,
"loss": 0.8099,
"step": 1580
},
{
"epoch": 0.26727181038830056,
"grad_norm": 70641.421875,
"learning_rate": 3.663640948058498e-05,
"loss": 0.8571,
"step": 1590
},
{
"epoch": 0.2689527651706169,
"grad_norm": 75064.75,
"learning_rate": 3.6552361741469155e-05,
"loss": 0.8681,
"step": 1600
},
{
"epoch": 0.2706337199529333,
"grad_norm": 75823.125,
"learning_rate": 3.646831400235334e-05,
"loss": 0.8258,
"step": 1610
},
{
"epoch": 0.2723146747352496,
"grad_norm": 72296.765625,
"learning_rate": 3.638426626323752e-05,
"loss": 0.8561,
"step": 1620
},
{
"epoch": 0.27399562951756595,
"grad_norm": 64309.3125,
"learning_rate": 3.6300218524121705e-05,
"loss": 0.856,
"step": 1630
},
{
"epoch": 0.27567658429988234,
"grad_norm": 73902.8359375,
"learning_rate": 3.621617078500589e-05,
"loss": 0.8424,
"step": 1640
},
{
"epoch": 0.2773575390821987,
"grad_norm": 63552.06640625,
"learning_rate": 3.6132123045890065e-05,
"loss": 0.861,
"step": 1650
},
{
"epoch": 0.27903849386451507,
"grad_norm": 74067.7265625,
"learning_rate": 3.604807530677425e-05,
"loss": 0.7832,
"step": 1660
},
{
"epoch": 0.2807194486468314,
"grad_norm": 68881.3046875,
"learning_rate": 3.596402756765843e-05,
"loss": 0.8443,
"step": 1670
},
{
"epoch": 0.28240040342914774,
"grad_norm": 62996.19921875,
"learning_rate": 3.5879979828542616e-05,
"loss": 0.8075,
"step": 1680
},
{
"epoch": 0.28408135821146413,
"grad_norm": 73364.8125,
"learning_rate": 3.57959320894268e-05,
"loss": 0.8425,
"step": 1690
},
{
"epoch": 0.28576231299378047,
"grad_norm": 67509.296875,
"learning_rate": 3.5711884350310976e-05,
"loss": 0.755,
"step": 1700
},
{
"epoch": 0.2874432677760968,
"grad_norm": 66616.984375,
"learning_rate": 3.562783661119516e-05,
"loss": 0.8679,
"step": 1710
},
{
"epoch": 0.2891242225584132,
"grad_norm": 74004.359375,
"learning_rate": 3.554378887207934e-05,
"loss": 0.8793,
"step": 1720
},
{
"epoch": 0.2908051773407295,
"grad_norm": 59084.9609375,
"learning_rate": 3.5459741132963526e-05,
"loss": 0.8322,
"step": 1730
},
{
"epoch": 0.2924861321230459,
"grad_norm": 74027.28125,
"learning_rate": 3.537569339384771e-05,
"loss": 0.8661,
"step": 1740
},
{
"epoch": 0.29416708690536225,
"grad_norm": 64524.97265625,
"learning_rate": 3.529164565473189e-05,
"loss": 0.7334,
"step": 1750
},
{
"epoch": 0.2958480416876786,
"grad_norm": 74809.4921875,
"learning_rate": 3.520759791561607e-05,
"loss": 0.8091,
"step": 1760
},
{
"epoch": 0.297528996469995,
"grad_norm": 66084.609375,
"learning_rate": 3.5123550176500253e-05,
"loss": 0.8715,
"step": 1770
},
{
"epoch": 0.2992099512523113,
"grad_norm": 62919.05859375,
"learning_rate": 3.503950243738444e-05,
"loss": 0.8326,
"step": 1780
},
{
"epoch": 0.30089090603462765,
"grad_norm": 68219.046875,
"learning_rate": 3.495545469826862e-05,
"loss": 0.8387,
"step": 1790
},
{
"epoch": 0.30257186081694404,
"grad_norm": 61066.58203125,
"learning_rate": 3.4871406959152804e-05,
"loss": 0.8113,
"step": 1800
},
{
"epoch": 0.3042528155992604,
"grad_norm": 64199.75390625,
"learning_rate": 3.478735922003698e-05,
"loss": 0.8359,
"step": 1810
},
{
"epoch": 0.3059337703815767,
"grad_norm": 64780.81640625,
"learning_rate": 3.4703311480921164e-05,
"loss": 0.8528,
"step": 1820
},
{
"epoch": 0.3076147251638931,
"grad_norm": 66866.6875,
"learning_rate": 3.461926374180535e-05,
"loss": 0.7795,
"step": 1830
},
{
"epoch": 0.30929567994620943,
"grad_norm": 74536.5078125,
"learning_rate": 3.453521600268953e-05,
"loss": 0.8432,
"step": 1840
},
{
"epoch": 0.3109766347285258,
"grad_norm": 63969.90234375,
"learning_rate": 3.4451168263573714e-05,
"loss": 0.8459,
"step": 1850
},
{
"epoch": 0.31265758951084216,
"grad_norm": 68175.203125,
"learning_rate": 3.43671205244579e-05,
"loss": 0.8437,
"step": 1860
},
{
"epoch": 0.3143385442931585,
"grad_norm": 61941.46484375,
"learning_rate": 3.4283072785342075e-05,
"loss": 0.8182,
"step": 1870
},
{
"epoch": 0.3160194990754749,
"grad_norm": 67861.203125,
"learning_rate": 3.419902504622626e-05,
"loss": 0.8742,
"step": 1880
},
{
"epoch": 0.3177004538577912,
"grad_norm": 66184.25,
"learning_rate": 3.411497730711044e-05,
"loss": 0.8312,
"step": 1890
},
{
"epoch": 0.31938140864010756,
"grad_norm": 63603.37109375,
"learning_rate": 3.4030929567994625e-05,
"loss": 0.8162,
"step": 1900
},
{
"epoch": 0.32106236342242395,
"grad_norm": 76040.2421875,
"learning_rate": 3.394688182887881e-05,
"loss": 0.8555,
"step": 1910
},
{
"epoch": 0.3227433182047403,
"grad_norm": 62280.6328125,
"learning_rate": 3.3862834089762985e-05,
"loss": 0.8086,
"step": 1920
},
{
"epoch": 0.32442427298705667,
"grad_norm": 68005.015625,
"learning_rate": 3.377878635064717e-05,
"loss": 0.7433,
"step": 1930
},
{
"epoch": 0.326105227769373,
"grad_norm": 58576.92578125,
"learning_rate": 3.369473861153135e-05,
"loss": 0.7728,
"step": 1940
},
{
"epoch": 0.32778618255168934,
"grad_norm": 64847.859375,
"learning_rate": 3.3610690872415536e-05,
"loss": 0.7596,
"step": 1950
},
{
"epoch": 0.32946713733400573,
"grad_norm": 63781.38671875,
"learning_rate": 3.352664313329972e-05,
"loss": 0.7308,
"step": 1960
},
{
"epoch": 0.33114809211632207,
"grad_norm": 66446.5859375,
"learning_rate": 3.3442595394183896e-05,
"loss": 0.8058,
"step": 1970
},
{
"epoch": 0.3328290468986384,
"grad_norm": 61722.046875,
"learning_rate": 3.335854765506808e-05,
"loss": 0.869,
"step": 1980
},
{
"epoch": 0.3345100016809548,
"grad_norm": 66245.3359375,
"learning_rate": 3.327449991595226e-05,
"loss": 0.8153,
"step": 1990
},
{
"epoch": 0.33619095646327113,
"grad_norm": 60609.90625,
"learning_rate": 3.3190452176836446e-05,
"loss": 0.8484,
"step": 2000
},
{
"epoch": 0.3378719112455875,
"grad_norm": 70234.8828125,
"learning_rate": 3.310640443772063e-05,
"loss": 0.8249,
"step": 2010
},
{
"epoch": 0.33955286602790385,
"grad_norm": 52655.56640625,
"learning_rate": 3.302235669860481e-05,
"loss": 0.7768,
"step": 2020
},
{
"epoch": 0.3412338208102202,
"grad_norm": 73065.375,
"learning_rate": 3.293830895948899e-05,
"loss": 0.8911,
"step": 2030
},
{
"epoch": 0.3429147755925366,
"grad_norm": 59607.45703125,
"learning_rate": 3.285426122037317e-05,
"loss": 0.7773,
"step": 2040
},
{
"epoch": 0.3445957303748529,
"grad_norm": 64399.9375,
"learning_rate": 3.277021348125736e-05,
"loss": 0.8161,
"step": 2050
},
{
"epoch": 0.34627668515716925,
"grad_norm": 75411.3359375,
"learning_rate": 3.268616574214154e-05,
"loss": 0.7615,
"step": 2060
},
{
"epoch": 0.34795763993948564,
"grad_norm": 69012.03125,
"learning_rate": 3.2602118003025724e-05,
"loss": 0.8245,
"step": 2070
},
{
"epoch": 0.349638594721802,
"grad_norm": 69290.5625,
"learning_rate": 3.25180702639099e-05,
"loss": 0.786,
"step": 2080
},
{
"epoch": 0.35131954950411837,
"grad_norm": 68525.0859375,
"learning_rate": 3.2434022524794084e-05,
"loss": 0.7572,
"step": 2090
},
{
"epoch": 0.3530005042864347,
"grad_norm": 61886.20703125,
"learning_rate": 3.234997478567827e-05,
"loss": 0.8115,
"step": 2100
},
{
"epoch": 0.35468145906875104,
"grad_norm": 73001.8828125,
"learning_rate": 3.226592704656245e-05,
"loss": 0.8336,
"step": 2110
},
{
"epoch": 0.3563624138510674,
"grad_norm": 65161.6484375,
"learning_rate": 3.2181879307446634e-05,
"loss": 0.8124,
"step": 2120
},
{
"epoch": 0.35804336863338376,
"grad_norm": 61001.9140625,
"learning_rate": 3.209783156833082e-05,
"loss": 0.8067,
"step": 2130
},
{
"epoch": 0.3597243234157001,
"grad_norm": 56932.25,
"learning_rate": 3.2013783829214994e-05,
"loss": 0.77,
"step": 2140
},
{
"epoch": 0.3614052781980165,
"grad_norm": 68792.859375,
"learning_rate": 3.192973609009918e-05,
"loss": 0.8217,
"step": 2150
},
{
"epoch": 0.3630862329803328,
"grad_norm": 61329.98828125,
"learning_rate": 3.184568835098336e-05,
"loss": 0.7714,
"step": 2160
},
{
"epoch": 0.36476718776264916,
"grad_norm": 66838.0078125,
"learning_rate": 3.1761640611867545e-05,
"loss": 0.8314,
"step": 2170
},
{
"epoch": 0.36644814254496555,
"grad_norm": 73817.578125,
"learning_rate": 3.167759287275173e-05,
"loss": 0.8413,
"step": 2180
},
{
"epoch": 0.3681290973272819,
"grad_norm": 67156.03125,
"learning_rate": 3.1593545133635905e-05,
"loss": 0.7996,
"step": 2190
},
{
"epoch": 0.3698100521095983,
"grad_norm": 83176.359375,
"learning_rate": 3.150949739452009e-05,
"loss": 0.7875,
"step": 2200
},
{
"epoch": 0.3714910068919146,
"grad_norm": 68843.5859375,
"learning_rate": 3.142544965540427e-05,
"loss": 0.8156,
"step": 2210
},
{
"epoch": 0.37317196167423095,
"grad_norm": 61444.25,
"learning_rate": 3.1341401916288455e-05,
"loss": 0.7747,
"step": 2220
},
{
"epoch": 0.37485291645654734,
"grad_norm": 70228.59375,
"learning_rate": 3.125735417717264e-05,
"loss": 0.8088,
"step": 2230
},
{
"epoch": 0.37653387123886367,
"grad_norm": 56036.578125,
"learning_rate": 3.1173306438056816e-05,
"loss": 0.7834,
"step": 2240
},
{
"epoch": 0.37821482602118,
"grad_norm": 62951.3828125,
"learning_rate": 3.1089258698941e-05,
"loss": 0.7642,
"step": 2250
},
{
"epoch": 0.3798957808034964,
"grad_norm": 66556.96875,
"learning_rate": 3.100521095982518e-05,
"loss": 0.8115,
"step": 2260
},
{
"epoch": 0.38157673558581273,
"grad_norm": 83066.890625,
"learning_rate": 3.0921163220709366e-05,
"loss": 0.8311,
"step": 2270
},
{
"epoch": 0.3832576903681291,
"grad_norm": 68849.5078125,
"learning_rate": 3.083711548159355e-05,
"loss": 0.7935,
"step": 2280
},
{
"epoch": 0.38493864515044546,
"grad_norm": 68215.6953125,
"learning_rate": 3.075306774247773e-05,
"loss": 0.8071,
"step": 2290
},
{
"epoch": 0.3866195999327618,
"grad_norm": 65878.6484375,
"learning_rate": 3.066902000336191e-05,
"loss": 0.9158,
"step": 2300
},
{
"epoch": 0.3883005547150782,
"grad_norm": 59912.58203125,
"learning_rate": 3.058497226424609e-05,
"loss": 0.8217,
"step": 2310
},
{
"epoch": 0.3899815094973945,
"grad_norm": 65707.3828125,
"learning_rate": 3.0500924525130277e-05,
"loss": 0.7693,
"step": 2320
},
{
"epoch": 0.39166246427971085,
"grad_norm": 75819.90625,
"learning_rate": 3.041687678601446e-05,
"loss": 0.8007,
"step": 2330
},
{
"epoch": 0.39334341906202724,
"grad_norm": 66755.734375,
"learning_rate": 3.0332829046898644e-05,
"loss": 0.8416,
"step": 2340
},
{
"epoch": 0.3950243738443436,
"grad_norm": 68420.984375,
"learning_rate": 3.024878130778282e-05,
"loss": 0.8119,
"step": 2350
},
{
"epoch": 0.39670532862665997,
"grad_norm": 70407.578125,
"learning_rate": 3.0164733568667004e-05,
"loss": 0.7907,
"step": 2360
},
{
"epoch": 0.3983862834089763,
"grad_norm": 70415.9453125,
"learning_rate": 3.0080685829551187e-05,
"loss": 0.7508,
"step": 2370
},
{
"epoch": 0.40006723819129264,
"grad_norm": 64331.296875,
"learning_rate": 2.999663809043537e-05,
"loss": 0.7812,
"step": 2380
},
{
"epoch": 0.40174819297360903,
"grad_norm": 55358.15625,
"learning_rate": 2.9912590351319554e-05,
"loss": 0.8044,
"step": 2390
},
{
"epoch": 0.40342914775592537,
"grad_norm": 74381.0078125,
"learning_rate": 2.9828542612203734e-05,
"loss": 0.7862,
"step": 2400
},
{
"epoch": 0.4051101025382417,
"grad_norm": 69503.421875,
"learning_rate": 2.9744494873087914e-05,
"loss": 0.8414,
"step": 2410
},
{
"epoch": 0.4067910573205581,
"grad_norm": 63817.671875,
"learning_rate": 2.9660447133972098e-05,
"loss": 0.8239,
"step": 2420
},
{
"epoch": 0.4084720121028744,
"grad_norm": 64829.07421875,
"learning_rate": 2.957639939485628e-05,
"loss": 0.8409,
"step": 2430
},
{
"epoch": 0.41015296688519076,
"grad_norm": 90130.75,
"learning_rate": 2.9492351655740465e-05,
"loss": 0.7979,
"step": 2440
},
{
"epoch": 0.41183392166750715,
"grad_norm": 65037.46484375,
"learning_rate": 2.9408303916624648e-05,
"loss": 0.7724,
"step": 2450
},
{
"epoch": 0.4135148764498235,
"grad_norm": 74054.9765625,
"learning_rate": 2.9324256177508825e-05,
"loss": 0.9302,
"step": 2460
},
{
"epoch": 0.4151958312321399,
"grad_norm": 65572.390625,
"learning_rate": 2.924020843839301e-05,
"loss": 0.8096,
"step": 2470
},
{
"epoch": 0.4168767860144562,
"grad_norm": 70729.125,
"learning_rate": 2.9156160699277192e-05,
"loss": 0.7749,
"step": 2480
},
{
"epoch": 0.41855774079677255,
"grad_norm": 60806.2265625,
"learning_rate": 2.9072112960161375e-05,
"loss": 0.7964,
"step": 2490
},
{
"epoch": 0.42023869557908894,
"grad_norm": 66842.796875,
"learning_rate": 2.898806522104556e-05,
"loss": 0.7956,
"step": 2500
},
{
"epoch": 0.4219196503614053,
"grad_norm": 65062.66015625,
"learning_rate": 2.8904017481929735e-05,
"loss": 0.765,
"step": 2510
},
{
"epoch": 0.4236006051437216,
"grad_norm": 70468.4921875,
"learning_rate": 2.881996974281392e-05,
"loss": 0.8029,
"step": 2520
},
{
"epoch": 0.425281559926038,
"grad_norm": 63679.37109375,
"learning_rate": 2.8735922003698102e-05,
"loss": 0.8116,
"step": 2530
},
{
"epoch": 0.42696251470835433,
"grad_norm": 79301.3984375,
"learning_rate": 2.8651874264582286e-05,
"loss": 0.8561,
"step": 2540
},
{
"epoch": 0.4286434694906707,
"grad_norm": 66358.953125,
"learning_rate": 2.856782652546647e-05,
"loss": 0.8753,
"step": 2550
},
{
"epoch": 0.43032442427298706,
"grad_norm": 64068.6875,
"learning_rate": 2.848377878635065e-05,
"loss": 0.7874,
"step": 2560
},
{
"epoch": 0.4320053790553034,
"grad_norm": 70842.203125,
"learning_rate": 2.839973104723483e-05,
"loss": 0.8036,
"step": 2570
},
{
"epoch": 0.4336863338376198,
"grad_norm": 69636.0390625,
"learning_rate": 2.8315683308119013e-05,
"loss": 0.8069,
"step": 2580
},
{
"epoch": 0.4353672886199361,
"grad_norm": 89295.2421875,
"learning_rate": 2.8231635569003196e-05,
"loss": 0.8322,
"step": 2590
},
{
"epoch": 0.43704824340225246,
"grad_norm": 70937.7265625,
"learning_rate": 2.814758782988738e-05,
"loss": 0.9161,
"step": 2600
},
{
"epoch": 0.43872919818456885,
"grad_norm": 88488.953125,
"learning_rate": 2.8063540090771563e-05,
"loss": 0.7753,
"step": 2610
},
{
"epoch": 0.4404101529668852,
"grad_norm": 68679.078125,
"learning_rate": 2.797949235165574e-05,
"loss": 0.8489,
"step": 2620
},
{
"epoch": 0.4420911077492016,
"grad_norm": 62188.109375,
"learning_rate": 2.7895444612539924e-05,
"loss": 0.7826,
"step": 2630
},
{
"epoch": 0.4437720625315179,
"grad_norm": 72500.828125,
"learning_rate": 2.7811396873424107e-05,
"loss": 0.8122,
"step": 2640
},
{
"epoch": 0.44545301731383424,
"grad_norm": 61758.7421875,
"learning_rate": 2.772734913430829e-05,
"loss": 0.7645,
"step": 2650
},
{
"epoch": 0.44713397209615063,
"grad_norm": 59873.4375,
"learning_rate": 2.7643301395192474e-05,
"loss": 0.7859,
"step": 2660
},
{
"epoch": 0.44881492687846697,
"grad_norm": 78891.734375,
"learning_rate": 2.755925365607665e-05,
"loss": 0.7696,
"step": 2670
},
{
"epoch": 0.4504958816607833,
"grad_norm": 69256.2578125,
"learning_rate": 2.7475205916960834e-05,
"loss": 0.8161,
"step": 2680
},
{
"epoch": 0.4521768364430997,
"grad_norm": 68757.21875,
"learning_rate": 2.7391158177845018e-05,
"loss": 0.7662,
"step": 2690
},
{
"epoch": 0.45385779122541603,
"grad_norm": 80412.984375,
"learning_rate": 2.73071104387292e-05,
"loss": 0.7875,
"step": 2700
},
{
"epoch": 0.45553874600773236,
"grad_norm": 74670.53125,
"learning_rate": 2.7223062699613385e-05,
"loss": 0.7664,
"step": 2710
},
{
"epoch": 0.45721970079004876,
"grad_norm": 69425.640625,
"learning_rate": 2.7139014960497565e-05,
"loss": 0.8521,
"step": 2720
},
{
"epoch": 0.4589006555723651,
"grad_norm": 70010.1484375,
"learning_rate": 2.7054967221381745e-05,
"loss": 0.7819,
"step": 2730
},
{
"epoch": 0.4605816103546815,
"grad_norm": 76622.5078125,
"learning_rate": 2.6970919482265928e-05,
"loss": 0.8632,
"step": 2740
},
{
"epoch": 0.4622625651369978,
"grad_norm": 63262.62890625,
"learning_rate": 2.6886871743150112e-05,
"loss": 0.7303,
"step": 2750
},
{
"epoch": 0.46394351991931415,
"grad_norm": 63295.68359375,
"learning_rate": 2.6802824004034295e-05,
"loss": 0.773,
"step": 2760
},
{
"epoch": 0.46562447470163054,
"grad_norm": 61289.41015625,
"learning_rate": 2.6718776264918475e-05,
"loss": 0.7776,
"step": 2770
},
{
"epoch": 0.4673054294839469,
"grad_norm": 75931.3671875,
"learning_rate": 2.6634728525802655e-05,
"loss": 0.826,
"step": 2780
},
{
"epoch": 0.4689863842662632,
"grad_norm": 69288.890625,
"learning_rate": 2.655068078668684e-05,
"loss": 0.7978,
"step": 2790
},
{
"epoch": 0.4706673390485796,
"grad_norm": 61528.40234375,
"learning_rate": 2.6466633047571022e-05,
"loss": 0.7891,
"step": 2800
},
{
"epoch": 0.47234829383089594,
"grad_norm": 67145.6796875,
"learning_rate": 2.6382585308455206e-05,
"loss": 0.7483,
"step": 2810
},
{
"epoch": 0.47402924861321233,
"grad_norm": 62958.0703125,
"learning_rate": 2.629853756933939e-05,
"loss": 0.8407,
"step": 2820
},
{
"epoch": 0.47571020339552866,
"grad_norm": 71565.0078125,
"learning_rate": 2.621448983022357e-05,
"loss": 0.8093,
"step": 2830
},
{
"epoch": 0.477391158177845,
"grad_norm": 70295.3125,
"learning_rate": 2.613044209110775e-05,
"loss": 0.8328,
"step": 2840
},
{
"epoch": 0.4790721129601614,
"grad_norm": 66831.65625,
"learning_rate": 2.6046394351991933e-05,
"loss": 0.7844,
"step": 2850
},
{
"epoch": 0.4807530677424777,
"grad_norm": 62661.7421875,
"learning_rate": 2.5962346612876116e-05,
"loss": 0.7925,
"step": 2860
},
{
"epoch": 0.48243402252479406,
"grad_norm": 61856.66796875,
"learning_rate": 2.58782988737603e-05,
"loss": 0.7488,
"step": 2870
},
{
"epoch": 0.48411497730711045,
"grad_norm": 73633.4765625,
"learning_rate": 2.579425113464448e-05,
"loss": 0.7881,
"step": 2880
},
{
"epoch": 0.4857959320894268,
"grad_norm": 69115.28125,
"learning_rate": 2.571020339552866e-05,
"loss": 0.8175,
"step": 2890
},
{
"epoch": 0.4874768868717432,
"grad_norm": 61891.3828125,
"learning_rate": 2.5626155656412843e-05,
"loss": 0.8784,
"step": 2900
},
{
"epoch": 0.4891578416540595,
"grad_norm": 77769.9921875,
"learning_rate": 2.5542107917297027e-05,
"loss": 0.8684,
"step": 2910
},
{
"epoch": 0.49083879643637585,
"grad_norm": 68048.734375,
"learning_rate": 2.545806017818121e-05,
"loss": 0.7853,
"step": 2920
},
{
"epoch": 0.49251975121869224,
"grad_norm": 61355.94140625,
"learning_rate": 2.537401243906539e-05,
"loss": 0.7845,
"step": 2930
},
{
"epoch": 0.49420070600100857,
"grad_norm": 69287.953125,
"learning_rate": 2.528996469994957e-05,
"loss": 0.8007,
"step": 2940
},
{
"epoch": 0.4958816607833249,
"grad_norm": 68851.8359375,
"learning_rate": 2.5205916960833754e-05,
"loss": 0.8159,
"step": 2950
},
{
"epoch": 0.4975626155656413,
"grad_norm": 69794.859375,
"learning_rate": 2.5121869221717938e-05,
"loss": 0.7958,
"step": 2960
},
{
"epoch": 0.49924357034795763,
"grad_norm": 73403.1953125,
"learning_rate": 2.503782148260212e-05,
"loss": 0.7855,
"step": 2970
},
{
"epoch": 0.500924525130274,
"grad_norm": 74224.1328125,
"learning_rate": 2.49537737434863e-05,
"loss": 0.7728,
"step": 2980
},
{
"epoch": 0.5026054799125903,
"grad_norm": 63414.859375,
"learning_rate": 2.4869726004370485e-05,
"loss": 0.8105,
"step": 2990
},
{
"epoch": 0.5042864346949067,
"grad_norm": 77378.109375,
"learning_rate": 2.4785678265254668e-05,
"loss": 0.8636,
"step": 3000
},
{
"epoch": 0.5059673894772231,
"grad_norm": 70468.2734375,
"learning_rate": 2.4701630526138848e-05,
"loss": 0.7242,
"step": 3010
},
{
"epoch": 0.5076483442595394,
"grad_norm": 60809.28515625,
"learning_rate": 2.461758278702303e-05,
"loss": 0.8078,
"step": 3020
},
{
"epoch": 0.5093292990418558,
"grad_norm": 51373.625,
"learning_rate": 2.4533535047907215e-05,
"loss": 0.8366,
"step": 3030
},
{
"epoch": 0.5110102538241721,
"grad_norm": 59719.765625,
"learning_rate": 2.4449487308791395e-05,
"loss": 0.8665,
"step": 3040
},
{
"epoch": 0.5126912086064885,
"grad_norm": 64678.41015625,
"learning_rate": 2.436543956967558e-05,
"loss": 0.7776,
"step": 3050
},
{
"epoch": 0.5143721633888049,
"grad_norm": 74790.40625,
"learning_rate": 2.4281391830559762e-05,
"loss": 0.7729,
"step": 3060
},
{
"epoch": 0.5160531181711212,
"grad_norm": 71427.921875,
"learning_rate": 2.4197344091443942e-05,
"loss": 0.7956,
"step": 3070
},
{
"epoch": 0.5177340729534375,
"grad_norm": 66004.3671875,
"learning_rate": 2.4113296352328126e-05,
"loss": 0.7588,
"step": 3080
},
{
"epoch": 0.5194150277357539,
"grad_norm": 64083.765625,
"learning_rate": 2.4029248613212306e-05,
"loss": 0.7931,
"step": 3090
},
{
"epoch": 0.5210959825180702,
"grad_norm": 54726.55078125,
"learning_rate": 2.394520087409649e-05,
"loss": 0.7163,
"step": 3100
},
{
"epoch": 0.5227769373003867,
"grad_norm": 73129.3125,
"learning_rate": 2.3861153134980673e-05,
"loss": 0.8364,
"step": 3110
},
{
"epoch": 0.524457892082703,
"grad_norm": 62834.08984375,
"learning_rate": 2.3777105395864853e-05,
"loss": 0.7431,
"step": 3120
},
{
"epoch": 0.5261388468650193,
"grad_norm": 61612.23828125,
"learning_rate": 2.3693057656749036e-05,
"loss": 0.7975,
"step": 3130
},
{
"epoch": 0.5278198016473357,
"grad_norm": 71137.6640625,
"learning_rate": 2.3609009917633216e-05,
"loss": 0.783,
"step": 3140
},
{
"epoch": 0.529500756429652,
"grad_norm": 63406.10546875,
"learning_rate": 2.35249621785174e-05,
"loss": 0.824,
"step": 3150
},
{
"epoch": 0.5311817112119684,
"grad_norm": 64795.73828125,
"learning_rate": 2.3440914439401583e-05,
"loss": 0.8715,
"step": 3160
},
{
"epoch": 0.5328626659942848,
"grad_norm": 59525.79296875,
"learning_rate": 2.3356866700285763e-05,
"loss": 0.7934,
"step": 3170
},
{
"epoch": 0.5345436207766011,
"grad_norm": 72914.4609375,
"learning_rate": 2.3272818961169947e-05,
"loss": 0.7243,
"step": 3180
},
{
"epoch": 0.5362245755589174,
"grad_norm": 62966.078125,
"learning_rate": 2.318877122205413e-05,
"loss": 0.712,
"step": 3190
},
{
"epoch": 0.5379055303412338,
"grad_norm": 66647.0546875,
"learning_rate": 2.310472348293831e-05,
"loss": 0.7674,
"step": 3200
},
{
"epoch": 0.5395864851235502,
"grad_norm": 65908.0703125,
"learning_rate": 2.3020675743822494e-05,
"loss": 0.8484,
"step": 3210
},
{
"epoch": 0.5412674399058666,
"grad_norm": 60565.56640625,
"learning_rate": 2.2936628004706674e-05,
"loss": 0.7844,
"step": 3220
},
{
"epoch": 0.5429483946881829,
"grad_norm": 67560.3828125,
"learning_rate": 2.2852580265590857e-05,
"loss": 0.9071,
"step": 3230
},
{
"epoch": 0.5446293494704992,
"grad_norm": 66084.265625,
"learning_rate": 2.276853252647504e-05,
"loss": 0.7323,
"step": 3240
},
{
"epoch": 0.5463103042528156,
"grad_norm": 78780.078125,
"learning_rate": 2.268448478735922e-05,
"loss": 0.9259,
"step": 3250
},
{
"epoch": 0.5479912590351319,
"grad_norm": 67810.703125,
"learning_rate": 2.2600437048243404e-05,
"loss": 0.7813,
"step": 3260
},
{
"epoch": 0.5496722138174484,
"grad_norm": 62699.97265625,
"learning_rate": 2.2516389309127588e-05,
"loss": 0.7831,
"step": 3270
},
{
"epoch": 0.5513531685997647,
"grad_norm": 66553.1640625,
"learning_rate": 2.2432341570011768e-05,
"loss": 0.8185,
"step": 3280
},
{
"epoch": 0.553034123382081,
"grad_norm": 60711.96875,
"learning_rate": 2.234829383089595e-05,
"loss": 0.7551,
"step": 3290
},
{
"epoch": 0.5547150781643974,
"grad_norm": 64588.28515625,
"learning_rate": 2.226424609178013e-05,
"loss": 0.6919,
"step": 3300
},
{
"epoch": 0.5563960329467137,
"grad_norm": 73545.9765625,
"learning_rate": 2.2180198352664315e-05,
"loss": 0.7834,
"step": 3310
},
{
"epoch": 0.5580769877290301,
"grad_norm": 66515.1796875,
"learning_rate": 2.20961506135485e-05,
"loss": 0.8135,
"step": 3320
},
{
"epoch": 0.5597579425113465,
"grad_norm": 66021.4140625,
"learning_rate": 2.201210287443268e-05,
"loss": 0.7767,
"step": 3330
},
{
"epoch": 0.5614388972936628,
"grad_norm": 72027.734375,
"learning_rate": 2.1928055135316862e-05,
"loss": 0.7623,
"step": 3340
},
{
"epoch": 0.5631198520759791,
"grad_norm": 69724.7109375,
"learning_rate": 2.1844007396201046e-05,
"loss": 0.8539,
"step": 3350
},
{
"epoch": 0.5648008068582955,
"grad_norm": 70565.2890625,
"learning_rate": 2.1759959657085226e-05,
"loss": 0.8081,
"step": 3360
},
{
"epoch": 0.5664817616406118,
"grad_norm": 69982.8515625,
"learning_rate": 2.167591191796941e-05,
"loss": 0.766,
"step": 3370
},
{
"epoch": 0.5681627164229283,
"grad_norm": 73730.5859375,
"learning_rate": 2.159186417885359e-05,
"loss": 0.7858,
"step": 3380
},
{
"epoch": 0.5698436712052446,
"grad_norm": 68504.3359375,
"learning_rate": 2.1507816439737773e-05,
"loss": 0.7774,
"step": 3390
},
{
"epoch": 0.5715246259875609,
"grad_norm": 68371.71875,
"learning_rate": 2.1423768700621956e-05,
"loss": 0.7397,
"step": 3400
},
{
"epoch": 0.5732055807698773,
"grad_norm": 62352.87890625,
"learning_rate": 2.1339720961506136e-05,
"loss": 0.7727,
"step": 3410
},
{
"epoch": 0.5748865355521936,
"grad_norm": 67821.3671875,
"learning_rate": 2.125567322239032e-05,
"loss": 0.7494,
"step": 3420
},
{
"epoch": 0.57656749033451,
"grad_norm": 75150.0703125,
"learning_rate": 2.1171625483274503e-05,
"loss": 0.738,
"step": 3430
},
{
"epoch": 0.5782484451168264,
"grad_norm": 71489.2109375,
"learning_rate": 2.1087577744158683e-05,
"loss": 0.7666,
"step": 3440
},
{
"epoch": 0.5799293998991427,
"grad_norm": 77000.265625,
"learning_rate": 2.1003530005042867e-05,
"loss": 0.8051,
"step": 3450
},
{
"epoch": 0.581610354681459,
"grad_norm": 63612.04296875,
"learning_rate": 2.0919482265927047e-05,
"loss": 0.7408,
"step": 3460
},
{
"epoch": 0.5832913094637754,
"grad_norm": 65412.390625,
"learning_rate": 2.083543452681123e-05,
"loss": 0.7585,
"step": 3470
},
{
"epoch": 0.5849722642460918,
"grad_norm": 63992.859375,
"learning_rate": 2.0751386787695414e-05,
"loss": 0.8036,
"step": 3480
},
{
"epoch": 0.5866532190284082,
"grad_norm": 67541.6015625,
"learning_rate": 2.0667339048579594e-05,
"loss": 0.7835,
"step": 3490
},
{
"epoch": 0.5883341738107245,
"grad_norm": 87275.59375,
"learning_rate": 2.0583291309463777e-05,
"loss": 0.843,
"step": 3500
},
{
"epoch": 0.5900151285930408,
"grad_norm": 62353.66015625,
"learning_rate": 2.0499243570347957e-05,
"loss": 0.8229,
"step": 3510
},
{
"epoch": 0.5916960833753572,
"grad_norm": 76160.890625,
"learning_rate": 2.041519583123214e-05,
"loss": 0.839,
"step": 3520
},
{
"epoch": 0.5933770381576735,
"grad_norm": 71393.7109375,
"learning_rate": 2.0331148092116324e-05,
"loss": 0.7136,
"step": 3530
},
{
"epoch": 0.59505799293999,
"grad_norm": 70572.75,
"learning_rate": 2.0247100353000504e-05,
"loss": 0.7987,
"step": 3540
},
{
"epoch": 0.5967389477223063,
"grad_norm": 66998.0703125,
"learning_rate": 2.0163052613884688e-05,
"loss": 0.7874,
"step": 3550
},
{
"epoch": 0.5984199025046226,
"grad_norm": 53749.26953125,
"learning_rate": 2.007900487476887e-05,
"loss": 0.767,
"step": 3560
},
{
"epoch": 0.600100857286939,
"grad_norm": 71350.6328125,
"learning_rate": 1.999495713565305e-05,
"loss": 0.8136,
"step": 3570
},
{
"epoch": 0.6017818120692553,
"grad_norm": 64163.25390625,
"learning_rate": 1.9910909396537235e-05,
"loss": 0.7565,
"step": 3580
},
{
"epoch": 0.6034627668515717,
"grad_norm": 61004.05078125,
"learning_rate": 1.9826861657421415e-05,
"loss": 0.7502,
"step": 3590
},
{
"epoch": 0.6051437216338881,
"grad_norm": 66495.4375,
"learning_rate": 1.97428139183056e-05,
"loss": 0.7485,
"step": 3600
},
{
"epoch": 0.6068246764162044,
"grad_norm": 62442.1015625,
"learning_rate": 1.9658766179189782e-05,
"loss": 0.7484,
"step": 3610
},
{
"epoch": 0.6085056311985207,
"grad_norm": 66803.453125,
"learning_rate": 1.9574718440073962e-05,
"loss": 0.8459,
"step": 3620
},
{
"epoch": 0.6101865859808371,
"grad_norm": 71394.6875,
"learning_rate": 1.9490670700958145e-05,
"loss": 0.8108,
"step": 3630
},
{
"epoch": 0.6118675407631534,
"grad_norm": 72386.0859375,
"learning_rate": 1.940662296184233e-05,
"loss": 0.8039,
"step": 3640
},
{
"epoch": 0.6135484955454699,
"grad_norm": 69155.015625,
"learning_rate": 1.932257522272651e-05,
"loss": 0.823,
"step": 3650
},
{
"epoch": 0.6152294503277862,
"grad_norm": 60343.54296875,
"learning_rate": 1.9238527483610693e-05,
"loss": 0.7284,
"step": 3660
},
{
"epoch": 0.6169104051101025,
"grad_norm": 70258.7734375,
"learning_rate": 1.9154479744494873e-05,
"loss": 0.7925,
"step": 3670
},
{
"epoch": 0.6185913598924189,
"grad_norm": 68638.21875,
"learning_rate": 1.9070432005379056e-05,
"loss": 0.7802,
"step": 3680
},
{
"epoch": 0.6202723146747352,
"grad_norm": 67208.890625,
"learning_rate": 1.898638426626324e-05,
"loss": 0.7921,
"step": 3690
},
{
"epoch": 0.6219532694570516,
"grad_norm": 56271.5703125,
"learning_rate": 1.890233652714742e-05,
"loss": 0.7752,
"step": 3700
},
{
"epoch": 0.623634224239368,
"grad_norm": 84835.21875,
"learning_rate": 1.8818288788031603e-05,
"loss": 0.8012,
"step": 3710
},
{
"epoch": 0.6253151790216843,
"grad_norm": 58683.70703125,
"learning_rate": 1.8734241048915787e-05,
"loss": 0.7547,
"step": 3720
},
{
"epoch": 0.6269961338040007,
"grad_norm": 66791.8828125,
"learning_rate": 1.8650193309799967e-05,
"loss": 0.782,
"step": 3730
},
{
"epoch": 0.628677088586317,
"grad_norm": 60903.69140625,
"learning_rate": 1.856614557068415e-05,
"loss": 0.7304,
"step": 3740
},
{
"epoch": 0.6303580433686334,
"grad_norm": 63727.53515625,
"learning_rate": 1.848209783156833e-05,
"loss": 0.7591,
"step": 3750
},
{
"epoch": 0.6320389981509498,
"grad_norm": 65615.4140625,
"learning_rate": 1.8398050092452514e-05,
"loss": 0.7483,
"step": 3760
},
{
"epoch": 0.6337199529332661,
"grad_norm": 72326.921875,
"learning_rate": 1.8314002353336697e-05,
"loss": 0.7795,
"step": 3770
},
{
"epoch": 0.6354009077155824,
"grad_norm": 65443.81640625,
"learning_rate": 1.8229954614220877e-05,
"loss": 0.8154,
"step": 3780
},
{
"epoch": 0.6370818624978988,
"grad_norm": 66016.125,
"learning_rate": 1.814590687510506e-05,
"loss": 0.8109,
"step": 3790
},
{
"epoch": 0.6387628172802151,
"grad_norm": 70201.703125,
"learning_rate": 1.8061859135989244e-05,
"loss": 0.8039,
"step": 3800
},
{
"epoch": 0.6404437720625316,
"grad_norm": 72543.0859375,
"learning_rate": 1.7977811396873424e-05,
"loss": 0.8201,
"step": 3810
},
{
"epoch": 0.6421247268448479,
"grad_norm": 65741.5703125,
"learning_rate": 1.7893763657757608e-05,
"loss": 0.7919,
"step": 3820
},
{
"epoch": 0.6438056816271642,
"grad_norm": 67635.5078125,
"learning_rate": 1.7809715918641788e-05,
"loss": 0.8174,
"step": 3830
},
{
"epoch": 0.6454866364094806,
"grad_norm": 64830.48046875,
"learning_rate": 1.772566817952597e-05,
"loss": 0.7375,
"step": 3840
},
{
"epoch": 0.6471675911917969,
"grad_norm": 66888.5234375,
"learning_rate": 1.7641620440410155e-05,
"loss": 0.7547,
"step": 3850
},
{
"epoch": 0.6488485459741133,
"grad_norm": 67550.5234375,
"learning_rate": 1.7557572701294335e-05,
"loss": 0.7532,
"step": 3860
},
{
"epoch": 0.6505295007564297,
"grad_norm": 66258.5859375,
"learning_rate": 1.747352496217852e-05,
"loss": 0.7623,
"step": 3870
},
{
"epoch": 0.652210455538746,
"grad_norm": 70212.1875,
"learning_rate": 1.7389477223062702e-05,
"loss": 0.8022,
"step": 3880
},
{
"epoch": 0.6538914103210623,
"grad_norm": 69388.6171875,
"learning_rate": 1.7305429483946882e-05,
"loss": 0.7672,
"step": 3890
},
{
"epoch": 0.6555723651033787,
"grad_norm": 61498.984375,
"learning_rate": 1.7221381744831065e-05,
"loss": 0.7243,
"step": 3900
},
{
"epoch": 0.657253319885695,
"grad_norm": 73348.640625,
"learning_rate": 1.7137334005715245e-05,
"loss": 0.7699,
"step": 3910
},
{
"epoch": 0.6589342746680115,
"grad_norm": 76186.703125,
"learning_rate": 1.705328626659943e-05,
"loss": 0.8293,
"step": 3920
},
{
"epoch": 0.6606152294503278,
"grad_norm": 72204.1171875,
"learning_rate": 1.6969238527483612e-05,
"loss": 0.8325,
"step": 3930
},
{
"epoch": 0.6622961842326441,
"grad_norm": 68690.1328125,
"learning_rate": 1.6885190788367792e-05,
"loss": 0.8174,
"step": 3940
},
{
"epoch": 0.6639771390149605,
"grad_norm": 69304.6015625,
"learning_rate": 1.6801143049251976e-05,
"loss": 0.7395,
"step": 3950
},
{
"epoch": 0.6656580937972768,
"grad_norm": 67933.7890625,
"learning_rate": 1.6717095310136156e-05,
"loss": 0.7976,
"step": 3960
},
{
"epoch": 0.6673390485795933,
"grad_norm": 74347.6953125,
"learning_rate": 1.663304757102034e-05,
"loss": 0.8003,
"step": 3970
},
{
"epoch": 0.6690200033619096,
"grad_norm": 74672.625,
"learning_rate": 1.6548999831904523e-05,
"loss": 0.7809,
"step": 3980
},
{
"epoch": 0.6707009581442259,
"grad_norm": 69611.8984375,
"learning_rate": 1.6464952092788703e-05,
"loss": 0.7884,
"step": 3990
},
{
"epoch": 0.6723819129265423,
"grad_norm": 61687.546875,
"learning_rate": 1.6380904353672887e-05,
"loss": 0.7726,
"step": 4000
},
{
"epoch": 0.6740628677088586,
"grad_norm": 57029.6953125,
"learning_rate": 1.629685661455707e-05,
"loss": 0.7537,
"step": 4010
},
{
"epoch": 0.675743822491175,
"grad_norm": 62528.2265625,
"learning_rate": 1.621280887544125e-05,
"loss": 0.7542,
"step": 4020
},
{
"epoch": 0.6774247772734914,
"grad_norm": 69657.46875,
"learning_rate": 1.6128761136325434e-05,
"loss": 0.7846,
"step": 4030
},
{
"epoch": 0.6791057320558077,
"grad_norm": 61733.50390625,
"learning_rate": 1.6044713397209614e-05,
"loss": 0.7342,
"step": 4040
},
{
"epoch": 0.680786686838124,
"grad_norm": 63571.60546875,
"learning_rate": 1.5960665658093797e-05,
"loss": 0.7046,
"step": 4050
},
{
"epoch": 0.6824676416204404,
"grad_norm": 80437.828125,
"learning_rate": 1.587661791897798e-05,
"loss": 0.8413,
"step": 4060
},
{
"epoch": 0.6841485964027567,
"grad_norm": 107205.921875,
"learning_rate": 1.579257017986216e-05,
"loss": 0.8052,
"step": 4070
},
{
"epoch": 0.6858295511850732,
"grad_norm": 71237.375,
"learning_rate": 1.5708522440746344e-05,
"loss": 0.7729,
"step": 4080
},
{
"epoch": 0.6875105059673895,
"grad_norm": 61227.4296875,
"learning_rate": 1.5624474701630528e-05,
"loss": 0.7972,
"step": 4090
},
{
"epoch": 0.6891914607497058,
"grad_norm": 81108.9296875,
"learning_rate": 1.5540426962514708e-05,
"loss": 0.7173,
"step": 4100
},
{
"epoch": 0.6908724155320222,
"grad_norm": 58387.5703125,
"learning_rate": 1.545637922339889e-05,
"loss": 0.8341,
"step": 4110
},
{
"epoch": 0.6925533703143385,
"grad_norm": 77154.796875,
"learning_rate": 1.537233148428307e-05,
"loss": 0.7459,
"step": 4120
},
{
"epoch": 0.694234325096655,
"grad_norm": 66626.140625,
"learning_rate": 1.5288283745167255e-05,
"loss": 0.7965,
"step": 4130
},
{
"epoch": 0.6959152798789713,
"grad_norm": 65993.8671875,
"learning_rate": 1.5204236006051437e-05,
"loss": 0.8077,
"step": 4140
},
{
"epoch": 0.6975962346612876,
"grad_norm": 65909.390625,
"learning_rate": 1.512018826693562e-05,
"loss": 0.7933,
"step": 4150
},
{
"epoch": 0.699277189443604,
"grad_norm": 73940.3671875,
"learning_rate": 1.5036140527819803e-05,
"loss": 0.7937,
"step": 4160
},
{
"epoch": 0.7009581442259203,
"grad_norm": 80516.8203125,
"learning_rate": 1.4952092788703984e-05,
"loss": 0.7921,
"step": 4170
},
{
"epoch": 0.7026390990082367,
"grad_norm": 62047.8046875,
"learning_rate": 1.4868045049588167e-05,
"loss": 0.7405,
"step": 4180
},
{
"epoch": 0.7043200537905531,
"grad_norm": 68648.3828125,
"learning_rate": 1.478399731047235e-05,
"loss": 0.7956,
"step": 4190
},
{
"epoch": 0.7060010085728694,
"grad_norm": 74798.203125,
"learning_rate": 1.469994957135653e-05,
"loss": 0.7026,
"step": 4200
},
{
"epoch": 0.7076819633551857,
"grad_norm": 65563.265625,
"learning_rate": 1.4615901832240714e-05,
"loss": 0.7846,
"step": 4210
},
{
"epoch": 0.7093629181375021,
"grad_norm": 76866.96875,
"learning_rate": 1.4531854093124898e-05,
"loss": 0.829,
"step": 4220
},
{
"epoch": 0.7110438729198184,
"grad_norm": 64904.796875,
"learning_rate": 1.4447806354009078e-05,
"loss": 0.8053,
"step": 4230
},
{
"epoch": 0.7127248277021349,
"grad_norm": 72151.6484375,
"learning_rate": 1.4363758614893261e-05,
"loss": 0.7583,
"step": 4240
},
{
"epoch": 0.7144057824844512,
"grad_norm": 57045.8203125,
"learning_rate": 1.4279710875777441e-05,
"loss": 0.7584,
"step": 4250
},
{
"epoch": 0.7160867372667675,
"grad_norm": 74145.6953125,
"learning_rate": 1.4195663136661625e-05,
"loss": 0.7637,
"step": 4260
},
{
"epoch": 0.7177676920490839,
"grad_norm": 63434.96875,
"learning_rate": 1.4111615397545808e-05,
"loss": 0.8278,
"step": 4270
},
{
"epoch": 0.7194486468314002,
"grad_norm": 70511.5546875,
"learning_rate": 1.4027567658429988e-05,
"loss": 0.7512,
"step": 4280
},
{
"epoch": 0.7211296016137166,
"grad_norm": 65691.6640625,
"learning_rate": 1.3943519919314172e-05,
"loss": 0.7509,
"step": 4290
},
{
"epoch": 0.722810556396033,
"grad_norm": 72238.515625,
"learning_rate": 1.3859472180198355e-05,
"loss": 0.7977,
"step": 4300
},
{
"epoch": 0.7244915111783493,
"grad_norm": 79115.5625,
"learning_rate": 1.3775424441082535e-05,
"loss": 0.8284,
"step": 4310
},
{
"epoch": 0.7261724659606656,
"grad_norm": 64790.1484375,
"learning_rate": 1.3691376701966719e-05,
"loss": 0.7124,
"step": 4320
},
{
"epoch": 0.727853420742982,
"grad_norm": 75220.65625,
"learning_rate": 1.3607328962850899e-05,
"loss": 0.7687,
"step": 4330
},
{
"epoch": 0.7295343755252983,
"grad_norm": 60745.61328125,
"learning_rate": 1.3523281223735082e-05,
"loss": 0.7283,
"step": 4340
},
{
"epoch": 0.7312153303076148,
"grad_norm": 54197.08203125,
"learning_rate": 1.3439233484619266e-05,
"loss": 0.768,
"step": 4350
},
{
"epoch": 0.7328962850899311,
"grad_norm": 61422.91015625,
"learning_rate": 1.3355185745503446e-05,
"loss": 0.7324,
"step": 4360
},
{
"epoch": 0.7345772398722474,
"grad_norm": 68666.9765625,
"learning_rate": 1.327113800638763e-05,
"loss": 0.8131,
"step": 4370
},
{
"epoch": 0.7362581946545638,
"grad_norm": 54904.734375,
"learning_rate": 1.3187090267271813e-05,
"loss": 0.751,
"step": 4380
},
{
"epoch": 0.7379391494368801,
"grad_norm": 74403.234375,
"learning_rate": 1.3103042528155993e-05,
"loss": 0.7564,
"step": 4390
},
{
"epoch": 0.7396201042191966,
"grad_norm": 83832.09375,
"learning_rate": 1.3018994789040176e-05,
"loss": 0.7588,
"step": 4400
},
{
"epoch": 0.7413010590015129,
"grad_norm": 65015.13671875,
"learning_rate": 1.2934947049924356e-05,
"loss": 0.7333,
"step": 4410
},
{
"epoch": 0.7429820137838292,
"grad_norm": 70224.0234375,
"learning_rate": 1.285089931080854e-05,
"loss": 0.7437,
"step": 4420
},
{
"epoch": 0.7446629685661456,
"grad_norm": 65735.6015625,
"learning_rate": 1.2766851571692723e-05,
"loss": 0.7311,
"step": 4430
},
{
"epoch": 0.7463439233484619,
"grad_norm": 64739.11328125,
"learning_rate": 1.2682803832576903e-05,
"loss": 0.752,
"step": 4440
},
{
"epoch": 0.7480248781307783,
"grad_norm": 84499.734375,
"learning_rate": 1.2598756093461087e-05,
"loss": 0.7085,
"step": 4450
},
{
"epoch": 0.7497058329130947,
"grad_norm": 58072.46484375,
"learning_rate": 1.251470835434527e-05,
"loss": 0.7869,
"step": 4460
},
{
"epoch": 0.751386787695411,
"grad_norm": 60343.64453125,
"learning_rate": 1.243066061522945e-05,
"loss": 0.748,
"step": 4470
},
{
"epoch": 0.7530677424777273,
"grad_norm": 73976.90625,
"learning_rate": 1.2346612876113634e-05,
"loss": 0.8125,
"step": 4480
},
{
"epoch": 0.7547486972600437,
"grad_norm": 83017.234375,
"learning_rate": 1.2262565136997816e-05,
"loss": 0.7647,
"step": 4490
},
{
"epoch": 0.75642965204236,
"grad_norm": 63707.15625,
"learning_rate": 1.2178517397881997e-05,
"loss": 0.689,
"step": 4500
},
{
"epoch": 0.7581106068246765,
"grad_norm": 62471.03125,
"learning_rate": 1.2094469658766181e-05,
"loss": 0.7642,
"step": 4510
},
{
"epoch": 0.7597915616069928,
"grad_norm": 71938.2421875,
"learning_rate": 1.2010421919650363e-05,
"loss": 0.7018,
"step": 4520
},
{
"epoch": 0.7614725163893091,
"grad_norm": 62137.34375,
"learning_rate": 1.1926374180534544e-05,
"loss": 0.7742,
"step": 4530
},
{
"epoch": 0.7631534711716255,
"grad_norm": 74779.3515625,
"learning_rate": 1.1842326441418726e-05,
"loss": 0.76,
"step": 4540
},
{
"epoch": 0.7648344259539418,
"grad_norm": 65167.984375,
"learning_rate": 1.175827870230291e-05,
"loss": 0.7814,
"step": 4550
},
{
"epoch": 0.7665153807362582,
"grad_norm": 64530.6015625,
"learning_rate": 1.1674230963187092e-05,
"loss": 0.7496,
"step": 4560
},
{
"epoch": 0.7681963355185746,
"grad_norm": 69663.7109375,
"learning_rate": 1.1590183224071273e-05,
"loss": 0.794,
"step": 4570
},
{
"epoch": 0.7698772903008909,
"grad_norm": 58575.296875,
"learning_rate": 1.1506135484955455e-05,
"loss": 0.7438,
"step": 4580
},
{
"epoch": 0.7715582450832073,
"grad_norm": 60153.59375,
"learning_rate": 1.1422087745839639e-05,
"loss": 0.7246,
"step": 4590
},
{
"epoch": 0.7732391998655236,
"grad_norm": 63029.4140625,
"learning_rate": 1.133804000672382e-05,
"loss": 0.7449,
"step": 4600
},
{
"epoch": 0.7749201546478399,
"grad_norm": 70431.859375,
"learning_rate": 1.1253992267608002e-05,
"loss": 0.8162,
"step": 4610
},
{
"epoch": 0.7766011094301564,
"grad_norm": 56848.18359375,
"learning_rate": 1.1169944528492184e-05,
"loss": 0.8272,
"step": 4620
},
{
"epoch": 0.7782820642124727,
"grad_norm": 63638.44921875,
"learning_rate": 1.1085896789376367e-05,
"loss": 0.6735,
"step": 4630
},
{
"epoch": 0.779963018994789,
"grad_norm": 81583.1328125,
"learning_rate": 1.1001849050260549e-05,
"loss": 0.7195,
"step": 4640
},
{
"epoch": 0.7816439737771054,
"grad_norm": 65090.41796875,
"learning_rate": 1.0917801311144731e-05,
"loss": 0.7348,
"step": 4650
},
{
"epoch": 0.7833249285594217,
"grad_norm": 81298.7265625,
"learning_rate": 1.0833753572028913e-05,
"loss": 0.7036,
"step": 4660
},
{
"epoch": 0.7850058833417382,
"grad_norm": 72416.609375,
"learning_rate": 1.0749705832913096e-05,
"loss": 0.759,
"step": 4670
},
{
"epoch": 0.7866868381240545,
"grad_norm": 62808.859375,
"learning_rate": 1.0665658093797278e-05,
"loss": 0.8043,
"step": 4680
},
{
"epoch": 0.7883677929063708,
"grad_norm": 57125.68359375,
"learning_rate": 1.058161035468146e-05,
"loss": 0.695,
"step": 4690
},
{
"epoch": 0.7900487476886872,
"grad_norm": 70024.90625,
"learning_rate": 1.0497562615565642e-05,
"loss": 0.8026,
"step": 4700
},
{
"epoch": 0.7917297024710035,
"grad_norm": 67909.5,
"learning_rate": 1.0413514876449825e-05,
"loss": 0.7139,
"step": 4710
},
{
"epoch": 0.7934106572533199,
"grad_norm": 63104.14453125,
"learning_rate": 1.0329467137334007e-05,
"loss": 0.7578,
"step": 4720
},
{
"epoch": 0.7950916120356363,
"grad_norm": 59612.7578125,
"learning_rate": 1.0245419398218189e-05,
"loss": 0.8141,
"step": 4730
},
{
"epoch": 0.7967725668179526,
"grad_norm": 61766.25,
"learning_rate": 1.016137165910237e-05,
"loss": 0.7066,
"step": 4740
},
{
"epoch": 0.798453521600269,
"grad_norm": 61020.91015625,
"learning_rate": 1.0077323919986554e-05,
"loss": 0.7129,
"step": 4750
},
{
"epoch": 0.8001344763825853,
"grad_norm": 73282.953125,
"learning_rate": 9.993276180870736e-06,
"loss": 0.7443,
"step": 4760
},
{
"epoch": 0.8018154311649016,
"grad_norm": 61251.37890625,
"learning_rate": 9.909228441754917e-06,
"loss": 0.7202,
"step": 4770
},
{
"epoch": 0.8034963859472181,
"grad_norm": 57658.30859375,
"learning_rate": 9.825180702639099e-06,
"loss": 0.7261,
"step": 4780
},
{
"epoch": 0.8051773407295344,
"grad_norm": 58032.66796875,
"learning_rate": 9.741132963523283e-06,
"loss": 0.7772,
"step": 4790
},
{
"epoch": 0.8068582955118507,
"grad_norm": 67425.0703125,
"learning_rate": 9.657085224407464e-06,
"loss": 0.75,
"step": 4800
},
{
"epoch": 0.8085392502941671,
"grad_norm": 61734.875,
"learning_rate": 9.573037485291646e-06,
"loss": 0.7046,
"step": 4810
},
{
"epoch": 0.8102202050764834,
"grad_norm": 68259.4453125,
"learning_rate": 9.48898974617583e-06,
"loss": 0.7702,
"step": 4820
},
{
"epoch": 0.8119011598587998,
"grad_norm": 62056.20703125,
"learning_rate": 9.404942007060011e-06,
"loss": 0.7043,
"step": 4830
},
{
"epoch": 0.8135821146411162,
"grad_norm": 65011.296875,
"learning_rate": 9.320894267944193e-06,
"loss": 0.7163,
"step": 4840
},
{
"epoch": 0.8152630694234325,
"grad_norm": 74655.8203125,
"learning_rate": 9.236846528828375e-06,
"loss": 0.8321,
"step": 4850
},
{
"epoch": 0.8169440242057489,
"grad_norm": 75672.0703125,
"learning_rate": 9.152798789712558e-06,
"loss": 0.7922,
"step": 4860
},
{
"epoch": 0.8186249789880652,
"grad_norm": 61180.29296875,
"learning_rate": 9.06875105059674e-06,
"loss": 0.7159,
"step": 4870
},
{
"epoch": 0.8203059337703815,
"grad_norm": 87931.515625,
"learning_rate": 8.984703311480922e-06,
"loss": 0.6781,
"step": 4880
},
{
"epoch": 0.821986888552698,
"grad_norm": 73175.0390625,
"learning_rate": 8.900655572365104e-06,
"loss": 0.7326,
"step": 4890
},
{
"epoch": 0.8236678433350143,
"grad_norm": 63801.890625,
"learning_rate": 8.816607833249287e-06,
"loss": 0.7975,
"step": 4900
},
{
"epoch": 0.8253487981173306,
"grad_norm": 80039.4375,
"learning_rate": 8.732560094133469e-06,
"loss": 0.6984,
"step": 4910
},
{
"epoch": 0.827029752899647,
"grad_norm": 71785.921875,
"learning_rate": 8.64851235501765e-06,
"loss": 0.7481,
"step": 4920
},
{
"epoch": 0.8287107076819633,
"grad_norm": 72605.5234375,
"learning_rate": 8.564464615901833e-06,
"loss": 0.7317,
"step": 4930
},
{
"epoch": 0.8303916624642798,
"grad_norm": 72289.1171875,
"learning_rate": 8.480416876786016e-06,
"loss": 0.752,
"step": 4940
},
{
"epoch": 0.8320726172465961,
"grad_norm": 69316.015625,
"learning_rate": 8.396369137670198e-06,
"loss": 0.722,
"step": 4950
},
{
"epoch": 0.8337535720289124,
"grad_norm": 68252.515625,
"learning_rate": 8.31232139855438e-06,
"loss": 0.7674,
"step": 4960
},
{
"epoch": 0.8354345268112288,
"grad_norm": 66645.7421875,
"learning_rate": 8.228273659438561e-06,
"loss": 0.7386,
"step": 4970
},
{
"epoch": 0.8371154815935451,
"grad_norm": 64145.96875,
"learning_rate": 8.144225920322745e-06,
"loss": 0.7213,
"step": 4980
},
{
"epoch": 0.8387964363758615,
"grad_norm": 62728.8125,
"learning_rate": 8.060178181206927e-06,
"loss": 0.6535,
"step": 4990
},
{
"epoch": 0.8404773911581779,
"grad_norm": 72365.625,
"learning_rate": 7.976130442091108e-06,
"loss": 0.6833,
"step": 5000
},
{
"epoch": 0.8421583459404942,
"grad_norm": 62805.015625,
"learning_rate": 7.89208270297529e-06,
"loss": 0.8574,
"step": 5010
},
{
"epoch": 0.8438393007228105,
"grad_norm": 65521.5703125,
"learning_rate": 7.808034963859474e-06,
"loss": 0.7408,
"step": 5020
},
{
"epoch": 0.8455202555051269,
"grad_norm": 67055.6171875,
"learning_rate": 7.723987224743655e-06,
"loss": 0.6515,
"step": 5030
},
{
"epoch": 0.8472012102874432,
"grad_norm": 56498.43359375,
"learning_rate": 7.639939485627837e-06,
"loss": 0.8291,
"step": 5040
},
{
"epoch": 0.8488821650697597,
"grad_norm": 68116.96875,
"learning_rate": 7.555891746512018e-06,
"loss": 0.7886,
"step": 5050
},
{
"epoch": 0.850563119852076,
"grad_norm": 64178.546875,
"learning_rate": 7.471844007396202e-06,
"loss": 0.7746,
"step": 5060
},
{
"epoch": 0.8522440746343923,
"grad_norm": 63573.09375,
"learning_rate": 7.387796268280383e-06,
"loss": 0.7769,
"step": 5070
},
{
"epoch": 0.8539250294167087,
"grad_norm": 57395.2421875,
"learning_rate": 7.303748529164565e-06,
"loss": 0.7334,
"step": 5080
},
{
"epoch": 0.855605984199025,
"grad_norm": 74868.375,
"learning_rate": 7.219700790048749e-06,
"loss": 0.7575,
"step": 5090
},
{
"epoch": 0.8572869389813415,
"grad_norm": 69290.953125,
"learning_rate": 7.1356530509329304e-06,
"loss": 0.6967,
"step": 5100
},
{
"epoch": 0.8589678937636578,
"grad_norm": 59573.5234375,
"learning_rate": 7.051605311817112e-06,
"loss": 0.8442,
"step": 5110
},
{
"epoch": 0.8606488485459741,
"grad_norm": 61021.09375,
"learning_rate": 6.967557572701294e-06,
"loss": 0.7069,
"step": 5120
},
{
"epoch": 0.8623298033282905,
"grad_norm": 71908.3359375,
"learning_rate": 6.8835098335854775e-06,
"loss": 0.6972,
"step": 5130
},
{
"epoch": 0.8640107581106068,
"grad_norm": 71261.375,
"learning_rate": 6.799462094469659e-06,
"loss": 0.7308,
"step": 5140
},
{
"epoch": 0.8656917128929231,
"grad_norm": 76563.328125,
"learning_rate": 6.715414355353841e-06,
"loss": 0.8391,
"step": 5150
},
{
"epoch": 0.8673726676752396,
"grad_norm": 73876.8515625,
"learning_rate": 6.631366616238023e-06,
"loss": 0.6593,
"step": 5160
},
{
"epoch": 0.8690536224575559,
"grad_norm": 58473.9140625,
"learning_rate": 6.547318877122206e-06,
"loss": 0.7766,
"step": 5170
},
{
"epoch": 0.8707345772398722,
"grad_norm": 61153.5390625,
"learning_rate": 6.463271138006388e-06,
"loss": 0.7679,
"step": 5180
},
{
"epoch": 0.8724155320221886,
"grad_norm": 58491.39453125,
"learning_rate": 6.37922339889057e-06,
"loss": 0.7115,
"step": 5190
},
{
"epoch": 0.8740964868045049,
"grad_norm": 64786.53125,
"learning_rate": 6.295175659774752e-06,
"loss": 0.6976,
"step": 5200
},
{
"epoch": 0.8757774415868214,
"grad_norm": 68189.5546875,
"learning_rate": 6.211127920658934e-06,
"loss": 0.8054,
"step": 5210
},
{
"epoch": 0.8774583963691377,
"grad_norm": 66427.9921875,
"learning_rate": 6.127080181543117e-06,
"loss": 0.7512,
"step": 5220
},
{
"epoch": 0.879139351151454,
"grad_norm": 57290.70703125,
"learning_rate": 6.043032442427299e-06,
"loss": 0.6927,
"step": 5230
},
{
"epoch": 0.8808203059337704,
"grad_norm": 72431.390625,
"learning_rate": 5.958984703311481e-06,
"loss": 0.8137,
"step": 5240
},
{
"epoch": 0.8825012607160867,
"grad_norm": 70580.7265625,
"learning_rate": 5.874936964195663e-06,
"loss": 0.7712,
"step": 5250
},
{
"epoch": 0.8841822154984031,
"grad_norm": 73078.3046875,
"learning_rate": 5.790889225079846e-06,
"loss": 0.6761,
"step": 5260
},
{
"epoch": 0.8858631702807195,
"grad_norm": 78010.0703125,
"learning_rate": 5.7068414859640274e-06,
"loss": 0.7715,
"step": 5270
},
{
"epoch": 0.8875441250630358,
"grad_norm": 66805.8671875,
"learning_rate": 5.62279374684821e-06,
"loss": 0.7067,
"step": 5280
},
{
"epoch": 0.8892250798453522,
"grad_norm": 68583.328125,
"learning_rate": 5.538746007732392e-06,
"loss": 0.7403,
"step": 5290
},
{
"epoch": 0.8909060346276685,
"grad_norm": 80909.6796875,
"learning_rate": 5.4546982686165745e-06,
"loss": 0.7469,
"step": 5300
},
{
"epoch": 0.8925869894099848,
"grad_norm": 62168.92578125,
"learning_rate": 5.370650529500756e-06,
"loss": 0.7284,
"step": 5310
},
{
"epoch": 0.8942679441923013,
"grad_norm": 70689.0546875,
"learning_rate": 5.286602790384939e-06,
"loss": 0.7804,
"step": 5320
},
{
"epoch": 0.8959488989746176,
"grad_norm": 70834.4609375,
"learning_rate": 5.202555051269121e-06,
"loss": 0.7504,
"step": 5330
},
{
"epoch": 0.8976298537569339,
"grad_norm": 72244.8828125,
"learning_rate": 5.118507312153303e-06,
"loss": 0.6909,
"step": 5340
},
{
"epoch": 0.8993108085392503,
"grad_norm": 68406.7734375,
"learning_rate": 5.034459573037485e-06,
"loss": 0.7554,
"step": 5350
},
{
"epoch": 0.9009917633215666,
"grad_norm": 59602.8515625,
"learning_rate": 4.950411833921668e-06,
"loss": 0.7673,
"step": 5360
},
{
"epoch": 0.902672718103883,
"grad_norm": 61461.9375,
"learning_rate": 4.8663640948058495e-06,
"loss": 0.7082,
"step": 5370
},
{
"epoch": 0.9043536728861994,
"grad_norm": 64041.15625,
"learning_rate": 4.782316355690032e-06,
"loss": 0.682,
"step": 5380
},
{
"epoch": 0.9060346276685157,
"grad_norm": 67531.3046875,
"learning_rate": 4.698268616574215e-06,
"loss": 0.8184,
"step": 5390
},
{
"epoch": 0.9077155824508321,
"grad_norm": 64709.15625,
"learning_rate": 4.6142208774583965e-06,
"loss": 0.7433,
"step": 5400
},
{
"epoch": 0.9093965372331484,
"grad_norm": 73473.203125,
"learning_rate": 4.530173138342579e-06,
"loss": 0.7386,
"step": 5410
},
{
"epoch": 0.9110774920154647,
"grad_norm": 66414.84375,
"learning_rate": 4.446125399226761e-06,
"loss": 0.7839,
"step": 5420
},
{
"epoch": 0.9127584467977812,
"grad_norm": 73395.6015625,
"learning_rate": 4.3620776601109435e-06,
"loss": 0.7699,
"step": 5430
},
{
"epoch": 0.9144394015800975,
"grad_norm": 60129.15625,
"learning_rate": 4.278029920995125e-06,
"loss": 0.7111,
"step": 5440
},
{
"epoch": 0.9161203563624138,
"grad_norm": 81137.3203125,
"learning_rate": 4.193982181879308e-06,
"loss": 0.7419,
"step": 5450
},
{
"epoch": 0.9178013111447302,
"grad_norm": 69321.53125,
"learning_rate": 4.10993444276349e-06,
"loss": 0.6546,
"step": 5460
},
{
"epoch": 0.9194822659270465,
"grad_norm": 73445.609375,
"learning_rate": 4.025886703647672e-06,
"loss": 0.7571,
"step": 5470
},
{
"epoch": 0.921163220709363,
"grad_norm": 55211.56640625,
"learning_rate": 3.941838964531854e-06,
"loss": 0.7361,
"step": 5480
},
{
"epoch": 0.9228441754916793,
"grad_norm": 60575.71484375,
"learning_rate": 3.857791225416037e-06,
"loss": 0.7854,
"step": 5490
},
{
"epoch": 0.9245251302739956,
"grad_norm": 70186.140625,
"learning_rate": 3.7737434863002185e-06,
"loss": 0.8255,
"step": 5500
},
{
"epoch": 0.926206085056312,
"grad_norm": 73100.078125,
"learning_rate": 3.689695747184401e-06,
"loss": 0.7469,
"step": 5510
},
{
"epoch": 0.9278870398386283,
"grad_norm": 75805.078125,
"learning_rate": 3.605648008068583e-06,
"loss": 0.7616,
"step": 5520
},
{
"epoch": 0.9295679946209447,
"grad_norm": 77849.140625,
"learning_rate": 3.5216002689527655e-06,
"loss": 0.7562,
"step": 5530
},
{
"epoch": 0.9312489494032611,
"grad_norm": 68207.265625,
"learning_rate": 3.4375525298369473e-06,
"loss": 0.6813,
"step": 5540
},
{
"epoch": 0.9329299041855774,
"grad_norm": 71734.1640625,
"learning_rate": 3.35350479072113e-06,
"loss": 0.6668,
"step": 5550
},
{
"epoch": 0.9346108589678938,
"grad_norm": 80375.4375,
"learning_rate": 3.2694570516053117e-06,
"loss": 0.7542,
"step": 5560
},
{
"epoch": 0.9362918137502101,
"grad_norm": 76297.75,
"learning_rate": 3.1854093124894943e-06,
"loss": 0.7539,
"step": 5570
},
{
"epoch": 0.9379727685325264,
"grad_norm": 59915.84765625,
"learning_rate": 3.1013615733736765e-06,
"loss": 0.6965,
"step": 5580
},
{
"epoch": 0.9396537233148429,
"grad_norm": 66482.078125,
"learning_rate": 3.0173138342578587e-06,
"loss": 0.7416,
"step": 5590
},
{
"epoch": 0.9413346780971592,
"grad_norm": 73498.7578125,
"learning_rate": 2.933266095142041e-06,
"loss": 0.7404,
"step": 5600
},
{
"epoch": 0.9430156328794755,
"grad_norm": 56677.17578125,
"learning_rate": 2.849218356026223e-06,
"loss": 0.7199,
"step": 5610
},
{
"epoch": 0.9446965876617919,
"grad_norm": 63067.734375,
"learning_rate": 2.7651706169104053e-06,
"loss": 0.738,
"step": 5620
},
{
"epoch": 0.9463775424441082,
"grad_norm": 65398.22265625,
"learning_rate": 2.6811228777945875e-06,
"loss": 0.7177,
"step": 5630
},
{
"epoch": 0.9480584972264247,
"grad_norm": 56372.125,
"learning_rate": 2.5970751386787698e-06,
"loss": 0.7594,
"step": 5640
},
{
"epoch": 0.949739452008741,
"grad_norm": 66133.2109375,
"learning_rate": 2.513027399562952e-06,
"loss": 0.755,
"step": 5650
},
{
"epoch": 0.9514204067910573,
"grad_norm": 73855.5859375,
"learning_rate": 2.428979660447134e-06,
"loss": 0.7286,
"step": 5660
},
{
"epoch": 0.9531013615733737,
"grad_norm": 80979.5703125,
"learning_rate": 2.3449319213313164e-06,
"loss": 0.7508,
"step": 5670
},
{
"epoch": 0.95478231635569,
"grad_norm": 59096.21484375,
"learning_rate": 2.2608841822154986e-06,
"loss": 0.7273,
"step": 5680
},
{
"epoch": 0.9564632711380063,
"grad_norm": 73199.34375,
"learning_rate": 2.1768364430996808e-06,
"loss": 0.8463,
"step": 5690
},
{
"epoch": 0.9581442259203228,
"grad_norm": 70557.265625,
"learning_rate": 2.092788703983863e-06,
"loss": 0.7518,
"step": 5700
},
{
"epoch": 0.9598251807026391,
"grad_norm": 64123.15234375,
"learning_rate": 2.008740964868045e-06,
"loss": 0.6743,
"step": 5710
},
{
"epoch": 0.9615061354849554,
"grad_norm": 70495.03125,
"learning_rate": 1.9246932257522274e-06,
"loss": 0.7179,
"step": 5720
},
{
"epoch": 0.9631870902672718,
"grad_norm": 62604.77734375,
"learning_rate": 1.8406454866364096e-06,
"loss": 0.7073,
"step": 5730
},
{
"epoch": 0.9648680450495881,
"grad_norm": 69528.75,
"learning_rate": 1.7565977475205918e-06,
"loss": 0.7738,
"step": 5740
},
{
"epoch": 0.9665489998319046,
"grad_norm": 64379.51953125,
"learning_rate": 1.672550008404774e-06,
"loss": 0.774,
"step": 5750
},
{
"epoch": 0.9682299546142209,
"grad_norm": 70070.359375,
"learning_rate": 1.5885022692889562e-06,
"loss": 0.7902,
"step": 5760
},
{
"epoch": 0.9699109093965372,
"grad_norm": 75525.7265625,
"learning_rate": 1.5044545301731386e-06,
"loss": 0.754,
"step": 5770
},
{
"epoch": 0.9715918641788536,
"grad_norm": 72437.4375,
"learning_rate": 1.4204067910573208e-06,
"loss": 0.7122,
"step": 5780
},
{
"epoch": 0.9732728189611699,
"grad_norm": 55177.8125,
"learning_rate": 1.3363590519415028e-06,
"loss": 0.7083,
"step": 5790
},
{
"epoch": 0.9749537737434864,
"grad_norm": 74118.140625,
"learning_rate": 1.252311312825685e-06,
"loss": 0.7124,
"step": 5800
},
{
"epoch": 0.9766347285258027,
"grad_norm": 75796.015625,
"learning_rate": 1.1682635737098672e-06,
"loss": 0.7563,
"step": 5810
},
{
"epoch": 0.978315683308119,
"grad_norm": 66329.46875,
"learning_rate": 1.0842158345940494e-06,
"loss": 0.7379,
"step": 5820
},
{
"epoch": 0.9799966380904354,
"grad_norm": 71971.1796875,
"learning_rate": 1.0001680954782316e-06,
"loss": 0.7237,
"step": 5830
},
{
"epoch": 0.9816775928727517,
"grad_norm": 70406.546875,
"learning_rate": 9.161203563624139e-07,
"loss": 0.7518,
"step": 5840
},
{
"epoch": 0.983358547655068,
"grad_norm": 65358.0,
"learning_rate": 8.320726172465961e-07,
"loss": 0.7263,
"step": 5850
},
{
"epoch": 0.9850395024373845,
"grad_norm": 56430.7265625,
"learning_rate": 7.480248781307783e-07,
"loss": 0.7174,
"step": 5860
},
{
"epoch": 0.9867204572197008,
"grad_norm": 69264.21875,
"learning_rate": 6.639771390149606e-07,
"loss": 0.7782,
"step": 5870
},
{
"epoch": 0.9884014120020171,
"grad_norm": 60301.59765625,
"learning_rate": 5.799293998991428e-07,
"loss": 0.7487,
"step": 5880
},
{
"epoch": 0.9900823667843335,
"grad_norm": 66026.0078125,
"learning_rate": 4.95881660783325e-07,
"loss": 0.7565,
"step": 5890
},
{
"epoch": 0.9917633215666498,
"grad_norm": 63975.43359375,
"learning_rate": 4.1183392166750716e-07,
"loss": 0.7349,
"step": 5900
},
{
"epoch": 0.9934442763489663,
"grad_norm": 72585.8046875,
"learning_rate": 3.2778618255168936e-07,
"loss": 0.7754,
"step": 5910
},
{
"epoch": 0.9951252311312826,
"grad_norm": 67646.4296875,
"learning_rate": 2.4373844343587156e-07,
"loss": 0.7622,
"step": 5920
},
{
"epoch": 0.9968061859135989,
"grad_norm": 74414.9375,
"learning_rate": 1.596907043200538e-07,
"loss": 0.7312,
"step": 5930
},
{
"epoch": 0.9984871406959153,
"grad_norm": 69075.203125,
"learning_rate": 7.5642965204236e-08,
"loss": 0.7293,
"step": 5940
},
{
"epoch": 1.0,
"step": 5949,
"total_flos": 9.074524143432499e+17,
"train_loss": 0.8014113598860578,
"train_runtime": 82603.7739,
"train_samples_per_second": 0.864,
"train_steps_per_second": 0.072
}
],
"logging_steps": 10,
"max_steps": 5949,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5949,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.074524143432499e+17,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}