mamba-790m-chat / trainer_state.json
voidful's picture
Upload folder using huggingface_hub
2dbd746 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5936927565773118,
"eval_steps": 500,
"global_step": 120000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 296.6432189941406,
"learning_rate": 2.0000000000000002e-07,
"loss": 36.1442,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 282.62713623046875,
"learning_rate": 4.0000000000000003e-07,
"loss": 36.2463,
"step": 200
},
{
"epoch": 0.0,
"grad_norm": 271.8738098144531,
"learning_rate": 6.000000000000001e-07,
"loss": 36.3589,
"step": 300
},
{
"epoch": 0.0,
"grad_norm": 287.5734558105469,
"learning_rate": 8.000000000000001e-07,
"loss": 36.0855,
"step": 400
},
{
"epoch": 0.0,
"grad_norm": 259.08685302734375,
"learning_rate": 1.0000000000000002e-06,
"loss": 35.8685,
"step": 500
},
{
"epoch": 0.0,
"grad_norm": 292.2701416015625,
"learning_rate": 1.2000000000000002e-06,
"loss": 35.4046,
"step": 600
},
{
"epoch": 0.0,
"grad_norm": 307.0222473144531,
"learning_rate": 1.4000000000000001e-06,
"loss": 34.6875,
"step": 700
},
{
"epoch": 0.0,
"grad_norm": 274.7489929199219,
"learning_rate": 1.6000000000000001e-06,
"loss": 34.1517,
"step": 800
},
{
"epoch": 0.0,
"grad_norm": 240.46612548828125,
"learning_rate": 1.8000000000000001e-06,
"loss": 33.4424,
"step": 900
},
{
"epoch": 0.0,
"grad_norm": 268.32684326171875,
"learning_rate": 2.0000000000000003e-06,
"loss": 32.5807,
"step": 1000
},
{
"epoch": 0.01,
"grad_norm": 258.87274169921875,
"learning_rate": 2.2e-06,
"loss": 30.8752,
"step": 1100
},
{
"epoch": 0.01,
"grad_norm": 288.45611572265625,
"learning_rate": 2.4000000000000003e-06,
"loss": 29.5351,
"step": 1200
},
{
"epoch": 0.01,
"grad_norm": 261.72149658203125,
"learning_rate": 2.6e-06,
"loss": 27.398,
"step": 1300
},
{
"epoch": 0.01,
"grad_norm": 331.3612365722656,
"learning_rate": 2.8000000000000003e-06,
"loss": 24.6465,
"step": 1400
},
{
"epoch": 0.01,
"grad_norm": 215.5654296875,
"learning_rate": 3e-06,
"loss": 21.7979,
"step": 1500
},
{
"epoch": 0.01,
"grad_norm": 222.94651794433594,
"learning_rate": 3.2000000000000003e-06,
"loss": 18.6465,
"step": 1600
},
{
"epoch": 0.01,
"grad_norm": 252.55087280273438,
"learning_rate": 3.4000000000000005e-06,
"loss": 15.7462,
"step": 1700
},
{
"epoch": 0.01,
"grad_norm": 273.9644470214844,
"learning_rate": 3.6000000000000003e-06,
"loss": 13.9379,
"step": 1800
},
{
"epoch": 0.01,
"grad_norm": 205.786376953125,
"learning_rate": 3.8000000000000005e-06,
"loss": 12.2574,
"step": 1900
},
{
"epoch": 0.01,
"grad_norm": 151.63124084472656,
"learning_rate": 4.000000000000001e-06,
"loss": 9.865,
"step": 2000
},
{
"epoch": 0.01,
"grad_norm": 120.38298034667969,
"learning_rate": 4.2000000000000004e-06,
"loss": 8.9936,
"step": 2100
},
{
"epoch": 0.01,
"grad_norm": 93.93321990966797,
"learning_rate": 4.4e-06,
"loss": 8.3415,
"step": 2200
},
{
"epoch": 0.01,
"grad_norm": 102.68135833740234,
"learning_rate": 4.600000000000001e-06,
"loss": 7.4711,
"step": 2300
},
{
"epoch": 0.01,
"grad_norm": 250.72817993164062,
"learning_rate": 4.800000000000001e-06,
"loss": 6.5714,
"step": 2400
},
{
"epoch": 0.01,
"grad_norm": 249.8506317138672,
"learning_rate": 5e-06,
"loss": 5.9448,
"step": 2500
},
{
"epoch": 0.01,
"grad_norm": 66.83155059814453,
"learning_rate": 5.2e-06,
"loss": 5.6368,
"step": 2600
},
{
"epoch": 0.01,
"grad_norm": 51.391082763671875,
"learning_rate": 5.400000000000001e-06,
"loss": 4.8538,
"step": 2700
},
{
"epoch": 0.01,
"grad_norm": 50.51924133300781,
"learning_rate": 5.600000000000001e-06,
"loss": 4.5733,
"step": 2800
},
{
"epoch": 0.01,
"grad_norm": 33.91701889038086,
"learning_rate": 5.8e-06,
"loss": 4.1586,
"step": 2900
},
{
"epoch": 0.01,
"grad_norm": 41.544532775878906,
"learning_rate": 6e-06,
"loss": 3.8914,
"step": 3000
},
{
"epoch": 0.02,
"grad_norm": 44.3348274230957,
"learning_rate": 6.200000000000001e-06,
"loss": 3.4145,
"step": 3100
},
{
"epoch": 0.02,
"grad_norm": 27.11107063293457,
"learning_rate": 6.4000000000000006e-06,
"loss": 3.2646,
"step": 3200
},
{
"epoch": 0.02,
"grad_norm": 144.55479431152344,
"learning_rate": 6.600000000000001e-06,
"loss": 3.1211,
"step": 3300
},
{
"epoch": 0.02,
"grad_norm": 16.845191955566406,
"learning_rate": 6.800000000000001e-06,
"loss": 3.0997,
"step": 3400
},
{
"epoch": 0.02,
"grad_norm": 13.76279067993164,
"learning_rate": 7e-06,
"loss": 2.7326,
"step": 3500
},
{
"epoch": 0.02,
"grad_norm": 10.799291610717773,
"learning_rate": 7.2000000000000005e-06,
"loss": 2.743,
"step": 3600
},
{
"epoch": 0.02,
"grad_norm": 10.540129661560059,
"learning_rate": 7.4e-06,
"loss": 2.6056,
"step": 3700
},
{
"epoch": 0.02,
"grad_norm": 11.966181755065918,
"learning_rate": 7.600000000000001e-06,
"loss": 2.4526,
"step": 3800
},
{
"epoch": 0.02,
"grad_norm": 16.517465591430664,
"learning_rate": 7.800000000000002e-06,
"loss": 2.419,
"step": 3900
},
{
"epoch": 0.02,
"grad_norm": 11.533743858337402,
"learning_rate": 8.000000000000001e-06,
"loss": 2.3781,
"step": 4000
},
{
"epoch": 0.02,
"grad_norm": 12.458216667175293,
"learning_rate": 8.2e-06,
"loss": 2.296,
"step": 4100
},
{
"epoch": 0.02,
"grad_norm": 7.576855182647705,
"learning_rate": 8.400000000000001e-06,
"loss": 2.269,
"step": 4200
},
{
"epoch": 0.02,
"grad_norm": 6.14406681060791,
"learning_rate": 8.6e-06,
"loss": 2.2047,
"step": 4300
},
{
"epoch": 0.02,
"grad_norm": 8.389999389648438,
"learning_rate": 8.8e-06,
"loss": 2.1709,
"step": 4400
},
{
"epoch": 0.02,
"grad_norm": 9.2235689163208,
"learning_rate": 9e-06,
"loss": 2.0764,
"step": 4500
},
{
"epoch": 0.02,
"grad_norm": 19.56966209411621,
"learning_rate": 9.200000000000002e-06,
"loss": 2.0616,
"step": 4600
},
{
"epoch": 0.02,
"grad_norm": 14.260141372680664,
"learning_rate": 9.4e-06,
"loss": 2.0158,
"step": 4700
},
{
"epoch": 0.02,
"grad_norm": 6.950816631317139,
"learning_rate": 9.600000000000001e-06,
"loss": 1.9903,
"step": 4800
},
{
"epoch": 0.02,
"grad_norm": 6.523265361785889,
"learning_rate": 9.800000000000001e-06,
"loss": 1.9903,
"step": 4900
},
{
"epoch": 0.02,
"grad_norm": 5.682758331298828,
"learning_rate": 1e-05,
"loss": 2.0119,
"step": 5000
},
{
"epoch": 0.03,
"grad_norm": 6.691493988037109,
"learning_rate": 9.99937486520531e-06,
"loss": 1.966,
"step": 5100
},
{
"epoch": 0.03,
"grad_norm": 6.045505046844482,
"learning_rate": 9.99874973041062e-06,
"loss": 1.9175,
"step": 5200
},
{
"epoch": 0.03,
"grad_norm": 5.446534633636475,
"learning_rate": 9.99812459561593e-06,
"loss": 1.9086,
"step": 5300
},
{
"epoch": 0.03,
"grad_norm": 6.22329044342041,
"learning_rate": 9.99749946082124e-06,
"loss": 1.8706,
"step": 5400
},
{
"epoch": 0.03,
"grad_norm": 6.028206825256348,
"learning_rate": 9.99687432602655e-06,
"loss": 1.8183,
"step": 5500
},
{
"epoch": 0.03,
"grad_norm": 5.474781036376953,
"learning_rate": 9.99624919123186e-06,
"loss": 1.9045,
"step": 5600
},
{
"epoch": 0.03,
"grad_norm": 4.7177886962890625,
"learning_rate": 9.99562405643717e-06,
"loss": 1.8141,
"step": 5700
},
{
"epoch": 0.03,
"grad_norm": 6.469454288482666,
"learning_rate": 9.99499892164248e-06,
"loss": 1.8079,
"step": 5800
},
{
"epoch": 0.03,
"grad_norm": 5.826772689819336,
"learning_rate": 9.99437378684779e-06,
"loss": 1.852,
"step": 5900
},
{
"epoch": 0.03,
"grad_norm": 4.368248462677002,
"learning_rate": 9.9937486520531e-06,
"loss": 1.8207,
"step": 6000
},
{
"epoch": 0.03,
"grad_norm": 6.686717987060547,
"learning_rate": 9.99312351725841e-06,
"loss": 1.7868,
"step": 6100
},
{
"epoch": 0.03,
"grad_norm": 4.562761306762695,
"learning_rate": 9.99249838246372e-06,
"loss": 1.8201,
"step": 6200
},
{
"epoch": 0.03,
"grad_norm": 4.788825511932373,
"learning_rate": 9.99187324766903e-06,
"loss": 1.7568,
"step": 6300
},
{
"epoch": 0.03,
"grad_norm": 5.2414870262146,
"learning_rate": 9.991248112874338e-06,
"loss": 1.7635,
"step": 6400
},
{
"epoch": 0.03,
"grad_norm": 4.775527477264404,
"learning_rate": 9.99062297807965e-06,
"loss": 1.7465,
"step": 6500
},
{
"epoch": 0.03,
"grad_norm": 5.719698429107666,
"learning_rate": 9.989997843284958e-06,
"loss": 1.7776,
"step": 6600
},
{
"epoch": 0.03,
"grad_norm": 5.103610515594482,
"learning_rate": 9.98937270849027e-06,
"loss": 1.7364,
"step": 6700
},
{
"epoch": 0.03,
"grad_norm": 5.553420543670654,
"learning_rate": 9.988747573695578e-06,
"loss": 1.7341,
"step": 6800
},
{
"epoch": 0.03,
"grad_norm": 4.687087535858154,
"learning_rate": 9.98812243890089e-06,
"loss": 1.7586,
"step": 6900
},
{
"epoch": 0.03,
"grad_norm": 6.242082595825195,
"learning_rate": 9.987497304106198e-06,
"loss": 1.7255,
"step": 7000
},
{
"epoch": 0.04,
"grad_norm": 7.58695650100708,
"learning_rate": 9.98687216931151e-06,
"loss": 1.724,
"step": 7100
},
{
"epoch": 0.04,
"grad_norm": 4.764819622039795,
"learning_rate": 9.986247034516818e-06,
"loss": 1.6934,
"step": 7200
},
{
"epoch": 0.04,
"grad_norm": 5.300253868103027,
"learning_rate": 9.985621899722129e-06,
"loss": 1.6773,
"step": 7300
},
{
"epoch": 0.04,
"grad_norm": 4.917991638183594,
"learning_rate": 9.984996764927437e-06,
"loss": 1.7492,
"step": 7400
},
{
"epoch": 0.04,
"grad_norm": 9.860074043273926,
"learning_rate": 9.984371630132749e-06,
"loss": 1.6835,
"step": 7500
},
{
"epoch": 0.04,
"grad_norm": 4.517050743103027,
"learning_rate": 9.983746495338059e-06,
"loss": 1.6981,
"step": 7600
},
{
"epoch": 0.04,
"grad_norm": 4.88366174697876,
"learning_rate": 9.983121360543367e-06,
"loss": 1.7226,
"step": 7700
},
{
"epoch": 0.04,
"grad_norm": 4.612452983856201,
"learning_rate": 9.982496225748679e-06,
"loss": 1.6904,
"step": 7800
},
{
"epoch": 0.04,
"grad_norm": 4.865972518920898,
"learning_rate": 9.981871090953987e-06,
"loss": 1.6969,
"step": 7900
},
{
"epoch": 0.04,
"grad_norm": 4.375401973724365,
"learning_rate": 9.981245956159299e-06,
"loss": 1.6524,
"step": 8000
},
{
"epoch": 0.04,
"grad_norm": 5.660288333892822,
"learning_rate": 9.980620821364607e-06,
"loss": 1.6866,
"step": 8100
},
{
"epoch": 0.04,
"grad_norm": 4.874125957489014,
"learning_rate": 9.979995686569918e-06,
"loss": 1.6697,
"step": 8200
},
{
"epoch": 0.04,
"grad_norm": 5.102114200592041,
"learning_rate": 9.979370551775227e-06,
"loss": 1.6429,
"step": 8300
},
{
"epoch": 0.04,
"grad_norm": 4.398207664489746,
"learning_rate": 9.978745416980538e-06,
"loss": 1.6489,
"step": 8400
},
{
"epoch": 0.04,
"grad_norm": 8.623647689819336,
"learning_rate": 9.978120282185847e-06,
"loss": 1.6278,
"step": 8500
},
{
"epoch": 0.04,
"grad_norm": 4.62777853012085,
"learning_rate": 9.977495147391158e-06,
"loss": 1.6555,
"step": 8600
},
{
"epoch": 0.04,
"grad_norm": 5.873004913330078,
"learning_rate": 9.976870012596466e-06,
"loss": 1.6624,
"step": 8700
},
{
"epoch": 0.04,
"grad_norm": 4.456192493438721,
"learning_rate": 9.976244877801778e-06,
"loss": 1.6201,
"step": 8800
},
{
"epoch": 0.04,
"grad_norm": 5.2842183113098145,
"learning_rate": 9.975619743007086e-06,
"loss": 1.6334,
"step": 8900
},
{
"epoch": 0.04,
"grad_norm": 5.298410415649414,
"learning_rate": 9.974994608212396e-06,
"loss": 1.6472,
"step": 9000
},
{
"epoch": 0.05,
"grad_norm": 5.887086391448975,
"learning_rate": 9.974369473417706e-06,
"loss": 1.6124,
"step": 9100
},
{
"epoch": 0.05,
"grad_norm": 4.660162925720215,
"learning_rate": 9.973744338623016e-06,
"loss": 1.6131,
"step": 9200
},
{
"epoch": 0.05,
"grad_norm": 5.750434398651123,
"learning_rate": 9.973119203828326e-06,
"loss": 1.6205,
"step": 9300
},
{
"epoch": 0.05,
"grad_norm": 5.6416707038879395,
"learning_rate": 9.972494069033636e-06,
"loss": 1.619,
"step": 9400
},
{
"epoch": 0.05,
"grad_norm": 4.29620361328125,
"learning_rate": 9.971868934238946e-06,
"loss": 1.6062,
"step": 9500
},
{
"epoch": 0.05,
"grad_norm": 4.172244071960449,
"learning_rate": 9.971243799444256e-06,
"loss": 1.6139,
"step": 9600
},
{
"epoch": 0.05,
"grad_norm": 3.9731390476226807,
"learning_rate": 9.970618664649566e-06,
"loss": 1.5706,
"step": 9700
},
{
"epoch": 0.05,
"grad_norm": 4.9260454177856445,
"learning_rate": 9.969993529854876e-06,
"loss": 1.6224,
"step": 9800
},
{
"epoch": 0.05,
"grad_norm": 5.228986740112305,
"learning_rate": 9.969368395060186e-06,
"loss": 1.5969,
"step": 9900
},
{
"epoch": 0.05,
"grad_norm": 5.494061470031738,
"learning_rate": 9.968743260265495e-06,
"loss": 1.6129,
"step": 10000
},
{
"epoch": 0.05,
"grad_norm": 4.893834590911865,
"learning_rate": 9.968118125470805e-06,
"loss": 1.5918,
"step": 10100
},
{
"epoch": 0.05,
"grad_norm": 4.201370716094971,
"learning_rate": 9.967492990676115e-06,
"loss": 1.571,
"step": 10200
},
{
"epoch": 0.05,
"grad_norm": 6.3033576011657715,
"learning_rate": 9.966867855881425e-06,
"loss": 1.6216,
"step": 10300
},
{
"epoch": 0.05,
"grad_norm": 5.211835861206055,
"learning_rate": 9.966242721086735e-06,
"loss": 1.6006,
"step": 10400
},
{
"epoch": 0.05,
"grad_norm": 4.3779730796813965,
"learning_rate": 9.965617586292045e-06,
"loss": 1.6184,
"step": 10500
},
{
"epoch": 0.05,
"grad_norm": 4.778099060058594,
"learning_rate": 9.964992451497355e-06,
"loss": 1.5855,
"step": 10600
},
{
"epoch": 0.05,
"grad_norm": 7.489856243133545,
"learning_rate": 9.964367316702665e-06,
"loss": 1.5827,
"step": 10700
},
{
"epoch": 0.05,
"grad_norm": 4.601972579956055,
"learning_rate": 9.963742181907975e-06,
"loss": 1.6029,
"step": 10800
},
{
"epoch": 0.05,
"grad_norm": 4.222909450531006,
"learning_rate": 9.963117047113285e-06,
"loss": 1.6027,
"step": 10900
},
{
"epoch": 0.05,
"grad_norm": 4.561893939971924,
"learning_rate": 9.962491912318595e-06,
"loss": 1.592,
"step": 11000
},
{
"epoch": 0.05,
"grad_norm": 4.908820152282715,
"learning_rate": 9.961866777523905e-06,
"loss": 1.612,
"step": 11100
},
{
"epoch": 0.06,
"grad_norm": 4.185163974761963,
"learning_rate": 9.961241642729215e-06,
"loss": 1.5834,
"step": 11200
},
{
"epoch": 0.06,
"grad_norm": 5.215177536010742,
"learning_rate": 9.960616507934524e-06,
"loss": 1.5741,
"step": 11300
},
{
"epoch": 0.06,
"grad_norm": 6.018292427062988,
"learning_rate": 9.959991373139834e-06,
"loss": 1.5687,
"step": 11400
},
{
"epoch": 0.06,
"grad_norm": 6.539705276489258,
"learning_rate": 9.959366238345144e-06,
"loss": 1.5967,
"step": 11500
},
{
"epoch": 0.06,
"grad_norm": 5.168763637542725,
"learning_rate": 9.958741103550454e-06,
"loss": 1.5716,
"step": 11600
},
{
"epoch": 0.06,
"grad_norm": 4.548024654388428,
"learning_rate": 9.958115968755764e-06,
"loss": 1.5576,
"step": 11700
},
{
"epoch": 0.06,
"grad_norm": 5.756062030792236,
"learning_rate": 9.957490833961072e-06,
"loss": 1.5742,
"step": 11800
},
{
"epoch": 0.06,
"grad_norm": 5.219858646392822,
"learning_rate": 9.956865699166384e-06,
"loss": 1.5807,
"step": 11900
},
{
"epoch": 0.06,
"grad_norm": 4.460545063018799,
"learning_rate": 9.956240564371692e-06,
"loss": 1.5859,
"step": 12000
},
{
"epoch": 0.06,
"grad_norm": 4.683807849884033,
"learning_rate": 9.955615429577004e-06,
"loss": 1.5769,
"step": 12100
},
{
"epoch": 0.06,
"grad_norm": 6.083448886871338,
"learning_rate": 9.954990294782312e-06,
"loss": 1.5701,
"step": 12200
},
{
"epoch": 0.06,
"grad_norm": 5.151342391967773,
"learning_rate": 9.954365159987624e-06,
"loss": 1.5834,
"step": 12300
},
{
"epoch": 0.06,
"grad_norm": 4.23958158493042,
"learning_rate": 9.953740025192932e-06,
"loss": 1.5797,
"step": 12400
},
{
"epoch": 0.06,
"grad_norm": 5.883495330810547,
"learning_rate": 9.953114890398244e-06,
"loss": 1.5584,
"step": 12500
},
{
"epoch": 0.06,
"grad_norm": 4.5151190757751465,
"learning_rate": 9.952489755603552e-06,
"loss": 1.5723,
"step": 12600
},
{
"epoch": 0.06,
"grad_norm": 4.234920024871826,
"learning_rate": 9.951864620808863e-06,
"loss": 1.5404,
"step": 12700
},
{
"epoch": 0.06,
"grad_norm": 4.942254543304443,
"learning_rate": 9.951239486014173e-06,
"loss": 1.5373,
"step": 12800
},
{
"epoch": 0.06,
"grad_norm": 4.068475723266602,
"learning_rate": 9.950614351219483e-06,
"loss": 1.5584,
"step": 12900
},
{
"epoch": 0.06,
"grad_norm": 4.250300407409668,
"learning_rate": 9.949989216424793e-06,
"loss": 1.6023,
"step": 13000
},
{
"epoch": 0.06,
"grad_norm": 4.279661178588867,
"learning_rate": 9.949364081630101e-06,
"loss": 1.5517,
"step": 13100
},
{
"epoch": 0.07,
"grad_norm": 4.5876946449279785,
"learning_rate": 9.948738946835413e-06,
"loss": 1.5142,
"step": 13200
},
{
"epoch": 0.07,
"grad_norm": 4.220710754394531,
"learning_rate": 9.948113812040721e-06,
"loss": 1.601,
"step": 13300
},
{
"epoch": 0.07,
"grad_norm": 4.183436870574951,
"learning_rate": 9.947488677246033e-06,
"loss": 1.5508,
"step": 13400
},
{
"epoch": 0.07,
"grad_norm": 4.20064640045166,
"learning_rate": 9.946863542451341e-06,
"loss": 1.532,
"step": 13500
},
{
"epoch": 0.07,
"grad_norm": 5.151244640350342,
"learning_rate": 9.946238407656653e-06,
"loss": 1.5256,
"step": 13600
},
{
"epoch": 0.07,
"grad_norm": 4.308995246887207,
"learning_rate": 9.945613272861961e-06,
"loss": 1.5349,
"step": 13700
},
{
"epoch": 0.07,
"grad_norm": 5.477377891540527,
"learning_rate": 9.944988138067273e-06,
"loss": 1.5365,
"step": 13800
},
{
"epoch": 0.07,
"grad_norm": 5.085025310516357,
"learning_rate": 9.944363003272581e-06,
"loss": 1.5038,
"step": 13900
},
{
"epoch": 0.07,
"grad_norm": 4.769080638885498,
"learning_rate": 9.943737868477893e-06,
"loss": 1.5387,
"step": 14000
},
{
"epoch": 0.07,
"grad_norm": 4.0054931640625,
"learning_rate": 9.9431127336832e-06,
"loss": 1.5018,
"step": 14100
},
{
"epoch": 0.07,
"grad_norm": 4.712356090545654,
"learning_rate": 9.942487598888512e-06,
"loss": 1.5049,
"step": 14200
},
{
"epoch": 0.07,
"grad_norm": 4.574007034301758,
"learning_rate": 9.94186246409382e-06,
"loss": 1.5316,
"step": 14300
},
{
"epoch": 0.07,
"grad_norm": 4.079704761505127,
"learning_rate": 9.94123732929913e-06,
"loss": 1.527,
"step": 14400
},
{
"epoch": 0.07,
"grad_norm": 4.1134490966796875,
"learning_rate": 9.94061219450444e-06,
"loss": 1.5472,
"step": 14500
},
{
"epoch": 0.07,
"grad_norm": 5.486052989959717,
"learning_rate": 9.93998705970975e-06,
"loss": 1.5063,
"step": 14600
},
{
"epoch": 0.07,
"grad_norm": 4.8249921798706055,
"learning_rate": 9.93936192491506e-06,
"loss": 1.5116,
"step": 14700
},
{
"epoch": 0.07,
"grad_norm": 4.1462931632995605,
"learning_rate": 9.93873679012037e-06,
"loss": 1.5259,
"step": 14800
},
{
"epoch": 0.07,
"grad_norm": 4.222506999969482,
"learning_rate": 9.93811165532568e-06,
"loss": 1.5378,
"step": 14900
},
{
"epoch": 0.07,
"grad_norm": 3.8890185356140137,
"learning_rate": 9.93748652053099e-06,
"loss": 1.5424,
"step": 15000
},
{
"epoch": 0.07,
"grad_norm": 6.006450176239014,
"learning_rate": 9.9368613857363e-06,
"loss": 1.5276,
"step": 15100
},
{
"epoch": 0.08,
"grad_norm": 4.570881366729736,
"learning_rate": 9.93623625094161e-06,
"loss": 1.5173,
"step": 15200
},
{
"epoch": 0.08,
"grad_norm": 4.1144633293151855,
"learning_rate": 9.93561111614692e-06,
"loss": 1.4905,
"step": 15300
},
{
"epoch": 0.08,
"grad_norm": 5.034255027770996,
"learning_rate": 9.93498598135223e-06,
"loss": 1.5321,
"step": 15400
},
{
"epoch": 0.08,
"grad_norm": 4.831255912780762,
"learning_rate": 9.93436084655754e-06,
"loss": 1.5297,
"step": 15500
},
{
"epoch": 0.08,
"grad_norm": 4.800346851348877,
"learning_rate": 9.93373571176285e-06,
"loss": 1.5109,
"step": 15600
},
{
"epoch": 0.08,
"grad_norm": 4.187744617462158,
"learning_rate": 9.93311057696816e-06,
"loss": 1.4896,
"step": 15700
},
{
"epoch": 0.08,
"grad_norm": 4.569481372833252,
"learning_rate": 9.93248544217347e-06,
"loss": 1.5254,
"step": 15800
},
{
"epoch": 0.08,
"grad_norm": 4.826055526733398,
"learning_rate": 9.93186030737878e-06,
"loss": 1.5003,
"step": 15900
},
{
"epoch": 0.08,
"grad_norm": 5.592813491821289,
"learning_rate": 9.93123517258409e-06,
"loss": 1.5386,
"step": 16000
},
{
"epoch": 0.08,
"grad_norm": 4.18519926071167,
"learning_rate": 9.9306100377894e-06,
"loss": 1.5192,
"step": 16100
},
{
"epoch": 0.08,
"grad_norm": 3.737257719039917,
"learning_rate": 9.92998490299471e-06,
"loss": 1.5049,
"step": 16200
},
{
"epoch": 0.08,
"grad_norm": 4.077634334564209,
"learning_rate": 9.929359768200019e-06,
"loss": 1.4786,
"step": 16300
},
{
"epoch": 0.08,
"grad_norm": 3.7141683101654053,
"learning_rate": 9.928734633405329e-06,
"loss": 1.5008,
"step": 16400
},
{
"epoch": 0.08,
"grad_norm": 5.656344413757324,
"learning_rate": 9.928109498610639e-06,
"loss": 1.4907,
"step": 16500
},
{
"epoch": 0.08,
"grad_norm": 4.02158784866333,
"learning_rate": 9.927484363815949e-06,
"loss": 1.5436,
"step": 16600
},
{
"epoch": 0.08,
"grad_norm": 4.094794273376465,
"learning_rate": 9.926859229021259e-06,
"loss": 1.5135,
"step": 16700
},
{
"epoch": 0.08,
"grad_norm": 5.3504533767700195,
"learning_rate": 9.926234094226569e-06,
"loss": 1.4825,
"step": 16800
},
{
"epoch": 0.08,
"grad_norm": 4.326951503753662,
"learning_rate": 9.925608959431879e-06,
"loss": 1.5042,
"step": 16900
},
{
"epoch": 0.08,
"grad_norm": 4.341583728790283,
"learning_rate": 9.924983824637187e-06,
"loss": 1.5239,
"step": 17000
},
{
"epoch": 0.08,
"grad_norm": 4.5446648597717285,
"learning_rate": 9.924358689842499e-06,
"loss": 1.5104,
"step": 17100
},
{
"epoch": 0.09,
"grad_norm": 4.787079811096191,
"learning_rate": 9.923733555047807e-06,
"loss": 1.4917,
"step": 17200
},
{
"epoch": 0.09,
"grad_norm": 4.259307384490967,
"learning_rate": 9.923108420253118e-06,
"loss": 1.5162,
"step": 17300
},
{
"epoch": 0.09,
"grad_norm": 4.553911209106445,
"learning_rate": 9.922483285458427e-06,
"loss": 1.4805,
"step": 17400
},
{
"epoch": 0.09,
"grad_norm": 4.846059322357178,
"learning_rate": 9.921858150663738e-06,
"loss": 1.51,
"step": 17500
},
{
"epoch": 0.09,
"grad_norm": 4.385834217071533,
"learning_rate": 9.921233015869046e-06,
"loss": 1.4883,
"step": 17600
},
{
"epoch": 0.09,
"grad_norm": 4.686222553253174,
"learning_rate": 9.920607881074358e-06,
"loss": 1.525,
"step": 17700
},
{
"epoch": 0.09,
"grad_norm": 3.7362844944000244,
"learning_rate": 9.919982746279666e-06,
"loss": 1.4877,
"step": 17800
},
{
"epoch": 0.09,
"grad_norm": 4.107117652893066,
"learning_rate": 9.919357611484978e-06,
"loss": 1.5353,
"step": 17900
},
{
"epoch": 0.09,
"grad_norm": 4.318368911743164,
"learning_rate": 9.918732476690288e-06,
"loss": 1.4682,
"step": 18000
},
{
"epoch": 0.09,
"grad_norm": 4.914721488952637,
"learning_rate": 9.918107341895598e-06,
"loss": 1.5261,
"step": 18100
},
{
"epoch": 0.09,
"grad_norm": 4.494168281555176,
"learning_rate": 9.917482207100908e-06,
"loss": 1.5013,
"step": 18200
},
{
"epoch": 0.09,
"grad_norm": 5.026334762573242,
"learning_rate": 9.916857072306216e-06,
"loss": 1.5093,
"step": 18300
},
{
"epoch": 0.09,
"grad_norm": 4.3312907218933105,
"learning_rate": 9.916231937511528e-06,
"loss": 1.5224,
"step": 18400
},
{
"epoch": 0.09,
"grad_norm": 3.9422335624694824,
"learning_rate": 9.915606802716836e-06,
"loss": 1.5059,
"step": 18500
},
{
"epoch": 0.09,
"grad_norm": 4.773715496063232,
"learning_rate": 9.914981667922147e-06,
"loss": 1.5031,
"step": 18600
},
{
"epoch": 0.09,
"grad_norm": 5.202546119689941,
"learning_rate": 9.914356533127456e-06,
"loss": 1.5133,
"step": 18700
},
{
"epoch": 0.09,
"grad_norm": 4.315513610839844,
"learning_rate": 9.913731398332767e-06,
"loss": 1.5343,
"step": 18800
},
{
"epoch": 0.09,
"grad_norm": 4.325439929962158,
"learning_rate": 9.913106263538076e-06,
"loss": 1.4993,
"step": 18900
},
{
"epoch": 0.09,
"grad_norm": 4.189039707183838,
"learning_rate": 9.912481128743387e-06,
"loss": 1.4871,
"step": 19000
},
{
"epoch": 0.09,
"grad_norm": 4.019628524780273,
"learning_rate": 9.911855993948695e-06,
"loss": 1.5294,
"step": 19100
},
{
"epoch": 0.09,
"grad_norm": 3.682359457015991,
"learning_rate": 9.911230859154007e-06,
"loss": 1.4728,
"step": 19200
},
{
"epoch": 0.1,
"grad_norm": 3.9543027877807617,
"learning_rate": 9.910605724359315e-06,
"loss": 1.473,
"step": 19300
},
{
"epoch": 0.1,
"grad_norm": 3.9522476196289062,
"learning_rate": 9.909980589564627e-06,
"loss": 1.5023,
"step": 19400
},
{
"epoch": 0.1,
"grad_norm": 3.9608728885650635,
"learning_rate": 9.909355454769935e-06,
"loss": 1.4532,
"step": 19500
},
{
"epoch": 0.1,
"grad_norm": 4.24020528793335,
"learning_rate": 9.908730319975245e-06,
"loss": 1.4691,
"step": 19600
},
{
"epoch": 0.1,
"grad_norm": 4.217301845550537,
"learning_rate": 9.908105185180555e-06,
"loss": 1.4704,
"step": 19700
},
{
"epoch": 0.1,
"grad_norm": 3.651137113571167,
"learning_rate": 9.907480050385865e-06,
"loss": 1.4933,
"step": 19800
},
{
"epoch": 0.1,
"grad_norm": 4.657069206237793,
"learning_rate": 9.906854915591175e-06,
"loss": 1.4778,
"step": 19900
},
{
"epoch": 0.1,
"grad_norm": 4.250380992889404,
"learning_rate": 9.906229780796485e-06,
"loss": 1.4931,
"step": 20000
},
{
"epoch": 0.1,
"grad_norm": 4.308979034423828,
"learning_rate": 9.905604646001795e-06,
"loss": 1.4946,
"step": 20100
},
{
"epoch": 0.1,
"grad_norm": 4.854809284210205,
"learning_rate": 9.904979511207105e-06,
"loss": 1.4399,
"step": 20200
},
{
"epoch": 0.1,
"grad_norm": 4.725897789001465,
"learning_rate": 9.904354376412414e-06,
"loss": 1.4711,
"step": 20300
},
{
"epoch": 0.1,
"grad_norm": 4.014156818389893,
"learning_rate": 9.903729241617724e-06,
"loss": 1.5038,
"step": 20400
},
{
"epoch": 0.1,
"grad_norm": 6.402193546295166,
"learning_rate": 9.903104106823034e-06,
"loss": 1.4607,
"step": 20500
},
{
"epoch": 0.1,
"grad_norm": 4.356836795806885,
"learning_rate": 9.902478972028344e-06,
"loss": 1.4767,
"step": 20600
},
{
"epoch": 0.1,
"grad_norm": 4.174656867980957,
"learning_rate": 9.901853837233654e-06,
"loss": 1.4675,
"step": 20700
},
{
"epoch": 0.1,
"grad_norm": 3.668475866317749,
"learning_rate": 9.901228702438964e-06,
"loss": 1.4529,
"step": 20800
},
{
"epoch": 0.1,
"grad_norm": 3.7700912952423096,
"learning_rate": 9.900603567644274e-06,
"loss": 1.4521,
"step": 20900
},
{
"epoch": 0.1,
"grad_norm": 3.570835828781128,
"learning_rate": 9.899978432849584e-06,
"loss": 1.4323,
"step": 21000
},
{
"epoch": 0.1,
"grad_norm": 3.7499380111694336,
"learning_rate": 9.899353298054894e-06,
"loss": 1.4644,
"step": 21100
},
{
"epoch": 0.1,
"grad_norm": 3.8630640506744385,
"learning_rate": 9.898728163260204e-06,
"loss": 1.4801,
"step": 21200
},
{
"epoch": 0.11,
"grad_norm": 4.1705145835876465,
"learning_rate": 9.898103028465514e-06,
"loss": 1.4649,
"step": 21300
},
{
"epoch": 0.11,
"grad_norm": 4.312972545623779,
"learning_rate": 9.897477893670824e-06,
"loss": 1.4696,
"step": 21400
},
{
"epoch": 0.11,
"grad_norm": 4.0083088874816895,
"learning_rate": 9.896852758876134e-06,
"loss": 1.4557,
"step": 21500
},
{
"epoch": 0.11,
"grad_norm": 3.791517496109009,
"learning_rate": 9.896227624081444e-06,
"loss": 1.4507,
"step": 21600
},
{
"epoch": 0.11,
"grad_norm": 4.635531425476074,
"learning_rate": 9.895602489286753e-06,
"loss": 1.4922,
"step": 21700
},
{
"epoch": 0.11,
"grad_norm": 3.6790366172790527,
"learning_rate": 9.894977354492063e-06,
"loss": 1.456,
"step": 21800
},
{
"epoch": 0.11,
"grad_norm": 4.739284038543701,
"learning_rate": 9.894352219697373e-06,
"loss": 1.4993,
"step": 21900
},
{
"epoch": 0.11,
"grad_norm": 3.6052489280700684,
"learning_rate": 9.893727084902683e-06,
"loss": 1.4599,
"step": 22000
},
{
"epoch": 0.11,
"grad_norm": 4.582137584686279,
"learning_rate": 9.893101950107993e-06,
"loss": 1.464,
"step": 22100
},
{
"epoch": 0.11,
"grad_norm": 4.414693355560303,
"learning_rate": 9.892476815313303e-06,
"loss": 1.4383,
"step": 22200
},
{
"epoch": 0.11,
"grad_norm": 4.012635707855225,
"learning_rate": 9.891851680518613e-06,
"loss": 1.4496,
"step": 22300
},
{
"epoch": 0.11,
"grad_norm": 3.935889482498169,
"learning_rate": 9.891226545723921e-06,
"loss": 1.44,
"step": 22400
},
{
"epoch": 0.11,
"grad_norm": 3.735189199447632,
"learning_rate": 9.890601410929233e-06,
"loss": 1.4396,
"step": 22500
},
{
"epoch": 0.11,
"grad_norm": 7.265974998474121,
"learning_rate": 9.889976276134541e-06,
"loss": 1.4367,
"step": 22600
},
{
"epoch": 0.11,
"grad_norm": 3.6876790523529053,
"learning_rate": 9.889351141339853e-06,
"loss": 1.4477,
"step": 22700
},
{
"epoch": 0.11,
"grad_norm": 4.87882661819458,
"learning_rate": 9.888726006545161e-06,
"loss": 1.4687,
"step": 22800
},
{
"epoch": 0.11,
"grad_norm": 4.071088790893555,
"learning_rate": 9.888100871750473e-06,
"loss": 1.5096,
"step": 22900
},
{
"epoch": 0.11,
"grad_norm": 3.7322299480438232,
"learning_rate": 9.887475736955783e-06,
"loss": 1.4701,
"step": 23000
},
{
"epoch": 0.11,
"grad_norm": 4.085651397705078,
"learning_rate": 9.886850602161092e-06,
"loss": 1.4518,
"step": 23100
},
{
"epoch": 0.11,
"grad_norm": 3.951169729232788,
"learning_rate": 9.886225467366402e-06,
"loss": 1.4336,
"step": 23200
},
{
"epoch": 0.12,
"grad_norm": 4.026634693145752,
"learning_rate": 9.885600332571712e-06,
"loss": 1.476,
"step": 23300
},
{
"epoch": 0.12,
"grad_norm": 5.0271477699279785,
"learning_rate": 9.884975197777022e-06,
"loss": 1.4925,
"step": 23400
},
{
"epoch": 0.12,
"grad_norm": 3.727571725845337,
"learning_rate": 9.884350062982332e-06,
"loss": 1.452,
"step": 23500
},
{
"epoch": 0.12,
"grad_norm": 3.8245084285736084,
"learning_rate": 9.883724928187642e-06,
"loss": 1.4597,
"step": 23600
},
{
"epoch": 0.12,
"grad_norm": 4.508169174194336,
"learning_rate": 9.88309979339295e-06,
"loss": 1.4481,
"step": 23700
},
{
"epoch": 0.12,
"grad_norm": 4.080810070037842,
"learning_rate": 9.882474658598262e-06,
"loss": 1.4807,
"step": 23800
},
{
"epoch": 0.12,
"grad_norm": 4.269100666046143,
"learning_rate": 9.88184952380357e-06,
"loss": 1.4108,
"step": 23900
},
{
"epoch": 0.12,
"grad_norm": 4.10108757019043,
"learning_rate": 9.881224389008882e-06,
"loss": 1.4592,
"step": 24000
},
{
"epoch": 0.12,
"grad_norm": 4.869362831115723,
"learning_rate": 9.88059925421419e-06,
"loss": 1.4687,
"step": 24100
},
{
"epoch": 0.12,
"grad_norm": 3.7101335525512695,
"learning_rate": 9.879974119419502e-06,
"loss": 1.4748,
"step": 24200
},
{
"epoch": 0.12,
"grad_norm": 3.676862955093384,
"learning_rate": 9.87934898462481e-06,
"loss": 1.4439,
"step": 24300
},
{
"epoch": 0.12,
"grad_norm": 4.2332844734191895,
"learning_rate": 9.878723849830121e-06,
"loss": 1.4371,
"step": 24400
},
{
"epoch": 0.12,
"grad_norm": 3.947660207748413,
"learning_rate": 9.87809871503543e-06,
"loss": 1.4619,
"step": 24500
},
{
"epoch": 0.12,
"grad_norm": 4.331580638885498,
"learning_rate": 9.877473580240741e-06,
"loss": 1.4332,
"step": 24600
},
{
"epoch": 0.12,
"grad_norm": 5.24333381652832,
"learning_rate": 9.87684844544605e-06,
"loss": 1.4474,
"step": 24700
},
{
"epoch": 0.12,
"grad_norm": 3.499051570892334,
"learning_rate": 9.876223310651361e-06,
"loss": 1.4303,
"step": 24800
},
{
"epoch": 0.12,
"grad_norm": 3.9360058307647705,
"learning_rate": 9.87559817585667e-06,
"loss": 1.4731,
"step": 24900
},
{
"epoch": 0.12,
"grad_norm": 4.0263352394104,
"learning_rate": 9.87497304106198e-06,
"loss": 1.4429,
"step": 25000
},
{
"epoch": 0.12,
"grad_norm": 3.827765941619873,
"learning_rate": 9.87434790626729e-06,
"loss": 1.4312,
"step": 25100
},
{
"epoch": 0.12,
"grad_norm": 3.680577278137207,
"learning_rate": 9.8737227714726e-06,
"loss": 1.4571,
"step": 25200
},
{
"epoch": 0.13,
"grad_norm": 4.409987926483154,
"learning_rate": 9.873097636677909e-06,
"loss": 1.4708,
"step": 25300
},
{
"epoch": 0.13,
"grad_norm": 4.233061790466309,
"learning_rate": 9.872472501883219e-06,
"loss": 1.4079,
"step": 25400
},
{
"epoch": 0.13,
"grad_norm": 3.923621416091919,
"learning_rate": 9.871847367088529e-06,
"loss": 1.4389,
"step": 25500
},
{
"epoch": 0.13,
"grad_norm": 3.695704698562622,
"learning_rate": 9.871222232293839e-06,
"loss": 1.448,
"step": 25600
},
{
"epoch": 0.13,
"grad_norm": 4.188453674316406,
"learning_rate": 9.870597097499149e-06,
"loss": 1.4356,
"step": 25700
},
{
"epoch": 0.13,
"grad_norm": 4.8711934089660645,
"learning_rate": 9.869971962704459e-06,
"loss": 1.4221,
"step": 25800
},
{
"epoch": 0.13,
"grad_norm": 4.016773223876953,
"learning_rate": 9.869346827909769e-06,
"loss": 1.4409,
"step": 25900
},
{
"epoch": 0.13,
"grad_norm": 4.488391399383545,
"learning_rate": 9.868721693115079e-06,
"loss": 1.4609,
"step": 26000
},
{
"epoch": 0.13,
"grad_norm": 4.66510534286499,
"learning_rate": 9.868096558320389e-06,
"loss": 1.4132,
"step": 26100
},
{
"epoch": 0.13,
"grad_norm": 3.9409704208374023,
"learning_rate": 9.867471423525698e-06,
"loss": 1.4406,
"step": 26200
},
{
"epoch": 0.13,
"grad_norm": 3.3582875728607178,
"learning_rate": 9.866846288731008e-06,
"loss": 1.438,
"step": 26300
},
{
"epoch": 0.13,
"grad_norm": 4.238399505615234,
"learning_rate": 9.866221153936318e-06,
"loss": 1.4201,
"step": 26400
},
{
"epoch": 0.13,
"grad_norm": 3.6502299308776855,
"learning_rate": 9.865596019141628e-06,
"loss": 1.4592,
"step": 26500
},
{
"epoch": 0.13,
"grad_norm": 3.8077006340026855,
"learning_rate": 9.864970884346938e-06,
"loss": 1.4556,
"step": 26600
},
{
"epoch": 0.13,
"grad_norm": 4.270641326904297,
"learning_rate": 9.864345749552248e-06,
"loss": 1.4101,
"step": 26700
},
{
"epoch": 0.13,
"grad_norm": 4.743376731872559,
"learning_rate": 9.863720614757558e-06,
"loss": 1.4719,
"step": 26800
},
{
"epoch": 0.13,
"grad_norm": 4.345736980438232,
"learning_rate": 9.863095479962868e-06,
"loss": 1.4002,
"step": 26900
},
{
"epoch": 0.13,
"grad_norm": 3.8192217350006104,
"learning_rate": 9.862470345168178e-06,
"loss": 1.4697,
"step": 27000
},
{
"epoch": 0.13,
"grad_norm": 4.685102939605713,
"learning_rate": 9.861845210373488e-06,
"loss": 1.4519,
"step": 27100
},
{
"epoch": 0.13,
"grad_norm": 3.690993070602417,
"learning_rate": 9.861220075578798e-06,
"loss": 1.4424,
"step": 27200
},
{
"epoch": 0.14,
"grad_norm": 3.8806326389312744,
"learning_rate": 9.860594940784108e-06,
"loss": 1.4416,
"step": 27300
},
{
"epoch": 0.14,
"grad_norm": 3.362546443939209,
"learning_rate": 9.859969805989418e-06,
"loss": 1.4245,
"step": 27400
},
{
"epoch": 0.14,
"grad_norm": 4.167792320251465,
"learning_rate": 9.859344671194728e-06,
"loss": 1.4645,
"step": 27500
},
{
"epoch": 0.14,
"grad_norm": 4.120845317840576,
"learning_rate": 9.858719536400036e-06,
"loss": 1.3847,
"step": 27600
},
{
"epoch": 0.14,
"grad_norm": 3.8441598415374756,
"learning_rate": 9.858094401605347e-06,
"loss": 1.4244,
"step": 27700
},
{
"epoch": 0.14,
"grad_norm": 3.3462672233581543,
"learning_rate": 9.857469266810656e-06,
"loss": 1.4401,
"step": 27800
},
{
"epoch": 0.14,
"grad_norm": 4.065661430358887,
"learning_rate": 9.856844132015967e-06,
"loss": 1.4212,
"step": 27900
},
{
"epoch": 0.14,
"grad_norm": 3.5907657146453857,
"learning_rate": 9.856218997221275e-06,
"loss": 1.4639,
"step": 28000
},
{
"epoch": 0.14,
"grad_norm": 3.701472759246826,
"learning_rate": 9.855593862426587e-06,
"loss": 1.4052,
"step": 28100
},
{
"epoch": 0.14,
"grad_norm": 3.7131853103637695,
"learning_rate": 9.854968727631897e-06,
"loss": 1.4237,
"step": 28200
},
{
"epoch": 0.14,
"grad_norm": 3.156214475631714,
"learning_rate": 9.854343592837207e-06,
"loss": 1.4364,
"step": 28300
},
{
"epoch": 0.14,
"grad_norm": 4.9435715675354,
"learning_rate": 9.853718458042517e-06,
"loss": 1.4352,
"step": 28400
},
{
"epoch": 0.14,
"grad_norm": 3.94811749458313,
"learning_rate": 9.853093323247827e-06,
"loss": 1.4303,
"step": 28500
},
{
"epoch": 0.14,
"grad_norm": 3.5269935131073,
"learning_rate": 9.852468188453137e-06,
"loss": 1.4214,
"step": 28600
},
{
"epoch": 0.14,
"grad_norm": 4.688473224639893,
"learning_rate": 9.851843053658447e-06,
"loss": 1.3854,
"step": 28700
},
{
"epoch": 0.14,
"grad_norm": 4.054961204528809,
"learning_rate": 9.851217918863757e-06,
"loss": 1.432,
"step": 28800
},
{
"epoch": 0.14,
"grad_norm": 3.178467273712158,
"learning_rate": 9.850592784069065e-06,
"loss": 1.4439,
"step": 28900
},
{
"epoch": 0.14,
"grad_norm": 4.031513690948486,
"learning_rate": 9.849967649274376e-06,
"loss": 1.4228,
"step": 29000
},
{
"epoch": 0.14,
"grad_norm": 3.9268980026245117,
"learning_rate": 9.849342514479685e-06,
"loss": 1.4002,
"step": 29100
},
{
"epoch": 0.14,
"grad_norm": 3.176645040512085,
"learning_rate": 9.848717379684996e-06,
"loss": 1.4732,
"step": 29200
},
{
"epoch": 0.14,
"grad_norm": 2.7952117919921875,
"learning_rate": 9.848092244890304e-06,
"loss": 1.4609,
"step": 29300
},
{
"epoch": 0.15,
"grad_norm": 3.6165409088134766,
"learning_rate": 9.847467110095616e-06,
"loss": 1.4179,
"step": 29400
},
{
"epoch": 0.15,
"grad_norm": 4.359500408172607,
"learning_rate": 9.846841975300924e-06,
"loss": 1.4443,
"step": 29500
},
{
"epoch": 0.15,
"grad_norm": 4.256430625915527,
"learning_rate": 9.846216840506236e-06,
"loss": 1.4205,
"step": 29600
},
{
"epoch": 0.15,
"grad_norm": 4.939763069152832,
"learning_rate": 9.845591705711544e-06,
"loss": 1.3889,
"step": 29700
},
{
"epoch": 0.15,
"grad_norm": 3.5934700965881348,
"learning_rate": 9.844966570916856e-06,
"loss": 1.4045,
"step": 29800
},
{
"epoch": 0.15,
"grad_norm": 3.0760035514831543,
"learning_rate": 9.844341436122164e-06,
"loss": 1.4534,
"step": 29900
},
{
"epoch": 0.15,
"grad_norm": 4.314694881439209,
"learning_rate": 9.843716301327476e-06,
"loss": 1.4284,
"step": 30000
},
{
"epoch": 0.15,
"grad_norm": 3.9042022228240967,
"learning_rate": 9.843091166532784e-06,
"loss": 1.4249,
"step": 30100
},
{
"epoch": 0.15,
"grad_norm": 3.454749822616577,
"learning_rate": 9.842466031738094e-06,
"loss": 1.4291,
"step": 30200
},
{
"epoch": 0.15,
"grad_norm": 3.8640189170837402,
"learning_rate": 9.841840896943404e-06,
"loss": 1.4514,
"step": 30300
},
{
"epoch": 0.15,
"grad_norm": 4.65750789642334,
"learning_rate": 9.841215762148714e-06,
"loss": 1.4626,
"step": 30400
},
{
"epoch": 0.15,
"grad_norm": 4.030206680297852,
"learning_rate": 9.840590627354024e-06,
"loss": 1.4038,
"step": 30500
},
{
"epoch": 0.15,
"grad_norm": 4.036793231964111,
"learning_rate": 9.839965492559334e-06,
"loss": 1.4225,
"step": 30600
},
{
"epoch": 0.15,
"grad_norm": 3.980349063873291,
"learning_rate": 9.839340357764643e-06,
"loss": 1.4334,
"step": 30700
},
{
"epoch": 0.15,
"grad_norm": 4.157260894775391,
"learning_rate": 9.838715222969953e-06,
"loss": 1.453,
"step": 30800
},
{
"epoch": 0.15,
"grad_norm": 3.416947841644287,
"learning_rate": 9.838090088175263e-06,
"loss": 1.4176,
"step": 30900
},
{
"epoch": 0.15,
"grad_norm": 5.0742645263671875,
"learning_rate": 9.837464953380573e-06,
"loss": 1.4492,
"step": 31000
},
{
"epoch": 0.15,
"grad_norm": 3.5331027507781982,
"learning_rate": 9.836839818585883e-06,
"loss": 1.4258,
"step": 31100
},
{
"epoch": 0.15,
"grad_norm": 3.268676280975342,
"learning_rate": 9.836214683791193e-06,
"loss": 1.4081,
"step": 31200
},
{
"epoch": 0.15,
"grad_norm": 3.619158983230591,
"learning_rate": 9.835589548996503e-06,
"loss": 1.3998,
"step": 31300
},
{
"epoch": 0.16,
"grad_norm": 3.513633966445923,
"learning_rate": 9.834964414201813e-06,
"loss": 1.4102,
"step": 31400
},
{
"epoch": 0.16,
"grad_norm": 3.7973320484161377,
"learning_rate": 9.834339279407123e-06,
"loss": 1.3846,
"step": 31500
},
{
"epoch": 0.16,
"grad_norm": 4.910383701324463,
"learning_rate": 9.833714144612433e-06,
"loss": 1.3886,
"step": 31600
},
{
"epoch": 0.16,
"grad_norm": 3.820688009262085,
"learning_rate": 9.833089009817743e-06,
"loss": 1.4178,
"step": 31700
},
{
"epoch": 0.16,
"grad_norm": 3.5854384899139404,
"learning_rate": 9.832463875023053e-06,
"loss": 1.4154,
"step": 31800
},
{
"epoch": 0.16,
"grad_norm": 3.8664228916168213,
"learning_rate": 9.831838740228363e-06,
"loss": 1.4255,
"step": 31900
},
{
"epoch": 0.16,
"grad_norm": 3.179574728012085,
"learning_rate": 9.831213605433673e-06,
"loss": 1.4039,
"step": 32000
},
{
"epoch": 0.16,
"grad_norm": 4.206747531890869,
"learning_rate": 9.830588470638982e-06,
"loss": 1.4066,
"step": 32100
},
{
"epoch": 0.16,
"grad_norm": 4.408509731292725,
"learning_rate": 9.829963335844292e-06,
"loss": 1.4028,
"step": 32200
},
{
"epoch": 0.16,
"grad_norm": 5.01927375793457,
"learning_rate": 9.829338201049602e-06,
"loss": 1.4415,
"step": 32300
},
{
"epoch": 0.16,
"grad_norm": 3.166085720062256,
"learning_rate": 9.828713066254912e-06,
"loss": 1.4698,
"step": 32400
},
{
"epoch": 0.16,
"grad_norm": 3.4799346923828125,
"learning_rate": 9.828087931460222e-06,
"loss": 1.4055,
"step": 32500
},
{
"epoch": 0.16,
"grad_norm": 3.68662428855896,
"learning_rate": 9.827462796665532e-06,
"loss": 1.3942,
"step": 32600
},
{
"epoch": 0.16,
"grad_norm": 4.4798970222473145,
"learning_rate": 9.826837661870842e-06,
"loss": 1.4293,
"step": 32700
},
{
"epoch": 0.16,
"grad_norm": 3.4319188594818115,
"learning_rate": 9.826212527076152e-06,
"loss": 1.4134,
"step": 32800
},
{
"epoch": 0.16,
"grad_norm": 3.2756521701812744,
"learning_rate": 9.825587392281462e-06,
"loss": 1.4535,
"step": 32900
},
{
"epoch": 0.16,
"grad_norm": 3.3544061183929443,
"learning_rate": 9.82496225748677e-06,
"loss": 1.3997,
"step": 33000
},
{
"epoch": 0.16,
"grad_norm": 3.7909374237060547,
"learning_rate": 9.824337122692082e-06,
"loss": 1.4043,
"step": 33100
},
{
"epoch": 0.16,
"grad_norm": 3.8240981101989746,
"learning_rate": 9.82371198789739e-06,
"loss": 1.428,
"step": 33200
},
{
"epoch": 0.16,
"grad_norm": 3.214618682861328,
"learning_rate": 9.823086853102702e-06,
"loss": 1.3999,
"step": 33300
},
{
"epoch": 0.17,
"grad_norm": 8.105681419372559,
"learning_rate": 9.822461718308011e-06,
"loss": 1.4082,
"step": 33400
},
{
"epoch": 0.17,
"grad_norm": 4.449899196624756,
"learning_rate": 9.821836583513321e-06,
"loss": 1.4273,
"step": 33500
},
{
"epoch": 0.17,
"grad_norm": 4.17997932434082,
"learning_rate": 9.821211448718631e-06,
"loss": 1.4291,
"step": 33600
},
{
"epoch": 0.17,
"grad_norm": 3.813826322555542,
"learning_rate": 9.820586313923941e-06,
"loss": 1.407,
"step": 33700
},
{
"epoch": 0.17,
"grad_norm": 3.7879397869110107,
"learning_rate": 9.819961179129251e-06,
"loss": 1.4159,
"step": 33800
},
{
"epoch": 0.17,
"grad_norm": 3.9027743339538574,
"learning_rate": 9.819336044334561e-06,
"loss": 1.439,
"step": 33900
},
{
"epoch": 0.17,
"grad_norm": 3.7069435119628906,
"learning_rate": 9.818710909539871e-06,
"loss": 1.4117,
"step": 34000
},
{
"epoch": 0.17,
"grad_norm": 3.904519557952881,
"learning_rate": 9.818085774745181e-06,
"loss": 1.4227,
"step": 34100
},
{
"epoch": 0.17,
"grad_norm": 3.545767068862915,
"learning_rate": 9.817460639950491e-06,
"loss": 1.4344,
"step": 34200
},
{
"epoch": 0.17,
"grad_norm": 3.717536687850952,
"learning_rate": 9.816835505155799e-06,
"loss": 1.447,
"step": 34300
},
{
"epoch": 0.17,
"grad_norm": 3.036220073699951,
"learning_rate": 9.81621037036111e-06,
"loss": 1.4124,
"step": 34400
},
{
"epoch": 0.17,
"grad_norm": 4.334647178649902,
"learning_rate": 9.815585235566419e-06,
"loss": 1.4347,
"step": 34500
},
{
"epoch": 0.17,
"grad_norm": 3.7795979976654053,
"learning_rate": 9.81496010077173e-06,
"loss": 1.417,
"step": 34600
},
{
"epoch": 0.17,
"grad_norm": 3.4146125316619873,
"learning_rate": 9.814334965977039e-06,
"loss": 1.3847,
"step": 34700
},
{
"epoch": 0.17,
"grad_norm": 3.193895101547241,
"learning_rate": 9.81370983118235e-06,
"loss": 1.4084,
"step": 34800
},
{
"epoch": 0.17,
"grad_norm": 3.3683910369873047,
"learning_rate": 9.813084696387659e-06,
"loss": 1.4393,
"step": 34900
},
{
"epoch": 0.17,
"grad_norm": 4.252421855926514,
"learning_rate": 9.81245956159297e-06,
"loss": 1.4294,
"step": 35000
},
{
"epoch": 0.17,
"grad_norm": 3.6922848224639893,
"learning_rate": 9.811834426798279e-06,
"loss": 1.4212,
"step": 35100
},
{
"epoch": 0.17,
"grad_norm": 3.27756667137146,
"learning_rate": 9.81120929200359e-06,
"loss": 1.4185,
"step": 35200
},
{
"epoch": 0.17,
"grad_norm": 3.377180337905884,
"learning_rate": 9.810584157208898e-06,
"loss": 1.3991,
"step": 35300
},
{
"epoch": 0.18,
"grad_norm": 3.4141881465911865,
"learning_rate": 9.80995902241421e-06,
"loss": 1.3816,
"step": 35400
},
{
"epoch": 0.18,
"grad_norm": 3.6975343227386475,
"learning_rate": 9.809333887619518e-06,
"loss": 1.4275,
"step": 35500
},
{
"epoch": 0.18,
"grad_norm": 3.303208112716675,
"learning_rate": 9.808708752824828e-06,
"loss": 1.3996,
"step": 35600
},
{
"epoch": 0.18,
"grad_norm": 3.1281208992004395,
"learning_rate": 9.808083618030138e-06,
"loss": 1.4573,
"step": 35700
},
{
"epoch": 0.18,
"grad_norm": 4.216818809509277,
"learning_rate": 9.807458483235448e-06,
"loss": 1.4264,
"step": 35800
},
{
"epoch": 0.18,
"grad_norm": 3.6236705780029297,
"learning_rate": 9.806833348440758e-06,
"loss": 1.42,
"step": 35900
},
{
"epoch": 0.18,
"grad_norm": 3.4652881622314453,
"learning_rate": 9.806208213646068e-06,
"loss": 1.4509,
"step": 36000
},
{
"epoch": 0.18,
"grad_norm": 4.3565449714660645,
"learning_rate": 9.805583078851378e-06,
"loss": 1.3968,
"step": 36100
},
{
"epoch": 0.18,
"grad_norm": 5.522129535675049,
"learning_rate": 9.804957944056688e-06,
"loss": 1.4402,
"step": 36200
},
{
"epoch": 0.18,
"grad_norm": 3.9470767974853516,
"learning_rate": 9.804332809261998e-06,
"loss": 1.3922,
"step": 36300
},
{
"epoch": 0.18,
"grad_norm": 3.978543758392334,
"learning_rate": 9.803707674467308e-06,
"loss": 1.403,
"step": 36400
},
{
"epoch": 0.18,
"grad_norm": 5.382244110107422,
"learning_rate": 9.803082539672618e-06,
"loss": 1.3968,
"step": 36500
},
{
"epoch": 0.18,
"grad_norm": 4.595647811889648,
"learning_rate": 9.802457404877927e-06,
"loss": 1.4002,
"step": 36600
},
{
"epoch": 0.18,
"grad_norm": 3.6310489177703857,
"learning_rate": 9.801832270083237e-06,
"loss": 1.4122,
"step": 36700
},
{
"epoch": 0.18,
"grad_norm": 3.4216363430023193,
"learning_rate": 9.801207135288547e-06,
"loss": 1.3859,
"step": 36800
},
{
"epoch": 0.18,
"grad_norm": 3.4577724933624268,
"learning_rate": 9.800582000493857e-06,
"loss": 1.4206,
"step": 36900
},
{
"epoch": 0.18,
"grad_norm": 4.211758136749268,
"learning_rate": 9.799956865699167e-06,
"loss": 1.4146,
"step": 37000
},
{
"epoch": 0.18,
"grad_norm": 3.5187759399414062,
"learning_rate": 9.799331730904477e-06,
"loss": 1.3983,
"step": 37100
},
{
"epoch": 0.18,
"grad_norm": 3.524277925491333,
"learning_rate": 9.798706596109787e-06,
"loss": 1.39,
"step": 37200
},
{
"epoch": 0.18,
"grad_norm": 3.746493339538574,
"learning_rate": 9.798081461315097e-06,
"loss": 1.4351,
"step": 37300
},
{
"epoch": 0.19,
"grad_norm": 3.489757537841797,
"learning_rate": 9.797456326520407e-06,
"loss": 1.4215,
"step": 37400
},
{
"epoch": 0.19,
"grad_norm": 3.862546443939209,
"learning_rate": 9.796831191725717e-06,
"loss": 1.3839,
"step": 37500
},
{
"epoch": 0.19,
"grad_norm": 3.700289487838745,
"learning_rate": 9.796206056931027e-06,
"loss": 1.4134,
"step": 37600
},
{
"epoch": 0.19,
"grad_norm": 4.463230609893799,
"learning_rate": 9.795580922136337e-06,
"loss": 1.4094,
"step": 37700
},
{
"epoch": 0.19,
"grad_norm": 3.6630661487579346,
"learning_rate": 9.794955787341647e-06,
"loss": 1.4008,
"step": 37800
},
{
"epoch": 0.19,
"grad_norm": 4.630967140197754,
"learning_rate": 9.794330652546956e-06,
"loss": 1.3759,
"step": 37900
},
{
"epoch": 0.19,
"grad_norm": 3.3025717735290527,
"learning_rate": 9.793705517752266e-06,
"loss": 1.3884,
"step": 38000
},
{
"epoch": 0.19,
"grad_norm": 3.3258678913116455,
"learning_rate": 9.793080382957576e-06,
"loss": 1.386,
"step": 38100
},
{
"epoch": 0.19,
"grad_norm": 3.719531536102295,
"learning_rate": 9.792455248162885e-06,
"loss": 1.3964,
"step": 38200
},
{
"epoch": 0.19,
"grad_norm": 3.2938575744628906,
"learning_rate": 9.791830113368196e-06,
"loss": 1.4057,
"step": 38300
},
{
"epoch": 0.19,
"grad_norm": 4.785384654998779,
"learning_rate": 9.791204978573506e-06,
"loss": 1.4134,
"step": 38400
},
{
"epoch": 0.19,
"grad_norm": 3.3767313957214355,
"learning_rate": 9.790579843778816e-06,
"loss": 1.4139,
"step": 38500
},
{
"epoch": 0.19,
"grad_norm": 2.9999425411224365,
"learning_rate": 9.789954708984126e-06,
"loss": 1.4039,
"step": 38600
},
{
"epoch": 0.19,
"grad_norm": 4.019780158996582,
"learning_rate": 9.789329574189436e-06,
"loss": 1.3637,
"step": 38700
},
{
"epoch": 0.19,
"grad_norm": 3.2933456897735596,
"learning_rate": 9.788704439394746e-06,
"loss": 1.3784,
"step": 38800
},
{
"epoch": 0.19,
"grad_norm": 3.511465549468994,
"learning_rate": 9.788079304600056e-06,
"loss": 1.3845,
"step": 38900
},
{
"epoch": 0.19,
"grad_norm": 3.031588077545166,
"learning_rate": 9.787454169805366e-06,
"loss": 1.3815,
"step": 39000
},
{
"epoch": 0.19,
"grad_norm": 3.726041078567505,
"learning_rate": 9.786829035010676e-06,
"loss": 1.4031,
"step": 39100
},
{
"epoch": 0.19,
"grad_norm": 3.459808349609375,
"learning_rate": 9.786203900215986e-06,
"loss": 1.4168,
"step": 39200
},
{
"epoch": 0.19,
"grad_norm": 5.884055137634277,
"learning_rate": 9.785578765421295e-06,
"loss": 1.3897,
"step": 39300
},
{
"epoch": 0.19,
"grad_norm": 3.8769099712371826,
"learning_rate": 9.784953630626605e-06,
"loss": 1.4214,
"step": 39400
},
{
"epoch": 0.2,
"grad_norm": 4.353100776672363,
"learning_rate": 9.784328495831914e-06,
"loss": 1.3875,
"step": 39500
},
{
"epoch": 0.2,
"grad_norm": 3.665733575820923,
"learning_rate": 9.783703361037225e-06,
"loss": 1.4033,
"step": 39600
},
{
"epoch": 0.2,
"grad_norm": 4.098516941070557,
"learning_rate": 9.783078226242533e-06,
"loss": 1.3755,
"step": 39700
},
{
"epoch": 0.2,
"grad_norm": 3.6719651222229004,
"learning_rate": 9.782453091447845e-06,
"loss": 1.4025,
"step": 39800
},
{
"epoch": 0.2,
"grad_norm": 3.7323970794677734,
"learning_rate": 9.781827956653153e-06,
"loss": 1.4072,
"step": 39900
},
{
"epoch": 0.2,
"grad_norm": 3.6012964248657227,
"learning_rate": 9.781202821858465e-06,
"loss": 1.3839,
"step": 40000
},
{
"epoch": 0.2,
"grad_norm": 3.1830966472625732,
"learning_rate": 9.780577687063773e-06,
"loss": 1.4025,
"step": 40100
},
{
"epoch": 0.2,
"grad_norm": 3.503458261489868,
"learning_rate": 9.779952552269085e-06,
"loss": 1.3849,
"step": 40200
},
{
"epoch": 0.2,
"grad_norm": 3.934358596801758,
"learning_rate": 9.779327417474393e-06,
"loss": 1.3908,
"step": 40300
},
{
"epoch": 0.2,
"grad_norm": 3.263597249984741,
"learning_rate": 9.778702282679705e-06,
"loss": 1.4144,
"step": 40400
},
{
"epoch": 0.2,
"grad_norm": 3.6019351482391357,
"learning_rate": 9.778077147885013e-06,
"loss": 1.3966,
"step": 40500
},
{
"epoch": 0.2,
"grad_norm": 3.211871862411499,
"learning_rate": 9.777452013090324e-06,
"loss": 1.4029,
"step": 40600
},
{
"epoch": 0.2,
"grad_norm": 4.445366382598877,
"learning_rate": 9.776826878295633e-06,
"loss": 1.4192,
"step": 40700
},
{
"epoch": 0.2,
"grad_norm": 3.834134340286255,
"learning_rate": 9.776201743500943e-06,
"loss": 1.3971,
"step": 40800
},
{
"epoch": 0.2,
"grad_norm": 11.223153114318848,
"learning_rate": 9.775576608706253e-06,
"loss": 1.4199,
"step": 40900
},
{
"epoch": 0.2,
"grad_norm": 3.168875217437744,
"learning_rate": 9.774951473911563e-06,
"loss": 1.403,
"step": 41000
},
{
"epoch": 0.2,
"grad_norm": 4.082376480102539,
"learning_rate": 9.774326339116872e-06,
"loss": 1.3758,
"step": 41100
},
{
"epoch": 0.2,
"grad_norm": 3.381903648376465,
"learning_rate": 9.773701204322182e-06,
"loss": 1.4124,
"step": 41200
},
{
"epoch": 0.2,
"grad_norm": 3.220072031021118,
"learning_rate": 9.773076069527492e-06,
"loss": 1.3731,
"step": 41300
},
{
"epoch": 0.2,
"grad_norm": 4.445113182067871,
"learning_rate": 9.772450934732802e-06,
"loss": 1.409,
"step": 41400
},
{
"epoch": 0.21,
"grad_norm": 4.551964282989502,
"learning_rate": 9.771825799938112e-06,
"loss": 1.4092,
"step": 41500
},
{
"epoch": 0.21,
"grad_norm": 3.120997905731201,
"learning_rate": 9.771200665143422e-06,
"loss": 1.4316,
"step": 41600
},
{
"epoch": 0.21,
"grad_norm": 3.3942294120788574,
"learning_rate": 9.770575530348732e-06,
"loss": 1.3752,
"step": 41700
},
{
"epoch": 0.21,
"grad_norm": 3.3764448165893555,
"learning_rate": 9.769950395554042e-06,
"loss": 1.4238,
"step": 41800
},
{
"epoch": 0.21,
"grad_norm": 3.600349187850952,
"learning_rate": 9.769325260759352e-06,
"loss": 1.4012,
"step": 41900
},
{
"epoch": 0.21,
"grad_norm": 3.6565279960632324,
"learning_rate": 9.768700125964662e-06,
"loss": 1.3768,
"step": 42000
},
{
"epoch": 0.21,
"grad_norm": 3.65138578414917,
"learning_rate": 9.768074991169972e-06,
"loss": 1.4022,
"step": 42100
},
{
"epoch": 0.21,
"grad_norm": 3.3732988834381104,
"learning_rate": 9.767449856375282e-06,
"loss": 1.409,
"step": 42200
},
{
"epoch": 0.21,
"grad_norm": 3.8248541355133057,
"learning_rate": 9.766824721580592e-06,
"loss": 1.4121,
"step": 42300
},
{
"epoch": 0.21,
"grad_norm": 3.3323121070861816,
"learning_rate": 9.766199586785901e-06,
"loss": 1.4203,
"step": 42400
},
{
"epoch": 0.21,
"grad_norm": 3.0881714820861816,
"learning_rate": 9.765574451991211e-06,
"loss": 1.3993,
"step": 42500
},
{
"epoch": 0.21,
"grad_norm": 3.6461262702941895,
"learning_rate": 9.764949317196521e-06,
"loss": 1.3842,
"step": 42600
},
{
"epoch": 0.21,
"grad_norm": 3.2910470962524414,
"learning_rate": 9.764324182401831e-06,
"loss": 1.4108,
"step": 42700
},
{
"epoch": 0.21,
"grad_norm": 3.8803622722625732,
"learning_rate": 9.763699047607141e-06,
"loss": 1.3962,
"step": 42800
},
{
"epoch": 0.21,
"grad_norm": 3.9977149963378906,
"learning_rate": 9.763073912812451e-06,
"loss": 1.3876,
"step": 42900
},
{
"epoch": 0.21,
"grad_norm": 3.4803717136383057,
"learning_rate": 9.762448778017761e-06,
"loss": 1.3601,
"step": 43000
},
{
"epoch": 0.21,
"grad_norm": 3.819903612136841,
"learning_rate": 9.761823643223071e-06,
"loss": 1.3891,
"step": 43100
},
{
"epoch": 0.21,
"grad_norm": 3.582854747772217,
"learning_rate": 9.761198508428381e-06,
"loss": 1.4145,
"step": 43200
},
{
"epoch": 0.21,
"grad_norm": 3.323837995529175,
"learning_rate": 9.76057337363369e-06,
"loss": 1.4141,
"step": 43300
},
{
"epoch": 0.21,
"grad_norm": 2.7181520462036133,
"learning_rate": 9.759948238839e-06,
"loss": 1.3464,
"step": 43400
},
{
"epoch": 0.22,
"grad_norm": 3.4599475860595703,
"learning_rate": 9.75932310404431e-06,
"loss": 1.3775,
"step": 43500
},
{
"epoch": 0.22,
"grad_norm": 3.005889654159546,
"learning_rate": 9.75869796924962e-06,
"loss": 1.4078,
"step": 43600
},
{
"epoch": 0.22,
"grad_norm": 3.152175188064575,
"learning_rate": 9.75807283445493e-06,
"loss": 1.411,
"step": 43700
},
{
"epoch": 0.22,
"grad_norm": 3.951000213623047,
"learning_rate": 9.75744769966024e-06,
"loss": 1.3737,
"step": 43800
},
{
"epoch": 0.22,
"grad_norm": 3.722508192062378,
"learning_rate": 9.75682256486555e-06,
"loss": 1.4092,
"step": 43900
},
{
"epoch": 0.22,
"grad_norm": 4.0199761390686035,
"learning_rate": 9.75619743007086e-06,
"loss": 1.3964,
"step": 44000
},
{
"epoch": 0.22,
"grad_norm": 3.306147336959839,
"learning_rate": 9.75557229527617e-06,
"loss": 1.3772,
"step": 44100
},
{
"epoch": 0.22,
"grad_norm": 3.0931670665740967,
"learning_rate": 9.75494716048148e-06,
"loss": 1.3848,
"step": 44200
},
{
"epoch": 0.22,
"grad_norm": 3.4775798320770264,
"learning_rate": 9.75432202568679e-06,
"loss": 1.391,
"step": 44300
},
{
"epoch": 0.22,
"grad_norm": 2.973440170288086,
"learning_rate": 9.7536968908921e-06,
"loss": 1.3993,
"step": 44400
},
{
"epoch": 0.22,
"grad_norm": 2.9864256381988525,
"learning_rate": 9.75307175609741e-06,
"loss": 1.3523,
"step": 44500
},
{
"epoch": 0.22,
"grad_norm": 3.9546356201171875,
"learning_rate": 9.75244662130272e-06,
"loss": 1.3527,
"step": 44600
},
{
"epoch": 0.22,
"grad_norm": 6.238116264343262,
"learning_rate": 9.75182148650803e-06,
"loss": 1.3624,
"step": 44700
},
{
"epoch": 0.22,
"grad_norm": 3.241060495376587,
"learning_rate": 9.75119635171334e-06,
"loss": 1.3875,
"step": 44800
},
{
"epoch": 0.22,
"grad_norm": 3.17099666595459,
"learning_rate": 9.750571216918648e-06,
"loss": 1.381,
"step": 44900
},
{
"epoch": 0.22,
"grad_norm": 3.647505283355713,
"learning_rate": 9.74994608212396e-06,
"loss": 1.3673,
"step": 45000
},
{
"epoch": 0.22,
"grad_norm": 3.5791754722595215,
"learning_rate": 9.749320947329268e-06,
"loss": 1.3936,
"step": 45100
},
{
"epoch": 0.22,
"grad_norm": 4.946603775024414,
"learning_rate": 9.74869581253458e-06,
"loss": 1.4135,
"step": 45200
},
{
"epoch": 0.22,
"grad_norm": 2.9948465824127197,
"learning_rate": 9.748070677739888e-06,
"loss": 1.3824,
"step": 45300
},
{
"epoch": 0.22,
"grad_norm": 3.355520725250244,
"learning_rate": 9.7474455429452e-06,
"loss": 1.3757,
"step": 45400
},
{
"epoch": 0.23,
"grad_norm": 3.599808931350708,
"learning_rate": 9.746820408150508e-06,
"loss": 1.4019,
"step": 45500
},
{
"epoch": 0.23,
"grad_norm": 2.60089111328125,
"learning_rate": 9.746195273355819e-06,
"loss": 1.4147,
"step": 45600
},
{
"epoch": 0.23,
"grad_norm": 3.074833393096924,
"learning_rate": 9.745570138561127e-06,
"loss": 1.4084,
"step": 45700
},
{
"epoch": 0.23,
"grad_norm": 3.3081626892089844,
"learning_rate": 9.744945003766439e-06,
"loss": 1.3999,
"step": 45800
},
{
"epoch": 0.23,
"grad_norm": 4.088558673858643,
"learning_rate": 9.744319868971747e-06,
"loss": 1.398,
"step": 45900
},
{
"epoch": 0.23,
"grad_norm": 3.7625129222869873,
"learning_rate": 9.743694734177059e-06,
"loss": 1.388,
"step": 46000
},
{
"epoch": 0.23,
"grad_norm": 4.328056335449219,
"learning_rate": 9.743069599382367e-06,
"loss": 1.3721,
"step": 46100
},
{
"epoch": 0.23,
"grad_norm": 3.1336140632629395,
"learning_rate": 9.742444464587677e-06,
"loss": 1.3865,
"step": 46200
},
{
"epoch": 0.23,
"grad_norm": 3.0789365768432617,
"learning_rate": 9.741819329792987e-06,
"loss": 1.3609,
"step": 46300
},
{
"epoch": 0.23,
"grad_norm": 3.569803237915039,
"learning_rate": 9.741194194998297e-06,
"loss": 1.3954,
"step": 46400
},
{
"epoch": 0.23,
"grad_norm": 2.9342846870422363,
"learning_rate": 9.740569060203607e-06,
"loss": 1.3381,
"step": 46500
},
{
"epoch": 0.23,
"grad_norm": 3.4400010108947754,
"learning_rate": 9.739943925408917e-06,
"loss": 1.3582,
"step": 46600
},
{
"epoch": 0.23,
"grad_norm": 3.698220729827881,
"learning_rate": 9.739318790614227e-06,
"loss": 1.4165,
"step": 46700
},
{
"epoch": 0.23,
"grad_norm": 3.7116315364837646,
"learning_rate": 9.738693655819537e-06,
"loss": 1.3643,
"step": 46800
},
{
"epoch": 0.23,
"grad_norm": 4.515981674194336,
"learning_rate": 9.738068521024846e-06,
"loss": 1.3899,
"step": 46900
},
{
"epoch": 0.23,
"grad_norm": 3.1222646236419678,
"learning_rate": 9.737443386230156e-06,
"loss": 1.3497,
"step": 47000
},
{
"epoch": 0.23,
"grad_norm": 3.3143482208251953,
"learning_rate": 9.736818251435466e-06,
"loss": 1.3906,
"step": 47100
},
{
"epoch": 0.23,
"grad_norm": 3.673861026763916,
"learning_rate": 9.736193116640776e-06,
"loss": 1.3742,
"step": 47200
},
{
"epoch": 0.23,
"grad_norm": 3.1223580837249756,
"learning_rate": 9.735567981846086e-06,
"loss": 1.3679,
"step": 47300
},
{
"epoch": 0.23,
"grad_norm": 3.321925640106201,
"learning_rate": 9.734942847051396e-06,
"loss": 1.3872,
"step": 47400
},
{
"epoch": 0.24,
"grad_norm": 3.3574061393737793,
"learning_rate": 9.734317712256706e-06,
"loss": 1.3536,
"step": 47500
},
{
"epoch": 0.24,
"grad_norm": 3.0097270011901855,
"learning_rate": 9.733692577462016e-06,
"loss": 1.3867,
"step": 47600
},
{
"epoch": 0.24,
"grad_norm": 6.731925964355469,
"learning_rate": 9.733067442667326e-06,
"loss": 1.3671,
"step": 47700
},
{
"epoch": 0.24,
"grad_norm": 4.5036468505859375,
"learning_rate": 9.732442307872636e-06,
"loss": 1.3935,
"step": 47800
},
{
"epoch": 0.24,
"grad_norm": 5.059147357940674,
"learning_rate": 9.731817173077946e-06,
"loss": 1.3693,
"step": 47900
},
{
"epoch": 0.24,
"grad_norm": 3.65627384185791,
"learning_rate": 9.731192038283256e-06,
"loss": 1.3838,
"step": 48000
},
{
"epoch": 0.24,
"grad_norm": 3.8300678730010986,
"learning_rate": 9.730566903488566e-06,
"loss": 1.4273,
"step": 48100
},
{
"epoch": 0.24,
"grad_norm": 3.2943530082702637,
"learning_rate": 9.729941768693876e-06,
"loss": 1.3598,
"step": 48200
},
{
"epoch": 0.24,
"grad_norm": 3.6148500442504883,
"learning_rate": 9.729316633899185e-06,
"loss": 1.3916,
"step": 48300
},
{
"epoch": 0.24,
"grad_norm": 3.14809250831604,
"learning_rate": 9.728691499104495e-06,
"loss": 1.3596,
"step": 48400
},
{
"epoch": 0.24,
"grad_norm": 3.4826507568359375,
"learning_rate": 9.728066364309805e-06,
"loss": 1.4113,
"step": 48500
},
{
"epoch": 0.24,
"grad_norm": 3.761301279067993,
"learning_rate": 9.727441229515115e-06,
"loss": 1.3811,
"step": 48600
},
{
"epoch": 0.24,
"grad_norm": 3.280597448348999,
"learning_rate": 9.726816094720425e-06,
"loss": 1.3735,
"step": 48700
},
{
"epoch": 0.24,
"grad_norm": 3.2056515216827393,
"learning_rate": 9.726190959925735e-06,
"loss": 1.3928,
"step": 48800
},
{
"epoch": 0.24,
"grad_norm": 5.535262107849121,
"learning_rate": 9.725565825131045e-06,
"loss": 1.3795,
"step": 48900
},
{
"epoch": 0.24,
"grad_norm": 3.71197509765625,
"learning_rate": 9.724940690336355e-06,
"loss": 1.3956,
"step": 49000
},
{
"epoch": 0.24,
"grad_norm": 3.048292875289917,
"learning_rate": 9.724315555541665e-06,
"loss": 1.3756,
"step": 49100
},
{
"epoch": 0.24,
"grad_norm": 3.6094841957092285,
"learning_rate": 9.723690420746975e-06,
"loss": 1.4143,
"step": 49200
},
{
"epoch": 0.24,
"grad_norm": 4.416449546813965,
"learning_rate": 9.723065285952285e-06,
"loss": 1.3618,
"step": 49300
},
{
"epoch": 0.24,
"grad_norm": 4.372152328491211,
"learning_rate": 9.722440151157595e-06,
"loss": 1.402,
"step": 49400
},
{
"epoch": 0.24,
"grad_norm": 3.1622934341430664,
"learning_rate": 9.721815016362905e-06,
"loss": 1.3914,
"step": 49500
},
{
"epoch": 0.25,
"grad_norm": 3.1704394817352295,
"learning_rate": 9.721189881568214e-06,
"loss": 1.3827,
"step": 49600
},
{
"epoch": 0.25,
"grad_norm": 3.9178764820098877,
"learning_rate": 9.720564746773524e-06,
"loss": 1.371,
"step": 49700
},
{
"epoch": 0.25,
"grad_norm": 3.839916706085205,
"learning_rate": 9.719939611978834e-06,
"loss": 1.361,
"step": 49800
},
{
"epoch": 0.25,
"grad_norm": 3.5647811889648438,
"learning_rate": 9.719314477184144e-06,
"loss": 1.3857,
"step": 49900
},
{
"epoch": 0.25,
"grad_norm": 3.2756240367889404,
"learning_rate": 9.718689342389454e-06,
"loss": 1.375,
"step": 50000
},
{
"epoch": 0.25,
"grad_norm": 4.051654815673828,
"learning_rate": 9.718064207594762e-06,
"loss": 1.3941,
"step": 50100
},
{
"epoch": 0.25,
"grad_norm": 4.137097358703613,
"learning_rate": 9.717439072800074e-06,
"loss": 1.3892,
"step": 50200
},
{
"epoch": 0.25,
"grad_norm": 3.646369457244873,
"learning_rate": 9.716813938005382e-06,
"loss": 1.3846,
"step": 50300
},
{
"epoch": 0.25,
"grad_norm": 3.162900686264038,
"learning_rate": 9.716188803210694e-06,
"loss": 1.4173,
"step": 50400
},
{
"epoch": 0.25,
"grad_norm": 3.1182548999786377,
"learning_rate": 9.715563668416002e-06,
"loss": 1.3769,
"step": 50500
},
{
"epoch": 0.25,
"grad_norm": 5.595252513885498,
"learning_rate": 9.714938533621314e-06,
"loss": 1.4087,
"step": 50600
},
{
"epoch": 0.25,
"grad_norm": 2.782058000564575,
"learning_rate": 9.714313398826622e-06,
"loss": 1.4179,
"step": 50700
},
{
"epoch": 0.25,
"grad_norm": 3.0897421836853027,
"learning_rate": 9.713688264031934e-06,
"loss": 1.3671,
"step": 50800
},
{
"epoch": 0.25,
"grad_norm": 3.455578565597534,
"learning_rate": 9.713063129237242e-06,
"loss": 1.401,
"step": 50900
},
{
"epoch": 0.25,
"grad_norm": 3.877256155014038,
"learning_rate": 9.712437994442553e-06,
"loss": 1.4114,
"step": 51000
},
{
"epoch": 0.25,
"grad_norm": 3.3103723526000977,
"learning_rate": 9.711812859647862e-06,
"loss": 1.3895,
"step": 51100
},
{
"epoch": 0.25,
"grad_norm": 3.909396171569824,
"learning_rate": 9.711187724853173e-06,
"loss": 1.3632,
"step": 51200
},
{
"epoch": 0.25,
"grad_norm": 3.4170010089874268,
"learning_rate": 9.710562590058482e-06,
"loss": 1.3819,
"step": 51300
},
{
"epoch": 0.25,
"grad_norm": 3.1259448528289795,
"learning_rate": 9.709937455263791e-06,
"loss": 1.3894,
"step": 51400
},
{
"epoch": 0.25,
"grad_norm": 4.449690818786621,
"learning_rate": 9.709312320469101e-06,
"loss": 1.3998,
"step": 51500
},
{
"epoch": 0.26,
"grad_norm": 3.47631573677063,
"learning_rate": 9.708687185674411e-06,
"loss": 1.374,
"step": 51600
},
{
"epoch": 0.26,
"grad_norm": 3.5324013233184814,
"learning_rate": 9.708062050879721e-06,
"loss": 1.3518,
"step": 51700
},
{
"epoch": 0.26,
"grad_norm": 3.9155433177948,
"learning_rate": 9.707436916085031e-06,
"loss": 1.3848,
"step": 51800
},
{
"epoch": 0.26,
"grad_norm": 3.4327306747436523,
"learning_rate": 9.706811781290341e-06,
"loss": 1.3653,
"step": 51900
},
{
"epoch": 0.26,
"grad_norm": 6.06643533706665,
"learning_rate": 9.706186646495651e-06,
"loss": 1.3892,
"step": 52000
},
{
"epoch": 0.26,
"grad_norm": 3.518132448196411,
"learning_rate": 9.705561511700961e-06,
"loss": 1.3433,
"step": 52100
},
{
"epoch": 0.26,
"grad_norm": 3.2915585041046143,
"learning_rate": 9.704936376906271e-06,
"loss": 1.3579,
"step": 52200
},
{
"epoch": 0.26,
"grad_norm": 3.804596185684204,
"learning_rate": 9.70431124211158e-06,
"loss": 1.3809,
"step": 52300
},
{
"epoch": 0.26,
"grad_norm": 4.617377758026123,
"learning_rate": 9.70368610731689e-06,
"loss": 1.398,
"step": 52400
},
{
"epoch": 0.26,
"grad_norm": 3.0816659927368164,
"learning_rate": 9.7030609725222e-06,
"loss": 1.3652,
"step": 52500
},
{
"epoch": 0.26,
"grad_norm": 3.5903923511505127,
"learning_rate": 9.70243583772751e-06,
"loss": 1.3767,
"step": 52600
},
{
"epoch": 0.26,
"grad_norm": 3.43280029296875,
"learning_rate": 9.70181070293282e-06,
"loss": 1.3802,
"step": 52700
},
{
"epoch": 0.26,
"grad_norm": 4.5906081199646,
"learning_rate": 9.70118556813813e-06,
"loss": 1.3726,
"step": 52800
},
{
"epoch": 0.26,
"grad_norm": 3.5110647678375244,
"learning_rate": 9.70056043334344e-06,
"loss": 1.4125,
"step": 52900
},
{
"epoch": 0.26,
"grad_norm": 3.5731992721557617,
"learning_rate": 9.69993529854875e-06,
"loss": 1.3971,
"step": 53000
},
{
"epoch": 0.26,
"grad_norm": 3.522200584411621,
"learning_rate": 9.69931016375406e-06,
"loss": 1.3758,
"step": 53100
},
{
"epoch": 0.26,
"grad_norm": 3.754093647003174,
"learning_rate": 9.69868502895937e-06,
"loss": 1.4125,
"step": 53200
},
{
"epoch": 0.26,
"grad_norm": 4.088795185089111,
"learning_rate": 9.69805989416468e-06,
"loss": 1.374,
"step": 53300
},
{
"epoch": 0.26,
"grad_norm": 3.095700263977051,
"learning_rate": 9.69743475936999e-06,
"loss": 1.3475,
"step": 53400
},
{
"epoch": 0.26,
"grad_norm": 3.6446001529693604,
"learning_rate": 9.6968096245753e-06,
"loss": 1.3675,
"step": 53500
},
{
"epoch": 0.27,
"grad_norm": 3.0287554264068604,
"learning_rate": 9.69618448978061e-06,
"loss": 1.3648,
"step": 53600
},
{
"epoch": 0.27,
"grad_norm": 3.5153772830963135,
"learning_rate": 9.69555935498592e-06,
"loss": 1.3526,
"step": 53700
},
{
"epoch": 0.27,
"grad_norm": 3.402449131011963,
"learning_rate": 9.69493422019123e-06,
"loss": 1.4021,
"step": 53800
},
{
"epoch": 0.27,
"grad_norm": 4.223129749298096,
"learning_rate": 9.69430908539654e-06,
"loss": 1.3754,
"step": 53900
},
{
"epoch": 0.27,
"grad_norm": 2.7301337718963623,
"learning_rate": 9.69368395060185e-06,
"loss": 1.411,
"step": 54000
},
{
"epoch": 0.27,
"grad_norm": 3.3157832622528076,
"learning_rate": 9.69305881580716e-06,
"loss": 1.3695,
"step": 54100
},
{
"epoch": 0.27,
"grad_norm": 4.0217671394348145,
"learning_rate": 9.69243368101247e-06,
"loss": 1.3822,
"step": 54200
},
{
"epoch": 0.27,
"grad_norm": 3.575080633163452,
"learning_rate": 9.69180854621778e-06,
"loss": 1.3578,
"step": 54300
},
{
"epoch": 0.27,
"grad_norm": 3.4445888996124268,
"learning_rate": 9.69118341142309e-06,
"loss": 1.3795,
"step": 54400
},
{
"epoch": 0.27,
"grad_norm": 3.173060894012451,
"learning_rate": 9.6905582766284e-06,
"loss": 1.3446,
"step": 54500
},
{
"epoch": 0.27,
"grad_norm": 3.8823065757751465,
"learning_rate": 9.689933141833709e-06,
"loss": 1.3696,
"step": 54600
},
{
"epoch": 0.27,
"grad_norm": 3.5092880725860596,
"learning_rate": 9.689308007039019e-06,
"loss": 1.3262,
"step": 54700
},
{
"epoch": 0.27,
"grad_norm": 3.270498514175415,
"learning_rate": 9.688682872244329e-06,
"loss": 1.3905,
"step": 54800
},
{
"epoch": 0.27,
"grad_norm": 3.0580673217773438,
"learning_rate": 9.688057737449639e-06,
"loss": 1.3707,
"step": 54900
},
{
"epoch": 0.27,
"grad_norm": 2.982407569885254,
"learning_rate": 9.687432602654949e-06,
"loss": 1.3909,
"step": 55000
},
{
"epoch": 0.27,
"grad_norm": 4.194490432739258,
"learning_rate": 9.686807467860259e-06,
"loss": 1.3703,
"step": 55100
},
{
"epoch": 0.27,
"grad_norm": 3.5486743450164795,
"learning_rate": 9.686182333065569e-06,
"loss": 1.3528,
"step": 55200
},
{
"epoch": 0.27,
"grad_norm": 3.081116199493408,
"learning_rate": 9.685557198270879e-06,
"loss": 1.3929,
"step": 55300
},
{
"epoch": 0.27,
"grad_norm": 3.644366979598999,
"learning_rate": 9.684932063476189e-06,
"loss": 1.3847,
"step": 55400
},
{
"epoch": 0.27,
"grad_norm": 2.9201712608337402,
"learning_rate": 9.684306928681497e-06,
"loss": 1.3689,
"step": 55500
},
{
"epoch": 0.28,
"grad_norm": 3.1488590240478516,
"learning_rate": 9.683681793886808e-06,
"loss": 1.3573,
"step": 55600
},
{
"epoch": 0.28,
"grad_norm": 3.4069724082946777,
"learning_rate": 9.683056659092117e-06,
"loss": 1.3447,
"step": 55700
},
{
"epoch": 0.28,
"grad_norm": 3.669130563735962,
"learning_rate": 9.682431524297428e-06,
"loss": 1.3629,
"step": 55800
},
{
"epoch": 0.28,
"grad_norm": 3.5612809658050537,
"learning_rate": 9.681806389502736e-06,
"loss": 1.3463,
"step": 55900
},
{
"epoch": 0.28,
"grad_norm": 3.3988492488861084,
"learning_rate": 9.681181254708048e-06,
"loss": 1.3262,
"step": 56000
},
{
"epoch": 0.28,
"grad_norm": 3.286510705947876,
"learning_rate": 9.680556119913356e-06,
"loss": 1.3671,
"step": 56100
},
{
"epoch": 0.28,
"grad_norm": 4.079017639160156,
"learning_rate": 9.679930985118668e-06,
"loss": 1.3691,
"step": 56200
},
{
"epoch": 0.28,
"grad_norm": 5.038201808929443,
"learning_rate": 9.679305850323976e-06,
"loss": 1.3684,
"step": 56300
},
{
"epoch": 0.28,
"grad_norm": 2.9335787296295166,
"learning_rate": 9.678680715529288e-06,
"loss": 1.3651,
"step": 56400
},
{
"epoch": 0.28,
"grad_norm": 3.750838279724121,
"learning_rate": 9.678055580734596e-06,
"loss": 1.3979,
"step": 56500
},
{
"epoch": 0.28,
"grad_norm": 3.269113779067993,
"learning_rate": 9.677430445939908e-06,
"loss": 1.4005,
"step": 56600
},
{
"epoch": 0.28,
"grad_norm": 2.9525506496429443,
"learning_rate": 9.676805311145216e-06,
"loss": 1.3535,
"step": 56700
},
{
"epoch": 0.28,
"grad_norm": 4.0349273681640625,
"learning_rate": 9.676180176350526e-06,
"loss": 1.3568,
"step": 56800
},
{
"epoch": 0.28,
"grad_norm": 3.9644954204559326,
"learning_rate": 9.675555041555836e-06,
"loss": 1.3368,
"step": 56900
},
{
"epoch": 0.28,
"grad_norm": 3.748861312866211,
"learning_rate": 9.674929906761146e-06,
"loss": 1.3788,
"step": 57000
},
{
"epoch": 0.28,
"grad_norm": 3.927027940750122,
"learning_rate": 9.674304771966456e-06,
"loss": 1.3369,
"step": 57100
},
{
"epoch": 0.28,
"grad_norm": 3.5700511932373047,
"learning_rate": 9.673679637171766e-06,
"loss": 1.3476,
"step": 57200
},
{
"epoch": 0.28,
"grad_norm": 3.2039616107940674,
"learning_rate": 9.673054502377075e-06,
"loss": 1.358,
"step": 57300
},
{
"epoch": 0.28,
"grad_norm": 3.7656171321868896,
"learning_rate": 9.672429367582385e-06,
"loss": 1.3656,
"step": 57400
},
{
"epoch": 0.28,
"grad_norm": 3.84936261177063,
"learning_rate": 9.671804232787695e-06,
"loss": 1.4221,
"step": 57500
},
{
"epoch": 0.28,
"grad_norm": 3.145597457885742,
"learning_rate": 9.671179097993005e-06,
"loss": 1.3444,
"step": 57600
},
{
"epoch": 0.29,
"grad_norm": 3.1017513275146484,
"learning_rate": 9.670553963198315e-06,
"loss": 1.3604,
"step": 57700
},
{
"epoch": 0.29,
"grad_norm": 3.460015058517456,
"learning_rate": 9.669928828403625e-06,
"loss": 1.3556,
"step": 57800
},
{
"epoch": 0.29,
"grad_norm": 3.2410836219787598,
"learning_rate": 9.669303693608935e-06,
"loss": 1.3468,
"step": 57900
},
{
"epoch": 0.29,
"grad_norm": 4.7556843757629395,
"learning_rate": 9.668678558814245e-06,
"loss": 1.3535,
"step": 58000
},
{
"epoch": 0.29,
"grad_norm": 2.8711953163146973,
"learning_rate": 9.668053424019555e-06,
"loss": 1.3587,
"step": 58100
},
{
"epoch": 0.29,
"grad_norm": 2.9098100662231445,
"learning_rate": 9.667428289224865e-06,
"loss": 1.3866,
"step": 58200
},
{
"epoch": 0.29,
"grad_norm": 3.6795027256011963,
"learning_rate": 9.666803154430175e-06,
"loss": 1.3692,
"step": 58300
},
{
"epoch": 0.29,
"grad_norm": 4.239531517028809,
"learning_rate": 9.666178019635485e-06,
"loss": 1.3898,
"step": 58400
},
{
"epoch": 0.29,
"grad_norm": 2.871461868286133,
"learning_rate": 9.665552884840795e-06,
"loss": 1.3863,
"step": 58500
},
{
"epoch": 0.29,
"grad_norm": 3.8275647163391113,
"learning_rate": 9.664927750046104e-06,
"loss": 1.4035,
"step": 58600
},
{
"epoch": 0.29,
"grad_norm": 3.168945074081421,
"learning_rate": 9.664302615251414e-06,
"loss": 1.4028,
"step": 58700
},
{
"epoch": 0.29,
"grad_norm": 3.4457874298095703,
"learning_rate": 9.663677480456724e-06,
"loss": 1.3484,
"step": 58800
},
{
"epoch": 0.29,
"grad_norm": 3.0998809337615967,
"learning_rate": 9.663052345662034e-06,
"loss": 1.3533,
"step": 58900
},
{
"epoch": 0.29,
"grad_norm": 3.2760820388793945,
"learning_rate": 9.662427210867344e-06,
"loss": 1.3733,
"step": 59000
},
{
"epoch": 0.29,
"grad_norm": 3.1642961502075195,
"learning_rate": 9.661802076072654e-06,
"loss": 1.3675,
"step": 59100
},
{
"epoch": 0.29,
"grad_norm": 3.5796260833740234,
"learning_rate": 9.661176941277964e-06,
"loss": 1.3842,
"step": 59200
},
{
"epoch": 0.29,
"grad_norm": 3.379223108291626,
"learning_rate": 9.660551806483274e-06,
"loss": 1.3,
"step": 59300
},
{
"epoch": 0.29,
"grad_norm": 3.7059500217437744,
"learning_rate": 9.659926671688584e-06,
"loss": 1.3801,
"step": 59400
},
{
"epoch": 0.29,
"grad_norm": 3.9644994735717773,
"learning_rate": 9.659301536893894e-06,
"loss": 1.3882,
"step": 59500
},
{
"epoch": 0.29,
"grad_norm": 3.516009569168091,
"learning_rate": 9.658676402099204e-06,
"loss": 1.3507,
"step": 59600
},
{
"epoch": 0.3,
"grad_norm": 3.6307122707366943,
"learning_rate": 9.658051267304514e-06,
"loss": 1.3887,
"step": 59700
},
{
"epoch": 0.3,
"grad_norm": 2.963676929473877,
"learning_rate": 9.657426132509824e-06,
"loss": 1.3574,
"step": 59800
},
{
"epoch": 0.3,
"grad_norm": 3.6590583324432373,
"learning_rate": 9.656800997715134e-06,
"loss": 1.3766,
"step": 59900
},
{
"epoch": 0.3,
"grad_norm": 3.9890248775482178,
"learning_rate": 9.656175862920443e-06,
"loss": 1.3766,
"step": 60000
},
{
"epoch": 0.3,
"grad_norm": 3.7033519744873047,
"learning_rate": 9.655550728125753e-06,
"loss": 1.3653,
"step": 60100
},
{
"epoch": 0.3,
"grad_norm": 3.2749149799346924,
"learning_rate": 9.654925593331063e-06,
"loss": 1.3555,
"step": 60200
},
{
"epoch": 0.3,
"grad_norm": 2.6017117500305176,
"learning_rate": 9.654300458536373e-06,
"loss": 1.3379,
"step": 60300
},
{
"epoch": 0.3,
"grad_norm": 3.2133805751800537,
"learning_rate": 9.653675323741683e-06,
"loss": 1.3383,
"step": 60400
},
{
"epoch": 0.3,
"grad_norm": 3.5152649879455566,
"learning_rate": 9.653050188946993e-06,
"loss": 1.3821,
"step": 60500
},
{
"epoch": 0.3,
"grad_norm": 3.8433949947357178,
"learning_rate": 9.652425054152303e-06,
"loss": 1.3648,
"step": 60600
},
{
"epoch": 0.3,
"grad_norm": 3.219630241394043,
"learning_rate": 9.651799919357611e-06,
"loss": 1.3381,
"step": 60700
},
{
"epoch": 0.3,
"grad_norm": 2.9422874450683594,
"learning_rate": 9.651174784562923e-06,
"loss": 1.3596,
"step": 60800
},
{
"epoch": 0.3,
"grad_norm": 3.0160861015319824,
"learning_rate": 9.650549649768231e-06,
"loss": 1.4086,
"step": 60900
},
{
"epoch": 0.3,
"grad_norm": 3.6983797550201416,
"learning_rate": 9.649924514973543e-06,
"loss": 1.3653,
"step": 61000
},
{
"epoch": 0.3,
"grad_norm": 3.4345366954803467,
"learning_rate": 9.649299380178851e-06,
"loss": 1.3836,
"step": 61100
},
{
"epoch": 0.3,
"grad_norm": 4.864907741546631,
"learning_rate": 9.648674245384163e-06,
"loss": 1.3395,
"step": 61200
},
{
"epoch": 0.3,
"grad_norm": 3.0346243381500244,
"learning_rate": 9.64804911058947e-06,
"loss": 1.3483,
"step": 61300
},
{
"epoch": 0.3,
"grad_norm": 3.271688938140869,
"learning_rate": 9.647423975794782e-06,
"loss": 1.3718,
"step": 61400
},
{
"epoch": 0.3,
"grad_norm": 5.519439697265625,
"learning_rate": 9.64679884100009e-06,
"loss": 1.3516,
"step": 61500
},
{
"epoch": 0.3,
"grad_norm": 3.676679849624634,
"learning_rate": 9.646173706205402e-06,
"loss": 1.3786,
"step": 61600
},
{
"epoch": 0.31,
"grad_norm": 2.799685001373291,
"learning_rate": 9.64554857141071e-06,
"loss": 1.3622,
"step": 61700
},
{
"epoch": 0.31,
"grad_norm": 3.5693440437316895,
"learning_rate": 9.644923436616022e-06,
"loss": 1.3506,
"step": 61800
},
{
"epoch": 0.31,
"grad_norm": 4.081248760223389,
"learning_rate": 9.64429830182133e-06,
"loss": 1.3551,
"step": 61900
},
{
"epoch": 0.31,
"grad_norm": 3.275651454925537,
"learning_rate": 9.64367316702664e-06,
"loss": 1.3691,
"step": 62000
},
{
"epoch": 0.31,
"grad_norm": 3.2802531719207764,
"learning_rate": 9.64304803223195e-06,
"loss": 1.3654,
"step": 62100
},
{
"epoch": 0.31,
"grad_norm": 2.8903188705444336,
"learning_rate": 9.64242289743726e-06,
"loss": 1.3694,
"step": 62200
},
{
"epoch": 0.31,
"grad_norm": 2.767051935195923,
"learning_rate": 9.64179776264257e-06,
"loss": 1.4189,
"step": 62300
},
{
"epoch": 0.31,
"grad_norm": 3.3065268993377686,
"learning_rate": 9.64117262784788e-06,
"loss": 1.3528,
"step": 62400
},
{
"epoch": 0.31,
"grad_norm": 3.6622681617736816,
"learning_rate": 9.64054749305319e-06,
"loss": 1.3744,
"step": 62500
},
{
"epoch": 0.31,
"grad_norm": 2.5773024559020996,
"learning_rate": 9.6399223582585e-06,
"loss": 1.3678,
"step": 62600
},
{
"epoch": 0.31,
"grad_norm": 4.050888538360596,
"learning_rate": 9.63929722346381e-06,
"loss": 1.3459,
"step": 62700
},
{
"epoch": 0.31,
"grad_norm": 3.3641510009765625,
"learning_rate": 9.63867208866912e-06,
"loss": 1.3289,
"step": 62800
},
{
"epoch": 0.31,
"grad_norm": 3.2410778999328613,
"learning_rate": 9.63804695387443e-06,
"loss": 1.3582,
"step": 62900
},
{
"epoch": 0.31,
"grad_norm": 3.7819199562072754,
"learning_rate": 9.63742181907974e-06,
"loss": 1.3535,
"step": 63000
},
{
"epoch": 0.31,
"grad_norm": 3.4329464435577393,
"learning_rate": 9.63679668428505e-06,
"loss": 1.3637,
"step": 63100
},
{
"epoch": 0.31,
"grad_norm": 3.7776496410369873,
"learning_rate": 9.63617154949036e-06,
"loss": 1.3427,
"step": 63200
},
{
"epoch": 0.31,
"grad_norm": 3.6807868480682373,
"learning_rate": 9.63554641469567e-06,
"loss": 1.3366,
"step": 63300
},
{
"epoch": 0.31,
"grad_norm": 3.182055711746216,
"learning_rate": 9.63492127990098e-06,
"loss": 1.3907,
"step": 63400
},
{
"epoch": 0.31,
"grad_norm": 3.0613508224487305,
"learning_rate": 9.63429614510629e-06,
"loss": 1.3628,
"step": 63500
},
{
"epoch": 0.31,
"grad_norm": 3.813504219055176,
"learning_rate": 9.633671010311599e-06,
"loss": 1.3534,
"step": 63600
},
{
"epoch": 0.32,
"grad_norm": 3.0388875007629395,
"learning_rate": 9.633045875516909e-06,
"loss": 1.3701,
"step": 63700
},
{
"epoch": 0.32,
"grad_norm": 3.5311150550842285,
"learning_rate": 9.632420740722219e-06,
"loss": 1.3419,
"step": 63800
},
{
"epoch": 0.32,
"grad_norm": 3.283538341522217,
"learning_rate": 9.631795605927529e-06,
"loss": 1.3772,
"step": 63900
},
{
"epoch": 0.32,
"grad_norm": 2.6988024711608887,
"learning_rate": 9.631170471132839e-06,
"loss": 1.3633,
"step": 64000
},
{
"epoch": 0.32,
"grad_norm": 3.738215684890747,
"learning_rate": 9.630545336338149e-06,
"loss": 1.3814,
"step": 64100
},
{
"epoch": 0.32,
"grad_norm": 3.899857997894287,
"learning_rate": 9.629920201543459e-06,
"loss": 1.3787,
"step": 64200
},
{
"epoch": 0.32,
"grad_norm": 3.2490193843841553,
"learning_rate": 9.629295066748769e-06,
"loss": 1.347,
"step": 64300
},
{
"epoch": 0.32,
"grad_norm": 3.262529134750366,
"learning_rate": 9.628669931954079e-06,
"loss": 1.3405,
"step": 64400
},
{
"epoch": 0.32,
"grad_norm": 3.1799771785736084,
"learning_rate": 9.628044797159388e-06,
"loss": 1.3796,
"step": 64500
},
{
"epoch": 0.32,
"grad_norm": 3.5044260025024414,
"learning_rate": 9.627419662364698e-06,
"loss": 1.322,
"step": 64600
},
{
"epoch": 0.32,
"grad_norm": 3.560049295425415,
"learning_rate": 9.626794527570008e-06,
"loss": 1.356,
"step": 64700
},
{
"epoch": 0.32,
"grad_norm": 2.3910019397735596,
"learning_rate": 9.626169392775318e-06,
"loss": 1.3546,
"step": 64800
},
{
"epoch": 0.32,
"grad_norm": 4.631550312042236,
"learning_rate": 9.625544257980628e-06,
"loss": 1.3664,
"step": 64900
},
{
"epoch": 0.32,
"grad_norm": 3.415191650390625,
"learning_rate": 9.624919123185938e-06,
"loss": 1.3818,
"step": 65000
},
{
"epoch": 0.32,
"grad_norm": 3.071653127670288,
"learning_rate": 9.624293988391248e-06,
"loss": 1.3947,
"step": 65100
},
{
"epoch": 0.32,
"grad_norm": 3.575773000717163,
"learning_rate": 9.623668853596558e-06,
"loss": 1.3392,
"step": 65200
},
{
"epoch": 0.32,
"grad_norm": 3.1757047176361084,
"learning_rate": 9.623043718801868e-06,
"loss": 1.3619,
"step": 65300
},
{
"epoch": 0.32,
"grad_norm": 2.487311840057373,
"learning_rate": 9.622418584007178e-06,
"loss": 1.3538,
"step": 65400
},
{
"epoch": 0.32,
"grad_norm": 2.791187047958374,
"learning_rate": 9.621793449212488e-06,
"loss": 1.343,
"step": 65500
},
{
"epoch": 0.32,
"grad_norm": 2.840940237045288,
"learning_rate": 9.621168314417798e-06,
"loss": 1.3479,
"step": 65600
},
{
"epoch": 0.33,
"grad_norm": 3.4006075859069824,
"learning_rate": 9.620543179623108e-06,
"loss": 1.3742,
"step": 65700
},
{
"epoch": 0.33,
"grad_norm": 2.8290023803710938,
"learning_rate": 9.619918044828418e-06,
"loss": 1.3825,
"step": 65800
},
{
"epoch": 0.33,
"grad_norm": 4.114961624145508,
"learning_rate": 9.619292910033727e-06,
"loss": 1.3742,
"step": 65900
},
{
"epoch": 0.33,
"grad_norm": 3.4792447090148926,
"learning_rate": 9.618667775239037e-06,
"loss": 1.3442,
"step": 66000
},
{
"epoch": 0.33,
"grad_norm": 3.3174169063568115,
"learning_rate": 9.618042640444346e-06,
"loss": 1.3896,
"step": 66100
},
{
"epoch": 0.33,
"grad_norm": 2.7161898612976074,
"learning_rate": 9.617417505649657e-06,
"loss": 1.3492,
"step": 66200
},
{
"epoch": 0.33,
"grad_norm": 3.3158161640167236,
"learning_rate": 9.616792370854965e-06,
"loss": 1.3378,
"step": 66300
},
{
"epoch": 0.33,
"grad_norm": 4.810378074645996,
"learning_rate": 9.616167236060277e-06,
"loss": 1.3482,
"step": 66400
},
{
"epoch": 0.33,
"grad_norm": 3.8272716999053955,
"learning_rate": 9.615542101265585e-06,
"loss": 1.3624,
"step": 66500
},
{
"epoch": 0.33,
"grad_norm": 3.1543540954589844,
"learning_rate": 9.614916966470897e-06,
"loss": 1.3703,
"step": 66600
},
{
"epoch": 0.33,
"grad_norm": 3.571789503097534,
"learning_rate": 9.614291831676205e-06,
"loss": 1.3467,
"step": 66700
},
{
"epoch": 0.33,
"grad_norm": 3.8913381099700928,
"learning_rate": 9.613666696881517e-06,
"loss": 1.3629,
"step": 66800
},
{
"epoch": 0.33,
"grad_norm": 3.3162176609039307,
"learning_rate": 9.613041562086825e-06,
"loss": 1.399,
"step": 66900
},
{
"epoch": 0.33,
"grad_norm": 3.26802396774292,
"learning_rate": 9.612416427292137e-06,
"loss": 1.3481,
"step": 67000
},
{
"epoch": 0.33,
"grad_norm": 3.7507224082946777,
"learning_rate": 9.611791292497445e-06,
"loss": 1.3875,
"step": 67100
},
{
"epoch": 0.33,
"grad_norm": 3.059849262237549,
"learning_rate": 9.611166157702756e-06,
"loss": 1.3402,
"step": 67200
},
{
"epoch": 0.33,
"grad_norm": 2.9418632984161377,
"learning_rate": 9.610541022908065e-06,
"loss": 1.385,
"step": 67300
},
{
"epoch": 0.33,
"grad_norm": 3.126817464828491,
"learning_rate": 9.609915888113375e-06,
"loss": 1.3342,
"step": 67400
},
{
"epoch": 0.33,
"grad_norm": 3.908066749572754,
"learning_rate": 9.609290753318685e-06,
"loss": 1.3658,
"step": 67500
},
{
"epoch": 0.33,
"grad_norm": 3.3799283504486084,
"learning_rate": 9.608665618523994e-06,
"loss": 1.3392,
"step": 67600
},
{
"epoch": 0.33,
"grad_norm": 3.0953500270843506,
"learning_rate": 9.608040483729304e-06,
"loss": 1.3664,
"step": 67700
},
{
"epoch": 0.34,
"grad_norm": 3.4090096950531006,
"learning_rate": 9.607415348934614e-06,
"loss": 1.3913,
"step": 67800
},
{
"epoch": 0.34,
"grad_norm": 3.0916600227355957,
"learning_rate": 9.606790214139924e-06,
"loss": 1.3482,
"step": 67900
},
{
"epoch": 0.34,
"grad_norm": 4.232104778289795,
"learning_rate": 9.606165079345234e-06,
"loss": 1.3663,
"step": 68000
},
{
"epoch": 0.34,
"grad_norm": 3.300558090209961,
"learning_rate": 9.605539944550544e-06,
"loss": 1.3608,
"step": 68100
},
{
"epoch": 0.34,
"grad_norm": 2.795227527618408,
"learning_rate": 9.604914809755854e-06,
"loss": 1.3637,
"step": 68200
},
{
"epoch": 0.34,
"grad_norm": 3.083174467086792,
"learning_rate": 9.604289674961164e-06,
"loss": 1.3269,
"step": 68300
},
{
"epoch": 0.34,
"grad_norm": 3.8292133808135986,
"learning_rate": 9.603664540166474e-06,
"loss": 1.3311,
"step": 68400
},
{
"epoch": 0.34,
"grad_norm": 3.3727259635925293,
"learning_rate": 9.603039405371784e-06,
"loss": 1.3233,
"step": 68500
},
{
"epoch": 0.34,
"grad_norm": 3.0696310997009277,
"learning_rate": 9.602414270577094e-06,
"loss": 1.3217,
"step": 68600
},
{
"epoch": 0.34,
"grad_norm": 5.1085591316223145,
"learning_rate": 9.601789135782404e-06,
"loss": 1.395,
"step": 68700
},
{
"epoch": 0.34,
"grad_norm": 4.036706447601318,
"learning_rate": 9.601164000987714e-06,
"loss": 1.3763,
"step": 68800
},
{
"epoch": 0.34,
"grad_norm": 3.823237419128418,
"learning_rate": 9.600538866193024e-06,
"loss": 1.3868,
"step": 68900
},
{
"epoch": 0.34,
"grad_norm": 3.535228729248047,
"learning_rate": 9.599913731398333e-06,
"loss": 1.3714,
"step": 69000
},
{
"epoch": 0.34,
"grad_norm": 3.333162546157837,
"learning_rate": 9.599288596603643e-06,
"loss": 1.3509,
"step": 69100
},
{
"epoch": 0.34,
"grad_norm": 3.901670455932617,
"learning_rate": 9.598663461808953e-06,
"loss": 1.3486,
"step": 69200
},
{
"epoch": 0.34,
"grad_norm": 2.89204478263855,
"learning_rate": 9.598038327014263e-06,
"loss": 1.3975,
"step": 69300
},
{
"epoch": 0.34,
"grad_norm": 3.5590710639953613,
"learning_rate": 9.597413192219573e-06,
"loss": 1.3822,
"step": 69400
},
{
"epoch": 0.34,
"grad_norm": 3.234952449798584,
"learning_rate": 9.596788057424883e-06,
"loss": 1.3942,
"step": 69500
},
{
"epoch": 0.34,
"grad_norm": 3.125939130783081,
"learning_rate": 9.596162922630193e-06,
"loss": 1.3573,
"step": 69600
},
{
"epoch": 0.34,
"grad_norm": 3.25191593170166,
"learning_rate": 9.595537787835503e-06,
"loss": 1.3742,
"step": 69700
},
{
"epoch": 0.35,
"grad_norm": 3.0981853008270264,
"learning_rate": 9.594912653040813e-06,
"loss": 1.3272,
"step": 69800
},
{
"epoch": 0.35,
"grad_norm": 2.660688638687134,
"learning_rate": 9.594287518246123e-06,
"loss": 1.3859,
"step": 69900
},
{
"epoch": 0.35,
"grad_norm": 4.212889671325684,
"learning_rate": 9.593662383451433e-06,
"loss": 1.3685,
"step": 70000
},
{
"epoch": 0.35,
"grad_norm": 2.602475643157959,
"learning_rate": 9.593037248656743e-06,
"loss": 1.3486,
"step": 70100
},
{
"epoch": 0.35,
"grad_norm": 3.8037405014038086,
"learning_rate": 9.592412113862053e-06,
"loss": 1.3094,
"step": 70200
},
{
"epoch": 0.35,
"grad_norm": 3.735767364501953,
"learning_rate": 9.591786979067363e-06,
"loss": 1.3443,
"step": 70300
},
{
"epoch": 0.35,
"grad_norm": 3.10837984085083,
"learning_rate": 9.591161844272672e-06,
"loss": 1.3637,
"step": 70400
},
{
"epoch": 0.35,
"grad_norm": 3.339202880859375,
"learning_rate": 9.590536709477982e-06,
"loss": 1.374,
"step": 70500
},
{
"epoch": 0.35,
"grad_norm": 4.676008224487305,
"learning_rate": 9.589911574683292e-06,
"loss": 1.3609,
"step": 70600
},
{
"epoch": 0.35,
"grad_norm": 3.2127492427825928,
"learning_rate": 9.589286439888602e-06,
"loss": 1.3443,
"step": 70700
},
{
"epoch": 0.35,
"grad_norm": 3.552145481109619,
"learning_rate": 9.588661305093912e-06,
"loss": 1.3653,
"step": 70800
},
{
"epoch": 0.35,
"grad_norm": 4.267813205718994,
"learning_rate": 9.588036170299222e-06,
"loss": 1.3316,
"step": 70900
},
{
"epoch": 0.35,
"grad_norm": 4.004978179931641,
"learning_rate": 9.587411035504532e-06,
"loss": 1.3259,
"step": 71000
},
{
"epoch": 0.35,
"grad_norm": 3.433945417404175,
"learning_rate": 9.586785900709842e-06,
"loss": 1.3475,
"step": 71100
},
{
"epoch": 0.35,
"grad_norm": 3.3748490810394287,
"learning_rate": 9.586160765915152e-06,
"loss": 1.3442,
"step": 71200
},
{
"epoch": 0.35,
"grad_norm": 3.2221007347106934,
"learning_rate": 9.58553563112046e-06,
"loss": 1.3705,
"step": 71300
},
{
"epoch": 0.35,
"grad_norm": 2.846968173980713,
"learning_rate": 9.584910496325772e-06,
"loss": 1.3411,
"step": 71400
},
{
"epoch": 0.35,
"grad_norm": 3.973281145095825,
"learning_rate": 9.58428536153108e-06,
"loss": 1.3503,
"step": 71500
},
{
"epoch": 0.35,
"grad_norm": 2.7122104167938232,
"learning_rate": 9.583660226736392e-06,
"loss": 1.3581,
"step": 71600
},
{
"epoch": 0.35,
"grad_norm": 4.12910795211792,
"learning_rate": 9.5830350919417e-06,
"loss": 1.385,
"step": 71700
},
{
"epoch": 0.36,
"grad_norm": 3.4491500854492188,
"learning_rate": 9.582409957147011e-06,
"loss": 1.3626,
"step": 71800
},
{
"epoch": 0.36,
"grad_norm": 4.059682846069336,
"learning_rate": 9.58178482235232e-06,
"loss": 1.3595,
"step": 71900
},
{
"epoch": 0.36,
"grad_norm": 3.2482686042785645,
"learning_rate": 9.581159687557631e-06,
"loss": 1.3339,
"step": 72000
},
{
"epoch": 0.36,
"grad_norm": 3.3053741455078125,
"learning_rate": 9.58053455276294e-06,
"loss": 1.3656,
"step": 72100
},
{
"epoch": 0.36,
"grad_norm": 3.101283311843872,
"learning_rate": 9.579909417968251e-06,
"loss": 1.3751,
"step": 72200
},
{
"epoch": 0.36,
"grad_norm": 3.7894277572631836,
"learning_rate": 9.57928428317356e-06,
"loss": 1.3744,
"step": 72300
},
{
"epoch": 0.36,
"grad_norm": 3.6949033737182617,
"learning_rate": 9.578659148378871e-06,
"loss": 1.3434,
"step": 72400
},
{
"epoch": 0.36,
"grad_norm": 3.2511017322540283,
"learning_rate": 9.57803401358418e-06,
"loss": 1.3809,
"step": 72500
},
{
"epoch": 0.36,
"grad_norm": 2.6631274223327637,
"learning_rate": 9.577408878789489e-06,
"loss": 1.3232,
"step": 72600
},
{
"epoch": 0.36,
"grad_norm": 3.0832998752593994,
"learning_rate": 9.576783743994799e-06,
"loss": 1.3785,
"step": 72700
},
{
"epoch": 0.36,
"grad_norm": 4.4912238121032715,
"learning_rate": 9.576158609200109e-06,
"loss": 1.3558,
"step": 72800
},
{
"epoch": 0.36,
"grad_norm": 3.720935821533203,
"learning_rate": 9.575533474405419e-06,
"loss": 1.3547,
"step": 72900
},
{
"epoch": 0.36,
"grad_norm": 3.69688081741333,
"learning_rate": 9.574908339610729e-06,
"loss": 1.3409,
"step": 73000
},
{
"epoch": 0.36,
"grad_norm": 4.521012783050537,
"learning_rate": 9.574283204816039e-06,
"loss": 1.3701,
"step": 73100
},
{
"epoch": 0.36,
"grad_norm": 3.9866528511047363,
"learning_rate": 9.573658070021349e-06,
"loss": 1.3623,
"step": 73200
},
{
"epoch": 0.36,
"grad_norm": 4.300259590148926,
"learning_rate": 9.573032935226659e-06,
"loss": 1.3562,
"step": 73300
},
{
"epoch": 0.36,
"grad_norm": 3.585087299346924,
"learning_rate": 9.572407800431969e-06,
"loss": 1.3574,
"step": 73400
},
{
"epoch": 0.36,
"grad_norm": 3.3413264751434326,
"learning_rate": 9.571782665637278e-06,
"loss": 1.3586,
"step": 73500
},
{
"epoch": 0.36,
"grad_norm": 3.7272746562957764,
"learning_rate": 9.571157530842588e-06,
"loss": 1.3525,
"step": 73600
},
{
"epoch": 0.36,
"grad_norm": 3.167235851287842,
"learning_rate": 9.570532396047898e-06,
"loss": 1.3545,
"step": 73700
},
{
"epoch": 0.37,
"grad_norm": 3.007138252258301,
"learning_rate": 9.569907261253208e-06,
"loss": 1.3638,
"step": 73800
},
{
"epoch": 0.37,
"grad_norm": 3.402449607849121,
"learning_rate": 9.569282126458518e-06,
"loss": 1.3394,
"step": 73900
},
{
"epoch": 0.37,
"grad_norm": 3.050807237625122,
"learning_rate": 9.568656991663828e-06,
"loss": 1.3477,
"step": 74000
},
{
"epoch": 0.37,
"grad_norm": 3.470465660095215,
"learning_rate": 9.568031856869138e-06,
"loss": 1.3573,
"step": 74100
},
{
"epoch": 0.37,
"grad_norm": 3.1874587535858154,
"learning_rate": 9.567406722074448e-06,
"loss": 1.3545,
"step": 74200
},
{
"epoch": 0.37,
"grad_norm": 3.022789478302002,
"learning_rate": 9.566781587279758e-06,
"loss": 1.3857,
"step": 74300
},
{
"epoch": 0.37,
"grad_norm": 3.696437358856201,
"learning_rate": 9.566156452485068e-06,
"loss": 1.3845,
"step": 74400
},
{
"epoch": 0.37,
"grad_norm": 3.3129115104675293,
"learning_rate": 9.565531317690378e-06,
"loss": 1.3342,
"step": 74500
},
{
"epoch": 0.37,
"grad_norm": 3.0286476612091064,
"learning_rate": 9.564906182895688e-06,
"loss": 1.3451,
"step": 74600
},
{
"epoch": 0.37,
"grad_norm": 3.14780330657959,
"learning_rate": 9.564281048100998e-06,
"loss": 1.3501,
"step": 74700
},
{
"epoch": 0.37,
"grad_norm": 3.660125732421875,
"learning_rate": 9.563655913306308e-06,
"loss": 1.3974,
"step": 74800
},
{
"epoch": 0.37,
"grad_norm": 3.7873997688293457,
"learning_rate": 9.563030778511617e-06,
"loss": 1.3328,
"step": 74900
},
{
"epoch": 0.37,
"grad_norm": 4.175543785095215,
"learning_rate": 9.562405643716927e-06,
"loss": 1.3871,
"step": 75000
},
{
"epoch": 0.37,
"grad_norm": 3.0854318141937256,
"learning_rate": 9.561780508922237e-06,
"loss": 1.3436,
"step": 75100
},
{
"epoch": 0.37,
"grad_norm": 3.430039882659912,
"learning_rate": 9.561155374127547e-06,
"loss": 1.3614,
"step": 75200
},
{
"epoch": 0.37,
"grad_norm": 3.1078710556030273,
"learning_rate": 9.560530239332857e-06,
"loss": 1.3788,
"step": 75300
},
{
"epoch": 0.37,
"grad_norm": 3.394430160522461,
"learning_rate": 9.559905104538167e-06,
"loss": 1.3534,
"step": 75400
},
{
"epoch": 0.37,
"grad_norm": 3.4498708248138428,
"learning_rate": 9.559279969743477e-06,
"loss": 1.3875,
"step": 75500
},
{
"epoch": 0.37,
"grad_norm": 3.169480562210083,
"learning_rate": 9.558654834948787e-06,
"loss": 1.3391,
"step": 75600
},
{
"epoch": 0.37,
"grad_norm": 3.4374375343322754,
"learning_rate": 9.558029700154097e-06,
"loss": 1.3657,
"step": 75700
},
{
"epoch": 0.38,
"grad_norm": 3.0859546661376953,
"learning_rate": 9.557404565359407e-06,
"loss": 1.3765,
"step": 75800
},
{
"epoch": 0.38,
"grad_norm": 3.567939281463623,
"learning_rate": 9.556779430564717e-06,
"loss": 1.3432,
"step": 75900
},
{
"epoch": 0.38,
"grad_norm": 3.236070156097412,
"learning_rate": 9.556154295770027e-06,
"loss": 1.3402,
"step": 76000
},
{
"epoch": 0.38,
"grad_norm": 3.366365432739258,
"learning_rate": 9.555529160975337e-06,
"loss": 1.3736,
"step": 76100
},
{
"epoch": 0.38,
"grad_norm": 4.573514461517334,
"learning_rate": 9.554904026180646e-06,
"loss": 1.3581,
"step": 76200
},
{
"epoch": 0.38,
"grad_norm": 3.199225664138794,
"learning_rate": 9.554278891385956e-06,
"loss": 1.3452,
"step": 76300
},
{
"epoch": 0.38,
"grad_norm": 3.0722098350524902,
"learning_rate": 9.553653756591266e-06,
"loss": 1.3785,
"step": 76400
},
{
"epoch": 0.38,
"grad_norm": 3.137385606765747,
"learning_rate": 9.553028621796576e-06,
"loss": 1.3521,
"step": 76500
},
{
"epoch": 0.38,
"grad_norm": 4.893807888031006,
"learning_rate": 9.552403487001886e-06,
"loss": 1.3572,
"step": 76600
},
{
"epoch": 0.38,
"grad_norm": 4.010082721710205,
"learning_rate": 9.551778352207194e-06,
"loss": 1.3483,
"step": 76700
},
{
"epoch": 0.38,
"grad_norm": 3.6857099533081055,
"learning_rate": 9.551153217412506e-06,
"loss": 1.3347,
"step": 76800
},
{
"epoch": 0.38,
"grad_norm": 2.510134696960449,
"learning_rate": 9.550528082617814e-06,
"loss": 1.3461,
"step": 76900
},
{
"epoch": 0.38,
"grad_norm": 3.9825291633605957,
"learning_rate": 9.549902947823126e-06,
"loss": 1.3415,
"step": 77000
},
{
"epoch": 0.38,
"grad_norm": 3.156740427017212,
"learning_rate": 9.549277813028434e-06,
"loss": 1.3305,
"step": 77100
},
{
"epoch": 0.38,
"grad_norm": 3.245800256729126,
"learning_rate": 9.548652678233746e-06,
"loss": 1.3573,
"step": 77200
},
{
"epoch": 0.38,
"grad_norm": 2.6874351501464844,
"learning_rate": 9.548027543439054e-06,
"loss": 1.3443,
"step": 77300
},
{
"epoch": 0.38,
"grad_norm": 3.6892011165618896,
"learning_rate": 9.547402408644366e-06,
"loss": 1.3192,
"step": 77400
},
{
"epoch": 0.38,
"grad_norm": 2.505993604660034,
"learning_rate": 9.546777273849674e-06,
"loss": 1.3812,
"step": 77500
},
{
"epoch": 0.38,
"grad_norm": 3.5395193099975586,
"learning_rate": 9.546152139054985e-06,
"loss": 1.3604,
"step": 77600
},
{
"epoch": 0.38,
"grad_norm": 3.2124781608581543,
"learning_rate": 9.545527004260294e-06,
"loss": 1.312,
"step": 77700
},
{
"epoch": 0.38,
"grad_norm": 3.8713743686676025,
"learning_rate": 9.544901869465605e-06,
"loss": 1.3435,
"step": 77800
},
{
"epoch": 0.39,
"grad_norm": 3.1610865592956543,
"learning_rate": 9.544276734670914e-06,
"loss": 1.3696,
"step": 77900
},
{
"epoch": 0.39,
"grad_norm": 7.323131561279297,
"learning_rate": 9.543651599876223e-06,
"loss": 1.357,
"step": 78000
},
{
"epoch": 0.39,
"grad_norm": 3.0851237773895264,
"learning_rate": 9.543026465081533e-06,
"loss": 1.3406,
"step": 78100
},
{
"epoch": 0.39,
"grad_norm": 3.637321949005127,
"learning_rate": 9.542401330286843e-06,
"loss": 1.376,
"step": 78200
},
{
"epoch": 0.39,
"grad_norm": 2.876664638519287,
"learning_rate": 9.541776195492153e-06,
"loss": 1.4049,
"step": 78300
},
{
"epoch": 0.39,
"grad_norm": 3.146031618118286,
"learning_rate": 9.541151060697463e-06,
"loss": 1.3484,
"step": 78400
},
{
"epoch": 0.39,
"grad_norm": 4.596341609954834,
"learning_rate": 9.540525925902773e-06,
"loss": 1.3518,
"step": 78500
},
{
"epoch": 0.39,
"grad_norm": 5.041236400604248,
"learning_rate": 9.539900791108083e-06,
"loss": 1.362,
"step": 78600
},
{
"epoch": 0.39,
"grad_norm": 3.9177463054656982,
"learning_rate": 9.539275656313393e-06,
"loss": 1.3586,
"step": 78700
},
{
"epoch": 0.39,
"grad_norm": 3.115206003189087,
"learning_rate": 9.538650521518703e-06,
"loss": 1.367,
"step": 78800
},
{
"epoch": 0.39,
"grad_norm": 2.846676826477051,
"learning_rate": 9.538025386724013e-06,
"loss": 1.3699,
"step": 78900
},
{
"epoch": 0.39,
"grad_norm": 3.625420331954956,
"learning_rate": 9.537400251929323e-06,
"loss": 1.3505,
"step": 79000
},
{
"epoch": 0.39,
"grad_norm": 3.315352439880371,
"learning_rate": 9.536775117134633e-06,
"loss": 1.3456,
"step": 79100
},
{
"epoch": 0.39,
"grad_norm": 3.249753475189209,
"learning_rate": 9.536149982339943e-06,
"loss": 1.3538,
"step": 79200
},
{
"epoch": 0.39,
"grad_norm": 3.9315223693847656,
"learning_rate": 9.535524847545253e-06,
"loss": 1.3459,
"step": 79300
},
{
"epoch": 0.39,
"grad_norm": 3.4720170497894287,
"learning_rate": 9.534899712750562e-06,
"loss": 1.3935,
"step": 79400
},
{
"epoch": 0.39,
"grad_norm": 2.97334885597229,
"learning_rate": 9.534274577955872e-06,
"loss": 1.3526,
"step": 79500
},
{
"epoch": 0.39,
"grad_norm": 2.218647003173828,
"learning_rate": 9.533649443161182e-06,
"loss": 1.3845,
"step": 79600
},
{
"epoch": 0.39,
"grad_norm": 3.644829034805298,
"learning_rate": 9.533024308366492e-06,
"loss": 1.3515,
"step": 79700
},
{
"epoch": 0.39,
"grad_norm": 4.018405437469482,
"learning_rate": 9.532399173571802e-06,
"loss": 1.3544,
"step": 79800
},
{
"epoch": 0.4,
"grad_norm": 3.210761308670044,
"learning_rate": 9.531774038777112e-06,
"loss": 1.3554,
"step": 79900
},
{
"epoch": 0.4,
"grad_norm": 3.046523094177246,
"learning_rate": 9.531148903982422e-06,
"loss": 1.3733,
"step": 80000
},
{
"epoch": 0.4,
"grad_norm": 3.437032699584961,
"learning_rate": 9.530523769187732e-06,
"loss": 1.347,
"step": 80100
},
{
"epoch": 0.4,
"grad_norm": 3.712858200073242,
"learning_rate": 9.529898634393042e-06,
"loss": 1.3856,
"step": 80200
},
{
"epoch": 0.4,
"grad_norm": 2.859689474105835,
"learning_rate": 9.529273499598352e-06,
"loss": 1.3312,
"step": 80300
},
{
"epoch": 0.4,
"grad_norm": 3.8814845085144043,
"learning_rate": 9.528648364803662e-06,
"loss": 1.3362,
"step": 80400
},
{
"epoch": 0.4,
"grad_norm": 2.983851909637451,
"learning_rate": 9.528023230008972e-06,
"loss": 1.3689,
"step": 80500
},
{
"epoch": 0.4,
"grad_norm": 3.3227264881134033,
"learning_rate": 9.527398095214282e-06,
"loss": 1.3362,
"step": 80600
},
{
"epoch": 0.4,
"grad_norm": 3.825824499130249,
"learning_rate": 9.526772960419591e-06,
"loss": 1.3496,
"step": 80700
},
{
"epoch": 0.4,
"grad_norm": 3.376059055328369,
"learning_rate": 9.526147825624901e-06,
"loss": 1.3781,
"step": 80800
},
{
"epoch": 0.4,
"grad_norm": 3.2187156677246094,
"learning_rate": 9.525522690830211e-06,
"loss": 1.4142,
"step": 80900
},
{
"epoch": 0.4,
"grad_norm": 3.073812246322632,
"learning_rate": 9.524897556035521e-06,
"loss": 1.3185,
"step": 81000
},
{
"epoch": 0.4,
"grad_norm": 2.7107346057891846,
"learning_rate": 9.524272421240831e-06,
"loss": 1.322,
"step": 81100
},
{
"epoch": 0.4,
"grad_norm": 3.378969669342041,
"learning_rate": 9.523647286446141e-06,
"loss": 1.3438,
"step": 81200
},
{
"epoch": 0.4,
"grad_norm": 4.337489604949951,
"learning_rate": 9.523022151651451e-06,
"loss": 1.3326,
"step": 81300
},
{
"epoch": 0.4,
"grad_norm": 4.453660488128662,
"learning_rate": 9.522397016856761e-06,
"loss": 1.3624,
"step": 81400
},
{
"epoch": 0.4,
"grad_norm": 3.236886501312256,
"learning_rate": 9.521771882062071e-06,
"loss": 1.3704,
"step": 81500
},
{
"epoch": 0.4,
"grad_norm": 3.969984531402588,
"learning_rate": 9.52114674726738e-06,
"loss": 1.3605,
"step": 81600
},
{
"epoch": 0.4,
"grad_norm": 2.707930326461792,
"learning_rate": 9.52052161247269e-06,
"loss": 1.3272,
"step": 81700
},
{
"epoch": 0.4,
"grad_norm": 3.0617573261260986,
"learning_rate": 9.519896477678e-06,
"loss": 1.3779,
"step": 81800
},
{
"epoch": 0.41,
"grad_norm": 2.8938345909118652,
"learning_rate": 9.519271342883309e-06,
"loss": 1.3453,
"step": 81900
},
{
"epoch": 0.41,
"grad_norm": 3.273656129837036,
"learning_rate": 9.51864620808862e-06,
"loss": 1.3535,
"step": 82000
},
{
"epoch": 0.41,
"grad_norm": 3.6416726112365723,
"learning_rate": 9.518021073293929e-06,
"loss": 1.393,
"step": 82100
},
{
"epoch": 0.41,
"grad_norm": 2.7089104652404785,
"learning_rate": 9.51739593849924e-06,
"loss": 1.3568,
"step": 82200
},
{
"epoch": 0.41,
"grad_norm": 3.872784376144409,
"learning_rate": 9.516770803704549e-06,
"loss": 1.3725,
"step": 82300
},
{
"epoch": 0.41,
"grad_norm": 3.3895182609558105,
"learning_rate": 9.51614566890986e-06,
"loss": 1.3347,
"step": 82400
},
{
"epoch": 0.41,
"grad_norm": 3.349815845489502,
"learning_rate": 9.515520534115168e-06,
"loss": 1.2965,
"step": 82500
},
{
"epoch": 0.41,
"grad_norm": 3.8851418495178223,
"learning_rate": 9.51489539932048e-06,
"loss": 1.3547,
"step": 82600
},
{
"epoch": 0.41,
"grad_norm": 3.7153375148773193,
"learning_rate": 9.514270264525788e-06,
"loss": 1.3502,
"step": 82700
},
{
"epoch": 0.41,
"grad_norm": 2.9336204528808594,
"learning_rate": 9.5136451297311e-06,
"loss": 1.3727,
"step": 82800
},
{
"epoch": 0.41,
"grad_norm": 2.910884141921997,
"learning_rate": 9.513019994936408e-06,
"loss": 1.3643,
"step": 82900
},
{
"epoch": 0.41,
"grad_norm": 2.9535582065582275,
"learning_rate": 9.51239486014172e-06,
"loss": 1.3532,
"step": 83000
},
{
"epoch": 0.41,
"grad_norm": 3.453658103942871,
"learning_rate": 9.511769725347028e-06,
"loss": 1.3747,
"step": 83100
},
{
"epoch": 0.41,
"grad_norm": 4.163629055023193,
"learning_rate": 9.511144590552338e-06,
"loss": 1.3686,
"step": 83200
},
{
"epoch": 0.41,
"grad_norm": 3.291599988937378,
"learning_rate": 9.510519455757648e-06,
"loss": 1.3195,
"step": 83300
},
{
"epoch": 0.41,
"grad_norm": 4.140781879425049,
"learning_rate": 9.509894320962958e-06,
"loss": 1.3454,
"step": 83400
},
{
"epoch": 0.41,
"grad_norm": 3.2356150150299072,
"learning_rate": 9.509269186168268e-06,
"loss": 1.3656,
"step": 83500
},
{
"epoch": 0.41,
"grad_norm": 2.98710298538208,
"learning_rate": 9.508644051373578e-06,
"loss": 1.371,
"step": 83600
},
{
"epoch": 0.41,
"grad_norm": 2.949601650238037,
"learning_rate": 9.508018916578888e-06,
"loss": 1.3794,
"step": 83700
},
{
"epoch": 0.41,
"grad_norm": 2.5830845832824707,
"learning_rate": 9.507393781784198e-06,
"loss": 1.3665,
"step": 83800
},
{
"epoch": 0.42,
"grad_norm": 3.1843700408935547,
"learning_rate": 9.506768646989507e-06,
"loss": 1.3463,
"step": 83900
},
{
"epoch": 0.42,
"grad_norm": 3.233429193496704,
"learning_rate": 9.506143512194817e-06,
"loss": 1.3741,
"step": 84000
},
{
"epoch": 0.42,
"grad_norm": 3.79780650138855,
"learning_rate": 9.505518377400127e-06,
"loss": 1.3006,
"step": 84100
},
{
"epoch": 0.42,
"grad_norm": 3.684920310974121,
"learning_rate": 9.504893242605437e-06,
"loss": 1.3613,
"step": 84200
},
{
"epoch": 0.42,
"grad_norm": 4.043038368225098,
"learning_rate": 9.504268107810747e-06,
"loss": 1.3762,
"step": 84300
},
{
"epoch": 0.42,
"grad_norm": 3.4520349502563477,
"learning_rate": 9.503642973016057e-06,
"loss": 1.3339,
"step": 84400
},
{
"epoch": 0.42,
"grad_norm": 3.5900933742523193,
"learning_rate": 9.503017838221367e-06,
"loss": 1.3507,
"step": 84500
},
{
"epoch": 0.42,
"grad_norm": 4.365208625793457,
"learning_rate": 9.502392703426677e-06,
"loss": 1.354,
"step": 84600
},
{
"epoch": 0.42,
"grad_norm": 3.7963385581970215,
"learning_rate": 9.501767568631987e-06,
"loss": 1.3377,
"step": 84700
},
{
"epoch": 0.42,
"grad_norm": 3.07368803024292,
"learning_rate": 9.501142433837297e-06,
"loss": 1.3544,
"step": 84800
},
{
"epoch": 0.42,
"grad_norm": 3.9033076763153076,
"learning_rate": 9.500517299042607e-06,
"loss": 1.3466,
"step": 84900
},
{
"epoch": 0.42,
"grad_norm": 2.946506977081299,
"learning_rate": 9.499892164247917e-06,
"loss": 1.3524,
"step": 85000
},
{
"epoch": 0.42,
"grad_norm": 4.097044944763184,
"learning_rate": 9.499267029453227e-06,
"loss": 1.3342,
"step": 85100
},
{
"epoch": 0.42,
"grad_norm": 3.7137930393218994,
"learning_rate": 9.498641894658536e-06,
"loss": 1.3737,
"step": 85200
},
{
"epoch": 0.42,
"grad_norm": 3.2496094703674316,
"learning_rate": 9.498016759863846e-06,
"loss": 1.339,
"step": 85300
},
{
"epoch": 0.42,
"grad_norm": 4.326569557189941,
"learning_rate": 9.497391625069156e-06,
"loss": 1.372,
"step": 85400
},
{
"epoch": 0.42,
"grad_norm": 2.918201208114624,
"learning_rate": 9.496766490274466e-06,
"loss": 1.341,
"step": 85500
},
{
"epoch": 0.42,
"grad_norm": 3.4720118045806885,
"learning_rate": 9.496141355479776e-06,
"loss": 1.3525,
"step": 85600
},
{
"epoch": 0.42,
"grad_norm": 3.20745587348938,
"learning_rate": 9.495516220685086e-06,
"loss": 1.3664,
"step": 85700
},
{
"epoch": 0.42,
"grad_norm": 3.320747137069702,
"learning_rate": 9.494891085890396e-06,
"loss": 1.326,
"step": 85800
},
{
"epoch": 0.42,
"grad_norm": 2.690807342529297,
"learning_rate": 9.494265951095706e-06,
"loss": 1.3689,
"step": 85900
},
{
"epoch": 0.43,
"grad_norm": 4.453171253204346,
"learning_rate": 9.493640816301016e-06,
"loss": 1.374,
"step": 86000
},
{
"epoch": 0.43,
"grad_norm": 3.375361204147339,
"learning_rate": 9.493015681506326e-06,
"loss": 1.3427,
"step": 86100
},
{
"epoch": 0.43,
"grad_norm": 3.053560495376587,
"learning_rate": 9.492390546711636e-06,
"loss": 1.3418,
"step": 86200
},
{
"epoch": 0.43,
"grad_norm": 4.028963565826416,
"learning_rate": 9.491765411916946e-06,
"loss": 1.3203,
"step": 86300
},
{
"epoch": 0.43,
"grad_norm": 3.746544599533081,
"learning_rate": 9.491140277122256e-06,
"loss": 1.3121,
"step": 86400
},
{
"epoch": 0.43,
"grad_norm": 3.1117103099823,
"learning_rate": 9.490515142327566e-06,
"loss": 1.3383,
"step": 86500
},
{
"epoch": 0.43,
"grad_norm": 3.2640998363494873,
"learning_rate": 9.489890007532875e-06,
"loss": 1.3384,
"step": 86600
},
{
"epoch": 0.43,
"grad_norm": 3.436328172683716,
"learning_rate": 9.489264872738185e-06,
"loss": 1.3587,
"step": 86700
},
{
"epoch": 0.43,
"grad_norm": 3.372560977935791,
"learning_rate": 9.488639737943495e-06,
"loss": 1.4054,
"step": 86800
},
{
"epoch": 0.43,
"grad_norm": 2.880247116088867,
"learning_rate": 9.488014603148805e-06,
"loss": 1.3166,
"step": 86900
},
{
"epoch": 0.43,
"grad_norm": 3.686885356903076,
"learning_rate": 9.487389468354115e-06,
"loss": 1.3629,
"step": 87000
},
{
"epoch": 0.43,
"grad_norm": 3.168898820877075,
"learning_rate": 9.486764333559425e-06,
"loss": 1.3868,
"step": 87100
},
{
"epoch": 0.43,
"grad_norm": 3.3519859313964844,
"learning_rate": 9.486139198764735e-06,
"loss": 1.3696,
"step": 87200
},
{
"epoch": 0.43,
"grad_norm": 2.844688892364502,
"learning_rate": 9.485514063970043e-06,
"loss": 1.3498,
"step": 87300
},
{
"epoch": 0.43,
"grad_norm": 3.061849594116211,
"learning_rate": 9.484888929175355e-06,
"loss": 1.3692,
"step": 87400
},
{
"epoch": 0.43,
"grad_norm": 4.100019931793213,
"learning_rate": 9.484263794380663e-06,
"loss": 1.3407,
"step": 87500
},
{
"epoch": 0.43,
"grad_norm": 3.519801378250122,
"learning_rate": 9.483638659585975e-06,
"loss": 1.3565,
"step": 87600
},
{
"epoch": 0.43,
"grad_norm": 6.410887241363525,
"learning_rate": 9.483013524791283e-06,
"loss": 1.3291,
"step": 87700
},
{
"epoch": 0.43,
"grad_norm": 3.080322504043579,
"learning_rate": 9.482388389996595e-06,
"loss": 1.3519,
"step": 87800
},
{
"epoch": 0.43,
"grad_norm": 3.0409817695617676,
"learning_rate": 9.481763255201903e-06,
"loss": 1.3519,
"step": 87900
},
{
"epoch": 0.44,
"grad_norm": 4.2845025062561035,
"learning_rate": 9.481138120407214e-06,
"loss": 1.3436,
"step": 88000
},
{
"epoch": 0.44,
"grad_norm": 3.1132304668426514,
"learning_rate": 9.480512985612523e-06,
"loss": 1.3145,
"step": 88100
},
{
"epoch": 0.44,
"grad_norm": 4.386362075805664,
"learning_rate": 9.479887850817834e-06,
"loss": 1.3483,
"step": 88200
},
{
"epoch": 0.44,
"grad_norm": 2.7619481086730957,
"learning_rate": 9.479262716023143e-06,
"loss": 1.312,
"step": 88300
},
{
"epoch": 0.44,
"grad_norm": 3.451927900314331,
"learning_rate": 9.478637581228454e-06,
"loss": 1.3807,
"step": 88400
},
{
"epoch": 0.44,
"grad_norm": 3.5724120140075684,
"learning_rate": 9.478012446433762e-06,
"loss": 1.3469,
"step": 88500
},
{
"epoch": 0.44,
"grad_norm": 4.330935955047607,
"learning_rate": 9.477387311639072e-06,
"loss": 1.3136,
"step": 88600
},
{
"epoch": 0.44,
"grad_norm": 3.6509666442871094,
"learning_rate": 9.476762176844382e-06,
"loss": 1.3645,
"step": 88700
},
{
"epoch": 0.44,
"grad_norm": 3.39678692817688,
"learning_rate": 9.476137042049692e-06,
"loss": 1.3229,
"step": 88800
},
{
"epoch": 0.44,
"grad_norm": 3.2337393760681152,
"learning_rate": 9.475511907255002e-06,
"loss": 1.3473,
"step": 88900
},
{
"epoch": 0.44,
"grad_norm": 2.9486355781555176,
"learning_rate": 9.474886772460312e-06,
"loss": 1.341,
"step": 89000
},
{
"epoch": 0.44,
"grad_norm": 3.5861918926239014,
"learning_rate": 9.474261637665622e-06,
"loss": 1.3708,
"step": 89100
},
{
"epoch": 0.44,
"grad_norm": 2.8153584003448486,
"learning_rate": 9.473636502870932e-06,
"loss": 1.35,
"step": 89200
},
{
"epoch": 0.44,
"grad_norm": 3.0656278133392334,
"learning_rate": 9.473011368076242e-06,
"loss": 1.3564,
"step": 89300
},
{
"epoch": 0.44,
"grad_norm": 3.5475146770477295,
"learning_rate": 9.472386233281552e-06,
"loss": 1.3178,
"step": 89400
},
{
"epoch": 0.44,
"grad_norm": 4.837975025177002,
"learning_rate": 9.471761098486862e-06,
"loss": 1.3588,
"step": 89500
},
{
"epoch": 0.44,
"grad_norm": 3.626478433609009,
"learning_rate": 9.471135963692172e-06,
"loss": 1.3288,
"step": 89600
},
{
"epoch": 0.44,
"grad_norm": 2.8399198055267334,
"learning_rate": 9.470510828897481e-06,
"loss": 1.3542,
"step": 89700
},
{
"epoch": 0.44,
"grad_norm": 3.478510856628418,
"learning_rate": 9.469885694102791e-06,
"loss": 1.3282,
"step": 89800
},
{
"epoch": 0.44,
"grad_norm": 3.0036330223083496,
"learning_rate": 9.469260559308101e-06,
"loss": 1.3135,
"step": 89900
},
{
"epoch": 0.45,
"grad_norm": 3.49764084815979,
"learning_rate": 9.468635424513411e-06,
"loss": 1.3216,
"step": 90000
},
{
"epoch": 0.45,
"grad_norm": 4.711456298828125,
"learning_rate": 9.468010289718721e-06,
"loss": 1.3746,
"step": 90100
},
{
"epoch": 0.45,
"grad_norm": 3.8252532482147217,
"learning_rate": 9.467385154924031e-06,
"loss": 1.3375,
"step": 90200
},
{
"epoch": 0.45,
"grad_norm": 3.445317029953003,
"learning_rate": 9.466760020129341e-06,
"loss": 1.3523,
"step": 90300
},
{
"epoch": 0.45,
"grad_norm": 2.879566192626953,
"learning_rate": 9.466134885334651e-06,
"loss": 1.3184,
"step": 90400
},
{
"epoch": 0.45,
"grad_norm": 3.891055107116699,
"learning_rate": 9.465509750539961e-06,
"loss": 1.3246,
"step": 90500
},
{
"epoch": 0.45,
"grad_norm": 2.6852951049804688,
"learning_rate": 9.46488461574527e-06,
"loss": 1.3803,
"step": 90600
},
{
"epoch": 0.45,
"grad_norm": 2.9267516136169434,
"learning_rate": 9.46425948095058e-06,
"loss": 1.3254,
"step": 90700
},
{
"epoch": 0.45,
"grad_norm": 2.6373448371887207,
"learning_rate": 9.46363434615589e-06,
"loss": 1.3266,
"step": 90800
},
{
"epoch": 0.45,
"grad_norm": 3.703024387359619,
"learning_rate": 9.4630092113612e-06,
"loss": 1.3552,
"step": 90900
},
{
"epoch": 0.45,
"grad_norm": 2.5792810916900635,
"learning_rate": 9.46238407656651e-06,
"loss": 1.3365,
"step": 91000
},
{
"epoch": 0.45,
"grad_norm": 3.6020054817199707,
"learning_rate": 9.46175894177182e-06,
"loss": 1.3287,
"step": 91100
},
{
"epoch": 0.45,
"grad_norm": 3.713806390762329,
"learning_rate": 9.46113380697713e-06,
"loss": 1.3537,
"step": 91200
},
{
"epoch": 0.45,
"grad_norm": 4.407512187957764,
"learning_rate": 9.46050867218244e-06,
"loss": 1.3835,
"step": 91300
},
{
"epoch": 0.45,
"grad_norm": 2.683220863342285,
"learning_rate": 9.45988353738775e-06,
"loss": 1.3206,
"step": 91400
},
{
"epoch": 0.45,
"grad_norm": 3.046828031539917,
"learning_rate": 9.45925840259306e-06,
"loss": 1.3251,
"step": 91500
},
{
"epoch": 0.45,
"grad_norm": 2.8515381813049316,
"learning_rate": 9.45863326779837e-06,
"loss": 1.3432,
"step": 91600
},
{
"epoch": 0.45,
"grad_norm": 3.381223678588867,
"learning_rate": 9.45800813300368e-06,
"loss": 1.355,
"step": 91700
},
{
"epoch": 0.45,
"grad_norm": 3.301053762435913,
"learning_rate": 9.45738299820899e-06,
"loss": 1.3406,
"step": 91800
},
{
"epoch": 0.45,
"grad_norm": 2.924475908279419,
"learning_rate": 9.4567578634143e-06,
"loss": 1.3444,
"step": 91900
},
{
"epoch": 0.46,
"grad_norm": 3.036510705947876,
"learning_rate": 9.45613272861961e-06,
"loss": 1.3519,
"step": 92000
},
{
"epoch": 0.46,
"grad_norm": 2.7162649631500244,
"learning_rate": 9.45550759382492e-06,
"loss": 1.3407,
"step": 92100
},
{
"epoch": 0.46,
"grad_norm": 2.7335431575775146,
"learning_rate": 9.45488245903023e-06,
"loss": 1.3941,
"step": 92200
},
{
"epoch": 0.46,
"grad_norm": 2.4760313034057617,
"learning_rate": 9.45425732423554e-06,
"loss": 1.3481,
"step": 92300
},
{
"epoch": 0.46,
"grad_norm": 3.327454090118408,
"learning_rate": 9.45363218944085e-06,
"loss": 1.3313,
"step": 92400
},
{
"epoch": 0.46,
"grad_norm": 3.170297861099243,
"learning_rate": 9.453007054646158e-06,
"loss": 1.3222,
"step": 92500
},
{
"epoch": 0.46,
"grad_norm": 3.097593307495117,
"learning_rate": 9.45238191985147e-06,
"loss": 1.3615,
"step": 92600
},
{
"epoch": 0.46,
"grad_norm": 2.889549493789673,
"learning_rate": 9.451756785056778e-06,
"loss": 1.3447,
"step": 92700
},
{
"epoch": 0.46,
"grad_norm": 2.5188488960266113,
"learning_rate": 9.45113165026209e-06,
"loss": 1.3844,
"step": 92800
},
{
"epoch": 0.46,
"grad_norm": 2.8199424743652344,
"learning_rate": 9.450506515467397e-06,
"loss": 1.3751,
"step": 92900
},
{
"epoch": 0.46,
"grad_norm": 2.9179065227508545,
"learning_rate": 9.449881380672709e-06,
"loss": 1.3483,
"step": 93000
},
{
"epoch": 0.46,
"grad_norm": 3.69584584236145,
"learning_rate": 9.449256245878017e-06,
"loss": 1.3613,
"step": 93100
},
{
"epoch": 0.46,
"grad_norm": 4.401488780975342,
"learning_rate": 9.448631111083329e-06,
"loss": 1.3625,
"step": 93200
},
{
"epoch": 0.46,
"grad_norm": 2.9850871562957764,
"learning_rate": 9.448005976288637e-06,
"loss": 1.3522,
"step": 93300
},
{
"epoch": 0.46,
"grad_norm": 3.8156750202178955,
"learning_rate": 9.447380841493949e-06,
"loss": 1.3387,
"step": 93400
},
{
"epoch": 0.46,
"grad_norm": 3.664689779281616,
"learning_rate": 9.446755706699257e-06,
"loss": 1.3233,
"step": 93500
},
{
"epoch": 0.46,
"grad_norm": 4.119280815124512,
"learning_rate": 9.446130571904569e-06,
"loss": 1.396,
"step": 93600
},
{
"epoch": 0.46,
"grad_norm": 2.9794814586639404,
"learning_rate": 9.445505437109877e-06,
"loss": 1.3731,
"step": 93700
},
{
"epoch": 0.46,
"grad_norm": 2.943528890609741,
"learning_rate": 9.444880302315187e-06,
"loss": 1.3452,
"step": 93800
},
{
"epoch": 0.46,
"grad_norm": 2.734614610671997,
"learning_rate": 9.444255167520497e-06,
"loss": 1.346,
"step": 93900
},
{
"epoch": 0.47,
"grad_norm": 3.5047719478607178,
"learning_rate": 9.443630032725807e-06,
"loss": 1.3618,
"step": 94000
},
{
"epoch": 0.47,
"grad_norm": 3.040126323699951,
"learning_rate": 9.443004897931117e-06,
"loss": 1.3461,
"step": 94100
},
{
"epoch": 0.47,
"grad_norm": 3.951183795928955,
"learning_rate": 9.442379763136426e-06,
"loss": 1.3402,
"step": 94200
},
{
"epoch": 0.47,
"grad_norm": 3.8336355686187744,
"learning_rate": 9.441754628341736e-06,
"loss": 1.3211,
"step": 94300
},
{
"epoch": 0.47,
"grad_norm": 2.9649171829223633,
"learning_rate": 9.441129493547046e-06,
"loss": 1.3538,
"step": 94400
},
{
"epoch": 0.47,
"grad_norm": 3.0716583728790283,
"learning_rate": 9.440504358752356e-06,
"loss": 1.3518,
"step": 94500
},
{
"epoch": 0.47,
"grad_norm": 2.94270658493042,
"learning_rate": 9.439879223957666e-06,
"loss": 1.3278,
"step": 94600
},
{
"epoch": 0.47,
"grad_norm": 2.8442766666412354,
"learning_rate": 9.439254089162976e-06,
"loss": 1.32,
"step": 94700
},
{
"epoch": 0.47,
"grad_norm": 3.8846325874328613,
"learning_rate": 9.438628954368286e-06,
"loss": 1.3391,
"step": 94800
},
{
"epoch": 0.47,
"grad_norm": 2.698730230331421,
"learning_rate": 9.438003819573596e-06,
"loss": 1.3777,
"step": 94900
},
{
"epoch": 0.47,
"grad_norm": 3.3867924213409424,
"learning_rate": 9.437378684778906e-06,
"loss": 1.3563,
"step": 95000
},
{
"epoch": 0.47,
"grad_norm": 2.769615411758423,
"learning_rate": 9.436753549984216e-06,
"loss": 1.3298,
"step": 95100
},
{
"epoch": 0.47,
"grad_norm": 3.6002724170684814,
"learning_rate": 9.436128415189526e-06,
"loss": 1.3425,
"step": 95200
},
{
"epoch": 0.47,
"grad_norm": 3.228452205657959,
"learning_rate": 9.435503280394836e-06,
"loss": 1.3429,
"step": 95300
},
{
"epoch": 0.47,
"grad_norm": 3.423189401626587,
"learning_rate": 9.434878145600146e-06,
"loss": 1.3768,
"step": 95400
},
{
"epoch": 0.47,
"grad_norm": 3.5708446502685547,
"learning_rate": 9.434253010805456e-06,
"loss": 1.3125,
"step": 95500
},
{
"epoch": 0.47,
"grad_norm": 2.9108211994171143,
"learning_rate": 9.433627876010765e-06,
"loss": 1.3347,
"step": 95600
},
{
"epoch": 0.47,
"grad_norm": 3.365302324295044,
"learning_rate": 9.433002741216075e-06,
"loss": 1.3277,
"step": 95700
},
{
"epoch": 0.47,
"grad_norm": 3.0356671810150146,
"learning_rate": 9.432377606421385e-06,
"loss": 1.3345,
"step": 95800
},
{
"epoch": 0.47,
"grad_norm": 2.6186368465423584,
"learning_rate": 9.431752471626695e-06,
"loss": 1.3059,
"step": 95900
},
{
"epoch": 0.47,
"grad_norm": 3.4578585624694824,
"learning_rate": 9.431127336832005e-06,
"loss": 1.3635,
"step": 96000
},
{
"epoch": 0.48,
"grad_norm": 2.8224425315856934,
"learning_rate": 9.430502202037315e-06,
"loss": 1.3325,
"step": 96100
},
{
"epoch": 0.48,
"grad_norm": 3.635671377182007,
"learning_rate": 9.429877067242625e-06,
"loss": 1.3007,
"step": 96200
},
{
"epoch": 0.48,
"grad_norm": 2.900747299194336,
"learning_rate": 9.429251932447935e-06,
"loss": 1.3317,
"step": 96300
},
{
"epoch": 0.48,
"grad_norm": 2.8643271923065186,
"learning_rate": 9.428626797653245e-06,
"loss": 1.3489,
"step": 96400
},
{
"epoch": 0.48,
"grad_norm": 3.498797655105591,
"learning_rate": 9.428001662858555e-06,
"loss": 1.3263,
"step": 96500
},
{
"epoch": 0.48,
"grad_norm": 2.8632445335388184,
"learning_rate": 9.427376528063865e-06,
"loss": 1.3465,
"step": 96600
},
{
"epoch": 0.48,
"grad_norm": 2.886178970336914,
"learning_rate": 9.426751393269175e-06,
"loss": 1.3622,
"step": 96700
},
{
"epoch": 0.48,
"grad_norm": 3.605872869491577,
"learning_rate": 9.426126258474485e-06,
"loss": 1.3404,
"step": 96800
},
{
"epoch": 0.48,
"grad_norm": 4.709196090698242,
"learning_rate": 9.425501123679795e-06,
"loss": 1.3344,
"step": 96900
},
{
"epoch": 0.48,
"grad_norm": 3.5497000217437744,
"learning_rate": 9.424875988885104e-06,
"loss": 1.3536,
"step": 97000
},
{
"epoch": 0.48,
"grad_norm": 3.165081024169922,
"learning_rate": 9.424250854090414e-06,
"loss": 1.3453,
"step": 97100
},
{
"epoch": 0.48,
"grad_norm": 3.4329254627227783,
"learning_rate": 9.423625719295724e-06,
"loss": 1.3044,
"step": 97200
},
{
"epoch": 0.48,
"grad_norm": 3.0184082984924316,
"learning_rate": 9.423000584501034e-06,
"loss": 1.344,
"step": 97300
},
{
"epoch": 0.48,
"grad_norm": 3.776301383972168,
"learning_rate": 9.422375449706344e-06,
"loss": 1.3315,
"step": 97400
},
{
"epoch": 0.48,
"grad_norm": 2.908074140548706,
"learning_rate": 9.421750314911654e-06,
"loss": 1.3613,
"step": 97500
},
{
"epoch": 0.48,
"grad_norm": 3.8458778858184814,
"learning_rate": 9.421125180116964e-06,
"loss": 1.3665,
"step": 97600
},
{
"epoch": 0.48,
"grad_norm": 3.778986930847168,
"learning_rate": 9.420500045322274e-06,
"loss": 1.3271,
"step": 97700
},
{
"epoch": 0.48,
"grad_norm": 4.841845512390137,
"learning_rate": 9.419874910527584e-06,
"loss": 1.3842,
"step": 97800
},
{
"epoch": 0.48,
"grad_norm": 2.875431537628174,
"learning_rate": 9.419249775732892e-06,
"loss": 1.3734,
"step": 97900
},
{
"epoch": 0.48,
"grad_norm": 3.327831268310547,
"learning_rate": 9.418624640938204e-06,
"loss": 1.3225,
"step": 98000
},
{
"epoch": 0.49,
"grad_norm": 3.921052932739258,
"learning_rate": 9.417999506143512e-06,
"loss": 1.3103,
"step": 98100
},
{
"epoch": 0.49,
"grad_norm": 3.3352317810058594,
"learning_rate": 9.417374371348824e-06,
"loss": 1.33,
"step": 98200
},
{
"epoch": 0.49,
"grad_norm": 2.6515772342681885,
"learning_rate": 9.416749236554132e-06,
"loss": 1.3325,
"step": 98300
},
{
"epoch": 0.49,
"grad_norm": 2.6556906700134277,
"learning_rate": 9.416124101759443e-06,
"loss": 1.3537,
"step": 98400
},
{
"epoch": 0.49,
"grad_norm": 3.394216775894165,
"learning_rate": 9.415498966964752e-06,
"loss": 1.337,
"step": 98500
},
{
"epoch": 0.49,
"grad_norm": 3.2017979621887207,
"learning_rate": 9.414873832170063e-06,
"loss": 1.3697,
"step": 98600
},
{
"epoch": 0.49,
"grad_norm": 4.548534393310547,
"learning_rate": 9.414248697375371e-06,
"loss": 1.353,
"step": 98700
},
{
"epoch": 0.49,
"grad_norm": 3.0345072746276855,
"learning_rate": 9.413623562580683e-06,
"loss": 1.3475,
"step": 98800
},
{
"epoch": 0.49,
"grad_norm": 2.3942067623138428,
"learning_rate": 9.412998427785991e-06,
"loss": 1.293,
"step": 98900
},
{
"epoch": 0.49,
"grad_norm": 2.577939033508301,
"learning_rate": 9.412373292991303e-06,
"loss": 1.3267,
"step": 99000
},
{
"epoch": 0.49,
"grad_norm": 3.599987745285034,
"learning_rate": 9.411748158196611e-06,
"loss": 1.3248,
"step": 99100
},
{
"epoch": 0.49,
"grad_norm": 2.732025623321533,
"learning_rate": 9.411123023401921e-06,
"loss": 1.3331,
"step": 99200
},
{
"epoch": 0.49,
"grad_norm": 3.382721185684204,
"learning_rate": 9.410497888607231e-06,
"loss": 1.3492,
"step": 99300
},
{
"epoch": 0.49,
"grad_norm": 3.670431613922119,
"learning_rate": 9.409872753812541e-06,
"loss": 1.3651,
"step": 99400
},
{
"epoch": 0.49,
"grad_norm": 2.932300329208374,
"learning_rate": 9.409247619017851e-06,
"loss": 1.3176,
"step": 99500
},
{
"epoch": 0.49,
"grad_norm": 4.241666316986084,
"learning_rate": 9.40862248422316e-06,
"loss": 1.3056,
"step": 99600
},
{
"epoch": 0.49,
"grad_norm": 2.6911585330963135,
"learning_rate": 9.40799734942847e-06,
"loss": 1.3425,
"step": 99700
},
{
"epoch": 0.49,
"grad_norm": 2.879465103149414,
"learning_rate": 9.40737221463378e-06,
"loss": 1.3395,
"step": 99800
},
{
"epoch": 0.49,
"grad_norm": 2.778740167617798,
"learning_rate": 9.40674707983909e-06,
"loss": 1.3426,
"step": 99900
},
{
"epoch": 0.49,
"grad_norm": 2.6691386699676514,
"learning_rate": 9.4061219450444e-06,
"loss": 1.3696,
"step": 100000
},
{
"epoch": 0.5,
"grad_norm": 4.0432562828063965,
"learning_rate": 9.40549681024971e-06,
"loss": 1.368,
"step": 100100
},
{
"epoch": 0.5,
"grad_norm": 2.7415411472320557,
"learning_rate": 9.40487167545502e-06,
"loss": 1.3368,
"step": 100200
},
{
"epoch": 0.5,
"grad_norm": 2.6961042881011963,
"learning_rate": 9.404246540660332e-06,
"loss": 1.3821,
"step": 100300
},
{
"epoch": 0.5,
"grad_norm": 3.592819929122925,
"learning_rate": 9.40362140586564e-06,
"loss": 1.3313,
"step": 100400
},
{
"epoch": 0.5,
"grad_norm": 3.588106632232666,
"learning_rate": 9.40299627107095e-06,
"loss": 1.3369,
"step": 100500
},
{
"epoch": 0.5,
"grad_norm": 3.1717209815979004,
"learning_rate": 9.40237113627626e-06,
"loss": 1.3063,
"step": 100600
},
{
"epoch": 0.5,
"grad_norm": 3.9011149406433105,
"learning_rate": 9.40174600148157e-06,
"loss": 1.3371,
"step": 100700
},
{
"epoch": 0.5,
"grad_norm": 2.861337184906006,
"learning_rate": 9.40112086668688e-06,
"loss": 1.3511,
"step": 100800
},
{
"epoch": 0.5,
"grad_norm": 3.2174508571624756,
"learning_rate": 9.40049573189219e-06,
"loss": 1.3374,
"step": 100900
},
{
"epoch": 0.5,
"grad_norm": 3.301086664199829,
"learning_rate": 9.3998705970975e-06,
"loss": 1.3024,
"step": 101000
},
{
"epoch": 0.5,
"grad_norm": 3.3187129497528076,
"learning_rate": 9.39924546230281e-06,
"loss": 1.349,
"step": 101100
},
{
"epoch": 0.5,
"grad_norm": 2.6953561305999756,
"learning_rate": 9.39862032750812e-06,
"loss": 1.3528,
"step": 101200
},
{
"epoch": 0.5,
"grad_norm": 3.3338325023651123,
"learning_rate": 9.39799519271343e-06,
"loss": 1.3511,
"step": 101300
},
{
"epoch": 0.5,
"grad_norm": 3.3522443771362305,
"learning_rate": 9.39737005791874e-06,
"loss": 1.3373,
"step": 101400
},
{
"epoch": 0.5,
"grad_norm": 2.7400362491607666,
"learning_rate": 9.39674492312405e-06,
"loss": 1.3535,
"step": 101500
},
{
"epoch": 0.5,
"grad_norm": 2.793731212615967,
"learning_rate": 9.39611978832936e-06,
"loss": 1.3184,
"step": 101600
},
{
"epoch": 0.5,
"grad_norm": 2.759066581726074,
"learning_rate": 9.39549465353467e-06,
"loss": 1.3489,
"step": 101700
},
{
"epoch": 0.5,
"grad_norm": 4.7479681968688965,
"learning_rate": 9.39486951873998e-06,
"loss": 1.3498,
"step": 101800
},
{
"epoch": 0.5,
"grad_norm": 3.2522835731506348,
"learning_rate": 9.394244383945289e-06,
"loss": 1.3476,
"step": 101900
},
{
"epoch": 0.5,
"grad_norm": 4.208197593688965,
"learning_rate": 9.393619249150599e-06,
"loss": 1.3289,
"step": 102000
},
{
"epoch": 0.51,
"grad_norm": 2.8332533836364746,
"learning_rate": 9.392994114355909e-06,
"loss": 1.3501,
"step": 102100
},
{
"epoch": 0.51,
"grad_norm": 3.675553798675537,
"learning_rate": 9.392368979561219e-06,
"loss": 1.3559,
"step": 102200
},
{
"epoch": 0.51,
"grad_norm": 2.7442257404327393,
"learning_rate": 9.391743844766529e-06,
"loss": 1.346,
"step": 102300
},
{
"epoch": 0.51,
"grad_norm": 3.129180431365967,
"learning_rate": 9.391118709971839e-06,
"loss": 1.3265,
"step": 102400
},
{
"epoch": 0.51,
"grad_norm": 3.1826012134552,
"learning_rate": 9.390493575177149e-06,
"loss": 1.3488,
"step": 102500
},
{
"epoch": 0.51,
"grad_norm": 3.8879926204681396,
"learning_rate": 9.389868440382459e-06,
"loss": 1.3814,
"step": 102600
},
{
"epoch": 0.51,
"grad_norm": 4.066867828369141,
"learning_rate": 9.389243305587769e-06,
"loss": 1.3454,
"step": 102700
},
{
"epoch": 0.51,
"grad_norm": 3.3028340339660645,
"learning_rate": 9.388618170793078e-06,
"loss": 1.3661,
"step": 102800
},
{
"epoch": 0.51,
"grad_norm": 2.9503161907196045,
"learning_rate": 9.387993035998388e-06,
"loss": 1.3326,
"step": 102900
},
{
"epoch": 0.51,
"grad_norm": 3.030353546142578,
"learning_rate": 9.387367901203698e-06,
"loss": 1.3436,
"step": 103000
},
{
"epoch": 0.51,
"grad_norm": 3.6172800064086914,
"learning_rate": 9.386742766409007e-06,
"loss": 1.3545,
"step": 103100
},
{
"epoch": 0.51,
"grad_norm": 3.2115883827209473,
"learning_rate": 9.386117631614318e-06,
"loss": 1.3345,
"step": 103200
},
{
"epoch": 0.51,
"grad_norm": 2.9105865955352783,
"learning_rate": 9.385492496819626e-06,
"loss": 1.3667,
"step": 103300
},
{
"epoch": 0.51,
"grad_norm": 4.278082847595215,
"learning_rate": 9.384867362024938e-06,
"loss": 1.3497,
"step": 103400
},
{
"epoch": 0.51,
"grad_norm": 3.113901376724243,
"learning_rate": 9.384242227230246e-06,
"loss": 1.3216,
"step": 103500
},
{
"epoch": 0.51,
"grad_norm": 3.9379656314849854,
"learning_rate": 9.383617092435558e-06,
"loss": 1.3469,
"step": 103600
},
{
"epoch": 0.51,
"grad_norm": 3.53202748298645,
"learning_rate": 9.382991957640866e-06,
"loss": 1.3488,
"step": 103700
},
{
"epoch": 0.51,
"grad_norm": 2.973238468170166,
"learning_rate": 9.382366822846178e-06,
"loss": 1.3387,
"step": 103800
},
{
"epoch": 0.51,
"grad_norm": 3.464711904525757,
"learning_rate": 9.381741688051486e-06,
"loss": 1.3472,
"step": 103900
},
{
"epoch": 0.51,
"grad_norm": 2.8774147033691406,
"learning_rate": 9.381116553256798e-06,
"loss": 1.3786,
"step": 104000
},
{
"epoch": 0.52,
"grad_norm": 3.0821099281311035,
"learning_rate": 9.380491418462106e-06,
"loss": 1.3329,
"step": 104100
},
{
"epoch": 0.52,
"grad_norm": 4.228484630584717,
"learning_rate": 9.379866283667417e-06,
"loss": 1.3329,
"step": 104200
},
{
"epoch": 0.52,
"grad_norm": 3.113833427429199,
"learning_rate": 9.379241148872726e-06,
"loss": 1.373,
"step": 104300
},
{
"epoch": 0.52,
"grad_norm": 3.078624963760376,
"learning_rate": 9.378616014078036e-06,
"loss": 1.3409,
"step": 104400
},
{
"epoch": 0.52,
"grad_norm": 2.510251045227051,
"learning_rate": 9.377990879283346e-06,
"loss": 1.3393,
"step": 104500
},
{
"epoch": 0.52,
"grad_norm": 3.0739970207214355,
"learning_rate": 9.377365744488655e-06,
"loss": 1.3696,
"step": 104600
},
{
"epoch": 0.52,
"grad_norm": 3.1729655265808105,
"learning_rate": 9.376740609693965e-06,
"loss": 1.3229,
"step": 104700
},
{
"epoch": 0.52,
"grad_norm": 3.003714084625244,
"learning_rate": 9.376115474899275e-06,
"loss": 1.3309,
"step": 104800
},
{
"epoch": 0.52,
"grad_norm": 3.0130865573883057,
"learning_rate": 9.375490340104585e-06,
"loss": 1.3736,
"step": 104900
},
{
"epoch": 0.52,
"grad_norm": 3.3182711601257324,
"learning_rate": 9.374865205309895e-06,
"loss": 1.342,
"step": 105000
},
{
"epoch": 0.52,
"grad_norm": 2.8089771270751953,
"learning_rate": 9.374240070515205e-06,
"loss": 1.3187,
"step": 105100
},
{
"epoch": 0.52,
"grad_norm": 3.321974515914917,
"learning_rate": 9.373614935720515e-06,
"loss": 1.3676,
"step": 105200
},
{
"epoch": 0.52,
"grad_norm": 3.114701986312866,
"learning_rate": 9.372989800925825e-06,
"loss": 1.3746,
"step": 105300
},
{
"epoch": 0.52,
"grad_norm": 3.1047348976135254,
"learning_rate": 9.372364666131135e-06,
"loss": 1.3437,
"step": 105400
},
{
"epoch": 0.52,
"grad_norm": 3.5590476989746094,
"learning_rate": 9.371739531336446e-06,
"loss": 1.3496,
"step": 105500
},
{
"epoch": 0.52,
"grad_norm": 3.6819374561309814,
"learning_rate": 9.371114396541755e-06,
"loss": 1.3254,
"step": 105600
},
{
"epoch": 0.52,
"grad_norm": 3.0282516479492188,
"learning_rate": 9.370489261747066e-06,
"loss": 1.3568,
"step": 105700
},
{
"epoch": 0.52,
"grad_norm": 3.659374475479126,
"learning_rate": 9.369864126952375e-06,
"loss": 1.3331,
"step": 105800
},
{
"epoch": 0.52,
"grad_norm": 2.9225759506225586,
"learning_rate": 9.369238992157685e-06,
"loss": 1.3293,
"step": 105900
},
{
"epoch": 0.52,
"grad_norm": 3.179713726043701,
"learning_rate": 9.368613857362994e-06,
"loss": 1.3342,
"step": 106000
},
{
"epoch": 0.52,
"grad_norm": 3.126467704772949,
"learning_rate": 9.367988722568304e-06,
"loss": 1.3441,
"step": 106100
},
{
"epoch": 0.53,
"grad_norm": 4.179965019226074,
"learning_rate": 9.367363587773614e-06,
"loss": 1.3225,
"step": 106200
},
{
"epoch": 0.53,
"grad_norm": 4.020696640014648,
"learning_rate": 9.366738452978924e-06,
"loss": 1.3263,
"step": 106300
},
{
"epoch": 0.53,
"grad_norm": 4.02736759185791,
"learning_rate": 9.366113318184234e-06,
"loss": 1.3621,
"step": 106400
},
{
"epoch": 0.53,
"grad_norm": 2.7809269428253174,
"learning_rate": 9.365488183389544e-06,
"loss": 1.3583,
"step": 106500
},
{
"epoch": 0.53,
"grad_norm": 3.92323637008667,
"learning_rate": 9.364863048594854e-06,
"loss": 1.359,
"step": 106600
},
{
"epoch": 0.53,
"grad_norm": 3.1310439109802246,
"learning_rate": 9.364237913800164e-06,
"loss": 1.3296,
"step": 106700
},
{
"epoch": 0.53,
"grad_norm": 2.9712395668029785,
"learning_rate": 9.363612779005474e-06,
"loss": 1.3118,
"step": 106800
},
{
"epoch": 0.53,
"grad_norm": 3.047405481338501,
"learning_rate": 9.362987644210784e-06,
"loss": 1.3441,
"step": 106900
},
{
"epoch": 0.53,
"grad_norm": 4.126023292541504,
"learning_rate": 9.362362509416094e-06,
"loss": 1.3484,
"step": 107000
},
{
"epoch": 0.53,
"grad_norm": 4.234996318817139,
"learning_rate": 9.361737374621404e-06,
"loss": 1.2947,
"step": 107100
},
{
"epoch": 0.53,
"grad_norm": 3.2066574096679688,
"learning_rate": 9.361112239826714e-06,
"loss": 1.3595,
"step": 107200
},
{
"epoch": 0.53,
"grad_norm": 2.9832849502563477,
"learning_rate": 9.360487105032023e-06,
"loss": 1.324,
"step": 107300
},
{
"epoch": 0.53,
"grad_norm": 3.168886661529541,
"learning_rate": 9.359861970237333e-06,
"loss": 1.3508,
"step": 107400
},
{
"epoch": 0.53,
"grad_norm": 4.0027995109558105,
"learning_rate": 9.359236835442643e-06,
"loss": 1.344,
"step": 107500
},
{
"epoch": 0.53,
"grad_norm": 3.394458770751953,
"learning_rate": 9.358611700647953e-06,
"loss": 1.3295,
"step": 107600
},
{
"epoch": 0.53,
"grad_norm": 2.4304399490356445,
"learning_rate": 9.357986565853263e-06,
"loss": 1.3473,
"step": 107700
},
{
"epoch": 0.53,
"grad_norm": 2.8694140911102295,
"learning_rate": 9.357361431058573e-06,
"loss": 1.3052,
"step": 107800
},
{
"epoch": 0.53,
"grad_norm": 2.8801755905151367,
"learning_rate": 9.356736296263883e-06,
"loss": 1.3495,
"step": 107900
},
{
"epoch": 0.53,
"grad_norm": 2.713139057159424,
"learning_rate": 9.356111161469193e-06,
"loss": 1.3552,
"step": 108000
},
{
"epoch": 0.53,
"grad_norm": 3.1576766967773438,
"learning_rate": 9.355486026674503e-06,
"loss": 1.3425,
"step": 108100
},
{
"epoch": 0.54,
"grad_norm": 3.07737398147583,
"learning_rate": 9.354860891879813e-06,
"loss": 1.2991,
"step": 108200
},
{
"epoch": 0.54,
"grad_norm": 3.1460793018341064,
"learning_rate": 9.354235757085123e-06,
"loss": 1.3528,
"step": 108300
},
{
"epoch": 0.54,
"grad_norm": 3.4123237133026123,
"learning_rate": 9.353610622290433e-06,
"loss": 1.3549,
"step": 108400
},
{
"epoch": 0.54,
"grad_norm": 4.295971870422363,
"learning_rate": 9.352985487495741e-06,
"loss": 1.32,
"step": 108500
},
{
"epoch": 0.54,
"grad_norm": 7.587291240692139,
"learning_rate": 9.352360352701053e-06,
"loss": 1.3205,
"step": 108600
},
{
"epoch": 0.54,
"grad_norm": 2.6797661781311035,
"learning_rate": 9.35173521790636e-06,
"loss": 1.3383,
"step": 108700
},
{
"epoch": 0.54,
"grad_norm": 2.9257397651672363,
"learning_rate": 9.351110083111672e-06,
"loss": 1.343,
"step": 108800
},
{
"epoch": 0.54,
"grad_norm": 2.9473204612731934,
"learning_rate": 9.35048494831698e-06,
"loss": 1.3423,
"step": 108900
},
{
"epoch": 0.54,
"grad_norm": 3.4466347694396973,
"learning_rate": 9.349859813522292e-06,
"loss": 1.3261,
"step": 109000
},
{
"epoch": 0.54,
"grad_norm": 3.173017978668213,
"learning_rate": 9.3492346787276e-06,
"loss": 1.3222,
"step": 109100
},
{
"epoch": 0.54,
"grad_norm": 3.7112643718719482,
"learning_rate": 9.348609543932912e-06,
"loss": 1.3528,
"step": 109200
},
{
"epoch": 0.54,
"grad_norm": 2.747380495071411,
"learning_rate": 9.34798440913822e-06,
"loss": 1.343,
"step": 109300
},
{
"epoch": 0.54,
"grad_norm": 2.6822452545166016,
"learning_rate": 9.347359274343532e-06,
"loss": 1.3478,
"step": 109400
},
{
"epoch": 0.54,
"grad_norm": 3.6231350898742676,
"learning_rate": 9.34673413954884e-06,
"loss": 1.3569,
"step": 109500
},
{
"epoch": 0.54,
"grad_norm": 2.985164165496826,
"learning_rate": 9.346109004754152e-06,
"loss": 1.358,
"step": 109600
},
{
"epoch": 0.54,
"grad_norm": 3.304058790206909,
"learning_rate": 9.34548386995946e-06,
"loss": 1.3312,
"step": 109700
},
{
"epoch": 0.54,
"grad_norm": 3.1674203872680664,
"learning_rate": 9.34485873516477e-06,
"loss": 1.3499,
"step": 109800
},
{
"epoch": 0.54,
"grad_norm": 3.0080056190490723,
"learning_rate": 9.34423360037008e-06,
"loss": 1.3268,
"step": 109900
},
{
"epoch": 0.54,
"grad_norm": 3.8802080154418945,
"learning_rate": 9.34360846557539e-06,
"loss": 1.3494,
"step": 110000
},
{
"epoch": 0.54,
"grad_norm": 3.842288017272949,
"learning_rate": 9.3429833307807e-06,
"loss": 1.3368,
"step": 110100
},
{
"epoch": 0.55,
"grad_norm": 3.315469980239868,
"learning_rate": 9.34235819598601e-06,
"loss": 1.3744,
"step": 110200
},
{
"epoch": 0.55,
"grad_norm": 5.507584095001221,
"learning_rate": 9.34173306119132e-06,
"loss": 1.3635,
"step": 110300
},
{
"epoch": 0.55,
"grad_norm": 3.494532585144043,
"learning_rate": 9.34110792639663e-06,
"loss": 1.3553,
"step": 110400
},
{
"epoch": 0.55,
"grad_norm": 2.602483034133911,
"learning_rate": 9.34048279160194e-06,
"loss": 1.3783,
"step": 110500
},
{
"epoch": 0.55,
"grad_norm": 4.201229572296143,
"learning_rate": 9.33985765680725e-06,
"loss": 1.3466,
"step": 110600
},
{
"epoch": 0.55,
"grad_norm": 3.809846878051758,
"learning_rate": 9.339232522012561e-06,
"loss": 1.3274,
"step": 110700
},
{
"epoch": 0.55,
"grad_norm": 2.964759588241577,
"learning_rate": 9.33860738721787e-06,
"loss": 1.3954,
"step": 110800
},
{
"epoch": 0.55,
"grad_norm": 2.925959587097168,
"learning_rate": 9.33798225242318e-06,
"loss": 1.3258,
"step": 110900
},
{
"epoch": 0.55,
"grad_norm": 2.654022455215454,
"learning_rate": 9.337357117628489e-06,
"loss": 1.3504,
"step": 111000
},
{
"epoch": 0.55,
"grad_norm": 3.296046257019043,
"learning_rate": 9.336731982833799e-06,
"loss": 1.3136,
"step": 111100
},
{
"epoch": 0.55,
"grad_norm": 3.804032564163208,
"learning_rate": 9.336106848039109e-06,
"loss": 1.3428,
"step": 111200
},
{
"epoch": 0.55,
"grad_norm": 3.254333019256592,
"learning_rate": 9.335481713244419e-06,
"loss": 1.3583,
"step": 111300
},
{
"epoch": 0.55,
"grad_norm": 2.7955849170684814,
"learning_rate": 9.334856578449729e-06,
"loss": 1.3294,
"step": 111400
},
{
"epoch": 0.55,
"grad_norm": 4.444815635681152,
"learning_rate": 9.334231443655039e-06,
"loss": 1.3727,
"step": 111500
},
{
"epoch": 0.55,
"grad_norm": 2.5120747089385986,
"learning_rate": 9.333606308860349e-06,
"loss": 1.3454,
"step": 111600
},
{
"epoch": 0.55,
"grad_norm": 3.015342950820923,
"learning_rate": 9.332981174065659e-06,
"loss": 1.319,
"step": 111700
},
{
"epoch": 0.55,
"grad_norm": 3.021049976348877,
"learning_rate": 9.332356039270968e-06,
"loss": 1.3498,
"step": 111800
},
{
"epoch": 0.55,
"grad_norm": 3.354524850845337,
"learning_rate": 9.331730904476278e-06,
"loss": 1.3552,
"step": 111900
},
{
"epoch": 0.55,
"grad_norm": 2.649719476699829,
"learning_rate": 9.331105769681588e-06,
"loss": 1.3276,
"step": 112000
},
{
"epoch": 0.55,
"grad_norm": 3.1411306858062744,
"learning_rate": 9.330480634886898e-06,
"loss": 1.3616,
"step": 112100
},
{
"epoch": 0.56,
"grad_norm": 3.030653238296509,
"learning_rate": 9.329855500092208e-06,
"loss": 1.3253,
"step": 112200
},
{
"epoch": 0.56,
"grad_norm": 2.987105369567871,
"learning_rate": 9.329230365297518e-06,
"loss": 1.3133,
"step": 112300
},
{
"epoch": 0.56,
"grad_norm": 3.9337832927703857,
"learning_rate": 9.328605230502828e-06,
"loss": 1.3697,
"step": 112400
},
{
"epoch": 0.56,
"grad_norm": 2.6805777549743652,
"learning_rate": 9.327980095708138e-06,
"loss": 1.3689,
"step": 112500
},
{
"epoch": 0.56,
"grad_norm": 3.582444667816162,
"learning_rate": 9.327354960913448e-06,
"loss": 1.3746,
"step": 112600
},
{
"epoch": 0.56,
"grad_norm": 4.040530204772949,
"learning_rate": 9.326729826118758e-06,
"loss": 1.3484,
"step": 112700
},
{
"epoch": 0.56,
"grad_norm": 3.6632535457611084,
"learning_rate": 9.326104691324068e-06,
"loss": 1.3191,
"step": 112800
},
{
"epoch": 0.56,
"grad_norm": 2.814882278442383,
"learning_rate": 9.325479556529378e-06,
"loss": 1.3331,
"step": 112900
},
{
"epoch": 0.56,
"grad_norm": 3.2695939540863037,
"learning_rate": 9.324854421734688e-06,
"loss": 1.3307,
"step": 113000
},
{
"epoch": 0.56,
"grad_norm": 3.6386842727661133,
"learning_rate": 9.324229286939998e-06,
"loss": 1.3464,
"step": 113100
},
{
"epoch": 0.56,
"grad_norm": 2.7537262439727783,
"learning_rate": 9.323604152145307e-06,
"loss": 1.3177,
"step": 113200
},
{
"epoch": 0.56,
"grad_norm": 3.279010057449341,
"learning_rate": 9.322979017350617e-06,
"loss": 1.353,
"step": 113300
},
{
"epoch": 0.56,
"grad_norm": 3.230193614959717,
"learning_rate": 9.322353882555927e-06,
"loss": 1.3301,
"step": 113400
},
{
"epoch": 0.56,
"grad_norm": 2.851243257522583,
"learning_rate": 9.321728747761237e-06,
"loss": 1.3535,
"step": 113500
},
{
"epoch": 0.56,
"grad_norm": 2.865309000015259,
"learning_rate": 9.321103612966547e-06,
"loss": 1.3836,
"step": 113600
},
{
"epoch": 0.56,
"grad_norm": 3.171292781829834,
"learning_rate": 9.320478478171855e-06,
"loss": 1.3426,
"step": 113700
},
{
"epoch": 0.56,
"grad_norm": 3.774327278137207,
"learning_rate": 9.319853343377167e-06,
"loss": 1.3376,
"step": 113800
},
{
"epoch": 0.56,
"grad_norm": 3.775113344192505,
"learning_rate": 9.319228208582475e-06,
"loss": 1.3253,
"step": 113900
},
{
"epoch": 0.56,
"grad_norm": 3.0207529067993164,
"learning_rate": 9.318603073787787e-06,
"loss": 1.3176,
"step": 114000
},
{
"epoch": 0.56,
"grad_norm": 3.2777695655822754,
"learning_rate": 9.317977938993095e-06,
"loss": 1.3699,
"step": 114100
},
{
"epoch": 0.56,
"grad_norm": 3.0100061893463135,
"learning_rate": 9.317352804198407e-06,
"loss": 1.3158,
"step": 114200
},
{
"epoch": 0.57,
"grad_norm": 3.428809881210327,
"learning_rate": 9.316727669403715e-06,
"loss": 1.3165,
"step": 114300
},
{
"epoch": 0.57,
"grad_norm": 2.833083391189575,
"learning_rate": 9.316102534609027e-06,
"loss": 1.3487,
"step": 114400
},
{
"epoch": 0.57,
"grad_norm": 2.81231951713562,
"learning_rate": 9.315477399814335e-06,
"loss": 1.327,
"step": 114500
},
{
"epoch": 0.57,
"grad_norm": 2.9721994400024414,
"learning_rate": 9.314852265019646e-06,
"loss": 1.3189,
"step": 114600
},
{
"epoch": 0.57,
"grad_norm": 2.969564437866211,
"learning_rate": 9.314227130224955e-06,
"loss": 1.3487,
"step": 114700
},
{
"epoch": 0.57,
"grad_norm": 3.1244125366210938,
"learning_rate": 9.313601995430266e-06,
"loss": 1.3391,
"step": 114800
},
{
"epoch": 0.57,
"grad_norm": 3.785893201828003,
"learning_rate": 9.312976860635575e-06,
"loss": 1.3017,
"step": 114900
},
{
"epoch": 0.57,
"grad_norm": 2.9908628463745117,
"learning_rate": 9.312351725840884e-06,
"loss": 1.3004,
"step": 115000
},
{
"epoch": 0.57,
"grad_norm": 2.7877655029296875,
"learning_rate": 9.311726591046194e-06,
"loss": 1.3345,
"step": 115100
},
{
"epoch": 0.57,
"grad_norm": 2.73725962638855,
"learning_rate": 9.311101456251504e-06,
"loss": 1.3164,
"step": 115200
},
{
"epoch": 0.57,
"grad_norm": 2.6744511127471924,
"learning_rate": 9.310476321456814e-06,
"loss": 1.3362,
"step": 115300
},
{
"epoch": 0.57,
"grad_norm": 2.9426522254943848,
"learning_rate": 9.309851186662124e-06,
"loss": 1.3111,
"step": 115400
},
{
"epoch": 0.57,
"grad_norm": 3.818319797515869,
"learning_rate": 9.309226051867434e-06,
"loss": 1.3193,
"step": 115500
},
{
"epoch": 0.57,
"grad_norm": 3.0991666316986084,
"learning_rate": 9.308600917072744e-06,
"loss": 1.3144,
"step": 115600
},
{
"epoch": 0.57,
"grad_norm": 3.6219863891601562,
"learning_rate": 9.307975782278054e-06,
"loss": 1.3397,
"step": 115700
},
{
"epoch": 0.57,
"grad_norm": 3.1432971954345703,
"learning_rate": 9.307350647483364e-06,
"loss": 1.3259,
"step": 115800
},
{
"epoch": 0.57,
"grad_norm": 4.15132999420166,
"learning_rate": 9.306725512688675e-06,
"loss": 1.3362,
"step": 115900
},
{
"epoch": 0.57,
"grad_norm": 4.191103935241699,
"learning_rate": 9.306100377893984e-06,
"loss": 1.3117,
"step": 116000
},
{
"epoch": 0.57,
"grad_norm": 3.6365630626678467,
"learning_rate": 9.305475243099295e-06,
"loss": 1.3397,
"step": 116100
},
{
"epoch": 0.57,
"grad_norm": 2.899077892303467,
"learning_rate": 9.304850108304604e-06,
"loss": 1.3434,
"step": 116200
},
{
"epoch": 0.58,
"grad_norm": 3.006063461303711,
"learning_rate": 9.304224973509915e-06,
"loss": 1.3148,
"step": 116300
},
{
"epoch": 0.58,
"grad_norm": 3.2103986740112305,
"learning_rate": 9.303599838715223e-06,
"loss": 1.3313,
"step": 116400
},
{
"epoch": 0.58,
"grad_norm": 2.6371185779571533,
"learning_rate": 9.302974703920533e-06,
"loss": 1.3219,
"step": 116500
},
{
"epoch": 0.58,
"grad_norm": 3.5810282230377197,
"learning_rate": 9.302349569125843e-06,
"loss": 1.3199,
"step": 116600
},
{
"epoch": 0.58,
"grad_norm": 3.7903432846069336,
"learning_rate": 9.301724434331153e-06,
"loss": 1.3604,
"step": 116700
},
{
"epoch": 0.58,
"grad_norm": 3.0497376918792725,
"learning_rate": 9.301099299536463e-06,
"loss": 1.3517,
"step": 116800
},
{
"epoch": 0.58,
"grad_norm": 3.0921273231506348,
"learning_rate": 9.300474164741773e-06,
"loss": 1.3192,
"step": 116900
},
{
"epoch": 0.58,
"grad_norm": 4.081624507904053,
"learning_rate": 9.299849029947083e-06,
"loss": 1.3376,
"step": 117000
},
{
"epoch": 0.58,
"grad_norm": 3.9647045135498047,
"learning_rate": 9.299223895152393e-06,
"loss": 1.3501,
"step": 117100
},
{
"epoch": 0.58,
"grad_norm": 3.3014702796936035,
"learning_rate": 9.298598760357703e-06,
"loss": 1.3467,
"step": 117200
},
{
"epoch": 0.58,
"grad_norm": 4.240477085113525,
"learning_rate": 9.297973625563013e-06,
"loss": 1.3094,
"step": 117300
},
{
"epoch": 0.58,
"grad_norm": 3.317046642303467,
"learning_rate": 9.297348490768323e-06,
"loss": 1.3586,
"step": 117400
},
{
"epoch": 0.58,
"grad_norm": 2.8633594512939453,
"learning_rate": 9.296723355973633e-06,
"loss": 1.3272,
"step": 117500
},
{
"epoch": 0.58,
"grad_norm": 3.435241937637329,
"learning_rate": 9.296098221178943e-06,
"loss": 1.3683,
"step": 117600
},
{
"epoch": 0.58,
"grad_norm": 2.955159902572632,
"learning_rate": 9.295473086384252e-06,
"loss": 1.35,
"step": 117700
},
{
"epoch": 0.58,
"grad_norm": 2.941067695617676,
"learning_rate": 9.294847951589562e-06,
"loss": 1.3438,
"step": 117800
},
{
"epoch": 0.58,
"grad_norm": 4.773413181304932,
"learning_rate": 9.294222816794872e-06,
"loss": 1.3167,
"step": 117900
},
{
"epoch": 0.58,
"grad_norm": 2.976818084716797,
"learning_rate": 9.293597682000182e-06,
"loss": 1.3415,
"step": 118000
},
{
"epoch": 0.58,
"grad_norm": 2.769272804260254,
"learning_rate": 9.292972547205492e-06,
"loss": 1.3596,
"step": 118100
},
{
"epoch": 0.58,
"grad_norm": 3.0079257488250732,
"learning_rate": 9.292347412410802e-06,
"loss": 1.3479,
"step": 118200
},
{
"epoch": 0.59,
"grad_norm": 6.148379802703857,
"learning_rate": 9.291722277616112e-06,
"loss": 1.3324,
"step": 118300
},
{
"epoch": 0.59,
"grad_norm": 3.0416197776794434,
"learning_rate": 9.291097142821422e-06,
"loss": 1.307,
"step": 118400
},
{
"epoch": 0.59,
"grad_norm": 2.869318962097168,
"learning_rate": 9.290472008026732e-06,
"loss": 1.3217,
"step": 118500
},
{
"epoch": 0.59,
"grad_norm": 3.5434398651123047,
"learning_rate": 9.289846873232042e-06,
"loss": 1.3356,
"step": 118600
},
{
"epoch": 0.59,
"grad_norm": 3.5270133018493652,
"learning_rate": 9.289221738437352e-06,
"loss": 1.3206,
"step": 118700
},
{
"epoch": 0.59,
"grad_norm": 5.0058369636535645,
"learning_rate": 9.288596603642662e-06,
"loss": 1.3386,
"step": 118800
},
{
"epoch": 0.59,
"grad_norm": 3.7998316287994385,
"learning_rate": 9.287971468847972e-06,
"loss": 1.2882,
"step": 118900
},
{
"epoch": 0.59,
"grad_norm": 4.403027534484863,
"learning_rate": 9.287346334053281e-06,
"loss": 1.3318,
"step": 119000
},
{
"epoch": 0.59,
"grad_norm": 3.3011553287506104,
"learning_rate": 9.28672119925859e-06,
"loss": 1.3395,
"step": 119100
},
{
"epoch": 0.59,
"grad_norm": 3.5392231941223145,
"learning_rate": 9.286096064463901e-06,
"loss": 1.3309,
"step": 119200
},
{
"epoch": 0.59,
"grad_norm": 3.0157182216644287,
"learning_rate": 9.28547092966921e-06,
"loss": 1.3212,
"step": 119300
},
{
"epoch": 0.59,
"grad_norm": 3.292978048324585,
"learning_rate": 9.284845794874521e-06,
"loss": 1.3419,
"step": 119400
},
{
"epoch": 0.59,
"grad_norm": 3.0125534534454346,
"learning_rate": 9.28422066007983e-06,
"loss": 1.3237,
"step": 119500
},
{
"epoch": 0.59,
"grad_norm": 3.421067476272583,
"learning_rate": 9.283595525285141e-06,
"loss": 1.343,
"step": 119600
},
{
"epoch": 0.59,
"grad_norm": 3.0257959365844727,
"learning_rate": 9.28297039049045e-06,
"loss": 1.3339,
"step": 119700
},
{
"epoch": 0.59,
"grad_norm": 3.5511181354522705,
"learning_rate": 9.282345255695761e-06,
"loss": 1.2891,
"step": 119800
},
{
"epoch": 0.59,
"grad_norm": 3.5419836044311523,
"learning_rate": 9.281720120901069e-06,
"loss": 1.3189,
"step": 119900
},
{
"epoch": 0.59,
"grad_norm": 2.700242757797241,
"learning_rate": 9.28109498610638e-06,
"loss": 1.3302,
"step": 120000
}
],
"logging_steps": 100,
"max_steps": 1604655,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 10000,
"total_flos": 2.4386875134941594e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}