{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5936927565773118, "eval_steps": 500, "global_step": 120000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 296.6432189941406, "learning_rate": 2.0000000000000002e-07, "loss": 36.1442, "step": 100 }, { "epoch": 0.0, "grad_norm": 282.62713623046875, "learning_rate": 4.0000000000000003e-07, "loss": 36.2463, "step": 200 }, { "epoch": 0.0, "grad_norm": 271.8738098144531, "learning_rate": 6.000000000000001e-07, "loss": 36.3589, "step": 300 }, { "epoch": 0.0, "grad_norm": 287.5734558105469, "learning_rate": 8.000000000000001e-07, "loss": 36.0855, "step": 400 }, { "epoch": 0.0, "grad_norm": 259.08685302734375, "learning_rate": 1.0000000000000002e-06, "loss": 35.8685, "step": 500 }, { "epoch": 0.0, "grad_norm": 292.2701416015625, "learning_rate": 1.2000000000000002e-06, "loss": 35.4046, "step": 600 }, { "epoch": 0.0, "grad_norm": 307.0222473144531, "learning_rate": 1.4000000000000001e-06, "loss": 34.6875, "step": 700 }, { "epoch": 0.0, "grad_norm": 274.7489929199219, "learning_rate": 1.6000000000000001e-06, "loss": 34.1517, "step": 800 }, { "epoch": 0.0, "grad_norm": 240.46612548828125, "learning_rate": 1.8000000000000001e-06, "loss": 33.4424, "step": 900 }, { "epoch": 0.0, "grad_norm": 268.32684326171875, "learning_rate": 2.0000000000000003e-06, "loss": 32.5807, "step": 1000 }, { "epoch": 0.01, "grad_norm": 258.87274169921875, "learning_rate": 2.2e-06, "loss": 30.8752, "step": 1100 }, { "epoch": 0.01, "grad_norm": 288.45611572265625, "learning_rate": 2.4000000000000003e-06, "loss": 29.5351, "step": 1200 }, { "epoch": 0.01, "grad_norm": 261.72149658203125, "learning_rate": 2.6e-06, "loss": 27.398, "step": 1300 }, { "epoch": 0.01, "grad_norm": 331.3612365722656, "learning_rate": 2.8000000000000003e-06, "loss": 24.6465, "step": 1400 }, { "epoch": 0.01, "grad_norm": 215.5654296875, "learning_rate": 3e-06, "loss": 21.7979, "step": 1500 }, { "epoch": 0.01, "grad_norm": 222.94651794433594, "learning_rate": 3.2000000000000003e-06, "loss": 18.6465, "step": 1600 }, { "epoch": 0.01, "grad_norm": 252.55087280273438, "learning_rate": 3.4000000000000005e-06, "loss": 15.7462, "step": 1700 }, { "epoch": 0.01, "grad_norm": 273.9644470214844, "learning_rate": 3.6000000000000003e-06, "loss": 13.9379, "step": 1800 }, { "epoch": 0.01, "grad_norm": 205.786376953125, "learning_rate": 3.8000000000000005e-06, "loss": 12.2574, "step": 1900 }, { "epoch": 0.01, "grad_norm": 151.63124084472656, "learning_rate": 4.000000000000001e-06, "loss": 9.865, "step": 2000 }, { "epoch": 0.01, "grad_norm": 120.38298034667969, "learning_rate": 4.2000000000000004e-06, "loss": 8.9936, "step": 2100 }, { "epoch": 0.01, "grad_norm": 93.93321990966797, "learning_rate": 4.4e-06, "loss": 8.3415, "step": 2200 }, { "epoch": 0.01, "grad_norm": 102.68135833740234, "learning_rate": 4.600000000000001e-06, "loss": 7.4711, "step": 2300 }, { "epoch": 0.01, "grad_norm": 250.72817993164062, "learning_rate": 4.800000000000001e-06, "loss": 6.5714, "step": 2400 }, { "epoch": 0.01, "grad_norm": 249.8506317138672, "learning_rate": 5e-06, "loss": 5.9448, "step": 2500 }, { "epoch": 0.01, "grad_norm": 66.83155059814453, "learning_rate": 5.2e-06, "loss": 5.6368, "step": 2600 }, { "epoch": 0.01, "grad_norm": 51.391082763671875, "learning_rate": 5.400000000000001e-06, "loss": 4.8538, "step": 2700 }, { "epoch": 0.01, "grad_norm": 50.51924133300781, "learning_rate": 5.600000000000001e-06, "loss": 4.5733, "step": 2800 }, { "epoch": 0.01, "grad_norm": 33.91701889038086, "learning_rate": 5.8e-06, "loss": 4.1586, "step": 2900 }, { "epoch": 0.01, "grad_norm": 41.544532775878906, "learning_rate": 6e-06, "loss": 3.8914, "step": 3000 }, { "epoch": 0.02, "grad_norm": 44.3348274230957, "learning_rate": 6.200000000000001e-06, "loss": 3.4145, "step": 3100 }, { "epoch": 0.02, "grad_norm": 27.11107063293457, "learning_rate": 6.4000000000000006e-06, "loss": 3.2646, "step": 3200 }, { "epoch": 0.02, "grad_norm": 144.55479431152344, "learning_rate": 6.600000000000001e-06, "loss": 3.1211, "step": 3300 }, { "epoch": 0.02, "grad_norm": 16.845191955566406, "learning_rate": 6.800000000000001e-06, "loss": 3.0997, "step": 3400 }, { "epoch": 0.02, "grad_norm": 13.76279067993164, "learning_rate": 7e-06, "loss": 2.7326, "step": 3500 }, { "epoch": 0.02, "grad_norm": 10.799291610717773, "learning_rate": 7.2000000000000005e-06, "loss": 2.743, "step": 3600 }, { "epoch": 0.02, "grad_norm": 10.540129661560059, "learning_rate": 7.4e-06, "loss": 2.6056, "step": 3700 }, { "epoch": 0.02, "grad_norm": 11.966181755065918, "learning_rate": 7.600000000000001e-06, "loss": 2.4526, "step": 3800 }, { "epoch": 0.02, "grad_norm": 16.517465591430664, "learning_rate": 7.800000000000002e-06, "loss": 2.419, "step": 3900 }, { "epoch": 0.02, "grad_norm": 11.533743858337402, "learning_rate": 8.000000000000001e-06, "loss": 2.3781, "step": 4000 }, { "epoch": 0.02, "grad_norm": 12.458216667175293, "learning_rate": 8.2e-06, "loss": 2.296, "step": 4100 }, { "epoch": 0.02, "grad_norm": 7.576855182647705, "learning_rate": 8.400000000000001e-06, "loss": 2.269, "step": 4200 }, { "epoch": 0.02, "grad_norm": 6.14406681060791, "learning_rate": 8.6e-06, "loss": 2.2047, "step": 4300 }, { "epoch": 0.02, "grad_norm": 8.389999389648438, "learning_rate": 8.8e-06, "loss": 2.1709, "step": 4400 }, { "epoch": 0.02, "grad_norm": 9.2235689163208, "learning_rate": 9e-06, "loss": 2.0764, "step": 4500 }, { "epoch": 0.02, "grad_norm": 19.56966209411621, "learning_rate": 9.200000000000002e-06, "loss": 2.0616, "step": 4600 }, { "epoch": 0.02, "grad_norm": 14.260141372680664, "learning_rate": 9.4e-06, "loss": 2.0158, "step": 4700 }, { "epoch": 0.02, "grad_norm": 6.950816631317139, "learning_rate": 9.600000000000001e-06, "loss": 1.9903, "step": 4800 }, { "epoch": 0.02, "grad_norm": 6.523265361785889, "learning_rate": 9.800000000000001e-06, "loss": 1.9903, "step": 4900 }, { "epoch": 0.02, "grad_norm": 5.682758331298828, "learning_rate": 1e-05, "loss": 2.0119, "step": 5000 }, { "epoch": 0.03, "grad_norm": 6.691493988037109, "learning_rate": 9.99937486520531e-06, "loss": 1.966, "step": 5100 }, { "epoch": 0.03, "grad_norm": 6.045505046844482, "learning_rate": 9.99874973041062e-06, "loss": 1.9175, "step": 5200 }, { "epoch": 0.03, "grad_norm": 5.446534633636475, "learning_rate": 9.99812459561593e-06, "loss": 1.9086, "step": 5300 }, { "epoch": 0.03, "grad_norm": 6.22329044342041, "learning_rate": 9.99749946082124e-06, "loss": 1.8706, "step": 5400 }, { "epoch": 0.03, "grad_norm": 6.028206825256348, "learning_rate": 9.99687432602655e-06, "loss": 1.8183, "step": 5500 }, { "epoch": 0.03, "grad_norm": 5.474781036376953, "learning_rate": 9.99624919123186e-06, "loss": 1.9045, "step": 5600 }, { "epoch": 0.03, "grad_norm": 4.7177886962890625, "learning_rate": 9.99562405643717e-06, "loss": 1.8141, "step": 5700 }, { "epoch": 0.03, "grad_norm": 6.469454288482666, "learning_rate": 9.99499892164248e-06, "loss": 1.8079, "step": 5800 }, { "epoch": 0.03, "grad_norm": 5.826772689819336, "learning_rate": 9.99437378684779e-06, "loss": 1.852, "step": 5900 }, { "epoch": 0.03, "grad_norm": 4.368248462677002, "learning_rate": 9.9937486520531e-06, "loss": 1.8207, "step": 6000 }, { "epoch": 0.03, "grad_norm": 6.686717987060547, "learning_rate": 9.99312351725841e-06, "loss": 1.7868, "step": 6100 }, { "epoch": 0.03, "grad_norm": 4.562761306762695, "learning_rate": 9.99249838246372e-06, "loss": 1.8201, "step": 6200 }, { "epoch": 0.03, "grad_norm": 4.788825511932373, "learning_rate": 9.99187324766903e-06, "loss": 1.7568, "step": 6300 }, { "epoch": 0.03, "grad_norm": 5.2414870262146, "learning_rate": 9.991248112874338e-06, "loss": 1.7635, "step": 6400 }, { "epoch": 0.03, "grad_norm": 4.775527477264404, "learning_rate": 9.99062297807965e-06, "loss": 1.7465, "step": 6500 }, { "epoch": 0.03, "grad_norm": 5.719698429107666, "learning_rate": 9.989997843284958e-06, "loss": 1.7776, "step": 6600 }, { "epoch": 0.03, "grad_norm": 5.103610515594482, "learning_rate": 9.98937270849027e-06, "loss": 1.7364, "step": 6700 }, { "epoch": 0.03, "grad_norm": 5.553420543670654, "learning_rate": 9.988747573695578e-06, "loss": 1.7341, "step": 6800 }, { "epoch": 0.03, "grad_norm": 4.687087535858154, "learning_rate": 9.98812243890089e-06, "loss": 1.7586, "step": 6900 }, { "epoch": 0.03, "grad_norm": 6.242082595825195, "learning_rate": 9.987497304106198e-06, "loss": 1.7255, "step": 7000 }, { "epoch": 0.04, "grad_norm": 7.58695650100708, "learning_rate": 9.98687216931151e-06, "loss": 1.724, "step": 7100 }, { "epoch": 0.04, "grad_norm": 4.764819622039795, "learning_rate": 9.986247034516818e-06, "loss": 1.6934, "step": 7200 }, { "epoch": 0.04, "grad_norm": 5.300253868103027, "learning_rate": 9.985621899722129e-06, "loss": 1.6773, "step": 7300 }, { "epoch": 0.04, "grad_norm": 4.917991638183594, "learning_rate": 9.984996764927437e-06, "loss": 1.7492, "step": 7400 }, { "epoch": 0.04, "grad_norm": 9.860074043273926, "learning_rate": 9.984371630132749e-06, "loss": 1.6835, "step": 7500 }, { "epoch": 0.04, "grad_norm": 4.517050743103027, "learning_rate": 9.983746495338059e-06, "loss": 1.6981, "step": 7600 }, { "epoch": 0.04, "grad_norm": 4.88366174697876, "learning_rate": 9.983121360543367e-06, "loss": 1.7226, "step": 7700 }, { "epoch": 0.04, "grad_norm": 4.612452983856201, "learning_rate": 9.982496225748679e-06, "loss": 1.6904, "step": 7800 }, { "epoch": 0.04, "grad_norm": 4.865972518920898, "learning_rate": 9.981871090953987e-06, "loss": 1.6969, "step": 7900 }, { "epoch": 0.04, "grad_norm": 4.375401973724365, "learning_rate": 9.981245956159299e-06, "loss": 1.6524, "step": 8000 }, { "epoch": 0.04, "grad_norm": 5.660288333892822, "learning_rate": 9.980620821364607e-06, "loss": 1.6866, "step": 8100 }, { "epoch": 0.04, "grad_norm": 4.874125957489014, "learning_rate": 9.979995686569918e-06, "loss": 1.6697, "step": 8200 }, { "epoch": 0.04, "grad_norm": 5.102114200592041, "learning_rate": 9.979370551775227e-06, "loss": 1.6429, "step": 8300 }, { "epoch": 0.04, "grad_norm": 4.398207664489746, "learning_rate": 9.978745416980538e-06, "loss": 1.6489, "step": 8400 }, { "epoch": 0.04, "grad_norm": 8.623647689819336, "learning_rate": 9.978120282185847e-06, "loss": 1.6278, "step": 8500 }, { "epoch": 0.04, "grad_norm": 4.62777853012085, "learning_rate": 9.977495147391158e-06, "loss": 1.6555, "step": 8600 }, { "epoch": 0.04, "grad_norm": 5.873004913330078, "learning_rate": 9.976870012596466e-06, "loss": 1.6624, "step": 8700 }, { "epoch": 0.04, "grad_norm": 4.456192493438721, "learning_rate": 9.976244877801778e-06, "loss": 1.6201, "step": 8800 }, { "epoch": 0.04, "grad_norm": 5.2842183113098145, "learning_rate": 9.975619743007086e-06, "loss": 1.6334, "step": 8900 }, { "epoch": 0.04, "grad_norm": 5.298410415649414, "learning_rate": 9.974994608212396e-06, "loss": 1.6472, "step": 9000 }, { "epoch": 0.05, "grad_norm": 5.887086391448975, "learning_rate": 9.974369473417706e-06, "loss": 1.6124, "step": 9100 }, { "epoch": 0.05, "grad_norm": 4.660162925720215, "learning_rate": 9.973744338623016e-06, "loss": 1.6131, "step": 9200 }, { "epoch": 0.05, "grad_norm": 5.750434398651123, "learning_rate": 9.973119203828326e-06, "loss": 1.6205, "step": 9300 }, { "epoch": 0.05, "grad_norm": 5.6416707038879395, "learning_rate": 9.972494069033636e-06, "loss": 1.619, "step": 9400 }, { "epoch": 0.05, "grad_norm": 4.29620361328125, "learning_rate": 9.971868934238946e-06, "loss": 1.6062, "step": 9500 }, { "epoch": 0.05, "grad_norm": 4.172244071960449, "learning_rate": 9.971243799444256e-06, "loss": 1.6139, "step": 9600 }, { "epoch": 0.05, "grad_norm": 3.9731390476226807, "learning_rate": 9.970618664649566e-06, "loss": 1.5706, "step": 9700 }, { "epoch": 0.05, "grad_norm": 4.9260454177856445, "learning_rate": 9.969993529854876e-06, "loss": 1.6224, "step": 9800 }, { "epoch": 0.05, "grad_norm": 5.228986740112305, "learning_rate": 9.969368395060186e-06, "loss": 1.5969, "step": 9900 }, { "epoch": 0.05, "grad_norm": 5.494061470031738, "learning_rate": 9.968743260265495e-06, "loss": 1.6129, "step": 10000 }, { "epoch": 0.05, "grad_norm": 4.893834590911865, "learning_rate": 9.968118125470805e-06, "loss": 1.5918, "step": 10100 }, { "epoch": 0.05, "grad_norm": 4.201370716094971, "learning_rate": 9.967492990676115e-06, "loss": 1.571, "step": 10200 }, { "epoch": 0.05, "grad_norm": 6.3033576011657715, "learning_rate": 9.966867855881425e-06, "loss": 1.6216, "step": 10300 }, { "epoch": 0.05, "grad_norm": 5.211835861206055, "learning_rate": 9.966242721086735e-06, "loss": 1.6006, "step": 10400 }, { "epoch": 0.05, "grad_norm": 4.3779730796813965, "learning_rate": 9.965617586292045e-06, "loss": 1.6184, "step": 10500 }, { "epoch": 0.05, "grad_norm": 4.778099060058594, "learning_rate": 9.964992451497355e-06, "loss": 1.5855, "step": 10600 }, { "epoch": 0.05, "grad_norm": 7.489856243133545, "learning_rate": 9.964367316702665e-06, "loss": 1.5827, "step": 10700 }, { "epoch": 0.05, "grad_norm": 4.601972579956055, "learning_rate": 9.963742181907975e-06, "loss": 1.6029, "step": 10800 }, { "epoch": 0.05, "grad_norm": 4.222909450531006, "learning_rate": 9.963117047113285e-06, "loss": 1.6027, "step": 10900 }, { "epoch": 0.05, "grad_norm": 4.561893939971924, "learning_rate": 9.962491912318595e-06, "loss": 1.592, "step": 11000 }, { "epoch": 0.05, "grad_norm": 4.908820152282715, "learning_rate": 9.961866777523905e-06, "loss": 1.612, "step": 11100 }, { "epoch": 0.06, "grad_norm": 4.185163974761963, "learning_rate": 9.961241642729215e-06, "loss": 1.5834, "step": 11200 }, { "epoch": 0.06, "grad_norm": 5.215177536010742, "learning_rate": 9.960616507934524e-06, "loss": 1.5741, "step": 11300 }, { "epoch": 0.06, "grad_norm": 6.018292427062988, "learning_rate": 9.959991373139834e-06, "loss": 1.5687, "step": 11400 }, { "epoch": 0.06, "grad_norm": 6.539705276489258, "learning_rate": 9.959366238345144e-06, "loss": 1.5967, "step": 11500 }, { "epoch": 0.06, "grad_norm": 5.168763637542725, "learning_rate": 9.958741103550454e-06, "loss": 1.5716, "step": 11600 }, { "epoch": 0.06, "grad_norm": 4.548024654388428, "learning_rate": 9.958115968755764e-06, "loss": 1.5576, "step": 11700 }, { "epoch": 0.06, "grad_norm": 5.756062030792236, "learning_rate": 9.957490833961072e-06, "loss": 1.5742, "step": 11800 }, { "epoch": 0.06, "grad_norm": 5.219858646392822, "learning_rate": 9.956865699166384e-06, "loss": 1.5807, "step": 11900 }, { "epoch": 0.06, "grad_norm": 4.460545063018799, "learning_rate": 9.956240564371692e-06, "loss": 1.5859, "step": 12000 }, { "epoch": 0.06, "grad_norm": 4.683807849884033, "learning_rate": 9.955615429577004e-06, "loss": 1.5769, "step": 12100 }, { "epoch": 0.06, "grad_norm": 6.083448886871338, "learning_rate": 9.954990294782312e-06, "loss": 1.5701, "step": 12200 }, { "epoch": 0.06, "grad_norm": 5.151342391967773, "learning_rate": 9.954365159987624e-06, "loss": 1.5834, "step": 12300 }, { "epoch": 0.06, "grad_norm": 4.23958158493042, "learning_rate": 9.953740025192932e-06, "loss": 1.5797, "step": 12400 }, { "epoch": 0.06, "grad_norm": 5.883495330810547, "learning_rate": 9.953114890398244e-06, "loss": 1.5584, "step": 12500 }, { "epoch": 0.06, "grad_norm": 4.5151190757751465, "learning_rate": 9.952489755603552e-06, "loss": 1.5723, "step": 12600 }, { "epoch": 0.06, "grad_norm": 4.234920024871826, "learning_rate": 9.951864620808863e-06, "loss": 1.5404, "step": 12700 }, { "epoch": 0.06, "grad_norm": 4.942254543304443, "learning_rate": 9.951239486014173e-06, "loss": 1.5373, "step": 12800 }, { "epoch": 0.06, "grad_norm": 4.068475723266602, "learning_rate": 9.950614351219483e-06, "loss": 1.5584, "step": 12900 }, { "epoch": 0.06, "grad_norm": 4.250300407409668, "learning_rate": 9.949989216424793e-06, "loss": 1.6023, "step": 13000 }, { "epoch": 0.06, "grad_norm": 4.279661178588867, "learning_rate": 9.949364081630101e-06, "loss": 1.5517, "step": 13100 }, { "epoch": 0.07, "grad_norm": 4.5876946449279785, "learning_rate": 9.948738946835413e-06, "loss": 1.5142, "step": 13200 }, { "epoch": 0.07, "grad_norm": 4.220710754394531, "learning_rate": 9.948113812040721e-06, "loss": 1.601, "step": 13300 }, { "epoch": 0.07, "grad_norm": 4.183436870574951, "learning_rate": 9.947488677246033e-06, "loss": 1.5508, "step": 13400 }, { "epoch": 0.07, "grad_norm": 4.20064640045166, "learning_rate": 9.946863542451341e-06, "loss": 1.532, "step": 13500 }, { "epoch": 0.07, "grad_norm": 5.151244640350342, "learning_rate": 9.946238407656653e-06, "loss": 1.5256, "step": 13600 }, { "epoch": 0.07, "grad_norm": 4.308995246887207, "learning_rate": 9.945613272861961e-06, "loss": 1.5349, "step": 13700 }, { "epoch": 0.07, "grad_norm": 5.477377891540527, "learning_rate": 9.944988138067273e-06, "loss": 1.5365, "step": 13800 }, { "epoch": 0.07, "grad_norm": 5.085025310516357, "learning_rate": 9.944363003272581e-06, "loss": 1.5038, "step": 13900 }, { "epoch": 0.07, "grad_norm": 4.769080638885498, "learning_rate": 9.943737868477893e-06, "loss": 1.5387, "step": 14000 }, { "epoch": 0.07, "grad_norm": 4.0054931640625, "learning_rate": 9.9431127336832e-06, "loss": 1.5018, "step": 14100 }, { "epoch": 0.07, "grad_norm": 4.712356090545654, "learning_rate": 9.942487598888512e-06, "loss": 1.5049, "step": 14200 }, { "epoch": 0.07, "grad_norm": 4.574007034301758, "learning_rate": 9.94186246409382e-06, "loss": 1.5316, "step": 14300 }, { "epoch": 0.07, "grad_norm": 4.079704761505127, "learning_rate": 9.94123732929913e-06, "loss": 1.527, "step": 14400 }, { "epoch": 0.07, "grad_norm": 4.1134490966796875, "learning_rate": 9.94061219450444e-06, "loss": 1.5472, "step": 14500 }, { "epoch": 0.07, "grad_norm": 5.486052989959717, "learning_rate": 9.93998705970975e-06, "loss": 1.5063, "step": 14600 }, { "epoch": 0.07, "grad_norm": 4.8249921798706055, "learning_rate": 9.93936192491506e-06, "loss": 1.5116, "step": 14700 }, { "epoch": 0.07, "grad_norm": 4.1462931632995605, "learning_rate": 9.93873679012037e-06, "loss": 1.5259, "step": 14800 }, { "epoch": 0.07, "grad_norm": 4.222506999969482, "learning_rate": 9.93811165532568e-06, "loss": 1.5378, "step": 14900 }, { "epoch": 0.07, "grad_norm": 3.8890185356140137, "learning_rate": 9.93748652053099e-06, "loss": 1.5424, "step": 15000 }, { "epoch": 0.07, "grad_norm": 6.006450176239014, "learning_rate": 9.9368613857363e-06, "loss": 1.5276, "step": 15100 }, { "epoch": 0.08, "grad_norm": 4.570881366729736, "learning_rate": 9.93623625094161e-06, "loss": 1.5173, "step": 15200 }, { "epoch": 0.08, "grad_norm": 4.1144633293151855, "learning_rate": 9.93561111614692e-06, "loss": 1.4905, "step": 15300 }, { "epoch": 0.08, "grad_norm": 5.034255027770996, "learning_rate": 9.93498598135223e-06, "loss": 1.5321, "step": 15400 }, { "epoch": 0.08, "grad_norm": 4.831255912780762, "learning_rate": 9.93436084655754e-06, "loss": 1.5297, "step": 15500 }, { "epoch": 0.08, "grad_norm": 4.800346851348877, "learning_rate": 9.93373571176285e-06, "loss": 1.5109, "step": 15600 }, { "epoch": 0.08, "grad_norm": 4.187744617462158, "learning_rate": 9.93311057696816e-06, "loss": 1.4896, "step": 15700 }, { "epoch": 0.08, "grad_norm": 4.569481372833252, "learning_rate": 9.93248544217347e-06, "loss": 1.5254, "step": 15800 }, { "epoch": 0.08, "grad_norm": 4.826055526733398, "learning_rate": 9.93186030737878e-06, "loss": 1.5003, "step": 15900 }, { "epoch": 0.08, "grad_norm": 5.592813491821289, "learning_rate": 9.93123517258409e-06, "loss": 1.5386, "step": 16000 }, { "epoch": 0.08, "grad_norm": 4.18519926071167, "learning_rate": 9.9306100377894e-06, "loss": 1.5192, "step": 16100 }, { "epoch": 0.08, "grad_norm": 3.737257719039917, "learning_rate": 9.92998490299471e-06, "loss": 1.5049, "step": 16200 }, { "epoch": 0.08, "grad_norm": 4.077634334564209, "learning_rate": 9.929359768200019e-06, "loss": 1.4786, "step": 16300 }, { "epoch": 0.08, "grad_norm": 3.7141683101654053, "learning_rate": 9.928734633405329e-06, "loss": 1.5008, "step": 16400 }, { "epoch": 0.08, "grad_norm": 5.656344413757324, "learning_rate": 9.928109498610639e-06, "loss": 1.4907, "step": 16500 }, { "epoch": 0.08, "grad_norm": 4.02158784866333, "learning_rate": 9.927484363815949e-06, "loss": 1.5436, "step": 16600 }, { "epoch": 0.08, "grad_norm": 4.094794273376465, "learning_rate": 9.926859229021259e-06, "loss": 1.5135, "step": 16700 }, { "epoch": 0.08, "grad_norm": 5.3504533767700195, "learning_rate": 9.926234094226569e-06, "loss": 1.4825, "step": 16800 }, { "epoch": 0.08, "grad_norm": 4.326951503753662, "learning_rate": 9.925608959431879e-06, "loss": 1.5042, "step": 16900 }, { "epoch": 0.08, "grad_norm": 4.341583728790283, "learning_rate": 9.924983824637187e-06, "loss": 1.5239, "step": 17000 }, { "epoch": 0.08, "grad_norm": 4.5446648597717285, "learning_rate": 9.924358689842499e-06, "loss": 1.5104, "step": 17100 }, { "epoch": 0.09, "grad_norm": 4.787079811096191, "learning_rate": 9.923733555047807e-06, "loss": 1.4917, "step": 17200 }, { "epoch": 0.09, "grad_norm": 4.259307384490967, "learning_rate": 9.923108420253118e-06, "loss": 1.5162, "step": 17300 }, { "epoch": 0.09, "grad_norm": 4.553911209106445, "learning_rate": 9.922483285458427e-06, "loss": 1.4805, "step": 17400 }, { "epoch": 0.09, "grad_norm": 4.846059322357178, "learning_rate": 9.921858150663738e-06, "loss": 1.51, "step": 17500 }, { "epoch": 0.09, "grad_norm": 4.385834217071533, "learning_rate": 9.921233015869046e-06, "loss": 1.4883, "step": 17600 }, { "epoch": 0.09, "grad_norm": 4.686222553253174, "learning_rate": 9.920607881074358e-06, "loss": 1.525, "step": 17700 }, { "epoch": 0.09, "grad_norm": 3.7362844944000244, "learning_rate": 9.919982746279666e-06, "loss": 1.4877, "step": 17800 }, { "epoch": 0.09, "grad_norm": 4.107117652893066, "learning_rate": 9.919357611484978e-06, "loss": 1.5353, "step": 17900 }, { "epoch": 0.09, "grad_norm": 4.318368911743164, "learning_rate": 9.918732476690288e-06, "loss": 1.4682, "step": 18000 }, { "epoch": 0.09, "grad_norm": 4.914721488952637, "learning_rate": 9.918107341895598e-06, "loss": 1.5261, "step": 18100 }, { "epoch": 0.09, "grad_norm": 4.494168281555176, "learning_rate": 9.917482207100908e-06, "loss": 1.5013, "step": 18200 }, { "epoch": 0.09, "grad_norm": 5.026334762573242, "learning_rate": 9.916857072306216e-06, "loss": 1.5093, "step": 18300 }, { "epoch": 0.09, "grad_norm": 4.3312907218933105, "learning_rate": 9.916231937511528e-06, "loss": 1.5224, "step": 18400 }, { "epoch": 0.09, "grad_norm": 3.9422335624694824, "learning_rate": 9.915606802716836e-06, "loss": 1.5059, "step": 18500 }, { "epoch": 0.09, "grad_norm": 4.773715496063232, "learning_rate": 9.914981667922147e-06, "loss": 1.5031, "step": 18600 }, { "epoch": 0.09, "grad_norm": 5.202546119689941, "learning_rate": 9.914356533127456e-06, "loss": 1.5133, "step": 18700 }, { "epoch": 0.09, "grad_norm": 4.315513610839844, "learning_rate": 9.913731398332767e-06, "loss": 1.5343, "step": 18800 }, { "epoch": 0.09, "grad_norm": 4.325439929962158, "learning_rate": 9.913106263538076e-06, "loss": 1.4993, "step": 18900 }, { "epoch": 0.09, "grad_norm": 4.189039707183838, "learning_rate": 9.912481128743387e-06, "loss": 1.4871, "step": 19000 }, { "epoch": 0.09, "grad_norm": 4.019628524780273, "learning_rate": 9.911855993948695e-06, "loss": 1.5294, "step": 19100 }, { "epoch": 0.09, "grad_norm": 3.682359457015991, "learning_rate": 9.911230859154007e-06, "loss": 1.4728, "step": 19200 }, { "epoch": 0.1, "grad_norm": 3.9543027877807617, "learning_rate": 9.910605724359315e-06, "loss": 1.473, "step": 19300 }, { "epoch": 0.1, "grad_norm": 3.9522476196289062, "learning_rate": 9.909980589564627e-06, "loss": 1.5023, "step": 19400 }, { "epoch": 0.1, "grad_norm": 3.9608728885650635, "learning_rate": 9.909355454769935e-06, "loss": 1.4532, "step": 19500 }, { "epoch": 0.1, "grad_norm": 4.24020528793335, "learning_rate": 9.908730319975245e-06, "loss": 1.4691, "step": 19600 }, { "epoch": 0.1, "grad_norm": 4.217301845550537, "learning_rate": 9.908105185180555e-06, "loss": 1.4704, "step": 19700 }, { "epoch": 0.1, "grad_norm": 3.651137113571167, "learning_rate": 9.907480050385865e-06, "loss": 1.4933, "step": 19800 }, { "epoch": 0.1, "grad_norm": 4.657069206237793, "learning_rate": 9.906854915591175e-06, "loss": 1.4778, "step": 19900 }, { "epoch": 0.1, "grad_norm": 4.250380992889404, "learning_rate": 9.906229780796485e-06, "loss": 1.4931, "step": 20000 }, { "epoch": 0.1, "grad_norm": 4.308979034423828, "learning_rate": 9.905604646001795e-06, "loss": 1.4946, "step": 20100 }, { "epoch": 0.1, "grad_norm": 4.854809284210205, "learning_rate": 9.904979511207105e-06, "loss": 1.4399, "step": 20200 }, { "epoch": 0.1, "grad_norm": 4.725897789001465, "learning_rate": 9.904354376412414e-06, "loss": 1.4711, "step": 20300 }, { "epoch": 0.1, "grad_norm": 4.014156818389893, "learning_rate": 9.903729241617724e-06, "loss": 1.5038, "step": 20400 }, { "epoch": 0.1, "grad_norm": 6.402193546295166, "learning_rate": 9.903104106823034e-06, "loss": 1.4607, "step": 20500 }, { "epoch": 0.1, "grad_norm": 4.356836795806885, "learning_rate": 9.902478972028344e-06, "loss": 1.4767, "step": 20600 }, { "epoch": 0.1, "grad_norm": 4.174656867980957, "learning_rate": 9.901853837233654e-06, "loss": 1.4675, "step": 20700 }, { "epoch": 0.1, "grad_norm": 3.668475866317749, "learning_rate": 9.901228702438964e-06, "loss": 1.4529, "step": 20800 }, { "epoch": 0.1, "grad_norm": 3.7700912952423096, "learning_rate": 9.900603567644274e-06, "loss": 1.4521, "step": 20900 }, { "epoch": 0.1, "grad_norm": 3.570835828781128, "learning_rate": 9.899978432849584e-06, "loss": 1.4323, "step": 21000 }, { "epoch": 0.1, "grad_norm": 3.7499380111694336, "learning_rate": 9.899353298054894e-06, "loss": 1.4644, "step": 21100 }, { "epoch": 0.1, "grad_norm": 3.8630640506744385, "learning_rate": 9.898728163260204e-06, "loss": 1.4801, "step": 21200 }, { "epoch": 0.11, "grad_norm": 4.1705145835876465, "learning_rate": 9.898103028465514e-06, "loss": 1.4649, "step": 21300 }, { "epoch": 0.11, "grad_norm": 4.312972545623779, "learning_rate": 9.897477893670824e-06, "loss": 1.4696, "step": 21400 }, { "epoch": 0.11, "grad_norm": 4.0083088874816895, "learning_rate": 9.896852758876134e-06, "loss": 1.4557, "step": 21500 }, { "epoch": 0.11, "grad_norm": 3.791517496109009, "learning_rate": 9.896227624081444e-06, "loss": 1.4507, "step": 21600 }, { "epoch": 0.11, "grad_norm": 4.635531425476074, "learning_rate": 9.895602489286753e-06, "loss": 1.4922, "step": 21700 }, { "epoch": 0.11, "grad_norm": 3.6790366172790527, "learning_rate": 9.894977354492063e-06, "loss": 1.456, "step": 21800 }, { "epoch": 0.11, "grad_norm": 4.739284038543701, "learning_rate": 9.894352219697373e-06, "loss": 1.4993, "step": 21900 }, { "epoch": 0.11, "grad_norm": 3.6052489280700684, "learning_rate": 9.893727084902683e-06, "loss": 1.4599, "step": 22000 }, { "epoch": 0.11, "grad_norm": 4.582137584686279, "learning_rate": 9.893101950107993e-06, "loss": 1.464, "step": 22100 }, { "epoch": 0.11, "grad_norm": 4.414693355560303, "learning_rate": 9.892476815313303e-06, "loss": 1.4383, "step": 22200 }, { "epoch": 0.11, "grad_norm": 4.012635707855225, "learning_rate": 9.891851680518613e-06, "loss": 1.4496, "step": 22300 }, { "epoch": 0.11, "grad_norm": 3.935889482498169, "learning_rate": 9.891226545723921e-06, "loss": 1.44, "step": 22400 }, { "epoch": 0.11, "grad_norm": 3.735189199447632, "learning_rate": 9.890601410929233e-06, "loss": 1.4396, "step": 22500 }, { "epoch": 0.11, "grad_norm": 7.265974998474121, "learning_rate": 9.889976276134541e-06, "loss": 1.4367, "step": 22600 }, { "epoch": 0.11, "grad_norm": 3.6876790523529053, "learning_rate": 9.889351141339853e-06, "loss": 1.4477, "step": 22700 }, { "epoch": 0.11, "grad_norm": 4.87882661819458, "learning_rate": 9.888726006545161e-06, "loss": 1.4687, "step": 22800 }, { "epoch": 0.11, "grad_norm": 4.071088790893555, "learning_rate": 9.888100871750473e-06, "loss": 1.5096, "step": 22900 }, { "epoch": 0.11, "grad_norm": 3.7322299480438232, "learning_rate": 9.887475736955783e-06, "loss": 1.4701, "step": 23000 }, { "epoch": 0.11, "grad_norm": 4.085651397705078, "learning_rate": 9.886850602161092e-06, "loss": 1.4518, "step": 23100 }, { "epoch": 0.11, "grad_norm": 3.951169729232788, "learning_rate": 9.886225467366402e-06, "loss": 1.4336, "step": 23200 }, { "epoch": 0.12, "grad_norm": 4.026634693145752, "learning_rate": 9.885600332571712e-06, "loss": 1.476, "step": 23300 }, { "epoch": 0.12, "grad_norm": 5.0271477699279785, "learning_rate": 9.884975197777022e-06, "loss": 1.4925, "step": 23400 }, { "epoch": 0.12, "grad_norm": 3.727571725845337, "learning_rate": 9.884350062982332e-06, "loss": 1.452, "step": 23500 }, { "epoch": 0.12, "grad_norm": 3.8245084285736084, "learning_rate": 9.883724928187642e-06, "loss": 1.4597, "step": 23600 }, { "epoch": 0.12, "grad_norm": 4.508169174194336, "learning_rate": 9.88309979339295e-06, "loss": 1.4481, "step": 23700 }, { "epoch": 0.12, "grad_norm": 4.080810070037842, "learning_rate": 9.882474658598262e-06, "loss": 1.4807, "step": 23800 }, { "epoch": 0.12, "grad_norm": 4.269100666046143, "learning_rate": 9.88184952380357e-06, "loss": 1.4108, "step": 23900 }, { "epoch": 0.12, "grad_norm": 4.10108757019043, "learning_rate": 9.881224389008882e-06, "loss": 1.4592, "step": 24000 }, { "epoch": 0.12, "grad_norm": 4.869362831115723, "learning_rate": 9.88059925421419e-06, "loss": 1.4687, "step": 24100 }, { "epoch": 0.12, "grad_norm": 3.7101335525512695, "learning_rate": 9.879974119419502e-06, "loss": 1.4748, "step": 24200 }, { "epoch": 0.12, "grad_norm": 3.676862955093384, "learning_rate": 9.87934898462481e-06, "loss": 1.4439, "step": 24300 }, { "epoch": 0.12, "grad_norm": 4.2332844734191895, "learning_rate": 9.878723849830121e-06, "loss": 1.4371, "step": 24400 }, { "epoch": 0.12, "grad_norm": 3.947660207748413, "learning_rate": 9.87809871503543e-06, "loss": 1.4619, "step": 24500 }, { "epoch": 0.12, "grad_norm": 4.331580638885498, "learning_rate": 9.877473580240741e-06, "loss": 1.4332, "step": 24600 }, { "epoch": 0.12, "grad_norm": 5.24333381652832, "learning_rate": 9.87684844544605e-06, "loss": 1.4474, "step": 24700 }, { "epoch": 0.12, "grad_norm": 3.499051570892334, "learning_rate": 9.876223310651361e-06, "loss": 1.4303, "step": 24800 }, { "epoch": 0.12, "grad_norm": 3.9360058307647705, "learning_rate": 9.87559817585667e-06, "loss": 1.4731, "step": 24900 }, { "epoch": 0.12, "grad_norm": 4.0263352394104, "learning_rate": 9.87497304106198e-06, "loss": 1.4429, "step": 25000 }, { "epoch": 0.12, "grad_norm": 3.827765941619873, "learning_rate": 9.87434790626729e-06, "loss": 1.4312, "step": 25100 }, { "epoch": 0.12, "grad_norm": 3.680577278137207, "learning_rate": 9.8737227714726e-06, "loss": 1.4571, "step": 25200 }, { "epoch": 0.13, "grad_norm": 4.409987926483154, "learning_rate": 9.873097636677909e-06, "loss": 1.4708, "step": 25300 }, { "epoch": 0.13, "grad_norm": 4.233061790466309, "learning_rate": 9.872472501883219e-06, "loss": 1.4079, "step": 25400 }, { "epoch": 0.13, "grad_norm": 3.923621416091919, "learning_rate": 9.871847367088529e-06, "loss": 1.4389, "step": 25500 }, { "epoch": 0.13, "grad_norm": 3.695704698562622, "learning_rate": 9.871222232293839e-06, "loss": 1.448, "step": 25600 }, { "epoch": 0.13, "grad_norm": 4.188453674316406, "learning_rate": 9.870597097499149e-06, "loss": 1.4356, "step": 25700 }, { "epoch": 0.13, "grad_norm": 4.8711934089660645, "learning_rate": 9.869971962704459e-06, "loss": 1.4221, "step": 25800 }, { "epoch": 0.13, "grad_norm": 4.016773223876953, "learning_rate": 9.869346827909769e-06, "loss": 1.4409, "step": 25900 }, { "epoch": 0.13, "grad_norm": 4.488391399383545, "learning_rate": 9.868721693115079e-06, "loss": 1.4609, "step": 26000 }, { "epoch": 0.13, "grad_norm": 4.66510534286499, "learning_rate": 9.868096558320389e-06, "loss": 1.4132, "step": 26100 }, { "epoch": 0.13, "grad_norm": 3.9409704208374023, "learning_rate": 9.867471423525698e-06, "loss": 1.4406, "step": 26200 }, { "epoch": 0.13, "grad_norm": 3.3582875728607178, "learning_rate": 9.866846288731008e-06, "loss": 1.438, "step": 26300 }, { "epoch": 0.13, "grad_norm": 4.238399505615234, "learning_rate": 9.866221153936318e-06, "loss": 1.4201, "step": 26400 }, { "epoch": 0.13, "grad_norm": 3.6502299308776855, "learning_rate": 9.865596019141628e-06, "loss": 1.4592, "step": 26500 }, { "epoch": 0.13, "grad_norm": 3.8077006340026855, "learning_rate": 9.864970884346938e-06, "loss": 1.4556, "step": 26600 }, { "epoch": 0.13, "grad_norm": 4.270641326904297, "learning_rate": 9.864345749552248e-06, "loss": 1.4101, "step": 26700 }, { "epoch": 0.13, "grad_norm": 4.743376731872559, "learning_rate": 9.863720614757558e-06, "loss": 1.4719, "step": 26800 }, { "epoch": 0.13, "grad_norm": 4.345736980438232, "learning_rate": 9.863095479962868e-06, "loss": 1.4002, "step": 26900 }, { "epoch": 0.13, "grad_norm": 3.8192217350006104, "learning_rate": 9.862470345168178e-06, "loss": 1.4697, "step": 27000 }, { "epoch": 0.13, "grad_norm": 4.685102939605713, "learning_rate": 9.861845210373488e-06, "loss": 1.4519, "step": 27100 }, { "epoch": 0.13, "grad_norm": 3.690993070602417, "learning_rate": 9.861220075578798e-06, "loss": 1.4424, "step": 27200 }, { "epoch": 0.14, "grad_norm": 3.8806326389312744, "learning_rate": 9.860594940784108e-06, "loss": 1.4416, "step": 27300 }, { "epoch": 0.14, "grad_norm": 3.362546443939209, "learning_rate": 9.859969805989418e-06, "loss": 1.4245, "step": 27400 }, { "epoch": 0.14, "grad_norm": 4.167792320251465, "learning_rate": 9.859344671194728e-06, "loss": 1.4645, "step": 27500 }, { "epoch": 0.14, "grad_norm": 4.120845317840576, "learning_rate": 9.858719536400036e-06, "loss": 1.3847, "step": 27600 }, { "epoch": 0.14, "grad_norm": 3.8441598415374756, "learning_rate": 9.858094401605347e-06, "loss": 1.4244, "step": 27700 }, { "epoch": 0.14, "grad_norm": 3.3462672233581543, "learning_rate": 9.857469266810656e-06, "loss": 1.4401, "step": 27800 }, { "epoch": 0.14, "grad_norm": 4.065661430358887, "learning_rate": 9.856844132015967e-06, "loss": 1.4212, "step": 27900 }, { "epoch": 0.14, "grad_norm": 3.5907657146453857, "learning_rate": 9.856218997221275e-06, "loss": 1.4639, "step": 28000 }, { "epoch": 0.14, "grad_norm": 3.701472759246826, "learning_rate": 9.855593862426587e-06, "loss": 1.4052, "step": 28100 }, { "epoch": 0.14, "grad_norm": 3.7131853103637695, "learning_rate": 9.854968727631897e-06, "loss": 1.4237, "step": 28200 }, { "epoch": 0.14, "grad_norm": 3.156214475631714, "learning_rate": 9.854343592837207e-06, "loss": 1.4364, "step": 28300 }, { "epoch": 0.14, "grad_norm": 4.9435715675354, "learning_rate": 9.853718458042517e-06, "loss": 1.4352, "step": 28400 }, { "epoch": 0.14, "grad_norm": 3.94811749458313, "learning_rate": 9.853093323247827e-06, "loss": 1.4303, "step": 28500 }, { "epoch": 0.14, "grad_norm": 3.5269935131073, "learning_rate": 9.852468188453137e-06, "loss": 1.4214, "step": 28600 }, { "epoch": 0.14, "grad_norm": 4.688473224639893, "learning_rate": 9.851843053658447e-06, "loss": 1.3854, "step": 28700 }, { "epoch": 0.14, "grad_norm": 4.054961204528809, "learning_rate": 9.851217918863757e-06, "loss": 1.432, "step": 28800 }, { "epoch": 0.14, "grad_norm": 3.178467273712158, "learning_rate": 9.850592784069065e-06, "loss": 1.4439, "step": 28900 }, { "epoch": 0.14, "grad_norm": 4.031513690948486, "learning_rate": 9.849967649274376e-06, "loss": 1.4228, "step": 29000 }, { "epoch": 0.14, "grad_norm": 3.9268980026245117, "learning_rate": 9.849342514479685e-06, "loss": 1.4002, "step": 29100 }, { "epoch": 0.14, "grad_norm": 3.176645040512085, "learning_rate": 9.848717379684996e-06, "loss": 1.4732, "step": 29200 }, { "epoch": 0.14, "grad_norm": 2.7952117919921875, "learning_rate": 9.848092244890304e-06, "loss": 1.4609, "step": 29300 }, { "epoch": 0.15, "grad_norm": 3.6165409088134766, "learning_rate": 9.847467110095616e-06, "loss": 1.4179, "step": 29400 }, { "epoch": 0.15, "grad_norm": 4.359500408172607, "learning_rate": 9.846841975300924e-06, "loss": 1.4443, "step": 29500 }, { "epoch": 0.15, "grad_norm": 4.256430625915527, "learning_rate": 9.846216840506236e-06, "loss": 1.4205, "step": 29600 }, { "epoch": 0.15, "grad_norm": 4.939763069152832, "learning_rate": 9.845591705711544e-06, "loss": 1.3889, "step": 29700 }, { "epoch": 0.15, "grad_norm": 3.5934700965881348, "learning_rate": 9.844966570916856e-06, "loss": 1.4045, "step": 29800 }, { "epoch": 0.15, "grad_norm": 3.0760035514831543, "learning_rate": 9.844341436122164e-06, "loss": 1.4534, "step": 29900 }, { "epoch": 0.15, "grad_norm": 4.314694881439209, "learning_rate": 9.843716301327476e-06, "loss": 1.4284, "step": 30000 }, { "epoch": 0.15, "grad_norm": 3.9042022228240967, "learning_rate": 9.843091166532784e-06, "loss": 1.4249, "step": 30100 }, { "epoch": 0.15, "grad_norm": 3.454749822616577, "learning_rate": 9.842466031738094e-06, "loss": 1.4291, "step": 30200 }, { "epoch": 0.15, "grad_norm": 3.8640189170837402, "learning_rate": 9.841840896943404e-06, "loss": 1.4514, "step": 30300 }, { "epoch": 0.15, "grad_norm": 4.65750789642334, "learning_rate": 9.841215762148714e-06, "loss": 1.4626, "step": 30400 }, { "epoch": 0.15, "grad_norm": 4.030206680297852, "learning_rate": 9.840590627354024e-06, "loss": 1.4038, "step": 30500 }, { "epoch": 0.15, "grad_norm": 4.036793231964111, "learning_rate": 9.839965492559334e-06, "loss": 1.4225, "step": 30600 }, { "epoch": 0.15, "grad_norm": 3.980349063873291, "learning_rate": 9.839340357764643e-06, "loss": 1.4334, "step": 30700 }, { "epoch": 0.15, "grad_norm": 4.157260894775391, "learning_rate": 9.838715222969953e-06, "loss": 1.453, "step": 30800 }, { "epoch": 0.15, "grad_norm": 3.416947841644287, "learning_rate": 9.838090088175263e-06, "loss": 1.4176, "step": 30900 }, { "epoch": 0.15, "grad_norm": 5.0742645263671875, "learning_rate": 9.837464953380573e-06, "loss": 1.4492, "step": 31000 }, { "epoch": 0.15, "grad_norm": 3.5331027507781982, "learning_rate": 9.836839818585883e-06, "loss": 1.4258, "step": 31100 }, { "epoch": 0.15, "grad_norm": 3.268676280975342, "learning_rate": 9.836214683791193e-06, "loss": 1.4081, "step": 31200 }, { "epoch": 0.15, "grad_norm": 3.619158983230591, "learning_rate": 9.835589548996503e-06, "loss": 1.3998, "step": 31300 }, { "epoch": 0.16, "grad_norm": 3.513633966445923, "learning_rate": 9.834964414201813e-06, "loss": 1.4102, "step": 31400 }, { "epoch": 0.16, "grad_norm": 3.7973320484161377, "learning_rate": 9.834339279407123e-06, "loss": 1.3846, "step": 31500 }, { "epoch": 0.16, "grad_norm": 4.910383701324463, "learning_rate": 9.833714144612433e-06, "loss": 1.3886, "step": 31600 }, { "epoch": 0.16, "grad_norm": 3.820688009262085, "learning_rate": 9.833089009817743e-06, "loss": 1.4178, "step": 31700 }, { "epoch": 0.16, "grad_norm": 3.5854384899139404, "learning_rate": 9.832463875023053e-06, "loss": 1.4154, "step": 31800 }, { "epoch": 0.16, "grad_norm": 3.8664228916168213, "learning_rate": 9.831838740228363e-06, "loss": 1.4255, "step": 31900 }, { "epoch": 0.16, "grad_norm": 3.179574728012085, "learning_rate": 9.831213605433673e-06, "loss": 1.4039, "step": 32000 }, { "epoch": 0.16, "grad_norm": 4.206747531890869, "learning_rate": 9.830588470638982e-06, "loss": 1.4066, "step": 32100 }, { "epoch": 0.16, "grad_norm": 4.408509731292725, "learning_rate": 9.829963335844292e-06, "loss": 1.4028, "step": 32200 }, { "epoch": 0.16, "grad_norm": 5.01927375793457, "learning_rate": 9.829338201049602e-06, "loss": 1.4415, "step": 32300 }, { "epoch": 0.16, "grad_norm": 3.166085720062256, "learning_rate": 9.828713066254912e-06, "loss": 1.4698, "step": 32400 }, { "epoch": 0.16, "grad_norm": 3.4799346923828125, "learning_rate": 9.828087931460222e-06, "loss": 1.4055, "step": 32500 }, { "epoch": 0.16, "grad_norm": 3.68662428855896, "learning_rate": 9.827462796665532e-06, "loss": 1.3942, "step": 32600 }, { "epoch": 0.16, "grad_norm": 4.4798970222473145, "learning_rate": 9.826837661870842e-06, "loss": 1.4293, "step": 32700 }, { "epoch": 0.16, "grad_norm": 3.4319188594818115, "learning_rate": 9.826212527076152e-06, "loss": 1.4134, "step": 32800 }, { "epoch": 0.16, "grad_norm": 3.2756521701812744, "learning_rate": 9.825587392281462e-06, "loss": 1.4535, "step": 32900 }, { "epoch": 0.16, "grad_norm": 3.3544061183929443, "learning_rate": 9.82496225748677e-06, "loss": 1.3997, "step": 33000 }, { "epoch": 0.16, "grad_norm": 3.7909374237060547, "learning_rate": 9.824337122692082e-06, "loss": 1.4043, "step": 33100 }, { "epoch": 0.16, "grad_norm": 3.8240981101989746, "learning_rate": 9.82371198789739e-06, "loss": 1.428, "step": 33200 }, { "epoch": 0.16, "grad_norm": 3.214618682861328, "learning_rate": 9.823086853102702e-06, "loss": 1.3999, "step": 33300 }, { "epoch": 0.17, "grad_norm": 8.105681419372559, "learning_rate": 9.822461718308011e-06, "loss": 1.4082, "step": 33400 }, { "epoch": 0.17, "grad_norm": 4.449899196624756, "learning_rate": 9.821836583513321e-06, "loss": 1.4273, "step": 33500 }, { "epoch": 0.17, "grad_norm": 4.17997932434082, "learning_rate": 9.821211448718631e-06, "loss": 1.4291, "step": 33600 }, { "epoch": 0.17, "grad_norm": 3.813826322555542, "learning_rate": 9.820586313923941e-06, "loss": 1.407, "step": 33700 }, { "epoch": 0.17, "grad_norm": 3.7879397869110107, "learning_rate": 9.819961179129251e-06, "loss": 1.4159, "step": 33800 }, { "epoch": 0.17, "grad_norm": 3.9027743339538574, "learning_rate": 9.819336044334561e-06, "loss": 1.439, "step": 33900 }, { "epoch": 0.17, "grad_norm": 3.7069435119628906, "learning_rate": 9.818710909539871e-06, "loss": 1.4117, "step": 34000 }, { "epoch": 0.17, "grad_norm": 3.904519557952881, "learning_rate": 9.818085774745181e-06, "loss": 1.4227, "step": 34100 }, { "epoch": 0.17, "grad_norm": 3.545767068862915, "learning_rate": 9.817460639950491e-06, "loss": 1.4344, "step": 34200 }, { "epoch": 0.17, "grad_norm": 3.717536687850952, "learning_rate": 9.816835505155799e-06, "loss": 1.447, "step": 34300 }, { "epoch": 0.17, "grad_norm": 3.036220073699951, "learning_rate": 9.81621037036111e-06, "loss": 1.4124, "step": 34400 }, { "epoch": 0.17, "grad_norm": 4.334647178649902, "learning_rate": 9.815585235566419e-06, "loss": 1.4347, "step": 34500 }, { "epoch": 0.17, "grad_norm": 3.7795979976654053, "learning_rate": 9.81496010077173e-06, "loss": 1.417, "step": 34600 }, { "epoch": 0.17, "grad_norm": 3.4146125316619873, "learning_rate": 9.814334965977039e-06, "loss": 1.3847, "step": 34700 }, { "epoch": 0.17, "grad_norm": 3.193895101547241, "learning_rate": 9.81370983118235e-06, "loss": 1.4084, "step": 34800 }, { "epoch": 0.17, "grad_norm": 3.3683910369873047, "learning_rate": 9.813084696387659e-06, "loss": 1.4393, "step": 34900 }, { "epoch": 0.17, "grad_norm": 4.252421855926514, "learning_rate": 9.81245956159297e-06, "loss": 1.4294, "step": 35000 }, { "epoch": 0.17, "grad_norm": 3.6922848224639893, "learning_rate": 9.811834426798279e-06, "loss": 1.4212, "step": 35100 }, { "epoch": 0.17, "grad_norm": 3.27756667137146, "learning_rate": 9.81120929200359e-06, "loss": 1.4185, "step": 35200 }, { "epoch": 0.17, "grad_norm": 3.377180337905884, "learning_rate": 9.810584157208898e-06, "loss": 1.3991, "step": 35300 }, { "epoch": 0.18, "grad_norm": 3.4141881465911865, "learning_rate": 9.80995902241421e-06, "loss": 1.3816, "step": 35400 }, { "epoch": 0.18, "grad_norm": 3.6975343227386475, "learning_rate": 9.809333887619518e-06, "loss": 1.4275, "step": 35500 }, { "epoch": 0.18, "grad_norm": 3.303208112716675, "learning_rate": 9.808708752824828e-06, "loss": 1.3996, "step": 35600 }, { "epoch": 0.18, "grad_norm": 3.1281208992004395, "learning_rate": 9.808083618030138e-06, "loss": 1.4573, "step": 35700 }, { "epoch": 0.18, "grad_norm": 4.216818809509277, "learning_rate": 9.807458483235448e-06, "loss": 1.4264, "step": 35800 }, { "epoch": 0.18, "grad_norm": 3.6236705780029297, "learning_rate": 9.806833348440758e-06, "loss": 1.42, "step": 35900 }, { "epoch": 0.18, "grad_norm": 3.4652881622314453, "learning_rate": 9.806208213646068e-06, "loss": 1.4509, "step": 36000 }, { "epoch": 0.18, "grad_norm": 4.3565449714660645, "learning_rate": 9.805583078851378e-06, "loss": 1.3968, "step": 36100 }, { "epoch": 0.18, "grad_norm": 5.522129535675049, "learning_rate": 9.804957944056688e-06, "loss": 1.4402, "step": 36200 }, { "epoch": 0.18, "grad_norm": 3.9470767974853516, "learning_rate": 9.804332809261998e-06, "loss": 1.3922, "step": 36300 }, { "epoch": 0.18, "grad_norm": 3.978543758392334, "learning_rate": 9.803707674467308e-06, "loss": 1.403, "step": 36400 }, { "epoch": 0.18, "grad_norm": 5.382244110107422, "learning_rate": 9.803082539672618e-06, "loss": 1.3968, "step": 36500 }, { "epoch": 0.18, "grad_norm": 4.595647811889648, "learning_rate": 9.802457404877927e-06, "loss": 1.4002, "step": 36600 }, { "epoch": 0.18, "grad_norm": 3.6310489177703857, "learning_rate": 9.801832270083237e-06, "loss": 1.4122, "step": 36700 }, { "epoch": 0.18, "grad_norm": 3.4216363430023193, "learning_rate": 9.801207135288547e-06, "loss": 1.3859, "step": 36800 }, { "epoch": 0.18, "grad_norm": 3.4577724933624268, "learning_rate": 9.800582000493857e-06, "loss": 1.4206, "step": 36900 }, { "epoch": 0.18, "grad_norm": 4.211758136749268, "learning_rate": 9.799956865699167e-06, "loss": 1.4146, "step": 37000 }, { "epoch": 0.18, "grad_norm": 3.5187759399414062, "learning_rate": 9.799331730904477e-06, "loss": 1.3983, "step": 37100 }, { "epoch": 0.18, "grad_norm": 3.524277925491333, "learning_rate": 9.798706596109787e-06, "loss": 1.39, "step": 37200 }, { "epoch": 0.18, "grad_norm": 3.746493339538574, "learning_rate": 9.798081461315097e-06, "loss": 1.4351, "step": 37300 }, { "epoch": 0.19, "grad_norm": 3.489757537841797, "learning_rate": 9.797456326520407e-06, "loss": 1.4215, "step": 37400 }, { "epoch": 0.19, "grad_norm": 3.862546443939209, "learning_rate": 9.796831191725717e-06, "loss": 1.3839, "step": 37500 }, { "epoch": 0.19, "grad_norm": 3.700289487838745, "learning_rate": 9.796206056931027e-06, "loss": 1.4134, "step": 37600 }, { "epoch": 0.19, "grad_norm": 4.463230609893799, "learning_rate": 9.795580922136337e-06, "loss": 1.4094, "step": 37700 }, { "epoch": 0.19, "grad_norm": 3.6630661487579346, "learning_rate": 9.794955787341647e-06, "loss": 1.4008, "step": 37800 }, { "epoch": 0.19, "grad_norm": 4.630967140197754, "learning_rate": 9.794330652546956e-06, "loss": 1.3759, "step": 37900 }, { "epoch": 0.19, "grad_norm": 3.3025717735290527, "learning_rate": 9.793705517752266e-06, "loss": 1.3884, "step": 38000 }, { "epoch": 0.19, "grad_norm": 3.3258678913116455, "learning_rate": 9.793080382957576e-06, "loss": 1.386, "step": 38100 }, { "epoch": 0.19, "grad_norm": 3.719531536102295, "learning_rate": 9.792455248162885e-06, "loss": 1.3964, "step": 38200 }, { "epoch": 0.19, "grad_norm": 3.2938575744628906, "learning_rate": 9.791830113368196e-06, "loss": 1.4057, "step": 38300 }, { "epoch": 0.19, "grad_norm": 4.785384654998779, "learning_rate": 9.791204978573506e-06, "loss": 1.4134, "step": 38400 }, { "epoch": 0.19, "grad_norm": 3.3767313957214355, "learning_rate": 9.790579843778816e-06, "loss": 1.4139, "step": 38500 }, { "epoch": 0.19, "grad_norm": 2.9999425411224365, "learning_rate": 9.789954708984126e-06, "loss": 1.4039, "step": 38600 }, { "epoch": 0.19, "grad_norm": 4.019780158996582, "learning_rate": 9.789329574189436e-06, "loss": 1.3637, "step": 38700 }, { "epoch": 0.19, "grad_norm": 3.2933456897735596, "learning_rate": 9.788704439394746e-06, "loss": 1.3784, "step": 38800 }, { "epoch": 0.19, "grad_norm": 3.511465549468994, "learning_rate": 9.788079304600056e-06, "loss": 1.3845, "step": 38900 }, { "epoch": 0.19, "grad_norm": 3.031588077545166, "learning_rate": 9.787454169805366e-06, "loss": 1.3815, "step": 39000 }, { "epoch": 0.19, "grad_norm": 3.726041078567505, "learning_rate": 9.786829035010676e-06, "loss": 1.4031, "step": 39100 }, { "epoch": 0.19, "grad_norm": 3.459808349609375, "learning_rate": 9.786203900215986e-06, "loss": 1.4168, "step": 39200 }, { "epoch": 0.19, "grad_norm": 5.884055137634277, "learning_rate": 9.785578765421295e-06, "loss": 1.3897, "step": 39300 }, { "epoch": 0.19, "grad_norm": 3.8769099712371826, "learning_rate": 9.784953630626605e-06, "loss": 1.4214, "step": 39400 }, { "epoch": 0.2, "grad_norm": 4.353100776672363, "learning_rate": 9.784328495831914e-06, "loss": 1.3875, "step": 39500 }, { "epoch": 0.2, "grad_norm": 3.665733575820923, "learning_rate": 9.783703361037225e-06, "loss": 1.4033, "step": 39600 }, { "epoch": 0.2, "grad_norm": 4.098516941070557, "learning_rate": 9.783078226242533e-06, "loss": 1.3755, "step": 39700 }, { "epoch": 0.2, "grad_norm": 3.6719651222229004, "learning_rate": 9.782453091447845e-06, "loss": 1.4025, "step": 39800 }, { "epoch": 0.2, "grad_norm": 3.7323970794677734, "learning_rate": 9.781827956653153e-06, "loss": 1.4072, "step": 39900 }, { "epoch": 0.2, "grad_norm": 3.6012964248657227, "learning_rate": 9.781202821858465e-06, "loss": 1.3839, "step": 40000 }, { "epoch": 0.2, "grad_norm": 3.1830966472625732, "learning_rate": 9.780577687063773e-06, "loss": 1.4025, "step": 40100 }, { "epoch": 0.2, "grad_norm": 3.503458261489868, "learning_rate": 9.779952552269085e-06, "loss": 1.3849, "step": 40200 }, { "epoch": 0.2, "grad_norm": 3.934358596801758, "learning_rate": 9.779327417474393e-06, "loss": 1.3908, "step": 40300 }, { "epoch": 0.2, "grad_norm": 3.263597249984741, "learning_rate": 9.778702282679705e-06, "loss": 1.4144, "step": 40400 }, { "epoch": 0.2, "grad_norm": 3.6019351482391357, "learning_rate": 9.778077147885013e-06, "loss": 1.3966, "step": 40500 }, { "epoch": 0.2, "grad_norm": 3.211871862411499, "learning_rate": 9.777452013090324e-06, "loss": 1.4029, "step": 40600 }, { "epoch": 0.2, "grad_norm": 4.445366382598877, "learning_rate": 9.776826878295633e-06, "loss": 1.4192, "step": 40700 }, { "epoch": 0.2, "grad_norm": 3.834134340286255, "learning_rate": 9.776201743500943e-06, "loss": 1.3971, "step": 40800 }, { "epoch": 0.2, "grad_norm": 11.223153114318848, "learning_rate": 9.775576608706253e-06, "loss": 1.4199, "step": 40900 }, { "epoch": 0.2, "grad_norm": 3.168875217437744, "learning_rate": 9.774951473911563e-06, "loss": 1.403, "step": 41000 }, { "epoch": 0.2, "grad_norm": 4.082376480102539, "learning_rate": 9.774326339116872e-06, "loss": 1.3758, "step": 41100 }, { "epoch": 0.2, "grad_norm": 3.381903648376465, "learning_rate": 9.773701204322182e-06, "loss": 1.4124, "step": 41200 }, { "epoch": 0.2, "grad_norm": 3.220072031021118, "learning_rate": 9.773076069527492e-06, "loss": 1.3731, "step": 41300 }, { "epoch": 0.2, "grad_norm": 4.445113182067871, "learning_rate": 9.772450934732802e-06, "loss": 1.409, "step": 41400 }, { "epoch": 0.21, "grad_norm": 4.551964282989502, "learning_rate": 9.771825799938112e-06, "loss": 1.4092, "step": 41500 }, { "epoch": 0.21, "grad_norm": 3.120997905731201, "learning_rate": 9.771200665143422e-06, "loss": 1.4316, "step": 41600 }, { "epoch": 0.21, "grad_norm": 3.3942294120788574, "learning_rate": 9.770575530348732e-06, "loss": 1.3752, "step": 41700 }, { "epoch": 0.21, "grad_norm": 3.3764448165893555, "learning_rate": 9.769950395554042e-06, "loss": 1.4238, "step": 41800 }, { "epoch": 0.21, "grad_norm": 3.600349187850952, "learning_rate": 9.769325260759352e-06, "loss": 1.4012, "step": 41900 }, { "epoch": 0.21, "grad_norm": 3.6565279960632324, "learning_rate": 9.768700125964662e-06, "loss": 1.3768, "step": 42000 }, { "epoch": 0.21, "grad_norm": 3.65138578414917, "learning_rate": 9.768074991169972e-06, "loss": 1.4022, "step": 42100 }, { "epoch": 0.21, "grad_norm": 3.3732988834381104, "learning_rate": 9.767449856375282e-06, "loss": 1.409, "step": 42200 }, { "epoch": 0.21, "grad_norm": 3.8248541355133057, "learning_rate": 9.766824721580592e-06, "loss": 1.4121, "step": 42300 }, { "epoch": 0.21, "grad_norm": 3.3323121070861816, "learning_rate": 9.766199586785901e-06, "loss": 1.4203, "step": 42400 }, { "epoch": 0.21, "grad_norm": 3.0881714820861816, "learning_rate": 9.765574451991211e-06, "loss": 1.3993, "step": 42500 }, { "epoch": 0.21, "grad_norm": 3.6461262702941895, "learning_rate": 9.764949317196521e-06, "loss": 1.3842, "step": 42600 }, { "epoch": 0.21, "grad_norm": 3.2910470962524414, "learning_rate": 9.764324182401831e-06, "loss": 1.4108, "step": 42700 }, { "epoch": 0.21, "grad_norm": 3.8803622722625732, "learning_rate": 9.763699047607141e-06, "loss": 1.3962, "step": 42800 }, { "epoch": 0.21, "grad_norm": 3.9977149963378906, "learning_rate": 9.763073912812451e-06, "loss": 1.3876, "step": 42900 }, { "epoch": 0.21, "grad_norm": 3.4803717136383057, "learning_rate": 9.762448778017761e-06, "loss": 1.3601, "step": 43000 }, { "epoch": 0.21, "grad_norm": 3.819903612136841, "learning_rate": 9.761823643223071e-06, "loss": 1.3891, "step": 43100 }, { "epoch": 0.21, "grad_norm": 3.582854747772217, "learning_rate": 9.761198508428381e-06, "loss": 1.4145, "step": 43200 }, { "epoch": 0.21, "grad_norm": 3.323837995529175, "learning_rate": 9.76057337363369e-06, "loss": 1.4141, "step": 43300 }, { "epoch": 0.21, "grad_norm": 2.7181520462036133, "learning_rate": 9.759948238839e-06, "loss": 1.3464, "step": 43400 }, { "epoch": 0.22, "grad_norm": 3.4599475860595703, "learning_rate": 9.75932310404431e-06, "loss": 1.3775, "step": 43500 }, { "epoch": 0.22, "grad_norm": 3.005889654159546, "learning_rate": 9.75869796924962e-06, "loss": 1.4078, "step": 43600 }, { "epoch": 0.22, "grad_norm": 3.152175188064575, "learning_rate": 9.75807283445493e-06, "loss": 1.411, "step": 43700 }, { "epoch": 0.22, "grad_norm": 3.951000213623047, "learning_rate": 9.75744769966024e-06, "loss": 1.3737, "step": 43800 }, { "epoch": 0.22, "grad_norm": 3.722508192062378, "learning_rate": 9.75682256486555e-06, "loss": 1.4092, "step": 43900 }, { "epoch": 0.22, "grad_norm": 4.0199761390686035, "learning_rate": 9.75619743007086e-06, "loss": 1.3964, "step": 44000 }, { "epoch": 0.22, "grad_norm": 3.306147336959839, "learning_rate": 9.75557229527617e-06, "loss": 1.3772, "step": 44100 }, { "epoch": 0.22, "grad_norm": 3.0931670665740967, "learning_rate": 9.75494716048148e-06, "loss": 1.3848, "step": 44200 }, { "epoch": 0.22, "grad_norm": 3.4775798320770264, "learning_rate": 9.75432202568679e-06, "loss": 1.391, "step": 44300 }, { "epoch": 0.22, "grad_norm": 2.973440170288086, "learning_rate": 9.7536968908921e-06, "loss": 1.3993, "step": 44400 }, { "epoch": 0.22, "grad_norm": 2.9864256381988525, "learning_rate": 9.75307175609741e-06, "loss": 1.3523, "step": 44500 }, { "epoch": 0.22, "grad_norm": 3.9546356201171875, "learning_rate": 9.75244662130272e-06, "loss": 1.3527, "step": 44600 }, { "epoch": 0.22, "grad_norm": 6.238116264343262, "learning_rate": 9.75182148650803e-06, "loss": 1.3624, "step": 44700 }, { "epoch": 0.22, "grad_norm": 3.241060495376587, "learning_rate": 9.75119635171334e-06, "loss": 1.3875, "step": 44800 }, { "epoch": 0.22, "grad_norm": 3.17099666595459, "learning_rate": 9.750571216918648e-06, "loss": 1.381, "step": 44900 }, { "epoch": 0.22, "grad_norm": 3.647505283355713, "learning_rate": 9.74994608212396e-06, "loss": 1.3673, "step": 45000 }, { "epoch": 0.22, "grad_norm": 3.5791754722595215, "learning_rate": 9.749320947329268e-06, "loss": 1.3936, "step": 45100 }, { "epoch": 0.22, "grad_norm": 4.946603775024414, "learning_rate": 9.74869581253458e-06, "loss": 1.4135, "step": 45200 }, { "epoch": 0.22, "grad_norm": 2.9948465824127197, "learning_rate": 9.748070677739888e-06, "loss": 1.3824, "step": 45300 }, { "epoch": 0.22, "grad_norm": 3.355520725250244, "learning_rate": 9.7474455429452e-06, "loss": 1.3757, "step": 45400 }, { "epoch": 0.23, "grad_norm": 3.599808931350708, "learning_rate": 9.746820408150508e-06, "loss": 1.4019, "step": 45500 }, { "epoch": 0.23, "grad_norm": 2.60089111328125, "learning_rate": 9.746195273355819e-06, "loss": 1.4147, "step": 45600 }, { "epoch": 0.23, "grad_norm": 3.074833393096924, "learning_rate": 9.745570138561127e-06, "loss": 1.4084, "step": 45700 }, { "epoch": 0.23, "grad_norm": 3.3081626892089844, "learning_rate": 9.744945003766439e-06, "loss": 1.3999, "step": 45800 }, { "epoch": 0.23, "grad_norm": 4.088558673858643, "learning_rate": 9.744319868971747e-06, "loss": 1.398, "step": 45900 }, { "epoch": 0.23, "grad_norm": 3.7625129222869873, "learning_rate": 9.743694734177059e-06, "loss": 1.388, "step": 46000 }, { "epoch": 0.23, "grad_norm": 4.328056335449219, "learning_rate": 9.743069599382367e-06, "loss": 1.3721, "step": 46100 }, { "epoch": 0.23, "grad_norm": 3.1336140632629395, "learning_rate": 9.742444464587677e-06, "loss": 1.3865, "step": 46200 }, { "epoch": 0.23, "grad_norm": 3.0789365768432617, "learning_rate": 9.741819329792987e-06, "loss": 1.3609, "step": 46300 }, { "epoch": 0.23, "grad_norm": 3.569803237915039, "learning_rate": 9.741194194998297e-06, "loss": 1.3954, "step": 46400 }, { "epoch": 0.23, "grad_norm": 2.9342846870422363, "learning_rate": 9.740569060203607e-06, "loss": 1.3381, "step": 46500 }, { "epoch": 0.23, "grad_norm": 3.4400010108947754, "learning_rate": 9.739943925408917e-06, "loss": 1.3582, "step": 46600 }, { "epoch": 0.23, "grad_norm": 3.698220729827881, "learning_rate": 9.739318790614227e-06, "loss": 1.4165, "step": 46700 }, { "epoch": 0.23, "grad_norm": 3.7116315364837646, "learning_rate": 9.738693655819537e-06, "loss": 1.3643, "step": 46800 }, { "epoch": 0.23, "grad_norm": 4.515981674194336, "learning_rate": 9.738068521024846e-06, "loss": 1.3899, "step": 46900 }, { "epoch": 0.23, "grad_norm": 3.1222646236419678, "learning_rate": 9.737443386230156e-06, "loss": 1.3497, "step": 47000 }, { "epoch": 0.23, "grad_norm": 3.3143482208251953, "learning_rate": 9.736818251435466e-06, "loss": 1.3906, "step": 47100 }, { "epoch": 0.23, "grad_norm": 3.673861026763916, "learning_rate": 9.736193116640776e-06, "loss": 1.3742, "step": 47200 }, { "epoch": 0.23, "grad_norm": 3.1223580837249756, "learning_rate": 9.735567981846086e-06, "loss": 1.3679, "step": 47300 }, { "epoch": 0.23, "grad_norm": 3.321925640106201, "learning_rate": 9.734942847051396e-06, "loss": 1.3872, "step": 47400 }, { "epoch": 0.24, "grad_norm": 3.3574061393737793, "learning_rate": 9.734317712256706e-06, "loss": 1.3536, "step": 47500 }, { "epoch": 0.24, "grad_norm": 3.0097270011901855, "learning_rate": 9.733692577462016e-06, "loss": 1.3867, "step": 47600 }, { "epoch": 0.24, "grad_norm": 6.731925964355469, "learning_rate": 9.733067442667326e-06, "loss": 1.3671, "step": 47700 }, { "epoch": 0.24, "grad_norm": 4.5036468505859375, "learning_rate": 9.732442307872636e-06, "loss": 1.3935, "step": 47800 }, { "epoch": 0.24, "grad_norm": 5.059147357940674, "learning_rate": 9.731817173077946e-06, "loss": 1.3693, "step": 47900 }, { "epoch": 0.24, "grad_norm": 3.65627384185791, "learning_rate": 9.731192038283256e-06, "loss": 1.3838, "step": 48000 }, { "epoch": 0.24, "grad_norm": 3.8300678730010986, "learning_rate": 9.730566903488566e-06, "loss": 1.4273, "step": 48100 }, { "epoch": 0.24, "grad_norm": 3.2943530082702637, "learning_rate": 9.729941768693876e-06, "loss": 1.3598, "step": 48200 }, { "epoch": 0.24, "grad_norm": 3.6148500442504883, "learning_rate": 9.729316633899185e-06, "loss": 1.3916, "step": 48300 }, { "epoch": 0.24, "grad_norm": 3.14809250831604, "learning_rate": 9.728691499104495e-06, "loss": 1.3596, "step": 48400 }, { "epoch": 0.24, "grad_norm": 3.4826507568359375, "learning_rate": 9.728066364309805e-06, "loss": 1.4113, "step": 48500 }, { "epoch": 0.24, "grad_norm": 3.761301279067993, "learning_rate": 9.727441229515115e-06, "loss": 1.3811, "step": 48600 }, { "epoch": 0.24, "grad_norm": 3.280597448348999, "learning_rate": 9.726816094720425e-06, "loss": 1.3735, "step": 48700 }, { "epoch": 0.24, "grad_norm": 3.2056515216827393, "learning_rate": 9.726190959925735e-06, "loss": 1.3928, "step": 48800 }, { "epoch": 0.24, "grad_norm": 5.535262107849121, "learning_rate": 9.725565825131045e-06, "loss": 1.3795, "step": 48900 }, { "epoch": 0.24, "grad_norm": 3.71197509765625, "learning_rate": 9.724940690336355e-06, "loss": 1.3956, "step": 49000 }, { "epoch": 0.24, "grad_norm": 3.048292875289917, "learning_rate": 9.724315555541665e-06, "loss": 1.3756, "step": 49100 }, { "epoch": 0.24, "grad_norm": 3.6094841957092285, "learning_rate": 9.723690420746975e-06, "loss": 1.4143, "step": 49200 }, { "epoch": 0.24, "grad_norm": 4.416449546813965, "learning_rate": 9.723065285952285e-06, "loss": 1.3618, "step": 49300 }, { "epoch": 0.24, "grad_norm": 4.372152328491211, "learning_rate": 9.722440151157595e-06, "loss": 1.402, "step": 49400 }, { "epoch": 0.24, "grad_norm": 3.1622934341430664, "learning_rate": 9.721815016362905e-06, "loss": 1.3914, "step": 49500 }, { "epoch": 0.25, "grad_norm": 3.1704394817352295, "learning_rate": 9.721189881568214e-06, "loss": 1.3827, "step": 49600 }, { "epoch": 0.25, "grad_norm": 3.9178764820098877, "learning_rate": 9.720564746773524e-06, "loss": 1.371, "step": 49700 }, { "epoch": 0.25, "grad_norm": 3.839916706085205, "learning_rate": 9.719939611978834e-06, "loss": 1.361, "step": 49800 }, { "epoch": 0.25, "grad_norm": 3.5647811889648438, "learning_rate": 9.719314477184144e-06, "loss": 1.3857, "step": 49900 }, { "epoch": 0.25, "grad_norm": 3.2756240367889404, "learning_rate": 9.718689342389454e-06, "loss": 1.375, "step": 50000 }, { "epoch": 0.25, "grad_norm": 4.051654815673828, "learning_rate": 9.718064207594762e-06, "loss": 1.3941, "step": 50100 }, { "epoch": 0.25, "grad_norm": 4.137097358703613, "learning_rate": 9.717439072800074e-06, "loss": 1.3892, "step": 50200 }, { "epoch": 0.25, "grad_norm": 3.646369457244873, "learning_rate": 9.716813938005382e-06, "loss": 1.3846, "step": 50300 }, { "epoch": 0.25, "grad_norm": 3.162900686264038, "learning_rate": 9.716188803210694e-06, "loss": 1.4173, "step": 50400 }, { "epoch": 0.25, "grad_norm": 3.1182548999786377, "learning_rate": 9.715563668416002e-06, "loss": 1.3769, "step": 50500 }, { "epoch": 0.25, "grad_norm": 5.595252513885498, "learning_rate": 9.714938533621314e-06, "loss": 1.4087, "step": 50600 }, { "epoch": 0.25, "grad_norm": 2.782058000564575, "learning_rate": 9.714313398826622e-06, "loss": 1.4179, "step": 50700 }, { "epoch": 0.25, "grad_norm": 3.0897421836853027, "learning_rate": 9.713688264031934e-06, "loss": 1.3671, "step": 50800 }, { "epoch": 0.25, "grad_norm": 3.455578565597534, "learning_rate": 9.713063129237242e-06, "loss": 1.401, "step": 50900 }, { "epoch": 0.25, "grad_norm": 3.877256155014038, "learning_rate": 9.712437994442553e-06, "loss": 1.4114, "step": 51000 }, { "epoch": 0.25, "grad_norm": 3.3103723526000977, "learning_rate": 9.711812859647862e-06, "loss": 1.3895, "step": 51100 }, { "epoch": 0.25, "grad_norm": 3.909396171569824, "learning_rate": 9.711187724853173e-06, "loss": 1.3632, "step": 51200 }, { "epoch": 0.25, "grad_norm": 3.4170010089874268, "learning_rate": 9.710562590058482e-06, "loss": 1.3819, "step": 51300 }, { "epoch": 0.25, "grad_norm": 3.1259448528289795, "learning_rate": 9.709937455263791e-06, "loss": 1.3894, "step": 51400 }, { "epoch": 0.25, "grad_norm": 4.449690818786621, "learning_rate": 9.709312320469101e-06, "loss": 1.3998, "step": 51500 }, { "epoch": 0.26, "grad_norm": 3.47631573677063, "learning_rate": 9.708687185674411e-06, "loss": 1.374, "step": 51600 }, { "epoch": 0.26, "grad_norm": 3.5324013233184814, "learning_rate": 9.708062050879721e-06, "loss": 1.3518, "step": 51700 }, { "epoch": 0.26, "grad_norm": 3.9155433177948, "learning_rate": 9.707436916085031e-06, "loss": 1.3848, "step": 51800 }, { "epoch": 0.26, "grad_norm": 3.4327306747436523, "learning_rate": 9.706811781290341e-06, "loss": 1.3653, "step": 51900 }, { "epoch": 0.26, "grad_norm": 6.06643533706665, "learning_rate": 9.706186646495651e-06, "loss": 1.3892, "step": 52000 }, { "epoch": 0.26, "grad_norm": 3.518132448196411, "learning_rate": 9.705561511700961e-06, "loss": 1.3433, "step": 52100 }, { "epoch": 0.26, "grad_norm": 3.2915585041046143, "learning_rate": 9.704936376906271e-06, "loss": 1.3579, "step": 52200 }, { "epoch": 0.26, "grad_norm": 3.804596185684204, "learning_rate": 9.70431124211158e-06, "loss": 1.3809, "step": 52300 }, { "epoch": 0.26, "grad_norm": 4.617377758026123, "learning_rate": 9.70368610731689e-06, "loss": 1.398, "step": 52400 }, { "epoch": 0.26, "grad_norm": 3.0816659927368164, "learning_rate": 9.7030609725222e-06, "loss": 1.3652, "step": 52500 }, { "epoch": 0.26, "grad_norm": 3.5903923511505127, "learning_rate": 9.70243583772751e-06, "loss": 1.3767, "step": 52600 }, { "epoch": 0.26, "grad_norm": 3.43280029296875, "learning_rate": 9.70181070293282e-06, "loss": 1.3802, "step": 52700 }, { "epoch": 0.26, "grad_norm": 4.5906081199646, "learning_rate": 9.70118556813813e-06, "loss": 1.3726, "step": 52800 }, { "epoch": 0.26, "grad_norm": 3.5110647678375244, "learning_rate": 9.70056043334344e-06, "loss": 1.4125, "step": 52900 }, { "epoch": 0.26, "grad_norm": 3.5731992721557617, "learning_rate": 9.69993529854875e-06, "loss": 1.3971, "step": 53000 }, { "epoch": 0.26, "grad_norm": 3.522200584411621, "learning_rate": 9.69931016375406e-06, "loss": 1.3758, "step": 53100 }, { "epoch": 0.26, "grad_norm": 3.754093647003174, "learning_rate": 9.69868502895937e-06, "loss": 1.4125, "step": 53200 }, { "epoch": 0.26, "grad_norm": 4.088795185089111, "learning_rate": 9.69805989416468e-06, "loss": 1.374, "step": 53300 }, { "epoch": 0.26, "grad_norm": 3.095700263977051, "learning_rate": 9.69743475936999e-06, "loss": 1.3475, "step": 53400 }, { "epoch": 0.26, "grad_norm": 3.6446001529693604, "learning_rate": 9.6968096245753e-06, "loss": 1.3675, "step": 53500 }, { "epoch": 0.27, "grad_norm": 3.0287554264068604, "learning_rate": 9.69618448978061e-06, "loss": 1.3648, "step": 53600 }, { "epoch": 0.27, "grad_norm": 3.5153772830963135, "learning_rate": 9.69555935498592e-06, "loss": 1.3526, "step": 53700 }, { "epoch": 0.27, "grad_norm": 3.402449131011963, "learning_rate": 9.69493422019123e-06, "loss": 1.4021, "step": 53800 }, { "epoch": 0.27, "grad_norm": 4.223129749298096, "learning_rate": 9.69430908539654e-06, "loss": 1.3754, "step": 53900 }, { "epoch": 0.27, "grad_norm": 2.7301337718963623, "learning_rate": 9.69368395060185e-06, "loss": 1.411, "step": 54000 }, { "epoch": 0.27, "grad_norm": 3.3157832622528076, "learning_rate": 9.69305881580716e-06, "loss": 1.3695, "step": 54100 }, { "epoch": 0.27, "grad_norm": 4.0217671394348145, "learning_rate": 9.69243368101247e-06, "loss": 1.3822, "step": 54200 }, { "epoch": 0.27, "grad_norm": 3.575080633163452, "learning_rate": 9.69180854621778e-06, "loss": 1.3578, "step": 54300 }, { "epoch": 0.27, "grad_norm": 3.4445888996124268, "learning_rate": 9.69118341142309e-06, "loss": 1.3795, "step": 54400 }, { "epoch": 0.27, "grad_norm": 3.173060894012451, "learning_rate": 9.6905582766284e-06, "loss": 1.3446, "step": 54500 }, { "epoch": 0.27, "grad_norm": 3.8823065757751465, "learning_rate": 9.689933141833709e-06, "loss": 1.3696, "step": 54600 }, { "epoch": 0.27, "grad_norm": 3.5092880725860596, "learning_rate": 9.689308007039019e-06, "loss": 1.3262, "step": 54700 }, { "epoch": 0.27, "grad_norm": 3.270498514175415, "learning_rate": 9.688682872244329e-06, "loss": 1.3905, "step": 54800 }, { "epoch": 0.27, "grad_norm": 3.0580673217773438, "learning_rate": 9.688057737449639e-06, "loss": 1.3707, "step": 54900 }, { "epoch": 0.27, "grad_norm": 2.982407569885254, "learning_rate": 9.687432602654949e-06, "loss": 1.3909, "step": 55000 }, { "epoch": 0.27, "grad_norm": 4.194490432739258, "learning_rate": 9.686807467860259e-06, "loss": 1.3703, "step": 55100 }, { "epoch": 0.27, "grad_norm": 3.5486743450164795, "learning_rate": 9.686182333065569e-06, "loss": 1.3528, "step": 55200 }, { "epoch": 0.27, "grad_norm": 3.081116199493408, "learning_rate": 9.685557198270879e-06, "loss": 1.3929, "step": 55300 }, { "epoch": 0.27, "grad_norm": 3.644366979598999, "learning_rate": 9.684932063476189e-06, "loss": 1.3847, "step": 55400 }, { "epoch": 0.27, "grad_norm": 2.9201712608337402, "learning_rate": 9.684306928681497e-06, "loss": 1.3689, "step": 55500 }, { "epoch": 0.28, "grad_norm": 3.1488590240478516, "learning_rate": 9.683681793886808e-06, "loss": 1.3573, "step": 55600 }, { "epoch": 0.28, "grad_norm": 3.4069724082946777, "learning_rate": 9.683056659092117e-06, "loss": 1.3447, "step": 55700 }, { "epoch": 0.28, "grad_norm": 3.669130563735962, "learning_rate": 9.682431524297428e-06, "loss": 1.3629, "step": 55800 }, { "epoch": 0.28, "grad_norm": 3.5612809658050537, "learning_rate": 9.681806389502736e-06, "loss": 1.3463, "step": 55900 }, { "epoch": 0.28, "grad_norm": 3.3988492488861084, "learning_rate": 9.681181254708048e-06, "loss": 1.3262, "step": 56000 }, { "epoch": 0.28, "grad_norm": 3.286510705947876, "learning_rate": 9.680556119913356e-06, "loss": 1.3671, "step": 56100 }, { "epoch": 0.28, "grad_norm": 4.079017639160156, "learning_rate": 9.679930985118668e-06, "loss": 1.3691, "step": 56200 }, { "epoch": 0.28, "grad_norm": 5.038201808929443, "learning_rate": 9.679305850323976e-06, "loss": 1.3684, "step": 56300 }, { "epoch": 0.28, "grad_norm": 2.9335787296295166, "learning_rate": 9.678680715529288e-06, "loss": 1.3651, "step": 56400 }, { "epoch": 0.28, "grad_norm": 3.750838279724121, "learning_rate": 9.678055580734596e-06, "loss": 1.3979, "step": 56500 }, { "epoch": 0.28, "grad_norm": 3.269113779067993, "learning_rate": 9.677430445939908e-06, "loss": 1.4005, "step": 56600 }, { "epoch": 0.28, "grad_norm": 2.9525506496429443, "learning_rate": 9.676805311145216e-06, "loss": 1.3535, "step": 56700 }, { "epoch": 0.28, "grad_norm": 4.0349273681640625, "learning_rate": 9.676180176350526e-06, "loss": 1.3568, "step": 56800 }, { "epoch": 0.28, "grad_norm": 3.9644954204559326, "learning_rate": 9.675555041555836e-06, "loss": 1.3368, "step": 56900 }, { "epoch": 0.28, "grad_norm": 3.748861312866211, "learning_rate": 9.674929906761146e-06, "loss": 1.3788, "step": 57000 }, { "epoch": 0.28, "grad_norm": 3.927027940750122, "learning_rate": 9.674304771966456e-06, "loss": 1.3369, "step": 57100 }, { "epoch": 0.28, "grad_norm": 3.5700511932373047, "learning_rate": 9.673679637171766e-06, "loss": 1.3476, "step": 57200 }, { "epoch": 0.28, "grad_norm": 3.2039616107940674, "learning_rate": 9.673054502377075e-06, "loss": 1.358, "step": 57300 }, { "epoch": 0.28, "grad_norm": 3.7656171321868896, "learning_rate": 9.672429367582385e-06, "loss": 1.3656, "step": 57400 }, { "epoch": 0.28, "grad_norm": 3.84936261177063, "learning_rate": 9.671804232787695e-06, "loss": 1.4221, "step": 57500 }, { "epoch": 0.28, "grad_norm": 3.145597457885742, "learning_rate": 9.671179097993005e-06, "loss": 1.3444, "step": 57600 }, { "epoch": 0.29, "grad_norm": 3.1017513275146484, "learning_rate": 9.670553963198315e-06, "loss": 1.3604, "step": 57700 }, { "epoch": 0.29, "grad_norm": 3.460015058517456, "learning_rate": 9.669928828403625e-06, "loss": 1.3556, "step": 57800 }, { "epoch": 0.29, "grad_norm": 3.2410836219787598, "learning_rate": 9.669303693608935e-06, "loss": 1.3468, "step": 57900 }, { "epoch": 0.29, "grad_norm": 4.7556843757629395, "learning_rate": 9.668678558814245e-06, "loss": 1.3535, "step": 58000 }, { "epoch": 0.29, "grad_norm": 2.8711953163146973, "learning_rate": 9.668053424019555e-06, "loss": 1.3587, "step": 58100 }, { "epoch": 0.29, "grad_norm": 2.9098100662231445, "learning_rate": 9.667428289224865e-06, "loss": 1.3866, "step": 58200 }, { "epoch": 0.29, "grad_norm": 3.6795027256011963, "learning_rate": 9.666803154430175e-06, "loss": 1.3692, "step": 58300 }, { "epoch": 0.29, "grad_norm": 4.239531517028809, "learning_rate": 9.666178019635485e-06, "loss": 1.3898, "step": 58400 }, { "epoch": 0.29, "grad_norm": 2.871461868286133, "learning_rate": 9.665552884840795e-06, "loss": 1.3863, "step": 58500 }, { "epoch": 0.29, "grad_norm": 3.8275647163391113, "learning_rate": 9.664927750046104e-06, "loss": 1.4035, "step": 58600 }, { "epoch": 0.29, "grad_norm": 3.168945074081421, "learning_rate": 9.664302615251414e-06, "loss": 1.4028, "step": 58700 }, { "epoch": 0.29, "grad_norm": 3.4457874298095703, "learning_rate": 9.663677480456724e-06, "loss": 1.3484, "step": 58800 }, { "epoch": 0.29, "grad_norm": 3.0998809337615967, "learning_rate": 9.663052345662034e-06, "loss": 1.3533, "step": 58900 }, { "epoch": 0.29, "grad_norm": 3.2760820388793945, "learning_rate": 9.662427210867344e-06, "loss": 1.3733, "step": 59000 }, { "epoch": 0.29, "grad_norm": 3.1642961502075195, "learning_rate": 9.661802076072654e-06, "loss": 1.3675, "step": 59100 }, { "epoch": 0.29, "grad_norm": 3.5796260833740234, "learning_rate": 9.661176941277964e-06, "loss": 1.3842, "step": 59200 }, { "epoch": 0.29, "grad_norm": 3.379223108291626, "learning_rate": 9.660551806483274e-06, "loss": 1.3, "step": 59300 }, { "epoch": 0.29, "grad_norm": 3.7059500217437744, "learning_rate": 9.659926671688584e-06, "loss": 1.3801, "step": 59400 }, { "epoch": 0.29, "grad_norm": 3.9644994735717773, "learning_rate": 9.659301536893894e-06, "loss": 1.3882, "step": 59500 }, { "epoch": 0.29, "grad_norm": 3.516009569168091, "learning_rate": 9.658676402099204e-06, "loss": 1.3507, "step": 59600 }, { "epoch": 0.3, "grad_norm": 3.6307122707366943, "learning_rate": 9.658051267304514e-06, "loss": 1.3887, "step": 59700 }, { "epoch": 0.3, "grad_norm": 2.963676929473877, "learning_rate": 9.657426132509824e-06, "loss": 1.3574, "step": 59800 }, { "epoch": 0.3, "grad_norm": 3.6590583324432373, "learning_rate": 9.656800997715134e-06, "loss": 1.3766, "step": 59900 }, { "epoch": 0.3, "grad_norm": 3.9890248775482178, "learning_rate": 9.656175862920443e-06, "loss": 1.3766, "step": 60000 }, { "epoch": 0.3, "grad_norm": 3.7033519744873047, "learning_rate": 9.655550728125753e-06, "loss": 1.3653, "step": 60100 }, { "epoch": 0.3, "grad_norm": 3.2749149799346924, "learning_rate": 9.654925593331063e-06, "loss": 1.3555, "step": 60200 }, { "epoch": 0.3, "grad_norm": 2.6017117500305176, "learning_rate": 9.654300458536373e-06, "loss": 1.3379, "step": 60300 }, { "epoch": 0.3, "grad_norm": 3.2133805751800537, "learning_rate": 9.653675323741683e-06, "loss": 1.3383, "step": 60400 }, { "epoch": 0.3, "grad_norm": 3.5152649879455566, "learning_rate": 9.653050188946993e-06, "loss": 1.3821, "step": 60500 }, { "epoch": 0.3, "grad_norm": 3.8433949947357178, "learning_rate": 9.652425054152303e-06, "loss": 1.3648, "step": 60600 }, { "epoch": 0.3, "grad_norm": 3.219630241394043, "learning_rate": 9.651799919357611e-06, "loss": 1.3381, "step": 60700 }, { "epoch": 0.3, "grad_norm": 2.9422874450683594, "learning_rate": 9.651174784562923e-06, "loss": 1.3596, "step": 60800 }, { "epoch": 0.3, "grad_norm": 3.0160861015319824, "learning_rate": 9.650549649768231e-06, "loss": 1.4086, "step": 60900 }, { "epoch": 0.3, "grad_norm": 3.6983797550201416, "learning_rate": 9.649924514973543e-06, "loss": 1.3653, "step": 61000 }, { "epoch": 0.3, "grad_norm": 3.4345366954803467, "learning_rate": 9.649299380178851e-06, "loss": 1.3836, "step": 61100 }, { "epoch": 0.3, "grad_norm": 4.864907741546631, "learning_rate": 9.648674245384163e-06, "loss": 1.3395, "step": 61200 }, { "epoch": 0.3, "grad_norm": 3.0346243381500244, "learning_rate": 9.64804911058947e-06, "loss": 1.3483, "step": 61300 }, { "epoch": 0.3, "grad_norm": 3.271688938140869, "learning_rate": 9.647423975794782e-06, "loss": 1.3718, "step": 61400 }, { "epoch": 0.3, "grad_norm": 5.519439697265625, "learning_rate": 9.64679884100009e-06, "loss": 1.3516, "step": 61500 }, { "epoch": 0.3, "grad_norm": 3.676679849624634, "learning_rate": 9.646173706205402e-06, "loss": 1.3786, "step": 61600 }, { "epoch": 0.31, "grad_norm": 2.799685001373291, "learning_rate": 9.64554857141071e-06, "loss": 1.3622, "step": 61700 }, { "epoch": 0.31, "grad_norm": 3.5693440437316895, "learning_rate": 9.644923436616022e-06, "loss": 1.3506, "step": 61800 }, { "epoch": 0.31, "grad_norm": 4.081248760223389, "learning_rate": 9.64429830182133e-06, "loss": 1.3551, "step": 61900 }, { "epoch": 0.31, "grad_norm": 3.275651454925537, "learning_rate": 9.64367316702664e-06, "loss": 1.3691, "step": 62000 }, { "epoch": 0.31, "grad_norm": 3.2802531719207764, "learning_rate": 9.64304803223195e-06, "loss": 1.3654, "step": 62100 }, { "epoch": 0.31, "grad_norm": 2.8903188705444336, "learning_rate": 9.64242289743726e-06, "loss": 1.3694, "step": 62200 }, { "epoch": 0.31, "grad_norm": 2.767051935195923, "learning_rate": 9.64179776264257e-06, "loss": 1.4189, "step": 62300 }, { "epoch": 0.31, "grad_norm": 3.3065268993377686, "learning_rate": 9.64117262784788e-06, "loss": 1.3528, "step": 62400 }, { "epoch": 0.31, "grad_norm": 3.6622681617736816, "learning_rate": 9.64054749305319e-06, "loss": 1.3744, "step": 62500 }, { "epoch": 0.31, "grad_norm": 2.5773024559020996, "learning_rate": 9.6399223582585e-06, "loss": 1.3678, "step": 62600 }, { "epoch": 0.31, "grad_norm": 4.050888538360596, "learning_rate": 9.63929722346381e-06, "loss": 1.3459, "step": 62700 }, { "epoch": 0.31, "grad_norm": 3.3641510009765625, "learning_rate": 9.63867208866912e-06, "loss": 1.3289, "step": 62800 }, { "epoch": 0.31, "grad_norm": 3.2410778999328613, "learning_rate": 9.63804695387443e-06, "loss": 1.3582, "step": 62900 }, { "epoch": 0.31, "grad_norm": 3.7819199562072754, "learning_rate": 9.63742181907974e-06, "loss": 1.3535, "step": 63000 }, { "epoch": 0.31, "grad_norm": 3.4329464435577393, "learning_rate": 9.63679668428505e-06, "loss": 1.3637, "step": 63100 }, { "epoch": 0.31, "grad_norm": 3.7776496410369873, "learning_rate": 9.63617154949036e-06, "loss": 1.3427, "step": 63200 }, { "epoch": 0.31, "grad_norm": 3.6807868480682373, "learning_rate": 9.63554641469567e-06, "loss": 1.3366, "step": 63300 }, { "epoch": 0.31, "grad_norm": 3.182055711746216, "learning_rate": 9.63492127990098e-06, "loss": 1.3907, "step": 63400 }, { "epoch": 0.31, "grad_norm": 3.0613508224487305, "learning_rate": 9.63429614510629e-06, "loss": 1.3628, "step": 63500 }, { "epoch": 0.31, "grad_norm": 3.813504219055176, "learning_rate": 9.633671010311599e-06, "loss": 1.3534, "step": 63600 }, { "epoch": 0.32, "grad_norm": 3.0388875007629395, "learning_rate": 9.633045875516909e-06, "loss": 1.3701, "step": 63700 }, { "epoch": 0.32, "grad_norm": 3.5311150550842285, "learning_rate": 9.632420740722219e-06, "loss": 1.3419, "step": 63800 }, { "epoch": 0.32, "grad_norm": 3.283538341522217, "learning_rate": 9.631795605927529e-06, "loss": 1.3772, "step": 63900 }, { "epoch": 0.32, "grad_norm": 2.6988024711608887, "learning_rate": 9.631170471132839e-06, "loss": 1.3633, "step": 64000 }, { "epoch": 0.32, "grad_norm": 3.738215684890747, "learning_rate": 9.630545336338149e-06, "loss": 1.3814, "step": 64100 }, { "epoch": 0.32, "grad_norm": 3.899857997894287, "learning_rate": 9.629920201543459e-06, "loss": 1.3787, "step": 64200 }, { "epoch": 0.32, "grad_norm": 3.2490193843841553, "learning_rate": 9.629295066748769e-06, "loss": 1.347, "step": 64300 }, { "epoch": 0.32, "grad_norm": 3.262529134750366, "learning_rate": 9.628669931954079e-06, "loss": 1.3405, "step": 64400 }, { "epoch": 0.32, "grad_norm": 3.1799771785736084, "learning_rate": 9.628044797159388e-06, "loss": 1.3796, "step": 64500 }, { "epoch": 0.32, "grad_norm": 3.5044260025024414, "learning_rate": 9.627419662364698e-06, "loss": 1.322, "step": 64600 }, { "epoch": 0.32, "grad_norm": 3.560049295425415, "learning_rate": 9.626794527570008e-06, "loss": 1.356, "step": 64700 }, { "epoch": 0.32, "grad_norm": 2.3910019397735596, "learning_rate": 9.626169392775318e-06, "loss": 1.3546, "step": 64800 }, { "epoch": 0.32, "grad_norm": 4.631550312042236, "learning_rate": 9.625544257980628e-06, "loss": 1.3664, "step": 64900 }, { "epoch": 0.32, "grad_norm": 3.415191650390625, "learning_rate": 9.624919123185938e-06, "loss": 1.3818, "step": 65000 }, { "epoch": 0.32, "grad_norm": 3.071653127670288, "learning_rate": 9.624293988391248e-06, "loss": 1.3947, "step": 65100 }, { "epoch": 0.32, "grad_norm": 3.575773000717163, "learning_rate": 9.623668853596558e-06, "loss": 1.3392, "step": 65200 }, { "epoch": 0.32, "grad_norm": 3.1757047176361084, "learning_rate": 9.623043718801868e-06, "loss": 1.3619, "step": 65300 }, { "epoch": 0.32, "grad_norm": 2.487311840057373, "learning_rate": 9.622418584007178e-06, "loss": 1.3538, "step": 65400 }, { "epoch": 0.32, "grad_norm": 2.791187047958374, "learning_rate": 9.621793449212488e-06, "loss": 1.343, "step": 65500 }, { "epoch": 0.32, "grad_norm": 2.840940237045288, "learning_rate": 9.621168314417798e-06, "loss": 1.3479, "step": 65600 }, { "epoch": 0.33, "grad_norm": 3.4006075859069824, "learning_rate": 9.620543179623108e-06, "loss": 1.3742, "step": 65700 }, { "epoch": 0.33, "grad_norm": 2.8290023803710938, "learning_rate": 9.619918044828418e-06, "loss": 1.3825, "step": 65800 }, { "epoch": 0.33, "grad_norm": 4.114961624145508, "learning_rate": 9.619292910033727e-06, "loss": 1.3742, "step": 65900 }, { "epoch": 0.33, "grad_norm": 3.4792447090148926, "learning_rate": 9.618667775239037e-06, "loss": 1.3442, "step": 66000 }, { "epoch": 0.33, "grad_norm": 3.3174169063568115, "learning_rate": 9.618042640444346e-06, "loss": 1.3896, "step": 66100 }, { "epoch": 0.33, "grad_norm": 2.7161898612976074, "learning_rate": 9.617417505649657e-06, "loss": 1.3492, "step": 66200 }, { "epoch": 0.33, "grad_norm": 3.3158161640167236, "learning_rate": 9.616792370854965e-06, "loss": 1.3378, "step": 66300 }, { "epoch": 0.33, "grad_norm": 4.810378074645996, "learning_rate": 9.616167236060277e-06, "loss": 1.3482, "step": 66400 }, { "epoch": 0.33, "grad_norm": 3.8272716999053955, "learning_rate": 9.615542101265585e-06, "loss": 1.3624, "step": 66500 }, { "epoch": 0.33, "grad_norm": 3.1543540954589844, "learning_rate": 9.614916966470897e-06, "loss": 1.3703, "step": 66600 }, { "epoch": 0.33, "grad_norm": 3.571789503097534, "learning_rate": 9.614291831676205e-06, "loss": 1.3467, "step": 66700 }, { "epoch": 0.33, "grad_norm": 3.8913381099700928, "learning_rate": 9.613666696881517e-06, "loss": 1.3629, "step": 66800 }, { "epoch": 0.33, "grad_norm": 3.3162176609039307, "learning_rate": 9.613041562086825e-06, "loss": 1.399, "step": 66900 }, { "epoch": 0.33, "grad_norm": 3.26802396774292, "learning_rate": 9.612416427292137e-06, "loss": 1.3481, "step": 67000 }, { "epoch": 0.33, "grad_norm": 3.7507224082946777, "learning_rate": 9.611791292497445e-06, "loss": 1.3875, "step": 67100 }, { "epoch": 0.33, "grad_norm": 3.059849262237549, "learning_rate": 9.611166157702756e-06, "loss": 1.3402, "step": 67200 }, { "epoch": 0.33, "grad_norm": 2.9418632984161377, "learning_rate": 9.610541022908065e-06, "loss": 1.385, "step": 67300 }, { "epoch": 0.33, "grad_norm": 3.126817464828491, "learning_rate": 9.609915888113375e-06, "loss": 1.3342, "step": 67400 }, { "epoch": 0.33, "grad_norm": 3.908066749572754, "learning_rate": 9.609290753318685e-06, "loss": 1.3658, "step": 67500 }, { "epoch": 0.33, "grad_norm": 3.3799283504486084, "learning_rate": 9.608665618523994e-06, "loss": 1.3392, "step": 67600 }, { "epoch": 0.33, "grad_norm": 3.0953500270843506, "learning_rate": 9.608040483729304e-06, "loss": 1.3664, "step": 67700 }, { "epoch": 0.34, "grad_norm": 3.4090096950531006, "learning_rate": 9.607415348934614e-06, "loss": 1.3913, "step": 67800 }, { "epoch": 0.34, "grad_norm": 3.0916600227355957, "learning_rate": 9.606790214139924e-06, "loss": 1.3482, "step": 67900 }, { "epoch": 0.34, "grad_norm": 4.232104778289795, "learning_rate": 9.606165079345234e-06, "loss": 1.3663, "step": 68000 }, { "epoch": 0.34, "grad_norm": 3.300558090209961, "learning_rate": 9.605539944550544e-06, "loss": 1.3608, "step": 68100 }, { "epoch": 0.34, "grad_norm": 2.795227527618408, "learning_rate": 9.604914809755854e-06, "loss": 1.3637, "step": 68200 }, { "epoch": 0.34, "grad_norm": 3.083174467086792, "learning_rate": 9.604289674961164e-06, "loss": 1.3269, "step": 68300 }, { "epoch": 0.34, "grad_norm": 3.8292133808135986, "learning_rate": 9.603664540166474e-06, "loss": 1.3311, "step": 68400 }, { "epoch": 0.34, "grad_norm": 3.3727259635925293, "learning_rate": 9.603039405371784e-06, "loss": 1.3233, "step": 68500 }, { "epoch": 0.34, "grad_norm": 3.0696310997009277, "learning_rate": 9.602414270577094e-06, "loss": 1.3217, "step": 68600 }, { "epoch": 0.34, "grad_norm": 5.1085591316223145, "learning_rate": 9.601789135782404e-06, "loss": 1.395, "step": 68700 }, { "epoch": 0.34, "grad_norm": 4.036706447601318, "learning_rate": 9.601164000987714e-06, "loss": 1.3763, "step": 68800 }, { "epoch": 0.34, "grad_norm": 3.823237419128418, "learning_rate": 9.600538866193024e-06, "loss": 1.3868, "step": 68900 }, { "epoch": 0.34, "grad_norm": 3.535228729248047, "learning_rate": 9.599913731398333e-06, "loss": 1.3714, "step": 69000 }, { "epoch": 0.34, "grad_norm": 3.333162546157837, "learning_rate": 9.599288596603643e-06, "loss": 1.3509, "step": 69100 }, { "epoch": 0.34, "grad_norm": 3.901670455932617, "learning_rate": 9.598663461808953e-06, "loss": 1.3486, "step": 69200 }, { "epoch": 0.34, "grad_norm": 2.89204478263855, "learning_rate": 9.598038327014263e-06, "loss": 1.3975, "step": 69300 }, { "epoch": 0.34, "grad_norm": 3.5590710639953613, "learning_rate": 9.597413192219573e-06, "loss": 1.3822, "step": 69400 }, { "epoch": 0.34, "grad_norm": 3.234952449798584, "learning_rate": 9.596788057424883e-06, "loss": 1.3942, "step": 69500 }, { "epoch": 0.34, "grad_norm": 3.125939130783081, "learning_rate": 9.596162922630193e-06, "loss": 1.3573, "step": 69600 }, { "epoch": 0.34, "grad_norm": 3.25191593170166, "learning_rate": 9.595537787835503e-06, "loss": 1.3742, "step": 69700 }, { "epoch": 0.35, "grad_norm": 3.0981853008270264, "learning_rate": 9.594912653040813e-06, "loss": 1.3272, "step": 69800 }, { "epoch": 0.35, "grad_norm": 2.660688638687134, "learning_rate": 9.594287518246123e-06, "loss": 1.3859, "step": 69900 }, { "epoch": 0.35, "grad_norm": 4.212889671325684, "learning_rate": 9.593662383451433e-06, "loss": 1.3685, "step": 70000 }, { "epoch": 0.35, "grad_norm": 2.602475643157959, "learning_rate": 9.593037248656743e-06, "loss": 1.3486, "step": 70100 }, { "epoch": 0.35, "grad_norm": 3.8037405014038086, "learning_rate": 9.592412113862053e-06, "loss": 1.3094, "step": 70200 }, { "epoch": 0.35, "grad_norm": 3.735767364501953, "learning_rate": 9.591786979067363e-06, "loss": 1.3443, "step": 70300 }, { "epoch": 0.35, "grad_norm": 3.10837984085083, "learning_rate": 9.591161844272672e-06, "loss": 1.3637, "step": 70400 }, { "epoch": 0.35, "grad_norm": 3.339202880859375, "learning_rate": 9.590536709477982e-06, "loss": 1.374, "step": 70500 }, { "epoch": 0.35, "grad_norm": 4.676008224487305, "learning_rate": 9.589911574683292e-06, "loss": 1.3609, "step": 70600 }, { "epoch": 0.35, "grad_norm": 3.2127492427825928, "learning_rate": 9.589286439888602e-06, "loss": 1.3443, "step": 70700 }, { "epoch": 0.35, "grad_norm": 3.552145481109619, "learning_rate": 9.588661305093912e-06, "loss": 1.3653, "step": 70800 }, { "epoch": 0.35, "grad_norm": 4.267813205718994, "learning_rate": 9.588036170299222e-06, "loss": 1.3316, "step": 70900 }, { "epoch": 0.35, "grad_norm": 4.004978179931641, "learning_rate": 9.587411035504532e-06, "loss": 1.3259, "step": 71000 }, { "epoch": 0.35, "grad_norm": 3.433945417404175, "learning_rate": 9.586785900709842e-06, "loss": 1.3475, "step": 71100 }, { "epoch": 0.35, "grad_norm": 3.3748490810394287, "learning_rate": 9.586160765915152e-06, "loss": 1.3442, "step": 71200 }, { "epoch": 0.35, "grad_norm": 3.2221007347106934, "learning_rate": 9.58553563112046e-06, "loss": 1.3705, "step": 71300 }, { "epoch": 0.35, "grad_norm": 2.846968173980713, "learning_rate": 9.584910496325772e-06, "loss": 1.3411, "step": 71400 }, { "epoch": 0.35, "grad_norm": 3.973281145095825, "learning_rate": 9.58428536153108e-06, "loss": 1.3503, "step": 71500 }, { "epoch": 0.35, "grad_norm": 2.7122104167938232, "learning_rate": 9.583660226736392e-06, "loss": 1.3581, "step": 71600 }, { "epoch": 0.35, "grad_norm": 4.12910795211792, "learning_rate": 9.5830350919417e-06, "loss": 1.385, "step": 71700 }, { "epoch": 0.36, "grad_norm": 3.4491500854492188, "learning_rate": 9.582409957147011e-06, "loss": 1.3626, "step": 71800 }, { "epoch": 0.36, "grad_norm": 4.059682846069336, "learning_rate": 9.58178482235232e-06, "loss": 1.3595, "step": 71900 }, { "epoch": 0.36, "grad_norm": 3.2482686042785645, "learning_rate": 9.581159687557631e-06, "loss": 1.3339, "step": 72000 }, { "epoch": 0.36, "grad_norm": 3.3053741455078125, "learning_rate": 9.58053455276294e-06, "loss": 1.3656, "step": 72100 }, { "epoch": 0.36, "grad_norm": 3.101283311843872, "learning_rate": 9.579909417968251e-06, "loss": 1.3751, "step": 72200 }, { "epoch": 0.36, "grad_norm": 3.7894277572631836, "learning_rate": 9.57928428317356e-06, "loss": 1.3744, "step": 72300 }, { "epoch": 0.36, "grad_norm": 3.6949033737182617, "learning_rate": 9.578659148378871e-06, "loss": 1.3434, "step": 72400 }, { "epoch": 0.36, "grad_norm": 3.2511017322540283, "learning_rate": 9.57803401358418e-06, "loss": 1.3809, "step": 72500 }, { "epoch": 0.36, "grad_norm": 2.6631274223327637, "learning_rate": 9.577408878789489e-06, "loss": 1.3232, "step": 72600 }, { "epoch": 0.36, "grad_norm": 3.0832998752593994, "learning_rate": 9.576783743994799e-06, "loss": 1.3785, "step": 72700 }, { "epoch": 0.36, "grad_norm": 4.4912238121032715, "learning_rate": 9.576158609200109e-06, "loss": 1.3558, "step": 72800 }, { "epoch": 0.36, "grad_norm": 3.720935821533203, "learning_rate": 9.575533474405419e-06, "loss": 1.3547, "step": 72900 }, { "epoch": 0.36, "grad_norm": 3.69688081741333, "learning_rate": 9.574908339610729e-06, "loss": 1.3409, "step": 73000 }, { "epoch": 0.36, "grad_norm": 4.521012783050537, "learning_rate": 9.574283204816039e-06, "loss": 1.3701, "step": 73100 }, { "epoch": 0.36, "grad_norm": 3.9866528511047363, "learning_rate": 9.573658070021349e-06, "loss": 1.3623, "step": 73200 }, { "epoch": 0.36, "grad_norm": 4.300259590148926, "learning_rate": 9.573032935226659e-06, "loss": 1.3562, "step": 73300 }, { "epoch": 0.36, "grad_norm": 3.585087299346924, "learning_rate": 9.572407800431969e-06, "loss": 1.3574, "step": 73400 }, { "epoch": 0.36, "grad_norm": 3.3413264751434326, "learning_rate": 9.571782665637278e-06, "loss": 1.3586, "step": 73500 }, { "epoch": 0.36, "grad_norm": 3.7272746562957764, "learning_rate": 9.571157530842588e-06, "loss": 1.3525, "step": 73600 }, { "epoch": 0.36, "grad_norm": 3.167235851287842, "learning_rate": 9.570532396047898e-06, "loss": 1.3545, "step": 73700 }, { "epoch": 0.37, "grad_norm": 3.007138252258301, "learning_rate": 9.569907261253208e-06, "loss": 1.3638, "step": 73800 }, { "epoch": 0.37, "grad_norm": 3.402449607849121, "learning_rate": 9.569282126458518e-06, "loss": 1.3394, "step": 73900 }, { "epoch": 0.37, "grad_norm": 3.050807237625122, "learning_rate": 9.568656991663828e-06, "loss": 1.3477, "step": 74000 }, { "epoch": 0.37, "grad_norm": 3.470465660095215, "learning_rate": 9.568031856869138e-06, "loss": 1.3573, "step": 74100 }, { "epoch": 0.37, "grad_norm": 3.1874587535858154, "learning_rate": 9.567406722074448e-06, "loss": 1.3545, "step": 74200 }, { "epoch": 0.37, "grad_norm": 3.022789478302002, "learning_rate": 9.566781587279758e-06, "loss": 1.3857, "step": 74300 }, { "epoch": 0.37, "grad_norm": 3.696437358856201, "learning_rate": 9.566156452485068e-06, "loss": 1.3845, "step": 74400 }, { "epoch": 0.37, "grad_norm": 3.3129115104675293, "learning_rate": 9.565531317690378e-06, "loss": 1.3342, "step": 74500 }, { "epoch": 0.37, "grad_norm": 3.0286476612091064, "learning_rate": 9.564906182895688e-06, "loss": 1.3451, "step": 74600 }, { "epoch": 0.37, "grad_norm": 3.14780330657959, "learning_rate": 9.564281048100998e-06, "loss": 1.3501, "step": 74700 }, { "epoch": 0.37, "grad_norm": 3.660125732421875, "learning_rate": 9.563655913306308e-06, "loss": 1.3974, "step": 74800 }, { "epoch": 0.37, "grad_norm": 3.7873997688293457, "learning_rate": 9.563030778511617e-06, "loss": 1.3328, "step": 74900 }, { "epoch": 0.37, "grad_norm": 4.175543785095215, "learning_rate": 9.562405643716927e-06, "loss": 1.3871, "step": 75000 }, { "epoch": 0.37, "grad_norm": 3.0854318141937256, "learning_rate": 9.561780508922237e-06, "loss": 1.3436, "step": 75100 }, { "epoch": 0.37, "grad_norm": 3.430039882659912, "learning_rate": 9.561155374127547e-06, "loss": 1.3614, "step": 75200 }, { "epoch": 0.37, "grad_norm": 3.1078710556030273, "learning_rate": 9.560530239332857e-06, "loss": 1.3788, "step": 75300 }, { "epoch": 0.37, "grad_norm": 3.394430160522461, "learning_rate": 9.559905104538167e-06, "loss": 1.3534, "step": 75400 }, { "epoch": 0.37, "grad_norm": 3.4498708248138428, "learning_rate": 9.559279969743477e-06, "loss": 1.3875, "step": 75500 }, { "epoch": 0.37, "grad_norm": 3.169480562210083, "learning_rate": 9.558654834948787e-06, "loss": 1.3391, "step": 75600 }, { "epoch": 0.37, "grad_norm": 3.4374375343322754, "learning_rate": 9.558029700154097e-06, "loss": 1.3657, "step": 75700 }, { "epoch": 0.38, "grad_norm": 3.0859546661376953, "learning_rate": 9.557404565359407e-06, "loss": 1.3765, "step": 75800 }, { "epoch": 0.38, "grad_norm": 3.567939281463623, "learning_rate": 9.556779430564717e-06, "loss": 1.3432, "step": 75900 }, { "epoch": 0.38, "grad_norm": 3.236070156097412, "learning_rate": 9.556154295770027e-06, "loss": 1.3402, "step": 76000 }, { "epoch": 0.38, "grad_norm": 3.366365432739258, "learning_rate": 9.555529160975337e-06, "loss": 1.3736, "step": 76100 }, { "epoch": 0.38, "grad_norm": 4.573514461517334, "learning_rate": 9.554904026180646e-06, "loss": 1.3581, "step": 76200 }, { "epoch": 0.38, "grad_norm": 3.199225664138794, "learning_rate": 9.554278891385956e-06, "loss": 1.3452, "step": 76300 }, { "epoch": 0.38, "grad_norm": 3.0722098350524902, "learning_rate": 9.553653756591266e-06, "loss": 1.3785, "step": 76400 }, { "epoch": 0.38, "grad_norm": 3.137385606765747, "learning_rate": 9.553028621796576e-06, "loss": 1.3521, "step": 76500 }, { "epoch": 0.38, "grad_norm": 4.893807888031006, "learning_rate": 9.552403487001886e-06, "loss": 1.3572, "step": 76600 }, { "epoch": 0.38, "grad_norm": 4.010082721710205, "learning_rate": 9.551778352207194e-06, "loss": 1.3483, "step": 76700 }, { "epoch": 0.38, "grad_norm": 3.6857099533081055, "learning_rate": 9.551153217412506e-06, "loss": 1.3347, "step": 76800 }, { "epoch": 0.38, "grad_norm": 2.510134696960449, "learning_rate": 9.550528082617814e-06, "loss": 1.3461, "step": 76900 }, { "epoch": 0.38, "grad_norm": 3.9825291633605957, "learning_rate": 9.549902947823126e-06, "loss": 1.3415, "step": 77000 }, { "epoch": 0.38, "grad_norm": 3.156740427017212, "learning_rate": 9.549277813028434e-06, "loss": 1.3305, "step": 77100 }, { "epoch": 0.38, "grad_norm": 3.245800256729126, "learning_rate": 9.548652678233746e-06, "loss": 1.3573, "step": 77200 }, { "epoch": 0.38, "grad_norm": 2.6874351501464844, "learning_rate": 9.548027543439054e-06, "loss": 1.3443, "step": 77300 }, { "epoch": 0.38, "grad_norm": 3.6892011165618896, "learning_rate": 9.547402408644366e-06, "loss": 1.3192, "step": 77400 }, { "epoch": 0.38, "grad_norm": 2.505993604660034, "learning_rate": 9.546777273849674e-06, "loss": 1.3812, "step": 77500 }, { "epoch": 0.38, "grad_norm": 3.5395193099975586, "learning_rate": 9.546152139054985e-06, "loss": 1.3604, "step": 77600 }, { "epoch": 0.38, "grad_norm": 3.2124781608581543, "learning_rate": 9.545527004260294e-06, "loss": 1.312, "step": 77700 }, { "epoch": 0.38, "grad_norm": 3.8713743686676025, "learning_rate": 9.544901869465605e-06, "loss": 1.3435, "step": 77800 }, { "epoch": 0.39, "grad_norm": 3.1610865592956543, "learning_rate": 9.544276734670914e-06, "loss": 1.3696, "step": 77900 }, { "epoch": 0.39, "grad_norm": 7.323131561279297, "learning_rate": 9.543651599876223e-06, "loss": 1.357, "step": 78000 }, { "epoch": 0.39, "grad_norm": 3.0851237773895264, "learning_rate": 9.543026465081533e-06, "loss": 1.3406, "step": 78100 }, { "epoch": 0.39, "grad_norm": 3.637321949005127, "learning_rate": 9.542401330286843e-06, "loss": 1.376, "step": 78200 }, { "epoch": 0.39, "grad_norm": 2.876664638519287, "learning_rate": 9.541776195492153e-06, "loss": 1.4049, "step": 78300 }, { "epoch": 0.39, "grad_norm": 3.146031618118286, "learning_rate": 9.541151060697463e-06, "loss": 1.3484, "step": 78400 }, { "epoch": 0.39, "grad_norm": 4.596341609954834, "learning_rate": 9.540525925902773e-06, "loss": 1.3518, "step": 78500 }, { "epoch": 0.39, "grad_norm": 5.041236400604248, "learning_rate": 9.539900791108083e-06, "loss": 1.362, "step": 78600 }, { "epoch": 0.39, "grad_norm": 3.9177463054656982, "learning_rate": 9.539275656313393e-06, "loss": 1.3586, "step": 78700 }, { "epoch": 0.39, "grad_norm": 3.115206003189087, "learning_rate": 9.538650521518703e-06, "loss": 1.367, "step": 78800 }, { "epoch": 0.39, "grad_norm": 2.846676826477051, "learning_rate": 9.538025386724013e-06, "loss": 1.3699, "step": 78900 }, { "epoch": 0.39, "grad_norm": 3.625420331954956, "learning_rate": 9.537400251929323e-06, "loss": 1.3505, "step": 79000 }, { "epoch": 0.39, "grad_norm": 3.315352439880371, "learning_rate": 9.536775117134633e-06, "loss": 1.3456, "step": 79100 }, { "epoch": 0.39, "grad_norm": 3.249753475189209, "learning_rate": 9.536149982339943e-06, "loss": 1.3538, "step": 79200 }, { "epoch": 0.39, "grad_norm": 3.9315223693847656, "learning_rate": 9.535524847545253e-06, "loss": 1.3459, "step": 79300 }, { "epoch": 0.39, "grad_norm": 3.4720170497894287, "learning_rate": 9.534899712750562e-06, "loss": 1.3935, "step": 79400 }, { "epoch": 0.39, "grad_norm": 2.97334885597229, "learning_rate": 9.534274577955872e-06, "loss": 1.3526, "step": 79500 }, { "epoch": 0.39, "grad_norm": 2.218647003173828, "learning_rate": 9.533649443161182e-06, "loss": 1.3845, "step": 79600 }, { "epoch": 0.39, "grad_norm": 3.644829034805298, "learning_rate": 9.533024308366492e-06, "loss": 1.3515, "step": 79700 }, { "epoch": 0.39, "grad_norm": 4.018405437469482, "learning_rate": 9.532399173571802e-06, "loss": 1.3544, "step": 79800 }, { "epoch": 0.4, "grad_norm": 3.210761308670044, "learning_rate": 9.531774038777112e-06, "loss": 1.3554, "step": 79900 }, { "epoch": 0.4, "grad_norm": 3.046523094177246, "learning_rate": 9.531148903982422e-06, "loss": 1.3733, "step": 80000 }, { "epoch": 0.4, "grad_norm": 3.437032699584961, "learning_rate": 9.530523769187732e-06, "loss": 1.347, "step": 80100 }, { "epoch": 0.4, "grad_norm": 3.712858200073242, "learning_rate": 9.529898634393042e-06, "loss": 1.3856, "step": 80200 }, { "epoch": 0.4, "grad_norm": 2.859689474105835, "learning_rate": 9.529273499598352e-06, "loss": 1.3312, "step": 80300 }, { "epoch": 0.4, "grad_norm": 3.8814845085144043, "learning_rate": 9.528648364803662e-06, "loss": 1.3362, "step": 80400 }, { "epoch": 0.4, "grad_norm": 2.983851909637451, "learning_rate": 9.528023230008972e-06, "loss": 1.3689, "step": 80500 }, { "epoch": 0.4, "grad_norm": 3.3227264881134033, "learning_rate": 9.527398095214282e-06, "loss": 1.3362, "step": 80600 }, { "epoch": 0.4, "grad_norm": 3.825824499130249, "learning_rate": 9.526772960419591e-06, "loss": 1.3496, "step": 80700 }, { "epoch": 0.4, "grad_norm": 3.376059055328369, "learning_rate": 9.526147825624901e-06, "loss": 1.3781, "step": 80800 }, { "epoch": 0.4, "grad_norm": 3.2187156677246094, "learning_rate": 9.525522690830211e-06, "loss": 1.4142, "step": 80900 }, { "epoch": 0.4, "grad_norm": 3.073812246322632, "learning_rate": 9.524897556035521e-06, "loss": 1.3185, "step": 81000 }, { "epoch": 0.4, "grad_norm": 2.7107346057891846, "learning_rate": 9.524272421240831e-06, "loss": 1.322, "step": 81100 }, { "epoch": 0.4, "grad_norm": 3.378969669342041, "learning_rate": 9.523647286446141e-06, "loss": 1.3438, "step": 81200 }, { "epoch": 0.4, "grad_norm": 4.337489604949951, "learning_rate": 9.523022151651451e-06, "loss": 1.3326, "step": 81300 }, { "epoch": 0.4, "grad_norm": 4.453660488128662, "learning_rate": 9.522397016856761e-06, "loss": 1.3624, "step": 81400 }, { "epoch": 0.4, "grad_norm": 3.236886501312256, "learning_rate": 9.521771882062071e-06, "loss": 1.3704, "step": 81500 }, { "epoch": 0.4, "grad_norm": 3.969984531402588, "learning_rate": 9.52114674726738e-06, "loss": 1.3605, "step": 81600 }, { "epoch": 0.4, "grad_norm": 2.707930326461792, "learning_rate": 9.52052161247269e-06, "loss": 1.3272, "step": 81700 }, { "epoch": 0.4, "grad_norm": 3.0617573261260986, "learning_rate": 9.519896477678e-06, "loss": 1.3779, "step": 81800 }, { "epoch": 0.41, "grad_norm": 2.8938345909118652, "learning_rate": 9.519271342883309e-06, "loss": 1.3453, "step": 81900 }, { "epoch": 0.41, "grad_norm": 3.273656129837036, "learning_rate": 9.51864620808862e-06, "loss": 1.3535, "step": 82000 }, { "epoch": 0.41, "grad_norm": 3.6416726112365723, "learning_rate": 9.518021073293929e-06, "loss": 1.393, "step": 82100 }, { "epoch": 0.41, "grad_norm": 2.7089104652404785, "learning_rate": 9.51739593849924e-06, "loss": 1.3568, "step": 82200 }, { "epoch": 0.41, "grad_norm": 3.872784376144409, "learning_rate": 9.516770803704549e-06, "loss": 1.3725, "step": 82300 }, { "epoch": 0.41, "grad_norm": 3.3895182609558105, "learning_rate": 9.51614566890986e-06, "loss": 1.3347, "step": 82400 }, { "epoch": 0.41, "grad_norm": 3.349815845489502, "learning_rate": 9.515520534115168e-06, "loss": 1.2965, "step": 82500 }, { "epoch": 0.41, "grad_norm": 3.8851418495178223, "learning_rate": 9.51489539932048e-06, "loss": 1.3547, "step": 82600 }, { "epoch": 0.41, "grad_norm": 3.7153375148773193, "learning_rate": 9.514270264525788e-06, "loss": 1.3502, "step": 82700 }, { "epoch": 0.41, "grad_norm": 2.9336204528808594, "learning_rate": 9.5136451297311e-06, "loss": 1.3727, "step": 82800 }, { "epoch": 0.41, "grad_norm": 2.910884141921997, "learning_rate": 9.513019994936408e-06, "loss": 1.3643, "step": 82900 }, { "epoch": 0.41, "grad_norm": 2.9535582065582275, "learning_rate": 9.51239486014172e-06, "loss": 1.3532, "step": 83000 }, { "epoch": 0.41, "grad_norm": 3.453658103942871, "learning_rate": 9.511769725347028e-06, "loss": 1.3747, "step": 83100 }, { "epoch": 0.41, "grad_norm": 4.163629055023193, "learning_rate": 9.511144590552338e-06, "loss": 1.3686, "step": 83200 }, { "epoch": 0.41, "grad_norm": 3.291599988937378, "learning_rate": 9.510519455757648e-06, "loss": 1.3195, "step": 83300 }, { "epoch": 0.41, "grad_norm": 4.140781879425049, "learning_rate": 9.509894320962958e-06, "loss": 1.3454, "step": 83400 }, { "epoch": 0.41, "grad_norm": 3.2356150150299072, "learning_rate": 9.509269186168268e-06, "loss": 1.3656, "step": 83500 }, { "epoch": 0.41, "grad_norm": 2.98710298538208, "learning_rate": 9.508644051373578e-06, "loss": 1.371, "step": 83600 }, { "epoch": 0.41, "grad_norm": 2.949601650238037, "learning_rate": 9.508018916578888e-06, "loss": 1.3794, "step": 83700 }, { "epoch": 0.41, "grad_norm": 2.5830845832824707, "learning_rate": 9.507393781784198e-06, "loss": 1.3665, "step": 83800 }, { "epoch": 0.42, "grad_norm": 3.1843700408935547, "learning_rate": 9.506768646989507e-06, "loss": 1.3463, "step": 83900 }, { "epoch": 0.42, "grad_norm": 3.233429193496704, "learning_rate": 9.506143512194817e-06, "loss": 1.3741, "step": 84000 }, { "epoch": 0.42, "grad_norm": 3.79780650138855, "learning_rate": 9.505518377400127e-06, "loss": 1.3006, "step": 84100 }, { "epoch": 0.42, "grad_norm": 3.684920310974121, "learning_rate": 9.504893242605437e-06, "loss": 1.3613, "step": 84200 }, { "epoch": 0.42, "grad_norm": 4.043038368225098, "learning_rate": 9.504268107810747e-06, "loss": 1.3762, "step": 84300 }, { "epoch": 0.42, "grad_norm": 3.4520349502563477, "learning_rate": 9.503642973016057e-06, "loss": 1.3339, "step": 84400 }, { "epoch": 0.42, "grad_norm": 3.5900933742523193, "learning_rate": 9.503017838221367e-06, "loss": 1.3507, "step": 84500 }, { "epoch": 0.42, "grad_norm": 4.365208625793457, "learning_rate": 9.502392703426677e-06, "loss": 1.354, "step": 84600 }, { "epoch": 0.42, "grad_norm": 3.7963385581970215, "learning_rate": 9.501767568631987e-06, "loss": 1.3377, "step": 84700 }, { "epoch": 0.42, "grad_norm": 3.07368803024292, "learning_rate": 9.501142433837297e-06, "loss": 1.3544, "step": 84800 }, { "epoch": 0.42, "grad_norm": 3.9033076763153076, "learning_rate": 9.500517299042607e-06, "loss": 1.3466, "step": 84900 }, { "epoch": 0.42, "grad_norm": 2.946506977081299, "learning_rate": 9.499892164247917e-06, "loss": 1.3524, "step": 85000 }, { "epoch": 0.42, "grad_norm": 4.097044944763184, "learning_rate": 9.499267029453227e-06, "loss": 1.3342, "step": 85100 }, { "epoch": 0.42, "grad_norm": 3.7137930393218994, "learning_rate": 9.498641894658536e-06, "loss": 1.3737, "step": 85200 }, { "epoch": 0.42, "grad_norm": 3.2496094703674316, "learning_rate": 9.498016759863846e-06, "loss": 1.339, "step": 85300 }, { "epoch": 0.42, "grad_norm": 4.326569557189941, "learning_rate": 9.497391625069156e-06, "loss": 1.372, "step": 85400 }, { "epoch": 0.42, "grad_norm": 2.918201208114624, "learning_rate": 9.496766490274466e-06, "loss": 1.341, "step": 85500 }, { "epoch": 0.42, "grad_norm": 3.4720118045806885, "learning_rate": 9.496141355479776e-06, "loss": 1.3525, "step": 85600 }, { "epoch": 0.42, "grad_norm": 3.20745587348938, "learning_rate": 9.495516220685086e-06, "loss": 1.3664, "step": 85700 }, { "epoch": 0.42, "grad_norm": 3.320747137069702, "learning_rate": 9.494891085890396e-06, "loss": 1.326, "step": 85800 }, { "epoch": 0.42, "grad_norm": 2.690807342529297, "learning_rate": 9.494265951095706e-06, "loss": 1.3689, "step": 85900 }, { "epoch": 0.43, "grad_norm": 4.453171253204346, "learning_rate": 9.493640816301016e-06, "loss": 1.374, "step": 86000 }, { "epoch": 0.43, "grad_norm": 3.375361204147339, "learning_rate": 9.493015681506326e-06, "loss": 1.3427, "step": 86100 }, { "epoch": 0.43, "grad_norm": 3.053560495376587, "learning_rate": 9.492390546711636e-06, "loss": 1.3418, "step": 86200 }, { "epoch": 0.43, "grad_norm": 4.028963565826416, "learning_rate": 9.491765411916946e-06, "loss": 1.3203, "step": 86300 }, { "epoch": 0.43, "grad_norm": 3.746544599533081, "learning_rate": 9.491140277122256e-06, "loss": 1.3121, "step": 86400 }, { "epoch": 0.43, "grad_norm": 3.1117103099823, "learning_rate": 9.490515142327566e-06, "loss": 1.3383, "step": 86500 }, { "epoch": 0.43, "grad_norm": 3.2640998363494873, "learning_rate": 9.489890007532875e-06, "loss": 1.3384, "step": 86600 }, { "epoch": 0.43, "grad_norm": 3.436328172683716, "learning_rate": 9.489264872738185e-06, "loss": 1.3587, "step": 86700 }, { "epoch": 0.43, "grad_norm": 3.372560977935791, "learning_rate": 9.488639737943495e-06, "loss": 1.4054, "step": 86800 }, { "epoch": 0.43, "grad_norm": 2.880247116088867, "learning_rate": 9.488014603148805e-06, "loss": 1.3166, "step": 86900 }, { "epoch": 0.43, "grad_norm": 3.686885356903076, "learning_rate": 9.487389468354115e-06, "loss": 1.3629, "step": 87000 }, { "epoch": 0.43, "grad_norm": 3.168898820877075, "learning_rate": 9.486764333559425e-06, "loss": 1.3868, "step": 87100 }, { "epoch": 0.43, "grad_norm": 3.3519859313964844, "learning_rate": 9.486139198764735e-06, "loss": 1.3696, "step": 87200 }, { "epoch": 0.43, "grad_norm": 2.844688892364502, "learning_rate": 9.485514063970043e-06, "loss": 1.3498, "step": 87300 }, { "epoch": 0.43, "grad_norm": 3.061849594116211, "learning_rate": 9.484888929175355e-06, "loss": 1.3692, "step": 87400 }, { "epoch": 0.43, "grad_norm": 4.100019931793213, "learning_rate": 9.484263794380663e-06, "loss": 1.3407, "step": 87500 }, { "epoch": 0.43, "grad_norm": 3.519801378250122, "learning_rate": 9.483638659585975e-06, "loss": 1.3565, "step": 87600 }, { "epoch": 0.43, "grad_norm": 6.410887241363525, "learning_rate": 9.483013524791283e-06, "loss": 1.3291, "step": 87700 }, { "epoch": 0.43, "grad_norm": 3.080322504043579, "learning_rate": 9.482388389996595e-06, "loss": 1.3519, "step": 87800 }, { "epoch": 0.43, "grad_norm": 3.0409817695617676, "learning_rate": 9.481763255201903e-06, "loss": 1.3519, "step": 87900 }, { "epoch": 0.44, "grad_norm": 4.2845025062561035, "learning_rate": 9.481138120407214e-06, "loss": 1.3436, "step": 88000 }, { "epoch": 0.44, "grad_norm": 3.1132304668426514, "learning_rate": 9.480512985612523e-06, "loss": 1.3145, "step": 88100 }, { "epoch": 0.44, "grad_norm": 4.386362075805664, "learning_rate": 9.479887850817834e-06, "loss": 1.3483, "step": 88200 }, { "epoch": 0.44, "grad_norm": 2.7619481086730957, "learning_rate": 9.479262716023143e-06, "loss": 1.312, "step": 88300 }, { "epoch": 0.44, "grad_norm": 3.451927900314331, "learning_rate": 9.478637581228454e-06, "loss": 1.3807, "step": 88400 }, { "epoch": 0.44, "grad_norm": 3.5724120140075684, "learning_rate": 9.478012446433762e-06, "loss": 1.3469, "step": 88500 }, { "epoch": 0.44, "grad_norm": 4.330935955047607, "learning_rate": 9.477387311639072e-06, "loss": 1.3136, "step": 88600 }, { "epoch": 0.44, "grad_norm": 3.6509666442871094, "learning_rate": 9.476762176844382e-06, "loss": 1.3645, "step": 88700 }, { "epoch": 0.44, "grad_norm": 3.39678692817688, "learning_rate": 9.476137042049692e-06, "loss": 1.3229, "step": 88800 }, { "epoch": 0.44, "grad_norm": 3.2337393760681152, "learning_rate": 9.475511907255002e-06, "loss": 1.3473, "step": 88900 }, { "epoch": 0.44, "grad_norm": 2.9486355781555176, "learning_rate": 9.474886772460312e-06, "loss": 1.341, "step": 89000 }, { "epoch": 0.44, "grad_norm": 3.5861918926239014, "learning_rate": 9.474261637665622e-06, "loss": 1.3708, "step": 89100 }, { "epoch": 0.44, "grad_norm": 2.8153584003448486, "learning_rate": 9.473636502870932e-06, "loss": 1.35, "step": 89200 }, { "epoch": 0.44, "grad_norm": 3.0656278133392334, "learning_rate": 9.473011368076242e-06, "loss": 1.3564, "step": 89300 }, { "epoch": 0.44, "grad_norm": 3.5475146770477295, "learning_rate": 9.472386233281552e-06, "loss": 1.3178, "step": 89400 }, { "epoch": 0.44, "grad_norm": 4.837975025177002, "learning_rate": 9.471761098486862e-06, "loss": 1.3588, "step": 89500 }, { "epoch": 0.44, "grad_norm": 3.626478433609009, "learning_rate": 9.471135963692172e-06, "loss": 1.3288, "step": 89600 }, { "epoch": 0.44, "grad_norm": 2.8399198055267334, "learning_rate": 9.470510828897481e-06, "loss": 1.3542, "step": 89700 }, { "epoch": 0.44, "grad_norm": 3.478510856628418, "learning_rate": 9.469885694102791e-06, "loss": 1.3282, "step": 89800 }, { "epoch": 0.44, "grad_norm": 3.0036330223083496, "learning_rate": 9.469260559308101e-06, "loss": 1.3135, "step": 89900 }, { "epoch": 0.45, "grad_norm": 3.49764084815979, "learning_rate": 9.468635424513411e-06, "loss": 1.3216, "step": 90000 }, { "epoch": 0.45, "grad_norm": 4.711456298828125, "learning_rate": 9.468010289718721e-06, "loss": 1.3746, "step": 90100 }, { "epoch": 0.45, "grad_norm": 3.8252532482147217, "learning_rate": 9.467385154924031e-06, "loss": 1.3375, "step": 90200 }, { "epoch": 0.45, "grad_norm": 3.445317029953003, "learning_rate": 9.466760020129341e-06, "loss": 1.3523, "step": 90300 }, { "epoch": 0.45, "grad_norm": 2.879566192626953, "learning_rate": 9.466134885334651e-06, "loss": 1.3184, "step": 90400 }, { "epoch": 0.45, "grad_norm": 3.891055107116699, "learning_rate": 9.465509750539961e-06, "loss": 1.3246, "step": 90500 }, { "epoch": 0.45, "grad_norm": 2.6852951049804688, "learning_rate": 9.46488461574527e-06, "loss": 1.3803, "step": 90600 }, { "epoch": 0.45, "grad_norm": 2.9267516136169434, "learning_rate": 9.46425948095058e-06, "loss": 1.3254, "step": 90700 }, { "epoch": 0.45, "grad_norm": 2.6373448371887207, "learning_rate": 9.46363434615589e-06, "loss": 1.3266, "step": 90800 }, { "epoch": 0.45, "grad_norm": 3.703024387359619, "learning_rate": 9.4630092113612e-06, "loss": 1.3552, "step": 90900 }, { "epoch": 0.45, "grad_norm": 2.5792810916900635, "learning_rate": 9.46238407656651e-06, "loss": 1.3365, "step": 91000 }, { "epoch": 0.45, "grad_norm": 3.6020054817199707, "learning_rate": 9.46175894177182e-06, "loss": 1.3287, "step": 91100 }, { "epoch": 0.45, "grad_norm": 3.713806390762329, "learning_rate": 9.46113380697713e-06, "loss": 1.3537, "step": 91200 }, { "epoch": 0.45, "grad_norm": 4.407512187957764, "learning_rate": 9.46050867218244e-06, "loss": 1.3835, "step": 91300 }, { "epoch": 0.45, "grad_norm": 2.683220863342285, "learning_rate": 9.45988353738775e-06, "loss": 1.3206, "step": 91400 }, { "epoch": 0.45, "grad_norm": 3.046828031539917, "learning_rate": 9.45925840259306e-06, "loss": 1.3251, "step": 91500 }, { "epoch": 0.45, "grad_norm": 2.8515381813049316, "learning_rate": 9.45863326779837e-06, "loss": 1.3432, "step": 91600 }, { "epoch": 0.45, "grad_norm": 3.381223678588867, "learning_rate": 9.45800813300368e-06, "loss": 1.355, "step": 91700 }, { "epoch": 0.45, "grad_norm": 3.301053762435913, "learning_rate": 9.45738299820899e-06, "loss": 1.3406, "step": 91800 }, { "epoch": 0.45, "grad_norm": 2.924475908279419, "learning_rate": 9.4567578634143e-06, "loss": 1.3444, "step": 91900 }, { "epoch": 0.46, "grad_norm": 3.036510705947876, "learning_rate": 9.45613272861961e-06, "loss": 1.3519, "step": 92000 }, { "epoch": 0.46, "grad_norm": 2.7162649631500244, "learning_rate": 9.45550759382492e-06, "loss": 1.3407, "step": 92100 }, { "epoch": 0.46, "grad_norm": 2.7335431575775146, "learning_rate": 9.45488245903023e-06, "loss": 1.3941, "step": 92200 }, { "epoch": 0.46, "grad_norm": 2.4760313034057617, "learning_rate": 9.45425732423554e-06, "loss": 1.3481, "step": 92300 }, { "epoch": 0.46, "grad_norm": 3.327454090118408, "learning_rate": 9.45363218944085e-06, "loss": 1.3313, "step": 92400 }, { "epoch": 0.46, "grad_norm": 3.170297861099243, "learning_rate": 9.453007054646158e-06, "loss": 1.3222, "step": 92500 }, { "epoch": 0.46, "grad_norm": 3.097593307495117, "learning_rate": 9.45238191985147e-06, "loss": 1.3615, "step": 92600 }, { "epoch": 0.46, "grad_norm": 2.889549493789673, "learning_rate": 9.451756785056778e-06, "loss": 1.3447, "step": 92700 }, { "epoch": 0.46, "grad_norm": 2.5188488960266113, "learning_rate": 9.45113165026209e-06, "loss": 1.3844, "step": 92800 }, { "epoch": 0.46, "grad_norm": 2.8199424743652344, "learning_rate": 9.450506515467397e-06, "loss": 1.3751, "step": 92900 }, { "epoch": 0.46, "grad_norm": 2.9179065227508545, "learning_rate": 9.449881380672709e-06, "loss": 1.3483, "step": 93000 }, { "epoch": 0.46, "grad_norm": 3.69584584236145, "learning_rate": 9.449256245878017e-06, "loss": 1.3613, "step": 93100 }, { "epoch": 0.46, "grad_norm": 4.401488780975342, "learning_rate": 9.448631111083329e-06, "loss": 1.3625, "step": 93200 }, { "epoch": 0.46, "grad_norm": 2.9850871562957764, "learning_rate": 9.448005976288637e-06, "loss": 1.3522, "step": 93300 }, { "epoch": 0.46, "grad_norm": 3.8156750202178955, "learning_rate": 9.447380841493949e-06, "loss": 1.3387, "step": 93400 }, { "epoch": 0.46, "grad_norm": 3.664689779281616, "learning_rate": 9.446755706699257e-06, "loss": 1.3233, "step": 93500 }, { "epoch": 0.46, "grad_norm": 4.119280815124512, "learning_rate": 9.446130571904569e-06, "loss": 1.396, "step": 93600 }, { "epoch": 0.46, "grad_norm": 2.9794814586639404, "learning_rate": 9.445505437109877e-06, "loss": 1.3731, "step": 93700 }, { "epoch": 0.46, "grad_norm": 2.943528890609741, "learning_rate": 9.444880302315187e-06, "loss": 1.3452, "step": 93800 }, { "epoch": 0.46, "grad_norm": 2.734614610671997, "learning_rate": 9.444255167520497e-06, "loss": 1.346, "step": 93900 }, { "epoch": 0.47, "grad_norm": 3.5047719478607178, "learning_rate": 9.443630032725807e-06, "loss": 1.3618, "step": 94000 }, { "epoch": 0.47, "grad_norm": 3.040126323699951, "learning_rate": 9.443004897931117e-06, "loss": 1.3461, "step": 94100 }, { "epoch": 0.47, "grad_norm": 3.951183795928955, "learning_rate": 9.442379763136426e-06, "loss": 1.3402, "step": 94200 }, { "epoch": 0.47, "grad_norm": 3.8336355686187744, "learning_rate": 9.441754628341736e-06, "loss": 1.3211, "step": 94300 }, { "epoch": 0.47, "grad_norm": 2.9649171829223633, "learning_rate": 9.441129493547046e-06, "loss": 1.3538, "step": 94400 }, { "epoch": 0.47, "grad_norm": 3.0716583728790283, "learning_rate": 9.440504358752356e-06, "loss": 1.3518, "step": 94500 }, { "epoch": 0.47, "grad_norm": 2.94270658493042, "learning_rate": 9.439879223957666e-06, "loss": 1.3278, "step": 94600 }, { "epoch": 0.47, "grad_norm": 2.8442766666412354, "learning_rate": 9.439254089162976e-06, "loss": 1.32, "step": 94700 }, { "epoch": 0.47, "grad_norm": 3.8846325874328613, "learning_rate": 9.438628954368286e-06, "loss": 1.3391, "step": 94800 }, { "epoch": 0.47, "grad_norm": 2.698730230331421, "learning_rate": 9.438003819573596e-06, "loss": 1.3777, "step": 94900 }, { "epoch": 0.47, "grad_norm": 3.3867924213409424, "learning_rate": 9.437378684778906e-06, "loss": 1.3563, "step": 95000 }, { "epoch": 0.47, "grad_norm": 2.769615411758423, "learning_rate": 9.436753549984216e-06, "loss": 1.3298, "step": 95100 }, { "epoch": 0.47, "grad_norm": 3.6002724170684814, "learning_rate": 9.436128415189526e-06, "loss": 1.3425, "step": 95200 }, { "epoch": 0.47, "grad_norm": 3.228452205657959, "learning_rate": 9.435503280394836e-06, "loss": 1.3429, "step": 95300 }, { "epoch": 0.47, "grad_norm": 3.423189401626587, "learning_rate": 9.434878145600146e-06, "loss": 1.3768, "step": 95400 }, { "epoch": 0.47, "grad_norm": 3.5708446502685547, "learning_rate": 9.434253010805456e-06, "loss": 1.3125, "step": 95500 }, { "epoch": 0.47, "grad_norm": 2.9108211994171143, "learning_rate": 9.433627876010765e-06, "loss": 1.3347, "step": 95600 }, { "epoch": 0.47, "grad_norm": 3.365302324295044, "learning_rate": 9.433002741216075e-06, "loss": 1.3277, "step": 95700 }, { "epoch": 0.47, "grad_norm": 3.0356671810150146, "learning_rate": 9.432377606421385e-06, "loss": 1.3345, "step": 95800 }, { "epoch": 0.47, "grad_norm": 2.6186368465423584, "learning_rate": 9.431752471626695e-06, "loss": 1.3059, "step": 95900 }, { "epoch": 0.47, "grad_norm": 3.4578585624694824, "learning_rate": 9.431127336832005e-06, "loss": 1.3635, "step": 96000 }, { "epoch": 0.48, "grad_norm": 2.8224425315856934, "learning_rate": 9.430502202037315e-06, "loss": 1.3325, "step": 96100 }, { "epoch": 0.48, "grad_norm": 3.635671377182007, "learning_rate": 9.429877067242625e-06, "loss": 1.3007, "step": 96200 }, { "epoch": 0.48, "grad_norm": 2.900747299194336, "learning_rate": 9.429251932447935e-06, "loss": 1.3317, "step": 96300 }, { "epoch": 0.48, "grad_norm": 2.8643271923065186, "learning_rate": 9.428626797653245e-06, "loss": 1.3489, "step": 96400 }, { "epoch": 0.48, "grad_norm": 3.498797655105591, "learning_rate": 9.428001662858555e-06, "loss": 1.3263, "step": 96500 }, { "epoch": 0.48, "grad_norm": 2.8632445335388184, "learning_rate": 9.427376528063865e-06, "loss": 1.3465, "step": 96600 }, { "epoch": 0.48, "grad_norm": 2.886178970336914, "learning_rate": 9.426751393269175e-06, "loss": 1.3622, "step": 96700 }, { "epoch": 0.48, "grad_norm": 3.605872869491577, "learning_rate": 9.426126258474485e-06, "loss": 1.3404, "step": 96800 }, { "epoch": 0.48, "grad_norm": 4.709196090698242, "learning_rate": 9.425501123679795e-06, "loss": 1.3344, "step": 96900 }, { "epoch": 0.48, "grad_norm": 3.5497000217437744, "learning_rate": 9.424875988885104e-06, "loss": 1.3536, "step": 97000 }, { "epoch": 0.48, "grad_norm": 3.165081024169922, "learning_rate": 9.424250854090414e-06, "loss": 1.3453, "step": 97100 }, { "epoch": 0.48, "grad_norm": 3.4329254627227783, "learning_rate": 9.423625719295724e-06, "loss": 1.3044, "step": 97200 }, { "epoch": 0.48, "grad_norm": 3.0184082984924316, "learning_rate": 9.423000584501034e-06, "loss": 1.344, "step": 97300 }, { "epoch": 0.48, "grad_norm": 3.776301383972168, "learning_rate": 9.422375449706344e-06, "loss": 1.3315, "step": 97400 }, { "epoch": 0.48, "grad_norm": 2.908074140548706, "learning_rate": 9.421750314911654e-06, "loss": 1.3613, "step": 97500 }, { "epoch": 0.48, "grad_norm": 3.8458778858184814, "learning_rate": 9.421125180116964e-06, "loss": 1.3665, "step": 97600 }, { "epoch": 0.48, "grad_norm": 3.778986930847168, "learning_rate": 9.420500045322274e-06, "loss": 1.3271, "step": 97700 }, { "epoch": 0.48, "grad_norm": 4.841845512390137, "learning_rate": 9.419874910527584e-06, "loss": 1.3842, "step": 97800 }, { "epoch": 0.48, "grad_norm": 2.875431537628174, "learning_rate": 9.419249775732892e-06, "loss": 1.3734, "step": 97900 }, { "epoch": 0.48, "grad_norm": 3.327831268310547, "learning_rate": 9.418624640938204e-06, "loss": 1.3225, "step": 98000 }, { "epoch": 0.49, "grad_norm": 3.921052932739258, "learning_rate": 9.417999506143512e-06, "loss": 1.3103, "step": 98100 }, { "epoch": 0.49, "grad_norm": 3.3352317810058594, "learning_rate": 9.417374371348824e-06, "loss": 1.33, "step": 98200 }, { "epoch": 0.49, "grad_norm": 2.6515772342681885, "learning_rate": 9.416749236554132e-06, "loss": 1.3325, "step": 98300 }, { "epoch": 0.49, "grad_norm": 2.6556906700134277, "learning_rate": 9.416124101759443e-06, "loss": 1.3537, "step": 98400 }, { "epoch": 0.49, "grad_norm": 3.394216775894165, "learning_rate": 9.415498966964752e-06, "loss": 1.337, "step": 98500 }, { "epoch": 0.49, "grad_norm": 3.2017979621887207, "learning_rate": 9.414873832170063e-06, "loss": 1.3697, "step": 98600 }, { "epoch": 0.49, "grad_norm": 4.548534393310547, "learning_rate": 9.414248697375371e-06, "loss": 1.353, "step": 98700 }, { "epoch": 0.49, "grad_norm": 3.0345072746276855, "learning_rate": 9.413623562580683e-06, "loss": 1.3475, "step": 98800 }, { "epoch": 0.49, "grad_norm": 2.3942067623138428, "learning_rate": 9.412998427785991e-06, "loss": 1.293, "step": 98900 }, { "epoch": 0.49, "grad_norm": 2.577939033508301, "learning_rate": 9.412373292991303e-06, "loss": 1.3267, "step": 99000 }, { "epoch": 0.49, "grad_norm": 3.599987745285034, "learning_rate": 9.411748158196611e-06, "loss": 1.3248, "step": 99100 }, { "epoch": 0.49, "grad_norm": 2.732025623321533, "learning_rate": 9.411123023401921e-06, "loss": 1.3331, "step": 99200 }, { "epoch": 0.49, "grad_norm": 3.382721185684204, "learning_rate": 9.410497888607231e-06, "loss": 1.3492, "step": 99300 }, { "epoch": 0.49, "grad_norm": 3.670431613922119, "learning_rate": 9.409872753812541e-06, "loss": 1.3651, "step": 99400 }, { "epoch": 0.49, "grad_norm": 2.932300329208374, "learning_rate": 9.409247619017851e-06, "loss": 1.3176, "step": 99500 }, { "epoch": 0.49, "grad_norm": 4.241666316986084, "learning_rate": 9.40862248422316e-06, "loss": 1.3056, "step": 99600 }, { "epoch": 0.49, "grad_norm": 2.6911585330963135, "learning_rate": 9.40799734942847e-06, "loss": 1.3425, "step": 99700 }, { "epoch": 0.49, "grad_norm": 2.879465103149414, "learning_rate": 9.40737221463378e-06, "loss": 1.3395, "step": 99800 }, { "epoch": 0.49, "grad_norm": 2.778740167617798, "learning_rate": 9.40674707983909e-06, "loss": 1.3426, "step": 99900 }, { "epoch": 0.49, "grad_norm": 2.6691386699676514, "learning_rate": 9.4061219450444e-06, "loss": 1.3696, "step": 100000 }, { "epoch": 0.5, "grad_norm": 4.0432562828063965, "learning_rate": 9.40549681024971e-06, "loss": 1.368, "step": 100100 }, { "epoch": 0.5, "grad_norm": 2.7415411472320557, "learning_rate": 9.40487167545502e-06, "loss": 1.3368, "step": 100200 }, { "epoch": 0.5, "grad_norm": 2.6961042881011963, "learning_rate": 9.404246540660332e-06, "loss": 1.3821, "step": 100300 }, { "epoch": 0.5, "grad_norm": 3.592819929122925, "learning_rate": 9.40362140586564e-06, "loss": 1.3313, "step": 100400 }, { "epoch": 0.5, "grad_norm": 3.588106632232666, "learning_rate": 9.40299627107095e-06, "loss": 1.3369, "step": 100500 }, { "epoch": 0.5, "grad_norm": 3.1717209815979004, "learning_rate": 9.40237113627626e-06, "loss": 1.3063, "step": 100600 }, { "epoch": 0.5, "grad_norm": 3.9011149406433105, "learning_rate": 9.40174600148157e-06, "loss": 1.3371, "step": 100700 }, { "epoch": 0.5, "grad_norm": 2.861337184906006, "learning_rate": 9.40112086668688e-06, "loss": 1.3511, "step": 100800 }, { "epoch": 0.5, "grad_norm": 3.2174508571624756, "learning_rate": 9.40049573189219e-06, "loss": 1.3374, "step": 100900 }, { "epoch": 0.5, "grad_norm": 3.301086664199829, "learning_rate": 9.3998705970975e-06, "loss": 1.3024, "step": 101000 }, { "epoch": 0.5, "grad_norm": 3.3187129497528076, "learning_rate": 9.39924546230281e-06, "loss": 1.349, "step": 101100 }, { "epoch": 0.5, "grad_norm": 2.6953561305999756, "learning_rate": 9.39862032750812e-06, "loss": 1.3528, "step": 101200 }, { "epoch": 0.5, "grad_norm": 3.3338325023651123, "learning_rate": 9.39799519271343e-06, "loss": 1.3511, "step": 101300 }, { "epoch": 0.5, "grad_norm": 3.3522443771362305, "learning_rate": 9.39737005791874e-06, "loss": 1.3373, "step": 101400 }, { "epoch": 0.5, "grad_norm": 2.7400362491607666, "learning_rate": 9.39674492312405e-06, "loss": 1.3535, "step": 101500 }, { "epoch": 0.5, "grad_norm": 2.793731212615967, "learning_rate": 9.39611978832936e-06, "loss": 1.3184, "step": 101600 }, { "epoch": 0.5, "grad_norm": 2.759066581726074, "learning_rate": 9.39549465353467e-06, "loss": 1.3489, "step": 101700 }, { "epoch": 0.5, "grad_norm": 4.7479681968688965, "learning_rate": 9.39486951873998e-06, "loss": 1.3498, "step": 101800 }, { "epoch": 0.5, "grad_norm": 3.2522835731506348, "learning_rate": 9.394244383945289e-06, "loss": 1.3476, "step": 101900 }, { "epoch": 0.5, "grad_norm": 4.208197593688965, "learning_rate": 9.393619249150599e-06, "loss": 1.3289, "step": 102000 }, { "epoch": 0.51, "grad_norm": 2.8332533836364746, "learning_rate": 9.392994114355909e-06, "loss": 1.3501, "step": 102100 }, { "epoch": 0.51, "grad_norm": 3.675553798675537, "learning_rate": 9.392368979561219e-06, "loss": 1.3559, "step": 102200 }, { "epoch": 0.51, "grad_norm": 2.7442257404327393, "learning_rate": 9.391743844766529e-06, "loss": 1.346, "step": 102300 }, { "epoch": 0.51, "grad_norm": 3.129180431365967, "learning_rate": 9.391118709971839e-06, "loss": 1.3265, "step": 102400 }, { "epoch": 0.51, "grad_norm": 3.1826012134552, "learning_rate": 9.390493575177149e-06, "loss": 1.3488, "step": 102500 }, { "epoch": 0.51, "grad_norm": 3.8879926204681396, "learning_rate": 9.389868440382459e-06, "loss": 1.3814, "step": 102600 }, { "epoch": 0.51, "grad_norm": 4.066867828369141, "learning_rate": 9.389243305587769e-06, "loss": 1.3454, "step": 102700 }, { "epoch": 0.51, "grad_norm": 3.3028340339660645, "learning_rate": 9.388618170793078e-06, "loss": 1.3661, "step": 102800 }, { "epoch": 0.51, "grad_norm": 2.9503161907196045, "learning_rate": 9.387993035998388e-06, "loss": 1.3326, "step": 102900 }, { "epoch": 0.51, "grad_norm": 3.030353546142578, "learning_rate": 9.387367901203698e-06, "loss": 1.3436, "step": 103000 }, { "epoch": 0.51, "grad_norm": 3.6172800064086914, "learning_rate": 9.386742766409007e-06, "loss": 1.3545, "step": 103100 }, { "epoch": 0.51, "grad_norm": 3.2115883827209473, "learning_rate": 9.386117631614318e-06, "loss": 1.3345, "step": 103200 }, { "epoch": 0.51, "grad_norm": 2.9105865955352783, "learning_rate": 9.385492496819626e-06, "loss": 1.3667, "step": 103300 }, { "epoch": 0.51, "grad_norm": 4.278082847595215, "learning_rate": 9.384867362024938e-06, "loss": 1.3497, "step": 103400 }, { "epoch": 0.51, "grad_norm": 3.113901376724243, "learning_rate": 9.384242227230246e-06, "loss": 1.3216, "step": 103500 }, { "epoch": 0.51, "grad_norm": 3.9379656314849854, "learning_rate": 9.383617092435558e-06, "loss": 1.3469, "step": 103600 }, { "epoch": 0.51, "grad_norm": 3.53202748298645, "learning_rate": 9.382991957640866e-06, "loss": 1.3488, "step": 103700 }, { "epoch": 0.51, "grad_norm": 2.973238468170166, "learning_rate": 9.382366822846178e-06, "loss": 1.3387, "step": 103800 }, { "epoch": 0.51, "grad_norm": 3.464711904525757, "learning_rate": 9.381741688051486e-06, "loss": 1.3472, "step": 103900 }, { "epoch": 0.51, "grad_norm": 2.8774147033691406, "learning_rate": 9.381116553256798e-06, "loss": 1.3786, "step": 104000 }, { "epoch": 0.52, "grad_norm": 3.0821099281311035, "learning_rate": 9.380491418462106e-06, "loss": 1.3329, "step": 104100 }, { "epoch": 0.52, "grad_norm": 4.228484630584717, "learning_rate": 9.379866283667417e-06, "loss": 1.3329, "step": 104200 }, { "epoch": 0.52, "grad_norm": 3.113833427429199, "learning_rate": 9.379241148872726e-06, "loss": 1.373, "step": 104300 }, { "epoch": 0.52, "grad_norm": 3.078624963760376, "learning_rate": 9.378616014078036e-06, "loss": 1.3409, "step": 104400 }, { "epoch": 0.52, "grad_norm": 2.510251045227051, "learning_rate": 9.377990879283346e-06, "loss": 1.3393, "step": 104500 }, { "epoch": 0.52, "grad_norm": 3.0739970207214355, "learning_rate": 9.377365744488655e-06, "loss": 1.3696, "step": 104600 }, { "epoch": 0.52, "grad_norm": 3.1729655265808105, "learning_rate": 9.376740609693965e-06, "loss": 1.3229, "step": 104700 }, { "epoch": 0.52, "grad_norm": 3.003714084625244, "learning_rate": 9.376115474899275e-06, "loss": 1.3309, "step": 104800 }, { "epoch": 0.52, "grad_norm": 3.0130865573883057, "learning_rate": 9.375490340104585e-06, "loss": 1.3736, "step": 104900 }, { "epoch": 0.52, "grad_norm": 3.3182711601257324, "learning_rate": 9.374865205309895e-06, "loss": 1.342, "step": 105000 }, { "epoch": 0.52, "grad_norm": 2.8089771270751953, "learning_rate": 9.374240070515205e-06, "loss": 1.3187, "step": 105100 }, { "epoch": 0.52, "grad_norm": 3.321974515914917, "learning_rate": 9.373614935720515e-06, "loss": 1.3676, "step": 105200 }, { "epoch": 0.52, "grad_norm": 3.114701986312866, "learning_rate": 9.372989800925825e-06, "loss": 1.3746, "step": 105300 }, { "epoch": 0.52, "grad_norm": 3.1047348976135254, "learning_rate": 9.372364666131135e-06, "loss": 1.3437, "step": 105400 }, { "epoch": 0.52, "grad_norm": 3.5590476989746094, "learning_rate": 9.371739531336446e-06, "loss": 1.3496, "step": 105500 }, { "epoch": 0.52, "grad_norm": 3.6819374561309814, "learning_rate": 9.371114396541755e-06, "loss": 1.3254, "step": 105600 }, { "epoch": 0.52, "grad_norm": 3.0282516479492188, "learning_rate": 9.370489261747066e-06, "loss": 1.3568, "step": 105700 }, { "epoch": 0.52, "grad_norm": 3.659374475479126, "learning_rate": 9.369864126952375e-06, "loss": 1.3331, "step": 105800 }, { "epoch": 0.52, "grad_norm": 2.9225759506225586, "learning_rate": 9.369238992157685e-06, "loss": 1.3293, "step": 105900 }, { "epoch": 0.52, "grad_norm": 3.179713726043701, "learning_rate": 9.368613857362994e-06, "loss": 1.3342, "step": 106000 }, { "epoch": 0.52, "grad_norm": 3.126467704772949, "learning_rate": 9.367988722568304e-06, "loss": 1.3441, "step": 106100 }, { "epoch": 0.53, "grad_norm": 4.179965019226074, "learning_rate": 9.367363587773614e-06, "loss": 1.3225, "step": 106200 }, { "epoch": 0.53, "grad_norm": 4.020696640014648, "learning_rate": 9.366738452978924e-06, "loss": 1.3263, "step": 106300 }, { "epoch": 0.53, "grad_norm": 4.02736759185791, "learning_rate": 9.366113318184234e-06, "loss": 1.3621, "step": 106400 }, { "epoch": 0.53, "grad_norm": 2.7809269428253174, "learning_rate": 9.365488183389544e-06, "loss": 1.3583, "step": 106500 }, { "epoch": 0.53, "grad_norm": 3.92323637008667, "learning_rate": 9.364863048594854e-06, "loss": 1.359, "step": 106600 }, { "epoch": 0.53, "grad_norm": 3.1310439109802246, "learning_rate": 9.364237913800164e-06, "loss": 1.3296, "step": 106700 }, { "epoch": 0.53, "grad_norm": 2.9712395668029785, "learning_rate": 9.363612779005474e-06, "loss": 1.3118, "step": 106800 }, { "epoch": 0.53, "grad_norm": 3.047405481338501, "learning_rate": 9.362987644210784e-06, "loss": 1.3441, "step": 106900 }, { "epoch": 0.53, "grad_norm": 4.126023292541504, "learning_rate": 9.362362509416094e-06, "loss": 1.3484, "step": 107000 }, { "epoch": 0.53, "grad_norm": 4.234996318817139, "learning_rate": 9.361737374621404e-06, "loss": 1.2947, "step": 107100 }, { "epoch": 0.53, "grad_norm": 3.2066574096679688, "learning_rate": 9.361112239826714e-06, "loss": 1.3595, "step": 107200 }, { "epoch": 0.53, "grad_norm": 2.9832849502563477, "learning_rate": 9.360487105032023e-06, "loss": 1.324, "step": 107300 }, { "epoch": 0.53, "grad_norm": 3.168886661529541, "learning_rate": 9.359861970237333e-06, "loss": 1.3508, "step": 107400 }, { "epoch": 0.53, "grad_norm": 4.0027995109558105, "learning_rate": 9.359236835442643e-06, "loss": 1.344, "step": 107500 }, { "epoch": 0.53, "grad_norm": 3.394458770751953, "learning_rate": 9.358611700647953e-06, "loss": 1.3295, "step": 107600 }, { "epoch": 0.53, "grad_norm": 2.4304399490356445, "learning_rate": 9.357986565853263e-06, "loss": 1.3473, "step": 107700 }, { "epoch": 0.53, "grad_norm": 2.8694140911102295, "learning_rate": 9.357361431058573e-06, "loss": 1.3052, "step": 107800 }, { "epoch": 0.53, "grad_norm": 2.8801755905151367, "learning_rate": 9.356736296263883e-06, "loss": 1.3495, "step": 107900 }, { "epoch": 0.53, "grad_norm": 2.713139057159424, "learning_rate": 9.356111161469193e-06, "loss": 1.3552, "step": 108000 }, { "epoch": 0.53, "grad_norm": 3.1576766967773438, "learning_rate": 9.355486026674503e-06, "loss": 1.3425, "step": 108100 }, { "epoch": 0.54, "grad_norm": 3.07737398147583, "learning_rate": 9.354860891879813e-06, "loss": 1.2991, "step": 108200 }, { "epoch": 0.54, "grad_norm": 3.1460793018341064, "learning_rate": 9.354235757085123e-06, "loss": 1.3528, "step": 108300 }, { "epoch": 0.54, "grad_norm": 3.4123237133026123, "learning_rate": 9.353610622290433e-06, "loss": 1.3549, "step": 108400 }, { "epoch": 0.54, "grad_norm": 4.295971870422363, "learning_rate": 9.352985487495741e-06, "loss": 1.32, "step": 108500 }, { "epoch": 0.54, "grad_norm": 7.587291240692139, "learning_rate": 9.352360352701053e-06, "loss": 1.3205, "step": 108600 }, { "epoch": 0.54, "grad_norm": 2.6797661781311035, "learning_rate": 9.35173521790636e-06, "loss": 1.3383, "step": 108700 }, { "epoch": 0.54, "grad_norm": 2.9257397651672363, "learning_rate": 9.351110083111672e-06, "loss": 1.343, "step": 108800 }, { "epoch": 0.54, "grad_norm": 2.9473204612731934, "learning_rate": 9.35048494831698e-06, "loss": 1.3423, "step": 108900 }, { "epoch": 0.54, "grad_norm": 3.4466347694396973, "learning_rate": 9.349859813522292e-06, "loss": 1.3261, "step": 109000 }, { "epoch": 0.54, "grad_norm": 3.173017978668213, "learning_rate": 9.3492346787276e-06, "loss": 1.3222, "step": 109100 }, { "epoch": 0.54, "grad_norm": 3.7112643718719482, "learning_rate": 9.348609543932912e-06, "loss": 1.3528, "step": 109200 }, { "epoch": 0.54, "grad_norm": 2.747380495071411, "learning_rate": 9.34798440913822e-06, "loss": 1.343, "step": 109300 }, { "epoch": 0.54, "grad_norm": 2.6822452545166016, "learning_rate": 9.347359274343532e-06, "loss": 1.3478, "step": 109400 }, { "epoch": 0.54, "grad_norm": 3.6231350898742676, "learning_rate": 9.34673413954884e-06, "loss": 1.3569, "step": 109500 }, { "epoch": 0.54, "grad_norm": 2.985164165496826, "learning_rate": 9.346109004754152e-06, "loss": 1.358, "step": 109600 }, { "epoch": 0.54, "grad_norm": 3.304058790206909, "learning_rate": 9.34548386995946e-06, "loss": 1.3312, "step": 109700 }, { "epoch": 0.54, "grad_norm": 3.1674203872680664, "learning_rate": 9.34485873516477e-06, "loss": 1.3499, "step": 109800 }, { "epoch": 0.54, "grad_norm": 3.0080056190490723, "learning_rate": 9.34423360037008e-06, "loss": 1.3268, "step": 109900 }, { "epoch": 0.54, "grad_norm": 3.8802080154418945, "learning_rate": 9.34360846557539e-06, "loss": 1.3494, "step": 110000 }, { "epoch": 0.54, "grad_norm": 3.842288017272949, "learning_rate": 9.3429833307807e-06, "loss": 1.3368, "step": 110100 }, { "epoch": 0.55, "grad_norm": 3.315469980239868, "learning_rate": 9.34235819598601e-06, "loss": 1.3744, "step": 110200 }, { "epoch": 0.55, "grad_norm": 5.507584095001221, "learning_rate": 9.34173306119132e-06, "loss": 1.3635, "step": 110300 }, { "epoch": 0.55, "grad_norm": 3.494532585144043, "learning_rate": 9.34110792639663e-06, "loss": 1.3553, "step": 110400 }, { "epoch": 0.55, "grad_norm": 2.602483034133911, "learning_rate": 9.34048279160194e-06, "loss": 1.3783, "step": 110500 }, { "epoch": 0.55, "grad_norm": 4.201229572296143, "learning_rate": 9.33985765680725e-06, "loss": 1.3466, "step": 110600 }, { "epoch": 0.55, "grad_norm": 3.809846878051758, "learning_rate": 9.339232522012561e-06, "loss": 1.3274, "step": 110700 }, { "epoch": 0.55, "grad_norm": 2.964759588241577, "learning_rate": 9.33860738721787e-06, "loss": 1.3954, "step": 110800 }, { "epoch": 0.55, "grad_norm": 2.925959587097168, "learning_rate": 9.33798225242318e-06, "loss": 1.3258, "step": 110900 }, { "epoch": 0.55, "grad_norm": 2.654022455215454, "learning_rate": 9.337357117628489e-06, "loss": 1.3504, "step": 111000 }, { "epoch": 0.55, "grad_norm": 3.296046257019043, "learning_rate": 9.336731982833799e-06, "loss": 1.3136, "step": 111100 }, { "epoch": 0.55, "grad_norm": 3.804032564163208, "learning_rate": 9.336106848039109e-06, "loss": 1.3428, "step": 111200 }, { "epoch": 0.55, "grad_norm": 3.254333019256592, "learning_rate": 9.335481713244419e-06, "loss": 1.3583, "step": 111300 }, { "epoch": 0.55, "grad_norm": 2.7955849170684814, "learning_rate": 9.334856578449729e-06, "loss": 1.3294, "step": 111400 }, { "epoch": 0.55, "grad_norm": 4.444815635681152, "learning_rate": 9.334231443655039e-06, "loss": 1.3727, "step": 111500 }, { "epoch": 0.55, "grad_norm": 2.5120747089385986, "learning_rate": 9.333606308860349e-06, "loss": 1.3454, "step": 111600 }, { "epoch": 0.55, "grad_norm": 3.015342950820923, "learning_rate": 9.332981174065659e-06, "loss": 1.319, "step": 111700 }, { "epoch": 0.55, "grad_norm": 3.021049976348877, "learning_rate": 9.332356039270968e-06, "loss": 1.3498, "step": 111800 }, { "epoch": 0.55, "grad_norm": 3.354524850845337, "learning_rate": 9.331730904476278e-06, "loss": 1.3552, "step": 111900 }, { "epoch": 0.55, "grad_norm": 2.649719476699829, "learning_rate": 9.331105769681588e-06, "loss": 1.3276, "step": 112000 }, { "epoch": 0.55, "grad_norm": 3.1411306858062744, "learning_rate": 9.330480634886898e-06, "loss": 1.3616, "step": 112100 }, { "epoch": 0.56, "grad_norm": 3.030653238296509, "learning_rate": 9.329855500092208e-06, "loss": 1.3253, "step": 112200 }, { "epoch": 0.56, "grad_norm": 2.987105369567871, "learning_rate": 9.329230365297518e-06, "loss": 1.3133, "step": 112300 }, { "epoch": 0.56, "grad_norm": 3.9337832927703857, "learning_rate": 9.328605230502828e-06, "loss": 1.3697, "step": 112400 }, { "epoch": 0.56, "grad_norm": 2.6805777549743652, "learning_rate": 9.327980095708138e-06, "loss": 1.3689, "step": 112500 }, { "epoch": 0.56, "grad_norm": 3.582444667816162, "learning_rate": 9.327354960913448e-06, "loss": 1.3746, "step": 112600 }, { "epoch": 0.56, "grad_norm": 4.040530204772949, "learning_rate": 9.326729826118758e-06, "loss": 1.3484, "step": 112700 }, { "epoch": 0.56, "grad_norm": 3.6632535457611084, "learning_rate": 9.326104691324068e-06, "loss": 1.3191, "step": 112800 }, { "epoch": 0.56, "grad_norm": 2.814882278442383, "learning_rate": 9.325479556529378e-06, "loss": 1.3331, "step": 112900 }, { "epoch": 0.56, "grad_norm": 3.2695939540863037, "learning_rate": 9.324854421734688e-06, "loss": 1.3307, "step": 113000 }, { "epoch": 0.56, "grad_norm": 3.6386842727661133, "learning_rate": 9.324229286939998e-06, "loss": 1.3464, "step": 113100 }, { "epoch": 0.56, "grad_norm": 2.7537262439727783, "learning_rate": 9.323604152145307e-06, "loss": 1.3177, "step": 113200 }, { "epoch": 0.56, "grad_norm": 3.279010057449341, "learning_rate": 9.322979017350617e-06, "loss": 1.353, "step": 113300 }, { "epoch": 0.56, "grad_norm": 3.230193614959717, "learning_rate": 9.322353882555927e-06, "loss": 1.3301, "step": 113400 }, { "epoch": 0.56, "grad_norm": 2.851243257522583, "learning_rate": 9.321728747761237e-06, "loss": 1.3535, "step": 113500 }, { "epoch": 0.56, "grad_norm": 2.865309000015259, "learning_rate": 9.321103612966547e-06, "loss": 1.3836, "step": 113600 }, { "epoch": 0.56, "grad_norm": 3.171292781829834, "learning_rate": 9.320478478171855e-06, "loss": 1.3426, "step": 113700 }, { "epoch": 0.56, "grad_norm": 3.774327278137207, "learning_rate": 9.319853343377167e-06, "loss": 1.3376, "step": 113800 }, { "epoch": 0.56, "grad_norm": 3.775113344192505, "learning_rate": 9.319228208582475e-06, "loss": 1.3253, "step": 113900 }, { "epoch": 0.56, "grad_norm": 3.0207529067993164, "learning_rate": 9.318603073787787e-06, "loss": 1.3176, "step": 114000 }, { "epoch": 0.56, "grad_norm": 3.2777695655822754, "learning_rate": 9.317977938993095e-06, "loss": 1.3699, "step": 114100 }, { "epoch": 0.56, "grad_norm": 3.0100061893463135, "learning_rate": 9.317352804198407e-06, "loss": 1.3158, "step": 114200 }, { "epoch": 0.57, "grad_norm": 3.428809881210327, "learning_rate": 9.316727669403715e-06, "loss": 1.3165, "step": 114300 }, { "epoch": 0.57, "grad_norm": 2.833083391189575, "learning_rate": 9.316102534609027e-06, "loss": 1.3487, "step": 114400 }, { "epoch": 0.57, "grad_norm": 2.81231951713562, "learning_rate": 9.315477399814335e-06, "loss": 1.327, "step": 114500 }, { "epoch": 0.57, "grad_norm": 2.9721994400024414, "learning_rate": 9.314852265019646e-06, "loss": 1.3189, "step": 114600 }, { "epoch": 0.57, "grad_norm": 2.969564437866211, "learning_rate": 9.314227130224955e-06, "loss": 1.3487, "step": 114700 }, { "epoch": 0.57, "grad_norm": 3.1244125366210938, "learning_rate": 9.313601995430266e-06, "loss": 1.3391, "step": 114800 }, { "epoch": 0.57, "grad_norm": 3.785893201828003, "learning_rate": 9.312976860635575e-06, "loss": 1.3017, "step": 114900 }, { "epoch": 0.57, "grad_norm": 2.9908628463745117, "learning_rate": 9.312351725840884e-06, "loss": 1.3004, "step": 115000 }, { "epoch": 0.57, "grad_norm": 2.7877655029296875, "learning_rate": 9.311726591046194e-06, "loss": 1.3345, "step": 115100 }, { "epoch": 0.57, "grad_norm": 2.73725962638855, "learning_rate": 9.311101456251504e-06, "loss": 1.3164, "step": 115200 }, { "epoch": 0.57, "grad_norm": 2.6744511127471924, "learning_rate": 9.310476321456814e-06, "loss": 1.3362, "step": 115300 }, { "epoch": 0.57, "grad_norm": 2.9426522254943848, "learning_rate": 9.309851186662124e-06, "loss": 1.3111, "step": 115400 }, { "epoch": 0.57, "grad_norm": 3.818319797515869, "learning_rate": 9.309226051867434e-06, "loss": 1.3193, "step": 115500 }, { "epoch": 0.57, "grad_norm": 3.0991666316986084, "learning_rate": 9.308600917072744e-06, "loss": 1.3144, "step": 115600 }, { "epoch": 0.57, "grad_norm": 3.6219863891601562, "learning_rate": 9.307975782278054e-06, "loss": 1.3397, "step": 115700 }, { "epoch": 0.57, "grad_norm": 3.1432971954345703, "learning_rate": 9.307350647483364e-06, "loss": 1.3259, "step": 115800 }, { "epoch": 0.57, "grad_norm": 4.15132999420166, "learning_rate": 9.306725512688675e-06, "loss": 1.3362, "step": 115900 }, { "epoch": 0.57, "grad_norm": 4.191103935241699, "learning_rate": 9.306100377893984e-06, "loss": 1.3117, "step": 116000 }, { "epoch": 0.57, "grad_norm": 3.6365630626678467, "learning_rate": 9.305475243099295e-06, "loss": 1.3397, "step": 116100 }, { "epoch": 0.57, "grad_norm": 2.899077892303467, "learning_rate": 9.304850108304604e-06, "loss": 1.3434, "step": 116200 }, { "epoch": 0.58, "grad_norm": 3.006063461303711, "learning_rate": 9.304224973509915e-06, "loss": 1.3148, "step": 116300 }, { "epoch": 0.58, "grad_norm": 3.2103986740112305, "learning_rate": 9.303599838715223e-06, "loss": 1.3313, "step": 116400 }, { "epoch": 0.58, "grad_norm": 2.6371185779571533, "learning_rate": 9.302974703920533e-06, "loss": 1.3219, "step": 116500 }, { "epoch": 0.58, "grad_norm": 3.5810282230377197, "learning_rate": 9.302349569125843e-06, "loss": 1.3199, "step": 116600 }, { "epoch": 0.58, "grad_norm": 3.7903432846069336, "learning_rate": 9.301724434331153e-06, "loss": 1.3604, "step": 116700 }, { "epoch": 0.58, "grad_norm": 3.0497376918792725, "learning_rate": 9.301099299536463e-06, "loss": 1.3517, "step": 116800 }, { "epoch": 0.58, "grad_norm": 3.0921273231506348, "learning_rate": 9.300474164741773e-06, "loss": 1.3192, "step": 116900 }, { "epoch": 0.58, "grad_norm": 4.081624507904053, "learning_rate": 9.299849029947083e-06, "loss": 1.3376, "step": 117000 }, { "epoch": 0.58, "grad_norm": 3.9647045135498047, "learning_rate": 9.299223895152393e-06, "loss": 1.3501, "step": 117100 }, { "epoch": 0.58, "grad_norm": 3.3014702796936035, "learning_rate": 9.298598760357703e-06, "loss": 1.3467, "step": 117200 }, { "epoch": 0.58, "grad_norm": 4.240477085113525, "learning_rate": 9.297973625563013e-06, "loss": 1.3094, "step": 117300 }, { "epoch": 0.58, "grad_norm": 3.317046642303467, "learning_rate": 9.297348490768323e-06, "loss": 1.3586, "step": 117400 }, { "epoch": 0.58, "grad_norm": 2.8633594512939453, "learning_rate": 9.296723355973633e-06, "loss": 1.3272, "step": 117500 }, { "epoch": 0.58, "grad_norm": 3.435241937637329, "learning_rate": 9.296098221178943e-06, "loss": 1.3683, "step": 117600 }, { "epoch": 0.58, "grad_norm": 2.955159902572632, "learning_rate": 9.295473086384252e-06, "loss": 1.35, "step": 117700 }, { "epoch": 0.58, "grad_norm": 2.941067695617676, "learning_rate": 9.294847951589562e-06, "loss": 1.3438, "step": 117800 }, { "epoch": 0.58, "grad_norm": 4.773413181304932, "learning_rate": 9.294222816794872e-06, "loss": 1.3167, "step": 117900 }, { "epoch": 0.58, "grad_norm": 2.976818084716797, "learning_rate": 9.293597682000182e-06, "loss": 1.3415, "step": 118000 }, { "epoch": 0.58, "grad_norm": 2.769272804260254, "learning_rate": 9.292972547205492e-06, "loss": 1.3596, "step": 118100 }, { "epoch": 0.58, "grad_norm": 3.0079257488250732, "learning_rate": 9.292347412410802e-06, "loss": 1.3479, "step": 118200 }, { "epoch": 0.59, "grad_norm": 6.148379802703857, "learning_rate": 9.291722277616112e-06, "loss": 1.3324, "step": 118300 }, { "epoch": 0.59, "grad_norm": 3.0416197776794434, "learning_rate": 9.291097142821422e-06, "loss": 1.307, "step": 118400 }, { "epoch": 0.59, "grad_norm": 2.869318962097168, "learning_rate": 9.290472008026732e-06, "loss": 1.3217, "step": 118500 }, { "epoch": 0.59, "grad_norm": 3.5434398651123047, "learning_rate": 9.289846873232042e-06, "loss": 1.3356, "step": 118600 }, { "epoch": 0.59, "grad_norm": 3.5270133018493652, "learning_rate": 9.289221738437352e-06, "loss": 1.3206, "step": 118700 }, { "epoch": 0.59, "grad_norm": 5.0058369636535645, "learning_rate": 9.288596603642662e-06, "loss": 1.3386, "step": 118800 }, { "epoch": 0.59, "grad_norm": 3.7998316287994385, "learning_rate": 9.287971468847972e-06, "loss": 1.2882, "step": 118900 }, { "epoch": 0.59, "grad_norm": 4.403027534484863, "learning_rate": 9.287346334053281e-06, "loss": 1.3318, "step": 119000 }, { "epoch": 0.59, "grad_norm": 3.3011553287506104, "learning_rate": 9.28672119925859e-06, "loss": 1.3395, "step": 119100 }, { "epoch": 0.59, "grad_norm": 3.5392231941223145, "learning_rate": 9.286096064463901e-06, "loss": 1.3309, "step": 119200 }, { "epoch": 0.59, "grad_norm": 3.0157182216644287, "learning_rate": 9.28547092966921e-06, "loss": 1.3212, "step": 119300 }, { "epoch": 0.59, "grad_norm": 3.292978048324585, "learning_rate": 9.284845794874521e-06, "loss": 1.3419, "step": 119400 }, { "epoch": 0.59, "grad_norm": 3.0125534534454346, "learning_rate": 9.28422066007983e-06, "loss": 1.3237, "step": 119500 }, { "epoch": 0.59, "grad_norm": 3.421067476272583, "learning_rate": 9.283595525285141e-06, "loss": 1.343, "step": 119600 }, { "epoch": 0.59, "grad_norm": 3.0257959365844727, "learning_rate": 9.28297039049045e-06, "loss": 1.3339, "step": 119700 }, { "epoch": 0.59, "grad_norm": 3.5511181354522705, "learning_rate": 9.282345255695761e-06, "loss": 1.2891, "step": 119800 }, { "epoch": 0.59, "grad_norm": 3.5419836044311523, "learning_rate": 9.281720120901069e-06, "loss": 1.3189, "step": 119900 }, { "epoch": 0.59, "grad_norm": 2.700242757797241, "learning_rate": 9.28109498610638e-06, "loss": 1.3302, "step": 120000 } ], "logging_steps": 100, "max_steps": 1604655, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 10000, "total_flos": 2.4386875134941594e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }