diff --git "a/Dubs/v0.0.2/P_C/checkpoint-110500/trainer_state.json" "b/Dubs/v0.0.2/P_C/checkpoint-110500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Dubs/v0.0.2/P_C/checkpoint-110500/trainer_state.json" @@ -0,0 +1,30973 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 100.0, + "eval_steps": 500, + "global_step": 110500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02262443438914027, + "grad_norm": 1.5616087913513184, + "learning_rate": 3.122171945701357e-07, + "loss": 12.3328, + "step": 25 + }, + { + "epoch": 0.04524886877828054, + "grad_norm": 1.4146922826766968, + "learning_rate": 6.515837104072398e-07, + "loss": 12.5401, + "step": 50 + }, + { + "epoch": 0.06787330316742081, + "grad_norm": 2.0305981636047363, + "learning_rate": 9.909502262443438e-07, + "loss": 12.2748, + "step": 75 + }, + { + "epoch": 0.09049773755656108, + "grad_norm": 1.0272996425628662, + "learning_rate": 1.3303167420814479e-06, + "loss": 12.5666, + "step": 100 + }, + { + "epoch": 0.11312217194570136, + "grad_norm": 1.0590907335281372, + "learning_rate": 1.6696832579185518e-06, + "loss": 12.6943, + "step": 125 + }, + { + "epoch": 0.13574660633484162, + "grad_norm": 1.196068525314331, + "learning_rate": 2.009049773755656e-06, + "loss": 12.6146, + "step": 150 + }, + { + "epoch": 0.1583710407239819, + "grad_norm": 1.4678159952163696, + "learning_rate": 2.34841628959276e-06, + "loss": 11.977, + "step": 175 + }, + { + "epoch": 0.18099547511312217, + "grad_norm": 1.9250414371490479, + "learning_rate": 2.6877828054298643e-06, + "loss": 12.002, + "step": 200 + }, + { + "epoch": 0.20361990950226244, + "grad_norm": 2.0926384925842285, + "learning_rate": 3.027149321266968e-06, + "loss": 12.1648, + "step": 225 + }, + { + "epoch": 0.22624434389140272, + "grad_norm": 1.752112627029419, + "learning_rate": 3.366515837104072e-06, + "loss": 11.4138, + "step": 250 + }, + { + "epoch": 0.248868778280543, + "grad_norm": 1.5051826238632202, + "learning_rate": 3.705882352941176e-06, + "loss": 11.7424, + "step": 275 + }, + { + "epoch": 0.27149321266968324, + "grad_norm": 1.4604785442352295, + "learning_rate": 4.045248868778281e-06, + "loss": 10.9347, + "step": 300 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 1.1712795495986938, + "learning_rate": 4.384615384615384e-06, + "loss": 11.0688, + "step": 325 + }, + { + "epoch": 0.3167420814479638, + "grad_norm": 1.562002420425415, + "learning_rate": 4.723981900452488e-06, + "loss": 10.3233, + "step": 350 + }, + { + "epoch": 0.3393665158371041, + "grad_norm": 1.423509120941162, + "learning_rate": 5.063348416289593e-06, + "loss": 10.8483, + "step": 375 + }, + { + "epoch": 0.36199095022624433, + "grad_norm": 1.488276481628418, + "learning_rate": 5.402714932126696e-06, + "loss": 10.4095, + "step": 400 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 1.0716525316238403, + "learning_rate": 5.7420814479638004e-06, + "loss": 10.0423, + "step": 425 + }, + { + "epoch": 0.4072398190045249, + "grad_norm": 1.3523911237716675, + "learning_rate": 6.081447963800904e-06, + "loss": 9.3279, + "step": 450 + }, + { + "epoch": 0.4298642533936652, + "grad_norm": 1.3519021272659302, + "learning_rate": 6.420814479638009e-06, + "loss": 8.6838, + "step": 475 + }, + { + "epoch": 0.45248868778280543, + "grad_norm": 0.8001397252082825, + "learning_rate": 6.7601809954751125e-06, + "loss": 8.5509, + "step": 500 + }, + { + "epoch": 0.4751131221719457, + "grad_norm": 1.0891382694244385, + "learning_rate": 7.099547511312217e-06, + "loss": 7.935, + "step": 525 + }, + { + "epoch": 0.497737556561086, + "grad_norm": 0.7729541659355164, + "learning_rate": 7.43891402714932e-06, + "loss": 7.5149, + "step": 550 + }, + { + "epoch": 0.5203619909502263, + "grad_norm": 0.8944075703620911, + "learning_rate": 7.778280542986424e-06, + "loss": 7.1296, + "step": 575 + }, + { + "epoch": 0.5429864253393665, + "grad_norm": 0.6566529273986816, + "learning_rate": 8.117647058823528e-06, + "loss": 7.0305, + "step": 600 + }, + { + "epoch": 0.5656108597285068, + "grad_norm": 0.871402382850647, + "learning_rate": 8.457013574660632e-06, + "loss": 6.7858, + "step": 625 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.4330376386642456, + "learning_rate": 8.796380090497737e-06, + "loss": 6.7152, + "step": 650 + }, + { + "epoch": 0.6108597285067874, + "grad_norm": 0.7765570878982544, + "learning_rate": 9.135746606334841e-06, + "loss": 6.524, + "step": 675 + }, + { + "epoch": 0.6334841628959276, + "grad_norm": 0.7222900390625, + "learning_rate": 9.475113122171945e-06, + "loss": 6.5056, + "step": 700 + }, + { + "epoch": 0.6561085972850679, + "grad_norm": 0.8623555302619934, + "learning_rate": 9.81447963800905e-06, + "loss": 6.4704, + "step": 725 + }, + { + "epoch": 0.6787330316742082, + "grad_norm": 0.7340829968452454, + "learning_rate": 1.0153846153846152e-05, + "loss": 6.4959, + "step": 750 + }, + { + "epoch": 0.7013574660633484, + "grad_norm": 0.8369361162185669, + "learning_rate": 1.0493212669683258e-05, + "loss": 6.352, + "step": 775 + }, + { + "epoch": 0.7239819004524887, + "grad_norm": 1.066939353942871, + "learning_rate": 1.083257918552036e-05, + "loss": 6.3632, + "step": 800 + }, + { + "epoch": 0.746606334841629, + "grad_norm": 0.9356958866119385, + "learning_rate": 1.1171945701357465e-05, + "loss": 6.3694, + "step": 825 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.9633516073226929, + "learning_rate": 1.1511312217194568e-05, + "loss": 6.3532, + "step": 850 + }, + { + "epoch": 0.7918552036199095, + "grad_norm": 0.8974762558937073, + "learning_rate": 1.1850678733031674e-05, + "loss": 6.3075, + "step": 875 + }, + { + "epoch": 0.8144796380090498, + "grad_norm": 0.9667727947235107, + "learning_rate": 1.2190045248868778e-05, + "loss": 6.3347, + "step": 900 + }, + { + "epoch": 0.8371040723981901, + "grad_norm": 1.1597741842269897, + "learning_rate": 1.252941176470588e-05, + "loss": 6.301, + "step": 925 + }, + { + "epoch": 0.8597285067873304, + "grad_norm": 1.0287904739379883, + "learning_rate": 1.2868778280542987e-05, + "loss": 6.4209, + "step": 950 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 1.2371269464492798, + "learning_rate": 1.320814479638009e-05, + "loss": 6.2208, + "step": 975 + }, + { + "epoch": 0.9049773755656109, + "grad_norm": 1.331064224243164, + "learning_rate": 1.3547511312217193e-05, + "loss": 6.3464, + "step": 1000 + }, + { + "epoch": 0.9276018099547512, + "grad_norm": 1.0429011583328247, + "learning_rate": 1.3886877828054298e-05, + "loss": 6.1974, + "step": 1025 + }, + { + "epoch": 0.9502262443438914, + "grad_norm": 1.0016249418258667, + "learning_rate": 1.4226244343891402e-05, + "loss": 6.1865, + "step": 1050 + }, + { + "epoch": 0.9728506787330317, + "grad_norm": 1.299787163734436, + "learning_rate": 1.4565610859728506e-05, + "loss": 6.1026, + "step": 1075 + }, + { + "epoch": 0.995475113122172, + "grad_norm": 1.2434639930725098, + "learning_rate": 1.4904977375565609e-05, + "loss": 6.1164, + "step": 1100 + }, + { + "epoch": 1.0180995475113122, + "grad_norm": 1.4540140628814697, + "learning_rate": 1.5244343891402713e-05, + "loss": 6.2016, + "step": 1125 + }, + { + "epoch": 1.0407239819004526, + "grad_norm": 1.1259605884552002, + "learning_rate": 1.5583710407239816e-05, + "loss": 6.1216, + "step": 1150 + }, + { + "epoch": 1.0633484162895928, + "grad_norm": 1.0611627101898193, + "learning_rate": 1.592307692307692e-05, + "loss": 6.2301, + "step": 1175 + }, + { + "epoch": 1.085972850678733, + "grad_norm": 0.903337836265564, + "learning_rate": 1.6262443438914028e-05, + "loss": 6.102, + "step": 1200 + }, + { + "epoch": 1.1085972850678734, + "grad_norm": 1.3304970264434814, + "learning_rate": 1.660180995475113e-05, + "loss": 6.0894, + "step": 1225 + }, + { + "epoch": 1.1312217194570136, + "grad_norm": 1.39982008934021, + "learning_rate": 1.6941176470588233e-05, + "loss": 6.1409, + "step": 1250 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 2.8062944412231445, + "learning_rate": 1.7280542986425337e-05, + "loss": 6.1516, + "step": 1275 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.2145191431045532, + "learning_rate": 1.7619909502262442e-05, + "loss": 6.1426, + "step": 1300 + }, + { + "epoch": 1.1990950226244343, + "grad_norm": 1.5336560010910034, + "learning_rate": 1.7959276018099546e-05, + "loss": 6.1265, + "step": 1325 + }, + { + "epoch": 1.2217194570135748, + "grad_norm": 1.1498029232025146, + "learning_rate": 1.829864253393665e-05, + "loss": 6.1197, + "step": 1350 + }, + { + "epoch": 1.244343891402715, + "grad_norm": 1.1084051132202148, + "learning_rate": 1.8638009049773755e-05, + "loss": 6.085, + "step": 1375 + }, + { + "epoch": 1.2669683257918551, + "grad_norm": 1.5752390623092651, + "learning_rate": 1.897737556561086e-05, + "loss": 6.0966, + "step": 1400 + }, + { + "epoch": 1.2895927601809956, + "grad_norm": 1.2039798498153687, + "learning_rate": 1.9316742081447963e-05, + "loss": 6.023, + "step": 1425 + }, + { + "epoch": 1.3122171945701357, + "grad_norm": 1.3117939233779907, + "learning_rate": 1.9656108597285064e-05, + "loss": 6.0805, + "step": 1450 + }, + { + "epoch": 1.334841628959276, + "grad_norm": 1.2346285581588745, + "learning_rate": 1.999547511312217e-05, + "loss": 6.1057, + "step": 1475 + }, + { + "epoch": 1.3574660633484164, + "grad_norm": 1.4661617279052734, + "learning_rate": 2.0334841628959276e-05, + "loss": 6.1266, + "step": 1500 + }, + { + "epoch": 1.3800904977375565, + "grad_norm": 0.9981404542922974, + "learning_rate": 2.0674208144796377e-05, + "loss": 6.1344, + "step": 1525 + }, + { + "epoch": 1.4027149321266967, + "grad_norm": 1.305759072303772, + "learning_rate": 2.101357466063348e-05, + "loss": 5.9825, + "step": 1550 + }, + { + "epoch": 1.4253393665158371, + "grad_norm": 1.2048537731170654, + "learning_rate": 2.135294117647059e-05, + "loss": 5.8368, + "step": 1575 + }, + { + "epoch": 1.4479638009049773, + "grad_norm": 1.233276128768921, + "learning_rate": 2.169230769230769e-05, + "loss": 6.0436, + "step": 1600 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.9798985719680786, + "learning_rate": 2.2031674208144794e-05, + "loss": 5.9756, + "step": 1625 + }, + { + "epoch": 1.493212669683258, + "grad_norm": 1.366964340209961, + "learning_rate": 2.2371040723981895e-05, + "loss": 6.0294, + "step": 1650 + }, + { + "epoch": 1.5158371040723981, + "grad_norm": 1.187187910079956, + "learning_rate": 2.2710407239819003e-05, + "loss": 6.0066, + "step": 1675 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 1.3633379936218262, + "learning_rate": 2.3049773755656107e-05, + "loss": 5.9852, + "step": 1700 + }, + { + "epoch": 1.5610859728506787, + "grad_norm": 1.536897897720337, + "learning_rate": 2.3389140271493208e-05, + "loss": 6.0531, + "step": 1725 + }, + { + "epoch": 1.5837104072398192, + "grad_norm": 1.3276063203811646, + "learning_rate": 2.3728506787330316e-05, + "loss": 5.9943, + "step": 1750 + }, + { + "epoch": 1.6063348416289593, + "grad_norm": 1.8121161460876465, + "learning_rate": 2.406787330316742e-05, + "loss": 5.938, + "step": 1775 + }, + { + "epoch": 1.6289592760180995, + "grad_norm": 1.8641277551651, + "learning_rate": 2.440723981900452e-05, + "loss": 5.8515, + "step": 1800 + }, + { + "epoch": 1.6515837104072397, + "grad_norm": 1.6875813007354736, + "learning_rate": 2.474660633484163e-05, + "loss": 6.0208, + "step": 1825 + }, + { + "epoch": 1.6742081447963801, + "grad_norm": 1.865849256515503, + "learning_rate": 2.5085972850678733e-05, + "loss": 5.9911, + "step": 1850 + }, + { + "epoch": 1.6968325791855203, + "grad_norm": 2.502399444580078, + "learning_rate": 2.5425339366515834e-05, + "loss": 6.0454, + "step": 1875 + }, + { + "epoch": 1.7194570135746607, + "grad_norm": 1.32041597366333, + "learning_rate": 2.5764705882352938e-05, + "loss": 6.0646, + "step": 1900 + }, + { + "epoch": 1.742081447963801, + "grad_norm": 1.2567394971847534, + "learning_rate": 2.6104072398190046e-05, + "loss": 5.8021, + "step": 1925 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.3535124063491821, + "learning_rate": 2.6443438914027147e-05, + "loss": 5.9069, + "step": 1950 + }, + { + "epoch": 1.7873303167420813, + "grad_norm": 2.1293888092041016, + "learning_rate": 2.678280542986425e-05, + "loss": 5.931, + "step": 1975 + }, + { + "epoch": 1.8099547511312217, + "grad_norm": 1.651511311531067, + "learning_rate": 2.7122171945701355e-05, + "loss": 6.0154, + "step": 2000 + }, + { + "epoch": 1.8325791855203621, + "grad_norm": 1.5086476802825928, + "learning_rate": 2.746153846153846e-05, + "loss": 5.984, + "step": 2025 + }, + { + "epoch": 1.8552036199095023, + "grad_norm": 1.448927402496338, + "learning_rate": 2.7800904977375564e-05, + "loss": 5.9867, + "step": 2050 + }, + { + "epoch": 1.8778280542986425, + "grad_norm": 1.412226676940918, + "learning_rate": 2.8140271493212665e-05, + "loss": 5.9561, + "step": 2075 + }, + { + "epoch": 1.9004524886877827, + "grad_norm": 1.206612467765808, + "learning_rate": 2.8479638009049773e-05, + "loss": 5.9315, + "step": 2100 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 2.53358793258667, + "learning_rate": 2.8819004524886877e-05, + "loss": 5.8414, + "step": 2125 + }, + { + "epoch": 1.9457013574660633, + "grad_norm": 1.4013510942459106, + "learning_rate": 2.9158371040723978e-05, + "loss": 5.9605, + "step": 2150 + }, + { + "epoch": 1.9683257918552037, + "grad_norm": 1.202830195426941, + "learning_rate": 2.9497737556561086e-05, + "loss": 5.9057, + "step": 2175 + }, + { + "epoch": 1.990950226244344, + "grad_norm": 1.3365705013275146, + "learning_rate": 2.9837104072398186e-05, + "loss": 5.904, + "step": 2200 + }, + { + "epoch": 2.013574660633484, + "grad_norm": 1.5933775901794434, + "learning_rate": 3.017647058823529e-05, + "loss": 5.8016, + "step": 2225 + }, + { + "epoch": 2.0361990950226243, + "grad_norm": 1.8763705492019653, + "learning_rate": 3.0515837104072395e-05, + "loss": 5.7132, + "step": 2250 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 1.6877716779708862, + "learning_rate": 3.08552036199095e-05, + "loss": 5.9003, + "step": 2275 + }, + { + "epoch": 2.081447963800905, + "grad_norm": 1.698456883430481, + "learning_rate": 3.1194570135746604e-05, + "loss": 5.8553, + "step": 2300 + }, + { + "epoch": 2.1040723981900453, + "grad_norm": 1.278631567955017, + "learning_rate": 3.1533936651583705e-05, + "loss": 5.8442, + "step": 2325 + }, + { + "epoch": 2.1266968325791855, + "grad_norm": 1.5354814529418945, + "learning_rate": 3.187330316742081e-05, + "loss": 5.8794, + "step": 2350 + }, + { + "epoch": 2.1493212669683257, + "grad_norm": 1.8818252086639404, + "learning_rate": 3.221266968325791e-05, + "loss": 5.7308, + "step": 2375 + }, + { + "epoch": 2.171945701357466, + "grad_norm": 1.439894437789917, + "learning_rate": 3.255203619909502e-05, + "loss": 5.841, + "step": 2400 + }, + { + "epoch": 2.1945701357466065, + "grad_norm": 1.8124769926071167, + "learning_rate": 3.289140271493212e-05, + "loss": 5.8127, + "step": 2425 + }, + { + "epoch": 2.2171945701357467, + "grad_norm": 1.4732517004013062, + "learning_rate": 3.323076923076923e-05, + "loss": 5.7981, + "step": 2450 + }, + { + "epoch": 2.239819004524887, + "grad_norm": 1.1843006610870361, + "learning_rate": 3.357013574660633e-05, + "loss": 5.8788, + "step": 2475 + }, + { + "epoch": 2.262443438914027, + "grad_norm": 1.776401162147522, + "learning_rate": 3.390950226244343e-05, + "loss": 5.7363, + "step": 2500 + }, + { + "epoch": 2.2850678733031673, + "grad_norm": 1.7445324659347534, + "learning_rate": 3.424886877828054e-05, + "loss": 5.8269, + "step": 2525 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.3931142091751099, + "learning_rate": 3.458823529411765e-05, + "loss": 5.7918, + "step": 2550 + }, + { + "epoch": 2.330316742081448, + "grad_norm": 1.1032215356826782, + "learning_rate": 3.492760180995475e-05, + "loss": 5.7233, + "step": 2575 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 1.4925826787948608, + "learning_rate": 3.526696832579185e-05, + "loss": 5.8363, + "step": 2600 + }, + { + "epoch": 2.3755656108597285, + "grad_norm": 1.5988218784332275, + "learning_rate": 3.5606334841628956e-05, + "loss": 5.8818, + "step": 2625 + }, + { + "epoch": 2.3981900452488687, + "grad_norm": 1.3079369068145752, + "learning_rate": 3.594570135746606e-05, + "loss": 5.8039, + "step": 2650 + }, + { + "epoch": 2.420814479638009, + "grad_norm": 1.6587289571762085, + "learning_rate": 3.6285067873303165e-05, + "loss": 5.774, + "step": 2675 + }, + { + "epoch": 2.4434389140271495, + "grad_norm": 1.2775558233261108, + "learning_rate": 3.662443438914027e-05, + "loss": 5.7375, + "step": 2700 + }, + { + "epoch": 2.4660633484162897, + "grad_norm": 1.5299643278121948, + "learning_rate": 3.6963800904977373e-05, + "loss": 5.8269, + "step": 2725 + }, + { + "epoch": 2.48868778280543, + "grad_norm": 1.7605458498001099, + "learning_rate": 3.7303167420814474e-05, + "loss": 5.8024, + "step": 2750 + }, + { + "epoch": 2.51131221719457, + "grad_norm": 1.6457359790802002, + "learning_rate": 3.7642533936651575e-05, + "loss": 5.8177, + "step": 2775 + }, + { + "epoch": 2.5339366515837103, + "grad_norm": 1.691697120666504, + "learning_rate": 3.798190045248868e-05, + "loss": 5.8871, + "step": 2800 + }, + { + "epoch": 2.5565610859728505, + "grad_norm": 1.1488367319107056, + "learning_rate": 3.832126696832579e-05, + "loss": 5.8424, + "step": 2825 + }, + { + "epoch": 2.579185520361991, + "grad_norm": 1.314544677734375, + "learning_rate": 3.86606334841629e-05, + "loss": 5.7421, + "step": 2850 + }, + { + "epoch": 2.6018099547511313, + "grad_norm": 1.5784523487091064, + "learning_rate": 3.9e-05, + "loss": 5.9461, + "step": 2875 + }, + { + "epoch": 2.6244343891402715, + "grad_norm": 1.5426156520843506, + "learning_rate": 3.93393665158371e-05, + "loss": 5.8428, + "step": 2900 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 1.8941651582717896, + "learning_rate": 3.96787330316742e-05, + "loss": 5.8117, + "step": 2925 + }, + { + "epoch": 2.669683257918552, + "grad_norm": 1.6045634746551514, + "learning_rate": 4.001809954751131e-05, + "loss": 5.7422, + "step": 2950 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.4464021921157837, + "learning_rate": 4.035746606334841e-05, + "loss": 5.802, + "step": 2975 + }, + { + "epoch": 2.7149321266968327, + "grad_norm": 1.4441245794296265, + "learning_rate": 4.069683257918552e-05, + "loss": 5.8503, + "step": 3000 + }, + { + "epoch": 2.737556561085973, + "grad_norm": 1.614174485206604, + "learning_rate": 4.1036199095022625e-05, + "loss": 5.8229, + "step": 3025 + }, + { + "epoch": 2.760180995475113, + "grad_norm": 2.3225739002227783, + "learning_rate": 4.1375565610859726e-05, + "loss": 5.7577, + "step": 3050 + }, + { + "epoch": 2.7828054298642533, + "grad_norm": 1.2462753057479858, + "learning_rate": 4.171493212669683e-05, + "loss": 5.8104, + "step": 3075 + }, + { + "epoch": 2.8054298642533935, + "grad_norm": 1.1745747327804565, + "learning_rate": 4.2054298642533935e-05, + "loss": 5.8207, + "step": 3100 + }, + { + "epoch": 2.8280542986425337, + "grad_norm": 1.7370704412460327, + "learning_rate": 4.2393665158371036e-05, + "loss": 5.7556, + "step": 3125 + }, + { + "epoch": 2.8506787330316743, + "grad_norm": 1.9213569164276123, + "learning_rate": 4.2733031674208136e-05, + "loss": 5.797, + "step": 3150 + }, + { + "epoch": 2.8733031674208145, + "grad_norm": 1.8364306688308716, + "learning_rate": 4.307239819004525e-05, + "loss": 5.8159, + "step": 3175 + }, + { + "epoch": 2.8959276018099547, + "grad_norm": 1.2981412410736084, + "learning_rate": 4.341176470588235e-05, + "loss": 5.816, + "step": 3200 + }, + { + "epoch": 2.918552036199095, + "grad_norm": 1.7525430917739868, + "learning_rate": 4.375113122171945e-05, + "loss": 5.7447, + "step": 3225 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 1.1247882843017578, + "learning_rate": 4.409049773755656e-05, + "loss": 5.7757, + "step": 3250 + }, + { + "epoch": 2.9638009049773757, + "grad_norm": 1.7951980829238892, + "learning_rate": 4.442986425339366e-05, + "loss": 5.7815, + "step": 3275 + }, + { + "epoch": 2.986425339366516, + "grad_norm": 2.1062705516815186, + "learning_rate": 4.476923076923076e-05, + "loss": 5.9039, + "step": 3300 + }, + { + "epoch": 3.009049773755656, + "grad_norm": 1.276943325996399, + "learning_rate": 4.510859728506786e-05, + "loss": 5.798, + "step": 3325 + }, + { + "epoch": 3.0316742081447963, + "grad_norm": 1.2819347381591797, + "learning_rate": 4.544796380090498e-05, + "loss": 5.587, + "step": 3350 + }, + { + "epoch": 3.0542986425339365, + "grad_norm": 1.5981217622756958, + "learning_rate": 4.578733031674208e-05, + "loss": 5.6573, + "step": 3375 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.9141427278518677, + "learning_rate": 4.612669683257918e-05, + "loss": 5.6523, + "step": 3400 + }, + { + "epoch": 3.0995475113122173, + "grad_norm": 2.002366304397583, + "learning_rate": 4.646606334841629e-05, + "loss": 5.7209, + "step": 3425 + }, + { + "epoch": 3.1221719457013575, + "grad_norm": 1.3727688789367676, + "learning_rate": 4.680542986425339e-05, + "loss": 5.7155, + "step": 3450 + }, + { + "epoch": 3.1447963800904977, + "grad_norm": 1.5259437561035156, + "learning_rate": 4.714479638009049e-05, + "loss": 5.6509, + "step": 3475 + }, + { + "epoch": 3.167420814479638, + "grad_norm": 1.8274619579315186, + "learning_rate": 4.74841628959276e-05, + "loss": 5.8386, + "step": 3500 + }, + { + "epoch": 3.1900452488687785, + "grad_norm": 1.372196912765503, + "learning_rate": 4.7823529411764704e-05, + "loss": 5.6764, + "step": 3525 + }, + { + "epoch": 3.2126696832579187, + "grad_norm": 2.9106500148773193, + "learning_rate": 4.8162895927601805e-05, + "loss": 5.6857, + "step": 3550 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 1.648293137550354, + "learning_rate": 4.850226244343891e-05, + "loss": 5.6533, + "step": 3575 + }, + { + "epoch": 3.257918552036199, + "grad_norm": 1.904435634613037, + "learning_rate": 4.8841628959276014e-05, + "loss": 5.7202, + "step": 3600 + }, + { + "epoch": 3.2805429864253393, + "grad_norm": 1.3707633018493652, + "learning_rate": 4.9180995475113115e-05, + "loss": 5.6916, + "step": 3625 + }, + { + "epoch": 3.3031674208144794, + "grad_norm": 1.5876151323318481, + "learning_rate": 4.952036199095022e-05, + "loss": 5.7601, + "step": 3650 + }, + { + "epoch": 3.32579185520362, + "grad_norm": 1.605332851409912, + "learning_rate": 4.9859728506787323e-05, + "loss": 5.7429, + "step": 3675 + }, + { + "epoch": 3.3484162895927603, + "grad_norm": 1.4415479898452759, + "learning_rate": 5.019909502262443e-05, + "loss": 5.655, + "step": 3700 + }, + { + "epoch": 3.3710407239819005, + "grad_norm": 1.6629538536071777, + "learning_rate": 5.053846153846154e-05, + "loss": 5.763, + "step": 3725 + }, + { + "epoch": 3.3936651583710407, + "grad_norm": 1.9927380084991455, + "learning_rate": 5.087782805429864e-05, + "loss": 5.7412, + "step": 3750 + }, + { + "epoch": 3.416289592760181, + "grad_norm": 1.6488847732543945, + "learning_rate": 5.121719457013574e-05, + "loss": 5.6904, + "step": 3775 + }, + { + "epoch": 3.4389140271493215, + "grad_norm": 1.2794617414474487, + "learning_rate": 5.155656108597284e-05, + "loss": 5.715, + "step": 3800 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 1.5437264442443848, + "learning_rate": 5.189592760180995e-05, + "loss": 5.708, + "step": 3825 + }, + { + "epoch": 3.484162895927602, + "grad_norm": 1.80771005153656, + "learning_rate": 5.223529411764705e-05, + "loss": 5.7182, + "step": 3850 + }, + { + "epoch": 3.506787330316742, + "grad_norm": 1.4776058197021484, + "learning_rate": 5.2574660633484165e-05, + "loss": 5.7784, + "step": 3875 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 2.749145030975342, + "learning_rate": 5.2914027149321266e-05, + "loss": 5.619, + "step": 3900 + }, + { + "epoch": 3.5520361990950224, + "grad_norm": 1.321062684059143, + "learning_rate": 5.3253393665158366e-05, + "loss": 5.7243, + "step": 3925 + }, + { + "epoch": 3.5746606334841626, + "grad_norm": 1.7517143487930298, + "learning_rate": 5.359276018099547e-05, + "loss": 5.6926, + "step": 3950 + }, + { + "epoch": 3.5972850678733033, + "grad_norm": 1.2275956869125366, + "learning_rate": 5.3932126696832575e-05, + "loss": 5.7326, + "step": 3975 + }, + { + "epoch": 3.6199095022624435, + "grad_norm": 1.4906036853790283, + "learning_rate": 5.4271493212669676e-05, + "loss": 5.7831, + "step": 4000 + }, + { + "epoch": 3.6425339366515836, + "grad_norm": 1.6958218812942505, + "learning_rate": 5.4610859728506784e-05, + "loss": 5.7038, + "step": 4025 + }, + { + "epoch": 3.665158371040724, + "grad_norm": 1.6957277059555054, + "learning_rate": 5.495022624434389e-05, + "loss": 5.6161, + "step": 4050 + }, + { + "epoch": 3.6877828054298645, + "grad_norm": 1.7718782424926758, + "learning_rate": 5.528959276018099e-05, + "loss": 5.7296, + "step": 4075 + }, + { + "epoch": 3.7104072398190047, + "grad_norm": 1.8896294832229614, + "learning_rate": 5.562895927601809e-05, + "loss": 5.7165, + "step": 4100 + }, + { + "epoch": 3.733031674208145, + "grad_norm": 3.1193795204162598, + "learning_rate": 5.59683257918552e-05, + "loss": 5.716, + "step": 4125 + }, + { + "epoch": 3.755656108597285, + "grad_norm": 1.3106483221054077, + "learning_rate": 5.63076923076923e-05, + "loss": 5.7097, + "step": 4150 + }, + { + "epoch": 3.7782805429864252, + "grad_norm": 1.7375882863998413, + "learning_rate": 5.66470588235294e-05, + "loss": 5.7747, + "step": 4175 + }, + { + "epoch": 3.8009049773755654, + "grad_norm": 1.6757769584655762, + "learning_rate": 5.698642533936652e-05, + "loss": 5.6659, + "step": 4200 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 2.179318428039551, + "learning_rate": 5.732579185520362e-05, + "loss": 5.6603, + "step": 4225 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 1.1887872219085693, + "learning_rate": 5.766515837104072e-05, + "loss": 5.6603, + "step": 4250 + }, + { + "epoch": 3.8687782805429864, + "grad_norm": 1.7193459272384644, + "learning_rate": 5.800452488687783e-05, + "loss": 5.5825, + "step": 4275 + }, + { + "epoch": 3.8914027149321266, + "grad_norm": 1.5481328964233398, + "learning_rate": 5.834389140271493e-05, + "loss": 5.7928, + "step": 4300 + }, + { + "epoch": 3.914027149321267, + "grad_norm": 1.2775366306304932, + "learning_rate": 5.868325791855203e-05, + "loss": 5.803, + "step": 4325 + }, + { + "epoch": 3.9366515837104075, + "grad_norm": 1.4987918138504028, + "learning_rate": 5.902262443438913e-05, + "loss": 5.5869, + "step": 4350 + }, + { + "epoch": 3.9592760180995477, + "grad_norm": 1.6811946630477905, + "learning_rate": 5.9361990950226244e-05, + "loss": 5.7896, + "step": 4375 + }, + { + "epoch": 3.981900452488688, + "grad_norm": 1.367522120475769, + "learning_rate": 5.9701357466063345e-05, + "loss": 5.6617, + "step": 4400 + }, + { + "epoch": 4.004524886877828, + "grad_norm": 1.4017376899719238, + "learning_rate": 6.0040723981900446e-05, + "loss": 5.6081, + "step": 4425 + }, + { + "epoch": 4.027149321266968, + "grad_norm": 1.3864396810531616, + "learning_rate": 6.0380090497737553e-05, + "loss": 5.6787, + "step": 4450 + }, + { + "epoch": 4.049773755656108, + "grad_norm": 1.5472630262374878, + "learning_rate": 6.0719457013574654e-05, + "loss": 5.6549, + "step": 4475 + }, + { + "epoch": 4.072398190045249, + "grad_norm": 1.563694715499878, + "learning_rate": 6.105882352941176e-05, + "loss": 5.6172, + "step": 4500 + }, + { + "epoch": 4.095022624434389, + "grad_norm": 1.7143316268920898, + "learning_rate": 6.139819004524886e-05, + "loss": 5.5593, + "step": 4525 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 1.314466118812561, + "learning_rate": 6.173755656108598e-05, + "loss": 5.5821, + "step": 4550 + }, + { + "epoch": 4.14027149321267, + "grad_norm": 1.114738941192627, + "learning_rate": 6.207692307692308e-05, + "loss": 5.6303, + "step": 4575 + }, + { + "epoch": 4.16289592760181, + "grad_norm": 1.5527963638305664, + "learning_rate": 6.241628959276018e-05, + "loss": 5.7382, + "step": 4600 + }, + { + "epoch": 4.1855203619909505, + "grad_norm": 0.9862022995948792, + "learning_rate": 6.275565610859728e-05, + "loss": 5.661, + "step": 4625 + }, + { + "epoch": 4.208144796380091, + "grad_norm": 1.3943437337875366, + "learning_rate": 6.309502262443438e-05, + "loss": 5.6551, + "step": 4650 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 1.2521923780441284, + "learning_rate": 6.343438914027148e-05, + "loss": 5.5984, + "step": 4675 + }, + { + "epoch": 4.253393665158371, + "grad_norm": 1.297293782234192, + "learning_rate": 6.377375565610858e-05, + "loss": 5.6354, + "step": 4700 + }, + { + "epoch": 4.276018099547511, + "grad_norm": 1.7689846754074097, + "learning_rate": 6.41131221719457e-05, + "loss": 5.5656, + "step": 4725 + }, + { + "epoch": 4.298642533936651, + "grad_norm": 1.3994182348251343, + "learning_rate": 6.44524886877828e-05, + "loss": 5.5455, + "step": 4750 + }, + { + "epoch": 4.321266968325792, + "grad_norm": 1.4447396993637085, + "learning_rate": 6.47918552036199e-05, + "loss": 5.6526, + "step": 4775 + }, + { + "epoch": 4.343891402714932, + "grad_norm": 1.3658605813980103, + "learning_rate": 6.513122171945701e-05, + "loss": 5.6019, + "step": 4800 + }, + { + "epoch": 4.366515837104072, + "grad_norm": 1.4008032083511353, + "learning_rate": 6.547058823529411e-05, + "loss": 5.6222, + "step": 4825 + }, + { + "epoch": 4.389140271493213, + "grad_norm": 1.3936430215835571, + "learning_rate": 6.580995475113122e-05, + "loss": 5.5291, + "step": 4850 + }, + { + "epoch": 4.411764705882353, + "grad_norm": 1.140010118484497, + "learning_rate": 6.614932126696832e-05, + "loss": 5.5792, + "step": 4875 + }, + { + "epoch": 4.4343891402714934, + "grad_norm": 1.2577894926071167, + "learning_rate": 6.648868778280543e-05, + "loss": 5.6013, + "step": 4900 + }, + { + "epoch": 4.457013574660634, + "grad_norm": 1.3935832977294922, + "learning_rate": 6.682805429864253e-05, + "loss": 5.603, + "step": 4925 + }, + { + "epoch": 4.479638009049774, + "grad_norm": 1.5030955076217651, + "learning_rate": 6.716742081447963e-05, + "loss": 5.7377, + "step": 4950 + }, + { + "epoch": 4.502262443438914, + "grad_norm": 1.145139455795288, + "learning_rate": 6.750678733031673e-05, + "loss": 5.6013, + "step": 4975 + }, + { + "epoch": 4.524886877828054, + "grad_norm": 1.4582146406173706, + "learning_rate": 6.784615384615383e-05, + "loss": 5.6743, + "step": 5000 + }, + { + "epoch": 4.547511312217194, + "grad_norm": 1.0025731325149536, + "learning_rate": 6.818552036199094e-05, + "loss": 5.7206, + "step": 5025 + }, + { + "epoch": 4.570135746606335, + "grad_norm": 1.39948308467865, + "learning_rate": 6.852488687782805e-05, + "loss": 5.6864, + "step": 5050 + }, + { + "epoch": 4.592760180995475, + "grad_norm": 1.5719788074493408, + "learning_rate": 6.886425339366515e-05, + "loss": 5.5086, + "step": 5075 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 1.4834094047546387, + "learning_rate": 6.920361990950227e-05, + "loss": 5.5901, + "step": 5100 + }, + { + "epoch": 4.638009049773755, + "grad_norm": 1.3456339836120605, + "learning_rate": 6.954298642533937e-05, + "loss": 5.6564, + "step": 5125 + }, + { + "epoch": 4.660633484162896, + "grad_norm": 1.3672384023666382, + "learning_rate": 6.988235294117647e-05, + "loss": 5.4837, + "step": 5150 + }, + { + "epoch": 4.683257918552036, + "grad_norm": 1.3631017208099365, + "learning_rate": 7.022171945701357e-05, + "loss": 5.5312, + "step": 5175 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 1.1694447994232178, + "learning_rate": 7.056108597285067e-05, + "loss": 5.5842, + "step": 5200 + }, + { + "epoch": 4.728506787330317, + "grad_norm": 1.489476203918457, + "learning_rate": 7.090045248868777e-05, + "loss": 5.564, + "step": 5225 + }, + { + "epoch": 4.751131221719457, + "grad_norm": 1.2117191553115845, + "learning_rate": 7.123981900452488e-05, + "loss": 5.6299, + "step": 5250 + }, + { + "epoch": 4.773755656108597, + "grad_norm": 1.2240872383117676, + "learning_rate": 7.157918552036199e-05, + "loss": 5.5516, + "step": 5275 + }, + { + "epoch": 4.796380090497737, + "grad_norm": 1.3566014766693115, + "learning_rate": 7.191855203619909e-05, + "loss": 5.5811, + "step": 5300 + }, + { + "epoch": 4.819004524886878, + "grad_norm": 1.6840308904647827, + "learning_rate": 7.225791855203619e-05, + "loss": 5.675, + "step": 5325 + }, + { + "epoch": 4.841628959276018, + "grad_norm": 1.5663491487503052, + "learning_rate": 7.25972850678733e-05, + "loss": 5.6887, + "step": 5350 + }, + { + "epoch": 4.864253393665159, + "grad_norm": 1.5798016786575317, + "learning_rate": 7.29366515837104e-05, + "loss": 5.5949, + "step": 5375 + }, + { + "epoch": 4.886877828054299, + "grad_norm": 2.12789249420166, + "learning_rate": 7.32760180995475e-05, + "loss": 5.5658, + "step": 5400 + }, + { + "epoch": 4.909502262443439, + "grad_norm": 1.7635608911514282, + "learning_rate": 7.361538461538462e-05, + "loss": 5.7001, + "step": 5425 + }, + { + "epoch": 4.932126696832579, + "grad_norm": 1.314758539199829, + "learning_rate": 7.395475113122172e-05, + "loss": 5.58, + "step": 5450 + }, + { + "epoch": 4.95475113122172, + "grad_norm": 1.0996482372283936, + "learning_rate": 7.429411764705882e-05, + "loss": 5.6766, + "step": 5475 + }, + { + "epoch": 4.97737556561086, + "grad_norm": 2.1102426052093506, + "learning_rate": 7.463348416289592e-05, + "loss": 5.4924, + "step": 5500 + }, + { + "epoch": 5.0, + "grad_norm": 1.0663578510284424, + "learning_rate": 7.497285067873302e-05, + "loss": 5.6522, + "step": 5525 + }, + { + "epoch": 5.02262443438914, + "grad_norm": 1.2302531003952026, + "learning_rate": 7.531221719457014e-05, + "loss": 5.4822, + "step": 5550 + }, + { + "epoch": 5.04524886877828, + "grad_norm": 1.242390751838684, + "learning_rate": 7.565158371040724e-05, + "loss": 5.4806, + "step": 5575 + }, + { + "epoch": 5.067873303167421, + "grad_norm": 1.377537488937378, + "learning_rate": 7.599095022624434e-05, + "loss": 5.4515, + "step": 5600 + }, + { + "epoch": 5.090497737556561, + "grad_norm": 1.372882604598999, + "learning_rate": 7.633031674208144e-05, + "loss": 5.5466, + "step": 5625 + }, + { + "epoch": 5.113122171945701, + "grad_norm": 1.1628869771957397, + "learning_rate": 7.666968325791854e-05, + "loss": 5.5631, + "step": 5650 + }, + { + "epoch": 5.135746606334842, + "grad_norm": 1.1661573648452759, + "learning_rate": 7.700904977375565e-05, + "loss": 5.5736, + "step": 5675 + }, + { + "epoch": 5.158371040723982, + "grad_norm": 1.680083990097046, + "learning_rate": 7.734841628959276e-05, + "loss": 5.5193, + "step": 5700 + }, + { + "epoch": 5.180995475113122, + "grad_norm": 1.4866079092025757, + "learning_rate": 7.768778280542986e-05, + "loss": 5.7053, + "step": 5725 + }, + { + "epoch": 5.203619909502263, + "grad_norm": 1.2793829441070557, + "learning_rate": 7.802714932126696e-05, + "loss": 5.4355, + "step": 5750 + }, + { + "epoch": 5.226244343891403, + "grad_norm": 1.63210928440094, + "learning_rate": 7.836651583710406e-05, + "loss": 5.6327, + "step": 5775 + }, + { + "epoch": 5.248868778280543, + "grad_norm": 1.3546538352966309, + "learning_rate": 7.870588235294116e-05, + "loss": 5.5684, + "step": 5800 + }, + { + "epoch": 5.271493212669683, + "grad_norm": 1.1607636213302612, + "learning_rate": 7.904524886877826e-05, + "loss": 5.5127, + "step": 5825 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 1.2475472688674927, + "learning_rate": 7.938461538461539e-05, + "loss": 5.6689, + "step": 5850 + }, + { + "epoch": 5.316742081447964, + "grad_norm": 1.0922496318817139, + "learning_rate": 7.972398190045249e-05, + "loss": 5.5478, + "step": 5875 + }, + { + "epoch": 5.339366515837104, + "grad_norm": 1.4692274332046509, + "learning_rate": 8.006334841628959e-05, + "loss": 5.5565, + "step": 5900 + }, + { + "epoch": 5.361990950226244, + "grad_norm": 1.2589067220687866, + "learning_rate": 8.040271493212669e-05, + "loss": 5.5281, + "step": 5925 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 2.016085147857666, + "learning_rate": 8.074208144796379e-05, + "loss": 5.5333, + "step": 5950 + }, + { + "epoch": 5.407239819004525, + "grad_norm": 1.1720553636550903, + "learning_rate": 8.10814479638009e-05, + "loss": 5.5755, + "step": 5975 + }, + { + "epoch": 5.429864253393665, + "grad_norm": 1.0042240619659424, + "learning_rate": 8.1420814479638e-05, + "loss": 5.586, + "step": 6000 + }, + { + "epoch": 5.452488687782806, + "grad_norm": 1.2361646890640259, + "learning_rate": 8.176018099547511e-05, + "loss": 5.5136, + "step": 6025 + }, + { + "epoch": 5.475113122171946, + "grad_norm": 0.9489585161209106, + "learning_rate": 8.209954751131221e-05, + "loss": 5.7089, + "step": 6050 + }, + { + "epoch": 5.497737556561086, + "grad_norm": 1.1919461488723755, + "learning_rate": 8.243891402714931e-05, + "loss": 5.4838, + "step": 6075 + }, + { + "epoch": 5.520361990950226, + "grad_norm": 1.2139184474945068, + "learning_rate": 8.277828054298641e-05, + "loss": 5.4898, + "step": 6100 + }, + { + "epoch": 5.542986425339366, + "grad_norm": 1.0839821100234985, + "learning_rate": 8.311764705882351e-05, + "loss": 5.6115, + "step": 6125 + }, + { + "epoch": 5.5656108597285066, + "grad_norm": 1.201003074645996, + "learning_rate": 8.345701357466063e-05, + "loss": 5.5291, + "step": 6150 + }, + { + "epoch": 5.588235294117647, + "grad_norm": 1.3064321279525757, + "learning_rate": 8.379638009049773e-05, + "loss": 5.5308, + "step": 6175 + }, + { + "epoch": 5.610859728506787, + "grad_norm": 1.372571587562561, + "learning_rate": 8.413574660633484e-05, + "loss": 5.5055, + "step": 6200 + }, + { + "epoch": 5.633484162895927, + "grad_norm": 1.0197138786315918, + "learning_rate": 8.447511312217194e-05, + "loss": 5.524, + "step": 6225 + }, + { + "epoch": 5.656108597285068, + "grad_norm": 1.2021855115890503, + "learning_rate": 8.481447963800904e-05, + "loss": 5.4335, + "step": 6250 + }, + { + "epoch": 5.678733031674208, + "grad_norm": 1.289168357849121, + "learning_rate": 8.515384615384614e-05, + "loss": 5.4808, + "step": 6275 + }, + { + "epoch": 5.701357466063349, + "grad_norm": 1.5402923822402954, + "learning_rate": 8.549321266968326e-05, + "loss": 5.5559, + "step": 6300 + }, + { + "epoch": 5.723981900452489, + "grad_norm": 0.9780600070953369, + "learning_rate": 8.583257918552036e-05, + "loss": 5.5102, + "step": 6325 + }, + { + "epoch": 5.746606334841629, + "grad_norm": 1.465335726737976, + "learning_rate": 8.617194570135746e-05, + "loss": 5.4427, + "step": 6350 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 1.1970276832580566, + "learning_rate": 8.651131221719456e-05, + "loss": 5.549, + "step": 6375 + }, + { + "epoch": 5.791855203619909, + "grad_norm": 1.118125319480896, + "learning_rate": 8.685067873303166e-05, + "loss": 5.4598, + "step": 6400 + }, + { + "epoch": 5.8144796380090495, + "grad_norm": 1.0834537744522095, + "learning_rate": 8.719004524886876e-05, + "loss": 5.3887, + "step": 6425 + }, + { + "epoch": 5.83710407239819, + "grad_norm": 1.057357907295227, + "learning_rate": 8.752941176470586e-05, + "loss": 5.6295, + "step": 6450 + }, + { + "epoch": 5.859728506787331, + "grad_norm": 1.5805262327194214, + "learning_rate": 8.786877828054298e-05, + "loss": 5.5536, + "step": 6475 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 1.3391293287277222, + "learning_rate": 8.820814479638008e-05, + "loss": 5.5675, + "step": 6500 + }, + { + "epoch": 5.904977375565611, + "grad_norm": 1.3164405822753906, + "learning_rate": 8.854751131221718e-05, + "loss": 5.4408, + "step": 6525 + }, + { + "epoch": 5.927601809954751, + "grad_norm": 1.160893201828003, + "learning_rate": 8.88868778280543e-05, + "loss": 5.5364, + "step": 6550 + }, + { + "epoch": 5.950226244343892, + "grad_norm": 0.9940909147262573, + "learning_rate": 8.92262443438914e-05, + "loss": 5.5756, + "step": 6575 + }, + { + "epoch": 5.972850678733032, + "grad_norm": 1.1090673208236694, + "learning_rate": 8.956561085972851e-05, + "loss": 5.4551, + "step": 6600 + }, + { + "epoch": 5.995475113122172, + "grad_norm": 1.2676074504852295, + "learning_rate": 8.990497737556561e-05, + "loss": 5.6009, + "step": 6625 + }, + { + "epoch": 6.018099547511312, + "grad_norm": 1.510372519493103, + "learning_rate": 9.024434389140271e-05, + "loss": 5.4012, + "step": 6650 + }, + { + "epoch": 6.040723981900452, + "grad_norm": 1.1791483163833618, + "learning_rate": 9.058371040723981e-05, + "loss": 5.3326, + "step": 6675 + }, + { + "epoch": 6.0633484162895925, + "grad_norm": 1.407641887664795, + "learning_rate": 9.092307692307691e-05, + "loss": 5.374, + "step": 6700 + }, + { + "epoch": 6.085972850678733, + "grad_norm": 1.3388482332229614, + "learning_rate": 9.126244343891401e-05, + "loss": 5.4042, + "step": 6725 + }, + { + "epoch": 6.108597285067873, + "grad_norm": 1.554003119468689, + "learning_rate": 9.160180995475112e-05, + "loss": 5.4236, + "step": 6750 + }, + { + "epoch": 6.131221719457014, + "grad_norm": 1.3483645915985107, + "learning_rate": 9.194117647058823e-05, + "loss": 5.4674, + "step": 6775 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 1.0115711688995361, + "learning_rate": 9.228054298642533e-05, + "loss": 5.4868, + "step": 6800 + }, + { + "epoch": 6.176470588235294, + "grad_norm": 1.439205527305603, + "learning_rate": 9.261990950226243e-05, + "loss": 5.4786, + "step": 6825 + }, + { + "epoch": 6.199095022624435, + "grad_norm": 0.9785445928573608, + "learning_rate": 9.295927601809953e-05, + "loss": 5.5347, + "step": 6850 + }, + { + "epoch": 6.221719457013575, + "grad_norm": 1.1020057201385498, + "learning_rate": 9.329864253393665e-05, + "loss": 5.4051, + "step": 6875 + }, + { + "epoch": 6.244343891402715, + "grad_norm": 1.0274757146835327, + "learning_rate": 9.363800904977375e-05, + "loss": 5.5246, + "step": 6900 + }, + { + "epoch": 6.266968325791855, + "grad_norm": 1.0783212184906006, + "learning_rate": 9.397737556561086e-05, + "loss": 5.5393, + "step": 6925 + }, + { + "epoch": 6.289592760180995, + "grad_norm": 1.0508118867874146, + "learning_rate": 9.431674208144796e-05, + "loss": 5.4552, + "step": 6950 + }, + { + "epoch": 6.3122171945701355, + "grad_norm": 1.1625144481658936, + "learning_rate": 9.465610859728506e-05, + "loss": 5.4529, + "step": 6975 + }, + { + "epoch": 6.334841628959276, + "grad_norm": 1.0248526334762573, + "learning_rate": 9.499547511312217e-05, + "loss": 5.3419, + "step": 7000 + }, + { + "epoch": 6.357466063348416, + "grad_norm": 1.1253679990768433, + "learning_rate": 9.533484162895927e-05, + "loss": 5.418, + "step": 7025 + }, + { + "epoch": 6.380090497737557, + "grad_norm": 1.1107271909713745, + "learning_rate": 9.567420814479637e-05, + "loss": 5.449, + "step": 7050 + }, + { + "epoch": 6.402714932126697, + "grad_norm": 1.0638843774795532, + "learning_rate": 9.601357466063347e-05, + "loss": 5.6126, + "step": 7075 + }, + { + "epoch": 6.425339366515837, + "grad_norm": 1.6658477783203125, + "learning_rate": 9.635294117647058e-05, + "loss": 5.5386, + "step": 7100 + }, + { + "epoch": 6.447963800904978, + "grad_norm": 1.1139315366744995, + "learning_rate": 9.669230769230768e-05, + "loss": 5.4514, + "step": 7125 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 1.1143317222595215, + "learning_rate": 9.703167420814478e-05, + "loss": 5.5782, + "step": 7150 + }, + { + "epoch": 6.493212669683258, + "grad_norm": 1.4471315145492554, + "learning_rate": 9.737104072398189e-05, + "loss": 5.4109, + "step": 7175 + }, + { + "epoch": 6.515837104072398, + "grad_norm": 1.167921781539917, + "learning_rate": 9.771040723981899e-05, + "loss": 5.49, + "step": 7200 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 0.9504801630973816, + "learning_rate": 9.804977375565611e-05, + "loss": 5.3976, + "step": 7225 + }, + { + "epoch": 6.5610859728506785, + "grad_norm": 1.4360880851745605, + "learning_rate": 9.838914027149322e-05, + "loss": 5.414, + "step": 7250 + }, + { + "epoch": 6.583710407239819, + "grad_norm": 1.3413575887680054, + "learning_rate": 9.871493212669681e-05, + "loss": 5.4689, + "step": 7275 + }, + { + "epoch": 6.606334841628959, + "grad_norm": 1.0288740396499634, + "learning_rate": 9.905429864253394e-05, + "loss": 5.455, + "step": 7300 + }, + { + "epoch": 6.628959276018099, + "grad_norm": 1.0424234867095947, + "learning_rate": 9.939366515837104e-05, + "loss": 5.3584, + "step": 7325 + }, + { + "epoch": 6.65158371040724, + "grad_norm": 1.119850516319275, + "learning_rate": 9.973303167420814e-05, + "loss": 5.3877, + "step": 7350 + }, + { + "epoch": 6.67420814479638, + "grad_norm": 1.2746446132659912, + "learning_rate": 0.00010007239819004524, + "loss": 5.4438, + "step": 7375 + }, + { + "epoch": 6.6968325791855206, + "grad_norm": 1.3023581504821777, + "learning_rate": 0.00010041176470588234, + "loss": 5.3398, + "step": 7400 + }, + { + "epoch": 6.719457013574661, + "grad_norm": 1.214418888092041, + "learning_rate": 0.00010075113122171946, + "loss": 5.4295, + "step": 7425 + }, + { + "epoch": 6.742081447963801, + "grad_norm": 1.1940439939498901, + "learning_rate": 0.00010109049773755656, + "loss": 5.4227, + "step": 7450 + }, + { + "epoch": 6.764705882352941, + "grad_norm": 1.1146560907363892, + "learning_rate": 0.00010142986425339366, + "loss": 5.3667, + "step": 7475 + }, + { + "epoch": 6.787330316742081, + "grad_norm": 1.2646377086639404, + "learning_rate": 0.00010176923076923076, + "loss": 5.4075, + "step": 7500 + }, + { + "epoch": 6.8099547511312215, + "grad_norm": 1.029643177986145, + "learning_rate": 0.00010210859728506786, + "loss": 5.5222, + "step": 7525 + }, + { + "epoch": 6.832579185520362, + "grad_norm": 1.259440541267395, + "learning_rate": 0.00010244796380090496, + "loss": 5.5015, + "step": 7550 + }, + { + "epoch": 6.855203619909502, + "grad_norm": 0.9059805870056152, + "learning_rate": 0.00010278733031674206, + "loss": 5.5371, + "step": 7575 + }, + { + "epoch": 6.877828054298643, + "grad_norm": 1.9123485088348389, + "learning_rate": 0.00010312669683257918, + "loss": 5.6179, + "step": 7600 + }, + { + "epoch": 6.900452488687783, + "grad_norm": 1.1449521780014038, + "learning_rate": 0.00010346606334841628, + "loss": 5.5484, + "step": 7625 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 1.0224618911743164, + "learning_rate": 0.00010380542986425339, + "loss": 5.2939, + "step": 7650 + }, + { + "epoch": 6.9457013574660635, + "grad_norm": 1.2241264581680298, + "learning_rate": 0.00010414479638009049, + "loss": 5.5328, + "step": 7675 + }, + { + "epoch": 6.968325791855204, + "grad_norm": 1.059005856513977, + "learning_rate": 0.00010448416289592759, + "loss": 5.4795, + "step": 7700 + }, + { + "epoch": 6.990950226244344, + "grad_norm": 0.9612129926681519, + "learning_rate": 0.0001048235294117647, + "loss": 5.553, + "step": 7725 + }, + { + "epoch": 7.013574660633484, + "grad_norm": 1.1683646440505981, + "learning_rate": 0.00010516289592760181, + "loss": 5.4763, + "step": 7750 + }, + { + "epoch": 7.036199095022624, + "grad_norm": 1.212646722793579, + "learning_rate": 0.00010550226244343891, + "loss": 5.3376, + "step": 7775 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 1.05197274684906, + "learning_rate": 0.00010584162895927601, + "loss": 5.2155, + "step": 7800 + }, + { + "epoch": 7.081447963800905, + "grad_norm": 1.0929110050201416, + "learning_rate": 0.00010618099547511311, + "loss": 5.3498, + "step": 7825 + }, + { + "epoch": 7.104072398190045, + "grad_norm": 1.2394142150878906, + "learning_rate": 0.00010652036199095021, + "loss": 5.3593, + "step": 7850 + }, + { + "epoch": 7.126696832579185, + "grad_norm": 1.4099429845809937, + "learning_rate": 0.00010685972850678731, + "loss": 5.2032, + "step": 7875 + }, + { + "epoch": 7.149321266968326, + "grad_norm": 1.3127585649490356, + "learning_rate": 0.00010719909502262441, + "loss": 5.4849, + "step": 7900 + }, + { + "epoch": 7.171945701357466, + "grad_norm": 1.2615708112716675, + "learning_rate": 0.00010753846153846153, + "loss": 5.4461, + "step": 7925 + }, + { + "epoch": 7.1945701357466065, + "grad_norm": 1.43902587890625, + "learning_rate": 0.00010787782805429863, + "loss": 5.4668, + "step": 7950 + }, + { + "epoch": 7.217194570135747, + "grad_norm": 1.8109885454177856, + "learning_rate": 0.00010821719457013573, + "loss": 5.2972, + "step": 7975 + }, + { + "epoch": 7.239819004524887, + "grad_norm": 1.0213814973831177, + "learning_rate": 0.00010855656108597284, + "loss": 5.359, + "step": 8000 + }, + { + "epoch": 7.262443438914027, + "grad_norm": 1.1312299966812134, + "learning_rate": 0.00010889592760180995, + "loss": 5.4318, + "step": 8025 + }, + { + "epoch": 7.285067873303167, + "grad_norm": 1.2350519895553589, + "learning_rate": 0.00010923529411764706, + "loss": 5.3081, + "step": 8050 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 1.3539677858352661, + "learning_rate": 0.00010957466063348416, + "loss": 5.3457, + "step": 8075 + }, + { + "epoch": 7.330316742081448, + "grad_norm": 1.0154722929000854, + "learning_rate": 0.00010991402714932126, + "loss": 5.4666, + "step": 8100 + }, + { + "epoch": 7.352941176470588, + "grad_norm": 1.1144194602966309, + "learning_rate": 0.00011025339366515836, + "loss": 5.442, + "step": 8125 + }, + { + "epoch": 7.375565610859729, + "grad_norm": 0.8434383869171143, + "learning_rate": 0.00011059276018099546, + "loss": 5.3537, + "step": 8150 + }, + { + "epoch": 7.398190045248869, + "grad_norm": 1.0677796602249146, + "learning_rate": 0.00011093212669683256, + "loss": 5.5026, + "step": 8175 + }, + { + "epoch": 7.420814479638009, + "grad_norm": 1.1033122539520264, + "learning_rate": 0.00011127149321266967, + "loss": 5.4341, + "step": 8200 + }, + { + "epoch": 7.4434389140271495, + "grad_norm": 0.9210260510444641, + "learning_rate": 0.00011161085972850678, + "loss": 5.4144, + "step": 8225 + }, + { + "epoch": 7.46606334841629, + "grad_norm": 1.0614783763885498, + "learning_rate": 0.00011195022624434388, + "loss": 5.4164, + "step": 8250 + }, + { + "epoch": 7.48868778280543, + "grad_norm": 1.2655754089355469, + "learning_rate": 0.00011228959276018098, + "loss": 5.2604, + "step": 8275 + }, + { + "epoch": 7.51131221719457, + "grad_norm": 0.9641904234886169, + "learning_rate": 0.00011262895927601808, + "loss": 5.3504, + "step": 8300 + }, + { + "epoch": 7.53393665158371, + "grad_norm": 1.2946969270706177, + "learning_rate": 0.00011296832579185518, + "loss": 5.4227, + "step": 8325 + }, + { + "epoch": 7.5565610859728505, + "grad_norm": 0.9463045597076416, + "learning_rate": 0.0001133076923076923, + "loss": 5.4054, + "step": 8350 + }, + { + "epoch": 7.579185520361991, + "grad_norm": 1.0814507007598877, + "learning_rate": 0.00011364705882352941, + "loss": 5.3966, + "step": 8375 + }, + { + "epoch": 7.601809954751131, + "grad_norm": 1.1838008165359497, + "learning_rate": 0.00011398642533936651, + "loss": 5.4821, + "step": 8400 + }, + { + "epoch": 7.624434389140271, + "grad_norm": 1.0479843616485596, + "learning_rate": 0.00011432579185520361, + "loss": 5.4574, + "step": 8425 + }, + { + "epoch": 7.647058823529412, + "grad_norm": 0.976851761341095, + "learning_rate": 0.00011466515837104072, + "loss": 5.4213, + "step": 8450 + }, + { + "epoch": 7.669683257918552, + "grad_norm": 1.4671597480773926, + "learning_rate": 0.00011500452488687782, + "loss": 5.417, + "step": 8475 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 1.2483266592025757, + "learning_rate": 0.00011534389140271492, + "loss": 5.294, + "step": 8500 + }, + { + "epoch": 7.714932126696833, + "grad_norm": 1.1279124021530151, + "learning_rate": 0.00011568325791855202, + "loss": 5.4524, + "step": 8525 + }, + { + "epoch": 7.737556561085973, + "grad_norm": 1.4917343854904175, + "learning_rate": 0.00011602262443438913, + "loss": 5.3789, + "step": 8550 + }, + { + "epoch": 7.760180995475113, + "grad_norm": 1.0781971216201782, + "learning_rate": 0.00011636199095022623, + "loss": 5.3371, + "step": 8575 + }, + { + "epoch": 7.782805429864253, + "grad_norm": 1.143670916557312, + "learning_rate": 0.00011670135746606333, + "loss": 5.3115, + "step": 8600 + }, + { + "epoch": 7.8054298642533935, + "grad_norm": 1.2002426385879517, + "learning_rate": 0.00011704072398190044, + "loss": 5.447, + "step": 8625 + }, + { + "epoch": 7.828054298642534, + "grad_norm": 1.0116580724716187, + "learning_rate": 0.00011738009049773754, + "loss": 5.3397, + "step": 8650 + }, + { + "epoch": 7.850678733031674, + "grad_norm": 1.029729962348938, + "learning_rate": 0.00011771945701357466, + "loss": 5.3606, + "step": 8675 + }, + { + "epoch": 7.873303167420815, + "grad_norm": 0.9455130696296692, + "learning_rate": 0.00011805882352941177, + "loss": 5.3195, + "step": 8700 + }, + { + "epoch": 7.895927601809955, + "grad_norm": 0.9694691300392151, + "learning_rate": 0.00011839819004524887, + "loss": 5.3867, + "step": 8725 + }, + { + "epoch": 7.918552036199095, + "grad_norm": 1.2311065196990967, + "learning_rate": 0.00011873755656108597, + "loss": 5.3923, + "step": 8750 + }, + { + "epoch": 7.9411764705882355, + "grad_norm": 0.8772637248039246, + "learning_rate": 0.00011907692307692307, + "loss": 5.3279, + "step": 8775 + }, + { + "epoch": 7.963800904977376, + "grad_norm": 1.106030821800232, + "learning_rate": 0.00011941628959276017, + "loss": 5.3267, + "step": 8800 + }, + { + "epoch": 7.986425339366516, + "grad_norm": 1.1599453687667847, + "learning_rate": 0.00011975565610859727, + "loss": 5.529, + "step": 8825 + }, + { + "epoch": 8.009049773755656, + "grad_norm": 1.8157141208648682, + "learning_rate": 0.00012009502262443438, + "loss": 5.3869, + "step": 8850 + }, + { + "epoch": 8.031674208144796, + "grad_norm": 1.1919975280761719, + "learning_rate": 0.00012043438914027149, + "loss": 5.2746, + "step": 8875 + }, + { + "epoch": 8.054298642533936, + "grad_norm": 1.0922486782073975, + "learning_rate": 0.00012077375565610859, + "loss": 5.2856, + "step": 8900 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 1.2444593906402588, + "learning_rate": 0.00012111312217194569, + "loss": 5.403, + "step": 8925 + }, + { + "epoch": 8.099547511312217, + "grad_norm": 0.8721426725387573, + "learning_rate": 0.00012145248868778279, + "loss": 5.2567, + "step": 8950 + }, + { + "epoch": 8.122171945701357, + "grad_norm": 1.3781883716583252, + "learning_rate": 0.00012179185520361989, + "loss": 5.2798, + "step": 8975 + }, + { + "epoch": 8.144796380090497, + "grad_norm": 1.1509733200073242, + "learning_rate": 0.000122131221719457, + "loss": 5.2044, + "step": 9000 + }, + { + "epoch": 8.167420814479637, + "grad_norm": 1.4785964488983154, + "learning_rate": 0.00012247058823529412, + "loss": 5.3677, + "step": 9025 + }, + { + "epoch": 8.190045248868778, + "grad_norm": 1.2807246446609497, + "learning_rate": 0.00012280995475113122, + "loss": 5.3202, + "step": 9050 + }, + { + "epoch": 8.212669683257918, + "grad_norm": 1.1646101474761963, + "learning_rate": 0.00012314932126696832, + "loss": 5.271, + "step": 9075 + }, + { + "epoch": 8.235294117647058, + "grad_norm": 1.301400899887085, + "learning_rate": 0.00012348868778280542, + "loss": 5.2795, + "step": 9100 + }, + { + "epoch": 8.2579185520362, + "grad_norm": 1.4196361303329468, + "learning_rate": 0.00012382805429864252, + "loss": 5.4176, + "step": 9125 + }, + { + "epoch": 8.28054298642534, + "grad_norm": 1.5214911699295044, + "learning_rate": 0.00012416742081447962, + "loss": 5.2882, + "step": 9150 + }, + { + "epoch": 8.30316742081448, + "grad_norm": 1.1181511878967285, + "learning_rate": 0.00012450678733031672, + "loss": 5.2897, + "step": 9175 + }, + { + "epoch": 8.32579185520362, + "grad_norm": 1.581581473350525, + "learning_rate": 0.00012484615384615382, + "loss": 5.3059, + "step": 9200 + }, + { + "epoch": 8.34841628959276, + "grad_norm": 0.9101288914680481, + "learning_rate": 0.00012518552036199093, + "loss": 5.2616, + "step": 9225 + }, + { + "epoch": 8.371040723981901, + "grad_norm": 0.9665130972862244, + "learning_rate": 0.00012552488687782805, + "loss": 5.2998, + "step": 9250 + }, + { + "epoch": 8.393665158371041, + "grad_norm": 1.1748067140579224, + "learning_rate": 0.00012586425339366515, + "loss": 5.3675, + "step": 9275 + }, + { + "epoch": 8.416289592760181, + "grad_norm": 1.115011215209961, + "learning_rate": 0.00012619004524886876, + "loss": 5.4141, + "step": 9300 + }, + { + "epoch": 8.438914027149321, + "grad_norm": 1.0583595037460327, + "learning_rate": 0.00012652941176470586, + "loss": 5.3022, + "step": 9325 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 1.1308449506759644, + "learning_rate": 0.00012686877828054296, + "loss": 5.3779, + "step": 9350 + }, + { + "epoch": 8.484162895927602, + "grad_norm": 1.1921645402908325, + "learning_rate": 0.00012720814479638006, + "loss": 5.2918, + "step": 9375 + }, + { + "epoch": 8.506787330316742, + "grad_norm": 0.9783223271369934, + "learning_rate": 0.00012754751131221717, + "loss": 5.3096, + "step": 9400 + }, + { + "epoch": 8.529411764705882, + "grad_norm": 1.0405751466751099, + "learning_rate": 0.00012788687782805427, + "loss": 5.2166, + "step": 9425 + }, + { + "epoch": 8.552036199095022, + "grad_norm": 0.9867390990257263, + "learning_rate": 0.0001282262443438914, + "loss": 5.2788, + "step": 9450 + }, + { + "epoch": 8.574660633484163, + "grad_norm": 1.3076112270355225, + "learning_rate": 0.0001285656108597285, + "loss": 5.2147, + "step": 9475 + }, + { + "epoch": 8.597285067873303, + "grad_norm": 1.2440968751907349, + "learning_rate": 0.0001289049773755656, + "loss": 5.2012, + "step": 9500 + }, + { + "epoch": 8.619909502262443, + "grad_norm": 1.0401350259780884, + "learning_rate": 0.0001292443438914027, + "loss": 5.3668, + "step": 9525 + }, + { + "epoch": 8.642533936651583, + "grad_norm": 1.071785569190979, + "learning_rate": 0.0001295837104072398, + "loss": 5.3017, + "step": 9550 + }, + { + "epoch": 8.665158371040723, + "grad_norm": 0.9909277558326721, + "learning_rate": 0.00012992307692307693, + "loss": 5.2481, + "step": 9575 + }, + { + "epoch": 8.687782805429864, + "grad_norm": 0.9336756467819214, + "learning_rate": 0.00013026244343891403, + "loss": 5.2832, + "step": 9600 + }, + { + "epoch": 8.710407239819004, + "grad_norm": 0.944492518901825, + "learning_rate": 0.00013060180995475113, + "loss": 5.342, + "step": 9625 + }, + { + "epoch": 8.733031674208144, + "grad_norm": 1.096969485282898, + "learning_rate": 0.00013094117647058823, + "loss": 5.3501, + "step": 9650 + }, + { + "epoch": 8.755656108597286, + "grad_norm": 1.1634862422943115, + "learning_rate": 0.00013128054298642533, + "loss": 5.321, + "step": 9675 + }, + { + "epoch": 8.778280542986426, + "grad_norm": 0.9251194000244141, + "learning_rate": 0.00013161990950226243, + "loss": 5.4857, + "step": 9700 + }, + { + "epoch": 8.800904977375566, + "grad_norm": 1.0412198305130005, + "learning_rate": 0.00013195927601809953, + "loss": 5.3761, + "step": 9725 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 1.352041244506836, + "learning_rate": 0.00013229864253393663, + "loss": 5.377, + "step": 9750 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 1.015771746635437, + "learning_rate": 0.00013263800904977373, + "loss": 5.3823, + "step": 9775 + }, + { + "epoch": 8.868778280542987, + "grad_norm": 0.9252334237098694, + "learning_rate": 0.00013297737556561086, + "loss": 5.2645, + "step": 9800 + }, + { + "epoch": 8.891402714932127, + "grad_norm": 1.1377252340316772, + "learning_rate": 0.00013331674208144796, + "loss": 5.2292, + "step": 9825 + }, + { + "epoch": 8.914027149321267, + "grad_norm": 1.8188557624816895, + "learning_rate": 0.00013365610859728506, + "loss": 5.4221, + "step": 9850 + }, + { + "epoch": 8.936651583710407, + "grad_norm": 1.1808732748031616, + "learning_rate": 0.00013398190045248867, + "loss": 5.3275, + "step": 9875 + }, + { + "epoch": 8.959276018099548, + "grad_norm": 0.935534656047821, + "learning_rate": 0.00013432126696832577, + "loss": 5.2564, + "step": 9900 + }, + { + "epoch": 8.981900452488688, + "grad_norm": 0.9960452318191528, + "learning_rate": 0.0001346606334841629, + "loss": 5.3774, + "step": 9925 + }, + { + "epoch": 9.004524886877828, + "grad_norm": 3.8774402141571045, + "learning_rate": 0.000135, + "loss": 5.1604, + "step": 9950 + }, + { + "epoch": 9.027149321266968, + "grad_norm": 1.0658574104309082, + "learning_rate": 0.0001353393665158371, + "loss": 5.1074, + "step": 9975 + }, + { + "epoch": 9.049773755656108, + "grad_norm": 1.2405463457107544, + "learning_rate": 0.0001356787330316742, + "loss": 5.0691, + "step": 10000 + }, + { + "epoch": 9.072398190045249, + "grad_norm": 1.2019230127334595, + "learning_rate": 0.0001360180995475113, + "loss": 5.1978, + "step": 10025 + }, + { + "epoch": 9.095022624434389, + "grad_norm": 1.9909340143203735, + "learning_rate": 0.0001363574660633484, + "loss": 5.2004, + "step": 10050 + }, + { + "epoch": 9.117647058823529, + "grad_norm": 1.0435068607330322, + "learning_rate": 0.0001366968325791855, + "loss": 5.209, + "step": 10075 + }, + { + "epoch": 9.14027149321267, + "grad_norm": 1.2937798500061035, + "learning_rate": 0.0001370361990950226, + "loss": 5.2087, + "step": 10100 + }, + { + "epoch": 9.16289592760181, + "grad_norm": 1.2895238399505615, + "learning_rate": 0.0001373755656108597, + "loss": 5.2103, + "step": 10125 + }, + { + "epoch": 9.18552036199095, + "grad_norm": 0.9975462555885315, + "learning_rate": 0.0001377149321266968, + "loss": 5.1116, + "step": 10150 + }, + { + "epoch": 9.20814479638009, + "grad_norm": 1.296386957168579, + "learning_rate": 0.0001380542986425339, + "loss": 5.1656, + "step": 10175 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 1.4493881464004517, + "learning_rate": 0.00013839366515837104, + "loss": 5.1834, + "step": 10200 + }, + { + "epoch": 9.25339366515837, + "grad_norm": 1.0695624351501465, + "learning_rate": 0.00013873303167420814, + "loss": 5.1005, + "step": 10225 + }, + { + "epoch": 9.276018099547512, + "grad_norm": 1.0306825637817383, + "learning_rate": 0.00013907239819004524, + "loss": 5.1712, + "step": 10250 + }, + { + "epoch": 9.298642533936652, + "grad_norm": 1.3647042512893677, + "learning_rate": 0.00013941176470588234, + "loss": 5.15, + "step": 10275 + }, + { + "epoch": 9.321266968325792, + "grad_norm": 2.479020357131958, + "learning_rate": 0.00013975113122171944, + "loss": 5.1985, + "step": 10300 + }, + { + "epoch": 9.343891402714933, + "grad_norm": 0.9140968918800354, + "learning_rate": 0.00014009049773755654, + "loss": 5.2683, + "step": 10325 + }, + { + "epoch": 9.366515837104073, + "grad_norm": 1.062315583229065, + "learning_rate": 0.00014042986425339364, + "loss": 5.2319, + "step": 10350 + }, + { + "epoch": 9.389140271493213, + "grad_norm": 1.0792220830917358, + "learning_rate": 0.00014076923076923074, + "loss": 5.2176, + "step": 10375 + }, + { + "epoch": 9.411764705882353, + "grad_norm": 0.9862438440322876, + "learning_rate": 0.00014110859728506787, + "loss": 5.2732, + "step": 10400 + }, + { + "epoch": 9.434389140271493, + "grad_norm": 0.9580933451652527, + "learning_rate": 0.00014144796380090497, + "loss": 5.3087, + "step": 10425 + }, + { + "epoch": 9.457013574660634, + "grad_norm": 1.1492204666137695, + "learning_rate": 0.00014178733031674207, + "loss": 5.0943, + "step": 10450 + }, + { + "epoch": 9.479638009049774, + "grad_norm": 1.028110384941101, + "learning_rate": 0.00014212669683257918, + "loss": 5.2554, + "step": 10475 + }, + { + "epoch": 9.502262443438914, + "grad_norm": 1.0601478815078735, + "learning_rate": 0.00014246606334841628, + "loss": 5.1979, + "step": 10500 + }, + { + "epoch": 9.524886877828054, + "grad_norm": 1.2302405834197998, + "learning_rate": 0.00014280542986425338, + "loss": 5.3466, + "step": 10525 + }, + { + "epoch": 9.547511312217194, + "grad_norm": 1.12709379196167, + "learning_rate": 0.0001431447963800905, + "loss": 5.2889, + "step": 10550 + }, + { + "epoch": 9.570135746606335, + "grad_norm": 1.0190308094024658, + "learning_rate": 0.0001434841628959276, + "loss": 5.3005, + "step": 10575 + }, + { + "epoch": 9.592760180995475, + "grad_norm": 1.5944112539291382, + "learning_rate": 0.0001438235294117647, + "loss": 5.2775, + "step": 10600 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 1.1170854568481445, + "learning_rate": 0.0001441628959276018, + "loss": 5.1568, + "step": 10625 + }, + { + "epoch": 9.638009049773755, + "grad_norm": 0.9360098838806152, + "learning_rate": 0.0001445022624434389, + "loss": 5.2575, + "step": 10650 + }, + { + "epoch": 9.660633484162895, + "grad_norm": 1.9224114418029785, + "learning_rate": 0.000144841628959276, + "loss": 5.1985, + "step": 10675 + }, + { + "epoch": 9.683257918552036, + "grad_norm": 1.250430703163147, + "learning_rate": 0.0001451809954751131, + "loss": 5.2976, + "step": 10700 + }, + { + "epoch": 9.705882352941176, + "grad_norm": 1.004128098487854, + "learning_rate": 0.0001455203619909502, + "loss": 5.3517, + "step": 10725 + }, + { + "epoch": 9.728506787330316, + "grad_norm": 1.2826331853866577, + "learning_rate": 0.0001458597285067873, + "loss": 5.2889, + "step": 10750 + }, + { + "epoch": 9.751131221719458, + "grad_norm": 1.0989943742752075, + "learning_rate": 0.0001461990950226244, + "loss": 5.2646, + "step": 10775 + }, + { + "epoch": 9.773755656108598, + "grad_norm": 1.5269279479980469, + "learning_rate": 0.00014653846153846151, + "loss": 5.2721, + "step": 10800 + }, + { + "epoch": 9.796380090497738, + "grad_norm": 1.0799510478973389, + "learning_rate": 0.00014687782805429862, + "loss": 5.2698, + "step": 10825 + }, + { + "epoch": 9.819004524886878, + "grad_norm": 0.9514308571815491, + "learning_rate": 0.00014721719457013572, + "loss": 5.3147, + "step": 10850 + }, + { + "epoch": 9.841628959276019, + "grad_norm": 0.9488893747329712, + "learning_rate": 0.00014755656108597282, + "loss": 5.36, + "step": 10875 + }, + { + "epoch": 9.864253393665159, + "grad_norm": 1.1077697277069092, + "learning_rate": 0.00014789592760180994, + "loss": 5.1958, + "step": 10900 + }, + { + "epoch": 9.886877828054299, + "grad_norm": 0.8970409035682678, + "learning_rate": 0.00014823529411764705, + "loss": 5.3716, + "step": 10925 + }, + { + "epoch": 9.90950226244344, + "grad_norm": 1.3561121225357056, + "learning_rate": 0.00014857466063348415, + "loss": 5.2173, + "step": 10950 + }, + { + "epoch": 9.93212669683258, + "grad_norm": 1.3258692026138306, + "learning_rate": 0.00014891402714932125, + "loss": 5.1904, + "step": 10975 + }, + { + "epoch": 9.95475113122172, + "grad_norm": 0.8535260558128357, + "learning_rate": 0.00014925339366515835, + "loss": 5.3146, + "step": 11000 + }, + { + "epoch": 9.97737556561086, + "grad_norm": 1.0721464157104492, + "learning_rate": 0.00014959276018099548, + "loss": 5.3224, + "step": 11025 + }, + { + "epoch": 10.0, + "grad_norm": 1.2367472648620605, + "learning_rate": 0.00014993212669683258, + "loss": 5.2298, + "step": 11050 + }, + { + "epoch": 10.02262443438914, + "grad_norm": 1.176303505897522, + "learning_rate": 0.000149999985031392, + "loss": 5.1865, + "step": 11075 + }, + { + "epoch": 10.04524886877828, + "grad_norm": 1.6310497522354126, + "learning_rate": 0.00014999992422143224, + "loss": 5.0658, + "step": 11100 + }, + { + "epoch": 10.06787330316742, + "grad_norm": 0.8968245983123779, + "learning_rate": 0.00014999981663462063, + "loss": 5.2153, + "step": 11125 + }, + { + "epoch": 10.09049773755656, + "grad_norm": 0.8865295052528381, + "learning_rate": 0.00014999966227102431, + "loss": 4.9703, + "step": 11150 + }, + { + "epoch": 10.113122171945701, + "grad_norm": 1.1915286779403687, + "learning_rate": 0.00014999946113073947, + "loss": 5.0086, + "step": 11175 + }, + { + "epoch": 10.135746606334841, + "grad_norm": 1.1938319206237793, + "learning_rate": 0.00014999921321389164, + "loss": 5.128, + "step": 11200 + }, + { + "epoch": 10.158371040723981, + "grad_norm": 1.2753313779830933, + "learning_rate": 0.00014999891852063535, + "loss": 5.0999, + "step": 11225 + }, + { + "epoch": 10.180995475113122, + "grad_norm": 0.9530912637710571, + "learning_rate": 0.0001499985770511545, + "loss": 5.2185, + "step": 11250 + }, + { + "epoch": 10.203619909502262, + "grad_norm": 1.3997453451156616, + "learning_rate": 0.000149998188805662, + "loss": 5.0885, + "step": 11275 + }, + { + "epoch": 10.226244343891402, + "grad_norm": 1.0850180387496948, + "learning_rate": 0.00014999775378440005, + "loss": 5.154, + "step": 11300 + }, + { + "epoch": 10.248868778280542, + "grad_norm": 1.292991042137146, + "learning_rate": 0.00014999727198763987, + "loss": 5.1104, + "step": 11325 + }, + { + "epoch": 10.271493212669684, + "grad_norm": 1.158610463142395, + "learning_rate": 0.00014999674341568207, + "loss": 5.1012, + "step": 11350 + }, + { + "epoch": 10.294117647058824, + "grad_norm": 1.1036847829818726, + "learning_rate": 0.00014999616806885623, + "loss": 5.2547, + "step": 11375 + }, + { + "epoch": 10.316742081447964, + "grad_norm": 2.1555657386779785, + "learning_rate": 0.00014999554594752123, + "loss": 5.1573, + "step": 11400 + }, + { + "epoch": 10.339366515837105, + "grad_norm": 1.2989445924758911, + "learning_rate": 0.00014999487705206506, + "loss": 5.0822, + "step": 11425 + }, + { + "epoch": 10.361990950226245, + "grad_norm": 0.9577361345291138, + "learning_rate": 0.00014999416138290492, + "loss": 5.1344, + "step": 11450 + }, + { + "epoch": 10.384615384615385, + "grad_norm": 1.484882116317749, + "learning_rate": 0.00014999339894048718, + "loss": 5.0269, + "step": 11475 + }, + { + "epoch": 10.407239819004525, + "grad_norm": 0.9797399044036865, + "learning_rate": 0.00014999258972528734, + "loss": 5.0701, + "step": 11500 + }, + { + "epoch": 10.429864253393665, + "grad_norm": 1.0783559083938599, + "learning_rate": 0.00014999173373781013, + "loss": 5.2029, + "step": 11525 + }, + { + "epoch": 10.452488687782806, + "grad_norm": 1.226446270942688, + "learning_rate": 0.0001499908309785894, + "loss": 5.0235, + "step": 11550 + }, + { + "epoch": 10.475113122171946, + "grad_norm": 1.1730021238327026, + "learning_rate": 0.0001499898814481882, + "loss": 5.13, + "step": 11575 + }, + { + "epoch": 10.497737556561086, + "grad_norm": 1.233260989189148, + "learning_rate": 0.00014998888514719874, + "loss": 5.1153, + "step": 11600 + }, + { + "epoch": 10.520361990950226, + "grad_norm": 1.1690484285354614, + "learning_rate": 0.0001499878420762424, + "loss": 5.1357, + "step": 11625 + }, + { + "epoch": 10.542986425339366, + "grad_norm": 1.0624953508377075, + "learning_rate": 0.0001499867522359698, + "loss": 5.1712, + "step": 11650 + }, + { + "epoch": 10.565610859728507, + "grad_norm": 1.3550862073898315, + "learning_rate": 0.00014998561562706055, + "loss": 5.2174, + "step": 11675 + }, + { + "epoch": 10.588235294117647, + "grad_norm": 1.1426056623458862, + "learning_rate": 0.0001499844322502236, + "loss": 5.1503, + "step": 11700 + }, + { + "epoch": 10.610859728506787, + "grad_norm": 1.130356788635254, + "learning_rate": 0.00014998320210619706, + "loss": 5.2003, + "step": 11725 + }, + { + "epoch": 10.633484162895927, + "grad_norm": 2.088479518890381, + "learning_rate": 0.00014998192519574807, + "loss": 5.0855, + "step": 11750 + }, + { + "epoch": 10.656108597285067, + "grad_norm": 1.282894253730774, + "learning_rate": 0.0001499806015196731, + "loss": 5.2212, + "step": 11775 + }, + { + "epoch": 10.678733031674208, + "grad_norm": 1.458074927330017, + "learning_rate": 0.0001499792310787977, + "loss": 5.1635, + "step": 11800 + }, + { + "epoch": 10.701357466063348, + "grad_norm": 1.0478781461715698, + "learning_rate": 0.00014997781387397657, + "loss": 5.1052, + "step": 11825 + }, + { + "epoch": 10.723981900452488, + "grad_norm": 1.191235065460205, + "learning_rate": 0.00014997634990609367, + "loss": 5.2958, + "step": 11850 + }, + { + "epoch": 10.74660633484163, + "grad_norm": 1.4063588380813599, + "learning_rate": 0.000149974839176062, + "loss": 5.17, + "step": 11875 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 1.1008962392807007, + "learning_rate": 0.00014997328168482384, + "loss": 5.082, + "step": 11900 + }, + { + "epoch": 10.79185520361991, + "grad_norm": 1.262567162513733, + "learning_rate": 0.00014997167743335054, + "loss": 5.2031, + "step": 11925 + }, + { + "epoch": 10.81447963800905, + "grad_norm": 0.9310780763626099, + "learning_rate": 0.00014997002642264272, + "loss": 5.1467, + "step": 11950 + }, + { + "epoch": 10.83710407239819, + "grad_norm": 1.0549923181533813, + "learning_rate": 0.00014996832865373004, + "loss": 5.0579, + "step": 11975 + }, + { + "epoch": 10.85972850678733, + "grad_norm": 1.1291685104370117, + "learning_rate": 0.0001499665841276714, + "loss": 5.0962, + "step": 12000 + }, + { + "epoch": 10.882352941176471, + "grad_norm": 1.5607845783233643, + "learning_rate": 0.00014996479284555488, + "loss": 5.2022, + "step": 12025 + }, + { + "epoch": 10.904977375565611, + "grad_norm": 1.444187045097351, + "learning_rate": 0.00014996295480849766, + "loss": 5.1123, + "step": 12050 + }, + { + "epoch": 10.927601809954751, + "grad_norm": 0.8266554474830627, + "learning_rate": 0.0001499610700176461, + "loss": 5.2064, + "step": 12075 + }, + { + "epoch": 10.950226244343892, + "grad_norm": 0.994787335395813, + "learning_rate": 0.00014995913847417575, + "loss": 5.2265, + "step": 12100 + }, + { + "epoch": 10.972850678733032, + "grad_norm": 1.2321454286575317, + "learning_rate": 0.0001499571601792913, + "loss": 5.1676, + "step": 12125 + }, + { + "epoch": 10.995475113122172, + "grad_norm": 1.3413864374160767, + "learning_rate": 0.0001499551351342266, + "loss": 5.1838, + "step": 12150 + }, + { + "epoch": 11.018099547511312, + "grad_norm": 1.178889274597168, + "learning_rate": 0.00014995306334024462, + "loss": 5.1451, + "step": 12175 + }, + { + "epoch": 11.040723981900452, + "grad_norm": 1.1773490905761719, + "learning_rate": 0.00014995094479863756, + "loss": 5.036, + "step": 12200 + }, + { + "epoch": 11.063348416289593, + "grad_norm": 1.0849095582962036, + "learning_rate": 0.0001499487795107267, + "loss": 4.9792, + "step": 12225 + }, + { + "epoch": 11.085972850678733, + "grad_norm": 1.5654512643814087, + "learning_rate": 0.00014994656747786256, + "loss": 5.0234, + "step": 12250 + }, + { + "epoch": 11.108597285067873, + "grad_norm": 1.1146122217178345, + "learning_rate": 0.00014994430870142472, + "loss": 4.9398, + "step": 12275 + }, + { + "epoch": 11.131221719457013, + "grad_norm": 1.0082900524139404, + "learning_rate": 0.00014994200318282198, + "loss": 4.9555, + "step": 12300 + }, + { + "epoch": 11.153846153846153, + "grad_norm": 1.175589919090271, + "learning_rate": 0.0001499396509234923, + "loss": 5.0247, + "step": 12325 + }, + { + "epoch": 11.176470588235293, + "grad_norm": 1.3833733797073364, + "learning_rate": 0.0001499372519249027, + "loss": 4.9253, + "step": 12350 + }, + { + "epoch": 11.199095022624434, + "grad_norm": 1.128486156463623, + "learning_rate": 0.00014993480618854952, + "loss": 5.0122, + "step": 12375 + }, + { + "epoch": 11.221719457013574, + "grad_norm": 1.1842920780181885, + "learning_rate": 0.00014993231371595802, + "loss": 5.0244, + "step": 12400 + }, + { + "epoch": 11.244343891402714, + "grad_norm": 1.0491068363189697, + "learning_rate": 0.00014992977450868284, + "loss": 5.046, + "step": 12425 + }, + { + "epoch": 11.266968325791856, + "grad_norm": 1.563358187675476, + "learning_rate": 0.00014992718856830762, + "loss": 4.9557, + "step": 12450 + }, + { + "epoch": 11.289592760180996, + "grad_norm": 1.1112754344940186, + "learning_rate": 0.00014992455589644515, + "loss": 5.0276, + "step": 12475 + }, + { + "epoch": 11.312217194570136, + "grad_norm": 1.1768701076507568, + "learning_rate": 0.00014992187649473748, + "loss": 4.9013, + "step": 12500 + }, + { + "epoch": 11.334841628959277, + "grad_norm": 1.175905704498291, + "learning_rate": 0.0001499191503648557, + "loss": 5.0562, + "step": 12525 + }, + { + "epoch": 11.357466063348417, + "grad_norm": 1.1734528541564941, + "learning_rate": 0.0001499163775085001, + "loss": 4.9798, + "step": 12550 + }, + { + "epoch": 11.380090497737557, + "grad_norm": 1.2192497253417969, + "learning_rate": 0.00014991355792740003, + "loss": 5.0336, + "step": 12575 + }, + { + "epoch": 11.402714932126697, + "grad_norm": 0.9738785028457642, + "learning_rate": 0.00014991069162331405, + "loss": 4.947, + "step": 12600 + }, + { + "epoch": 11.425339366515837, + "grad_norm": 1.001910924911499, + "learning_rate": 0.00014990777859802992, + "loss": 5.0363, + "step": 12625 + }, + { + "epoch": 11.447963800904978, + "grad_norm": 1.4487667083740234, + "learning_rate": 0.0001499048188533644, + "loss": 5.0579, + "step": 12650 + }, + { + "epoch": 11.470588235294118, + "grad_norm": 0.9874216318130493, + "learning_rate": 0.00014990181239116348, + "loss": 5.1646, + "step": 12675 + }, + { + "epoch": 11.493212669683258, + "grad_norm": 1.091408371925354, + "learning_rate": 0.00014989875921330229, + "loss": 5.1041, + "step": 12700 + }, + { + "epoch": 11.515837104072398, + "grad_norm": 0.8557041883468628, + "learning_rate": 0.00014989565932168504, + "loss": 5.033, + "step": 12725 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 1.2733933925628662, + "learning_rate": 0.00014989251271824513, + "loss": 5.0351, + "step": 12750 + }, + { + "epoch": 11.561085972850679, + "grad_norm": 1.1447516679763794, + "learning_rate": 0.00014988931940494507, + "loss": 5.0698, + "step": 12775 + }, + { + "epoch": 11.583710407239819, + "grad_norm": 1.2866042852401733, + "learning_rate": 0.00014988607938377647, + "loss": 5.0858, + "step": 12800 + }, + { + "epoch": 11.606334841628959, + "grad_norm": 0.9531421065330505, + "learning_rate": 0.00014988279265676013, + "loss": 5.0072, + "step": 12825 + }, + { + "epoch": 11.628959276018099, + "grad_norm": 1.4517631530761719, + "learning_rate": 0.00014987945922594599, + "loss": 5.0316, + "step": 12850 + }, + { + "epoch": 11.65158371040724, + "grad_norm": 1.2381407022476196, + "learning_rate": 0.00014987607909341304, + "loss": 5.0895, + "step": 12875 + }, + { + "epoch": 11.67420814479638, + "grad_norm": 1.1914258003234863, + "learning_rate": 0.00014987265226126944, + "loss": 4.9131, + "step": 12900 + }, + { + "epoch": 11.69683257918552, + "grad_norm": 1.4007694721221924, + "learning_rate": 0.00014986917873165248, + "loss": 5.0471, + "step": 12925 + }, + { + "epoch": 11.71945701357466, + "grad_norm": 1.4290871620178223, + "learning_rate": 0.0001498656585067286, + "loss": 5.0424, + "step": 12950 + }, + { + "epoch": 11.742081447963802, + "grad_norm": 1.190192699432373, + "learning_rate": 0.00014986209158869332, + "loss": 5.0347, + "step": 12975 + }, + { + "epoch": 11.764705882352942, + "grad_norm": 1.1445356607437134, + "learning_rate": 0.0001498584779797713, + "loss": 5.0117, + "step": 13000 + }, + { + "epoch": 11.787330316742082, + "grad_norm": 1.3588181734085083, + "learning_rate": 0.0001498548176822163, + "loss": 5.0594, + "step": 13025 + }, + { + "epoch": 11.809954751131222, + "grad_norm": 1.5752766132354736, + "learning_rate": 0.00014985111069831122, + "loss": 5.1229, + "step": 13050 + }, + { + "epoch": 11.832579185520363, + "grad_norm": 1.3154525756835938, + "learning_rate": 0.00014984735703036812, + "loss": 5.1844, + "step": 13075 + }, + { + "epoch": 11.855203619909503, + "grad_norm": 0.9763768315315247, + "learning_rate": 0.0001498435566807281, + "loss": 5.1022, + "step": 13100 + }, + { + "epoch": 11.877828054298643, + "grad_norm": 1.2734490633010864, + "learning_rate": 0.00014983970965176137, + "loss": 5.0628, + "step": 13125 + }, + { + "epoch": 11.900452488687783, + "grad_norm": 1.2248128652572632, + "learning_rate": 0.00014983581594586737, + "loss": 5.0853, + "step": 13150 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 1.1970940828323364, + "learning_rate": 0.00014983187556547454, + "loss": 4.9832, + "step": 13175 + }, + { + "epoch": 11.945701357466064, + "grad_norm": 1.3674147129058838, + "learning_rate": 0.00014982788851304046, + "loss": 5.0107, + "step": 13200 + }, + { + "epoch": 11.968325791855204, + "grad_norm": 0.918038547039032, + "learning_rate": 0.0001498238547910518, + "loss": 5.114, + "step": 13225 + }, + { + "epoch": 11.990950226244344, + "grad_norm": 0.9390908479690552, + "learning_rate": 0.0001498197744020244, + "loss": 5.1094, + "step": 13250 + }, + { + "epoch": 12.013574660633484, + "grad_norm": 1.1689878702163696, + "learning_rate": 0.00014981564734850312, + "loss": 4.8828, + "step": 13275 + }, + { + "epoch": 12.036199095022624, + "grad_norm": 1.544630527496338, + "learning_rate": 0.00014981147363306202, + "loss": 4.9284, + "step": 13300 + }, + { + "epoch": 12.058823529411764, + "grad_norm": 1.171164870262146, + "learning_rate": 0.00014980725325830418, + "loss": 4.8129, + "step": 13325 + }, + { + "epoch": 12.081447963800905, + "grad_norm": 1.3011549711227417, + "learning_rate": 0.00014980298622686183, + "loss": 4.9128, + "step": 13350 + }, + { + "epoch": 12.104072398190045, + "grad_norm": 1.7315293550491333, + "learning_rate": 0.00014979867254139628, + "loss": 4.7659, + "step": 13375 + }, + { + "epoch": 12.126696832579185, + "grad_norm": 1.319021224975586, + "learning_rate": 0.00014979431220459792, + "loss": 4.9106, + "step": 13400 + }, + { + "epoch": 12.149321266968325, + "grad_norm": 1.1789779663085938, + "learning_rate": 0.00014978990521918628, + "loss": 4.8845, + "step": 13425 + }, + { + "epoch": 12.171945701357465, + "grad_norm": 1.3722392320632935, + "learning_rate": 0.00014978545158791, + "loss": 4.8086, + "step": 13450 + }, + { + "epoch": 12.194570135746606, + "grad_norm": 1.3678064346313477, + "learning_rate": 0.00014978095131354665, + "loss": 4.8411, + "step": 13475 + }, + { + "epoch": 12.217194570135746, + "grad_norm": 1.5004942417144775, + "learning_rate": 0.00014977640439890316, + "loss": 4.9215, + "step": 13500 + }, + { + "epoch": 12.239819004524886, + "grad_norm": 1.622236728668213, + "learning_rate": 0.00014977181084681532, + "loss": 4.8444, + "step": 13525 + }, + { + "epoch": 12.262443438914028, + "grad_norm": 1.582205891609192, + "learning_rate": 0.0001497671706601481, + "loss": 4.8589, + "step": 13550 + }, + { + "epoch": 12.285067873303168, + "grad_norm": 1.7920467853546143, + "learning_rate": 0.0001497624838417956, + "loss": 4.8509, + "step": 13575 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 1.2452937364578247, + "learning_rate": 0.00014975775039468086, + "loss": 4.9321, + "step": 13600 + }, + { + "epoch": 12.330316742081449, + "grad_norm": 1.0658358335494995, + "learning_rate": 0.00014975297032175617, + "loss": 4.9616, + "step": 13625 + }, + { + "epoch": 12.352941176470589, + "grad_norm": 1.2660192251205444, + "learning_rate": 0.0001497481436260028, + "loss": 4.9297, + "step": 13650 + }, + { + "epoch": 12.375565610859729, + "grad_norm": 0.9705486297607422, + "learning_rate": 0.0001497432703104311, + "loss": 4.8978, + "step": 13675 + }, + { + "epoch": 12.39819004524887, + "grad_norm": 1.2590233087539673, + "learning_rate": 0.00014973835037808056, + "loss": 4.8164, + "step": 13700 + }, + { + "epoch": 12.42081447963801, + "grad_norm": 1.3895432949066162, + "learning_rate": 0.00014973338383201965, + "loss": 4.9924, + "step": 13725 + }, + { + "epoch": 12.44343891402715, + "grad_norm": 1.2928717136383057, + "learning_rate": 0.000149728370675346, + "loss": 4.9683, + "step": 13750 + }, + { + "epoch": 12.46606334841629, + "grad_norm": 1.352980375289917, + "learning_rate": 0.00014972331091118627, + "loss": 4.8819, + "step": 13775 + }, + { + "epoch": 12.48868778280543, + "grad_norm": 1.4463528394699097, + "learning_rate": 0.00014971820454269622, + "loss": 4.9767, + "step": 13800 + }, + { + "epoch": 12.51131221719457, + "grad_norm": 1.238166093826294, + "learning_rate": 0.0001497130515730606, + "loss": 4.987, + "step": 13825 + }, + { + "epoch": 12.53393665158371, + "grad_norm": 1.2077312469482422, + "learning_rate": 0.00014970785200549332, + "loss": 4.9379, + "step": 13850 + }, + { + "epoch": 12.55656108597285, + "grad_norm": 1.2092487812042236, + "learning_rate": 0.00014970260584323724, + "loss": 4.9096, + "step": 13875 + }, + { + "epoch": 12.57918552036199, + "grad_norm": 1.4933909177780151, + "learning_rate": 0.00014969731308956443, + "loss": 4.9451, + "step": 13900 + }, + { + "epoch": 12.60180995475113, + "grad_norm": 1.228698968887329, + "learning_rate": 0.0001496921882158976, + "loss": 4.8687, + "step": 13925 + }, + { + "epoch": 12.624434389140271, + "grad_norm": 1.2857065200805664, + "learning_rate": 0.00014968680415265059, + "loss": 4.9593, + "step": 13950 + }, + { + "epoch": 12.647058823529411, + "grad_norm": 1.3414708375930786, + "learning_rate": 0.00014968137350784223, + "loss": 4.9016, + "step": 13975 + }, + { + "epoch": 12.669683257918551, + "grad_norm": 1.3420109748840332, + "learning_rate": 0.00014967589628485953, + "loss": 4.8301, + "step": 14000 + }, + { + "epoch": 12.692307692307692, + "grad_norm": 1.5517535209655762, + "learning_rate": 0.00014967037248711856, + "loss": 4.9292, + "step": 14025 + }, + { + "epoch": 12.714932126696832, + "grad_norm": 1.4037293195724487, + "learning_rate": 0.00014966480211806458, + "loss": 5.0179, + "step": 14050 + }, + { + "epoch": 12.737556561085974, + "grad_norm": 1.1001484394073486, + "learning_rate": 0.00014965918518117168, + "loss": 4.9745, + "step": 14075 + }, + { + "epoch": 12.760180995475114, + "grad_norm": 1.636365532875061, + "learning_rate": 0.00014965352167994317, + "loss": 4.9478, + "step": 14100 + }, + { + "epoch": 12.782805429864254, + "grad_norm": 1.43099045753479, + "learning_rate": 0.00014964781161791126, + "loss": 5.0, + "step": 14125 + }, + { + "epoch": 12.805429864253394, + "grad_norm": 1.5528738498687744, + "learning_rate": 0.0001496420549986373, + "loss": 4.8686, + "step": 14150 + }, + { + "epoch": 12.828054298642535, + "grad_norm": 1.145129680633545, + "learning_rate": 0.0001496362518257117, + "loss": 5.0184, + "step": 14175 + }, + { + "epoch": 12.850678733031675, + "grad_norm": 1.2828490734100342, + "learning_rate": 0.00014963040210275378, + "loss": 4.8833, + "step": 14200 + }, + { + "epoch": 12.873303167420815, + "grad_norm": 1.3507344722747803, + "learning_rate": 0.00014962450583341202, + "loss": 5.0941, + "step": 14225 + }, + { + "epoch": 12.895927601809955, + "grad_norm": 1.2493312358856201, + "learning_rate": 0.00014961856302136381, + "loss": 4.987, + "step": 14250 + }, + { + "epoch": 12.918552036199095, + "grad_norm": 0.9961503148078918, + "learning_rate": 0.00014961257367031568, + "loss": 5.085, + "step": 14275 + }, + { + "epoch": 12.941176470588236, + "grad_norm": 1.2279330492019653, + "learning_rate": 0.00014960653778400317, + "loss": 5.0184, + "step": 14300 + }, + { + "epoch": 12.963800904977376, + "grad_norm": 1.4541242122650146, + "learning_rate": 0.00014960045536619075, + "loss": 4.9985, + "step": 14325 + }, + { + "epoch": 12.986425339366516, + "grad_norm": 1.1447900533676147, + "learning_rate": 0.000149594326420672, + "loss": 4.9128, + "step": 14350 + }, + { + "epoch": 13.009049773755656, + "grad_norm": 1.1077444553375244, + "learning_rate": 0.0001495881509512695, + "loss": 5.0568, + "step": 14375 + }, + { + "epoch": 13.031674208144796, + "grad_norm": 1.2826627492904663, + "learning_rate": 0.00014958192896183484, + "loss": 4.8062, + "step": 14400 + }, + { + "epoch": 13.054298642533936, + "grad_norm": 1.337775468826294, + "learning_rate": 0.00014957566045624863, + "loss": 4.7318, + "step": 14425 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 1.5631524324417114, + "learning_rate": 0.00014956934543842047, + "loss": 4.7804, + "step": 14450 + }, + { + "epoch": 13.099547511312217, + "grad_norm": 1.4884055852890015, + "learning_rate": 0.000149562983912289, + "loss": 4.8913, + "step": 14475 + }, + { + "epoch": 13.122171945701357, + "grad_norm": 1.8440004587173462, + "learning_rate": 0.00014955657588182182, + "loss": 4.7437, + "step": 14500 + }, + { + "epoch": 13.144796380090497, + "grad_norm": 1.6917012929916382, + "learning_rate": 0.0001495501213510156, + "loss": 4.6571, + "step": 14525 + }, + { + "epoch": 13.167420814479637, + "grad_norm": 1.4976379871368408, + "learning_rate": 0.000149543620323896, + "loss": 4.8869, + "step": 14550 + }, + { + "epoch": 13.190045248868778, + "grad_norm": 1.5437226295471191, + "learning_rate": 0.00014953707280451764, + "loss": 4.8286, + "step": 14575 + }, + { + "epoch": 13.212669683257918, + "grad_norm": 1.7578870058059692, + "learning_rate": 0.00014953047879696414, + "loss": 4.8591, + "step": 14600 + }, + { + "epoch": 13.235294117647058, + "grad_norm": 1.410611867904663, + "learning_rate": 0.00014952383830534814, + "loss": 4.7898, + "step": 14625 + }, + { + "epoch": 13.2579185520362, + "grad_norm": 1.135124683380127, + "learning_rate": 0.00014951715133381123, + "loss": 4.7035, + "step": 14650 + }, + { + "epoch": 13.28054298642534, + "grad_norm": 1.2869302034378052, + "learning_rate": 0.00014951041788652407, + "loss": 4.8667, + "step": 14675 + }, + { + "epoch": 13.30316742081448, + "grad_norm": 2.459376811981201, + "learning_rate": 0.00014950363796768624, + "loss": 4.7734, + "step": 14700 + }, + { + "epoch": 13.32579185520362, + "grad_norm": 1.4525374174118042, + "learning_rate": 0.00014949681158152631, + "loss": 4.9633, + "step": 14725 + }, + { + "epoch": 13.34841628959276, + "grad_norm": 1.7316921949386597, + "learning_rate": 0.00014948993873230187, + "loss": 4.894, + "step": 14750 + }, + { + "epoch": 13.371040723981901, + "grad_norm": 1.5175212621688843, + "learning_rate": 0.00014948301942429941, + "loss": 4.8522, + "step": 14775 + }, + { + "epoch": 13.393665158371041, + "grad_norm": 1.1831189393997192, + "learning_rate": 0.0001494760536618345, + "loss": 4.8597, + "step": 14800 + }, + { + "epoch": 13.416289592760181, + "grad_norm": 1.3064175844192505, + "learning_rate": 0.0001494690414492516, + "loss": 4.7122, + "step": 14825 + }, + { + "epoch": 13.438914027149321, + "grad_norm": 1.6598682403564453, + "learning_rate": 0.0001494619827909242, + "loss": 4.835, + "step": 14850 + }, + { + "epoch": 13.461538461538462, + "grad_norm": 1.4541395902633667, + "learning_rate": 0.00014945487769125467, + "loss": 4.8324, + "step": 14875 + }, + { + "epoch": 13.484162895927602, + "grad_norm": 1.1823992729187012, + "learning_rate": 0.00014944772615467448, + "loss": 4.8323, + "step": 14900 + }, + { + "epoch": 13.506787330316742, + "grad_norm": 1.439037799835205, + "learning_rate": 0.00014944052818564394, + "loss": 4.7843, + "step": 14925 + }, + { + "epoch": 13.529411764705882, + "grad_norm": 1.3563629388809204, + "learning_rate": 0.00014943328378865236, + "loss": 4.8002, + "step": 14950 + }, + { + "epoch": 13.552036199095022, + "grad_norm": 1.4317904710769653, + "learning_rate": 0.00014942599296821803, + "loss": 4.8379, + "step": 14975 + }, + { + "epoch": 13.574660633484163, + "grad_norm": 2.0179126262664795, + "learning_rate": 0.00014941865572888816, + "loss": 4.6603, + "step": 15000 + }, + { + "epoch": 13.597285067873303, + "grad_norm": 1.239160418510437, + "learning_rate": 0.00014941127207523898, + "loss": 4.8227, + "step": 15025 + }, + { + "epoch": 13.619909502262443, + "grad_norm": 1.4565255641937256, + "learning_rate": 0.00014940384201187553, + "loss": 4.8444, + "step": 15050 + }, + { + "epoch": 13.642533936651583, + "grad_norm": 1.5881317853927612, + "learning_rate": 0.00014939636554343194, + "loss": 4.8009, + "step": 15075 + }, + { + "epoch": 13.665158371040723, + "grad_norm": 1.9541592597961426, + "learning_rate": 0.0001493891444801549, + "loss": 4.7418, + "step": 15100 + }, + { + "epoch": 13.687782805429864, + "grad_norm": 1.6791893243789673, + "learning_rate": 0.00014938157707130754, + "loss": 4.8094, + "step": 15125 + }, + { + "epoch": 13.710407239819004, + "grad_norm": 1.267594814300537, + "learning_rate": 0.0001493739632712665, + "loss": 4.813, + "step": 15150 + }, + { + "epoch": 13.733031674208144, + "grad_norm": 1.4446032047271729, + "learning_rate": 0.00014936630308478042, + "loss": 4.7961, + "step": 15175 + }, + { + "epoch": 13.755656108597286, + "grad_norm": 1.5222852230072021, + "learning_rate": 0.00014935859651662696, + "loss": 4.8379, + "step": 15200 + }, + { + "epoch": 13.778280542986426, + "grad_norm": 2.0166687965393066, + "learning_rate": 0.00014935084357161255, + "loss": 4.7495, + "step": 15225 + }, + { + "epoch": 13.800904977375566, + "grad_norm": 1.8037384748458862, + "learning_rate": 0.0001493430442545727, + "loss": 4.844, + "step": 15250 + }, + { + "epoch": 13.823529411764707, + "grad_norm": 1.2547695636749268, + "learning_rate": 0.0001493351985703718, + "loss": 4.7547, + "step": 15275 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 1.2022550106048584, + "learning_rate": 0.0001493273065239031, + "loss": 4.8512, + "step": 15300 + }, + { + "epoch": 13.868778280542987, + "grad_norm": 1.9524818658828735, + "learning_rate": 0.0001493193681200888, + "loss": 4.7884, + "step": 15325 + }, + { + "epoch": 13.891402714932127, + "grad_norm": 1.3403666019439697, + "learning_rate": 0.00014931138336388004, + "loss": 4.7881, + "step": 15350 + }, + { + "epoch": 13.914027149321267, + "grad_norm": 1.6197713613510132, + "learning_rate": 0.00014930335226025684, + "loss": 4.8554, + "step": 15375 + }, + { + "epoch": 13.936651583710407, + "grad_norm": 1.4861394166946411, + "learning_rate": 0.00014929527481422815, + "loss": 4.8498, + "step": 15400 + }, + { + "epoch": 13.959276018099548, + "grad_norm": 1.1756571531295776, + "learning_rate": 0.00014928715103083178, + "loss": 4.8845, + "step": 15425 + }, + { + "epoch": 13.981900452488688, + "grad_norm": 1.8476282358169556, + "learning_rate": 0.0001492789809151345, + "loss": 4.7783, + "step": 15450 + }, + { + "epoch": 14.004524886877828, + "grad_norm": 1.3503631353378296, + "learning_rate": 0.00014927076447223195, + "loss": 4.7305, + "step": 15475 + }, + { + "epoch": 14.027149321266968, + "grad_norm": 1.109547734260559, + "learning_rate": 0.00014926250170724863, + "loss": 4.6489, + "step": 15500 + }, + { + "epoch": 14.049773755656108, + "grad_norm": 1.9085489511489868, + "learning_rate": 0.00014925419262533794, + "loss": 4.64, + "step": 15525 + }, + { + "epoch": 14.072398190045249, + "grad_norm": 1.5549107789993286, + "learning_rate": 0.00014924583723168226, + "loss": 4.6353, + "step": 15550 + }, + { + "epoch": 14.095022624434389, + "grad_norm": 2.0388152599334717, + "learning_rate": 0.00014923743553149271, + "loss": 4.6727, + "step": 15575 + }, + { + "epoch": 14.117647058823529, + "grad_norm": 1.3196210861206055, + "learning_rate": 0.00014922898753000943, + "loss": 4.7442, + "step": 15600 + }, + { + "epoch": 14.14027149321267, + "grad_norm": 1.8600906133651733, + "learning_rate": 0.00014922049323250132, + "loss": 4.6711, + "step": 15625 + }, + { + "epoch": 14.16289592760181, + "grad_norm": 1.3792847394943237, + "learning_rate": 0.0001492119526442662, + "loss": 4.5971, + "step": 15650 + }, + { + "epoch": 14.18552036199095, + "grad_norm": 1.8213170766830444, + "learning_rate": 0.0001492033657706308, + "loss": 4.5187, + "step": 15675 + }, + { + "epoch": 14.20814479638009, + "grad_norm": 1.7856183052062988, + "learning_rate": 0.00014919473261695067, + "loss": 4.6313, + "step": 15700 + }, + { + "epoch": 14.23076923076923, + "grad_norm": 1.993672490119934, + "learning_rate": 0.00014918605318861027, + "loss": 4.8331, + "step": 15725 + }, + { + "epoch": 14.25339366515837, + "grad_norm": 1.207847237586975, + "learning_rate": 0.00014917732749102284, + "loss": 4.7103, + "step": 15750 + }, + { + "epoch": 14.276018099547512, + "grad_norm": 1.5237271785736084, + "learning_rate": 0.00014916855552963052, + "loss": 4.6217, + "step": 15775 + }, + { + "epoch": 14.298642533936652, + "grad_norm": 1.4603103399276733, + "learning_rate": 0.00014915973730990437, + "loss": 4.5937, + "step": 15800 + }, + { + "epoch": 14.321266968325792, + "grad_norm": 1.55234956741333, + "learning_rate": 0.00014915087283734422, + "loss": 4.6703, + "step": 15825 + }, + { + "epoch": 14.343891402714933, + "grad_norm": 1.4504629373550415, + "learning_rate": 0.00014914196211747875, + "loss": 4.7315, + "step": 15850 + }, + { + "epoch": 14.366515837104073, + "grad_norm": 1.5225130319595337, + "learning_rate": 0.00014913300515586553, + "loss": 4.8271, + "step": 15875 + }, + { + "epoch": 14.389140271493213, + "grad_norm": 1.748902678489685, + "learning_rate": 0.00014912436297366587, + "loss": 4.7046, + "step": 15900 + }, + { + "epoch": 14.411764705882353, + "grad_norm": 1.5265341997146606, + "learning_rate": 0.00014911531539445877, + "loss": 4.5435, + "step": 15925 + }, + { + "epoch": 14.434389140271493, + "grad_norm": 2.092003107070923, + "learning_rate": 0.00014910622159012326, + "loss": 4.7161, + "step": 15950 + }, + { + "epoch": 14.457013574660634, + "grad_norm": 1.8419088125228882, + "learning_rate": 0.00014909708156633108, + "loss": 4.6548, + "step": 15975 + }, + { + "epoch": 14.479638009049774, + "grad_norm": 1.6830592155456543, + "learning_rate": 0.00014908789532878277, + "loss": 4.6093, + "step": 16000 + }, + { + "epoch": 14.502262443438914, + "grad_norm": 2.628295660018921, + "learning_rate": 0.00014907866288320774, + "loss": 4.6621, + "step": 16025 + }, + { + "epoch": 14.524886877828054, + "grad_norm": 1.7091392278671265, + "learning_rate": 0.00014906938423536417, + "loss": 4.6033, + "step": 16050 + }, + { + "epoch": 14.547511312217194, + "grad_norm": 1.7395411729812622, + "learning_rate": 0.00014906005939103906, + "loss": 4.5248, + "step": 16075 + }, + { + "epoch": 14.570135746606335, + "grad_norm": 1.5759046077728271, + "learning_rate": 0.00014905068835604826, + "loss": 4.6969, + "step": 16100 + }, + { + "epoch": 14.592760180995475, + "grad_norm": 1.5003995895385742, + "learning_rate": 0.00014904127113623644, + "loss": 4.6489, + "step": 16125 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 2.510037660598755, + "learning_rate": 0.000149031807737477, + "loss": 4.9008, + "step": 16150 + }, + { + "epoch": 14.638009049773755, + "grad_norm": 2.241529941558838, + "learning_rate": 0.0001490222981656722, + "loss": 4.7296, + "step": 16175 + }, + { + "epoch": 14.660633484162895, + "grad_norm": 1.7212368249893188, + "learning_rate": 0.0001490127424267531, + "loss": 4.7851, + "step": 16200 + }, + { + "epoch": 14.683257918552036, + "grad_norm": 1.4144231081008911, + "learning_rate": 0.00014900314052667952, + "loss": 4.8715, + "step": 16225 + }, + { + "epoch": 14.705882352941176, + "grad_norm": 1.6816970109939575, + "learning_rate": 0.00014899349247144008, + "loss": 4.8645, + "step": 16250 + }, + { + "epoch": 14.728506787330316, + "grad_norm": 1.9775121212005615, + "learning_rate": 0.00014898379826705223, + "loss": 4.6665, + "step": 16275 + }, + { + "epoch": 14.751131221719458, + "grad_norm": 1.2360163927078247, + "learning_rate": 0.00014897405791956212, + "loss": 4.7686, + "step": 16300 + }, + { + "epoch": 14.773755656108598, + "grad_norm": 1.3875223398208618, + "learning_rate": 0.00014896427143504476, + "loss": 4.6434, + "step": 16325 + }, + { + "epoch": 14.796380090497738, + "grad_norm": 1.443358063697815, + "learning_rate": 0.0001489544388196039, + "loss": 4.6037, + "step": 16350 + }, + { + "epoch": 14.819004524886878, + "grad_norm": 1.9732576608657837, + "learning_rate": 0.00014894456007937204, + "loss": 4.6332, + "step": 16375 + }, + { + "epoch": 14.841628959276019, + "grad_norm": 1.4852114915847778, + "learning_rate": 0.0001489346352205105, + "loss": 4.7808, + "step": 16400 + }, + { + "epoch": 14.864253393665159, + "grad_norm": 1.50007164478302, + "learning_rate": 0.00014892466424920933, + "loss": 4.6615, + "step": 16425 + }, + { + "epoch": 14.886877828054299, + "grad_norm": 1.3639907836914062, + "learning_rate": 0.00014891464717168732, + "loss": 4.7119, + "step": 16450 + }, + { + "epoch": 14.90950226244344, + "grad_norm": 1.2364827394485474, + "learning_rate": 0.00014890458399419209, + "loss": 4.7193, + "step": 16475 + }, + { + "epoch": 14.93212669683258, + "grad_norm": 1.5049721002578735, + "learning_rate": 0.0001488944747229999, + "loss": 4.6358, + "step": 16500 + }, + { + "epoch": 14.95475113122172, + "grad_norm": 1.619442343711853, + "learning_rate": 0.00014888431936441586, + "loss": 4.7165, + "step": 16525 + }, + { + "epoch": 14.97737556561086, + "grad_norm": 1.3429360389709473, + "learning_rate": 0.00014887411792477377, + "loss": 4.7141, + "step": 16550 + }, + { + "epoch": 15.0, + "grad_norm": 1.5158376693725586, + "learning_rate": 0.00014886387041043622, + "loss": 4.769, + "step": 16575 + }, + { + "epoch": 15.02262443438914, + "grad_norm": 1.6161320209503174, + "learning_rate": 0.00014885357682779447, + "loss": 4.4315, + "step": 16600 + }, + { + "epoch": 15.04524886877828, + "grad_norm": 1.5701944828033447, + "learning_rate": 0.00014884323718326853, + "loss": 4.5037, + "step": 16625 + }, + { + "epoch": 15.06787330316742, + "grad_norm": 1.572098970413208, + "learning_rate": 0.00014883326779548884, + "loss": 4.5302, + "step": 16650 + }, + { + "epoch": 15.09049773755656, + "grad_norm": 1.624096393585205, + "learning_rate": 0.0001488228378884032, + "loss": 4.5176, + "step": 16675 + }, + { + "epoch": 15.113122171945701, + "grad_norm": 3.299954414367676, + "learning_rate": 0.00014881236193860494, + "loss": 4.5393, + "step": 16700 + }, + { + "epoch": 15.135746606334841, + "grad_norm": 1.5165592432022095, + "learning_rate": 0.000148801839952628, + "loss": 4.4814, + "step": 16725 + }, + { + "epoch": 15.158371040723981, + "grad_norm": 2.0233843326568604, + "learning_rate": 0.0001487912719370347, + "loss": 4.5063, + "step": 16750 + }, + { + "epoch": 15.180995475113122, + "grad_norm": 1.7269341945648193, + "learning_rate": 0.0001487806578984163, + "loss": 4.4549, + "step": 16775 + }, + { + "epoch": 15.203619909502262, + "grad_norm": 2.506732702255249, + "learning_rate": 0.0001487699978433927, + "loss": 4.5649, + "step": 16800 + }, + { + "epoch": 15.226244343891402, + "grad_norm": 1.6503850221633911, + "learning_rate": 0.0001487592917786125, + "loss": 4.4911, + "step": 16825 + }, + { + "epoch": 15.248868778280542, + "grad_norm": 2.2163994312286377, + "learning_rate": 0.00014874853971075293, + "loss": 4.3962, + "step": 16850 + }, + { + "epoch": 15.271493212669684, + "grad_norm": 2.2149605751037598, + "learning_rate": 0.00014873774164652, + "loss": 4.5588, + "step": 16875 + }, + { + "epoch": 15.294117647058824, + "grad_norm": 1.7098424434661865, + "learning_rate": 0.00014872689759264839, + "loss": 4.5646, + "step": 16900 + }, + { + "epoch": 15.316742081447964, + "grad_norm": 1.7517985105514526, + "learning_rate": 0.00014871600755590142, + "loss": 4.5443, + "step": 16925 + }, + { + "epoch": 15.339366515837105, + "grad_norm": 2.4214367866516113, + "learning_rate": 0.00014870507154307114, + "loss": 4.5232, + "step": 16950 + }, + { + "epoch": 15.361990950226245, + "grad_norm": 1.5129008293151855, + "learning_rate": 0.00014869408956097826, + "loss": 4.5897, + "step": 16975 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 1.4799803495407104, + "learning_rate": 0.00014868306161647214, + "loss": 4.6032, + "step": 17000 + }, + { + "epoch": 15.407239819004525, + "grad_norm": 1.4959242343902588, + "learning_rate": 0.00014867198771643085, + "loss": 4.6202, + "step": 17025 + }, + { + "epoch": 15.429864253393665, + "grad_norm": 2.0012917518615723, + "learning_rate": 0.00014866086786776103, + "loss": 4.5087, + "step": 17050 + }, + { + "epoch": 15.452488687782806, + "grad_norm": 2.2068300247192383, + "learning_rate": 0.00014864970207739808, + "loss": 4.6576, + "step": 17075 + }, + { + "epoch": 15.475113122171946, + "grad_norm": 1.85796320438385, + "learning_rate": 0.00014863849035230602, + "loss": 4.597, + "step": 17100 + }, + { + "epoch": 15.497737556561086, + "grad_norm": 1.781432867050171, + "learning_rate": 0.0001486272326994775, + "loss": 4.5005, + "step": 17125 + }, + { + "epoch": 15.520361990950226, + "grad_norm": 1.7553761005401611, + "learning_rate": 0.00014861592912593385, + "loss": 4.564, + "step": 17150 + }, + { + "epoch": 15.542986425339366, + "grad_norm": 1.4817267656326294, + "learning_rate": 0.00014860457963872497, + "loss": 4.5873, + "step": 17175 + }, + { + "epoch": 15.565610859728507, + "grad_norm": 1.7653452157974243, + "learning_rate": 0.0001485931842449295, + "loss": 4.3915, + "step": 17200 + }, + { + "epoch": 15.588235294117647, + "grad_norm": 1.4498839378356934, + "learning_rate": 0.00014858174295165463, + "loss": 4.5328, + "step": 17225 + }, + { + "epoch": 15.610859728506787, + "grad_norm": 1.341663122177124, + "learning_rate": 0.0001485702557660362, + "loss": 4.5389, + "step": 17250 + }, + { + "epoch": 15.633484162895927, + "grad_norm": 2.314711093902588, + "learning_rate": 0.00014855872269523866, + "loss": 4.5966, + "step": 17275 + }, + { + "epoch": 15.656108597285067, + "grad_norm": 1.455085039138794, + "learning_rate": 0.00014854714374645513, + "loss": 4.6546, + "step": 17300 + }, + { + "epoch": 15.678733031674208, + "grad_norm": 1.658861517906189, + "learning_rate": 0.0001485355189269073, + "loss": 4.7516, + "step": 17325 + }, + { + "epoch": 15.701357466063348, + "grad_norm": 1.514167308807373, + "learning_rate": 0.00014852384824384546, + "loss": 4.5733, + "step": 17350 + }, + { + "epoch": 15.723981900452488, + "grad_norm": 1.9092057943344116, + "learning_rate": 0.00014851213170454853, + "loss": 4.7214, + "step": 17375 + }, + { + "epoch": 15.74660633484163, + "grad_norm": 1.7849903106689453, + "learning_rate": 0.000148500369316324, + "loss": 4.5989, + "step": 17400 + }, + { + "epoch": 15.76923076923077, + "grad_norm": 2.334677219390869, + "learning_rate": 0.00014848856108650802, + "loss": 4.5126, + "step": 17425 + }, + { + "epoch": 15.79185520361991, + "grad_norm": 1.6683495044708252, + "learning_rate": 0.00014847670702246527, + "loss": 4.6018, + "step": 17450 + }, + { + "epoch": 15.81447963800905, + "grad_norm": 1.8269922733306885, + "learning_rate": 0.000148464807131589, + "loss": 4.5902, + "step": 17475 + }, + { + "epoch": 15.83710407239819, + "grad_norm": 1.3060346841812134, + "learning_rate": 0.00014845286142130116, + "loss": 4.7792, + "step": 17500 + }, + { + "epoch": 15.85972850678733, + "grad_norm": 2.0103907585144043, + "learning_rate": 0.0001484408698990521, + "loss": 4.696, + "step": 17525 + }, + { + "epoch": 15.882352941176471, + "grad_norm": 1.733870029449463, + "learning_rate": 0.0001484288325723209, + "loss": 4.5294, + "step": 17550 + }, + { + "epoch": 15.904977375565611, + "grad_norm": 1.6223927736282349, + "learning_rate": 0.0001484167494486151, + "loss": 4.6102, + "step": 17575 + }, + { + "epoch": 15.927601809954751, + "grad_norm": 1.5040452480316162, + "learning_rate": 0.0001484046205354709, + "loss": 4.5765, + "step": 17600 + }, + { + "epoch": 15.950226244343892, + "grad_norm": 2.060255527496338, + "learning_rate": 0.00014839244584045295, + "loss": 4.5268, + "step": 17625 + }, + { + "epoch": 15.972850678733032, + "grad_norm": 1.6869535446166992, + "learning_rate": 0.00014838022537115453, + "loss": 4.6779, + "step": 17650 + }, + { + "epoch": 15.995475113122172, + "grad_norm": 1.6950558423995972, + "learning_rate": 0.00014836795913519748, + "loss": 4.4615, + "step": 17675 + }, + { + "epoch": 16.018099547511312, + "grad_norm": 1.6395280361175537, + "learning_rate": 0.0001483556471402321, + "loss": 4.3711, + "step": 17700 + }, + { + "epoch": 16.040723981900452, + "grad_norm": 2.1945571899414062, + "learning_rate": 0.00014834328939393733, + "loss": 4.3561, + "step": 17725 + }, + { + "epoch": 16.063348416289593, + "grad_norm": 1.7082488536834717, + "learning_rate": 0.0001483308859040206, + "loss": 4.4945, + "step": 17750 + }, + { + "epoch": 16.085972850678733, + "grad_norm": 2.0914411544799805, + "learning_rate": 0.00014831843667821777, + "loss": 4.2182, + "step": 17775 + }, + { + "epoch": 16.108597285067873, + "grad_norm": 2.07283878326416, + "learning_rate": 0.00014830594172429346, + "loss": 4.2707, + "step": 17800 + }, + { + "epoch": 16.131221719457013, + "grad_norm": 1.7264245748519897, + "learning_rate": 0.0001482934010500406, + "loss": 4.5788, + "step": 17825 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 1.8298250436782837, + "learning_rate": 0.0001482808146632807, + "loss": 4.5108, + "step": 17850 + }, + { + "epoch": 16.176470588235293, + "grad_norm": 1.6344093084335327, + "learning_rate": 0.00014826818257186383, + "loss": 4.2816, + "step": 17875 + }, + { + "epoch": 16.199095022624434, + "grad_norm": 1.8113044500350952, + "learning_rate": 0.00014825550478366847, + "loss": 4.3255, + "step": 17900 + }, + { + "epoch": 16.221719457013574, + "grad_norm": 1.8958951234817505, + "learning_rate": 0.0001482427813066017, + "loss": 4.3559, + "step": 17925 + }, + { + "epoch": 16.244343891402714, + "grad_norm": 1.710372805595398, + "learning_rate": 0.00014823001214859903, + "loss": 4.2967, + "step": 17950 + }, + { + "epoch": 16.266968325791854, + "grad_norm": 2.1480696201324463, + "learning_rate": 0.0001482171973176245, + "loss": 4.4731, + "step": 17975 + }, + { + "epoch": 16.289592760180994, + "grad_norm": 1.6352964639663696, + "learning_rate": 0.0001482043368216706, + "loss": 4.336, + "step": 18000 + }, + { + "epoch": 16.312217194570135, + "grad_norm": 1.9280736446380615, + "learning_rate": 0.00014819143066875832, + "loss": 4.2713, + "step": 18025 + }, + { + "epoch": 16.334841628959275, + "grad_norm": 2.057253837585449, + "learning_rate": 0.00014817847886693713, + "loss": 4.3916, + "step": 18050 + }, + { + "epoch": 16.357466063348415, + "grad_norm": 1.9234403371810913, + "learning_rate": 0.00014816548142428495, + "loss": 4.4578, + "step": 18075 + }, + { + "epoch": 16.380090497737555, + "grad_norm": 1.9432510137557983, + "learning_rate": 0.0001481524383489082, + "loss": 4.4288, + "step": 18100 + }, + { + "epoch": 16.402714932126695, + "grad_norm": 1.8046300411224365, + "learning_rate": 0.00014813934964894176, + "loss": 4.5106, + "step": 18125 + }, + { + "epoch": 16.425339366515836, + "grad_norm": 1.8123124837875366, + "learning_rate": 0.00014812621533254888, + "loss": 4.3949, + "step": 18150 + }, + { + "epoch": 16.447963800904976, + "grad_norm": 2.1310598850250244, + "learning_rate": 0.0001481130354079214, + "loss": 4.532, + "step": 18175 + }, + { + "epoch": 16.470588235294116, + "grad_norm": 2.790117025375366, + "learning_rate": 0.0001480998098832795, + "loss": 4.3405, + "step": 18200 + }, + { + "epoch": 16.49321266968326, + "grad_norm": 2.3215723037719727, + "learning_rate": 0.00014808653876687185, + "loss": 4.4327, + "step": 18225 + }, + { + "epoch": 16.5158371040724, + "grad_norm": 1.8296551704406738, + "learning_rate": 0.0001480732220669755, + "loss": 4.5334, + "step": 18250 + }, + { + "epoch": 16.53846153846154, + "grad_norm": 2.881291627883911, + "learning_rate": 0.00014805985979189602, + "loss": 4.4545, + "step": 18275 + }, + { + "epoch": 16.56108597285068, + "grad_norm": 2.629380941390991, + "learning_rate": 0.0001480464519499673, + "loss": 4.4504, + "step": 18300 + }, + { + "epoch": 16.58371040723982, + "grad_norm": 1.830816626548767, + "learning_rate": 0.00014803299854955173, + "loss": 4.5261, + "step": 18325 + }, + { + "epoch": 16.60633484162896, + "grad_norm": 1.7896275520324707, + "learning_rate": 0.0001480194995990401, + "loss": 4.5665, + "step": 18350 + }, + { + "epoch": 16.6289592760181, + "grad_norm": 1.8729816675186157, + "learning_rate": 0.00014800595510685162, + "loss": 4.4799, + "step": 18375 + }, + { + "epoch": 16.65158371040724, + "grad_norm": 2.032773494720459, + "learning_rate": 0.0001479923650814338, + "loss": 4.4249, + "step": 18400 + }, + { + "epoch": 16.67420814479638, + "grad_norm": 2.0493974685668945, + "learning_rate": 0.0001479787295312627, + "loss": 4.4361, + "step": 18425 + }, + { + "epoch": 16.69683257918552, + "grad_norm": 1.7331079244613647, + "learning_rate": 0.0001479650484648427, + "loss": 4.4077, + "step": 18450 + }, + { + "epoch": 16.71945701357466, + "grad_norm": 1.6861578226089478, + "learning_rate": 0.00014795132189070653, + "loss": 4.4833, + "step": 18475 + }, + { + "epoch": 16.742081447963802, + "grad_norm": 1.9591151475906372, + "learning_rate": 0.0001479375498174154, + "loss": 4.5211, + "step": 18500 + }, + { + "epoch": 16.764705882352942, + "grad_norm": 1.5738307237625122, + "learning_rate": 0.00014792373225355879, + "loss": 4.5, + "step": 18525 + }, + { + "epoch": 16.787330316742082, + "grad_norm": 2.668651819229126, + "learning_rate": 0.00014790986920775462, + "loss": 4.4206, + "step": 18550 + }, + { + "epoch": 16.809954751131222, + "grad_norm": 1.7280417680740356, + "learning_rate": 0.00014789596068864915, + "loss": 4.3528, + "step": 18575 + }, + { + "epoch": 16.832579185520363, + "grad_norm": 2.1722147464752197, + "learning_rate": 0.00014788200670491706, + "loss": 4.5144, + "step": 18600 + }, + { + "epoch": 16.855203619909503, + "grad_norm": 2.2709977626800537, + "learning_rate": 0.00014786800726526126, + "loss": 4.4895, + "step": 18625 + }, + { + "epoch": 16.877828054298643, + "grad_norm": 2.3444464206695557, + "learning_rate": 0.00014785396237841316, + "loss": 4.5319, + "step": 18650 + }, + { + "epoch": 16.900452488687783, + "grad_norm": 2.3623125553131104, + "learning_rate": 0.00014783987205313243, + "loss": 4.4935, + "step": 18675 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 2.035130262374878, + "learning_rate": 0.00014782573629820706, + "loss": 4.4686, + "step": 18700 + }, + { + "epoch": 16.945701357466064, + "grad_norm": 2.233100175857544, + "learning_rate": 0.00014781155512245343, + "loss": 4.5366, + "step": 18725 + }, + { + "epoch": 16.968325791855204, + "grad_norm": 1.819060206413269, + "learning_rate": 0.00014779732853471624, + "loss": 4.5277, + "step": 18750 + }, + { + "epoch": 16.990950226244344, + "grad_norm": 3.098139524459839, + "learning_rate": 0.00014778305654386848, + "loss": 4.3685, + "step": 18775 + }, + { + "epoch": 17.013574660633484, + "grad_norm": 2.7072527408599854, + "learning_rate": 0.00014776873915881147, + "loss": 4.3317, + "step": 18800 + }, + { + "epoch": 17.036199095022624, + "grad_norm": 1.5817420482635498, + "learning_rate": 0.00014775437638847485, + "loss": 4.2269, + "step": 18825 + }, + { + "epoch": 17.058823529411764, + "grad_norm": 1.7589970827102661, + "learning_rate": 0.00014773996824181656, + "loss": 4.2141, + "step": 18850 + }, + { + "epoch": 17.081447963800905, + "grad_norm": 2.6992104053497314, + "learning_rate": 0.00014772551472782286, + "loss": 4.1172, + "step": 18875 + }, + { + "epoch": 17.104072398190045, + "grad_norm": 1.9499256610870361, + "learning_rate": 0.00014771101585550828, + "loss": 4.2247, + "step": 18900 + }, + { + "epoch": 17.126696832579185, + "grad_norm": 1.7662402391433716, + "learning_rate": 0.00014769647163391568, + "loss": 4.3208, + "step": 18925 + }, + { + "epoch": 17.149321266968325, + "grad_norm": 1.9599316120147705, + "learning_rate": 0.00014768188207211615, + "loss": 4.149, + "step": 18950 + }, + { + "epoch": 17.171945701357465, + "grad_norm": 1.9052058458328247, + "learning_rate": 0.00014766724717920907, + "loss": 4.2268, + "step": 18975 + }, + { + "epoch": 17.194570135746606, + "grad_norm": 1.827323079109192, + "learning_rate": 0.00014765256696432213, + "loss": 4.3944, + "step": 19000 + }, + { + "epoch": 17.217194570135746, + "grad_norm": 1.8786205053329468, + "learning_rate": 0.00014763784143661125, + "loss": 4.2911, + "step": 19025 + }, + { + "epoch": 17.239819004524886, + "grad_norm": 1.890916109085083, + "learning_rate": 0.00014762307060526064, + "loss": 4.3243, + "step": 19050 + }, + { + "epoch": 17.262443438914026, + "grad_norm": 2.4130940437316895, + "learning_rate": 0.0001476082544794827, + "loss": 4.267, + "step": 19075 + }, + { + "epoch": 17.285067873303166, + "grad_norm": 2.0690503120422363, + "learning_rate": 0.0001475933930685182, + "loss": 4.4042, + "step": 19100 + }, + { + "epoch": 17.307692307692307, + "grad_norm": 1.4941195249557495, + "learning_rate": 0.00014757848638163602, + "loss": 4.3633, + "step": 19125 + }, + { + "epoch": 17.330316742081447, + "grad_norm": 2.508427858352661, + "learning_rate": 0.0001475635344281334, + "loss": 4.3816, + "step": 19150 + }, + { + "epoch": 17.352941176470587, + "grad_norm": 1.9983786344528198, + "learning_rate": 0.0001475485372173357, + "loss": 4.2859, + "step": 19175 + }, + { + "epoch": 17.375565610859727, + "grad_norm": 5.220970153808594, + "learning_rate": 0.00014753349475859657, + "loss": 4.4096, + "step": 19200 + }, + { + "epoch": 17.398190045248867, + "grad_norm": 2.2304937839508057, + "learning_rate": 0.0001475184070612979, + "loss": 4.2902, + "step": 19225 + }, + { + "epoch": 17.420814479638008, + "grad_norm": 1.8377021551132202, + "learning_rate": 0.00014750327413484975, + "loss": 4.2992, + "step": 19250 + }, + { + "epoch": 17.443438914027148, + "grad_norm": 6.677389621734619, + "learning_rate": 0.00014748809598869042, + "loss": 4.4381, + "step": 19275 + }, + { + "epoch": 17.466063348416288, + "grad_norm": 2.76698637008667, + "learning_rate": 0.00014747287263228634, + "loss": 4.1557, + "step": 19300 + }, + { + "epoch": 17.488687782805428, + "grad_norm": 2.1575393676757812, + "learning_rate": 0.00014745760407513226, + "loss": 4.1819, + "step": 19325 + }, + { + "epoch": 17.511312217194572, + "grad_norm": 2.5181052684783936, + "learning_rate": 0.00014744229032675105, + "loss": 4.3038, + "step": 19350 + }, + { + "epoch": 17.533936651583712, + "grad_norm": 3.4468493461608887, + "learning_rate": 0.00014742693139669375, + "loss": 4.3447, + "step": 19375 + }, + { + "epoch": 17.556561085972852, + "grad_norm": 3.16564679145813, + "learning_rate": 0.0001474115272945396, + "loss": 4.2663, + "step": 19400 + }, + { + "epoch": 17.579185520361992, + "grad_norm": 1.846726417541504, + "learning_rate": 0.00014739607802989602, + "loss": 4.3996, + "step": 19425 + }, + { + "epoch": 17.601809954751133, + "grad_norm": 2.7358832359313965, + "learning_rate": 0.0001473805836123986, + "loss": 4.4441, + "step": 19450 + }, + { + "epoch": 17.624434389140273, + "grad_norm": 1.9710636138916016, + "learning_rate": 0.0001473650440517111, + "loss": 4.3825, + "step": 19475 + }, + { + "epoch": 17.647058823529413, + "grad_norm": 2.2102885246276855, + "learning_rate": 0.00014734945935752537, + "loss": 4.2444, + "step": 19500 + }, + { + "epoch": 17.669683257918553, + "grad_norm": 2.3739373683929443, + "learning_rate": 0.00014733382953956148, + "loss": 4.2263, + "step": 19525 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 2.2633421421051025, + "learning_rate": 0.00014731815460756765, + "loss": 4.2954, + "step": 19550 + }, + { + "epoch": 17.714932126696834, + "grad_norm": 1.925898790359497, + "learning_rate": 0.0001473024345713202, + "loss": 4.3745, + "step": 19575 + }, + { + "epoch": 17.737556561085974, + "grad_norm": 2.121878147125244, + "learning_rate": 0.00014728666944062357, + "loss": 4.2597, + "step": 19600 + }, + { + "epoch": 17.760180995475114, + "grad_norm": 2.168797492980957, + "learning_rate": 0.00014727085922531036, + "loss": 4.3293, + "step": 19625 + }, + { + "epoch": 17.782805429864254, + "grad_norm": 2.117703914642334, + "learning_rate": 0.00014725500393524126, + "loss": 4.3164, + "step": 19650 + }, + { + "epoch": 17.805429864253394, + "grad_norm": 2.157322883605957, + "learning_rate": 0.00014723910358030513, + "loss": 4.3772, + "step": 19675 + }, + { + "epoch": 17.828054298642535, + "grad_norm": 2.02313494682312, + "learning_rate": 0.00014722315817041883, + "loss": 4.4356, + "step": 19700 + }, + { + "epoch": 17.850678733031675, + "grad_norm": 2.4830150604248047, + "learning_rate": 0.0001472071677155274, + "loss": 4.312, + "step": 19725 + }, + { + "epoch": 17.873303167420815, + "grad_norm": 2.015939235687256, + "learning_rate": 0.00014719113222560402, + "loss": 4.3141, + "step": 19750 + }, + { + "epoch": 17.895927601809955, + "grad_norm": 2.0206422805786133, + "learning_rate": 0.00014717505171064983, + "loss": 4.2916, + "step": 19775 + }, + { + "epoch": 17.918552036199095, + "grad_norm": 3.191218852996826, + "learning_rate": 0.00014715892618069417, + "loss": 4.3458, + "step": 19800 + }, + { + "epoch": 17.941176470588236, + "grad_norm": 1.7842390537261963, + "learning_rate": 0.00014714275564579432, + "loss": 4.3788, + "step": 19825 + }, + { + "epoch": 17.963800904977376, + "grad_norm": 2.51277494430542, + "learning_rate": 0.0001471265401160358, + "loss": 4.2968, + "step": 19850 + }, + { + "epoch": 17.986425339366516, + "grad_norm": 1.9772491455078125, + "learning_rate": 0.00014711027960153208, + "loss": 4.2711, + "step": 19875 + }, + { + "epoch": 18.009049773755656, + "grad_norm": 2.1392099857330322, + "learning_rate": 0.00014709397411242467, + "loss": 4.2256, + "step": 19900 + }, + { + "epoch": 18.031674208144796, + "grad_norm": 2.5523273944854736, + "learning_rate": 0.00014707762365888326, + "loss": 4.1125, + "step": 19925 + }, + { + "epoch": 18.054298642533936, + "grad_norm": 1.976250171661377, + "learning_rate": 0.00014706122825110541, + "loss": 4.1466, + "step": 19950 + }, + { + "epoch": 18.076923076923077, + "grad_norm": 1.9020646810531616, + "learning_rate": 0.00014704478789931687, + "loss": 4.1489, + "step": 19975 + }, + { + "epoch": 18.099547511312217, + "grad_norm": 2.142031192779541, + "learning_rate": 0.0001470283026137713, + "loss": 4.0118, + "step": 20000 + }, + { + "epoch": 18.122171945701357, + "grad_norm": 2.3549931049346924, + "learning_rate": 0.00014701177240475046, + "loss": 4.1826, + "step": 20025 + }, + { + "epoch": 18.144796380090497, + "grad_norm": 2.1854188442230225, + "learning_rate": 0.00014699519728256414, + "loss": 4.1429, + "step": 20050 + }, + { + "epoch": 18.167420814479637, + "grad_norm": 2.0742249488830566, + "learning_rate": 0.00014697857725755006, + "loss": 4.188, + "step": 20075 + }, + { + "epoch": 18.190045248868778, + "grad_norm": 2.2574656009674072, + "learning_rate": 0.00014696191234007404, + "loss": 4.0573, + "step": 20100 + }, + { + "epoch": 18.212669683257918, + "grad_norm": 2.203341245651245, + "learning_rate": 0.00014694520254052984, + "loss": 4.0884, + "step": 20125 + }, + { + "epoch": 18.235294117647058, + "grad_norm": 2.074465274810791, + "learning_rate": 0.00014692844786933922, + "loss": 4.1462, + "step": 20150 + }, + { + "epoch": 18.257918552036198, + "grad_norm": 2.674535036087036, + "learning_rate": 0.00014691164833695197, + "loss": 4.0187, + "step": 20175 + }, + { + "epoch": 18.28054298642534, + "grad_norm": 1.9000567197799683, + "learning_rate": 0.00014689480395384575, + "loss": 4.2204, + "step": 20200 + }, + { + "epoch": 18.30316742081448, + "grad_norm": 1.8418059349060059, + "learning_rate": 0.00014687791473052633, + "loss": 4.1562, + "step": 20225 + }, + { + "epoch": 18.32579185520362, + "grad_norm": 2.8064866065979004, + "learning_rate": 0.00014686098067752737, + "loss": 4.081, + "step": 20250 + }, + { + "epoch": 18.34841628959276, + "grad_norm": 2.413703203201294, + "learning_rate": 0.00014684400180541048, + "loss": 4.1337, + "step": 20275 + }, + { + "epoch": 18.3710407239819, + "grad_norm": 3.4849491119384766, + "learning_rate": 0.00014682697812476529, + "loss": 4.2092, + "step": 20300 + }, + { + "epoch": 18.39366515837104, + "grad_norm": 2.3783164024353027, + "learning_rate": 0.0001468099096462093, + "loss": 4.1779, + "step": 20325 + }, + { + "epoch": 18.41628959276018, + "grad_norm": 1.7030620574951172, + "learning_rate": 0.00014679279638038796, + "loss": 4.2995, + "step": 20350 + }, + { + "epoch": 18.43891402714932, + "grad_norm": 2.441593885421753, + "learning_rate": 0.0001467756383379747, + "loss": 4.2323, + "step": 20375 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 2.21101713180542, + "learning_rate": 0.00014675843552967093, + "loss": 4.2335, + "step": 20400 + }, + { + "epoch": 18.4841628959276, + "grad_norm": 2.7679131031036377, + "learning_rate": 0.0001467411879662058, + "loss": 4.3095, + "step": 20425 + }, + { + "epoch": 18.50678733031674, + "grad_norm": 2.2078464031219482, + "learning_rate": 0.0001467238956583365, + "loss": 4.0579, + "step": 20450 + }, + { + "epoch": 18.529411764705884, + "grad_norm": 2.1665759086608887, + "learning_rate": 0.00014670655861684812, + "loss": 4.2085, + "step": 20475 + }, + { + "epoch": 18.552036199095024, + "grad_norm": 2.4058430194854736, + "learning_rate": 0.00014668917685255366, + "loss": 4.0638, + "step": 20500 + }, + { + "epoch": 18.574660633484164, + "grad_norm": 2.21022629737854, + "learning_rate": 0.0001466717503762939, + "loss": 4.1707, + "step": 20525 + }, + { + "epoch": 18.597285067873305, + "grad_norm": 2.420351982116699, + "learning_rate": 0.00014665427919893767, + "loss": 4.1226, + "step": 20550 + }, + { + "epoch": 18.619909502262445, + "grad_norm": 2.014174699783325, + "learning_rate": 0.0001466367633313816, + "loss": 4.2822, + "step": 20575 + }, + { + "epoch": 18.642533936651585, + "grad_norm": 2.4564104080200195, + "learning_rate": 0.00014661920278455018, + "loss": 4.234, + "step": 20600 + }, + { + "epoch": 18.665158371040725, + "grad_norm": 2.478470802307129, + "learning_rate": 0.00014660159756939577, + "loss": 4.1423, + "step": 20625 + }, + { + "epoch": 18.687782805429865, + "grad_norm": 2.064303159713745, + "learning_rate": 0.00014658394769689865, + "loss": 4.3023, + "step": 20650 + }, + { + "epoch": 18.710407239819006, + "grad_norm": 3.076389789581299, + "learning_rate": 0.00014656625317806683, + "loss": 4.2387, + "step": 20675 + }, + { + "epoch": 18.733031674208146, + "grad_norm": 4.118660926818848, + "learning_rate": 0.00014654851402393627, + "loss": 4.1435, + "step": 20700 + }, + { + "epoch": 18.755656108597286, + "grad_norm": 1.9801812171936035, + "learning_rate": 0.00014653073024557077, + "loss": 4.1353, + "step": 20725 + }, + { + "epoch": 18.778280542986426, + "grad_norm": 1.930025339126587, + "learning_rate": 0.0001465129018540619, + "loss": 4.1906, + "step": 20750 + }, + { + "epoch": 18.800904977375566, + "grad_norm": 1.7695097923278809, + "learning_rate": 0.00014649502886052908, + "loss": 4.1309, + "step": 20775 + }, + { + "epoch": 18.823529411764707, + "grad_norm": 2.5337889194488525, + "learning_rate": 0.00014647711127611959, + "loss": 4.013, + "step": 20800 + }, + { + "epoch": 18.846153846153847, + "grad_norm": 2.259813070297241, + "learning_rate": 0.00014645914911200843, + "loss": 4.3515, + "step": 20825 + }, + { + "epoch": 18.868778280542987, + "grad_norm": 1.9734256267547607, + "learning_rate": 0.0001464411423793985, + "loss": 4.0642, + "step": 20850 + }, + { + "epoch": 18.891402714932127, + "grad_norm": 2.131457567214966, + "learning_rate": 0.00014642309108952044, + "loss": 4.1572, + "step": 20875 + }, + { + "epoch": 18.914027149321267, + "grad_norm": 2.4415125846862793, + "learning_rate": 0.0001464049952536327, + "loss": 4.2472, + "step": 20900 + }, + { + "epoch": 18.936651583710407, + "grad_norm": 2.270073175430298, + "learning_rate": 0.00014638685488302147, + "loss": 4.2295, + "step": 20925 + }, + { + "epoch": 18.959276018099548, + "grad_norm": 2.5856406688690186, + "learning_rate": 0.00014636866998900082, + "loss": 4.2938, + "step": 20950 + }, + { + "epoch": 18.981900452488688, + "grad_norm": 1.826500654220581, + "learning_rate": 0.00014635044058291247, + "loss": 4.2264, + "step": 20975 + }, + { + "epoch": 19.004524886877828, + "grad_norm": 3.0457630157470703, + "learning_rate": 0.000146332166676126, + "loss": 4.1044, + "step": 21000 + }, + { + "epoch": 19.02714932126697, + "grad_norm": 2.027559280395508, + "learning_rate": 0.00014631384828003865, + "loss": 4.0324, + "step": 21025 + }, + { + "epoch": 19.04977375565611, + "grad_norm": 3.3464810848236084, + "learning_rate": 0.0001462954854060755, + "loss": 4.0011, + "step": 21050 + }, + { + "epoch": 19.07239819004525, + "grad_norm": 2.8238072395324707, + "learning_rate": 0.0001462770780656893, + "loss": 4.0729, + "step": 21075 + }, + { + "epoch": 19.09502262443439, + "grad_norm": 2.94176983833313, + "learning_rate": 0.00014625862627036054, + "loss": 3.9543, + "step": 21100 + }, + { + "epoch": 19.11764705882353, + "grad_norm": 2.5876317024230957, + "learning_rate": 0.00014624013003159753, + "loss": 3.9983, + "step": 21125 + }, + { + "epoch": 19.14027149321267, + "grad_norm": 2.7959675788879395, + "learning_rate": 0.00014622158936093617, + "loss": 3.9553, + "step": 21150 + }, + { + "epoch": 19.16289592760181, + "grad_norm": 2.9518980979919434, + "learning_rate": 0.00014620300426994014, + "loss": 4.0165, + "step": 21175 + }, + { + "epoch": 19.18552036199095, + "grad_norm": 2.9283668994903564, + "learning_rate": 0.0001461843747702008, + "loss": 3.9963, + "step": 21200 + }, + { + "epoch": 19.20814479638009, + "grad_norm": 2.777820587158203, + "learning_rate": 0.00014616570087333723, + "loss": 4.0544, + "step": 21225 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 2.8670153617858887, + "learning_rate": 0.0001461469825909962, + "loss": 4.0468, + "step": 21250 + }, + { + "epoch": 19.25339366515837, + "grad_norm": 2.9561901092529297, + "learning_rate": 0.00014612821993485213, + "loss": 4.0144, + "step": 21275 + }, + { + "epoch": 19.27601809954751, + "grad_norm": 3.267868757247925, + "learning_rate": 0.00014610941291660716, + "loss": 3.9227, + "step": 21300 + }, + { + "epoch": 19.29864253393665, + "grad_norm": 2.76511812210083, + "learning_rate": 0.00014609056154799108, + "loss": 4.0184, + "step": 21325 + }, + { + "epoch": 19.32126696832579, + "grad_norm": 2.691312551498413, + "learning_rate": 0.00014607166584076133, + "loss": 4.0367, + "step": 21350 + }, + { + "epoch": 19.34389140271493, + "grad_norm": 2.1221303939819336, + "learning_rate": 0.00014605272580670296, + "loss": 3.9295, + "step": 21375 + }, + { + "epoch": 19.36651583710407, + "grad_norm": 2.8809330463409424, + "learning_rate": 0.0001460337414576288, + "loss": 4.0271, + "step": 21400 + }, + { + "epoch": 19.38914027149321, + "grad_norm": 2.1942460536956787, + "learning_rate": 0.0001460147128053792, + "loss": 3.9666, + "step": 21425 + }, + { + "epoch": 19.41176470588235, + "grad_norm": 1.810325026512146, + "learning_rate": 0.0001459956398618222, + "loss": 4.0222, + "step": 21450 + }, + { + "epoch": 19.43438914027149, + "grad_norm": 3.1612627506256104, + "learning_rate": 0.0001459765226388534, + "loss": 4.1373, + "step": 21475 + }, + { + "epoch": 19.457013574660632, + "grad_norm": 1.9534374475479126, + "learning_rate": 0.00014595736114839607, + "loss": 4.2342, + "step": 21500 + }, + { + "epoch": 19.479638009049772, + "grad_norm": 2.4724044799804688, + "learning_rate": 0.0001459381554024011, + "loss": 4.0709, + "step": 21525 + }, + { + "epoch": 19.502262443438916, + "grad_norm": 2.7774980068206787, + "learning_rate": 0.00014591890541284695, + "loss": 4.0093, + "step": 21550 + }, + { + "epoch": 19.524886877828056, + "grad_norm": 3.0610437393188477, + "learning_rate": 0.0001458996111917397, + "loss": 4.0114, + "step": 21575 + }, + { + "epoch": 19.547511312217196, + "grad_norm": 2.0043299198150635, + "learning_rate": 0.00014588027275111293, + "loss": 4.0228, + "step": 21600 + }, + { + "epoch": 19.570135746606336, + "grad_norm": 2.2680184841156006, + "learning_rate": 0.00014586089010302795, + "loss": 4.0565, + "step": 21625 + }, + { + "epoch": 19.592760180995477, + "grad_norm": 4.519238471984863, + "learning_rate": 0.00014584146325957357, + "loss": 3.9648, + "step": 21650 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 2.4883666038513184, + "learning_rate": 0.0001458219922328661, + "loss": 4.0511, + "step": 21675 + }, + { + "epoch": 19.638009049773757, + "grad_norm": 2.097761392593384, + "learning_rate": 0.00014580247703504948, + "loss": 4.0443, + "step": 21700 + }, + { + "epoch": 19.660633484162897, + "grad_norm": 2.8141157627105713, + "learning_rate": 0.00014578291767829518, + "loss": 4.0383, + "step": 21725 + }, + { + "epoch": 19.683257918552037, + "grad_norm": 2.520421266555786, + "learning_rate": 0.00014576331417480226, + "loss": 3.8808, + "step": 21750 + }, + { + "epoch": 19.705882352941178, + "grad_norm": 2.791774034500122, + "learning_rate": 0.0001457436665367972, + "loss": 4.0448, + "step": 21775 + }, + { + "epoch": 19.728506787330318, + "grad_norm": 2.9688174724578857, + "learning_rate": 0.0001457239747765341, + "loss": 4.1494, + "step": 21800 + }, + { + "epoch": 19.751131221719458, + "grad_norm": 2.548532009124756, + "learning_rate": 0.00014570423890629457, + "loss": 3.9962, + "step": 21825 + }, + { + "epoch": 19.773755656108598, + "grad_norm": 2.1748194694519043, + "learning_rate": 0.0001456844589383877, + "loss": 4.1168, + "step": 21850 + }, + { + "epoch": 19.79638009049774, + "grad_norm": 3.12788987159729, + "learning_rate": 0.00014566463488515012, + "loss": 4.037, + "step": 21875 + }, + { + "epoch": 19.81900452488688, + "grad_norm": 3.020284414291382, + "learning_rate": 0.0001456447667589459, + "loss": 4.0868, + "step": 21900 + }, + { + "epoch": 19.84162895927602, + "grad_norm": 2.217618942260742, + "learning_rate": 0.00014562485457216663, + "loss": 4.1297, + "step": 21925 + }, + { + "epoch": 19.86425339366516, + "grad_norm": 2.423015594482422, + "learning_rate": 0.0001456048983372314, + "loss": 4.0776, + "step": 21950 + }, + { + "epoch": 19.8868778280543, + "grad_norm": 2.9892938137054443, + "learning_rate": 0.00014558489806658676, + "loss": 3.9935, + "step": 21975 + }, + { + "epoch": 19.90950226244344, + "grad_norm": 2.593134880065918, + "learning_rate": 0.0001455648537727067, + "loss": 4.1027, + "step": 22000 + }, + { + "epoch": 19.93212669683258, + "grad_norm": 2.268334150314331, + "learning_rate": 0.0001455447654680927, + "loss": 4.008, + "step": 22025 + }, + { + "epoch": 19.95475113122172, + "grad_norm": 2.3195865154266357, + "learning_rate": 0.00014552463316527367, + "loss": 4.1872, + "step": 22050 + }, + { + "epoch": 19.97737556561086, + "grad_norm": 1.8791290521621704, + "learning_rate": 0.00014550445687680597, + "loss": 3.9837, + "step": 22075 + }, + { + "epoch": 20.0, + "grad_norm": 2.1045377254486084, + "learning_rate": 0.00014548423661527336, + "loss": 4.0442, + "step": 22100 + }, + { + "epoch": 20.02262443438914, + "grad_norm": 2.456329107284546, + "learning_rate": 0.0001454639723932871, + "loss": 3.7574, + "step": 22125 + }, + { + "epoch": 20.04524886877828, + "grad_norm": 3.4917218685150146, + "learning_rate": 0.0001454436642234858, + "loss": 3.8406, + "step": 22150 + }, + { + "epoch": 20.06787330316742, + "grad_norm": 3.367820978164673, + "learning_rate": 0.0001454233121185355, + "loss": 3.8001, + "step": 22175 + }, + { + "epoch": 20.09049773755656, + "grad_norm": 2.7249231338500977, + "learning_rate": 0.00014540291609112965, + "loss": 3.8397, + "step": 22200 + }, + { + "epoch": 20.1131221719457, + "grad_norm": 2.733926773071289, + "learning_rate": 0.0001453824761539891, + "loss": 3.7859, + "step": 22225 + }, + { + "epoch": 20.13574660633484, + "grad_norm": 3.5410749912261963, + "learning_rate": 0.00014536199231986204, + "loss": 3.924, + "step": 22250 + }, + { + "epoch": 20.15837104072398, + "grad_norm": 2.766115188598633, + "learning_rate": 0.00014534146460152409, + "loss": 3.8949, + "step": 22275 + }, + { + "epoch": 20.18099547511312, + "grad_norm": 3.4365551471710205, + "learning_rate": 0.00014532089301177826, + "loss": 3.8416, + "step": 22300 + }, + { + "epoch": 20.20361990950226, + "grad_norm": 2.3583567142486572, + "learning_rate": 0.00014530027756345487, + "loss": 3.8667, + "step": 22325 + }, + { + "epoch": 20.226244343891402, + "grad_norm": 2.6554083824157715, + "learning_rate": 0.00014527961826941155, + "loss": 3.8911, + "step": 22350 + }, + { + "epoch": 20.248868778280542, + "grad_norm": 2.0309135913848877, + "learning_rate": 0.0001452589151425334, + "loss": 3.8475, + "step": 22375 + }, + { + "epoch": 20.271493212669682, + "grad_norm": 2.347342014312744, + "learning_rate": 0.00014523816819573277, + "loss": 3.953, + "step": 22400 + }, + { + "epoch": 20.294117647058822, + "grad_norm": 2.267829656600952, + "learning_rate": 0.0001452173774419494, + "loss": 3.7009, + "step": 22425 + }, + { + "epoch": 20.316742081447963, + "grad_norm": 2.2686140537261963, + "learning_rate": 0.00014519654289415026, + "loss": 3.9219, + "step": 22450 + }, + { + "epoch": 20.339366515837103, + "grad_norm": 3.073906898498535, + "learning_rate": 0.0001451756645653297, + "loss": 3.8887, + "step": 22475 + }, + { + "epoch": 20.361990950226243, + "grad_norm": 2.2049973011016846, + "learning_rate": 0.00014515474246850943, + "loss": 3.8091, + "step": 22500 + }, + { + "epoch": 20.384615384615383, + "grad_norm": 2.884639263153076, + "learning_rate": 0.00014513377661673832, + "loss": 3.9657, + "step": 22525 + }, + { + "epoch": 20.407239819004523, + "grad_norm": 2.205660104751587, + "learning_rate": 0.00014511276702309264, + "loss": 3.9964, + "step": 22550 + }, + { + "epoch": 20.429864253393664, + "grad_norm": 2.565671682357788, + "learning_rate": 0.0001450917137006759, + "loss": 3.9969, + "step": 22575 + }, + { + "epoch": 20.452488687782804, + "grad_norm": 2.937319278717041, + "learning_rate": 0.0001450706166626189, + "loss": 3.901, + "step": 22600 + }, + { + "epoch": 20.475113122171944, + "grad_norm": 2.7020833492279053, + "learning_rate": 0.00014504947592207965, + "loss": 3.8461, + "step": 22625 + }, + { + "epoch": 20.497737556561084, + "grad_norm": 2.310760259628296, + "learning_rate": 0.0001450282914922435, + "loss": 3.8047, + "step": 22650 + }, + { + "epoch": 20.520361990950228, + "grad_norm": 2.482426881790161, + "learning_rate": 0.00014500706338632302, + "loss": 3.9389, + "step": 22675 + }, + { + "epoch": 20.542986425339368, + "grad_norm": 3.0349998474121094, + "learning_rate": 0.00014498664332646884, + "loss": 3.8287, + "step": 22700 + }, + { + "epoch": 20.56561085972851, + "grad_norm": 3.132357597351074, + "learning_rate": 0.0001449653296538543, + "loss": 4.0393, + "step": 22725 + }, + { + "epoch": 20.58823529411765, + "grad_norm": 2.248824119567871, + "learning_rate": 0.0001449439723444242, + "loss": 3.952, + "step": 22750 + }, + { + "epoch": 20.61085972850679, + "grad_norm": 2.4726078510284424, + "learning_rate": 0.00014492257141149895, + "loss": 4.0212, + "step": 22775 + }, + { + "epoch": 20.63348416289593, + "grad_norm": 3.7489490509033203, + "learning_rate": 0.0001449011268684261, + "loss": 4.0147, + "step": 22800 + }, + { + "epoch": 20.65610859728507, + "grad_norm": 3.4584591388702393, + "learning_rate": 0.00014487963872858046, + "loss": 4.0384, + "step": 22825 + }, + { + "epoch": 20.67873303167421, + "grad_norm": 3.992187023162842, + "learning_rate": 0.000144858107005364, + "loss": 3.8956, + "step": 22850 + }, + { + "epoch": 20.70135746606335, + "grad_norm": 2.215635061264038, + "learning_rate": 0.0001448365317122059, + "loss": 3.8018, + "step": 22875 + }, + { + "epoch": 20.72398190045249, + "grad_norm": 2.9964985847473145, + "learning_rate": 0.00014481491286256248, + "loss": 3.8225, + "step": 22900 + }, + { + "epoch": 20.74660633484163, + "grad_norm": 4.619599342346191, + "learning_rate": 0.00014479325046991726, + "loss": 3.9947, + "step": 22925 + }, + { + "epoch": 20.76923076923077, + "grad_norm": 2.641470432281494, + "learning_rate": 0.00014477154454778086, + "loss": 4.0523, + "step": 22950 + }, + { + "epoch": 20.79185520361991, + "grad_norm": 3.320308208465576, + "learning_rate": 0.00014474979510969117, + "loss": 3.8796, + "step": 22975 + }, + { + "epoch": 20.81447963800905, + "grad_norm": 3.0288734436035156, + "learning_rate": 0.00014472800216921305, + "loss": 3.8345, + "step": 23000 + }, + { + "epoch": 20.83710407239819, + "grad_norm": 2.2802088260650635, + "learning_rate": 0.00014470616573993865, + "loss": 3.8913, + "step": 23025 + }, + { + "epoch": 20.85972850678733, + "grad_norm": 2.45820689201355, + "learning_rate": 0.00014468428583548716, + "loss": 3.9711, + "step": 23050 + }, + { + "epoch": 20.88235294117647, + "grad_norm": 1.9973255395889282, + "learning_rate": 0.00014466236246950487, + "loss": 3.9193, + "step": 23075 + }, + { + "epoch": 20.90497737556561, + "grad_norm": 2.4245059490203857, + "learning_rate": 0.0001446403956556652, + "loss": 3.9111, + "step": 23100 + }, + { + "epoch": 20.92760180995475, + "grad_norm": 4.0652546882629395, + "learning_rate": 0.00014461838540766875, + "loss": 4.0191, + "step": 23125 + }, + { + "epoch": 20.95022624434389, + "grad_norm": 2.8320844173431396, + "learning_rate": 0.0001445963317392431, + "loss": 3.9825, + "step": 23150 + }, + { + "epoch": 20.97285067873303, + "grad_norm": 2.5035881996154785, + "learning_rate": 0.0001445742346641429, + "loss": 3.9313, + "step": 23175 + }, + { + "epoch": 20.995475113122172, + "grad_norm": 2.2385265827178955, + "learning_rate": 0.00014455209419614998, + "loss": 3.9076, + "step": 23200 + }, + { + "epoch": 21.018099547511312, + "grad_norm": 3.920473098754883, + "learning_rate": 0.00014452991034907313, + "loss": 3.7294, + "step": 23225 + }, + { + "epoch": 21.040723981900452, + "grad_norm": 3.026109218597412, + "learning_rate": 0.00014450768313674825, + "loss": 3.5229, + "step": 23250 + }, + { + "epoch": 21.063348416289593, + "grad_norm": 3.1287951469421387, + "learning_rate": 0.00014448541257303828, + "loss": 3.6663, + "step": 23275 + }, + { + "epoch": 21.085972850678733, + "grad_norm": 3.583796977996826, + "learning_rate": 0.0001444630986718332, + "loss": 3.6865, + "step": 23300 + }, + { + "epoch": 21.108597285067873, + "grad_norm": 2.2165699005126953, + "learning_rate": 0.00014444074144705002, + "loss": 3.6303, + "step": 23325 + }, + { + "epoch": 21.131221719457013, + "grad_norm": 3.276175022125244, + "learning_rate": 0.00014441834091263276, + "loss": 3.7655, + "step": 23350 + }, + { + "epoch": 21.153846153846153, + "grad_norm": 2.832486152648926, + "learning_rate": 0.0001443958970825524, + "loss": 3.7292, + "step": 23375 + }, + { + "epoch": 21.176470588235293, + "grad_norm": 2.4505176544189453, + "learning_rate": 0.00014437340997080703, + "loss": 3.6258, + "step": 23400 + }, + { + "epoch": 21.199095022624434, + "grad_norm": 2.6741573810577393, + "learning_rate": 0.00014435087959142166, + "loss": 3.8267, + "step": 23425 + }, + { + "epoch": 21.221719457013574, + "grad_norm": 2.6699492931365967, + "learning_rate": 0.00014432830595844832, + "loss": 3.7696, + "step": 23450 + }, + { + "epoch": 21.244343891402714, + "grad_norm": 2.3976047039031982, + "learning_rate": 0.000144305689085966, + "loss": 3.67, + "step": 23475 + }, + { + "epoch": 21.266968325791854, + "grad_norm": 2.6721272468566895, + "learning_rate": 0.00014428302898808067, + "loss": 3.798, + "step": 23500 + }, + { + "epoch": 21.289592760180994, + "grad_norm": 3.6001930236816406, + "learning_rate": 0.0001442603256789252, + "loss": 3.6711, + "step": 23525 + }, + { + "epoch": 21.312217194570135, + "grad_norm": 3.659670114517212, + "learning_rate": 0.00014423757917265956, + "loss": 3.6844, + "step": 23550 + }, + { + "epoch": 21.334841628959275, + "grad_norm": 3.1840333938598633, + "learning_rate": 0.00014421478948347047, + "loss": 3.8196, + "step": 23575 + }, + { + "epoch": 21.357466063348415, + "grad_norm": 2.3045458793640137, + "learning_rate": 0.00014419195662557173, + "loss": 3.7621, + "step": 23600 + }, + { + "epoch": 21.380090497737555, + "grad_norm": 2.785276412963867, + "learning_rate": 0.000144169080613204, + "loss": 3.8661, + "step": 23625 + }, + { + "epoch": 21.402714932126695, + "grad_norm": 3.0140879154205322, + "learning_rate": 0.00014414616146063485, + "loss": 3.8701, + "step": 23650 + }, + { + "epoch": 21.425339366515836, + "grad_norm": 2.6863458156585693, + "learning_rate": 0.0001441231991821588, + "loss": 3.7959, + "step": 23675 + }, + { + "epoch": 21.447963800904976, + "grad_norm": 2.7733335494995117, + "learning_rate": 0.0001441001937920972, + "loss": 3.9786, + "step": 23700 + }, + { + "epoch": 21.470588235294116, + "grad_norm": 3.5744516849517822, + "learning_rate": 0.00014407714530479835, + "loss": 3.5387, + "step": 23725 + }, + { + "epoch": 21.49321266968326, + "grad_norm": 2.776697874069214, + "learning_rate": 0.0001440540537346374, + "loss": 3.7142, + "step": 23750 + }, + { + "epoch": 21.5158371040724, + "grad_norm": 3.0770251750946045, + "learning_rate": 0.0001440309190960164, + "loss": 3.8133, + "step": 23775 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 2.604779005050659, + "learning_rate": 0.00014400774140336422, + "loss": 3.7582, + "step": 23800 + }, + { + "epoch": 21.56108597285068, + "grad_norm": 2.170243263244629, + "learning_rate": 0.0001439845206711366, + "loss": 3.9327, + "step": 23825 + }, + { + "epoch": 21.58371040723982, + "grad_norm": 3.603564500808716, + "learning_rate": 0.00014396125691381613, + "loss": 3.7161, + "step": 23850 + }, + { + "epoch": 21.60633484162896, + "grad_norm": 2.814188241958618, + "learning_rate": 0.0001439379501459122, + "loss": 3.8073, + "step": 23875 + }, + { + "epoch": 21.6289592760181, + "grad_norm": 2.9708809852600098, + "learning_rate": 0.00014391460038196114, + "loss": 3.7837, + "step": 23900 + }, + { + "epoch": 21.65158371040724, + "grad_norm": 2.4882922172546387, + "learning_rate": 0.00014389120763652592, + "loss": 3.7579, + "step": 23925 + }, + { + "epoch": 21.67420814479638, + "grad_norm": 3.3712284564971924, + "learning_rate": 0.00014386777192419643, + "loss": 3.6708, + "step": 23950 + }, + { + "epoch": 21.69683257918552, + "grad_norm": 3.6276750564575195, + "learning_rate": 0.00014384429325958937, + "loss": 3.7829, + "step": 23975 + }, + { + "epoch": 21.71945701357466, + "grad_norm": 2.4860033988952637, + "learning_rate": 0.00014382077165734814, + "loss": 3.8101, + "step": 24000 + }, + { + "epoch": 21.742081447963802, + "grad_norm": 2.5607101917266846, + "learning_rate": 0.000143797207132143, + "loss": 3.6407, + "step": 24025 + }, + { + "epoch": 21.764705882352942, + "grad_norm": 3.5957260131835938, + "learning_rate": 0.00014377359969867102, + "loss": 3.8883, + "step": 24050 + }, + { + "epoch": 21.787330316742082, + "grad_norm": 2.6889231204986572, + "learning_rate": 0.00014374994937165587, + "loss": 3.8151, + "step": 24075 + }, + { + "epoch": 21.809954751131222, + "grad_norm": 2.925708532333374, + "learning_rate": 0.0001437262561658481, + "loss": 3.8295, + "step": 24100 + }, + { + "epoch": 21.832579185520363, + "grad_norm": 2.946596145629883, + "learning_rate": 0.000143702520096025, + "loss": 3.8385, + "step": 24125 + }, + { + "epoch": 21.855203619909503, + "grad_norm": 3.8493125438690186, + "learning_rate": 0.00014367874117699053, + "loss": 3.8796, + "step": 24150 + }, + { + "epoch": 21.877828054298643, + "grad_norm": 2.1728506088256836, + "learning_rate": 0.00014365491942357545, + "loss": 3.9122, + "step": 24175 + }, + { + "epoch": 21.900452488687783, + "grad_norm": 2.742302656173706, + "learning_rate": 0.00014363105485063716, + "loss": 3.8483, + "step": 24200 + }, + { + "epoch": 21.923076923076923, + "grad_norm": 4.172833442687988, + "learning_rate": 0.00014360714747305983, + "loss": 3.7386, + "step": 24225 + }, + { + "epoch": 21.945701357466064, + "grad_norm": 2.3973381519317627, + "learning_rate": 0.00014358319730575428, + "loss": 3.7992, + "step": 24250 + }, + { + "epoch": 21.968325791855204, + "grad_norm": 4.66103458404541, + "learning_rate": 0.00014355920436365802, + "loss": 3.8109, + "step": 24275 + }, + { + "epoch": 21.990950226244344, + "grad_norm": 2.4127964973449707, + "learning_rate": 0.00014353516866173532, + "loss": 3.7642, + "step": 24300 + }, + { + "epoch": 22.013574660633484, + "grad_norm": 3.1286137104034424, + "learning_rate": 0.000143511090214977, + "loss": 3.5502, + "step": 24325 + }, + { + "epoch": 22.036199095022624, + "grad_norm": 2.3641533851623535, + "learning_rate": 0.00014348696903840062, + "loss": 3.5343, + "step": 24350 + }, + { + "epoch": 22.058823529411764, + "grad_norm": 3.4149348735809326, + "learning_rate": 0.00014346280514705034, + "loss": 3.4885, + "step": 24375 + }, + { + "epoch": 22.081447963800905, + "grad_norm": 4.007994651794434, + "learning_rate": 0.000143438598555997, + "loss": 3.5516, + "step": 24400 + }, + { + "epoch": 22.104072398190045, + "grad_norm": 3.8492393493652344, + "learning_rate": 0.00014341434928033807, + "loss": 3.553, + "step": 24425 + }, + { + "epoch": 22.126696832579185, + "grad_norm": 4.333260536193848, + "learning_rate": 0.00014339005733519762, + "loss": 3.4589, + "step": 24450 + }, + { + "epoch": 22.149321266968325, + "grad_norm": 3.36545729637146, + "learning_rate": 0.0001433657227357263, + "loss": 3.6346, + "step": 24475 + }, + { + "epoch": 22.171945701357465, + "grad_norm": 3.1914398670196533, + "learning_rate": 0.00014334134549710148, + "loss": 3.5903, + "step": 24500 + }, + { + "epoch": 22.194570135746606, + "grad_norm": 2.467092514038086, + "learning_rate": 0.00014331692563452703, + "loss": 3.6353, + "step": 24525 + }, + { + "epoch": 22.217194570135746, + "grad_norm": 4.812871932983398, + "learning_rate": 0.00014329246316323338, + "loss": 3.412, + "step": 24550 + }, + { + "epoch": 22.239819004524886, + "grad_norm": 3.4200334548950195, + "learning_rate": 0.00014326795809847757, + "loss": 3.5378, + "step": 24575 + }, + { + "epoch": 22.262443438914026, + "grad_norm": 4.153021812438965, + "learning_rate": 0.0001432434104555433, + "loss": 3.5938, + "step": 24600 + }, + { + "epoch": 22.285067873303166, + "grad_norm": 3.033048629760742, + "learning_rate": 0.00014321882024974063, + "loss": 3.8194, + "step": 24625 + }, + { + "epoch": 22.307692307692307, + "grad_norm": 2.6659903526306152, + "learning_rate": 0.00014319418749640637, + "loss": 3.3993, + "step": 24650 + }, + { + "epoch": 22.330316742081447, + "grad_norm": 3.0235040187835693, + "learning_rate": 0.0001431695122109037, + "loss": 3.5442, + "step": 24675 + }, + { + "epoch": 22.352941176470587, + "grad_norm": 2.6311628818511963, + "learning_rate": 0.00014314479440862243, + "loss": 3.7309, + "step": 24700 + }, + { + "epoch": 22.375565610859727, + "grad_norm": 2.480937957763672, + "learning_rate": 0.00014312003410497885, + "loss": 3.5935, + "step": 24725 + }, + { + "epoch": 22.398190045248867, + "grad_norm": 2.8534250259399414, + "learning_rate": 0.00014309523131541575, + "loss": 3.7793, + "step": 24750 + }, + { + "epoch": 22.420814479638008, + "grad_norm": 3.454343557357788, + "learning_rate": 0.00014307038605540246, + "loss": 3.6217, + "step": 24775 + }, + { + "epoch": 22.443438914027148, + "grad_norm": 2.928800106048584, + "learning_rate": 0.00014304549834043476, + "loss": 3.5282, + "step": 24800 + }, + { + "epoch": 22.466063348416288, + "grad_norm": 2.2259414196014404, + "learning_rate": 0.00014302056818603492, + "loss": 3.6573, + "step": 24825 + }, + { + "epoch": 22.488687782805428, + "grad_norm": 2.492011547088623, + "learning_rate": 0.00014299559560775163, + "loss": 3.6935, + "step": 24850 + }, + { + "epoch": 22.511312217194572, + "grad_norm": 3.9176621437072754, + "learning_rate": 0.00014297058062116014, + "loss": 3.6242, + "step": 24875 + }, + { + "epoch": 22.533936651583712, + "grad_norm": 3.987274646759033, + "learning_rate": 0.0001429455232418621, + "loss": 3.6212, + "step": 24900 + }, + { + "epoch": 22.556561085972852, + "grad_norm": 2.9213876724243164, + "learning_rate": 0.00014292042348548558, + "loss": 3.7327, + "step": 24925 + }, + { + "epoch": 22.579185520361992, + "grad_norm": 3.0751595497131348, + "learning_rate": 0.0001428952813676851, + "loss": 3.6155, + "step": 24950 + }, + { + "epoch": 22.601809954751133, + "grad_norm": 4.081638813018799, + "learning_rate": 0.00014287009690414158, + "loss": 3.6378, + "step": 24975 + }, + { + "epoch": 22.624434389140273, + "grad_norm": 3.4721250534057617, + "learning_rate": 0.0001428448701105624, + "loss": 3.7146, + "step": 25000 + }, + { + "epoch": 22.647058823529413, + "grad_norm": 3.622154712677002, + "learning_rate": 0.00014281960100268127, + "loss": 3.7822, + "step": 25025 + }, + { + "epoch": 22.669683257918553, + "grad_norm": 2.6639842987060547, + "learning_rate": 0.0001427942895962584, + "loss": 3.6893, + "step": 25050 + }, + { + "epoch": 22.692307692307693, + "grad_norm": 2.982954740524292, + "learning_rate": 0.0001427689359070802, + "loss": 3.6211, + "step": 25075 + }, + { + "epoch": 22.714932126696834, + "grad_norm": 3.1290342807769775, + "learning_rate": 0.00014274353995095965, + "loss": 3.7861, + "step": 25100 + }, + { + "epoch": 22.737556561085974, + "grad_norm": 3.3933656215667725, + "learning_rate": 0.00014271810174373598, + "loss": 3.7582, + "step": 25125 + }, + { + "epoch": 22.760180995475114, + "grad_norm": 2.868032932281494, + "learning_rate": 0.00014269262130127481, + "loss": 3.7166, + "step": 25150 + }, + { + "epoch": 22.782805429864254, + "grad_norm": 2.7672760486602783, + "learning_rate": 0.00014266709863946806, + "loss": 3.6105, + "step": 25175 + }, + { + "epoch": 22.805429864253394, + "grad_norm": 2.7547879219055176, + "learning_rate": 0.00014264153377423403, + "loss": 3.7227, + "step": 25200 + }, + { + "epoch": 22.828054298642535, + "grad_norm": 3.555361032485962, + "learning_rate": 0.00014261592672151733, + "loss": 3.8386, + "step": 25225 + }, + { + "epoch": 22.850678733031675, + "grad_norm": 2.814558982849121, + "learning_rate": 0.00014259027749728885, + "loss": 3.8839, + "step": 25250 + }, + { + "epoch": 22.873303167420815, + "grad_norm": 2.9268410205841064, + "learning_rate": 0.0001425645861175458, + "loss": 3.7736, + "step": 25275 + }, + { + "epoch": 22.895927601809955, + "grad_norm": 3.4612526893615723, + "learning_rate": 0.00014253885259831172, + "loss": 3.6202, + "step": 25300 + }, + { + "epoch": 22.918552036199095, + "grad_norm": 3.606201171875, + "learning_rate": 0.00014251307695563637, + "loss": 3.6119, + "step": 25325 + }, + { + "epoch": 22.941176470588236, + "grad_norm": 3.235017776489258, + "learning_rate": 0.00014248725920559582, + "loss": 3.664, + "step": 25350 + }, + { + "epoch": 22.963800904977376, + "grad_norm": 3.830575942993164, + "learning_rate": 0.0001424613993642924, + "loss": 3.673, + "step": 25375 + }, + { + "epoch": 22.986425339366516, + "grad_norm": 3.381676435470581, + "learning_rate": 0.0001424354974478547, + "loss": 3.6577, + "step": 25400 + }, + { + "epoch": 23.009049773755656, + "grad_norm": 2.7601847648620605, + "learning_rate": 0.0001424095534724375, + "loss": 3.4432, + "step": 25425 + }, + { + "epoch": 23.031674208144796, + "grad_norm": 2.761472225189209, + "learning_rate": 0.0001423835674542219, + "loss": 3.4836, + "step": 25450 + }, + { + "epoch": 23.054298642533936, + "grad_norm": 3.1776866912841797, + "learning_rate": 0.00014235753940941513, + "loss": 3.273, + "step": 25475 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 2.9398069381713867, + "learning_rate": 0.00014233146935425066, + "loss": 3.4708, + "step": 25500 + }, + { + "epoch": 23.099547511312217, + "grad_norm": 2.938962459564209, + "learning_rate": 0.00014230535730498824, + "loss": 3.5316, + "step": 25525 + }, + { + "epoch": 23.122171945701357, + "grad_norm": 2.923698663711548, + "learning_rate": 0.0001422792032779137, + "loss": 3.4431, + "step": 25550 + }, + { + "epoch": 23.144796380090497, + "grad_norm": 4.093198299407959, + "learning_rate": 0.0001422530072893391, + "loss": 3.4515, + "step": 25575 + }, + { + "epoch": 23.167420814479637, + "grad_norm": 3.009486436843872, + "learning_rate": 0.00014222676935560265, + "loss": 3.4196, + "step": 25600 + }, + { + "epoch": 23.190045248868778, + "grad_norm": 3.3584773540496826, + "learning_rate": 0.0001422004894930688, + "loss": 3.4173, + "step": 25625 + }, + { + "epoch": 23.212669683257918, + "grad_norm": 3.2386748790740967, + "learning_rate": 0.000142174167718128, + "loss": 3.3641, + "step": 25650 + }, + { + "epoch": 23.235294117647058, + "grad_norm": 3.175867795944214, + "learning_rate": 0.000142147804047197, + "loss": 3.5917, + "step": 25675 + }, + { + "epoch": 23.257918552036198, + "grad_norm": 2.7285513877868652, + "learning_rate": 0.00014212139849671863, + "loss": 3.3328, + "step": 25700 + }, + { + "epoch": 23.28054298642534, + "grad_norm": 2.6384241580963135, + "learning_rate": 0.00014209495108316174, + "loss": 3.4482, + "step": 25725 + }, + { + "epoch": 23.30316742081448, + "grad_norm": 4.104657173156738, + "learning_rate": 0.00014206846182302142, + "loss": 3.3214, + "step": 25750 + }, + { + "epoch": 23.32579185520362, + "grad_norm": 3.784193277359009, + "learning_rate": 0.00014204193073281878, + "loss": 3.3955, + "step": 25775 + }, + { + "epoch": 23.34841628959276, + "grad_norm": 2.452960252761841, + "learning_rate": 0.0001420153578291011, + "loss": 3.5232, + "step": 25800 + }, + { + "epoch": 23.3710407239819, + "grad_norm": 3.177434206008911, + "learning_rate": 0.00014198874312844163, + "loss": 3.5626, + "step": 25825 + }, + { + "epoch": 23.39366515837104, + "grad_norm": 2.790126323699951, + "learning_rate": 0.0001419620866474398, + "loss": 3.5347, + "step": 25850 + }, + { + "epoch": 23.41628959276018, + "grad_norm": 4.466036319732666, + "learning_rate": 0.000141935388402721, + "loss": 3.5762, + "step": 25875 + }, + { + "epoch": 23.43891402714932, + "grad_norm": 3.542550563812256, + "learning_rate": 0.00014190864841093673, + "loss": 3.6707, + "step": 25900 + }, + { + "epoch": 23.46153846153846, + "grad_norm": 3.3853087425231934, + "learning_rate": 0.0001418818666887645, + "loss": 3.5571, + "step": 25925 + }, + { + "epoch": 23.4841628959276, + "grad_norm": 3.19075345993042, + "learning_rate": 0.00014185504325290788, + "loss": 3.583, + "step": 25950 + }, + { + "epoch": 23.50678733031674, + "grad_norm": 2.814525604248047, + "learning_rate": 0.00014182817812009637, + "loss": 3.6345, + "step": 25975 + }, + { + "epoch": 23.529411764705884, + "grad_norm": 2.687741756439209, + "learning_rate": 0.00014180127130708562, + "loss": 3.3791, + "step": 26000 + }, + { + "epoch": 23.552036199095024, + "grad_norm": 2.655111074447632, + "learning_rate": 0.00014177432283065712, + "loss": 3.5476, + "step": 26025 + }, + { + "epoch": 23.574660633484164, + "grad_norm": 3.3815510272979736, + "learning_rate": 0.0001417473327076185, + "loss": 3.5151, + "step": 26050 + }, + { + "epoch": 23.597285067873305, + "grad_norm": 3.5296308994293213, + "learning_rate": 0.00014172030095480322, + "loss": 3.3458, + "step": 26075 + }, + { + "epoch": 23.619909502262445, + "grad_norm": 3.340770959854126, + "learning_rate": 0.00014169322758907077, + "loss": 3.7039, + "step": 26100 + }, + { + "epoch": 23.642533936651585, + "grad_norm": 4.046700477600098, + "learning_rate": 0.0001416661126273066, + "loss": 3.3456, + "step": 26125 + }, + { + "epoch": 23.665158371040725, + "grad_norm": 4.637563705444336, + "learning_rate": 0.00014163895608642214, + "loss": 3.6017, + "step": 26150 + }, + { + "epoch": 23.687782805429865, + "grad_norm": 4.189379692077637, + "learning_rate": 0.0001416117579833546, + "loss": 3.4353, + "step": 26175 + }, + { + "epoch": 23.710407239819006, + "grad_norm": 2.8796141147613525, + "learning_rate": 0.00014158451833506735, + "loss": 3.6313, + "step": 26200 + }, + { + "epoch": 23.733031674208146, + "grad_norm": 2.5167758464813232, + "learning_rate": 0.00014155723715854944, + "loss": 3.5882, + "step": 26225 + }, + { + "epoch": 23.755656108597286, + "grad_norm": 4.902872562408447, + "learning_rate": 0.00014152991447081599, + "loss": 3.5322, + "step": 26250 + }, + { + "epoch": 23.778280542986426, + "grad_norm": 2.816702365875244, + "learning_rate": 0.00014150255028890787, + "loss": 3.5301, + "step": 26275 + }, + { + "epoch": 23.800904977375566, + "grad_norm": 3.1594150066375732, + "learning_rate": 0.00014147514462989195, + "loss": 3.5826, + "step": 26300 + }, + { + "epoch": 23.823529411764707, + "grad_norm": 2.405186414718628, + "learning_rate": 0.00014144769751086095, + "loss": 3.5372, + "step": 26325 + }, + { + "epoch": 23.846153846153847, + "grad_norm": 2.4925551414489746, + "learning_rate": 0.00014142020894893334, + "loss": 3.5462, + "step": 26350 + }, + { + "epoch": 23.868778280542987, + "grad_norm": 3.11789608001709, + "learning_rate": 0.00014139267896125357, + "loss": 3.6252, + "step": 26375 + }, + { + "epoch": 23.891402714932127, + "grad_norm": 3.4235665798187256, + "learning_rate": 0.00014136510756499184, + "loss": 3.5679, + "step": 26400 + }, + { + "epoch": 23.914027149321267, + "grad_norm": 3.0763099193573, + "learning_rate": 0.00014133749477734424, + "loss": 3.6435, + "step": 26425 + }, + { + "epoch": 23.936651583710407, + "grad_norm": 3.3343071937561035, + "learning_rate": 0.0001413098406155326, + "loss": 3.6377, + "step": 26450 + }, + { + "epoch": 23.959276018099548, + "grad_norm": 2.7103164196014404, + "learning_rate": 0.00014128214509680467, + "loss": 3.6505, + "step": 26475 + }, + { + "epoch": 23.981900452488688, + "grad_norm": 3.13881778717041, + "learning_rate": 0.00014125440823843386, + "loss": 3.6165, + "step": 26500 + }, + { + "epoch": 24.004524886877828, + "grad_norm": 3.2268309593200684, + "learning_rate": 0.00014122663005771948, + "loss": 3.4545, + "step": 26525 + }, + { + "epoch": 24.02714932126697, + "grad_norm": 3.1312403678894043, + "learning_rate": 0.0001411988105719865, + "loss": 3.2974, + "step": 26550 + }, + { + "epoch": 24.04977375565611, + "grad_norm": 3.918436288833618, + "learning_rate": 0.00014117094979858573, + "loss": 3.2575, + "step": 26575 + }, + { + "epoch": 24.07239819004525, + "grad_norm": 4.358361721038818, + "learning_rate": 0.00014114304775489375, + "loss": 3.278, + "step": 26600 + }, + { + "epoch": 24.09502262443439, + "grad_norm": 2.8544600009918213, + "learning_rate": 0.0001411151044583128, + "loss": 3.233, + "step": 26625 + }, + { + "epoch": 24.11764705882353, + "grad_norm": 3.5348682403564453, + "learning_rate": 0.00014108711992627087, + "loss": 3.2931, + "step": 26650 + }, + { + "epoch": 24.14027149321267, + "grad_norm": 3.0109381675720215, + "learning_rate": 0.00014105909417622174, + "loss": 3.3485, + "step": 26675 + }, + { + "epoch": 24.16289592760181, + "grad_norm": 3.3919458389282227, + "learning_rate": 0.00014103102722564485, + "loss": 3.3735, + "step": 26700 + }, + { + "epoch": 24.18552036199095, + "grad_norm": 3.133079767227173, + "learning_rate": 0.00014100291909204527, + "loss": 3.4457, + "step": 26725 + }, + { + "epoch": 24.20814479638009, + "grad_norm": 3.33530855178833, + "learning_rate": 0.0001409747697929539, + "loss": 3.2646, + "step": 26750 + }, + { + "epoch": 24.23076923076923, + "grad_norm": 3.767277717590332, + "learning_rate": 0.0001409477077536281, + "loss": 3.2765, + "step": 26775 + }, + { + "epoch": 24.25339366515837, + "grad_norm": 3.2638959884643555, + "learning_rate": 0.0001409194778211244, + "loss": 3.4323, + "step": 26800 + }, + { + "epoch": 24.27601809954751, + "grad_norm": 4.183350086212158, + "learning_rate": 0.00014089120677517053, + "loss": 3.399, + "step": 26825 + }, + { + "epoch": 24.29864253393665, + "grad_norm": 3.9280920028686523, + "learning_rate": 0.00014086289463339886, + "loss": 3.37, + "step": 26850 + }, + { + "epoch": 24.32126696832579, + "grad_norm": 4.650427341461182, + "learning_rate": 0.00014083454141346753, + "loss": 3.4239, + "step": 26875 + }, + { + "epoch": 24.34389140271493, + "grad_norm": 4.651412487030029, + "learning_rate": 0.00014080614713306015, + "loss": 3.4509, + "step": 26900 + }, + { + "epoch": 24.36651583710407, + "grad_norm": 2.813253402709961, + "learning_rate": 0.00014077771180988604, + "loss": 3.2197, + "step": 26925 + }, + { + "epoch": 24.38914027149321, + "grad_norm": 2.85752272605896, + "learning_rate": 0.00014075037530306622, + "loss": 3.4156, + "step": 26950 + }, + { + "epoch": 24.41176470588235, + "grad_norm": 3.765904664993286, + "learning_rate": 0.0001407218595875384, + "loss": 3.4427, + "step": 26975 + }, + { + "epoch": 24.43438914027149, + "grad_norm": 3.2647831439971924, + "learning_rate": 0.0001406933028818133, + "loss": 3.3161, + "step": 27000 + }, + { + "epoch": 24.457013574660632, + "grad_norm": 3.5723791122436523, + "learning_rate": 0.0001406647052037015, + "loss": 3.3568, + "step": 27025 + }, + { + "epoch": 24.479638009049772, + "grad_norm": 3.9250237941741943, + "learning_rate": 0.00014063606657103918, + "loss": 3.3633, + "step": 27050 + }, + { + "epoch": 24.502262443438916, + "grad_norm": 3.2530288696289062, + "learning_rate": 0.000140607387001688, + "loss": 3.1968, + "step": 27075 + }, + { + "epoch": 24.524886877828056, + "grad_norm": 3.3201465606689453, + "learning_rate": 0.00014057866651353518, + "loss": 3.4533, + "step": 27100 + }, + { + "epoch": 24.547511312217196, + "grad_norm": 3.3233988285064697, + "learning_rate": 0.0001405499051244935, + "loss": 3.434, + "step": 27125 + }, + { + "epoch": 24.570135746606336, + "grad_norm": 4.571245193481445, + "learning_rate": 0.0001405211028525011, + "loss": 3.3871, + "step": 27150 + }, + { + "epoch": 24.592760180995477, + "grad_norm": 3.496021032333374, + "learning_rate": 0.0001404922597155218, + "loss": 3.2646, + "step": 27175 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 3.7779834270477295, + "learning_rate": 0.00014046337573154485, + "loss": 3.7001, + "step": 27200 + }, + { + "epoch": 24.638009049773757, + "grad_norm": 4.0103936195373535, + "learning_rate": 0.00014043445091858493, + "loss": 3.3776, + "step": 27225 + }, + { + "epoch": 24.660633484162897, + "grad_norm": 3.5195281505584717, + "learning_rate": 0.00014040548529468223, + "loss": 3.4887, + "step": 27250 + }, + { + "epoch": 24.683257918552037, + "grad_norm": 4.090165615081787, + "learning_rate": 0.00014037647887790237, + "loss": 3.4716, + "step": 27275 + }, + { + "epoch": 24.705882352941178, + "grad_norm": 3.485745668411255, + "learning_rate": 0.00014034743168633637, + "loss": 3.509, + "step": 27300 + }, + { + "epoch": 24.728506787330318, + "grad_norm": 2.8224310874938965, + "learning_rate": 0.00014031834373810082, + "loss": 3.4638, + "step": 27325 + }, + { + "epoch": 24.751131221719458, + "grad_norm": 3.4466655254364014, + "learning_rate": 0.0001402892150513376, + "loss": 3.2815, + "step": 27350 + }, + { + "epoch": 24.773755656108598, + "grad_norm": 3.448620319366455, + "learning_rate": 0.000140260045644214, + "loss": 3.4555, + "step": 27375 + }, + { + "epoch": 24.79638009049774, + "grad_norm": 2.5938901901245117, + "learning_rate": 0.00014023083553492283, + "loss": 3.3474, + "step": 27400 + }, + { + "epoch": 24.81900452488688, + "grad_norm": 3.0207743644714355, + "learning_rate": 0.00014020158474168214, + "loss": 3.4351, + "step": 27425 + }, + { + "epoch": 24.84162895927602, + "grad_norm": 3.6358518600463867, + "learning_rate": 0.0001401722932827354, + "loss": 3.5771, + "step": 27450 + }, + { + "epoch": 24.86425339366516, + "grad_norm": 3.3951761722564697, + "learning_rate": 0.00014014296117635154, + "loss": 3.6193, + "step": 27475 + }, + { + "epoch": 24.8868778280543, + "grad_norm": 4.6842827796936035, + "learning_rate": 0.00014011358844082466, + "loss": 3.4015, + "step": 27500 + }, + { + "epoch": 24.90950226244344, + "grad_norm": 2.8488800525665283, + "learning_rate": 0.00014008417509447438, + "loss": 3.2968, + "step": 27525 + }, + { + "epoch": 24.93212669683258, + "grad_norm": 3.6501495838165283, + "learning_rate": 0.0001400547211556455, + "loss": 3.4843, + "step": 27550 + }, + { + "epoch": 24.95475113122172, + "grad_norm": 2.7535831928253174, + "learning_rate": 0.0001400252266427083, + "loss": 3.2811, + "step": 27575 + }, + { + "epoch": 24.97737556561086, + "grad_norm": 3.883152484893799, + "learning_rate": 0.00013999569157405816, + "loss": 3.5346, + "step": 27600 + }, + { + "epoch": 25.0, + "grad_norm": 3.7707326412200928, + "learning_rate": 0.0001399661159681159, + "loss": 3.4849, + "step": 27625 + }, + { + "epoch": 25.02262443438914, + "grad_norm": 4.191146373748779, + "learning_rate": 0.00013993649984332765, + "loss": 3.0419, + "step": 27650 + }, + { + "epoch": 25.04524886877828, + "grad_norm": 3.368551731109619, + "learning_rate": 0.0001399068432181647, + "loss": 3.4149, + "step": 27675 + }, + { + "epoch": 25.06787330316742, + "grad_norm": 2.966998338699341, + "learning_rate": 0.00013987714611112364, + "loss": 3.0632, + "step": 27700 + }, + { + "epoch": 25.09049773755656, + "grad_norm": 5.442501068115234, + "learning_rate": 0.00013984740854072636, + "loss": 3.2078, + "step": 27725 + }, + { + "epoch": 25.1131221719457, + "grad_norm": 3.4973864555358887, + "learning_rate": 0.00013981763052551988, + "loss": 3.2176, + "step": 27750 + }, + { + "epoch": 25.13574660633484, + "grad_norm": 3.5123682022094727, + "learning_rate": 0.00013978781208407657, + "loss": 3.1047, + "step": 27775 + }, + { + "epoch": 25.15837104072398, + "grad_norm": 3.922881841659546, + "learning_rate": 0.00013975795323499393, + "loss": 3.2109, + "step": 27800 + }, + { + "epoch": 25.18099547511312, + "grad_norm": 2.8657217025756836, + "learning_rate": 0.0001397280539968947, + "loss": 3.1468, + "step": 27825 + }, + { + "epoch": 25.20361990950226, + "grad_norm": 3.662860631942749, + "learning_rate": 0.00013969811438842677, + "loss": 3.1534, + "step": 27850 + }, + { + "epoch": 25.226244343891402, + "grad_norm": 3.196195125579834, + "learning_rate": 0.00013966813442826324, + "loss": 3.1925, + "step": 27875 + }, + { + "epoch": 25.248868778280542, + "grad_norm": 4.097235202789307, + "learning_rate": 0.00013963811413510236, + "loss": 3.3288, + "step": 27900 + }, + { + "epoch": 25.271493212669682, + "grad_norm": 2.881373643875122, + "learning_rate": 0.0001396080535276676, + "loss": 3.2905, + "step": 27925 + }, + { + "epoch": 25.294117647058822, + "grad_norm": 4.8148193359375, + "learning_rate": 0.00013957795262470744, + "loss": 3.3332, + "step": 27950 + }, + { + "epoch": 25.316742081447963, + "grad_norm": 2.971619129180908, + "learning_rate": 0.00013954781144499565, + "loss": 3.1742, + "step": 27975 + }, + { + "epoch": 25.339366515837103, + "grad_norm": 3.4500572681427, + "learning_rate": 0.00013951763000733097, + "loss": 3.2529, + "step": 28000 + }, + { + "epoch": 25.361990950226243, + "grad_norm": 3.8950579166412354, + "learning_rate": 0.00013948740833053737, + "loss": 3.3713, + "step": 28025 + }, + { + "epoch": 25.384615384615383, + "grad_norm": 3.466339588165283, + "learning_rate": 0.00013945714643346388, + "loss": 3.2068, + "step": 28050 + }, + { + "epoch": 25.407239819004523, + "grad_norm": 3.300884246826172, + "learning_rate": 0.00013942684433498455, + "loss": 3.313, + "step": 28075 + }, + { + "epoch": 25.429864253393664, + "grad_norm": 3.316925287246704, + "learning_rate": 0.0001393965020539986, + "loss": 3.2274, + "step": 28100 + }, + { + "epoch": 25.452488687782804, + "grad_norm": 3.2406859397888184, + "learning_rate": 0.00013936611960943022, + "loss": 3.2634, + "step": 28125 + }, + { + "epoch": 25.475113122171944, + "grad_norm": 3.2880921363830566, + "learning_rate": 0.00013933569702022876, + "loss": 3.2326, + "step": 28150 + }, + { + "epoch": 25.497737556561084, + "grad_norm": 5.280263423919678, + "learning_rate": 0.0001393052343053685, + "loss": 3.493, + "step": 28175 + }, + { + "epoch": 25.520361990950228, + "grad_norm": 3.5389604568481445, + "learning_rate": 0.00013927473148384883, + "loss": 3.1836, + "step": 28200 + }, + { + "epoch": 25.542986425339368, + "grad_norm": 3.564344882965088, + "learning_rate": 0.00013924418857469406, + "loss": 3.1464, + "step": 28225 + }, + { + "epoch": 25.56561085972851, + "grad_norm": 3.950079917907715, + "learning_rate": 0.00013921360559695362, + "loss": 3.2156, + "step": 28250 + }, + { + "epoch": 25.58823529411765, + "grad_norm": 3.957066297531128, + "learning_rate": 0.0001391829825697018, + "loss": 3.3102, + "step": 28275 + }, + { + "epoch": 25.61085972850679, + "grad_norm": 3.5355470180511475, + "learning_rate": 0.000139152319512038, + "loss": 3.3057, + "step": 28300 + }, + { + "epoch": 25.63348416289593, + "grad_norm": 3.0977582931518555, + "learning_rate": 0.00013912161644308646, + "loss": 3.2756, + "step": 28325 + }, + { + "epoch": 25.65610859728507, + "grad_norm": 2.89414644241333, + "learning_rate": 0.00013909087338199652, + "loss": 3.276, + "step": 28350 + }, + { + "epoch": 25.67873303167421, + "grad_norm": 3.588494062423706, + "learning_rate": 0.00013906009034794228, + "loss": 3.2859, + "step": 28375 + }, + { + "epoch": 25.70135746606335, + "grad_norm": 2.6155803203582764, + "learning_rate": 0.000139029267360123, + "loss": 3.265, + "step": 28400 + }, + { + "epoch": 25.72398190045249, + "grad_norm": 8.915180206298828, + "learning_rate": 0.0001389984044377626, + "loss": 3.31, + "step": 28425 + }, + { + "epoch": 25.74660633484163, + "grad_norm": 4.498159885406494, + "learning_rate": 0.0001389675016001101, + "loss": 3.2415, + "step": 28450 + }, + { + "epoch": 25.76923076923077, + "grad_norm": 3.761918544769287, + "learning_rate": 0.00013893655886643939, + "loss": 3.4283, + "step": 28475 + }, + { + "epoch": 25.79185520361991, + "grad_norm": 3.198148488998413, + "learning_rate": 0.0001389055762560491, + "loss": 3.4218, + "step": 28500 + }, + { + "epoch": 25.81447963800905, + "grad_norm": 5.093716621398926, + "learning_rate": 0.00013887455378826293, + "loss": 3.4517, + "step": 28525 + }, + { + "epoch": 25.83710407239819, + "grad_norm": 2.8361928462982178, + "learning_rate": 0.00013884349148242934, + "loss": 3.4102, + "step": 28550 + }, + { + "epoch": 25.85972850678733, + "grad_norm": 3.1751811504364014, + "learning_rate": 0.00013881238935792157, + "loss": 3.2427, + "step": 28575 + }, + { + "epoch": 25.88235294117647, + "grad_norm": 3.8904829025268555, + "learning_rate": 0.0001387812474341378, + "loss": 3.3164, + "step": 28600 + }, + { + "epoch": 25.90497737556561, + "grad_norm": 4.2356414794921875, + "learning_rate": 0.00013875006573050105, + "loss": 3.2934, + "step": 28625 + }, + { + "epoch": 25.92760180995475, + "grad_norm": 5.149202823638916, + "learning_rate": 0.00013871884426645904, + "loss": 3.3569, + "step": 28650 + }, + { + "epoch": 25.95022624434389, + "grad_norm": 2.6554596424102783, + "learning_rate": 0.00013868758306148437, + "loss": 3.2018, + "step": 28675 + }, + { + "epoch": 25.97285067873303, + "grad_norm": 3.185791492462158, + "learning_rate": 0.00013865628213507439, + "loss": 3.4526, + "step": 28700 + }, + { + "epoch": 25.995475113122172, + "grad_norm": 4.2164387702941895, + "learning_rate": 0.00013862494150675126, + "loss": 3.2728, + "step": 28725 + }, + { + "epoch": 26.018099547511312, + "grad_norm": 4.304487228393555, + "learning_rate": 0.00013859356119606185, + "loss": 2.983, + "step": 28750 + }, + { + "epoch": 26.040723981900452, + "grad_norm": 3.6233808994293213, + "learning_rate": 0.0001385621412225778, + "loss": 2.9708, + "step": 28775 + }, + { + "epoch": 26.063348416289593, + "grad_norm": 2.99818754196167, + "learning_rate": 0.00013853068160589555, + "loss": 3.0539, + "step": 28800 + }, + { + "epoch": 26.085972850678733, + "grad_norm": 3.181682586669922, + "learning_rate": 0.00013849918236563617, + "loss": 2.9993, + "step": 28825 + }, + { + "epoch": 26.108597285067873, + "grad_norm": 3.563267946243286, + "learning_rate": 0.00013846764352144547, + "loss": 3.0493, + "step": 28850 + }, + { + "epoch": 26.131221719457013, + "grad_norm": 3.5948915481567383, + "learning_rate": 0.00013843606509299404, + "loss": 3.084, + "step": 28875 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 3.071012020111084, + "learning_rate": 0.000138404447099977, + "loss": 3.0579, + "step": 28900 + }, + { + "epoch": 26.176470588235293, + "grad_norm": 3.4085752964019775, + "learning_rate": 0.0001383727895621143, + "loss": 3.1644, + "step": 28925 + }, + { + "epoch": 26.199095022624434, + "grad_norm": 5.598212718963623, + "learning_rate": 0.0001383410924991505, + "loss": 2.9763, + "step": 28950 + }, + { + "epoch": 26.221719457013574, + "grad_norm": 4.041131496429443, + "learning_rate": 0.00013830935593085478, + "loss": 3.157, + "step": 28975 + }, + { + "epoch": 26.244343891402714, + "grad_norm": 3.9623188972473145, + "learning_rate": 0.00013827757987702098, + "loss": 3.182, + "step": 29000 + }, + { + "epoch": 26.266968325791854, + "grad_norm": 3.68810772895813, + "learning_rate": 0.00013824576435746757, + "loss": 3.1772, + "step": 29025 + }, + { + "epoch": 26.289592760180994, + "grad_norm": 4.063492298126221, + "learning_rate": 0.00013821390939203765, + "loss": 3.0717, + "step": 29050 + }, + { + "epoch": 26.312217194570135, + "grad_norm": 3.7294957637786865, + "learning_rate": 0.00013818201500059892, + "loss": 3.2203, + "step": 29075 + }, + { + "epoch": 26.334841628959275, + "grad_norm": 3.2714200019836426, + "learning_rate": 0.0001381500812030436, + "loss": 3.2725, + "step": 29100 + }, + { + "epoch": 26.357466063348415, + "grad_norm": 3.9746792316436768, + "learning_rate": 0.00013811810801928862, + "loss": 3.1863, + "step": 29125 + }, + { + "epoch": 26.380090497737555, + "grad_norm": 5.214170455932617, + "learning_rate": 0.00013808609546927533, + "loss": 3.1069, + "step": 29150 + }, + { + "epoch": 26.402714932126695, + "grad_norm": 3.6367127895355225, + "learning_rate": 0.00013805404357296973, + "loss": 3.2726, + "step": 29175 + }, + { + "epoch": 26.425339366515836, + "grad_norm": 3.8441576957702637, + "learning_rate": 0.00013802195235036236, + "loss": 3.31, + "step": 29200 + }, + { + "epoch": 26.447963800904976, + "grad_norm": 3.904715061187744, + "learning_rate": 0.00013798982182146817, + "loss": 3.4129, + "step": 29225 + }, + { + "epoch": 26.470588235294116, + "grad_norm": 3.413214921951294, + "learning_rate": 0.00013795765200632683, + "loss": 2.9399, + "step": 29250 + }, + { + "epoch": 26.49321266968326, + "grad_norm": 4.17348051071167, + "learning_rate": 0.00013792544292500232, + "loss": 3.1527, + "step": 29275 + }, + { + "epoch": 26.5158371040724, + "grad_norm": 3.2856154441833496, + "learning_rate": 0.00013789319459758318, + "loss": 3.2105, + "step": 29300 + }, + { + "epoch": 26.53846153846154, + "grad_norm": 3.4974355697631836, + "learning_rate": 0.00013786090704418248, + "loss": 3.0755, + "step": 29325 + }, + { + "epoch": 26.56108597285068, + "grad_norm": 4.032309055328369, + "learning_rate": 0.0001378285802849377, + "loss": 3.2051, + "step": 29350 + }, + { + "epoch": 26.58371040723982, + "grad_norm": 3.3807785511016846, + "learning_rate": 0.00013779621434001075, + "loss": 3.1604, + "step": 29375 + }, + { + "epoch": 26.60633484162896, + "grad_norm": 3.105994701385498, + "learning_rate": 0.00013776380922958802, + "loss": 3.16, + "step": 29400 + }, + { + "epoch": 26.6289592760181, + "grad_norm": 4.5413312911987305, + "learning_rate": 0.00013773136497388034, + "loss": 3.1033, + "step": 29425 + }, + { + "epoch": 26.65158371040724, + "grad_norm": 3.6155831813812256, + "learning_rate": 0.00013769888159312292, + "loss": 3.3687, + "step": 29450 + }, + { + "epoch": 26.67420814479638, + "grad_norm": 4.292983055114746, + "learning_rate": 0.00013766635910757537, + "loss": 3.0782, + "step": 29475 + }, + { + "epoch": 26.69683257918552, + "grad_norm": 4.59736967086792, + "learning_rate": 0.00013763379753752172, + "loss": 3.045, + "step": 29500 + }, + { + "epoch": 26.71945701357466, + "grad_norm": 4.385296821594238, + "learning_rate": 0.00013760119690327035, + "loss": 3.1458, + "step": 29525 + }, + { + "epoch": 26.742081447963802, + "grad_norm": 3.282942295074463, + "learning_rate": 0.000137568557225154, + "loss": 3.1732, + "step": 29550 + }, + { + "epoch": 26.764705882352942, + "grad_norm": 3.5215020179748535, + "learning_rate": 0.00013753587852352985, + "loss": 3.2069, + "step": 29575 + }, + { + "epoch": 26.787330316742082, + "grad_norm": 3.795607328414917, + "learning_rate": 0.00013750316081877925, + "loss": 3.3613, + "step": 29600 + }, + { + "epoch": 26.809954751131222, + "grad_norm": 2.8766205310821533, + "learning_rate": 0.00013747040413130803, + "loss": 3.3316, + "step": 29625 + }, + { + "epoch": 26.832579185520363, + "grad_norm": 3.666149377822876, + "learning_rate": 0.00013743760848154623, + "loss": 3.1864, + "step": 29650 + }, + { + "epoch": 26.855203619909503, + "grad_norm": 3.5978901386260986, + "learning_rate": 0.00013740477388994826, + "loss": 3.1937, + "step": 29675 + }, + { + "epoch": 26.877828054298643, + "grad_norm": 3.9239766597747803, + "learning_rate": 0.0001373719003769928, + "loss": 3.1024, + "step": 29700 + }, + { + "epoch": 26.900452488687783, + "grad_norm": 3.981957197189331, + "learning_rate": 0.00013733898796318279, + "loss": 3.1872, + "step": 29725 + }, + { + "epoch": 26.923076923076923, + "grad_norm": 3.4527359008789062, + "learning_rate": 0.00013730603666904542, + "loss": 3.0253, + "step": 29750 + }, + { + "epoch": 26.945701357466064, + "grad_norm": 4.188531398773193, + "learning_rate": 0.0001372730465151322, + "loss": 3.2988, + "step": 29775 + }, + { + "epoch": 26.968325791855204, + "grad_norm": 5.179065227508545, + "learning_rate": 0.0001372413394271976, + "loss": 3.2894, + "step": 29800 + }, + { + "epoch": 26.990950226244344, + "grad_norm": 3.946202039718628, + "learning_rate": 0.00013720827316783207, + "loss": 3.1759, + "step": 29825 + }, + { + "epoch": 27.013574660633484, + "grad_norm": 4.342083930969238, + "learning_rate": 0.00013717516810966498, + "loss": 3.0488, + "step": 29850 + }, + { + "epoch": 27.036199095022624, + "grad_norm": 4.0787177085876465, + "learning_rate": 0.00013714202427334368, + "loss": 2.9319, + "step": 29875 + }, + { + "epoch": 27.058823529411764, + "grad_norm": 4.135145664215088, + "learning_rate": 0.0001371088416795397, + "loss": 2.759, + "step": 29900 + }, + { + "epoch": 27.081447963800905, + "grad_norm": 5.538492202758789, + "learning_rate": 0.00013707562034894876, + "loss": 2.9119, + "step": 29925 + }, + { + "epoch": 27.104072398190045, + "grad_norm": 2.9948039054870605, + "learning_rate": 0.00013704236030229078, + "loss": 3.0108, + "step": 29950 + }, + { + "epoch": 27.126696832579185, + "grad_norm": 3.5534348487854004, + "learning_rate": 0.0001370090615603097, + "loss": 3.0078, + "step": 29975 + }, + { + "epoch": 27.149321266968325, + "grad_norm": 3.3250679969787598, + "learning_rate": 0.00013697572414377376, + "loss": 3.0801, + "step": 30000 + }, + { + "epoch": 27.171945701357465, + "grad_norm": 2.5745601654052734, + "learning_rate": 0.0001369423480734752, + "loss": 3.0052, + "step": 30025 + }, + { + "epoch": 27.194570135746606, + "grad_norm": 3.4980592727661133, + "learning_rate": 0.00013690893337023043, + "loss": 2.9534, + "step": 30050 + }, + { + "epoch": 27.217194570135746, + "grad_norm": 3.5090231895446777, + "learning_rate": 0.0001368754800548799, + "loss": 2.965, + "step": 30075 + }, + { + "epoch": 27.239819004524886, + "grad_norm": 4.154080867767334, + "learning_rate": 0.0001368419881482882, + "loss": 2.7661, + "step": 30100 + }, + { + "epoch": 27.262443438914026, + "grad_norm": 5.53240966796875, + "learning_rate": 0.00013680845767134395, + "loss": 2.7583, + "step": 30125 + }, + { + "epoch": 27.285067873303166, + "grad_norm": 3.128809928894043, + "learning_rate": 0.00013677488864495985, + "loss": 2.9682, + "step": 30150 + }, + { + "epoch": 27.307692307692307, + "grad_norm": 4.812078952789307, + "learning_rate": 0.00013674128109007267, + "loss": 2.7923, + "step": 30175 + }, + { + "epoch": 27.330316742081447, + "grad_norm": 3.9619510173797607, + "learning_rate": 0.0001367076350276431, + "loss": 2.9877, + "step": 30200 + }, + { + "epoch": 27.352941176470587, + "grad_norm": 4.209227085113525, + "learning_rate": 0.00013667395047865593, + "loss": 2.9891, + "step": 30225 + }, + { + "epoch": 27.375565610859727, + "grad_norm": 3.904585838317871, + "learning_rate": 0.00013664022746412, + "loss": 3.0813, + "step": 30250 + }, + { + "epoch": 27.398190045248867, + "grad_norm": 5.194969654083252, + "learning_rate": 0.00013660646600506803, + "loss": 3.1126, + "step": 30275 + }, + { + "epoch": 27.420814479638008, + "grad_norm": 3.965092420578003, + "learning_rate": 0.00013657266612255683, + "loss": 3.1711, + "step": 30300 + }, + { + "epoch": 27.443438914027148, + "grad_norm": 4.104466438293457, + "learning_rate": 0.00013653882783766706, + "loss": 3.003, + "step": 30325 + }, + { + "epoch": 27.466063348416288, + "grad_norm": 4.142114639282227, + "learning_rate": 0.00013650495117150337, + "loss": 3.1556, + "step": 30350 + }, + { + "epoch": 27.488687782805428, + "grad_norm": 3.6268327236175537, + "learning_rate": 0.0001364710361451944, + "loss": 2.8924, + "step": 30375 + }, + { + "epoch": 27.511312217194572, + "grad_norm": 3.663881301879883, + "learning_rate": 0.00013643708277989274, + "loss": 3.0629, + "step": 30400 + }, + { + "epoch": 27.533936651583712, + "grad_norm": 3.707071304321289, + "learning_rate": 0.00013640309109677474, + "loss": 3.0408, + "step": 30425 + }, + { + "epoch": 27.556561085972852, + "grad_norm": 2.7248923778533936, + "learning_rate": 0.00013636906111704077, + "loss": 2.9734, + "step": 30450 + }, + { + "epoch": 27.579185520361992, + "grad_norm": 3.750122547149658, + "learning_rate": 0.00013633499286191505, + "loss": 3.0136, + "step": 30475 + }, + { + "epoch": 27.601809954751133, + "grad_norm": 4.197097301483154, + "learning_rate": 0.0001363008863526457, + "loss": 3.0078, + "step": 30500 + }, + { + "epoch": 27.624434389140273, + "grad_norm": 4.05198860168457, + "learning_rate": 0.00013626674161050465, + "loss": 2.9035, + "step": 30525 + }, + { + "epoch": 27.647058823529413, + "grad_norm": 4.189694404602051, + "learning_rate": 0.0001362325586567877, + "loss": 3.1849, + "step": 30550 + }, + { + "epoch": 27.669683257918553, + "grad_norm": 3.242140531539917, + "learning_rate": 0.00013619833751281454, + "loss": 2.9062, + "step": 30575 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 3.5438835620880127, + "learning_rate": 0.00013616407819992858, + "loss": 3.0231, + "step": 30600 + }, + { + "epoch": 27.714932126696834, + "grad_norm": 3.196305990219116, + "learning_rate": 0.00013612978073949706, + "loss": 3.0501, + "step": 30625 + }, + { + "epoch": 27.737556561085974, + "grad_norm": 3.3109047412872314, + "learning_rate": 0.00013609544515291106, + "loss": 3.133, + "step": 30650 + }, + { + "epoch": 27.760180995475114, + "grad_norm": 4.766122341156006, + "learning_rate": 0.00013606107146158538, + "loss": 3.187, + "step": 30675 + }, + { + "epoch": 27.782805429864254, + "grad_norm": 4.751678943634033, + "learning_rate": 0.00013602665968695865, + "loss": 2.9891, + "step": 30700 + }, + { + "epoch": 27.805429864253394, + "grad_norm": 3.198948621749878, + "learning_rate": 0.00013599220985049322, + "loss": 3.1548, + "step": 30725 + }, + { + "epoch": 27.828054298642535, + "grad_norm": 4.021533012390137, + "learning_rate": 0.00013595772197367515, + "loss": 3.0128, + "step": 30750 + }, + { + "epoch": 27.850678733031675, + "grad_norm": 3.177055597305298, + "learning_rate": 0.0001359231960780143, + "loss": 3.2086, + "step": 30775 + }, + { + "epoch": 27.873303167420815, + "grad_norm": 3.0573225021362305, + "learning_rate": 0.00013588863218504414, + "loss": 3.1362, + "step": 30800 + }, + { + "epoch": 27.895927601809955, + "grad_norm": 4.4769697189331055, + "learning_rate": 0.00013585403031632189, + "loss": 3.0807, + "step": 30825 + }, + { + "epoch": 27.918552036199095, + "grad_norm": 4.18534517288208, + "learning_rate": 0.0001358193904934285, + "loss": 3.2425, + "step": 30850 + }, + { + "epoch": 27.941176470588236, + "grad_norm": 3.6010515689849854, + "learning_rate": 0.00013578471273796857, + "loss": 3.3131, + "step": 30875 + }, + { + "epoch": 27.963800904977376, + "grad_norm": 3.723889112472534, + "learning_rate": 0.00013574999707157025, + "loss": 3.0206, + "step": 30900 + }, + { + "epoch": 27.986425339366516, + "grad_norm": 4.026259422302246, + "learning_rate": 0.00013571524351588547, + "loss": 3.19, + "step": 30925 + }, + { + "epoch": 28.009049773755656, + "grad_norm": 3.8600564002990723, + "learning_rate": 0.0001356804520925898, + "loss": 3.0606, + "step": 30950 + }, + { + "epoch": 28.031674208144796, + "grad_norm": 3.6338248252868652, + "learning_rate": 0.00013564562282338227, + "loss": 3.0325, + "step": 30975 + }, + { + "epoch": 28.054298642533936, + "grad_norm": 4.088747501373291, + "learning_rate": 0.00013561075572998568, + "loss": 2.7951, + "step": 31000 + }, + { + "epoch": 28.076923076923077, + "grad_norm": 4.479493618011475, + "learning_rate": 0.00013557585083414636, + "loss": 2.9833, + "step": 31025 + }, + { + "epoch": 28.099547511312217, + "grad_norm": 4.0963664054870605, + "learning_rate": 0.00013554090815763418, + "loss": 2.9664, + "step": 31050 + }, + { + "epoch": 28.122171945701357, + "grad_norm": 4.823516368865967, + "learning_rate": 0.00013550592772224263, + "loss": 2.8624, + "step": 31075 + }, + { + "epoch": 28.144796380090497, + "grad_norm": 4.446453094482422, + "learning_rate": 0.0001354709095497887, + "loss": 2.9645, + "step": 31100 + }, + { + "epoch": 28.167420814479637, + "grad_norm": 4.259945392608643, + "learning_rate": 0.000135435853662113, + "loss": 3.016, + "step": 31125 + }, + { + "epoch": 28.190045248868778, + "grad_norm": 4.517056465148926, + "learning_rate": 0.00013540076008107955, + "loss": 3.0072, + "step": 31150 + }, + { + "epoch": 28.212669683257918, + "grad_norm": 3.433553695678711, + "learning_rate": 0.00013536562882857594, + "loss": 2.9396, + "step": 31175 + }, + { + "epoch": 28.235294117647058, + "grad_norm": 4.888617992401123, + "learning_rate": 0.00013533045992651332, + "loss": 2.7976, + "step": 31200 + }, + { + "epoch": 28.257918552036198, + "grad_norm": 4.243067264556885, + "learning_rate": 0.00013529525339682616, + "loss": 2.9328, + "step": 31225 + }, + { + "epoch": 28.28054298642534, + "grad_norm": 3.8665149211883545, + "learning_rate": 0.00013526000926147253, + "loss": 2.903, + "step": 31250 + }, + { + "epoch": 28.30316742081448, + "grad_norm": 3.0539114475250244, + "learning_rate": 0.0001352247275424339, + "loss": 2.6476, + "step": 31275 + }, + { + "epoch": 28.32579185520362, + "grad_norm": 3.3414466381073, + "learning_rate": 0.00013518940826171526, + "loss": 2.7779, + "step": 31300 + }, + { + "epoch": 28.34841628959276, + "grad_norm": 3.770616054534912, + "learning_rate": 0.00013515405144134488, + "loss": 2.9363, + "step": 31325 + }, + { + "epoch": 28.3710407239819, + "grad_norm": 3.9259214401245117, + "learning_rate": 0.00013511865710337455, + "loss": 2.7611, + "step": 31350 + }, + { + "epoch": 28.39366515837104, + "grad_norm": 3.8687143325805664, + "learning_rate": 0.00013508322526987947, + "loss": 3.0073, + "step": 31375 + }, + { + "epoch": 28.41628959276018, + "grad_norm": 4.19588565826416, + "learning_rate": 0.00013504775596295815, + "loss": 2.9568, + "step": 31400 + }, + { + "epoch": 28.43891402714932, + "grad_norm": 3.60166597366333, + "learning_rate": 0.00013501224920473253, + "loss": 2.8525, + "step": 31425 + }, + { + "epoch": 28.46153846153846, + "grad_norm": 4.343369007110596, + "learning_rate": 0.0001349767050173479, + "loss": 2.953, + "step": 31450 + }, + { + "epoch": 28.4841628959276, + "grad_norm": 4.2392168045043945, + "learning_rate": 0.00013494112342297285, + "loss": 2.8835, + "step": 31475 + }, + { + "epoch": 28.50678733031674, + "grad_norm": 5.6029558181762695, + "learning_rate": 0.00013490550444379936, + "loss": 2.9266, + "step": 31500 + }, + { + "epoch": 28.529411764705884, + "grad_norm": 4.1231536865234375, + "learning_rate": 0.00013486984810204272, + "loss": 2.9438, + "step": 31525 + }, + { + "epoch": 28.552036199095024, + "grad_norm": 3.6548025608062744, + "learning_rate": 0.00013483415441994145, + "loss": 2.9277, + "step": 31550 + }, + { + "epoch": 28.574660633484164, + "grad_norm": 4.746679782867432, + "learning_rate": 0.00013479842341975747, + "loss": 2.8741, + "step": 31575 + }, + { + "epoch": 28.597285067873305, + "grad_norm": 5.058715343475342, + "learning_rate": 0.00013476265512377591, + "loss": 2.9333, + "step": 31600 + }, + { + "epoch": 28.619909502262445, + "grad_norm": 4.017326831817627, + "learning_rate": 0.00013472684955430516, + "loss": 2.8836, + "step": 31625 + }, + { + "epoch": 28.642533936651585, + "grad_norm": 3.4557321071624756, + "learning_rate": 0.00013469100673367684, + "loss": 2.8834, + "step": 31650 + }, + { + "epoch": 28.665158371040725, + "grad_norm": 3.441136598587036, + "learning_rate": 0.00013465512668424585, + "loss": 3.1403, + "step": 31675 + }, + { + "epoch": 28.687782805429865, + "grad_norm": 4.262326240539551, + "learning_rate": 0.00013461920942839029, + "loss": 2.7424, + "step": 31700 + }, + { + "epoch": 28.710407239819006, + "grad_norm": 4.127242565155029, + "learning_rate": 0.00013458325498851147, + "loss": 2.8001, + "step": 31725 + }, + { + "epoch": 28.733031674208146, + "grad_norm": 5.030572891235352, + "learning_rate": 0.0001345472633870339, + "loss": 3.1041, + "step": 31750 + }, + { + "epoch": 28.755656108597286, + "grad_norm": 4.85727596282959, + "learning_rate": 0.0001345112346464052, + "loss": 2.8716, + "step": 31775 + }, + { + "epoch": 28.778280542986426, + "grad_norm": 3.561958074569702, + "learning_rate": 0.0001344751687890963, + "loss": 3.0226, + "step": 31800 + }, + { + "epoch": 28.800904977375566, + "grad_norm": 3.3147952556610107, + "learning_rate": 0.0001344390658376011, + "loss": 2.9349, + "step": 31825 + }, + { + "epoch": 28.823529411764707, + "grad_norm": 4.120586395263672, + "learning_rate": 0.00013440292581443674, + "loss": 2.8454, + "step": 31850 + }, + { + "epoch": 28.846153846153847, + "grad_norm": 3.3597311973571777, + "learning_rate": 0.00013436674874214348, + "loss": 2.822, + "step": 31875 + }, + { + "epoch": 28.868778280542987, + "grad_norm": 5.217077255249023, + "learning_rate": 0.00013433053464328466, + "loss": 2.956, + "step": 31900 + }, + { + "epoch": 28.891402714932127, + "grad_norm": 5.234213829040527, + "learning_rate": 0.00013429428354044674, + "loss": 2.8244, + "step": 31925 + }, + { + "epoch": 28.914027149321267, + "grad_norm": 3.850409984588623, + "learning_rate": 0.00013425799545623923, + "loss": 3.1384, + "step": 31950 + }, + { + "epoch": 28.936651583710407, + "grad_norm": 3.3998496532440186, + "learning_rate": 0.00013422167041329472, + "loss": 3.1237, + "step": 31975 + }, + { + "epoch": 28.959276018099548, + "grad_norm": 3.317704439163208, + "learning_rate": 0.0001341853084342688, + "loss": 2.9861, + "step": 32000 + }, + { + "epoch": 28.981900452488688, + "grad_norm": 3.755441188812256, + "learning_rate": 0.00013414890954184026, + "loss": 2.9027, + "step": 32025 + }, + { + "epoch": 29.004524886877828, + "grad_norm": 2.548013687133789, + "learning_rate": 0.0001341124737587107, + "loss": 2.9465, + "step": 32050 + }, + { + "epoch": 29.02714932126697, + "grad_norm": 3.4946072101593018, + "learning_rate": 0.00013407600110760485, + "loss": 2.8628, + "step": 32075 + }, + { + "epoch": 29.04977375565611, + "grad_norm": 4.340054988861084, + "learning_rate": 0.00013403949161127043, + "loss": 2.4742, + "step": 32100 + }, + { + "epoch": 29.07239819004525, + "grad_norm": 4.377787113189697, + "learning_rate": 0.0001340029452924781, + "loss": 2.6672, + "step": 32125 + }, + { + "epoch": 29.09502262443439, + "grad_norm": 4.529804706573486, + "learning_rate": 0.0001339663621740215, + "loss": 2.7543, + "step": 32150 + }, + { + "epoch": 29.11764705882353, + "grad_norm": 4.187199592590332, + "learning_rate": 0.00013392974227871722, + "loss": 2.9031, + "step": 32175 + }, + { + "epoch": 29.14027149321267, + "grad_norm": 4.080550193786621, + "learning_rate": 0.00013389308562940485, + "loss": 2.7065, + "step": 32200 + }, + { + "epoch": 29.16289592760181, + "grad_norm": 5.53787088394165, + "learning_rate": 0.0001338563922489468, + "loss": 2.8107, + "step": 32225 + }, + { + "epoch": 29.18552036199095, + "grad_norm": 3.6485674381256104, + "learning_rate": 0.00013381966216022845, + "loss": 2.7821, + "step": 32250 + }, + { + "epoch": 29.20814479638009, + "grad_norm": 4.681895732879639, + "learning_rate": 0.00013378289538615805, + "loss": 2.8319, + "step": 32275 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 4.285266399383545, + "learning_rate": 0.00013374609194966676, + "loss": 2.8651, + "step": 32300 + }, + { + "epoch": 29.25339366515837, + "grad_norm": 3.7702324390411377, + "learning_rate": 0.0001337092518737086, + "loss": 2.7786, + "step": 32325 + }, + { + "epoch": 29.27601809954751, + "grad_norm": 3.929197072982788, + "learning_rate": 0.0001336723751812604, + "loss": 2.7335, + "step": 32350 + }, + { + "epoch": 29.29864253393665, + "grad_norm": 5.015972137451172, + "learning_rate": 0.00013363546189532188, + "loss": 3.0028, + "step": 32375 + }, + { + "epoch": 29.32126696832579, + "grad_norm": 4.664575099945068, + "learning_rate": 0.00013359851203891554, + "loss": 2.6164, + "step": 32400 + }, + { + "epoch": 29.34389140271493, + "grad_norm": 5.186281204223633, + "learning_rate": 0.0001335630057926611, + "loss": 2.7075, + "step": 32425 + }, + { + "epoch": 29.36651583710407, + "grad_norm": 4.499082565307617, + "learning_rate": 0.00013352598432500893, + "loss": 2.6906, + "step": 32450 + }, + { + "epoch": 29.38914027149321, + "grad_norm": 4.127201557159424, + "learning_rate": 0.0001334889263551692, + "loss": 2.8804, + "step": 32475 + }, + { + "epoch": 29.41176470588235, + "grad_norm": 4.216423034667969, + "learning_rate": 0.00013345183190625475, + "loss": 2.9464, + "step": 32500 + }, + { + "epoch": 29.43438914027149, + "grad_norm": 4.783670425415039, + "learning_rate": 0.000133414701001401, + "loss": 2.5978, + "step": 32525 + }, + { + "epoch": 29.457013574660632, + "grad_norm": 4.002803802490234, + "learning_rate": 0.00013337753366376626, + "loss": 2.7778, + "step": 32550 + }, + { + "epoch": 29.479638009049772, + "grad_norm": 4.036042213439941, + "learning_rate": 0.00013334032991653148, + "loss": 2.8319, + "step": 32575 + }, + { + "epoch": 29.502262443438916, + "grad_norm": 4.025344371795654, + "learning_rate": 0.00013330308978290033, + "loss": 2.9733, + "step": 32600 + }, + { + "epoch": 29.524886877828056, + "grad_norm": 4.4124040603637695, + "learning_rate": 0.00013326581328609922, + "loss": 2.7095, + "step": 32625 + }, + { + "epoch": 29.547511312217196, + "grad_norm": 3.700578451156616, + "learning_rate": 0.0001332285004493772, + "loss": 2.8302, + "step": 32650 + }, + { + "epoch": 29.570135746606336, + "grad_norm": 4.622064113616943, + "learning_rate": 0.0001331911512960059, + "loss": 2.9011, + "step": 32675 + }, + { + "epoch": 29.592760180995477, + "grad_norm": 3.7043676376342773, + "learning_rate": 0.0001331537658492798, + "loss": 2.8195, + "step": 32700 + }, + { + "epoch": 29.615384615384617, + "grad_norm": 3.6045854091644287, + "learning_rate": 0.00013311634413251585, + "loss": 2.898, + "step": 32725 + }, + { + "epoch": 29.638009049773757, + "grad_norm": 3.9226889610290527, + "learning_rate": 0.00013307888616905365, + "loss": 2.8238, + "step": 32750 + }, + { + "epoch": 29.660633484162897, + "grad_norm": 4.132287979125977, + "learning_rate": 0.0001330413919822555, + "loss": 2.8462, + "step": 32775 + }, + { + "epoch": 29.683257918552037, + "grad_norm": 3.3745787143707275, + "learning_rate": 0.00013300386159550618, + "loss": 2.8541, + "step": 32800 + }, + { + "epoch": 29.705882352941178, + "grad_norm": 3.9182276725769043, + "learning_rate": 0.0001329662950322131, + "loss": 2.9974, + "step": 32825 + }, + { + "epoch": 29.728506787330318, + "grad_norm": 4.313821315765381, + "learning_rate": 0.0001329286923158062, + "loss": 2.7951, + "step": 32850 + }, + { + "epoch": 29.751131221719458, + "grad_norm": 4.022199630737305, + "learning_rate": 0.00013289105346973802, + "loss": 2.9561, + "step": 32875 + }, + { + "epoch": 29.773755656108598, + "grad_norm": 4.818426132202148, + "learning_rate": 0.00013285337851748363, + "loss": 2.8191, + "step": 32900 + }, + { + "epoch": 29.79638009049774, + "grad_norm": 4.591739654541016, + "learning_rate": 0.00013281566748254056, + "loss": 2.7544, + "step": 32925 + }, + { + "epoch": 29.81900452488688, + "grad_norm": 4.046112060546875, + "learning_rate": 0.00013277792038842888, + "loss": 2.6974, + "step": 32950 + }, + { + "epoch": 29.84162895927602, + "grad_norm": 3.9047091007232666, + "learning_rate": 0.00013274013725869115, + "loss": 2.7516, + "step": 32975 + }, + { + "epoch": 29.86425339366516, + "grad_norm": 4.003262519836426, + "learning_rate": 0.00013270231811689242, + "loss": 2.8326, + "step": 33000 + }, + { + "epoch": 29.8868778280543, + "grad_norm": 3.445901393890381, + "learning_rate": 0.0001326644629866202, + "loss": 2.8347, + "step": 33025 + }, + { + "epoch": 29.90950226244344, + "grad_norm": 5.109114646911621, + "learning_rate": 0.0001326265718914844, + "loss": 2.88, + "step": 33050 + }, + { + "epoch": 29.93212669683258, + "grad_norm": 5.185047149658203, + "learning_rate": 0.0001325886448551174, + "loss": 2.9353, + "step": 33075 + }, + { + "epoch": 29.95475113122172, + "grad_norm": 3.7265560626983643, + "learning_rate": 0.00013255068190117398, + "loss": 2.7464, + "step": 33100 + }, + { + "epoch": 29.97737556561086, + "grad_norm": 4.132345199584961, + "learning_rate": 0.00013251268305333137, + "loss": 2.8342, + "step": 33125 + }, + { + "epoch": 30.0, + "grad_norm": 4.887376308441162, + "learning_rate": 0.0001324746483352891, + "loss": 2.8665, + "step": 33150 + }, + { + "epoch": 30.02262443438914, + "grad_norm": 3.8556153774261475, + "learning_rate": 0.00013243657777076915, + "loss": 2.5729, + "step": 33175 + }, + { + "epoch": 30.04524886877828, + "grad_norm": 4.831211566925049, + "learning_rate": 0.00013239847138351581, + "loss": 2.4019, + "step": 33200 + }, + { + "epoch": 30.06787330316742, + "grad_norm": 4.507389545440674, + "learning_rate": 0.00013236032919729574, + "loss": 2.6132, + "step": 33225 + }, + { + "epoch": 30.09049773755656, + "grad_norm": 4.609244346618652, + "learning_rate": 0.0001323221512358979, + "loss": 2.6143, + "step": 33250 + }, + { + "epoch": 30.1131221719457, + "grad_norm": 6.194278717041016, + "learning_rate": 0.00013228393752313358, + "loss": 2.6638, + "step": 33275 + }, + { + "epoch": 30.13574660633484, + "grad_norm": 3.9110960960388184, + "learning_rate": 0.00013224568808283641, + "loss": 2.6422, + "step": 33300 + }, + { + "epoch": 30.15837104072398, + "grad_norm": 3.939910411834717, + "learning_rate": 0.0001322074029388622, + "loss": 2.7747, + "step": 33325 + }, + { + "epoch": 30.18099547511312, + "grad_norm": 4.057479381561279, + "learning_rate": 0.0001321690821150891, + "loss": 2.6193, + "step": 33350 + }, + { + "epoch": 30.20361990950226, + "grad_norm": 3.234480619430542, + "learning_rate": 0.00013213072563541753, + "loss": 2.6003, + "step": 33375 + }, + { + "epoch": 30.226244343891402, + "grad_norm": 4.1614179611206055, + "learning_rate": 0.0001320923335237701, + "loss": 2.8562, + "step": 33400 + }, + { + "epoch": 30.248868778280542, + "grad_norm": 4.460386276245117, + "learning_rate": 0.00013205390580409165, + "loss": 2.7884, + "step": 33425 + }, + { + "epoch": 30.271493212669682, + "grad_norm": 5.725566387176514, + "learning_rate": 0.00013201544250034927, + "loss": 2.7352, + "step": 33450 + }, + { + "epoch": 30.294117647058822, + "grad_norm": 5.769152641296387, + "learning_rate": 0.0001319769436365322, + "loss": 2.6063, + "step": 33475 + }, + { + "epoch": 30.316742081447963, + "grad_norm": 4.5305047035217285, + "learning_rate": 0.00013193840923665187, + "loss": 2.4199, + "step": 33500 + }, + { + "epoch": 30.339366515837103, + "grad_norm": 3.5889010429382324, + "learning_rate": 0.00013189983932474186, + "loss": 2.8019, + "step": 33525 + }, + { + "epoch": 30.361990950226243, + "grad_norm": 3.8920085430145264, + "learning_rate": 0.00013186123392485794, + "loss": 2.6213, + "step": 33550 + }, + { + "epoch": 30.384615384615383, + "grad_norm": 4.817618370056152, + "learning_rate": 0.000131822593061078, + "loss": 2.7214, + "step": 33575 + }, + { + "epoch": 30.407239819004523, + "grad_norm": 4.146068096160889, + "learning_rate": 0.000131783916757502, + "loss": 2.7239, + "step": 33600 + }, + { + "epoch": 30.429864253393664, + "grad_norm": 5.7670674324035645, + "learning_rate": 0.0001317452050382521, + "loss": 2.6981, + "step": 33625 + }, + { + "epoch": 30.452488687782804, + "grad_norm": 4.598320484161377, + "learning_rate": 0.0001317064579274724, + "loss": 2.6299, + "step": 33650 + }, + { + "epoch": 30.475113122171944, + "grad_norm": 4.3051300048828125, + "learning_rate": 0.00013166767544932922, + "loss": 2.8231, + "step": 33675 + }, + { + "epoch": 30.497737556561084, + "grad_norm": 4.633981704711914, + "learning_rate": 0.0001316288576280109, + "loss": 2.7843, + "step": 33700 + }, + { + "epoch": 30.520361990950228, + "grad_norm": 4.050837516784668, + "learning_rate": 0.00013159000448772777, + "loss": 2.5236, + "step": 33725 + }, + { + "epoch": 30.542986425339368, + "grad_norm": 3.8911004066467285, + "learning_rate": 0.00013155111605271221, + "loss": 2.6911, + "step": 33750 + }, + { + "epoch": 30.56561085972851, + "grad_norm": 4.056077003479004, + "learning_rate": 0.00013151219234721866, + "loss": 2.5631, + "step": 33775 + }, + { + "epoch": 30.58823529411765, + "grad_norm": 4.661139488220215, + "learning_rate": 0.00013147323339552348, + "loss": 2.7013, + "step": 33800 + }, + { + "epoch": 30.61085972850679, + "grad_norm": 4.946695327758789, + "learning_rate": 0.00013143423922192514, + "loss": 2.6462, + "step": 33825 + }, + { + "epoch": 30.63348416289593, + "grad_norm": 3.819631338119507, + "learning_rate": 0.00013139520985074388, + "loss": 2.7273, + "step": 33850 + }, + { + "epoch": 30.65610859728507, + "grad_norm": 4.634519100189209, + "learning_rate": 0.0001313561453063221, + "loss": 2.8124, + "step": 33875 + }, + { + "epoch": 30.67873303167421, + "grad_norm": 3.164106607437134, + "learning_rate": 0.00013131704561302398, + "loss": 2.5955, + "step": 33900 + }, + { + "epoch": 30.70135746606335, + "grad_norm": 4.083550453186035, + "learning_rate": 0.00013127791079523574, + "loss": 2.742, + "step": 33925 + }, + { + "epoch": 30.72398190045249, + "grad_norm": 3.631793737411499, + "learning_rate": 0.0001312387408773654, + "loss": 2.7186, + "step": 33950 + }, + { + "epoch": 30.74660633484163, + "grad_norm": 4.551006317138672, + "learning_rate": 0.00013119953588384294, + "loss": 2.5732, + "step": 33975 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 5.5608062744140625, + "learning_rate": 0.00013116029583912022, + "loss": 2.9259, + "step": 34000 + }, + { + "epoch": 30.79185520361991, + "grad_norm": 4.855534553527832, + "learning_rate": 0.00013112102076767097, + "loss": 2.8942, + "step": 34025 + }, + { + "epoch": 30.81447963800905, + "grad_norm": 4.316082000732422, + "learning_rate": 0.00013108171069399065, + "loss": 2.7487, + "step": 34050 + }, + { + "epoch": 30.83710407239819, + "grad_norm": 5.009440898895264, + "learning_rate": 0.00013104236564259668, + "loss": 2.5815, + "step": 34075 + }, + { + "epoch": 30.85972850678733, + "grad_norm": 4.553768157958984, + "learning_rate": 0.0001310029856380283, + "loss": 2.8171, + "step": 34100 + }, + { + "epoch": 30.88235294117647, + "grad_norm": 4.048161506652832, + "learning_rate": 0.00013096357070484644, + "loss": 2.8778, + "step": 34125 + }, + { + "epoch": 30.90497737556561, + "grad_norm": 4.5986433029174805, + "learning_rate": 0.00013092412086763392, + "loss": 3.0391, + "step": 34150 + }, + { + "epoch": 30.92760180995475, + "grad_norm": 4.356034278869629, + "learning_rate": 0.00013088463615099525, + "loss": 2.7292, + "step": 34175 + }, + { + "epoch": 30.95022624434389, + "grad_norm": 4.125836372375488, + "learning_rate": 0.00013084511657955673, + "loss": 2.6635, + "step": 34200 + }, + { + "epoch": 30.97285067873303, + "grad_norm": 4.798816680908203, + "learning_rate": 0.00013080556217796646, + "loss": 2.7088, + "step": 34225 + }, + { + "epoch": 30.995475113122172, + "grad_norm": 4.807849407196045, + "learning_rate": 0.00013076597297089412, + "loss": 2.6887, + "step": 34250 + }, + { + "epoch": 31.018099547511312, + "grad_norm": 4.601236343383789, + "learning_rate": 0.00013072634898303126, + "loss": 2.5557, + "step": 34275 + }, + { + "epoch": 31.040723981900452, + "grad_norm": 4.286696434020996, + "learning_rate": 0.000130686690239091, + "loss": 2.5424, + "step": 34300 + }, + { + "epoch": 31.063348416289593, + "grad_norm": 5.3083367347717285, + "learning_rate": 0.00013064699676380818, + "loss": 2.4756, + "step": 34325 + }, + { + "epoch": 31.085972850678733, + "grad_norm": 4.651934623718262, + "learning_rate": 0.00013060726858193933, + "loss": 2.4422, + "step": 34350 + }, + { + "epoch": 31.108597285067873, + "grad_norm": 6.969370365142822, + "learning_rate": 0.00013056750571826254, + "loss": 2.5326, + "step": 34375 + }, + { + "epoch": 31.131221719457013, + "grad_norm": 4.533806324005127, + "learning_rate": 0.00013052770819757767, + "loss": 2.5938, + "step": 34400 + }, + { + "epoch": 31.153846153846153, + "grad_norm": 4.460901737213135, + "learning_rate": 0.00013048787604470606, + "loss": 2.5329, + "step": 34425 + }, + { + "epoch": 31.176470588235293, + "grad_norm": 4.053142070770264, + "learning_rate": 0.0001304480092844907, + "loss": 2.5739, + "step": 34450 + }, + { + "epoch": 31.199095022624434, + "grad_norm": 4.864528179168701, + "learning_rate": 0.00013040810794179622, + "loss": 2.5561, + "step": 34475 + }, + { + "epoch": 31.221719457013574, + "grad_norm": 4.160267353057861, + "learning_rate": 0.00013036817204150873, + "loss": 2.4335, + "step": 34500 + }, + { + "epoch": 31.244343891402714, + "grad_norm": 4.050480365753174, + "learning_rate": 0.0001303282016085359, + "loss": 2.6838, + "step": 34525 + }, + { + "epoch": 31.266968325791854, + "grad_norm": 3.8022878170013428, + "learning_rate": 0.00013028819666780706, + "loss": 2.5064, + "step": 34550 + }, + { + "epoch": 31.289592760180994, + "grad_norm": 3.9805221557617188, + "learning_rate": 0.00013024815724427288, + "loss": 2.702, + "step": 34575 + }, + { + "epoch": 31.312217194570135, + "grad_norm": 4.316376686096191, + "learning_rate": 0.0001302096869794373, + "loss": 2.6071, + "step": 34600 + }, + { + "epoch": 31.334841628959275, + "grad_norm": 4.840281963348389, + "learning_rate": 0.00013016958004206426, + "loss": 2.5611, + "step": 34625 + }, + { + "epoch": 31.357466063348415, + "grad_norm": 4.259674549102783, + "learning_rate": 0.00013012943869586613, + "loss": 2.5317, + "step": 34650 + }, + { + "epoch": 31.380090497737555, + "grad_norm": 3.153834342956543, + "learning_rate": 0.0001300892629658788, + "loss": 2.6136, + "step": 34675 + }, + { + "epoch": 31.402714932126695, + "grad_norm": 4.826272487640381, + "learning_rate": 0.00013004905287715947, + "loss": 2.6296, + "step": 34700 + }, + { + "epoch": 31.425339366515836, + "grad_norm": 3.353173017501831, + "learning_rate": 0.00013000880845478693, + "loss": 2.5559, + "step": 34725 + }, + { + "epoch": 31.447963800904976, + "grad_norm": 4.404378414154053, + "learning_rate": 0.0001299685297238613, + "loss": 2.589, + "step": 34750 + }, + { + "epoch": 31.470588235294116, + "grad_norm": 5.003884315490723, + "learning_rate": 0.00012992821670950404, + "loss": 2.7557, + "step": 34775 + }, + { + "epoch": 31.49321266968326, + "grad_norm": 4.578713417053223, + "learning_rate": 0.00012988786943685812, + "loss": 2.5047, + "step": 34800 + }, + { + "epoch": 31.5158371040724, + "grad_norm": 5.10005521774292, + "learning_rate": 0.00012984748793108775, + "loss": 2.6792, + "step": 34825 + }, + { + "epoch": 31.53846153846154, + "grad_norm": 4.338147163391113, + "learning_rate": 0.00012980707221737863, + "loss": 2.6348, + "step": 34850 + }, + { + "epoch": 31.56108597285068, + "grad_norm": 5.2252984046936035, + "learning_rate": 0.00012976662232093764, + "loss": 2.6911, + "step": 34875 + }, + { + "epoch": 31.58371040723982, + "grad_norm": 3.880182981491089, + "learning_rate": 0.0001297277582846583, + "loss": 2.5268, + "step": 34900 + }, + { + "epoch": 31.60633484162896, + "grad_norm": 4.190444469451904, + "learning_rate": 0.00012968724146326485, + "loss": 2.5337, + "step": 34925 + }, + { + "epoch": 31.6289592760181, + "grad_norm": 3.701547622680664, + "learning_rate": 0.0001296466905338771, + "loss": 2.5426, + "step": 34950 + }, + { + "epoch": 31.65158371040724, + "grad_norm": 4.068524360656738, + "learning_rate": 0.0001296061055217862, + "loss": 2.6174, + "step": 34975 + }, + { + "epoch": 31.67420814479638, + "grad_norm": 5.140333652496338, + "learning_rate": 0.00012956548645230487, + "loss": 2.5687, + "step": 35000 + }, + { + "epoch": 31.69683257918552, + "grad_norm": 4.030555725097656, + "learning_rate": 0.00012952483335076676, + "loss": 2.5699, + "step": 35025 + }, + { + "epoch": 31.71945701357466, + "grad_norm": 4.85916805267334, + "learning_rate": 0.00012948414624252693, + "loss": 2.6461, + "step": 35050 + }, + { + "epoch": 31.742081447963802, + "grad_norm": 4.350060939788818, + "learning_rate": 0.0001294434251529616, + "loss": 2.6628, + "step": 35075 + }, + { + "epoch": 31.764705882352942, + "grad_norm": 3.316129207611084, + "learning_rate": 0.00012940267010746822, + "loss": 2.4926, + "step": 35100 + }, + { + "epoch": 31.787330316742082, + "grad_norm": 5.346104621887207, + "learning_rate": 0.00012936188113146535, + "loss": 2.6199, + "step": 35125 + }, + { + "epoch": 31.809954751131222, + "grad_norm": 4.797980785369873, + "learning_rate": 0.00012932105825039274, + "loss": 2.5169, + "step": 35150 + }, + { + "epoch": 31.832579185520363, + "grad_norm": 4.692802906036377, + "learning_rate": 0.00012928020148971132, + "loss": 2.5529, + "step": 35175 + }, + { + "epoch": 31.855203619909503, + "grad_norm": 5.026429653167725, + "learning_rate": 0.00012923931087490312, + "loss": 2.6802, + "step": 35200 + }, + { + "epoch": 31.877828054298643, + "grad_norm": 4.618716716766357, + "learning_rate": 0.0001291983864314713, + "loss": 2.5839, + "step": 35225 + }, + { + "epoch": 31.900452488687783, + "grad_norm": 4.131834030151367, + "learning_rate": 0.00012915742818494003, + "loss": 2.6939, + "step": 35250 + }, + { + "epoch": 31.923076923076923, + "grad_norm": 4.771623611450195, + "learning_rate": 0.00012911643616085472, + "loss": 2.6028, + "step": 35275 + }, + { + "epoch": 31.945701357466064, + "grad_norm": 4.510164737701416, + "learning_rate": 0.00012907541038478177, + "loss": 2.7633, + "step": 35300 + }, + { + "epoch": 31.968325791855204, + "grad_norm": 3.9268558025360107, + "learning_rate": 0.0001290343508823086, + "loss": 2.786, + "step": 35325 + }, + { + "epoch": 31.990950226244344, + "grad_norm": 4.578567981719971, + "learning_rate": 0.0001289932576790437, + "loss": 2.6832, + "step": 35350 + }, + { + "epoch": 32.01357466063349, + "grad_norm": 4.22797966003418, + "learning_rate": 0.00012895213080061656, + "loss": 2.587, + "step": 35375 + }, + { + "epoch": 32.036199095022624, + "grad_norm": 4.337325572967529, + "learning_rate": 0.00012891097027267767, + "loss": 2.3603, + "step": 35400 + }, + { + "epoch": 32.05882352941177, + "grad_norm": 5.987388610839844, + "learning_rate": 0.0001288697761208986, + "loss": 2.5849, + "step": 35425 + }, + { + "epoch": 32.081447963800905, + "grad_norm": 3.6974217891693115, + "learning_rate": 0.0001288285483709717, + "loss": 2.4429, + "step": 35450 + }, + { + "epoch": 32.10407239819005, + "grad_norm": 4.210789680480957, + "learning_rate": 0.0001287872870486105, + "loss": 2.6268, + "step": 35475 + }, + { + "epoch": 32.126696832579185, + "grad_norm": 5.0200676918029785, + "learning_rate": 0.00012874599217954926, + "loss": 2.402, + "step": 35500 + }, + { + "epoch": 32.14932126696833, + "grad_norm": 3.6578259468078613, + "learning_rate": 0.00012870466378954332, + "loss": 2.3686, + "step": 35525 + }, + { + "epoch": 32.171945701357465, + "grad_norm": 4.825088977813721, + "learning_rate": 0.00012866330190436883, + "loss": 2.4563, + "step": 35550 + }, + { + "epoch": 32.19457013574661, + "grad_norm": 4.426782131195068, + "learning_rate": 0.0001286219065498229, + "loss": 2.5751, + "step": 35575 + }, + { + "epoch": 32.217194570135746, + "grad_norm": 4.516976833343506, + "learning_rate": 0.00012858047775172346, + "loss": 2.5115, + "step": 35600 + }, + { + "epoch": 32.23981900452489, + "grad_norm": 5.571347236633301, + "learning_rate": 0.0001285390155359093, + "loss": 2.4532, + "step": 35625 + }, + { + "epoch": 32.262443438914026, + "grad_norm": 4.27960205078125, + "learning_rate": 0.00012849751992824012, + "loss": 2.371, + "step": 35650 + }, + { + "epoch": 32.28506787330317, + "grad_norm": 5.029642105102539, + "learning_rate": 0.00012845599095459635, + "loss": 2.4602, + "step": 35675 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 4.342826843261719, + "learning_rate": 0.0001284144286408793, + "loss": 2.5523, + "step": 35700 + }, + { + "epoch": 32.33031674208145, + "grad_norm": 4.571736812591553, + "learning_rate": 0.00012837283301301108, + "loss": 2.2815, + "step": 35725 + }, + { + "epoch": 32.35294117647059, + "grad_norm": 3.9872703552246094, + "learning_rate": 0.0001283312040969345, + "loss": 2.6431, + "step": 35750 + }, + { + "epoch": 32.37556561085973, + "grad_norm": 4.537867069244385, + "learning_rate": 0.00012828954191861322, + "loss": 2.4634, + "step": 35775 + }, + { + "epoch": 32.39819004524887, + "grad_norm": 4.200901031494141, + "learning_rate": 0.0001282478465040316, + "loss": 2.5172, + "step": 35800 + }, + { + "epoch": 32.42081447963801, + "grad_norm": 3.6147212982177734, + "learning_rate": 0.0001282061178791947, + "loss": 2.6195, + "step": 35825 + }, + { + "epoch": 32.44343891402715, + "grad_norm": 3.921602725982666, + "learning_rate": 0.00012816435607012838, + "loss": 2.3279, + "step": 35850 + }, + { + "epoch": 32.46606334841629, + "grad_norm": 6.205512523651123, + "learning_rate": 0.0001281225611028791, + "loss": 2.4272, + "step": 35875 + }, + { + "epoch": 32.48868778280543, + "grad_norm": 4.8625898361206055, + "learning_rate": 0.00012808073300351407, + "loss": 2.6181, + "step": 35900 + }, + { + "epoch": 32.51131221719457, + "grad_norm": 5.045218467712402, + "learning_rate": 0.00012803887179812116, + "loss": 2.5377, + "step": 35925 + }, + { + "epoch": 32.53393665158371, + "grad_norm": 4.018486499786377, + "learning_rate": 0.00012799697751280883, + "loss": 2.4845, + "step": 35950 + }, + { + "epoch": 32.55656108597285, + "grad_norm": 4.254944324493408, + "learning_rate": 0.00012795505017370622, + "loss": 2.3467, + "step": 35975 + }, + { + "epoch": 32.57918552036199, + "grad_norm": 4.213393688201904, + "learning_rate": 0.0001279130898069631, + "loss": 2.3798, + "step": 36000 + }, + { + "epoch": 32.60180995475113, + "grad_norm": 3.9159576892852783, + "learning_rate": 0.00012787109643874978, + "loss": 2.3131, + "step": 36025 + }, + { + "epoch": 32.62443438914027, + "grad_norm": 4.4072747230529785, + "learning_rate": 0.0001278290700952572, + "loss": 2.4228, + "step": 36050 + }, + { + "epoch": 32.64705882352941, + "grad_norm": 4.352427005767822, + "learning_rate": 0.00012778701080269685, + "loss": 2.4964, + "step": 36075 + }, + { + "epoch": 32.66968325791855, + "grad_norm": 3.7858567237854004, + "learning_rate": 0.00012774491858730082, + "loss": 2.5937, + "step": 36100 + }, + { + "epoch": 32.69230769230769, + "grad_norm": 4.358039855957031, + "learning_rate": 0.0001277027934753216, + "loss": 2.3238, + "step": 36125 + }, + { + "epoch": 32.71493212669683, + "grad_norm": 4.003338813781738, + "learning_rate": 0.0001276606354930324, + "loss": 2.5184, + "step": 36150 + }, + { + "epoch": 32.737556561085974, + "grad_norm": 3.9666638374328613, + "learning_rate": 0.0001276184446667267, + "loss": 2.4886, + "step": 36175 + }, + { + "epoch": 32.76018099547511, + "grad_norm": 4.199832916259766, + "learning_rate": 0.00012757622102271864, + "loss": 2.6314, + "step": 36200 + }, + { + "epoch": 32.782805429864254, + "grad_norm": 5.6295881271362305, + "learning_rate": 0.00012753396458734274, + "loss": 2.627, + "step": 36225 + }, + { + "epoch": 32.80542986425339, + "grad_norm": 4.842776298522949, + "learning_rate": 0.00012749167538695405, + "loss": 2.594, + "step": 36250 + }, + { + "epoch": 32.828054298642535, + "grad_norm": 4.408575534820557, + "learning_rate": 0.00012744935344792795, + "loss": 2.4685, + "step": 36275 + }, + { + "epoch": 32.85067873303167, + "grad_norm": 4.2867817878723145, + "learning_rate": 0.00012740699879666033, + "loss": 2.536, + "step": 36300 + }, + { + "epoch": 32.873303167420815, + "grad_norm": 4.182464599609375, + "learning_rate": 0.00012736461145956745, + "loss": 2.4234, + "step": 36325 + }, + { + "epoch": 32.89592760180995, + "grad_norm": 3.8789894580841064, + "learning_rate": 0.00012732219146308592, + "loss": 2.5252, + "step": 36350 + }, + { + "epoch": 32.918552036199095, + "grad_norm": 5.3863606452941895, + "learning_rate": 0.0001272797388336728, + "loss": 2.6901, + "step": 36375 + }, + { + "epoch": 32.94117647058823, + "grad_norm": 3.9546992778778076, + "learning_rate": 0.0001272372535978054, + "loss": 2.6173, + "step": 36400 + }, + { + "epoch": 32.963800904977376, + "grad_norm": 4.50889778137207, + "learning_rate": 0.00012719473578198145, + "loss": 2.5091, + "step": 36425 + }, + { + "epoch": 32.98642533936652, + "grad_norm": 4.707273006439209, + "learning_rate": 0.00012715218541271893, + "loss": 2.5365, + "step": 36450 + }, + { + "epoch": 33.009049773755656, + "grad_norm": 3.601125717163086, + "learning_rate": 0.00012710960251655627, + "loss": 2.3054, + "step": 36475 + }, + { + "epoch": 33.0316742081448, + "grad_norm": 4.469501972198486, + "learning_rate": 0.00012706698712005196, + "loss": 2.3404, + "step": 36500 + }, + { + "epoch": 33.05429864253394, + "grad_norm": 3.9866597652435303, + "learning_rate": 0.00012702433924978494, + "loss": 2.3609, + "step": 36525 + }, + { + "epoch": 33.07692307692308, + "grad_norm": 5.863500118255615, + "learning_rate": 0.00012698165893235434, + "loss": 2.3967, + "step": 36550 + }, + { + "epoch": 33.09954751131222, + "grad_norm": 4.805734157562256, + "learning_rate": 0.00012693894619437954, + "loss": 2.2383, + "step": 36575 + }, + { + "epoch": 33.12217194570136, + "grad_norm": 4.294662952423096, + "learning_rate": 0.00012689620106250013, + "loss": 2.4667, + "step": 36600 + }, + { + "epoch": 33.1447963800905, + "grad_norm": 4.397215366363525, + "learning_rate": 0.0001268534235633759, + "loss": 2.3737, + "step": 36625 + }, + { + "epoch": 33.16742081447964, + "grad_norm": 4.107430934906006, + "learning_rate": 0.00012681061372368682, + "loss": 2.2985, + "step": 36650 + }, + { + "epoch": 33.19004524886878, + "grad_norm": 5.470655918121338, + "learning_rate": 0.00012676777157013305, + "loss": 2.3414, + "step": 36675 + }, + { + "epoch": 33.21266968325792, + "grad_norm": 4.423906326293945, + "learning_rate": 0.0001267248971294349, + "loss": 2.5055, + "step": 36700 + }, + { + "epoch": 33.23529411764706, + "grad_norm": 4.633875846862793, + "learning_rate": 0.00012668199042833284, + "loss": 2.4087, + "step": 36725 + }, + { + "epoch": 33.2579185520362, + "grad_norm": 3.9326577186584473, + "learning_rate": 0.00012663905149358742, + "loss": 2.2786, + "step": 36750 + }, + { + "epoch": 33.28054298642534, + "grad_norm": 4.227899551391602, + "learning_rate": 0.0001265960803519793, + "loss": 2.1966, + "step": 36775 + }, + { + "epoch": 33.30316742081448, + "grad_norm": 4.313636779785156, + "learning_rate": 0.00012655307703030925, + "loss": 2.3442, + "step": 36800 + }, + { + "epoch": 33.32579185520362, + "grad_norm": 5.133744239807129, + "learning_rate": 0.00012651004155539807, + "loss": 2.4795, + "step": 36825 + }, + { + "epoch": 33.34841628959276, + "grad_norm": 4.362292766571045, + "learning_rate": 0.00012646697395408667, + "loss": 2.2518, + "step": 36850 + }, + { + "epoch": 33.3710407239819, + "grad_norm": 5.804786682128906, + "learning_rate": 0.00012642387425323596, + "loss": 2.4099, + "step": 36875 + }, + { + "epoch": 33.39366515837104, + "grad_norm": 4.712783336639404, + "learning_rate": 0.00012638074247972686, + "loss": 2.3866, + "step": 36900 + }, + { + "epoch": 33.41628959276018, + "grad_norm": 5.200821399688721, + "learning_rate": 0.00012633757866046036, + "loss": 2.3939, + "step": 36925 + }, + { + "epoch": 33.43891402714932, + "grad_norm": 4.40043830871582, + "learning_rate": 0.00012629438282235733, + "loss": 2.2179, + "step": 36950 + }, + { + "epoch": 33.46153846153846, + "grad_norm": 4.598316669464111, + "learning_rate": 0.0001262511549923587, + "loss": 2.3507, + "step": 36975 + }, + { + "epoch": 33.484162895927604, + "grad_norm": 4.276734352111816, + "learning_rate": 0.00012620789519742534, + "loss": 2.4628, + "step": 37000 + }, + { + "epoch": 33.50678733031674, + "grad_norm": 4.842257499694824, + "learning_rate": 0.00012616460346453798, + "loss": 2.4459, + "step": 37025 + }, + { + "epoch": 33.529411764705884, + "grad_norm": 4.445224761962891, + "learning_rate": 0.00012612127982069738, + "loss": 2.3065, + "step": 37050 + }, + { + "epoch": 33.55203619909502, + "grad_norm": 5.114562511444092, + "learning_rate": 0.0001260779242929241, + "loss": 2.4811, + "step": 37075 + }, + { + "epoch": 33.574660633484164, + "grad_norm": 4.964504718780518, + "learning_rate": 0.0001260345369082587, + "loss": 2.3818, + "step": 37100 + }, + { + "epoch": 33.5972850678733, + "grad_norm": 4.5414886474609375, + "learning_rate": 0.0001259911176937615, + "loss": 2.545, + "step": 37125 + }, + { + "epoch": 33.619909502262445, + "grad_norm": 4.3598856925964355, + "learning_rate": 0.00012594766667651272, + "loss": 2.4586, + "step": 37150 + }, + { + "epoch": 33.64253393665158, + "grad_norm": 4.801278591156006, + "learning_rate": 0.00012590418388361242, + "loss": 2.439, + "step": 37175 + }, + { + "epoch": 33.665158371040725, + "grad_norm": 4.317627906799316, + "learning_rate": 0.00012586066934218047, + "loss": 2.4285, + "step": 37200 + }, + { + "epoch": 33.68778280542986, + "grad_norm": 3.9917690753936768, + "learning_rate": 0.0001258171230793565, + "loss": 2.3523, + "step": 37225 + }, + { + "epoch": 33.710407239819006, + "grad_norm": 4.744004249572754, + "learning_rate": 0.00012577354512230003, + "loss": 2.3969, + "step": 37250 + }, + { + "epoch": 33.73303167420814, + "grad_norm": 5.244507312774658, + "learning_rate": 0.00012572993549819027, + "loss": 2.3932, + "step": 37275 + }, + { + "epoch": 33.755656108597286, + "grad_norm": 5.12740421295166, + "learning_rate": 0.00012568629423422617, + "loss": 2.4197, + "step": 37300 + }, + { + "epoch": 33.77828054298642, + "grad_norm": 3.774641752243042, + "learning_rate": 0.00012564262135762643, + "loss": 2.4066, + "step": 37325 + }, + { + "epoch": 33.800904977375566, + "grad_norm": 4.841151237487793, + "learning_rate": 0.0001255989168956295, + "loss": 2.5366, + "step": 37350 + }, + { + "epoch": 33.8235294117647, + "grad_norm": 5.057889461517334, + "learning_rate": 0.0001255551808754935, + "loss": 2.4209, + "step": 37375 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 3.9078261852264404, + "learning_rate": 0.0001255114133244962, + "loss": 2.3331, + "step": 37400 + }, + { + "epoch": 33.86877828054298, + "grad_norm": 5.785364151000977, + "learning_rate": 0.0001254676142699351, + "loss": 2.5448, + "step": 37425 + }, + { + "epoch": 33.89140271493213, + "grad_norm": 4.612705230712891, + "learning_rate": 0.00012542378373912736, + "loss": 2.3608, + "step": 37450 + }, + { + "epoch": 33.914027149321264, + "grad_norm": 5.307022571563721, + "learning_rate": 0.00012537992175940964, + "loss": 2.4911, + "step": 37475 + }, + { + "epoch": 33.93665158371041, + "grad_norm": 5.742743968963623, + "learning_rate": 0.00012533602835813838, + "loss": 2.3899, + "step": 37500 + }, + { + "epoch": 33.959276018099544, + "grad_norm": 5.305797100067139, + "learning_rate": 0.0001252921035626895, + "loss": 2.3406, + "step": 37525 + }, + { + "epoch": 33.98190045248869, + "grad_norm": 3.845489501953125, + "learning_rate": 0.00012524814740045857, + "loss": 2.4819, + "step": 37550 + }, + { + "epoch": 34.00452488687783, + "grad_norm": 4.11320161819458, + "learning_rate": 0.00012520415989886066, + "loss": 2.3448, + "step": 37575 + }, + { + "epoch": 34.02714932126697, + "grad_norm": 4.280022144317627, + "learning_rate": 0.00012516014108533049, + "loss": 2.2643, + "step": 37600 + }, + { + "epoch": 34.04977375565611, + "grad_norm": 4.960660934448242, + "learning_rate": 0.00012511609098732215, + "loss": 2.1732, + "step": 37625 + }, + { + "epoch": 34.07239819004525, + "grad_norm": 4.4615254402160645, + "learning_rate": 0.0001250720096323094, + "loss": 2.2452, + "step": 37650 + }, + { + "epoch": 34.09502262443439, + "grad_norm": 4.478513240814209, + "learning_rate": 0.0001250278970477854, + "loss": 2.2065, + "step": 37675 + }, + { + "epoch": 34.11764705882353, + "grad_norm": 4.7386345863342285, + "learning_rate": 0.00012498375326126286, + "loss": 2.33, + "step": 37700 + }, + { + "epoch": 34.14027149321267, + "grad_norm": 4.773339748382568, + "learning_rate": 0.00012493957830027384, + "loss": 2.1296, + "step": 37725 + }, + { + "epoch": 34.16289592760181, + "grad_norm": 4.943683624267578, + "learning_rate": 0.00012489537219236994, + "loss": 2.1463, + "step": 37750 + }, + { + "epoch": 34.18552036199095, + "grad_norm": 4.314094543457031, + "learning_rate": 0.00012485113496512218, + "loss": 2.2566, + "step": 37775 + }, + { + "epoch": 34.20814479638009, + "grad_norm": 5.287987232208252, + "learning_rate": 0.00012480686664612093, + "loss": 2.3733, + "step": 37800 + }, + { + "epoch": 34.23076923076923, + "grad_norm": 5.228750228881836, + "learning_rate": 0.00012476256726297598, + "loss": 2.1226, + "step": 37825 + }, + { + "epoch": 34.25339366515837, + "grad_norm": 3.784313201904297, + "learning_rate": 0.00012471823684331653, + "loss": 2.1873, + "step": 37850 + }, + { + "epoch": 34.276018099547514, + "grad_norm": 4.201169490814209, + "learning_rate": 0.0001246738754147911, + "loss": 2.3755, + "step": 37875 + }, + { + "epoch": 34.29864253393665, + "grad_norm": 4.945826053619385, + "learning_rate": 0.00012462948300506754, + "loss": 2.1317, + "step": 37900 + }, + { + "epoch": 34.321266968325794, + "grad_norm": 3.940614700317383, + "learning_rate": 0.00012458505964183306, + "loss": 2.1285, + "step": 37925 + }, + { + "epoch": 34.34389140271493, + "grad_norm": 5.837182998657227, + "learning_rate": 0.00012454060535279412, + "loss": 2.1953, + "step": 37950 + }, + { + "epoch": 34.366515837104075, + "grad_norm": 4.630831241607666, + "learning_rate": 0.00012449612016567657, + "loss": 2.2692, + "step": 37975 + }, + { + "epoch": 34.38914027149321, + "grad_norm": 4.768281936645508, + "learning_rate": 0.00012445160410822542, + "loss": 2.1857, + "step": 38000 + }, + { + "epoch": 34.411764705882355, + "grad_norm": 5.820812702178955, + "learning_rate": 0.00012440705720820496, + "loss": 2.2188, + "step": 38025 + }, + { + "epoch": 34.43438914027149, + "grad_norm": 4.905755043029785, + "learning_rate": 0.00012436247949339875, + "loss": 2.2482, + "step": 38050 + }, + { + "epoch": 34.457013574660635, + "grad_norm": 5.567986488342285, + "learning_rate": 0.0001243178709916096, + "loss": 2.4189, + "step": 38075 + }, + { + "epoch": 34.47963800904977, + "grad_norm": 4.231449127197266, + "learning_rate": 0.0001242732317306594, + "loss": 2.2835, + "step": 38100 + }, + { + "epoch": 34.502262443438916, + "grad_norm": 3.215325117111206, + "learning_rate": 0.00012422856173838938, + "loss": 2.4559, + "step": 38125 + }, + { + "epoch": 34.52488687782805, + "grad_norm": 3.921882152557373, + "learning_rate": 0.0001241838610426598, + "loss": 2.2331, + "step": 38150 + }, + { + "epoch": 34.547511312217196, + "grad_norm": 5.587146282196045, + "learning_rate": 0.00012413912967135013, + "loss": 2.2795, + "step": 38175 + }, + { + "epoch": 34.57013574660633, + "grad_norm": 4.40065860748291, + "learning_rate": 0.00012409436765235896, + "loss": 2.3123, + "step": 38200 + }, + { + "epoch": 34.59276018099548, + "grad_norm": 4.005879878997803, + "learning_rate": 0.00012404957501360405, + "loss": 2.2526, + "step": 38225 + }, + { + "epoch": 34.61538461538461, + "grad_norm": 3.5936102867126465, + "learning_rate": 0.00012400475178302216, + "loss": 2.4858, + "step": 38250 + }, + { + "epoch": 34.63800904977376, + "grad_norm": 3.343513011932373, + "learning_rate": 0.0001239598979885692, + "loss": 2.3716, + "step": 38275 + }, + { + "epoch": 34.660633484162894, + "grad_norm": 4.3708648681640625, + "learning_rate": 0.00012391501365822014, + "loss": 2.2519, + "step": 38300 + }, + { + "epoch": 34.68325791855204, + "grad_norm": 4.603794097900391, + "learning_rate": 0.00012387009881996894, + "loss": 2.4559, + "step": 38325 + }, + { + "epoch": 34.705882352941174, + "grad_norm": 4.484412670135498, + "learning_rate": 0.00012382515350182867, + "loss": 2.4036, + "step": 38350 + }, + { + "epoch": 34.72850678733032, + "grad_norm": 5.898510456085205, + "learning_rate": 0.0001237801777318313, + "loss": 2.2546, + "step": 38375 + }, + { + "epoch": 34.751131221719454, + "grad_norm": 5.234091758728027, + "learning_rate": 0.00012373517153802793, + "loss": 2.3146, + "step": 38400 + }, + { + "epoch": 34.7737556561086, + "grad_norm": 5.243960380554199, + "learning_rate": 0.0001236901349484885, + "loss": 2.1139, + "step": 38425 + }, + { + "epoch": 34.796380090497735, + "grad_norm": 5.678958892822266, + "learning_rate": 0.00012364506799130201, + "loss": 2.2946, + "step": 38450 + }, + { + "epoch": 34.81900452488688, + "grad_norm": 5.459033966064453, + "learning_rate": 0.00012359997069457635, + "loss": 2.4424, + "step": 38475 + }, + { + "epoch": 34.841628959276015, + "grad_norm": 4.285768508911133, + "learning_rate": 0.00012355484308643837, + "loss": 2.251, + "step": 38500 + }, + { + "epoch": 34.86425339366516, + "grad_norm": 3.4532508850097656, + "learning_rate": 0.0001235096851950337, + "loss": 2.3726, + "step": 38525 + }, + { + "epoch": 34.886877828054295, + "grad_norm": 5.380105495452881, + "learning_rate": 0.0001234644970485271, + "loss": 2.4636, + "step": 38550 + }, + { + "epoch": 34.90950226244344, + "grad_norm": 4.285428524017334, + "learning_rate": 0.00012341927867510192, + "loss": 2.4352, + "step": 38575 + }, + { + "epoch": 34.932126696832576, + "grad_norm": 5.082425117492676, + "learning_rate": 0.00012337403010296059, + "loss": 2.2011, + "step": 38600 + }, + { + "epoch": 34.95475113122172, + "grad_norm": 5.039985656738281, + "learning_rate": 0.00012332875136032424, + "loss": 2.3386, + "step": 38625 + }, + { + "epoch": 34.977375565610856, + "grad_norm": 4.668622970581055, + "learning_rate": 0.00012328344247543286, + "loss": 2.4197, + "step": 38650 + }, + { + "epoch": 35.0, + "grad_norm": 3.9261083602905273, + "learning_rate": 0.00012323810347654525, + "loss": 2.3796, + "step": 38675 + }, + { + "epoch": 35.022624434389144, + "grad_norm": 4.162503719329834, + "learning_rate": 0.000123192734391939, + "loss": 2.271, + "step": 38700 + }, + { + "epoch": 35.04524886877828, + "grad_norm": 5.307286262512207, + "learning_rate": 0.00012314733524991037, + "loss": 2.0773, + "step": 38725 + }, + { + "epoch": 35.067873303167424, + "grad_norm": 3.6611545085906982, + "learning_rate": 0.00012310190607877454, + "loss": 2.2386, + "step": 38750 + }, + { + "epoch": 35.09049773755656, + "grad_norm": 5.716986656188965, + "learning_rate": 0.00012305644690686524, + "loss": 2.1327, + "step": 38775 + }, + { + "epoch": 35.113122171945705, + "grad_norm": 4.889161586761475, + "learning_rate": 0.00012301095776253506, + "loss": 1.8899, + "step": 38800 + }, + { + "epoch": 35.13574660633484, + "grad_norm": 4.419322967529297, + "learning_rate": 0.00012296543867415513, + "loss": 2.0504, + "step": 38825 + }, + { + "epoch": 35.158371040723985, + "grad_norm": 4.462159156799316, + "learning_rate": 0.00012291988967011542, + "loss": 2.2039, + "step": 38850 + }, + { + "epoch": 35.18099547511312, + "grad_norm": 4.497054576873779, + "learning_rate": 0.00012287431077882442, + "loss": 1.9936, + "step": 38875 + }, + { + "epoch": 35.203619909502265, + "grad_norm": 4.174394607543945, + "learning_rate": 0.00012283052695164664, + "loss": 2.3584, + "step": 38900 + }, + { + "epoch": 35.2262443438914, + "grad_norm": 6.2371602058410645, + "learning_rate": 0.00012278488956382204, + "loss": 2.1509, + "step": 38925 + }, + { + "epoch": 35.248868778280546, + "grad_norm": 4.77733039855957, + "learning_rate": 0.0001227392223729447, + "loss": 2.1533, + "step": 38950 + }, + { + "epoch": 35.27149321266968, + "grad_norm": 4.860963344573975, + "learning_rate": 0.0001226935254074968, + "loss": 2.2584, + "step": 38975 + }, + { + "epoch": 35.294117647058826, + "grad_norm": 4.231897354125977, + "learning_rate": 0.00012264779869597926, + "loss": 2.2071, + "step": 39000 + }, + { + "epoch": 35.31674208144796, + "grad_norm": 4.994824409484863, + "learning_rate": 0.00012260204226691138, + "loss": 2.0147, + "step": 39025 + }, + { + "epoch": 35.339366515837106, + "grad_norm": 5.301912784576416, + "learning_rate": 0.00012255625614883116, + "loss": 2.4201, + "step": 39050 + }, + { + "epoch": 35.36199095022624, + "grad_norm": 5.6750335693359375, + "learning_rate": 0.00012251044037029496, + "loss": 2.2504, + "step": 39075 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 4.550475597381592, + "learning_rate": 0.00012246459495987775, + "loss": 2.3047, + "step": 39100 + }, + { + "epoch": 35.40723981900452, + "grad_norm": 4.855838298797607, + "learning_rate": 0.00012241871994617294, + "loss": 2.2243, + "step": 39125 + }, + { + "epoch": 35.42986425339367, + "grad_norm": 4.490835666656494, + "learning_rate": 0.00012237281535779242, + "loss": 2.1518, + "step": 39150 + }, + { + "epoch": 35.452488687782804, + "grad_norm": 4.116247653961182, + "learning_rate": 0.0001223268812233665, + "loss": 2.1949, + "step": 39175 + }, + { + "epoch": 35.47511312217195, + "grad_norm": 5.929376125335693, + "learning_rate": 0.00012228091757154392, + "loss": 2.1322, + "step": 39200 + }, + { + "epoch": 35.497737556561084, + "grad_norm": 4.142200469970703, + "learning_rate": 0.00012223492443099186, + "loss": 2.0254, + "step": 39225 + }, + { + "epoch": 35.52036199095023, + "grad_norm": 3.8631250858306885, + "learning_rate": 0.00012218890183039589, + "loss": 2.1582, + "step": 39250 + }, + { + "epoch": 35.542986425339365, + "grad_norm": 4.793134689331055, + "learning_rate": 0.0001221428497984599, + "loss": 2.1888, + "step": 39275 + }, + { + "epoch": 35.56561085972851, + "grad_norm": 4.693505764007568, + "learning_rate": 0.0001220967683639062, + "loss": 2.1438, + "step": 39300 + }, + { + "epoch": 35.588235294117645, + "grad_norm": 4.33529806137085, + "learning_rate": 0.00012205065755547539, + "loss": 2.1771, + "step": 39325 + }, + { + "epoch": 35.61085972850679, + "grad_norm": 4.993594646453857, + "learning_rate": 0.00012200451740192644, + "loss": 2.2037, + "step": 39350 + }, + { + "epoch": 35.633484162895925, + "grad_norm": 4.973903179168701, + "learning_rate": 0.00012195834793203655, + "loss": 2.1383, + "step": 39375 + }, + { + "epoch": 35.65610859728507, + "grad_norm": 4.853772163391113, + "learning_rate": 0.00012191214917460131, + "loss": 2.2724, + "step": 39400 + }, + { + "epoch": 35.678733031674206, + "grad_norm": 5.172274589538574, + "learning_rate": 0.00012186592115843446, + "loss": 2.2928, + "step": 39425 + }, + { + "epoch": 35.70135746606335, + "grad_norm": 4.263808250427246, + "learning_rate": 0.00012181966391236806, + "loss": 2.19, + "step": 39450 + }, + { + "epoch": 35.723981900452486, + "grad_norm": 4.12034273147583, + "learning_rate": 0.00012177337746525237, + "loss": 2.0226, + "step": 39475 + }, + { + "epoch": 35.74660633484163, + "grad_norm": 4.177559852600098, + "learning_rate": 0.00012172706184595594, + "loss": 2.1455, + "step": 39500 + }, + { + "epoch": 35.76923076923077, + "grad_norm": 5.90501594543457, + "learning_rate": 0.00012168071708336537, + "loss": 2.3705, + "step": 39525 + }, + { + "epoch": 35.79185520361991, + "grad_norm": 5.299849033355713, + "learning_rate": 0.00012163434320638556, + "loss": 2.2741, + "step": 39550 + }, + { + "epoch": 35.81447963800905, + "grad_norm": 4.647896766662598, + "learning_rate": 0.00012158794024393952, + "loss": 2.2383, + "step": 39575 + }, + { + "epoch": 35.83710407239819, + "grad_norm": 4.593433380126953, + "learning_rate": 0.00012154150822496841, + "loss": 2.1183, + "step": 39600 + }, + { + "epoch": 35.85972850678733, + "grad_norm": 3.6611621379852295, + "learning_rate": 0.00012149504717843149, + "loss": 2.2052, + "step": 39625 + }, + { + "epoch": 35.88235294117647, + "grad_norm": 3.904351234436035, + "learning_rate": 0.00012144855713330618, + "loss": 2.2897, + "step": 39650 + }, + { + "epoch": 35.90497737556561, + "grad_norm": 4.795818328857422, + "learning_rate": 0.00012140203811858789, + "loss": 2.1473, + "step": 39675 + }, + { + "epoch": 35.92760180995475, + "grad_norm": 5.492254257202148, + "learning_rate": 0.0001213554901632902, + "loss": 2.2725, + "step": 39700 + }, + { + "epoch": 35.95022624434389, + "grad_norm": 4.611868858337402, + "learning_rate": 0.0001213089132964447, + "loss": 2.197, + "step": 39725 + }, + { + "epoch": 35.97285067873303, + "grad_norm": 4.947539329528809, + "learning_rate": 0.00012126230754710099, + "loss": 2.2163, + "step": 39750 + }, + { + "epoch": 35.99547511312217, + "grad_norm": 4.95683479309082, + "learning_rate": 0.0001212156729443267, + "loss": 2.1723, + "step": 39775 + }, + { + "epoch": 36.01809954751131, + "grad_norm": 4.998210906982422, + "learning_rate": 0.00012116900951720745, + "loss": 2.045, + "step": 39800 + }, + { + "epoch": 36.040723981900456, + "grad_norm": 4.544924736022949, + "learning_rate": 0.00012112231729484689, + "loss": 1.9196, + "step": 39825 + }, + { + "epoch": 36.06334841628959, + "grad_norm": 3.828965663909912, + "learning_rate": 0.00012107559630636655, + "loss": 2.0334, + "step": 39850 + }, + { + "epoch": 36.085972850678736, + "grad_norm": 5.906606197357178, + "learning_rate": 0.00012102884658090593, + "loss": 2.168, + "step": 39875 + }, + { + "epoch": 36.10859728506787, + "grad_norm": 4.3583831787109375, + "learning_rate": 0.00012098206814762247, + "loss": 2.0809, + "step": 39900 + }, + { + "epoch": 36.13122171945702, + "grad_norm": 6.541887283325195, + "learning_rate": 0.00012093526103569152, + "loss": 2.0264, + "step": 39925 + }, + { + "epoch": 36.15384615384615, + "grad_norm": 5.1993536949157715, + "learning_rate": 0.00012088842527430629, + "loss": 2.1022, + "step": 39950 + }, + { + "epoch": 36.1764705882353, + "grad_norm": 5.358850955963135, + "learning_rate": 0.00012084156089267785, + "loss": 2.0869, + "step": 39975 + }, + { + "epoch": 36.199095022624434, + "grad_norm": 4.883439064025879, + "learning_rate": 0.00012079466792003517, + "loss": 2.1404, + "step": 40000 + }, + { + "epoch": 36.22171945701358, + "grad_norm": 5.307112693786621, + "learning_rate": 0.000120747746385625, + "loss": 2.0697, + "step": 40025 + }, + { + "epoch": 36.244343891402714, + "grad_norm": 4.577943325042725, + "learning_rate": 0.00012070079631871192, + "loss": 2.013, + "step": 40050 + }, + { + "epoch": 36.26696832579186, + "grad_norm": 5.501679420471191, + "learning_rate": 0.00012065381774857832, + "loss": 2.0398, + "step": 40075 + }, + { + "epoch": 36.289592760180994, + "grad_norm": 5.405413627624512, + "learning_rate": 0.00012060681070452438, + "loss": 2.1319, + "step": 40100 + }, + { + "epoch": 36.31221719457014, + "grad_norm": 4.200492858886719, + "learning_rate": 0.00012055977521586798, + "loss": 2.2056, + "step": 40125 + }, + { + "epoch": 36.334841628959275, + "grad_norm": 4.862429141998291, + "learning_rate": 0.0001205127113119448, + "loss": 1.9553, + "step": 40150 + }, + { + "epoch": 36.35746606334842, + "grad_norm": 5.6993536949157715, + "learning_rate": 0.00012046561902210822, + "loss": 2.3308, + "step": 40175 + }, + { + "epoch": 36.380090497737555, + "grad_norm": 5.029634952545166, + "learning_rate": 0.00012041849837572929, + "loss": 2.2173, + "step": 40200 + }, + { + "epoch": 36.4027149321267, + "grad_norm": 6.117025852203369, + "learning_rate": 0.00012037134940219684, + "loss": 2.0294, + "step": 40225 + }, + { + "epoch": 36.425339366515836, + "grad_norm": 4.50570821762085, + "learning_rate": 0.00012032417213091728, + "loss": 2.1731, + "step": 40250 + }, + { + "epoch": 36.44796380090498, + "grad_norm": 4.521318435668945, + "learning_rate": 0.00012027696659131466, + "loss": 2.003, + "step": 40275 + }, + { + "epoch": 36.470588235294116, + "grad_norm": 4.803319454193115, + "learning_rate": 0.00012022973281283073, + "loss": 1.9514, + "step": 40300 + }, + { + "epoch": 36.49321266968326, + "grad_norm": 5.161010265350342, + "learning_rate": 0.00012018247082492483, + "loss": 2.2183, + "step": 40325 + }, + { + "epoch": 36.515837104072396, + "grad_norm": 5.701634407043457, + "learning_rate": 0.00012013518065707387, + "loss": 2.0652, + "step": 40350 + }, + { + "epoch": 36.53846153846154, + "grad_norm": 5.202276229858398, + "learning_rate": 0.00012008786233877233, + "loss": 2.0276, + "step": 40375 + }, + { + "epoch": 36.56108597285068, + "grad_norm": 5.018892765045166, + "learning_rate": 0.00012004051589953232, + "loss": 2.1121, + "step": 40400 + }, + { + "epoch": 36.58371040723982, + "grad_norm": 5.593802452087402, + "learning_rate": 0.00011999314136888338, + "loss": 2.0423, + "step": 40425 + }, + { + "epoch": 36.60633484162896, + "grad_norm": 5.779587268829346, + "learning_rate": 0.00011994573877637264, + "loss": 2.0321, + "step": 40450 + }, + { + "epoch": 36.6289592760181, + "grad_norm": 4.755776882171631, + "learning_rate": 0.00011989830815156473, + "loss": 2.0956, + "step": 40475 + }, + { + "epoch": 36.65158371040724, + "grad_norm": 4.763101577758789, + "learning_rate": 0.00011985084952404173, + "loss": 2.0916, + "step": 40500 + }, + { + "epoch": 36.67420814479638, + "grad_norm": 4.131762981414795, + "learning_rate": 0.00011980336292340324, + "loss": 2.0092, + "step": 40525 + }, + { + "epoch": 36.69683257918552, + "grad_norm": 4.448458671569824, + "learning_rate": 0.00011975584837926623, + "loss": 2.1099, + "step": 40550 + }, + { + "epoch": 36.71945701357466, + "grad_norm": 4.395147323608398, + "learning_rate": 0.00011970830592126517, + "loss": 1.9198, + "step": 40575 + }, + { + "epoch": 36.7420814479638, + "grad_norm": 5.14371919631958, + "learning_rate": 0.00011966073557905188, + "loss": 2.138, + "step": 40600 + }, + { + "epoch": 36.76470588235294, + "grad_norm": 5.342911720275879, + "learning_rate": 0.00011961313738229565, + "loss": 2.1533, + "step": 40625 + }, + { + "epoch": 36.78733031674208, + "grad_norm": 4.580821990966797, + "learning_rate": 0.00011956551136068306, + "loss": 2.2054, + "step": 40650 + }, + { + "epoch": 36.80995475113122, + "grad_norm": 4.565236568450928, + "learning_rate": 0.00011951785754391807, + "loss": 2.1256, + "step": 40675 + }, + { + "epoch": 36.83257918552036, + "grad_norm": 4.817327499389648, + "learning_rate": 0.00011947017596172202, + "loss": 2.2157, + "step": 40700 + }, + { + "epoch": 36.8552036199095, + "grad_norm": 4.755205154418945, + "learning_rate": 0.0001194224666438335, + "loss": 2.017, + "step": 40725 + }, + { + "epoch": 36.87782805429864, + "grad_norm": 4.711893558502197, + "learning_rate": 0.00011937472962000844, + "loss": 2.2593, + "step": 40750 + }, + { + "epoch": 36.90045248868778, + "grad_norm": 5.197144508361816, + "learning_rate": 0.00011932696492002003, + "loss": 2.1794, + "step": 40775 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 3.8158843517303467, + "learning_rate": 0.00011927917257365873, + "loss": 2.0601, + "step": 40800 + }, + { + "epoch": 36.94570135746606, + "grad_norm": 5.204895973205566, + "learning_rate": 0.00011923135261073229, + "loss": 2.1827, + "step": 40825 + }, + { + "epoch": 36.9683257918552, + "grad_norm": 4.822196960449219, + "learning_rate": 0.00011918350506106556, + "loss": 2.1934, + "step": 40850 + }, + { + "epoch": 36.990950226244344, + "grad_norm": 3.964585304260254, + "learning_rate": 0.00011913562995450072, + "loss": 2.1056, + "step": 40875 + }, + { + "epoch": 37.01357466063349, + "grad_norm": 3.375369071960449, + "learning_rate": 0.00011908772732089709, + "loss": 1.899, + "step": 40900 + }, + { + "epoch": 37.036199095022624, + "grad_norm": 4.7697834968566895, + "learning_rate": 0.00011903979719013116, + "loss": 1.8776, + "step": 40925 + }, + { + "epoch": 37.05882352941177, + "grad_norm": 4.175597190856934, + "learning_rate": 0.00011899183959209656, + "loss": 1.8977, + "step": 40950 + }, + { + "epoch": 37.081447963800905, + "grad_norm": 4.101481914520264, + "learning_rate": 0.00011894385455670405, + "loss": 1.9835, + "step": 40975 + }, + { + "epoch": 37.10407239819005, + "grad_norm": 4.512917518615723, + "learning_rate": 0.00011889584211388152, + "loss": 1.9868, + "step": 41000 + }, + { + "epoch": 37.126696832579185, + "grad_norm": 5.312557697296143, + "learning_rate": 0.00011884780229357397, + "loss": 1.9328, + "step": 41025 + }, + { + "epoch": 37.14932126696833, + "grad_norm": 5.03694486618042, + "learning_rate": 0.0001187997351257434, + "loss": 1.8786, + "step": 41050 + }, + { + "epoch": 37.171945701357465, + "grad_norm": 4.9827070236206055, + "learning_rate": 0.00011875164064036896, + "loss": 1.9491, + "step": 41075 + }, + { + "epoch": 37.19457013574661, + "grad_norm": 5.550374507904053, + "learning_rate": 0.0001187035188674468, + "loss": 1.9217, + "step": 41100 + }, + { + "epoch": 37.217194570135746, + "grad_norm": 4.104196071624756, + "learning_rate": 0.00011865536983699005, + "loss": 1.93, + "step": 41125 + }, + { + "epoch": 37.23981900452489, + "grad_norm": 5.8158369064331055, + "learning_rate": 0.0001186071935790289, + "loss": 1.9963, + "step": 41150 + }, + { + "epoch": 37.262443438914026, + "grad_norm": 4.323169708251953, + "learning_rate": 0.00011855899012361047, + "loss": 1.909, + "step": 41175 + }, + { + "epoch": 37.28506787330317, + "grad_norm": 4.632725715637207, + "learning_rate": 0.0001185107595007989, + "loss": 2.0336, + "step": 41200 + }, + { + "epoch": 37.30769230769231, + "grad_norm": 4.106157302856445, + "learning_rate": 0.00011846250174067522, + "loss": 2.1866, + "step": 41225 + }, + { + "epoch": 37.33031674208145, + "grad_norm": 4.956432342529297, + "learning_rate": 0.00011841421687333743, + "loss": 2.0205, + "step": 41250 + }, + { + "epoch": 37.35294117647059, + "grad_norm": 6.1458964347839355, + "learning_rate": 0.00011836590492890039, + "loss": 2.0438, + "step": 41275 + }, + { + "epoch": 37.37556561085973, + "grad_norm": 5.244638919830322, + "learning_rate": 0.0001183175659374959, + "loss": 2.1053, + "step": 41300 + }, + { + "epoch": 37.39819004524887, + "grad_norm": 4.762988567352295, + "learning_rate": 0.00011826919992927255, + "loss": 1.9517, + "step": 41325 + }, + { + "epoch": 37.42081447963801, + "grad_norm": 4.060920238494873, + "learning_rate": 0.00011822080693439589, + "loss": 2.0046, + "step": 41350 + }, + { + "epoch": 37.44343891402715, + "grad_norm": 4.41787576675415, + "learning_rate": 0.00011817238698304823, + "loss": 2.0745, + "step": 41375 + }, + { + "epoch": 37.46606334841629, + "grad_norm": 3.6469852924346924, + "learning_rate": 0.00011812394010542869, + "loss": 1.8661, + "step": 41400 + }, + { + "epoch": 37.48868778280543, + "grad_norm": 4.105870246887207, + "learning_rate": 0.00011807546633175323, + "loss": 2.0869, + "step": 41425 + }, + { + "epoch": 37.51131221719457, + "grad_norm": 4.687159061431885, + "learning_rate": 0.0001180269656922545, + "loss": 1.8561, + "step": 41450 + }, + { + "epoch": 37.53393665158371, + "grad_norm": 4.697596073150635, + "learning_rate": 0.00011797843821718201, + "loss": 2.0736, + "step": 41475 + }, + { + "epoch": 37.55656108597285, + "grad_norm": 5.025703430175781, + "learning_rate": 0.00011792988393680192, + "loss": 1.927, + "step": 41500 + }, + { + "epoch": 37.57918552036199, + "grad_norm": 4.535080909729004, + "learning_rate": 0.00011788130288139719, + "loss": 2.0446, + "step": 41525 + }, + { + "epoch": 37.60180995475113, + "grad_norm": 5.132175445556641, + "learning_rate": 0.0001178326950812674, + "loss": 1.9801, + "step": 41550 + }, + { + "epoch": 37.62443438914027, + "grad_norm": 4.711911678314209, + "learning_rate": 0.00011778406056672883, + "loss": 1.9259, + "step": 41575 + }, + { + "epoch": 37.64705882352941, + "grad_norm": 4.1732282638549805, + "learning_rate": 0.00011773539936811449, + "loss": 1.9793, + "step": 41600 + }, + { + "epoch": 37.66968325791855, + "grad_norm": 4.863363742828369, + "learning_rate": 0.00011768671151577396, + "loss": 1.9115, + "step": 41625 + }, + { + "epoch": 37.69230769230769, + "grad_norm": 4.690446853637695, + "learning_rate": 0.00011763799704007343, + "loss": 2.1073, + "step": 41650 + }, + { + "epoch": 37.71493212669683, + "grad_norm": 5.44816255569458, + "learning_rate": 0.00011758925597139577, + "loss": 1.9904, + "step": 41675 + }, + { + "epoch": 37.737556561085974, + "grad_norm": 4.960061073303223, + "learning_rate": 0.00011754048834014034, + "loss": 2.3171, + "step": 41700 + }, + { + "epoch": 37.76018099547511, + "grad_norm": 4.3642354011535645, + "learning_rate": 0.00011749169417672319, + "loss": 1.9993, + "step": 41725 + }, + { + "epoch": 37.782805429864254, + "grad_norm": 3.7109146118164062, + "learning_rate": 0.00011744287351157682, + "loss": 2.0134, + "step": 41750 + }, + { + "epoch": 37.80542986425339, + "grad_norm": 4.4268927574157715, + "learning_rate": 0.00011739402637515027, + "loss": 2.1329, + "step": 41775 + }, + { + "epoch": 37.828054298642535, + "grad_norm": 4.4178643226623535, + "learning_rate": 0.00011734515279790915, + "loss": 2.1266, + "step": 41800 + }, + { + "epoch": 37.85067873303167, + "grad_norm": 5.504481315612793, + "learning_rate": 0.00011729625281033546, + "loss": 2.1022, + "step": 41825 + }, + { + "epoch": 37.873303167420815, + "grad_norm": 4.019266128540039, + "learning_rate": 0.00011724732644292778, + "loss": 1.9571, + "step": 41850 + }, + { + "epoch": 37.89592760180995, + "grad_norm": 4.862491607666016, + "learning_rate": 0.00011719837372620108, + "loss": 1.9971, + "step": 41875 + }, + { + "epoch": 37.918552036199095, + "grad_norm": 4.543803691864014, + "learning_rate": 0.00011714939469068675, + "loss": 2.0642, + "step": 41900 + }, + { + "epoch": 37.94117647058823, + "grad_norm": 5.209831714630127, + "learning_rate": 0.00011710038936693266, + "loss": 1.9744, + "step": 41925 + }, + { + "epoch": 37.963800904977376, + "grad_norm": 3.90395188331604, + "learning_rate": 0.00011705135778550302, + "loss": 2.0004, + "step": 41950 + }, + { + "epoch": 37.98642533936652, + "grad_norm": 4.3217878341674805, + "learning_rate": 0.00011700229997697843, + "loss": 1.9563, + "step": 41975 + }, + { + "epoch": 38.009049773755656, + "grad_norm": 3.418581485748291, + "learning_rate": 0.00011695321597195587, + "loss": 1.9149, + "step": 42000 + }, + { + "epoch": 38.0316742081448, + "grad_norm": 4.049460411071777, + "learning_rate": 0.00011690410580104862, + "loss": 1.8076, + "step": 42025 + }, + { + "epoch": 38.05429864253394, + "grad_norm": 5.510961532592773, + "learning_rate": 0.00011685496949488631, + "loss": 1.793, + "step": 42050 + }, + { + "epoch": 38.07692307692308, + "grad_norm": 5.471452713012695, + "learning_rate": 0.00011680580708411488, + "loss": 1.8731, + "step": 42075 + }, + { + "epoch": 38.09954751131222, + "grad_norm": 5.05703592300415, + "learning_rate": 0.00011675661859939648, + "loss": 1.8416, + "step": 42100 + }, + { + "epoch": 38.12217194570136, + "grad_norm": 4.373697280883789, + "learning_rate": 0.00011670740407140963, + "loss": 1.9638, + "step": 42125 + }, + { + "epoch": 38.1447963800905, + "grad_norm": 5.67213773727417, + "learning_rate": 0.00011665816353084898, + "loss": 1.7583, + "step": 42150 + }, + { + "epoch": 38.16742081447964, + "grad_norm": 4.141268253326416, + "learning_rate": 0.00011660889700842552, + "loss": 2.103, + "step": 42175 + }, + { + "epoch": 38.19004524886878, + "grad_norm": 5.167728424072266, + "learning_rate": 0.00011655960453486637, + "loss": 2.0116, + "step": 42200 + }, + { + "epoch": 38.21266968325792, + "grad_norm": 6.356166839599609, + "learning_rate": 0.00011651028614091482, + "loss": 1.9145, + "step": 42225 + }, + { + "epoch": 38.23529411764706, + "grad_norm": 4.968979358673096, + "learning_rate": 0.00011646094185733036, + "loss": 1.8108, + "step": 42250 + }, + { + "epoch": 38.2579185520362, + "grad_norm": 7.776438236236572, + "learning_rate": 0.00011641157171488867, + "loss": 1.9705, + "step": 42275 + }, + { + "epoch": 38.28054298642534, + "grad_norm": 4.122015476226807, + "learning_rate": 0.00011636217574438146, + "loss": 1.9069, + "step": 42300 + }, + { + "epoch": 38.30316742081448, + "grad_norm": 3.944101333618164, + "learning_rate": 0.00011631275397661664, + "loss": 1.8944, + "step": 42325 + }, + { + "epoch": 38.32579185520362, + "grad_norm": 4.42734432220459, + "learning_rate": 0.00011626330644241815, + "loss": 1.9438, + "step": 42350 + }, + { + "epoch": 38.34841628959276, + "grad_norm": 3.9971983432769775, + "learning_rate": 0.00011621383317262603, + "loss": 1.7437, + "step": 42375 + }, + { + "epoch": 38.3710407239819, + "grad_norm": 5.1051106452941895, + "learning_rate": 0.00011616433419809634, + "loss": 1.9386, + "step": 42400 + }, + { + "epoch": 38.39366515837104, + "grad_norm": 6.243931770324707, + "learning_rate": 0.00011611480954970122, + "loss": 1.9027, + "step": 42425 + }, + { + "epoch": 38.41628959276018, + "grad_norm": 4.356624603271484, + "learning_rate": 0.0001160652592583288, + "loss": 1.9252, + "step": 42450 + }, + { + "epoch": 38.43891402714932, + "grad_norm": 4.535741806030273, + "learning_rate": 0.00011601568335488318, + "loss": 2.0281, + "step": 42475 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 5.314903736114502, + "learning_rate": 0.00011596608187028447, + "loss": 1.8095, + "step": 42500 + }, + { + "epoch": 38.484162895927604, + "grad_norm": 5.069538116455078, + "learning_rate": 0.0001159164548354687, + "loss": 2.0085, + "step": 42525 + }, + { + "epoch": 38.50678733031674, + "grad_norm": 5.700867176055908, + "learning_rate": 0.00011586680228138787, + "loss": 1.8026, + "step": 42550 + }, + { + "epoch": 38.529411764705884, + "grad_norm": 3.9822821617126465, + "learning_rate": 0.00011581712423900985, + "loss": 1.8676, + "step": 42575 + }, + { + "epoch": 38.55203619909502, + "grad_norm": 4.560797214508057, + "learning_rate": 0.00011576940936769776, + "loss": 1.9595, + "step": 42600 + }, + { + "epoch": 38.574660633484164, + "grad_norm": 5.297098159790039, + "learning_rate": 0.00011571968145814983, + "loss": 1.8811, + "step": 42625 + }, + { + "epoch": 38.5972850678733, + "grad_norm": 3.8053483963012695, + "learning_rate": 0.00011566992815206284, + "loss": 2.0062, + "step": 42650 + }, + { + "epoch": 38.619909502262445, + "grad_norm": 4.440960884094238, + "learning_rate": 0.00011562014948046748, + "loss": 2.052, + "step": 42675 + }, + { + "epoch": 38.64253393665158, + "grad_norm": 6.567005634307861, + "learning_rate": 0.00011557034547441034, + "loss": 2.0608, + "step": 42700 + }, + { + "epoch": 38.665158371040725, + "grad_norm": 4.547772407531738, + "learning_rate": 0.00011552051616495379, + "loss": 1.9677, + "step": 42725 + }, + { + "epoch": 38.68778280542986, + "grad_norm": 4.409372806549072, + "learning_rate": 0.00011547066158317594, + "loss": 2.003, + "step": 42750 + }, + { + "epoch": 38.710407239819006, + "grad_norm": 4.9843058586120605, + "learning_rate": 0.00011542078176017068, + "loss": 1.9311, + "step": 42775 + }, + { + "epoch": 38.73303167420814, + "grad_norm": 4.709278106689453, + "learning_rate": 0.0001153708767270477, + "loss": 1.7398, + "step": 42800 + }, + { + "epoch": 38.755656108597286, + "grad_norm": 5.142955303192139, + "learning_rate": 0.00011532094651493235, + "loss": 1.9743, + "step": 42825 + }, + { + "epoch": 38.77828054298642, + "grad_norm": 5.384269714355469, + "learning_rate": 0.00011527099115496569, + "loss": 1.8471, + "step": 42850 + }, + { + "epoch": 38.800904977375566, + "grad_norm": 5.019404411315918, + "learning_rate": 0.00011522101067830449, + "loss": 1.8774, + "step": 42875 + }, + { + "epoch": 38.8235294117647, + "grad_norm": 4.817010402679443, + "learning_rate": 0.00011517100511612118, + "loss": 1.903, + "step": 42900 + }, + { + "epoch": 38.84615384615385, + "grad_norm": 4.09938907623291, + "learning_rate": 0.00011512097449960381, + "loss": 2.0208, + "step": 42925 + }, + { + "epoch": 38.86877828054298, + "grad_norm": 5.036036491394043, + "learning_rate": 0.0001150709188599561, + "loss": 1.9105, + "step": 42950 + }, + { + "epoch": 38.89140271493213, + "grad_norm": 4.437981605529785, + "learning_rate": 0.00011502083822839734, + "loss": 1.9323, + "step": 42975 + }, + { + "epoch": 38.914027149321264, + "grad_norm": 4.4150390625, + "learning_rate": 0.00011497073263616241, + "loss": 1.9789, + "step": 43000 + }, + { + "epoch": 38.93665158371041, + "grad_norm": 4.1862640380859375, + "learning_rate": 0.00011492060211450178, + "loss": 2.0008, + "step": 43025 + }, + { + "epoch": 38.959276018099544, + "grad_norm": 4.7120361328125, + "learning_rate": 0.00011487044669468144, + "loss": 1.9275, + "step": 43050 + }, + { + "epoch": 38.98190045248869, + "grad_norm": 4.61646842956543, + "learning_rate": 0.00011482026640798293, + "loss": 1.926, + "step": 43075 + }, + { + "epoch": 39.00452488687783, + "grad_norm": 5.012235164642334, + "learning_rate": 0.00011477006128570328, + "loss": 1.9905, + "step": 43100 + }, + { + "epoch": 39.02714932126697, + "grad_norm": 4.445367336273193, + "learning_rate": 0.00011471983135915506, + "loss": 1.7675, + "step": 43125 + }, + { + "epoch": 39.04977375565611, + "grad_norm": 3.9145455360412598, + "learning_rate": 0.00011466957665966624, + "loss": 1.6291, + "step": 43150 + }, + { + "epoch": 39.07239819004525, + "grad_norm": 4.605747699737549, + "learning_rate": 0.00011461929721858028, + "loss": 1.8455, + "step": 43175 + }, + { + "epoch": 39.09502262443439, + "grad_norm": 4.1831440925598145, + "learning_rate": 0.00011456899306725608, + "loss": 1.6843, + "step": 43200 + }, + { + "epoch": 39.11764705882353, + "grad_norm": 4.9170026779174805, + "learning_rate": 0.0001145186642370679, + "loss": 1.8931, + "step": 43225 + }, + { + "epoch": 39.14027149321267, + "grad_norm": 5.320871353149414, + "learning_rate": 0.00011446831075940548, + "loss": 1.752, + "step": 43250 + }, + { + "epoch": 39.16289592760181, + "grad_norm": 5.1726975440979, + "learning_rate": 0.00011441793266567382, + "loss": 1.7615, + "step": 43275 + }, + { + "epoch": 39.18552036199095, + "grad_norm": 4.3221964836120605, + "learning_rate": 0.00011436752998729339, + "loss": 1.8699, + "step": 43300 + }, + { + "epoch": 39.20814479638009, + "grad_norm": 5.191243648529053, + "learning_rate": 0.00011431710275569989, + "loss": 1.8307, + "step": 43325 + }, + { + "epoch": 39.23076923076923, + "grad_norm": 5.0341796875, + "learning_rate": 0.00011426665100234442, + "loss": 1.8825, + "step": 43350 + }, + { + "epoch": 39.25339366515837, + "grad_norm": 3.7480239868164062, + "learning_rate": 0.00011421617475869331, + "loss": 1.6761, + "step": 43375 + }, + { + "epoch": 39.276018099547514, + "grad_norm": 4.302507400512695, + "learning_rate": 0.0001141656740562282, + "loss": 1.935, + "step": 43400 + }, + { + "epoch": 39.29864253393665, + "grad_norm": 4.641387462615967, + "learning_rate": 0.00011411514892644595, + "loss": 1.9015, + "step": 43425 + }, + { + "epoch": 39.321266968325794, + "grad_norm": 4.742063045501709, + "learning_rate": 0.00011406459940085872, + "loss": 1.8855, + "step": 43450 + }, + { + "epoch": 39.34389140271493, + "grad_norm": 6.475007057189941, + "learning_rate": 0.0001140140255109938, + "loss": 1.7541, + "step": 43475 + }, + { + "epoch": 39.366515837104075, + "grad_norm": 4.648007869720459, + "learning_rate": 0.00011396342728839376, + "loss": 1.8593, + "step": 43500 + }, + { + "epoch": 39.38914027149321, + "grad_norm": 4.917444229125977, + "learning_rate": 0.00011391280476461629, + "loss": 1.9333, + "step": 43525 + }, + { + "epoch": 39.411764705882355, + "grad_norm": 5.695059299468994, + "learning_rate": 0.00011386215797123425, + "loss": 1.8917, + "step": 43550 + }, + { + "epoch": 39.43438914027149, + "grad_norm": 5.259223937988281, + "learning_rate": 0.00011381148693983562, + "loss": 1.9861, + "step": 43575 + }, + { + "epoch": 39.457013574660635, + "grad_norm": 4.331487655639648, + "learning_rate": 0.00011376079170202356, + "loss": 1.6762, + "step": 43600 + }, + { + "epoch": 39.47963800904977, + "grad_norm": 5.437529563903809, + "learning_rate": 0.00011371007228941624, + "loss": 1.8088, + "step": 43625 + }, + { + "epoch": 39.502262443438916, + "grad_norm": 5.5596022605896, + "learning_rate": 0.00011365932873364697, + "loss": 1.7223, + "step": 43650 + }, + { + "epoch": 39.52488687782805, + "grad_norm": 4.620628356933594, + "learning_rate": 0.00011360856106636412, + "loss": 1.7117, + "step": 43675 + }, + { + "epoch": 39.547511312217196, + "grad_norm": 4.542058944702148, + "learning_rate": 0.00011355776931923104, + "loss": 1.9292, + "step": 43700 + }, + { + "epoch": 39.57013574660633, + "grad_norm": 4.393435955047607, + "learning_rate": 0.00011350695352392617, + "loss": 1.91, + "step": 43725 + }, + { + "epoch": 39.59276018099548, + "grad_norm": 5.202274322509766, + "learning_rate": 0.00011345611371214287, + "loss": 1.7783, + "step": 43750 + }, + { + "epoch": 39.61538461538461, + "grad_norm": 4.276601791381836, + "learning_rate": 0.00011340524991558958, + "loss": 1.7813, + "step": 43775 + }, + { + "epoch": 39.63800904977376, + "grad_norm": 4.053043365478516, + "learning_rate": 0.0001133543621659896, + "loss": 1.9062, + "step": 43800 + }, + { + "epoch": 39.660633484162894, + "grad_norm": 5.021280765533447, + "learning_rate": 0.00011330345049508122, + "loss": 1.7271, + "step": 43825 + }, + { + "epoch": 39.68325791855204, + "grad_norm": 4.679367542266846, + "learning_rate": 0.00011325251493461763, + "loss": 1.9449, + "step": 43850 + }, + { + "epoch": 39.705882352941174, + "grad_norm": 4.967319488525391, + "learning_rate": 0.00011320155551636697, + "loss": 1.8454, + "step": 43875 + }, + { + "epoch": 39.72850678733032, + "grad_norm": 5.371568202972412, + "learning_rate": 0.00011315057227211218, + "loss": 1.9814, + "step": 43900 + }, + { + "epoch": 39.751131221719454, + "grad_norm": 4.940624237060547, + "learning_rate": 0.00011309956523365114, + "loss": 1.8542, + "step": 43925 + }, + { + "epoch": 39.7737556561086, + "grad_norm": 5.8213043212890625, + "learning_rate": 0.0001130485344327965, + "loss": 1.8071, + "step": 43950 + }, + { + "epoch": 39.796380090497735, + "grad_norm": 4.60387659072876, + "learning_rate": 0.00011299747990137579, + "loss": 1.8441, + "step": 43975 + }, + { + "epoch": 39.81900452488688, + "grad_norm": 4.248369216918945, + "learning_rate": 0.00011294640167123127, + "loss": 1.7951, + "step": 44000 + }, + { + "epoch": 39.841628959276015, + "grad_norm": 5.201953887939453, + "learning_rate": 0.00011289529977422006, + "loss": 1.8703, + "step": 44025 + }, + { + "epoch": 39.86425339366516, + "grad_norm": 3.783778190612793, + "learning_rate": 0.00011284417424221399, + "loss": 1.8602, + "step": 44050 + }, + { + "epoch": 39.886877828054295, + "grad_norm": 5.063864707946777, + "learning_rate": 0.00011279302510709964, + "loss": 1.8747, + "step": 44075 + }, + { + "epoch": 39.90950226244344, + "grad_norm": 5.825732707977295, + "learning_rate": 0.00011274185240077831, + "loss": 1.8468, + "step": 44100 + }, + { + "epoch": 39.932126696832576, + "grad_norm": 4.459935188293457, + "learning_rate": 0.00011269065615516604, + "loss": 1.8034, + "step": 44125 + }, + { + "epoch": 39.95475113122172, + "grad_norm": 4.279222011566162, + "learning_rate": 0.00011263943640219348, + "loss": 1.7833, + "step": 44150 + }, + { + "epoch": 39.977375565610856, + "grad_norm": 4.123258590698242, + "learning_rate": 0.00011258819317380599, + "loss": 2.0082, + "step": 44175 + }, + { + "epoch": 40.0, + "grad_norm": 3.5205819606781006, + "learning_rate": 0.00011253692650196358, + "loss": 1.7227, + "step": 44200 + }, + { + "epoch": 40.022624434389144, + "grad_norm": 4.596939563751221, + "learning_rate": 0.00011248563641864084, + "loss": 1.8559, + "step": 44225 + }, + { + "epoch": 40.04524886877828, + "grad_norm": 5.928715229034424, + "learning_rate": 0.000112434322955827, + "loss": 1.5835, + "step": 44250 + }, + { + "epoch": 40.067873303167424, + "grad_norm": 5.3721699714660645, + "learning_rate": 0.00011238298614552586, + "loss": 1.7946, + "step": 44275 + }, + { + "epoch": 40.09049773755656, + "grad_norm": 5.1657633781433105, + "learning_rate": 0.00011233162601975576, + "loss": 1.5946, + "step": 44300 + }, + { + "epoch": 40.113122171945705, + "grad_norm": 5.189513206481934, + "learning_rate": 0.0001122802426105496, + "loss": 1.7134, + "step": 44325 + }, + { + "epoch": 40.13574660633484, + "grad_norm": 4.161139488220215, + "learning_rate": 0.00011222883594995482, + "loss": 1.7487, + "step": 44350 + }, + { + "epoch": 40.158371040723985, + "grad_norm": 4.272087097167969, + "learning_rate": 0.0001121774060700333, + "loss": 1.7234, + "step": 44375 + }, + { + "epoch": 40.18099547511312, + "grad_norm": 3.691531181335449, + "learning_rate": 0.0001121259530028615, + "loss": 1.6367, + "step": 44400 + }, + { + "epoch": 40.203619909502265, + "grad_norm": 4.305932998657227, + "learning_rate": 0.00011207447678053024, + "loss": 1.646, + "step": 44425 + }, + { + "epoch": 40.2262443438914, + "grad_norm": 4.7150397300720215, + "learning_rate": 0.00011202297743514485, + "loss": 1.6649, + "step": 44450 + }, + { + "epoch": 40.248868778280546, + "grad_norm": 5.014179229736328, + "learning_rate": 0.00011197145499882505, + "loss": 1.7508, + "step": 44475 + }, + { + "epoch": 40.27149321266968, + "grad_norm": 4.31679105758667, + "learning_rate": 0.000111919909503705, + "loss": 1.631, + "step": 44500 + }, + { + "epoch": 40.294117647058826, + "grad_norm": 3.9892804622650146, + "learning_rate": 0.00011186834098193317, + "loss": 1.7101, + "step": 44525 + }, + { + "epoch": 40.31674208144796, + "grad_norm": 5.431767463684082, + "learning_rate": 0.00011181674946567244, + "loss": 1.8287, + "step": 44550 + }, + { + "epoch": 40.339366515837106, + "grad_norm": 5.472475051879883, + "learning_rate": 0.00011176513498710005, + "loss": 1.5993, + "step": 44575 + }, + { + "epoch": 40.36199095022624, + "grad_norm": 4.497899055480957, + "learning_rate": 0.00011171349757840752, + "loss": 1.7544, + "step": 44600 + }, + { + "epoch": 40.38461538461539, + "grad_norm": 4.565390110015869, + "learning_rate": 0.00011166183727180069, + "loss": 1.5822, + "step": 44625 + }, + { + "epoch": 40.40723981900452, + "grad_norm": 5.392773151397705, + "learning_rate": 0.00011161015409949968, + "loss": 1.7425, + "step": 44650 + }, + { + "epoch": 40.42986425339367, + "grad_norm": 5.015474319458008, + "learning_rate": 0.00011155844809373889, + "loss": 1.8255, + "step": 44675 + }, + { + "epoch": 40.452488687782804, + "grad_norm": 5.054648399353027, + "learning_rate": 0.00011150671928676691, + "loss": 1.8321, + "step": 44700 + }, + { + "epoch": 40.47511312217195, + "grad_norm": 4.974578380584717, + "learning_rate": 0.00011145496771084659, + "loss": 1.8072, + "step": 44725 + }, + { + "epoch": 40.497737556561084, + "grad_norm": 5.514859676361084, + "learning_rate": 0.00011140319339825497, + "loss": 1.6787, + "step": 44750 + }, + { + "epoch": 40.52036199095023, + "grad_norm": 4.790339946746826, + "learning_rate": 0.00011135139638128332, + "loss": 1.5476, + "step": 44775 + }, + { + "epoch": 40.542986425339365, + "grad_norm": 6.611651420593262, + "learning_rate": 0.00011129957669223695, + "loss": 1.8185, + "step": 44800 + }, + { + "epoch": 40.56561085972851, + "grad_norm": 4.54742956161499, + "learning_rate": 0.00011124773436343543, + "loss": 1.8504, + "step": 44825 + }, + { + "epoch": 40.588235294117645, + "grad_norm": 3.9661245346069336, + "learning_rate": 0.0001111958694272124, + "loss": 1.8194, + "step": 44850 + }, + { + "epoch": 40.61085972850679, + "grad_norm": 4.951076507568359, + "learning_rate": 0.00011114398191591562, + "loss": 2.0075, + "step": 44875 + }, + { + "epoch": 40.633484162895925, + "grad_norm": 4.756132125854492, + "learning_rate": 0.00011109207186190689, + "loss": 1.5663, + "step": 44900 + }, + { + "epoch": 40.65610859728507, + "grad_norm": 5.745034217834473, + "learning_rate": 0.00011104013929756209, + "loss": 1.7752, + "step": 44925 + }, + { + "epoch": 40.678733031674206, + "grad_norm": 4.642129421234131, + "learning_rate": 0.00011098818425527114, + "loss": 1.642, + "step": 44950 + }, + { + "epoch": 40.70135746606335, + "grad_norm": 4.858119010925293, + "learning_rate": 0.00011093620676743805, + "loss": 1.7187, + "step": 44975 + }, + { + "epoch": 40.723981900452486, + "grad_norm": 4.606063365936279, + "learning_rate": 0.00011088420686648067, + "loss": 1.8942, + "step": 45000 + }, + { + "epoch": 40.74660633484163, + "grad_norm": 5.150576591491699, + "learning_rate": 0.000110832184584831, + "loss": 1.8682, + "step": 45025 + }, + { + "epoch": 40.76923076923077, + "grad_norm": 3.524111032485962, + "learning_rate": 0.00011078013995493485, + "loss": 1.868, + "step": 45050 + }, + { + "epoch": 40.79185520361991, + "grad_norm": 5.863454818725586, + "learning_rate": 0.00011072807300925209, + "loss": 1.8941, + "step": 45075 + }, + { + "epoch": 40.81447963800905, + "grad_norm": 5.784281253814697, + "learning_rate": 0.00011067598378025643, + "loss": 1.7836, + "step": 45100 + }, + { + "epoch": 40.83710407239819, + "grad_norm": 5.31019401550293, + "learning_rate": 0.00011062387230043554, + "loss": 1.8444, + "step": 45125 + }, + { + "epoch": 40.85972850678733, + "grad_norm": 4.280027866363525, + "learning_rate": 0.00011057173860229088, + "loss": 1.647, + "step": 45150 + }, + { + "epoch": 40.88235294117647, + "grad_norm": 5.887366771697998, + "learning_rate": 0.00011051958271833787, + "loss": 1.8289, + "step": 45175 + }, + { + "epoch": 40.90497737556561, + "grad_norm": 4.642131328582764, + "learning_rate": 0.00011046740468110568, + "loss": 1.8778, + "step": 45200 + }, + { + "epoch": 40.92760180995475, + "grad_norm": 5.087174892425537, + "learning_rate": 0.00011041520452313732, + "loss": 1.6238, + "step": 45225 + }, + { + "epoch": 40.95022624434389, + "grad_norm": 4.7690582275390625, + "learning_rate": 0.00011036298227698969, + "loss": 1.5938, + "step": 45250 + }, + { + "epoch": 40.97285067873303, + "grad_norm": 5.323506832122803, + "learning_rate": 0.00011031073797523332, + "loss": 1.887, + "step": 45275 + }, + { + "epoch": 40.99547511312217, + "grad_norm": 4.844785690307617, + "learning_rate": 0.00011025847165045257, + "loss": 1.7679, + "step": 45300 + }, + { + "epoch": 41.01809954751131, + "grad_norm": 4.683863162994385, + "learning_rate": 0.00011020618333524554, + "loss": 1.6957, + "step": 45325 + }, + { + "epoch": 41.040723981900456, + "grad_norm": 2.92931866645813, + "learning_rate": 0.00011015387306222402, + "loss": 1.6158, + "step": 45350 + }, + { + "epoch": 41.06334841628959, + "grad_norm": 4.800383567810059, + "learning_rate": 0.00011010154086401354, + "loss": 1.613, + "step": 45375 + }, + { + "epoch": 41.085972850678736, + "grad_norm": 5.136124134063721, + "learning_rate": 0.00011004918677325321, + "loss": 1.4679, + "step": 45400 + }, + { + "epoch": 41.10859728506787, + "grad_norm": 4.789379596710205, + "learning_rate": 0.00010999681082259594, + "loss": 1.7905, + "step": 45425 + }, + { + "epoch": 41.13122171945702, + "grad_norm": 4.386842727661133, + "learning_rate": 0.00010994441304470811, + "loss": 1.5171, + "step": 45450 + }, + { + "epoch": 41.15384615384615, + "grad_norm": 4.789997577667236, + "learning_rate": 0.00010989199347226987, + "loss": 1.6731, + "step": 45475 + }, + { + "epoch": 41.1764705882353, + "grad_norm": 4.864757061004639, + "learning_rate": 0.00010983955213797482, + "loss": 1.5448, + "step": 45500 + }, + { + "epoch": 41.199095022624434, + "grad_norm": 5.094907760620117, + "learning_rate": 0.00010978708907453026, + "loss": 1.6458, + "step": 45525 + }, + { + "epoch": 41.22171945701358, + "grad_norm": 5.919474124908447, + "learning_rate": 0.00010973460431465693, + "loss": 1.6497, + "step": 45550 + }, + { + "epoch": 41.244343891402714, + "grad_norm": 5.220949649810791, + "learning_rate": 0.00010968209789108917, + "loss": 1.634, + "step": 45575 + }, + { + "epoch": 41.26696832579186, + "grad_norm": 4.432976245880127, + "learning_rate": 0.00010962956983657482, + "loss": 1.4961, + "step": 45600 + }, + { + "epoch": 41.289592760180994, + "grad_norm": 4.356385707855225, + "learning_rate": 0.00010957702018387521, + "loss": 1.5595, + "step": 45625 + }, + { + "epoch": 41.31221719457014, + "grad_norm": 3.8516087532043457, + "learning_rate": 0.00010952444896576515, + "loss": 1.6545, + "step": 45650 + }, + { + "epoch": 41.334841628959275, + "grad_norm": 6.359644412994385, + "learning_rate": 0.00010947185621503287, + "loss": 1.7344, + "step": 45675 + }, + { + "epoch": 41.35746606334842, + "grad_norm": 5.247097492218018, + "learning_rate": 0.00010941924196448005, + "loss": 1.5788, + "step": 45700 + }, + { + "epoch": 41.380090497737555, + "grad_norm": 4.667223930358887, + "learning_rate": 0.00010936660624692176, + "loss": 1.7606, + "step": 45725 + }, + { + "epoch": 41.4027149321267, + "grad_norm": 4.9451518058776855, + "learning_rate": 0.0001093139490951865, + "loss": 1.6596, + "step": 45750 + }, + { + "epoch": 41.425339366515836, + "grad_norm": 4.578155517578125, + "learning_rate": 0.00010926127054211612, + "loss": 1.5681, + "step": 45775 + }, + { + "epoch": 41.44796380090498, + "grad_norm": 5.186984539031982, + "learning_rate": 0.00010920857062056577, + "loss": 1.7949, + "step": 45800 + }, + { + "epoch": 41.470588235294116, + "grad_norm": 4.541292190551758, + "learning_rate": 0.00010915584936340401, + "loss": 1.6994, + "step": 45825 + }, + { + "epoch": 41.49321266968326, + "grad_norm": 4.632137298583984, + "learning_rate": 0.00010910310680351266, + "loss": 1.8626, + "step": 45850 + }, + { + "epoch": 41.515837104072396, + "grad_norm": 4.453769207000732, + "learning_rate": 0.00010905034297378684, + "loss": 1.7099, + "step": 45875 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 5.0133867263793945, + "learning_rate": 0.00010899755790713488, + "loss": 1.7266, + "step": 45900 + }, + { + "epoch": 41.56108597285068, + "grad_norm": 4.538997173309326, + "learning_rate": 0.00010894475163647845, + "loss": 1.67, + "step": 45925 + }, + { + "epoch": 41.58371040723982, + "grad_norm": 5.796776294708252, + "learning_rate": 0.00010889192419475238, + "loss": 1.6908, + "step": 45950 + }, + { + "epoch": 41.60633484162896, + "grad_norm": 6.282868385314941, + "learning_rate": 0.00010883907561490472, + "loss": 1.6817, + "step": 45975 + }, + { + "epoch": 41.6289592760181, + "grad_norm": 5.680196762084961, + "learning_rate": 0.00010878620592989672, + "loss": 1.7501, + "step": 46000 + }, + { + "epoch": 41.65158371040724, + "grad_norm": 4.4939398765563965, + "learning_rate": 0.00010873331517270277, + "loss": 1.6653, + "step": 46025 + }, + { + "epoch": 41.67420814479638, + "grad_norm": 5.307185649871826, + "learning_rate": 0.00010868040337631042, + "loss": 1.8483, + "step": 46050 + }, + { + "epoch": 41.69683257918552, + "grad_norm": 5.628327369689941, + "learning_rate": 0.00010862747057372032, + "loss": 1.6202, + "step": 46075 + }, + { + "epoch": 41.71945701357466, + "grad_norm": 5.606970310211182, + "learning_rate": 0.00010857451679794621, + "loss": 1.5604, + "step": 46100 + }, + { + "epoch": 41.7420814479638, + "grad_norm": 5.794508934020996, + "learning_rate": 0.00010852154208201502, + "loss": 1.6942, + "step": 46125 + }, + { + "epoch": 41.76470588235294, + "grad_norm": 4.090907573699951, + "learning_rate": 0.00010846854645896657, + "loss": 1.5644, + "step": 46150 + }, + { + "epoch": 41.78733031674208, + "grad_norm": 4.428072452545166, + "learning_rate": 0.00010841552996185383, + "loss": 1.5533, + "step": 46175 + }, + { + "epoch": 41.80995475113122, + "grad_norm": 5.153327941894531, + "learning_rate": 0.00010836249262374277, + "loss": 1.5703, + "step": 46200 + }, + { + "epoch": 41.83257918552036, + "grad_norm": 4.66979455947876, + "learning_rate": 0.00010830943447771238, + "loss": 1.8543, + "step": 46225 + }, + { + "epoch": 41.8552036199095, + "grad_norm": 4.50140905380249, + "learning_rate": 0.00010825635555685456, + "loss": 1.6253, + "step": 46250 + }, + { + "epoch": 41.87782805429864, + "grad_norm": 3.9756264686584473, + "learning_rate": 0.00010820325589427422, + "loss": 1.6251, + "step": 46275 + }, + { + "epoch": 41.90045248868778, + "grad_norm": 5.265647888183594, + "learning_rate": 0.00010815013552308918, + "loss": 1.6218, + "step": 46300 + }, + { + "epoch": 41.92307692307692, + "grad_norm": 5.2123308181762695, + "learning_rate": 0.00010809699447643023, + "loss": 1.705, + "step": 46325 + }, + { + "epoch": 41.94570135746606, + "grad_norm": 2.903510808944702, + "learning_rate": 0.000108043832787441, + "loss": 1.6392, + "step": 46350 + }, + { + "epoch": 41.9683257918552, + "grad_norm": 4.76161003112793, + "learning_rate": 0.00010799065048927798, + "loss": 1.8432, + "step": 46375 + }, + { + "epoch": 41.990950226244344, + "grad_norm": 4.800807476043701, + "learning_rate": 0.00010793744761511057, + "loss": 1.7893, + "step": 46400 + }, + { + "epoch": 42.01357466063349, + "grad_norm": 4.19058895111084, + "learning_rate": 0.00010788422419812098, + "loss": 1.5279, + "step": 46425 + }, + { + "epoch": 42.036199095022624, + "grad_norm": 5.557961940765381, + "learning_rate": 0.0001078309802715042, + "loss": 1.6492, + "step": 46450 + }, + { + "epoch": 42.05882352941177, + "grad_norm": 4.525270938873291, + "learning_rate": 0.00010777771586846808, + "loss": 1.3892, + "step": 46475 + }, + { + "epoch": 42.081447963800905, + "grad_norm": 5.34505033493042, + "learning_rate": 0.00010772443102223318, + "loss": 1.7326, + "step": 46500 + }, + { + "epoch": 42.10407239819005, + "grad_norm": 5.436420440673828, + "learning_rate": 0.00010767112576603282, + "loss": 1.4617, + "step": 46525 + }, + { + "epoch": 42.126696832579185, + "grad_norm": 4.170498371124268, + "learning_rate": 0.00010761780013311307, + "loss": 1.6095, + "step": 46550 + }, + { + "epoch": 42.14932126696833, + "grad_norm": 6.534799575805664, + "learning_rate": 0.00010756445415673272, + "loss": 1.5952, + "step": 46575 + }, + { + "epoch": 42.171945701357465, + "grad_norm": 4.798860549926758, + "learning_rate": 0.00010751108787016321, + "loss": 1.5917, + "step": 46600 + }, + { + "epoch": 42.19457013574661, + "grad_norm": 4.602447032928467, + "learning_rate": 0.00010745770130668866, + "loss": 1.4728, + "step": 46625 + }, + { + "epoch": 42.217194570135746, + "grad_norm": 4.544776439666748, + "learning_rate": 0.00010740429449960586, + "loss": 1.4985, + "step": 46650 + }, + { + "epoch": 42.23981900452489, + "grad_norm": 7.215638637542725, + "learning_rate": 0.00010735086748222419, + "loss": 1.5785, + "step": 46675 + }, + { + "epoch": 42.262443438914026, + "grad_norm": 4.556062698364258, + "learning_rate": 0.00010729742028786567, + "loss": 1.5428, + "step": 46700 + }, + { + "epoch": 42.28506787330317, + "grad_norm": 4.447551727294922, + "learning_rate": 0.00010724395294986487, + "loss": 1.6337, + "step": 46725 + }, + { + "epoch": 42.30769230769231, + "grad_norm": 3.2624270915985107, + "learning_rate": 0.00010719046550156895, + "loss": 1.5049, + "step": 46750 + }, + { + "epoch": 42.33031674208145, + "grad_norm": 5.950804233551025, + "learning_rate": 0.00010713695797633759, + "loss": 1.612, + "step": 46775 + }, + { + "epoch": 42.35294117647059, + "grad_norm": 4.925538063049316, + "learning_rate": 0.00010708343040754303, + "loss": 1.6729, + "step": 46800 + }, + { + "epoch": 42.37556561085973, + "grad_norm": 4.276064395904541, + "learning_rate": 0.00010702988282856997, + "loss": 1.5955, + "step": 46825 + }, + { + "epoch": 42.39819004524887, + "grad_norm": 5.456986427307129, + "learning_rate": 0.00010697631527281561, + "loss": 1.4984, + "step": 46850 + }, + { + "epoch": 42.42081447963801, + "grad_norm": 4.983468532562256, + "learning_rate": 0.0001069227277736896, + "loss": 1.6025, + "step": 46875 + }, + { + "epoch": 42.44343891402715, + "grad_norm": 5.594575881958008, + "learning_rate": 0.00010686912036461401, + "loss": 1.5873, + "step": 46900 + }, + { + "epoch": 42.46606334841629, + "grad_norm": 3.534917116165161, + "learning_rate": 0.00010681549307902341, + "loss": 1.7042, + "step": 46925 + }, + { + "epoch": 42.48868778280543, + "grad_norm": 4.687277793884277, + "learning_rate": 0.00010676184595036465, + "loss": 1.4687, + "step": 46950 + }, + { + "epoch": 42.51131221719457, + "grad_norm": 4.967089653015137, + "learning_rate": 0.00010670817901209707, + "loss": 1.5756, + "step": 46975 + }, + { + "epoch": 42.53393665158371, + "grad_norm": 4.12251091003418, + "learning_rate": 0.00010665449229769228, + "loss": 1.5803, + "step": 47000 + }, + { + "epoch": 42.55656108597285, + "grad_norm": 4.591906547546387, + "learning_rate": 0.00010660078584063423, + "loss": 1.7013, + "step": 47025 + }, + { + "epoch": 42.57918552036199, + "grad_norm": 4.8183393478393555, + "learning_rate": 0.00010654705967441924, + "loss": 1.566, + "step": 47050 + }, + { + "epoch": 42.60180995475113, + "grad_norm": 4.178544998168945, + "learning_rate": 0.00010649331383255589, + "loss": 1.514, + "step": 47075 + }, + { + "epoch": 42.62443438914027, + "grad_norm": 6.708840370178223, + "learning_rate": 0.00010643954834856499, + "loss": 1.7042, + "step": 47100 + }, + { + "epoch": 42.64705882352941, + "grad_norm": 4.6382598876953125, + "learning_rate": 0.0001063857632559797, + "loss": 1.5801, + "step": 47125 + }, + { + "epoch": 42.66968325791855, + "grad_norm": 5.215579032897949, + "learning_rate": 0.0001063341111504707, + "loss": 1.5118, + "step": 47150 + }, + { + "epoch": 42.69230769230769, + "grad_norm": 5.23929500579834, + "learning_rate": 0.00010628028772235998, + "loss": 1.5391, + "step": 47175 + }, + { + "epoch": 42.71493212669683, + "grad_norm": 4.934380531311035, + "learning_rate": 0.00010622644478498442, + "loss": 1.6349, + "step": 47200 + }, + { + "epoch": 42.737556561085974, + "grad_norm": 5.085113525390625, + "learning_rate": 0.00010617258237192542, + "loss": 1.7049, + "step": 47225 + }, + { + "epoch": 42.76018099547511, + "grad_norm": 6.066861629486084, + "learning_rate": 0.00010611870051677655, + "loss": 1.5581, + "step": 47250 + }, + { + "epoch": 42.782805429864254, + "grad_norm": 5.6061601638793945, + "learning_rate": 0.00010606479925314348, + "loss": 1.4464, + "step": 47275 + }, + { + "epoch": 42.80542986425339, + "grad_norm": 5.139316558837891, + "learning_rate": 0.000106010878614644, + "loss": 1.5885, + "step": 47300 + }, + { + "epoch": 42.828054298642535, + "grad_norm": 3.9583466053009033, + "learning_rate": 0.00010595693863490798, + "loss": 1.6028, + "step": 47325 + }, + { + "epoch": 42.85067873303167, + "grad_norm": 4.169937610626221, + "learning_rate": 0.00010590297934757735, + "loss": 1.6479, + "step": 47350 + }, + { + "epoch": 42.873303167420815, + "grad_norm": 5.27691650390625, + "learning_rate": 0.0001058490007863061, + "loss": 1.6315, + "step": 47375 + }, + { + "epoch": 42.89592760180995, + "grad_norm": 5.77647590637207, + "learning_rate": 0.0001057950029847602, + "loss": 1.6568, + "step": 47400 + }, + { + "epoch": 42.918552036199095, + "grad_norm": 5.759611129760742, + "learning_rate": 0.00010574098597661768, + "loss": 1.7188, + "step": 47425 + }, + { + "epoch": 42.94117647058823, + "grad_norm": 5.73583984375, + "learning_rate": 0.00010568694979556849, + "loss": 1.6243, + "step": 47450 + }, + { + "epoch": 42.963800904977376, + "grad_norm": 4.507272243499756, + "learning_rate": 0.00010563289447531457, + "loss": 1.7092, + "step": 47475 + }, + { + "epoch": 42.98642533936652, + "grad_norm": 5.818449020385742, + "learning_rate": 0.00010557882004956979, + "loss": 1.431, + "step": 47500 + }, + { + "epoch": 43.009049773755656, + "grad_norm": 5.47661018371582, + "learning_rate": 0.00010552472655205996, + "loss": 1.4526, + "step": 47525 + }, + { + "epoch": 43.0316742081448, + "grad_norm": 5.197766304016113, + "learning_rate": 0.00010547061401652269, + "loss": 1.4359, + "step": 47550 + }, + { + "epoch": 43.05429864253394, + "grad_norm": 3.4543919563293457, + "learning_rate": 0.00010541648247670762, + "loss": 1.5048, + "step": 47575 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 6.372855186462402, + "learning_rate": 0.00010536233196637611, + "loss": 1.5558, + "step": 47600 + }, + { + "epoch": 43.09954751131222, + "grad_norm": 5.4943413734436035, + "learning_rate": 0.0001053081625193014, + "loss": 1.4904, + "step": 47625 + }, + { + "epoch": 43.12217194570136, + "grad_norm": 3.5853707790374756, + "learning_rate": 0.00010525397416926856, + "loss": 1.4444, + "step": 47650 + }, + { + "epoch": 43.1447963800905, + "grad_norm": 5.288698196411133, + "learning_rate": 0.00010519976695007442, + "loss": 1.4926, + "step": 47675 + }, + { + "epoch": 43.16742081447964, + "grad_norm": 5.487499713897705, + "learning_rate": 0.00010514554089552758, + "loss": 1.5038, + "step": 47700 + }, + { + "epoch": 43.19004524886878, + "grad_norm": 5.475346088409424, + "learning_rate": 0.00010509129603944842, + "loss": 1.4347, + "step": 47725 + }, + { + "epoch": 43.21266968325792, + "grad_norm": 4.218531608581543, + "learning_rate": 0.00010503703241566899, + "loss": 1.6238, + "step": 47750 + }, + { + "epoch": 43.23529411764706, + "grad_norm": 5.887923717498779, + "learning_rate": 0.0001049827500580331, + "loss": 1.5715, + "step": 47775 + }, + { + "epoch": 43.2579185520362, + "grad_norm": 4.928927898406982, + "learning_rate": 0.00010492844900039621, + "loss": 1.5173, + "step": 47800 + }, + { + "epoch": 43.28054298642534, + "grad_norm": 4.930531024932861, + "learning_rate": 0.00010487412927662547, + "loss": 1.5222, + "step": 47825 + }, + { + "epoch": 43.30316742081448, + "grad_norm": 4.241865158081055, + "learning_rate": 0.00010481979092059963, + "loss": 1.4223, + "step": 47850 + }, + { + "epoch": 43.32579185520362, + "grad_norm": 5.365390777587891, + "learning_rate": 0.00010476543396620911, + "loss": 1.5511, + "step": 47875 + }, + { + "epoch": 43.34841628959276, + "grad_norm": 4.603792190551758, + "learning_rate": 0.00010471105844735592, + "loss": 1.4558, + "step": 47900 + }, + { + "epoch": 43.3710407239819, + "grad_norm": 5.6232781410217285, + "learning_rate": 0.00010465666439795359, + "loss": 1.4777, + "step": 47925 + }, + { + "epoch": 43.39366515837104, + "grad_norm": 5.014750003814697, + "learning_rate": 0.00010460225185192727, + "loss": 1.664, + "step": 47950 + }, + { + "epoch": 43.41628959276018, + "grad_norm": 4.773794651031494, + "learning_rate": 0.00010454782084321365, + "loss": 1.4406, + "step": 47975 + }, + { + "epoch": 43.43891402714932, + "grad_norm": 5.268489837646484, + "learning_rate": 0.0001044933714057609, + "loss": 1.3602, + "step": 48000 + }, + { + "epoch": 43.46153846153846, + "grad_norm": 3.3007686138153076, + "learning_rate": 0.0001044389035735287, + "loss": 1.4394, + "step": 48025 + }, + { + "epoch": 43.484162895927604, + "grad_norm": 4.0678253173828125, + "learning_rate": 0.0001043844173804882, + "loss": 1.513, + "step": 48050 + }, + { + "epoch": 43.50678733031674, + "grad_norm": 4.807238578796387, + "learning_rate": 0.00010432991286062201, + "loss": 1.4545, + "step": 48075 + }, + { + "epoch": 43.529411764705884, + "grad_norm": 4.851547718048096, + "learning_rate": 0.00010427539004792414, + "loss": 1.5125, + "step": 48100 + }, + { + "epoch": 43.55203619909502, + "grad_norm": 4.754851341247559, + "learning_rate": 0.00010422084897640007, + "loss": 1.5075, + "step": 48125 + }, + { + "epoch": 43.574660633484164, + "grad_norm": 5.759084701538086, + "learning_rate": 0.00010416628968006659, + "loss": 1.498, + "step": 48150 + }, + { + "epoch": 43.5972850678733, + "grad_norm": 5.977701663970947, + "learning_rate": 0.0001041117121929519, + "loss": 1.6014, + "step": 48175 + }, + { + "epoch": 43.619909502262445, + "grad_norm": 6.444182872772217, + "learning_rate": 0.00010405711654909558, + "loss": 1.3752, + "step": 48200 + }, + { + "epoch": 43.64253393665158, + "grad_norm": 4.371002197265625, + "learning_rate": 0.00010400250278254844, + "loss": 1.3662, + "step": 48225 + }, + { + "epoch": 43.665158371040725, + "grad_norm": 4.785689830780029, + "learning_rate": 0.00010394787092737267, + "loss": 1.5946, + "step": 48250 + }, + { + "epoch": 43.68778280542986, + "grad_norm": 5.227655410766602, + "learning_rate": 0.00010389322101764175, + "loss": 1.4658, + "step": 48275 + }, + { + "epoch": 43.710407239819006, + "grad_norm": 4.917147159576416, + "learning_rate": 0.00010383855308744037, + "loss": 1.4717, + "step": 48300 + }, + { + "epoch": 43.73303167420814, + "grad_norm": 5.020429611206055, + "learning_rate": 0.00010378386717086447, + "loss": 1.6081, + "step": 48325 + }, + { + "epoch": 43.755656108597286, + "grad_norm": 5.552148818969727, + "learning_rate": 0.00010372916330202122, + "loss": 1.5598, + "step": 48350 + }, + { + "epoch": 43.77828054298642, + "grad_norm": 4.826664924621582, + "learning_rate": 0.00010367444151502902, + "loss": 1.5655, + "step": 48375 + }, + { + "epoch": 43.800904977375566, + "grad_norm": 4.741078853607178, + "learning_rate": 0.00010361970184401735, + "loss": 1.5584, + "step": 48400 + }, + { + "epoch": 43.8235294117647, + "grad_norm": 5.433865070343018, + "learning_rate": 0.00010356494432312695, + "loss": 1.4365, + "step": 48425 + }, + { + "epoch": 43.84615384615385, + "grad_norm": 4.902193546295166, + "learning_rate": 0.00010351016898650963, + "loss": 1.6606, + "step": 48450 + }, + { + "epoch": 43.86877828054298, + "grad_norm": 5.173399925231934, + "learning_rate": 0.00010345537586832833, + "loss": 1.5314, + "step": 48475 + }, + { + "epoch": 43.89140271493213, + "grad_norm": 5.055915832519531, + "learning_rate": 0.00010340056500275707, + "loss": 1.5808, + "step": 48500 + }, + { + "epoch": 43.914027149321264, + "grad_norm": 5.21277379989624, + "learning_rate": 0.00010334573642398098, + "loss": 1.4901, + "step": 48525 + }, + { + "epoch": 43.93665158371041, + "grad_norm": 5.40691614151001, + "learning_rate": 0.00010329089016619616, + "loss": 1.6291, + "step": 48550 + }, + { + "epoch": 43.959276018099544, + "grad_norm": 4.3971405029296875, + "learning_rate": 0.00010323602626360982, + "loss": 1.5533, + "step": 48575 + }, + { + "epoch": 43.98190045248869, + "grad_norm": 4.630727767944336, + "learning_rate": 0.00010318114475044012, + "loss": 1.3832, + "step": 48600 + }, + { + "epoch": 44.00452488687783, + "grad_norm": 4.368161678314209, + "learning_rate": 0.00010312624566091621, + "loss": 1.5873, + "step": 48625 + }, + { + "epoch": 44.02714932126697, + "grad_norm": 3.7434568405151367, + "learning_rate": 0.00010307132902927823, + "loss": 1.4848, + "step": 48650 + }, + { + "epoch": 44.04977375565611, + "grad_norm": 4.684480667114258, + "learning_rate": 0.00010301639488977724, + "loss": 1.3814, + "step": 48675 + }, + { + "epoch": 44.07239819004525, + "grad_norm": 3.173452138900757, + "learning_rate": 0.00010296144327667522, + "loss": 1.4749, + "step": 48700 + }, + { + "epoch": 44.09502262443439, + "grad_norm": 5.971280574798584, + "learning_rate": 0.00010290647422424504, + "loss": 1.3801, + "step": 48725 + }, + { + "epoch": 44.11764705882353, + "grad_norm": 4.780723571777344, + "learning_rate": 0.00010285148776677046, + "loss": 1.402, + "step": 48750 + }, + { + "epoch": 44.14027149321267, + "grad_norm": 4.479318618774414, + "learning_rate": 0.00010279648393854613, + "loss": 1.4263, + "step": 48775 + }, + { + "epoch": 44.16289592760181, + "grad_norm": 4.854914665222168, + "learning_rate": 0.00010274146277387746, + "loss": 1.4235, + "step": 48800 + }, + { + "epoch": 44.18552036199095, + "grad_norm": 4.779385566711426, + "learning_rate": 0.0001026864243070807, + "loss": 1.3708, + "step": 48825 + }, + { + "epoch": 44.20814479638009, + "grad_norm": 5.719334602355957, + "learning_rate": 0.00010263136857248292, + "loss": 1.4497, + "step": 48850 + }, + { + "epoch": 44.23076923076923, + "grad_norm": 5.735396862030029, + "learning_rate": 0.00010257629560442195, + "loss": 1.2651, + "step": 48875 + }, + { + "epoch": 44.25339366515837, + "grad_norm": 4.51582670211792, + "learning_rate": 0.00010252120543724635, + "loss": 1.5399, + "step": 48900 + }, + { + "epoch": 44.276018099547514, + "grad_norm": 5.506528854370117, + "learning_rate": 0.00010246609810531541, + "loss": 1.4355, + "step": 48925 + }, + { + "epoch": 44.29864253393665, + "grad_norm": 5.63154411315918, + "learning_rate": 0.00010241097364299913, + "loss": 1.3229, + "step": 48950 + }, + { + "epoch": 44.321266968325794, + "grad_norm": 5.07582950592041, + "learning_rate": 0.00010235583208467818, + "loss": 1.4395, + "step": 48975 + }, + { + "epoch": 44.34389140271493, + "grad_norm": 5.703203201293945, + "learning_rate": 0.00010230067346474395, + "loss": 1.5553, + "step": 49000 + }, + { + "epoch": 44.366515837104075, + "grad_norm": 4.868218421936035, + "learning_rate": 0.00010224549781759842, + "loss": 1.3299, + "step": 49025 + }, + { + "epoch": 44.38914027149321, + "grad_norm": 5.52424430847168, + "learning_rate": 0.00010219030517765418, + "loss": 1.52, + "step": 49050 + }, + { + "epoch": 44.411764705882355, + "grad_norm": 6.301329135894775, + "learning_rate": 0.00010213509557933443, + "loss": 1.3564, + "step": 49075 + }, + { + "epoch": 44.43438914027149, + "grad_norm": 5.152988433837891, + "learning_rate": 0.00010207986905707296, + "loss": 1.4055, + "step": 49100 + }, + { + "epoch": 44.457013574660635, + "grad_norm": 5.810203552246094, + "learning_rate": 0.00010202462564531415, + "loss": 1.4116, + "step": 49125 + }, + { + "epoch": 44.47963800904977, + "grad_norm": 5.6694655418396, + "learning_rate": 0.00010196936537851282, + "loss": 1.5294, + "step": 49150 + }, + { + "epoch": 44.502262443438916, + "grad_norm": 5.7976789474487305, + "learning_rate": 0.00010191408829113439, + "loss": 1.5283, + "step": 49175 + }, + { + "epoch": 44.52488687782805, + "grad_norm": 5.283827781677246, + "learning_rate": 0.0001018587944176547, + "loss": 1.3695, + "step": 49200 + }, + { + "epoch": 44.547511312217196, + "grad_norm": 5.265637397766113, + "learning_rate": 0.00010180348379256013, + "loss": 1.3707, + "step": 49225 + }, + { + "epoch": 44.57013574660633, + "grad_norm": 4.790452003479004, + "learning_rate": 0.00010174815645034747, + "loss": 1.5486, + "step": 49250 + }, + { + "epoch": 44.59276018099548, + "grad_norm": 4.9435248374938965, + "learning_rate": 0.00010169281242552394, + "loss": 1.4313, + "step": 49275 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 5.264336109161377, + "learning_rate": 0.00010163745175260714, + "loss": 1.4269, + "step": 49300 + }, + { + "epoch": 44.63800904977376, + "grad_norm": 4.573849678039551, + "learning_rate": 0.00010158207446612511, + "loss": 1.4565, + "step": 49325 + }, + { + "epoch": 44.660633484162894, + "grad_norm": 4.074705123901367, + "learning_rate": 0.00010152668060061618, + "loss": 1.5345, + "step": 49350 + }, + { + "epoch": 44.68325791855204, + "grad_norm": 6.016761779785156, + "learning_rate": 0.0001014712701906291, + "loss": 1.6035, + "step": 49375 + }, + { + "epoch": 44.705882352941174, + "grad_norm": 5.152374267578125, + "learning_rate": 0.0001014158432707229, + "loss": 1.4211, + "step": 49400 + }, + { + "epoch": 44.72850678733032, + "grad_norm": 5.270075798034668, + "learning_rate": 0.00010136039987546688, + "loss": 1.4602, + "step": 49425 + }, + { + "epoch": 44.751131221719454, + "grad_norm": 5.11018180847168, + "learning_rate": 0.00010130494003944063, + "loss": 1.3028, + "step": 49450 + }, + { + "epoch": 44.7737556561086, + "grad_norm": 4.941214561462402, + "learning_rate": 0.00010124946379723408, + "loss": 1.4229, + "step": 49475 + }, + { + "epoch": 44.796380090497735, + "grad_norm": 4.68281888961792, + "learning_rate": 0.00010119397118344723, + "loss": 1.4369, + "step": 49500 + }, + { + "epoch": 44.81900452488688, + "grad_norm": 4.22309684753418, + "learning_rate": 0.00010113846223269042, + "loss": 1.4697, + "step": 49525 + }, + { + "epoch": 44.841628959276015, + "grad_norm": 5.230871200561523, + "learning_rate": 0.00010108293697958412, + "loss": 1.5112, + "step": 49550 + }, + { + "epoch": 44.86425339366516, + "grad_norm": 4.829254627227783, + "learning_rate": 0.00010102739545875901, + "loss": 1.424, + "step": 49575 + }, + { + "epoch": 44.886877828054295, + "grad_norm": 5.477245807647705, + "learning_rate": 0.00010097183770485589, + "loss": 1.5255, + "step": 49600 + }, + { + "epoch": 44.90950226244344, + "grad_norm": 4.841703414916992, + "learning_rate": 0.00010091626375252565, + "loss": 1.5065, + "step": 49625 + }, + { + "epoch": 44.932126696832576, + "grad_norm": 5.197177410125732, + "learning_rate": 0.00010086067363642935, + "loss": 1.3484, + "step": 49650 + }, + { + "epoch": 44.95475113122172, + "grad_norm": 5.4379682540893555, + "learning_rate": 0.0001008050673912381, + "loss": 1.5079, + "step": 49675 + }, + { + "epoch": 44.977375565610856, + "grad_norm": 5.954039573669434, + "learning_rate": 0.00010074944505163306, + "loss": 1.4882, + "step": 49700 + }, + { + "epoch": 45.0, + "grad_norm": 5.441366672515869, + "learning_rate": 0.00010069380665230545, + "loss": 1.5511, + "step": 49725 + }, + { + "epoch": 45.022624434389144, + "grad_norm": NaN, + "learning_rate": 0.00010064037871217546, + "loss": 1.2875, + "step": 49750 + }, + { + "epoch": 45.04524886877828, + "grad_norm": 8.035948753356934, + "learning_rate": 0.00010058470893646217, + "loss": 1.4301, + "step": 49775 + }, + { + "epoch": 45.067873303167424, + "grad_norm": 5.814290523529053, + "learning_rate": 0.0001005290232037709, + "loss": 1.3189, + "step": 49800 + }, + { + "epoch": 45.09049773755656, + "grad_norm": 4.740077018737793, + "learning_rate": 0.0001004733215488324, + "loss": 1.346, + "step": 49825 + }, + { + "epoch": 45.113122171945705, + "grad_norm": 5.189807891845703, + "learning_rate": 0.0001004176040063873, + "loss": 1.2482, + "step": 49850 + }, + { + "epoch": 45.13574660633484, + "grad_norm": 5.7160491943359375, + "learning_rate": 0.00010036187061118628, + "loss": 1.4034, + "step": 49875 + }, + { + "epoch": 45.158371040723985, + "grad_norm": 3.8227548599243164, + "learning_rate": 0.00010030612139798972, + "loss": 1.2693, + "step": 49900 + }, + { + "epoch": 45.18099547511312, + "grad_norm": 4.569591522216797, + "learning_rate": 0.000100250356401568, + "loss": 1.3758, + "step": 49925 + }, + { + "epoch": 45.203619909502265, + "grad_norm": 5.352408409118652, + "learning_rate": 0.00010019457565670129, + "loss": 1.3286, + "step": 49950 + }, + { + "epoch": 45.2262443438914, + "grad_norm": 4.980902671813965, + "learning_rate": 0.00010013877919817958, + "loss": 1.3187, + "step": 49975 + }, + { + "epoch": 45.248868778280546, + "grad_norm": 5.208808898925781, + "learning_rate": 0.00010008296706080273, + "loss": 1.4312, + "step": 50000 + }, + { + "epoch": 45.27149321266968, + "grad_norm": 3.345900058746338, + "learning_rate": 0.00010002713927938026, + "loss": 1.3032, + "step": 50025 + }, + { + "epoch": 45.294117647058826, + "grad_norm": 4.701058387756348, + "learning_rate": 9.997129588873153e-05, + "loss": 1.4663, + "step": 50050 + }, + { + "epoch": 45.31674208144796, + "grad_norm": 4.687870979309082, + "learning_rate": 9.991543692368565e-05, + "loss": 1.1913, + "step": 50075 + }, + { + "epoch": 45.339366515837106, + "grad_norm": 5.097414493560791, + "learning_rate": 9.985956241908134e-05, + "loss": 1.488, + "step": 50100 + }, + { + "epoch": 45.36199095022624, + "grad_norm": 5.062108516693115, + "learning_rate": 9.980367240976714e-05, + "loss": 1.3711, + "step": 50125 + }, + { + "epoch": 45.38461538461539, + "grad_norm": 4.611981391906738, + "learning_rate": 9.974776693060117e-05, + "loss": 1.3808, + "step": 50150 + }, + { + "epoch": 45.40723981900452, + "grad_norm": 6.802997589111328, + "learning_rate": 9.969184601645124e-05, + "loss": 1.2606, + "step": 50175 + }, + { + "epoch": 45.42986425339367, + "grad_norm": 4.539482593536377, + "learning_rate": 9.963590970219478e-05, + "loss": 1.4523, + "step": 50200 + }, + { + "epoch": 45.452488687782804, + "grad_norm": 4.706707000732422, + "learning_rate": 9.957995802271883e-05, + "loss": 1.3775, + "step": 50225 + }, + { + "epoch": 45.47511312217195, + "grad_norm": 5.560224533081055, + "learning_rate": 9.952399101291996e-05, + "loss": 1.3178, + "step": 50250 + }, + { + "epoch": 45.497737556561084, + "grad_norm": 5.96610164642334, + "learning_rate": 9.94680087077044e-05, + "loss": 1.2672, + "step": 50275 + }, + { + "epoch": 45.52036199095023, + "grad_norm": 4.098049163818359, + "learning_rate": 9.941201114198785e-05, + "loss": 1.3768, + "step": 50300 + }, + { + "epoch": 45.542986425339365, + "grad_norm": 4.603151321411133, + "learning_rate": 9.935599835069552e-05, + "loss": 1.3075, + "step": 50325 + }, + { + "epoch": 45.56561085972851, + "grad_norm": 4.098180770874023, + "learning_rate": 9.929997036876215e-05, + "loss": 1.2581, + "step": 50350 + }, + { + "epoch": 45.588235294117645, + "grad_norm": 4.840689182281494, + "learning_rate": 9.924392723113195e-05, + "loss": 1.3037, + "step": 50375 + }, + { + "epoch": 45.61085972850679, + "grad_norm": 4.30715799331665, + "learning_rate": 9.918786897275859e-05, + "loss": 1.3482, + "step": 50400 + }, + { + "epoch": 45.633484162895925, + "grad_norm": 5.419538497924805, + "learning_rate": 9.913179562860512e-05, + "loss": 1.3613, + "step": 50425 + }, + { + "epoch": 45.65610859728507, + "grad_norm": 5.290268898010254, + "learning_rate": 9.907570723364405e-05, + "loss": 1.3416, + "step": 50450 + }, + { + "epoch": 45.678733031674206, + "grad_norm": 4.399529933929443, + "learning_rate": 9.901960382285728e-05, + "loss": 1.3726, + "step": 50475 + }, + { + "epoch": 45.70135746606335, + "grad_norm": 5.522080898284912, + "learning_rate": 9.896348543123606e-05, + "loss": 1.4135, + "step": 50500 + }, + { + "epoch": 45.723981900452486, + "grad_norm": 5.97099494934082, + "learning_rate": 9.890735209378095e-05, + "loss": 1.4295, + "step": 50525 + }, + { + "epoch": 45.74660633484163, + "grad_norm": 5.0166120529174805, + "learning_rate": 9.885120384550189e-05, + "loss": 1.387, + "step": 50550 + }, + { + "epoch": 45.76923076923077, + "grad_norm": 4.919198989868164, + "learning_rate": 9.879504072141808e-05, + "loss": 1.4421, + "step": 50575 + }, + { + "epoch": 45.79185520361991, + "grad_norm": 5.102237224578857, + "learning_rate": 9.873886275655801e-05, + "loss": 1.4403, + "step": 50600 + }, + { + "epoch": 45.81447963800905, + "grad_norm": 5.842960834503174, + "learning_rate": 9.868266998595943e-05, + "loss": 1.3875, + "step": 50625 + }, + { + "epoch": 45.83710407239819, + "grad_norm": 5.246617317199707, + "learning_rate": 9.862646244466932e-05, + "loss": 1.5681, + "step": 50650 + }, + { + "epoch": 45.85972850678733, + "grad_norm": 5.355321884155273, + "learning_rate": 9.857024016774387e-05, + "loss": 1.3422, + "step": 50675 + }, + { + "epoch": 45.88235294117647, + "grad_norm": 5.763421058654785, + "learning_rate": 9.851400319024845e-05, + "loss": 1.3366, + "step": 50700 + }, + { + "epoch": 45.90497737556561, + "grad_norm": 5.695327281951904, + "learning_rate": 9.845775154725766e-05, + "loss": 1.394, + "step": 50725 + }, + { + "epoch": 45.92760180995475, + "grad_norm": 5.689870357513428, + "learning_rate": 9.840148527385517e-05, + "loss": 1.3939, + "step": 50750 + }, + { + "epoch": 45.95022624434389, + "grad_norm": 4.690075397491455, + "learning_rate": 9.834520440513379e-05, + "loss": 1.5407, + "step": 50775 + }, + { + "epoch": 45.97285067873303, + "grad_norm": 3.7511346340179443, + "learning_rate": 9.828890897619545e-05, + "loss": 1.4854, + "step": 50800 + }, + { + "epoch": 45.99547511312217, + "grad_norm": 4.550795555114746, + "learning_rate": 9.82325990221512e-05, + "loss": 1.405, + "step": 50825 + }, + { + "epoch": 46.01809954751131, + "grad_norm": 5.129349231719971, + "learning_rate": 9.817627457812105e-05, + "loss": 1.31, + "step": 50850 + }, + { + "epoch": 46.040723981900456, + "grad_norm": 4.822476387023926, + "learning_rate": 9.811993567923413e-05, + "loss": 1.2706, + "step": 50875 + }, + { + "epoch": 46.06334841628959, + "grad_norm": 5.464430809020996, + "learning_rate": 9.806358236062858e-05, + "loss": 1.184, + "step": 50900 + }, + { + "epoch": 46.085972850678736, + "grad_norm": 4.488974571228027, + "learning_rate": 9.800721465745147e-05, + "loss": 1.2153, + "step": 50925 + }, + { + "epoch": 46.10859728506787, + "grad_norm": 4.2137346267700195, + "learning_rate": 9.795083260485891e-05, + "loss": 1.2017, + "step": 50950 + }, + { + "epoch": 46.13122171945702, + "grad_norm": 5.356151580810547, + "learning_rate": 9.789443623801593e-05, + "loss": 1.2963, + "step": 50975 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 4.786318302154541, + "learning_rate": 9.783802559209652e-05, + "loss": 1.284, + "step": 51000 + }, + { + "epoch": 46.1764705882353, + "grad_norm": 4.5469865798950195, + "learning_rate": 9.77816007022835e-05, + "loss": 1.304, + "step": 51025 + }, + { + "epoch": 46.199095022624434, + "grad_norm": 6.047292709350586, + "learning_rate": 9.772516160376866e-05, + "loss": 1.3066, + "step": 51050 + }, + { + "epoch": 46.22171945701358, + "grad_norm": 3.767551898956299, + "learning_rate": 9.766870833175256e-05, + "loss": 1.201, + "step": 51075 + }, + { + "epoch": 46.244343891402714, + "grad_norm": 4.860236644744873, + "learning_rate": 9.76122409214447e-05, + "loss": 1.2653, + "step": 51100 + }, + { + "epoch": 46.26696832579186, + "grad_norm": 4.5802459716796875, + "learning_rate": 9.755575940806337e-05, + "loss": 1.4322, + "step": 51125 + }, + { + "epoch": 46.289592760180994, + "grad_norm": 5.008360862731934, + "learning_rate": 9.74992638268356e-05, + "loss": 1.2949, + "step": 51150 + }, + { + "epoch": 46.31221719457014, + "grad_norm": 4.551988124847412, + "learning_rate": 9.744275421299724e-05, + "loss": 1.2047, + "step": 51175 + }, + { + "epoch": 46.334841628959275, + "grad_norm": 6.203924179077148, + "learning_rate": 9.738623060179288e-05, + "loss": 1.2815, + "step": 51200 + }, + { + "epoch": 46.35746606334842, + "grad_norm": 4.722567081451416, + "learning_rate": 9.732969302847585e-05, + "loss": 1.2592, + "step": 51225 + }, + { + "epoch": 46.380090497737555, + "grad_norm": 6.056229114532471, + "learning_rate": 9.727314152830819e-05, + "loss": 1.3173, + "step": 51250 + }, + { + "epoch": 46.4027149321267, + "grad_norm": 3.917114019393921, + "learning_rate": 9.721657613656058e-05, + "loss": 1.2974, + "step": 51275 + }, + { + "epoch": 46.425339366515836, + "grad_norm": 4.1579203605651855, + "learning_rate": 9.715999688851245e-05, + "loss": 1.3222, + "step": 51300 + }, + { + "epoch": 46.44796380090498, + "grad_norm": 5.7063469886779785, + "learning_rate": 9.710340381945179e-05, + "loss": 1.3636, + "step": 51325 + }, + { + "epoch": 46.470588235294116, + "grad_norm": 4.55832576751709, + "learning_rate": 9.704679696467525e-05, + "loss": 1.2854, + "step": 51350 + }, + { + "epoch": 46.49321266968326, + "grad_norm": 5.123040199279785, + "learning_rate": 9.699017635948812e-05, + "loss": 1.4846, + "step": 51375 + }, + { + "epoch": 46.515837104072396, + "grad_norm": 4.968398094177246, + "learning_rate": 9.693354203920413e-05, + "loss": 1.3019, + "step": 51400 + }, + { + "epoch": 46.53846153846154, + "grad_norm": 4.814697265625, + "learning_rate": 9.687689403914572e-05, + "loss": 1.2732, + "step": 51425 + }, + { + "epoch": 46.56108597285068, + "grad_norm": 5.311944484710693, + "learning_rate": 9.682023239464377e-05, + "loss": 1.3774, + "step": 51450 + }, + { + "epoch": 46.58371040723982, + "grad_norm": 4.640413761138916, + "learning_rate": 9.676355714103769e-05, + "loss": 1.2123, + "step": 51475 + }, + { + "epoch": 46.60633484162896, + "grad_norm": 4.477193832397461, + "learning_rate": 9.670686831367536e-05, + "loss": 1.2285, + "step": 51500 + }, + { + "epoch": 46.6289592760181, + "grad_norm": 4.573556900024414, + "learning_rate": 9.665016594791321e-05, + "loss": 1.3481, + "step": 51525 + }, + { + "epoch": 46.65158371040724, + "grad_norm": 4.527987957000732, + "learning_rate": 9.659345007911601e-05, + "loss": 1.2507, + "step": 51550 + }, + { + "epoch": 46.67420814479638, + "grad_norm": 5.239626407623291, + "learning_rate": 9.6536720742657e-05, + "loss": 1.4123, + "step": 51575 + }, + { + "epoch": 46.69683257918552, + "grad_norm": 5.0842413902282715, + "learning_rate": 9.64799779739178e-05, + "loss": 1.362, + "step": 51600 + }, + { + "epoch": 46.71945701357466, + "grad_norm": 5.790838241577148, + "learning_rate": 9.642322180828843e-05, + "loss": 1.3565, + "step": 51625 + }, + { + "epoch": 46.7420814479638, + "grad_norm": 5.47201681137085, + "learning_rate": 9.636645228116726e-05, + "loss": 1.2825, + "step": 51650 + }, + { + "epoch": 46.76470588235294, + "grad_norm": 3.6880836486816406, + "learning_rate": 9.6309669427961e-05, + "loss": 1.4405, + "step": 51675 + }, + { + "epoch": 46.78733031674208, + "grad_norm": 5.187725067138672, + "learning_rate": 9.625287328408463e-05, + "loss": 1.1651, + "step": 51700 + }, + { + "epoch": 46.80995475113122, + "grad_norm": 6.163667678833008, + "learning_rate": 9.619606388496146e-05, + "loss": 1.2395, + "step": 51725 + }, + { + "epoch": 46.83257918552036, + "grad_norm": 4.501791954040527, + "learning_rate": 9.613924126602308e-05, + "loss": 1.3948, + "step": 51750 + }, + { + "epoch": 46.8552036199095, + "grad_norm": 4.55626916885376, + "learning_rate": 9.608240546270928e-05, + "loss": 1.2392, + "step": 51775 + }, + { + "epoch": 46.87782805429864, + "grad_norm": 4.100018501281738, + "learning_rate": 9.602555651046811e-05, + "loss": 1.2744, + "step": 51800 + }, + { + "epoch": 46.90045248868778, + "grad_norm": 4.403803825378418, + "learning_rate": 9.596869444475578e-05, + "loss": 1.2697, + "step": 51825 + }, + { + "epoch": 46.92307692307692, + "grad_norm": 5.647763729095459, + "learning_rate": 9.591181930103675e-05, + "loss": 1.3068, + "step": 51850 + }, + { + "epoch": 46.94570135746606, + "grad_norm": 4.663811206817627, + "learning_rate": 9.585493111478352e-05, + "loss": 1.3152, + "step": 51875 + }, + { + "epoch": 46.9683257918552, + "grad_norm": 4.8924455642700195, + "learning_rate": 9.579802992147688e-05, + "loss": 1.3996, + "step": 51900 + }, + { + "epoch": 46.990950226244344, + "grad_norm": 4.687934398651123, + "learning_rate": 9.574111575660559e-05, + "loss": 1.2499, + "step": 51925 + }, + { + "epoch": 47.01357466063349, + "grad_norm": 4.841540813446045, + "learning_rate": 9.568418865566658e-05, + "loss": 1.2103, + "step": 51950 + }, + { + "epoch": 47.036199095022624, + "grad_norm": 4.487457275390625, + "learning_rate": 9.562724865416483e-05, + "loss": 1.1023, + "step": 51975 + }, + { + "epoch": 47.05882352941177, + "grad_norm": 5.510406017303467, + "learning_rate": 9.557029578761332e-05, + "loss": 1.2958, + "step": 52000 + }, + { + "epoch": 47.081447963800905, + "grad_norm": 5.464969635009766, + "learning_rate": 9.551333009153317e-05, + "loss": 0.9992, + "step": 52025 + }, + { + "epoch": 47.10407239819005, + "grad_norm": 4.596621513366699, + "learning_rate": 9.545635160145339e-05, + "loss": 1.1281, + "step": 52050 + }, + { + "epoch": 47.126696832579185, + "grad_norm": 5.372847557067871, + "learning_rate": 9.5399360352911e-05, + "loss": 1.4568, + "step": 52075 + }, + { + "epoch": 47.14932126696833, + "grad_norm": 5.727240085601807, + "learning_rate": 9.534235638145098e-05, + "loss": 1.2155, + "step": 52100 + }, + { + "epoch": 47.171945701357465, + "grad_norm": 4.874152183532715, + "learning_rate": 9.528533972262628e-05, + "loss": 1.2021, + "step": 52125 + }, + { + "epoch": 47.19457013574661, + "grad_norm": 4.732327938079834, + "learning_rate": 9.523059182689132e-05, + "loss": 1.3154, + "step": 52150 + }, + { + "epoch": 47.217194570135746, + "grad_norm": 5.301266670227051, + "learning_rate": 9.517355040399401e-05, + "loss": 1.0518, + "step": 52175 + }, + { + "epoch": 47.23981900452489, + "grad_norm": 3.8940746784210205, + "learning_rate": 9.511649639901494e-05, + "loss": 1.1216, + "step": 52200 + }, + { + "epoch": 47.262443438914026, + "grad_norm": 3.6668171882629395, + "learning_rate": 9.505942984753822e-05, + "loss": 1.3145, + "step": 52225 + }, + { + "epoch": 47.28506787330317, + "grad_norm": 4.5170416831970215, + "learning_rate": 9.500235078515583e-05, + "loss": 1.3284, + "step": 52250 + }, + { + "epoch": 47.30769230769231, + "grad_norm": 6.2065863609313965, + "learning_rate": 9.494525924746748e-05, + "loss": 1.1503, + "step": 52275 + }, + { + "epoch": 47.33031674208145, + "grad_norm": 4.878929615020752, + "learning_rate": 9.488815527008077e-05, + "loss": 1.1821, + "step": 52300 + }, + { + "epoch": 47.35294117647059, + "grad_norm": 5.171248912811279, + "learning_rate": 9.483103888861095e-05, + "loss": 1.163, + "step": 52325 + }, + { + "epoch": 47.37556561085973, + "grad_norm": 6.436504364013672, + "learning_rate": 9.477391013868105e-05, + "loss": 1.2094, + "step": 52350 + }, + { + "epoch": 47.39819004524887, + "grad_norm": 3.0119566917419434, + "learning_rate": 9.471676905592184e-05, + "loss": 1.1008, + "step": 52375 + }, + { + "epoch": 47.42081447963801, + "grad_norm": 4.798369407653809, + "learning_rate": 9.46596156759717e-05, + "loss": 1.2489, + "step": 52400 + }, + { + "epoch": 47.44343891402715, + "grad_norm": 4.6900811195373535, + "learning_rate": 9.460245003447679e-05, + "loss": 1.1736, + "step": 52425 + }, + { + "epoch": 47.46606334841629, + "grad_norm": 4.819660663604736, + "learning_rate": 9.454527216709086e-05, + "loss": 1.3886, + "step": 52450 + }, + { + "epoch": 47.48868778280543, + "grad_norm": 4.714395046234131, + "learning_rate": 9.448808210947526e-05, + "loss": 1.3167, + "step": 52475 + }, + { + "epoch": 47.51131221719457, + "grad_norm": 4.665770053863525, + "learning_rate": 9.443087989729899e-05, + "loss": 1.1773, + "step": 52500 + }, + { + "epoch": 47.53393665158371, + "grad_norm": 5.140773773193359, + "learning_rate": 9.43736655662386e-05, + "loss": 1.3442, + "step": 52525 + }, + { + "epoch": 47.55656108597285, + "grad_norm": 3.1296918392181396, + "learning_rate": 9.431643915197818e-05, + "loss": 1.1789, + "step": 52550 + }, + { + "epoch": 47.57918552036199, + "grad_norm": 5.336180686950684, + "learning_rate": 9.425920069020947e-05, + "loss": 1.185, + "step": 52575 + }, + { + "epoch": 47.60180995475113, + "grad_norm": 5.8437113761901855, + "learning_rate": 9.420195021663156e-05, + "loss": 1.2764, + "step": 52600 + }, + { + "epoch": 47.62443438914027, + "grad_norm": 4.632587909698486, + "learning_rate": 9.414468776695116e-05, + "loss": 1.3965, + "step": 52625 + }, + { + "epoch": 47.64705882352941, + "grad_norm": 4.85280179977417, + "learning_rate": 9.408741337688238e-05, + "loss": 1.1969, + "step": 52650 + }, + { + "epoch": 47.66968325791855, + "grad_norm": 5.9409499168396, + "learning_rate": 9.40301270821468e-05, + "loss": 1.1177, + "step": 52675 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 5.9572978019714355, + "learning_rate": 9.397282891847343e-05, + "loss": 1.3279, + "step": 52700 + }, + { + "epoch": 47.71493212669683, + "grad_norm": 4.965830326080322, + "learning_rate": 9.391551892159867e-05, + "loss": 1.3477, + "step": 52725 + }, + { + "epoch": 47.737556561085974, + "grad_norm": 5.128588676452637, + "learning_rate": 9.385819712726629e-05, + "loss": 1.3403, + "step": 52750 + }, + { + "epoch": 47.76018099547511, + "grad_norm": 4.773208141326904, + "learning_rate": 9.380086357122747e-05, + "loss": 1.283, + "step": 52775 + }, + { + "epoch": 47.782805429864254, + "grad_norm": 5.443982124328613, + "learning_rate": 9.374351828924065e-05, + "loss": 1.1588, + "step": 52800 + }, + { + "epoch": 47.80542986425339, + "grad_norm": 5.004065036773682, + "learning_rate": 9.368616131707165e-05, + "loss": 1.3078, + "step": 52825 + }, + { + "epoch": 47.828054298642535, + "grad_norm": 4.608489990234375, + "learning_rate": 9.362879269049356e-05, + "loss": 1.2215, + "step": 52850 + }, + { + "epoch": 47.85067873303167, + "grad_norm": 3.259402275085449, + "learning_rate": 9.357141244528671e-05, + "loss": 1.2802, + "step": 52875 + }, + { + "epoch": 47.873303167420815, + "grad_norm": 5.098649978637695, + "learning_rate": 9.35140206172387e-05, + "loss": 1.2212, + "step": 52900 + }, + { + "epoch": 47.89592760180995, + "grad_norm": 5.7414751052856445, + "learning_rate": 9.345661724214437e-05, + "loss": 1.2021, + "step": 52925 + }, + { + "epoch": 47.918552036199095, + "grad_norm": 4.869835376739502, + "learning_rate": 9.339920235580574e-05, + "loss": 1.2323, + "step": 52950 + }, + { + "epoch": 47.94117647058823, + "grad_norm": 4.785629749298096, + "learning_rate": 9.3341775994032e-05, + "loss": 1.2435, + "step": 52975 + }, + { + "epoch": 47.963800904977376, + "grad_norm": 5.641818046569824, + "learning_rate": 9.328433819263953e-05, + "loss": 1.1873, + "step": 53000 + }, + { + "epoch": 47.98642533936652, + "grad_norm": 5.285050392150879, + "learning_rate": 9.322688898745181e-05, + "loss": 1.3702, + "step": 53025 + }, + { + "epoch": 48.009049773755656, + "grad_norm": 5.142643451690674, + "learning_rate": 9.316942841429947e-05, + "loss": 1.3408, + "step": 53050 + }, + { + "epoch": 48.0316742081448, + "grad_norm": 5.038753986358643, + "learning_rate": 9.31119565090202e-05, + "loss": 1.1248, + "step": 53075 + }, + { + "epoch": 48.05429864253394, + "grad_norm": 3.7307887077331543, + "learning_rate": 9.305447330745876e-05, + "loss": 1.2337, + "step": 53100 + }, + { + "epoch": 48.07692307692308, + "grad_norm": 4.286962985992432, + "learning_rate": 9.299697884546696e-05, + "loss": 1.1177, + "step": 53125 + }, + { + "epoch": 48.09954751131222, + "grad_norm": 4.528583526611328, + "learning_rate": 9.293947315890367e-05, + "loss": 1.1977, + "step": 53150 + }, + { + "epoch": 48.12217194570136, + "grad_norm": 6.151467800140381, + "learning_rate": 9.288195628363467e-05, + "loss": 1.1828, + "step": 53175 + }, + { + "epoch": 48.1447963800905, + "grad_norm": 4.997425556182861, + "learning_rate": 9.282442825553279e-05, + "loss": 1.1186, + "step": 53200 + }, + { + "epoch": 48.16742081447964, + "grad_norm": 5.374769687652588, + "learning_rate": 9.276688911047785e-05, + "loss": 1.1204, + "step": 53225 + }, + { + "epoch": 48.19004524886878, + "grad_norm": 4.511106014251709, + "learning_rate": 9.27093388843565e-05, + "loss": 1.1014, + "step": 53250 + }, + { + "epoch": 48.21266968325792, + "grad_norm": 4.060224533081055, + "learning_rate": 9.265177761306237e-05, + "loss": 1.1864, + "step": 53275 + }, + { + "epoch": 48.23529411764706, + "grad_norm": 5.773272514343262, + "learning_rate": 9.259420533249596e-05, + "loss": 1.0438, + "step": 53300 + }, + { + "epoch": 48.2579185520362, + "grad_norm": 5.258281707763672, + "learning_rate": 9.253662207856466e-05, + "loss": 1.1108, + "step": 53325 + }, + { + "epoch": 48.28054298642534, + "grad_norm": 5.085644721984863, + "learning_rate": 9.247902788718266e-05, + "loss": 1.2611, + "step": 53350 + }, + { + "epoch": 48.30316742081448, + "grad_norm": 4.6476240158081055, + "learning_rate": 9.2421422794271e-05, + "loss": 1.1513, + "step": 53375 + }, + { + "epoch": 48.32579185520362, + "grad_norm": 4.446463584899902, + "learning_rate": 9.236380683575753e-05, + "loss": 1.1763, + "step": 53400 + }, + { + "epoch": 48.34841628959276, + "grad_norm": 4.901967525482178, + "learning_rate": 9.230618004757686e-05, + "loss": 1.2309, + "step": 53425 + }, + { + "epoch": 48.3710407239819, + "grad_norm": 4.112976551055908, + "learning_rate": 9.224854246567034e-05, + "loss": 1.0473, + "step": 53450 + }, + { + "epoch": 48.39366515837104, + "grad_norm": 3.424607038497925, + "learning_rate": 9.219089412598608e-05, + "loss": 1.1428, + "step": 53475 + }, + { + "epoch": 48.41628959276018, + "grad_norm": 3.574810028076172, + "learning_rate": 9.213323506447888e-05, + "loss": 1.2392, + "step": 53500 + }, + { + "epoch": 48.43891402714932, + "grad_norm": 6.763994216918945, + "learning_rate": 9.207556531711024e-05, + "loss": 1.1168, + "step": 53525 + }, + { + "epoch": 48.46153846153846, + "grad_norm": 3.5914273262023926, + "learning_rate": 9.201788491984829e-05, + "loss": 1.2307, + "step": 53550 + }, + { + "epoch": 48.484162895927604, + "grad_norm": 4.489185333251953, + "learning_rate": 9.19601939086679e-05, + "loss": 1.2362, + "step": 53575 + }, + { + "epoch": 48.50678733031674, + "grad_norm": 4.0316033363342285, + "learning_rate": 9.190249231955043e-05, + "loss": 1.2428, + "step": 53600 + }, + { + "epoch": 48.529411764705884, + "grad_norm": 3.3174991607666016, + "learning_rate": 9.184478018848392e-05, + "loss": 0.9921, + "step": 53625 + }, + { + "epoch": 48.55203619909502, + "grad_norm": 5.367835521697998, + "learning_rate": 9.178705755146298e-05, + "loss": 1.1551, + "step": 53650 + }, + { + "epoch": 48.574660633484164, + "grad_norm": 5.262642860412598, + "learning_rate": 9.172932444448872e-05, + "loss": 1.0222, + "step": 53675 + }, + { + "epoch": 48.5972850678733, + "grad_norm": 4.616153240203857, + "learning_rate": 9.167158090356884e-05, + "loss": 1.173, + "step": 53700 + }, + { + "epoch": 48.619909502262445, + "grad_norm": 4.295658111572266, + "learning_rate": 9.161382696471753e-05, + "loss": 1.0673, + "step": 53725 + }, + { + "epoch": 48.64253393665158, + "grad_norm": 4.903369426727295, + "learning_rate": 9.155606266395545e-05, + "loss": 1.2454, + "step": 53750 + }, + { + "epoch": 48.665158371040725, + "grad_norm": 3.6059064865112305, + "learning_rate": 9.149828803730971e-05, + "loss": 1.1858, + "step": 53775 + }, + { + "epoch": 48.68778280542986, + "grad_norm": 5.3199286460876465, + "learning_rate": 9.144050312081392e-05, + "loss": 1.1035, + "step": 53800 + }, + { + "epoch": 48.710407239819006, + "grad_norm": 4.197053909301758, + "learning_rate": 9.138270795050804e-05, + "loss": 1.2688, + "step": 53825 + }, + { + "epoch": 48.73303167420814, + "grad_norm": 4.945096492767334, + "learning_rate": 9.132490256243849e-05, + "loss": 1.0091, + "step": 53850 + }, + { + "epoch": 48.755656108597286, + "grad_norm": 3.641407012939453, + "learning_rate": 9.126708699265797e-05, + "loss": 1.2336, + "step": 53875 + }, + { + "epoch": 48.77828054298642, + "grad_norm": 4.669260025024414, + "learning_rate": 9.120926127722563e-05, + "loss": 1.1354, + "step": 53900 + }, + { + "epoch": 48.800904977375566, + "grad_norm": 4.605678081512451, + "learning_rate": 9.115142545220692e-05, + "loss": 1.2065, + "step": 53925 + }, + { + "epoch": 48.8235294117647, + "grad_norm": 4.441488742828369, + "learning_rate": 9.109357955367354e-05, + "loss": 1.2603, + "step": 53950 + }, + { + "epoch": 48.84615384615385, + "grad_norm": 4.292562961578369, + "learning_rate": 9.103572361770353e-05, + "loss": 1.2716, + "step": 53975 + }, + { + "epoch": 48.86877828054298, + "grad_norm": 5.542261123657227, + "learning_rate": 9.097785768038118e-05, + "loss": 1.192, + "step": 54000 + }, + { + "epoch": 48.89140271493213, + "grad_norm": 5.503359317779541, + "learning_rate": 9.0919981777797e-05, + "loss": 1.2559, + "step": 54025 + }, + { + "epoch": 48.914027149321264, + "grad_norm": 4.744175434112549, + "learning_rate": 9.086209594604775e-05, + "loss": 1.2943, + "step": 54050 + }, + { + "epoch": 48.93665158371041, + "grad_norm": 3.3318700790405273, + "learning_rate": 9.080420022123631e-05, + "loss": 1.1944, + "step": 54075 + }, + { + "epoch": 48.959276018099544, + "grad_norm": 4.345468521118164, + "learning_rate": 9.074629463947185e-05, + "loss": 1.1818, + "step": 54100 + }, + { + "epoch": 48.98190045248869, + "grad_norm": 4.530157566070557, + "learning_rate": 9.068837923686955e-05, + "loss": 1.4225, + "step": 54125 + }, + { + "epoch": 49.00452488687783, + "grad_norm": 4.266589164733887, + "learning_rate": 9.063045404955082e-05, + "loss": 1.2109, + "step": 54150 + }, + { + "epoch": 49.02714932126697, + "grad_norm": 4.339365482330322, + "learning_rate": 9.057251911364314e-05, + "loss": 1.009, + "step": 54175 + }, + { + "epoch": 49.04977375565611, + "grad_norm": 5.674631118774414, + "learning_rate": 9.051457446528005e-05, + "loss": 1.1196, + "step": 54200 + }, + { + "epoch": 49.07239819004525, + "grad_norm": 5.143533229827881, + "learning_rate": 9.045662014060117e-05, + "loss": 1.0855, + "step": 54225 + }, + { + "epoch": 49.09502262443439, + "grad_norm": 5.3571882247924805, + "learning_rate": 9.039865617575213e-05, + "loss": 1.1771, + "step": 54250 + }, + { + "epoch": 49.11764705882353, + "grad_norm": 4.910420894622803, + "learning_rate": 9.03406826068846e-05, + "loss": 1.0568, + "step": 54275 + }, + { + "epoch": 49.14027149321267, + "grad_norm": 3.894571304321289, + "learning_rate": 9.028269947015625e-05, + "loss": 1.0524, + "step": 54300 + }, + { + "epoch": 49.16289592760181, + "grad_norm": 5.98056173324585, + "learning_rate": 9.022702669102259e-05, + "loss": 1.0619, + "step": 54325 + }, + { + "epoch": 49.18552036199095, + "grad_norm": 4.829105854034424, + "learning_rate": 9.016902490619592e-05, + "loss": 1.1119, + "step": 54350 + }, + { + "epoch": 49.20814479638009, + "grad_norm": 4.5941619873046875, + "learning_rate": 9.011101366056998e-05, + "loss": 1.2032, + "step": 54375 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 5.961572170257568, + "learning_rate": 9.005299299032587e-05, + "loss": 1.008, + "step": 54400 + }, + { + "epoch": 49.25339366515837, + "grad_norm": 5.3435187339782715, + "learning_rate": 8.999496293165061e-05, + "loss": 1.1795, + "step": 54425 + }, + { + "epoch": 49.276018099547514, + "grad_norm": 3.5825347900390625, + "learning_rate": 8.993692352073714e-05, + "loss": 1.1525, + "step": 54450 + }, + { + "epoch": 49.29864253393665, + "grad_norm": 5.072323322296143, + "learning_rate": 8.987887479378413e-05, + "loss": 1.0846, + "step": 54475 + }, + { + "epoch": 49.321266968325794, + "grad_norm": 3.932933807373047, + "learning_rate": 8.982081678699613e-05, + "loss": 1.187, + "step": 54500 + }, + { + "epoch": 49.34389140271493, + "grad_norm": 4.800660133361816, + "learning_rate": 8.976274953658343e-05, + "loss": 1.0316, + "step": 54525 + }, + { + "epoch": 49.366515837104075, + "grad_norm": 4.958492279052734, + "learning_rate": 8.970467307876213e-05, + "loss": 1.1783, + "step": 54550 + }, + { + "epoch": 49.38914027149321, + "grad_norm": 5.268551826477051, + "learning_rate": 8.964658744975403e-05, + "loss": 1.1196, + "step": 54575 + }, + { + "epoch": 49.411764705882355, + "grad_norm": 5.094686985015869, + "learning_rate": 8.958849268578667e-05, + "loss": 1.0254, + "step": 54600 + }, + { + "epoch": 49.43438914027149, + "grad_norm": 6.6234846115112305, + "learning_rate": 8.953038882309333e-05, + "loss": 1.1582, + "step": 54625 + }, + { + "epoch": 49.457013574660635, + "grad_norm": 3.9450597763061523, + "learning_rate": 8.947227589791287e-05, + "loss": 1.1137, + "step": 54650 + }, + { + "epoch": 49.47963800904977, + "grad_norm": 5.424504280090332, + "learning_rate": 8.941415394648991e-05, + "loss": 1.042, + "step": 54675 + }, + { + "epoch": 49.502262443438916, + "grad_norm": 4.318606376647949, + "learning_rate": 8.93560230050746e-05, + "loss": 1.1823, + "step": 54700 + }, + { + "epoch": 49.52488687782805, + "grad_norm": 4.582488536834717, + "learning_rate": 8.929788310992276e-05, + "loss": 1.0477, + "step": 54725 + }, + { + "epoch": 49.547511312217196, + "grad_norm": 5.677311420440674, + "learning_rate": 8.923973429729578e-05, + "loss": 1.2954, + "step": 54750 + }, + { + "epoch": 49.57013574660633, + "grad_norm": 5.549562931060791, + "learning_rate": 8.918157660346061e-05, + "loss": 1.0914, + "step": 54775 + }, + { + "epoch": 49.59276018099548, + "grad_norm": 4.94139289855957, + "learning_rate": 8.912341006468973e-05, + "loss": 1.1153, + "step": 54800 + }, + { + "epoch": 49.61538461538461, + "grad_norm": 4.835080623626709, + "learning_rate": 8.906523471726113e-05, + "loss": 1.1043, + "step": 54825 + }, + { + "epoch": 49.63800904977376, + "grad_norm": 5.070520401000977, + "learning_rate": 8.900705059745834e-05, + "loss": 1.2282, + "step": 54850 + }, + { + "epoch": 49.660633484162894, + "grad_norm": 4.417810440063477, + "learning_rate": 8.89488577415703e-05, + "loss": 1.0648, + "step": 54875 + }, + { + "epoch": 49.68325791855204, + "grad_norm": 5.827963352203369, + "learning_rate": 8.889065618589147e-05, + "loss": 1.1563, + "step": 54900 + }, + { + "epoch": 49.705882352941174, + "grad_norm": 4.687410831451416, + "learning_rate": 8.883244596672165e-05, + "loss": 1.0463, + "step": 54925 + }, + { + "epoch": 49.72850678733032, + "grad_norm": 4.386495113372803, + "learning_rate": 8.87742271203661e-05, + "loss": 1.1286, + "step": 54950 + }, + { + "epoch": 49.751131221719454, + "grad_norm": 4.633265972137451, + "learning_rate": 8.871599968313545e-05, + "loss": 1.1123, + "step": 54975 + }, + { + "epoch": 49.7737556561086, + "grad_norm": 5.492758750915527, + "learning_rate": 8.865776369134569e-05, + "loss": 1.1681, + "step": 55000 + }, + { + "epoch": 49.796380090497735, + "grad_norm": 4.6047868728637695, + "learning_rate": 8.859951918131815e-05, + "loss": 1.0757, + "step": 55025 + }, + { + "epoch": 49.81900452488688, + "grad_norm": 4.503223419189453, + "learning_rate": 8.854126618937945e-05, + "loss": 1.2659, + "step": 55050 + }, + { + "epoch": 49.841628959276015, + "grad_norm": 5.064691543579102, + "learning_rate": 8.84830047518615e-05, + "loss": 1.1565, + "step": 55075 + }, + { + "epoch": 49.86425339366516, + "grad_norm": 4.539597511291504, + "learning_rate": 8.842473490510153e-05, + "loss": 1.1721, + "step": 55100 + }, + { + "epoch": 49.886877828054295, + "grad_norm": 5.208485126495361, + "learning_rate": 8.836645668544193e-05, + "loss": 1.0792, + "step": 55125 + }, + { + "epoch": 49.90950226244344, + "grad_norm": 4.17509126663208, + "learning_rate": 8.830817012923041e-05, + "loss": 1.1144, + "step": 55150 + }, + { + "epoch": 49.932126696832576, + "grad_norm": 4.037477970123291, + "learning_rate": 8.82498752728198e-05, + "loss": 1.0173, + "step": 55175 + }, + { + "epoch": 49.95475113122172, + "grad_norm": 4.351037979125977, + "learning_rate": 8.819157215256813e-05, + "loss": 0.9713, + "step": 55200 + }, + { + "epoch": 49.977375565610856, + "grad_norm": 5.295916557312012, + "learning_rate": 8.813326080483859e-05, + "loss": 1.1529, + "step": 55225 + }, + { + "epoch": 50.0, + "grad_norm": 4.7759270668029785, + "learning_rate": 8.807494126599952e-05, + "loss": 1.1772, + "step": 55250 + }, + { + "epoch": 50.022624434389144, + "grad_norm": 4.5403971672058105, + "learning_rate": 8.801661357242433e-05, + "loss": 1.0469, + "step": 55275 + }, + { + "epoch": 50.04524886877828, + "grad_norm": 5.13372802734375, + "learning_rate": 8.795827776049156e-05, + "loss": 1.0775, + "step": 55300 + }, + { + "epoch": 50.067873303167424, + "grad_norm": 4.247966289520264, + "learning_rate": 8.789993386658474e-05, + "loss": 1.0043, + "step": 55325 + }, + { + "epoch": 50.09049773755656, + "grad_norm": 5.053829193115234, + "learning_rate": 8.784158192709253e-05, + "loss": 1.0262, + "step": 55350 + }, + { + "epoch": 50.113122171945705, + "grad_norm": 6.302519798278809, + "learning_rate": 8.778322197840855e-05, + "loss": 0.9002, + "step": 55375 + }, + { + "epoch": 50.13574660633484, + "grad_norm": 5.568416595458984, + "learning_rate": 8.772485405693146e-05, + "loss": 1.0576, + "step": 55400 + }, + { + "epoch": 50.158371040723985, + "grad_norm": 5.653662204742432, + "learning_rate": 8.766647819906483e-05, + "loss": 1.0275, + "step": 55425 + }, + { + "epoch": 50.18099547511312, + "grad_norm": 5.099042892456055, + "learning_rate": 8.760809444121722e-05, + "loss": 1.1494, + "step": 55450 + }, + { + "epoch": 50.203619909502265, + "grad_norm": 3.052720308303833, + "learning_rate": 8.754970281980214e-05, + "loss": 1.0364, + "step": 55475 + }, + { + "epoch": 50.2262443438914, + "grad_norm": 5.043280601501465, + "learning_rate": 8.749130337123795e-05, + "loss": 1.0985, + "step": 55500 + }, + { + "epoch": 50.248868778280546, + "grad_norm": 5.506175994873047, + "learning_rate": 8.743289613194792e-05, + "loss": 0.9248, + "step": 55525 + }, + { + "epoch": 50.27149321266968, + "grad_norm": 5.041479587554932, + "learning_rate": 8.737448113836019e-05, + "loss": 1.1453, + "step": 55550 + }, + { + "epoch": 50.294117647058826, + "grad_norm": 4.894893169403076, + "learning_rate": 8.731605842690771e-05, + "loss": 1.1438, + "step": 55575 + }, + { + "epoch": 50.31674208144796, + "grad_norm": 5.285278797149658, + "learning_rate": 8.725762803402827e-05, + "loss": 0.9856, + "step": 55600 + }, + { + "epoch": 50.339366515837106, + "grad_norm": 5.30338191986084, + "learning_rate": 8.719918999616442e-05, + "loss": 1.1053, + "step": 55625 + }, + { + "epoch": 50.36199095022624, + "grad_norm": 4.593639373779297, + "learning_rate": 8.714074434976352e-05, + "loss": 1.0672, + "step": 55650 + }, + { + "epoch": 50.38461538461539, + "grad_norm": 4.997950553894043, + "learning_rate": 8.70822911312776e-05, + "loss": 1.0073, + "step": 55675 + }, + { + "epoch": 50.40723981900452, + "grad_norm": 4.9740824699401855, + "learning_rate": 8.702383037716355e-05, + "loss": 1.0856, + "step": 55700 + }, + { + "epoch": 50.42986425339367, + "grad_norm": 4.203141689300537, + "learning_rate": 8.69653621238828e-05, + "loss": 1.2288, + "step": 55725 + }, + { + "epoch": 50.452488687782804, + "grad_norm": 4.776144504547119, + "learning_rate": 8.690688640790157e-05, + "loss": 1.0695, + "step": 55750 + }, + { + "epoch": 50.47511312217195, + "grad_norm": 5.120831489562988, + "learning_rate": 8.684840326569068e-05, + "loss": 1.1256, + "step": 55775 + }, + { + "epoch": 50.497737556561084, + "grad_norm": 4.816657543182373, + "learning_rate": 8.678991273372561e-05, + "loss": 1.1635, + "step": 55800 + }, + { + "epoch": 50.52036199095023, + "grad_norm": 4.2659196853637695, + "learning_rate": 8.673141484848641e-05, + "loss": 1.0529, + "step": 55825 + }, + { + "epoch": 50.542986425339365, + "grad_norm": 4.1815972328186035, + "learning_rate": 8.667290964645777e-05, + "loss": 0.9332, + "step": 55850 + }, + { + "epoch": 50.56561085972851, + "grad_norm": 4.719061851501465, + "learning_rate": 8.661439716412889e-05, + "loss": 1.0962, + "step": 55875 + }, + { + "epoch": 50.588235294117645, + "grad_norm": 4.619856357574463, + "learning_rate": 8.655587743799356e-05, + "loss": 1.0358, + "step": 55900 + }, + { + "epoch": 50.61085972850679, + "grad_norm": 4.741133213043213, + "learning_rate": 8.649735050455006e-05, + "loss": 1.2345, + "step": 55925 + }, + { + "epoch": 50.633484162895925, + "grad_norm": 4.988595962524414, + "learning_rate": 8.643881640030116e-05, + "loss": 1.073, + "step": 55950 + }, + { + "epoch": 50.65610859728507, + "grad_norm": 4.526401996612549, + "learning_rate": 8.638027516175412e-05, + "loss": 1.1112, + "step": 55975 + }, + { + "epoch": 50.678733031674206, + "grad_norm": 6.290268421173096, + "learning_rate": 8.632172682542064e-05, + "loss": 1.0573, + "step": 56000 + }, + { + "epoch": 50.70135746606335, + "grad_norm": 5.91135311126709, + "learning_rate": 8.626317142781684e-05, + "loss": 0.9932, + "step": 56025 + }, + { + "epoch": 50.723981900452486, + "grad_norm": 5.749835968017578, + "learning_rate": 8.620460900546326e-05, + "loss": 1.0493, + "step": 56050 + }, + { + "epoch": 50.74660633484163, + "grad_norm": 5.972226619720459, + "learning_rate": 8.614603959488482e-05, + "loss": 1.1073, + "step": 56075 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 3.0617969036102295, + "learning_rate": 8.608746323261079e-05, + "loss": 0.9442, + "step": 56100 + }, + { + "epoch": 50.79185520361991, + "grad_norm": 3.8403468132019043, + "learning_rate": 8.602887995517476e-05, + "loss": 1.032, + "step": 56125 + }, + { + "epoch": 50.81447963800905, + "grad_norm": 4.818698883056641, + "learning_rate": 8.597028979911466e-05, + "loss": 1.0355, + "step": 56150 + }, + { + "epoch": 50.83710407239819, + "grad_norm": 4.8740153312683105, + "learning_rate": 8.59116928009727e-05, + "loss": 0.9741, + "step": 56175 + }, + { + "epoch": 50.85972850678733, + "grad_norm": 4.436838150024414, + "learning_rate": 8.585308899729538e-05, + "loss": 0.9861, + "step": 56200 + }, + { + "epoch": 50.88235294117647, + "grad_norm": 5.0014472007751465, + "learning_rate": 8.579447842463339e-05, + "loss": 0.9933, + "step": 56225 + }, + { + "epoch": 50.90497737556561, + "grad_norm": 5.533863067626953, + "learning_rate": 8.57358611195417e-05, + "loss": 1.0641, + "step": 56250 + }, + { + "epoch": 50.92760180995475, + "grad_norm": 4.220861434936523, + "learning_rate": 8.567723711857944e-05, + "loss": 1.0358, + "step": 56275 + }, + { + "epoch": 50.95022624434389, + "grad_norm": 4.768381595611572, + "learning_rate": 8.561860645830993e-05, + "loss": 1.115, + "step": 56300 + }, + { + "epoch": 50.97285067873303, + "grad_norm": 4.91024112701416, + "learning_rate": 8.555996917530065e-05, + "loss": 1.0712, + "step": 56325 + }, + { + "epoch": 50.99547511312217, + "grad_norm": 3.9711110591888428, + "learning_rate": 8.550132530612319e-05, + "loss": 1.1407, + "step": 56350 + }, + { + "epoch": 51.01809954751131, + "grad_norm": 4.726860046386719, + "learning_rate": 8.544267488735329e-05, + "loss": 1.0346, + "step": 56375 + }, + { + "epoch": 51.040723981900456, + "grad_norm": 4.389982223510742, + "learning_rate": 8.53840179555707e-05, + "loss": 1.088, + "step": 56400 + }, + { + "epoch": 51.06334841628959, + "grad_norm": 4.467222213745117, + "learning_rate": 8.532535454735934e-05, + "loss": 1.0083, + "step": 56425 + }, + { + "epoch": 51.085972850678736, + "grad_norm": 5.905579090118408, + "learning_rate": 8.526668469930705e-05, + "loss": 0.9034, + "step": 56450 + }, + { + "epoch": 51.10859728506787, + "grad_norm": 5.432013511657715, + "learning_rate": 8.520800844800578e-05, + "loss": 0.9099, + "step": 56475 + }, + { + "epoch": 51.13122171945702, + "grad_norm": 5.049311637878418, + "learning_rate": 8.515167325655024e-05, + "loss": 0.93, + "step": 56500 + }, + { + "epoch": 51.15384615384615, + "grad_norm": 4.538837432861328, + "learning_rate": 8.509298456104203e-05, + "loss": 0.8998, + "step": 56525 + }, + { + "epoch": 51.1764705882353, + "grad_norm": 5.023305416107178, + "learning_rate": 8.503428957062021e-05, + "loss": 0.9987, + "step": 56550 + }, + { + "epoch": 51.199095022624434, + "grad_norm": 3.4950027465820312, + "learning_rate": 8.49755883218924e-05, + "loss": 0.8631, + "step": 56575 + }, + { + "epoch": 51.22171945701358, + "grad_norm": 4.973414897918701, + "learning_rate": 8.491688085147005e-05, + "loss": 1.0109, + "step": 56600 + }, + { + "epoch": 51.244343891402714, + "grad_norm": 5.232577323913574, + "learning_rate": 8.485816719596856e-05, + "loss": 1.0334, + "step": 56625 + }, + { + "epoch": 51.26696832579186, + "grad_norm": 4.882142066955566, + "learning_rate": 8.47994473920072e-05, + "loss": 0.9905, + "step": 56650 + }, + { + "epoch": 51.289592760180994, + "grad_norm": 6.884396553039551, + "learning_rate": 8.4740721476209e-05, + "loss": 0.9886, + "step": 56675 + }, + { + "epoch": 51.31221719457014, + "grad_norm": 4.622968673706055, + "learning_rate": 8.468198948520084e-05, + "loss": 1.0852, + "step": 56700 + }, + { + "epoch": 51.334841628959275, + "grad_norm": 4.605698585510254, + "learning_rate": 8.462325145561343e-05, + "loss": 1.0237, + "step": 56725 + }, + { + "epoch": 51.35746606334842, + "grad_norm": 5.381290912628174, + "learning_rate": 8.456450742408119e-05, + "loss": 1.0082, + "step": 56750 + }, + { + "epoch": 51.380090497737555, + "grad_norm": 4.742079734802246, + "learning_rate": 8.450575742724228e-05, + "loss": 1.0455, + "step": 56775 + }, + { + "epoch": 51.4027149321267, + "grad_norm": 5.130732536315918, + "learning_rate": 8.444700150173863e-05, + "loss": 0.9221, + "step": 56800 + }, + { + "epoch": 51.425339366515836, + "grad_norm": 3.486804962158203, + "learning_rate": 8.438823968421584e-05, + "loss": 0.9971, + "step": 56825 + }, + { + "epoch": 51.44796380090498, + "grad_norm": 4.974049091339111, + "learning_rate": 8.432947201132317e-05, + "loss": 1.0028, + "step": 56850 + }, + { + "epoch": 51.470588235294116, + "grad_norm": 5.254629611968994, + "learning_rate": 8.427069851971354e-05, + "loss": 1.1134, + "step": 56875 + }, + { + "epoch": 51.49321266968326, + "grad_norm": 5.264469623565674, + "learning_rate": 8.421191924604354e-05, + "loss": 1.027, + "step": 56900 + }, + { + "epoch": 51.515837104072396, + "grad_norm": 4.641003131866455, + "learning_rate": 8.415313422697329e-05, + "loss": 1.0775, + "step": 56925 + }, + { + "epoch": 51.53846153846154, + "grad_norm": 5.969740867614746, + "learning_rate": 8.409434349916655e-05, + "loss": 0.976, + "step": 56950 + }, + { + "epoch": 51.56108597285068, + "grad_norm": 4.478930950164795, + "learning_rate": 8.403554709929067e-05, + "loss": 0.8974, + "step": 56975 + }, + { + "epoch": 51.58371040723982, + "grad_norm": 5.73991584777832, + "learning_rate": 8.397674506401642e-05, + "loss": 0.9987, + "step": 57000 + }, + { + "epoch": 51.60633484162896, + "grad_norm": 5.22064208984375, + "learning_rate": 8.39179374300182e-05, + "loss": 0.9793, + "step": 57025 + }, + { + "epoch": 51.6289592760181, + "grad_norm": 4.16978120803833, + "learning_rate": 8.385912423397387e-05, + "loss": 0.965, + "step": 57050 + }, + { + "epoch": 51.65158371040724, + "grad_norm": 3.6818130016326904, + "learning_rate": 8.38003055125647e-05, + "loss": 0.9747, + "step": 57075 + }, + { + "epoch": 51.67420814479638, + "grad_norm": 4.096356391906738, + "learning_rate": 8.37414813024755e-05, + "loss": 0.8952, + "step": 57100 + }, + { + "epoch": 51.69683257918552, + "grad_norm": 3.7166366577148438, + "learning_rate": 8.368265164039447e-05, + "loss": 1.0305, + "step": 57125 + }, + { + "epoch": 51.71945701357466, + "grad_norm": 4.9898176193237305, + "learning_rate": 8.362381656301315e-05, + "loss": 0.9782, + "step": 57150 + }, + { + "epoch": 51.7420814479638, + "grad_norm": 3.6234934329986572, + "learning_rate": 8.356497610702655e-05, + "loss": 1.0226, + "step": 57175 + }, + { + "epoch": 51.76470588235294, + "grad_norm": 4.622697830200195, + "learning_rate": 8.350613030913296e-05, + "loss": 1.1135, + "step": 57200 + }, + { + "epoch": 51.78733031674208, + "grad_norm": 4.255482196807861, + "learning_rate": 8.344727920603407e-05, + "loss": 0.9715, + "step": 57225 + }, + { + "epoch": 51.80995475113122, + "grad_norm": 4.796724796295166, + "learning_rate": 8.338842283443479e-05, + "loss": 1.0713, + "step": 57250 + }, + { + "epoch": 51.83257918552036, + "grad_norm": 3.0090603828430176, + "learning_rate": 8.332956123104341e-05, + "loss": 1.0199, + "step": 57275 + }, + { + "epoch": 51.8552036199095, + "grad_norm": 3.2074475288391113, + "learning_rate": 8.327069443257142e-05, + "loss": 0.9717, + "step": 57300 + }, + { + "epoch": 51.87782805429864, + "grad_norm": 4.8500776290893555, + "learning_rate": 8.321182247573357e-05, + "loss": 1.0186, + "step": 57325 + }, + { + "epoch": 51.90045248868778, + "grad_norm": 5.010964393615723, + "learning_rate": 8.315294539724782e-05, + "loss": 0.988, + "step": 57350 + }, + { + "epoch": 51.92307692307692, + "grad_norm": 4.531972885131836, + "learning_rate": 8.309406323383538e-05, + "loss": 0.9834, + "step": 57375 + }, + { + "epoch": 51.94570135746606, + "grad_norm": 5.096385955810547, + "learning_rate": 8.303517602222053e-05, + "loss": 1.2033, + "step": 57400 + }, + { + "epoch": 51.9683257918552, + "grad_norm": 4.396825790405273, + "learning_rate": 8.297628379913079e-05, + "loss": 1.0491, + "step": 57425 + }, + { + "epoch": 51.990950226244344, + "grad_norm": 4.0146803855896, + "learning_rate": 8.29173866012967e-05, + "loss": 1.1816, + "step": 57450 + }, + { + "epoch": 52.01357466063349, + "grad_norm": 4.34961462020874, + "learning_rate": 8.285848446545207e-05, + "loss": 0.9503, + "step": 57475 + }, + { + "epoch": 52.036199095022624, + "grad_norm": 5.64193058013916, + "learning_rate": 8.279957742833363e-05, + "loss": 1.0821, + "step": 57500 + }, + { + "epoch": 52.05882352941177, + "grad_norm": 4.364360809326172, + "learning_rate": 8.274066552668122e-05, + "loss": 0.9489, + "step": 57525 + }, + { + "epoch": 52.081447963800905, + "grad_norm": 5.169826507568359, + "learning_rate": 8.268174879723775e-05, + "loss": 1.0077, + "step": 57550 + }, + { + "epoch": 52.10407239819005, + "grad_norm": 5.747620582580566, + "learning_rate": 8.262282727674908e-05, + "loss": 0.8733, + "step": 57575 + }, + { + "epoch": 52.126696832579185, + "grad_norm": 3.697861909866333, + "learning_rate": 8.256390100196413e-05, + "loss": 0.9009, + "step": 57600 + }, + { + "epoch": 52.14932126696833, + "grad_norm": 5.627383708953857, + "learning_rate": 8.250497000963473e-05, + "loss": 0.8749, + "step": 57625 + }, + { + "epoch": 52.171945701357465, + "grad_norm": 5.055898666381836, + "learning_rate": 8.244603433651566e-05, + "loss": 0.9451, + "step": 57650 + }, + { + "epoch": 52.19457013574661, + "grad_norm": 5.240993976593018, + "learning_rate": 8.238709401936462e-05, + "loss": 0.8263, + "step": 57675 + }, + { + "epoch": 52.217194570135746, + "grad_norm": 4.983405113220215, + "learning_rate": 8.232814909494223e-05, + "loss": 1.029, + "step": 57700 + }, + { + "epoch": 52.23981900452489, + "grad_norm": 4.142995834350586, + "learning_rate": 8.226919960001196e-05, + "loss": 0.9562, + "step": 57725 + }, + { + "epoch": 52.262443438914026, + "grad_norm": 4.967777729034424, + "learning_rate": 8.221024557134015e-05, + "loss": 0.8929, + "step": 57750 + }, + { + "epoch": 52.28506787330317, + "grad_norm": 4.299960613250732, + "learning_rate": 8.215128704569592e-05, + "loss": 0.8692, + "step": 57775 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 3.7758352756500244, + "learning_rate": 8.209232405985127e-05, + "loss": 0.8449, + "step": 57800 + }, + { + "epoch": 52.33031674208145, + "grad_norm": 5.353555202484131, + "learning_rate": 8.203335665058093e-05, + "loss": 1.0161, + "step": 57825 + }, + { + "epoch": 52.35294117647059, + "grad_norm": 4.520666599273682, + "learning_rate": 8.197438485466239e-05, + "loss": 1.0039, + "step": 57850 + }, + { + "epoch": 52.37556561085973, + "grad_norm": 4.586175441741943, + "learning_rate": 8.191540870887588e-05, + "loss": 0.7861, + "step": 57875 + }, + { + "epoch": 52.39819004524887, + "grad_norm": 4.806995868682861, + "learning_rate": 8.185642825000438e-05, + "loss": 0.9014, + "step": 57900 + }, + { + "epoch": 52.42081447963801, + "grad_norm": 5.556037902832031, + "learning_rate": 8.179744351483352e-05, + "loss": 1.0142, + "step": 57925 + }, + { + "epoch": 52.44343891402715, + "grad_norm": 5.44512414932251, + "learning_rate": 8.17384545401516e-05, + "loss": 1.0591, + "step": 57950 + }, + { + "epoch": 52.46606334841629, + "grad_norm": 5.103080749511719, + "learning_rate": 8.167946136274956e-05, + "loss": 0.9555, + "step": 57975 + }, + { + "epoch": 52.48868778280543, + "grad_norm": 5.634963512420654, + "learning_rate": 8.162046401942097e-05, + "loss": 0.9449, + "step": 58000 + }, + { + "epoch": 52.51131221719457, + "grad_norm": 5.9640793800354, + "learning_rate": 8.156146254696202e-05, + "loss": 1.0044, + "step": 58025 + }, + { + "epoch": 52.53393665158371, + "grad_norm": 4.814497470855713, + "learning_rate": 8.150245698217146e-05, + "loss": 0.8738, + "step": 58050 + }, + { + "epoch": 52.55656108597285, + "grad_norm": 3.970576047897339, + "learning_rate": 8.144344736185057e-05, + "loss": 0.862, + "step": 58075 + }, + { + "epoch": 52.57918552036199, + "grad_norm": 5.5982489585876465, + "learning_rate": 8.138443372280319e-05, + "loss": 1.0608, + "step": 58100 + }, + { + "epoch": 52.60180995475113, + "grad_norm": 6.342493057250977, + "learning_rate": 8.132541610183564e-05, + "loss": 1.0313, + "step": 58125 + }, + { + "epoch": 52.62443438914027, + "grad_norm": 3.9694483280181885, + "learning_rate": 8.126639453575674e-05, + "loss": 0.8397, + "step": 58150 + }, + { + "epoch": 52.64705882352941, + "grad_norm": 4.348864555358887, + "learning_rate": 8.120736906137778e-05, + "loss": 0.988, + "step": 58175 + }, + { + "epoch": 52.66968325791855, + "grad_norm": 3.7852602005004883, + "learning_rate": 8.114833971551248e-05, + "loss": 0.9123, + "step": 58200 + }, + { + "epoch": 52.69230769230769, + "grad_norm": 4.544547080993652, + "learning_rate": 8.108930653497694e-05, + "loss": 1.0721, + "step": 58225 + }, + { + "epoch": 52.71493212669683, + "grad_norm": 4.0904693603515625, + "learning_rate": 8.103026955658971e-05, + "loss": 0.8775, + "step": 58250 + }, + { + "epoch": 52.737556561085974, + "grad_norm": 4.872154235839844, + "learning_rate": 8.097122881717167e-05, + "loss": 0.9625, + "step": 58275 + }, + { + "epoch": 52.76018099547511, + "grad_norm": 4.9419941902160645, + "learning_rate": 8.091218435354605e-05, + "loss": 0.9202, + "step": 58300 + }, + { + "epoch": 52.782805429864254, + "grad_norm": 4.382291316986084, + "learning_rate": 8.085313620253843e-05, + "loss": 0.9289, + "step": 58325 + }, + { + "epoch": 52.80542986425339, + "grad_norm": 5.273210048675537, + "learning_rate": 8.079408440097666e-05, + "loss": 0.9524, + "step": 58350 + }, + { + "epoch": 52.828054298642535, + "grad_norm": 4.947889804840088, + "learning_rate": 8.073502898569082e-05, + "loss": 0.9681, + "step": 58375 + }, + { + "epoch": 52.85067873303167, + "grad_norm": 2.935694932937622, + "learning_rate": 8.067596999351339e-05, + "loss": 0.9267, + "step": 58400 + }, + { + "epoch": 52.873303167420815, + "grad_norm": 4.31731653213501, + "learning_rate": 8.061690746127895e-05, + "loss": 1.0439, + "step": 58425 + }, + { + "epoch": 52.89592760180995, + "grad_norm": 4.913259983062744, + "learning_rate": 8.055784142582433e-05, + "loss": 1.0238, + "step": 58450 + }, + { + "epoch": 52.918552036199095, + "grad_norm": 4.733499050140381, + "learning_rate": 8.049877192398854e-05, + "loss": 0.9618, + "step": 58475 + }, + { + "epoch": 52.94117647058823, + "grad_norm": 4.514467716217041, + "learning_rate": 8.043969899261277e-05, + "loss": 0.9964, + "step": 58500 + }, + { + "epoch": 52.963800904977376, + "grad_norm": 5.010599136352539, + "learning_rate": 8.038062266854029e-05, + "loss": 0.9073, + "step": 58525 + }, + { + "epoch": 52.98642533936652, + "grad_norm": 4.960625648498535, + "learning_rate": 8.032154298861657e-05, + "loss": 1.0038, + "step": 58550 + }, + { + "epoch": 53.009049773755656, + "grad_norm": 4.952969074249268, + "learning_rate": 8.026245998968913e-05, + "loss": 0.8599, + "step": 58575 + }, + { + "epoch": 53.0316742081448, + "grad_norm": 4.405599594116211, + "learning_rate": 8.020337370860755e-05, + "loss": 0.8009, + "step": 58600 + }, + { + "epoch": 53.05429864253394, + "grad_norm": 4.26874303817749, + "learning_rate": 8.014428418222347e-05, + "loss": 0.9077, + "step": 58625 + }, + { + "epoch": 53.07692307692308, + "grad_norm": 6.236453056335449, + "learning_rate": 8.008519144739058e-05, + "loss": 0.8767, + "step": 58650 + }, + { + "epoch": 53.09954751131222, + "grad_norm": 4.521067142486572, + "learning_rate": 8.002609554096451e-05, + "loss": 0.9162, + "step": 58675 + }, + { + "epoch": 53.12217194570136, + "grad_norm": 3.8446598052978516, + "learning_rate": 7.996699649980292e-05, + "loss": 0.8431, + "step": 58700 + }, + { + "epoch": 53.1447963800905, + "grad_norm": 5.036252498626709, + "learning_rate": 7.990789436076541e-05, + "loss": 0.8377, + "step": 58725 + }, + { + "epoch": 53.16742081447964, + "grad_norm": 4.960869789123535, + "learning_rate": 7.984878916071353e-05, + "loss": 0.937, + "step": 58750 + }, + { + "epoch": 53.19004524886878, + "grad_norm": 5.040843963623047, + "learning_rate": 7.978968093651067e-05, + "loss": 0.8643, + "step": 58775 + }, + { + "epoch": 53.21266968325792, + "grad_norm": 5.60190486907959, + "learning_rate": 7.97305697250222e-05, + "loss": 1.087, + "step": 58800 + }, + { + "epoch": 53.23529411764706, + "grad_norm": 4.930598735809326, + "learning_rate": 7.967145556311528e-05, + "loss": 1.0033, + "step": 58825 + }, + { + "epoch": 53.2579185520362, + "grad_norm": 3.259086847305298, + "learning_rate": 7.961233848765895e-05, + "loss": 0.8735, + "step": 58850 + }, + { + "epoch": 53.28054298642534, + "grad_norm": 4.807427406311035, + "learning_rate": 7.955321853552407e-05, + "loss": 0.9848, + "step": 58875 + }, + { + "epoch": 53.30316742081448, + "grad_norm": 4.867278099060059, + "learning_rate": 7.949409574358326e-05, + "loss": 0.8761, + "step": 58900 + }, + { + "epoch": 53.32579185520362, + "grad_norm": 5.033575534820557, + "learning_rate": 7.943497014871094e-05, + "loss": 0.7422, + "step": 58925 + }, + { + "epoch": 53.34841628959276, + "grad_norm": 4.1396684646606445, + "learning_rate": 7.93758417877833e-05, + "loss": 0.9512, + "step": 58950 + }, + { + "epoch": 53.3710407239819, + "grad_norm": 4.426119327545166, + "learning_rate": 7.931671069767817e-05, + "loss": 0.8334, + "step": 58975 + }, + { + "epoch": 53.39366515837104, + "grad_norm": 5.277045726776123, + "learning_rate": 7.925757691527516e-05, + "loss": 0.8212, + "step": 59000 + }, + { + "epoch": 53.41628959276018, + "grad_norm": 5.944455146789551, + "learning_rate": 7.919844047745553e-05, + "loss": 0.8919, + "step": 59025 + }, + { + "epoch": 53.43891402714932, + "grad_norm": 5.586777687072754, + "learning_rate": 7.913930142110222e-05, + "loss": 0.8989, + "step": 59050 + }, + { + "epoch": 53.46153846153846, + "grad_norm": 5.0498552322387695, + "learning_rate": 7.908015978309979e-05, + "loss": 0.8728, + "step": 59075 + }, + { + "epoch": 53.484162895927604, + "grad_norm": 4.9062910079956055, + "learning_rate": 7.902101560033438e-05, + "loss": 0.8128, + "step": 59100 + }, + { + "epoch": 53.50678733031674, + "grad_norm": 4.3646159172058105, + "learning_rate": 7.896186890969375e-05, + "loss": 0.8927, + "step": 59125 + }, + { + "epoch": 53.529411764705884, + "grad_norm": 4.290850639343262, + "learning_rate": 7.890271974806724e-05, + "loss": 0.9639, + "step": 59150 + }, + { + "epoch": 53.55203619909502, + "grad_norm": 2.934475898742676, + "learning_rate": 7.884356815234569e-05, + "loss": 0.8293, + "step": 59175 + }, + { + "epoch": 53.574660633484164, + "grad_norm": 4.962638854980469, + "learning_rate": 7.878441415942146e-05, + "loss": 0.8695, + "step": 59200 + }, + { + "epoch": 53.5972850678733, + "grad_norm": 5.541092395782471, + "learning_rate": 7.872525780618844e-05, + "loss": 0.9574, + "step": 59225 + }, + { + "epoch": 53.619909502262445, + "grad_norm": 5.185718059539795, + "learning_rate": 7.866609912954199e-05, + "loss": 1.018, + "step": 59250 + }, + { + "epoch": 53.64253393665158, + "grad_norm": 4.9260663986206055, + "learning_rate": 7.860930464834365e-05, + "loss": 1.0888, + "step": 59275 + }, + { + "epoch": 53.665158371040725, + "grad_norm": 5.476282596588135, + "learning_rate": 7.855014152483838e-05, + "loss": 0.8982, + "step": 59300 + }, + { + "epoch": 53.68778280542986, + "grad_norm": 6.069909572601318, + "learning_rate": 7.849097618713829e-05, + "loss": 0.8551, + "step": 59325 + }, + { + "epoch": 53.710407239819006, + "grad_norm": 4.913949966430664, + "learning_rate": 7.84318086721443e-05, + "loss": 0.9009, + "step": 59350 + }, + { + "epoch": 53.73303167420814, + "grad_norm": 4.0589399337768555, + "learning_rate": 7.837263901675874e-05, + "loss": 0.8428, + "step": 59375 + }, + { + "epoch": 53.755656108597286, + "grad_norm": 4.331938743591309, + "learning_rate": 7.831346725788526e-05, + "loss": 0.9352, + "step": 59400 + }, + { + "epoch": 53.77828054298642, + "grad_norm": 4.615716457366943, + "learning_rate": 7.825429343242879e-05, + "loss": 0.9279, + "step": 59425 + }, + { + "epoch": 53.800904977375566, + "grad_norm": 4.417220592498779, + "learning_rate": 7.819511757729558e-05, + "loss": 0.8169, + "step": 59450 + }, + { + "epoch": 53.8235294117647, + "grad_norm": 3.6304221153259277, + "learning_rate": 7.813593972939313e-05, + "loss": 0.9274, + "step": 59475 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 3.4916086196899414, + "learning_rate": 7.80767599256302e-05, + "loss": 0.9707, + "step": 59500 + }, + { + "epoch": 53.86877828054298, + "grad_norm": 4.305111885070801, + "learning_rate": 7.801757820291675e-05, + "loss": 0.9303, + "step": 59525 + }, + { + "epoch": 53.89140271493213, + "grad_norm": 4.907180309295654, + "learning_rate": 7.795839459816396e-05, + "loss": 1.0336, + "step": 59550 + }, + { + "epoch": 53.914027149321264, + "grad_norm": 4.160072326660156, + "learning_rate": 7.789920914828416e-05, + "loss": 1.0352, + "step": 59575 + }, + { + "epoch": 53.93665158371041, + "grad_norm": 5.184953212738037, + "learning_rate": 7.784002189019085e-05, + "loss": 0.904, + "step": 59600 + }, + { + "epoch": 53.959276018099544, + "grad_norm": 5.4037699699401855, + "learning_rate": 7.778083286079861e-05, + "loss": 0.8632, + "step": 59625 + }, + { + "epoch": 53.98190045248869, + "grad_norm": 4.708549499511719, + "learning_rate": 7.772164209702321e-05, + "loss": 0.8251, + "step": 59650 + }, + { + "epoch": 54.00452488687783, + "grad_norm": 4.25620698928833, + "learning_rate": 7.766244963578145e-05, + "loss": 0.7864, + "step": 59675 + }, + { + "epoch": 54.02714932126697, + "grad_norm": 4.483149528503418, + "learning_rate": 7.760325551399117e-05, + "loss": 0.7154, + "step": 59700 + }, + { + "epoch": 54.04977375565611, + "grad_norm": 4.288634777069092, + "learning_rate": 7.754405976857129e-05, + "loss": 0.9173, + "step": 59725 + }, + { + "epoch": 54.07239819004525, + "grad_norm": 4.393452167510986, + "learning_rate": 7.748486243644173e-05, + "loss": 0.9591, + "step": 59750 + }, + { + "epoch": 54.09502262443439, + "grad_norm": 3.991946220397949, + "learning_rate": 7.742566355452335e-05, + "loss": 0.8082, + "step": 59775 + }, + { + "epoch": 54.11764705882353, + "grad_norm": 3.410980701446533, + "learning_rate": 7.736646315973805e-05, + "loss": 0.8256, + "step": 59800 + }, + { + "epoch": 54.14027149321267, + "grad_norm": 4.950623512268066, + "learning_rate": 7.730726128900864e-05, + "loss": 0.7366, + "step": 59825 + }, + { + "epoch": 54.16289592760181, + "grad_norm": 3.422947883605957, + "learning_rate": 7.724805797925886e-05, + "loss": 0.7395, + "step": 59850 + }, + { + "epoch": 54.18552036199095, + "grad_norm": 4.740551948547363, + "learning_rate": 7.71888532674133e-05, + "loss": 0.7957, + "step": 59875 + }, + { + "epoch": 54.20814479638009, + "grad_norm": 5.016266822814941, + "learning_rate": 7.71296471903975e-05, + "loss": 0.9355, + "step": 59900 + }, + { + "epoch": 54.23076923076923, + "grad_norm": 6.452232360839844, + "learning_rate": 7.707043978513784e-05, + "loss": 0.9089, + "step": 59925 + }, + { + "epoch": 54.25339366515837, + "grad_norm": 6.509178638458252, + "learning_rate": 7.701123108856147e-05, + "loss": 0.8108, + "step": 59950 + }, + { + "epoch": 54.276018099547514, + "grad_norm": 5.48393440246582, + "learning_rate": 7.695202113759637e-05, + "loss": 0.7972, + "step": 59975 + }, + { + "epoch": 54.29864253393665, + "grad_norm": 4.971682071685791, + "learning_rate": 7.689280996917132e-05, + "loss": 0.7777, + "step": 60000 + }, + { + "epoch": 54.321266968325794, + "grad_norm": 4.500193119049072, + "learning_rate": 7.683359762021586e-05, + "loss": 0.9309, + "step": 60025 + }, + { + "epoch": 54.34389140271493, + "grad_norm": 4.3350934982299805, + "learning_rate": 7.677438412766026e-05, + "loss": 0.9981, + "step": 60050 + }, + { + "epoch": 54.366515837104075, + "grad_norm": 4.638091087341309, + "learning_rate": 7.671516952843549e-05, + "loss": 0.8094, + "step": 60075 + }, + { + "epoch": 54.38914027149321, + "grad_norm": 5.043220043182373, + "learning_rate": 7.665595385947324e-05, + "loss": 0.7614, + "step": 60100 + }, + { + "epoch": 54.411764705882355, + "grad_norm": 4.639249324798584, + "learning_rate": 7.659673715770582e-05, + "loss": 0.8464, + "step": 60125 + }, + { + "epoch": 54.43438914027149, + "grad_norm": 3.943392038345337, + "learning_rate": 7.653751946006623e-05, + "loss": 0.8505, + "step": 60150 + }, + { + "epoch": 54.457013574660635, + "grad_norm": 3.4700372219085693, + "learning_rate": 7.647830080348808e-05, + "loss": 0.8874, + "step": 60175 + }, + { + "epoch": 54.47963800904977, + "grad_norm": 4.969888687133789, + "learning_rate": 7.641908122490556e-05, + "loss": 0.9128, + "step": 60200 + }, + { + "epoch": 54.502262443438916, + "grad_norm": 4.452884674072266, + "learning_rate": 7.635986076125344e-05, + "loss": 0.9618, + "step": 60225 + }, + { + "epoch": 54.52488687782805, + "grad_norm": 4.573580265045166, + "learning_rate": 7.630063944946708e-05, + "loss": 0.7469, + "step": 60250 + }, + { + "epoch": 54.547511312217196, + "grad_norm": 6.330535411834717, + "learning_rate": 7.62414173264823e-05, + "loss": 0.7804, + "step": 60275 + }, + { + "epoch": 54.57013574660633, + "grad_norm": 4.171976566314697, + "learning_rate": 7.618219442923547e-05, + "loss": 0.8142, + "step": 60300 + }, + { + "epoch": 54.59276018099548, + "grad_norm": 4.339895248413086, + "learning_rate": 7.612297079466346e-05, + "loss": 0.9616, + "step": 60325 + }, + { + "epoch": 54.61538461538461, + "grad_norm": 4.75609016418457, + "learning_rate": 7.606374645970356e-05, + "loss": 1.0351, + "step": 60350 + }, + { + "epoch": 54.63800904977376, + "grad_norm": 4.1576995849609375, + "learning_rate": 7.600452146129352e-05, + "loss": 0.7971, + "step": 60375 + }, + { + "epoch": 54.660633484162894, + "grad_norm": 4.641231536865234, + "learning_rate": 7.594529583637149e-05, + "loss": 0.8702, + "step": 60400 + }, + { + "epoch": 54.68325791855204, + "grad_norm": 3.3054144382476807, + "learning_rate": 7.588606962187601e-05, + "loss": 0.7996, + "step": 60425 + }, + { + "epoch": 54.705882352941174, + "grad_norm": 4.372158527374268, + "learning_rate": 7.582684285474603e-05, + "loss": 0.9059, + "step": 60450 + }, + { + "epoch": 54.72850678733032, + "grad_norm": 3.55283522605896, + "learning_rate": 7.576761557192076e-05, + "loss": 0.8029, + "step": 60475 + }, + { + "epoch": 54.751131221719454, + "grad_norm": 3.247828960418701, + "learning_rate": 7.57083878103398e-05, + "loss": 0.8858, + "step": 60500 + }, + { + "epoch": 54.7737556561086, + "grad_norm": 6.058444023132324, + "learning_rate": 7.564915960694308e-05, + "loss": 0.8342, + "step": 60525 + }, + { + "epoch": 54.796380090497735, + "grad_norm": 4.797789573669434, + "learning_rate": 7.558993099867068e-05, + "loss": 0.8698, + "step": 60550 + }, + { + "epoch": 54.81900452488688, + "grad_norm": 4.413180351257324, + "learning_rate": 7.553070202246305e-05, + "loss": 0.9446, + "step": 60575 + }, + { + "epoch": 54.841628959276015, + "grad_norm": 4.170695781707764, + "learning_rate": 7.547147271526081e-05, + "loss": 0.9185, + "step": 60600 + }, + { + "epoch": 54.86425339366516, + "grad_norm": 5.421101093292236, + "learning_rate": 7.541224311400484e-05, + "loss": 0.8482, + "step": 60625 + }, + { + "epoch": 54.886877828054295, + "grad_norm": 4.560364723205566, + "learning_rate": 7.535301325563611e-05, + "loss": 0.8413, + "step": 60650 + }, + { + "epoch": 54.90950226244344, + "grad_norm": 6.658802032470703, + "learning_rate": 7.529378317709587e-05, + "loss": 0.9026, + "step": 60675 + }, + { + "epoch": 54.932126696832576, + "grad_norm": 2.510969400405884, + "learning_rate": 7.52345529153254e-05, + "loss": 0.8417, + "step": 60700 + }, + { + "epoch": 54.95475113122172, + "grad_norm": 5.733388423919678, + "learning_rate": 7.517532250726617e-05, + "loss": 0.9093, + "step": 60725 + }, + { + "epoch": 54.977375565610856, + "grad_norm": 5.590017318725586, + "learning_rate": 7.511609198985969e-05, + "loss": 0.8656, + "step": 60750 + }, + { + "epoch": 55.0, + "grad_norm": 5.159645080566406, + "learning_rate": 7.505686140004757e-05, + "loss": 0.747, + "step": 60775 + }, + { + "epoch": 55.022624434389144, + "grad_norm": 3.632514238357544, + "learning_rate": 7.499763077477145e-05, + "loss": 0.8007, + "step": 60800 + }, + { + "epoch": 55.04524886877828, + "grad_norm": 4.552527904510498, + "learning_rate": 7.493840015097298e-05, + "loss": 0.8871, + "step": 60825 + }, + { + "epoch": 55.067873303167424, + "grad_norm": 3.7181003093719482, + "learning_rate": 7.487916956559385e-05, + "loss": 0.8081, + "step": 60850 + }, + { + "epoch": 55.09049773755656, + "grad_norm": 4.032871246337891, + "learning_rate": 7.481993905557571e-05, + "loss": 0.7013, + "step": 60875 + }, + { + "epoch": 55.113122171945705, + "grad_norm": 4.573465824127197, + "learning_rate": 7.476070865786012e-05, + "loss": 0.8295, + "step": 60900 + }, + { + "epoch": 55.13574660633484, + "grad_norm": 3.690110445022583, + "learning_rate": 7.470147840938863e-05, + "loss": 0.8161, + "step": 60925 + }, + { + "epoch": 55.158371040723985, + "grad_norm": 3.9547295570373535, + "learning_rate": 7.464224834710267e-05, + "loss": 0.7405, + "step": 60950 + }, + { + "epoch": 55.18099547511312, + "grad_norm": 4.441616535186768, + "learning_rate": 7.458301850794355e-05, + "loss": 0.7908, + "step": 60975 + }, + { + "epoch": 55.203619909502265, + "grad_norm": 4.410475730895996, + "learning_rate": 7.452378892885243e-05, + "loss": 0.7324, + "step": 61000 + }, + { + "epoch": 55.2262443438914, + "grad_norm": 4.958261489868164, + "learning_rate": 7.446455964677036e-05, + "loss": 0.7666, + "step": 61025 + }, + { + "epoch": 55.248868778280546, + "grad_norm": 4.1845784187316895, + "learning_rate": 7.440533069863813e-05, + "loss": 0.7368, + "step": 61050 + }, + { + "epoch": 55.27149321266968, + "grad_norm": 4.775341510772705, + "learning_rate": 7.434610212139639e-05, + "loss": 0.8197, + "step": 61075 + }, + { + "epoch": 55.294117647058826, + "grad_norm": 4.15249490737915, + "learning_rate": 7.428687395198551e-05, + "loss": 0.8121, + "step": 61100 + }, + { + "epoch": 55.31674208144796, + "grad_norm": 4.936108589172363, + "learning_rate": 7.422764622734565e-05, + "loss": 0.7077, + "step": 61125 + }, + { + "epoch": 55.339366515837106, + "grad_norm": 3.2025458812713623, + "learning_rate": 7.416841898441663e-05, + "loss": 0.8949, + "step": 61150 + }, + { + "epoch": 55.36199095022624, + "grad_norm": 5.4227399826049805, + "learning_rate": 7.410919226013802e-05, + "loss": 0.8844, + "step": 61175 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 5.631619930267334, + "learning_rate": 7.404996609144908e-05, + "loss": 0.842, + "step": 61200 + }, + { + "epoch": 55.40723981900452, + "grad_norm": 5.11384391784668, + "learning_rate": 7.399074051528867e-05, + "loss": 0.8668, + "step": 61225 + }, + { + "epoch": 55.42986425339367, + "grad_norm": 5.616774082183838, + "learning_rate": 7.393151556859532e-05, + "loss": 0.7929, + "step": 61250 + }, + { + "epoch": 55.452488687782804, + "grad_norm": 4.66877555847168, + "learning_rate": 7.387229128830714e-05, + "loss": 0.6773, + "step": 61275 + }, + { + "epoch": 55.47511312217195, + "grad_norm": 5.053830623626709, + "learning_rate": 7.381306771136186e-05, + "loss": 0.7212, + "step": 61300 + }, + { + "epoch": 55.497737556561084, + "grad_norm": 4.312491416931152, + "learning_rate": 7.375384487469673e-05, + "loss": 0.8143, + "step": 61325 + }, + { + "epoch": 55.52036199095023, + "grad_norm": 2.47932767868042, + "learning_rate": 7.369462281524857e-05, + "loss": 0.776, + "step": 61350 + }, + { + "epoch": 55.542986425339365, + "grad_norm": NaN, + "learning_rate": 7.363777040367044e-05, + "loss": 0.7555, + "step": 61375 + }, + { + "epoch": 55.56561085972851, + "grad_norm": 3.8299970626831055, + "learning_rate": 7.357854997471195e-05, + "loss": 0.7993, + "step": 61400 + }, + { + "epoch": 55.588235294117645, + "grad_norm": 4.222673416137695, + "learning_rate": 7.351933043230046e-05, + "loss": 0.8498, + "step": 61425 + }, + { + "epoch": 55.61085972850679, + "grad_norm": 4.237673282623291, + "learning_rate": 7.346011181337071e-05, + "loss": 0.8143, + "step": 61450 + }, + { + "epoch": 55.633484162895925, + "grad_norm": 4.1661505699157715, + "learning_rate": 7.34008941548569e-05, + "loss": 0.8586, + "step": 61475 + }, + { + "epoch": 55.65610859728507, + "grad_norm": 2.9293253421783447, + "learning_rate": 7.334167749369258e-05, + "loss": 0.8458, + "step": 61500 + }, + { + "epoch": 55.678733031674206, + "grad_norm": 4.5011067390441895, + "learning_rate": 7.328246186681073e-05, + "loss": 0.8494, + "step": 61525 + }, + { + "epoch": 55.70135746606335, + "grad_norm": 4.723522186279297, + "learning_rate": 7.322324731114364e-05, + "loss": 0.9023, + "step": 61550 + }, + { + "epoch": 55.723981900452486, + "grad_norm": 2.886542797088623, + "learning_rate": 7.316403386362297e-05, + "loss": 0.7982, + "step": 61575 + }, + { + "epoch": 55.74660633484163, + "grad_norm": 4.536559104919434, + "learning_rate": 7.310482156117968e-05, + "loss": 0.8375, + "step": 61600 + }, + { + "epoch": 55.76923076923077, + "grad_norm": 4.458587646484375, + "learning_rate": 7.304561044074399e-05, + "loss": 0.7473, + "step": 61625 + }, + { + "epoch": 55.79185520361991, + "grad_norm": 4.814058780670166, + "learning_rate": 7.298640053924542e-05, + "loss": 0.9217, + "step": 61650 + }, + { + "epoch": 55.81447963800905, + "grad_norm": 4.6384196281433105, + "learning_rate": 7.29271918936127e-05, + "loss": 0.7983, + "step": 61675 + }, + { + "epoch": 55.83710407239819, + "grad_norm": 4.325346946716309, + "learning_rate": 7.286798454077377e-05, + "loss": 0.8147, + "step": 61700 + }, + { + "epoch": 55.85972850678733, + "grad_norm": 5.219418525695801, + "learning_rate": 7.280877851765582e-05, + "loss": 0.7371, + "step": 61725 + }, + { + "epoch": 55.88235294117647, + "grad_norm": 3.9857089519500732, + "learning_rate": 7.274957386118515e-05, + "loss": 0.9557, + "step": 61750 + }, + { + "epoch": 55.90497737556561, + "grad_norm": 3.2314071655273438, + "learning_rate": 7.269037060828723e-05, + "loss": 0.7471, + "step": 61775 + }, + { + "epoch": 55.92760180995475, + "grad_norm": 4.041252613067627, + "learning_rate": 7.263116879588665e-05, + "loss": 0.8052, + "step": 61800 + }, + { + "epoch": 55.95022624434389, + "grad_norm": 5.032653331756592, + "learning_rate": 7.257196846090713e-05, + "loss": 0.7964, + "step": 61825 + }, + { + "epoch": 55.97285067873303, + "grad_norm": 5.317902565002441, + "learning_rate": 7.251276964027141e-05, + "loss": 0.8317, + "step": 61850 + }, + { + "epoch": 55.99547511312217, + "grad_norm": 3.7628276348114014, + "learning_rate": 7.245357237090136e-05, + "loss": 0.8196, + "step": 61875 + }, + { + "epoch": 56.01809954751131, + "grad_norm": 2.3814358711242676, + "learning_rate": 7.23943766897178e-05, + "loss": 0.6783, + "step": 61900 + }, + { + "epoch": 56.040723981900456, + "grad_norm": 4.1640305519104, + "learning_rate": 7.233518263364064e-05, + "loss": 0.7511, + "step": 61925 + }, + { + "epoch": 56.06334841628959, + "grad_norm": 4.725026607513428, + "learning_rate": 7.22759902395887e-05, + "loss": 0.7144, + "step": 61950 + }, + { + "epoch": 56.085972850678736, + "grad_norm": 5.087151527404785, + "learning_rate": 7.221679954447983e-05, + "loss": 0.7817, + "step": 61975 + }, + { + "epoch": 56.10859728506787, + "grad_norm": 4.164005279541016, + "learning_rate": 7.21576105852308e-05, + "loss": 0.7576, + "step": 62000 + }, + { + "epoch": 56.13122171945702, + "grad_norm": 5.5483293533325195, + "learning_rate": 7.209842339875726e-05, + "loss": 0.7417, + "step": 62025 + }, + { + "epoch": 56.15384615384615, + "grad_norm": 4.518779754638672, + "learning_rate": 7.203923802197381e-05, + "loss": 0.743, + "step": 62050 + }, + { + "epoch": 56.1764705882353, + "grad_norm": 3.0123450756073, + "learning_rate": 7.198005449179387e-05, + "loss": 0.6979, + "step": 62075 + }, + { + "epoch": 56.199095022624434, + "grad_norm": 4.908483028411865, + "learning_rate": 7.192087284512977e-05, + "loss": 0.7268, + "step": 62100 + }, + { + "epoch": 56.22171945701358, + "grad_norm": 4.837472915649414, + "learning_rate": 7.18616931188926e-05, + "loss": 0.703, + "step": 62125 + }, + { + "epoch": 56.244343891402714, + "grad_norm": 4.4174299240112305, + "learning_rate": 7.180251534999227e-05, + "loss": 0.7239, + "step": 62150 + }, + { + "epoch": 56.26696832579186, + "grad_norm": 3.4075124263763428, + "learning_rate": 7.174333957533752e-05, + "loss": 0.7709, + "step": 62175 + }, + { + "epoch": 56.289592760180994, + "grad_norm": 4.364498615264893, + "learning_rate": 7.168416583183577e-05, + "loss": 0.8033, + "step": 62200 + }, + { + "epoch": 56.31221719457014, + "grad_norm": 3.780409336090088, + "learning_rate": 7.162499415639324e-05, + "loss": 0.7739, + "step": 62225 + }, + { + "epoch": 56.334841628959275, + "grad_norm": 5.191378116607666, + "learning_rate": 7.15658245859148e-05, + "loss": 0.7106, + "step": 62250 + }, + { + "epoch": 56.35746606334842, + "grad_norm": 3.34543514251709, + "learning_rate": 7.150665715730405e-05, + "loss": 0.6838, + "step": 62275 + }, + { + "epoch": 56.380090497737555, + "grad_norm": 4.146462440490723, + "learning_rate": 7.144749190746326e-05, + "loss": 0.6753, + "step": 62300 + }, + { + "epoch": 56.4027149321267, + "grad_norm": 6.109175682067871, + "learning_rate": 7.13883288732933e-05, + "loss": 0.7479, + "step": 62325 + }, + { + "epoch": 56.425339366515836, + "grad_norm": 4.915993690490723, + "learning_rate": 7.132916809169368e-05, + "loss": 0.7604, + "step": 62350 + }, + { + "epoch": 56.44796380090498, + "grad_norm": 4.397975444793701, + "learning_rate": 7.127000959956248e-05, + "loss": 0.8161, + "step": 62375 + }, + { + "epoch": 56.470588235294116, + "grad_norm": 4.625315189361572, + "learning_rate": 7.121085343379644e-05, + "loss": 0.7908, + "step": 62400 + }, + { + "epoch": 56.49321266968326, + "grad_norm": 3.804797649383545, + "learning_rate": 7.115169963129076e-05, + "loss": 0.7554, + "step": 62425 + }, + { + "epoch": 56.515837104072396, + "grad_norm": 4.955995082855225, + "learning_rate": 7.109254822893919e-05, + "loss": 0.7662, + "step": 62450 + }, + { + "epoch": 56.53846153846154, + "grad_norm": 4.094953536987305, + "learning_rate": 7.103339926363398e-05, + "loss": 0.7181, + "step": 62475 + }, + { + "epoch": 56.56108597285068, + "grad_norm": 5.037959575653076, + "learning_rate": 7.097425277226586e-05, + "loss": 0.8225, + "step": 62500 + }, + { + "epoch": 56.58371040723982, + "grad_norm": 4.111741542816162, + "learning_rate": 7.091510879172405e-05, + "loss": 0.8178, + "step": 62525 + }, + { + "epoch": 56.60633484162896, + "grad_norm": 4.771442890167236, + "learning_rate": 7.085596735889615e-05, + "loss": 0.7598, + "step": 62550 + }, + { + "epoch": 56.6289592760181, + "grad_norm": 5.622108459472656, + "learning_rate": 7.079682851066821e-05, + "loss": 0.8022, + "step": 62575 + }, + { + "epoch": 56.65158371040724, + "grad_norm": 4.363972187042236, + "learning_rate": 7.073769228392465e-05, + "loss": 0.8628, + "step": 62600 + }, + { + "epoch": 56.67420814479638, + "grad_norm": 3.973304271697998, + "learning_rate": 7.067855871554826e-05, + "loss": 0.6982, + "step": 62625 + }, + { + "epoch": 56.69683257918552, + "grad_norm": 4.642250061035156, + "learning_rate": 7.061942784242019e-05, + "loss": 0.7985, + "step": 62650 + }, + { + "epoch": 56.71945701357466, + "grad_norm": 2.3813135623931885, + "learning_rate": 7.056029970141988e-05, + "loss": 0.7239, + "step": 62675 + }, + { + "epoch": 56.7420814479638, + "grad_norm": 3.675196647644043, + "learning_rate": 7.050117432942506e-05, + "loss": 0.7956, + "step": 62700 + }, + { + "epoch": 56.76470588235294, + "grad_norm": 4.203518867492676, + "learning_rate": 7.044205176331178e-05, + "loss": 0.8164, + "step": 62725 + }, + { + "epoch": 56.78733031674208, + "grad_norm": 5.610568046569824, + "learning_rate": 7.038293203995428e-05, + "loss": 0.82, + "step": 62750 + }, + { + "epoch": 56.80995475113122, + "grad_norm": 5.120931148529053, + "learning_rate": 7.032381519622508e-05, + "loss": 0.801, + "step": 62775 + }, + { + "epoch": 56.83257918552036, + "grad_norm": 4.9113335609436035, + "learning_rate": 7.026470126899489e-05, + "loss": 0.7235, + "step": 62800 + }, + { + "epoch": 56.8552036199095, + "grad_norm": 4.712889671325684, + "learning_rate": 7.020559029513258e-05, + "loss": 0.9791, + "step": 62825 + }, + { + "epoch": 56.87782805429864, + "grad_norm": 3.4738965034484863, + "learning_rate": 7.014648231150519e-05, + "loss": 0.7358, + "step": 62850 + }, + { + "epoch": 56.90045248868778, + "grad_norm": 4.060792446136475, + "learning_rate": 7.00873773549779e-05, + "loss": 0.8057, + "step": 62875 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 4.6342644691467285, + "learning_rate": 7.002827546241396e-05, + "loss": 0.8536, + "step": 62900 + }, + { + "epoch": 56.94570135746606, + "grad_norm": 4.327664852142334, + "learning_rate": 6.99691766706748e-05, + "loss": 0.7719, + "step": 62925 + }, + { + "epoch": 56.9683257918552, + "grad_norm": 4.646831035614014, + "learning_rate": 6.991008101661986e-05, + "loss": 0.9035, + "step": 62950 + }, + { + "epoch": 56.990950226244344, + "grad_norm": 4.714659690856934, + "learning_rate": 6.98509885371066e-05, + "loss": 0.761, + "step": 62975 + }, + { + "epoch": 57.01357466063349, + "grad_norm": 4.753892421722412, + "learning_rate": 6.979189926899054e-05, + "loss": 0.7183, + "step": 63000 + }, + { + "epoch": 57.036199095022624, + "grad_norm": 4.693076133728027, + "learning_rate": 6.973281324912518e-05, + "loss": 0.6936, + "step": 63025 + }, + { + "epoch": 57.05882352941177, + "grad_norm": 2.9521639347076416, + "learning_rate": 6.967373051436201e-05, + "loss": 0.6493, + "step": 63050 + }, + { + "epoch": 57.081447963800905, + "grad_norm": 5.09092378616333, + "learning_rate": 6.961465110155042e-05, + "loss": 0.7024, + "step": 63075 + }, + { + "epoch": 57.10407239819005, + "grad_norm": 5.996335029602051, + "learning_rate": 6.95555750475378e-05, + "loss": 0.7514, + "step": 63100 + }, + { + "epoch": 57.126696832579185, + "grad_norm": 4.089336395263672, + "learning_rate": 6.949650238916941e-05, + "loss": 0.6803, + "step": 63125 + }, + { + "epoch": 57.14932126696833, + "grad_norm": 2.8486642837524414, + "learning_rate": 6.943743316328838e-05, + "loss": 0.7033, + "step": 63150 + }, + { + "epoch": 57.171945701357465, + "grad_norm": 4.217557430267334, + "learning_rate": 6.937836740673573e-05, + "loss": 0.7176, + "step": 63175 + }, + { + "epoch": 57.19457013574661, + "grad_norm": 4.2329511642456055, + "learning_rate": 6.931930515635028e-05, + "loss": 0.7707, + "step": 63200 + }, + { + "epoch": 57.217194570135746, + "grad_norm": 6.204469203948975, + "learning_rate": 6.926024644896869e-05, + "loss": 0.6269, + "step": 63225 + }, + { + "epoch": 57.23981900452489, + "grad_norm": 4.37061071395874, + "learning_rate": 6.920119132142542e-05, + "loss": 0.7308, + "step": 63250 + }, + { + "epoch": 57.262443438914026, + "grad_norm": 4.992610454559326, + "learning_rate": 6.914213981055264e-05, + "loss": 0.8525, + "step": 63275 + }, + { + "epoch": 57.28506787330317, + "grad_norm": 5.8490447998046875, + "learning_rate": 6.908309195318034e-05, + "loss": 0.7, + "step": 63300 + }, + { + "epoch": 57.30769230769231, + "grad_norm": 4.511977672576904, + "learning_rate": 6.90240477861362e-05, + "loss": 0.7824, + "step": 63325 + }, + { + "epoch": 57.33031674208145, + "grad_norm": 4.135030269622803, + "learning_rate": 6.896500734624555e-05, + "loss": 0.7225, + "step": 63350 + }, + { + "epoch": 57.35294117647059, + "grad_norm": 4.676153182983398, + "learning_rate": 6.890597067033148e-05, + "loss": 0.7085, + "step": 63375 + }, + { + "epoch": 57.37556561085973, + "grad_norm": 4.3189215660095215, + "learning_rate": 6.884693779521468e-05, + "loss": 0.705, + "step": 63400 + }, + { + "epoch": 57.39819004524887, + "grad_norm": 4.620569705963135, + "learning_rate": 6.878790875771347e-05, + "loss": 0.7154, + "step": 63425 + }, + { + "epoch": 57.42081447963801, + "grad_norm": 5.138914108276367, + "learning_rate": 6.87288835946438e-05, + "loss": 0.6988, + "step": 63450 + }, + { + "epoch": 57.44343891402715, + "grad_norm": 4.84014368057251, + "learning_rate": 6.86698623428192e-05, + "loss": 0.6541, + "step": 63475 + }, + { + "epoch": 57.46606334841629, + "grad_norm": 3.530526638031006, + "learning_rate": 6.861320565493707e-05, + "loss": 0.8056, + "step": 63500 + }, + { + "epoch": 57.48868778280543, + "grad_norm": 5.182336807250977, + "learning_rate": 6.855419217593208e-05, + "loss": 0.8006, + "step": 63525 + }, + { + "epoch": 57.51131221719457, + "grad_norm": 4.710208892822266, + "learning_rate": 6.849518271712579e-05, + "loss": 0.7378, + "step": 63550 + }, + { + "epoch": 57.53393665158371, + "grad_norm": 4.753151893615723, + "learning_rate": 6.843617731532191e-05, + "loss": 0.7038, + "step": 63575 + }, + { + "epoch": 57.55656108597285, + "grad_norm": 4.007762432098389, + "learning_rate": 6.837717600732167e-05, + "loss": 0.6604, + "step": 63600 + }, + { + "epoch": 57.57918552036199, + "grad_norm": 4.074855804443359, + "learning_rate": 6.831817882992368e-05, + "loss": 0.7431, + "step": 63625 + }, + { + "epoch": 57.60180995475113, + "grad_norm": 4.61572265625, + "learning_rate": 6.825918581992403e-05, + "loss": 0.7457, + "step": 63650 + }, + { + "epoch": 57.62443438914027, + "grad_norm": 4.903160095214844, + "learning_rate": 6.820019701411617e-05, + "loss": 0.8099, + "step": 63675 + }, + { + "epoch": 57.64705882352941, + "grad_norm": 4.184039115905762, + "learning_rate": 6.814121244929096e-05, + "loss": 0.7859, + "step": 63700 + }, + { + "epoch": 57.66968325791855, + "grad_norm": 4.689314842224121, + "learning_rate": 6.808223216223658e-05, + "loss": 0.7541, + "step": 63725 + }, + { + "epoch": 57.69230769230769, + "grad_norm": 4.083841323852539, + "learning_rate": 6.80232561897386e-05, + "loss": 0.686, + "step": 63750 + }, + { + "epoch": 57.71493212669683, + "grad_norm": 3.9760990142822266, + "learning_rate": 6.796428456857983e-05, + "loss": 0.6643, + "step": 63775 + }, + { + "epoch": 57.737556561085974, + "grad_norm": 5.40849494934082, + "learning_rate": 6.79053173355404e-05, + "loss": 0.7009, + "step": 63800 + }, + { + "epoch": 57.76018099547511, + "grad_norm": 5.317495346069336, + "learning_rate": 6.784635452739771e-05, + "loss": 0.727, + "step": 63825 + }, + { + "epoch": 57.782805429864254, + "grad_norm": 4.315069675445557, + "learning_rate": 6.778739618092638e-05, + "loss": 0.8383, + "step": 63850 + }, + { + "epoch": 57.80542986425339, + "grad_norm": 4.335770130157471, + "learning_rate": 6.772844233289827e-05, + "loss": 0.6921, + "step": 63875 + }, + { + "epoch": 57.828054298642535, + "grad_norm": 3.6915290355682373, + "learning_rate": 6.766949302008243e-05, + "loss": 0.7353, + "step": 63900 + }, + { + "epoch": 57.85067873303167, + "grad_norm": 4.659553527832031, + "learning_rate": 6.761054827924506e-05, + "loss": 0.6632, + "step": 63925 + }, + { + "epoch": 57.873303167420815, + "grad_norm": 4.827514171600342, + "learning_rate": 6.75516081471495e-05, + "loss": 0.7025, + "step": 63950 + }, + { + "epoch": 57.89592760180995, + "grad_norm": 4.103525638580322, + "learning_rate": 6.74926726605563e-05, + "loss": 0.7812, + "step": 63975 + }, + { + "epoch": 57.918552036199095, + "grad_norm": 3.6395256519317627, + "learning_rate": 6.7433741856223e-05, + "loss": 0.8079, + "step": 64000 + }, + { + "epoch": 57.94117647058823, + "grad_norm": 4.708597660064697, + "learning_rate": 6.737481577090427e-05, + "loss": 0.6813, + "step": 64025 + }, + { + "epoch": 57.963800904977376, + "grad_norm": 5.471441745758057, + "learning_rate": 6.731589444135186e-05, + "loss": 0.7924, + "step": 64050 + }, + { + "epoch": 57.98642533936652, + "grad_norm": 3.4653618335723877, + "learning_rate": 6.725697790431454e-05, + "loss": 0.6593, + "step": 64075 + }, + { + "epoch": 58.009049773755656, + "grad_norm": 4.245968341827393, + "learning_rate": 6.719806619653805e-05, + "loss": 0.6898, + "step": 64100 + }, + { + "epoch": 58.0316742081448, + "grad_norm": 5.01800012588501, + "learning_rate": 6.713915935476516e-05, + "loss": 0.6993, + "step": 64125 + }, + { + "epoch": 58.05429864253394, + "grad_norm": 5.440726280212402, + "learning_rate": 6.70802574157356e-05, + "loss": 0.5498, + "step": 64150 + }, + { + "epoch": 58.07692307692308, + "grad_norm": 2.5011403560638428, + "learning_rate": 6.702136041618605e-05, + "loss": 0.6508, + "step": 64175 + }, + { + "epoch": 58.09954751131222, + "grad_norm": 4.115577697753906, + "learning_rate": 6.696246839285009e-05, + "loss": 0.6571, + "step": 64200 + }, + { + "epoch": 58.12217194570136, + "grad_norm": 3.631739377975464, + "learning_rate": 6.690358138245819e-05, + "loss": 0.6546, + "step": 64225 + }, + { + "epoch": 58.1447963800905, + "grad_norm": 4.247576713562012, + "learning_rate": 6.684469942173772e-05, + "loss": 0.6615, + "step": 64250 + }, + { + "epoch": 58.16742081447964, + "grad_norm": 4.359410762786865, + "learning_rate": 6.678582254741286e-05, + "loss": 0.5832, + "step": 64275 + }, + { + "epoch": 58.19004524886878, + "grad_norm": 2.6934914588928223, + "learning_rate": 6.672695079620469e-05, + "loss": 0.6694, + "step": 64300 + }, + { + "epoch": 58.21266968325792, + "grad_norm": 3.67513108253479, + "learning_rate": 6.666808420483102e-05, + "loss": 0.7659, + "step": 64325 + }, + { + "epoch": 58.23529411764706, + "grad_norm": 4.915542125701904, + "learning_rate": 6.660922281000649e-05, + "loss": 0.6633, + "step": 64350 + }, + { + "epoch": 58.2579185520362, + "grad_norm": 3.5720250606536865, + "learning_rate": 6.655036664844245e-05, + "loss": 0.6374, + "step": 64375 + }, + { + "epoch": 58.28054298642534, + "grad_norm": 5.004158020019531, + "learning_rate": 6.649151575684705e-05, + "loss": 0.6356, + "step": 64400 + }, + { + "epoch": 58.30316742081448, + "grad_norm": 3.67757511138916, + "learning_rate": 6.643267017192509e-05, + "loss": 0.6521, + "step": 64425 + }, + { + "epoch": 58.32579185520362, + "grad_norm": 3.7194998264312744, + "learning_rate": 6.63738299303781e-05, + "loss": 0.6626, + "step": 64450 + }, + { + "epoch": 58.34841628959276, + "grad_norm": 4.588992595672607, + "learning_rate": 6.63149950689043e-05, + "loss": 0.6551, + "step": 64475 + }, + { + "epoch": 58.3710407239819, + "grad_norm": 4.48909330368042, + "learning_rate": 6.625616562419846e-05, + "loss": 0.6209, + "step": 64500 + }, + { + "epoch": 58.39366515837104, + "grad_norm": 5.264707565307617, + "learning_rate": 6.619734163295209e-05, + "loss": 0.7786, + "step": 64525 + }, + { + "epoch": 58.41628959276018, + "grad_norm": 4.463858604431152, + "learning_rate": 6.613852313185321e-05, + "loss": 0.7384, + "step": 64550 + }, + { + "epoch": 58.43891402714932, + "grad_norm": 4.5495829582214355, + "learning_rate": 6.607971015758645e-05, + "loss": 0.6313, + "step": 64575 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 4.662189483642578, + "learning_rate": 6.602090274683301e-05, + "loss": 0.63, + "step": 64600 + }, + { + "epoch": 58.484162895927604, + "grad_norm": 4.075993537902832, + "learning_rate": 6.596210093627058e-05, + "loss": 0.6615, + "step": 64625 + }, + { + "epoch": 58.50678733031674, + "grad_norm": 3.6193978786468506, + "learning_rate": 6.590330476257338e-05, + "loss": 0.7725, + "step": 64650 + }, + { + "epoch": 58.529411764705884, + "grad_norm": 4.732771873474121, + "learning_rate": 6.584451426241213e-05, + "loss": 0.6606, + "step": 64675 + }, + { + "epoch": 58.55203619909502, + "grad_norm": 4.847504615783691, + "learning_rate": 6.578572947245397e-05, + "loss": 0.7519, + "step": 64700 + }, + { + "epoch": 58.574660633484164, + "grad_norm": 5.2629075050354, + "learning_rate": 6.572695042936253e-05, + "loss": 0.7245, + "step": 64725 + }, + { + "epoch": 58.5972850678733, + "grad_norm": 3.563847541809082, + "learning_rate": 6.56681771697978e-05, + "loss": 0.6861, + "step": 64750 + }, + { + "epoch": 58.619909502262445, + "grad_norm": 3.5651309490203857, + "learning_rate": 6.560940973041621e-05, + "loss": 0.7774, + "step": 64775 + }, + { + "epoch": 58.64253393665158, + "grad_norm": 4.128613471984863, + "learning_rate": 6.555064814787053e-05, + "loss": 0.6914, + "step": 64800 + }, + { + "epoch": 58.665158371040725, + "grad_norm": 4.838799476623535, + "learning_rate": 6.54918924588099e-05, + "loss": 0.7919, + "step": 64825 + }, + { + "epoch": 58.68778280542986, + "grad_norm": 3.5059125423431396, + "learning_rate": 6.543314269987974e-05, + "loss": 0.7642, + "step": 64850 + }, + { + "epoch": 58.710407239819006, + "grad_norm": 4.279022693634033, + "learning_rate": 6.537439890772185e-05, + "loss": 0.7714, + "step": 64875 + }, + { + "epoch": 58.73303167420814, + "grad_norm": 3.9953958988189697, + "learning_rate": 6.531566111897426e-05, + "loss": 0.7696, + "step": 64900 + }, + { + "epoch": 58.755656108597286, + "grad_norm": 5.27321195602417, + "learning_rate": 6.525692937027122e-05, + "loss": 0.5896, + "step": 64925 + }, + { + "epoch": 58.77828054298642, + "grad_norm": 5.410314083099365, + "learning_rate": 6.519820369824329e-05, + "loss": 0.6908, + "step": 64950 + }, + { + "epoch": 58.800904977375566, + "grad_norm": 4.452655792236328, + "learning_rate": 6.513948413951717e-05, + "loss": 0.7339, + "step": 64975 + }, + { + "epoch": 58.8235294117647, + "grad_norm": 2.9418070316314697, + "learning_rate": 6.50807707307158e-05, + "loss": 0.7615, + "step": 65000 + }, + { + "epoch": 58.84615384615385, + "grad_norm": 4.953251361846924, + "learning_rate": 6.502206350845825e-05, + "loss": 0.6923, + "step": 65025 + }, + { + "epoch": 58.86877828054298, + "grad_norm": 3.1625888347625732, + "learning_rate": 6.496336250935975e-05, + "loss": 0.6791, + "step": 65050 + }, + { + "epoch": 58.89140271493213, + "grad_norm": 4.483161926269531, + "learning_rate": 6.490466777003164e-05, + "loss": 0.7056, + "step": 65075 + }, + { + "epoch": 58.914027149321264, + "grad_norm": 4.233424186706543, + "learning_rate": 6.484597932708136e-05, + "loss": 0.7401, + "step": 65100 + }, + { + "epoch": 58.93665158371041, + "grad_norm": 4.563268661499023, + "learning_rate": 6.478729721711243e-05, + "loss": 0.6932, + "step": 65125 + }, + { + "epoch": 58.959276018099544, + "grad_norm": 5.159553050994873, + "learning_rate": 6.472862147672439e-05, + "loss": 0.691, + "step": 65150 + }, + { + "epoch": 58.98190045248869, + "grad_norm": 3.6239402294158936, + "learning_rate": 6.466995214251286e-05, + "loss": 0.6401, + "step": 65175 + }, + { + "epoch": 59.00452488687783, + "grad_norm": 3.668715476989746, + "learning_rate": 6.46112892510694e-05, + "loss": 0.7049, + "step": 65200 + }, + { + "epoch": 59.02714932126697, + "grad_norm": 4.769604206085205, + "learning_rate": 6.45526328389816e-05, + "loss": 0.6767, + "step": 65225 + }, + { + "epoch": 59.04977375565611, + "grad_norm": 3.7245876789093018, + "learning_rate": 6.449398294283301e-05, + "loss": 0.5708, + "step": 65250 + }, + { + "epoch": 59.07239819004525, + "grad_norm": 4.986959457397461, + "learning_rate": 6.443533959920309e-05, + "loss": 0.8017, + "step": 65275 + }, + { + "epoch": 59.09502262443439, + "grad_norm": 5.122184753417969, + "learning_rate": 6.43767028446672e-05, + "loss": 0.6403, + "step": 65300 + }, + { + "epoch": 59.11764705882353, + "grad_norm": 4.31292200088501, + "learning_rate": 6.431807271579664e-05, + "loss": 0.6101, + "step": 65325 + }, + { + "epoch": 59.14027149321267, + "grad_norm": 5.1685309410095215, + "learning_rate": 6.425944924915857e-05, + "loss": 0.5861, + "step": 65350 + }, + { + "epoch": 59.16289592760181, + "grad_norm": 3.9274120330810547, + "learning_rate": 6.420083248131591e-05, + "loss": 0.6415, + "step": 65375 + }, + { + "epoch": 59.18552036199095, + "grad_norm": 3.911078929901123, + "learning_rate": 6.414222244882754e-05, + "loss": 0.6568, + "step": 65400 + }, + { + "epoch": 59.20814479638009, + "grad_norm": 3.8726963996887207, + "learning_rate": 6.408361918824803e-05, + "loss": 0.5736, + "step": 65425 + }, + { + "epoch": 59.23076923076923, + "grad_norm": 3.2742345333099365, + "learning_rate": 6.402502273612777e-05, + "loss": 0.6087, + "step": 65450 + }, + { + "epoch": 59.25339366515837, + "grad_norm": 4.598509788513184, + "learning_rate": 6.396643312901291e-05, + "loss": 0.7204, + "step": 65475 + }, + { + "epoch": 59.276018099547514, + "grad_norm": 4.440429210662842, + "learning_rate": 6.39078504034453e-05, + "loss": 0.6373, + "step": 65500 + }, + { + "epoch": 59.29864253393665, + "grad_norm": 4.567727565765381, + "learning_rate": 6.384927459596254e-05, + "loss": 0.6433, + "step": 65525 + }, + { + "epoch": 59.321266968325794, + "grad_norm": 3.317457914352417, + "learning_rate": 6.379070574309786e-05, + "loss": 0.7142, + "step": 65550 + }, + { + "epoch": 59.34389140271493, + "grad_norm": 4.630799770355225, + "learning_rate": 6.37321438813802e-05, + "loss": 0.5899, + "step": 65575 + }, + { + "epoch": 59.366515837104075, + "grad_norm": 3.2978603839874268, + "learning_rate": 6.367358904733413e-05, + "loss": 0.6016, + "step": 65600 + }, + { + "epoch": 59.38914027149321, + "grad_norm": 3.8394994735717773, + "learning_rate": 6.361738305218345e-05, + "loss": 0.6981, + "step": 65625 + }, + { + "epoch": 59.411764705882355, + "grad_norm": 1.3056546449661255, + "learning_rate": 6.355884209830735e-05, + "loss": 0.6676, + "step": 65650 + }, + { + "epoch": 59.43438914027149, + "grad_norm": 4.848611354827881, + "learning_rate": 6.350030828018976e-05, + "loss": 0.7122, + "step": 65675 + }, + { + "epoch": 59.457013574660635, + "grad_norm": 2.762185573577881, + "learning_rate": 6.344178163433774e-05, + "loss": 0.6294, + "step": 65700 + }, + { + "epoch": 59.47963800904977, + "grad_norm": 4.507059097290039, + "learning_rate": 6.338326219725394e-05, + "loss": 0.5763, + "step": 65725 + }, + { + "epoch": 59.502262443438916, + "grad_norm": 4.1835222244262695, + "learning_rate": 6.332475000543644e-05, + "loss": 0.6056, + "step": 65750 + }, + { + "epoch": 59.52488687782805, + "grad_norm": 4.698659420013428, + "learning_rate": 6.326624509537881e-05, + "loss": 0.662, + "step": 65775 + }, + { + "epoch": 59.547511312217196, + "grad_norm": 4.21934175491333, + "learning_rate": 6.32077475035701e-05, + "loss": 0.6435, + "step": 65800 + }, + { + "epoch": 59.57013574660633, + "grad_norm": 3.255751609802246, + "learning_rate": 6.31492572664948e-05, + "loss": 0.7144, + "step": 65825 + }, + { + "epoch": 59.59276018099548, + "grad_norm": 3.099938154220581, + "learning_rate": 6.30907744206328e-05, + "loss": 0.76, + "step": 65850 + }, + { + "epoch": 59.61538461538461, + "grad_norm": 4.311501979827881, + "learning_rate": 6.303229900245936e-05, + "loss": 0.6285, + "step": 65875 + }, + { + "epoch": 59.63800904977376, + "grad_norm": 3.967350482940674, + "learning_rate": 6.297383104844517e-05, + "loss": 0.7041, + "step": 65900 + }, + { + "epoch": 59.660633484162894, + "grad_norm": 3.6102454662323, + "learning_rate": 6.29153705950562e-05, + "loss": 0.6672, + "step": 65925 + }, + { + "epoch": 59.68325791855204, + "grad_norm": 4.378409385681152, + "learning_rate": 6.285691767875377e-05, + "loss": 0.6725, + "step": 65950 + }, + { + "epoch": 59.705882352941174, + "grad_norm": 4.492663860321045, + "learning_rate": 6.27984723359945e-05, + "loss": 0.6248, + "step": 65975 + }, + { + "epoch": 59.72850678733032, + "grad_norm": 4.401359558105469, + "learning_rate": 6.274003460323027e-05, + "loss": 0.652, + "step": 66000 + }, + { + "epoch": 59.751131221719454, + "grad_norm": 4.687775611877441, + "learning_rate": 6.268160451690824e-05, + "loss": 0.675, + "step": 66025 + }, + { + "epoch": 59.7737556561086, + "grad_norm": 5.653653621673584, + "learning_rate": 6.262318211347079e-05, + "loss": 0.6625, + "step": 66050 + }, + { + "epoch": 59.796380090497735, + "grad_norm": 4.8085784912109375, + "learning_rate": 6.256476742935548e-05, + "loss": 0.6134, + "step": 66075 + }, + { + "epoch": 59.81900452488688, + "grad_norm": 3.568744421005249, + "learning_rate": 6.250636050099509e-05, + "loss": 0.5429, + "step": 66100 + }, + { + "epoch": 59.841628959276015, + "grad_norm": 4.331258773803711, + "learning_rate": 6.244796136481757e-05, + "loss": 0.7067, + "step": 66125 + }, + { + "epoch": 59.86425339366516, + "grad_norm": 5.338553428649902, + "learning_rate": 6.238957005724598e-05, + "loss": 0.677, + "step": 66150 + }, + { + "epoch": 59.886877828054295, + "grad_norm": 3.8364171981811523, + "learning_rate": 6.233118661469852e-05, + "loss": 0.6128, + "step": 66175 + }, + { + "epoch": 59.90950226244344, + "grad_norm": 3.696770191192627, + "learning_rate": 6.227281107358846e-05, + "loss": 0.792, + "step": 66200 + }, + { + "epoch": 59.932126696832576, + "grad_norm": 5.0461106300354, + "learning_rate": 6.221444347032417e-05, + "loss": 0.6329, + "step": 66225 + }, + { + "epoch": 59.95475113122172, + "grad_norm": 4.868934631347656, + "learning_rate": 6.215608384130905e-05, + "loss": 0.627, + "step": 66250 + }, + { + "epoch": 59.977375565610856, + "grad_norm": 3.4682068824768066, + "learning_rate": 6.209773222294153e-05, + "loss": 0.5947, + "step": 66275 + }, + { + "epoch": 60.0, + "grad_norm": 5.2894062995910645, + "learning_rate": 6.203938865161506e-05, + "loss": 0.6666, + "step": 66300 + }, + { + "epoch": 60.022624434389144, + "grad_norm": 3.4546284675598145, + "learning_rate": 6.198105316371804e-05, + "loss": 0.6174, + "step": 66325 + }, + { + "epoch": 60.04524886877828, + "grad_norm": 4.68898344039917, + "learning_rate": 6.192272579563387e-05, + "loss": 0.5547, + "step": 66350 + }, + { + "epoch": 60.067873303167424, + "grad_norm": 1.7499679327011108, + "learning_rate": 6.186440658374084e-05, + "loss": 0.5807, + "step": 66375 + }, + { + "epoch": 60.09049773755656, + "grad_norm": 5.337199687957764, + "learning_rate": 6.180609556441219e-05, + "loss": 0.6037, + "step": 66400 + }, + { + "epoch": 60.113122171945705, + "grad_norm": 3.1942121982574463, + "learning_rate": 6.174779277401602e-05, + "loss": 0.6353, + "step": 66425 + }, + { + "epoch": 60.13574660633484, + "grad_norm": 6.741465091705322, + "learning_rate": 6.168949824891534e-05, + "loss": 0.6482, + "step": 66450 + }, + { + "epoch": 60.158371040723985, + "grad_norm": 5.219030857086182, + "learning_rate": 6.163121202546794e-05, + "loss": 0.6007, + "step": 66475 + }, + { + "epoch": 60.18099547511312, + "grad_norm": 4.876162052154541, + "learning_rate": 6.157293414002651e-05, + "loss": 0.6126, + "step": 66500 + }, + { + "epoch": 60.203619909502265, + "grad_norm": 4.365516185760498, + "learning_rate": 6.151466462893847e-05, + "loss": 0.5603, + "step": 66525 + }, + { + "epoch": 60.2262443438914, + "grad_norm": 4.76017951965332, + "learning_rate": 6.145640352854606e-05, + "loss": 0.5364, + "step": 66550 + }, + { + "epoch": 60.248868778280546, + "grad_norm": 2.8500778675079346, + "learning_rate": 6.139815087518626e-05, + "loss": 0.6049, + "step": 66575 + }, + { + "epoch": 60.27149321266968, + "grad_norm": 4.174798011779785, + "learning_rate": 6.133990670519079e-05, + "loss": 0.7041, + "step": 66600 + }, + { + "epoch": 60.294117647058826, + "grad_norm": 5.168081283569336, + "learning_rate": 6.128167105488605e-05, + "loss": 0.7466, + "step": 66625 + }, + { + "epoch": 60.31674208144796, + "grad_norm": 5.011589527130127, + "learning_rate": 6.122344396059319e-05, + "loss": 0.616, + "step": 66650 + }, + { + "epoch": 60.339366515837106, + "grad_norm": 2.1540355682373047, + "learning_rate": 6.116522545862794e-05, + "loss": 0.5621, + "step": 66675 + }, + { + "epoch": 60.36199095022624, + "grad_norm": 3.5327677726745605, + "learning_rate": 6.110701558530073e-05, + "loss": 0.6676, + "step": 66700 + }, + { + "epoch": 60.38461538461539, + "grad_norm": 3.2821593284606934, + "learning_rate": 6.10488143769166e-05, + "loss": 0.5885, + "step": 66725 + }, + { + "epoch": 60.40723981900452, + "grad_norm": 2.0960774421691895, + "learning_rate": 6.099062186977516e-05, + "loss": 0.6644, + "step": 66750 + }, + { + "epoch": 60.42986425339367, + "grad_norm": 2.393394708633423, + "learning_rate": 6.093243810017062e-05, + "loss": 0.5608, + "step": 66775 + }, + { + "epoch": 60.452488687782804, + "grad_norm": 4.688882827758789, + "learning_rate": 6.087426310439174e-05, + "loss": 0.6636, + "step": 66800 + }, + { + "epoch": 60.47511312217195, + "grad_norm": 4.645911693572998, + "learning_rate": 6.081609691872178e-05, + "loss": 0.5493, + "step": 66825 + }, + { + "epoch": 60.497737556561084, + "grad_norm": 3.82240891456604, + "learning_rate": 6.075793957943854e-05, + "loss": 0.5768, + "step": 66850 + }, + { + "epoch": 60.52036199095023, + "grad_norm": 4.518941402435303, + "learning_rate": 6.0699791122814275e-05, + "loss": 0.6824, + "step": 66875 + }, + { + "epoch": 60.542986425339365, + "grad_norm": 4.387720108032227, + "learning_rate": 6.0641651585115715e-05, + "loss": 0.7327, + "step": 66900 + }, + { + "epoch": 60.56561085972851, + "grad_norm": 5.1645331382751465, + "learning_rate": 6.058352100260403e-05, + "loss": 0.592, + "step": 66925 + }, + { + "epoch": 60.588235294117645, + "grad_norm": 3.3075625896453857, + "learning_rate": 6.052539941153477e-05, + "loss": 0.6545, + "step": 66950 + }, + { + "epoch": 60.61085972850679, + "grad_norm": 2.7808585166931152, + "learning_rate": 6.0467286848157954e-05, + "loss": 0.6129, + "step": 66975 + }, + { + "epoch": 60.633484162895925, + "grad_norm": 4.265286445617676, + "learning_rate": 6.04091833487179e-05, + "loss": 0.6214, + "step": 67000 + }, + { + "epoch": 60.65610859728507, + "grad_norm": 4.914118766784668, + "learning_rate": 6.035108894945328e-05, + "loss": 0.5254, + "step": 67025 + }, + { + "epoch": 60.678733031674206, + "grad_norm": 5.129421710968018, + "learning_rate": 6.029300368659712e-05, + "loss": 0.674, + "step": 67050 + }, + { + "epoch": 60.70135746606335, + "grad_norm": 4.754515647888184, + "learning_rate": 6.0234927596376744e-05, + "loss": 0.6048, + "step": 67075 + }, + { + "epoch": 60.723981900452486, + "grad_norm": 3.9844114780426025, + "learning_rate": 6.0176860715013715e-05, + "loss": 0.6366, + "step": 67100 + }, + { + "epoch": 60.74660633484163, + "grad_norm": 3.719999074935913, + "learning_rate": 6.011880307872391e-05, + "loss": 0.6566, + "step": 67125 + }, + { + "epoch": 60.76923076923077, + "grad_norm": 4.930984973907471, + "learning_rate": 6.00607547237174e-05, + "loss": 0.6577, + "step": 67150 + }, + { + "epoch": 60.79185520361991, + "grad_norm": 3.436265707015991, + "learning_rate": 6.000271568619847e-05, + "loss": 0.6, + "step": 67175 + }, + { + "epoch": 60.81447963800905, + "grad_norm": 5.183216571807861, + "learning_rate": 5.994468600236561e-05, + "loss": 0.675, + "step": 67200 + }, + { + "epoch": 60.83710407239819, + "grad_norm": 3.767946243286133, + "learning_rate": 5.988666570841148e-05, + "loss": 0.6304, + "step": 67225 + }, + { + "epoch": 60.85972850678733, + "grad_norm": 2.762791872024536, + "learning_rate": 5.9828654840522855e-05, + "loss": 0.5571, + "step": 67250 + }, + { + "epoch": 60.88235294117647, + "grad_norm": 4.998724937438965, + "learning_rate": 5.977065343488066e-05, + "loss": 0.6721, + "step": 67275 + }, + { + "epoch": 60.90497737556561, + "grad_norm": 3.832132577896118, + "learning_rate": 5.97126615276599e-05, + "loss": 0.5589, + "step": 67300 + }, + { + "epoch": 60.92760180995475, + "grad_norm": 3.30859375, + "learning_rate": 5.965467915502967e-05, + "loss": 0.5113, + "step": 67325 + }, + { + "epoch": 60.95022624434389, + "grad_norm": 4.903571128845215, + "learning_rate": 5.959670635315308e-05, + "loss": 0.6619, + "step": 67350 + }, + { + "epoch": 60.97285067873303, + "grad_norm": 4.615489959716797, + "learning_rate": 5.953874315818736e-05, + "loss": 0.5516, + "step": 67375 + }, + { + "epoch": 60.99547511312217, + "grad_norm": 4.0281243324279785, + "learning_rate": 5.948078960628366e-05, + "loss": 0.5842, + "step": 67400 + }, + { + "epoch": 61.01809954751131, + "grad_norm": 3.07563853263855, + "learning_rate": 5.942284573358712e-05, + "loss": 0.62, + "step": 67425 + }, + { + "epoch": 61.040723981900456, + "grad_norm": 4.654354095458984, + "learning_rate": 5.9364911576236915e-05, + "loss": 0.5309, + "step": 67450 + }, + { + "epoch": 61.06334841628959, + "grad_norm": 4.621328830718994, + "learning_rate": 5.930698717036609e-05, + "loss": 0.6051, + "step": 67475 + }, + { + "epoch": 61.085972850678736, + "grad_norm": 3.6699090003967285, + "learning_rate": 5.9249072552101643e-05, + "loss": 0.5934, + "step": 67500 + }, + { + "epoch": 61.10859728506787, + "grad_norm": 3.4764626026153564, + "learning_rate": 5.919116775756445e-05, + "loss": 0.6266, + "step": 67525 + }, + { + "epoch": 61.13122171945702, + "grad_norm": 4.3350510597229, + "learning_rate": 5.9133272822869274e-05, + "loss": 0.4966, + "step": 67550 + }, + { + "epoch": 61.15384615384615, + "grad_norm": 2.9503839015960693, + "learning_rate": 5.907538778412471e-05, + "loss": 0.5372, + "step": 67575 + }, + { + "epoch": 61.1764705882353, + "grad_norm": 3.6014959812164307, + "learning_rate": 5.901751267743321e-05, + "loss": 0.5589, + "step": 67600 + }, + { + "epoch": 61.199095022624434, + "grad_norm": 2.387441635131836, + "learning_rate": 5.8959647538891e-05, + "loss": 0.568, + "step": 67625 + }, + { + "epoch": 61.22171945701358, + "grad_norm": 4.004934310913086, + "learning_rate": 5.8901792404588097e-05, + "loss": 0.5501, + "step": 67650 + }, + { + "epoch": 61.244343891402714, + "grad_norm": 4.333060264587402, + "learning_rate": 5.8843947310608306e-05, + "loss": 0.4982, + "step": 67675 + }, + { + "epoch": 61.26696832579186, + "grad_norm": 3.765822649002075, + "learning_rate": 5.878611229302914e-05, + "loss": 0.5822, + "step": 67700 + }, + { + "epoch": 61.289592760180994, + "grad_norm": 4.504117012023926, + "learning_rate": 5.872828738792183e-05, + "loss": 0.6039, + "step": 67725 + }, + { + "epoch": 61.31221719457014, + "grad_norm": 4.02402925491333, + "learning_rate": 5.867047263135131e-05, + "loss": 0.5885, + "step": 67750 + }, + { + "epoch": 61.334841628959275, + "grad_norm": 4.467599391937256, + "learning_rate": 5.8612668059376184e-05, + "loss": 0.5821, + "step": 67775 + }, + { + "epoch": 61.35746606334842, + "grad_norm": 4.618236064910889, + "learning_rate": 5.8554873708048695e-05, + "loss": 0.5983, + "step": 67800 + }, + { + "epoch": 61.380090497737555, + "grad_norm": 4.545291900634766, + "learning_rate": 5.849708961341472e-05, + "loss": 0.6362, + "step": 67825 + }, + { + "epoch": 61.4027149321267, + "grad_norm": 4.7155585289001465, + "learning_rate": 5.843931581151373e-05, + "loss": 0.5973, + "step": 67850 + }, + { + "epoch": 61.425339366515836, + "grad_norm": 3.715583086013794, + "learning_rate": 5.838155233837879e-05, + "loss": 0.6329, + "step": 67875 + }, + { + "epoch": 61.44796380090498, + "grad_norm": 4.6777472496032715, + "learning_rate": 5.832379923003652e-05, + "loss": 0.5239, + "step": 67900 + }, + { + "epoch": 61.470588235294116, + "grad_norm": 2.7520992755889893, + "learning_rate": 5.826605652250705e-05, + "loss": 0.5685, + "step": 67925 + }, + { + "epoch": 61.49321266968326, + "grad_norm": 3.158763885498047, + "learning_rate": 5.820832425180407e-05, + "loss": 0.5413, + "step": 67950 + }, + { + "epoch": 61.515837104072396, + "grad_norm": 3.884981632232666, + "learning_rate": 5.8150602453934725e-05, + "loss": 0.5559, + "step": 67975 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 1.9774912595748901, + "learning_rate": 5.809289116489963e-05, + "loss": 0.527, + "step": 68000 + }, + { + "epoch": 61.56108597285068, + "grad_norm": 4.468634605407715, + "learning_rate": 5.8035190420692854e-05, + "loss": 0.6478, + "step": 68025 + }, + { + "epoch": 61.58371040723982, + "grad_norm": 4.440135955810547, + "learning_rate": 5.79775002573019e-05, + "loss": 0.702, + "step": 68050 + }, + { + "epoch": 61.60633484162896, + "grad_norm": 3.9630126953125, + "learning_rate": 5.791982071070765e-05, + "loss": 0.4886, + "step": 68075 + }, + { + "epoch": 61.6289592760181, + "grad_norm": 3.3422679901123047, + "learning_rate": 5.786215181688438e-05, + "loss": 0.5723, + "step": 68100 + }, + { + "epoch": 61.65158371040724, + "grad_norm": 3.2867820262908936, + "learning_rate": 5.780449361179972e-05, + "loss": 0.6226, + "step": 68125 + }, + { + "epoch": 61.67420814479638, + "grad_norm": 3.681912899017334, + "learning_rate": 5.774684613141463e-05, + "loss": 0.6728, + "step": 68150 + }, + { + "epoch": 61.69683257918552, + "grad_norm": 4.133671760559082, + "learning_rate": 5.768920941168337e-05, + "loss": 0.6028, + "step": 68175 + }, + { + "epoch": 61.71945701357466, + "grad_norm": 5.205613136291504, + "learning_rate": 5.763158348855351e-05, + "loss": 0.6276, + "step": 68200 + }, + { + "epoch": 61.7420814479638, + "grad_norm": 4.072926044464111, + "learning_rate": 5.757396839796589e-05, + "loss": 0.5494, + "step": 68225 + }, + { + "epoch": 61.76470588235294, + "grad_norm": 4.312483310699463, + "learning_rate": 5.751636417585455e-05, + "loss": 0.5893, + "step": 68250 + }, + { + "epoch": 61.78733031674208, + "grad_norm": 1.9435641765594482, + "learning_rate": 5.7458770858146817e-05, + "loss": 0.6388, + "step": 68275 + }, + { + "epoch": 61.80995475113122, + "grad_norm": 3.7756338119506836, + "learning_rate": 5.740118848076314e-05, + "loss": 0.5754, + "step": 68300 + }, + { + "epoch": 61.83257918552036, + "grad_norm": 4.449934005737305, + "learning_rate": 5.7343617079617225e-05, + "loss": 0.5657, + "step": 68325 + }, + { + "epoch": 61.8552036199095, + "grad_norm": 4.232459545135498, + "learning_rate": 5.728605669061587e-05, + "loss": 0.5684, + "step": 68350 + }, + { + "epoch": 61.87782805429864, + "grad_norm": 3.241680383682251, + "learning_rate": 5.7228507349659045e-05, + "loss": 0.5568, + "step": 68375 + }, + { + "epoch": 61.90045248868778, + "grad_norm": 4.112957000732422, + "learning_rate": 5.71709690926398e-05, + "loss": 0.6108, + "step": 68400 + }, + { + "epoch": 61.92307692307692, + "grad_norm": 4.771388053894043, + "learning_rate": 5.711344195544428e-05, + "loss": 0.5494, + "step": 68425 + }, + { + "epoch": 61.94570135746606, + "grad_norm": 2.903620958328247, + "learning_rate": 5.705592597395171e-05, + "loss": 0.6083, + "step": 68450 + }, + { + "epoch": 61.9683257918552, + "grad_norm": 3.416398763656616, + "learning_rate": 5.699842118403436e-05, + "loss": 0.532, + "step": 68475 + }, + { + "epoch": 61.990950226244344, + "grad_norm": 4.085901737213135, + "learning_rate": 5.694092762155747e-05, + "loss": 0.6211, + "step": 68500 + }, + { + "epoch": 62.01357466063349, + "grad_norm": 5.162738800048828, + "learning_rate": 5.688344532237935e-05, + "loss": 0.5795, + "step": 68525 + }, + { + "epoch": 62.036199095022624, + "grad_norm": 3.3909380435943604, + "learning_rate": 5.682597432235123e-05, + "loss": 0.4504, + "step": 68550 + }, + { + "epoch": 62.05882352941177, + "grad_norm": 4.008853435516357, + "learning_rate": 5.676851465731733e-05, + "loss": 0.5542, + "step": 68575 + }, + { + "epoch": 62.081447963800905, + "grad_norm": 4.662866115570068, + "learning_rate": 5.671106636311477e-05, + "loss": 0.5655, + "step": 68600 + }, + { + "epoch": 62.10407239819005, + "grad_norm": 3.8678126335144043, + "learning_rate": 5.665362947557359e-05, + "loss": 0.6555, + "step": 68625 + }, + { + "epoch": 62.126696832579185, + "grad_norm": 3.644484043121338, + "learning_rate": 5.659620403051671e-05, + "loss": 0.5688, + "step": 68650 + }, + { + "epoch": 62.14932126696833, + "grad_norm": 3.2867202758789062, + "learning_rate": 5.653879006375994e-05, + "loss": 0.6283, + "step": 68675 + }, + { + "epoch": 62.171945701357465, + "grad_norm": 3.1707193851470947, + "learning_rate": 5.648138761111189e-05, + "loss": 0.5258, + "step": 68700 + }, + { + "epoch": 62.19457013574661, + "grad_norm": 4.200314998626709, + "learning_rate": 5.642399670837403e-05, + "loss": 0.4637, + "step": 68725 + }, + { + "epoch": 62.217194570135746, + "grad_norm": 5.262591361999512, + "learning_rate": 5.636661739134059e-05, + "loss": 0.6027, + "step": 68750 + }, + { + "epoch": 62.23981900452489, + "grad_norm": 3.71937894821167, + "learning_rate": 5.6309249695798596e-05, + "loss": 0.6497, + "step": 68775 + }, + { + "epoch": 62.262443438914026, + "grad_norm": 2.9670209884643555, + "learning_rate": 5.625189365752782e-05, + "loss": 0.6145, + "step": 68800 + }, + { + "epoch": 62.28506787330317, + "grad_norm": 4.04097318649292, + "learning_rate": 5.619454931230076e-05, + "loss": 0.5354, + "step": 68825 + }, + { + "epoch": 62.30769230769231, + "grad_norm": 3.8297502994537354, + "learning_rate": 5.613721669588264e-05, + "loss": 0.5392, + "step": 68850 + }, + { + "epoch": 62.33031674208145, + "grad_norm": 3.143273115158081, + "learning_rate": 5.607989584403133e-05, + "loss": 0.6856, + "step": 68875 + }, + { + "epoch": 62.35294117647059, + "grad_norm": 3.210341453552246, + "learning_rate": 5.60225867924974e-05, + "loss": 0.5466, + "step": 68900 + }, + { + "epoch": 62.37556561085973, + "grad_norm": 3.1118643283843994, + "learning_rate": 5.596528957702405e-05, + "loss": 0.4679, + "step": 68925 + }, + { + "epoch": 62.39819004524887, + "grad_norm": 3.7933237552642822, + "learning_rate": 5.5908004233347086e-05, + "loss": 0.5791, + "step": 68950 + }, + { + "epoch": 62.42081447963801, + "grad_norm": 3.7283291816711426, + "learning_rate": 5.585073079719492e-05, + "loss": 0.609, + "step": 68975 + }, + { + "epoch": 62.44343891402715, + "grad_norm": 5.116457462310791, + "learning_rate": 5.579346930428853e-05, + "loss": 0.564, + "step": 69000 + }, + { + "epoch": 62.46606334841629, + "grad_norm": 4.880287170410156, + "learning_rate": 5.573621979034147e-05, + "loss": 0.5384, + "step": 69025 + }, + { + "epoch": 62.48868778280543, + "grad_norm": 2.7976601123809814, + "learning_rate": 5.567898229105977e-05, + "loss": 0.598, + "step": 69050 + }, + { + "epoch": 62.51131221719457, + "grad_norm": 6.518646240234375, + "learning_rate": 5.5621756842142026e-05, + "loss": 0.6078, + "step": 69075 + }, + { + "epoch": 62.53393665158371, + "grad_norm": 3.570544958114624, + "learning_rate": 5.556454347927929e-05, + "loss": 0.575, + "step": 69100 + }, + { + "epoch": 62.55656108597285, + "grad_norm": 3.5969743728637695, + "learning_rate": 5.5507342238155075e-05, + "loss": 0.4654, + "step": 69125 + }, + { + "epoch": 62.57918552036199, + "grad_norm": 3.625481367111206, + "learning_rate": 5.545015315444533e-05, + "loss": 0.5297, + "step": 69150 + }, + { + "epoch": 62.60180995475113, + "grad_norm": 3.9849331378936768, + "learning_rate": 5.539297626381843e-05, + "loss": 0.495, + "step": 69175 + }, + { + "epoch": 62.62443438914027, + "grad_norm": 4.907066345214844, + "learning_rate": 5.533581160193515e-05, + "loss": 0.5037, + "step": 69200 + }, + { + "epoch": 62.64705882352941, + "grad_norm": 1.6982041597366333, + "learning_rate": 5.527865920444863e-05, + "loss": 0.5503, + "step": 69225 + }, + { + "epoch": 62.66968325791855, + "grad_norm": 3.2752087116241455, + "learning_rate": 5.522151910700436e-05, + "loss": 0.6308, + "step": 69250 + }, + { + "epoch": 62.69230769230769, + "grad_norm": 3.391486644744873, + "learning_rate": 5.516439134524016e-05, + "loss": 0.5923, + "step": 69275 + }, + { + "epoch": 62.71493212669683, + "grad_norm": 2.5114595890045166, + "learning_rate": 5.5107275954786155e-05, + "loss": 0.5571, + "step": 69300 + }, + { + "epoch": 62.737556561085974, + "grad_norm": 4.318356513977051, + "learning_rate": 5.505017297126477e-05, + "loss": 0.5301, + "step": 69325 + }, + { + "epoch": 62.76018099547511, + "grad_norm": 5.065729141235352, + "learning_rate": 5.499308243029066e-05, + "loss": 0.4962, + "step": 69350 + }, + { + "epoch": 62.782805429864254, + "grad_norm": 3.5812008380889893, + "learning_rate": 5.493600436747075e-05, + "loss": 0.6212, + "step": 69375 + }, + { + "epoch": 62.80542986425339, + "grad_norm": 5.2495622634887695, + "learning_rate": 5.487893881840417e-05, + "loss": 0.5656, + "step": 69400 + }, + { + "epoch": 62.828054298642535, + "grad_norm": 3.9354496002197266, + "learning_rate": 5.482188581868223e-05, + "loss": 0.5469, + "step": 69425 + }, + { + "epoch": 62.85067873303167, + "grad_norm": 4.970963478088379, + "learning_rate": 5.476484540388846e-05, + "loss": 0.4997, + "step": 69450 + }, + { + "epoch": 62.873303167420815, + "grad_norm": 4.774138450622559, + "learning_rate": 5.4707817609598484e-05, + "loss": 0.6145, + "step": 69475 + }, + { + "epoch": 62.89592760180995, + "grad_norm": 4.530363082885742, + "learning_rate": 5.4650802471380084e-05, + "loss": 0.4556, + "step": 69500 + }, + { + "epoch": 62.918552036199095, + "grad_norm": 3.339048385620117, + "learning_rate": 5.4593800024793155e-05, + "loss": 0.5627, + "step": 69525 + }, + { + "epoch": 62.94117647058823, + "grad_norm": 3.6626808643341064, + "learning_rate": 5.453681030538966e-05, + "loss": 0.5036, + "step": 69550 + }, + { + "epoch": 62.963800904977376, + "grad_norm": 3.765770435333252, + "learning_rate": 5.447983334871362e-05, + "loss": 0.4979, + "step": 69575 + }, + { + "epoch": 62.98642533936652, + "grad_norm": 3.9678280353546143, + "learning_rate": 5.442286919030111e-05, + "loss": 0.4783, + "step": 69600 + }, + { + "epoch": 63.009049773755656, + "grad_norm": 2.8816945552825928, + "learning_rate": 5.436591786568025e-05, + "loss": 0.632, + "step": 69625 + }, + { + "epoch": 63.0316742081448, + "grad_norm": 3.625143051147461, + "learning_rate": 5.4308979410371085e-05, + "loss": 0.4666, + "step": 69650 + }, + { + "epoch": 63.05429864253394, + "grad_norm": 4.200559139251709, + "learning_rate": 5.4252053859885696e-05, + "loss": 0.4263, + "step": 69675 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 2.9522573947906494, + "learning_rate": 5.4195141249728086e-05, + "loss": 0.5067, + "step": 69700 + }, + { + "epoch": 63.09954751131222, + "grad_norm": 2.31929874420166, + "learning_rate": 5.413824161539419e-05, + "loss": 0.5216, + "step": 69725 + }, + { + "epoch": 63.12217194570136, + "grad_norm": 5.139983654022217, + "learning_rate": 5.40836302070305e-05, + "loss": 0.5242, + "step": 69750 + }, + { + "epoch": 63.1447963800905, + "grad_norm": 3.778296709060669, + "learning_rate": 5.40267561082467e-05, + "loss": 0.5228, + "step": 69775 + }, + { + "epoch": 63.16742081447964, + "grad_norm": 3.6103365421295166, + "learning_rate": 5.396989509030709e-05, + "loss": 0.5152, + "step": 69800 + }, + { + "epoch": 63.19004524886878, + "grad_norm": 3.8815252780914307, + "learning_rate": 5.391304718867541e-05, + "loss": 0.5407, + "step": 69825 + }, + { + "epoch": 63.21266968325792, + "grad_norm": 2.3992159366607666, + "learning_rate": 5.385621243880726e-05, + "loss": 0.5378, + "step": 69850 + }, + { + "epoch": 63.23529411764706, + "grad_norm": 3.226144790649414, + "learning_rate": 5.379939087615002e-05, + "loss": 0.5152, + "step": 69875 + }, + { + "epoch": 63.2579185520362, + "grad_norm": 3.268803596496582, + "learning_rate": 5.3742582536142826e-05, + "loss": 0.4802, + "step": 69900 + }, + { + "epoch": 63.28054298642534, + "grad_norm": 3.825580358505249, + "learning_rate": 5.3685787454216605e-05, + "loss": 0.553, + "step": 69925 + }, + { + "epoch": 63.30316742081448, + "grad_norm": 4.392209053039551, + "learning_rate": 5.3629005665793976e-05, + "loss": 0.4879, + "step": 69950 + }, + { + "epoch": 63.32579185520362, + "grad_norm": 3.3955893516540527, + "learning_rate": 5.35722372062893e-05, + "loss": 0.5015, + "step": 69975 + }, + { + "epoch": 63.34841628959276, + "grad_norm": 2.4158828258514404, + "learning_rate": 5.35154821111086e-05, + "loss": 0.431, + "step": 70000 + }, + { + "epoch": 63.3710407239819, + "grad_norm": 3.4631412029266357, + "learning_rate": 5.3458740415649584e-05, + "loss": 0.5473, + "step": 70025 + }, + { + "epoch": 63.39366515837104, + "grad_norm": 4.1777024269104, + "learning_rate": 5.3402012155301574e-05, + "loss": 0.6049, + "step": 70050 + }, + { + "epoch": 63.41628959276018, + "grad_norm": 3.4698071479797363, + "learning_rate": 5.334529736544555e-05, + "loss": 0.4622, + "step": 70075 + }, + { + "epoch": 63.43891402714932, + "grad_norm": 4.021111488342285, + "learning_rate": 5.328859608145406e-05, + "loss": 0.5109, + "step": 70100 + }, + { + "epoch": 63.46153846153846, + "grad_norm": 4.438292026519775, + "learning_rate": 5.323190833869125e-05, + "loss": 0.5073, + "step": 70125 + }, + { + "epoch": 63.484162895927604, + "grad_norm": 3.206141710281372, + "learning_rate": 5.3175234172512813e-05, + "loss": 0.547, + "step": 70150 + }, + { + "epoch": 63.50678733031674, + "grad_norm": 4.061110973358154, + "learning_rate": 5.3118573618265966e-05, + "loss": 0.5482, + "step": 70175 + }, + { + "epoch": 63.529411764705884, + "grad_norm": 3.305384397506714, + "learning_rate": 5.3061926711289445e-05, + "loss": 0.5466, + "step": 70200 + }, + { + "epoch": 63.55203619909502, + "grad_norm": 3.861522912979126, + "learning_rate": 5.300529348691344e-05, + "loss": 0.4888, + "step": 70225 + }, + { + "epoch": 63.574660633484164, + "grad_norm": 2.5031259059906006, + "learning_rate": 5.2948673980459694e-05, + "loss": 0.5227, + "step": 70250 + }, + { + "epoch": 63.5972850678733, + "grad_norm": 5.511804580688477, + "learning_rate": 5.289206822724132e-05, + "loss": 0.5474, + "step": 70275 + }, + { + "epoch": 63.619909502262445, + "grad_norm": 4.490988254547119, + "learning_rate": 5.283547626256287e-05, + "loss": 0.5946, + "step": 70300 + }, + { + "epoch": 63.64253393665158, + "grad_norm": 3.6166396141052246, + "learning_rate": 5.277889812172029e-05, + "loss": 0.5263, + "step": 70325 + }, + { + "epoch": 63.665158371040725, + "grad_norm": 3.9081203937530518, + "learning_rate": 5.2722333840000926e-05, + "loss": 0.4762, + "step": 70350 + }, + { + "epoch": 63.68778280542986, + "grad_norm": 3.9215359687805176, + "learning_rate": 5.266578345268345e-05, + "loss": 0.5689, + "step": 70375 + }, + { + "epoch": 63.710407239819006, + "grad_norm": 4.067574501037598, + "learning_rate": 5.260924699503791e-05, + "loss": 0.5259, + "step": 70400 + }, + { + "epoch": 63.73303167420814, + "grad_norm": 4.492548942565918, + "learning_rate": 5.2552724502325633e-05, + "loss": 0.5292, + "step": 70425 + }, + { + "epoch": 63.755656108597286, + "grad_norm": 2.3186278343200684, + "learning_rate": 5.249621600979923e-05, + "loss": 0.4537, + "step": 70450 + }, + { + "epoch": 63.77828054298642, + "grad_norm": 3.9395086765289307, + "learning_rate": 5.2439721552702614e-05, + "loss": 0.4591, + "step": 70475 + }, + { + "epoch": 63.800904977375566, + "grad_norm": 2.755739450454712, + "learning_rate": 5.23832411662709e-05, + "loss": 0.5483, + "step": 70500 + }, + { + "epoch": 63.8235294117647, + "grad_norm": 3.0595924854278564, + "learning_rate": 5.2326774885730485e-05, + "loss": 0.5067, + "step": 70525 + }, + { + "epoch": 63.84615384615385, + "grad_norm": 4.360012054443359, + "learning_rate": 5.227032274629892e-05, + "loss": 0.6306, + "step": 70550 + }, + { + "epoch": 63.86877828054298, + "grad_norm": 2.9489736557006836, + "learning_rate": 5.2213884783184947e-05, + "loss": 0.5541, + "step": 70575 + }, + { + "epoch": 63.89140271493213, + "grad_norm": 2.208599090576172, + "learning_rate": 5.2157461031588484e-05, + "loss": 0.5129, + "step": 70600 + }, + { + "epoch": 63.914027149321264, + "grad_norm": 5.114026069641113, + "learning_rate": 5.210105152670055e-05, + "loss": 0.5569, + "step": 70625 + }, + { + "epoch": 63.93665158371041, + "grad_norm": 5.624028205871582, + "learning_rate": 5.204465630370334e-05, + "loss": 0.6499, + "step": 70650 + }, + { + "epoch": 63.959276018099544, + "grad_norm": 3.8451778888702393, + "learning_rate": 5.198827539777006e-05, + "loss": 0.4687, + "step": 70675 + }, + { + "epoch": 63.98190045248869, + "grad_norm": 2.6324074268341064, + "learning_rate": 5.193190884406505e-05, + "loss": 0.5226, + "step": 70700 + }, + { + "epoch": 64.00452488687783, + "grad_norm": 4.485694408416748, + "learning_rate": 5.187555667774369e-05, + "loss": 0.5219, + "step": 70725 + }, + { + "epoch": 64.02714932126698, + "grad_norm": 1.449777364730835, + "learning_rate": 5.181921893395235e-05, + "loss": 0.4093, + "step": 70750 + }, + { + "epoch": 64.0497737556561, + "grad_norm": 4.14157772064209, + "learning_rate": 5.176289564782844e-05, + "loss": 0.5353, + "step": 70775 + }, + { + "epoch": 64.07239819004525, + "grad_norm": 3.385127305984497, + "learning_rate": 5.170658685450035e-05, + "loss": 0.488, + "step": 70800 + }, + { + "epoch": 64.09502262443439, + "grad_norm": 3.332422971725464, + "learning_rate": 5.165029258908742e-05, + "loss": 0.4705, + "step": 70825 + }, + { + "epoch": 64.11764705882354, + "grad_norm": 4.453216075897217, + "learning_rate": 5.159401288669993e-05, + "loss": 0.5551, + "step": 70850 + }, + { + "epoch": 64.14027149321267, + "grad_norm": 3.221008062362671, + "learning_rate": 5.153774778243906e-05, + "loss": 0.5066, + "step": 70875 + }, + { + "epoch": 64.16289592760181, + "grad_norm": 2.463557481765747, + "learning_rate": 5.148149731139694e-05, + "loss": 0.4539, + "step": 70900 + }, + { + "epoch": 64.18552036199095, + "grad_norm": 2.7777414321899414, + "learning_rate": 5.142526150865652e-05, + "loss": 0.489, + "step": 70925 + }, + { + "epoch": 64.2081447963801, + "grad_norm": 3.453420877456665, + "learning_rate": 5.136904040929162e-05, + "loss": 0.476, + "step": 70950 + }, + { + "epoch": 64.23076923076923, + "grad_norm": 3.4403789043426514, + "learning_rate": 5.131283404836688e-05, + "loss": 0.565, + "step": 70975 + }, + { + "epoch": 64.25339366515837, + "grad_norm": 0.8236686587333679, + "learning_rate": 5.1256642460937774e-05, + "loss": 0.471, + "step": 71000 + }, + { + "epoch": 64.27601809954751, + "grad_norm": 2.603306531906128, + "learning_rate": 5.120046568205054e-05, + "loss": 0.4589, + "step": 71025 + }, + { + "epoch": 64.29864253393666, + "grad_norm": 4.371375560760498, + "learning_rate": 5.1144303746742185e-05, + "loss": 0.4263, + "step": 71050 + }, + { + "epoch": 64.32126696832579, + "grad_norm": 4.094240188598633, + "learning_rate": 5.1088156690040455e-05, + "loss": 0.499, + "step": 71075 + }, + { + "epoch": 64.34389140271493, + "grad_norm": 3.8048579692840576, + "learning_rate": 5.1032024546963815e-05, + "loss": 0.453, + "step": 71100 + }, + { + "epoch": 64.36651583710407, + "grad_norm": 3.6536977291107178, + "learning_rate": 5.0975907352521445e-05, + "loss": 0.5122, + "step": 71125 + }, + { + "epoch": 64.38914027149322, + "grad_norm": 3.905212640762329, + "learning_rate": 5.091980514171318e-05, + "loss": 0.5703, + "step": 71150 + }, + { + "epoch": 64.41176470588235, + "grad_norm": 3.993847370147705, + "learning_rate": 5.086371794952952e-05, + "loss": 0.509, + "step": 71175 + }, + { + "epoch": 64.43438914027149, + "grad_norm": 3.4553163051605225, + "learning_rate": 5.08076458109516e-05, + "loss": 0.5573, + "step": 71200 + }, + { + "epoch": 64.45701357466064, + "grad_norm": 3.7322750091552734, + "learning_rate": 5.075158876095117e-05, + "loss": 0.5615, + "step": 71225 + }, + { + "epoch": 64.47963800904978, + "grad_norm": 4.832956314086914, + "learning_rate": 5.0695546834490546e-05, + "loss": 0.5044, + "step": 71250 + }, + { + "epoch": 64.50226244343891, + "grad_norm": 4.214544296264648, + "learning_rate": 5.063952006652264e-05, + "loss": 0.5492, + "step": 71275 + }, + { + "epoch": 64.52488687782805, + "grad_norm": 4.111082553863525, + "learning_rate": 5.0583508491990884e-05, + "loss": 0.4865, + "step": 71300 + }, + { + "epoch": 64.5475113122172, + "grad_norm": 3.1240320205688477, + "learning_rate": 5.0527512145829285e-05, + "loss": 0.4663, + "step": 71325 + }, + { + "epoch": 64.57013574660634, + "grad_norm": 3.761641025543213, + "learning_rate": 5.047153106296228e-05, + "loss": 0.4849, + "step": 71350 + }, + { + "epoch": 64.59276018099547, + "grad_norm": 3.206021785736084, + "learning_rate": 5.0415565278304835e-05, + "loss": 0.4669, + "step": 71375 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 2.882524251937866, + "learning_rate": 5.035961482676237e-05, + "loss": 0.5029, + "step": 71400 + }, + { + "epoch": 64.63800904977376, + "grad_norm": 2.521789073944092, + "learning_rate": 5.030367974323071e-05, + "loss": 0.5085, + "step": 71425 + }, + { + "epoch": 64.6606334841629, + "grad_norm": 5.065183162689209, + "learning_rate": 5.024776006259615e-05, + "loss": 0.5224, + "step": 71450 + }, + { + "epoch": 64.68325791855203, + "grad_norm": 4.135383605957031, + "learning_rate": 5.019185581973532e-05, + "loss": 0.564, + "step": 71475 + }, + { + "epoch": 64.70588235294117, + "grad_norm": 3.4252192974090576, + "learning_rate": 5.0135967049515255e-05, + "loss": 0.4829, + "step": 71500 + }, + { + "epoch": 64.72850678733032, + "grad_norm": 4.423304080963135, + "learning_rate": 5.008009378679332e-05, + "loss": 0.4485, + "step": 71525 + }, + { + "epoch": 64.75113122171946, + "grad_norm": 3.599609136581421, + "learning_rate": 5.0024236066417256e-05, + "loss": 0.5045, + "step": 71550 + }, + { + "epoch": 64.77375565610859, + "grad_norm": 3.402135133743286, + "learning_rate": 4.996839392322507e-05, + "loss": 0.6037, + "step": 71575 + }, + { + "epoch": 64.79638009049773, + "grad_norm": 4.259146690368652, + "learning_rate": 4.991256739204502e-05, + "loss": 0.56, + "step": 71600 + }, + { + "epoch": 64.81900452488688, + "grad_norm": 3.316690444946289, + "learning_rate": 4.985675650769569e-05, + "loss": 0.4922, + "step": 71625 + }, + { + "epoch": 64.84162895927602, + "grad_norm": 3.8324856758117676, + "learning_rate": 4.98009613049859e-05, + "loss": 0.5149, + "step": 71650 + }, + { + "epoch": 64.86425339366515, + "grad_norm": 4.328530788421631, + "learning_rate": 4.9745181818714644e-05, + "loss": 0.5364, + "step": 71675 + }, + { + "epoch": 64.8868778280543, + "grad_norm": 1.284960150718689, + "learning_rate": 4.968941808367116e-05, + "loss": 0.4889, + "step": 71700 + }, + { + "epoch": 64.90950226244344, + "grad_norm": 2.807213068008423, + "learning_rate": 4.9633670134634827e-05, + "loss": 0.4275, + "step": 71725 + }, + { + "epoch": 64.93212669683258, + "grad_norm": 2.836705207824707, + "learning_rate": 4.9577938006375206e-05, + "loss": 0.4192, + "step": 71750 + }, + { + "epoch": 64.95475113122171, + "grad_norm": 3.825068235397339, + "learning_rate": 4.952222173365197e-05, + "loss": 0.4179, + "step": 71775 + }, + { + "epoch": 64.97737556561086, + "grad_norm": 3.5482659339904785, + "learning_rate": 4.94687490609831e-05, + "loss": 0.4845, + "step": 71800 + }, + { + "epoch": 65.0, + "grad_norm": 3.292893409729004, + "learning_rate": 4.941306396590419e-05, + "loss": 0.4963, + "step": 71825 + }, + { + "epoch": 65.02262443438914, + "grad_norm": 3.8134896755218506, + "learning_rate": 4.935739482919228e-05, + "loss": 0.4938, + "step": 71850 + }, + { + "epoch": 65.04524886877829, + "grad_norm": 5.965142250061035, + "learning_rate": 4.930174168556778e-05, + "loss": 0.5126, + "step": 71875 + }, + { + "epoch": 65.06787330316742, + "grad_norm": 3.516213893890381, + "learning_rate": 4.924610456974109e-05, + "loss": 0.465, + "step": 71900 + }, + { + "epoch": 65.09049773755656, + "grad_norm": 3.8203318119049072, + "learning_rate": 4.919048351641266e-05, + "loss": 0.3844, + "step": 71925 + }, + { + "epoch": 65.1131221719457, + "grad_norm": 4.222325325012207, + "learning_rate": 4.913487856027287e-05, + "loss": 0.5269, + "step": 71950 + }, + { + "epoch": 65.13574660633485, + "grad_norm": 3.3444793224334717, + "learning_rate": 4.907928973600209e-05, + "loss": 0.5801, + "step": 71975 + }, + { + "epoch": 65.15837104072398, + "grad_norm": 3.294954299926758, + "learning_rate": 4.902371707827064e-05, + "loss": 0.4357, + "step": 72000 + }, + { + "epoch": 65.18099547511312, + "grad_norm": 3.5857014656066895, + "learning_rate": 4.8968160621738725e-05, + "loss": 0.5202, + "step": 72025 + }, + { + "epoch": 65.20361990950227, + "grad_norm": 3.8422534465789795, + "learning_rate": 4.891262040105648e-05, + "loss": 0.5648, + "step": 72050 + }, + { + "epoch": 65.22624434389141, + "grad_norm": 4.33748197555542, + "learning_rate": 4.885709645086387e-05, + "loss": 0.4577, + "step": 72075 + }, + { + "epoch": 65.24886877828054, + "grad_norm": 4.151119232177734, + "learning_rate": 4.880158880579076e-05, + "loss": 0.4767, + "step": 72100 + }, + { + "epoch": 65.27149321266968, + "grad_norm": 3.3274641036987305, + "learning_rate": 4.8746097500456816e-05, + "loss": 0.4201, + "step": 72125 + }, + { + "epoch": 65.29411764705883, + "grad_norm": 2.085918426513672, + "learning_rate": 4.869062256947151e-05, + "loss": 0.4389, + "step": 72150 + }, + { + "epoch": 65.31674208144797, + "grad_norm": 4.64586067199707, + "learning_rate": 4.863516404743413e-05, + "loss": 0.4356, + "step": 72175 + }, + { + "epoch": 65.3393665158371, + "grad_norm": 4.617623329162598, + "learning_rate": 4.85797219689337e-05, + "loss": 0.552, + "step": 72200 + }, + { + "epoch": 65.36199095022624, + "grad_norm": 4.925380229949951, + "learning_rate": 4.852429636854901e-05, + "loss": 0.4565, + "step": 72225 + }, + { + "epoch": 65.38461538461539, + "grad_norm": 3.209160566329956, + "learning_rate": 4.846888728084855e-05, + "loss": 0.4161, + "step": 72250 + }, + { + "epoch": 65.40723981900453, + "grad_norm": 2.874236822128296, + "learning_rate": 4.8413494740390534e-05, + "loss": 0.4984, + "step": 72275 + }, + { + "epoch": 65.42986425339366, + "grad_norm": 2.963542938232422, + "learning_rate": 4.835811878172284e-05, + "loss": 0.4449, + "step": 72300 + }, + { + "epoch": 65.4524886877828, + "grad_norm": 3.9334418773651123, + "learning_rate": 4.830275943938298e-05, + "loss": 0.4491, + "step": 72325 + }, + { + "epoch": 65.47511312217195, + "grad_norm": 3.364246368408203, + "learning_rate": 4.82474167478982e-05, + "loss": 0.4536, + "step": 72350 + }, + { + "epoch": 65.49773755656109, + "grad_norm": 2.7113635540008545, + "learning_rate": 4.8192090741785256e-05, + "loss": 0.5391, + "step": 72375 + }, + { + "epoch": 65.52036199095022, + "grad_norm": 4.298723220825195, + "learning_rate": 4.813678145555054e-05, + "loss": 0.4959, + "step": 72400 + }, + { + "epoch": 65.54298642533936, + "grad_norm": 3.5630440711975098, + "learning_rate": 4.808148892369e-05, + "loss": 0.4216, + "step": 72425 + }, + { + "epoch": 65.56561085972851, + "grad_norm": 3.007699728012085, + "learning_rate": 4.802621318068916e-05, + "loss": 0.4718, + "step": 72450 + }, + { + "epoch": 65.58823529411765, + "grad_norm": 3.4051260948181152, + "learning_rate": 4.797095426102305e-05, + "loss": 0.4499, + "step": 72475 + }, + { + "epoch": 65.61085972850678, + "grad_norm": 3.9288182258605957, + "learning_rate": 4.7915712199156215e-05, + "loss": 0.5228, + "step": 72500 + }, + { + "epoch": 65.63348416289593, + "grad_norm": 3.282602548599243, + "learning_rate": 4.7860487029542684e-05, + "loss": 0.4455, + "step": 72525 + }, + { + "epoch": 65.65610859728507, + "grad_norm": 3.789949893951416, + "learning_rate": 4.780527878662596e-05, + "loss": 0.4649, + "step": 72550 + }, + { + "epoch": 65.67873303167421, + "grad_norm": 3.194241523742676, + "learning_rate": 4.7750087504838975e-05, + "loss": 0.4583, + "step": 72575 + }, + { + "epoch": 65.70135746606334, + "grad_norm": 5.476235866546631, + "learning_rate": 4.76971198633072e-05, + "loss": 0.4904, + "step": 72600 + }, + { + "epoch": 65.72398190045249, + "grad_norm": 3.226522207260132, + "learning_rate": 4.7641961925177045e-05, + "loss": 0.45, + "step": 72625 + }, + { + "epoch": 65.74660633484163, + "grad_norm": 2.643019437789917, + "learning_rate": 4.758682105003606e-05, + "loss": 0.5505, + "step": 72650 + }, + { + "epoch": 65.76923076923077, + "grad_norm": 3.408048152923584, + "learning_rate": 4.753169727227516e-05, + "loss": 0.4742, + "step": 72675 + }, + { + "epoch": 65.7918552036199, + "grad_norm": 4.125067234039307, + "learning_rate": 4.747659062627459e-05, + "loss": 0.4487, + "step": 72700 + }, + { + "epoch": 65.81447963800905, + "grad_norm": 3.495706558227539, + "learning_rate": 4.742150114640394e-05, + "loss": 0.5123, + "step": 72725 + }, + { + "epoch": 65.83710407239819, + "grad_norm": 2.2247610092163086, + "learning_rate": 4.7366428867022075e-05, + "loss": 0.4085, + "step": 72750 + }, + { + "epoch": 65.85972850678733, + "grad_norm": 3.591494560241699, + "learning_rate": 4.7311373822477125e-05, + "loss": 0.469, + "step": 72775 + }, + { + "epoch": 65.88235294117646, + "grad_norm": 3.502779722213745, + "learning_rate": 4.7256336047106486e-05, + "loss": 0.522, + "step": 72800 + }, + { + "epoch": 65.90497737556561, + "grad_norm": 4.009083271026611, + "learning_rate": 4.7201315575236784e-05, + "loss": 0.5237, + "step": 72825 + }, + { + "epoch": 65.92760180995475, + "grad_norm": 2.376384973526001, + "learning_rate": 4.714631244118384e-05, + "loss": 0.5136, + "step": 72850 + }, + { + "epoch": 65.9502262443439, + "grad_norm": 4.574949264526367, + "learning_rate": 4.709132667925266e-05, + "loss": 0.3673, + "step": 72875 + }, + { + "epoch": 65.97285067873302, + "grad_norm": 2.778244972229004, + "learning_rate": 4.703635832373743e-05, + "loss": 0.4782, + "step": 72900 + }, + { + "epoch": 65.99547511312217, + "grad_norm": 3.3691933155059814, + "learning_rate": 4.698140740892149e-05, + "loss": 0.3929, + "step": 72925 + }, + { + "epoch": 66.01809954751131, + "grad_norm": 3.810154676437378, + "learning_rate": 4.692647396907726e-05, + "loss": 0.455, + "step": 72950 + }, + { + "epoch": 66.04072398190046, + "grad_norm": 3.4145562648773193, + "learning_rate": 4.687155803846629e-05, + "loss": 0.3828, + "step": 72975 + }, + { + "epoch": 66.0633484162896, + "grad_norm": 3.596548557281494, + "learning_rate": 4.681665965133922e-05, + "loss": 0.4272, + "step": 73000 + }, + { + "epoch": 66.08597285067873, + "grad_norm": 3.4454643726348877, + "learning_rate": 4.676177884193572e-05, + "loss": 0.4169, + "step": 73025 + }, + { + "epoch": 66.10859728506787, + "grad_norm": 2.733118772506714, + "learning_rate": 4.670691564448452e-05, + "loss": 0.4115, + "step": 73050 + }, + { + "epoch": 66.13122171945702, + "grad_norm": 2.9450082778930664, + "learning_rate": 4.6652070093203356e-05, + "loss": 0.3903, + "step": 73075 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 3.848235607147217, + "learning_rate": 4.6597242222298955e-05, + "loss": 0.4043, + "step": 73100 + }, + { + "epoch": 66.17647058823529, + "grad_norm": 2.8531174659729004, + "learning_rate": 4.654243206596703e-05, + "loss": 0.4569, + "step": 73125 + }, + { + "epoch": 66.19909502262443, + "grad_norm": 2.625898599624634, + "learning_rate": 4.6487639658392224e-05, + "loss": 0.4542, + "step": 73150 + }, + { + "epoch": 66.22171945701358, + "grad_norm": 3.0660314559936523, + "learning_rate": 4.643286503374812e-05, + "loss": 0.3989, + "step": 73175 + }, + { + "epoch": 66.24434389140272, + "grad_norm": 3.1097512245178223, + "learning_rate": 4.6378108226197205e-05, + "loss": 0.5148, + "step": 73200 + }, + { + "epoch": 66.26696832579185, + "grad_norm": 4.034802436828613, + "learning_rate": 4.632336926989091e-05, + "loss": 0.5242, + "step": 73225 + }, + { + "epoch": 66.289592760181, + "grad_norm": 3.696357011795044, + "learning_rate": 4.626864819896943e-05, + "loss": 0.4765, + "step": 73250 + }, + { + "epoch": 66.31221719457014, + "grad_norm": 2.8210299015045166, + "learning_rate": 4.621394504756188e-05, + "loss": 0.4046, + "step": 73275 + }, + { + "epoch": 66.33484162895928, + "grad_norm": 3.4342691898345947, + "learning_rate": 4.615925984978619e-05, + "loss": 0.4902, + "step": 73300 + }, + { + "epoch": 66.35746606334841, + "grad_norm": 3.3892955780029297, + "learning_rate": 4.610459263974905e-05, + "loss": 0.3694, + "step": 73325 + }, + { + "epoch": 66.38009049773756, + "grad_norm": 3.8561410903930664, + "learning_rate": 4.6049943451545985e-05, + "loss": 0.4382, + "step": 73350 + }, + { + "epoch": 66.4027149321267, + "grad_norm": 3.929945230484009, + "learning_rate": 4.599531231926125e-05, + "loss": 0.3728, + "step": 73375 + }, + { + "epoch": 66.42533936651584, + "grad_norm": 4.837695121765137, + "learning_rate": 4.594069927696783e-05, + "loss": 0.4722, + "step": 73400 + }, + { + "epoch": 66.44796380090497, + "grad_norm": 2.2682697772979736, + "learning_rate": 4.588610435872746e-05, + "loss": 0.406, + "step": 73425 + }, + { + "epoch": 66.47058823529412, + "grad_norm": 4.606935024261475, + "learning_rate": 4.5831527598590544e-05, + "loss": 0.3981, + "step": 73450 + }, + { + "epoch": 66.49321266968326, + "grad_norm": 3.6162781715393066, + "learning_rate": 4.577696903059617e-05, + "loss": 0.4641, + "step": 73475 + }, + { + "epoch": 66.5158371040724, + "grad_norm": 2.6316192150115967, + "learning_rate": 4.572242868877209e-05, + "loss": 0.4188, + "step": 73500 + }, + { + "epoch": 66.53846153846153, + "grad_norm": 3.1077880859375, + "learning_rate": 4.566790660713468e-05, + "loss": 0.4499, + "step": 73525 + }, + { + "epoch": 66.56108597285068, + "grad_norm": 3.875701904296875, + "learning_rate": 4.561340281968889e-05, + "loss": 0.4369, + "step": 73550 + }, + { + "epoch": 66.58371040723982, + "grad_norm": 3.852557420730591, + "learning_rate": 4.555891736042833e-05, + "loss": 0.4332, + "step": 73575 + }, + { + "epoch": 66.60633484162896, + "grad_norm": 4.01460075378418, + "learning_rate": 4.550445026333515e-05, + "loss": 0.4226, + "step": 73600 + }, + { + "epoch": 66.6289592760181, + "grad_norm": 1.935514211654663, + "learning_rate": 4.545000156238002e-05, + "loss": 0.4093, + "step": 73625 + }, + { + "epoch": 66.65158371040724, + "grad_norm": 3.7266364097595215, + "learning_rate": 4.539557129152217e-05, + "loss": 0.3904, + "step": 73650 + }, + { + "epoch": 66.67420814479638, + "grad_norm": 4.496799468994141, + "learning_rate": 4.534115948470931e-05, + "loss": 0.5374, + "step": 73675 + }, + { + "epoch": 66.69683257918552, + "grad_norm": 2.9410617351531982, + "learning_rate": 4.528676617587767e-05, + "loss": 0.3951, + "step": 73700 + }, + { + "epoch": 66.71945701357465, + "grad_norm": 3.5392343997955322, + "learning_rate": 4.523239139895191e-05, + "loss": 0.5283, + "step": 73725 + }, + { + "epoch": 66.7420814479638, + "grad_norm": 3.0796167850494385, + "learning_rate": 4.517803518784513e-05, + "loss": 0.4688, + "step": 73750 + }, + { + "epoch": 66.76470588235294, + "grad_norm": 3.29840087890625, + "learning_rate": 4.512369757645889e-05, + "loss": 0.5688, + "step": 73775 + }, + { + "epoch": 66.78733031674209, + "grad_norm": 3.5389914512634277, + "learning_rate": 4.506937859868309e-05, + "loss": 0.4534, + "step": 73800 + }, + { + "epoch": 66.80995475113122, + "grad_norm": 2.3450798988342285, + "learning_rate": 4.501507828839607e-05, + "loss": 0.497, + "step": 73825 + }, + { + "epoch": 66.83257918552036, + "grad_norm": 2.9214231967926025, + "learning_rate": 4.496079667946449e-05, + "loss": 0.4667, + "step": 73850 + }, + { + "epoch": 66.8552036199095, + "grad_norm": 4.5358476638793945, + "learning_rate": 4.490653380574336e-05, + "loss": 0.4841, + "step": 73875 + }, + { + "epoch": 66.87782805429865, + "grad_norm": 3.7113118171691895, + "learning_rate": 4.485228970107598e-05, + "loss": 0.5856, + "step": 73900 + }, + { + "epoch": 66.90045248868778, + "grad_norm": 3.699545383453369, + "learning_rate": 4.4798064399293976e-05, + "loss": 0.4488, + "step": 73925 + }, + { + "epoch": 66.92307692307692, + "grad_norm": 4.203170299530029, + "learning_rate": 4.474385793421724e-05, + "loss": 0.5001, + "step": 73950 + }, + { + "epoch": 66.94570135746606, + "grad_norm": 4.271376609802246, + "learning_rate": 4.468967033965391e-05, + "loss": 0.4255, + "step": 73975 + }, + { + "epoch": 66.96832579185521, + "grad_norm": 2.9453163146972656, + "learning_rate": 4.463550164940034e-05, + "loss": 0.4729, + "step": 74000 + }, + { + "epoch": 66.99095022624434, + "grad_norm": 3.467568874359131, + "learning_rate": 4.4581351897241116e-05, + "loss": 0.4127, + "step": 74025 + }, + { + "epoch": 67.01357466063348, + "grad_norm": 2.1562981605529785, + "learning_rate": 4.4527221116949e-05, + "loss": 0.4176, + "step": 74050 + }, + { + "epoch": 67.03619909502262, + "grad_norm": 3.6084489822387695, + "learning_rate": 4.447310934228494e-05, + "loss": 0.4044, + "step": 74075 + }, + { + "epoch": 67.05882352941177, + "grad_norm": 2.777543306350708, + "learning_rate": 4.441901660699801e-05, + "loss": 0.4769, + "step": 74100 + }, + { + "epoch": 67.08144796380091, + "grad_norm": 2.9665920734405518, + "learning_rate": 4.4364942944825416e-05, + "loss": 0.3794, + "step": 74125 + }, + { + "epoch": 67.10407239819004, + "grad_norm": 3.2442235946655273, + "learning_rate": 4.431088838949245e-05, + "loss": 0.445, + "step": 74150 + }, + { + "epoch": 67.12669683257919, + "grad_norm": 5.0728960037231445, + "learning_rate": 4.4256852974712534e-05, + "loss": 0.4795, + "step": 74175 + }, + { + "epoch": 67.14932126696833, + "grad_norm": 3.2921695709228516, + "learning_rate": 4.4202836734187106e-05, + "loss": 0.3874, + "step": 74200 + }, + { + "epoch": 67.17194570135747, + "grad_norm": 1.5285372734069824, + "learning_rate": 4.4148839701605664e-05, + "loss": 0.4343, + "step": 74225 + }, + { + "epoch": 67.1945701357466, + "grad_norm": 3.4112703800201416, + "learning_rate": 4.409486191064573e-05, + "loss": 0.4788, + "step": 74250 + }, + { + "epoch": 67.21719457013575, + "grad_norm": 3.9787042140960693, + "learning_rate": 4.4040903394972793e-05, + "loss": 0.4578, + "step": 74275 + }, + { + "epoch": 67.23981900452489, + "grad_norm": 3.6014389991760254, + "learning_rate": 4.398696418824039e-05, + "loss": 0.3759, + "step": 74300 + }, + { + "epoch": 67.26244343891403, + "grad_norm": 3.469893217086792, + "learning_rate": 4.393304432408996e-05, + "loss": 0.409, + "step": 74325 + }, + { + "epoch": 67.28506787330316, + "grad_norm": 3.641577959060669, + "learning_rate": 4.3879143836150873e-05, + "loss": 0.3616, + "step": 74350 + }, + { + "epoch": 67.3076923076923, + "grad_norm": 3.0347094535827637, + "learning_rate": 4.382526275804044e-05, + "loss": 0.4235, + "step": 74375 + }, + { + "epoch": 67.33031674208145, + "grad_norm": 2.8964600563049316, + "learning_rate": 4.3771401123363886e-05, + "loss": 0.3915, + "step": 74400 + }, + { + "epoch": 67.3529411764706, + "grad_norm": 3.0891056060791016, + "learning_rate": 4.371755896571421e-05, + "loss": 0.4741, + "step": 74425 + }, + { + "epoch": 67.37556561085972, + "grad_norm": 3.889401912689209, + "learning_rate": 4.366373631867242e-05, + "loss": 0.3821, + "step": 74450 + }, + { + "epoch": 67.39819004524887, + "grad_norm": 2.750678300857544, + "learning_rate": 4.36099332158072e-05, + "loss": 0.3626, + "step": 74475 + }, + { + "epoch": 67.42081447963801, + "grad_norm": 2.762613296508789, + "learning_rate": 4.355614969067516e-05, + "loss": 0.3529, + "step": 74500 + }, + { + "epoch": 67.44343891402715, + "grad_norm": 3.64766263961792, + "learning_rate": 4.35023857768206e-05, + "loss": 0.4109, + "step": 74525 + }, + { + "epoch": 67.46606334841628, + "grad_norm": 3.7489163875579834, + "learning_rate": 4.344864150777573e-05, + "loss": 0.4445, + "step": 74550 + }, + { + "epoch": 67.48868778280543, + "grad_norm": 3.121638536453247, + "learning_rate": 4.339491691706033e-05, + "loss": 0.4574, + "step": 74575 + }, + { + "epoch": 67.51131221719457, + "grad_norm": 3.443589210510254, + "learning_rate": 4.3341212038182054e-05, + "loss": 0.4053, + "step": 74600 + }, + { + "epoch": 67.53393665158372, + "grad_norm": 6.428598880767822, + "learning_rate": 4.328752690463617e-05, + "loss": 0.3812, + "step": 74625 + }, + { + "epoch": 67.55656108597285, + "grad_norm": 1.6356045007705688, + "learning_rate": 4.32338615499057e-05, + "loss": 0.3695, + "step": 74650 + }, + { + "epoch": 67.57918552036199, + "grad_norm": 2.584383726119995, + "learning_rate": 4.3180216007461257e-05, + "loss": 0.4634, + "step": 74675 + }, + { + "epoch": 67.60180995475113, + "grad_norm": 3.4905943870544434, + "learning_rate": 4.312659031076118e-05, + "loss": 0.4604, + "step": 74700 + }, + { + "epoch": 67.62443438914028, + "grad_norm": 3.9946959018707275, + "learning_rate": 4.3072984493251336e-05, + "loss": 0.4917, + "step": 74725 + }, + { + "epoch": 67.6470588235294, + "grad_norm": 3.935030221939087, + "learning_rate": 4.301939858836529e-05, + "loss": 0.3362, + "step": 74750 + }, + { + "epoch": 67.66968325791855, + "grad_norm": 3.106130838394165, + "learning_rate": 4.296583262952411e-05, + "loss": 0.3965, + "step": 74775 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 2.663750410079956, + "learning_rate": 4.291228665013646e-05, + "loss": 0.3978, + "step": 74800 + }, + { + "epoch": 67.71493212669684, + "grad_norm": 3.5414390563964844, + "learning_rate": 4.2858760683598544e-05, + "loss": 0.4735, + "step": 74825 + }, + { + "epoch": 67.73755656108597, + "grad_norm": 2.744206428527832, + "learning_rate": 4.2805254763294114e-05, + "loss": 0.3553, + "step": 74850 + }, + { + "epoch": 67.76018099547511, + "grad_norm": 3.3571581840515137, + "learning_rate": 4.275176892259432e-05, + "loss": 0.3899, + "step": 74875 + }, + { + "epoch": 67.78280542986425, + "grad_norm": 4.216256618499756, + "learning_rate": 4.26983031948579e-05, + "loss": 0.5193, + "step": 74900 + }, + { + "epoch": 67.8054298642534, + "grad_norm": 3.1385810375213623, + "learning_rate": 4.264485761343097e-05, + "loss": 0.4591, + "step": 74925 + }, + { + "epoch": 67.82805429864253, + "grad_norm": 3.5836966037750244, + "learning_rate": 4.259143221164715e-05, + "loss": 0.4142, + "step": 74950 + }, + { + "epoch": 67.85067873303167, + "grad_norm": 3.812453508377075, + "learning_rate": 4.253802702282737e-05, + "loss": 0.4652, + "step": 74975 + }, + { + "epoch": 67.87330316742081, + "grad_norm": 2.7617790699005127, + "learning_rate": 4.248464208028009e-05, + "loss": 0.4776, + "step": 75000 + }, + { + "epoch": 67.89592760180996, + "grad_norm": 2.787205457687378, + "learning_rate": 4.2431277417301005e-05, + "loss": 0.4688, + "step": 75025 + }, + { + "epoch": 67.91855203619909, + "grad_norm": 4.954131126403809, + "learning_rate": 4.23779330671733e-05, + "loss": 0.5092, + "step": 75050 + }, + { + "epoch": 67.94117647058823, + "grad_norm": 3.119158983230591, + "learning_rate": 4.232460906316734e-05, + "loss": 0.4368, + "step": 75075 + }, + { + "epoch": 67.96380090497738, + "grad_norm": 3.8148066997528076, + "learning_rate": 4.227130543854095e-05, + "loss": 0.4735, + "step": 75100 + }, + { + "epoch": 67.98642533936652, + "grad_norm": 2.6367037296295166, + "learning_rate": 4.221802222653911e-05, + "loss": 0.4561, + "step": 75125 + }, + { + "epoch": 68.00904977375566, + "grad_norm": 3.58677339553833, + "learning_rate": 4.2164759460394195e-05, + "loss": 0.4825, + "step": 75150 + }, + { + "epoch": 68.03167420814479, + "grad_norm": 4.2442240715026855, + "learning_rate": 4.211151717332572e-05, + "loss": 0.3115, + "step": 75175 + }, + { + "epoch": 68.05429864253394, + "grad_norm": 3.5822954177856445, + "learning_rate": 4.205829539854051e-05, + "loss": 0.4369, + "step": 75200 + }, + { + "epoch": 68.07692307692308, + "grad_norm": 2.1157548427581787, + "learning_rate": 4.200509416923249e-05, + "loss": 0.2988, + "step": 75225 + }, + { + "epoch": 68.09954751131222, + "grad_norm": 3.1891684532165527, + "learning_rate": 4.195191351858294e-05, + "loss": 0.3535, + "step": 75250 + }, + { + "epoch": 68.12217194570135, + "grad_norm": 3.4645771980285645, + "learning_rate": 4.1898753479760116e-05, + "loss": 0.3996, + "step": 75275 + }, + { + "epoch": 68.1447963800905, + "grad_norm": 1.7759895324707031, + "learning_rate": 4.184561408591955e-05, + "loss": 0.4161, + "step": 75300 + }, + { + "epoch": 68.16742081447964, + "grad_norm": 3.8754966259002686, + "learning_rate": 4.179249537020388e-05, + "loss": 0.3405, + "step": 75325 + }, + { + "epoch": 68.19004524886878, + "grad_norm": 3.558027744293213, + "learning_rate": 4.173939736574275e-05, + "loss": 0.3907, + "step": 75350 + }, + { + "epoch": 68.21266968325791, + "grad_norm": 4.56781530380249, + "learning_rate": 4.168632010565306e-05, + "loss": 0.4352, + "step": 75375 + }, + { + "epoch": 68.23529411764706, + "grad_norm": 2.9799764156341553, + "learning_rate": 4.1633263623038566e-05, + "loss": 0.3617, + "step": 75400 + }, + { + "epoch": 68.2579185520362, + "grad_norm": 3.8714258670806885, + "learning_rate": 4.158022795099026e-05, + "loss": 0.4573, + "step": 75425 + }, + { + "epoch": 68.28054298642535, + "grad_norm": 2.85005259513855, + "learning_rate": 4.152721312258601e-05, + "loss": 0.3795, + "step": 75450 + }, + { + "epoch": 68.30316742081448, + "grad_norm": 3.3433988094329834, + "learning_rate": 4.147421917089077e-05, + "loss": 0.3775, + "step": 75475 + }, + { + "epoch": 68.32579185520362, + "grad_norm": 3.080737829208374, + "learning_rate": 4.142124612895642e-05, + "loss": 0.4629, + "step": 75500 + }, + { + "epoch": 68.34841628959276, + "grad_norm": 2.6309893131256104, + "learning_rate": 4.136829402982185e-05, + "loss": 0.3704, + "step": 75525 + }, + { + "epoch": 68.3710407239819, + "grad_norm": 2.5583741664886475, + "learning_rate": 4.131536290651282e-05, + "loss": 0.3958, + "step": 75550 + }, + { + "epoch": 68.39366515837104, + "grad_norm": 3.351884365081787, + "learning_rate": 4.1262452792042086e-05, + "loss": 0.4445, + "step": 75575 + }, + { + "epoch": 68.41628959276018, + "grad_norm": 3.149663209915161, + "learning_rate": 4.120956371940923e-05, + "loss": 0.4257, + "step": 75600 + }, + { + "epoch": 68.43891402714932, + "grad_norm": 2.4363698959350586, + "learning_rate": 4.115669572160079e-05, + "loss": 0.3825, + "step": 75625 + }, + { + "epoch": 68.46153846153847, + "grad_norm": 3.3466544151306152, + "learning_rate": 4.1103848831590055e-05, + "loss": 0.3214, + "step": 75650 + }, + { + "epoch": 68.4841628959276, + "grad_norm": 3.8922653198242188, + "learning_rate": 4.1051023082337254e-05, + "loss": 0.3655, + "step": 75675 + }, + { + "epoch": 68.50678733031674, + "grad_norm": 2.990739107131958, + "learning_rate": 4.0998218506789346e-05, + "loss": 0.4013, + "step": 75700 + }, + { + "epoch": 68.52941176470588, + "grad_norm": 2.7267568111419678, + "learning_rate": 4.0945435137880164e-05, + "loss": 0.3437, + "step": 75725 + }, + { + "epoch": 68.55203619909503, + "grad_norm": 2.7915239334106445, + "learning_rate": 4.0892673008530206e-05, + "loss": 0.377, + "step": 75750 + }, + { + "epoch": 68.57466063348416, + "grad_norm": 2.374497413635254, + "learning_rate": 4.083993215164687e-05, + "loss": 0.3747, + "step": 75775 + }, + { + "epoch": 68.5972850678733, + "grad_norm": 2.1000912189483643, + "learning_rate": 4.0787212600124145e-05, + "loss": 0.4749, + "step": 75800 + }, + { + "epoch": 68.61990950226244, + "grad_norm": 3.066020965576172, + "learning_rate": 4.073451438684284e-05, + "loss": 0.3983, + "step": 75825 + }, + { + "epoch": 68.64253393665159, + "grad_norm": 1.8375914096832275, + "learning_rate": 4.0681837544670356e-05, + "loss": 0.4454, + "step": 75850 + }, + { + "epoch": 68.66515837104072, + "grad_norm": 3.709446430206299, + "learning_rate": 4.062918210646087e-05, + "loss": 0.5053, + "step": 75875 + }, + { + "epoch": 68.68778280542986, + "grad_norm": 3.295236587524414, + "learning_rate": 4.0576548105055096e-05, + "loss": 0.4938, + "step": 75900 + }, + { + "epoch": 68.710407239819, + "grad_norm": 3.026183843612671, + "learning_rate": 4.0523935573280514e-05, + "loss": 0.3928, + "step": 75925 + }, + { + "epoch": 68.73303167420815, + "grad_norm": 3.2350196838378906, + "learning_rate": 4.047134454395107e-05, + "loss": 0.4085, + "step": 75950 + }, + { + "epoch": 68.75565610859728, + "grad_norm": 2.931197166442871, + "learning_rate": 4.041877504986743e-05, + "loss": 0.4082, + "step": 75975 + }, + { + "epoch": 68.77828054298642, + "grad_norm": 3.3384668827056885, + "learning_rate": 4.036622712381669e-05, + "loss": 0.3353, + "step": 76000 + }, + { + "epoch": 68.80090497737557, + "grad_norm": 2.9945406913757324, + "learning_rate": 4.031370079857266e-05, + "loss": 0.4683, + "step": 76025 + }, + { + "epoch": 68.82352941176471, + "grad_norm": 2.8785560131073, + "learning_rate": 4.0261196106895506e-05, + "loss": 0.4455, + "step": 76050 + }, + { + "epoch": 68.84615384615384, + "grad_norm": 2.258955717086792, + "learning_rate": 4.020871308153204e-05, + "loss": 0.4795, + "step": 76075 + }, + { + "epoch": 68.86877828054298, + "grad_norm": 3.7678897380828857, + "learning_rate": 4.015625175521547e-05, + "loss": 0.544, + "step": 76100 + }, + { + "epoch": 68.89140271493213, + "grad_norm": 3.1933746337890625, + "learning_rate": 4.0103812160665535e-05, + "loss": 0.4258, + "step": 76125 + }, + { + "epoch": 68.91402714932127, + "grad_norm": 3.049928903579712, + "learning_rate": 4.005139433058835e-05, + "loss": 0.4686, + "step": 76150 + }, + { + "epoch": 68.9366515837104, + "grad_norm": 2.922126054763794, + "learning_rate": 3.999899829767655e-05, + "loss": 0.43, + "step": 76175 + }, + { + "epoch": 68.95927601809954, + "grad_norm": 3.004406213760376, + "learning_rate": 3.994662409460906e-05, + "loss": 0.3562, + "step": 76200 + }, + { + "epoch": 68.98190045248869, + "grad_norm": 3.188622236251831, + "learning_rate": 3.989427175405131e-05, + "loss": 0.4254, + "step": 76225 + }, + { + "epoch": 69.00452488687783, + "grad_norm": 3.9997904300689697, + "learning_rate": 3.9841941308654985e-05, + "loss": 0.3802, + "step": 76250 + }, + { + "epoch": 69.02714932126698, + "grad_norm": 3.5980288982391357, + "learning_rate": 3.978963279105821e-05, + "loss": 0.386, + "step": 76275 + }, + { + "epoch": 69.0497737556561, + "grad_norm": 3.104586601257324, + "learning_rate": 3.973734623388533e-05, + "loss": 0.3537, + "step": 76300 + }, + { + "epoch": 69.07239819004525, + "grad_norm": 2.5005218982696533, + "learning_rate": 3.968508166974712e-05, + "loss": 0.3382, + "step": 76325 + }, + { + "epoch": 69.09502262443439, + "grad_norm": 2.546375036239624, + "learning_rate": 3.963283913124051e-05, + "loss": 0.3581, + "step": 76350 + }, + { + "epoch": 69.11764705882354, + "grad_norm": 4.206288814544678, + "learning_rate": 3.958061865094877e-05, + "loss": 0.3366, + "step": 76375 + }, + { + "epoch": 69.14027149321267, + "grad_norm": 2.7173237800598145, + "learning_rate": 3.952842026144144e-05, + "loss": 0.3442, + "step": 76400 + }, + { + "epoch": 69.16289592760181, + "grad_norm": 3.7858710289001465, + "learning_rate": 3.947624399527418e-05, + "loss": 0.3715, + "step": 76425 + }, + { + "epoch": 69.18552036199095, + "grad_norm": 3.3816254138946533, + "learning_rate": 3.9424089884988945e-05, + "loss": 0.417, + "step": 76450 + }, + { + "epoch": 69.2081447963801, + "grad_norm": 3.8528568744659424, + "learning_rate": 3.937195796311381e-05, + "loss": 0.3602, + "step": 76475 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 2.9668381214141846, + "learning_rate": 3.931984826216307e-05, + "loss": 0.4146, + "step": 76500 + }, + { + "epoch": 69.25339366515837, + "grad_norm": 3.6018624305725098, + "learning_rate": 3.9267760814637075e-05, + "loss": 0.3841, + "step": 76525 + }, + { + "epoch": 69.27601809954751, + "grad_norm": 1.6048812866210938, + "learning_rate": 3.921569565302239e-05, + "loss": 0.3891, + "step": 76550 + }, + { + "epoch": 69.29864253393666, + "grad_norm": 3.363898277282715, + "learning_rate": 3.916365280979162e-05, + "loss": 0.3802, + "step": 76575 + }, + { + "epoch": 69.32126696832579, + "grad_norm": 2.56240177154541, + "learning_rate": 3.9111632317403514e-05, + "loss": 0.3569, + "step": 76600 + }, + { + "epoch": 69.34389140271493, + "grad_norm": 3.013315439224243, + "learning_rate": 3.905963420830277e-05, + "loss": 0.3686, + "step": 76625 + }, + { + "epoch": 69.36651583710407, + "grad_norm": 3.691343069076538, + "learning_rate": 3.900765851492025e-05, + "loss": 0.3341, + "step": 76650 + }, + { + "epoch": 69.38914027149322, + "grad_norm": 3.1747207641601562, + "learning_rate": 3.8955705269672716e-05, + "loss": 0.3069, + "step": 76675 + }, + { + "epoch": 69.41176470588235, + "grad_norm": 2.1772170066833496, + "learning_rate": 3.8903774504963056e-05, + "loss": 0.4192, + "step": 76700 + }, + { + "epoch": 69.43438914027149, + "grad_norm": 2.9762048721313477, + "learning_rate": 3.885186625318001e-05, + "loss": 0.4359, + "step": 76725 + }, + { + "epoch": 69.45701357466064, + "grad_norm": 2.8205325603485107, + "learning_rate": 3.87999805466984e-05, + "loss": 0.3983, + "step": 76750 + }, + { + "epoch": 69.47963800904978, + "grad_norm": 2.5052621364593506, + "learning_rate": 3.8748117417878875e-05, + "loss": 0.3964, + "step": 76775 + }, + { + "epoch": 69.50226244343891, + "grad_norm": 4.439595699310303, + "learning_rate": 3.86962768990681e-05, + "loss": 0.3912, + "step": 76800 + }, + { + "epoch": 69.52488687782805, + "grad_norm": 1.4123729467391968, + "learning_rate": 3.864445902259853e-05, + "loss": 0.5169, + "step": 76825 + }, + { + "epoch": 69.5475113122172, + "grad_norm": 3.92445969581604, + "learning_rate": 3.859266382078863e-05, + "loss": 0.3252, + "step": 76850 + }, + { + "epoch": 69.57013574660634, + "grad_norm": 2.97933030128479, + "learning_rate": 3.854089132594257e-05, + "loss": 0.3811, + "step": 76875 + }, + { + "epoch": 69.59276018099547, + "grad_norm": 3.185816526412964, + "learning_rate": 3.848914157035053e-05, + "loss": 0.3827, + "step": 76900 + }, + { + "epoch": 69.61538461538461, + "grad_norm": 2.848999500274658, + "learning_rate": 3.8437414586288346e-05, + "loss": 0.4058, + "step": 76925 + }, + { + "epoch": 69.63800904977376, + "grad_norm": 3.1349036693573, + "learning_rate": 3.838571040601778e-05, + "loss": 0.3609, + "step": 76950 + }, + { + "epoch": 69.6606334841629, + "grad_norm": 2.6944453716278076, + "learning_rate": 3.833402906178626e-05, + "loss": 0.3657, + "step": 76975 + }, + { + "epoch": 69.68325791855203, + "grad_norm": 2.8627524375915527, + "learning_rate": 3.8282370585827084e-05, + "loss": 0.37, + "step": 77000 + }, + { + "epoch": 69.70588235294117, + "grad_norm": 2.7694175243377686, + "learning_rate": 3.823073501035916e-05, + "loss": 0.4469, + "step": 77025 + }, + { + "epoch": 69.72850678733032, + "grad_norm": 2.7689924240112305, + "learning_rate": 3.817912236758726e-05, + "loss": 0.3819, + "step": 77050 + }, + { + "epoch": 69.75113122171946, + "grad_norm": 2.8514535427093506, + "learning_rate": 3.8127532689701705e-05, + "loss": 0.4633, + "step": 77075 + }, + { + "epoch": 69.77375565610859, + "grad_norm": 3.1230578422546387, + "learning_rate": 3.807596600887862e-05, + "loss": 0.4819, + "step": 77100 + }, + { + "epoch": 69.79638009049773, + "grad_norm": 4.111325740814209, + "learning_rate": 3.802442235727969e-05, + "loss": 0.3846, + "step": 77125 + }, + { + "epoch": 69.81900452488688, + "grad_norm": 2.426924467086792, + "learning_rate": 3.797290176705232e-05, + "loss": 0.4069, + "step": 77150 + }, + { + "epoch": 69.84162895927602, + "grad_norm": 2.085695743560791, + "learning_rate": 3.792140427032944e-05, + "loss": 0.3929, + "step": 77175 + }, + { + "epoch": 69.86425339366515, + "grad_norm": 4.2562103271484375, + "learning_rate": 3.7869929899229666e-05, + "loss": 0.5067, + "step": 77200 + }, + { + "epoch": 69.8868778280543, + "grad_norm": 3.177539587020874, + "learning_rate": 3.781847868585711e-05, + "loss": 0.4238, + "step": 77225 + }, + { + "epoch": 69.90950226244344, + "grad_norm": 3.2816081047058105, + "learning_rate": 3.776910733759687e-05, + "loss": 0.373, + "step": 77250 + }, + { + "epoch": 69.93212669683258, + "grad_norm": 2.236802816390991, + "learning_rate": 3.7717701606442145e-05, + "loss": 0.3423, + "step": 77275 + }, + { + "epoch": 69.95475113122171, + "grad_norm": 2.7069408893585205, + "learning_rate": 3.766631912795817e-05, + "loss": 0.3994, + "step": 77300 + }, + { + "epoch": 69.97737556561086, + "grad_norm": 3.7963755130767822, + "learning_rate": 3.7614959934191905e-05, + "loss": 0.4046, + "step": 77325 + }, + { + "epoch": 70.0, + "grad_norm": 3.8006396293640137, + "learning_rate": 3.756362405717558e-05, + "loss": 0.3248, + "step": 77350 + }, + { + "epoch": 70.02262443438914, + "grad_norm": 3.3729357719421387, + "learning_rate": 3.751231152892702e-05, + "loss": 0.3544, + "step": 77375 + }, + { + "epoch": 70.04524886877829, + "grad_norm": 2.3429107666015625, + "learning_rate": 3.7461022381449395e-05, + "loss": 0.3622, + "step": 77400 + }, + { + "epoch": 70.06787330316742, + "grad_norm": 2.8610446453094482, + "learning_rate": 3.740975664673138e-05, + "loss": 0.4314, + "step": 77425 + }, + { + "epoch": 70.09049773755656, + "grad_norm": 3.3164401054382324, + "learning_rate": 3.735851435674695e-05, + "loss": 0.3804, + "step": 77450 + }, + { + "epoch": 70.1131221719457, + "grad_norm": 3.767671823501587, + "learning_rate": 3.730729554345557e-05, + "loss": 0.3723, + "step": 77475 + }, + { + "epoch": 70.13574660633485, + "grad_norm": 2.632603645324707, + "learning_rate": 3.7256100238801936e-05, + "loss": 0.3289, + "step": 77500 + }, + { + "epoch": 70.15837104072398, + "grad_norm": 2.85009765625, + "learning_rate": 3.7204928474716224e-05, + "loss": 0.3121, + "step": 77525 + }, + { + "epoch": 70.18099547511312, + "grad_norm": 4.107291221618652, + "learning_rate": 3.7153780283113785e-05, + "loss": 0.3146, + "step": 77550 + }, + { + "epoch": 70.20361990950227, + "grad_norm": 3.3742053508758545, + "learning_rate": 3.71026556958954e-05, + "loss": 0.4125, + "step": 77575 + }, + { + "epoch": 70.22624434389141, + "grad_norm": 2.78633451461792, + "learning_rate": 3.7051554744946995e-05, + "loss": 0.3578, + "step": 77600 + }, + { + "epoch": 70.24886877828054, + "grad_norm": 2.340951919555664, + "learning_rate": 3.700047746213989e-05, + "loss": 0.3427, + "step": 77625 + }, + { + "epoch": 70.27149321266968, + "grad_norm": 1.9949183464050293, + "learning_rate": 3.694942387933053e-05, + "loss": 0.3342, + "step": 77650 + }, + { + "epoch": 70.29411764705883, + "grad_norm": 5.352598190307617, + "learning_rate": 3.689839402836066e-05, + "loss": 0.3045, + "step": 77675 + }, + { + "epoch": 70.31674208144797, + "grad_norm": 3.4376213550567627, + "learning_rate": 3.684738794105714e-05, + "loss": 0.3284, + "step": 77700 + }, + { + "epoch": 70.3393665158371, + "grad_norm": 3.1421401500701904, + "learning_rate": 3.6796405649232125e-05, + "loss": 0.4091, + "step": 77725 + }, + { + "epoch": 70.36199095022624, + "grad_norm": 4.102700233459473, + "learning_rate": 3.67454471846828e-05, + "loss": 0.3916, + "step": 77750 + }, + { + "epoch": 70.38461538461539, + "grad_norm": 3.809084415435791, + "learning_rate": 3.669451257919161e-05, + "loss": 0.3505, + "step": 77775 + }, + { + "epoch": 70.40723981900453, + "grad_norm": 3.107203722000122, + "learning_rate": 3.6643601864525986e-05, + "loss": 0.4183, + "step": 77800 + }, + { + "epoch": 70.42986425339366, + "grad_norm": 3.738818645477295, + "learning_rate": 3.6592715072438604e-05, + "loss": 0.3844, + "step": 77825 + }, + { + "epoch": 70.4524886877828, + "grad_norm": 3.4404258728027344, + "learning_rate": 3.65418522346671e-05, + "loss": 0.4346, + "step": 77850 + }, + { + "epoch": 70.47511312217195, + "grad_norm": 3.7546753883361816, + "learning_rate": 3.649101338293427e-05, + "loss": 0.3497, + "step": 77875 + }, + { + "epoch": 70.49773755656109, + "grad_norm": 3.058210849761963, + "learning_rate": 3.644019854894783e-05, + "loss": 0.3192, + "step": 77900 + }, + { + "epoch": 70.52036199095022, + "grad_norm": 4.312135219573975, + "learning_rate": 3.638940776440065e-05, + "loss": 0.3527, + "step": 77925 + }, + { + "epoch": 70.54298642533936, + "grad_norm": 3.8511061668395996, + "learning_rate": 3.6338641060970485e-05, + "loss": 0.5133, + "step": 77950 + }, + { + "epoch": 70.56561085972851, + "grad_norm": 3.1947343349456787, + "learning_rate": 3.628789847032016e-05, + "loss": 0.366, + "step": 77975 + }, + { + "epoch": 70.58823529411765, + "grad_norm": 2.7243242263793945, + "learning_rate": 3.623718002409739e-05, + "loss": 0.3192, + "step": 78000 + }, + { + "epoch": 70.61085972850678, + "grad_norm": 1.0801172256469727, + "learning_rate": 3.61864857539349e-05, + "loss": 0.3447, + "step": 78025 + }, + { + "epoch": 70.63348416289593, + "grad_norm": 2.5636961460113525, + "learning_rate": 3.613581569145025e-05, + "loss": 0.338, + "step": 78050 + }, + { + "epoch": 70.65610859728507, + "grad_norm": 1.186858892440796, + "learning_rate": 3.608516986824601e-05, + "loss": 0.3571, + "step": 78075 + }, + { + "epoch": 70.67873303167421, + "grad_norm": 3.1972029209136963, + "learning_rate": 3.603454831590952e-05, + "loss": 0.3363, + "step": 78100 + }, + { + "epoch": 70.70135746606334, + "grad_norm": 2.716545343399048, + "learning_rate": 3.5983951066013065e-05, + "loss": 0.383, + "step": 78125 + }, + { + "epoch": 70.72398190045249, + "grad_norm": 2.7895750999450684, + "learning_rate": 3.5933378150113764e-05, + "loss": 0.3357, + "step": 78150 + }, + { + "epoch": 70.74660633484163, + "grad_norm": 2.2795443534851074, + "learning_rate": 3.588282959975348e-05, + "loss": 0.3294, + "step": 78175 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 3.1738946437835693, + "learning_rate": 3.583230544645902e-05, + "loss": 0.3669, + "step": 78200 + }, + { + "epoch": 70.7918552036199, + "grad_norm": 1.987891435623169, + "learning_rate": 3.578180572174181e-05, + "loss": 0.3827, + "step": 78225 + }, + { + "epoch": 70.81447963800905, + "grad_norm": 2.462768793106079, + "learning_rate": 3.573133045709819e-05, + "loss": 0.4561, + "step": 78250 + }, + { + "epoch": 70.83710407239819, + "grad_norm": 2.8346848487854004, + "learning_rate": 3.568087968400913e-05, + "loss": 0.3556, + "step": 78275 + }, + { + "epoch": 70.85972850678733, + "grad_norm": 2.2082462310791016, + "learning_rate": 3.563045343394042e-05, + "loss": 0.37, + "step": 78300 + }, + { + "epoch": 70.88235294117646, + "grad_norm": 2.6408658027648926, + "learning_rate": 3.558005173834245e-05, + "loss": 0.3757, + "step": 78325 + }, + { + "epoch": 70.90497737556561, + "grad_norm": 3.0608766078948975, + "learning_rate": 3.552967462865042e-05, + "loss": 0.3892, + "step": 78350 + }, + { + "epoch": 70.92760180995475, + "grad_norm": 2.8581387996673584, + "learning_rate": 3.547932213628407e-05, + "loss": 0.392, + "step": 78375 + }, + { + "epoch": 70.9502262443439, + "grad_norm": 1.155606746673584, + "learning_rate": 3.5428994292647884e-05, + "loss": 0.4871, + "step": 78400 + }, + { + "epoch": 70.97285067873302, + "grad_norm": 2.484496593475342, + "learning_rate": 3.537869112913091e-05, + "loss": 0.3925, + "step": 78425 + }, + { + "epoch": 70.99547511312217, + "grad_norm": 2.2866179943084717, + "learning_rate": 3.532841267710686e-05, + "loss": 0.4169, + "step": 78450 + }, + { + "epoch": 71.01809954751131, + "grad_norm": 2.280975341796875, + "learning_rate": 3.527815896793396e-05, + "loss": 0.443, + "step": 78475 + }, + { + "epoch": 71.04072398190046, + "grad_norm": 4.163336277008057, + "learning_rate": 3.5227930032955095e-05, + "loss": 0.3605, + "step": 78500 + }, + { + "epoch": 71.0633484162896, + "grad_norm": 2.618426561355591, + "learning_rate": 3.5177725903497595e-05, + "loss": 0.3193, + "step": 78525 + }, + { + "epoch": 71.08597285067873, + "grad_norm": 2.807340621948242, + "learning_rate": 3.512754661087343e-05, + "loss": 0.2815, + "step": 78550 + }, + { + "epoch": 71.10859728506787, + "grad_norm": 2.9943177700042725, + "learning_rate": 3.507739218637896e-05, + "loss": 0.4231, + "step": 78575 + }, + { + "epoch": 71.13122171945702, + "grad_norm": 3.1198978424072266, + "learning_rate": 3.5027262661295155e-05, + "loss": 0.3435, + "step": 78600 + }, + { + "epoch": 71.15384615384616, + "grad_norm": 3.141416311264038, + "learning_rate": 3.497715806688736e-05, + "loss": 0.3491, + "step": 78625 + }, + { + "epoch": 71.17647058823529, + "grad_norm": 2.8542861938476562, + "learning_rate": 3.492707843440544e-05, + "loss": 0.3876, + "step": 78650 + }, + { + "epoch": 71.19909502262443, + "grad_norm": 1.997887134552002, + "learning_rate": 3.487702379508362e-05, + "loss": 0.3316, + "step": 78675 + }, + { + "epoch": 71.22171945701358, + "grad_norm": 2.8526930809020996, + "learning_rate": 3.482699418014063e-05, + "loss": 0.3033, + "step": 78700 + }, + { + "epoch": 71.24434389140272, + "grad_norm": 1.8499417304992676, + "learning_rate": 3.477698962077949e-05, + "loss": 0.3892, + "step": 78725 + }, + { + "epoch": 71.26696832579185, + "grad_norm": 1.9071060419082642, + "learning_rate": 3.4727010148187684e-05, + "loss": 0.3477, + "step": 78750 + }, + { + "epoch": 71.289592760181, + "grad_norm": 1.4768844842910767, + "learning_rate": 3.467705579353696e-05, + "loss": 0.3652, + "step": 78775 + }, + { + "epoch": 71.31221719457014, + "grad_norm": 3.2199654579162598, + "learning_rate": 3.4627126587983516e-05, + "loss": 0.3227, + "step": 78800 + }, + { + "epoch": 71.33484162895928, + "grad_norm": 2.7382895946502686, + "learning_rate": 3.457722256266772e-05, + "loss": 0.3517, + "step": 78825 + }, + { + "epoch": 71.35746606334841, + "grad_norm": 2.2621893882751465, + "learning_rate": 3.452734374871438e-05, + "loss": 0.3345, + "step": 78850 + }, + { + "epoch": 71.38009049773756, + "grad_norm": 3.079010248184204, + "learning_rate": 3.4477490177232465e-05, + "loss": 0.3239, + "step": 78875 + }, + { + "epoch": 71.4027149321267, + "grad_norm": 2.129749298095703, + "learning_rate": 3.44276618793153e-05, + "loss": 0.3056, + "step": 78900 + }, + { + "epoch": 71.42533936651584, + "grad_norm": 2.247183084487915, + "learning_rate": 3.437785888604032e-05, + "loss": 0.2968, + "step": 78925 + }, + { + "epoch": 71.44796380090497, + "grad_norm": 2.2137086391448975, + "learning_rate": 3.432808122846933e-05, + "loss": 0.4876, + "step": 78950 + }, + { + "epoch": 71.47058823529412, + "grad_norm": 2.3155691623687744, + "learning_rate": 3.42783289376482e-05, + "loss": 0.4405, + "step": 78975 + }, + { + "epoch": 71.49321266968326, + "grad_norm": 3.00319766998291, + "learning_rate": 3.4228602044607084e-05, + "loss": 0.3387, + "step": 79000 + }, + { + "epoch": 71.5158371040724, + "grad_norm": 2.0988829135894775, + "learning_rate": 3.4178900580360195e-05, + "loss": 0.2847, + "step": 79025 + }, + { + "epoch": 71.53846153846153, + "grad_norm": 3.216895580291748, + "learning_rate": 3.4129224575905986e-05, + "loss": 0.3739, + "step": 79050 + }, + { + "epoch": 71.56108597285068, + "grad_norm": 3.7864925861358643, + "learning_rate": 3.407957406222693e-05, + "loss": 0.3226, + "step": 79075 + }, + { + "epoch": 71.58371040723982, + "grad_norm": 2.259701728820801, + "learning_rate": 3.402994907028971e-05, + "loss": 0.4087, + "step": 79100 + }, + { + "epoch": 71.60633484162896, + "grad_norm": 3.4178225994110107, + "learning_rate": 3.398034963104499e-05, + "loss": 0.4035, + "step": 79125 + }, + { + "epoch": 71.6289592760181, + "grad_norm": 1.78024423122406, + "learning_rate": 3.3930775775427596e-05, + "loss": 0.3156, + "step": 79150 + }, + { + "epoch": 71.65158371040724, + "grad_norm": 2.06205153465271, + "learning_rate": 3.3881227534356275e-05, + "loss": 0.3569, + "step": 79175 + }, + { + "epoch": 71.67420814479638, + "grad_norm": 2.405913829803467, + "learning_rate": 3.3831704938733914e-05, + "loss": 0.3027, + "step": 79200 + }, + { + "epoch": 71.69683257918552, + "grad_norm": 3.71380877494812, + "learning_rate": 3.3782208019447373e-05, + "loss": 0.3924, + "step": 79225 + }, + { + "epoch": 71.71945701357465, + "grad_norm": 2.3701257705688477, + "learning_rate": 3.373273680736743e-05, + "loss": 0.3816, + "step": 79250 + }, + { + "epoch": 71.7420814479638, + "grad_norm": 2.361420154571533, + "learning_rate": 3.368329133334895e-05, + "loss": 0.3278, + "step": 79275 + }, + { + "epoch": 71.76470588235294, + "grad_norm": 2.993346691131592, + "learning_rate": 3.363387162823061e-05, + "loss": 0.3009, + "step": 79300 + }, + { + "epoch": 71.78733031674209, + "grad_norm": 2.089348554611206, + "learning_rate": 3.3584477722835156e-05, + "loss": 0.3819, + "step": 79325 + }, + { + "epoch": 71.80995475113122, + "grad_norm": 2.9268131256103516, + "learning_rate": 3.35351096479691e-05, + "loss": 0.421, + "step": 79350 + }, + { + "epoch": 71.83257918552036, + "grad_norm": 2.4692280292510986, + "learning_rate": 3.3485767434422965e-05, + "loss": 0.4297, + "step": 79375 + }, + { + "epoch": 71.8552036199095, + "grad_norm": 3.26350998878479, + "learning_rate": 3.3436451112971064e-05, + "loss": 0.3359, + "step": 79400 + }, + { + "epoch": 71.87782805429865, + "grad_norm": 2.73140811920166, + "learning_rate": 3.3387160714371625e-05, + "loss": 0.3542, + "step": 79425 + }, + { + "epoch": 71.90045248868778, + "grad_norm": 2.5850207805633545, + "learning_rate": 3.333789626936663e-05, + "loss": 0.3099, + "step": 79450 + }, + { + "epoch": 71.92307692307692, + "grad_norm": 2.3376190662384033, + "learning_rate": 3.3288657808681966e-05, + "loss": 0.3096, + "step": 79475 + }, + { + "epoch": 71.94570135746606, + "grad_norm": 5.441970348358154, + "learning_rate": 3.3239445363027224e-05, + "loss": 0.3586, + "step": 79500 + }, + { + "epoch": 71.96832579185521, + "grad_norm": 3.9859695434570312, + "learning_rate": 3.319025896309585e-05, + "loss": 0.3671, + "step": 79525 + }, + { + "epoch": 71.99095022624434, + "grad_norm": 2.742628812789917, + "learning_rate": 3.314109863956497e-05, + "loss": 0.3583, + "step": 79550 + }, + { + "epoch": 72.01357466063348, + "grad_norm": 3.0470237731933594, + "learning_rate": 3.309196442309552e-05, + "loss": 0.3518, + "step": 79575 + }, + { + "epoch": 72.03619909502262, + "grad_norm": 3.3255622386932373, + "learning_rate": 3.304285634433209e-05, + "loss": 0.3202, + "step": 79600 + }, + { + "epoch": 72.05882352941177, + "grad_norm": 2.460353136062622, + "learning_rate": 3.299377443390302e-05, + "loss": 0.3318, + "step": 79625 + }, + { + "epoch": 72.08144796380091, + "grad_norm": 3.0762221813201904, + "learning_rate": 3.2944718722420265e-05, + "loss": 0.3052, + "step": 79650 + }, + { + "epoch": 72.10407239819004, + "grad_norm": 2.307109832763672, + "learning_rate": 3.289568924047954e-05, + "loss": 0.3359, + "step": 79675 + }, + { + "epoch": 72.12669683257919, + "grad_norm": 1.8705261945724487, + "learning_rate": 3.284668601866006e-05, + "loss": 0.3049, + "step": 79700 + }, + { + "epoch": 72.14932126696833, + "grad_norm": 3.070863962173462, + "learning_rate": 3.2797709087524827e-05, + "loss": 0.2867, + "step": 79725 + }, + { + "epoch": 72.17194570135747, + "grad_norm": 3.732760190963745, + "learning_rate": 3.274875847762028e-05, + "loss": 0.374, + "step": 79750 + }, + { + "epoch": 72.1945701357466, + "grad_norm": 3.1433746814727783, + "learning_rate": 3.26998342194766e-05, + "loss": 0.2753, + "step": 79775 + }, + { + "epoch": 72.21719457013575, + "grad_norm": 1.5535398721694946, + "learning_rate": 3.2650936343607387e-05, + "loss": 0.3896, + "step": 79800 + }, + { + "epoch": 72.23981900452489, + "grad_norm": 2.2897911071777344, + "learning_rate": 3.2602064880509904e-05, + "loss": 0.3285, + "step": 79825 + }, + { + "epoch": 72.26244343891403, + "grad_norm": 2.7750487327575684, + "learning_rate": 3.2553219860664856e-05, + "loss": 0.3311, + "step": 79850 + }, + { + "epoch": 72.28506787330316, + "grad_norm": 3.684419631958008, + "learning_rate": 3.2504401314536526e-05, + "loss": 0.3472, + "step": 79875 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 2.5782241821289062, + "learning_rate": 3.2455609272572606e-05, + "loss": 0.3646, + "step": 79900 + }, + { + "epoch": 72.33031674208145, + "grad_norm": 2.6745152473449707, + "learning_rate": 3.240684376520435e-05, + "loss": 0.3156, + "step": 79925 + }, + { + "epoch": 72.3529411764706, + "grad_norm": 1.8315614461898804, + "learning_rate": 3.2358104822846364e-05, + "loss": 0.3267, + "step": 79950 + }, + { + "epoch": 72.37556561085972, + "grad_norm": 2.0482234954833984, + "learning_rate": 3.2309392475896785e-05, + "loss": 0.3241, + "step": 79975 + }, + { + "epoch": 72.39819004524887, + "grad_norm": 2.1007161140441895, + "learning_rate": 3.226070675473707e-05, + "loss": 0.2776, + "step": 80000 + }, + { + "epoch": 72.42081447963801, + "grad_norm": 0.8560355305671692, + "learning_rate": 3.221204768973215e-05, + "loss": 0.3009, + "step": 80025 + }, + { + "epoch": 72.44343891402715, + "grad_norm": 1.1884980201721191, + "learning_rate": 3.2163415311230254e-05, + "loss": 0.3478, + "step": 80050 + }, + { + "epoch": 72.46606334841628, + "grad_norm": 2.090597152709961, + "learning_rate": 3.211480964956306e-05, + "loss": 0.3158, + "step": 80075 + }, + { + "epoch": 72.48868778280543, + "grad_norm": 3.4095449447631836, + "learning_rate": 3.206623073504547e-05, + "loss": 0.3402, + "step": 80100 + }, + { + "epoch": 72.51131221719457, + "grad_norm": 2.9183309078216553, + "learning_rate": 3.201767859797582e-05, + "loss": 0.3328, + "step": 80125 + }, + { + "epoch": 72.53393665158372, + "grad_norm": 2.8395659923553467, + "learning_rate": 3.1971093766721136e-05, + "loss": 0.4211, + "step": 80150 + }, + { + "epoch": 72.55656108597285, + "grad_norm": 3.2014007568359375, + "learning_rate": 3.1922594201274654e-05, + "loss": 0.2866, + "step": 80175 + }, + { + "epoch": 72.57918552036199, + "grad_norm": 2.333317995071411, + "learning_rate": 3.187412150286107e-05, + "loss": 0.2989, + "step": 80200 + }, + { + "epoch": 72.60180995475113, + "grad_norm": 2.2161149978637695, + "learning_rate": 3.182567570171236e-05, + "loss": 0.3393, + "step": 80225 + }, + { + "epoch": 72.62443438914028, + "grad_norm": 2.343919277191162, + "learning_rate": 3.177725682804383e-05, + "loss": 0.4342, + "step": 80250 + }, + { + "epoch": 72.6470588235294, + "grad_norm": 4.018833637237549, + "learning_rate": 3.172886491205391e-05, + "loss": 0.3837, + "step": 80275 + }, + { + "epoch": 72.66968325791855, + "grad_norm": 4.137818813323975, + "learning_rate": 3.1680499983924286e-05, + "loss": 0.3435, + "step": 80300 + }, + { + "epoch": 72.6923076923077, + "grad_norm": 3.6315526962280273, + "learning_rate": 3.163216207381973e-05, + "loss": 0.4356, + "step": 80325 + }, + { + "epoch": 72.71493212669684, + "grad_norm": 3.445065498352051, + "learning_rate": 3.158385121188826e-05, + "loss": 0.4355, + "step": 80350 + }, + { + "epoch": 72.73755656108597, + "grad_norm": 1.8939542770385742, + "learning_rate": 3.1535567428260915e-05, + "loss": 0.3259, + "step": 80375 + }, + { + "epoch": 72.76018099547511, + "grad_norm": 3.711698532104492, + "learning_rate": 3.148731075305197e-05, + "loss": 0.3666, + "step": 80400 + }, + { + "epoch": 72.78280542986425, + "grad_norm": 2.810096502304077, + "learning_rate": 3.1439081216358666e-05, + "loss": 0.3143, + "step": 80425 + }, + { + "epoch": 72.8054298642534, + "grad_norm": 3.124650001525879, + "learning_rate": 3.1390878848261434e-05, + "loss": 0.3903, + "step": 80450 + }, + { + "epoch": 72.82805429864253, + "grad_norm": 2.124650478363037, + "learning_rate": 3.1342703678823644e-05, + "loss": 0.4082, + "step": 80475 + }, + { + "epoch": 72.85067873303167, + "grad_norm": 1.5146489143371582, + "learning_rate": 3.1294555738091826e-05, + "loss": 0.2769, + "step": 80500 + }, + { + "epoch": 72.87330316742081, + "grad_norm": 2.1591391563415527, + "learning_rate": 3.124643505609541e-05, + "loss": 0.2875, + "step": 80525 + }, + { + "epoch": 72.89592760180996, + "grad_norm": 2.8991804122924805, + "learning_rate": 3.119834166284693e-05, + "loss": 0.305, + "step": 80550 + }, + { + "epoch": 72.91855203619909, + "grad_norm": 1.6438605785369873, + "learning_rate": 3.115027558834179e-05, + "loss": 0.3824, + "step": 80575 + }, + { + "epoch": 72.94117647058823, + "grad_norm": 3.8229880332946777, + "learning_rate": 3.1102236862558485e-05, + "loss": 0.3596, + "step": 80600 + }, + { + "epoch": 72.96380090497738, + "grad_norm": 2.8134958744049072, + "learning_rate": 3.105422551545832e-05, + "loss": 0.3102, + "step": 80625 + }, + { + "epoch": 72.98642533936652, + "grad_norm": 1.2967236042022705, + "learning_rate": 3.1006241576985644e-05, + "loss": 0.3728, + "step": 80650 + }, + { + "epoch": 73.00904977375566, + "grad_norm": 1.0272668600082397, + "learning_rate": 3.09582850770676e-05, + "loss": 0.2807, + "step": 80675 + }, + { + "epoch": 73.03167420814479, + "grad_norm": 2.2608537673950195, + "learning_rate": 3.0910356045614324e-05, + "loss": 0.4159, + "step": 80700 + }, + { + "epoch": 73.05429864253394, + "grad_norm": 2.204169273376465, + "learning_rate": 3.0862454512518724e-05, + "loss": 0.3911, + "step": 80725 + }, + { + "epoch": 73.07692307692308, + "grad_norm": 3.5292227268218994, + "learning_rate": 3.081458050765665e-05, + "loss": 0.2254, + "step": 80750 + }, + { + "epoch": 73.09954751131222, + "grad_norm": 2.4946675300598145, + "learning_rate": 3.076673406088667e-05, + "loss": 0.2821, + "step": 80775 + }, + { + "epoch": 73.12217194570135, + "grad_norm": 3.0582332611083984, + "learning_rate": 3.07189152020503e-05, + "loss": 0.3263, + "step": 80800 + }, + { + "epoch": 73.1447963800905, + "grad_norm": 2.60414981842041, + "learning_rate": 3.067112396097173e-05, + "loss": 0.3567, + "step": 80825 + }, + { + "epoch": 73.16742081447964, + "grad_norm": 3.260399103164673, + "learning_rate": 3.062336036745801e-05, + "loss": 0.3123, + "step": 80850 + }, + { + "epoch": 73.19004524886878, + "grad_norm": 3.269760847091675, + "learning_rate": 3.057562445129886e-05, + "loss": 0.2551, + "step": 80875 + }, + { + "epoch": 73.21266968325791, + "grad_norm": 1.9295631647109985, + "learning_rate": 3.0527916242266825e-05, + "loss": 0.374, + "step": 80900 + }, + { + "epoch": 73.23529411764706, + "grad_norm": 1.468007206916809, + "learning_rate": 3.048023577011716e-05, + "loss": 0.283, + "step": 80925 + }, + { + "epoch": 73.2579185520362, + "grad_norm": 2.1063339710235596, + "learning_rate": 3.0432583064587727e-05, + "loss": 0.304, + "step": 80950 + }, + { + "epoch": 73.28054298642535, + "grad_norm": 2.960327386856079, + "learning_rate": 3.03849581553992e-05, + "loss": 0.3517, + "step": 80975 + }, + { + "epoch": 73.30316742081448, + "grad_norm": 2.941462755203247, + "learning_rate": 3.0337361072254802e-05, + "loss": 0.3285, + "step": 81000 + }, + { + "epoch": 73.32579185520362, + "grad_norm": 2.0421576499938965, + "learning_rate": 3.02897918448405e-05, + "loss": 0.2806, + "step": 81025 + }, + { + "epoch": 73.34841628959276, + "grad_norm": 4.625753879547119, + "learning_rate": 3.0242250502824785e-05, + "loss": 0.2991, + "step": 81050 + }, + { + "epoch": 73.3710407239819, + "grad_norm": 3.131014108657837, + "learning_rate": 3.019473707585887e-05, + "loss": 0.3234, + "step": 81075 + }, + { + "epoch": 73.39366515837104, + "grad_norm": 3.3441922664642334, + "learning_rate": 3.0147251593576445e-05, + "loss": 0.3555, + "step": 81100 + }, + { + "epoch": 73.41628959276018, + "grad_norm": 3.3957202434539795, + "learning_rate": 3.009979408559387e-05, + "loss": 0.3075, + "step": 81125 + }, + { + "epoch": 73.43891402714932, + "grad_norm": 2.288896083831787, + "learning_rate": 3.005236458150997e-05, + "loss": 0.4573, + "step": 81150 + }, + { + "epoch": 73.46153846153847, + "grad_norm": 2.7308273315429688, + "learning_rate": 3.0004963110906195e-05, + "loss": 0.3272, + "step": 81175 + }, + { + "epoch": 73.4841628959276, + "grad_norm": 2.2193171977996826, + "learning_rate": 2.9957589703346404e-05, + "loss": 0.4009, + "step": 81200 + }, + { + "epoch": 73.50678733031674, + "grad_norm": 1.5203183889389038, + "learning_rate": 2.991024438837708e-05, + "loss": 0.2621, + "step": 81225 + }, + { + "epoch": 73.52941176470588, + "grad_norm": 2.995479106903076, + "learning_rate": 2.986292719552705e-05, + "loss": 0.3055, + "step": 81250 + }, + { + "epoch": 73.55203619909503, + "grad_norm": 2.3604586124420166, + "learning_rate": 2.9815638154307726e-05, + "loss": 0.2909, + "step": 81275 + }, + { + "epoch": 73.57466063348416, + "grad_norm": 3.2187516689300537, + "learning_rate": 2.976837729421285e-05, + "loss": 0.3055, + "step": 81300 + }, + { + "epoch": 73.5972850678733, + "grad_norm": 3.0556023120880127, + "learning_rate": 2.9721144644718697e-05, + "loss": 0.3756, + "step": 81325 + }, + { + "epoch": 73.61990950226244, + "grad_norm": 1.973311424255371, + "learning_rate": 2.9673940235283835e-05, + "loss": 0.2639, + "step": 81350 + }, + { + "epoch": 73.64253393665159, + "grad_norm": 2.9156811237335205, + "learning_rate": 2.962676409534933e-05, + "loss": 0.2938, + "step": 81375 + }, + { + "epoch": 73.66515837104072, + "grad_norm": 2.5239579677581787, + "learning_rate": 2.9579616254338513e-05, + "loss": 0.3068, + "step": 81400 + }, + { + "epoch": 73.68778280542986, + "grad_norm": 2.642988681793213, + "learning_rate": 2.9532496741657164e-05, + "loss": 0.2811, + "step": 81425 + }, + { + "epoch": 73.710407239819, + "grad_norm": 3.004598379135132, + "learning_rate": 2.9485405586693296e-05, + "loss": 0.3466, + "step": 81450 + }, + { + "epoch": 73.73303167420815, + "grad_norm": 2.9902164936065674, + "learning_rate": 2.943834281881733e-05, + "loss": 0.3031, + "step": 81475 + }, + { + "epoch": 73.75565610859728, + "grad_norm": 2.555826187133789, + "learning_rate": 2.9391308467381894e-05, + "loss": 0.3415, + "step": 81500 + }, + { + "epoch": 73.77828054298642, + "grad_norm": 1.7891961336135864, + "learning_rate": 2.934430256172199e-05, + "loss": 0.3319, + "step": 81525 + }, + { + "epoch": 73.80090497737557, + "grad_norm": 2.137577772140503, + "learning_rate": 2.9297325131154764e-05, + "loss": 0.2743, + "step": 81550 + }, + { + "epoch": 73.82352941176471, + "grad_norm": 1.9066287279129028, + "learning_rate": 2.9250376204979718e-05, + "loss": 0.3188, + "step": 81575 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 2.7708871364593506, + "learning_rate": 2.9203455812478468e-05, + "loss": 0.4532, + "step": 81600 + }, + { + "epoch": 73.86877828054298, + "grad_norm": 1.9949368238449097, + "learning_rate": 2.9156563982914942e-05, + "loss": 0.3868, + "step": 81625 + }, + { + "epoch": 73.89140271493213, + "grad_norm": 3.3274765014648438, + "learning_rate": 2.9109700745535148e-05, + "loss": 0.27, + "step": 81650 + }, + { + "epoch": 73.91402714932127, + "grad_norm": 1.5537223815917969, + "learning_rate": 2.9062866129567357e-05, + "loss": 0.3259, + "step": 81675 + }, + { + "epoch": 73.9366515837104, + "grad_norm": 2.4978432655334473, + "learning_rate": 2.9016060164221903e-05, + "loss": 0.3164, + "step": 81700 + }, + { + "epoch": 73.95927601809954, + "grad_norm": 3.590334177017212, + "learning_rate": 2.8969282878691327e-05, + "loss": 0.311, + "step": 81725 + }, + { + "epoch": 73.98190045248869, + "grad_norm": 2.6827969551086426, + "learning_rate": 2.8922534302150214e-05, + "loss": 0.4615, + "step": 81750 + }, + { + "epoch": 74.00452488687783, + "grad_norm": 4.458113193511963, + "learning_rate": 2.8875814463755313e-05, + "loss": 0.3725, + "step": 81775 + }, + { + "epoch": 74.02714932126698, + "grad_norm": 3.4580414295196533, + "learning_rate": 2.8829123392645366e-05, + "loss": 0.348, + "step": 81800 + }, + { + "epoch": 74.0497737556561, + "grad_norm": 2.76762056350708, + "learning_rate": 2.878246111794128e-05, + "loss": 0.233, + "step": 81825 + }, + { + "epoch": 74.07239819004525, + "grad_norm": 2.417442798614502, + "learning_rate": 2.873582766874587e-05, + "loss": 0.2785, + "step": 81850 + }, + { + "epoch": 74.09502262443439, + "grad_norm": 2.8274288177490234, + "learning_rate": 2.868922307414412e-05, + "loss": 0.2651, + "step": 81875 + }, + { + "epoch": 74.11764705882354, + "grad_norm": 3.2265682220458984, + "learning_rate": 2.8642647363202874e-05, + "loss": 0.2956, + "step": 81900 + }, + { + "epoch": 74.14027149321267, + "grad_norm": 2.7552247047424316, + "learning_rate": 2.8596100564971093e-05, + "loss": 0.2489, + "step": 81925 + }, + { + "epoch": 74.16289592760181, + "grad_norm": 1.803971290588379, + "learning_rate": 2.854958270847958e-05, + "loss": 0.2792, + "step": 81950 + }, + { + "epoch": 74.18552036199095, + "grad_norm": 1.4962939023971558, + "learning_rate": 2.8503093822741187e-05, + "loss": 0.3283, + "step": 81975 + }, + { + "epoch": 74.2081447963801, + "grad_norm": 2.628507137298584, + "learning_rate": 2.845663393675067e-05, + "loss": 0.2768, + "step": 82000 + }, + { + "epoch": 74.23076923076923, + "grad_norm": 3.1974716186523438, + "learning_rate": 2.8410203079484646e-05, + "loss": 0.3006, + "step": 82025 + }, + { + "epoch": 74.25339366515837, + "grad_norm": 2.3015873432159424, + "learning_rate": 2.8363801279901723e-05, + "loss": 0.2932, + "step": 82050 + }, + { + "epoch": 74.27601809954751, + "grad_norm": 2.5883677005767822, + "learning_rate": 2.8317428566942273e-05, + "loss": 0.3571, + "step": 82075 + }, + { + "epoch": 74.29864253393666, + "grad_norm": 2.075589179992676, + "learning_rate": 2.8271084969528644e-05, + "loss": 0.3607, + "step": 82100 + }, + { + "epoch": 74.32126696832579, + "grad_norm": 2.1009368896484375, + "learning_rate": 2.82247705165649e-05, + "loss": 0.2569, + "step": 82125 + }, + { + "epoch": 74.34389140271493, + "grad_norm": 5.540664196014404, + "learning_rate": 2.8178485236937067e-05, + "loss": 0.3984, + "step": 82150 + }, + { + "epoch": 74.36651583710407, + "grad_norm": 1.998557686805725, + "learning_rate": 2.8132229159512835e-05, + "loss": 0.3504, + "step": 82175 + }, + { + "epoch": 74.38914027149322, + "grad_norm": 2.714663028717041, + "learning_rate": 2.8086002313141813e-05, + "loss": 0.2711, + "step": 82200 + }, + { + "epoch": 74.41176470588235, + "grad_norm": 3.093409299850464, + "learning_rate": 2.8039804726655254e-05, + "loss": 0.2628, + "step": 82225 + }, + { + "epoch": 74.43438914027149, + "grad_norm": 1.7061655521392822, + "learning_rate": 2.7993636428866274e-05, + "loss": 0.2953, + "step": 82250 + }, + { + "epoch": 74.45701357466064, + "grad_norm": 2.4149158000946045, + "learning_rate": 2.7947497448569626e-05, + "loss": 0.3035, + "step": 82275 + }, + { + "epoch": 74.47963800904978, + "grad_norm": 2.3477773666381836, + "learning_rate": 2.7901387814541866e-05, + "loss": 0.3547, + "step": 82300 + }, + { + "epoch": 74.50226244343891, + "grad_norm": 1.7514317035675049, + "learning_rate": 2.7855307555541154e-05, + "loss": 0.2606, + "step": 82325 + }, + { + "epoch": 74.52488687782805, + "grad_norm": 2.655534029006958, + "learning_rate": 2.7809256700307435e-05, + "loss": 0.3602, + "step": 82350 + }, + { + "epoch": 74.5475113122172, + "grad_norm": 2.45708966255188, + "learning_rate": 2.776323527756221e-05, + "loss": 0.2686, + "step": 82375 + }, + { + "epoch": 74.57013574660634, + "grad_norm": 2.429360866546631, + "learning_rate": 2.7717243316008704e-05, + "loss": 0.288, + "step": 82400 + }, + { + "epoch": 74.59276018099547, + "grad_norm": 2.4920473098754883, + "learning_rate": 2.7671280844331694e-05, + "loss": 0.3735, + "step": 82425 + }, + { + "epoch": 74.61538461538461, + "grad_norm": 2.358638286590576, + "learning_rate": 2.762534789119767e-05, + "loss": 0.2888, + "step": 82450 + }, + { + "epoch": 74.63800904977376, + "grad_norm": 2.5498902797698975, + "learning_rate": 2.757944448525458e-05, + "loss": 0.3056, + "step": 82475 + }, + { + "epoch": 74.6606334841629, + "grad_norm": 2.0660364627838135, + "learning_rate": 2.7533570655132064e-05, + "loss": 0.384, + "step": 82500 + }, + { + "epoch": 74.68325791855203, + "grad_norm": 5.2555365562438965, + "learning_rate": 2.7487726429441214e-05, + "loss": 0.4108, + "step": 82525 + }, + { + "epoch": 74.70588235294117, + "grad_norm": 0.9547332525253296, + "learning_rate": 2.7441911836774757e-05, + "loss": 0.3115, + "step": 82550 + }, + { + "epoch": 74.72850678733032, + "grad_norm": 2.4335803985595703, + "learning_rate": 2.7396126905706836e-05, + "loss": 0.364, + "step": 82575 + }, + { + "epoch": 74.75113122171946, + "grad_norm": 2.516291618347168, + "learning_rate": 2.73503716647932e-05, + "loss": 0.308, + "step": 82600 + }, + { + "epoch": 74.77375565610859, + "grad_norm": 2.8080978393554688, + "learning_rate": 2.730464614257096e-05, + "loss": 0.3661, + "step": 82625 + }, + { + "epoch": 74.79638009049773, + "grad_norm": 2.376248598098755, + "learning_rate": 2.725895036755883e-05, + "loss": 0.3609, + "step": 82650 + }, + { + "epoch": 74.81900452488688, + "grad_norm": 1.9304112195968628, + "learning_rate": 2.7213284368256836e-05, + "loss": 0.3785, + "step": 82675 + }, + { + "epoch": 74.84162895927602, + "grad_norm": 2.4170520305633545, + "learning_rate": 2.7167648173146557e-05, + "loss": 0.2514, + "step": 82700 + }, + { + "epoch": 74.86425339366515, + "grad_norm": 2.2855734825134277, + "learning_rate": 2.7122041810690847e-05, + "loss": 0.3127, + "step": 82725 + }, + { + "epoch": 74.8868778280543, + "grad_norm": 2.7707996368408203, + "learning_rate": 2.707646530933411e-05, + "loss": 0.3125, + "step": 82750 + }, + { + "epoch": 74.90950226244344, + "grad_norm": 2.6241166591644287, + "learning_rate": 2.7030918697501974e-05, + "loss": 0.2987, + "step": 82775 + }, + { + "epoch": 74.93212669683258, + "grad_norm": 2.104856014251709, + "learning_rate": 2.6985402003601557e-05, + "loss": 0.277, + "step": 82800 + }, + { + "epoch": 74.95475113122171, + "grad_norm": 2.8981499671936035, + "learning_rate": 2.6939915256021194e-05, + "loss": 0.3739, + "step": 82825 + }, + { + "epoch": 74.97737556561086, + "grad_norm": 3.1936309337615967, + "learning_rate": 2.689445848313067e-05, + "loss": 0.2896, + "step": 82850 + }, + { + "epoch": 75.0, + "grad_norm": 2.3453421592712402, + "learning_rate": 2.6849031713280924e-05, + "loss": 0.323, + "step": 82875 + }, + { + "epoch": 75.02262443438914, + "grad_norm": 2.4536938667297363, + "learning_rate": 2.6803634974804376e-05, + "loss": 0.254, + "step": 82900 + }, + { + "epoch": 75.04524886877829, + "grad_norm": 1.9860996007919312, + "learning_rate": 2.6758268296014522e-05, + "loss": 0.2524, + "step": 82925 + }, + { + "epoch": 75.06787330316742, + "grad_norm": 2.0787274837493896, + "learning_rate": 2.671293170520626e-05, + "loss": 0.2855, + "step": 82950 + }, + { + "epoch": 75.09049773755656, + "grad_norm": 2.0604352951049805, + "learning_rate": 2.6667625230655603e-05, + "loss": 0.2704, + "step": 82975 + }, + { + "epoch": 75.1131221719457, + "grad_norm": 1.309448480606079, + "learning_rate": 2.662234890061989e-05, + "loss": 0.3004, + "step": 83000 + }, + { + "epoch": 75.13574660633485, + "grad_norm": 2.537576675415039, + "learning_rate": 2.6577102743337536e-05, + "loss": 0.2986, + "step": 83025 + }, + { + "epoch": 75.15837104072398, + "grad_norm": 2.5910840034484863, + "learning_rate": 2.653188678702828e-05, + "loss": 0.2692, + "step": 83050 + }, + { + "epoch": 75.18099547511312, + "grad_norm": 1.1406221389770508, + "learning_rate": 2.648670105989289e-05, + "loss": 0.2639, + "step": 83075 + }, + { + "epoch": 75.20361990950227, + "grad_norm": 1.3294293880462646, + "learning_rate": 2.6441545590113394e-05, + "loss": 0.2789, + "step": 83100 + }, + { + "epoch": 75.22624434389141, + "grad_norm": 2.0877737998962402, + "learning_rate": 2.639642040585285e-05, + "loss": 0.3101, + "step": 83125 + }, + { + "epoch": 75.24886877828054, + "grad_norm": 2.896243095397949, + "learning_rate": 2.6351325535255527e-05, + "loss": 0.268, + "step": 83150 + }, + { + "epoch": 75.27149321266968, + "grad_norm": 2.6649253368377686, + "learning_rate": 2.630626100644667e-05, + "loss": 0.2688, + "step": 83175 + }, + { + "epoch": 75.29411764705883, + "grad_norm": 3.275167226791382, + "learning_rate": 2.6261226847532727e-05, + "loss": 0.2727, + "step": 83200 + }, + { + "epoch": 75.31674208144797, + "grad_norm": 1.5215011835098267, + "learning_rate": 2.621622308660109e-05, + "loss": 0.3157, + "step": 83225 + }, + { + "epoch": 75.3393665158371, + "grad_norm": 3.882742166519165, + "learning_rate": 2.6171249751720292e-05, + "loss": 0.342, + "step": 83250 + }, + { + "epoch": 75.36199095022624, + "grad_norm": 3.296842575073242, + "learning_rate": 2.6126306870939785e-05, + "loss": 0.2939, + "step": 83275 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 2.2243354320526123, + "learning_rate": 2.6081394472290153e-05, + "loss": 0.2908, + "step": 83300 + }, + { + "epoch": 75.40723981900453, + "grad_norm": 2.889202356338501, + "learning_rate": 2.603651258378283e-05, + "loss": 0.3203, + "step": 83325 + }, + { + "epoch": 75.42986425339366, + "grad_norm": 1.743531346321106, + "learning_rate": 2.5991661233410348e-05, + "loss": 0.2665, + "step": 83350 + }, + { + "epoch": 75.4524886877828, + "grad_norm": 3.992488384246826, + "learning_rate": 2.5946840449146088e-05, + "loss": 0.3554, + "step": 83375 + }, + { + "epoch": 75.47511312217195, + "grad_norm": 2.4225494861602783, + "learning_rate": 2.5902050258944454e-05, + "loss": 0.2776, + "step": 83400 + }, + { + "epoch": 75.49773755656109, + "grad_norm": 1.7738288640975952, + "learning_rate": 2.585729069074069e-05, + "loss": 0.335, + "step": 83425 + }, + { + "epoch": 75.52036199095022, + "grad_norm": 1.690691351890564, + "learning_rate": 2.5812561772451023e-05, + "loss": 0.317, + "step": 83450 + }, + { + "epoch": 75.54298642533936, + "grad_norm": 1.6452319622039795, + "learning_rate": 2.5767863531972483e-05, + "loss": 0.2665, + "step": 83475 + }, + { + "epoch": 75.56561085972851, + "grad_norm": 2.921520233154297, + "learning_rate": 2.5723195997183033e-05, + "loss": 0.4052, + "step": 83500 + }, + { + "epoch": 75.58823529411765, + "grad_norm": 1.555267095565796, + "learning_rate": 2.5678559195941424e-05, + "loss": 0.3262, + "step": 83525 + }, + { + "epoch": 75.61085972850678, + "grad_norm": 2.8181378841400146, + "learning_rate": 2.563395315608729e-05, + "loss": 0.2583, + "step": 83550 + }, + { + "epoch": 75.63348416289593, + "grad_norm": 3.9380154609680176, + "learning_rate": 2.5589377905441086e-05, + "loss": 0.3063, + "step": 83575 + }, + { + "epoch": 75.65610859728507, + "grad_norm": 2.493354320526123, + "learning_rate": 2.5544833471803992e-05, + "loss": 0.3014, + "step": 83600 + }, + { + "epoch": 75.67873303167421, + "grad_norm": 1.0901517868041992, + "learning_rate": 2.550031988295806e-05, + "loss": 0.2351, + "step": 83625 + }, + { + "epoch": 75.70135746606334, + "grad_norm": 2.2735798358917236, + "learning_rate": 2.5455837166666007e-05, + "loss": 0.2414, + "step": 83650 + }, + { + "epoch": 75.72398190045249, + "grad_norm": 1.5142090320587158, + "learning_rate": 2.54113853506714e-05, + "loss": 0.3092, + "step": 83675 + }, + { + "epoch": 75.74660633484163, + "grad_norm": 2.3500263690948486, + "learning_rate": 2.536696446269843e-05, + "loss": 0.2965, + "step": 83700 + }, + { + "epoch": 75.76923076923077, + "grad_norm": 1.6575660705566406, + "learning_rate": 2.53225745304521e-05, + "loss": 0.3118, + "step": 83725 + }, + { + "epoch": 75.7918552036199, + "grad_norm": 3.3095154762268066, + "learning_rate": 2.5278215581618005e-05, + "loss": 0.3143, + "step": 83750 + }, + { + "epoch": 75.81447963800905, + "grad_norm": 4.247889518737793, + "learning_rate": 2.5233887643862524e-05, + "loss": 0.3356, + "step": 83775 + }, + { + "epoch": 75.83710407239819, + "grad_norm": 2.5860421657562256, + "learning_rate": 2.5189590744832567e-05, + "loss": 0.3011, + "step": 83800 + }, + { + "epoch": 75.85972850678733, + "grad_norm": 2.3422882556915283, + "learning_rate": 2.5145324912155822e-05, + "loss": 0.2598, + "step": 83825 + }, + { + "epoch": 75.88235294117646, + "grad_norm": 1.9032371044158936, + "learning_rate": 2.5101090173440477e-05, + "loss": 0.2853, + "step": 83850 + }, + { + "epoch": 75.90497737556561, + "grad_norm": 1.396369457244873, + "learning_rate": 2.5056886556275436e-05, + "loss": 0.3136, + "step": 83875 + }, + { + "epoch": 75.92760180995475, + "grad_norm": 2.703479528427124, + "learning_rate": 2.5012714088230086e-05, + "loss": 0.4706, + "step": 83900 + }, + { + "epoch": 75.9502262443439, + "grad_norm": 4.66386604309082, + "learning_rate": 2.4968572796854495e-05, + "loss": 0.3674, + "step": 83925 + }, + { + "epoch": 75.97285067873302, + "grad_norm": 2.2697360515594482, + "learning_rate": 2.492446270967919e-05, + "loss": 0.3386, + "step": 83950 + }, + { + "epoch": 75.99547511312217, + "grad_norm": 2.354702949523926, + "learning_rate": 2.488214640844009e-05, + "loss": 0.329, + "step": 83975 + }, + { + "epoch": 76.01809954751131, + "grad_norm": 2.0319299697875977, + "learning_rate": 2.483809756128354e-05, + "loss": 0.3337, + "step": 84000 + }, + { + "epoch": 76.04072398190046, + "grad_norm": 1.9685543775558472, + "learning_rate": 2.4794079999703636e-05, + "loss": 0.2498, + "step": 84025 + }, + { + "epoch": 76.0633484162896, + "grad_norm": 1.9180629253387451, + "learning_rate": 2.4750093751153818e-05, + "loss": 0.2594, + "step": 84050 + }, + { + "epoch": 76.08597285067873, + "grad_norm": 1.8504961729049683, + "learning_rate": 2.4706138843067916e-05, + "loss": 0.2618, + "step": 84075 + }, + { + "epoch": 76.10859728506787, + "grad_norm": 1.5177520513534546, + "learning_rate": 2.4662215302860293e-05, + "loss": 0.2552, + "step": 84100 + }, + { + "epoch": 76.13122171945702, + "grad_norm": 1.9649053812026978, + "learning_rate": 2.4618323157925665e-05, + "loss": 0.2737, + "step": 84125 + }, + { + "epoch": 76.15384615384616, + "grad_norm": 2.5923967361450195, + "learning_rate": 2.4574462435639256e-05, + "loss": 0.2432, + "step": 84150 + }, + { + "epoch": 76.17647058823529, + "grad_norm": 2.4911556243896484, + "learning_rate": 2.4530633163356605e-05, + "loss": 0.2476, + "step": 84175 + }, + { + "epoch": 76.19909502262443, + "grad_norm": 1.116897463798523, + "learning_rate": 2.4486835368413724e-05, + "loss": 0.2449, + "step": 84200 + }, + { + "epoch": 76.22171945701358, + "grad_norm": 3.128192663192749, + "learning_rate": 2.4443069078126902e-05, + "loss": 0.3248, + "step": 84225 + }, + { + "epoch": 76.24434389140272, + "grad_norm": 3.1830790042877197, + "learning_rate": 2.439933431979286e-05, + "loss": 0.3011, + "step": 84250 + }, + { + "epoch": 76.26696832579185, + "grad_norm": 2.2238335609436035, + "learning_rate": 2.435563112068858e-05, + "loss": 0.3543, + "step": 84275 + }, + { + "epoch": 76.289592760181, + "grad_norm": 1.420111060142517, + "learning_rate": 2.4311959508071437e-05, + "loss": 0.3136, + "step": 84300 + }, + { + "epoch": 76.31221719457014, + "grad_norm": 2.6922364234924316, + "learning_rate": 2.4268319509179024e-05, + "loss": 0.2751, + "step": 84325 + }, + { + "epoch": 76.33484162895928, + "grad_norm": 2.10040283203125, + "learning_rate": 2.422471115122931e-05, + "loss": 0.2464, + "step": 84350 + }, + { + "epoch": 76.35746606334841, + "grad_norm": 1.4247398376464844, + "learning_rate": 2.418113446142042e-05, + "loss": 0.3139, + "step": 84375 + }, + { + "epoch": 76.38009049773756, + "grad_norm": 3.905673027038574, + "learning_rate": 2.4137589466930843e-05, + "loss": 0.3082, + "step": 84400 + }, + { + "epoch": 76.4027149321267, + "grad_norm": 2.093348264694214, + "learning_rate": 2.4094076194919193e-05, + "loss": 0.2518, + "step": 84425 + }, + { + "epoch": 76.42533936651584, + "grad_norm": 2.3145039081573486, + "learning_rate": 2.4050594672524394e-05, + "loss": 0.3754, + "step": 84450 + }, + { + "epoch": 76.44796380090497, + "grad_norm": 3.8801562786102295, + "learning_rate": 2.4007144926865473e-05, + "loss": 0.2717, + "step": 84475 + }, + { + "epoch": 76.47058823529412, + "grad_norm": 2.3647115230560303, + "learning_rate": 2.3963726985041743e-05, + "loss": 0.3164, + "step": 84500 + }, + { + "epoch": 76.49321266968326, + "grad_norm": 2.593600273132324, + "learning_rate": 2.3920340874132575e-05, + "loss": 0.318, + "step": 84525 + }, + { + "epoch": 76.5158371040724, + "grad_norm": 6.139859199523926, + "learning_rate": 2.387698662119759e-05, + "loss": 0.3166, + "step": 84550 + }, + { + "epoch": 76.53846153846153, + "grad_norm": 2.8768985271453857, + "learning_rate": 2.383366425327644e-05, + "loss": 0.3057, + "step": 84575 + }, + { + "epoch": 76.56108597285068, + "grad_norm": 1.225097894668579, + "learning_rate": 2.3790373797388974e-05, + "loss": 0.2983, + "step": 84600 + }, + { + "epoch": 76.58371040723982, + "grad_norm": 4.126646518707275, + "learning_rate": 2.374711528053508e-05, + "loss": 0.3626, + "step": 84625 + }, + { + "epoch": 76.60633484162896, + "grad_norm": 2.515584707260132, + "learning_rate": 2.3703888729694766e-05, + "loss": 0.3151, + "step": 84650 + }, + { + "epoch": 76.6289592760181, + "grad_norm": 1.9274005889892578, + "learning_rate": 2.366069417182807e-05, + "loss": 0.3131, + "step": 84675 + }, + { + "epoch": 76.65158371040724, + "grad_norm": 2.3965847492218018, + "learning_rate": 2.3617531633875107e-05, + "loss": 0.283, + "step": 84700 + }, + { + "epoch": 76.67420814479638, + "grad_norm": 2.397613763809204, + "learning_rate": 2.357440114275598e-05, + "loss": 0.2835, + "step": 84725 + }, + { + "epoch": 76.69683257918552, + "grad_norm": 2.0233254432678223, + "learning_rate": 2.3531302725370872e-05, + "loss": 0.3035, + "step": 84750 + }, + { + "epoch": 76.71945701357465, + "grad_norm": 3.270297050476074, + "learning_rate": 2.3488236408599876e-05, + "loss": 0.2768, + "step": 84775 + }, + { + "epoch": 76.7420814479638, + "grad_norm": 2.188873291015625, + "learning_rate": 2.3445202219303124e-05, + "loss": 0.3151, + "step": 84800 + }, + { + "epoch": 76.76470588235294, + "grad_norm": 1.799863338470459, + "learning_rate": 2.3402200184320726e-05, + "loss": 0.387, + "step": 84825 + }, + { + "epoch": 76.78733031674209, + "grad_norm": 1.9361035823822021, + "learning_rate": 2.3359230330472663e-05, + "loss": 0.272, + "step": 84850 + }, + { + "epoch": 76.80995475113122, + "grad_norm": 3.911672592163086, + "learning_rate": 2.3316292684558923e-05, + "loss": 0.3238, + "step": 84875 + }, + { + "epoch": 76.83257918552036, + "grad_norm": 2.8085107803344727, + "learning_rate": 2.3273387273359336e-05, + "loss": 0.3517, + "step": 84900 + }, + { + "epoch": 76.8552036199095, + "grad_norm": 2.42825984954834, + "learning_rate": 2.323051412363371e-05, + "loss": 0.2764, + "step": 84925 + }, + { + "epoch": 76.87782805429865, + "grad_norm": 3.0773284435272217, + "learning_rate": 2.3187673262121634e-05, + "loss": 0.2522, + "step": 84950 + }, + { + "epoch": 76.90045248868778, + "grad_norm": 2.8645083904266357, + "learning_rate": 2.3144864715542653e-05, + "loss": 0.2687, + "step": 84975 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 4.084969520568848, + "learning_rate": 2.3102088510596076e-05, + "loss": 0.2953, + "step": 85000 + }, + { + "epoch": 76.94570135746606, + "grad_norm": 1.894515872001648, + "learning_rate": 2.3059344673961123e-05, + "loss": 0.2792, + "step": 85025 + }, + { + "epoch": 76.96832579185521, + "grad_norm": 3.177055835723877, + "learning_rate": 2.3016633232296725e-05, + "loss": 0.2772, + "step": 85050 + }, + { + "epoch": 76.99095022624434, + "grad_norm": 1.561269760131836, + "learning_rate": 2.297395421224173e-05, + "loss": 0.2439, + "step": 85075 + }, + { + "epoch": 77.01357466063348, + "grad_norm": 1.7832810878753662, + "learning_rate": 2.2931307640414653e-05, + "loss": 0.2548, + "step": 85100 + }, + { + "epoch": 77.03619909502262, + "grad_norm": 4.043969631195068, + "learning_rate": 2.2888693543413853e-05, + "loss": 0.3584, + "step": 85125 + }, + { + "epoch": 77.05882352941177, + "grad_norm": 2.9569408893585205, + "learning_rate": 2.284611194781736e-05, + "loss": 0.2941, + "step": 85150 + }, + { + "epoch": 77.08144796380091, + "grad_norm": 2.0861144065856934, + "learning_rate": 2.2803562880183027e-05, + "loss": 0.3211, + "step": 85175 + }, + { + "epoch": 77.10407239819004, + "grad_norm": 1.199874997138977, + "learning_rate": 2.2761046367048314e-05, + "loss": 0.2533, + "step": 85200 + }, + { + "epoch": 77.12669683257919, + "grad_norm": 2.356982469558716, + "learning_rate": 2.2718562434930475e-05, + "loss": 0.2732, + "step": 85225 + }, + { + "epoch": 77.14932126696833, + "grad_norm": 2.066962480545044, + "learning_rate": 2.2676111110326354e-05, + "loss": 0.2119, + "step": 85250 + }, + { + "epoch": 77.17194570135747, + "grad_norm": 3.157137155532837, + "learning_rate": 2.263369241971254e-05, + "loss": 0.2906, + "step": 85275 + }, + { + "epoch": 77.1945701357466, + "grad_norm": 1.8582885265350342, + "learning_rate": 2.25913063895452e-05, + "loss": 0.3255, + "step": 85300 + }, + { + "epoch": 77.21719457013575, + "grad_norm": 2.554067611694336, + "learning_rate": 2.25489530462602e-05, + "loss": 0.2571, + "step": 85325 + }, + { + "epoch": 77.23981900452489, + "grad_norm": 2.0254104137420654, + "learning_rate": 2.2506632416272932e-05, + "loss": 0.2846, + "step": 85350 + }, + { + "epoch": 77.26244343891403, + "grad_norm": 2.0758092403411865, + "learning_rate": 2.2464344525978463e-05, + "loss": 0.3366, + "step": 85375 + }, + { + "epoch": 77.28506787330316, + "grad_norm": 2.7301504611968994, + "learning_rate": 2.242208940175141e-05, + "loss": 0.2517, + "step": 85400 + }, + { + "epoch": 77.3076923076923, + "grad_norm": 0.9961033463478088, + "learning_rate": 2.2379867069945967e-05, + "loss": 0.3473, + "step": 85425 + }, + { + "epoch": 77.33031674208145, + "grad_norm": 1.8666002750396729, + "learning_rate": 2.233767755689583e-05, + "loss": 0.2344, + "step": 85450 + }, + { + "epoch": 77.3529411764706, + "grad_norm": 2.4896492958068848, + "learning_rate": 2.229552088891431e-05, + "loss": 0.2992, + "step": 85475 + }, + { + "epoch": 77.37556561085972, + "grad_norm": 1.919020175933838, + "learning_rate": 2.225339709229412e-05, + "loss": 0.248, + "step": 85500 + }, + { + "epoch": 77.39819004524887, + "grad_norm": 1.4611742496490479, + "learning_rate": 2.2211306193307598e-05, + "loss": 0.2468, + "step": 85525 + }, + { + "epoch": 77.42081447963801, + "grad_norm": 2.4415318965911865, + "learning_rate": 2.2169248218206444e-05, + "loss": 0.2568, + "step": 85550 + }, + { + "epoch": 77.44343891402715, + "grad_norm": 2.6511993408203125, + "learning_rate": 2.2127223193221934e-05, + "loss": 0.2942, + "step": 85575 + }, + { + "epoch": 77.46606334841628, + "grad_norm": 1.9115158319473267, + "learning_rate": 2.2085231144564687e-05, + "loss": 0.2828, + "step": 85600 + }, + { + "epoch": 77.48868778280543, + "grad_norm": 1.9452518224716187, + "learning_rate": 2.2043272098424855e-05, + "loss": 0.2795, + "step": 85625 + }, + { + "epoch": 77.51131221719457, + "grad_norm": 1.4088445901870728, + "learning_rate": 2.200134608097192e-05, + "loss": 0.346, + "step": 85650 + }, + { + "epoch": 77.53393665158372, + "grad_norm": 2.5477261543273926, + "learning_rate": 2.1959453118354833e-05, + "loss": 0.254, + "step": 85675 + }, + { + "epoch": 77.55656108597285, + "grad_norm": 2.778261423110962, + "learning_rate": 2.1917593236701866e-05, + "loss": 0.2644, + "step": 85700 + }, + { + "epoch": 77.57918552036199, + "grad_norm": 2.218679189682007, + "learning_rate": 2.1875766462120734e-05, + "loss": 0.3112, + "step": 85725 + }, + { + "epoch": 77.60180995475113, + "grad_norm": 1.5461026430130005, + "learning_rate": 2.1833972820698417e-05, + "loss": 0.3407, + "step": 85750 + }, + { + "epoch": 77.62443438914028, + "grad_norm": 2.2722394466400146, + "learning_rate": 2.1792212338501316e-05, + "loss": 0.2668, + "step": 85775 + }, + { + "epoch": 77.6470588235294, + "grad_norm": 1.2408086061477661, + "learning_rate": 2.1750485041575064e-05, + "loss": 0.3041, + "step": 85800 + }, + { + "epoch": 77.66968325791855, + "grad_norm": 2.132417678833008, + "learning_rate": 2.1708790955944698e-05, + "loss": 0.2479, + "step": 85825 + }, + { + "epoch": 77.6923076923077, + "grad_norm": 1.9469751119613647, + "learning_rate": 2.166713010761442e-05, + "loss": 0.3126, + "step": 85850 + }, + { + "epoch": 77.71493212669684, + "grad_norm": 0.7266249060630798, + "learning_rate": 2.162550252256784e-05, + "loss": 0.2829, + "step": 85875 + }, + { + "epoch": 77.73755656108597, + "grad_norm": 2.63224196434021, + "learning_rate": 2.1583908226767675e-05, + "loss": 0.2298, + "step": 85900 + }, + { + "epoch": 77.76018099547511, + "grad_norm": 1.7881295680999756, + "learning_rate": 2.1542347246156015e-05, + "loss": 0.2689, + "step": 85925 + }, + { + "epoch": 77.78280542986425, + "grad_norm": 1.8670376539230347, + "learning_rate": 2.1500819606654065e-05, + "loss": 0.2701, + "step": 85950 + }, + { + "epoch": 77.8054298642534, + "grad_norm": 1.5502375364303589, + "learning_rate": 2.145932533416232e-05, + "loss": 0.2444, + "step": 85975 + }, + { + "epoch": 77.82805429864253, + "grad_norm": 4.497215747833252, + "learning_rate": 2.1417864454560386e-05, + "loss": 0.3827, + "step": 86000 + }, + { + "epoch": 77.85067873303167, + "grad_norm": 1.7891716957092285, + "learning_rate": 2.1376436993707127e-05, + "loss": 0.2445, + "step": 86025 + }, + { + "epoch": 77.87330316742081, + "grad_norm": 1.0871717929840088, + "learning_rate": 2.133504297744047e-05, + "loss": 0.2506, + "step": 86050 + }, + { + "epoch": 77.89592760180996, + "grad_norm": 2.207763433456421, + "learning_rate": 2.1293682431577578e-05, + "loss": 0.2627, + "step": 86075 + }, + { + "epoch": 77.91855203619909, + "grad_norm": 1.6225380897521973, + "learning_rate": 2.125235538191464e-05, + "loss": 0.2886, + "step": 86100 + }, + { + "epoch": 77.94117647058823, + "grad_norm": 2.372548818588257, + "learning_rate": 2.1211061854227067e-05, + "loss": 0.269, + "step": 86125 + }, + { + "epoch": 77.96380090497738, + "grad_norm": 3.087933301925659, + "learning_rate": 2.1169801874269242e-05, + "loss": 0.3022, + "step": 86150 + }, + { + "epoch": 77.98642533936652, + "grad_norm": 0.7303369641304016, + "learning_rate": 2.112857546777473e-05, + "loss": 0.2829, + "step": 86175 + }, + { + "epoch": 78.00904977375566, + "grad_norm": 2.1290323734283447, + "learning_rate": 2.1087382660456077e-05, + "loss": 0.2893, + "step": 86200 + }, + { + "epoch": 78.03167420814479, + "grad_norm": 2.1010098457336426, + "learning_rate": 2.1046223478004936e-05, + "loss": 0.2467, + "step": 86225 + }, + { + "epoch": 78.05429864253394, + "grad_norm": 1.8762445449829102, + "learning_rate": 2.100674232095631e-05, + "loss": 0.2446, + "step": 86250 + }, + { + "epoch": 78.07692307692308, + "grad_norm": 1.819108009338379, + "learning_rate": 2.0965649117691385e-05, + "loss": 0.3258, + "step": 86275 + }, + { + "epoch": 78.09954751131222, + "grad_norm": 1.1745537519454956, + "learning_rate": 2.0924589615218227e-05, + "loss": 0.2444, + "step": 86300 + }, + { + "epoch": 78.12217194570135, + "grad_norm": 2.1086909770965576, + "learning_rate": 2.0883563839145248e-05, + "loss": 0.2723, + "step": 86325 + }, + { + "epoch": 78.1447963800905, + "grad_norm": 1.8595943450927734, + "learning_rate": 2.0842571815059967e-05, + "loss": 0.3271, + "step": 86350 + }, + { + "epoch": 78.16742081447964, + "grad_norm": 2.4486351013183594, + "learning_rate": 2.080161356852872e-05, + "loss": 0.2213, + "step": 86375 + }, + { + "epoch": 78.19004524886878, + "grad_norm": 1.9297285079956055, + "learning_rate": 2.076068912509688e-05, + "loss": 0.3219, + "step": 86400 + }, + { + "epoch": 78.21266968325791, + "grad_norm": 2.3453269004821777, + "learning_rate": 2.0719798510288666e-05, + "loss": 0.2324, + "step": 86425 + }, + { + "epoch": 78.23529411764706, + "grad_norm": 2.9171552658081055, + "learning_rate": 2.0678941749607253e-05, + "loss": 0.2939, + "step": 86450 + }, + { + "epoch": 78.2579185520362, + "grad_norm": 1.9287010431289673, + "learning_rate": 2.0638118868534642e-05, + "loss": 0.2399, + "step": 86475 + }, + { + "epoch": 78.28054298642535, + "grad_norm": 2.80836820602417, + "learning_rate": 2.059732989253175e-05, + "loss": 0.2329, + "step": 86500 + }, + { + "epoch": 78.30316742081448, + "grad_norm": 0.6510598063468933, + "learning_rate": 2.055657484703837e-05, + "loss": 0.2462, + "step": 86525 + }, + { + "epoch": 78.32579185520362, + "grad_norm": 1.468997597694397, + "learning_rate": 2.0515853757473046e-05, + "loss": 0.237, + "step": 86550 + }, + { + "epoch": 78.34841628959276, + "grad_norm": 2.5640623569488525, + "learning_rate": 2.0475166649233238e-05, + "loss": 0.234, + "step": 86575 + }, + { + "epoch": 78.3710407239819, + "grad_norm": 1.8810259103775024, + "learning_rate": 2.0434513547695123e-05, + "loss": 0.3246, + "step": 86600 + }, + { + "epoch": 78.39366515837104, + "grad_norm": 1.4488884210586548, + "learning_rate": 2.0393894478213767e-05, + "loss": 0.3463, + "step": 86625 + }, + { + "epoch": 78.41628959276018, + "grad_norm": 2.9062445163726807, + "learning_rate": 2.0353309466122903e-05, + "loss": 0.2456, + "step": 86650 + }, + { + "epoch": 78.43891402714932, + "grad_norm": 1.876059889793396, + "learning_rate": 2.0312758536735137e-05, + "loss": 0.2249, + "step": 86675 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 1.7976325750350952, + "learning_rate": 2.0272241715341704e-05, + "loss": 0.2466, + "step": 86700 + }, + { + "epoch": 78.4841628959276, + "grad_norm": 1.2220265865325928, + "learning_rate": 2.0231759027212657e-05, + "loss": 0.2529, + "step": 86725 + }, + { + "epoch": 78.50678733031674, + "grad_norm": 1.8671175241470337, + "learning_rate": 2.0191310497596696e-05, + "loss": 0.231, + "step": 86750 + }, + { + "epoch": 78.52941176470588, + "grad_norm": 1.1508766412734985, + "learning_rate": 2.015089615172129e-05, + "loss": 0.2309, + "step": 86775 + }, + { + "epoch": 78.55203619909503, + "grad_norm": 2.886613130569458, + "learning_rate": 2.0110516014792486e-05, + "loss": 0.2588, + "step": 86800 + }, + { + "epoch": 78.57466063348416, + "grad_norm": 1.6978250741958618, + "learning_rate": 2.00701701119951e-05, + "loss": 0.2535, + "step": 86825 + }, + { + "epoch": 78.5972850678733, + "grad_norm": 5.23097562789917, + "learning_rate": 2.002985846849251e-05, + "loss": 0.4008, + "step": 86850 + }, + { + "epoch": 78.61990950226244, + "grad_norm": 1.906355619430542, + "learning_rate": 1.9989581109426808e-05, + "loss": 0.3319, + "step": 86875 + }, + { + "epoch": 78.64253393665159, + "grad_norm": 1.8507574796676636, + "learning_rate": 1.994933805991861e-05, + "loss": 0.4066, + "step": 86900 + }, + { + "epoch": 78.66515837104072, + "grad_norm": 2.7209722995758057, + "learning_rate": 1.9909129345067228e-05, + "loss": 0.2455, + "step": 86925 + }, + { + "epoch": 78.68778280542986, + "grad_norm": 1.5386555194854736, + "learning_rate": 1.9868954989950472e-05, + "loss": 0.2444, + "step": 86950 + }, + { + "epoch": 78.710407239819, + "grad_norm": 1.6125478744506836, + "learning_rate": 1.982881501962482e-05, + "loss": 0.3259, + "step": 86975 + }, + { + "epoch": 78.73303167420815, + "grad_norm": 5.142017364501953, + "learning_rate": 1.9788709459125194e-05, + "loss": 0.3714, + "step": 87000 + }, + { + "epoch": 78.75565610859728, + "grad_norm": 2.0047316551208496, + "learning_rate": 1.974863833346515e-05, + "loss": 0.2944, + "step": 87025 + }, + { + "epoch": 78.77828054298642, + "grad_norm": 2.330984354019165, + "learning_rate": 1.97086016676367e-05, + "loss": 0.2309, + "step": 87050 + }, + { + "epoch": 78.80090497737557, + "grad_norm": 1.5318078994750977, + "learning_rate": 1.9668599486610427e-05, + "loss": 0.2182, + "step": 87075 + }, + { + "epoch": 78.82352941176471, + "grad_norm": 2.0248022079467773, + "learning_rate": 1.962863181533533e-05, + "loss": 0.2288, + "step": 87100 + }, + { + "epoch": 78.84615384615384, + "grad_norm": 2.2620387077331543, + "learning_rate": 1.958869867873897e-05, + "loss": 0.3758, + "step": 87125 + }, + { + "epoch": 78.86877828054298, + "grad_norm": 1.064523458480835, + "learning_rate": 1.954880010172727e-05, + "loss": 0.2542, + "step": 87150 + }, + { + "epoch": 78.89140271493213, + "grad_norm": 1.792374610900879, + "learning_rate": 1.9508936109184713e-05, + "loss": 0.2257, + "step": 87175 + }, + { + "epoch": 78.91402714932127, + "grad_norm": 2.4855358600616455, + "learning_rate": 1.9469106725974086e-05, + "loss": 0.2449, + "step": 87200 + }, + { + "epoch": 78.9366515837104, + "grad_norm": 1.8008933067321777, + "learning_rate": 1.9429311976936726e-05, + "loss": 0.2285, + "step": 87225 + }, + { + "epoch": 78.95927601809954, + "grad_norm": 1.6819531917572021, + "learning_rate": 1.938955188689224e-05, + "loss": 0.3144, + "step": 87250 + }, + { + "epoch": 78.98190045248869, + "grad_norm": 0.9282886981964111, + "learning_rate": 1.934982648063872e-05, + "loss": 0.2331, + "step": 87275 + }, + { + "epoch": 79.00452488687783, + "grad_norm": 0.7602487802505493, + "learning_rate": 1.9310135782952542e-05, + "loss": 0.3145, + "step": 87300 + }, + { + "epoch": 79.02714932126698, + "grad_norm": 0.8442087769508362, + "learning_rate": 1.9270479818588528e-05, + "loss": 0.2277, + "step": 87325 + }, + { + "epoch": 79.0497737556561, + "grad_norm": 1.41585111618042, + "learning_rate": 1.9230858612279735e-05, + "loss": 0.3091, + "step": 87350 + }, + { + "epoch": 79.07239819004525, + "grad_norm": 1.8606550693511963, + "learning_rate": 1.919127218873763e-05, + "loss": 0.2291, + "step": 87375 + }, + { + "epoch": 79.09502262443439, + "grad_norm": 1.2281155586242676, + "learning_rate": 1.9151720572651914e-05, + "loss": 0.2361, + "step": 87400 + }, + { + "epoch": 79.11764705882354, + "grad_norm": 2.0486700534820557, + "learning_rate": 1.9112203788690654e-05, + "loss": 0.2459, + "step": 87425 + }, + { + "epoch": 79.14027149321267, + "grad_norm": 1.0638536214828491, + "learning_rate": 1.907272186150011e-05, + "loss": 0.2291, + "step": 87450 + }, + { + "epoch": 79.16289592760181, + "grad_norm": 1.740169644355774, + "learning_rate": 1.903327481570488e-05, + "loss": 0.2477, + "step": 87475 + }, + { + "epoch": 79.18552036199095, + "grad_norm": 1.0604002475738525, + "learning_rate": 1.8993862675907725e-05, + "loss": 0.2555, + "step": 87500 + }, + { + "epoch": 79.2081447963801, + "grad_norm": 2.515486717224121, + "learning_rate": 1.895448546668974e-05, + "loss": 0.2041, + "step": 87525 + }, + { + "epoch": 79.23076923076923, + "grad_norm": 2.8553245067596436, + "learning_rate": 1.8915143212610117e-05, + "loss": 0.2326, + "step": 87550 + }, + { + "epoch": 79.25339366515837, + "grad_norm": 2.4957714080810547, + "learning_rate": 1.8875835938206332e-05, + "loss": 0.2342, + "step": 87575 + }, + { + "epoch": 79.27601809954751, + "grad_norm": 4.493040084838867, + "learning_rate": 1.8836563667994042e-05, + "loss": 0.317, + "step": 87600 + }, + { + "epoch": 79.29864253393666, + "grad_norm": 2.3935322761535645, + "learning_rate": 1.8797326426466996e-05, + "loss": 0.3032, + "step": 87625 + }, + { + "epoch": 79.32126696832579, + "grad_norm": 2.9496877193450928, + "learning_rate": 1.8758124238097202e-05, + "loss": 0.338, + "step": 87650 + }, + { + "epoch": 79.34389140271493, + "grad_norm": 1.6749485731124878, + "learning_rate": 1.8718957127334694e-05, + "loss": 0.2714, + "step": 87675 + }, + { + "epoch": 79.36651583710407, + "grad_norm": 1.2983139753341675, + "learning_rate": 1.8679825118607732e-05, + "loss": 0.2615, + "step": 87700 + }, + { + "epoch": 79.38914027149322, + "grad_norm": 2.107964038848877, + "learning_rate": 1.8640728236322598e-05, + "loss": 0.2624, + "step": 87725 + }, + { + "epoch": 79.41176470588235, + "grad_norm": 1.2432149648666382, + "learning_rate": 1.8601666504863748e-05, + "loss": 0.2805, + "step": 87750 + }, + { + "epoch": 79.43438914027149, + "grad_norm": 1.972983956336975, + "learning_rate": 1.8562639948593625e-05, + "loss": 0.3174, + "step": 87775 + }, + { + "epoch": 79.45701357466064, + "grad_norm": 1.1237883567810059, + "learning_rate": 1.8523648591852818e-05, + "loss": 0.2342, + "step": 87800 + }, + { + "epoch": 79.47963800904978, + "grad_norm": 4.015199184417725, + "learning_rate": 1.8484692458959884e-05, + "loss": 0.2771, + "step": 87825 + }, + { + "epoch": 79.50226244343891, + "grad_norm": 1.4740439653396606, + "learning_rate": 1.8445771574211478e-05, + "loss": 0.2238, + "step": 87850 + }, + { + "epoch": 79.52488687782805, + "grad_norm": 1.226570963859558, + "learning_rate": 1.840688596188226e-05, + "loss": 0.2233, + "step": 87875 + }, + { + "epoch": 79.5475113122172, + "grad_norm": 5.07127046585083, + "learning_rate": 1.8368035646224834e-05, + "loss": 0.2866, + "step": 87900 + }, + { + "epoch": 79.57013574660634, + "grad_norm": 2.449209451675415, + "learning_rate": 1.8329220651469874e-05, + "loss": 0.3408, + "step": 87925 + }, + { + "epoch": 79.59276018099547, + "grad_norm": 0.795286238193512, + "learning_rate": 1.829044100182593e-05, + "loss": 0.2508, + "step": 87950 + }, + { + "epoch": 79.61538461538461, + "grad_norm": 0.9944939613342285, + "learning_rate": 1.825169672147962e-05, + "loss": 0.2122, + "step": 87975 + }, + { + "epoch": 79.63800904977376, + "grad_norm": 1.448467493057251, + "learning_rate": 1.8212987834595377e-05, + "loss": 0.2648, + "step": 88000 + }, + { + "epoch": 79.6606334841629, + "grad_norm": 2.312659978866577, + "learning_rate": 1.8174314365315683e-05, + "loss": 0.2904, + "step": 88025 + }, + { + "epoch": 79.68325791855203, + "grad_norm": 1.334312081336975, + "learning_rate": 1.8135676337760823e-05, + "loss": 0.3041, + "step": 88050 + }, + { + "epoch": 79.70588235294117, + "grad_norm": 1.571508765220642, + "learning_rate": 1.8097073776029038e-05, + "loss": 0.2758, + "step": 88075 + }, + { + "epoch": 79.72850678733032, + "grad_norm": 2.2298097610473633, + "learning_rate": 1.805850670419647e-05, + "loss": 0.276, + "step": 88100 + }, + { + "epoch": 79.75113122171946, + "grad_norm": 2.40057635307312, + "learning_rate": 1.8019975146317042e-05, + "loss": 0.3084, + "step": 88125 + }, + { + "epoch": 79.77375565610859, + "grad_norm": 4.272204399108887, + "learning_rate": 1.7981479126422625e-05, + "loss": 0.2568, + "step": 88150 + }, + { + "epoch": 79.79638009049773, + "grad_norm": 5.12802267074585, + "learning_rate": 1.7943018668522834e-05, + "loss": 0.3572, + "step": 88175 + }, + { + "epoch": 79.81900452488688, + "grad_norm": 2.5201570987701416, + "learning_rate": 1.7904593796605195e-05, + "loss": 0.2335, + "step": 88200 + }, + { + "epoch": 79.84162895927602, + "grad_norm": 1.8251371383666992, + "learning_rate": 1.786620453463494e-05, + "loss": 0.2235, + "step": 88225 + }, + { + "epoch": 79.86425339366515, + "grad_norm": 1.3936398029327393, + "learning_rate": 1.7827850906555195e-05, + "loss": 0.2314, + "step": 88250 + }, + { + "epoch": 79.8868778280543, + "grad_norm": 2.0077548027038574, + "learning_rate": 1.7789532936286775e-05, + "loss": 0.2696, + "step": 88275 + }, + { + "epoch": 79.90950226244344, + "grad_norm": 1.215086817741394, + "learning_rate": 1.7751250647728314e-05, + "loss": 0.2898, + "step": 88300 + }, + { + "epoch": 79.93212669683258, + "grad_norm": 0.969950795173645, + "learning_rate": 1.771300406475614e-05, + "loss": 0.2935, + "step": 88325 + }, + { + "epoch": 79.95475113122171, + "grad_norm": 1.9991028308868408, + "learning_rate": 1.7674793211224382e-05, + "loss": 0.2279, + "step": 88350 + }, + { + "epoch": 79.97737556561086, + "grad_norm": 3.2695038318634033, + "learning_rate": 1.7636618110964798e-05, + "loss": 0.2357, + "step": 88375 + }, + { + "epoch": 80.0, + "grad_norm": 1.568069577217102, + "learning_rate": 1.759847878778693e-05, + "loss": 0.3049, + "step": 88400 + }, + { + "epoch": 80.02262443438914, + "grad_norm": 2.0564796924591064, + "learning_rate": 1.7560375265477937e-05, + "loss": 0.266, + "step": 88425 + }, + { + "epoch": 80.04524886877829, + "grad_norm": 1.5851367712020874, + "learning_rate": 1.7522307567802714e-05, + "loss": 0.2405, + "step": 88450 + }, + { + "epoch": 80.06787330316742, + "grad_norm": 3.8050899505615234, + "learning_rate": 1.7484275718503744e-05, + "loss": 0.3095, + "step": 88475 + }, + { + "epoch": 80.09049773755656, + "grad_norm": 2.199955940246582, + "learning_rate": 1.744627974130122e-05, + "loss": 0.2136, + "step": 88500 + }, + { + "epoch": 80.1131221719457, + "grad_norm": 3.5844807624816895, + "learning_rate": 1.7408319659892896e-05, + "loss": 0.3086, + "step": 88525 + }, + { + "epoch": 80.13574660633485, + "grad_norm": 1.3600096702575684, + "learning_rate": 1.7370395497954205e-05, + "loss": 0.252, + "step": 88550 + }, + { + "epoch": 80.15837104072398, + "grad_norm": 1.3867491483688354, + "learning_rate": 1.7332507279138105e-05, + "loss": 0.2444, + "step": 88575 + }, + { + "epoch": 80.18099547511312, + "grad_norm": 1.8291349411010742, + "learning_rate": 1.7294655027075207e-05, + "loss": 0.3002, + "step": 88600 + }, + { + "epoch": 80.20361990950227, + "grad_norm": 2.0560073852539062, + "learning_rate": 1.7256838765373636e-05, + "loss": 0.3013, + "step": 88625 + }, + { + "epoch": 80.22624434389141, + "grad_norm": 1.6388047933578491, + "learning_rate": 1.721905851761911e-05, + "loss": 0.2177, + "step": 88650 + }, + { + "epoch": 80.24886877828054, + "grad_norm": 2.622504711151123, + "learning_rate": 1.7181314307374846e-05, + "loss": 0.2814, + "step": 88675 + }, + { + "epoch": 80.27149321266968, + "grad_norm": 2.2391486167907715, + "learning_rate": 1.7143606158181637e-05, + "loss": 0.2466, + "step": 88700 + }, + { + "epoch": 80.29411764705883, + "grad_norm": 1.8025130033493042, + "learning_rate": 1.7105934093557708e-05, + "loss": 0.2138, + "step": 88725 + }, + { + "epoch": 80.31674208144797, + "grad_norm": 1.6778080463409424, + "learning_rate": 1.7068298136998867e-05, + "loss": 0.2396, + "step": 88750 + }, + { + "epoch": 80.3393665158371, + "grad_norm": 1.2616379261016846, + "learning_rate": 1.7030698311978322e-05, + "loss": 0.2711, + "step": 88775 + }, + { + "epoch": 80.36199095022624, + "grad_norm": 1.9824016094207764, + "learning_rate": 1.699313464194682e-05, + "loss": 0.2512, + "step": 88800 + }, + { + "epoch": 80.38461538461539, + "grad_norm": 4.605892181396484, + "learning_rate": 1.6955607150332488e-05, + "loss": 0.325, + "step": 88825 + }, + { + "epoch": 80.40723981900453, + "grad_norm": 1.984059453010559, + "learning_rate": 1.691811586054095e-05, + "loss": 0.2753, + "step": 88850 + }, + { + "epoch": 80.42986425339366, + "grad_norm": 1.7115203142166138, + "learning_rate": 1.6880660795955193e-05, + "loss": 0.233, + "step": 88875 + }, + { + "epoch": 80.4524886877828, + "grad_norm": 1.6921839714050293, + "learning_rate": 1.6843241979935677e-05, + "loss": 0.257, + "step": 88900 + }, + { + "epoch": 80.47511312217195, + "grad_norm": 1.532244086265564, + "learning_rate": 1.6805859435820175e-05, + "loss": 0.2257, + "step": 88925 + }, + { + "epoch": 80.49773755656109, + "grad_norm": 1.4685120582580566, + "learning_rate": 1.6768513186923918e-05, + "loss": 0.2219, + "step": 88950 + }, + { + "epoch": 80.52036199095022, + "grad_norm": 2.3729798793792725, + "learning_rate": 1.6731203256539437e-05, + "loss": 0.2435, + "step": 88975 + }, + { + "epoch": 80.54298642533936, + "grad_norm": 1.9455311298370361, + "learning_rate": 1.6693929667936662e-05, + "loss": 0.3169, + "step": 89000 + }, + { + "epoch": 80.56561085972851, + "grad_norm": 4.638451099395752, + "learning_rate": 1.6656692444362792e-05, + "loss": 0.3016, + "step": 89025 + }, + { + "epoch": 80.58823529411765, + "grad_norm": 2.428246259689331, + "learning_rate": 1.6619491609042433e-05, + "loss": 0.2505, + "step": 89050 + }, + { + "epoch": 80.61085972850678, + "grad_norm": 1.4879069328308105, + "learning_rate": 1.658232718517741e-05, + "loss": 0.2251, + "step": 89075 + }, + { + "epoch": 80.63348416289593, + "grad_norm": 2.2105343341827393, + "learning_rate": 1.6545199195946903e-05, + "loss": 0.2347, + "step": 89100 + }, + { + "epoch": 80.65610859728507, + "grad_norm": 2.0314412117004395, + "learning_rate": 1.650810766450731e-05, + "loss": 0.2961, + "step": 89125 + }, + { + "epoch": 80.67873303167421, + "grad_norm": 1.6765267848968506, + "learning_rate": 1.6471052613992345e-05, + "loss": 0.2472, + "step": 89150 + }, + { + "epoch": 80.70135746606334, + "grad_norm": 1.2480494976043701, + "learning_rate": 1.643403406751296e-05, + "loss": 0.3061, + "step": 89175 + }, + { + "epoch": 80.72398190045249, + "grad_norm": 1.6919770240783691, + "learning_rate": 1.6397052048157287e-05, + "loss": 0.2119, + "step": 89200 + }, + { + "epoch": 80.74660633484163, + "grad_norm": 2.1331253051757812, + "learning_rate": 1.6360106578990753e-05, + "loss": 0.2444, + "step": 89225 + }, + { + "epoch": 80.76923076923077, + "grad_norm": 0.9414656162261963, + "learning_rate": 1.632319768305592e-05, + "loss": 0.2422, + "step": 89250 + }, + { + "epoch": 80.7918552036199, + "grad_norm": 1.2934455871582031, + "learning_rate": 1.6286325383372606e-05, + "loss": 0.2695, + "step": 89275 + }, + { + "epoch": 80.81447963800905, + "grad_norm": 1.551058292388916, + "learning_rate": 1.624948970293772e-05, + "loss": 0.2508, + "step": 89300 + }, + { + "epoch": 80.83710407239819, + "grad_norm": 1.4792729616165161, + "learning_rate": 1.6212690664725437e-05, + "loss": 0.226, + "step": 89325 + }, + { + "epoch": 80.85972850678733, + "grad_norm": 1.9807225465774536, + "learning_rate": 1.6175928291686968e-05, + "loss": 0.2861, + "step": 89350 + }, + { + "epoch": 80.88235294117646, + "grad_norm": 2.2029078006744385, + "learning_rate": 1.6139202606750756e-05, + "loss": 0.2617, + "step": 89375 + }, + { + "epoch": 80.90497737556561, + "grad_norm": 0.8517450094223022, + "learning_rate": 1.6102513632822285e-05, + "loss": 0.2689, + "step": 89400 + }, + { + "epoch": 80.92760180995475, + "grad_norm": 1.0860334634780884, + "learning_rate": 1.6065861392784195e-05, + "loss": 0.2503, + "step": 89425 + }, + { + "epoch": 80.9502262443439, + "grad_norm": 2.202310562133789, + "learning_rate": 1.6029245909496174e-05, + "loss": 0.2197, + "step": 89450 + }, + { + "epoch": 80.97285067873302, + "grad_norm": 2.39774751663208, + "learning_rate": 1.5992667205795037e-05, + "loss": 0.2722, + "step": 89475 + }, + { + "epoch": 80.99547511312217, + "grad_norm": 1.3892974853515625, + "learning_rate": 1.5956125304494585e-05, + "loss": 0.2617, + "step": 89500 + }, + { + "epoch": 81.01809954751131, + "grad_norm": 1.5049279928207397, + "learning_rate": 1.5919620228385755e-05, + "loss": 0.2354, + "step": 89525 + }, + { + "epoch": 81.04072398190046, + "grad_norm": 3.5445001125335693, + "learning_rate": 1.5883152000236423e-05, + "loss": 0.2787, + "step": 89550 + }, + { + "epoch": 81.0633484162896, + "grad_norm": 1.3553639650344849, + "learning_rate": 1.5846720642791582e-05, + "loss": 0.1943, + "step": 89575 + }, + { + "epoch": 81.08597285067873, + "grad_norm": 4.648387432098389, + "learning_rate": 1.5810326178773132e-05, + "loss": 0.2349, + "step": 89600 + }, + { + "epoch": 81.10859728506787, + "grad_norm": 1.2038744688034058, + "learning_rate": 1.5773968630880044e-05, + "loss": 0.2052, + "step": 89625 + }, + { + "epoch": 81.13122171945702, + "grad_norm": 1.6837557554244995, + "learning_rate": 1.573764802178819e-05, + "loss": 0.2153, + "step": 89650 + }, + { + "epoch": 81.15384615384616, + "grad_norm": 1.333583116531372, + "learning_rate": 1.5701364374150492e-05, + "loss": 0.2624, + "step": 89675 + }, + { + "epoch": 81.17647058823529, + "grad_norm": 3.7715420722961426, + "learning_rate": 1.5665117710596726e-05, + "loss": 0.2495, + "step": 89700 + }, + { + "epoch": 81.19909502262443, + "grad_norm": 0.7759903073310852, + "learning_rate": 1.5628908053733684e-05, + "loss": 0.2014, + "step": 89725 + }, + { + "epoch": 81.22171945701358, + "grad_norm": 1.1528794765472412, + "learning_rate": 1.5592735426145e-05, + "loss": 0.2359, + "step": 89750 + }, + { + "epoch": 81.24434389140272, + "grad_norm": 2.053586483001709, + "learning_rate": 1.555659985039129e-05, + "loss": 0.2627, + "step": 89775 + }, + { + "epoch": 81.26696832579185, + "grad_norm": 1.9309684038162231, + "learning_rate": 1.552050134900998e-05, + "loss": 0.2415, + "step": 89800 + }, + { + "epoch": 81.289592760181, + "grad_norm": 1.7584835290908813, + "learning_rate": 1.5484439944515462e-05, + "loss": 0.321, + "step": 89825 + }, + { + "epoch": 81.31221719457014, + "grad_norm": 1.91073739528656, + "learning_rate": 1.5448415659398907e-05, + "loss": 0.2044, + "step": 89850 + }, + { + "epoch": 81.33484162895928, + "grad_norm": 2.160158395767212, + "learning_rate": 1.541242851612841e-05, + "loss": 0.2069, + "step": 89875 + }, + { + "epoch": 81.35746606334841, + "grad_norm": 2.3774430751800537, + "learning_rate": 1.5376478537148817e-05, + "loss": 0.2548, + "step": 89900 + }, + { + "epoch": 81.38009049773756, + "grad_norm": 1.7145510911941528, + "learning_rate": 1.53405657448819e-05, + "loss": 0.2984, + "step": 89925 + }, + { + "epoch": 81.4027149321267, + "grad_norm": 1.8368686437606812, + "learning_rate": 1.5304690161726117e-05, + "loss": 0.2372, + "step": 89950 + }, + { + "epoch": 81.42533936651584, + "grad_norm": 1.3777281045913696, + "learning_rate": 1.5268851810056833e-05, + "loss": 0.2717, + "step": 89975 + }, + { + "epoch": 81.44796380090497, + "grad_norm": 2.33797025680542, + "learning_rate": 1.5233050712226108e-05, + "loss": 0.296, + "step": 90000 + }, + { + "epoch": 81.47058823529412, + "grad_norm": 1.3664602041244507, + "learning_rate": 1.5197286890562827e-05, + "loss": 0.2516, + "step": 90025 + }, + { + "epoch": 81.49321266968326, + "grad_norm": 1.3721119165420532, + "learning_rate": 1.5161560367372571e-05, + "loss": 0.2688, + "step": 90050 + }, + { + "epoch": 81.5158371040724, + "grad_norm": 1.7181849479675293, + "learning_rate": 1.5125871164937719e-05, + "loss": 0.2863, + "step": 90075 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 1.4487202167510986, + "learning_rate": 1.5090219305517298e-05, + "loss": 0.2202, + "step": 90100 + }, + { + "epoch": 81.56108597285068, + "grad_norm": 1.9481704235076904, + "learning_rate": 1.505460481134713e-05, + "loss": 0.2709, + "step": 90125 + }, + { + "epoch": 81.58371040723982, + "grad_norm": 1.483481526374817, + "learning_rate": 1.5019027704639652e-05, + "loss": 0.2657, + "step": 90150 + }, + { + "epoch": 81.60633484162896, + "grad_norm": 0.8282179832458496, + "learning_rate": 1.498348800758406e-05, + "loss": 0.2389, + "step": 90175 + }, + { + "epoch": 81.6289592760181, + "grad_norm": 2.205792188644409, + "learning_rate": 1.494798574234613e-05, + "loss": 0.2383, + "step": 90200 + }, + { + "epoch": 81.65158371040724, + "grad_norm": 1.9597299098968506, + "learning_rate": 1.4912520931068375e-05, + "loss": 0.2259, + "step": 90225 + }, + { + "epoch": 81.67420814479638, + "grad_norm": 1.1000136137008667, + "learning_rate": 1.4877093595869927e-05, + "loss": 0.2828, + "step": 90250 + }, + { + "epoch": 81.69683257918552, + "grad_norm": 1.8393522500991821, + "learning_rate": 1.4841703758846484e-05, + "loss": 0.2638, + "step": 90275 + }, + { + "epoch": 81.71945701357465, + "grad_norm": 1.8651313781738281, + "learning_rate": 1.4806351442070453e-05, + "loss": 0.1863, + "step": 90300 + }, + { + "epoch": 81.7420814479638, + "grad_norm": 1.395439863204956, + "learning_rate": 1.4771036667590749e-05, + "loss": 0.2088, + "step": 90325 + }, + { + "epoch": 81.76470588235294, + "grad_norm": 0.7056885361671448, + "learning_rate": 1.473575945743295e-05, + "loss": 0.2418, + "step": 90350 + }, + { + "epoch": 81.78733031674209, + "grad_norm": 1.6459146738052368, + "learning_rate": 1.4700519833599136e-05, + "loss": 0.2634, + "step": 90375 + }, + { + "epoch": 81.80995475113122, + "grad_norm": 2.4437522888183594, + "learning_rate": 1.4665317818068012e-05, + "loss": 0.2991, + "step": 90400 + }, + { + "epoch": 81.83257918552036, + "grad_norm": 2.822815418243408, + "learning_rate": 1.4631559285429537e-05, + "loss": 0.327, + "step": 90425 + }, + { + "epoch": 81.8552036199095, + "grad_norm": 2.050520658493042, + "learning_rate": 1.4596431045837553e-05, + "loss": 0.215, + "step": 90450 + }, + { + "epoch": 81.87782805429865, + "grad_norm": 1.3490715026855469, + "learning_rate": 1.4561340479467562e-05, + "loss": 0.2292, + "step": 90475 + }, + { + "epoch": 81.90045248868778, + "grad_norm": 2.234866142272949, + "learning_rate": 1.4526287608205314e-05, + "loss": 0.2813, + "step": 90500 + }, + { + "epoch": 81.92307692307692, + "grad_norm": 1.5313230752944946, + "learning_rate": 1.4491272453912964e-05, + "loss": 0.2411, + "step": 90525 + }, + { + "epoch": 81.94570135746606, + "grad_norm": 1.1357070207595825, + "learning_rate": 1.4456295038429216e-05, + "loss": 0.2521, + "step": 90550 + }, + { + "epoch": 81.96832579185521, + "grad_norm": 1.6652240753173828, + "learning_rate": 1.4421355383569172e-05, + "loss": 0.3507, + "step": 90575 + }, + { + "epoch": 81.99095022624434, + "grad_norm": 0.9113162159919739, + "learning_rate": 1.438645351112444e-05, + "loss": 0.262, + "step": 90600 + }, + { + "epoch": 82.01357466063348, + "grad_norm": 1.4021083116531372, + "learning_rate": 1.4351589442863018e-05, + "loss": 0.2673, + "step": 90625 + }, + { + "epoch": 82.03619909502262, + "grad_norm": 1.5596468448638916, + "learning_rate": 1.4316763200529377e-05, + "loss": 0.2682, + "step": 90650 + }, + { + "epoch": 82.05882352941177, + "grad_norm": 1.2058978080749512, + "learning_rate": 1.428336561468464e-05, + "loss": 0.2488, + "step": 90675 + }, + { + "epoch": 82.08144796380091, + "grad_norm": 1.6812113523483276, + "learning_rate": 1.4248613574155315e-05, + "loss": 0.2568, + "step": 90700 + }, + { + "epoch": 82.10407239819004, + "grad_norm": 2.0603604316711426, + "learning_rate": 1.4213899423778998e-05, + "loss": 0.2337, + "step": 90725 + }, + { + "epoch": 82.12669683257919, + "grad_norm": 1.0920335054397583, + "learning_rate": 1.4179223185206579e-05, + "loss": 0.1952, + "step": 90750 + }, + { + "epoch": 82.14932126696833, + "grad_norm": 1.583495020866394, + "learning_rate": 1.4144584880065395e-05, + "loss": 0.2322, + "step": 90775 + }, + { + "epoch": 82.17194570135747, + "grad_norm": 1.7024261951446533, + "learning_rate": 1.4109984529959045e-05, + "loss": 0.1983, + "step": 90800 + }, + { + "epoch": 82.1945701357466, + "grad_norm": 1.6076480150222778, + "learning_rate": 1.4075422156467522e-05, + "loss": 0.2192, + "step": 90825 + }, + { + "epoch": 82.21719457013575, + "grad_norm": 2.3282856941223145, + "learning_rate": 1.4040897781147067e-05, + "loss": 0.277, + "step": 90850 + }, + { + "epoch": 82.23981900452489, + "grad_norm": 1.092956304550171, + "learning_rate": 1.400641142553029e-05, + "loss": 0.2378, + "step": 90875 + }, + { + "epoch": 82.26244343891403, + "grad_norm": 1.6435421705245972, + "learning_rate": 1.3971963111126025e-05, + "loss": 0.2142, + "step": 90900 + }, + { + "epoch": 82.28506787330316, + "grad_norm": 1.6375032663345337, + "learning_rate": 1.3937552859419438e-05, + "loss": 0.2553, + "step": 90925 + }, + { + "epoch": 82.3076923076923, + "grad_norm": 1.4579874277114868, + "learning_rate": 1.3903180691871885e-05, + "loss": 0.24, + "step": 90950 + }, + { + "epoch": 82.33031674208145, + "grad_norm": 1.8350778818130493, + "learning_rate": 1.3868846629921068e-05, + "loss": 0.1939, + "step": 90975 + }, + { + "epoch": 82.3529411764706, + "grad_norm": 1.1116398572921753, + "learning_rate": 1.3834550694980817e-05, + "loss": 0.262, + "step": 91000 + }, + { + "epoch": 82.37556561085972, + "grad_norm": 1.8121416568756104, + "learning_rate": 1.3800292908441246e-05, + "loss": 0.2183, + "step": 91025 + }, + { + "epoch": 82.39819004524887, + "grad_norm": 1.6931511163711548, + "learning_rate": 1.3766073291668688e-05, + "loss": 0.2559, + "step": 91050 + }, + { + "epoch": 82.42081447963801, + "grad_norm": 1.3180493116378784, + "learning_rate": 1.3731891866005615e-05, + "loss": 0.2303, + "step": 91075 + }, + { + "epoch": 82.44343891402715, + "grad_norm": 4.452065944671631, + "learning_rate": 1.369774865277072e-05, + "loss": 0.2706, + "step": 91100 + }, + { + "epoch": 82.46606334841628, + "grad_norm": 1.5095728635787964, + "learning_rate": 1.3663643673258839e-05, + "loss": 0.2605, + "step": 91125 + }, + { + "epoch": 82.48868778280543, + "grad_norm": 4.638199329376221, + "learning_rate": 1.3629576948741006e-05, + "loss": 0.2598, + "step": 91150 + }, + { + "epoch": 82.51131221719457, + "grad_norm": 1.861236572265625, + "learning_rate": 1.3595548500464315e-05, + "loss": 0.2515, + "step": 91175 + }, + { + "epoch": 82.53393665158372, + "grad_norm": 1.0149219036102295, + "learning_rate": 1.3561558349652091e-05, + "loss": 0.1994, + "step": 91200 + }, + { + "epoch": 82.55656108597285, + "grad_norm": 1.626465916633606, + "learning_rate": 1.3527606517503667e-05, + "loss": 0.3791, + "step": 91225 + }, + { + "epoch": 82.57918552036199, + "grad_norm": 1.5020828247070312, + "learning_rate": 1.3493693025194572e-05, + "loss": 0.2321, + "step": 91250 + }, + { + "epoch": 82.60180995475113, + "grad_norm": 2.61862850189209, + "learning_rate": 1.3459817893876344e-05, + "loss": 0.2618, + "step": 91275 + }, + { + "epoch": 82.62443438914028, + "grad_norm": 1.5377072095870972, + "learning_rate": 1.3425981144676664e-05, + "loss": 0.2238, + "step": 91300 + }, + { + "epoch": 82.6470588235294, + "grad_norm": 1.8476402759552002, + "learning_rate": 1.3392182798699214e-05, + "loss": 0.2467, + "step": 91325 + }, + { + "epoch": 82.66968325791855, + "grad_norm": 1.638711929321289, + "learning_rate": 1.3358422877023778e-05, + "loss": 0.3005, + "step": 91350 + }, + { + "epoch": 82.6923076923077, + "grad_norm": 1.5282971858978271, + "learning_rate": 1.3324701400706106e-05, + "loss": 0.2854, + "step": 91375 + }, + { + "epoch": 82.71493212669684, + "grad_norm": 1.1351838111877441, + "learning_rate": 1.3291018390778065e-05, + "loss": 0.2075, + "step": 91400 + }, + { + "epoch": 82.73755656108597, + "grad_norm": 2.479787588119507, + "learning_rate": 1.3257373868247437e-05, + "loss": 0.3039, + "step": 91425 + }, + { + "epoch": 82.76018099547511, + "grad_norm": 1.5777153968811035, + "learning_rate": 1.3223767854098075e-05, + "loss": 0.271, + "step": 91450 + }, + { + "epoch": 82.78280542986425, + "grad_norm": 0.48745808005332947, + "learning_rate": 1.3190200369289739e-05, + "loss": 0.2293, + "step": 91475 + }, + { + "epoch": 82.8054298642534, + "grad_norm": 1.8441256284713745, + "learning_rate": 1.3156671434758249e-05, + "loss": 0.1951, + "step": 91500 + }, + { + "epoch": 82.82805429864253, + "grad_norm": 1.2741081714630127, + "learning_rate": 1.3123181071415292e-05, + "loss": 0.2211, + "step": 91525 + }, + { + "epoch": 82.85067873303167, + "grad_norm": 2.198697805404663, + "learning_rate": 1.3089729300148571e-05, + "loss": 0.2804, + "step": 91550 + }, + { + "epoch": 82.87330316742081, + "grad_norm": 1.9699496030807495, + "learning_rate": 1.3056316141821655e-05, + "loss": 0.2583, + "step": 91575 + }, + { + "epoch": 82.89592760180996, + "grad_norm": 2.259859085083008, + "learning_rate": 1.3022941617274096e-05, + "loss": 0.3086, + "step": 91600 + }, + { + "epoch": 82.91855203619909, + "grad_norm": 0.7027202248573303, + "learning_rate": 1.298960574732128e-05, + "loss": 0.1961, + "step": 91625 + }, + { + "epoch": 82.94117647058823, + "grad_norm": 1.2334860563278198, + "learning_rate": 1.2956308552754574e-05, + "loss": 0.264, + "step": 91650 + }, + { + "epoch": 82.96380090497738, + "grad_norm": 1.6832187175750732, + "learning_rate": 1.2923050054341116e-05, + "loss": 0.225, + "step": 91675 + }, + { + "epoch": 82.98642533936652, + "grad_norm": 1.0022525787353516, + "learning_rate": 1.2889830272824015e-05, + "loss": 0.2152, + "step": 91700 + }, + { + "epoch": 83.00904977375566, + "grad_norm": 1.7496888637542725, + "learning_rate": 1.2856649228922128e-05, + "loss": 0.1937, + "step": 91725 + }, + { + "epoch": 83.03167420814479, + "grad_norm": 1.1650596857070923, + "learning_rate": 1.2823506943330261e-05, + "loss": 0.2486, + "step": 91750 + }, + { + "epoch": 83.05429864253394, + "grad_norm": 3.7929768562316895, + "learning_rate": 1.2790403436718955e-05, + "loss": 0.2547, + "step": 91775 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 1.5414960384368896, + "learning_rate": 1.2757338729734627e-05, + "loss": 0.2198, + "step": 91800 + }, + { + "epoch": 83.09954751131222, + "grad_norm": 1.1925899982452393, + "learning_rate": 1.2724312842999438e-05, + "loss": 0.2075, + "step": 91825 + }, + { + "epoch": 83.12217194570135, + "grad_norm": 0.9208041429519653, + "learning_rate": 1.2691325797111412e-05, + "loss": 0.2762, + "step": 91850 + }, + { + "epoch": 83.1447963800905, + "grad_norm": 1.5031992197036743, + "learning_rate": 1.2658377612644261e-05, + "loss": 0.2388, + "step": 91875 + }, + { + "epoch": 83.16742081447964, + "grad_norm": 1.0860973596572876, + "learning_rate": 1.2625468310147543e-05, + "loss": 0.1943, + "step": 91900 + }, + { + "epoch": 83.19004524886878, + "grad_norm": 1.8405532836914062, + "learning_rate": 1.2592597910146484e-05, + "loss": 0.2026, + "step": 91925 + }, + { + "epoch": 83.21266968325791, + "grad_norm": 1.0833250284194946, + "learning_rate": 1.2559766433142136e-05, + "loss": 0.2159, + "step": 91950 + }, + { + "epoch": 83.23529411764706, + "grad_norm": 1.1138516664505005, + "learning_rate": 1.252697389961118e-05, + "loss": 0.2332, + "step": 91975 + }, + { + "epoch": 83.2579185520362, + "grad_norm": 4.483831882476807, + "learning_rate": 1.2494220330006106e-05, + "loss": 0.2463, + "step": 92000 + }, + { + "epoch": 83.28054298642535, + "grad_norm": 1.4626716375350952, + "learning_rate": 1.2461505744755008e-05, + "loss": 0.241, + "step": 92025 + }, + { + "epoch": 83.30316742081448, + "grad_norm": 1.9593162536621094, + "learning_rate": 1.242883016426175e-05, + "loss": 0.2296, + "step": 92050 + }, + { + "epoch": 83.32579185520362, + "grad_norm": 5.334970474243164, + "learning_rate": 1.2396193608905788e-05, + "loss": 0.2694, + "step": 92075 + }, + { + "epoch": 83.34841628959276, + "grad_norm": 1.8293943405151367, + "learning_rate": 1.2363596099042308e-05, + "loss": 0.2916, + "step": 92100 + }, + { + "epoch": 83.3710407239819, + "grad_norm": 1.431554913520813, + "learning_rate": 1.2331037655002129e-05, + "loss": 0.2393, + "step": 92125 + }, + { + "epoch": 83.39366515837104, + "grad_norm": 1.992844581604004, + "learning_rate": 1.229851829709165e-05, + "loss": 0.2334, + "step": 92150 + }, + { + "epoch": 83.41628959276018, + "grad_norm": 1.039581537246704, + "learning_rate": 1.226603804559298e-05, + "loss": 0.2296, + "step": 92175 + }, + { + "epoch": 83.43891402714932, + "grad_norm": 1.2727681398391724, + "learning_rate": 1.2233596920763747e-05, + "loss": 0.2359, + "step": 92200 + }, + { + "epoch": 83.46153846153847, + "grad_norm": 1.4922980070114136, + "learning_rate": 1.2201194942837259e-05, + "loss": 0.2194, + "step": 92225 + }, + { + "epoch": 83.4841628959276, + "grad_norm": 2.502060890197754, + "learning_rate": 1.216883213202234e-05, + "loss": 0.3302, + "step": 92250 + }, + { + "epoch": 83.50678733031674, + "grad_norm": 1.0203526020050049, + "learning_rate": 1.213650850850344e-05, + "loss": 0.1877, + "step": 92275 + }, + { + "epoch": 83.52941176470588, + "grad_norm": 1.7818883657455444, + "learning_rate": 1.210422409244052e-05, + "loss": 0.177, + "step": 92300 + }, + { + "epoch": 83.55203619909503, + "grad_norm": 1.2579689025878906, + "learning_rate": 1.2071978903969142e-05, + "loss": 0.2073, + "step": 92325 + }, + { + "epoch": 83.57466063348416, + "grad_norm": 1.957848072052002, + "learning_rate": 1.2039772963200344e-05, + "loss": 0.2609, + "step": 92350 + }, + { + "epoch": 83.5972850678733, + "grad_norm": 1.2671171426773071, + "learning_rate": 1.2007606290220733e-05, + "loss": 0.2539, + "step": 92375 + }, + { + "epoch": 83.61990950226244, + "grad_norm": 1.2481365203857422, + "learning_rate": 1.1975478905092379e-05, + "loss": 0.2584, + "step": 92400 + }, + { + "epoch": 83.64253393665159, + "grad_norm": 1.514829397201538, + "learning_rate": 1.1943390827852917e-05, + "loss": 0.2208, + "step": 92425 + }, + { + "epoch": 83.66515837104072, + "grad_norm": 1.7578043937683105, + "learning_rate": 1.1911342078515374e-05, + "loss": 0.2245, + "step": 92450 + }, + { + "epoch": 83.68778280542986, + "grad_norm": 1.1728429794311523, + "learning_rate": 1.1879332677068335e-05, + "loss": 0.1893, + "step": 92475 + }, + { + "epoch": 83.710407239819, + "grad_norm": 2.2865982055664062, + "learning_rate": 1.1847362643475789e-05, + "loss": 0.2877, + "step": 92500 + }, + { + "epoch": 83.73303167420815, + "grad_norm": 1.3461461067199707, + "learning_rate": 1.1815431997677201e-05, + "loss": 0.1979, + "step": 92525 + }, + { + "epoch": 83.75565610859728, + "grad_norm": 1.475647211074829, + "learning_rate": 1.1783540759587445e-05, + "loss": 0.2905, + "step": 92550 + }, + { + "epoch": 83.77828054298642, + "grad_norm": 2.1079046726226807, + "learning_rate": 1.1751688949096857e-05, + "loss": 0.2425, + "step": 92575 + }, + { + "epoch": 83.80090497737557, + "grad_norm": 1.268576979637146, + "learning_rate": 1.1719876586071114e-05, + "loss": 0.2529, + "step": 92600 + }, + { + "epoch": 83.82352941176471, + "grad_norm": 1.504587173461914, + "learning_rate": 1.1688103690351377e-05, + "loss": 0.2188, + "step": 92625 + }, + { + "epoch": 83.84615384615384, + "grad_norm": 1.9538213014602661, + "learning_rate": 1.1656370281754113e-05, + "loss": 0.3516, + "step": 92650 + }, + { + "epoch": 83.86877828054298, + "grad_norm": 1.2477388381958008, + "learning_rate": 1.162467638007122e-05, + "loss": 0.2276, + "step": 92675 + }, + { + "epoch": 83.89140271493213, + "grad_norm": 1.0654138326644897, + "learning_rate": 1.1593022005069908e-05, + "loss": 0.227, + "step": 92700 + }, + { + "epoch": 83.91402714932127, + "grad_norm": 1.1622322797775269, + "learning_rate": 1.156140717649277e-05, + "loss": 0.2651, + "step": 92725 + }, + { + "epoch": 83.9366515837104, + "grad_norm": 1.1602331399917603, + "learning_rate": 1.1529831914057713e-05, + "loss": 0.2379, + "step": 92750 + }, + { + "epoch": 83.95927601809954, + "grad_norm": 2.367241382598877, + "learning_rate": 1.1498296237458e-05, + "loss": 0.2841, + "step": 92775 + }, + { + "epoch": 83.98190045248869, + "grad_norm": 2.114567518234253, + "learning_rate": 1.1466800166362136e-05, + "loss": 0.246, + "step": 92800 + }, + { + "epoch": 84.00452488687783, + "grad_norm": 1.3881651163101196, + "learning_rate": 1.1435343720413986e-05, + "loss": 0.2544, + "step": 92825 + }, + { + "epoch": 84.02714932126698, + "grad_norm": 1.0981236696243286, + "learning_rate": 1.1403926919232706e-05, + "loss": 0.2271, + "step": 92850 + }, + { + "epoch": 84.0497737556561, + "grad_norm": 1.800406575202942, + "learning_rate": 1.1372549782412696e-05, + "loss": 0.2432, + "step": 92875 + }, + { + "epoch": 84.07239819004525, + "grad_norm": 1.1198346614837646, + "learning_rate": 1.1341212329523594e-05, + "loss": 0.3307, + "step": 92900 + }, + { + "epoch": 84.09502262443439, + "grad_norm": 0.9892085194587708, + "learning_rate": 1.1309914580110367e-05, + "loss": 0.1931, + "step": 92925 + }, + { + "epoch": 84.11764705882354, + "grad_norm": 1.205314040184021, + "learning_rate": 1.1278656553693122e-05, + "loss": 0.234, + "step": 92950 + }, + { + "epoch": 84.14027149321267, + "grad_norm": 0.7759678363800049, + "learning_rate": 1.1247438269767275e-05, + "loss": 0.1786, + "step": 92975 + }, + { + "epoch": 84.16289592760181, + "grad_norm": 1.9085394144058228, + "learning_rate": 1.1216259747803394e-05, + "loss": 0.1732, + "step": 93000 + }, + { + "epoch": 84.18552036199095, + "grad_norm": 4.513044834136963, + "learning_rate": 1.1185121007247305e-05, + "loss": 0.2084, + "step": 93025 + }, + { + "epoch": 84.2081447963801, + "grad_norm": 1.1749680042266846, + "learning_rate": 1.115402206751995e-05, + "loss": 0.2534, + "step": 93050 + }, + { + "epoch": 84.23076923076923, + "grad_norm": 1.0119569301605225, + "learning_rate": 1.1122962948017528e-05, + "loss": 0.1841, + "step": 93075 + }, + { + "epoch": 84.25339366515837, + "grad_norm": 0.9673476219177246, + "learning_rate": 1.1091943668111327e-05, + "loss": 0.2651, + "step": 93100 + }, + { + "epoch": 84.27601809954751, + "grad_norm": 1.5067884922027588, + "learning_rate": 1.1060964247147857e-05, + "loss": 0.2389, + "step": 93125 + }, + { + "epoch": 84.29864253393666, + "grad_norm": 1.245119571685791, + "learning_rate": 1.1030024704448703e-05, + "loss": 0.1927, + "step": 93150 + }, + { + "epoch": 84.32126696832579, + "grad_norm": 1.9086949825286865, + "learning_rate": 1.0999125059310646e-05, + "loss": 0.2167, + "step": 93175 + }, + { + "epoch": 84.34389140271493, + "grad_norm": 1.1651722192764282, + "learning_rate": 1.0968265331005511e-05, + "loss": 0.1841, + "step": 93200 + }, + { + "epoch": 84.36651583710407, + "grad_norm": 1.0608320236206055, + "learning_rate": 1.0937445538780293e-05, + "loss": 0.1861, + "step": 93225 + }, + { + "epoch": 84.38914027149322, + "grad_norm": 1.3786742687225342, + "learning_rate": 1.0906665701857017e-05, + "loss": 0.2703, + "step": 93250 + }, + { + "epoch": 84.41176470588235, + "grad_norm": 1.4715839624404907, + "learning_rate": 1.0875925839432862e-05, + "loss": 0.2952, + "step": 93275 + }, + { + "epoch": 84.43438914027149, + "grad_norm": 1.4558757543563843, + "learning_rate": 1.0845225970679989e-05, + "loss": 0.2203, + "step": 93300 + }, + { + "epoch": 84.45701357466064, + "grad_norm": 1.2345609664916992, + "learning_rate": 1.0814566114745698e-05, + "loss": 0.2048, + "step": 93325 + }, + { + "epoch": 84.47963800904978, + "grad_norm": 1.4966411590576172, + "learning_rate": 1.0783946290752271e-05, + "loss": 0.2378, + "step": 93350 + }, + { + "epoch": 84.50226244343891, + "grad_norm": 1.184018850326538, + "learning_rate": 1.0753366517797071e-05, + "loss": 0.2707, + "step": 93375 + }, + { + "epoch": 84.52488687782805, + "grad_norm": 1.4262800216674805, + "learning_rate": 1.0722826814952418e-05, + "loss": 0.2474, + "step": 93400 + }, + { + "epoch": 84.5475113122172, + "grad_norm": 2.6090216636657715, + "learning_rate": 1.0692327201265724e-05, + "loss": 0.2004, + "step": 93425 + }, + { + "epoch": 84.57013574660634, + "grad_norm": 1.0995943546295166, + "learning_rate": 1.0661867695759324e-05, + "loss": 0.2601, + "step": 93450 + }, + { + "epoch": 84.59276018099547, + "grad_norm": 1.1398271322250366, + "learning_rate": 1.0631448317430589e-05, + "loss": 0.2098, + "step": 93475 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 2.0760931968688965, + "learning_rate": 1.0601069085251816e-05, + "loss": 0.2365, + "step": 93500 + }, + { + "epoch": 84.63800904977376, + "grad_norm": 1.293813705444336, + "learning_rate": 1.0570730018170314e-05, + "loss": 0.2195, + "step": 93525 + }, + { + "epoch": 84.6606334841629, + "grad_norm": 5.195942401885986, + "learning_rate": 1.0540431135108294e-05, + "loss": 0.2578, + "step": 93550 + }, + { + "epoch": 84.68325791855203, + "grad_norm": 1.6340610980987549, + "learning_rate": 1.0510172454962951e-05, + "loss": 0.2413, + "step": 93575 + }, + { + "epoch": 84.70588235294117, + "grad_norm": 1.8091683387756348, + "learning_rate": 1.0479953996606358e-05, + "loss": 0.2124, + "step": 93600 + }, + { + "epoch": 84.72850678733032, + "grad_norm": 1.0600144863128662, + "learning_rate": 1.0449775778885538e-05, + "loss": 0.2983, + "step": 93625 + }, + { + "epoch": 84.75113122171946, + "grad_norm": 1.701952338218689, + "learning_rate": 1.0419637820622394e-05, + "loss": 0.2389, + "step": 93650 + }, + { + "epoch": 84.77375565610859, + "grad_norm": 1.4062515497207642, + "learning_rate": 1.0389540140613733e-05, + "loss": 0.2156, + "step": 93675 + }, + { + "epoch": 84.79638009049773, + "grad_norm": 1.0955379009246826, + "learning_rate": 1.0359482757631263e-05, + "loss": 0.2286, + "step": 93700 + }, + { + "epoch": 84.81900452488688, + "grad_norm": 1.2862584590911865, + "learning_rate": 1.0329465690421488e-05, + "loss": 0.2216, + "step": 93725 + }, + { + "epoch": 84.84162895927602, + "grad_norm": 2.141054153442383, + "learning_rate": 1.0299488957705848e-05, + "loss": 0.2311, + "step": 93750 + }, + { + "epoch": 84.86425339366515, + "grad_norm": 1.131123423576355, + "learning_rate": 1.0269552578180564e-05, + "loss": 0.2346, + "step": 93775 + }, + { + "epoch": 84.8868778280543, + "grad_norm": 1.3921465873718262, + "learning_rate": 1.0239656570516752e-05, + "loss": 0.2685, + "step": 93800 + }, + { + "epoch": 84.90950226244344, + "grad_norm": 5.968398571014404, + "learning_rate": 1.0209800953360261e-05, + "loss": 0.3258, + "step": 93825 + }, + { + "epoch": 84.93212669683258, + "grad_norm": 1.6810258626937866, + "learning_rate": 1.0179985745331856e-05, + "loss": 0.2198, + "step": 93850 + }, + { + "epoch": 84.95475113122171, + "grad_norm": 4.591853618621826, + "learning_rate": 1.0150210965026995e-05, + "loss": 0.2331, + "step": 93875 + }, + { + "epoch": 84.97737556561086, + "grad_norm": 1.592495322227478, + "learning_rate": 1.0120476631016005e-05, + "loss": 0.2732, + "step": 93900 + }, + { + "epoch": 85.0, + "grad_norm": 1.7209784984588623, + "learning_rate": 1.0090782761843929e-05, + "loss": 0.2067, + "step": 93925 + }, + { + "epoch": 85.02262443438914, + "grad_norm": 1.0648268461227417, + "learning_rate": 1.006112937603062e-05, + "loss": 0.1947, + "step": 93950 + }, + { + "epoch": 85.04524886877829, + "grad_norm": 1.4495179653167725, + "learning_rate": 1.003151649207062e-05, + "loss": 0.2439, + "step": 93975 + }, + { + "epoch": 85.06787330316742, + "grad_norm": 1.446929931640625, + "learning_rate": 1.0001944128433287e-05, + "loss": 0.1935, + "step": 94000 + }, + { + "epoch": 85.09049773755656, + "grad_norm": 1.1472259759902954, + "learning_rate": 9.97241230356263e-06, + "loss": 0.2541, + "step": 94025 + }, + { + "epoch": 85.1131221719457, + "grad_norm": 2.1452972888946533, + "learning_rate": 9.94292103587745e-06, + "loss": 0.1793, + "step": 94050 + }, + { + "epoch": 85.13574660633485, + "grad_norm": 1.1351040601730347, + "learning_rate": 9.913470343771182e-06, + "loss": 0.2169, + "step": 94075 + }, + { + "epoch": 85.15837104072398, + "grad_norm": 1.1573127508163452, + "learning_rate": 9.88406024561201e-06, + "loss": 0.2461, + "step": 94100 + }, + { + "epoch": 85.18099547511312, + "grad_norm": 1.3857795000076294, + "learning_rate": 9.854690759742761e-06, + "loss": 0.2407, + "step": 94125 + }, + { + "epoch": 85.20361990950227, + "grad_norm": 1.405233383178711, + "learning_rate": 9.825361904480957e-06, + "loss": 0.24, + "step": 94150 + }, + { + "epoch": 85.22624434389141, + "grad_norm": 1.58432137966156, + "learning_rate": 9.796073698118758e-06, + "loss": 0.1896, + "step": 94175 + }, + { + "epoch": 85.24886877828054, + "grad_norm": 1.1714897155761719, + "learning_rate": 9.76682615892301e-06, + "loss": 0.2228, + "step": 94200 + }, + { + "epoch": 85.27149321266968, + "grad_norm": 1.204953670501709, + "learning_rate": 9.73761930513513e-06, + "loss": 0.2495, + "step": 94225 + }, + { + "epoch": 85.29411764705883, + "grad_norm": 1.3959544897079468, + "learning_rate": 9.708453154971237e-06, + "loss": 0.2837, + "step": 94250 + }, + { + "epoch": 85.31674208144797, + "grad_norm": 0.8167849779129028, + "learning_rate": 9.679327726621999e-06, + "loss": 0.1963, + "step": 94275 + }, + { + "epoch": 85.3393665158371, + "grad_norm": 0.5334164500236511, + "learning_rate": 9.650243038252725e-06, + "loss": 0.2422, + "step": 94300 + }, + { + "epoch": 85.36199095022624, + "grad_norm": 0.9496262073516846, + "learning_rate": 9.621199108003288e-06, + "loss": 0.2173, + "step": 94325 + }, + { + "epoch": 85.38461538461539, + "grad_norm": 0.8168772459030151, + "learning_rate": 9.592195953988191e-06, + "loss": 0.228, + "step": 94350 + }, + { + "epoch": 85.40723981900453, + "grad_norm": 1.048490285873413, + "learning_rate": 9.563233594296412e-06, + "loss": 0.2297, + "step": 94375 + }, + { + "epoch": 85.42986425339366, + "grad_norm": 3.6872718334198, + "learning_rate": 9.534312046991596e-06, + "loss": 0.244, + "step": 94400 + }, + { + "epoch": 85.4524886877828, + "grad_norm": 1.9084551334381104, + "learning_rate": 9.505431330111845e-06, + "loss": 0.2154, + "step": 94425 + }, + { + "epoch": 85.47511312217195, + "grad_norm": 0.738849401473999, + "learning_rate": 9.476591461669852e-06, + "loss": 0.2178, + "step": 94450 + }, + { + "epoch": 85.49773755656109, + "grad_norm": 0.7861320376396179, + "learning_rate": 9.447792459652794e-06, + "loss": 0.2324, + "step": 94475 + }, + { + "epoch": 85.52036199095022, + "grad_norm": 0.9632349014282227, + "learning_rate": 9.419034342022406e-06, + "loss": 0.2168, + "step": 94500 + }, + { + "epoch": 85.54298642533936, + "grad_norm": 1.1395058631896973, + "learning_rate": 9.390317126714852e-06, + "loss": 0.23, + "step": 94525 + }, + { + "epoch": 85.56561085972851, + "grad_norm": 0.8488622307777405, + "learning_rate": 9.361640831640876e-06, + "loss": 0.2729, + "step": 94550 + }, + { + "epoch": 85.58823529411765, + "grad_norm": 1.4476090669631958, + "learning_rate": 9.333005474685621e-06, + "loss": 0.2602, + "step": 94575 + }, + { + "epoch": 85.61085972850678, + "grad_norm": 4.248432159423828, + "learning_rate": 9.304411073708748e-06, + "loss": 0.2542, + "step": 94600 + }, + { + "epoch": 85.63348416289593, + "grad_norm": 1.1603845357894897, + "learning_rate": 9.275857646544336e-06, + "loss": 0.1941, + "step": 94625 + }, + { + "epoch": 85.65610859728507, + "grad_norm": 1.3284860849380493, + "learning_rate": 9.247345211000954e-06, + "loss": 0.248, + "step": 94650 + }, + { + "epoch": 85.67873303167421, + "grad_norm": 1.5702111721038818, + "learning_rate": 9.218873784861544e-06, + "loss": 0.2863, + "step": 94675 + }, + { + "epoch": 85.70135746606334, + "grad_norm": 0.984173059463501, + "learning_rate": 9.191579813898661e-06, + "loss": 0.321, + "step": 94700 + }, + { + "epoch": 85.72398190045249, + "grad_norm": 1.5825644731521606, + "learning_rate": 9.163188817678019e-06, + "loss": 0.2568, + "step": 94725 + }, + { + "epoch": 85.74660633484163, + "grad_norm": 0.6896769404411316, + "learning_rate": 9.134838883349022e-06, + "loss": 0.2086, + "step": 94750 + }, + { + "epoch": 85.76923076923077, + "grad_norm": 1.1233223676681519, + "learning_rate": 9.106530028593325e-06, + "loss": 0.3007, + "step": 94775 + }, + { + "epoch": 85.7918552036199, + "grad_norm": 1.124239444732666, + "learning_rate": 9.078262271066916e-06, + "loss": 0.2402, + "step": 94800 + }, + { + "epoch": 85.81447963800905, + "grad_norm": 1.208046317100525, + "learning_rate": 9.05003562840019e-06, + "loss": 0.2751, + "step": 94825 + }, + { + "epoch": 85.83710407239819, + "grad_norm": 1.4039021730422974, + "learning_rate": 9.021850118197848e-06, + "loss": 0.2098, + "step": 94850 + }, + { + "epoch": 85.85972850678733, + "grad_norm": 1.041251540184021, + "learning_rate": 8.993705758039004e-06, + "loss": 0.2017, + "step": 94875 + }, + { + "epoch": 85.88235294117646, + "grad_norm": 1.353350281715393, + "learning_rate": 8.965602565477025e-06, + "loss": 0.1931, + "step": 94900 + }, + { + "epoch": 85.90497737556561, + "grad_norm": 1.6761797666549683, + "learning_rate": 8.937540558039675e-06, + "loss": 0.2041, + "step": 94925 + }, + { + "epoch": 85.92760180995475, + "grad_norm": 1.4602274894714355, + "learning_rate": 8.909519753229016e-06, + "loss": 0.2115, + "step": 94950 + }, + { + "epoch": 85.9502262443439, + "grad_norm": 1.3357871770858765, + "learning_rate": 8.881540168521364e-06, + "loss": 0.1892, + "step": 94975 + }, + { + "epoch": 85.97285067873302, + "grad_norm": 1.0203659534454346, + "learning_rate": 8.8536018213674e-06, + "loss": 0.2226, + "step": 95000 + }, + { + "epoch": 85.99547511312217, + "grad_norm": 0.8663904666900635, + "learning_rate": 8.825704729192013e-06, + "loss": 0.1884, + "step": 95025 + }, + { + "epoch": 86.01809954751131, + "grad_norm": 1.6109756231307983, + "learning_rate": 8.797848909394422e-06, + "loss": 0.1958, + "step": 95050 + }, + { + "epoch": 86.04072398190046, + "grad_norm": 0.881666898727417, + "learning_rate": 8.77003437934806e-06, + "loss": 0.2421, + "step": 95075 + }, + { + "epoch": 86.0633484162896, + "grad_norm": 1.2642569541931152, + "learning_rate": 8.742261156400645e-06, + "loss": 0.2097, + "step": 95100 + }, + { + "epoch": 86.08597285067873, + "grad_norm": 0.5287483334541321, + "learning_rate": 8.714529257874084e-06, + "loss": 0.2161, + "step": 95125 + }, + { + "epoch": 86.10859728506787, + "grad_norm": 1.08829665184021, + "learning_rate": 8.68683870106458e-06, + "loss": 0.2238, + "step": 95150 + }, + { + "epoch": 86.13122171945702, + "grad_norm": 0.4798315465450287, + "learning_rate": 8.659189503242469e-06, + "loss": 0.2238, + "step": 95175 + }, + { + "epoch": 86.15384615384616, + "grad_norm": 1.7319557666778564, + "learning_rate": 8.631581681652375e-06, + "loss": 0.2451, + "step": 95200 + }, + { + "epoch": 86.17647058823529, + "grad_norm": 0.795026421546936, + "learning_rate": 8.604015253513038e-06, + "loss": 0.2117, + "step": 95225 + }, + { + "epoch": 86.19909502262443, + "grad_norm": 1.3550655841827393, + "learning_rate": 8.57649023601745e-06, + "loss": 0.2975, + "step": 95250 + }, + { + "epoch": 86.22171945701358, + "grad_norm": 0.6398485898971558, + "learning_rate": 8.549006646332709e-06, + "loss": 0.2096, + "step": 95275 + }, + { + "epoch": 86.24434389140272, + "grad_norm": 4.678731441497803, + "learning_rate": 8.521564501600156e-06, + "loss": 0.2141, + "step": 95300 + }, + { + "epoch": 86.26696832579185, + "grad_norm": 1.8914018869400024, + "learning_rate": 8.49416381893519e-06, + "loss": 0.2474, + "step": 95325 + }, + { + "epoch": 86.289592760181, + "grad_norm": 0.6787257194519043, + "learning_rate": 8.466804615427425e-06, + "loss": 0.2422, + "step": 95350 + }, + { + "epoch": 86.31221719457014, + "grad_norm": 1.2384531497955322, + "learning_rate": 8.439486908140562e-06, + "loss": 0.2872, + "step": 95375 + }, + { + "epoch": 86.33484162895928, + "grad_norm": 1.7806782722473145, + "learning_rate": 8.41221071411246e-06, + "loss": 0.2452, + "step": 95400 + }, + { + "epoch": 86.35746606334841, + "grad_norm": 0.5884113311767578, + "learning_rate": 8.384976050355041e-06, + "loss": 0.267, + "step": 95425 + }, + { + "epoch": 86.38009049773756, + "grad_norm": 1.2176893949508667, + "learning_rate": 8.357782933854357e-06, + "loss": 0.3127, + "step": 95450 + }, + { + "epoch": 86.4027149321267, + "grad_norm": 1.3576347827911377, + "learning_rate": 8.330631381570524e-06, + "loss": 0.2059, + "step": 95475 + }, + { + "epoch": 86.42533936651584, + "grad_norm": 1.3243180513381958, + "learning_rate": 8.303521410437772e-06, + "loss": 0.2054, + "step": 95500 + }, + { + "epoch": 86.44796380090497, + "grad_norm": 2.1407980918884277, + "learning_rate": 8.276453037364342e-06, + "loss": 0.2466, + "step": 95525 + }, + { + "epoch": 86.47058823529412, + "grad_norm": 1.1910250186920166, + "learning_rate": 8.249426279232587e-06, + "loss": 0.2047, + "step": 95550 + }, + { + "epoch": 86.49321266968326, + "grad_norm": 1.1588889360427856, + "learning_rate": 8.222441152898859e-06, + "loss": 0.2252, + "step": 95575 + }, + { + "epoch": 86.5158371040724, + "grad_norm": 0.9993530511856079, + "learning_rate": 8.195497675193586e-06, + "loss": 0.2225, + "step": 95600 + }, + { + "epoch": 86.53846153846153, + "grad_norm": 1.0689810514450073, + "learning_rate": 8.168595862921174e-06, + "loss": 0.1916, + "step": 95625 + }, + { + "epoch": 86.56108597285068, + "grad_norm": 1.3326655626296997, + "learning_rate": 8.141735732860102e-06, + "loss": 0.1635, + "step": 95650 + }, + { + "epoch": 86.58371040723982, + "grad_norm": 1.4869006872177124, + "learning_rate": 8.11491730176278e-06, + "loss": 0.2113, + "step": 95675 + }, + { + "epoch": 86.60633484162896, + "grad_norm": 1.290174126625061, + "learning_rate": 8.088140586355677e-06, + "loss": 0.2214, + "step": 95700 + }, + { + "epoch": 86.6289592760181, + "grad_norm": 1.172241449356079, + "learning_rate": 8.061405603339199e-06, + "loss": 0.2019, + "step": 95725 + }, + { + "epoch": 86.65158371040724, + "grad_norm": 1.8281601667404175, + "learning_rate": 8.034712369387752e-06, + "loss": 0.2456, + "step": 95750 + }, + { + "epoch": 86.67420814479638, + "grad_norm": 1.2891125679016113, + "learning_rate": 8.008060901149685e-06, + "loss": 0.1795, + "step": 95775 + }, + { + "epoch": 86.69683257918552, + "grad_norm": 1.448681354522705, + "learning_rate": 7.981451215247317e-06, + "loss": 0.2167, + "step": 95800 + }, + { + "epoch": 86.71945701357465, + "grad_norm": 1.2627946138381958, + "learning_rate": 7.954883328276864e-06, + "loss": 0.1876, + "step": 95825 + }, + { + "epoch": 86.7420814479638, + "grad_norm": 0.7485878467559814, + "learning_rate": 7.928357256808549e-06, + "loss": 0.247, + "step": 95850 + }, + { + "epoch": 86.76470588235294, + "grad_norm": 1.0917359590530396, + "learning_rate": 7.901873017386435e-06, + "loss": 0.2737, + "step": 95875 + }, + { + "epoch": 86.78733031674209, + "grad_norm": 1.3244069814682007, + "learning_rate": 7.87543062652856e-06, + "loss": 0.2113, + "step": 95900 + }, + { + "epoch": 86.80995475113122, + "grad_norm": 1.586942195892334, + "learning_rate": 7.849030100726789e-06, + "loss": 0.2813, + "step": 95925 + }, + { + "epoch": 86.83257918552036, + "grad_norm": 0.8971759080886841, + "learning_rate": 7.822671456446961e-06, + "loss": 0.235, + "step": 95950 + }, + { + "epoch": 86.8552036199095, + "grad_norm": 1.3623031377792358, + "learning_rate": 7.796354710128724e-06, + "loss": 0.2643, + "step": 95975 + }, + { + "epoch": 86.87782805429865, + "grad_norm": 1.6224130392074585, + "learning_rate": 7.77007987818565e-06, + "loss": 0.2527, + "step": 96000 + }, + { + "epoch": 86.90045248868778, + "grad_norm": 1.5212302207946777, + "learning_rate": 7.743846977005097e-06, + "loss": 0.1836, + "step": 96025 + }, + { + "epoch": 86.92307692307692, + "grad_norm": 0.9528728127479553, + "learning_rate": 7.717656022948367e-06, + "loss": 0.1959, + "step": 96050 + }, + { + "epoch": 86.94570135746606, + "grad_norm": 1.2689321041107178, + "learning_rate": 7.691507032350506e-06, + "loss": 0.1851, + "step": 96075 + }, + { + "epoch": 86.96832579185521, + "grad_norm": 0.9713883996009827, + "learning_rate": 7.665400021520454e-06, + "loss": 0.2318, + "step": 96100 + }, + { + "epoch": 86.99095022624434, + "grad_norm": 1.0001325607299805, + "learning_rate": 7.639335006740924e-06, + "loss": 0.2098, + "step": 96125 + }, + { + "epoch": 87.01357466063348, + "grad_norm": 1.2248291969299316, + "learning_rate": 7.613312004268484e-06, + "loss": 0.1827, + "step": 96150 + }, + { + "epoch": 87.03619909502262, + "grad_norm": 3.51912784576416, + "learning_rate": 7.587331030333454e-06, + "loss": 0.2468, + "step": 96175 + }, + { + "epoch": 87.05882352941177, + "grad_norm": 4.302508354187012, + "learning_rate": 7.561392101139971e-06, + "loss": 0.2708, + "step": 96200 + }, + { + "epoch": 87.08144796380091, + "grad_norm": 1.4801913499832153, + "learning_rate": 7.53549523286591e-06, + "loss": 0.2225, + "step": 96225 + }, + { + "epoch": 87.10407239819004, + "grad_norm": 1.3829647302627563, + "learning_rate": 7.509640441662976e-06, + "loss": 0.1742, + "step": 96250 + }, + { + "epoch": 87.12669683257919, + "grad_norm": 1.7717469930648804, + "learning_rate": 7.483827743656571e-06, + "loss": 0.1917, + "step": 96275 + }, + { + "epoch": 87.14932126696833, + "grad_norm": 1.1577367782592773, + "learning_rate": 7.458057154945882e-06, + "loss": 0.2072, + "step": 96300 + }, + { + "epoch": 87.17194570135747, + "grad_norm": 1.3073642253875732, + "learning_rate": 7.432328691603803e-06, + "loss": 0.2031, + "step": 96325 + }, + { + "epoch": 87.1945701357466, + "grad_norm": 0.7133212089538574, + "learning_rate": 7.4066423696769905e-06, + "loss": 0.272, + "step": 96350 + }, + { + "epoch": 87.21719457013575, + "grad_norm": 1.3051916360855103, + "learning_rate": 7.380998205185778e-06, + "loss": 0.1908, + "step": 96375 + }, + { + "epoch": 87.23981900452489, + "grad_norm": 1.0187506675720215, + "learning_rate": 7.355396214124249e-06, + "loss": 0.2285, + "step": 96400 + }, + { + "epoch": 87.26244343891403, + "grad_norm": 0.6291978359222412, + "learning_rate": 7.329836412460127e-06, + "loss": 0.2747, + "step": 96425 + }, + { + "epoch": 87.28506787330316, + "grad_norm": 1.2138255834579468, + "learning_rate": 7.30431881613488e-06, + "loss": 0.2397, + "step": 96450 + }, + { + "epoch": 87.3076923076923, + "grad_norm": 1.0286800861358643, + "learning_rate": 7.278843441063633e-06, + "loss": 0.2344, + "step": 96475 + }, + { + "epoch": 87.33031674208145, + "grad_norm": 1.1416869163513184, + "learning_rate": 7.253410303135154e-06, + "loss": 0.2007, + "step": 96500 + }, + { + "epoch": 87.3529411764706, + "grad_norm": 1.5559656620025635, + "learning_rate": 7.228019418211903e-06, + "loss": 0.2306, + "step": 96525 + }, + { + "epoch": 87.37556561085972, + "grad_norm": 1.1798025369644165, + "learning_rate": 7.202670802129954e-06, + "loss": 0.1642, + "step": 96550 + }, + { + "epoch": 87.39819004524887, + "grad_norm": 1.2939826250076294, + "learning_rate": 7.17736447069906e-06, + "loss": 0.2186, + "step": 96575 + }, + { + "epoch": 87.42081447963801, + "grad_norm": 1.5505955219268799, + "learning_rate": 7.152100439702555e-06, + "loss": 0.1967, + "step": 96600 + }, + { + "epoch": 87.44343891402715, + "grad_norm": 1.1835687160491943, + "learning_rate": 7.126878724897434e-06, + "loss": 0.1904, + "step": 96625 + }, + { + "epoch": 87.46606334841628, + "grad_norm": 1.1613070964813232, + "learning_rate": 7.101699342014247e-06, + "loss": 0.1898, + "step": 96650 + }, + { + "epoch": 87.48868778280543, + "grad_norm": 0.9732732176780701, + "learning_rate": 7.076562306757208e-06, + "loss": 0.2852, + "step": 96675 + }, + { + "epoch": 87.51131221719457, + "grad_norm": 1.5406293869018555, + "learning_rate": 7.051467634804059e-06, + "loss": 0.2073, + "step": 96700 + }, + { + "epoch": 87.53393665158372, + "grad_norm": 1.4113398790359497, + "learning_rate": 7.0264153418061634e-06, + "loss": 0.2592, + "step": 96725 + }, + { + "epoch": 87.55656108597285, + "grad_norm": 1.0180559158325195, + "learning_rate": 7.001405443388422e-06, + "loss": 0.2635, + "step": 96750 + }, + { + "epoch": 87.57918552036199, + "grad_norm": 0.9436126351356506, + "learning_rate": 6.977435840208026e-06, + "loss": 0.1937, + "step": 96775 + }, + { + "epoch": 87.60180995475113, + "grad_norm": 3.945801258087158, + "learning_rate": 6.9525090803909055e-06, + "loss": 0.225, + "step": 96800 + }, + { + "epoch": 87.62443438914028, + "grad_norm": 0.9749020934104919, + "learning_rate": 6.927624761248676e-06, + "loss": 0.2477, + "step": 96825 + }, + { + "epoch": 87.6470588235294, + "grad_norm": 1.1191080808639526, + "learning_rate": 6.902782898301515e-06, + "loss": 0.2031, + "step": 96850 + }, + { + "epoch": 87.66968325791855, + "grad_norm": 6.480875015258789, + "learning_rate": 6.8779835070430695e-06, + "loss": 0.3124, + "step": 96875 + }, + { + "epoch": 87.6923076923077, + "grad_norm": 1.1927406787872314, + "learning_rate": 6.853226602940534e-06, + "loss": 0.2204, + "step": 96900 + }, + { + "epoch": 87.71493212669684, + "grad_norm": 1.3960765600204468, + "learning_rate": 6.828512201434574e-06, + "loss": 0.207, + "step": 96925 + }, + { + "epoch": 87.73755656108597, + "grad_norm": 1.835037350654602, + "learning_rate": 6.8038403179394015e-06, + "loss": 0.173, + "step": 96950 + }, + { + "epoch": 87.76018099547511, + "grad_norm": 1.1787928342819214, + "learning_rate": 6.779210967842624e-06, + "loss": 0.1785, + "step": 96975 + }, + { + "epoch": 87.78280542986425, + "grad_norm": 1.1594642400741577, + "learning_rate": 6.754624166505412e-06, + "loss": 0.1793, + "step": 97000 + }, + { + "epoch": 87.8054298642534, + "grad_norm": 1.0816947221755981, + "learning_rate": 6.730079929262325e-06, + "loss": 0.1877, + "step": 97025 + }, + { + "epoch": 87.82805429864253, + "grad_norm": 1.2309519052505493, + "learning_rate": 6.7055782714214415e-06, + "loss": 0.2552, + "step": 97050 + }, + { + "epoch": 87.85067873303167, + "grad_norm": 1.8382105827331543, + "learning_rate": 6.6811192082642045e-06, + "loss": 0.2752, + "step": 97075 + }, + { + "epoch": 87.87330316742081, + "grad_norm": 1.8699984550476074, + "learning_rate": 6.656702755045579e-06, + "loss": 0.2458, + "step": 97100 + }, + { + "epoch": 87.89592760180996, + "grad_norm": 1.124491810798645, + "learning_rate": 6.632328926993874e-06, + "loss": 0.2592, + "step": 97125 + }, + { + "epoch": 87.91855203619909, + "grad_norm": 1.0307016372680664, + "learning_rate": 6.607997739310889e-06, + "loss": 0.1954, + "step": 97150 + }, + { + "epoch": 87.94117647058823, + "grad_norm": 1.102413535118103, + "learning_rate": 6.58370920717175e-06, + "loss": 0.1964, + "step": 97175 + }, + { + "epoch": 87.96380090497738, + "grad_norm": 1.0357909202575684, + "learning_rate": 6.559463345725058e-06, + "loss": 0.2379, + "step": 97200 + }, + { + "epoch": 87.98642533936652, + "grad_norm": 1.1607322692871094, + "learning_rate": 6.535260170092732e-06, + "loss": 0.2297, + "step": 97225 + }, + { + "epoch": 88.00904977375566, + "grad_norm": 1.5806924104690552, + "learning_rate": 6.5110996953701225e-06, + "loss": 0.2144, + "step": 97250 + }, + { + "epoch": 88.03167420814479, + "grad_norm": 1.0098795890808105, + "learning_rate": 6.486981936625901e-06, + "loss": 0.2351, + "step": 97275 + }, + { + "epoch": 88.05429864253394, + "grad_norm": 0.9517509341239929, + "learning_rate": 6.462906908902143e-06, + "loss": 0.1783, + "step": 97300 + }, + { + "epoch": 88.07692307692308, + "grad_norm": 1.001928687095642, + "learning_rate": 6.43887462721423e-06, + "loss": 0.1693, + "step": 97325 + }, + { + "epoch": 88.09954751131222, + "grad_norm": 1.1728754043579102, + "learning_rate": 6.414885106550929e-06, + "loss": 0.1852, + "step": 97350 + }, + { + "epoch": 88.12217194570135, + "grad_norm": 1.1057087182998657, + "learning_rate": 6.390938361874282e-06, + "loss": 0.2166, + "step": 97375 + }, + { + "epoch": 88.1447963800905, + "grad_norm": 1.3694144487380981, + "learning_rate": 6.367034408119706e-06, + "loss": 0.1823, + "step": 97400 + }, + { + "epoch": 88.16742081447964, + "grad_norm": 1.2324395179748535, + "learning_rate": 6.343173260195885e-06, + "loss": 0.2074, + "step": 97425 + }, + { + "epoch": 88.19004524886878, + "grad_norm": 1.6150360107421875, + "learning_rate": 6.319354932984849e-06, + "loss": 0.2048, + "step": 97450 + }, + { + "epoch": 88.21266968325791, + "grad_norm": 1.3420321941375732, + "learning_rate": 6.295579441341872e-06, + "loss": 0.2302, + "step": 97475 + }, + { + "epoch": 88.23529411764706, + "grad_norm": 2.5951383113861084, + "learning_rate": 6.2718468000955675e-06, + "loss": 0.2142, + "step": 97500 + }, + { + "epoch": 88.2579185520362, + "grad_norm": 1.2469367980957031, + "learning_rate": 6.248157024047762e-06, + "loss": 0.2217, + "step": 97525 + }, + { + "epoch": 88.28054298642535, + "grad_norm": 1.5069005489349365, + "learning_rate": 6.224510127973603e-06, + "loss": 0.215, + "step": 97550 + }, + { + "epoch": 88.30316742081448, + "grad_norm": 1.5617077350616455, + "learning_rate": 6.20090612662146e-06, + "loss": 0.3176, + "step": 97575 + }, + { + "epoch": 88.32579185520362, + "grad_norm": 0.839979887008667, + "learning_rate": 6.177345034712966e-06, + "loss": 0.1975, + "step": 97600 + }, + { + "epoch": 88.34841628959276, + "grad_norm": 1.1441539525985718, + "learning_rate": 6.1538268669429655e-06, + "loss": 0.1989, + "step": 97625 + }, + { + "epoch": 88.3710407239819, + "grad_norm": 0.4892381429672241, + "learning_rate": 6.130351637979583e-06, + "loss": 0.2742, + "step": 97650 + }, + { + "epoch": 88.39366515837104, + "grad_norm": 1.1934547424316406, + "learning_rate": 6.106919362464099e-06, + "loss": 0.1969, + "step": 97675 + }, + { + "epoch": 88.41628959276018, + "grad_norm": 0.8226586580276489, + "learning_rate": 6.083530055011048e-06, + "loss": 0.1789, + "step": 97700 + }, + { + "epoch": 88.43891402714932, + "grad_norm": 0.802721381187439, + "learning_rate": 6.060183730208171e-06, + "loss": 0.2685, + "step": 97725 + }, + { + "epoch": 88.46153846153847, + "grad_norm": 1.128164291381836, + "learning_rate": 6.036880402616359e-06, + "loss": 0.2312, + "step": 97750 + }, + { + "epoch": 88.4841628959276, + "grad_norm": 1.6216875314712524, + "learning_rate": 6.01362008676973e-06, + "loss": 0.1797, + "step": 97775 + }, + { + "epoch": 88.50678733031674, + "grad_norm": 1.7495800256729126, + "learning_rate": 5.990402797175537e-06, + "loss": 0.1974, + "step": 97800 + }, + { + "epoch": 88.52941176470588, + "grad_norm": 0.7272179126739502, + "learning_rate": 5.967228548314229e-06, + "loss": 0.1775, + "step": 97825 + }, + { + "epoch": 88.55203619909503, + "grad_norm": 2.3626515865325928, + "learning_rate": 5.944097354639405e-06, + "loss": 0.2977, + "step": 97850 + }, + { + "epoch": 88.57466063348416, + "grad_norm": 4.555837631225586, + "learning_rate": 5.921009230577797e-06, + "loss": 0.2158, + "step": 97875 + }, + { + "epoch": 88.5972850678733, + "grad_norm": 1.622931957244873, + "learning_rate": 5.897964190529289e-06, + "loss": 0.183, + "step": 97900 + }, + { + "epoch": 88.61990950226244, + "grad_norm": 1.069495439529419, + "learning_rate": 5.874962248866874e-06, + "loss": 0.2131, + "step": 97925 + }, + { + "epoch": 88.64253393665159, + "grad_norm": 1.374558687210083, + "learning_rate": 5.852003419936693e-06, + "loss": 0.2064, + "step": 97950 + }, + { + "epoch": 88.66515837104072, + "grad_norm": 0.7070494294166565, + "learning_rate": 5.8290877180579755e-06, + "loss": 0.2297, + "step": 97975 + }, + { + "epoch": 88.68778280542986, + "grad_norm": 2.116999387741089, + "learning_rate": 5.806215157523073e-06, + "loss": 0.2706, + "step": 98000 + }, + { + "epoch": 88.710407239819, + "grad_norm": 2.178647994995117, + "learning_rate": 5.783385752597397e-06, + "loss": 0.205, + "step": 98025 + }, + { + "epoch": 88.73303167420815, + "grad_norm": 0.8619604110717773, + "learning_rate": 5.760599517519493e-06, + "loss": 0.215, + "step": 98050 + }, + { + "epoch": 88.75565610859728, + "grad_norm": 1.0493172407150269, + "learning_rate": 5.7378564665009175e-06, + "loss": 0.3104, + "step": 98075 + }, + { + "epoch": 88.77828054298642, + "grad_norm": 1.0355833768844604, + "learning_rate": 5.7151566137263655e-06, + "loss": 0.1816, + "step": 98100 + }, + { + "epoch": 88.80090497737557, + "grad_norm": 1.2814608812332153, + "learning_rate": 5.692499973353529e-06, + "loss": 0.2369, + "step": 98125 + }, + { + "epoch": 88.82352941176471, + "grad_norm": 1.3461711406707764, + "learning_rate": 5.669886559513187e-06, + "loss": 0.1955, + "step": 98150 + }, + { + "epoch": 88.84615384615384, + "grad_norm": 4.495723247528076, + "learning_rate": 5.647316386309126e-06, + "loss": 0.2299, + "step": 98175 + }, + { + "epoch": 88.86877828054298, + "grad_norm": 5.8133721351623535, + "learning_rate": 5.6247894678182e-06, + "loss": 0.2424, + "step": 98200 + }, + { + "epoch": 88.89140271493213, + "grad_norm": 1.4363712072372437, + "learning_rate": 5.602305818090272e-06, + "loss": 0.1946, + "step": 98225 + }, + { + "epoch": 88.91402714932127, + "grad_norm": 1.0300897359848022, + "learning_rate": 5.579865451148191e-06, + "loss": 0.2983, + "step": 98250 + }, + { + "epoch": 88.9366515837104, + "grad_norm": 0.7913947105407715, + "learning_rate": 5.557468380987862e-06, + "loss": 0.2199, + "step": 98275 + }, + { + "epoch": 88.95927601809954, + "grad_norm": 0.6860991716384888, + "learning_rate": 5.535114621578126e-06, + "loss": 0.2499, + "step": 98300 + }, + { + "epoch": 88.98190045248869, + "grad_norm": 1.259525179862976, + "learning_rate": 5.512804186860883e-06, + "loss": 0.1944, + "step": 98325 + }, + { + "epoch": 89.00452488687783, + "grad_norm": 1.7142527103424072, + "learning_rate": 5.490537090750935e-06, + "loss": 0.2269, + "step": 98350 + }, + { + "epoch": 89.02714932126698, + "grad_norm": 0.5371774435043335, + "learning_rate": 5.468313347136113e-06, + "loss": 0.1918, + "step": 98375 + }, + { + "epoch": 89.0497737556561, + "grad_norm": 0.699518084526062, + "learning_rate": 5.446132969877181e-06, + "loss": 0.2079, + "step": 98400 + }, + { + "epoch": 89.07239819004525, + "grad_norm": 1.345686912536621, + "learning_rate": 5.423995972807866e-06, + "loss": 0.2125, + "step": 98425 + }, + { + "epoch": 89.09502262443439, + "grad_norm": 1.0948538780212402, + "learning_rate": 5.4019023697348285e-06, + "loss": 0.2294, + "step": 98450 + }, + { + "epoch": 89.11764705882354, + "grad_norm": 1.389125108718872, + "learning_rate": 5.379852174437682e-06, + "loss": 0.2028, + "step": 98475 + }, + { + "epoch": 89.14027149321267, + "grad_norm": 1.8768160343170166, + "learning_rate": 5.357845400668942e-06, + "loss": 0.2449, + "step": 98500 + }, + { + "epoch": 89.16289592760181, + "grad_norm": 1.5371594429016113, + "learning_rate": 5.335882062154079e-06, + "loss": 0.2419, + "step": 98525 + }, + { + "epoch": 89.18552036199095, + "grad_norm": 1.790648102760315, + "learning_rate": 5.313962172591427e-06, + "loss": 0.2064, + "step": 98550 + }, + { + "epoch": 89.2081447963801, + "grad_norm": 1.6184557676315308, + "learning_rate": 5.292085745652266e-06, + "loss": 0.1847, + "step": 98575 + }, + { + "epoch": 89.23076923076923, + "grad_norm": 0.9454947710037231, + "learning_rate": 5.2702527949807335e-06, + "loss": 0.2475, + "step": 98600 + }, + { + "epoch": 89.25339366515837, + "grad_norm": 1.2674115896224976, + "learning_rate": 5.248463334193878e-06, + "loss": 0.196, + "step": 98625 + }, + { + "epoch": 89.27601809954751, + "grad_norm": 1.6565049886703491, + "learning_rate": 5.226717376881595e-06, + "loss": 0.1967, + "step": 98650 + }, + { + "epoch": 89.29864253393666, + "grad_norm": 4.183839321136475, + "learning_rate": 5.205014936606686e-06, + "loss": 0.2212, + "step": 98675 + }, + { + "epoch": 89.32126696832579, + "grad_norm": 1.141318917274475, + "learning_rate": 5.183356026904764e-06, + "loss": 0.1983, + "step": 98700 + }, + { + "epoch": 89.34389140271493, + "grad_norm": 0.9870153665542603, + "learning_rate": 5.16174066128435e-06, + "loss": 0.2428, + "step": 98725 + }, + { + "epoch": 89.36651583710407, + "grad_norm": 0.5465179681777954, + "learning_rate": 5.140168853226734e-06, + "loss": 0.1734, + "step": 98750 + }, + { + "epoch": 89.38914027149322, + "grad_norm": 1.515248417854309, + "learning_rate": 5.118640616186121e-06, + "loss": 0.1716, + "step": 98775 + }, + { + "epoch": 89.41176470588235, + "grad_norm": NaN, + "learning_rate": 5.09801451270383e-06, + "loss": 0.2216, + "step": 98800 + }, + { + "epoch": 89.43438914027149, + "grad_norm": 1.1756689548492432, + "learning_rate": 5.076571713780264e-06, + "loss": 0.2696, + "step": 98825 + }, + { + "epoch": 89.45701357466064, + "grad_norm": 1.3918015956878662, + "learning_rate": 5.055172525538723e-06, + "loss": 0.2155, + "step": 98850 + }, + { + "epoch": 89.47963800904978, + "grad_norm": 0.7016647458076477, + "learning_rate": 5.033816961325671e-06, + "loss": 0.1772, + "step": 98875 + }, + { + "epoch": 89.50226244343891, + "grad_norm": 0.8793459534645081, + "learning_rate": 5.0125050344604455e-06, + "loss": 0.202, + "step": 98900 + }, + { + "epoch": 89.52488687782805, + "grad_norm": 1.3844480514526367, + "learning_rate": 4.99123675823509e-06, + "loss": 0.1997, + "step": 98925 + }, + { + "epoch": 89.5475113122172, + "grad_norm": 0.7166454195976257, + "learning_rate": 4.970012145914484e-06, + "loss": 0.211, + "step": 98950 + }, + { + "epoch": 89.57013574660634, + "grad_norm": 1.519817590713501, + "learning_rate": 4.9488312107362235e-06, + "loss": 0.286, + "step": 98975 + }, + { + "epoch": 89.59276018099547, + "grad_norm": 1.4116744995117188, + "learning_rate": 4.927693965910706e-06, + "loss": 0.216, + "step": 99000 + }, + { + "epoch": 89.61538461538461, + "grad_norm": 1.1981618404388428, + "learning_rate": 4.906600424621054e-06, + "loss": 0.2009, + "step": 99025 + }, + { + "epoch": 89.63800904977376, + "grad_norm": 0.9614788293838501, + "learning_rate": 4.885550600023153e-06, + "loss": 0.2241, + "step": 99050 + }, + { + "epoch": 89.6606334841629, + "grad_norm": 0.9208125472068787, + "learning_rate": 4.8645445052455825e-06, + "loss": 0.2626, + "step": 99075 + }, + { + "epoch": 89.68325791855203, + "grad_norm": 0.6827859878540039, + "learning_rate": 4.843582153389705e-06, + "loss": 0.2556, + "step": 99100 + }, + { + "epoch": 89.70588235294117, + "grad_norm": 1.1373286247253418, + "learning_rate": 4.822663557529555e-06, + "loss": 0.2558, + "step": 99125 + }, + { + "epoch": 89.72850678733032, + "grad_norm": 1.5220346450805664, + "learning_rate": 4.801788730711903e-06, + "loss": 0.1929, + "step": 99150 + }, + { + "epoch": 89.75113122171946, + "grad_norm": 0.7927146553993225, + "learning_rate": 4.780957685956194e-06, + "loss": 0.2164, + "step": 99175 + }, + { + "epoch": 89.77375565610859, + "grad_norm": 1.3103749752044678, + "learning_rate": 4.760170436254601e-06, + "loss": 0.2094, + "step": 99200 + }, + { + "epoch": 89.79638009049773, + "grad_norm": 4.157220363616943, + "learning_rate": 4.739426994571954e-06, + "loss": 0.2671, + "step": 99225 + }, + { + "epoch": 89.81900452488688, + "grad_norm": 1.3247969150543213, + "learning_rate": 4.718727373845787e-06, + "loss": 0.2106, + "step": 99250 + }, + { + "epoch": 89.84162895927602, + "grad_norm": 4.084348201751709, + "learning_rate": 4.698071586986266e-06, + "loss": 0.2294, + "step": 99275 + }, + { + "epoch": 89.86425339366515, + "grad_norm": 1.5951694250106812, + "learning_rate": 4.677459646876267e-06, + "loss": 0.2371, + "step": 99300 + }, + { + "epoch": 89.8868778280543, + "grad_norm": 1.3263107538223267, + "learning_rate": 4.656891566371257e-06, + "loss": 0.2168, + "step": 99325 + }, + { + "epoch": 89.90950226244344, + "grad_norm": 2.5932395458221436, + "learning_rate": 4.636367358299417e-06, + "loss": 0.1729, + "step": 99350 + }, + { + "epoch": 89.93212669683258, + "grad_norm": 1.5805531740188599, + "learning_rate": 4.615887035461499e-06, + "loss": 0.2018, + "step": 99375 + }, + { + "epoch": 89.95475113122171, + "grad_norm": 2.333526849746704, + "learning_rate": 4.595450610630952e-06, + "loss": 0.2279, + "step": 99400 + }, + { + "epoch": 89.97737556561086, + "grad_norm": 1.2010316848754883, + "learning_rate": 4.575058096553772e-06, + "loss": 0.1759, + "step": 99425 + }, + { + "epoch": 90.0, + "grad_norm": 1.7871593236923218, + "learning_rate": 4.5547095059486335e-06, + "loss": 0.2108, + "step": 99450 + }, + { + "epoch": 90.02262443438914, + "grad_norm": 1.0759422779083252, + "learning_rate": 4.5344048515067875e-06, + "loss": 0.1616, + "step": 99475 + }, + { + "epoch": 90.04524886877829, + "grad_norm": 0.7481698393821716, + "learning_rate": 4.5141441458920765e-06, + "loss": 0.1975, + "step": 99500 + }, + { + "epoch": 90.06787330316742, + "grad_norm": 1.1689400672912598, + "learning_rate": 4.493927401740943e-06, + "loss": 0.2479, + "step": 99525 + }, + { + "epoch": 90.09049773755656, + "grad_norm": 1.1692843437194824, + "learning_rate": 4.47375463166241e-06, + "loss": 0.1792, + "step": 99550 + }, + { + "epoch": 90.1131221719457, + "grad_norm": 4.002111911773682, + "learning_rate": 4.453625848238071e-06, + "loss": 0.2316, + "step": 99575 + }, + { + "epoch": 90.13574660633485, + "grad_norm": 0.9596063494682312, + "learning_rate": 4.433541064022084e-06, + "loss": 0.2341, + "step": 99600 + }, + { + "epoch": 90.15837104072398, + "grad_norm": 1.192742943763733, + "learning_rate": 4.413500291541169e-06, + "loss": 0.2055, + "step": 99625 + }, + { + "epoch": 90.18099547511312, + "grad_norm": 4.47230339050293, + "learning_rate": 4.3935035432945966e-06, + "loss": 0.2587, + "step": 99650 + }, + { + "epoch": 90.20361990950227, + "grad_norm": 0.7736957669258118, + "learning_rate": 4.373550831754189e-06, + "loss": 0.1501, + "step": 99675 + }, + { + "epoch": 90.22624434389141, + "grad_norm": 1.5000149011611938, + "learning_rate": 4.353642169364266e-06, + "loss": 0.2109, + "step": 99700 + }, + { + "epoch": 90.24886877828054, + "grad_norm": 4.5788092613220215, + "learning_rate": 4.333777568541738e-06, + "loss": 0.2648, + "step": 99725 + }, + { + "epoch": 90.27149321266968, + "grad_norm": 1.0441960096359253, + "learning_rate": 4.313957041675953e-06, + "loss": 0.2013, + "step": 99750 + }, + { + "epoch": 90.29411764705883, + "grad_norm": 1.4179902076721191, + "learning_rate": 4.294180601128855e-06, + "loss": 0.1717, + "step": 99775 + }, + { + "epoch": 90.31674208144797, + "grad_norm": 0.7733138799667358, + "learning_rate": 4.274448259234828e-06, + "loss": 0.1884, + "step": 99800 + }, + { + "epoch": 90.3393665158371, + "grad_norm": 1.196804165840149, + "learning_rate": 4.254760028300794e-06, + "loss": 0.1869, + "step": 99825 + }, + { + "epoch": 90.36199095022624, + "grad_norm": 1.052259087562561, + "learning_rate": 4.2351159206061135e-06, + "loss": 0.1821, + "step": 99850 + }, + { + "epoch": 90.38461538461539, + "grad_norm": 4.576720714569092, + "learning_rate": 4.215515948402695e-06, + "loss": 0.2633, + "step": 99875 + }, + { + "epoch": 90.40723981900453, + "grad_norm": 0.6688747406005859, + "learning_rate": 4.1959601239148596e-06, + "loss": 0.1988, + "step": 99900 + }, + { + "epoch": 90.42986425339366, + "grad_norm": 0.7618852257728577, + "learning_rate": 4.176448459339435e-06, + "loss": 0.1855, + "step": 99925 + }, + { + "epoch": 90.4524886877828, + "grad_norm": 0.901997447013855, + "learning_rate": 4.156980966845669e-06, + "loss": 0.1848, + "step": 99950 + }, + { + "epoch": 90.47511312217195, + "grad_norm": 1.5723624229431152, + "learning_rate": 4.137557658575299e-06, + "loss": 0.1667, + "step": 99975 + }, + { + "epoch": 90.49773755656109, + "grad_norm": 1.2151317596435547, + "learning_rate": 4.118178546642478e-06, + "loss": 0.1989, + "step": 100000 + }, + { + "epoch": 90.52036199095022, + "grad_norm": 0.9620048999786377, + "learning_rate": 4.09884364313382e-06, + "loss": 0.2964, + "step": 100025 + }, + { + "epoch": 90.54298642533936, + "grad_norm": 1.3927428722381592, + "learning_rate": 4.079552960108321e-06, + "loss": 0.2111, + "step": 100050 + }, + { + "epoch": 90.56561085972851, + "grad_norm": 0.8601452112197876, + "learning_rate": 4.060306509597447e-06, + "loss": 0.2963, + "step": 100075 + }, + { + "epoch": 90.58823529411765, + "grad_norm": 1.4434726238250732, + "learning_rate": 4.041104303605047e-06, + "loss": 0.1968, + "step": 100100 + }, + { + "epoch": 90.61085972850678, + "grad_norm": 1.012083649635315, + "learning_rate": 4.021946354107383e-06, + "loss": 0.2221, + "step": 100125 + }, + { + "epoch": 90.63348416289593, + "grad_norm": 0.8624517917633057, + "learning_rate": 4.0028326730531135e-06, + "loss": 0.2395, + "step": 100150 + }, + { + "epoch": 90.65610859728507, + "grad_norm": 0.7817290425300598, + "learning_rate": 3.983763272363302e-06, + "loss": 0.2174, + "step": 100175 + }, + { + "epoch": 90.67873303167421, + "grad_norm": 1.2338742017745972, + "learning_rate": 3.9647381639313525e-06, + "loss": 0.2204, + "step": 100200 + }, + { + "epoch": 90.70135746606334, + "grad_norm": 1.4460539817810059, + "learning_rate": 3.945757359623106e-06, + "loss": 0.2336, + "step": 100225 + }, + { + "epoch": 90.72398190045249, + "grad_norm": 1.0861200094223022, + "learning_rate": 3.9268208712767005e-06, + "loss": 0.2327, + "step": 100250 + }, + { + "epoch": 90.74660633484163, + "grad_norm": 0.6357161402702332, + "learning_rate": 3.907928710702715e-06, + "loss": 0.22, + "step": 100275 + }, + { + "epoch": 90.76923076923077, + "grad_norm": 1.9994890689849854, + "learning_rate": 3.88908088968399e-06, + "loss": 0.2103, + "step": 100300 + }, + { + "epoch": 90.7918552036199, + "grad_norm": 1.4905165433883667, + "learning_rate": 3.8702774199758145e-06, + "loss": 0.245, + "step": 100325 + }, + { + "epoch": 90.81447963800905, + "grad_norm": 1.434935450553894, + "learning_rate": 3.8515183133057155e-06, + "loss": 0.2043, + "step": 100350 + }, + { + "epoch": 90.83710407239819, + "grad_norm": 1.1512585878372192, + "learning_rate": 3.832803581373633e-06, + "loss": 0.2167, + "step": 100375 + }, + { + "epoch": 90.85972850678733, + "grad_norm": 0.6571705341339111, + "learning_rate": 3.8141332358517657e-06, + "loss": 0.1661, + "step": 100400 + }, + { + "epoch": 90.88235294117646, + "grad_norm": 1.545005202293396, + "learning_rate": 3.7955072883846805e-06, + "loss": 0.2381, + "step": 100425 + }, + { + "epoch": 90.90497737556561, + "grad_norm": 1.2027240991592407, + "learning_rate": 3.776925750589219e-06, + "loss": 0.1711, + "step": 100450 + }, + { + "epoch": 90.92760180995475, + "grad_norm": 1.356946587562561, + "learning_rate": 3.7583886340545514e-06, + "loss": 0.2117, + "step": 100475 + }, + { + "epoch": 90.9502262443439, + "grad_norm": 1.0545406341552734, + "learning_rate": 3.739895950342106e-06, + "loss": 0.2191, + "step": 100500 + }, + { + "epoch": 90.97285067873302, + "grad_norm": 1.6496835947036743, + "learning_rate": 3.7214477109856394e-06, + "loss": 0.2234, + "step": 100525 + }, + { + "epoch": 90.99547511312217, + "grad_norm": 0.896088182926178, + "learning_rate": 3.7030439274911645e-06, + "loss": 0.1973, + "step": 100550 + }, + { + "epoch": 91.01809954751131, + "grad_norm": 0.8725895881652832, + "learning_rate": 3.6846846113369745e-06, + "loss": 0.2494, + "step": 100575 + }, + { + "epoch": 91.04072398190046, + "grad_norm": 1.522818922996521, + "learning_rate": 3.6663697739736264e-06, + "loss": 0.1876, + "step": 100600 + }, + { + "epoch": 91.0633484162896, + "grad_norm": 1.0002983808517456, + "learning_rate": 3.64809942682395e-06, + "loss": 0.1711, + "step": 100625 + }, + { + "epoch": 91.08597285067873, + "grad_norm": 0.6117151379585266, + "learning_rate": 3.629873581282988e-06, + "loss": 0.2083, + "step": 100650 + }, + { + "epoch": 91.10859728506787, + "grad_norm": 1.174191951751709, + "learning_rate": 3.6116922487180814e-06, + "loss": 0.2017, + "step": 100675 + }, + { + "epoch": 91.13122171945702, + "grad_norm": 1.1282929182052612, + "learning_rate": 3.5935554404687594e-06, + "loss": 0.2245, + "step": 100700 + }, + { + "epoch": 91.15384615384616, + "grad_norm": 1.034684419631958, + "learning_rate": 3.5754631678468316e-06, + "loss": 0.1539, + "step": 100725 + }, + { + "epoch": 91.17647058823529, + "grad_norm": 0.6202672123908997, + "learning_rate": 3.5574154421362714e-06, + "loss": 0.2047, + "step": 100750 + }, + { + "epoch": 91.19909502262443, + "grad_norm": 1.0714919567108154, + "learning_rate": 3.5394122745933256e-06, + "loss": 0.2335, + "step": 100775 + }, + { + "epoch": 91.22171945701358, + "grad_norm": 1.7276630401611328, + "learning_rate": 3.5214536764464035e-06, + "loss": 0.2676, + "step": 100800 + }, + { + "epoch": 91.24434389140272, + "grad_norm": 1.0312620401382446, + "learning_rate": 3.503539658896162e-06, + "loss": 0.2036, + "step": 100825 + }, + { + "epoch": 91.26696832579185, + "grad_norm": 1.6993221044540405, + "learning_rate": 3.4856702331154144e-06, + "loss": 0.2018, + "step": 100850 + }, + { + "epoch": 91.289592760181, + "grad_norm": 0.8954603672027588, + "learning_rate": 3.467845410249187e-06, + "loss": 0.2035, + "step": 100875 + }, + { + "epoch": 91.31221719457014, + "grad_norm": 1.3395525217056274, + "learning_rate": 3.4507755530394504e-06, + "loss": 0.2032, + "step": 100900 + }, + { + "epoch": 91.33484162895928, + "grad_norm": 0.5515787601470947, + "learning_rate": 3.4330381841086163e-06, + "loss": 0.1682, + "step": 100925 + }, + { + "epoch": 91.35746606334841, + "grad_norm": 0.8045617341995239, + "learning_rate": 3.415345450918494e-06, + "loss": 0.177, + "step": 100950 + }, + { + "epoch": 91.38009049773756, + "grad_norm": 0.974839448928833, + "learning_rate": 3.3976973645038735e-06, + "loss": 0.2034, + "step": 100975 + }, + { + "epoch": 91.4027149321267, + "grad_norm": 1.4894038438796997, + "learning_rate": 3.3800939358717584e-06, + "loss": 0.1659, + "step": 101000 + }, + { + "epoch": 91.42533936651584, + "grad_norm": 0.8471037745475769, + "learning_rate": 3.362535176001249e-06, + "loss": 0.1936, + "step": 101025 + }, + { + "epoch": 91.44796380090497, + "grad_norm": 0.7326423525810242, + "learning_rate": 3.345021095843603e-06, + "loss": 0.1763, + "step": 101050 + }, + { + "epoch": 91.47058823529412, + "grad_norm": 1.0680112838745117, + "learning_rate": 3.3275517063222067e-06, + "loss": 0.2155, + "step": 101075 + }, + { + "epoch": 91.49321266968326, + "grad_norm": 1.4577637910842896, + "learning_rate": 3.310127018332595e-06, + "loss": 0.2404, + "step": 101100 + }, + { + "epoch": 91.5158371040724, + "grad_norm": 1.1680512428283691, + "learning_rate": 3.2927470427423914e-06, + "loss": 0.2226, + "step": 101125 + }, + { + "epoch": 91.53846153846153, + "grad_norm": 0.7334850430488586, + "learning_rate": 3.2754117903913498e-06, + "loss": 0.2151, + "step": 101150 + }, + { + "epoch": 91.56108597285068, + "grad_norm": 1.0506032705307007, + "learning_rate": 3.2581212720913464e-06, + "loss": 0.2141, + "step": 101175 + }, + { + "epoch": 91.58371040723982, + "grad_norm": 0.7149414420127869, + "learning_rate": 3.240875498626305e-06, + "loss": 0.2182, + "step": 101200 + }, + { + "epoch": 91.60633484162896, + "grad_norm": 0.6269099116325378, + "learning_rate": 3.2236744807523058e-06, + "loss": 0.2246, + "step": 101225 + }, + { + "epoch": 91.6289592760181, + "grad_norm": 0.9679759740829468, + "learning_rate": 3.2065182291974744e-06, + "loss": 0.1789, + "step": 101250 + }, + { + "epoch": 91.65158371040724, + "grad_norm": 0.9241008758544922, + "learning_rate": 3.189406754662027e-06, + "loss": 0.2289, + "step": 101275 + }, + { + "epoch": 91.67420814479638, + "grad_norm": 1.6055413484573364, + "learning_rate": 3.172340067818252e-06, + "loss": 0.1855, + "step": 101300 + }, + { + "epoch": 91.69683257918552, + "grad_norm": 0.8490349650382996, + "learning_rate": 3.1553181793105092e-06, + "loss": 0.2092, + "step": 101325 + }, + { + "epoch": 91.71945701357465, + "grad_norm": 1.3778477907180786, + "learning_rate": 3.1383410997552067e-06, + "loss": 0.1682, + "step": 101350 + }, + { + "epoch": 91.7420814479638, + "grad_norm": 0.8499693870544434, + "learning_rate": 3.1214088397408327e-06, + "loss": 0.2396, + "step": 101375 + }, + { + "epoch": 91.76470588235294, + "grad_norm": 1.6106404066085815, + "learning_rate": 3.104521409827873e-06, + "loss": 0.2095, + "step": 101400 + }, + { + "epoch": 91.78733031674209, + "grad_norm": 1.2034767866134644, + "learning_rate": 3.087678820548911e-06, + "loss": 0.1982, + "step": 101425 + }, + { + "epoch": 91.80995475113122, + "grad_norm": 0.6358681321144104, + "learning_rate": 3.0708810824085107e-06, + "loss": 0.26, + "step": 101450 + }, + { + "epoch": 91.83257918552036, + "grad_norm": 1.65291428565979, + "learning_rate": 3.054128205883308e-06, + "loss": 0.2222, + "step": 101475 + }, + { + "epoch": 91.8552036199095, + "grad_norm": 1.0159928798675537, + "learning_rate": 3.037420201421911e-06, + "loss": 0.2816, + "step": 101500 + }, + { + "epoch": 91.87782805429865, + "grad_norm": 1.630953073501587, + "learning_rate": 3.020757079445002e-06, + "loss": 0.1828, + "step": 101525 + }, + { + "epoch": 91.90045248868778, + "grad_norm": 0.5932508707046509, + "learning_rate": 3.0041388503452e-06, + "loss": 0.2103, + "step": 101550 + }, + { + "epoch": 91.92307692307692, + "grad_norm": 0.9708763957023621, + "learning_rate": 2.9875655244871984e-06, + "loss": 0.2741, + "step": 101575 + }, + { + "epoch": 91.94570135746606, + "grad_norm": 1.2914602756500244, + "learning_rate": 2.971037112207619e-06, + "loss": 0.2661, + "step": 101600 + }, + { + "epoch": 91.96832579185521, + "grad_norm": 1.1061433553695679, + "learning_rate": 2.9545536238151172e-06, + "loss": 0.1836, + "step": 101625 + }, + { + "epoch": 91.99095022624434, + "grad_norm": 1.001975178718567, + "learning_rate": 2.9381150695902937e-06, + "loss": 0.2847, + "step": 101650 + }, + { + "epoch": 92.01357466063348, + "grad_norm": 1.711601972579956, + "learning_rate": 2.9217214597857725e-06, + "loss": 0.1761, + "step": 101675 + }, + { + "epoch": 92.03619909502262, + "grad_norm": 0.42898303270339966, + "learning_rate": 2.9053728046260825e-06, + "loss": 0.183, + "step": 101700 + }, + { + "epoch": 92.05882352941177, + "grad_norm": 1.3710592985153198, + "learning_rate": 2.889069114307785e-06, + "loss": 0.2273, + "step": 101725 + }, + { + "epoch": 92.08144796380091, + "grad_norm": 0.9163620471954346, + "learning_rate": 2.8728103989993283e-06, + "loss": 0.2118, + "step": 101750 + }, + { + "epoch": 92.10407239819004, + "grad_norm": 1.4323467016220093, + "learning_rate": 2.85659666884116e-06, + "loss": 0.1969, + "step": 101775 + }, + { + "epoch": 92.12669683257919, + "grad_norm": 0.9811010956764221, + "learning_rate": 2.840427933945649e-06, + "loss": 0.1962, + "step": 101800 + }, + { + "epoch": 92.14932126696833, + "grad_norm": 3.9413015842437744, + "learning_rate": 2.8243042043971126e-06, + "loss": 0.2551, + "step": 101825 + }, + { + "epoch": 92.17194570135747, + "grad_norm": 1.1939562559127808, + "learning_rate": 2.808225490251781e-06, + "loss": 0.254, + "step": 101850 + }, + { + "epoch": 92.1945701357466, + "grad_norm": 0.7034265398979187, + "learning_rate": 2.7921918015378324e-06, + "loss": 0.1436, + "step": 101875 + }, + { + "epoch": 92.21719457013575, + "grad_norm": 1.3420288562774658, + "learning_rate": 2.776203148255335e-06, + "loss": 0.1856, + "step": 101900 + }, + { + "epoch": 92.23981900452489, + "grad_norm": 1.0718244314193726, + "learning_rate": 2.7602595403762946e-06, + "loss": 0.2213, + "step": 101925 + }, + { + "epoch": 92.26244343891403, + "grad_norm": 0.8896125555038452, + "learning_rate": 2.744360987844599e-06, + "loss": 0.1921, + "step": 101950 + }, + { + "epoch": 92.28506787330316, + "grad_norm": 1.383668303489685, + "learning_rate": 2.728507500576074e-06, + "loss": 0.2313, + "step": 101975 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 0.961441695690155, + "learning_rate": 2.712699088458378e-06, + "loss": 0.1863, + "step": 102000 + }, + { + "epoch": 92.33031674208145, + "grad_norm": 1.8158527612686157, + "learning_rate": 2.696935761351124e-06, + "loss": 0.1983, + "step": 102025 + }, + { + "epoch": 92.3529411764706, + "grad_norm": 0.8649600148200989, + "learning_rate": 2.6812175290857466e-06, + "loss": 0.2421, + "step": 102050 + }, + { + "epoch": 92.37556561085972, + "grad_norm": 1.4010592699050903, + "learning_rate": 2.665544401465597e-06, + "loss": 0.2068, + "step": 102075 + }, + { + "epoch": 92.39819004524887, + "grad_norm": 0.8777248859405518, + "learning_rate": 2.6499163882658713e-06, + "loss": 0.2213, + "step": 102100 + }, + { + "epoch": 92.42081447963801, + "grad_norm": 1.2792974710464478, + "learning_rate": 2.6343334992336485e-06, + "loss": 0.2488, + "step": 102125 + }, + { + "epoch": 92.44343891402715, + "grad_norm": 0.7411015033721924, + "learning_rate": 2.618795744087829e-06, + "loss": 0.1788, + "step": 102150 + }, + { + "epoch": 92.46606334841628, + "grad_norm": 0.7130656838417053, + "learning_rate": 2.603303132519219e-06, + "loss": 0.2025, + "step": 102175 + }, + { + "epoch": 92.48868778280543, + "grad_norm": 1.1229000091552734, + "learning_rate": 2.587855674190398e-06, + "loss": 0.2559, + "step": 102200 + }, + { + "epoch": 92.51131221719457, + "grad_norm": 1.2911707162857056, + "learning_rate": 2.572453378735842e-06, + "loss": 0.2007, + "step": 102225 + }, + { + "epoch": 92.53393665158372, + "grad_norm": 0.9338110685348511, + "learning_rate": 2.5570962557618508e-06, + "loss": 0.206, + "step": 102250 + }, + { + "epoch": 92.55656108597285, + "grad_norm": 1.5435631275177002, + "learning_rate": 2.541784314846512e-06, + "loss": 0.1792, + "step": 102275 + }, + { + "epoch": 92.57918552036199, + "grad_norm": 1.8651821613311768, + "learning_rate": 2.526517565539796e-06, + "loss": 0.2104, + "step": 102300 + }, + { + "epoch": 92.60180995475113, + "grad_norm": 1.2022444009780884, + "learning_rate": 2.5112960173634096e-06, + "loss": 0.2904, + "step": 102325 + }, + { + "epoch": 92.62443438914028, + "grad_norm": 0.6954711079597473, + "learning_rate": 2.496119679810943e-06, + "loss": 0.2138, + "step": 102350 + }, + { + "epoch": 92.6470588235294, + "grad_norm": 0.6750567555427551, + "learning_rate": 2.480988562347741e-06, + "loss": 0.2131, + "step": 102375 + }, + { + "epoch": 92.66968325791855, + "grad_norm": 0.5733470320701599, + "learning_rate": 2.4659026744109716e-06, + "loss": 0.1843, + "step": 102400 + }, + { + "epoch": 92.6923076923077, + "grad_norm": 0.8600246906280518, + "learning_rate": 2.4508620254095666e-06, + "loss": 0.2016, + "step": 102425 + }, + { + "epoch": 92.71493212669684, + "grad_norm": 1.2240346670150757, + "learning_rate": 2.4358666247242724e-06, + "loss": 0.2224, + "step": 102450 + }, + { + "epoch": 92.73755656108597, + "grad_norm": 1.3388067483901978, + "learning_rate": 2.420916481707591e-06, + "loss": 0.2066, + "step": 102475 + }, + { + "epoch": 92.76018099547511, + "grad_norm": 0.9193198680877686, + "learning_rate": 2.4060116056838135e-06, + "loss": 0.2052, + "step": 102500 + }, + { + "epoch": 92.78280542986425, + "grad_norm": 1.349044919013977, + "learning_rate": 2.3911520059489792e-06, + "loss": 0.2023, + "step": 102525 + }, + { + "epoch": 92.8054298642534, + "grad_norm": 1.1145248413085938, + "learning_rate": 2.376337691770924e-06, + "loss": 0.2149, + "step": 102550 + }, + { + "epoch": 92.82805429864253, + "grad_norm": 0.8239882588386536, + "learning_rate": 2.3615686723891996e-06, + "loss": 0.1715, + "step": 102575 + }, + { + "epoch": 92.85067873303167, + "grad_norm": 1.3007310628890991, + "learning_rate": 2.346844957015129e-06, + "loss": 0.1835, + "step": 102600 + }, + { + "epoch": 92.87330316742081, + "grad_norm": 0.9309889674186707, + "learning_rate": 2.332166554831774e-06, + "loss": 0.196, + "step": 102625 + }, + { + "epoch": 92.89592760180996, + "grad_norm": 0.5905202627182007, + "learning_rate": 2.317533474993938e-06, + "loss": 0.1931, + "step": 102650 + }, + { + "epoch": 92.91855203619909, + "grad_norm": 0.9990751147270203, + "learning_rate": 2.3029457266281525e-06, + "loss": 0.2116, + "step": 102675 + }, + { + "epoch": 92.94117647058823, + "grad_norm": 0.7694403529167175, + "learning_rate": 2.288403318832699e-06, + "loss": 0.2259, + "step": 102700 + }, + { + "epoch": 92.96380090497738, + "grad_norm": 0.9315254092216492, + "learning_rate": 2.2739062606775215e-06, + "loss": 0.2116, + "step": 102725 + }, + { + "epoch": 92.98642533936652, + "grad_norm": 1.2433048486709595, + "learning_rate": 2.259454561204363e-06, + "loss": 0.2442, + "step": 102750 + }, + { + "epoch": 93.00904977375566, + "grad_norm": 4.522818088531494, + "learning_rate": 2.2450482294265883e-06, + "loss": 0.1914, + "step": 102775 + }, + { + "epoch": 93.03167420814479, + "grad_norm": 1.2282065153121948, + "learning_rate": 2.2306872743293513e-06, + "loss": 0.2266, + "step": 102800 + }, + { + "epoch": 93.05429864253394, + "grad_norm": 0.9609178900718689, + "learning_rate": 2.2163717048694377e-06, + "loss": 0.1945, + "step": 102825 + }, + { + "epoch": 93.07692307692308, + "grad_norm": 0.8740372061729431, + "learning_rate": 2.202101529975381e-06, + "loss": 0.2537, + "step": 102850 + }, + { + "epoch": 93.09954751131222, + "grad_norm": 0.8128977417945862, + "learning_rate": 2.1878767585473358e-06, + "loss": 0.1931, + "step": 102875 + }, + { + "epoch": 93.12217194570135, + "grad_norm": 1.2922780513763428, + "learning_rate": 2.173697399457222e-06, + "loss": 0.1717, + "step": 102900 + }, + { + "epoch": 93.1447963800905, + "grad_norm": 0.6901913285255432, + "learning_rate": 2.1595634615485495e-06, + "loss": 0.1513, + "step": 102925 + }, + { + "epoch": 93.16742081447964, + "grad_norm": 0.951432466506958, + "learning_rate": 2.145474953636575e-06, + "loss": 0.2563, + "step": 102950 + }, + { + "epoch": 93.19004524886878, + "grad_norm": 0.9803032279014587, + "learning_rate": 2.1319927347387108e-06, + "loss": 0.2106, + "step": 102975 + }, + { + "epoch": 93.21266968325791, + "grad_norm": 1.376770257949829, + "learning_rate": 2.1179932950829315e-06, + "loss": 0.189, + "step": 103000 + }, + { + "epoch": 93.23529411764706, + "grad_norm": 0.7428179979324341, + "learning_rate": 2.1040393113508356e-06, + "loss": 0.1792, + "step": 103025 + }, + { + "epoch": 93.2579185520362, + "grad_norm": 0.7498146295547485, + "learning_rate": 2.0901307922453787e-06, + "loss": 0.1746, + "step": 103050 + }, + { + "epoch": 93.28054298642535, + "grad_norm": 1.1039135456085205, + "learning_rate": 2.0762677464412127e-06, + "loss": 0.1834, + "step": 103075 + }, + { + "epoch": 93.30316742081448, + "grad_norm": 3.8603408336639404, + "learning_rate": 2.0624501825845964e-06, + "loss": 0.2071, + "step": 103100 + }, + { + "epoch": 93.32579185520362, + "grad_norm": 0.9220117926597595, + "learning_rate": 2.048678109293453e-06, + "loss": 0.2253, + "step": 103125 + }, + { + "epoch": 93.34841628959276, + "grad_norm": 1.001235008239746, + "learning_rate": 2.0349515351572865e-06, + "loss": 0.2116, + "step": 103150 + }, + { + "epoch": 93.3710407239819, + "grad_norm": 1.4093916416168213, + "learning_rate": 2.0212704687372736e-06, + "loss": 0.1671, + "step": 103175 + }, + { + "epoch": 93.39366515837104, + "grad_norm": 1.044776439666748, + "learning_rate": 2.007634918566173e-06, + "loss": 0.2012, + "step": 103200 + }, + { + "epoch": 93.41628959276018, + "grad_norm": 1.1573245525360107, + "learning_rate": 1.9940448931483803e-06, + "loss": 0.2196, + "step": 103225 + }, + { + "epoch": 93.43891402714932, + "grad_norm": 0.8817508220672607, + "learning_rate": 1.980500400959875e-06, + "loss": 0.2047, + "step": 103250 + }, + { + "epoch": 93.46153846153847, + "grad_norm": 1.0110949277877808, + "learning_rate": 1.967001450448258e-06, + "loss": 0.1694, + "step": 103275 + }, + { + "epoch": 93.4841628959276, + "grad_norm": 1.1973719596862793, + "learning_rate": 1.953548050032694e-06, + "loss": 0.2134, + "step": 103300 + }, + { + "epoch": 93.50678733031674, + "grad_norm": 1.0478057861328125, + "learning_rate": 1.940140208103996e-06, + "loss": 0.2134, + "step": 103325 + }, + { + "epoch": 93.52941176470588, + "grad_norm": 0.9741623401641846, + "learning_rate": 1.9267779330244926e-06, + "loss": 0.2097, + "step": 103350 + }, + { + "epoch": 93.55203619909503, + "grad_norm": 4.4383392333984375, + "learning_rate": 1.913461233128158e-06, + "loss": 0.2352, + "step": 103375 + }, + { + "epoch": 93.57466063348416, + "grad_norm": 0.9028875827789307, + "learning_rate": 1.900190116720482e-06, + "loss": 0.218, + "step": 103400 + }, + { + "epoch": 93.5972850678733, + "grad_norm": 0.6334171295166016, + "learning_rate": 1.8869645920785854e-06, + "loss": 0.187, + "step": 103425 + }, + { + "epoch": 93.61990950226244, + "grad_norm": 1.2982796430587769, + "learning_rate": 1.8737846674510947e-06, + "loss": 0.2124, + "step": 103450 + }, + { + "epoch": 93.64253393665159, + "grad_norm": 0.9085620641708374, + "learning_rate": 1.8606503510582348e-06, + "loss": 0.1869, + "step": 103475 + }, + { + "epoch": 93.66515837104072, + "grad_norm": 1.6037293672561646, + "learning_rate": 1.8475616510917695e-06, + "loss": 0.1942, + "step": 103500 + }, + { + "epoch": 93.68778280542986, + "grad_norm": 1.0532147884368896, + "learning_rate": 1.8345185757150355e-06, + "loss": 0.2109, + "step": 103525 + }, + { + "epoch": 93.710407239819, + "grad_norm": 1.4603524208068848, + "learning_rate": 1.8215211330628587e-06, + "loss": 0.2755, + "step": 103550 + }, + { + "epoch": 93.73303167420815, + "grad_norm": 1.0279241800308228, + "learning_rate": 1.8085693312416716e-06, + "loss": 0.2452, + "step": 103575 + }, + { + "epoch": 93.75565610859728, + "grad_norm": 0.8479742407798767, + "learning_rate": 1.7956631783293873e-06, + "loss": 0.215, + "step": 103600 + }, + { + "epoch": 93.77828054298642, + "grad_norm": 1.001603603363037, + "learning_rate": 1.7828026823754921e-06, + "loss": 0.2412, + "step": 103625 + }, + { + "epoch": 93.80090497737557, + "grad_norm": 1.0904514789581299, + "learning_rate": 1.769987851400953e-06, + "loss": 0.178, + "step": 103650 + }, + { + "epoch": 93.82352941176471, + "grad_norm": 1.0738730430603027, + "learning_rate": 1.7572186933982936e-06, + "loss": 0.2173, + "step": 103675 + }, + { + "epoch": 93.84615384615384, + "grad_norm": 0.7459914088249207, + "learning_rate": 1.7444952163315179e-06, + "loss": 0.1983, + "step": 103700 + }, + { + "epoch": 93.86877828054298, + "grad_norm": 0.5410562753677368, + "learning_rate": 1.7318174281361785e-06, + "loss": 0.2264, + "step": 103725 + }, + { + "epoch": 93.89140271493213, + "grad_norm": 0.8717644214630127, + "learning_rate": 1.719185336719292e-06, + "loss": 0.2407, + "step": 103750 + }, + { + "epoch": 93.91402714932127, + "grad_norm": 0.7874726057052612, + "learning_rate": 1.7065989499594063e-06, + "loss": 0.2323, + "step": 103775 + }, + { + "epoch": 93.9366515837104, + "grad_norm": 0.8878973126411438, + "learning_rate": 1.6940582757065334e-06, + "loss": 0.1853, + "step": 103800 + }, + { + "epoch": 93.95927601809954, + "grad_norm": 1.1652942895889282, + "learning_rate": 1.6815633217822088e-06, + "loss": 0.205, + "step": 103825 + }, + { + "epoch": 93.98190045248869, + "grad_norm": 0.7771767377853394, + "learning_rate": 1.6691140959794153e-06, + "loss": 0.185, + "step": 103850 + }, + { + "epoch": 94.00452488687783, + "grad_norm": 1.6624658107757568, + "learning_rate": 1.6567106060626583e-06, + "loss": 0.226, + "step": 103875 + }, + { + "epoch": 94.02714932126698, + "grad_norm": 0.8755984902381897, + "learning_rate": 1.6443528597678835e-06, + "loss": 0.1643, + "step": 103900 + }, + { + "epoch": 94.0497737556561, + "grad_norm": 1.9576427936553955, + "learning_rate": 1.6320408648025085e-06, + "loss": 0.2071, + "step": 103925 + }, + { + "epoch": 94.07239819004525, + "grad_norm": 1.1330572366714478, + "learning_rate": 1.6197746288454494e-06, + "loss": 0.2119, + "step": 103950 + }, + { + "epoch": 94.09502262443439, + "grad_norm": 1.2345455884933472, + "learning_rate": 1.6075541595470364e-06, + "loss": 0.2549, + "step": 103975 + }, + { + "epoch": 94.11764705882354, + "grad_norm": 0.7058464884757996, + "learning_rate": 1.595379464529098e-06, + "loss": 0.1734, + "step": 104000 + }, + { + "epoch": 94.14027149321267, + "grad_norm": 0.5983518958091736, + "learning_rate": 1.5832505513848763e-06, + "loss": 0.1905, + "step": 104025 + }, + { + "epoch": 94.16289592760181, + "grad_norm": 0.9150748252868652, + "learning_rate": 1.571167427679096e-06, + "loss": 0.1965, + "step": 104050 + }, + { + "epoch": 94.18552036199095, + "grad_norm": 1.6826242208480835, + "learning_rate": 1.5591301009478779e-06, + "loss": 0.1778, + "step": 104075 + }, + { + "epoch": 94.2081447963801, + "grad_norm": 1.0981590747833252, + "learning_rate": 1.5471385786988339e-06, + "loss": 0.2164, + "step": 104100 + }, + { + "epoch": 94.23076923076923, + "grad_norm": 1.714074730873108, + "learning_rate": 1.5351928684109644e-06, + "loss": 0.1888, + "step": 104125 + }, + { + "epoch": 94.25339366515837, + "grad_norm": 0.9475539922714233, + "learning_rate": 1.523292977534718e-06, + "loss": 0.1931, + "step": 104150 + }, + { + "epoch": 94.27601809954751, + "grad_norm": 1.521440029144287, + "learning_rate": 1.511438913491958e-06, + "loss": 0.2342, + "step": 104175 + }, + { + "epoch": 94.29864253393666, + "grad_norm": 1.6483838558197021, + "learning_rate": 1.4996306836759787e-06, + "loss": 0.2582, + "step": 104200 + }, + { + "epoch": 94.32126696832579, + "grad_norm": 1.1247897148132324, + "learning_rate": 1.4878682954514637e-06, + "loss": 0.1604, + "step": 104225 + }, + { + "epoch": 94.34389140271493, + "grad_norm": 0.9208066463470459, + "learning_rate": 1.4761517561545283e-06, + "loss": 0.1853, + "step": 104250 + }, + { + "epoch": 94.36651583710407, + "grad_norm": 3.9537439346313477, + "learning_rate": 1.4644810730926853e-06, + "loss": 0.2231, + "step": 104275 + }, + { + "epoch": 94.38914027149322, + "grad_norm": 0.6742361187934875, + "learning_rate": 1.4528562535448456e-06, + "loss": 0.1867, + "step": 104300 + }, + { + "epoch": 94.41176470588235, + "grad_norm": 1.2406797409057617, + "learning_rate": 1.441277304761318e-06, + "loss": 0.1764, + "step": 104325 + }, + { + "epoch": 94.43438914027149, + "grad_norm": 0.8202566504478455, + "learning_rate": 1.429744233963792e-06, + "loss": 0.2323, + "step": 104350 + }, + { + "epoch": 94.45701357466064, + "grad_norm": 1.2112210988998413, + "learning_rate": 1.418257048345356e-06, + "loss": 0.1882, + "step": 104375 + }, + { + "epoch": 94.47963800904978, + "grad_norm": 1.0226188898086548, + "learning_rate": 1.4068157550704868e-06, + "loss": 0.2021, + "step": 104400 + }, + { + "epoch": 94.50226244343891, + "grad_norm": 1.8163559436798096, + "learning_rate": 1.3954203612750014e-06, + "loss": 0.1935, + "step": 104425 + }, + { + "epoch": 94.52488687782805, + "grad_norm": 1.208831548690796, + "learning_rate": 1.3840708740661482e-06, + "loss": 0.2259, + "step": 104450 + }, + { + "epoch": 94.5475113122172, + "grad_norm": 1.374042272567749, + "learning_rate": 1.3727673005224815e-06, + "loss": 0.2432, + "step": 104475 + }, + { + "epoch": 94.57013574660634, + "grad_norm": 1.003490924835205, + "learning_rate": 1.3615096476939702e-06, + "loss": 0.1726, + "step": 104500 + }, + { + "epoch": 94.59276018099547, + "grad_norm": 0.8769525289535522, + "learning_rate": 1.3502979226019062e-06, + "loss": 0.2423, + "step": 104525 + }, + { + "epoch": 94.61538461538461, + "grad_norm": 1.1709707975387573, + "learning_rate": 1.3391321322389708e-06, + "loss": 0.1983, + "step": 104550 + }, + { + "epoch": 94.63800904977376, + "grad_norm": 0.6149175763130188, + "learning_rate": 1.3280122835691604e-06, + "loss": 0.1944, + "step": 104575 + }, + { + "epoch": 94.6606334841629, + "grad_norm": 1.0593194961547852, + "learning_rate": 1.3169383835278435e-06, + "loss": 0.2416, + "step": 104600 + }, + { + "epoch": 94.68325791855203, + "grad_norm": 1.1648855209350586, + "learning_rate": 1.3059104390217206e-06, + "loss": 0.2337, + "step": 104625 + }, + { + "epoch": 94.70588235294117, + "grad_norm": 0.9153949022293091, + "learning_rate": 1.2949284569288398e-06, + "loss": 0.2037, + "step": 104650 + }, + { + "epoch": 94.72850678733032, + "grad_norm": 0.9934732913970947, + "learning_rate": 1.2839924440985722e-06, + "loss": 0.1683, + "step": 104675 + }, + { + "epoch": 94.75113122171946, + "grad_norm": 0.7712545990943909, + "learning_rate": 1.2731024073516117e-06, + "loss": 0.2012, + "step": 104700 + }, + { + "epoch": 94.77375565610859, + "grad_norm": 0.9374544024467468, + "learning_rate": 1.2622583534800002e-06, + "loss": 0.1571, + "step": 104725 + }, + { + "epoch": 94.79638009049773, + "grad_norm": 1.30306875705719, + "learning_rate": 1.2514602892470777e-06, + "loss": 0.2407, + "step": 104750 + }, + { + "epoch": 94.81900452488688, + "grad_norm": 1.0117579698562622, + "learning_rate": 1.2407082213875069e-06, + "loss": 0.2223, + "step": 104775 + }, + { + "epoch": 94.84162895927602, + "grad_norm": 0.8421927094459534, + "learning_rate": 1.2300021566072905e-06, + "loss": 0.1925, + "step": 104800 + }, + { + "epoch": 94.86425339366515, + "grad_norm": 1.0489987134933472, + "learning_rate": 1.2193421015836869e-06, + "loss": 0.2146, + "step": 104825 + }, + { + "epoch": 94.8868778280543, + "grad_norm": 0.6136251091957092, + "learning_rate": 1.2087280629653028e-06, + "loss": 0.2087, + "step": 104850 + }, + { + "epoch": 94.90950226244344, + "grad_norm": 1.6450775861740112, + "learning_rate": 1.1981600473720182e-06, + "loss": 0.241, + "step": 104875 + }, + { + "epoch": 94.93212669683258, + "grad_norm": 1.319272756576538, + "learning_rate": 1.1876380613950271e-06, + "loss": 0.1899, + "step": 104900 + }, + { + "epoch": 94.95475113122171, + "grad_norm": 0.9890087842941284, + "learning_rate": 1.177162111596805e-06, + "loss": 0.2008, + "step": 104925 + }, + { + "epoch": 94.97737556561086, + "grad_norm": 1.0454882383346558, + "learning_rate": 1.166732204511134e-06, + "loss": 0.2533, + "step": 104950 + }, + { + "epoch": 95.0, + "grad_norm": 1.0466210842132568, + "learning_rate": 1.156348346643035e-06, + "loss": 0.1704, + "step": 104975 + }, + { + "epoch": 95.02262443438914, + "grad_norm": 1.2732752561569214, + "learning_rate": 1.1460105444688533e-06, + "loss": 0.1997, + "step": 105000 + }, + { + "epoch": 95.04524886877829, + "grad_norm": 0.7997806072235107, + "learning_rate": 1.1357188044361976e-06, + "loss": 0.1741, + "step": 105025 + }, + { + "epoch": 95.06787330316742, + "grad_norm": 0.9983102679252625, + "learning_rate": 1.1258820752262033e-06, + "loss": 0.1968, + "step": 105050 + }, + { + "epoch": 95.09049773755656, + "grad_norm": 1.702331304550171, + "learning_rate": 1.115680635584129e-06, + "loss": 0.2113, + "step": 105075 + }, + { + "epoch": 95.1131221719457, + "grad_norm": 1.430219054222107, + "learning_rate": 1.105525277000091e-06, + "loss": 0.1525, + "step": 105100 + }, + { + "epoch": 95.13574660633485, + "grad_norm": 1.2468103170394897, + "learning_rate": 1.0954160058079143e-06, + "loss": 0.269, + "step": 105125 + }, + { + "epoch": 95.15837104072398, + "grad_norm": 0.8768815398216248, + "learning_rate": 1.0853528283126634e-06, + "loss": 0.3236, + "step": 105150 + }, + { + "epoch": 95.18099547511312, + "grad_norm": 0.8579899072647095, + "learning_rate": 1.075735548693593e-06, + "loss": 0.2739, + "step": 105175 + }, + { + "epoch": 95.20361990950227, + "grad_norm": 1.2456624507904053, + "learning_rate": 1.0657627330239893e-06, + "loss": 0.1553, + "step": 105200 + }, + { + "epoch": 95.22624434389141, + "grad_norm": 1.5865308046340942, + "learning_rate": 1.0558360295458173e-06, + "loss": 0.1892, + "step": 105225 + }, + { + "epoch": 95.24886877828054, + "grad_norm": 0.8442748785018921, + "learning_rate": 1.0459554444502998e-06, + "loss": 0.1483, + "step": 105250 + }, + { + "epoch": 95.27149321266968, + "grad_norm": 0.8328303694725037, + "learning_rate": 1.0361209838998574e-06, + "loss": 0.1628, + "step": 105275 + }, + { + "epoch": 95.29411764705883, + "grad_norm": 1.0149391889572144, + "learning_rate": 1.0263326540281752e-06, + "loss": 0.2416, + "step": 105300 + }, + { + "epoch": 95.31674208144797, + "grad_norm": 1.6882009506225586, + "learning_rate": 1.0165904609401533e-06, + "loss": 0.2808, + "step": 105325 + }, + { + "epoch": 95.3393665158371, + "grad_norm": 1.6237668991088867, + "learning_rate": 1.0068944107119226e-06, + "loss": 0.191, + "step": 105350 + }, + { + "epoch": 95.36199095022624, + "grad_norm": 0.9608161449432373, + "learning_rate": 9.97244509390821e-07, + "loss": 0.1574, + "step": 105375 + }, + { + "epoch": 95.38461538461539, + "grad_norm": 0.7665073275566101, + "learning_rate": 9.87640762995434e-07, + "loss": 0.1816, + "step": 105400 + }, + { + "epoch": 95.40723981900453, + "grad_norm": 1.0347704887390137, + "learning_rate": 9.780831775155206e-07, + "loss": 0.228, + "step": 105425 + }, + { + "epoch": 95.42986425339366, + "grad_norm": 1.0157124996185303, + "learning_rate": 9.685717589120874e-07, + "loss": 0.2145, + "step": 105450 + }, + { + "epoch": 95.4524886877828, + "grad_norm": 1.4060900211334229, + "learning_rate": 9.59106513117322e-07, + "loss": 0.1951, + "step": 105475 + }, + { + "epoch": 95.47511312217195, + "grad_norm": 0.8281680941581726, + "learning_rate": 9.496874460346276e-07, + "loss": 0.1727, + "step": 105500 + }, + { + "epoch": 95.49773755656109, + "grad_norm": 1.2150906324386597, + "learning_rate": 9.403145635385884e-07, + "loss": 0.2419, + "step": 105525 + }, + { + "epoch": 95.52036199095022, + "grad_norm": 0.8985528349876404, + "learning_rate": 9.309878714750113e-07, + "loss": 0.146, + "step": 105550 + }, + { + "epoch": 95.54298642533936, + "grad_norm": 0.9663586616516113, + "learning_rate": 9.2170737566086e-07, + "loss": 0.2395, + "step": 105575 + }, + { + "epoch": 95.56561085972851, + "grad_norm": 0.9380387663841248, + "learning_rate": 9.124730818843295e-07, + "loss": 0.1957, + "step": 105600 + }, + { + "epoch": 95.58823529411765, + "grad_norm": 0.9491564035415649, + "learning_rate": 9.032849959047544e-07, + "loss": 0.2082, + "step": 105625 + }, + { + "epoch": 95.61085972850678, + "grad_norm": 0.9426025152206421, + "learning_rate": 8.941431234526925e-07, + "loss": 0.189, + "step": 105650 + }, + { + "epoch": 95.63348416289593, + "grad_norm": 1.1982818841934204, + "learning_rate": 8.850474702298327e-07, + "loss": 0.1886, + "step": 105675 + }, + { + "epoch": 95.65610859728507, + "grad_norm": 1.0873892307281494, + "learning_rate": 8.759980419090706e-07, + "loss": 0.1591, + "step": 105700 + }, + { + "epoch": 95.67873303167421, + "grad_norm": 1.0176407098770142, + "learning_rate": 8.669948441344665e-07, + "loss": 0.2202, + "step": 105725 + }, + { + "epoch": 95.70135746606334, + "grad_norm": 0.5090556144714355, + "learning_rate": 8.580378825212369e-07, + "loss": 0.2594, + "step": 105750 + }, + { + "epoch": 95.72398190045249, + "grad_norm": 1.0242820978164673, + "learning_rate": 8.491271626557716e-07, + "loss": 0.203, + "step": 105775 + }, + { + "epoch": 95.74660633484163, + "grad_norm": 0.6912797093391418, + "learning_rate": 8.402626900956083e-07, + "loss": 0.1706, + "step": 105800 + }, + { + "epoch": 95.76923076923077, + "grad_norm": 1.1983668804168701, + "learning_rate": 8.314444703694495e-07, + "loss": 0.1723, + "step": 105825 + }, + { + "epoch": 95.7918552036199, + "grad_norm": 0.9142023324966431, + "learning_rate": 8.226725089771541e-07, + "loss": 0.2184, + "step": 105850 + }, + { + "epoch": 95.81447963800905, + "grad_norm": 1.0519486665725708, + "learning_rate": 8.139468113897291e-07, + "loss": 0.1831, + "step": 105875 + }, + { + "epoch": 95.83710407239819, + "grad_norm": 1.305741310119629, + "learning_rate": 8.052673830493045e-07, + "loss": 0.2034, + "step": 105900 + }, + { + "epoch": 95.85972850678733, + "grad_norm": 0.7995584011077881, + "learning_rate": 7.966342293691835e-07, + "loss": 0.1994, + "step": 105925 + }, + { + "epoch": 95.88235294117646, + "grad_norm": 1.2197544574737549, + "learning_rate": 7.880473557337841e-07, + "loss": 0.1965, + "step": 105950 + }, + { + "epoch": 95.90497737556561, + "grad_norm": 1.1358721256256104, + "learning_rate": 7.795067674986805e-07, + "loss": 0.2104, + "step": 105975 + }, + { + "epoch": 95.92760180995475, + "grad_norm": 1.3733657598495483, + "learning_rate": 7.710124699905618e-07, + "loss": 0.2225, + "step": 106000 + }, + { + "epoch": 95.9502262443439, + "grad_norm": 0.9826918840408325, + "learning_rate": 7.625644685072651e-07, + "loss": 0.2207, + "step": 106025 + }, + { + "epoch": 95.97285067873302, + "grad_norm": 1.742278814315796, + "learning_rate": 7.541627683177343e-07, + "loss": 0.2163, + "step": 106050 + }, + { + "epoch": 95.99547511312217, + "grad_norm": 0.9497316479682922, + "learning_rate": 7.458073746620357e-07, + "loss": 0.203, + "step": 106075 + }, + { + "epoch": 96.01809954751131, + "grad_norm": 1.1039804220199585, + "learning_rate": 7.374982927513679e-07, + "loss": 0.2386, + "step": 106100 + }, + { + "epoch": 96.04072398190046, + "grad_norm": 0.8502349257469177, + "learning_rate": 7.292355277680434e-07, + "loss": 0.2104, + "step": 106125 + }, + { + "epoch": 96.0633484162896, + "grad_norm": 1.3425025939941406, + "learning_rate": 7.210190848654734e-07, + "loss": 0.1992, + "step": 106150 + }, + { + "epoch": 96.08597285067873, + "grad_norm": 0.8537681698799133, + "learning_rate": 7.128489691681921e-07, + "loss": 0.1741, + "step": 106175 + }, + { + "epoch": 96.10859728506787, + "grad_norm": 0.8187534809112549, + "learning_rate": 7.0472518577184e-07, + "loss": 0.2135, + "step": 106200 + }, + { + "epoch": 96.13122171945702, + "grad_norm": 3.6513512134552, + "learning_rate": 6.966477397431475e-07, + "loss": 0.208, + "step": 106225 + }, + { + "epoch": 96.15384615384616, + "grad_norm": 0.49796590209007263, + "learning_rate": 6.886166361199514e-07, + "loss": 0.1867, + "step": 106250 + }, + { + "epoch": 96.17647058823529, + "grad_norm": 0.8673714399337769, + "learning_rate": 6.806318799111949e-07, + "loss": 0.2391, + "step": 106275 + }, + { + "epoch": 96.19909502262443, + "grad_norm": 1.4049699306488037, + "learning_rate": 6.726934760968944e-07, + "loss": 0.2303, + "step": 106300 + }, + { + "epoch": 96.22171945701358, + "grad_norm": 1.2969492673873901, + "learning_rate": 6.648014296281895e-07, + "loss": 0.1813, + "step": 106325 + }, + { + "epoch": 96.24434389140272, + "grad_norm": 0.7551602125167847, + "learning_rate": 6.569557454272595e-07, + "loss": 0.2027, + "step": 106350 + }, + { + "epoch": 96.26696832579185, + "grad_norm": 1.1955355405807495, + "learning_rate": 6.491564283874234e-07, + "loss": 0.2105, + "step": 106375 + }, + { + "epoch": 96.289592760181, + "grad_norm": 1.5181422233581543, + "learning_rate": 6.41403483373032e-07, + "loss": 0.1833, + "step": 106400 + }, + { + "epoch": 96.31221719457014, + "grad_norm": 1.6462416648864746, + "learning_rate": 6.33696915219542e-07, + "loss": 0.2627, + "step": 106425 + }, + { + "epoch": 96.33484162895928, + "grad_norm": 1.1370958089828491, + "learning_rate": 6.260367287334755e-07, + "loss": 0.1615, + "step": 106450 + }, + { + "epoch": 96.35746606334841, + "grad_norm": 1.0515254735946655, + "learning_rate": 6.184229286924358e-07, + "loss": 0.1848, + "step": 106475 + }, + { + "epoch": 96.38009049773756, + "grad_norm": 0.6994163990020752, + "learning_rate": 6.108555198450826e-07, + "loss": 0.2235, + "step": 106500 + }, + { + "epoch": 96.4027149321267, + "grad_norm": 1.143445372581482, + "learning_rate": 6.033345069111489e-07, + "loss": 0.1824, + "step": 106525 + }, + { + "epoch": 96.42533936651584, + "grad_norm": 1.1587834358215332, + "learning_rate": 5.958598945814325e-07, + "loss": 0.1896, + "step": 106550 + }, + { + "epoch": 96.44796380090497, + "grad_norm": 0.7910786271095276, + "learning_rate": 5.884316875177958e-07, + "loss": 0.1526, + "step": 106575 + }, + { + "epoch": 96.47058823529412, + "grad_norm": 1.0434627532958984, + "learning_rate": 5.81049890353133e-07, + "loss": 0.209, + "step": 106600 + }, + { + "epoch": 96.49321266968326, + "grad_norm": 1.4372351169586182, + "learning_rate": 5.737145076914279e-07, + "loss": 0.2571, + "step": 106625 + }, + { + "epoch": 96.5158371040724, + "grad_norm": 0.6785518527030945, + "learning_rate": 5.664255441076959e-07, + "loss": 0.2259, + "step": 106650 + }, + { + "epoch": 96.53846153846153, + "grad_norm": 1.279801368713379, + "learning_rate": 5.591830041480089e-07, + "loss": 0.19, + "step": 106675 + }, + { + "epoch": 96.56108597285068, + "grad_norm": 1.3192423582077026, + "learning_rate": 5.519868923294702e-07, + "loss": 0.2327, + "step": 106700 + }, + { + "epoch": 96.58371040723982, + "grad_norm": 1.2175381183624268, + "learning_rate": 5.448372131402479e-07, + "loss": 0.1903, + "step": 106725 + }, + { + "epoch": 96.60633484162896, + "grad_norm": 0.8464931845664978, + "learning_rate": 5.377339710395334e-07, + "loss": 0.1747, + "step": 106750 + }, + { + "epoch": 96.6289592760181, + "grad_norm": 0.9121562838554382, + "learning_rate": 5.306771704575663e-07, + "loss": 0.2012, + "step": 106775 + }, + { + "epoch": 96.65158371040724, + "grad_norm": 0.6595601439476013, + "learning_rate": 5.236668157956092e-07, + "loss": 0.1959, + "step": 106800 + }, + { + "epoch": 96.67420814479638, + "grad_norm": 0.7561436295509338, + "learning_rate": 5.16702911425973e-07, + "loss": 0.2023, + "step": 106825 + }, + { + "epoch": 96.69683257918552, + "grad_norm": 0.8651442527770996, + "learning_rate": 5.097854616919833e-07, + "loss": 0.2495, + "step": 106850 + }, + { + "epoch": 96.71945701357465, + "grad_norm": 0.8870030045509338, + "learning_rate": 5.029144709080057e-07, + "loss": 0.1939, + "step": 106875 + }, + { + "epoch": 96.7420814479638, + "grad_norm": 0.929315447807312, + "learning_rate": 4.960899433594123e-07, + "loss": 0.2965, + "step": 106900 + }, + { + "epoch": 96.76470588235294, + "grad_norm": 1.3893049955368042, + "learning_rate": 4.893118833026066e-07, + "loss": 0.2217, + "step": 106925 + }, + { + "epoch": 96.78733031674209, + "grad_norm": 1.6950517892837524, + "learning_rate": 4.825802949650237e-07, + "loss": 0.1845, + "step": 106950 + }, + { + "epoch": 96.80995475113122, + "grad_norm": 0.5185062289237976, + "learning_rate": 4.7589518254508017e-07, + "loss": 0.1639, + "step": 106975 + }, + { + "epoch": 96.83257918552036, + "grad_norm": 0.9957413673400879, + "learning_rate": 4.692565502122492e-07, + "loss": 0.1754, + "step": 107000 + }, + { + "epoch": 96.8552036199095, + "grad_norm": 1.1682478189468384, + "learning_rate": 4.6266440210697695e-07, + "loss": 0.2075, + "step": 107025 + }, + { + "epoch": 96.87782805429865, + "grad_norm": 0.8752449154853821, + "learning_rate": 4.561187423407414e-07, + "loss": 0.1964, + "step": 107050 + }, + { + "epoch": 96.90045248868778, + "grad_norm": 0.8976783156394958, + "learning_rate": 4.496195749960102e-07, + "loss": 0.2129, + "step": 107075 + }, + { + "epoch": 96.92307692307692, + "grad_norm": 0.6009721159934998, + "learning_rate": 4.431669041262742e-07, + "loss": 0.212, + "step": 107100 + }, + { + "epoch": 96.94570135746606, + "grad_norm": 0.8184754848480225, + "learning_rate": 4.367607337559975e-07, + "loss": 0.1688, + "step": 107125 + }, + { + "epoch": 96.96832579185521, + "grad_norm": 0.6784803867340088, + "learning_rate": 4.304010678806674e-07, + "loss": 0.1856, + "step": 107150 + }, + { + "epoch": 96.99095022624434, + "grad_norm": 1.0374619960784912, + "learning_rate": 4.240879104667611e-07, + "loss": 0.2438, + "step": 107175 + }, + { + "epoch": 97.01357466063348, + "grad_norm": 1.0512750148773193, + "learning_rate": 4.1782126545172876e-07, + "loss": 0.1997, + "step": 107200 + }, + { + "epoch": 97.03619909502262, + "grad_norm": 0.7069447040557861, + "learning_rate": 4.1160113674404417e-07, + "loss": 0.2011, + "step": 107225 + }, + { + "epoch": 97.05882352941177, + "grad_norm": 0.6853822469711304, + "learning_rate": 4.0542752822312894e-07, + "loss": 0.1959, + "step": 107250 + }, + { + "epoch": 97.08144796380091, + "grad_norm": 0.6463684439659119, + "learning_rate": 3.9930044373943647e-07, + "loss": 0.245, + "step": 107275 + }, + { + "epoch": 97.10407239819004, + "grad_norm": 1.1984963417053223, + "learning_rate": 3.932198871143682e-07, + "loss": 0.187, + "step": 107300 + }, + { + "epoch": 97.12669683257919, + "grad_norm": 0.9013722538948059, + "learning_rate": 3.8718586214033233e-07, + "loss": 0.2416, + "step": 107325 + }, + { + "epoch": 97.14932126696833, + "grad_norm": 1.4150621891021729, + "learning_rate": 3.8119837258067663e-07, + "loss": 0.2377, + "step": 107350 + }, + { + "epoch": 97.17194570135747, + "grad_norm": 1.1563483476638794, + "learning_rate": 3.7525742216976404e-07, + "loss": 0.1711, + "step": 107375 + }, + { + "epoch": 97.1945701357466, + "grad_norm": 0.775921642780304, + "learning_rate": 3.693630146129306e-07, + "loss": 0.2397, + "step": 107400 + }, + { + "epoch": 97.21719457013575, + "grad_norm": 0.9456844329833984, + "learning_rate": 3.635151535864522e-07, + "loss": 0.192, + "step": 107425 + }, + { + "epoch": 97.23981900452489, + "grad_norm": 0.8060914278030396, + "learning_rate": 3.5771384273760307e-07, + "loss": 0.2544, + "step": 107450 + }, + { + "epoch": 97.26244343891403, + "grad_norm": 1.0496222972869873, + "learning_rate": 3.5195908568460554e-07, + "loss": 0.1845, + "step": 107475 + }, + { + "epoch": 97.28506787330316, + "grad_norm": 1.110653042793274, + "learning_rate": 3.462508860166635e-07, + "loss": 0.2204, + "step": 107500 + }, + { + "epoch": 97.3076923076923, + "grad_norm": 1.4529675245285034, + "learning_rate": 3.4058924729392925e-07, + "loss": 0.2007, + "step": 107525 + }, + { + "epoch": 97.33031674208145, + "grad_norm": 0.9726166129112244, + "learning_rate": 3.3497417304752806e-07, + "loss": 0.1843, + "step": 107550 + }, + { + "epoch": 97.3529411764706, + "grad_norm": 1.0995749235153198, + "learning_rate": 3.294056667795336e-07, + "loss": 0.2261, + "step": 107575 + }, + { + "epoch": 97.37556561085972, + "grad_norm": 0.6885856986045837, + "learning_rate": 3.2388373196297613e-07, + "loss": 0.1681, + "step": 107600 + }, + { + "epoch": 97.39819004524887, + "grad_norm": 1.1119883060455322, + "learning_rate": 3.1840837204184234e-07, + "loss": 0.1899, + "step": 107625 + }, + { + "epoch": 97.42081447963801, + "grad_norm": 0.9760034680366516, + "learning_rate": 3.129795904310839e-07, + "loss": 0.2193, + "step": 107650 + }, + { + "epoch": 97.44343891402715, + "grad_norm": 0.9522729516029358, + "learning_rate": 3.075973905165674e-07, + "loss": 0.1967, + "step": 107675 + }, + { + "epoch": 97.46606334841628, + "grad_norm": 1.1178126335144043, + "learning_rate": 3.0226177565514096e-07, + "loss": 0.1555, + "step": 107700 + }, + { + "epoch": 97.48868778280543, + "grad_norm": 0.8215828537940979, + "learning_rate": 2.9697274917457604e-07, + "loss": 0.2184, + "step": 107725 + }, + { + "epoch": 97.51131221719457, + "grad_norm": 1.2026300430297852, + "learning_rate": 2.917303143736088e-07, + "loss": 0.1963, + "step": 107750 + }, + { + "epoch": 97.53393665158372, + "grad_norm": 1.8742423057556152, + "learning_rate": 2.865344745218906e-07, + "loss": 0.1814, + "step": 107775 + }, + { + "epoch": 97.55656108597285, + "grad_norm": 1.6945271492004395, + "learning_rate": 2.8138523286003747e-07, + "loss": 0.1677, + "step": 107800 + }, + { + "epoch": 97.57918552036199, + "grad_norm": 0.8677001595497131, + "learning_rate": 2.762825925995721e-07, + "loss": 0.2478, + "step": 107825 + }, + { + "epoch": 97.60180995475113, + "grad_norm": 1.41805100440979, + "learning_rate": 2.7122655692299875e-07, + "loss": 0.2183, + "step": 107850 + }, + { + "epoch": 97.62443438914028, + "grad_norm": 1.0568068027496338, + "learning_rate": 2.6621712898369506e-07, + "loss": 0.1626, + "step": 107875 + }, + { + "epoch": 97.6470588235294, + "grad_norm": 0.5560912489891052, + "learning_rate": 2.6125431190602006e-07, + "loss": 0.1507, + "step": 107900 + }, + { + "epoch": 97.66968325791855, + "grad_norm": 4.1539483070373535, + "learning_rate": 2.563381087852395e-07, + "loss": 0.22, + "step": 107925 + }, + { + "epoch": 97.6923076923077, + "grad_norm": 0.9042114615440369, + "learning_rate": 2.5146852268755067e-07, + "loss": 0.2123, + "step": 107950 + }, + { + "epoch": 97.71493212669684, + "grad_norm": 0.9128357172012329, + "learning_rate": 2.466455566500658e-07, + "loss": 0.1648, + "step": 107975 + }, + { + "epoch": 97.73755656108597, + "grad_norm": 2.1743557453155518, + "learning_rate": 2.4186921368084533e-07, + "loss": 0.2671, + "step": 108000 + }, + { + "epoch": 97.76018099547511, + "grad_norm": 1.140045404434204, + "learning_rate": 2.3713949675884802e-07, + "loss": 0.1961, + "step": 108025 + }, + { + "epoch": 97.78280542986425, + "grad_norm": 1.2659268379211426, + "learning_rate": 2.324564088339642e-07, + "loss": 0.2415, + "step": 108050 + }, + { + "epoch": 97.8054298642534, + "grad_norm": 1.0270200967788696, + "learning_rate": 2.2781995282699085e-07, + "loss": 0.198, + "step": 108075 + }, + { + "epoch": 97.82805429864253, + "grad_norm": 1.1628632545471191, + "learning_rate": 2.232301316296481e-07, + "loss": 0.1938, + "step": 108100 + }, + { + "epoch": 97.85067873303167, + "grad_norm": 0.7492268681526184, + "learning_rate": 2.1868694810457943e-07, + "loss": 0.1708, + "step": 108125 + }, + { + "epoch": 97.87330316742081, + "grad_norm": 1.1845602989196777, + "learning_rate": 2.1419040508533492e-07, + "loss": 0.1817, + "step": 108150 + }, + { + "epoch": 97.89592760180996, + "grad_norm": 0.9418588280677795, + "learning_rate": 2.0974050537635456e-07, + "loss": 0.2071, + "step": 108175 + }, + { + "epoch": 97.91855203619909, + "grad_norm": 0.934147834777832, + "learning_rate": 2.0533725175302663e-07, + "loss": 0.2131, + "step": 108200 + }, + { + "epoch": 97.94117647058823, + "grad_norm": 1.1434376239776611, + "learning_rate": 2.0098064696160432e-07, + "loss": 0.2049, + "step": 108225 + }, + { + "epoch": 97.96380090497738, + "grad_norm": 0.6958593130111694, + "learning_rate": 1.966706937192808e-07, + "loss": 0.1928, + "step": 108250 + }, + { + "epoch": 97.98642533936652, + "grad_norm": 0.9885338544845581, + "learning_rate": 1.9240739471413913e-07, + "loss": 0.2062, + "step": 108275 + }, + { + "epoch": 98.00904977375566, + "grad_norm": 1.6239970922470093, + "learning_rate": 1.881907526051607e-07, + "loss": 0.2045, + "step": 108300 + }, + { + "epoch": 98.03167420814479, + "grad_norm": 1.2024998664855957, + "learning_rate": 1.8402077002222516e-07, + "loss": 0.2421, + "step": 108325 + }, + { + "epoch": 98.05429864253394, + "grad_norm": 0.8262038826942444, + "learning_rate": 1.7989744956613538e-07, + "loss": 0.2003, + "step": 108350 + }, + { + "epoch": 98.07692307692308, + "grad_norm": 0.7141467332839966, + "learning_rate": 1.7582079380855096e-07, + "loss": 0.239, + "step": 108375 + }, + { + "epoch": 98.09954751131222, + "grad_norm": 0.7989129424095154, + "learning_rate": 1.7179080529207135e-07, + "loss": 0.2318, + "step": 108400 + }, + { + "epoch": 98.12217194570135, + "grad_norm": 1.194916009902954, + "learning_rate": 1.6780748653015263e-07, + "loss": 0.1894, + "step": 108425 + }, + { + "epoch": 98.1447963800905, + "grad_norm": 0.6949662566184998, + "learning_rate": 1.6387084000716587e-07, + "loss": 0.2095, + "step": 108450 + }, + { + "epoch": 98.16742081447964, + "grad_norm": 0.8797706961631775, + "learning_rate": 1.5998086817835542e-07, + "loss": 0.2249, + "step": 108475 + }, + { + "epoch": 98.19004524886878, + "grad_norm": 1.238031268119812, + "learning_rate": 1.5613757346988055e-07, + "loss": 0.2747, + "step": 108500 + }, + { + "epoch": 98.21266968325791, + "grad_norm": 1.2970422506332397, + "learning_rate": 1.523409582787738e-07, + "loss": 0.1667, + "step": 108525 + }, + { + "epoch": 98.23529411764706, + "grad_norm": 0.8342011570930481, + "learning_rate": 1.4859102497293274e-07, + "loss": 0.1889, + "step": 108550 + }, + { + "epoch": 98.2579185520362, + "grad_norm": 0.9561188220977783, + "learning_rate": 1.448877758911865e-07, + "loss": 0.2635, + "step": 108575 + }, + { + "epoch": 98.28054298642535, + "grad_norm": 1.4334338903427124, + "learning_rate": 1.4123121334319587e-07, + "loss": 0.2002, + "step": 108600 + }, + { + "epoch": 98.30316742081448, + "grad_norm": 1.6493507623672485, + "learning_rate": 1.3762133960955323e-07, + "loss": 0.1708, + "step": 108625 + }, + { + "epoch": 98.32579185520362, + "grad_norm": 1.9994558095932007, + "learning_rate": 1.3405815694169931e-07, + "loss": 0.2021, + "step": 108650 + }, + { + "epoch": 98.34841628959276, + "grad_norm": 0.6810872554779053, + "learning_rate": 1.305416675619647e-07, + "loss": 0.1983, + "step": 108675 + }, + { + "epoch": 98.3710407239819, + "grad_norm": 0.8623680472373962, + "learning_rate": 1.270718736635451e-07, + "loss": 0.1699, + "step": 108700 + }, + { + "epoch": 98.39366515837104, + "grad_norm": 0.7404358386993408, + "learning_rate": 1.2364877741053435e-07, + "loss": 0.2153, + "step": 108725 + }, + { + "epoch": 98.41628959276018, + "grad_norm": 1.1512694358825684, + "learning_rate": 1.2027238093788306e-07, + "loss": 0.1662, + "step": 108750 + }, + { + "epoch": 98.43891402714932, + "grad_norm": 0.794710099697113, + "learning_rate": 1.1694268635142335e-07, + "loss": 0.1841, + "step": 108775 + }, + { + "epoch": 98.46153846153847, + "grad_norm": 1.0078766345977783, + "learning_rate": 1.1365969572786904e-07, + "loss": 0.2332, + "step": 108800 + }, + { + "epoch": 98.4841628959276, + "grad_norm": 1.046234369277954, + "learning_rate": 1.1042341111478226e-07, + "loss": 0.1803, + "step": 108825 + }, + { + "epoch": 98.50678733031674, + "grad_norm": 4.2476725578308105, + "learning_rate": 1.0723383453061507e-07, + "loss": 0.2169, + "step": 108850 + }, + { + "epoch": 98.52941176470588, + "grad_norm": 0.9719073176383972, + "learning_rate": 1.0409096796468453e-07, + "loss": 0.149, + "step": 108875 + }, + { + "epoch": 98.55203619909503, + "grad_norm": 0.9626606702804565, + "learning_rate": 1.0099481337715599e-07, + "loss": 0.2009, + "step": 108900 + }, + { + "epoch": 98.57466063348416, + "grad_norm": 0.4916283190250397, + "learning_rate": 9.794537269909308e-08, + "loss": 0.2229, + "step": 108925 + }, + { + "epoch": 98.5972850678733, + "grad_norm": 0.7384562492370605, + "learning_rate": 9.494264783239947e-08, + "loss": 0.1724, + "step": 108950 + }, + { + "epoch": 98.61990950226244, + "grad_norm": 0.9659861922264099, + "learning_rate": 9.198664064985206e-08, + "loss": 0.1845, + "step": 108975 + }, + { + "epoch": 98.64253393665159, + "grad_norm": 1.0783486366271973, + "learning_rate": 8.907735299508445e-08, + "loss": 0.1884, + "step": 109000 + }, + { + "epoch": 98.66515837104072, + "grad_norm": 0.7933319211006165, + "learning_rate": 8.621478668260351e-08, + "loss": 0.2171, + "step": 109025 + }, + { + "epoch": 98.68778280542986, + "grad_norm": 0.7159486413002014, + "learning_rate": 8.339894349776444e-08, + "loss": 0.2953, + "step": 109050 + }, + { + "epoch": 98.710407239819, + "grad_norm": 0.9898947477340698, + "learning_rate": 8.06298251967874e-08, + "loss": 0.2242, + "step": 109075 + }, + { + "epoch": 98.73303167420815, + "grad_norm": 1.0413763523101807, + "learning_rate": 7.790743350674922e-08, + "loss": 0.1767, + "step": 109100 + }, + { + "epoch": 98.75565610859728, + "grad_norm": 1.061669945716858, + "learning_rate": 7.523177012559167e-08, + "loss": 0.2004, + "step": 109125 + }, + { + "epoch": 98.77828054298642, + "grad_norm": 4.033838748931885, + "learning_rate": 7.260283672208822e-08, + "loss": 0.2846, + "step": 109150 + }, + { + "epoch": 98.80090497737557, + "grad_norm": 0.962801456451416, + "learning_rate": 7.002063493588562e-08, + "loss": 0.1659, + "step": 109175 + }, + { + "epoch": 98.82352941176471, + "grad_norm": 0.941548228263855, + "learning_rate": 6.748516637749556e-08, + "loss": 0.1697, + "step": 109200 + }, + { + "epoch": 98.84615384615384, + "grad_norm": 1.4694151878356934, + "learning_rate": 6.499643262826149e-08, + "loss": 0.1457, + "step": 109225 + }, + { + "epoch": 98.86877828054298, + "grad_norm": 0.8219665288925171, + "learning_rate": 6.255443524039172e-08, + "loss": 0.2182, + "step": 109250 + }, + { + "epoch": 98.89140271493213, + "grad_norm": 1.0226620435714722, + "learning_rate": 6.025408873085724e-08, + "loss": 0.1969, + "step": 109275 + }, + { + "epoch": 98.91402714932127, + "grad_norm": 0.899225115776062, + "learning_rate": 5.790369900227076e-08, + "loss": 0.1797, + "step": 109300 + }, + { + "epoch": 98.9366515837104, + "grad_norm": 0.7753010988235474, + "learning_rate": 5.560005005871837e-08, + "loss": 0.19, + "step": 109325 + }, + { + "epoch": 98.95927601809954, + "grad_norm": 0.923560619354248, + "learning_rate": 5.334314333697576e-08, + "loss": 0.194, + "step": 109350 + }, + { + "epoch": 98.98190045248869, + "grad_norm": 1.2471297979354858, + "learning_rate": 5.1132980244658617e-08, + "loss": 0.1879, + "step": 109375 + }, + { + "epoch": 99.00452488687783, + "grad_norm": 1.3618764877319336, + "learning_rate": 4.896956216023096e-08, + "loss": 0.1803, + "step": 109400 + }, + { + "epoch": 99.02714932126698, + "grad_norm": 1.0565627813339233, + "learning_rate": 4.6852890432988453e-08, + "loss": 0.1534, + "step": 109425 + }, + { + "epoch": 99.0497737556561, + "grad_norm": 1.0808826684951782, + "learning_rate": 4.4782966383091754e-08, + "loss": 0.2064, + "step": 109450 + }, + { + "epoch": 99.07239819004525, + "grad_norm": 1.2194515466690063, + "learning_rate": 4.275979130153318e-08, + "loss": 0.2306, + "step": 109475 + }, + { + "epoch": 99.09502262443439, + "grad_norm": 3.7945005893707275, + "learning_rate": 4.078336645014502e-08, + "loss": 0.196, + "step": 109500 + }, + { + "epoch": 99.11764705882354, + "grad_norm": 1.1586612462997437, + "learning_rate": 3.885369306161623e-08, + "loss": 0.2183, + "step": 109525 + }, + { + "epoch": 99.14027149321267, + "grad_norm": 0.8019810318946838, + "learning_rate": 3.697077233946743e-08, + "loss": 0.1617, + "step": 109550 + }, + { + "epoch": 99.16289592760181, + "grad_norm": 1.1393961906433105, + "learning_rate": 3.513460545805091e-08, + "loss": 0.2624, + "step": 109575 + }, + { + "epoch": 99.18552036199095, + "grad_norm": 1.6677390336990356, + "learning_rate": 3.3345193562583915e-08, + "loss": 0.1994, + "step": 109600 + }, + { + "epoch": 99.2081447963801, + "grad_norm": 1.3894984722137451, + "learning_rate": 3.160253776909871e-08, + "loss": 0.2197, + "step": 109625 + }, + { + "epoch": 99.23076923076923, + "grad_norm": 1.0854603052139282, + "learning_rate": 2.990663916447589e-08, + "loss": 0.1929, + "step": 109650 + }, + { + "epoch": 99.25339366515837, + "grad_norm": 1.320836067199707, + "learning_rate": 2.8257498806444345e-08, + "loss": 0.1708, + "step": 109675 + }, + { + "epoch": 99.27601809954751, + "grad_norm": 0.7589247226715088, + "learning_rate": 2.6655117723548005e-08, + "loss": 0.2757, + "step": 109700 + }, + { + "epoch": 99.29864253393666, + "grad_norm": 0.6867343783378601, + "learning_rate": 2.5099496915179095e-08, + "loss": 0.2192, + "step": 109725 + }, + { + "epoch": 99.32126696832579, + "grad_norm": 1.2005285024642944, + "learning_rate": 2.3590637351569852e-08, + "loss": 0.2282, + "step": 109750 + }, + { + "epoch": 99.34389140271493, + "grad_norm": 0.9429746866226196, + "learning_rate": 2.212853997379249e-08, + "loss": 0.193, + "step": 109775 + }, + { + "epoch": 99.36651583710407, + "grad_norm": 1.5281111001968384, + "learning_rate": 2.071320569372592e-08, + "loss": 0.1732, + "step": 109800 + }, + { + "epoch": 99.38914027149322, + "grad_norm": 0.6826324462890625, + "learning_rate": 1.9344635394122344e-08, + "loss": 0.2163, + "step": 109825 + }, + { + "epoch": 99.41176470588235, + "grad_norm": 1.1552412509918213, + "learning_rate": 1.8022829928532324e-08, + "loss": 0.1768, + "step": 109850 + }, + { + "epoch": 99.43438914027149, + "grad_norm": 0.668086051940918, + "learning_rate": 1.674779012136307e-08, + "loss": 0.1905, + "step": 109875 + }, + { + "epoch": 99.45701357466064, + "grad_norm": 1.4751818180084229, + "learning_rate": 1.5519516767853456e-08, + "loss": 0.1776, + "step": 109900 + }, + { + "epoch": 99.47963800904978, + "grad_norm": 1.344252586364746, + "learning_rate": 1.4338010634049046e-08, + "loss": 0.1896, + "step": 109925 + }, + { + "epoch": 99.50226244343891, + "grad_norm": 0.7122328877449036, + "learning_rate": 1.3203272456868697e-08, + "loss": 0.2223, + "step": 109950 + }, + { + "epoch": 99.52488687782805, + "grad_norm": 3.9881479740142822, + "learning_rate": 1.2115302944021298e-08, + "loss": 0.2235, + "step": 109975 + }, + { + "epoch": 99.5475113122172, + "grad_norm": 1.2549389600753784, + "learning_rate": 1.1074102774072391e-08, + "loss": 0.1651, + "step": 110000 + }, + { + "epoch": 99.57013574660634, + "grad_norm": 1.056302547454834, + "learning_rate": 1.0079672596402522e-08, + "loss": 0.2073, + "step": 110025 + }, + { + "epoch": 99.59276018099547, + "grad_norm": 0.7567543387413025, + "learning_rate": 9.132013031248886e-09, + "loss": 0.1968, + "step": 110050 + }, + { + "epoch": 99.61538461538461, + "grad_norm": 0.9951006770133972, + "learning_rate": 8.23112466963871e-09, + "loss": 0.218, + "step": 110075 + }, + { + "epoch": 99.63800904977376, + "grad_norm": 1.0877238512039185, + "learning_rate": 7.37700807345587e-09, + "loss": 0.1514, + "step": 110100 + }, + { + "epoch": 99.6606334841629, + "grad_norm": 0.8827500939369202, + "learning_rate": 6.5696637754075744e-09, + "loss": 0.1804, + "step": 110125 + }, + { + "epoch": 99.68325791855203, + "grad_norm": 0.8063337802886963, + "learning_rate": 5.809092279032701e-09, + "loss": 0.1606, + "step": 110150 + }, + { + "epoch": 99.70588235294117, + "grad_norm": 0.7970460653305054, + "learning_rate": 5.095294058676813e-09, + "loss": 0.2114, + "step": 110175 + }, + { + "epoch": 99.72850678733032, + "grad_norm": 0.9552301168441772, + "learning_rate": 4.428269559550446e-09, + "loss": 0.1904, + "step": 110200 + }, + { + "epoch": 99.75113122171946, + "grad_norm": 0.9689047336578369, + "learning_rate": 3.808019197662493e-09, + "loss": 0.2306, + "step": 110225 + }, + { + "epoch": 99.77375565610859, + "grad_norm": 1.2445021867752075, + "learning_rate": 3.234543359853514e-09, + "loss": 0.2111, + "step": 110250 + }, + { + "epoch": 99.79638009049773, + "grad_norm": 1.4182652235031128, + "learning_rate": 2.7078424038040613e-09, + "loss": 0.2064, + "step": 110275 + }, + { + "epoch": 99.81900452488688, + "grad_norm": 1.0118160247802734, + "learning_rate": 2.2279166580096987e-09, + "loss": 0.2341, + "step": 110300 + }, + { + "epoch": 99.84162895927602, + "grad_norm": 1.0100202560424805, + "learning_rate": 1.7947664217976553e-09, + "loss": 0.1898, + "step": 110325 + }, + { + "epoch": 99.86425339366515, + "grad_norm": 0.5291355848312378, + "learning_rate": 1.4083919653101738e-09, + "loss": 0.222, + "step": 110350 + }, + { + "epoch": 99.8868778280543, + "grad_norm": 0.5837073922157288, + "learning_rate": 1.0687935295461413e-09, + "loss": 0.18, + "step": 110375 + }, + { + "epoch": 99.90950226244344, + "grad_norm": 1.0217236280441284, + "learning_rate": 7.759713262861512e-10, + "loss": 0.195, + "step": 110400 + }, + { + "epoch": 99.93212669683258, + "grad_norm": 0.9842627048492432, + "learning_rate": 5.29925538184095e-10, + "loss": 0.2016, + "step": 110425 + }, + { + "epoch": 99.95475113122171, + "grad_norm": 1.3540014028549194, + "learning_rate": 3.306563186838973e-10, + "loss": 0.3034, + "step": 110450 + }, + { + "epoch": 99.97737556561086, + "grad_norm": 1.082549810409546, + "learning_rate": 1.781637920694745e-10, + "loss": 0.186, + "step": 110475 + }, + { + "epoch": 100.0, + "grad_norm": 0.7581382393836975, + "learning_rate": 7.244805344808202e-11, + "loss": 0.1697, + "step": 110500 + } + ], + "logging_steps": 25, + "max_steps": 110500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.44639913934848e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}