{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 0.7542400501913732, "learning_rate": 3.5087719298245616e-07, "loss": 1.6751, "step": 1 }, { "epoch": 0.0032, "grad_norm": 0.742305937368562, "learning_rate": 7.017543859649123e-07, "loss": 1.6555, "step": 2 }, { "epoch": 0.0048, "grad_norm": 0.6980748485515275, "learning_rate": 1.0526315789473685e-06, "loss": 1.6802, "step": 3 }, { "epoch": 0.0064, "grad_norm": 0.8118757008619297, "learning_rate": 1.4035087719298246e-06, "loss": 1.7025, "step": 4 }, { "epoch": 0.008, "grad_norm": 0.7050523685798155, "learning_rate": 1.7543859649122807e-06, "loss": 1.6962, "step": 5 }, { "epoch": 0.0096, "grad_norm": 0.711727455005183, "learning_rate": 2.105263157894737e-06, "loss": 1.697, "step": 6 }, { "epoch": 0.0112, "grad_norm": 0.7160671118334767, "learning_rate": 2.456140350877193e-06, "loss": 1.6868, "step": 7 }, { "epoch": 0.0128, "grad_norm": 0.7821002979119394, "learning_rate": 2.8070175438596493e-06, "loss": 1.6639, "step": 8 }, { "epoch": 0.0144, "grad_norm": 0.7451098951855825, "learning_rate": 3.157894736842105e-06, "loss": 1.6825, "step": 9 }, { "epoch": 0.016, "grad_norm": 0.7363068641172336, "learning_rate": 3.5087719298245615e-06, "loss": 1.6668, "step": 10 }, { "epoch": 0.0176, "grad_norm": 0.7116123257106387, "learning_rate": 3.859649122807018e-06, "loss": 1.665, "step": 11 }, { "epoch": 0.0192, "grad_norm": 0.692542211487462, "learning_rate": 4.210526315789474e-06, "loss": 1.6172, "step": 12 }, { "epoch": 0.0208, "grad_norm": 0.7049109164414815, "learning_rate": 4.56140350877193e-06, "loss": 1.6232, "step": 13 }, { "epoch": 0.0224, "grad_norm": 0.8451468378901278, "learning_rate": 4.912280701754386e-06, "loss": 1.6017, "step": 14 }, { "epoch": 0.024, "grad_norm": 0.8451468378901278, "learning_rate": 4.912280701754386e-06, "loss": 1.6056, "step": 15 }, { "epoch": 0.0256, "grad_norm": 0.7169606297094776, "learning_rate": 5.263157894736842e-06, "loss": 1.6057, "step": 16 }, { "epoch": 0.0272, "grad_norm": 0.7135705482908589, "learning_rate": 5.6140350877192985e-06, "loss": 1.6443, "step": 17 }, { "epoch": 0.0288, "grad_norm": 0.7167917758950816, "learning_rate": 5.964912280701755e-06, "loss": 1.5457, "step": 18 }, { "epoch": 0.0304, "grad_norm": 0.7764053685064782, "learning_rate": 6.31578947368421e-06, "loss": 1.5726, "step": 19 }, { "epoch": 0.032, "grad_norm": 0.9473759206115546, "learning_rate": 6.666666666666667e-06, "loss": 1.5574, "step": 20 }, { "epoch": 0.0336, "grad_norm": 0.7523094343855393, "learning_rate": 7.017543859649123e-06, "loss": 1.5088, "step": 21 }, { "epoch": 0.0352, "grad_norm": 0.7105882407110944, "learning_rate": 7.368421052631579e-06, "loss": 1.5274, "step": 22 }, { "epoch": 0.0368, "grad_norm": 0.6879803795744842, "learning_rate": 7.719298245614036e-06, "loss": 1.5775, "step": 23 }, { "epoch": 0.0384, "grad_norm": 0.6570506808253213, "learning_rate": 8.070175438596492e-06, "loss": 1.5383, "step": 24 }, { "epoch": 0.04, "grad_norm": 0.6790633921201122, "learning_rate": 8.421052631578948e-06, "loss": 1.4843, "step": 25 }, { "epoch": 0.0416, "grad_norm": 0.7507734448320031, "learning_rate": 8.771929824561405e-06, "loss": 1.4724, "step": 26 }, { "epoch": 0.0432, "grad_norm": 0.6260435863513739, "learning_rate": 9.12280701754386e-06, "loss": 1.5053, "step": 27 }, { "epoch": 0.0448, "grad_norm": 0.6344279227660959, "learning_rate": 9.473684210526315e-06, "loss": 1.4264, "step": 28 }, { "epoch": 0.0464, "grad_norm": 0.6373882465568406, "learning_rate": 9.824561403508772e-06, "loss": 1.4379, "step": 29 }, { "epoch": 0.048, "grad_norm": 0.7332703557822935, "learning_rate": 1.017543859649123e-05, "loss": 1.3561, "step": 30 }, { "epoch": 0.0496, "grad_norm": 0.6037687269257381, "learning_rate": 1.0526315789473684e-05, "loss": 1.3818, "step": 31 }, { "epoch": 0.0512, "grad_norm": 0.6152316972590014, "learning_rate": 1.0877192982456142e-05, "loss": 1.3158, "step": 32 }, { "epoch": 0.0528, "grad_norm": 0.6574478270840337, "learning_rate": 1.1228070175438597e-05, "loss": 1.3933, "step": 33 }, { "epoch": 0.0544, "grad_norm": 0.5990701570969205, "learning_rate": 1.1578947368421053e-05, "loss": 1.3143, "step": 34 }, { "epoch": 0.056, "grad_norm": 0.6450886782633305, "learning_rate": 1.192982456140351e-05, "loss": 1.3073, "step": 35 }, { "epoch": 0.0576, "grad_norm": 0.5773418755615032, "learning_rate": 1.2280701754385966e-05, "loss": 1.2652, "step": 36 }, { "epoch": 0.0592, "grad_norm": 0.6757255156889946, "learning_rate": 1.263157894736842e-05, "loss": 1.2634, "step": 37 }, { "epoch": 0.0608, "grad_norm": 0.6363505050303085, "learning_rate": 1.2982456140350879e-05, "loss": 1.2551, "step": 38 }, { "epoch": 0.0624, "grad_norm": 0.5900310001990254, "learning_rate": 1.3333333333333333e-05, "loss": 1.2004, "step": 39 }, { "epoch": 0.064, "grad_norm": 0.5947705278540787, "learning_rate": 1.3684210526315791e-05, "loss": 1.1485, "step": 40 }, { "epoch": 0.0656, "grad_norm": 0.6511869039207653, "learning_rate": 1.4035087719298246e-05, "loss": 1.1496, "step": 41 }, { "epoch": 0.0672, "grad_norm": 0.5968807693876552, "learning_rate": 1.4385964912280704e-05, "loss": 1.1163, "step": 42 }, { "epoch": 0.0688, "grad_norm": 0.6768227590842741, "learning_rate": 1.4736842105263159e-05, "loss": 1.1251, "step": 43 }, { "epoch": 0.0704, "grad_norm": 1.331031279479116, "learning_rate": 1.5087719298245615e-05, "loss": 1.072, "step": 44 }, { "epoch": 0.072, "grad_norm": 0.6420589576746323, "learning_rate": 1.543859649122807e-05, "loss": 1.0612, "step": 45 }, { "epoch": 0.0736, "grad_norm": 0.637279753966264, "learning_rate": 1.578947368421053e-05, "loss": 0.9885, "step": 46 }, { "epoch": 0.0752, "grad_norm": 0.6285715183385797, "learning_rate": 1.6140350877192984e-05, "loss": 0.9694, "step": 47 }, { "epoch": 0.0768, "grad_norm": 0.6861440187647094, "learning_rate": 1.649122807017544e-05, "loss": 0.9652, "step": 48 }, { "epoch": 0.0784, "grad_norm": 0.7538552449135, "learning_rate": 1.6842105263157896e-05, "loss": 0.9091, "step": 49 }, { "epoch": 0.08, "grad_norm": 0.7139497393950953, "learning_rate": 1.719298245614035e-05, "loss": 0.8914, "step": 50 }, { "epoch": 0.0816, "grad_norm": 0.6987917832417296, "learning_rate": 1.754385964912281e-05, "loss": 0.8326, "step": 51 }, { "epoch": 0.0832, "grad_norm": 0.6871197658295636, "learning_rate": 1.7894736842105264e-05, "loss": 0.7873, "step": 52 }, { "epoch": 0.0848, "grad_norm": 0.7292306074949482, "learning_rate": 1.824561403508772e-05, "loss": 0.7752, "step": 53 }, { "epoch": 0.0864, "grad_norm": 0.7750209058554545, "learning_rate": 1.8596491228070176e-05, "loss": 0.7384, "step": 54 }, { "epoch": 0.088, "grad_norm": 0.8563870585217198, "learning_rate": 1.894736842105263e-05, "loss": 0.6838, "step": 55 }, { "epoch": 0.0896, "grad_norm": 0.8700867511623959, "learning_rate": 1.929824561403509e-05, "loss": 0.6253, "step": 56 }, { "epoch": 0.0912, "grad_norm": 0.8715974869514921, "learning_rate": 1.9649122807017544e-05, "loss": 0.577, "step": 57 }, { "epoch": 0.0928, "grad_norm": 0.8853465018763553, "learning_rate": 2e-05, "loss": 0.5009, "step": 58 }, { "epoch": 0.0944, "grad_norm": 1.0984224579924668, "learning_rate": 1.9999985069241058e-05, "loss": 0.5193, "step": 59 }, { "epoch": 0.096, "grad_norm": 1.4561780667877906, "learning_rate": 1.9999940277008807e-05, "loss": 0.4672, "step": 60 }, { "epoch": 0.0976, "grad_norm": 1.4370299342188482, "learning_rate": 1.9999865623437014e-05, "loss": 0.404, "step": 61 }, { "epoch": 0.0992, "grad_norm": 1.0143477041895286, "learning_rate": 1.99997611087486e-05, "loss": 0.3898, "step": 62 }, { "epoch": 0.1008, "grad_norm": 0.8105824534934927, "learning_rate": 1.9999626733255662e-05, "loss": 0.3629, "step": 63 }, { "epoch": 0.1024, "grad_norm": 0.7390096173444612, "learning_rate": 1.9999462497359468e-05, "loss": 0.3122, "step": 64 }, { "epoch": 0.104, "grad_norm": 0.7115518497609002, "learning_rate": 1.9999268401550445e-05, "loss": 0.3047, "step": 65 }, { "epoch": 0.1056, "grad_norm": 0.7207830753076017, "learning_rate": 1.9999044446408203e-05, "loss": 0.2692, "step": 66 }, { "epoch": 0.1072, "grad_norm": 0.7856619231156039, "learning_rate": 1.9998790632601496e-05, "loss": 0.2416, "step": 67 }, { "epoch": 0.1088, "grad_norm": 0.6475630242734959, "learning_rate": 1.9998506960888258e-05, "loss": 0.2092, "step": 68 }, { "epoch": 0.1104, "grad_norm": 0.5711599547520134, "learning_rate": 1.999819343211557e-05, "loss": 0.2103, "step": 69 }, { "epoch": 0.112, "grad_norm": 0.7075381593815743, "learning_rate": 1.999785004721968e-05, "loss": 0.2024, "step": 70 }, { "epoch": 0.1136, "grad_norm": 0.7278916815227956, "learning_rate": 1.9997476807225987e-05, "loss": 0.1923, "step": 71 }, { "epoch": 0.1152, "grad_norm": 0.7400146380176278, "learning_rate": 1.999707371324904e-05, "loss": 0.1719, "step": 72 }, { "epoch": 0.1168, "grad_norm": 0.731122455083543, "learning_rate": 1.9996640766492542e-05, "loss": 0.1562, "step": 73 }, { "epoch": 0.1184, "grad_norm": 0.6656914514197745, "learning_rate": 1.9996177968249336e-05, "loss": 0.1392, "step": 74 }, { "epoch": 0.12, "grad_norm": 0.6552350171809493, "learning_rate": 1.999568531990141e-05, "loss": 0.147, "step": 75 }, { "epoch": 0.1216, "grad_norm": 0.5189005591063778, "learning_rate": 1.999516282291988e-05, "loss": 0.1282, "step": 76 }, { "epoch": 0.1232, "grad_norm": 0.4733125299687125, "learning_rate": 1.9994610478865012e-05, "loss": 0.1088, "step": 77 }, { "epoch": 0.1248, "grad_norm": 0.5579587554542267, "learning_rate": 1.999402828938618e-05, "loss": 0.1233, "step": 78 }, { "epoch": 0.1264, "grad_norm": 0.5092352653843509, "learning_rate": 1.9993416256221894e-05, "loss": 0.0934, "step": 79 }, { "epoch": 0.128, "grad_norm": 0.519808016741284, "learning_rate": 1.999277438119978e-05, "loss": 0.0914, "step": 80 }, { "epoch": 0.1296, "grad_norm": 0.6408642013945355, "learning_rate": 1.9992102666236567e-05, "loss": 0.1004, "step": 81 }, { "epoch": 0.1312, "grad_norm": 0.6438889392476188, "learning_rate": 1.9991401113338103e-05, "loss": 0.0912, "step": 82 }, { "epoch": 0.1328, "grad_norm": 0.5376033248254236, "learning_rate": 1.9990669724599336e-05, "loss": 0.0876, "step": 83 }, { "epoch": 0.1344, "grad_norm": 0.470270008076486, "learning_rate": 1.9989908502204295e-05, "loss": 0.0856, "step": 84 }, { "epoch": 0.136, "grad_norm": 0.5326255229735967, "learning_rate": 1.998911744842611e-05, "loss": 0.0812, "step": 85 }, { "epoch": 0.1376, "grad_norm": 0.9375418947984391, "learning_rate": 1.9988296565626988e-05, "loss": 0.0864, "step": 86 }, { "epoch": 0.1392, "grad_norm": 0.9653364692897405, "learning_rate": 1.9987445856258208e-05, "loss": 0.0775, "step": 87 }, { "epoch": 0.1408, "grad_norm": 0.510837102874891, "learning_rate": 1.9986565322860117e-05, "loss": 0.0758, "step": 88 }, { "epoch": 0.1424, "grad_norm": 0.46503877822117407, "learning_rate": 1.9985654968062122e-05, "loss": 0.0754, "step": 89 }, { "epoch": 0.144, "grad_norm": 0.4771621635835423, "learning_rate": 1.9984714794582682e-05, "loss": 0.0739, "step": 90 }, { "epoch": 0.1456, "grad_norm": 0.3851199281048143, "learning_rate": 1.9983744805229296e-05, "loss": 0.0741, "step": 91 }, { "epoch": 0.1472, "grad_norm": 0.5702319110120412, "learning_rate": 1.99827450028985e-05, "loss": 0.076, "step": 92 }, { "epoch": 0.1488, "grad_norm": 0.5111184671166494, "learning_rate": 1.998171539057586e-05, "loss": 0.0741, "step": 93 }, { "epoch": 0.1504, "grad_norm": 0.41546583111467167, "learning_rate": 1.9980655971335944e-05, "loss": 0.0609, "step": 94 }, { "epoch": 0.152, "grad_norm": 0.49956020283473984, "learning_rate": 1.9979566748342348e-05, "loss": 0.0638, "step": 95 }, { "epoch": 0.1536, "grad_norm": 0.3594293136690318, "learning_rate": 1.9978447724847655e-05, "loss": 0.0659, "step": 96 }, { "epoch": 0.1552, "grad_norm": 0.31456158706187765, "learning_rate": 1.9977298904193438e-05, "loss": 0.0688, "step": 97 }, { "epoch": 0.1568, "grad_norm": 0.31372590519120896, "learning_rate": 1.9976120289810247e-05, "loss": 0.0612, "step": 98 }, { "epoch": 0.1584, "grad_norm": 0.35164770270724616, "learning_rate": 1.997491188521761e-05, "loss": 0.0623, "step": 99 }, { "epoch": 0.16, "grad_norm": 0.3678653054533639, "learning_rate": 1.9973673694024002e-05, "loss": 0.0591, "step": 100 }, { "epoch": 0.1616, "grad_norm": 0.2830216452554222, "learning_rate": 1.997240571992685e-05, "loss": 0.0554, "step": 101 }, { "epoch": 0.1632, "grad_norm": 0.31115685625855377, "learning_rate": 1.9971107966712518e-05, "loss": 0.0581, "step": 102 }, { "epoch": 0.1648, "grad_norm": 0.6027402944866155, "learning_rate": 1.9969780438256295e-05, "loss": 0.0561, "step": 103 }, { "epoch": 0.1664, "grad_norm": 0.3062620836300149, "learning_rate": 1.9968423138522382e-05, "loss": 0.0561, "step": 104 }, { "epoch": 0.168, "grad_norm": 0.30374482823732385, "learning_rate": 1.9967036071563878e-05, "loss": 0.054, "step": 105 }, { "epoch": 0.1696, "grad_norm": 0.3215386574382714, "learning_rate": 1.996561924152278e-05, "loss": 0.055, "step": 106 }, { "epoch": 0.1712, "grad_norm": 0.3179166891991341, "learning_rate": 1.996417265262996e-05, "loss": 0.0526, "step": 107 }, { "epoch": 0.1728, "grad_norm": 0.2819886680887226, "learning_rate": 1.9962696309205146e-05, "loss": 0.0509, "step": 108 }, { "epoch": 0.1744, "grad_norm": 0.3863087049989975, "learning_rate": 1.996119021565693e-05, "loss": 0.0564, "step": 109 }, { "epoch": 0.176, "grad_norm": 0.38183766944400965, "learning_rate": 1.995965437648273e-05, "loss": 0.0541, "step": 110 }, { "epoch": 0.1776, "grad_norm": 0.3150541173369872, "learning_rate": 1.9958088796268794e-05, "loss": 0.054, "step": 111 }, { "epoch": 0.1792, "grad_norm": 0.3142985896656338, "learning_rate": 1.995649347969019e-05, "loss": 0.0493, "step": 112 }, { "epoch": 0.1808, "grad_norm": 0.2863614451714051, "learning_rate": 1.9954868431510764e-05, "loss": 0.0465, "step": 113 }, { "epoch": 0.1824, "grad_norm": 0.33168986501926395, "learning_rate": 1.995321365658317e-05, "loss": 0.0569, "step": 114 }, { "epoch": 0.184, "grad_norm": 0.2936366801022514, "learning_rate": 1.9951529159848805e-05, "loss": 0.0504, "step": 115 }, { "epoch": 0.1856, "grad_norm": 0.35989897726851766, "learning_rate": 1.994981494633784e-05, "loss": 0.0474, "step": 116 }, { "epoch": 0.1872, "grad_norm": 0.3118496134874041, "learning_rate": 1.9948071021169176e-05, "loss": 0.0471, "step": 117 }, { "epoch": 0.1888, "grad_norm": 0.3301667250792525, "learning_rate": 1.9946297389550433e-05, "loss": 0.0506, "step": 118 }, { "epoch": 0.1904, "grad_norm": 0.29854367027467466, "learning_rate": 1.9944494056777945e-05, "loss": 0.0491, "step": 119 }, { "epoch": 0.192, "grad_norm": 0.2639233963987313, "learning_rate": 1.9942661028236746e-05, "loss": 0.0478, "step": 120 }, { "epoch": 0.1936, "grad_norm": 0.31310526044955456, "learning_rate": 1.9940798309400527e-05, "loss": 0.0454, "step": 121 }, { "epoch": 0.1952, "grad_norm": 0.30036459286759426, "learning_rate": 1.9938905905831657e-05, "loss": 0.0512, "step": 122 }, { "epoch": 0.1968, "grad_norm": 0.274927744652201, "learning_rate": 1.9936983823181132e-05, "loss": 0.049, "step": 123 }, { "epoch": 0.1984, "grad_norm": 0.3077251417327128, "learning_rate": 1.993503206718859e-05, "loss": 0.0448, "step": 124 }, { "epoch": 0.2, "grad_norm": 0.2984318065517011, "learning_rate": 1.993305064368227e-05, "loss": 0.0447, "step": 125 }, { "epoch": 0.2016, "grad_norm": 0.2562044510094792, "learning_rate": 1.9931039558578997e-05, "loss": 0.0422, "step": 126 }, { "epoch": 0.2032, "grad_norm": 0.22049955284817202, "learning_rate": 1.9928998817884185e-05, "loss": 0.0462, "step": 127 }, { "epoch": 0.2048, "grad_norm": 0.2402367528446714, "learning_rate": 1.9926928427691788e-05, "loss": 0.0416, "step": 128 }, { "epoch": 0.2064, "grad_norm": 0.2530328053587009, "learning_rate": 1.9924828394184308e-05, "loss": 0.0444, "step": 129 }, { "epoch": 0.208, "grad_norm": 0.23402020878255644, "learning_rate": 1.992269872363277e-05, "loss": 0.0454, "step": 130 }, { "epoch": 0.2096, "grad_norm": 0.27231943284727395, "learning_rate": 1.992053942239668e-05, "loss": 0.0432, "step": 131 }, { "epoch": 0.2112, "grad_norm": 0.30162603611918526, "learning_rate": 1.991835049692405e-05, "loss": 0.0461, "step": 132 }, { "epoch": 0.2128, "grad_norm": 0.5875657609066812, "learning_rate": 1.9916131953751342e-05, "loss": 0.0496, "step": 133 }, { "epoch": 0.2144, "grad_norm": 0.23444266640024533, "learning_rate": 1.991388379950346e-05, "loss": 0.0448, "step": 134 }, { "epoch": 0.216, "grad_norm": 0.24100730008249682, "learning_rate": 1.9911606040893742e-05, "loss": 0.0449, "step": 135 }, { "epoch": 0.2176, "grad_norm": 0.27409831637006027, "learning_rate": 1.9909298684723905e-05, "loss": 0.0429, "step": 136 }, { "epoch": 0.2192, "grad_norm": 0.21807084455312617, "learning_rate": 1.990696173788408e-05, "loss": 0.042, "step": 137 }, { "epoch": 0.2208, "grad_norm": 0.2859994449145259, "learning_rate": 1.9904595207352736e-05, "loss": 0.0431, "step": 138 }, { "epoch": 0.2224, "grad_norm": 0.20183362855681072, "learning_rate": 1.9902199100196697e-05, "loss": 0.0385, "step": 139 }, { "epoch": 0.224, "grad_norm": 0.3453138382705126, "learning_rate": 1.9899773423571102e-05, "loss": 0.0424, "step": 140 }, { "epoch": 0.2256, "grad_norm": 0.5953531319319878, "learning_rate": 1.9897318184719386e-05, "loss": 0.0466, "step": 141 }, { "epoch": 0.2272, "grad_norm": 0.2458303919799798, "learning_rate": 1.9894833390973266e-05, "loss": 0.042, "step": 142 }, { "epoch": 0.2288, "grad_norm": 0.242002979674024, "learning_rate": 1.989231904975272e-05, "loss": 0.045, "step": 143 }, { "epoch": 0.2304, "grad_norm": 0.2312660284492851, "learning_rate": 1.9889775168565942e-05, "loss": 0.0409, "step": 144 }, { "epoch": 0.232, "grad_norm": 0.24766915847032975, "learning_rate": 1.9887201755009358e-05, "loss": 0.0407, "step": 145 }, { "epoch": 0.2336, "grad_norm": 0.3333321837420313, "learning_rate": 1.9884598816767563e-05, "loss": 0.0421, "step": 146 }, { "epoch": 0.2352, "grad_norm": 0.25937507599864257, "learning_rate": 1.988196636161333e-05, "loss": 0.0382, "step": 147 }, { "epoch": 0.2368, "grad_norm": 0.2644808268622511, "learning_rate": 1.987930439740757e-05, "loss": 0.0382, "step": 148 }, { "epoch": 0.2384, "grad_norm": 0.4004716853916845, "learning_rate": 1.987661293209931e-05, "loss": 0.0396, "step": 149 }, { "epoch": 0.24, "grad_norm": 0.28747417262335284, "learning_rate": 1.9873891973725673e-05, "loss": 0.0379, "step": 150 }, { "epoch": 0.2416, "grad_norm": 1.1173736301665167, "learning_rate": 1.9871141530411854e-05, "loss": 0.0459, "step": 151 }, { "epoch": 0.2432, "grad_norm": 0.3332516879243943, "learning_rate": 1.98683616103711e-05, "loss": 0.0393, "step": 152 }, { "epoch": 0.2448, "grad_norm": 0.2922136524879718, "learning_rate": 1.986555222190467e-05, "loss": 0.04, "step": 153 }, { "epoch": 0.2464, "grad_norm": 0.29721043057775826, "learning_rate": 1.986271337340182e-05, "loss": 0.041, "step": 154 }, { "epoch": 0.248, "grad_norm": 0.29914012628358855, "learning_rate": 1.9859845073339788e-05, "loss": 0.0428, "step": 155 }, { "epoch": 0.2496, "grad_norm": 0.4076615698415316, "learning_rate": 1.9856947330283752e-05, "loss": 0.0418, "step": 156 }, { "epoch": 0.2512, "grad_norm": 0.48998089711242176, "learning_rate": 1.9854020152886816e-05, "loss": 0.046, "step": 157 }, { "epoch": 0.2528, "grad_norm": 0.3520972303139779, "learning_rate": 1.985106354988997e-05, "loss": 0.0383, "step": 158 }, { "epoch": 0.2544, "grad_norm": 0.2416225857243467, "learning_rate": 1.9848077530122083e-05, "loss": 0.0406, "step": 159 }, { "epoch": 0.256, "grad_norm": 0.3351339395051045, "learning_rate": 1.984506210249986e-05, "loss": 0.0443, "step": 160 }, { "epoch": 0.2576, "grad_norm": 0.4373154457475244, "learning_rate": 1.984201727602783e-05, "loss": 0.0377, "step": 161 }, { "epoch": 0.2592, "grad_norm": 0.32347937035340885, "learning_rate": 1.9838943059798305e-05, "loss": 0.0404, "step": 162 }, { "epoch": 0.2608, "grad_norm": 0.1931293350449502, "learning_rate": 1.983583946299136e-05, "loss": 0.0361, "step": 163 }, { "epoch": 0.2624, "grad_norm": 0.23801575923761545, "learning_rate": 1.9832706494874812e-05, "loss": 0.0359, "step": 164 }, { "epoch": 0.264, "grad_norm": 0.2732645051219492, "learning_rate": 1.9829544164804172e-05, "loss": 0.0369, "step": 165 }, { "epoch": 0.2656, "grad_norm": 0.27126891539245734, "learning_rate": 1.982635248222264e-05, "loss": 0.0341, "step": 166 }, { "epoch": 0.2672, "grad_norm": 0.21752650221622016, "learning_rate": 1.9823131456661064e-05, "loss": 0.0318, "step": 167 }, { "epoch": 0.2688, "grad_norm": 0.2349754020387302, "learning_rate": 1.9819881097737917e-05, "loss": 0.0323, "step": 168 }, { "epoch": 0.2704, "grad_norm": 0.4203489034506969, "learning_rate": 1.9816601415159266e-05, "loss": 0.0431, "step": 169 }, { "epoch": 0.272, "grad_norm": 0.4203489034506969, "learning_rate": 1.9816601415159266e-05, "loss": 0.0348, "step": 170 }, { "epoch": 0.2736, "grad_norm": 0.44413902157418056, "learning_rate": 1.9813292418718734e-05, "loss": 0.0363, "step": 171 }, { "epoch": 0.2752, "grad_norm": 0.30897582190180134, "learning_rate": 1.980995411829749e-05, "loss": 0.0333, "step": 172 }, { "epoch": 0.2768, "grad_norm": 0.2564689775545095, "learning_rate": 1.9806586523864212e-05, "loss": 0.0341, "step": 173 }, { "epoch": 0.2784, "grad_norm": 0.2359324424976115, "learning_rate": 1.980318964547504e-05, "loss": 0.0371, "step": 174 }, { "epoch": 0.28, "grad_norm": 0.38324646806929435, "learning_rate": 1.9799763493273572e-05, "loss": 0.0337, "step": 175 }, { "epoch": 0.2816, "grad_norm": 0.2989848276810454, "learning_rate": 1.9796308077490817e-05, "loss": 0.0356, "step": 176 }, { "epoch": 0.2832, "grad_norm": 0.26092774592942874, "learning_rate": 1.9792823408445173e-05, "loss": 0.0337, "step": 177 }, { "epoch": 0.2848, "grad_norm": 0.24958450769694013, "learning_rate": 1.978930949654239e-05, "loss": 0.0335, "step": 178 }, { "epoch": 0.2864, "grad_norm": 0.27590569127811576, "learning_rate": 1.978576635227554e-05, "loss": 0.0348, "step": 179 }, { "epoch": 0.288, "grad_norm": 0.3213869696488523, "learning_rate": 1.9782193986224997e-05, "loss": 0.0325, "step": 180 }, { "epoch": 0.2896, "grad_norm": 0.3128042058548295, "learning_rate": 1.9778592409058376e-05, "loss": 0.0375, "step": 181 }, { "epoch": 0.2912, "grad_norm": 1.0075076172124091, "learning_rate": 1.9774961631530543e-05, "loss": 0.0349, "step": 182 }, { "epoch": 0.2928, "grad_norm": 0.2446969181592618, "learning_rate": 1.9771301664483548e-05, "loss": 0.0323, "step": 183 }, { "epoch": 0.2944, "grad_norm": 0.2617986459213487, "learning_rate": 1.976761251884661e-05, "loss": 0.0351, "step": 184 }, { "epoch": 0.296, "grad_norm": 0.45657092656538206, "learning_rate": 1.976389420563607e-05, "loss": 0.0365, "step": 185 }, { "epoch": 0.2976, "grad_norm": 0.24512426907244425, "learning_rate": 1.9760146735955388e-05, "loss": 0.0278, "step": 186 }, { "epoch": 0.2992, "grad_norm": 0.24913183105659079, "learning_rate": 1.975637012099507e-05, "loss": 0.0342, "step": 187 }, { "epoch": 0.3008, "grad_norm": 0.2824975570969569, "learning_rate": 1.9752564372032655e-05, "loss": 0.0298, "step": 188 }, { "epoch": 0.3024, "grad_norm": 0.31008595066638983, "learning_rate": 1.97487295004327e-05, "loss": 0.0318, "step": 189 }, { "epoch": 0.304, "grad_norm": 0.3387799766405644, "learning_rate": 1.974486551764671e-05, "loss": 0.0339, "step": 190 }, { "epoch": 0.3056, "grad_norm": 0.4995885536634404, "learning_rate": 1.9740972435213114e-05, "loss": 0.0359, "step": 191 }, { "epoch": 0.3072, "grad_norm": 0.25349932173180195, "learning_rate": 1.973705026475726e-05, "loss": 0.0312, "step": 192 }, { "epoch": 0.3088, "grad_norm": 0.32648409714588955, "learning_rate": 1.9733099017991342e-05, "loss": 0.0315, "step": 193 }, { "epoch": 0.3104, "grad_norm": 0.3440565315834147, "learning_rate": 1.9729118706714377e-05, "loss": 0.0342, "step": 194 }, { "epoch": 0.312, "grad_norm": 0.2677942023605695, "learning_rate": 1.972510934281218e-05, "loss": 0.0326, "step": 195 }, { "epoch": 0.3136, "grad_norm": 0.386774474399475, "learning_rate": 1.9721070938257326e-05, "loss": 0.0361, "step": 196 }, { "epoch": 0.3152, "grad_norm": 0.2530848402763779, "learning_rate": 1.9717003505109097e-05, "loss": 0.0315, "step": 197 }, { "epoch": 0.3168, "grad_norm": 0.4090054430468855, "learning_rate": 1.971290705551347e-05, "loss": 0.0331, "step": 198 }, { "epoch": 0.3184, "grad_norm": 0.4367545289940036, "learning_rate": 1.9708781601703066e-05, "loss": 0.0316, "step": 199 }, { "epoch": 0.32, "grad_norm": 0.19063531216095272, "learning_rate": 1.970462715599711e-05, "loss": 0.0325, "step": 200 }, { "epoch": 0.3216, "grad_norm": 0.36599672109503767, "learning_rate": 1.9700443730801412e-05, "loss": 0.0372, "step": 201 }, { "epoch": 0.3232, "grad_norm": 0.250894690133702, "learning_rate": 1.9696231338608317e-05, "loss": 0.0319, "step": 202 }, { "epoch": 0.3248, "grad_norm": 0.35665383810945905, "learning_rate": 1.9691989991996663e-05, "loss": 0.0299, "step": 203 }, { "epoch": 0.3264, "grad_norm": 0.26890697010330017, "learning_rate": 1.9687719703631757e-05, "loss": 0.0325, "step": 204 }, { "epoch": 0.328, "grad_norm": 0.25896028779850616, "learning_rate": 1.9683420486265328e-05, "loss": 0.0304, "step": 205 }, { "epoch": 0.3296, "grad_norm": 0.3009648507964905, "learning_rate": 1.967909235273549e-05, "loss": 0.032, "step": 206 }, { "epoch": 0.3312, "grad_norm": 0.20628455614138974, "learning_rate": 1.967473531596671e-05, "loss": 0.0294, "step": 207 }, { "epoch": 0.3328, "grad_norm": 0.3014924152402286, "learning_rate": 1.9670349388969758e-05, "loss": 0.0287, "step": 208 }, { "epoch": 0.3344, "grad_norm": 0.23648053994541607, "learning_rate": 1.966593458484168e-05, "loss": 0.0304, "step": 209 }, { "epoch": 0.336, "grad_norm": 0.3306354577588259, "learning_rate": 1.9661490916765752e-05, "loss": 0.029, "step": 210 }, { "epoch": 0.3376, "grad_norm": 0.3438378127543805, "learning_rate": 1.9657018398011435e-05, "loss": 0.0278, "step": 211 }, { "epoch": 0.3392, "grad_norm": 0.2168801651038601, "learning_rate": 1.9652517041934357e-05, "loss": 0.0258, "step": 212 }, { "epoch": 0.3408, "grad_norm": 0.26209554081420433, "learning_rate": 1.9647986861976246e-05, "loss": 0.028, "step": 213 }, { "epoch": 0.3424, "grad_norm": 0.29361599815797396, "learning_rate": 1.9643427871664912e-05, "loss": 0.0309, "step": 214 }, { "epoch": 0.344, "grad_norm": 0.2362773483998967, "learning_rate": 1.9638840084614182e-05, "loss": 0.0259, "step": 215 }, { "epoch": 0.3456, "grad_norm": 0.35454665238094957, "learning_rate": 1.963422351452389e-05, "loss": 0.0323, "step": 216 }, { "epoch": 0.3472, "grad_norm": 0.21029650807600475, "learning_rate": 1.9629578175179823e-05, "loss": 0.0271, "step": 217 }, { "epoch": 0.3488, "grad_norm": 0.26629914086279055, "learning_rate": 1.9624904080453656e-05, "loss": 0.0284, "step": 218 }, { "epoch": 0.3504, "grad_norm": 0.18941616218847454, "learning_rate": 1.9620201244302952e-05, "loss": 0.0276, "step": 219 }, { "epoch": 0.352, "grad_norm": 0.19426874809023298, "learning_rate": 1.9615469680771097e-05, "loss": 0.0282, "step": 220 }, { "epoch": 0.3536, "grad_norm": 0.2624611759558848, "learning_rate": 1.9610709403987248e-05, "loss": 0.029, "step": 221 }, { "epoch": 0.3552, "grad_norm": 0.325503693832948, "learning_rate": 1.960592042816632e-05, "loss": 0.026, "step": 222 }, { "epoch": 0.3568, "grad_norm": 0.20446996096247685, "learning_rate": 1.9601102767608924e-05, "loss": 0.0271, "step": 223 }, { "epoch": 0.3584, "grad_norm": 0.26119031688067684, "learning_rate": 1.9596256436701324e-05, "loss": 0.0261, "step": 224 }, { "epoch": 0.36, "grad_norm": 0.17955547518671727, "learning_rate": 1.95913814499154e-05, "loss": 0.0276, "step": 225 }, { "epoch": 0.3616, "grad_norm": 0.24063273200380345, "learning_rate": 1.9586477821808597e-05, "loss": 0.0265, "step": 226 }, { "epoch": 0.3632, "grad_norm": 0.2557128126599869, "learning_rate": 1.95815455670239e-05, "loss": 0.0277, "step": 227 }, { "epoch": 0.3648, "grad_norm": 0.16440851052959135, "learning_rate": 1.957658470028977e-05, "loss": 0.0254, "step": 228 }, { "epoch": 0.3664, "grad_norm": 0.37790528555642366, "learning_rate": 1.9571595236420103e-05, "loss": 0.0251, "step": 229 }, { "epoch": 0.368, "grad_norm": 0.17331729000725077, "learning_rate": 1.95665771903142e-05, "loss": 0.025, "step": 230 }, { "epoch": 0.3696, "grad_norm": 0.22255498342727273, "learning_rate": 1.9561530576956703e-05, "loss": 0.026, "step": 231 }, { "epoch": 0.3712, "grad_norm": 0.13851311457314286, "learning_rate": 1.9556455411417575e-05, "loss": 0.0257, "step": 232 }, { "epoch": 0.3728, "grad_norm": 0.2180209443564617, "learning_rate": 1.955135170885202e-05, "loss": 0.0266, "step": 233 }, { "epoch": 0.3744, "grad_norm": 0.18936063079669935, "learning_rate": 1.9546219484500475e-05, "loss": 0.0254, "step": 234 }, { "epoch": 0.376, "grad_norm": 0.3323779097048608, "learning_rate": 1.9541058753688538e-05, "loss": 0.0302, "step": 235 }, { "epoch": 0.3776, "grad_norm": 0.3245826088945582, "learning_rate": 1.9535869531826938e-05, "loss": 0.028, "step": 236 }, { "epoch": 0.3792, "grad_norm": 0.19574917790356397, "learning_rate": 1.9530651834411477e-05, "loss": 0.0253, "step": 237 }, { "epoch": 0.3808, "grad_norm": 0.2537992886761661, "learning_rate": 1.952540567702299e-05, "loss": 0.0292, "step": 238 }, { "epoch": 0.3824, "grad_norm": 0.4225817760415451, "learning_rate": 1.95201310753273e-05, "loss": 0.0309, "step": 239 }, { "epoch": 0.384, "grad_norm": 0.2038343453442293, "learning_rate": 1.951482804507517e-05, "loss": 0.0251, "step": 240 }, { "epoch": 0.3856, "grad_norm": 0.49544587227207304, "learning_rate": 1.9509496602102253e-05, "loss": 0.0239, "step": 241 }, { "epoch": 0.3872, "grad_norm": 0.3482229630924854, "learning_rate": 1.9504136762329046e-05, "loss": 0.0273, "step": 242 }, { "epoch": 0.3888, "grad_norm": 0.42426491159244356, "learning_rate": 1.9498748541760845e-05, "loss": 0.0256, "step": 243 }, { "epoch": 0.3904, "grad_norm": 0.2564993321314424, "learning_rate": 1.949333195648769e-05, "loss": 0.0265, "step": 244 }, { "epoch": 0.392, "grad_norm": 0.2147695548639086, "learning_rate": 1.9487887022684336e-05, "loss": 0.0256, "step": 245 }, { "epoch": 0.3936, "grad_norm": 0.16683905641566563, "learning_rate": 1.9482413756610175e-05, "loss": 0.0237, "step": 246 }, { "epoch": 0.3952, "grad_norm": 0.18124224889405904, "learning_rate": 1.947691217460921e-05, "loss": 0.0255, "step": 247 }, { "epoch": 0.3968, "grad_norm": 0.22738342310219078, "learning_rate": 1.9471382293110004e-05, "loss": 0.0282, "step": 248 }, { "epoch": 0.3984, "grad_norm": 0.15315087071959668, "learning_rate": 1.946582412862562e-05, "loss": 0.0256, "step": 249 }, { "epoch": 0.4, "grad_norm": 0.17131894088568825, "learning_rate": 1.9460237697753577e-05, "loss": 0.0253, "step": 250 }, { "epoch": 0.4016, "grad_norm": 0.2736617070066449, "learning_rate": 1.9454623017175814e-05, "loss": 0.0267, "step": 251 }, { "epoch": 0.4032, "grad_norm": 0.3657647944090099, "learning_rate": 1.9448980103658613e-05, "loss": 0.026, "step": 252 }, { "epoch": 0.4048, "grad_norm": 0.2876716836389149, "learning_rate": 1.9443308974052574e-05, "loss": 0.0251, "step": 253 }, { "epoch": 0.4064, "grad_norm": 0.25445616948155586, "learning_rate": 1.943760964529255e-05, "loss": 0.0271, "step": 254 }, { "epoch": 0.408, "grad_norm": 0.1854945634725827, "learning_rate": 1.9431882134397596e-05, "loss": 0.0237, "step": 255 }, { "epoch": 0.4096, "grad_norm": 0.20873308410010488, "learning_rate": 1.9426126458470936e-05, "loss": 0.0249, "step": 256 }, { "epoch": 0.4112, "grad_norm": 0.23299332351744328, "learning_rate": 1.9420342634699893e-05, "loss": 0.0279, "step": 257 }, { "epoch": 0.4128, "grad_norm": 0.19319678676101346, "learning_rate": 1.9414530680355837e-05, "loss": 0.0261, "step": 258 }, { "epoch": 0.4144, "grad_norm": 0.18150742033137932, "learning_rate": 1.9408690612794146e-05, "loss": 0.0257, "step": 259 }, { "epoch": 0.416, "grad_norm": 0.26277783543834804, "learning_rate": 1.9402822449454154e-05, "loss": 0.0278, "step": 260 }, { "epoch": 0.4176, "grad_norm": 0.24540077426443008, "learning_rate": 1.9396926207859085e-05, "loss": 0.0255, "step": 261 }, { "epoch": 0.4192, "grad_norm": 0.24639773880399232, "learning_rate": 1.939100190561601e-05, "loss": 0.0255, "step": 262 }, { "epoch": 0.4208, "grad_norm": 0.18519198438379944, "learning_rate": 1.9385049560415794e-05, "loss": 0.024, "step": 263 }, { "epoch": 0.4224, "grad_norm": 0.17894539537653406, "learning_rate": 1.9379069190033042e-05, "loss": 0.0249, "step": 264 }, { "epoch": 0.424, "grad_norm": 0.32960028213367215, "learning_rate": 1.9373060812326053e-05, "loss": 0.0264, "step": 265 }, { "epoch": 0.4256, "grad_norm": 0.18565514288989918, "learning_rate": 1.936702444523675e-05, "loss": 0.0258, "step": 266 }, { "epoch": 0.4272, "grad_norm": 0.21328083113489418, "learning_rate": 1.9360960106790645e-05, "loss": 0.0267, "step": 267 }, { "epoch": 0.4288, "grad_norm": 0.7772355439669246, "learning_rate": 1.9354867815096772e-05, "loss": 0.0254, "step": 268 }, { "epoch": 0.4304, "grad_norm": 0.2870524330818712, "learning_rate": 1.9348747588347637e-05, "loss": 0.0237, "step": 269 }, { "epoch": 0.432, "grad_norm": 0.3739445599859368, "learning_rate": 1.9342599444819167e-05, "loss": 0.0273, "step": 270 }, { "epoch": 0.4336, "grad_norm": 0.2511413022176609, "learning_rate": 1.9336423402870655e-05, "loss": 0.025, "step": 271 }, { "epoch": 0.4352, "grad_norm": 0.2720251884441693, "learning_rate": 1.9330219480944693e-05, "loss": 0.0237, "step": 272 }, { "epoch": 0.4368, "grad_norm": 0.19155647163622136, "learning_rate": 1.932398769756714e-05, "loss": 0.0225, "step": 273 }, { "epoch": 0.4384, "grad_norm": 0.22998262921006735, "learning_rate": 1.931772807134704e-05, "loss": 0.0248, "step": 274 }, { "epoch": 0.44, "grad_norm": 0.26222229165931915, "learning_rate": 1.9311440620976597e-05, "loss": 0.0249, "step": 275 }, { "epoch": 0.4416, "grad_norm": 0.2409259694452907, "learning_rate": 1.9305125365231087e-05, "loss": 0.0254, "step": 276 }, { "epoch": 0.4432, "grad_norm": 0.18412201144034487, "learning_rate": 1.9298782322968817e-05, "loss": 0.0242, "step": 277 }, { "epoch": 0.4448, "grad_norm": 0.23927348676565244, "learning_rate": 1.929241151313108e-05, "loss": 0.0243, "step": 278 }, { "epoch": 0.4464, "grad_norm": 0.3135936679260672, "learning_rate": 1.9286012954742078e-05, "loss": 0.0264, "step": 279 }, { "epoch": 0.448, "grad_norm": 0.32007604599511913, "learning_rate": 1.9279586666908886e-05, "loss": 0.0252, "step": 280 }, { "epoch": 0.4496, "grad_norm": 0.3096904351422991, "learning_rate": 1.9273132668821363e-05, "loss": 0.0253, "step": 281 }, { "epoch": 0.4512, "grad_norm": 0.23490286212024764, "learning_rate": 1.9266650979752137e-05, "loss": 0.0257, "step": 282 }, { "epoch": 0.4528, "grad_norm": 0.3190306995921017, "learning_rate": 1.9260141619056507e-05, "loss": 0.0242, "step": 283 }, { "epoch": 0.4544, "grad_norm": 0.1460060583985359, "learning_rate": 1.925360460617242e-05, "loss": 0.0244, "step": 284 }, { "epoch": 0.456, "grad_norm": 0.40457351297947347, "learning_rate": 1.924703996062038e-05, "loss": 0.0273, "step": 285 }, { "epoch": 0.4576, "grad_norm": 0.21777864726658375, "learning_rate": 1.9240447702003422e-05, "loss": 0.0249, "step": 286 }, { "epoch": 0.4592, "grad_norm": 0.13886632445514155, "learning_rate": 1.9233827850007028e-05, "loss": 0.0231, "step": 287 }, { "epoch": 0.4608, "grad_norm": 0.23268263869417316, "learning_rate": 1.9227180424399082e-05, "loss": 0.0253, "step": 288 }, { "epoch": 0.4624, "grad_norm": 0.24274828301682744, "learning_rate": 1.9220505445029803e-05, "loss": 0.0268, "step": 289 }, { "epoch": 0.464, "grad_norm": 0.20450417450718378, "learning_rate": 1.9213802931831697e-05, "loss": 0.0232, "step": 290 }, { "epoch": 0.4656, "grad_norm": 0.16078185374649792, "learning_rate": 1.9207072904819484e-05, "loss": 0.0232, "step": 291 }, { "epoch": 0.4672, "grad_norm": 0.3118234085979711, "learning_rate": 1.9200315384090045e-05, "loss": 0.0268, "step": 292 }, { "epoch": 0.4688, "grad_norm": 0.1657397801331871, "learning_rate": 1.9193530389822364e-05, "loss": 0.0234, "step": 293 }, { "epoch": 0.4704, "grad_norm": 0.24790087220099571, "learning_rate": 1.9186717942277466e-05, "loss": 0.0249, "step": 294 }, { "epoch": 0.472, "grad_norm": 0.18698610147821218, "learning_rate": 1.9179878061798347e-05, "loss": 0.0242, "step": 295 }, { "epoch": 0.4736, "grad_norm": 0.20122110461948367, "learning_rate": 1.9173010768809934e-05, "loss": 0.0253, "step": 296 }, { "epoch": 0.4752, "grad_norm": 0.16444390761754338, "learning_rate": 1.9166116083819002e-05, "loss": 0.0239, "step": 297 }, { "epoch": 0.4768, "grad_norm": 0.2096031816916971, "learning_rate": 1.915919402741413e-05, "loss": 0.0237, "step": 298 }, { "epoch": 0.4784, "grad_norm": 0.2871382876650298, "learning_rate": 1.915224462026563e-05, "loss": 0.0247, "step": 299 }, { "epoch": 0.48, "grad_norm": 0.23396068054992797, "learning_rate": 1.9145267883125483e-05, "loss": 0.0226, "step": 300 }, { "epoch": 0.4816, "grad_norm": 0.17806677822199216, "learning_rate": 1.913826383682729e-05, "loss": 0.0228, "step": 301 }, { "epoch": 0.4832, "grad_norm": 0.22421810228175576, "learning_rate": 1.913123250228619e-05, "loss": 0.0222, "step": 302 }, { "epoch": 0.4848, "grad_norm": 0.2671737148491569, "learning_rate": 1.912417390049882e-05, "loss": 0.0258, "step": 303 }, { "epoch": 0.4864, "grad_norm": 0.22521421563326308, "learning_rate": 1.9117088052543233e-05, "loss": 0.0259, "step": 304 }, { "epoch": 0.488, "grad_norm": 0.2512719570632976, "learning_rate": 1.9109974979578852e-05, "loss": 0.0223, "step": 305 }, { "epoch": 0.4896, "grad_norm": 0.200102823046399, "learning_rate": 1.9102834702846387e-05, "loss": 0.0244, "step": 306 }, { "epoch": 0.4912, "grad_norm": 0.2322391139619468, "learning_rate": 1.909566724366779e-05, "loss": 0.0248, "step": 307 }, { "epoch": 0.4928, "grad_norm": 0.33073866828097187, "learning_rate": 1.9088472623446182e-05, "loss": 0.026, "step": 308 }, { "epoch": 0.4944, "grad_norm": 0.1893349038318749, "learning_rate": 1.9081250863665794e-05, "loss": 0.0235, "step": 309 }, { "epoch": 0.496, "grad_norm": 0.29277990880028976, "learning_rate": 1.9074001985891893e-05, "loss": 0.0265, "step": 310 }, { "epoch": 0.4976, "grad_norm": 0.19477738061872754, "learning_rate": 1.9066726011770725e-05, "loss": 0.0228, "step": 311 }, { "epoch": 0.4992, "grad_norm": 0.22915444168212584, "learning_rate": 1.9059422963029464e-05, "loss": 0.0256, "step": 312 }, { "epoch": 0.5008, "grad_norm": 0.13982238568398425, "learning_rate": 1.905209286147611e-05, "loss": 0.022, "step": 313 }, { "epoch": 0.5024, "grad_norm": 0.30311950923241815, "learning_rate": 1.9044735728999472e-05, "loss": 0.0261, "step": 314 }, { "epoch": 0.504, "grad_norm": 0.2124747140890154, "learning_rate": 1.903735158756905e-05, "loss": 0.0205, "step": 315 }, { "epoch": 0.5056, "grad_norm": 0.31912333300382856, "learning_rate": 1.902994045923502e-05, "loss": 0.0236, "step": 316 }, { "epoch": 0.5072, "grad_norm": 0.205258171466095, "learning_rate": 1.9022502366128136e-05, "loss": 0.0249, "step": 317 }, { "epoch": 0.5088, "grad_norm": 0.25082631668722966, "learning_rate": 1.901503733045967e-05, "loss": 0.0249, "step": 318 }, { "epoch": 0.5104, "grad_norm": 0.293085694954146, "learning_rate": 1.9007545374521354e-05, "loss": 0.0238, "step": 319 }, { "epoch": 0.512, "grad_norm": 0.2181476235450802, "learning_rate": 1.90000265206853e-05, "loss": 0.0233, "step": 320 }, { "epoch": 0.5136, "grad_norm": 0.16868250926306455, "learning_rate": 1.8992480791403957e-05, "loss": 0.0211, "step": 321 }, { "epoch": 0.5152, "grad_norm": 0.1452284105599233, "learning_rate": 1.898490820921001e-05, "loss": 0.0235, "step": 322 }, { "epoch": 0.5168, "grad_norm": 0.16961367231195754, "learning_rate": 1.897730879671634e-05, "loss": 0.0217, "step": 323 }, { "epoch": 0.5184, "grad_norm": 0.2427685545953266, "learning_rate": 1.8969682576615947e-05, "loss": 0.0238, "step": 324 }, { "epoch": 0.52, "grad_norm": 0.2560872454036068, "learning_rate": 1.8962029571681887e-05, "loss": 0.024, "step": 325 }, { "epoch": 0.5216, "grad_norm": 0.42672805528036134, "learning_rate": 1.8954349804767185e-05, "loss": 0.0227, "step": 326 }, { "epoch": 0.5232, "grad_norm": 0.25995213776532555, "learning_rate": 1.8946643298804794e-05, "loss": 0.0221, "step": 327 }, { "epoch": 0.5248, "grad_norm": 0.2995897648125354, "learning_rate": 1.8938910076807514e-05, "loss": 0.0262, "step": 328 }, { "epoch": 0.5264, "grad_norm": 0.2088551905545114, "learning_rate": 1.8931150161867917e-05, "loss": 0.0206, "step": 329 }, { "epoch": 0.528, "grad_norm": 0.43085085565275305, "learning_rate": 1.892336357715829e-05, "loss": 0.0252, "step": 330 }, { "epoch": 0.5296, "grad_norm": 0.36257688507162905, "learning_rate": 1.891555034593055e-05, "loss": 0.0235, "step": 331 }, { "epoch": 0.5312, "grad_norm": 0.20771270465771838, "learning_rate": 1.8907710491516197e-05, "loss": 0.022, "step": 332 }, { "epoch": 0.5328, "grad_norm": 0.17509810477386462, "learning_rate": 1.8899844037326227e-05, "loss": 0.0211, "step": 333 }, { "epoch": 0.5344, "grad_norm": 0.12538653447060122, "learning_rate": 1.889195100685106e-05, "loss": 0.0193, "step": 334 }, { "epoch": 0.536, "grad_norm": 0.1980666975815131, "learning_rate": 1.8884031423660492e-05, "loss": 0.0209, "step": 335 }, { "epoch": 0.5376, "grad_norm": 0.26669488594459173, "learning_rate": 1.8876085311403592e-05, "loss": 0.0223, "step": 336 }, { "epoch": 0.5392, "grad_norm": 0.21213789151153697, "learning_rate": 1.8868112693808664e-05, "loss": 0.0227, "step": 337 }, { "epoch": 0.5408, "grad_norm": 0.3009564209037192, "learning_rate": 1.8860113594683148e-05, "loss": 0.0243, "step": 338 }, { "epoch": 0.5424, "grad_norm": 0.16434507191527442, "learning_rate": 1.8852088037913577e-05, "loss": 0.0193, "step": 339 }, { "epoch": 0.544, "grad_norm": 0.25856557410366665, "learning_rate": 1.884403604746547e-05, "loss": 0.0235, "step": 340 }, { "epoch": 0.5456, "grad_norm": 0.2753428966537922, "learning_rate": 1.8835957647383304e-05, "loss": 0.0243, "step": 341 }, { "epoch": 0.5472, "grad_norm": 0.23000290993852823, "learning_rate": 1.8827852861790398e-05, "loss": 0.0225, "step": 342 }, { "epoch": 0.5488, "grad_norm": 0.18518777898446165, "learning_rate": 1.8819721714888878e-05, "loss": 0.0226, "step": 343 }, { "epoch": 0.5504, "grad_norm": 0.22708845342130995, "learning_rate": 1.8811564230959585e-05, "loss": 0.0222, "step": 344 }, { "epoch": 0.552, "grad_norm": 0.22327313120164785, "learning_rate": 1.8803380434362e-05, "loss": 0.0223, "step": 345 }, { "epoch": 0.5536, "grad_norm": 0.23016725124648443, "learning_rate": 1.879517034953418e-05, "loss": 0.0219, "step": 346 }, { "epoch": 0.5552, "grad_norm": 0.12311146499918242, "learning_rate": 1.878693400099269e-05, "loss": 0.0188, "step": 347 }, { "epoch": 0.5568, "grad_norm": 0.25520263166702184, "learning_rate": 1.8778671413332513e-05, "loss": 0.0217, "step": 348 }, { "epoch": 0.5584, "grad_norm": 0.17375942859806887, "learning_rate": 1.877038261122699e-05, "loss": 0.0227, "step": 349 }, { "epoch": 0.56, "grad_norm": 0.2868786417041136, "learning_rate": 1.8762067619427745e-05, "loss": 0.0232, "step": 350 }, { "epoch": 0.5616, "grad_norm": 0.22263963887687188, "learning_rate": 1.87537264627646e-05, "loss": 0.0205, "step": 351 }, { "epoch": 0.5632, "grad_norm": 0.3216611897383196, "learning_rate": 1.8745359166145526e-05, "loss": 0.0222, "step": 352 }, { "epoch": 0.5648, "grad_norm": 0.348533541354947, "learning_rate": 1.8736965754556527e-05, "loss": 0.024, "step": 353 }, { "epoch": 0.5664, "grad_norm": 0.14815375140096462, "learning_rate": 1.8728546253061614e-05, "loss": 0.0189, "step": 354 }, { "epoch": 0.568, "grad_norm": 0.276884259305941, "learning_rate": 1.8720100686802693e-05, "loss": 0.0196, "step": 355 }, { "epoch": 0.5696, "grad_norm": 0.24410384498418092, "learning_rate": 1.8711629080999506e-05, "loss": 0.0222, "step": 356 }, { "epoch": 0.5712, "grad_norm": 0.33067569013923137, "learning_rate": 1.8703131460949555e-05, "loss": 0.022, "step": 357 }, { "epoch": 0.5728, "grad_norm": 0.22661205766856518, "learning_rate": 1.869460785202802e-05, "loss": 0.0185, "step": 358 }, { "epoch": 0.5744, "grad_norm": 0.503638421558941, "learning_rate": 1.86860582796877e-05, "loss": 0.0263, "step": 359 }, { "epoch": 0.576, "grad_norm": 0.30972726099984216, "learning_rate": 1.8677482769458905e-05, "loss": 0.022, "step": 360 }, { "epoch": 0.5776, "grad_norm": 0.3154407238669633, "learning_rate": 1.866888134694942e-05, "loss": 0.0214, "step": 361 }, { "epoch": 0.5792, "grad_norm": 0.3588250103478784, "learning_rate": 1.866025403784439e-05, "loss": 0.024, "step": 362 }, { "epoch": 0.5808, "grad_norm": 0.2103162608999855, "learning_rate": 1.865160086790627e-05, "loss": 0.0201, "step": 363 }, { "epoch": 0.5824, "grad_norm": 0.24144473483802972, "learning_rate": 1.8642921862974742e-05, "loss": 0.0197, "step": 364 }, { "epoch": 0.584, "grad_norm": 0.8072550367254895, "learning_rate": 1.8634217048966638e-05, "loss": 0.0248, "step": 365 }, { "epoch": 0.5856, "grad_norm": 0.27443717495137954, "learning_rate": 1.8625486451875843e-05, "loss": 0.0229, "step": 366 }, { "epoch": 0.5872, "grad_norm": 0.3238915337976546, "learning_rate": 1.861673009777325e-05, "loss": 0.021, "step": 367 }, { "epoch": 0.5888, "grad_norm": 0.2769206307667872, "learning_rate": 1.8607948012806664e-05, "loss": 0.0254, "step": 368 }, { "epoch": 0.5904, "grad_norm": 0.35901812150739465, "learning_rate": 1.8599140223200716e-05, "loss": 0.0235, "step": 369 }, { "epoch": 0.592, "grad_norm": 0.4209880875878923, "learning_rate": 1.859030675525681e-05, "loss": 0.0228, "step": 370 }, { "epoch": 0.5936, "grad_norm": 0.2743855954190781, "learning_rate": 1.858144763535302e-05, "loss": 0.0213, "step": 371 }, { "epoch": 0.5952, "grad_norm": 0.4891783966635455, "learning_rate": 1.857256288994402e-05, "loss": 0.0239, "step": 372 }, { "epoch": 0.5968, "grad_norm": 0.3334065452478207, "learning_rate": 1.8563652545561014e-05, "loss": 0.0226, "step": 373 }, { "epoch": 0.5984, "grad_norm": 0.22921939143541975, "learning_rate": 1.855471662881164e-05, "loss": 0.0211, "step": 374 }, { "epoch": 0.6, "grad_norm": 0.2083502675972401, "learning_rate": 1.8545755166379898e-05, "loss": 0.0188, "step": 375 }, { "epoch": 0.6016, "grad_norm": 0.1738970990934248, "learning_rate": 1.8536768185026085e-05, "loss": 0.0208, "step": 376 }, { "epoch": 0.6032, "grad_norm": 0.2288134807094265, "learning_rate": 1.852775571158668e-05, "loss": 0.0226, "step": 377 }, { "epoch": 0.6048, "grad_norm": 0.21350233078025635, "learning_rate": 1.85187177729743e-05, "loss": 0.0231, "step": 378 }, { "epoch": 0.6064, "grad_norm": 0.24932344963951814, "learning_rate": 1.850965439617761e-05, "loss": 0.0232, "step": 379 }, { "epoch": 0.608, "grad_norm": 0.2973275026012781, "learning_rate": 1.8500565608261215e-05, "loss": 0.022, "step": 380 }, { "epoch": 0.6096, "grad_norm": 0.31199357328500743, "learning_rate": 1.8491451436365628e-05, "loss": 0.022, "step": 381 }, { "epoch": 0.6112, "grad_norm": 0.19071566443042118, "learning_rate": 1.848231190770714e-05, "loss": 0.02, "step": 382 }, { "epoch": 0.6128, "grad_norm": 0.13039451500974714, "learning_rate": 1.8473147049577777e-05, "loss": 0.0194, "step": 383 }, { "epoch": 0.6144, "grad_norm": 0.1709393072560162, "learning_rate": 1.8463956889345195e-05, "loss": 0.0209, "step": 384 }, { "epoch": 0.616, "grad_norm": 0.4326244410727375, "learning_rate": 1.8454741454452604e-05, "loss": 0.0243, "step": 385 }, { "epoch": 0.6176, "grad_norm": 0.29733583141209047, "learning_rate": 1.8445500772418697e-05, "loss": 0.0229, "step": 386 }, { "epoch": 0.6192, "grad_norm": 0.18634249572348707, "learning_rate": 1.843623487083755e-05, "loss": 0.0198, "step": 387 }, { "epoch": 0.6208, "grad_norm": 0.17262948208342496, "learning_rate": 1.842694377737855e-05, "loss": 0.0169, "step": 388 }, { "epoch": 0.6224, "grad_norm": 0.21827100172380481, "learning_rate": 1.8417627519786317e-05, "loss": 0.0212, "step": 389 }, { "epoch": 0.624, "grad_norm": 0.25942130447905226, "learning_rate": 1.8408286125880605e-05, "loss": 0.0224, "step": 390 }, { "epoch": 0.6256, "grad_norm": 0.40627656684977626, "learning_rate": 1.839891962355624e-05, "loss": 0.0196, "step": 391 }, { "epoch": 0.6272, "grad_norm": 0.16973658798713487, "learning_rate": 1.8389528040783014e-05, "loss": 0.0207, "step": 392 }, { "epoch": 0.6288, "grad_norm": 0.21438004340509942, "learning_rate": 1.838011140560562e-05, "loss": 0.0198, "step": 393 }, { "epoch": 0.6304, "grad_norm": 0.31232945857171035, "learning_rate": 1.8370669746143566e-05, "loss": 0.0203, "step": 394 }, { "epoch": 0.632, "grad_norm": 0.2146851485289993, "learning_rate": 1.836120309059107e-05, "loss": 0.0195, "step": 395 }, { "epoch": 0.6336, "grad_norm": 0.24251036655589733, "learning_rate": 1.835171146721701e-05, "loss": 0.0172, "step": 396 }, { "epoch": 0.6352, "grad_norm": 0.3042791151359387, "learning_rate": 1.8342194904364815e-05, "loss": 0.0183, "step": 397 }, { "epoch": 0.6368, "grad_norm": 0.26113605341230206, "learning_rate": 1.8332653430452375e-05, "loss": 0.0202, "step": 398 }, { "epoch": 0.6384, "grad_norm": 0.34152736814393164, "learning_rate": 1.8323087073971996e-05, "loss": 0.0221, "step": 399 }, { "epoch": 0.64, "grad_norm": 0.22308120700113318, "learning_rate": 1.831349586349026e-05, "loss": 0.0183, "step": 400 }, { "epoch": 0.6416, "grad_norm": 0.3146848923252582, "learning_rate": 1.8303879827647977e-05, "loss": 0.019, "step": 401 }, { "epoch": 0.6432, "grad_norm": 0.3138495171429804, "learning_rate": 1.8294238995160093e-05, "loss": 0.0179, "step": 402 }, { "epoch": 0.6448, "grad_norm": 0.34576002225133945, "learning_rate": 1.8284573394815596e-05, "loss": 0.0146, "step": 403 }, { "epoch": 0.6464, "grad_norm": 0.32694901216997213, "learning_rate": 1.8274883055477436e-05, "loss": 0.0212, "step": 404 }, { "epoch": 0.648, "grad_norm": 0.35767715776140246, "learning_rate": 1.826516800608244e-05, "loss": 0.0222, "step": 405 }, { "epoch": 0.6496, "grad_norm": 0.2689394197658266, "learning_rate": 1.8255428275641212e-05, "loss": 0.017, "step": 406 }, { "epoch": 0.6512, "grad_norm": 0.3106575036482928, "learning_rate": 1.8245663893238075e-05, "loss": 0.0253, "step": 407 }, { "epoch": 0.6528, "grad_norm": 0.27604524063461516, "learning_rate": 1.823587488803095e-05, "loss": 0.0152, "step": 408 }, { "epoch": 0.6544, "grad_norm": 0.47681734881426474, "learning_rate": 1.8226061289251297e-05, "loss": 0.0205, "step": 409 }, { "epoch": 0.656, "grad_norm": 0.21430013633514908, "learning_rate": 1.821622312620401e-05, "loss": 0.0182, "step": 410 }, { "epoch": 0.6576, "grad_norm": 0.30350010790826815, "learning_rate": 1.8206360428267332e-05, "loss": 0.0191, "step": 411 }, { "epoch": 0.6592, "grad_norm": 0.2860070509652396, "learning_rate": 1.8196473224892784e-05, "loss": 0.019, "step": 412 }, { "epoch": 0.6608, "grad_norm": 0.28196191886430455, "learning_rate": 1.8186561545605055e-05, "loss": 0.0206, "step": 413 }, { "epoch": 0.6624, "grad_norm": 0.3079688364968603, "learning_rate": 1.817662542000192e-05, "loss": 0.0184, "step": 414 }, { "epoch": 0.664, "grad_norm": 0.3122700677645408, "learning_rate": 1.816666487775416e-05, "loss": 0.0178, "step": 415 }, { "epoch": 0.6656, "grad_norm": 0.2487569400300341, "learning_rate": 1.815667994860547e-05, "loss": 0.0194, "step": 416 }, { "epoch": 0.6672, "grad_norm": 0.2155913058486151, "learning_rate": 1.8146670662372353e-05, "loss": 0.0186, "step": 417 }, { "epoch": 0.6688, "grad_norm": 0.3581498362161358, "learning_rate": 1.813663704894407e-05, "loss": 0.0194, "step": 418 }, { "epoch": 0.6704, "grad_norm": 0.25611597657771756, "learning_rate": 1.8126579138282502e-05, "loss": 0.0173, "step": 419 }, { "epoch": 0.672, "grad_norm": 0.33612713879076267, "learning_rate": 1.8116496960422108e-05, "loss": 0.0171, "step": 420 }, { "epoch": 0.6736, "grad_norm": 0.26310467780127467, "learning_rate": 1.8106390545469797e-05, "loss": 0.0195, "step": 421 }, { "epoch": 0.6752, "grad_norm": 0.28730353462201863, "learning_rate": 1.809625992360485e-05, "loss": 0.0195, "step": 422 }, { "epoch": 0.6768, "grad_norm": 0.35988954674299883, "learning_rate": 1.8086105125078858e-05, "loss": 0.0195, "step": 423 }, { "epoch": 0.6784, "grad_norm": 0.2730538724372587, "learning_rate": 1.8075926180215576e-05, "loss": 0.0175, "step": 424 }, { "epoch": 0.68, "grad_norm": 0.23158565001568174, "learning_rate": 1.8065723119410885e-05, "loss": 0.0156, "step": 425 }, { "epoch": 0.6816, "grad_norm": 0.34307170718940355, "learning_rate": 1.805549597313267e-05, "loss": 0.0169, "step": 426 }, { "epoch": 0.6832, "grad_norm": 0.3044701690296964, "learning_rate": 1.804524477192075e-05, "loss": 0.021, "step": 427 }, { "epoch": 0.6848, "grad_norm": 0.3284070104407129, "learning_rate": 1.803496954638676e-05, "loss": 0.0196, "step": 428 }, { "epoch": 0.6864, "grad_norm": 0.5072657066644937, "learning_rate": 1.8024670327214084e-05, "loss": 0.0225, "step": 429 }, { "epoch": 0.688, "grad_norm": 0.2941198460671475, "learning_rate": 1.8014347145157757e-05, "loss": 0.0188, "step": 430 }, { "epoch": 0.6896, "grad_norm": 0.25643271150318425, "learning_rate": 1.8004000031044363e-05, "loss": 0.0159, "step": 431 }, { "epoch": 0.6912, "grad_norm": 0.372899223911401, "learning_rate": 1.799362901577196e-05, "loss": 0.0183, "step": 432 }, { "epoch": 0.6928, "grad_norm": 0.335477210653575, "learning_rate": 1.798323413030997e-05, "loss": 0.019, "step": 433 }, { "epoch": 0.6944, "grad_norm": 0.34141551328982883, "learning_rate": 1.7972815405699105e-05, "loss": 0.0168, "step": 434 }, { "epoch": 0.696, "grad_norm": 0.3393022535549739, "learning_rate": 1.796237287305125e-05, "loss": 0.0204, "step": 435 }, { "epoch": 0.6976, "grad_norm": 0.2215319109318955, "learning_rate": 1.7951906563549397e-05, "loss": 0.0177, "step": 436 }, { "epoch": 0.6992, "grad_norm": 0.33171031082369407, "learning_rate": 1.7941416508447537e-05, "loss": 0.0173, "step": 437 }, { "epoch": 0.7008, "grad_norm": 0.47259061929176016, "learning_rate": 1.793090273907056e-05, "loss": 0.0162, "step": 438 }, { "epoch": 0.7024, "grad_norm": 0.17497052475464156, "learning_rate": 1.792036528681418e-05, "loss": 0.0158, "step": 439 }, { "epoch": 0.704, "grad_norm": 0.3207435066493352, "learning_rate": 1.7909804183144837e-05, "loss": 0.0189, "step": 440 }, { "epoch": 0.7056, "grad_norm": 0.3768624074051605, "learning_rate": 1.789921945959958e-05, "loss": 0.0166, "step": 441 }, { "epoch": 0.7072, "grad_norm": 0.29665002817002356, "learning_rate": 1.7888611147786003e-05, "loss": 0.0159, "step": 442 }, { "epoch": 0.7088, "grad_norm": 0.3207389922464664, "learning_rate": 1.7877979279382135e-05, "loss": 0.0203, "step": 443 }, { "epoch": 0.7104, "grad_norm": 0.18890591065624907, "learning_rate": 1.786732388613635e-05, "loss": 0.0165, "step": 444 }, { "epoch": 0.712, "grad_norm": 0.23594625568492894, "learning_rate": 1.7856644999867264e-05, "loss": 0.0204, "step": 445 }, { "epoch": 0.7136, "grad_norm": 0.2552483310094428, "learning_rate": 1.784594265246366e-05, "loss": 0.0168, "step": 446 }, { "epoch": 0.7152, "grad_norm": 0.24151614992913603, "learning_rate": 1.783521687588437e-05, "loss": 0.0167, "step": 447 }, { "epoch": 0.7168, "grad_norm": 0.3604766090360732, "learning_rate": 1.782446770215819e-05, "loss": 0.022, "step": 448 }, { "epoch": 0.7184, "grad_norm": 0.24693074827496292, "learning_rate": 1.781369516338378e-05, "loss": 0.0188, "step": 449 }, { "epoch": 0.72, "grad_norm": 0.51301858757728, "learning_rate": 1.7802899291729585e-05, "loss": 0.0188, "step": 450 }, { "epoch": 0.7216, "grad_norm": 0.44328837920573166, "learning_rate": 1.779208011943371e-05, "loss": 0.0171, "step": 451 }, { "epoch": 0.7232, "grad_norm": 0.3028936714557896, "learning_rate": 1.7781237678803845e-05, "loss": 0.0187, "step": 452 }, { "epoch": 0.7248, "grad_norm": 0.19178441580412345, "learning_rate": 1.777037200221717e-05, "loss": 0.0165, "step": 453 }, { "epoch": 0.7264, "grad_norm": 0.1723409498307265, "learning_rate": 1.775948312212024e-05, "loss": 0.0164, "step": 454 }, { "epoch": 0.728, "grad_norm": 0.5766940297412076, "learning_rate": 1.77485710710289e-05, "loss": 0.0152, "step": 455 }, { "epoch": 0.7296, "grad_norm": 0.3736278036944696, "learning_rate": 1.7737635881528198e-05, "loss": 0.0192, "step": 456 }, { "epoch": 0.7312, "grad_norm": 0.37918511783155534, "learning_rate": 1.7726677586272263e-05, "loss": 0.0185, "step": 457 }, { "epoch": 0.7328, "grad_norm": 0.31748017727601047, "learning_rate": 1.7715696217984233e-05, "loss": 0.0174, "step": 458 }, { "epoch": 0.7344, "grad_norm": 0.28283710719929595, "learning_rate": 1.7704691809456142e-05, "loss": 0.017, "step": 459 }, { "epoch": 0.736, "grad_norm": 0.21831749033244646, "learning_rate": 1.7693664393548822e-05, "loss": 0.0136, "step": 460 }, { "epoch": 0.7376, "grad_norm": 0.3209663077765436, "learning_rate": 1.7682614003191807e-05, "loss": 0.0164, "step": 461 }, { "epoch": 0.7392, "grad_norm": 0.3648279664943445, "learning_rate": 1.7671540671383245e-05, "loss": 0.0222, "step": 462 }, { "epoch": 0.7408, "grad_norm": 0.3650059458944533, "learning_rate": 1.766044443118978e-05, "loss": 0.0178, "step": 463 }, { "epoch": 0.7424, "grad_norm": 0.23141197197187555, "learning_rate": 1.764932531574648e-05, "loss": 0.0177, "step": 464 }, { "epoch": 0.744, "grad_norm": 0.24848492713296702, "learning_rate": 1.76381833582567e-05, "loss": 0.0152, "step": 465 }, { "epoch": 0.7456, "grad_norm": 0.212058059906063, "learning_rate": 1.762701859199202e-05, "loss": 0.016, "step": 466 }, { "epoch": 0.7472, "grad_norm": 0.22424171520312197, "learning_rate": 1.761583105029213e-05, "loss": 0.0151, "step": 467 }, { "epoch": 0.7488, "grad_norm": 0.3352178009928058, "learning_rate": 1.7604620766564725e-05, "loss": 0.0196, "step": 468 }, { "epoch": 0.7504, "grad_norm": 0.26322572301360064, "learning_rate": 1.7593387774285412e-05, "loss": 0.015, "step": 469 }, { "epoch": 0.752, "grad_norm": 0.2814098104445483, "learning_rate": 1.7582132106997615e-05, "loss": 0.0184, "step": 470 }, { "epoch": 0.7536, "grad_norm": 0.307432575270684, "learning_rate": 1.7570853798312462e-05, "loss": 0.0164, "step": 471 }, { "epoch": 0.7552, "grad_norm": 0.2966570504116292, "learning_rate": 1.7559552881908698e-05, "loss": 0.0167, "step": 472 }, { "epoch": 0.7568, "grad_norm": 0.4866919307321116, "learning_rate": 1.7548229391532572e-05, "loss": 0.0201, "step": 473 }, { "epoch": 0.7584, "grad_norm": 0.2717271876657307, "learning_rate": 1.7536883360997743e-05, "loss": 0.0149, "step": 474 }, { "epoch": 0.76, "grad_norm": 0.23215274345701958, "learning_rate": 1.7525514824185187e-05, "loss": 0.0157, "step": 475 }, { "epoch": 0.7616, "grad_norm": 0.2733866519645903, "learning_rate": 1.7514123815043073e-05, "loss": 0.0155, "step": 476 }, { "epoch": 0.7632, "grad_norm": 0.26453774848889056, "learning_rate": 1.750271036758669e-05, "loss": 0.0174, "step": 477 }, { "epoch": 0.7648, "grad_norm": 0.23059650092556158, "learning_rate": 1.749127451589832e-05, "loss": 0.0151, "step": 478 }, { "epoch": 0.7664, "grad_norm": 0.3304249529546193, "learning_rate": 1.747981629412715e-05, "loss": 0.0174, "step": 479 }, { "epoch": 0.768, "grad_norm": 0.2270056832127085, "learning_rate": 1.7468335736489177e-05, "loss": 0.0163, "step": 480 }, { "epoch": 0.7696, "grad_norm": 0.29069700663640613, "learning_rate": 1.7456832877267083e-05, "loss": 0.0158, "step": 481 }, { "epoch": 0.7712, "grad_norm": 0.3752070968464264, "learning_rate": 1.7445307750810153e-05, "loss": 0.0159, "step": 482 }, { "epoch": 0.7728, "grad_norm": 0.23327504768427246, "learning_rate": 1.7433760391534166e-05, "loss": 0.0167, "step": 483 }, { "epoch": 0.7744, "grad_norm": 0.2239594613519126, "learning_rate": 1.7422190833921284e-05, "loss": 0.0156, "step": 484 }, { "epoch": 0.776, "grad_norm": 0.1777337204794238, "learning_rate": 1.741059911251997e-05, "loss": 0.0147, "step": 485 }, { "epoch": 0.7776, "grad_norm": 0.30193231609851146, "learning_rate": 1.7398985261944857e-05, "loss": 0.0155, "step": 486 }, { "epoch": 0.7792, "grad_norm": 0.3705440433007421, "learning_rate": 1.7387349316876668e-05, "loss": 0.016, "step": 487 }, { "epoch": 0.7808, "grad_norm": 0.3036805749121437, "learning_rate": 1.7375691312062102e-05, "loss": 0.0159, "step": 488 }, { "epoch": 0.7824, "grad_norm": 0.22418435070009263, "learning_rate": 1.7364011282313732e-05, "loss": 0.0151, "step": 489 }, { "epoch": 0.784, "grad_norm": 0.2447819237603347, "learning_rate": 1.7352309262509894e-05, "loss": 0.0157, "step": 490 }, { "epoch": 0.7856, "grad_norm": 0.2563576164383423, "learning_rate": 1.7340585287594605e-05, "loss": 0.0167, "step": 491 }, { "epoch": 0.7872, "grad_norm": 0.25731657995972784, "learning_rate": 1.7328839392577422e-05, "loss": 0.0181, "step": 492 }, { "epoch": 0.7888, "grad_norm": 0.33470461643316934, "learning_rate": 1.731707161253338e-05, "loss": 0.0154, "step": 493 }, { "epoch": 0.7904, "grad_norm": 0.2771412691165488, "learning_rate": 1.730528198260285e-05, "loss": 0.0145, "step": 494 }, { "epoch": 0.792, "grad_norm": 0.2709908554160519, "learning_rate": 1.7293470537991463e-05, "loss": 0.0179, "step": 495 }, { "epoch": 0.7936, "grad_norm": 0.2558694357144735, "learning_rate": 1.728163731396998e-05, "loss": 0.0138, "step": 496 }, { "epoch": 0.7952, "grad_norm": 0.25559177407084555, "learning_rate": 1.7269782345874204e-05, "loss": 0.021, "step": 497 }, { "epoch": 0.7968, "grad_norm": 0.33197746334294914, "learning_rate": 1.7257905669104874e-05, "loss": 0.0163, "step": 498 }, { "epoch": 0.7984, "grad_norm": 0.24632128756768099, "learning_rate": 1.7246007319127547e-05, "loss": 0.0144, "step": 499 }, { "epoch": 0.8, "grad_norm": 0.32033168801101264, "learning_rate": 1.72340873314725e-05, "loss": 0.0165, "step": 500 }, { "epoch": 0.8016, "grad_norm": 0.3268250789735254, "learning_rate": 1.7222145741734625e-05, "loss": 0.0164, "step": 501 }, { "epoch": 0.8032, "grad_norm": 0.2232688071161065, "learning_rate": 1.721018258557333e-05, "loss": 0.0153, "step": 502 }, { "epoch": 0.8048, "grad_norm": 0.4371826628599994, "learning_rate": 1.7198197898712402e-05, "loss": 0.0158, "step": 503 }, { "epoch": 0.8064, "grad_norm": 0.272064986688738, "learning_rate": 1.7186191716939946e-05, "loss": 0.016, "step": 504 }, { "epoch": 0.808, "grad_norm": 0.2999734487686739, "learning_rate": 1.717416407610824e-05, "loss": 0.0165, "step": 505 }, { "epoch": 0.8096, "grad_norm": 0.20381910185537508, "learning_rate": 1.7162115012133643e-05, "loss": 0.0122, "step": 506 }, { "epoch": 0.8112, "grad_norm": 0.2514599572754894, "learning_rate": 1.7150044560996488e-05, "loss": 0.0165, "step": 507 }, { "epoch": 0.8128, "grad_norm": 0.24878584344864693, "learning_rate": 1.713795275874098e-05, "loss": 0.0141, "step": 508 }, { "epoch": 0.8144, "grad_norm": 0.2495924620006895, "learning_rate": 1.7125839641475074e-05, "loss": 0.0136, "step": 509 }, { "epoch": 0.816, "grad_norm": 0.2983231959518722, "learning_rate": 1.711370524537037e-05, "loss": 0.0153, "step": 510 }, { "epoch": 0.8176, "grad_norm": 0.32363209088505135, "learning_rate": 1.7101549606662025e-05, "loss": 0.0162, "step": 511 }, { "epoch": 0.8192, "grad_norm": 0.23024824037626007, "learning_rate": 1.7089372761648617e-05, "loss": 0.0129, "step": 512 }, { "epoch": 0.8208, "grad_norm": 0.3099805526735493, "learning_rate": 1.7077174746692054e-05, "loss": 0.0146, "step": 513 }, { "epoch": 0.8224, "grad_norm": 0.2958790504323679, "learning_rate": 1.7064955598217463e-05, "loss": 0.0144, "step": 514 }, { "epoch": 0.824, "grad_norm": 0.32180951645652084, "learning_rate": 1.7052715352713076e-05, "loss": 0.0124, "step": 515 }, { "epoch": 0.8256, "grad_norm": 0.3161408048838766, "learning_rate": 1.7040454046730118e-05, "loss": 0.0122, "step": 516 }, { "epoch": 0.8272, "grad_norm": 0.4359972092447916, "learning_rate": 1.7028171716882714e-05, "loss": 0.016, "step": 517 }, { "epoch": 0.8288, "grad_norm": 0.6070067239235338, "learning_rate": 1.7015868399847768e-05, "loss": 0.0157, "step": 518 }, { "epoch": 0.8304, "grad_norm": 0.30921833575080176, "learning_rate": 1.7003544132364847e-05, "loss": 0.0175, "step": 519 }, { "epoch": 0.832, "grad_norm": 0.24855531102712775, "learning_rate": 1.6991198951236088e-05, "loss": 0.0135, "step": 520 }, { "epoch": 0.8336, "grad_norm": 0.3990339989018151, "learning_rate": 1.6978832893326074e-05, "loss": 0.0156, "step": 521 }, { "epoch": 0.8352, "grad_norm": 0.4165012367115773, "learning_rate": 1.696644599556173e-05, "loss": 0.016, "step": 522 }, { "epoch": 0.8368, "grad_norm": 0.43412052801627804, "learning_rate": 1.6954038294932215e-05, "loss": 0.0201, "step": 523 }, { "epoch": 0.8384, "grad_norm": 0.47790373526083657, "learning_rate": 1.6941609828488806e-05, "loss": 0.0125, "step": 524 }, { "epoch": 0.84, "grad_norm": 0.2970205406935243, "learning_rate": 1.692916063334479e-05, "loss": 0.0138, "step": 525 }, { "epoch": 0.8416, "grad_norm": 0.4618090240290649, "learning_rate": 1.691669074667535e-05, "loss": 0.0188, "step": 526 }, { "epoch": 0.8432, "grad_norm": 0.34029536024377827, "learning_rate": 1.690420020571747e-05, "loss": 0.018, "step": 527 }, { "epoch": 0.8448, "grad_norm": 0.3439089692946025, "learning_rate": 1.689168904776979e-05, "loss": 0.017, "step": 528 }, { "epoch": 0.8464, "grad_norm": 0.30043383316215877, "learning_rate": 1.6879157310192537e-05, "loss": 0.0179, "step": 529 }, { "epoch": 0.848, "grad_norm": 0.4054893693102873, "learning_rate": 1.686660503040737e-05, "loss": 0.0159, "step": 530 }, { "epoch": 0.8496, "grad_norm": 0.2557418982826781, "learning_rate": 1.685403224589731e-05, "loss": 0.0159, "step": 531 }, { "epoch": 0.8512, "grad_norm": 0.3221299446889479, "learning_rate": 1.6841438994206597e-05, "loss": 0.0139, "step": 532 }, { "epoch": 0.8528, "grad_norm": 0.20671619932779295, "learning_rate": 1.6828825312940594e-05, "loss": 0.0133, "step": 533 }, { "epoch": 0.8544, "grad_norm": 0.395621658771693, "learning_rate": 1.6816191239765668e-05, "loss": 0.015, "step": 534 }, { "epoch": 0.856, "grad_norm": 0.18571913914672067, "learning_rate": 1.6803536812409077e-05, "loss": 0.0133, "step": 535 }, { "epoch": 0.8576, "grad_norm": 0.4219460617356573, "learning_rate": 1.6790862068658863e-05, "loss": 0.0167, "step": 536 }, { "epoch": 0.8592, "grad_norm": 0.4313099517984012, "learning_rate": 1.6778167046363735e-05, "loss": 0.016, "step": 537 }, { "epoch": 0.8608, "grad_norm": 0.24685981400311702, "learning_rate": 1.6765451783432953e-05, "loss": 0.0129, "step": 538 }, { "epoch": 0.8624, "grad_norm": 0.36926713742823264, "learning_rate": 1.675271631783623e-05, "loss": 0.0176, "step": 539 }, { "epoch": 0.864, "grad_norm": 0.27147011307048174, "learning_rate": 1.6739960687603592e-05, "loss": 0.0193, "step": 540 }, { "epoch": 0.8656, "grad_norm": 0.30418888127452126, "learning_rate": 1.672718493082529e-05, "loss": 0.0187, "step": 541 }, { "epoch": 0.8672, "grad_norm": 0.31714375660907473, "learning_rate": 1.671438908565167e-05, "loss": 0.0139, "step": 542 }, { "epoch": 0.8688, "grad_norm": 0.44713073525701746, "learning_rate": 1.6701573190293076e-05, "loss": 0.0154, "step": 543 }, { "epoch": 0.8704, "grad_norm": 0.32977461143617176, "learning_rate": 1.6688737283019708e-05, "loss": 0.0172, "step": 544 }, { "epoch": 0.872, "grad_norm": 0.22999075617827208, "learning_rate": 1.667588140216154e-05, "loss": 0.0169, "step": 545 }, { "epoch": 0.8736, "grad_norm": 0.23997918810025493, "learning_rate": 1.6663005586108175e-05, "loss": 0.0166, "step": 546 }, { "epoch": 0.8752, "grad_norm": 0.43197070145474226, "learning_rate": 1.6650109873308763e-05, "loss": 0.0156, "step": 547 }, { "epoch": 0.8768, "grad_norm": 0.4741914350797042, "learning_rate": 1.663719430227186e-05, "loss": 0.0191, "step": 548 }, { "epoch": 0.8784, "grad_norm": 0.43115846393700075, "learning_rate": 1.6624258911565312e-05, "loss": 0.0157, "step": 549 }, { "epoch": 0.88, "grad_norm": 0.2965223549899331, "learning_rate": 1.661130373981617e-05, "loss": 0.0116, "step": 550 }, { "epoch": 0.8816, "grad_norm": 0.24489453090945398, "learning_rate": 1.6598328825710536e-05, "loss": 0.0141, "step": 551 }, { "epoch": 0.8832, "grad_norm": 0.28538146507727113, "learning_rate": 1.6585334207993475e-05, "loss": 0.0139, "step": 552 }, { "epoch": 0.8848, "grad_norm": 0.35788977059608296, "learning_rate": 1.6572319925468892e-05, "loss": 0.0147, "step": 553 }, { "epoch": 0.8864, "grad_norm": 0.28960712882823075, "learning_rate": 1.65592860169994e-05, "loss": 0.0147, "step": 554 }, { "epoch": 0.888, "grad_norm": 0.31950119423116574, "learning_rate": 1.654623252150624e-05, "loss": 0.0135, "step": 555 }, { "epoch": 0.8896, "grad_norm": 0.3807738816010361, "learning_rate": 1.6533159477969122e-05, "loss": 0.0133, "step": 556 }, { "epoch": 0.8912, "grad_norm": 0.31788894397034834, "learning_rate": 1.6520066925426146e-05, "loss": 0.0163, "step": 557 }, { "epoch": 0.8928, "grad_norm": 0.29489370953447575, "learning_rate": 1.6506954902973657e-05, "loss": 0.0118, "step": 558 }, { "epoch": 0.8944, "grad_norm": 0.32915478563644357, "learning_rate": 1.6493823449766137e-05, "loss": 0.0145, "step": 559 }, { "epoch": 0.896, "grad_norm": 0.2496157180299605, "learning_rate": 1.648067260501611e-05, "loss": 0.0105, "step": 560 }, { "epoch": 0.8976, "grad_norm": 0.23022392976252964, "learning_rate": 1.6467502407993995e-05, "loss": 0.0153, "step": 561 }, { "epoch": 0.8992, "grad_norm": 0.2741504250149933, "learning_rate": 1.6454312898027992e-05, "loss": 0.0129, "step": 562 }, { "epoch": 0.9008, "grad_norm": 0.37673992433131875, "learning_rate": 1.644110411450398e-05, "loss": 0.0145, "step": 563 }, { "epoch": 0.9024, "grad_norm": 0.24683345688818692, "learning_rate": 1.6427876096865394e-05, "loss": 0.0132, "step": 564 }, { "epoch": 0.904, "grad_norm": 0.21155010306282332, "learning_rate": 1.6414628884613106e-05, "loss": 0.0124, "step": 565 }, { "epoch": 0.9056, "grad_norm": 0.2282821732438531, "learning_rate": 1.6401362517305296e-05, "loss": 0.0121, "step": 566 }, { "epoch": 0.9072, "grad_norm": 0.5087710551373635, "learning_rate": 1.6388077034557355e-05, "loss": 0.0172, "step": 567 }, { "epoch": 0.9088, "grad_norm": 0.38875972717524937, "learning_rate": 1.637477247604175e-05, "loss": 0.0143, "step": 568 }, { "epoch": 0.9104, "grad_norm": 0.33977547058849117, "learning_rate": 1.6361448881487913e-05, "loss": 0.0144, "step": 569 }, { "epoch": 0.912, "grad_norm": 0.39719043920205616, "learning_rate": 1.6348106290682117e-05, "loss": 0.0135, "step": 570 }, { "epoch": 0.9136, "grad_norm": 0.41966092792866705, "learning_rate": 1.6334744743467366e-05, "loss": 0.0148, "step": 571 }, { "epoch": 0.9152, "grad_norm": 0.5310320819623283, "learning_rate": 1.6321364279743267e-05, "loss": 0.0148, "step": 572 }, { "epoch": 0.9168, "grad_norm": 0.36975329288920533, "learning_rate": 1.6307964939465914e-05, "loss": 0.0142, "step": 573 }, { "epoch": 0.9184, "grad_norm": 0.24157258569983317, "learning_rate": 1.6294546762647775e-05, "loss": 0.0091, "step": 574 }, { "epoch": 0.92, "grad_norm": 0.4828289939576407, "learning_rate": 1.628110978935756e-05, "loss": 0.0141, "step": 575 }, { "epoch": 0.9216, "grad_norm": 0.4861418486779714, "learning_rate": 1.626765405972011e-05, "loss": 0.0159, "step": 576 }, { "epoch": 0.9232, "grad_norm": 0.3805881534396026, "learning_rate": 1.625417961391628e-05, "loss": 0.0143, "step": 577 }, { "epoch": 0.9248, "grad_norm": 0.2824880755489354, "learning_rate": 1.6240686492182806e-05, "loss": 0.0113, "step": 578 }, { "epoch": 0.9264, "grad_norm": 0.47408413957904083, "learning_rate": 1.62271747348122e-05, "loss": 0.0168, "step": 579 }, { "epoch": 0.928, "grad_norm": 0.5431515431501727, "learning_rate": 1.621364438215262e-05, "loss": 0.0177, "step": 580 }, { "epoch": 0.9296, "grad_norm": 0.4281155633808909, "learning_rate": 1.6200095474607753e-05, "loss": 0.0137, "step": 581 }, { "epoch": 0.9312, "grad_norm": 0.2893495598259243, "learning_rate": 1.6186528052636692e-05, "loss": 0.0126, "step": 582 }, { "epoch": 0.9328, "grad_norm": 0.31186922217219265, "learning_rate": 1.6172942156753822e-05, "loss": 0.0138, "step": 583 }, { "epoch": 0.9344, "grad_norm": 0.16779597630347912, "learning_rate": 1.6159337827528686e-05, "loss": 0.0111, "step": 584 }, { "epoch": 0.936, "grad_norm": 0.47723171668240166, "learning_rate": 1.614571510558588e-05, "loss": 0.0156, "step": 585 }, { "epoch": 0.9376, "grad_norm": 0.4819625853916828, "learning_rate": 1.6132074031604917e-05, "loss": 0.0142, "step": 586 }, { "epoch": 0.9392, "grad_norm": 0.4558378441295942, "learning_rate": 1.6118414646320115e-05, "loss": 0.0149, "step": 587 }, { "epoch": 0.9408, "grad_norm": 0.3157472452703629, "learning_rate": 1.6104736990520468e-05, "loss": 0.0116, "step": 588 }, { "epoch": 0.9424, "grad_norm": 0.1989816586608992, "learning_rate": 1.6091041105049542e-05, "loss": 0.0112, "step": 589 }, { "epoch": 0.944, "grad_norm": 0.34300136848839013, "learning_rate": 1.6077327030805318e-05, "loss": 0.0129, "step": 590 }, { "epoch": 0.9456, "grad_norm": 0.1805561810136135, "learning_rate": 1.6063594808740112e-05, "loss": 0.0108, "step": 591 }, { "epoch": 0.9472, "grad_norm": 0.18712891674307838, "learning_rate": 1.604984447986042e-05, "loss": 0.0135, "step": 592 }, { "epoch": 0.9488, "grad_norm": 0.3067750461036351, "learning_rate": 1.6036076085226813e-05, "loss": 0.0133, "step": 593 }, { "epoch": 0.9504, "grad_norm": 0.1812977796323941, "learning_rate": 1.602228966595381e-05, "loss": 0.0099, "step": 594 }, { "epoch": 0.952, "grad_norm": 0.2480339618306651, "learning_rate": 1.6008485263209742e-05, "loss": 0.0102, "step": 595 }, { "epoch": 0.9536, "grad_norm": 0.24450667954263358, "learning_rate": 1.599466291821666e-05, "loss": 0.0134, "step": 596 }, { "epoch": 0.9552, "grad_norm": 0.29278728917338376, "learning_rate": 1.598082267225018e-05, "loss": 0.0125, "step": 597 }, { "epoch": 0.9568, "grad_norm": 0.3086905154384183, "learning_rate": 1.596696456663938e-05, "loss": 0.0121, "step": 598 }, { "epoch": 0.9584, "grad_norm": 0.2766246945781701, "learning_rate": 1.595308864276666e-05, "loss": 0.0107, "step": 599 }, { "epoch": 0.96, "grad_norm": 0.23271296887264653, "learning_rate": 1.5939194942067647e-05, "loss": 0.0118, "step": 600 }, { "epoch": 0.9616, "grad_norm": 0.21834384775961552, "learning_rate": 1.592528350603103e-05, "loss": 0.0095, "step": 601 }, { "epoch": 0.9632, "grad_norm": 0.31082618893772185, "learning_rate": 1.5911354376198468e-05, "loss": 0.0139, "step": 602 }, { "epoch": 0.9648, "grad_norm": 0.3716195934094933, "learning_rate": 1.5897407594164468e-05, "loss": 0.0121, "step": 603 }, { "epoch": 0.9664, "grad_norm": 0.46949516504477684, "learning_rate": 1.5883443201576225e-05, "loss": 0.0122, "step": 604 }, { "epoch": 0.968, "grad_norm": 0.35059424145377427, "learning_rate": 1.586946124013354e-05, "loss": 0.0116, "step": 605 }, { "epoch": 0.9696, "grad_norm": 0.40688113982199503, "learning_rate": 1.585546175158868e-05, "loss": 0.014, "step": 606 }, { "epoch": 0.9712, "grad_norm": 0.4156616285372754, "learning_rate": 1.5841444777746232e-05, "loss": 0.013, "step": 607 }, { "epoch": 0.9728, "grad_norm": 0.21046745034986, "learning_rate": 1.582741036046301e-05, "loss": 0.0088, "step": 608 }, { "epoch": 0.9744, "grad_norm": 0.36941300422007295, "learning_rate": 1.5813358541647915e-05, "loss": 0.0106, "step": 609 }, { "epoch": 0.976, "grad_norm": 0.24297535036589946, "learning_rate": 1.5799289363261815e-05, "loss": 0.0115, "step": 610 }, { "epoch": 0.9776, "grad_norm": 0.303837166125424, "learning_rate": 1.578520286731741e-05, "loss": 0.0106, "step": 611 }, { "epoch": 0.9792, "grad_norm": 0.2803480753538006, "learning_rate": 1.5771099095879108e-05, "loss": 0.0096, "step": 612 }, { "epoch": 0.9808, "grad_norm": 0.31074041433807803, "learning_rate": 1.575697809106292e-05, "loss": 0.0118, "step": 613 }, { "epoch": 0.9824, "grad_norm": 0.49870271318985004, "learning_rate": 1.5742839895036305e-05, "loss": 0.0165, "step": 614 }, { "epoch": 0.984, "grad_norm": 0.3044978450333263, "learning_rate": 1.5728684550018066e-05, "loss": 0.0126, "step": 615 }, { "epoch": 0.9856, "grad_norm": 0.3493681227793422, "learning_rate": 1.571451209827821e-05, "loss": 0.014, "step": 616 }, { "epoch": 0.9872, "grad_norm": 0.37956769866427376, "learning_rate": 1.570032258213783e-05, "loss": 0.0134, "step": 617 }, { "epoch": 0.9888, "grad_norm": 0.37799510952249216, "learning_rate": 1.5686116043968975e-05, "loss": 0.0111, "step": 618 }, { "epoch": 0.9904, "grad_norm": 0.39548477772844104, "learning_rate": 1.5671892526194515e-05, "loss": 0.0153, "step": 619 }, { "epoch": 0.992, "grad_norm": 0.4257260663819077, "learning_rate": 1.565765207128805e-05, "loss": 0.0131, "step": 620 }, { "epoch": 0.9936, "grad_norm": 0.30794139713595914, "learning_rate": 1.564339472177373e-05, "loss": 0.0117, "step": 621 }, { "epoch": 0.9952, "grad_norm": 0.278719510483356, "learning_rate": 1.5629120520226163e-05, "loss": 0.0119, "step": 622 }, { "epoch": 0.9968, "grad_norm": 0.297554543587557, "learning_rate": 1.561482950927029e-05, "loss": 0.0112, "step": 623 }, { "epoch": 0.9984, "grad_norm": 0.24695394521400738, "learning_rate": 1.560052173158123e-05, "loss": 0.0125, "step": 624 }, { "epoch": 1.0, "grad_norm": 0.40345363184405636, "learning_rate": 1.5586197229884185e-05, "loss": 0.0133, "step": 625 }, { "epoch": 1.0016, "grad_norm": 0.417857034308898, "learning_rate": 1.5571856046954284e-05, "loss": 0.0114, "step": 626 }, { "epoch": 1.0032, "grad_norm": 0.21833831910708582, "learning_rate": 1.5557498225616488e-05, "loss": 0.0125, "step": 627 }, { "epoch": 1.0048, "grad_norm": 0.2743191504104528, "learning_rate": 1.5543123808745418e-05, "loss": 0.0132, "step": 628 }, { "epoch": 1.0064, "grad_norm": 0.30618341671798766, "learning_rate": 1.5528732839265272e-05, "loss": 0.0124, "step": 629 }, { "epoch": 1.008, "grad_norm": 0.3689469002067123, "learning_rate": 1.5514325360149668e-05, "loss": 0.0147, "step": 630 }, { "epoch": 1.0096, "grad_norm": 0.3088756615860901, "learning_rate": 1.549990141442153e-05, "loss": 0.0122, "step": 631 }, { "epoch": 1.0112, "grad_norm": 0.19294285505406303, "learning_rate": 1.5485461045152937e-05, "loss": 0.012, "step": 632 }, { "epoch": 1.0128, "grad_norm": 0.3001012276511453, "learning_rate": 1.5471004295465034e-05, "loss": 0.0132, "step": 633 }, { "epoch": 1.0144, "grad_norm": 0.1921528171891681, "learning_rate": 1.5456531208527868e-05, "loss": 0.0093, "step": 634 }, { "epoch": 1.016, "grad_norm": 0.16268529604637008, "learning_rate": 1.5442041827560274e-05, "loss": 0.009, "step": 635 }, { "epoch": 1.0176, "grad_norm": 0.30072233263249754, "learning_rate": 1.542753619582974e-05, "loss": 0.0101, "step": 636 }, { "epoch": 1.0192, "grad_norm": 0.25149533409567965, "learning_rate": 1.5413014356652287e-05, "loss": 0.0115, "step": 637 }, { "epoch": 1.0208, "grad_norm": 0.27926618346757304, "learning_rate": 1.5398476353392323e-05, "loss": 0.0128, "step": 638 }, { "epoch": 1.0224, "grad_norm": 0.23840478861268233, "learning_rate": 1.538392222946255e-05, "loss": 0.0125, "step": 639 }, { "epoch": 1.024, "grad_norm": 0.2748477006078572, "learning_rate": 1.5369352028323773e-05, "loss": 0.0114, "step": 640 }, { "epoch": 1.0256, "grad_norm": 0.36023776554838094, "learning_rate": 1.5354765793484834e-05, "loss": 0.0097, "step": 641 }, { "epoch": 1.0272, "grad_norm": 0.36739400395113647, "learning_rate": 1.534016356850244e-05, "loss": 0.0119, "step": 642 }, { "epoch": 1.0288, "grad_norm": 0.32947192475025666, "learning_rate": 1.5325545396981053e-05, "loss": 0.0087, "step": 643 }, { "epoch": 1.0304, "grad_norm": 0.4587397886694395, "learning_rate": 1.531091132257275e-05, "loss": 0.0119, "step": 644 }, { "epoch": 1.032, "grad_norm": 0.2852871073646235, "learning_rate": 1.5296261388977107e-05, "loss": 0.0102, "step": 645 }, { "epoch": 1.0336, "grad_norm": 0.44917912534081555, "learning_rate": 1.528159563994104e-05, "loss": 0.0111, "step": 646 }, { "epoch": 1.0352, "grad_norm": 0.47157149649612523, "learning_rate": 1.52669141192587e-05, "loss": 0.0139, "step": 647 }, { "epoch": 1.0368, "grad_norm": 0.3528186945026468, "learning_rate": 1.5252216870771345e-05, "loss": 0.0138, "step": 648 }, { "epoch": 1.0384, "grad_norm": 0.3986037607641987, "learning_rate": 1.5237503938367186e-05, "loss": 0.0144, "step": 649 }, { "epoch": 1.04, "grad_norm": 0.5600923250750897, "learning_rate": 1.5222775365981272e-05, "loss": 0.013, "step": 650 }, { "epoch": 1.0416, "grad_norm": 0.421695944273333, "learning_rate": 1.5208031197595357e-05, "loss": 0.011, "step": 651 }, { "epoch": 1.0432, "grad_norm": 0.43277467796608626, "learning_rate": 1.5193271477237761e-05, "loss": 0.0139, "step": 652 }, { "epoch": 1.0448, "grad_norm": 0.33407654281757615, "learning_rate": 1.5178496248983254e-05, "loss": 0.0121, "step": 653 }, { "epoch": 1.0464, "grad_norm": 0.3645641843666089, "learning_rate": 1.5163705556952912e-05, "loss": 0.0107, "step": 654 }, { "epoch": 1.048, "grad_norm": 0.3621632764026968, "learning_rate": 1.5148899445313983e-05, "loss": 0.0142, "step": 655 }, { "epoch": 1.0496, "grad_norm": 0.32624268869617895, "learning_rate": 1.5134077958279764e-05, "loss": 0.0133, "step": 656 }, { "epoch": 1.0512, "grad_norm": 0.29284343589574013, "learning_rate": 1.5119241140109466e-05, "loss": 0.0095, "step": 657 }, { "epoch": 1.0528, "grad_norm": 0.41739711667469, "learning_rate": 1.5104389035108078e-05, "loss": 0.0121, "step": 658 }, { "epoch": 1.0544, "grad_norm": 0.32747130295385996, "learning_rate": 1.5089521687626243e-05, "loss": 0.0135, "step": 659 }, { "epoch": 1.056, "grad_norm": 0.3291067637296192, "learning_rate": 1.5074639142060119e-05, "loss": 0.0094, "step": 660 }, { "epoch": 1.0576, "grad_norm": 0.31460572785375496, "learning_rate": 1.505974144285124e-05, "loss": 0.0104, "step": 661 }, { "epoch": 1.0592, "grad_norm": 0.5394369508399102, "learning_rate": 1.50448286344864e-05, "loss": 0.0126, "step": 662 }, { "epoch": 1.0608, "grad_norm": 0.6173037121141223, "learning_rate": 1.5029900761497507e-05, "loss": 0.0158, "step": 663 }, { "epoch": 1.0624, "grad_norm": 0.3429474313068254, "learning_rate": 1.501495786846146e-05, "loss": 0.0134, "step": 664 }, { "epoch": 1.064, "grad_norm": 0.2944391342591949, "learning_rate": 1.5000000000000002e-05, "loss": 0.0085, "step": 665 }, { "epoch": 1.0656, "grad_norm": 0.4287258380040374, "learning_rate": 1.4985027200779599e-05, "loss": 0.0144, "step": 666 }, { "epoch": 1.0672, "grad_norm": 0.3770767536307347, "learning_rate": 1.4970039515511303e-05, "loss": 0.0125, "step": 667 }, { "epoch": 1.0688, "grad_norm": 0.5615733955806762, "learning_rate": 1.4955036988950617e-05, "loss": 0.015, "step": 668 }, { "epoch": 1.0704, "grad_norm": 0.23511834985253832, "learning_rate": 1.4940019665897363e-05, "loss": 0.0106, "step": 669 }, { "epoch": 1.072, "grad_norm": 0.30497553069081007, "learning_rate": 1.4924987591195548e-05, "loss": 0.0113, "step": 670 }, { "epoch": 1.0735999999999999, "grad_norm": 0.35650652604214367, "learning_rate": 1.4909940809733223e-05, "loss": 0.0122, "step": 671 }, { "epoch": 1.0752, "grad_norm": 0.3308671401210887, "learning_rate": 1.489487936644237e-05, "loss": 0.0118, "step": 672 }, { "epoch": 1.0768, "grad_norm": 0.5047129564910963, "learning_rate": 1.4879803306298736e-05, "loss": 0.0105, "step": 673 }, { "epoch": 1.0784, "grad_norm": 0.2888826727326239, "learning_rate": 1.4864712674321733e-05, "loss": 0.0107, "step": 674 }, { "epoch": 1.08, "grad_norm": 0.22439723192462555, "learning_rate": 1.4849607515574276e-05, "loss": 0.0097, "step": 675 }, { "epoch": 1.0816, "grad_norm": 0.351953190025847, "learning_rate": 1.4834487875162657e-05, "loss": 0.01, "step": 676 }, { "epoch": 1.0832, "grad_norm": 0.29334503002848533, "learning_rate": 1.4819353798236427e-05, "loss": 0.0097, "step": 677 }, { "epoch": 1.0848, "grad_norm": 0.35188988523350234, "learning_rate": 1.4804205329988226e-05, "loss": 0.0122, "step": 678 }, { "epoch": 1.0864, "grad_norm": 0.24505321145577785, "learning_rate": 1.4789042515653687e-05, "loss": 0.0129, "step": 679 }, { "epoch": 1.088, "grad_norm": 0.25008455950445413, "learning_rate": 1.477386540051127e-05, "loss": 0.0102, "step": 680 }, { "epoch": 1.0896, "grad_norm": 0.35576816544038325, "learning_rate": 1.4758674029882152e-05, "loss": 0.0116, "step": 681 }, { "epoch": 1.0912, "grad_norm": 0.33614223269913607, "learning_rate": 1.4743468449130065e-05, "loss": 0.0133, "step": 682 }, { "epoch": 1.0928, "grad_norm": 0.22442180749880328, "learning_rate": 1.4728248703661183e-05, "loss": 0.01, "step": 683 }, { "epoch": 1.0944, "grad_norm": 0.28219807599355523, "learning_rate": 1.4713014838923975e-05, "loss": 0.0117, "step": 684 }, { "epoch": 1.096, "grad_norm": 0.2650287871574999, "learning_rate": 1.4697766900409076e-05, "loss": 0.0088, "step": 685 }, { "epoch": 1.0976, "grad_norm": 0.2766787747844056, "learning_rate": 1.4682504933649144e-05, "loss": 0.0086, "step": 686 }, { "epoch": 1.0992, "grad_norm": 0.3620818365197564, "learning_rate": 1.466722898421873e-05, "loss": 0.0079, "step": 687 }, { "epoch": 1.1008, "grad_norm": 0.37691673416789456, "learning_rate": 1.4651939097734132e-05, "loss": 0.0144, "step": 688 }, { "epoch": 1.1024, "grad_norm": 0.29790859515349194, "learning_rate": 1.4636635319853274e-05, "loss": 0.0101, "step": 689 }, { "epoch": 1.104, "grad_norm": 0.25829554558104056, "learning_rate": 1.4621317696275563e-05, "loss": 0.0116, "step": 690 }, { "epoch": 1.1056, "grad_norm": 0.26790109240876864, "learning_rate": 1.4605986272741748e-05, "loss": 0.0094, "step": 691 }, { "epoch": 1.1072, "grad_norm": 0.42271209243570657, "learning_rate": 1.4590641095033786e-05, "loss": 0.0092, "step": 692 }, { "epoch": 1.1088, "grad_norm": 0.4486221873645635, "learning_rate": 1.4575282208974704e-05, "loss": 0.0143, "step": 693 }, { "epoch": 1.1104, "grad_norm": 0.214394965355476, "learning_rate": 1.4559909660428469e-05, "loss": 0.0089, "step": 694 }, { "epoch": 1.112, "grad_norm": 0.38287545940258694, "learning_rate": 1.4544523495299843e-05, "loss": 0.0122, "step": 695 }, { "epoch": 1.1136, "grad_norm": 0.4091777970517161, "learning_rate": 1.4529123759534253e-05, "loss": 0.0096, "step": 696 }, { "epoch": 1.1152, "grad_norm": 0.3551233469036031, "learning_rate": 1.4513710499117648e-05, "loss": 0.0106, "step": 697 }, { "epoch": 1.1168, "grad_norm": 0.35675441570055094, "learning_rate": 1.4498283760076362e-05, "loss": 0.008, "step": 698 }, { "epoch": 1.1184, "grad_norm": 0.4761855955476139, "learning_rate": 1.4482843588476976e-05, "loss": 0.0113, "step": 699 }, { "epoch": 1.12, "grad_norm": 0.3129622710980147, "learning_rate": 1.4467390030426187e-05, "loss": 0.0096, "step": 700 }, { "epoch": 1.1216, "grad_norm": 0.36202464550526253, "learning_rate": 1.445192313207067e-05, "loss": 0.0084, "step": 701 }, { "epoch": 1.1232, "grad_norm": 0.2751302476380293, "learning_rate": 1.443644293959693e-05, "loss": 0.0089, "step": 702 }, { "epoch": 1.1248, "grad_norm": 0.44668265606662855, "learning_rate": 1.4420949499231172e-05, "loss": 0.0119, "step": 703 }, { "epoch": 1.1264, "grad_norm": 0.6115934261309857, "learning_rate": 1.4405442857239151e-05, "loss": 0.0126, "step": 704 }, { "epoch": 1.1280000000000001, "grad_norm": 0.3178399997249513, "learning_rate": 1.4389923059926064e-05, "loss": 0.012, "step": 705 }, { "epoch": 1.1296, "grad_norm": 0.22483155185595363, "learning_rate": 1.437439015363638e-05, "loss": 0.0108, "step": 706 }, { "epoch": 1.1312, "grad_norm": 0.27827436859396276, "learning_rate": 1.4358844184753713e-05, "loss": 0.0103, "step": 707 }, { "epoch": 1.1328, "grad_norm": 0.40643842085489884, "learning_rate": 1.4343285199700685e-05, "loss": 0.008, "step": 708 }, { "epoch": 1.1344, "grad_norm": 0.17923007058434895, "learning_rate": 1.432771324493879e-05, "loss": 0.0084, "step": 709 }, { "epoch": 1.1360000000000001, "grad_norm": 0.36990937482255815, "learning_rate": 1.4312128366968244e-05, "loss": 0.0112, "step": 710 }, { "epoch": 1.1376, "grad_norm": 0.30074565334559583, "learning_rate": 1.4296530612327864e-05, "loss": 0.0126, "step": 711 }, { "epoch": 1.1392, "grad_norm": 0.32382377558034564, "learning_rate": 1.428092002759491e-05, "loss": 0.01, "step": 712 }, { "epoch": 1.1408, "grad_norm": 0.3153553781926328, "learning_rate": 1.4265296659384956e-05, "loss": 0.0123, "step": 713 }, { "epoch": 1.1424, "grad_norm": 0.3566687628901846, "learning_rate": 1.4249660554351752e-05, "loss": 0.011, "step": 714 }, { "epoch": 1.144, "grad_norm": 0.29709278445318327, "learning_rate": 1.4234011759187084e-05, "loss": 0.0105, "step": 715 }, { "epoch": 1.1456, "grad_norm": 0.43818116357185727, "learning_rate": 1.4218350320620625e-05, "loss": 0.0105, "step": 716 }, { "epoch": 1.1472, "grad_norm": 0.34110667072697853, "learning_rate": 1.4202676285419811e-05, "loss": 0.0083, "step": 717 }, { "epoch": 1.1488, "grad_norm": 0.5110187407254071, "learning_rate": 1.4186989700389689e-05, "loss": 0.0099, "step": 718 }, { "epoch": 1.1504, "grad_norm": 0.6325184130355656, "learning_rate": 1.4171290612372781e-05, "loss": 0.0133, "step": 719 }, { "epoch": 1.152, "grad_norm": 0.225304624696556, "learning_rate": 1.4155579068248951e-05, "loss": 0.0106, "step": 720 }, { "epoch": 1.1536, "grad_norm": 0.2884387125345757, "learning_rate": 1.4139855114935253e-05, "loss": 0.0086, "step": 721 }, { "epoch": 1.1552, "grad_norm": 0.15131765398328514, "learning_rate": 1.4124118799385797e-05, "loss": 0.0095, "step": 722 }, { "epoch": 1.1568, "grad_norm": 0.2563227551597657, "learning_rate": 1.410837016859161e-05, "loss": 0.0082, "step": 723 }, { "epoch": 1.1584, "grad_norm": 0.43243462497917795, "learning_rate": 1.4092609269580498e-05, "loss": 0.0108, "step": 724 }, { "epoch": 1.16, "grad_norm": 0.30652501002357746, "learning_rate": 1.4076836149416889e-05, "loss": 0.0129, "step": 725 }, { "epoch": 1.1616, "grad_norm": 0.2600559640710353, "learning_rate": 1.4061050855201723e-05, "loss": 0.0122, "step": 726 }, { "epoch": 1.1632, "grad_norm": 0.2751909063838169, "learning_rate": 1.4045253434072278e-05, "loss": 0.0095, "step": 727 }, { "epoch": 1.1648, "grad_norm": 0.2677495381514669, "learning_rate": 1.4029443933202059e-05, "loss": 0.0087, "step": 728 }, { "epoch": 1.1663999999999999, "grad_norm": 0.329153787441638, "learning_rate": 1.4013622399800628e-05, "loss": 0.0118, "step": 729 }, { "epoch": 1.168, "grad_norm": 0.3954741610104781, "learning_rate": 1.399778888111349e-05, "loss": 0.0096, "step": 730 }, { "epoch": 1.1696, "grad_norm": 0.39323399280494176, "learning_rate": 1.3981943424421932e-05, "loss": 0.0112, "step": 731 }, { "epoch": 1.1712, "grad_norm": 0.2878151028081222, "learning_rate": 1.3966086077042891e-05, "loss": 0.0083, "step": 732 }, { "epoch": 1.1728, "grad_norm": 0.505797224428351, "learning_rate": 1.3950216886328818e-05, "loss": 0.014, "step": 733 }, { "epoch": 1.1743999999999999, "grad_norm": 0.2790678628939786, "learning_rate": 1.3934335899667526e-05, "loss": 0.0107, "step": 734 }, { "epoch": 1.176, "grad_norm": 0.3277167621850828, "learning_rate": 1.3918443164482048e-05, "loss": 0.0159, "step": 735 }, { "epoch": 1.1776, "grad_norm": 0.600242811130678, "learning_rate": 1.3902538728230502e-05, "loss": 0.0106, "step": 736 }, { "epoch": 1.1792, "grad_norm": 0.391801850174501, "learning_rate": 1.3886622638405953e-05, "loss": 0.011, "step": 737 }, { "epoch": 1.1808, "grad_norm": 0.2984427536737696, "learning_rate": 1.387069494253626e-05, "loss": 0.0133, "step": 738 }, { "epoch": 1.1824, "grad_norm": 0.19798876151872777, "learning_rate": 1.3854755688183941e-05, "loss": 0.0108, "step": 739 }, { "epoch": 1.184, "grad_norm": 0.3120296494732397, "learning_rate": 1.3838804922946027e-05, "loss": 0.0099, "step": 740 }, { "epoch": 1.1856, "grad_norm": 0.5741221288191662, "learning_rate": 1.3822842694453923e-05, "loss": 0.0143, "step": 741 }, { "epoch": 1.1872, "grad_norm": 0.5326283032765481, "learning_rate": 1.380686905037327e-05, "loss": 0.0116, "step": 742 }, { "epoch": 1.1888, "grad_norm": 0.311440966690227, "learning_rate": 1.3790884038403796e-05, "loss": 0.0116, "step": 743 }, { "epoch": 1.1904, "grad_norm": 0.425289385624851, "learning_rate": 1.3774887706279165e-05, "loss": 0.0094, "step": 744 }, { "epoch": 1.192, "grad_norm": 0.26822941847179554, "learning_rate": 1.375888010176686e-05, "loss": 0.0133, "step": 745 }, { "epoch": 1.1936, "grad_norm": 0.23449670744636084, "learning_rate": 1.374286127266801e-05, "loss": 0.0081, "step": 746 }, { "epoch": 1.1952, "grad_norm": 0.24419781837141116, "learning_rate": 1.3726831266817278e-05, "loss": 0.0111, "step": 747 }, { "epoch": 1.1968, "grad_norm": 0.2503170694406675, "learning_rate": 1.3710790132082693e-05, "loss": 0.0098, "step": 748 }, { "epoch": 1.1984, "grad_norm": 0.2924145590049586, "learning_rate": 1.3694737916365517e-05, "loss": 0.0107, "step": 749 }, { "epoch": 1.2, "grad_norm": 0.413991031494989, "learning_rate": 1.3678674667600102e-05, "loss": 0.0124, "step": 750 }, { "epoch": 1.2016, "grad_norm": 0.2641875819723108, "learning_rate": 1.3662600433753746e-05, "loss": 0.0095, "step": 751 }, { "epoch": 1.2032, "grad_norm": 0.2530973588426655, "learning_rate": 1.3646515262826551e-05, "loss": 0.0095, "step": 752 }, { "epoch": 1.2048, "grad_norm": 0.13083092986175077, "learning_rate": 1.3630419202851287e-05, "loss": 0.0068, "step": 753 }, { "epoch": 1.2064, "grad_norm": 0.2826431152828746, "learning_rate": 1.3614312301893222e-05, "loss": 0.0091, "step": 754 }, { "epoch": 1.208, "grad_norm": 0.4859207050471821, "learning_rate": 1.3598194608050011e-05, "loss": 0.0109, "step": 755 }, { "epoch": 1.2096, "grad_norm": 0.3528244903100441, "learning_rate": 1.3582066169451535e-05, "loss": 0.0081, "step": 756 }, { "epoch": 1.2112, "grad_norm": 0.23735550136193914, "learning_rate": 1.3565927034259757e-05, "loss": 0.0072, "step": 757 }, { "epoch": 1.2128, "grad_norm": 0.26982430126630624, "learning_rate": 1.354977725066859e-05, "loss": 0.013, "step": 758 }, { "epoch": 1.2144, "grad_norm": 0.5291448926408648, "learning_rate": 1.3533616866903736e-05, "loss": 0.0127, "step": 759 }, { "epoch": 1.216, "grad_norm": 0.28414696636536013, "learning_rate": 1.351744593122255e-05, "loss": 0.012, "step": 760 }, { "epoch": 1.2176, "grad_norm": 0.4694574939461318, "learning_rate": 1.3501264491913909e-05, "loss": 0.0121, "step": 761 }, { "epoch": 1.2192, "grad_norm": 0.35695596199339735, "learning_rate": 1.3485072597298038e-05, "loss": 0.0085, "step": 762 }, { "epoch": 1.2208, "grad_norm": 0.19716908170889477, "learning_rate": 1.3468870295726399e-05, "loss": 0.0075, "step": 763 }, { "epoch": 1.2224, "grad_norm": 0.4402510192967036, "learning_rate": 1.3452657635581521e-05, "loss": 0.0097, "step": 764 }, { "epoch": 1.224, "grad_norm": 0.4574247653241456, "learning_rate": 1.3436434665276865e-05, "loss": 0.0097, "step": 765 }, { "epoch": 1.2256, "grad_norm": 0.31040237291608896, "learning_rate": 1.342020143325669e-05, "loss": 0.011, "step": 766 }, { "epoch": 1.2272, "grad_norm": 0.310907702558819, "learning_rate": 1.3403957987995884e-05, "loss": 0.0095, "step": 767 }, { "epoch": 1.2288000000000001, "grad_norm": 0.3833100006565884, "learning_rate": 1.3387704377999842e-05, "loss": 0.0113, "step": 768 }, { "epoch": 1.2304, "grad_norm": 0.4376843592039469, "learning_rate": 1.3371440651804313e-05, "loss": 0.0105, "step": 769 }, { "epoch": 1.232, "grad_norm": 0.3461601821191171, "learning_rate": 1.335516685797525e-05, "loss": 0.0105, "step": 770 }, { "epoch": 1.2336, "grad_norm": 0.4206013244953963, "learning_rate": 1.3338883045108674e-05, "loss": 0.01, "step": 771 }, { "epoch": 1.2352, "grad_norm": 0.3969044404932267, "learning_rate": 1.3322589261830517e-05, "loss": 0.0078, "step": 772 }, { "epoch": 1.2368000000000001, "grad_norm": 0.29849669127198464, "learning_rate": 1.3306285556796494e-05, "loss": 0.0091, "step": 773 }, { "epoch": 1.2384, "grad_norm": 0.36916239678537005, "learning_rate": 1.328997197869194e-05, "loss": 0.0081, "step": 774 }, { "epoch": 1.24, "grad_norm": 0.42480431464993185, "learning_rate": 1.327364857623168e-05, "loss": 0.0109, "step": 775 }, { "epoch": 1.2416, "grad_norm": 0.3006130760508486, "learning_rate": 1.3257315398159865e-05, "loss": 0.0107, "step": 776 }, { "epoch": 1.2432, "grad_norm": 0.3883539027864325, "learning_rate": 1.3240972493249846e-05, "loss": 0.0096, "step": 777 }, { "epoch": 1.2448, "grad_norm": 0.1962408454725349, "learning_rate": 1.3224619910304019e-05, "loss": 0.0072, "step": 778 }, { "epoch": 1.2464, "grad_norm": 0.45417639089173384, "learning_rate": 1.3208257698153677e-05, "loss": 0.014, "step": 779 }, { "epoch": 1.248, "grad_norm": 0.3029083926751555, "learning_rate": 1.3191885905658873e-05, "loss": 0.0085, "step": 780 }, { "epoch": 1.2496, "grad_norm": 0.36776562477996894, "learning_rate": 1.3175504581708261e-05, "loss": 0.0106, "step": 781 }, { "epoch": 1.2511999999999999, "grad_norm": 0.3492474614748319, "learning_rate": 1.3159113775218963e-05, "loss": 0.0128, "step": 782 }, { "epoch": 1.2528000000000001, "grad_norm": 0.404422451829511, "learning_rate": 1.3142713535136413e-05, "loss": 0.0132, "step": 783 }, { "epoch": 1.2544, "grad_norm": 0.4650420812195629, "learning_rate": 1.3126303910434215e-05, "loss": 0.0086, "step": 784 }, { "epoch": 1.256, "grad_norm": 0.29560248543685946, "learning_rate": 1.3109884950114007e-05, "loss": 0.0095, "step": 785 }, { "epoch": 1.2576, "grad_norm": 0.17946868198520566, "learning_rate": 1.309345670320529e-05, "loss": 0.0071, "step": 786 }, { "epoch": 1.2591999999999999, "grad_norm": 0.4780712178705736, "learning_rate": 1.3077019218765306e-05, "loss": 0.0084, "step": 787 }, { "epoch": 1.2608, "grad_norm": 0.4451640294949449, "learning_rate": 1.3060572545878875e-05, "loss": 0.0137, "step": 788 }, { "epoch": 1.2624, "grad_norm": 0.42930639864800635, "learning_rate": 1.3044116733658261e-05, "loss": 0.0109, "step": 789 }, { "epoch": 1.264, "grad_norm": 0.23600539351190586, "learning_rate": 1.302765183124302e-05, "loss": 0.0068, "step": 790 }, { "epoch": 1.2656, "grad_norm": 0.37736631357521966, "learning_rate": 1.3011177887799846e-05, "loss": 0.0115, "step": 791 }, { "epoch": 1.2671999999999999, "grad_norm": 0.2326939370182424, "learning_rate": 1.2994694952522435e-05, "loss": 0.0089, "step": 792 }, { "epoch": 1.2688, "grad_norm": 0.2437980610117518, "learning_rate": 1.2978203074631335e-05, "loss": 0.008, "step": 793 }, { "epoch": 1.2704, "grad_norm": 0.42895841526592327, "learning_rate": 1.2961702303373795e-05, "loss": 0.0084, "step": 794 }, { "epoch": 1.272, "grad_norm": 0.4639404626719677, "learning_rate": 1.2945192688023625e-05, "loss": 0.0112, "step": 795 }, { "epoch": 1.2736, "grad_norm": 0.33281863236309694, "learning_rate": 1.2928674277881041e-05, "loss": 0.0071, "step": 796 }, { "epoch": 1.2752, "grad_norm": 0.5574126754068867, "learning_rate": 1.2912147122272523e-05, "loss": 0.0109, "step": 797 }, { "epoch": 1.2768, "grad_norm": 0.2797917785944588, "learning_rate": 1.2895611270550666e-05, "loss": 0.0067, "step": 798 }, { "epoch": 1.2784, "grad_norm": 0.3907726779123967, "learning_rate": 1.287906677209403e-05, "loss": 0.0084, "step": 799 }, { "epoch": 1.28, "grad_norm": 0.3629313864103953, "learning_rate": 1.2862513676307009e-05, "loss": 0.0099, "step": 800 }, { "epoch": 1.2816, "grad_norm": 0.2278749102527171, "learning_rate": 1.2845952032619651e-05, "loss": 0.0082, "step": 801 }, { "epoch": 1.2832, "grad_norm": 0.32473684811473386, "learning_rate": 1.2829381890487536e-05, "loss": 0.0072, "step": 802 }, { "epoch": 1.2848, "grad_norm": 0.3133256938184833, "learning_rate": 1.2812803299391629e-05, "loss": 0.0082, "step": 803 }, { "epoch": 1.2864, "grad_norm": 0.24712866401984404, "learning_rate": 1.2796216308838116e-05, "loss": 0.009, "step": 804 }, { "epoch": 1.288, "grad_norm": 0.329561597917042, "learning_rate": 1.2779620968358276e-05, "loss": 0.0099, "step": 805 }, { "epoch": 1.2896, "grad_norm": 0.3674812322915868, "learning_rate": 1.2763017327508304e-05, "loss": 0.0086, "step": 806 }, { "epoch": 1.2912, "grad_norm": 0.5298508650102428, "learning_rate": 1.2746405435869198e-05, "loss": 0.0096, "step": 807 }, { "epoch": 1.2928, "grad_norm": 0.36287308162787724, "learning_rate": 1.2729785343046587e-05, "loss": 0.0104, "step": 808 }, { "epoch": 1.2944, "grad_norm": 0.4298281989328197, "learning_rate": 1.271315709867059e-05, "loss": 0.012, "step": 809 }, { "epoch": 1.296, "grad_norm": 0.35227509950172525, "learning_rate": 1.2696520752395671e-05, "loss": 0.0149, "step": 810 }, { "epoch": 1.2976, "grad_norm": 0.29161432482983723, "learning_rate": 1.2679876353900482e-05, "loss": 0.0069, "step": 811 }, { "epoch": 1.2992, "grad_norm": 0.39437309067730203, "learning_rate": 1.2663223952887724e-05, "loss": 0.0092, "step": 812 }, { "epoch": 1.3008, "grad_norm": 0.44244490142285775, "learning_rate": 1.2646563599083997e-05, "loss": 0.0129, "step": 813 }, { "epoch": 1.3024, "grad_norm": 0.25192147580518404, "learning_rate": 1.2629895342239643e-05, "loss": 0.0083, "step": 814 }, { "epoch": 1.304, "grad_norm": 0.3250667811497033, "learning_rate": 1.2613219232128608e-05, "loss": 0.0068, "step": 815 }, { "epoch": 1.3056, "grad_norm": 0.23326374829818386, "learning_rate": 1.2596535318548288e-05, "loss": 0.0088, "step": 816 }, { "epoch": 1.3072, "grad_norm": 0.32973641631747075, "learning_rate": 1.2579843651319382e-05, "loss": 0.0112, "step": 817 }, { "epoch": 1.3088, "grad_norm": 0.5578728814053906, "learning_rate": 1.2563144280285742e-05, "loss": 0.0102, "step": 818 }, { "epoch": 1.3104, "grad_norm": 0.30049060516878323, "learning_rate": 1.2546437255314223e-05, "loss": 0.0081, "step": 819 }, { "epoch": 1.312, "grad_norm": 0.5041765014830494, "learning_rate": 1.252972262629454e-05, "loss": 0.0125, "step": 820 }, { "epoch": 1.3136, "grad_norm": 0.2746317455129088, "learning_rate": 1.2513000443139112e-05, "loss": 0.0079, "step": 821 }, { "epoch": 1.3152, "grad_norm": 0.22655587379623343, "learning_rate": 1.2496270755782913e-05, "loss": 0.0061, "step": 822 }, { "epoch": 1.3168, "grad_norm": 0.43052405453673703, "learning_rate": 1.2479533614183334e-05, "loss": 0.012, "step": 823 }, { "epoch": 1.3184, "grad_norm": 0.5377323132322557, "learning_rate": 1.2462789068320016e-05, "loss": 0.0142, "step": 824 }, { "epoch": 1.32, "grad_norm": 0.1647183059052667, "learning_rate": 1.2446037168194716e-05, "loss": 0.0062, "step": 825 }, { "epoch": 1.3216, "grad_norm": 0.32227842625423486, "learning_rate": 1.2429277963831147e-05, "loss": 0.0075, "step": 826 }, { "epoch": 1.3232, "grad_norm": 0.37578055706478664, "learning_rate": 1.2412511505274845e-05, "loss": 0.0097, "step": 827 }, { "epoch": 1.3248, "grad_norm": 0.34278180839303224, "learning_rate": 1.2395737842592997e-05, "loss": 0.0092, "step": 828 }, { "epoch": 1.3264, "grad_norm": 0.44306859364790885, "learning_rate": 1.23789570258743e-05, "loss": 0.0079, "step": 829 }, { "epoch": 1.328, "grad_norm": 0.24536713994929546, "learning_rate": 1.2362169105228828e-05, "loss": 0.0066, "step": 830 }, { "epoch": 1.3296000000000001, "grad_norm": 0.37085850581722773, "learning_rate": 1.2345374130787855e-05, "loss": 0.0104, "step": 831 }, { "epoch": 1.3312, "grad_norm": 0.37085850581722773, "learning_rate": 1.2345374130787855e-05, "loss": 0.0066, "step": 832 }, { "epoch": 1.3328, "grad_norm": 0.30848218442545855, "learning_rate": 1.2328572152703726e-05, "loss": 0.0103, "step": 833 }, { "epoch": 1.3344, "grad_norm": 0.3627469324704478, "learning_rate": 1.23117632211497e-05, "loss": 0.0099, "step": 834 }, { "epoch": 1.336, "grad_norm": 0.3160131608223671, "learning_rate": 1.2294947386319793e-05, "loss": 0.0122, "step": 835 }, { "epoch": 1.3376000000000001, "grad_norm": 0.2648370818046249, "learning_rate": 1.2278124698428643e-05, "loss": 0.0085, "step": 836 }, { "epoch": 1.3392, "grad_norm": 0.22951803230834383, "learning_rate": 1.2261295207711347e-05, "loss": 0.0082, "step": 837 }, { "epoch": 1.3408, "grad_norm": 0.2807748471689655, "learning_rate": 1.2244458964423328e-05, "loss": 0.0064, "step": 838 }, { "epoch": 1.3424, "grad_norm": 0.424777188861044, "learning_rate": 1.2227616018840154e-05, "loss": 0.0105, "step": 839 }, { "epoch": 1.3439999999999999, "grad_norm": 0.4438776391511739, "learning_rate": 1.221076642125742e-05, "loss": 0.0106, "step": 840 }, { "epoch": 1.3456000000000001, "grad_norm": 0.4737141697531989, "learning_rate": 1.2193910221990582e-05, "loss": 0.0091, "step": 841 }, { "epoch": 1.3472, "grad_norm": 0.22483995906080545, "learning_rate": 1.2177047471374808e-05, "loss": 0.0066, "step": 842 }, { "epoch": 1.3488, "grad_norm": 0.2966432665008213, "learning_rate": 1.2160178219764838e-05, "loss": 0.006, "step": 843 }, { "epoch": 1.3504, "grad_norm": 0.2922693904592179, "learning_rate": 1.214330251753481e-05, "loss": 0.0096, "step": 844 }, { "epoch": 1.3519999999999999, "grad_norm": 0.3977189742221552, "learning_rate": 1.2126420415078133e-05, "loss": 0.0118, "step": 845 }, { "epoch": 1.3536000000000001, "grad_norm": 0.3505353141532119, "learning_rate": 1.2109531962807333e-05, "loss": 0.0076, "step": 846 }, { "epoch": 1.3552, "grad_norm": 0.23791890889957615, "learning_rate": 1.2092637211153885e-05, "loss": 0.0061, "step": 847 }, { "epoch": 1.3568, "grad_norm": 0.3527591437661224, "learning_rate": 1.207573621056809e-05, "loss": 0.0079, "step": 848 }, { "epoch": 1.3584, "grad_norm": 0.40075379763225033, "learning_rate": 1.2058829011518896e-05, "loss": 0.0083, "step": 849 }, { "epoch": 1.3599999999999999, "grad_norm": 0.29395282358595143, "learning_rate": 1.2041915664493763e-05, "loss": 0.0093, "step": 850 }, { "epoch": 1.3616, "grad_norm": 0.38626848636739086, "learning_rate": 1.2024996219998517e-05, "loss": 0.0089, "step": 851 }, { "epoch": 1.3632, "grad_norm": 0.362595249446703, "learning_rate": 1.2008070728557186e-05, "loss": 0.0112, "step": 852 }, { "epoch": 1.3648, "grad_norm": 0.34489646540213964, "learning_rate": 1.1991139240711857e-05, "loss": 0.0067, "step": 853 }, { "epoch": 1.3664, "grad_norm": 0.20804277398188623, "learning_rate": 1.1974201807022525e-05, "loss": 0.0053, "step": 854 }, { "epoch": 1.3679999999999999, "grad_norm": 0.629465566724322, "learning_rate": 1.195725847806693e-05, "loss": 0.0122, "step": 855 }, { "epoch": 1.3696, "grad_norm": 0.29739632175228264, "learning_rate": 1.1940309304440434e-05, "loss": 0.0057, "step": 856 }, { "epoch": 1.3712, "grad_norm": 0.36866457031721195, "learning_rate": 1.1923354336755835e-05, "loss": 0.0099, "step": 857 }, { "epoch": 1.3728, "grad_norm": 0.2528063106828151, "learning_rate": 1.1906393625643244e-05, "loss": 0.0068, "step": 858 }, { "epoch": 1.3744, "grad_norm": 0.5200816773004503, "learning_rate": 1.1889427221749916e-05, "loss": 0.0085, "step": 859 }, { "epoch": 1.376, "grad_norm": 0.4688297926594098, "learning_rate": 1.1872455175740111e-05, "loss": 0.0086, "step": 860 }, { "epoch": 1.3776, "grad_norm": 0.42639147595917204, "learning_rate": 1.1855477538294934e-05, "loss": 0.0115, "step": 861 }, { "epoch": 1.3792, "grad_norm": 0.36162926831552683, "learning_rate": 1.1838494360112185e-05, "loss": 0.0075, "step": 862 }, { "epoch": 1.3808, "grad_norm": 0.377285170693332, "learning_rate": 1.1821505691906216e-05, "loss": 0.008, "step": 863 }, { "epoch": 1.3824, "grad_norm": 0.26461382490495844, "learning_rate": 1.1804511584407763e-05, "loss": 0.0057, "step": 864 }, { "epoch": 1.384, "grad_norm": 0.5373846258454587, "learning_rate": 1.1787512088363817e-05, "loss": 0.01, "step": 865 }, { "epoch": 1.3856, "grad_norm": 0.2500826668207825, "learning_rate": 1.1770507254537454e-05, "loss": 0.006, "step": 866 }, { "epoch": 1.3872, "grad_norm": 0.3530484206619875, "learning_rate": 1.1753497133707678e-05, "loss": 0.0076, "step": 867 }, { "epoch": 1.3888, "grad_norm": 0.580708563505262, "learning_rate": 1.1736481776669307e-05, "loss": 0.0077, "step": 868 }, { "epoch": 1.3904, "grad_norm": 0.39933089144151734, "learning_rate": 1.1719461234232765e-05, "loss": 0.0096, "step": 869 }, { "epoch": 1.392, "grad_norm": 0.418183007425909, "learning_rate": 1.1702435557223988e-05, "loss": 0.0083, "step": 870 }, { "epoch": 1.3936, "grad_norm": 0.36181371076167196, "learning_rate": 1.1685404796484226e-05, "loss": 0.0077, "step": 871 }, { "epoch": 1.3952, "grad_norm": 0.5490040008065915, "learning_rate": 1.1668369002869912e-05, "loss": 0.01, "step": 872 }, { "epoch": 1.3968, "grad_norm": 0.5014599138442379, "learning_rate": 1.1651328227252516e-05, "loss": 0.0123, "step": 873 }, { "epoch": 1.3984, "grad_norm": 0.35423008795715893, "learning_rate": 1.1634282520518382e-05, "loss": 0.0089, "step": 874 }, { "epoch": 1.4, "grad_norm": 0.2803589900876237, "learning_rate": 1.1617231933568579e-05, "loss": 0.0067, "step": 875 }, { "epoch": 1.4016, "grad_norm": 0.4036687277196133, "learning_rate": 1.1600176517318742e-05, "loss": 0.0091, "step": 876 }, { "epoch": 1.4032, "grad_norm": 0.5007461160934455, "learning_rate": 1.1583116322698936e-05, "loss": 0.0095, "step": 877 }, { "epoch": 1.4048, "grad_norm": 0.38342918619428096, "learning_rate": 1.1566051400653486e-05, "loss": 0.0129, "step": 878 }, { "epoch": 1.4064, "grad_norm": 0.305523477144802, "learning_rate": 1.1548981802140849e-05, "loss": 0.0082, "step": 879 }, { "epoch": 1.408, "grad_norm": 0.34758149839035163, "learning_rate": 1.153190757813343e-05, "loss": 0.01, "step": 880 }, { "epoch": 1.4096, "grad_norm": 0.29249731470991963, "learning_rate": 1.151482877961746e-05, "loss": 0.0074, "step": 881 }, { "epoch": 1.4112, "grad_norm": 0.26005852708222554, "learning_rate": 1.1497745457592817e-05, "loss": 0.0068, "step": 882 }, { "epoch": 1.4128, "grad_norm": 0.37864495517615604, "learning_rate": 1.1480657663072896e-05, "loss": 0.0095, "step": 883 }, { "epoch": 1.4144, "grad_norm": 0.26262297208151797, "learning_rate": 1.1463565447084446e-05, "loss": 0.0053, "step": 884 }, { "epoch": 1.416, "grad_norm": 0.457891954315086, "learning_rate": 1.1446468860667422e-05, "loss": 0.0106, "step": 885 }, { "epoch": 1.4176, "grad_norm": 0.3154968911620306, "learning_rate": 1.142936795487482e-05, "loss": 0.0091, "step": 886 }, { "epoch": 1.4192, "grad_norm": 0.34682416254437193, "learning_rate": 1.141226278077254e-05, "loss": 0.0076, "step": 887 }, { "epoch": 1.4208, "grad_norm": 0.2687387433904091, "learning_rate": 1.1395153389439232e-05, "loss": 0.0095, "step": 888 }, { "epoch": 1.4224, "grad_norm": 0.29596337803682293, "learning_rate": 1.1378039831966134e-05, "loss": 0.007, "step": 889 }, { "epoch": 1.424, "grad_norm": 0.36130970492428094, "learning_rate": 1.1360922159456929e-05, "loss": 0.0079, "step": 890 }, { "epoch": 1.4256, "grad_norm": 0.31624384251425164, "learning_rate": 1.1343800423027583e-05, "loss": 0.0118, "step": 891 }, { "epoch": 1.4272, "grad_norm": 0.36534736351875546, "learning_rate": 1.1326674673806195e-05, "loss": 0.0128, "step": 892 }, { "epoch": 1.4288, "grad_norm": 0.2851068872408835, "learning_rate": 1.1309544962932861e-05, "loss": 0.0073, "step": 893 }, { "epoch": 1.4304000000000001, "grad_norm": 0.3145959788278407, "learning_rate": 1.129241134155949e-05, "loss": 0.0077, "step": 894 }, { "epoch": 1.432, "grad_norm": 0.29982859274561346, "learning_rate": 1.1275273860849684e-05, "loss": 0.0079, "step": 895 }, { "epoch": 1.4336, "grad_norm": 0.36762779188609057, "learning_rate": 1.1258132571978555e-05, "loss": 0.0099, "step": 896 }, { "epoch": 1.4352, "grad_norm": 0.3316833150262433, "learning_rate": 1.1240987526132595e-05, "loss": 0.0092, "step": 897 }, { "epoch": 1.4368, "grad_norm": 0.3136355214354166, "learning_rate": 1.1223838774509515e-05, "loss": 0.0078, "step": 898 }, { "epoch": 1.4384000000000001, "grad_norm": 0.17982547977128643, "learning_rate": 1.1206686368318087e-05, "loss": 0.006, "step": 899 }, { "epoch": 1.44, "grad_norm": 0.33169527081978134, "learning_rate": 1.1189530358778005e-05, "loss": 0.0066, "step": 900 }, { "epoch": 1.4416, "grad_norm": 0.3896441580222428, "learning_rate": 1.1172370797119711e-05, "loss": 0.0078, "step": 901 }, { "epoch": 1.4432, "grad_norm": 0.5813066018410238, "learning_rate": 1.1155207734584264e-05, "loss": 0.0102, "step": 902 }, { "epoch": 1.4447999999999999, "grad_norm": 0.4291550297920064, "learning_rate": 1.1138041222423177e-05, "loss": 0.0093, "step": 903 }, { "epoch": 1.4464000000000001, "grad_norm": 0.25388174158389226, "learning_rate": 1.1120871311898254e-05, "loss": 0.0084, "step": 904 }, { "epoch": 1.448, "grad_norm": 0.3056076978259771, "learning_rate": 1.110369805428146e-05, "loss": 0.0072, "step": 905 }, { "epoch": 1.4496, "grad_norm": 0.3181572825323978, "learning_rate": 1.1086521500854746e-05, "loss": 0.008, "step": 906 }, { "epoch": 1.4512, "grad_norm": 0.3642763841578639, "learning_rate": 1.106934170290991e-05, "loss": 0.0085, "step": 907 }, { "epoch": 1.4527999999999999, "grad_norm": 0.2851410982407962, "learning_rate": 1.1052158711748435e-05, "loss": 0.0082, "step": 908 }, { "epoch": 1.4544000000000001, "grad_norm": 0.3791614811930034, "learning_rate": 1.1034972578681338e-05, "loss": 0.0119, "step": 909 }, { "epoch": 1.456, "grad_norm": 0.3177905578295756, "learning_rate": 1.1017783355029027e-05, "loss": 0.0066, "step": 910 }, { "epoch": 1.4576, "grad_norm": 0.47774781874768774, "learning_rate": 1.1000591092121126e-05, "loss": 0.0113, "step": 911 }, { "epoch": 1.4592, "grad_norm": 0.3274200811499365, "learning_rate": 1.0983395841296349e-05, "loss": 0.0088, "step": 912 }, { "epoch": 1.4607999999999999, "grad_norm": 0.356859262010724, "learning_rate": 1.0966197653902319e-05, "loss": 0.0077, "step": 913 }, { "epoch": 1.4624, "grad_norm": 0.3265891416269014, "learning_rate": 1.0948996581295437e-05, "loss": 0.0096, "step": 914 }, { "epoch": 1.464, "grad_norm": 0.43926335025409535, "learning_rate": 1.0931792674840718e-05, "loss": 0.0083, "step": 915 }, { "epoch": 1.4656, "grad_norm": 0.46204903984552365, "learning_rate": 1.0914585985911632e-05, "loss": 0.0073, "step": 916 }, { "epoch": 1.4672, "grad_norm": 0.3547883499422264, "learning_rate": 1.0897376565889972e-05, "loss": 0.0089, "step": 917 }, { "epoch": 1.4687999999999999, "grad_norm": 0.31694588750563024, "learning_rate": 1.0880164466165675e-05, "loss": 0.006, "step": 918 }, { "epoch": 1.4704, "grad_norm": 0.3493040167389327, "learning_rate": 1.0862949738136682e-05, "loss": 0.01, "step": 919 }, { "epoch": 1.472, "grad_norm": 0.2912440674322754, "learning_rate": 1.084573243320878e-05, "loss": 0.0078, "step": 920 }, { "epoch": 1.4736, "grad_norm": 0.26374265472204816, "learning_rate": 1.0828512602795462e-05, "loss": 0.0072, "step": 921 }, { "epoch": 1.4752, "grad_norm": 0.5666887421121791, "learning_rate": 1.0811290298317755e-05, "loss": 0.0092, "step": 922 }, { "epoch": 1.4768, "grad_norm": 0.6282685860166906, "learning_rate": 1.0794065571204073e-05, "loss": 0.0118, "step": 923 }, { "epoch": 1.4784, "grad_norm": 0.3907357745744409, "learning_rate": 1.0776838472890065e-05, "loss": 0.0071, "step": 924 }, { "epoch": 1.48, "grad_norm": 0.24739569338370557, "learning_rate": 1.0759609054818459e-05, "loss": 0.0075, "step": 925 }, { "epoch": 1.4816, "grad_norm": 0.5111206065703442, "learning_rate": 1.0742377368438915e-05, "loss": 0.0085, "step": 926 }, { "epoch": 1.4832, "grad_norm": 0.37201673466090657, "learning_rate": 1.0725143465207868e-05, "loss": 0.0069, "step": 927 }, { "epoch": 1.4848, "grad_norm": 0.3401601865747001, "learning_rate": 1.0707907396588362e-05, "loss": 0.0075, "step": 928 }, { "epoch": 1.4864, "grad_norm": 0.28814263010972685, "learning_rate": 1.069066921404992e-05, "loss": 0.0076, "step": 929 }, { "epoch": 1.488, "grad_norm": 0.44674818094500446, "learning_rate": 1.0673428969068365e-05, "loss": 0.0081, "step": 930 }, { "epoch": 1.4896, "grad_norm": 0.39018171835308646, "learning_rate": 1.065618671312569e-05, "loss": 0.0079, "step": 931 }, { "epoch": 1.4912, "grad_norm": 0.425351217131543, "learning_rate": 1.063894249770989e-05, "loss": 0.0063, "step": 932 }, { "epoch": 1.4928, "grad_norm": 0.24044988708296725, "learning_rate": 1.0621696374314807e-05, "loss": 0.005, "step": 933 }, { "epoch": 1.4944, "grad_norm": 0.39605196566520534, "learning_rate": 1.0604448394439983e-05, "loss": 0.0082, "step": 934 }, { "epoch": 1.496, "grad_norm": 0.2975298766800573, "learning_rate": 1.0587198609590505e-05, "loss": 0.0053, "step": 935 }, { "epoch": 1.4976, "grad_norm": 0.1918028503772937, "learning_rate": 1.0569947071276847e-05, "loss": 0.0056, "step": 936 }, { "epoch": 1.4992, "grad_norm": 0.5225196763397442, "learning_rate": 1.0552693831014726e-05, "loss": 0.0073, "step": 937 }, { "epoch": 1.5008, "grad_norm": 0.4740052207484797, "learning_rate": 1.053543894032493e-05, "loss": 0.0086, "step": 938 }, { "epoch": 1.5024, "grad_norm": 0.38194720723505593, "learning_rate": 1.0518182450733185e-05, "loss": 0.0065, "step": 939 }, { "epoch": 1.504, "grad_norm": 0.5756849630159504, "learning_rate": 1.0500924413769988e-05, "loss": 0.0128, "step": 940 }, { "epoch": 1.5056, "grad_norm": 0.510189764918798, "learning_rate": 1.0483664880970456e-05, "loss": 0.008, "step": 941 }, { "epoch": 1.5072, "grad_norm": 0.3040717534673568, "learning_rate": 1.0466403903874176e-05, "loss": 0.0063, "step": 942 }, { "epoch": 1.5088, "grad_norm": 0.3804735334346367, "learning_rate": 1.0449141534025044e-05, "loss": 0.0085, "step": 943 }, { "epoch": 1.5104, "grad_norm": 0.3791484229367134, "learning_rate": 1.0431877822971118e-05, "loss": 0.0056, "step": 944 }, { "epoch": 1.512, "grad_norm": 0.5429868292228728, "learning_rate": 1.0414612822264457e-05, "loss": 0.0111, "step": 945 }, { "epoch": 1.5135999999999998, "grad_norm": 0.48848710775177834, "learning_rate": 1.0397346583460972e-05, "loss": 0.0141, "step": 946 }, { "epoch": 1.5152, "grad_norm": 0.41318570074900424, "learning_rate": 1.038007915812028e-05, "loss": 0.0078, "step": 947 }, { "epoch": 1.5168, "grad_norm": 0.2347124585806924, "learning_rate": 1.0362810597805526e-05, "loss": 0.0073, "step": 948 }, { "epoch": 1.5184, "grad_norm": 0.2584944658144249, "learning_rate": 1.034554095408326e-05, "loss": 0.0061, "step": 949 }, { "epoch": 1.52, "grad_norm": 0.3665948347221613, "learning_rate": 1.0328270278523256e-05, "loss": 0.0083, "step": 950 }, { "epoch": 1.5215999999999998, "grad_norm": 0.27856771161565724, "learning_rate": 1.031099862269837e-05, "loss": 0.0066, "step": 951 }, { "epoch": 1.5232, "grad_norm": 0.5144509420873353, "learning_rate": 1.0293726038184393e-05, "loss": 0.0097, "step": 952 }, { "epoch": 1.5248, "grad_norm": 0.44385552507763787, "learning_rate": 1.0276452576559878e-05, "loss": 0.0073, "step": 953 }, { "epoch": 1.5264, "grad_norm": 0.22407647544607326, "learning_rate": 1.0259178289406011e-05, "loss": 0.0061, "step": 954 }, { "epoch": 1.528, "grad_norm": 0.2805711214308104, "learning_rate": 1.024190322830643e-05, "loss": 0.0058, "step": 955 }, { "epoch": 1.5295999999999998, "grad_norm": 0.404703550361594, "learning_rate": 1.022462744484709e-05, "loss": 0.0076, "step": 956 }, { "epoch": 1.5312000000000001, "grad_norm": 0.2694424159798581, "learning_rate": 1.0207350990616107e-05, "loss": 0.0083, "step": 957 }, { "epoch": 1.5328, "grad_norm": 0.21143467606914215, "learning_rate": 1.019007391720359e-05, "loss": 0.0044, "step": 958 }, { "epoch": 1.5344, "grad_norm": 0.5338154828197106, "learning_rate": 1.0172796276201504e-05, "loss": 0.0083, "step": 959 }, { "epoch": 1.536, "grad_norm": 0.2364234305089969, "learning_rate": 1.0155518119203511e-05, "loss": 0.0062, "step": 960 }, { "epoch": 1.5375999999999999, "grad_norm": 0.4413487129601474, "learning_rate": 1.0138239497804804e-05, "loss": 0.0068, "step": 961 }, { "epoch": 1.5392000000000001, "grad_norm": 0.5534092282345944, "learning_rate": 1.0120960463601977e-05, "loss": 0.0124, "step": 962 }, { "epoch": 1.5408, "grad_norm": 0.17683301558606265, "learning_rate": 1.0103681068192845e-05, "loss": 0.0042, "step": 963 }, { "epoch": 1.5424, "grad_norm": 0.36698442517605695, "learning_rate": 1.0086401363176306e-05, "loss": 0.0065, "step": 964 }, { "epoch": 1.544, "grad_norm": 0.49359052658995684, "learning_rate": 1.0069121400152182e-05, "loss": 0.0105, "step": 965 }, { "epoch": 1.5455999999999999, "grad_norm": 0.3855375904953211, "learning_rate": 1.0051841230721065e-05, "loss": 0.0084, "step": 966 }, { "epoch": 1.5472000000000001, "grad_norm": 0.2950845647855933, "learning_rate": 1.0034560906484161e-05, "loss": 0.0048, "step": 967 }, { "epoch": 1.5488, "grad_norm": 0.8075864977538052, "learning_rate": 1.0017280479043148e-05, "loss": 0.0096, "step": 968 }, { "epoch": 1.5504, "grad_norm": 0.6355076338574505, "learning_rate": 1e-05, "loss": 0.0144, "step": 969 }, { "epoch": 1.552, "grad_norm": 0.3166150019684947, "learning_rate": 9.982719520956856e-06, "loss": 0.0095, "step": 970 }, { "epoch": 1.5535999999999999, "grad_norm": 0.5387976539923057, "learning_rate": 9.965439093515842e-06, "loss": 0.012, "step": 971 }, { "epoch": 1.5552000000000001, "grad_norm": 0.3613434605845047, "learning_rate": 9.948158769278939e-06, "loss": 0.0077, "step": 972 }, { "epoch": 1.5568, "grad_norm": 0.31729947652365154, "learning_rate": 9.930878599847822e-06, "loss": 0.0082, "step": 973 }, { "epoch": 1.5584, "grad_norm": 0.404159685281789, "learning_rate": 9.913598636823694e-06, "loss": 0.007, "step": 974 }, { "epoch": 1.56, "grad_norm": 0.38660573811621596, "learning_rate": 9.896318931807155e-06, "loss": 0.0079, "step": 975 }, { "epoch": 1.5615999999999999, "grad_norm": 0.3594232944435872, "learning_rate": 9.879039536398023e-06, "loss": 0.0069, "step": 976 }, { "epoch": 1.5632000000000001, "grad_norm": 0.40316798951088684, "learning_rate": 9.861760502195197e-06, "loss": 0.0075, "step": 977 }, { "epoch": 1.5648, "grad_norm": 0.27917997843188214, "learning_rate": 9.844481880796492e-06, "loss": 0.0066, "step": 978 }, { "epoch": 1.5664, "grad_norm": 0.2498848300811543, "learning_rate": 9.827203723798498e-06, "loss": 0.0071, "step": 979 }, { "epoch": 1.568, "grad_norm": 0.31125100047983123, "learning_rate": 9.809926082796415e-06, "loss": 0.0065, "step": 980 }, { "epoch": 1.5695999999999999, "grad_norm": 0.3937536406425979, "learning_rate": 9.7926490093839e-06, "loss": 0.0063, "step": 981 }, { "epoch": 1.5712000000000002, "grad_norm": 0.3458932138437726, "learning_rate": 9.775372555152912e-06, "loss": 0.0078, "step": 982 }, { "epoch": 1.5728, "grad_norm": 0.34898050713560863, "learning_rate": 9.758096771693574e-06, "loss": 0.0077, "step": 983 }, { "epoch": 1.5744, "grad_norm": 0.2970444107821782, "learning_rate": 9.740821710593989e-06, "loss": 0.0055, "step": 984 }, { "epoch": 1.576, "grad_norm": 0.15653928019538896, "learning_rate": 9.723547423440122e-06, "loss": 0.0051, "step": 985 }, { "epoch": 1.5776, "grad_norm": 0.28161861859072923, "learning_rate": 9.70627396181561e-06, "loss": 0.0064, "step": 986 }, { "epoch": 1.5792000000000002, "grad_norm": 0.4613769443728261, "learning_rate": 9.689001377301634e-06, "loss": 0.0091, "step": 987 }, { "epoch": 1.5808, "grad_norm": 0.4574576544358834, "learning_rate": 9.671729721476747e-06, "loss": 0.0099, "step": 988 }, { "epoch": 1.5824, "grad_norm": 0.33530952880607473, "learning_rate": 9.654459045916743e-06, "loss": 0.0078, "step": 989 }, { "epoch": 1.584, "grad_norm": 0.28242677597079807, "learning_rate": 9.637189402194477e-06, "loss": 0.009, "step": 990 }, { "epoch": 1.5856, "grad_norm": 0.19422839526194863, "learning_rate": 9.619920841879726e-06, "loss": 0.0056, "step": 991 }, { "epoch": 1.5872000000000002, "grad_norm": 0.3148931713956359, "learning_rate": 9.602653416539031e-06, "loss": 0.0049, "step": 992 }, { "epoch": 1.5888, "grad_norm": 0.399269780526929, "learning_rate": 9.585387177735548e-06, "loss": 0.0077, "step": 993 }, { "epoch": 1.5904, "grad_norm": 0.19831642838835764, "learning_rate": 9.568122177028884e-06, "loss": 0.0054, "step": 994 }, { "epoch": 1.592, "grad_norm": 0.31601962588213334, "learning_rate": 9.550858465974958e-06, "loss": 0.0052, "step": 995 }, { "epoch": 1.5936, "grad_norm": 0.2582794553323895, "learning_rate": 9.533596096125826e-06, "loss": 0.0048, "step": 996 }, { "epoch": 1.5952, "grad_norm": 0.3926592333535472, "learning_rate": 9.516335119029547e-06, "loss": 0.0094, "step": 997 }, { "epoch": 1.5968, "grad_norm": 0.4871992079292709, "learning_rate": 9.499075586230014e-06, "loss": 0.0091, "step": 998 }, { "epoch": 1.5984, "grad_norm": 0.3472080830513595, "learning_rate": 9.481817549266817e-06, "loss": 0.0089, "step": 999 }, { "epoch": 1.6, "grad_norm": 0.32112872839677564, "learning_rate": 9.464561059675073e-06, "loss": 0.0088, "step": 1000 }, { "epoch": 1.6016, "grad_norm": 0.3858214720607361, "learning_rate": 9.44730616898528e-06, "loss": 0.0091, "step": 1001 }, { "epoch": 1.6032, "grad_norm": 0.24614269707964617, "learning_rate": 9.430052928723153e-06, "loss": 0.0043, "step": 1002 }, { "epoch": 1.6048, "grad_norm": 0.4562912736776646, "learning_rate": 9.412801390409496e-06, "loss": 0.0056, "step": 1003 }, { "epoch": 1.6064, "grad_norm": 0.32457799252073727, "learning_rate": 9.395551605560018e-06, "loss": 0.0044, "step": 1004 }, { "epoch": 1.608, "grad_norm": 0.548586587859223, "learning_rate": 9.378303625685196e-06, "loss": 0.0078, "step": 1005 }, { "epoch": 1.6096, "grad_norm": 0.18743960053979214, "learning_rate": 9.361057502290112e-06, "loss": 0.0044, "step": 1006 }, { "epoch": 1.6112, "grad_norm": 0.2672443254835976, "learning_rate": 9.343813286874312e-06, "loss": 0.0047, "step": 1007 }, { "epoch": 1.6128, "grad_norm": 0.31131607843452186, "learning_rate": 9.326571030931636e-06, "loss": 0.0079, "step": 1008 }, { "epoch": 1.6143999999999998, "grad_norm": 0.344647486890118, "learning_rate": 9.309330785950086e-06, "loss": 0.0075, "step": 1009 }, { "epoch": 1.616, "grad_norm": 0.5556824055660701, "learning_rate": 9.292092603411642e-06, "loss": 0.0099, "step": 1010 }, { "epoch": 1.6176, "grad_norm": 0.4352851050232399, "learning_rate": 9.274856534792138e-06, "loss": 0.0063, "step": 1011 }, { "epoch": 1.6192, "grad_norm": 0.23656950577275335, "learning_rate": 9.257622631561085e-06, "loss": 0.0041, "step": 1012 }, { "epoch": 1.6208, "grad_norm": 0.29931781512890643, "learning_rate": 9.240390945181543e-06, "loss": 0.007, "step": 1013 }, { "epoch": 1.6223999999999998, "grad_norm": 0.2650588085569112, "learning_rate": 9.223161527109938e-06, "loss": 0.0051, "step": 1014 }, { "epoch": 1.624, "grad_norm": 0.2714204527615275, "learning_rate": 9.205934428795929e-06, "loss": 0.0073, "step": 1015 }, { "epoch": 1.6256, "grad_norm": 0.3800341050458012, "learning_rate": 9.188709701682246e-06, "loss": 0.0096, "step": 1016 }, { "epoch": 1.6272, "grad_norm": 0.3126363797153251, "learning_rate": 9.17148739720454e-06, "loss": 0.0051, "step": 1017 }, { "epoch": 1.6288, "grad_norm": 0.3386373119408399, "learning_rate": 9.154267566791224e-06, "loss": 0.0056, "step": 1018 }, { "epoch": 1.6303999999999998, "grad_norm": 0.45698087996637005, "learning_rate": 9.137050261863323e-06, "loss": 0.0089, "step": 1019 }, { "epoch": 1.6320000000000001, "grad_norm": 0.342596772259811, "learning_rate": 9.119835533834332e-06, "loss": 0.0059, "step": 1020 }, { "epoch": 1.6336, "grad_norm": 0.4333113205185105, "learning_rate": 9.102623434110028e-06, "loss": 0.0081, "step": 1021 }, { "epoch": 1.6352, "grad_norm": 0.330647644777723, "learning_rate": 9.085414014088368e-06, "loss": 0.0081, "step": 1022 }, { "epoch": 1.6368, "grad_norm": 0.3686712495111263, "learning_rate": 9.068207325159285e-06, "loss": 0.0069, "step": 1023 }, { "epoch": 1.6383999999999999, "grad_norm": 0.42138138717299284, "learning_rate": 9.051003418704566e-06, "loss": 0.0062, "step": 1024 }, { "epoch": 1.6400000000000001, "grad_norm": 0.3040381923635361, "learning_rate": 9.033802346097683e-06, "loss": 0.0062, "step": 1025 }, { "epoch": 1.6416, "grad_norm": 0.24752072470183697, "learning_rate": 9.016604158703654e-06, "loss": 0.0073, "step": 1026 }, { "epoch": 1.6432, "grad_norm": 0.40757468787887463, "learning_rate": 8.999408907878877e-06, "loss": 0.0061, "step": 1027 }, { "epoch": 1.6448, "grad_norm": 0.4507691465257879, "learning_rate": 8.982216644970978e-06, "loss": 0.0073, "step": 1028 }, { "epoch": 1.6463999999999999, "grad_norm": 0.2753376263149583, "learning_rate": 8.965027421318666e-06, "loss": 0.0053, "step": 1029 }, { "epoch": 1.6480000000000001, "grad_norm": 0.2693047821995353, "learning_rate": 8.947841288251568e-06, "loss": 0.0056, "step": 1030 }, { "epoch": 1.6496, "grad_norm": 0.2931505194078567, "learning_rate": 8.930658297090092e-06, "loss": 0.0077, "step": 1031 }, { "epoch": 1.6512, "grad_norm": 0.39489564425611573, "learning_rate": 8.913478499145255e-06, "loss": 0.0071, "step": 1032 }, { "epoch": 1.6528, "grad_norm": 0.31392814320075263, "learning_rate": 8.896301945718541e-06, "loss": 0.0054, "step": 1033 }, { "epoch": 1.6543999999999999, "grad_norm": 0.36157939324989835, "learning_rate": 8.879128688101749e-06, "loss": 0.0071, "step": 1034 }, { "epoch": 1.6560000000000001, "grad_norm": 0.5234260678164566, "learning_rate": 8.861958777576826e-06, "loss": 0.0056, "step": 1035 }, { "epoch": 1.6576, "grad_norm": 0.5320931361265643, "learning_rate": 8.844792265415738e-06, "loss": 0.0129, "step": 1036 }, { "epoch": 1.6592, "grad_norm": 0.3858984330716074, "learning_rate": 8.827629202880294e-06, "loss": 0.0074, "step": 1037 }, { "epoch": 1.6608, "grad_norm": 0.29373407740042495, "learning_rate": 8.810469641222001e-06, "loss": 0.0073, "step": 1038 }, { "epoch": 1.6623999999999999, "grad_norm": 0.37216888978843427, "learning_rate": 8.793313631681915e-06, "loss": 0.006, "step": 1039 }, { "epoch": 1.6640000000000001, "grad_norm": 0.3994723130190849, "learning_rate": 8.776161225490488e-06, "loss": 0.0071, "step": 1040 }, { "epoch": 1.6656, "grad_norm": 0.3248017020873982, "learning_rate": 8.759012473867407e-06, "loss": 0.0095, "step": 1041 }, { "epoch": 1.6672, "grad_norm": 0.4572658899164843, "learning_rate": 8.741867428021447e-06, "loss": 0.0089, "step": 1042 }, { "epoch": 1.6688, "grad_norm": 0.29580859378645913, "learning_rate": 8.72472613915032e-06, "loss": 0.0048, "step": 1043 }, { "epoch": 1.6703999999999999, "grad_norm": 0.20178460492639025, "learning_rate": 8.707588658440511e-06, "loss": 0.0047, "step": 1044 }, { "epoch": 1.6720000000000002, "grad_norm": 0.2449233525366233, "learning_rate": 8.690455037067142e-06, "loss": 0.0066, "step": 1045 }, { "epoch": 1.6736, "grad_norm": 0.2636928045909801, "learning_rate": 8.673325326193806e-06, "loss": 0.0057, "step": 1046 }, { "epoch": 1.6752, "grad_norm": 0.3635025762512276, "learning_rate": 8.656199576972424e-06, "loss": 0.0077, "step": 1047 }, { "epoch": 1.6768, "grad_norm": 0.4118276246432996, "learning_rate": 8.639077840543078e-06, "loss": 0.0085, "step": 1048 }, { "epoch": 1.6784, "grad_norm": 0.30512670055196606, "learning_rate": 8.621960168033868e-06, "loss": 0.0068, "step": 1049 }, { "epoch": 1.6800000000000002, "grad_norm": 0.38760281582430434, "learning_rate": 8.604846610560771e-06, "loss": 0.0069, "step": 1050 }, { "epoch": 1.6816, "grad_norm": 0.7300646784773884, "learning_rate": 8.587737219227462e-06, "loss": 0.0167, "step": 1051 }, { "epoch": 1.6832, "grad_norm": 0.3130205403934896, "learning_rate": 8.570632045125185e-06, "loss": 0.005, "step": 1052 }, { "epoch": 1.6848, "grad_norm": 0.35157751233453793, "learning_rate": 8.553531139332583e-06, "loss": 0.0052, "step": 1053 }, { "epoch": 1.6864, "grad_norm": 0.3671764642336765, "learning_rate": 8.536434552915555e-06, "loss": 0.0102, "step": 1054 }, { "epoch": 1.688, "grad_norm": 0.40735028939010787, "learning_rate": 8.519342336927106e-06, "loss": 0.0079, "step": 1055 }, { "epoch": 1.6896, "grad_norm": 0.20121176475177047, "learning_rate": 8.502254542407186e-06, "loss": 0.0041, "step": 1056 }, { "epoch": 1.6912, "grad_norm": 0.15124424016459737, "learning_rate": 8.485171220382545e-06, "loss": 0.0041, "step": 1057 }, { "epoch": 1.6928, "grad_norm": 1.221110816048998, "learning_rate": 8.468092421866575e-06, "loss": 0.0083, "step": 1058 }, { "epoch": 1.6944, "grad_norm": 0.32146297311032346, "learning_rate": 8.451018197859153e-06, "loss": 0.0065, "step": 1059 }, { "epoch": 1.696, "grad_norm": 0.19352312694489443, "learning_rate": 8.433948599346516e-06, "loss": 0.0047, "step": 1060 }, { "epoch": 1.6976, "grad_norm": 0.2887348899685335, "learning_rate": 8.41688367730107e-06, "loss": 0.0056, "step": 1061 }, { "epoch": 1.6992, "grad_norm": 0.25308885584787627, "learning_rate": 8.399823482681263e-06, "loss": 0.0055, "step": 1062 }, { "epoch": 1.7008, "grad_norm": 0.4536632873651025, "learning_rate": 8.382768066431427e-06, "loss": 0.0114, "step": 1063 }, { "epoch": 1.7024, "grad_norm": 0.3334783544636052, "learning_rate": 8.36571747948162e-06, "loss": 0.0077, "step": 1064 }, { "epoch": 1.704, "grad_norm": 0.5080509457816318, "learning_rate": 8.348671772747488e-06, "loss": 0.0064, "step": 1065 }, { "epoch": 1.7056, "grad_norm": 0.21655922542846942, "learning_rate": 8.331630997130091e-06, "loss": 0.0051, "step": 1066 }, { "epoch": 1.7072, "grad_norm": 0.32059563902166266, "learning_rate": 8.314595203515781e-06, "loss": 0.0065, "step": 1067 }, { "epoch": 1.7088, "grad_norm": 0.19953685775494243, "learning_rate": 8.297564442776014e-06, "loss": 0.005, "step": 1068 }, { "epoch": 1.7104, "grad_norm": 0.3299807282834272, "learning_rate": 8.280538765767236e-06, "loss": 0.0068, "step": 1069 }, { "epoch": 1.712, "grad_norm": 0.400352453437766, "learning_rate": 8.263518223330698e-06, "loss": 0.0062, "step": 1070 }, { "epoch": 1.7136, "grad_norm": 0.2856335747990602, "learning_rate": 8.246502866292324e-06, "loss": 0.0056, "step": 1071 }, { "epoch": 1.7151999999999998, "grad_norm": 0.2527366785060869, "learning_rate": 8.229492745462551e-06, "loss": 0.0072, "step": 1072 }, { "epoch": 1.7168, "grad_norm": 0.24010903888241203, "learning_rate": 8.212487911636185e-06, "loss": 0.0056, "step": 1073 }, { "epoch": 1.7184, "grad_norm": 0.23896142647226756, "learning_rate": 8.195488415592238e-06, "loss": 0.0044, "step": 1074 }, { "epoch": 1.72, "grad_norm": 0.44078832518865635, "learning_rate": 8.17849430809379e-06, "loss": 0.011, "step": 1075 }, { "epoch": 1.7216, "grad_norm": 0.2928645606679149, "learning_rate": 8.161505639887818e-06, "loss": 0.0051, "step": 1076 }, { "epoch": 1.7231999999999998, "grad_norm": 0.22448380268567095, "learning_rate": 8.144522461705067e-06, "loss": 0.0053, "step": 1077 }, { "epoch": 1.7248, "grad_norm": 0.3144154252736782, "learning_rate": 8.12754482425989e-06, "loss": 0.0045, "step": 1078 }, { "epoch": 1.7264, "grad_norm": 0.39472540262238637, "learning_rate": 8.110572778250086e-06, "loss": 0.0064, "step": 1079 }, { "epoch": 1.728, "grad_norm": 0.22917733716831562, "learning_rate": 8.09360637435676e-06, "loss": 0.0031, "step": 1080 }, { "epoch": 1.7296, "grad_norm": 0.3201761452865257, "learning_rate": 8.076645663244168e-06, "loss": 0.0071, "step": 1081 }, { "epoch": 1.7311999999999999, "grad_norm": 0.3584742117015553, "learning_rate": 8.05969069555957e-06, "loss": 0.0056, "step": 1082 }, { "epoch": 1.7328000000000001, "grad_norm": 0.7920760407669944, "learning_rate": 8.042741521933071e-06, "loss": 0.0083, "step": 1083 }, { "epoch": 1.7344, "grad_norm": 0.4012984458746515, "learning_rate": 8.025798192977482e-06, "loss": 0.0035, "step": 1084 }, { "epoch": 1.736, "grad_norm": 0.3498836585904627, "learning_rate": 8.008860759288148e-06, "loss": 0.0059, "step": 1085 }, { "epoch": 1.7376, "grad_norm": 0.23419600252095124, "learning_rate": 7.991929271442817e-06, "loss": 0.003, "step": 1086 }, { "epoch": 1.7391999999999999, "grad_norm": 0.15684536876271404, "learning_rate": 7.975003780001486e-06, "loss": 0.0031, "step": 1087 }, { "epoch": 1.7408000000000001, "grad_norm": 0.27088599771134847, "learning_rate": 7.958084335506239e-06, "loss": 0.0051, "step": 1088 }, { "epoch": 1.7424, "grad_norm": 0.2713339506776891, "learning_rate": 7.941170988481108e-06, "loss": 0.0033, "step": 1089 }, { "epoch": 1.744, "grad_norm": 0.444222794720414, "learning_rate": 7.924263789431913e-06, "loss": 0.0073, "step": 1090 }, { "epoch": 1.7456, "grad_norm": 0.3699148307486384, "learning_rate": 7.907362788846116e-06, "loss": 0.0057, "step": 1091 }, { "epoch": 1.7471999999999999, "grad_norm": 0.5512431022822822, "learning_rate": 7.89046803719267e-06, "loss": 0.0068, "step": 1092 }, { "epoch": 1.7488000000000001, "grad_norm": 0.2578013990883215, "learning_rate": 7.873579584921869e-06, "loss": 0.0041, "step": 1093 }, { "epoch": 1.7504, "grad_norm": 0.3321770414037301, "learning_rate": 7.856697482465195e-06, "loss": 0.0058, "step": 1094 }, { "epoch": 1.752, "grad_norm": 0.32220117094247036, "learning_rate": 7.839821780235168e-06, "loss": 0.0034, "step": 1095 }, { "epoch": 1.7536, "grad_norm": 0.6863037206004958, "learning_rate": 7.822952528625192e-06, "loss": 0.0088, "step": 1096 }, { "epoch": 1.7551999999999999, "grad_norm": 0.24408042299840918, "learning_rate": 7.806089778009421e-06, "loss": 0.0049, "step": 1097 }, { "epoch": 1.7568000000000001, "grad_norm": 0.5797822426576273, "learning_rate": 7.789233578742583e-06, "loss": 0.0089, "step": 1098 }, { "epoch": 1.7584, "grad_norm": 0.5290109447804685, "learning_rate": 7.77238398115985e-06, "loss": 0.0109, "step": 1099 }, { "epoch": 1.76, "grad_norm": 0.41951629898132015, "learning_rate": 7.755541035576677e-06, "loss": 0.0058, "step": 1100 }, { "epoch": 1.7616, "grad_norm": 0.4877789394560355, "learning_rate": 7.738704792288654e-06, "loss": 0.0059, "step": 1101 }, { "epoch": 1.7631999999999999, "grad_norm": 0.5335087112253809, "learning_rate": 7.721875301571359e-06, "loss": 0.0097, "step": 1102 }, { "epoch": 1.7648000000000001, "grad_norm": 0.47224661389922745, "learning_rate": 7.705052613680212e-06, "loss": 0.0118, "step": 1103 }, { "epoch": 1.7664, "grad_norm": 0.41273094189268117, "learning_rate": 7.688236778850307e-06, "loss": 0.0058, "step": 1104 }, { "epoch": 1.768, "grad_norm": 0.6097866252752729, "learning_rate": 7.671427847296274e-06, "loss": 0.0084, "step": 1105 }, { "epoch": 1.7696, "grad_norm": 0.39561876270305457, "learning_rate": 7.654625869212147e-06, "loss": 0.0069, "step": 1106 }, { "epoch": 1.7711999999999999, "grad_norm": 0.5717886178279439, "learning_rate": 7.637830894771176e-06, "loss": 0.0089, "step": 1107 }, { "epoch": 1.7728000000000002, "grad_norm": 0.5155788333858049, "learning_rate": 7.621042974125701e-06, "loss": 0.0058, "step": 1108 }, { "epoch": 1.7744, "grad_norm": 0.3477078857840802, "learning_rate": 7.604262157407008e-06, "loss": 0.0039, "step": 1109 }, { "epoch": 1.776, "grad_norm": 0.23435713399109273, "learning_rate": 7.587488494725157e-06, "loss": 0.0052, "step": 1110 }, { "epoch": 1.7776, "grad_norm": 0.5578341526611292, "learning_rate": 7.570722036168855e-06, "loss": 0.0098, "step": 1111 }, { "epoch": 1.7792, "grad_norm": 0.655412385837667, "learning_rate": 7.553962831805291e-06, "loss": 0.0085, "step": 1112 }, { "epoch": 1.7808000000000002, "grad_norm": 0.5802910339710066, "learning_rate": 7.537210931679988e-06, "loss": 0.0076, "step": 1113 }, { "epoch": 1.7824, "grad_norm": 0.6665626528983685, "learning_rate": 7.520466385816672e-06, "loss": 0.0087, "step": 1114 }, { "epoch": 1.784, "grad_norm": 0.33197254843442225, "learning_rate": 7.5037292442170865e-06, "loss": 0.0039, "step": 1115 }, { "epoch": 1.7856, "grad_norm": 0.5695030088373062, "learning_rate": 7.48699955686089e-06, "loss": 0.0098, "step": 1116 }, { "epoch": 1.7872, "grad_norm": 0.33307819834062863, "learning_rate": 7.470277373705461e-06, "loss": 0.0067, "step": 1117 }, { "epoch": 1.7888, "grad_norm": 0.47288221727470514, "learning_rate": 7.453562744685779e-06, "loss": 0.005, "step": 1118 }, { "epoch": 1.7904, "grad_norm": 0.3960654098825163, "learning_rate": 7.4368557197142596e-06, "loss": 0.0065, "step": 1119 }, { "epoch": 1.792, "grad_norm": 0.3449626116345571, "learning_rate": 7.420156348680621e-06, "loss": 0.0067, "step": 1120 }, { "epoch": 1.7936, "grad_norm": 0.4244578336020816, "learning_rate": 7.4034646814517155e-06, "loss": 0.0065, "step": 1121 }, { "epoch": 1.7952, "grad_norm": 0.4169819548917528, "learning_rate": 7.3867807678713965e-06, "loss": 0.0069, "step": 1122 }, { "epoch": 1.7968, "grad_norm": 0.32289014794153065, "learning_rate": 7.3701046577603605e-06, "loss": 0.0085, "step": 1123 }, { "epoch": 1.7984, "grad_norm": 0.472551606096138, "learning_rate": 7.353436400916006e-06, "loss": 0.0063, "step": 1124 }, { "epoch": 1.8, "grad_norm": 0.33829886594761394, "learning_rate": 7.336776047112277e-06, "loss": 0.0045, "step": 1125 }, { "epoch": 1.8016, "grad_norm": 0.479782119071177, "learning_rate": 7.32012364609952e-06, "loss": 0.0084, "step": 1126 }, { "epoch": 1.8032, "grad_norm": 0.2724659089240648, "learning_rate": 7.303479247604333e-06, "loss": 0.0036, "step": 1127 }, { "epoch": 1.8048, "grad_norm": 0.3327499456295506, "learning_rate": 7.286842901329413e-06, "loss": 0.0054, "step": 1128 }, { "epoch": 1.8064, "grad_norm": 0.33234081270148413, "learning_rate": 7.270214656953415e-06, "loss": 0.0047, "step": 1129 }, { "epoch": 1.808, "grad_norm": 0.5063193642113647, "learning_rate": 7.253594564130804e-06, "loss": 0.0067, "step": 1130 }, { "epoch": 1.8096, "grad_norm": 0.24443282410498932, "learning_rate": 7.236982672491699e-06, "loss": 0.0056, "step": 1131 }, { "epoch": 1.8112, "grad_norm": 0.47491223334505844, "learning_rate": 7.22037903164173e-06, "loss": 0.0056, "step": 1132 }, { "epoch": 1.8128, "grad_norm": 0.190968013316911, "learning_rate": 7.203783691161883e-06, "loss": 0.0043, "step": 1133 }, { "epoch": 1.8144, "grad_norm": 0.5665726247730363, "learning_rate": 7.187196700608373e-06, "loss": 0.0079, "step": 1134 }, { "epoch": 1.8159999999999998, "grad_norm": 0.3578735453323942, "learning_rate": 7.170618109512465e-06, "loss": 0.0074, "step": 1135 }, { "epoch": 1.8176, "grad_norm": 0.26292486166299694, "learning_rate": 7.154047967380353e-06, "loss": 0.0055, "step": 1136 }, { "epoch": 1.8192, "grad_norm": 0.3287946223540181, "learning_rate": 7.137486323692994e-06, "loss": 0.0045, "step": 1137 }, { "epoch": 1.8208, "grad_norm": 0.4821439032277905, "learning_rate": 7.120933227905971e-06, "loss": 0.0151, "step": 1138 }, { "epoch": 1.8224, "grad_norm": 0.34486469359537525, "learning_rate": 7.104388729449338e-06, "loss": 0.006, "step": 1139 }, { "epoch": 1.8239999999999998, "grad_norm": 0.2618247231236417, "learning_rate": 7.0878528777274814e-06, "loss": 0.0051, "step": 1140 }, { "epoch": 1.8256000000000001, "grad_norm": 0.4052418483681234, "learning_rate": 7.0713257221189635e-06, "loss": 0.0057, "step": 1141 }, { "epoch": 1.8272, "grad_norm": 0.45680366105962966, "learning_rate": 7.05480731197638e-06, "loss": 0.0053, "step": 1142 }, { "epoch": 1.8288, "grad_norm": 0.3486466160337022, "learning_rate": 7.0382976966262065e-06, "loss": 0.0061, "step": 1143 }, { "epoch": 1.8304, "grad_norm": 0.18148153052257054, "learning_rate": 7.021796925368667e-06, "loss": 0.0035, "step": 1144 }, { "epoch": 1.8319999999999999, "grad_norm": 0.30417352075414195, "learning_rate": 7.005305047477566e-06, "loss": 0.0073, "step": 1145 }, { "epoch": 1.8336000000000001, "grad_norm": 0.3316981633490078, "learning_rate": 6.988822112200157e-06, "loss": 0.0073, "step": 1146 }, { "epoch": 1.8352, "grad_norm": 0.3646192862545398, "learning_rate": 6.9723481687569836e-06, "loss": 0.0042, "step": 1147 }, { "epoch": 1.8368, "grad_norm": 0.6013002629062065, "learning_rate": 6.955883266341741e-06, "loss": 0.0042, "step": 1148 }, { "epoch": 1.8384, "grad_norm": 0.29652681955969035, "learning_rate": 6.939427454121128e-06, "loss": 0.0061, "step": 1149 }, { "epoch": 1.8399999999999999, "grad_norm": 0.34240071160797086, "learning_rate": 6.9229807812346985e-06, "loss": 0.0096, "step": 1150 }, { "epoch": 1.8416000000000001, "grad_norm": 0.17463564552842203, "learning_rate": 6.9065432967947145e-06, "loss": 0.004, "step": 1151 }, { "epoch": 1.8432, "grad_norm": 0.38200139192617427, "learning_rate": 6.890115049885995e-06, "loss": 0.0077, "step": 1152 }, { "epoch": 1.8448, "grad_norm": 0.4949151787689372, "learning_rate": 6.8736960895657854e-06, "loss": 0.0108, "step": 1153 }, { "epoch": 1.8464, "grad_norm": 0.27918929940467996, "learning_rate": 6.85728646486359e-06, "loss": 0.0042, "step": 1154 }, { "epoch": 1.8479999999999999, "grad_norm": 0.2761223100165974, "learning_rate": 6.840886224781039e-06, "loss": 0.0067, "step": 1155 }, { "epoch": 1.8496000000000001, "grad_norm": 0.3091314244102595, "learning_rate": 6.824495418291741e-06, "loss": 0.008, "step": 1156 }, { "epoch": 1.8512, "grad_norm": 0.4653825842474054, "learning_rate": 6.8081140943411296e-06, "loss": 0.0072, "step": 1157 }, { "epoch": 1.8528, "grad_norm": 0.31221517262585574, "learning_rate": 6.791742301846325e-06, "loss": 0.0059, "step": 1158 }, { "epoch": 1.8544, "grad_norm": 0.36726590327954306, "learning_rate": 6.775380089695986e-06, "loss": 0.0046, "step": 1159 }, { "epoch": 1.8559999999999999, "grad_norm": 0.3242158265475158, "learning_rate": 6.759027506750159e-06, "loss": 0.0064, "step": 1160 }, { "epoch": 1.8576000000000001, "grad_norm": 0.31247436508664383, "learning_rate": 6.742684601840142e-06, "loss": 0.0049, "step": 1161 }, { "epoch": 1.8592, "grad_norm": 0.20875497548440738, "learning_rate": 6.726351423768323e-06, "loss": 0.0027, "step": 1162 }, { "epoch": 1.8608, "grad_norm": 0.33207426145196084, "learning_rate": 6.710028021308061e-06, "loss": 0.0049, "step": 1163 }, { "epoch": 1.8624, "grad_norm": 0.40260970705133703, "learning_rate": 6.693714443203507e-06, "loss": 0.0047, "step": 1164 }, { "epoch": 1.8639999999999999, "grad_norm": 0.30987382333167335, "learning_rate": 6.677410738169485e-06, "loss": 0.0056, "step": 1165 }, { "epoch": 1.8656000000000001, "grad_norm": 0.4130657667581579, "learning_rate": 6.661116954891329e-06, "loss": 0.0055, "step": 1166 }, { "epoch": 1.8672, "grad_norm": 0.32794991888696495, "learning_rate": 6.644833142024752e-06, "loss": 0.0042, "step": 1167 }, { "epoch": 1.8688, "grad_norm": 0.522406636948986, "learning_rate": 6.62855934819569e-06, "loss": 0.0065, "step": 1168 }, { "epoch": 1.8704, "grad_norm": 0.3902449243177916, "learning_rate": 6.612295622000162e-06, "loss": 0.0088, "step": 1169 }, { "epoch": 1.8719999999999999, "grad_norm": 0.213505902046021, "learning_rate": 6.59604201200412e-06, "loss": 0.005, "step": 1170 }, { "epoch": 1.8736000000000002, "grad_norm": 0.2757910385921279, "learning_rate": 6.579798566743314e-06, "loss": 0.006, "step": 1171 }, { "epoch": 1.8752, "grad_norm": 0.2563207989287682, "learning_rate": 6.563565334723134e-06, "loss": 0.004, "step": 1172 }, { "epoch": 1.8768, "grad_norm": 0.3429964571294477, "learning_rate": 6.547342364418482e-06, "loss": 0.0047, "step": 1173 }, { "epoch": 1.8784, "grad_norm": 0.17895805114654723, "learning_rate": 6.5311297042736046e-06, "loss": 0.0031, "step": 1174 }, { "epoch": 1.88, "grad_norm": 0.3836987346654857, "learning_rate": 6.514927402701965e-06, "loss": 0.0063, "step": 1175 }, { "epoch": 1.8816000000000002, "grad_norm": 0.45059071335600803, "learning_rate": 6.498735508086094e-06, "loss": 0.008, "step": 1176 }, { "epoch": 1.8832, "grad_norm": 0.33214015016144555, "learning_rate": 6.482554068777451e-06, "loss": 0.0042, "step": 1177 }, { "epoch": 1.8848, "grad_norm": 0.4595121731217701, "learning_rate": 6.466383133096268e-06, "loss": 0.0063, "step": 1178 }, { "epoch": 1.8864, "grad_norm": 0.42499357639905216, "learning_rate": 6.450222749331414e-06, "loss": 0.0063, "step": 1179 }, { "epoch": 1.888, "grad_norm": 0.3630709788454369, "learning_rate": 6.4340729657402424e-06, "loss": 0.0056, "step": 1180 }, { "epoch": 1.8896, "grad_norm": 0.32849173455121033, "learning_rate": 6.4179338305484675e-06, "loss": 0.0042, "step": 1181 }, { "epoch": 1.8912, "grad_norm": 0.47697285205391693, "learning_rate": 6.40180539194999e-06, "loss": 0.0053, "step": 1182 }, { "epoch": 1.8928, "grad_norm": 0.214089479958329, "learning_rate": 6.385687698106781e-06, "loss": 0.004, "step": 1183 }, { "epoch": 1.8944, "grad_norm": 0.18666853565973923, "learning_rate": 6.3695807971487175e-06, "loss": 0.0044, "step": 1184 }, { "epoch": 1.896, "grad_norm": 0.37642768242623026, "learning_rate": 6.35348473717345e-06, "loss": 0.0063, "step": 1185 }, { "epoch": 1.8976, "grad_norm": 0.27832937638872407, "learning_rate": 6.337399566246257e-06, "loss": 0.0073, "step": 1186 }, { "epoch": 1.8992, "grad_norm": 0.3143922382058477, "learning_rate": 6.321325332399904e-06, "loss": 0.0034, "step": 1187 }, { "epoch": 1.9008, "grad_norm": 0.33682375227488676, "learning_rate": 6.305262083634488e-06, "loss": 0.0039, "step": 1188 }, { "epoch": 1.9024, "grad_norm": 0.4044456913665351, "learning_rate": 6.289209867917312e-06, "loss": 0.0059, "step": 1189 }, { "epoch": 1.904, "grad_norm": 0.6500081303210257, "learning_rate": 6.2731687331827214e-06, "loss": 0.0068, "step": 1190 }, { "epoch": 1.9056, "grad_norm": 0.5839569583167884, "learning_rate": 6.2571387273319905e-06, "loss": 0.0072, "step": 1191 }, { "epoch": 1.9072, "grad_norm": 0.29813248946273585, "learning_rate": 6.2411198982331435e-06, "loss": 0.0059, "step": 1192 }, { "epoch": 1.9088, "grad_norm": 0.395552190000539, "learning_rate": 6.225112293720836e-06, "loss": 0.0056, "step": 1193 }, { "epoch": 1.9104, "grad_norm": 0.28649578761298766, "learning_rate": 6.209115961596208e-06, "loss": 0.0032, "step": 1194 }, { "epoch": 1.912, "grad_norm": 0.33399463181181227, "learning_rate": 6.193130949626731e-06, "loss": 0.0076, "step": 1195 }, { "epoch": 1.9136, "grad_norm": 0.31191182808084145, "learning_rate": 6.177157305546077e-06, "loss": 0.006, "step": 1196 }, { "epoch": 1.9152, "grad_norm": 0.28527792964216175, "learning_rate": 6.1611950770539766e-06, "loss": 0.0046, "step": 1197 }, { "epoch": 1.9167999999999998, "grad_norm": 0.4323818678849387, "learning_rate": 6.145244311816063e-06, "loss": 0.0155, "step": 1198 }, { "epoch": 1.9184, "grad_norm": 0.26984275027804744, "learning_rate": 6.129305057463741e-06, "loss": 0.0039, "step": 1199 }, { "epoch": 1.92, "grad_norm": 0.3258817250599572, "learning_rate": 6.113377361594048e-06, "loss": 0.0066, "step": 1200 }, { "epoch": 1.9216, "grad_norm": 0.40964896634709624, "learning_rate": 6.0974612717695e-06, "loss": 0.0054, "step": 1201 }, { "epoch": 1.9232, "grad_norm": 0.36917883995698975, "learning_rate": 6.081556835517955e-06, "loss": 0.0052, "step": 1202 }, { "epoch": 1.9247999999999998, "grad_norm": 0.2358165396682365, "learning_rate": 6.065664100332478e-06, "loss": 0.0037, "step": 1203 }, { "epoch": 1.9264000000000001, "grad_norm": 0.36764085459176077, "learning_rate": 6.049783113671184e-06, "loss": 0.0043, "step": 1204 }, { "epoch": 1.928, "grad_norm": 0.41252331455261154, "learning_rate": 6.033913922957112e-06, "loss": 0.0043, "step": 1205 }, { "epoch": 1.9296, "grad_norm": 0.18441031104306468, "learning_rate": 6.018056575578075e-06, "loss": 0.0033, "step": 1206 }, { "epoch": 1.9312, "grad_norm": 0.22195801965442705, "learning_rate": 6.002211118886514e-06, "loss": 0.0027, "step": 1207 }, { "epoch": 1.9327999999999999, "grad_norm": 0.4037529762288358, "learning_rate": 5.986377600199371e-06, "loss": 0.0064, "step": 1208 }, { "epoch": 1.9344000000000001, "grad_norm": 0.3270423790951842, "learning_rate": 5.970556066797941e-06, "loss": 0.0055, "step": 1209 }, { "epoch": 1.936, "grad_norm": 0.3849544612859768, "learning_rate": 5.9547465659277215e-06, "loss": 0.005, "step": 1210 }, { "epoch": 1.9376, "grad_norm": 0.19885859640069262, "learning_rate": 5.93894914479828e-06, "loss": 0.004, "step": 1211 }, { "epoch": 1.9392, "grad_norm": 0.2511154473625223, "learning_rate": 5.923163850583114e-06, "loss": 0.0032, "step": 1212 }, { "epoch": 1.9407999999999999, "grad_norm": 0.3459117674365608, "learning_rate": 5.907390730419506e-06, "loss": 0.0065, "step": 1213 }, { "epoch": 1.9424000000000001, "grad_norm": 0.3154974281342526, "learning_rate": 5.891629831408392e-06, "loss": 0.0037, "step": 1214 }, { "epoch": 1.944, "grad_norm": 0.35432322451329235, "learning_rate": 5.875881200614208e-06, "loss": 0.0057, "step": 1215 }, { "epoch": 1.9456, "grad_norm": 0.27041268208410507, "learning_rate": 5.8601448850647515e-06, "loss": 0.0047, "step": 1216 }, { "epoch": 1.9472, "grad_norm": 0.571194663835566, "learning_rate": 5.8444209317510515e-06, "loss": 0.0061, "step": 1217 }, { "epoch": 1.9487999999999999, "grad_norm": 0.6282359016961699, "learning_rate": 5.828709387627219e-06, "loss": 0.0127, "step": 1218 }, { "epoch": 1.9504000000000001, "grad_norm": 0.34402025436940703, "learning_rate": 5.813010299610313e-06, "loss": 0.0084, "step": 1219 }, { "epoch": 1.952, "grad_norm": 0.3480649745220345, "learning_rate": 5.797323714580192e-06, "loss": 0.0039, "step": 1220 }, { "epoch": 1.9536, "grad_norm": 0.4691213849698909, "learning_rate": 5.781649679379379e-06, "loss": 0.0059, "step": 1221 }, { "epoch": 1.9552, "grad_norm": 0.21565957592354734, "learning_rate": 5.7659882408129204e-06, "loss": 0.0038, "step": 1222 }, { "epoch": 1.9567999999999999, "grad_norm": 0.44118541711284726, "learning_rate": 5.750339445648252e-06, "loss": 0.0094, "step": 1223 }, { "epoch": 1.9584000000000001, "grad_norm": 0.4957908051911234, "learning_rate": 5.7347033406150494e-06, "loss": 0.0072, "step": 1224 }, { "epoch": 1.96, "grad_norm": 0.2937561406067032, "learning_rate": 5.7190799724050924e-06, "loss": 0.0038, "step": 1225 }, { "epoch": 1.9616, "grad_norm": 0.7194650498636356, "learning_rate": 5.703469387672138e-06, "loss": 0.0071, "step": 1226 }, { "epoch": 1.9632, "grad_norm": 0.4604277229426392, "learning_rate": 5.687871633031754e-06, "loss": 0.0062, "step": 1227 }, { "epoch": 1.9647999999999999, "grad_norm": 0.5455836140271865, "learning_rate": 5.672286755061212e-06, "loss": 0.0087, "step": 1228 }, { "epoch": 1.9664000000000001, "grad_norm": 0.4672016066833358, "learning_rate": 5.656714800299317e-06, "loss": 0.0072, "step": 1229 }, { "epoch": 1.968, "grad_norm": 0.21241233709878102, "learning_rate": 5.64115581524629e-06, "loss": 0.0053, "step": 1230 }, { "epoch": 1.9696, "grad_norm": 0.34005297435351134, "learning_rate": 5.625609846363622e-06, "loss": 0.003, "step": 1231 }, { "epoch": 1.9712, "grad_norm": 0.42052160854254317, "learning_rate": 5.610076940073939e-06, "loss": 0.0069, "step": 1232 }, { "epoch": 1.9727999999999999, "grad_norm": 0.3944203490898746, "learning_rate": 5.594557142760853e-06, "loss": 0.0048, "step": 1233 }, { "epoch": 1.9744000000000002, "grad_norm": 0.21058513543549798, "learning_rate": 5.579050500768837e-06, "loss": 0.0047, "step": 1234 }, { "epoch": 1.976, "grad_norm": 0.26725426932475405, "learning_rate": 5.563557060403071e-06, "loss": 0.0037, "step": 1235 }, { "epoch": 1.9776, "grad_norm": 0.33849781411866225, "learning_rate": 5.548076867929331e-06, "loss": 0.0067, "step": 1236 }, { "epoch": 1.9792, "grad_norm": 0.29629666107985536, "learning_rate": 5.53260996957381e-06, "loss": 0.0062, "step": 1237 }, { "epoch": 1.9808, "grad_norm": 0.42531169332494806, "learning_rate": 5.517156411523026e-06, "loss": 0.0065, "step": 1238 }, { "epoch": 1.9824000000000002, "grad_norm": 0.4259366912829729, "learning_rate": 5.501716239923642e-06, "loss": 0.0072, "step": 1239 }, { "epoch": 1.984, "grad_norm": 0.6024658910530383, "learning_rate": 5.486289500882355e-06, "loss": 0.0132, "step": 1240 }, { "epoch": 1.9856, "grad_norm": 0.30824009184805684, "learning_rate": 5.47087624046575e-06, "loss": 0.004, "step": 1241 }, { "epoch": 1.9872, "grad_norm": 0.22725954434890408, "learning_rate": 5.455476504700161e-06, "loss": 0.0042, "step": 1242 }, { "epoch": 1.9888, "grad_norm": 0.4197519865595487, "learning_rate": 5.440090339571537e-06, "loss": 0.0064, "step": 1243 }, { "epoch": 1.9904, "grad_norm": 0.44957031880341963, "learning_rate": 5.424717791025302e-06, "loss": 0.0078, "step": 1244 }, { "epoch": 1.992, "grad_norm": 0.35123516194158083, "learning_rate": 5.4093589049662175e-06, "loss": 0.0073, "step": 1245 }, { "epoch": 1.9936, "grad_norm": 0.282971493471759, "learning_rate": 5.3940137272582534e-06, "loss": 0.0052, "step": 1246 }, { "epoch": 1.9952, "grad_norm": 0.38016149664339494, "learning_rate": 5.378682303724435e-06, "loss": 0.0038, "step": 1247 }, { "epoch": 1.9968, "grad_norm": 0.5098739325476022, "learning_rate": 5.3633646801467255e-06, "loss": 0.007, "step": 1248 }, { "epoch": 1.9984, "grad_norm": 0.6140692262656893, "learning_rate": 5.348060902265871e-06, "loss": 0.0075, "step": 1249 }, { "epoch": 2.0, "grad_norm": 0.3501279716540246, "learning_rate": 5.332771015781275e-06, "loss": 0.0064, "step": 1250 }, { "epoch": 2.0016, "grad_norm": 0.27946530367808153, "learning_rate": 5.31749506635086e-06, "loss": 0.0064, "step": 1251 }, { "epoch": 2.0032, "grad_norm": 0.30440282450736217, "learning_rate": 5.302233099590928e-06, "loss": 0.0046, "step": 1252 }, { "epoch": 2.0048, "grad_norm": 0.44397766833706465, "learning_rate": 5.286985161076029e-06, "loss": 0.0062, "step": 1253 }, { "epoch": 2.0064, "grad_norm": 0.478560493359205, "learning_rate": 5.271751296338823e-06, "loss": 0.0061, "step": 1254 }, { "epoch": 2.008, "grad_norm": 0.6615866501216844, "learning_rate": 5.2565315508699374e-06, "loss": 0.0063, "step": 1255 }, { "epoch": 2.0096, "grad_norm": 0.2802755002339857, "learning_rate": 5.241325970117851e-06, "loss": 0.0042, "step": 1256 }, { "epoch": 2.0112, "grad_norm": 0.4109368046645932, "learning_rate": 5.226134599488728e-06, "loss": 0.0053, "step": 1257 }, { "epoch": 2.0128, "grad_norm": 0.3481720552299007, "learning_rate": 5.210957484346314e-06, "loss": 0.0082, "step": 1258 }, { "epoch": 2.0144, "grad_norm": 0.3029754374591977, "learning_rate": 5.195794670011775e-06, "loss": 0.0062, "step": 1259 }, { "epoch": 2.016, "grad_norm": 0.5502723232435336, "learning_rate": 5.1806462017635775e-06, "loss": 0.0052, "step": 1260 }, { "epoch": 2.0176, "grad_norm": 0.330718899882051, "learning_rate": 5.165512124837344e-06, "loss": 0.007, "step": 1261 }, { "epoch": 2.0192, "grad_norm": 0.48764245936206446, "learning_rate": 5.150392484425728e-06, "loss": 0.0049, "step": 1262 }, { "epoch": 2.0208, "grad_norm": 0.3342124162357661, "learning_rate": 5.135287325678271e-06, "loss": 0.0066, "step": 1263 }, { "epoch": 2.0224, "grad_norm": 0.4202488105628078, "learning_rate": 5.120196693701267e-06, "loss": 0.0042, "step": 1264 }, { "epoch": 2.024, "grad_norm": 0.40577032886305797, "learning_rate": 5.105120633557634e-06, "loss": 0.0072, "step": 1265 }, { "epoch": 2.0256, "grad_norm": 0.26383658827264916, "learning_rate": 5.090059190266779e-06, "loss": 0.0036, "step": 1266 }, { "epoch": 2.0272, "grad_norm": 0.37091203157656494, "learning_rate": 5.075012408804458e-06, "loss": 0.0034, "step": 1267 }, { "epoch": 2.0288, "grad_norm": 0.353219243476177, "learning_rate": 5.059980334102637e-06, "loss": 0.0047, "step": 1268 }, { "epoch": 2.0304, "grad_norm": 0.23227630423010162, "learning_rate": 5.044963011049384e-06, "loss": 0.0025, "step": 1269 }, { "epoch": 2.032, "grad_norm": 0.2737992961702637, "learning_rate": 5.0299604844886985e-06, "loss": 0.0058, "step": 1270 }, { "epoch": 2.0336, "grad_norm": 0.49954303192035526, "learning_rate": 5.0149727992204034e-06, "loss": 0.0058, "step": 1271 }, { "epoch": 2.0352, "grad_norm": 0.34112852678238753, "learning_rate": 5.000000000000003e-06, "loss": 0.0068, "step": 1272 }, { "epoch": 2.0368, "grad_norm": 0.5429394939769813, "learning_rate": 4.985042131538545e-06, "loss": 0.008, "step": 1273 }, { "epoch": 2.0384, "grad_norm": 0.35273691884209735, "learning_rate": 4.970099238502494e-06, "loss": 0.0045, "step": 1274 }, { "epoch": 2.04, "grad_norm": 0.41262386203301665, "learning_rate": 4.955171365513603e-06, "loss": 0.0057, "step": 1275 }, { "epoch": 2.0416, "grad_norm": 0.261370128060724, "learning_rate": 4.940258557148765e-06, "loss": 0.0054, "step": 1276 }, { "epoch": 2.0432, "grad_norm": 0.627636888025991, "learning_rate": 4.925360857939886e-06, "loss": 0.0073, "step": 1277 }, { "epoch": 2.0448, "grad_norm": 0.2226146536129725, "learning_rate": 4.910478312373757e-06, "loss": 0.0035, "step": 1278 }, { "epoch": 2.0464, "grad_norm": 0.2127764817558698, "learning_rate": 4.895610964891923e-06, "loss": 0.0036, "step": 1279 }, { "epoch": 2.048, "grad_norm": 0.25214117233057504, "learning_rate": 4.8807588598905364e-06, "loss": 0.0037, "step": 1280 }, { "epoch": 2.0496, "grad_norm": 0.31500180906439734, "learning_rate": 4.865922041720239e-06, "loss": 0.0067, "step": 1281 }, { "epoch": 2.0512, "grad_norm": 0.2577048344610542, "learning_rate": 4.8511005546860214e-06, "loss": 0.005, "step": 1282 }, { "epoch": 2.0528, "grad_norm": 0.25911780144857743, "learning_rate": 4.836294443047088e-06, "loss": 0.0043, "step": 1283 }, { "epoch": 2.0544, "grad_norm": 0.24808506280824796, "learning_rate": 4.821503751016746e-06, "loss": 0.004, "step": 1284 }, { "epoch": 2.056, "grad_norm": 0.19062806653577596, "learning_rate": 4.806728522762241e-06, "loss": 0.0042, "step": 1285 }, { "epoch": 2.0576, "grad_norm": 0.1565566885639028, "learning_rate": 4.791968802404648e-06, "loss": 0.0035, "step": 1286 }, { "epoch": 2.0592, "grad_norm": 0.3309828031131677, "learning_rate": 4.777224634018732e-06, "loss": 0.006, "step": 1287 }, { "epoch": 2.0608, "grad_norm": 0.23958773308236364, "learning_rate": 4.762496061632814e-06, "loss": 0.0034, "step": 1288 }, { "epoch": 2.0624, "grad_norm": 0.23239980452297915, "learning_rate": 4.7477831292286555e-06, "loss": 0.0041, "step": 1289 }, { "epoch": 2.064, "grad_norm": 0.3107452959938814, "learning_rate": 4.733085880741301e-06, "loss": 0.0049, "step": 1290 }, { "epoch": 2.0656, "grad_norm": 0.21814879569393347, "learning_rate": 4.7184043600589655e-06, "loss": 0.0038, "step": 1291 }, { "epoch": 2.0672, "grad_norm": 0.3002218278383945, "learning_rate": 4.703738611022899e-06, "loss": 0.0038, "step": 1292 }, { "epoch": 2.0688, "grad_norm": 0.17988878791249202, "learning_rate": 4.689088677427249e-06, "loss": 0.0028, "step": 1293 }, { "epoch": 2.0704, "grad_norm": 0.1918665097047004, "learning_rate": 4.674454603018949e-06, "loss": 0.0029, "step": 1294 }, { "epoch": 2.072, "grad_norm": 0.2138417184341068, "learning_rate": 4.659836431497563e-06, "loss": 0.0039, "step": 1295 }, { "epoch": 2.0736, "grad_norm": 0.17185071863166368, "learning_rate": 4.645234206515171e-06, "loss": 0.0022, "step": 1296 }, { "epoch": 2.0752, "grad_norm": 0.3512312119574598, "learning_rate": 4.630647971676232e-06, "loss": 0.0051, "step": 1297 }, { "epoch": 2.0768, "grad_norm": 0.1967479613686531, "learning_rate": 4.616077770537453e-06, "loss": 0.0031, "step": 1298 }, { "epoch": 2.0784, "grad_norm": 0.30824071063703085, "learning_rate": 4.601523646607675e-06, "loss": 0.0034, "step": 1299 }, { "epoch": 2.08, "grad_norm": 0.2695497026040452, "learning_rate": 4.586985643347716e-06, "loss": 0.0039, "step": 1300 }, { "epoch": 2.0816, "grad_norm": 0.4598031790299854, "learning_rate": 4.572463804170263e-06, "loss": 0.0042, "step": 1301 }, { "epoch": 2.0832, "grad_norm": 0.22011607876674072, "learning_rate": 4.557958172439726e-06, "loss": 0.0019, "step": 1302 }, { "epoch": 2.0848, "grad_norm": 0.3459979438376907, "learning_rate": 4.543468791472131e-06, "loss": 0.0034, "step": 1303 }, { "epoch": 2.0864, "grad_norm": 0.29612633919353937, "learning_rate": 4.5289957045349655e-06, "loss": 0.0032, "step": 1304 }, { "epoch": 2.088, "grad_norm": 0.33327682757677773, "learning_rate": 4.5145389548470645e-06, "loss": 0.0054, "step": 1305 }, { "epoch": 2.0896, "grad_norm": 0.2522281062899099, "learning_rate": 4.500098585578475e-06, "loss": 0.0035, "step": 1306 }, { "epoch": 2.0912, "grad_norm": 0.3154381874423506, "learning_rate": 4.485674639850334e-06, "loss": 0.0033, "step": 1307 }, { "epoch": 2.0928, "grad_norm": 0.7806681135321053, "learning_rate": 4.471267160734731e-06, "loss": 0.0082, "step": 1308 }, { "epoch": 2.0944, "grad_norm": 0.20924011580648477, "learning_rate": 4.456876191254582e-06, "loss": 0.002, "step": 1309 }, { "epoch": 2.096, "grad_norm": 0.27343581761958907, "learning_rate": 4.4425017743835155e-06, "loss": 0.0042, "step": 1310 }, { "epoch": 2.0976, "grad_norm": 0.43634599723703343, "learning_rate": 4.4281439530457174e-06, "loss": 0.0053, "step": 1311 }, { "epoch": 2.0992, "grad_norm": 0.27198385642671596, "learning_rate": 4.413802770115816e-06, "loss": 0.0033, "step": 1312 }, { "epoch": 2.1008, "grad_norm": 0.23484207052201267, "learning_rate": 4.399478268418771e-06, "loss": 0.0018, "step": 1313 }, { "epoch": 2.1024, "grad_norm": 0.17246406743748, "learning_rate": 4.385170490729712e-06, "loss": 0.0019, "step": 1314 }, { "epoch": 2.104, "grad_norm": 0.2028269985930605, "learning_rate": 4.370879479773837e-06, "loss": 0.0027, "step": 1315 }, { "epoch": 2.1056, "grad_norm": 0.26292786274161534, "learning_rate": 4.356605278226274e-06, "loss": 0.0029, "step": 1316 }, { "epoch": 2.1072, "grad_norm": 0.1867765717999834, "learning_rate": 4.342347928711953e-06, "loss": 0.0023, "step": 1317 }, { "epoch": 2.1088, "grad_norm": 0.5299678610445645, "learning_rate": 4.328107473805487e-06, "loss": 0.0067, "step": 1318 }, { "epoch": 2.1104, "grad_norm": 0.3684095285885515, "learning_rate": 4.313883956031031e-06, "loss": 0.006, "step": 1319 }, { "epoch": 2.112, "grad_norm": 0.21733768699759115, "learning_rate": 4.299677417862174e-06, "loss": 0.0024, "step": 1320 }, { "epoch": 2.1136, "grad_norm": 0.339185518787034, "learning_rate": 4.28548790172179e-06, "loss": 0.0038, "step": 1321 }, { "epoch": 2.1152, "grad_norm": 0.20615281771551228, "learning_rate": 4.2713154499819345e-06, "loss": 0.0027, "step": 1322 }, { "epoch": 2.1168, "grad_norm": 0.35306789971032443, "learning_rate": 4.257160104963695e-06, "loss": 0.0043, "step": 1323 }, { "epoch": 2.1184, "grad_norm": 0.42183599454548737, "learning_rate": 4.243021908937083e-06, "loss": 0.0051, "step": 1324 }, { "epoch": 2.12, "grad_norm": 0.3939500329035106, "learning_rate": 4.228900904120895e-06, "loss": 0.0044, "step": 1325 }, { "epoch": 2.1216, "grad_norm": 0.6441545692827979, "learning_rate": 4.214797132682597e-06, "loss": 0.0057, "step": 1326 }, { "epoch": 2.1232, "grad_norm": 0.46152235798316077, "learning_rate": 4.200710636738189e-06, "loss": 0.0054, "step": 1327 }, { "epoch": 2.1248, "grad_norm": 0.40725285654205506, "learning_rate": 4.186641458352088e-06, "loss": 0.0059, "step": 1328 }, { "epoch": 2.1264, "grad_norm": 0.24332009358464296, "learning_rate": 4.172589639536992e-06, "loss": 0.0043, "step": 1329 }, { "epoch": 2.128, "grad_norm": 0.41815713704336355, "learning_rate": 4.158555222253772e-06, "loss": 0.0058, "step": 1330 }, { "epoch": 2.1296, "grad_norm": 0.4771775277273845, "learning_rate": 4.144538248411321e-06, "loss": 0.004, "step": 1331 }, { "epoch": 2.1312, "grad_norm": 0.33752406994186707, "learning_rate": 4.130538759866457e-06, "loss": 0.0059, "step": 1332 }, { "epoch": 2.1328, "grad_norm": 0.4575903127414404, "learning_rate": 4.116556798423776e-06, "loss": 0.0052, "step": 1333 }, { "epoch": 2.1344, "grad_norm": 0.23976033424310222, "learning_rate": 4.102592405835536e-06, "loss": 0.0045, "step": 1334 }, { "epoch": 2.136, "grad_norm": 0.4474444698419231, "learning_rate": 4.088645623801534e-06, "loss": 0.0046, "step": 1335 }, { "epoch": 2.1376, "grad_norm": 0.7501073711538025, "learning_rate": 4.074716493968976e-06, "loss": 0.01, "step": 1336 }, { "epoch": 2.1391999999999998, "grad_norm": 0.4176418804505741, "learning_rate": 4.060805057932359e-06, "loss": 0.008, "step": 1337 }, { "epoch": 2.1408, "grad_norm": 0.41124851299783816, "learning_rate": 4.046911357233343e-06, "loss": 0.005, "step": 1338 }, { "epoch": 2.1424, "grad_norm": 0.10633178217740394, "learning_rate": 4.033035433360624e-06, "loss": 0.0012, "step": 1339 }, { "epoch": 2.144, "grad_norm": 0.5906271452129809, "learning_rate": 4.019177327749822e-06, "loss": 0.0064, "step": 1340 }, { "epoch": 2.1456, "grad_norm": 0.42624283355835463, "learning_rate": 4.00533708178334e-06, "loss": 0.0062, "step": 1341 }, { "epoch": 2.1471999999999998, "grad_norm": 0.3947213443096026, "learning_rate": 3.991514736790259e-06, "loss": 0.0067, "step": 1342 }, { "epoch": 2.1488, "grad_norm": 0.2380767675553356, "learning_rate": 3.977710334046193e-06, "loss": 0.0028, "step": 1343 }, { "epoch": 2.1504, "grad_norm": 0.3204449051112425, "learning_rate": 3.9639239147731865e-06, "loss": 0.003, "step": 1344 }, { "epoch": 2.152, "grad_norm": 0.2398994875704844, "learning_rate": 3.950155520139581e-06, "loss": 0.005, "step": 1345 }, { "epoch": 2.1536, "grad_norm": 0.17312681016523704, "learning_rate": 3.936405191259891e-06, "loss": 0.0027, "step": 1346 }, { "epoch": 2.1552, "grad_norm": 0.5201426071877979, "learning_rate": 3.9226729691946865e-06, "loss": 0.0043, "step": 1347 }, { "epoch": 2.1568, "grad_norm": 0.26525210748144445, "learning_rate": 3.908958894950465e-06, "loss": 0.0038, "step": 1348 }, { "epoch": 2.1584, "grad_norm": 0.3930269687027248, "learning_rate": 3.895263009479534e-06, "loss": 0.0033, "step": 1349 }, { "epoch": 2.16, "grad_norm": 0.41311565597600647, "learning_rate": 3.881585353679891e-06, "loss": 0.0055, "step": 1350 }, { "epoch": 2.1616, "grad_norm": 0.1639356228958999, "learning_rate": 3.867925968395085e-06, "loss": 0.0022, "step": 1351 }, { "epoch": 2.1632, "grad_norm": 0.2127497575095144, "learning_rate": 3.854284894414122e-06, "loss": 0.0028, "step": 1352 }, { "epoch": 2.1648, "grad_norm": 0.35107268142876874, "learning_rate": 3.840662172471315e-06, "loss": 0.0061, "step": 1353 }, { "epoch": 2.1664, "grad_norm": 0.4737783292996846, "learning_rate": 3.827057843246181e-06, "loss": 0.0057, "step": 1354 }, { "epoch": 2.168, "grad_norm": 0.30491471805866227, "learning_rate": 3.8134719473633098e-06, "loss": 0.0039, "step": 1355 }, { "epoch": 2.1696, "grad_norm": 0.30209230149364336, "learning_rate": 3.799904525392251e-06, "loss": 0.0042, "step": 1356 }, { "epoch": 2.1712, "grad_norm": 0.5563014157232996, "learning_rate": 3.786355617847385e-06, "loss": 0.0064, "step": 1357 }, { "epoch": 2.1728, "grad_norm": 0.5629153450463303, "learning_rate": 3.7728252651878018e-06, "loss": 0.0042, "step": 1358 }, { "epoch": 2.1744, "grad_norm": 0.206585291044635, "learning_rate": 3.759313507817196e-06, "loss": 0.0017, "step": 1359 }, { "epoch": 2.176, "grad_norm": 0.3337582064935557, "learning_rate": 3.745820386083724e-06, "loss": 0.004, "step": 1360 }, { "epoch": 2.1776, "grad_norm": 0.29096226358428184, "learning_rate": 3.7323459402798936e-06, "loss": 0.0046, "step": 1361 }, { "epoch": 2.1792, "grad_norm": 0.39433867019101887, "learning_rate": 3.718890210642442e-06, "loss": 0.0043, "step": 1362 }, { "epoch": 2.1808, "grad_norm": 0.2635655806134531, "learning_rate": 3.705453237352227e-06, "loss": 0.0033, "step": 1363 }, { "epoch": 2.1824, "grad_norm": 0.3111796237224152, "learning_rate": 3.6920350605340883e-06, "loss": 0.0036, "step": 1364 }, { "epoch": 2.184, "grad_norm": 0.3144976699122512, "learning_rate": 3.6786357202567367e-06, "loss": 0.0031, "step": 1365 }, { "epoch": 2.1856, "grad_norm": 0.4266913195390291, "learning_rate": 3.6652552565326382e-06, "loss": 0.0068, "step": 1366 }, { "epoch": 2.1872, "grad_norm": 0.2717717217634583, "learning_rate": 3.6518937093178873e-06, "loss": 0.004, "step": 1367 }, { "epoch": 2.1888, "grad_norm": 0.508523412760655, "learning_rate": 3.638551118512089e-06, "loss": 0.0061, "step": 1368 }, { "epoch": 2.1904, "grad_norm": 0.273214646716238, "learning_rate": 3.6252275239582522e-06, "loss": 0.0047, "step": 1369 }, { "epoch": 2.192, "grad_norm": 0.3764530890741371, "learning_rate": 3.611922965442648e-06, "loss": 0.004, "step": 1370 }, { "epoch": 2.1936, "grad_norm": 0.24772481640114252, "learning_rate": 3.5986374826947067e-06, "loss": 0.0022, "step": 1371 }, { "epoch": 2.1952, "grad_norm": 0.30233854902217266, "learning_rate": 3.5853711153868962e-06, "loss": 0.0035, "step": 1372 }, { "epoch": 2.1968, "grad_norm": 0.4774706382490888, "learning_rate": 3.5721239031346067e-06, "loss": 0.0088, "step": 1373 }, { "epoch": 2.1984, "grad_norm": 0.2645838272243305, "learning_rate": 3.558895885496023e-06, "loss": 0.0029, "step": 1374 }, { "epoch": 2.2, "grad_norm": 0.310741726897126, "learning_rate": 3.545687101972013e-06, "loss": 0.0031, "step": 1375 }, { "epoch": 2.2016, "grad_norm": 0.2528778399021895, "learning_rate": 3.53249759200601e-06, "loss": 0.0031, "step": 1376 }, { "epoch": 2.2032, "grad_norm": 0.3168647834878494, "learning_rate": 3.519327394983888e-06, "loss": 0.0038, "step": 1377 }, { "epoch": 2.2048, "grad_norm": 0.3048180293845282, "learning_rate": 3.506176550233863e-06, "loss": 0.0044, "step": 1378 }, { "epoch": 2.2064, "grad_norm": 0.5099909796721364, "learning_rate": 3.4930450970263485e-06, "loss": 0.0062, "step": 1379 }, { "epoch": 2.208, "grad_norm": 0.37016840677232016, "learning_rate": 3.479933074573858e-06, "loss": 0.004, "step": 1380 }, { "epoch": 2.2096, "grad_norm": 0.3865414949322783, "learning_rate": 3.4668405220308797e-06, "loss": 0.0065, "step": 1381 }, { "epoch": 2.2112, "grad_norm": 0.3237177904121443, "learning_rate": 3.453767478493761e-06, "loss": 0.0052, "step": 1382 }, { "epoch": 2.2128, "grad_norm": 0.17672305270254257, "learning_rate": 3.440713983000601e-06, "loss": 0.0018, "step": 1383 }, { "epoch": 2.2144, "grad_norm": 0.37446742117760956, "learning_rate": 3.4276800745311135e-06, "loss": 0.0039, "step": 1384 }, { "epoch": 2.216, "grad_norm": 0.37271036327618084, "learning_rate": 3.4146657920065286e-06, "loss": 0.0043, "step": 1385 }, { "epoch": 2.2176, "grad_norm": 0.24469565507957217, "learning_rate": 3.401671174289469e-06, "loss": 0.0034, "step": 1386 }, { "epoch": 2.2192, "grad_norm": 0.2746037035440998, "learning_rate": 3.3886962601838327e-06, "loss": 0.0028, "step": 1387 }, { "epoch": 2.2208, "grad_norm": 0.3230434848869537, "learning_rate": 3.37574108843469e-06, "loss": 0.0044, "step": 1388 }, { "epoch": 2.2224, "grad_norm": 0.3526293628919445, "learning_rate": 3.3628056977281456e-06, "loss": 0.0059, "step": 1389 }, { "epoch": 2.224, "grad_norm": 0.2789922220782755, "learning_rate": 3.3498901266912397e-06, "loss": 0.0029, "step": 1390 }, { "epoch": 2.2256, "grad_norm": 0.3246182344755697, "learning_rate": 3.3369944138918286e-06, "loss": 0.0055, "step": 1391 }, { "epoch": 2.2272, "grad_norm": 0.26651265660656137, "learning_rate": 3.3241185978384636e-06, "loss": 0.0026, "step": 1392 }, { "epoch": 2.2288, "grad_norm": 0.43057393091251184, "learning_rate": 3.3112627169802948e-06, "loss": 0.0058, "step": 1393 }, { "epoch": 2.2304, "grad_norm": 0.31773344106889506, "learning_rate": 3.2984268097069284e-06, "loss": 0.0054, "step": 1394 }, { "epoch": 2.232, "grad_norm": 0.39789740348144614, "learning_rate": 3.2856109143483316e-06, "loss": 0.0048, "step": 1395 }, { "epoch": 2.2336, "grad_norm": 0.3131342354025925, "learning_rate": 3.2728150691747117e-06, "loss": 0.0033, "step": 1396 }, { "epoch": 2.2352, "grad_norm": 0.20721870152903424, "learning_rate": 3.2600393123964114e-06, "loss": 0.0022, "step": 1397 }, { "epoch": 2.2368, "grad_norm": 0.311970033505521, "learning_rate": 3.2472836821637744e-06, "loss": 0.0033, "step": 1398 }, { "epoch": 2.2384, "grad_norm": 0.3479014445667459, "learning_rate": 3.2345482165670493e-06, "loss": 0.0047, "step": 1399 }, { "epoch": 2.24, "grad_norm": 0.3117659770003126, "learning_rate": 3.22183295363627e-06, "loss": 0.0044, "step": 1400 }, { "epoch": 2.2416, "grad_norm": 0.36393325077159877, "learning_rate": 3.209137931341143e-06, "loss": 0.0037, "step": 1401 }, { "epoch": 2.2432, "grad_norm": 0.47793755851724995, "learning_rate": 3.196463187590929e-06, "loss": 0.0047, "step": 1402 }, { "epoch": 2.2448, "grad_norm": 0.42910510869632124, "learning_rate": 3.183808760234335e-06, "loss": 0.0051, "step": 1403 }, { "epoch": 2.2464, "grad_norm": 0.2169222373711283, "learning_rate": 3.1711746870594083e-06, "loss": 0.0036, "step": 1404 }, { "epoch": 2.248, "grad_norm": 0.3028100781268857, "learning_rate": 3.1585610057934022e-06, "loss": 0.0032, "step": 1405 }, { "epoch": 2.2496, "grad_norm": 0.5514473213221643, "learning_rate": 3.145967754102691e-06, "loss": 0.0055, "step": 1406 }, { "epoch": 2.2512, "grad_norm": 0.4049785329070876, "learning_rate": 3.1333949695926323e-06, "loss": 0.0046, "step": 1407 }, { "epoch": 2.2528, "grad_norm": 0.40058983163715545, "learning_rate": 3.1208426898074685e-06, "loss": 0.0067, "step": 1408 }, { "epoch": 2.2544, "grad_norm": 0.30103543975515684, "learning_rate": 3.1083109522302124e-06, "loss": 0.0026, "step": 1409 }, { "epoch": 2.2560000000000002, "grad_norm": 0.35093667823401603, "learning_rate": 3.0957997942825337e-06, "loss": 0.0054, "step": 1410 }, { "epoch": 2.2576, "grad_norm": 0.46305339166405457, "learning_rate": 3.083309253324651e-06, "loss": 0.0032, "step": 1411 }, { "epoch": 2.2592, "grad_norm": 0.3253390462209622, "learning_rate": 3.070839366655215e-06, "loss": 0.0061, "step": 1412 }, { "epoch": 2.2608, "grad_norm": 0.4677775103626948, "learning_rate": 3.0583901715111965e-06, "loss": 0.0037, "step": 1413 }, { "epoch": 2.2624, "grad_norm": 0.4040927546843886, "learning_rate": 3.045961705067787e-06, "loss": 0.0047, "step": 1414 }, { "epoch": 2.2640000000000002, "grad_norm": 0.37462510699491497, "learning_rate": 3.0335540044382693e-06, "loss": 0.0059, "step": 1415 }, { "epoch": 2.2656, "grad_norm": 0.36590491327213365, "learning_rate": 3.021167106673928e-06, "loss": 0.0039, "step": 1416 }, { "epoch": 2.2672, "grad_norm": 0.4126162047296427, "learning_rate": 3.008801048763914e-06, "loss": 0.0086, "step": 1417 }, { "epoch": 2.2688, "grad_norm": 0.19225835760207746, "learning_rate": 2.996455867635155e-06, "loss": 0.0019, "step": 1418 }, { "epoch": 2.2704, "grad_norm": 0.2745973327167802, "learning_rate": 2.9841316001522345e-06, "loss": 0.0026, "step": 1419 }, { "epoch": 2.2720000000000002, "grad_norm": 0.4391947025341379, "learning_rate": 2.9718282831172885e-06, "loss": 0.0043, "step": 1420 }, { "epoch": 2.2736, "grad_norm": 0.37677518935738363, "learning_rate": 2.9595459532698854e-06, "loss": 0.0055, "step": 1421 }, { "epoch": 2.2752, "grad_norm": 0.20114083698483004, "learning_rate": 2.94728464728693e-06, "loss": 0.0025, "step": 1422 }, { "epoch": 2.2768, "grad_norm": 0.5013196354873555, "learning_rate": 2.9350444017825385e-06, "loss": 0.0049, "step": 1423 }, { "epoch": 2.2784, "grad_norm": 0.21358665618820144, "learning_rate": 2.922825253307947e-06, "loss": 0.0032, "step": 1424 }, { "epoch": 2.2800000000000002, "grad_norm": 0.3422376824346592, "learning_rate": 2.910627238351383e-06, "loss": 0.0057, "step": 1425 }, { "epoch": 2.2816, "grad_norm": 0.22579646148322313, "learning_rate": 2.898450393337977e-06, "loss": 0.0031, "step": 1426 }, { "epoch": 2.2832, "grad_norm": 0.48227216495460973, "learning_rate": 2.886294754629632e-06, "loss": 0.0108, "step": 1427 }, { "epoch": 2.2848, "grad_norm": 0.44696733275716655, "learning_rate": 2.8741603585249312e-06, "loss": 0.0045, "step": 1428 }, { "epoch": 2.2864, "grad_norm": 0.3841451203939599, "learning_rate": 2.8620472412590227e-06, "loss": 0.0057, "step": 1429 }, { "epoch": 2.288, "grad_norm": 0.29580487326759286, "learning_rate": 2.8499554390035144e-06, "loss": 0.0024, "step": 1430 }, { "epoch": 2.2896, "grad_norm": 0.27885549878850224, "learning_rate": 2.837884987866363e-06, "loss": 0.0036, "step": 1431 }, { "epoch": 2.2912, "grad_norm": 0.25723153018368067, "learning_rate": 2.8258359238917665e-06, "loss": 0.0047, "step": 1432 }, { "epoch": 2.2928, "grad_norm": 0.3956945495465071, "learning_rate": 2.8138082830600556e-06, "loss": 0.0037, "step": 1433 }, { "epoch": 2.2944, "grad_norm": 0.3694973738962662, "learning_rate": 2.8018021012875994e-06, "loss": 0.0062, "step": 1434 }, { "epoch": 2.296, "grad_norm": 0.25758641245570874, "learning_rate": 2.789817414426673e-06, "loss": 0.0038, "step": 1435 }, { "epoch": 2.2976, "grad_norm": 0.4575486542931288, "learning_rate": 2.7778542582653746e-06, "loss": 0.0098, "step": 1436 }, { "epoch": 2.2992, "grad_norm": 0.1979079546178225, "learning_rate": 2.7659126685275028e-06, "loss": 0.0031, "step": 1437 }, { "epoch": 2.3008, "grad_norm": 0.2243893806489553, "learning_rate": 2.753992680872457e-06, "loss": 0.0029, "step": 1438 }, { "epoch": 2.3024, "grad_norm": 0.3084199013963842, "learning_rate": 2.7420943308951287e-06, "loss": 0.0065, "step": 1439 }, { "epoch": 2.304, "grad_norm": 0.4512154129946682, "learning_rate": 2.7302176541257984e-06, "loss": 0.0089, "step": 1440 }, { "epoch": 2.3056, "grad_norm": 0.33548598217031145, "learning_rate": 2.718362686030025e-06, "loss": 0.0086, "step": 1441 }, { "epoch": 2.3072, "grad_norm": 0.19048531501627833, "learning_rate": 2.7065294620085425e-06, "loss": 0.002, "step": 1442 }, { "epoch": 2.3088, "grad_norm": 0.12113369849410853, "learning_rate": 2.694718017397151e-06, "loss": 0.0017, "step": 1443 }, { "epoch": 2.3104, "grad_norm": 0.2889296533710736, "learning_rate": 2.6829283874666236e-06, "loss": 0.0042, "step": 1444 }, { "epoch": 2.312, "grad_norm": 0.19654119828654645, "learning_rate": 2.6711606074225783e-06, "loss": 0.0027, "step": 1445 }, { "epoch": 2.3136, "grad_norm": 0.21111769539052078, "learning_rate": 2.6594147124053983e-06, "loss": 0.0027, "step": 1446 }, { "epoch": 2.3152, "grad_norm": 0.2984545484710624, "learning_rate": 2.6476907374901062e-06, "loss": 0.0028, "step": 1447 }, { "epoch": 2.3168, "grad_norm": 0.27245977028805857, "learning_rate": 2.635988717686272e-06, "loss": 0.0033, "step": 1448 }, { "epoch": 2.3184, "grad_norm": 0.16091645943495417, "learning_rate": 2.6243086879379e-06, "loss": 0.0021, "step": 1449 }, { "epoch": 2.32, "grad_norm": 0.31031957579565217, "learning_rate": 2.6126506831233343e-06, "loss": 0.0036, "step": 1450 }, { "epoch": 2.3216, "grad_norm": 0.30000907332964405, "learning_rate": 2.6010147380551474e-06, "loss": 0.0047, "step": 1451 }, { "epoch": 2.3232, "grad_norm": 0.4743264477032393, "learning_rate": 2.5894008874800323e-06, "loss": 0.0067, "step": 1452 }, { "epoch": 2.3247999999999998, "grad_norm": 0.31529185585749747, "learning_rate": 2.577809166078716e-06, "loss": 0.0031, "step": 1453 }, { "epoch": 2.3264, "grad_norm": 0.2339476450070033, "learning_rate": 2.5662396084658383e-06, "loss": 0.0021, "step": 1454 }, { "epoch": 2.328, "grad_norm": 0.3713041003942751, "learning_rate": 2.5546922491898497e-06, "loss": 0.0074, "step": 1455 }, { "epoch": 2.3296, "grad_norm": 0.2339589802070238, "learning_rate": 2.543167122732918e-06, "loss": 0.0031, "step": 1456 }, { "epoch": 2.3312, "grad_norm": 0.5717784940488866, "learning_rate": 2.5316642635108247e-06, "loss": 0.0081, "step": 1457 }, { "epoch": 2.3327999999999998, "grad_norm": 0.22448609320259003, "learning_rate": 2.5201837058728506e-06, "loss": 0.0032, "step": 1458 }, { "epoch": 2.3344, "grad_norm": 0.25023791492443204, "learning_rate": 2.508725484101684e-06, "loss": 0.0029, "step": 1459 }, { "epoch": 2.336, "grad_norm": 0.20091058624312233, "learning_rate": 2.4972896324133143e-06, "loss": 0.0031, "step": 1460 }, { "epoch": 2.3376, "grad_norm": 0.24096170695328817, "learning_rate": 2.485876184956928e-06, "loss": 0.002, "step": 1461 }, { "epoch": 2.3392, "grad_norm": 0.16494454119077795, "learning_rate": 2.474485175814816e-06, "loss": 0.0028, "step": 1462 }, { "epoch": 2.3407999999999998, "grad_norm": 0.3569701654101745, "learning_rate": 2.4631166390022574e-06, "loss": 0.0076, "step": 1463 }, { "epoch": 2.3424, "grad_norm": 0.3297034067855959, "learning_rate": 2.451770608467432e-06, "loss": 0.0044, "step": 1464 }, { "epoch": 2.344, "grad_norm": 0.23806871388529877, "learning_rate": 2.440447118091306e-06, "loss": 0.0024, "step": 1465 }, { "epoch": 2.3456, "grad_norm": 0.2863711800753721, "learning_rate": 2.429146201687538e-06, "loss": 0.005, "step": 1466 }, { "epoch": 2.3472, "grad_norm": 0.2957686438641965, "learning_rate": 2.417867893002387e-06, "loss": 0.0022, "step": 1467 }, { "epoch": 2.3487999999999998, "grad_norm": 0.2722652520341511, "learning_rate": 2.4066122257145898e-06, "loss": 0.003, "step": 1468 }, { "epoch": 2.3504, "grad_norm": 0.2559967992120655, "learning_rate": 2.3953792334352787e-06, "loss": 0.0027, "step": 1469 }, { "epoch": 2.352, "grad_norm": 0.19374332836986866, "learning_rate": 2.3841689497078746e-06, "loss": 0.0015, "step": 1470 }, { "epoch": 2.3536, "grad_norm": 0.35084170140341775, "learning_rate": 2.3729814080079815e-06, "loss": 0.0038, "step": 1471 }, { "epoch": 2.3552, "grad_norm": 0.28875005932803777, "learning_rate": 2.361816641743303e-06, "loss": 0.0031, "step": 1472 }, { "epoch": 2.3568, "grad_norm": 0.2639017927454099, "learning_rate": 2.3506746842535244e-06, "loss": 0.0032, "step": 1473 }, { "epoch": 2.3584, "grad_norm": 0.3922182807822737, "learning_rate": 2.339555568810221e-06, "loss": 0.0053, "step": 1474 }, { "epoch": 2.36, "grad_norm": 0.475777040855533, "learning_rate": 2.328459328616759e-06, "loss": 0.0037, "step": 1475 }, { "epoch": 2.3616, "grad_norm": 0.26708534180003957, "learning_rate": 2.317385996808195e-06, "loss": 0.0033, "step": 1476 }, { "epoch": 2.3632, "grad_norm": 0.4313635677032356, "learning_rate": 2.306335606451181e-06, "loss": 0.0059, "step": 1477 }, { "epoch": 2.3648, "grad_norm": 0.3618264943721742, "learning_rate": 2.295308190543859e-06, "loss": 0.0043, "step": 1478 }, { "epoch": 2.3664, "grad_norm": 0.12036963531136798, "learning_rate": 2.2843037820157678e-06, "loss": 0.0014, "step": 1479 }, { "epoch": 2.368, "grad_norm": 0.36595545738178586, "learning_rate": 2.2733224137277366e-06, "loss": 0.0054, "step": 1480 }, { "epoch": 2.3696, "grad_norm": 0.26787082388335076, "learning_rate": 2.2623641184718048e-06, "loss": 0.0035, "step": 1481 }, { "epoch": 2.3712, "grad_norm": 0.28522072693267997, "learning_rate": 2.251428928971102e-06, "loss": 0.0043, "step": 1482 }, { "epoch": 2.3728, "grad_norm": 0.5210748077944257, "learning_rate": 2.240516877879765e-06, "loss": 0.0055, "step": 1483 }, { "epoch": 2.3744, "grad_norm": 0.5219477323930597, "learning_rate": 2.229627997782834e-06, "loss": 0.006, "step": 1484 }, { "epoch": 2.376, "grad_norm": 0.33335896703198864, "learning_rate": 2.218762321196156e-06, "loss": 0.0026, "step": 1485 }, { "epoch": 2.3776, "grad_norm": 0.37122145250464333, "learning_rate": 2.2079198805662917e-06, "loss": 0.0031, "step": 1486 }, { "epoch": 2.3792, "grad_norm": 0.311804841509584, "learning_rate": 2.1971007082704167e-06, "loss": 0.003, "step": 1487 }, { "epoch": 2.3808, "grad_norm": 0.33656546755716665, "learning_rate": 2.186304836616221e-06, "loss": 0.0034, "step": 1488 }, { "epoch": 2.3824, "grad_norm": 0.20226762823370856, "learning_rate": 2.1755322978418134e-06, "loss": 0.0022, "step": 1489 }, { "epoch": 2.384, "grad_norm": 0.2478589945954889, "learning_rate": 2.1647831241156304e-06, "loss": 0.0027, "step": 1490 }, { "epoch": 2.3856, "grad_norm": 0.41468702245089384, "learning_rate": 2.1540573475363402e-06, "loss": 0.0035, "step": 1491 }, { "epoch": 2.3872, "grad_norm": 0.3191295821954367, "learning_rate": 2.1433550001327376e-06, "loss": 0.0036, "step": 1492 }, { "epoch": 2.3888, "grad_norm": 0.36544274076642663, "learning_rate": 2.1326761138636555e-06, "loss": 0.0082, "step": 1493 }, { "epoch": 2.3904, "grad_norm": 0.27671618926189717, "learning_rate": 2.122020720617869e-06, "loss": 0.002, "step": 1494 }, { "epoch": 2.392, "grad_norm": 0.30829653757205117, "learning_rate": 2.111388852214001e-06, "loss": 0.0032, "step": 1495 }, { "epoch": 2.3936, "grad_norm": 0.24967904868047844, "learning_rate": 2.1007805404004247e-06, "loss": 0.0032, "step": 1496 }, { "epoch": 2.3952, "grad_norm": 0.34782125592838925, "learning_rate": 2.090195816855164e-06, "loss": 0.0044, "step": 1497 }, { "epoch": 2.3968, "grad_norm": 0.41020445003946276, "learning_rate": 2.0796347131858187e-06, "loss": 0.0045, "step": 1498 }, { "epoch": 2.3984, "grad_norm": 0.34054741896721913, "learning_rate": 2.069097260929439e-06, "loss": 0.007, "step": 1499 }, { "epoch": 2.4, "grad_norm": 0.33057868952454816, "learning_rate": 2.058583491552465e-06, "loss": 0.0049, "step": 1500 }, { "epoch": 2.4016, "grad_norm": 0.20827812995867526, "learning_rate": 2.048093436450603e-06, "loss": 0.0019, "step": 1501 }, { "epoch": 2.4032, "grad_norm": 0.5795964386944271, "learning_rate": 2.037627126948751e-06, "loss": 0.0109, "step": 1502 }, { "epoch": 2.4048, "grad_norm": 0.2061432574165304, "learning_rate": 2.0271845943008984e-06, "loss": 0.0024, "step": 1503 }, { "epoch": 2.4064, "grad_norm": 0.30377799341148626, "learning_rate": 2.0167658696900317e-06, "loss": 0.0036, "step": 1504 }, { "epoch": 2.408, "grad_norm": 0.43473012373512426, "learning_rate": 2.006370984228043e-06, "loss": 0.0039, "step": 1505 }, { "epoch": 2.4096, "grad_norm": 0.3505689933040014, "learning_rate": 1.9959999689556407e-06, "loss": 0.0043, "step": 1506 }, { "epoch": 2.4112, "grad_norm": 0.5849372527808584, "learning_rate": 1.985652854842247e-06, "loss": 0.0052, "step": 1507 }, { "epoch": 2.4128, "grad_norm": 0.4538261118657633, "learning_rate": 1.9753296727859195e-06, "loss": 0.0038, "step": 1508 }, { "epoch": 2.4144, "grad_norm": 0.5685819220855983, "learning_rate": 1.9650304536132426e-06, "loss": 0.0106, "step": 1509 }, { "epoch": 2.416, "grad_norm": 0.3575602209191756, "learning_rate": 1.9547552280792528e-06, "loss": 0.0027, "step": 1510 }, { "epoch": 2.4176, "grad_norm": 0.30509551541333807, "learning_rate": 1.9445040268673297e-06, "loss": 0.0049, "step": 1511 }, { "epoch": 2.4192, "grad_norm": 0.3524991773850427, "learning_rate": 1.9342768805891176e-06, "loss": 0.0035, "step": 1512 }, { "epoch": 2.4208, "grad_norm": 0.17541571972789952, "learning_rate": 1.924073819784428e-06, "loss": 0.0016, "step": 1513 }, { "epoch": 2.4224, "grad_norm": 0.2290188668830978, "learning_rate": 1.9138948749211473e-06, "loss": 0.0023, "step": 1514 }, { "epoch": 2.424, "grad_norm": 0.3169526663891749, "learning_rate": 1.9037400763951508e-06, "loss": 0.0031, "step": 1515 }, { "epoch": 2.4256, "grad_norm": 0.26479670963805474, "learning_rate": 1.8936094545302098e-06, "loss": 0.0055, "step": 1516 }, { "epoch": 2.4272, "grad_norm": 0.18324366456774377, "learning_rate": 1.8835030395778941e-06, "loss": 0.002, "step": 1517 }, { "epoch": 2.4288, "grad_norm": 0.4203047399682966, "learning_rate": 1.8734208617174986e-06, "loss": 0.0038, "step": 1518 }, { "epoch": 2.4304, "grad_norm": 0.20604756226606, "learning_rate": 1.8633629510559315e-06, "loss": 0.0025, "step": 1519 }, { "epoch": 2.432, "grad_norm": 0.2766323590013184, "learning_rate": 1.8533293376276473e-06, "loss": 0.0042, "step": 1520 }, { "epoch": 2.4336, "grad_norm": 0.44343747501893677, "learning_rate": 1.8433200513945338e-06, "loss": 0.0055, "step": 1521 }, { "epoch": 2.4352, "grad_norm": 0.24799125963285085, "learning_rate": 1.8333351222458407e-06, "loss": 0.0047, "step": 1522 }, { "epoch": 2.4368, "grad_norm": 0.2526019202655489, "learning_rate": 1.8233745799980818e-06, "loss": 0.0038, "step": 1523 }, { "epoch": 2.4384, "grad_norm": 0.2039656728483351, "learning_rate": 1.813438454394948e-06, "loss": 0.0029, "step": 1524 }, { "epoch": 2.44, "grad_norm": 0.4609040856616715, "learning_rate": 1.8035267751072172e-06, "loss": 0.004, "step": 1525 }, { "epoch": 2.4416, "grad_norm": 0.3248920744267325, "learning_rate": 1.7936395717326705e-06, "loss": 0.0057, "step": 1526 }, { "epoch": 2.4432, "grad_norm": 0.22369747227456252, "learning_rate": 1.7837768737959937e-06, "loss": 0.0028, "step": 1527 }, { "epoch": 2.4448, "grad_norm": 0.19174459869622357, "learning_rate": 1.773938710748706e-06, "loss": 0.002, "step": 1528 }, { "epoch": 2.4464, "grad_norm": 0.3870861370113197, "learning_rate": 1.7641251119690505e-06, "loss": 0.0057, "step": 1529 }, { "epoch": 2.448, "grad_norm": 0.40104713267802156, "learning_rate": 1.7543361067619269e-06, "loss": 0.0106, "step": 1530 }, { "epoch": 2.4496, "grad_norm": 0.33739972760230735, "learning_rate": 1.7445717243587889e-06, "loss": 0.0032, "step": 1531 }, { "epoch": 2.4512, "grad_norm": 0.3016213026287297, "learning_rate": 1.734831993917564e-06, "loss": 0.0027, "step": 1532 }, { "epoch": 2.4528, "grad_norm": 0.1849836736276717, "learning_rate": 1.7251169445225658e-06, "loss": 0.0014, "step": 1533 }, { "epoch": 2.4544, "grad_norm": 0.3882689943555003, "learning_rate": 1.715426605184407e-06, "loss": 0.0068, "step": 1534 }, { "epoch": 2.456, "grad_norm": 0.3403722116164076, "learning_rate": 1.705761004839911e-06, "loss": 0.0041, "step": 1535 }, { "epoch": 2.4576000000000002, "grad_norm": 0.3656401880020418, "learning_rate": 1.6961201723520248e-06, "loss": 0.0055, "step": 1536 }, { "epoch": 2.4592, "grad_norm": 0.35146384136990116, "learning_rate": 1.6865041365097434e-06, "loss": 0.003, "step": 1537 }, { "epoch": 2.4608, "grad_norm": 0.6592852769443976, "learning_rate": 1.676912926028007e-06, "loss": 0.0079, "step": 1538 }, { "epoch": 2.4624, "grad_norm": 0.5307332952929912, "learning_rate": 1.6673465695476233e-06, "loss": 0.0062, "step": 1539 }, { "epoch": 2.464, "grad_norm": 0.18065197635713737, "learning_rate": 1.6578050956351887e-06, "loss": 0.0024, "step": 1540 }, { "epoch": 2.4656000000000002, "grad_norm": 0.25899370366356406, "learning_rate": 1.6482885327829912e-06, "loss": 0.0033, "step": 1541 }, { "epoch": 2.4672, "grad_norm": 0.45720416015892273, "learning_rate": 1.6387969094089318e-06, "loss": 0.005, "step": 1542 }, { "epoch": 2.4688, "grad_norm": 0.14764278603001516, "learning_rate": 1.6293302538564381e-06, "loss": 0.0018, "step": 1543 }, { "epoch": 2.4704, "grad_norm": 0.41132663149020565, "learning_rate": 1.619888594394382e-06, "loss": 0.0068, "step": 1544 }, { "epoch": 2.472, "grad_norm": 0.5567151556235029, "learning_rate": 1.6104719592169905e-06, "loss": 0.0032, "step": 1545 }, { "epoch": 2.4736000000000002, "grad_norm": 0.3841223040297522, "learning_rate": 1.6010803764437633e-06, "loss": 0.007, "step": 1546 }, { "epoch": 2.4752, "grad_norm": 0.2533837036175254, "learning_rate": 1.5917138741193972e-06, "loss": 0.0035, "step": 1547 }, { "epoch": 2.4768, "grad_norm": 0.41208188391712325, "learning_rate": 1.5823724802136863e-06, "loss": 0.0057, "step": 1548 }, { "epoch": 2.4784, "grad_norm": 0.333231114609289, "learning_rate": 1.5730562226214529e-06, "loss": 0.0035, "step": 1549 }, { "epoch": 2.48, "grad_norm": 0.3460601045673392, "learning_rate": 1.5637651291624522e-06, "loss": 0.0048, "step": 1550 }, { "epoch": 2.4816, "grad_norm": 0.4561451554139878, "learning_rate": 1.5544992275813053e-06, "loss": 0.006, "step": 1551 }, { "epoch": 2.4832, "grad_norm": 0.2600217071914026, "learning_rate": 1.545258545547398e-06, "loss": 0.0033, "step": 1552 }, { "epoch": 2.4848, "grad_norm": 0.26522188048489387, "learning_rate": 1.536043110654809e-06, "loss": 0.0046, "step": 1553 }, { "epoch": 2.4864, "grad_norm": 0.26254243975243546, "learning_rate": 1.5268529504222262e-06, "loss": 0.0029, "step": 1554 }, { "epoch": 2.488, "grad_norm": 0.3105340678608335, "learning_rate": 1.5176880922928615e-06, "loss": 0.0046, "step": 1555 }, { "epoch": 2.4896, "grad_norm": 0.3592223753091298, "learning_rate": 1.5085485636343755e-06, "loss": 0.004, "step": 1556 }, { "epoch": 2.4912, "grad_norm": 0.27150401965803533, "learning_rate": 1.4994343917387854e-06, "loss": 0.0037, "step": 1557 }, { "epoch": 2.4928, "grad_norm": 0.5551493870366635, "learning_rate": 1.4903456038223941e-06, "loss": 0.0132, "step": 1558 }, { "epoch": 2.4944, "grad_norm": 0.34588012387295763, "learning_rate": 1.481282227025701e-06, "loss": 0.0064, "step": 1559 }, { "epoch": 2.496, "grad_norm": 0.23905683065560415, "learning_rate": 1.4722442884133214e-06, "loss": 0.0025, "step": 1560 }, { "epoch": 2.4976, "grad_norm": 0.397704021336595, "learning_rate": 1.4632318149739177e-06, "loss": 0.0049, "step": 1561 }, { "epoch": 2.4992, "grad_norm": 0.3374482968206889, "learning_rate": 1.4542448336201021e-06, "loss": 0.0033, "step": 1562 }, { "epoch": 2.5008, "grad_norm": 0.34762474001471755, "learning_rate": 1.4452833711883629e-06, "loss": 0.0087, "step": 1563 }, { "epoch": 2.5023999999999997, "grad_norm": 0.2282977695631175, "learning_rate": 1.4363474544389876e-06, "loss": 0.0026, "step": 1564 }, { "epoch": 2.504, "grad_norm": 0.42541342818139793, "learning_rate": 1.4274371100559792e-06, "loss": 0.0048, "step": 1565 }, { "epoch": 2.5056000000000003, "grad_norm": 0.4863614084250172, "learning_rate": 1.4185523646469822e-06, "loss": 0.0044, "step": 1566 }, { "epoch": 2.5072, "grad_norm": 0.12498832532587154, "learning_rate": 1.409693244743192e-06, "loss": 0.0019, "step": 1567 }, { "epoch": 2.5088, "grad_norm": 0.4225063695409753, "learning_rate": 1.4008597767992872e-06, "loss": 0.0047, "step": 1568 }, { "epoch": 2.5103999999999997, "grad_norm": 0.230938913837018, "learning_rate": 1.3920519871933425e-06, "loss": 0.0026, "step": 1569 }, { "epoch": 2.512, "grad_norm": 0.2688014101429334, "learning_rate": 1.3832699022267516e-06, "loss": 0.0047, "step": 1570 }, { "epoch": 2.5136, "grad_norm": 0.5189916763612203, "learning_rate": 1.3745135481241602e-06, "loss": 0.0056, "step": 1571 }, { "epoch": 2.5152, "grad_norm": 0.2823365163678064, "learning_rate": 1.3657829510333653e-06, "loss": 0.0043, "step": 1572 }, { "epoch": 2.5168, "grad_norm": 0.2323500911953732, "learning_rate": 1.3570781370252584e-06, "loss": 0.0026, "step": 1573 }, { "epoch": 2.5183999999999997, "grad_norm": 0.26700063611671265, "learning_rate": 1.3483991320937307e-06, "loss": 0.0031, "step": 1574 }, { "epoch": 2.52, "grad_norm": 0.2532235049322369, "learning_rate": 1.339745962155613e-06, "loss": 0.0035, "step": 1575 }, { "epoch": 2.5216, "grad_norm": 0.3169413186063335, "learning_rate": 1.3311186530505838e-06, "loss": 0.0045, "step": 1576 }, { "epoch": 2.5232, "grad_norm": 0.25250982744404826, "learning_rate": 1.322517230541096e-06, "loss": 0.0051, "step": 1577 }, { "epoch": 2.5248, "grad_norm": 0.35861267443225797, "learning_rate": 1.313941720312303e-06, "loss": 0.0047, "step": 1578 }, { "epoch": 2.5263999999999998, "grad_norm": 0.33250067415436657, "learning_rate": 1.30539214797198e-06, "loss": 0.003, "step": 1579 }, { "epoch": 2.528, "grad_norm": 0.26320164189665274, "learning_rate": 1.2968685390504465e-06, "loss": 0.0029, "step": 1580 }, { "epoch": 2.5296, "grad_norm": 0.33401526006983745, "learning_rate": 1.2883709190004956e-06, "loss": 0.0039, "step": 1581 }, { "epoch": 2.5312, "grad_norm": 0.30630453343895253, "learning_rate": 1.2798993131973093e-06, "loss": 0.0037, "step": 1582 }, { "epoch": 2.5328, "grad_norm": 0.1999894111806715, "learning_rate": 1.2714537469383858e-06, "loss": 0.002, "step": 1583 }, { "epoch": 2.5343999999999998, "grad_norm": 0.32881800262495675, "learning_rate": 1.263034245443473e-06, "loss": 0.0037, "step": 1584 }, { "epoch": 2.536, "grad_norm": 0.21174416417483852, "learning_rate": 1.254640833854477e-06, "loss": 0.0037, "step": 1585 }, { "epoch": 2.5376, "grad_norm": 0.08888564725084476, "learning_rate": 1.2462735372353996e-06, "loss": 0.0013, "step": 1586 }, { "epoch": 2.5392, "grad_norm": 0.38680815178140715, "learning_rate": 1.2379323805722575e-06, "loss": 0.0075, "step": 1587 }, { "epoch": 2.5408, "grad_norm": 0.2271389553814667, "learning_rate": 1.2296173887730122e-06, "loss": 0.0034, "step": 1588 }, { "epoch": 2.5423999999999998, "grad_norm": 0.43773514969335675, "learning_rate": 1.2213285866674908e-06, "loss": 0.0049, "step": 1589 }, { "epoch": 2.544, "grad_norm": 0.29155850382909776, "learning_rate": 1.2130659990073146e-06, "loss": 0.0029, "step": 1590 }, { "epoch": 2.5456, "grad_norm": 0.2653316292247236, "learning_rate": 1.2048296504658208e-06, "loss": 0.003, "step": 1591 }, { "epoch": 2.5472, "grad_norm": 0.36255738638210777, "learning_rate": 1.196619565638003e-06, "loss": 0.0033, "step": 1592 }, { "epoch": 2.5488, "grad_norm": 0.28846739314243985, "learning_rate": 1.1884357690404157e-06, "loss": 0.0042, "step": 1593 }, { "epoch": 2.5504, "grad_norm": 0.28643151052329563, "learning_rate": 1.1802782851111206e-06, "loss": 0.0034, "step": 1594 }, { "epoch": 2.552, "grad_norm": 0.3566538575233225, "learning_rate": 1.1721471382096028e-06, "loss": 0.0068, "step": 1595 }, { "epoch": 2.5536, "grad_norm": 0.6056236455845999, "learning_rate": 1.1640423526166987e-06, "loss": 0.0052, "step": 1596 }, { "epoch": 2.5552, "grad_norm": 0.3432641368965244, "learning_rate": 1.1559639525345313e-06, "loss": 0.0034, "step": 1597 }, { "epoch": 2.5568, "grad_norm": 0.314694297610856, "learning_rate": 1.1479119620864277e-06, "loss": 0.0063, "step": 1598 }, { "epoch": 2.5584, "grad_norm": 0.16891172033085503, "learning_rate": 1.1398864053168534e-06, "loss": 0.0021, "step": 1599 }, { "epoch": 2.56, "grad_norm": 0.28250641261356624, "learning_rate": 1.1318873061913405e-06, "loss": 0.004, "step": 1600 }, { "epoch": 2.5616, "grad_norm": 0.3870674371758313, "learning_rate": 1.123914688596409e-06, "loss": 0.0042, "step": 1601 }, { "epoch": 2.5632, "grad_norm": 0.1787916187396734, "learning_rate": 1.1159685763395113e-06, "loss": 0.0021, "step": 1602 }, { "epoch": 2.5648, "grad_norm": 0.09943497923667746, "learning_rate": 1.108048993148939e-06, "loss": 0.0011, "step": 1603 }, { "epoch": 2.5664, "grad_norm": 0.3028880170947417, "learning_rate": 1.1001559626737757e-06, "loss": 0.0032, "step": 1604 }, { "epoch": 2.568, "grad_norm": 0.27090689294158915, "learning_rate": 1.0922895084838036e-06, "loss": 0.0039, "step": 1605 }, { "epoch": 2.5696, "grad_norm": 0.5076518218659594, "learning_rate": 1.0844496540694515e-06, "loss": 0.0056, "step": 1606 }, { "epoch": 2.5712, "grad_norm": 0.5329209114392545, "learning_rate": 1.0766364228417148e-06, "loss": 0.0085, "step": 1607 }, { "epoch": 2.5728, "grad_norm": 0.37775231134155207, "learning_rate": 1.0688498381320855e-06, "loss": 0.0051, "step": 1608 }, { "epoch": 2.5744, "grad_norm": 0.27772476593874784, "learning_rate": 1.0610899231924887e-06, "loss": 0.003, "step": 1609 }, { "epoch": 2.576, "grad_norm": 0.19838487249698816, "learning_rate": 1.0533567011952094e-06, "loss": 0.0027, "step": 1610 }, { "epoch": 2.5776, "grad_norm": 0.25085948073206776, "learning_rate": 1.0456501952328191e-06, "loss": 0.0034, "step": 1611 }, { "epoch": 2.5792, "grad_norm": 0.39908536394921706, "learning_rate": 1.037970428318118e-06, "loss": 0.0041, "step": 1612 }, { "epoch": 2.5808, "grad_norm": 0.6323558270607047, "learning_rate": 1.0303174233840529e-06, "loss": 0.0064, "step": 1613 }, { "epoch": 2.5824, "grad_norm": 0.18981437980393318, "learning_rate": 1.022691203283661e-06, "loss": 0.0029, "step": 1614 }, { "epoch": 2.584, "grad_norm": 0.22420449384561897, "learning_rate": 1.0150917907899926e-06, "loss": 0.0028, "step": 1615 }, { "epoch": 2.5856, "grad_norm": 0.22817326728027199, "learning_rate": 1.0075192085960451e-06, "loss": 0.0023, "step": 1616 }, { "epoch": 2.5872, "grad_norm": 0.24630772747490712, "learning_rate": 9.999734793146998e-07, "loss": 0.0038, "step": 1617 }, { "epoch": 2.5888, "grad_norm": 0.31054400155061074, "learning_rate": 9.924546254786493e-07, "loss": 0.0034, "step": 1618 }, { "epoch": 2.5904, "grad_norm": 0.15441061679808213, "learning_rate": 9.849626695403326e-07, "loss": 0.0029, "step": 1619 }, { "epoch": 2.592, "grad_norm": 0.28853886186684, "learning_rate": 9.77497633871868e-07, "loss": 0.0031, "step": 1620 }, { "epoch": 2.5936, "grad_norm": 0.13359951082571278, "learning_rate": 9.700595407649805e-07, "loss": 0.0013, "step": 1621 }, { "epoch": 2.5952, "grad_norm": 0.25004584541730634, "learning_rate": 9.62648412430951e-07, "loss": 0.004, "step": 1622 }, { "epoch": 2.5968, "grad_norm": 0.39526327537043676, "learning_rate": 9.5526427100053e-07, "loss": 0.0035, "step": 1623 }, { "epoch": 2.5984, "grad_norm": 0.24772152883498377, "learning_rate": 9.479071385238892e-07, "loss": 0.0033, "step": 1624 }, { "epoch": 2.6, "grad_norm": 0.3254513376690307, "learning_rate": 9.40577036970538e-07, "loss": 0.0029, "step": 1625 }, { "epoch": 2.6016, "grad_norm": 0.34472744884953915, "learning_rate": 9.332739882292752e-07, "loss": 0.0035, "step": 1626 }, { "epoch": 2.6032, "grad_norm": 0.37230591882644276, "learning_rate": 9.259980141081115e-07, "loss": 0.0033, "step": 1627 }, { "epoch": 2.6048, "grad_norm": 0.3720842210650768, "learning_rate": 9.187491363342094e-07, "loss": 0.0046, "step": 1628 }, { "epoch": 2.6064, "grad_norm": 0.4182576528710122, "learning_rate": 9.115273765538202e-07, "loss": 0.0089, "step": 1629 }, { "epoch": 2.608, "grad_norm": 0.5352513477811385, "learning_rate": 9.043327563322113e-07, "loss": 0.0076, "step": 1630 }, { "epoch": 2.6096, "grad_norm": 0.316514214802117, "learning_rate": 8.971652971536149e-07, "loss": 0.0046, "step": 1631 }, { "epoch": 2.6112, "grad_norm": 0.23125374471918014, "learning_rate": 8.900250204211513e-07, "loss": 0.0034, "step": 1632 }, { "epoch": 2.6128, "grad_norm": 0.290009493417907, "learning_rate": 8.829119474567672e-07, "loss": 0.0036, "step": 1633 }, { "epoch": 2.6144, "grad_norm": 0.22741470652410545, "learning_rate": 8.758260995011825e-07, "loss": 0.0022, "step": 1634 }, { "epoch": 2.616, "grad_norm": 0.44988819712172906, "learning_rate": 8.687674977138116e-07, "loss": 0.0067, "step": 1635 }, { "epoch": 2.6176, "grad_norm": 0.40645278584597383, "learning_rate": 8.617361631727139e-07, "loss": 0.0051, "step": 1636 }, { "epoch": 2.6192, "grad_norm": 0.39458796436952626, "learning_rate": 8.547321168745192e-07, "loss": 0.004, "step": 1637 }, { "epoch": 2.6208, "grad_norm": 0.21891152487583124, "learning_rate": 8.477553797343729e-07, "loss": 0.0027, "step": 1638 }, { "epoch": 2.6224, "grad_norm": 0.39473639258139465, "learning_rate": 8.40805972585872e-07, "loss": 0.0034, "step": 1639 }, { "epoch": 2.624, "grad_norm": 0.4043861845152941, "learning_rate": 8.338839161809997e-07, "loss": 0.0028, "step": 1640 }, { "epoch": 2.6256, "grad_norm": 0.27864258890756977, "learning_rate": 8.269892311900696e-07, "loss": 0.0031, "step": 1641 }, { "epoch": 2.6272, "grad_norm": 0.26557343737536915, "learning_rate": 8.201219382016556e-07, "loss": 0.0027, "step": 1642 }, { "epoch": 2.6288, "grad_norm": 0.24467885630190223, "learning_rate": 8.132820577225386e-07, "loss": 0.0034, "step": 1643 }, { "epoch": 2.6304, "grad_norm": 0.3069297710916183, "learning_rate": 8.06469610177636e-07, "loss": 0.004, "step": 1644 }, { "epoch": 2.632, "grad_norm": 0.2345896763838344, "learning_rate": 7.996846159099558e-07, "loss": 0.0026, "step": 1645 }, { "epoch": 2.6336, "grad_norm": 0.2687014085198747, "learning_rate": 7.92927095180518e-07, "loss": 0.0023, "step": 1646 }, { "epoch": 2.6352, "grad_norm": 0.2509517187635115, "learning_rate": 7.861970681683051e-07, "loss": 0.0029, "step": 1647 }, { "epoch": 2.6368, "grad_norm": 0.3224143429555162, "learning_rate": 7.794945549701993e-07, "loss": 0.003, "step": 1648 }, { "epoch": 2.6384, "grad_norm": 0.34475283717693306, "learning_rate": 7.728195756009204e-07, "loss": 0.0028, "step": 1649 }, { "epoch": 2.64, "grad_norm": 0.4283927293710006, "learning_rate": 7.661721499929753e-07, "loss": 0.0114, "step": 1650 }, { "epoch": 2.6416, "grad_norm": 0.23359871179380523, "learning_rate": 7.595522979965819e-07, "loss": 0.0024, "step": 1651 }, { "epoch": 2.6432, "grad_norm": 0.28872326379249535, "learning_rate": 7.529600393796232e-07, "loss": 0.0027, "step": 1652 }, { "epoch": 2.6448, "grad_norm": 0.20751657884631444, "learning_rate": 7.463953938275859e-07, "loss": 0.0039, "step": 1653 }, { "epoch": 2.6464, "grad_norm": 0.1548793088215341, "learning_rate": 7.398583809434944e-07, "loss": 0.0017, "step": 1654 }, { "epoch": 2.648, "grad_norm": 0.18302776241554822, "learning_rate": 7.333490202478666e-07, "loss": 0.0029, "step": 1655 }, { "epoch": 2.6496, "grad_norm": 0.2665263815329152, "learning_rate": 7.268673311786378e-07, "loss": 0.003, "step": 1656 }, { "epoch": 2.6512000000000002, "grad_norm": 0.2852673473046866, "learning_rate": 7.204133330911179e-07, "loss": 0.0026, "step": 1657 }, { "epoch": 2.6528, "grad_norm": 0.4359148892423722, "learning_rate": 7.1398704525792e-07, "loss": 0.0042, "step": 1658 }, { "epoch": 2.6544, "grad_norm": 0.22052120673606213, "learning_rate": 7.07588486868922e-07, "loss": 0.0024, "step": 1659 }, { "epoch": 2.656, "grad_norm": 0.1597290951375419, "learning_rate": 7.012176770311863e-07, "loss": 0.002, "step": 1660 }, { "epoch": 2.6576, "grad_norm": 0.17961582222811928, "learning_rate": 6.948746347689184e-07, "loss": 0.0018, "step": 1661 }, { "epoch": 2.6592000000000002, "grad_norm": 0.27243437680930604, "learning_rate": 6.885593790234057e-07, "loss": 0.0019, "step": 1662 }, { "epoch": 2.6608, "grad_norm": 0.18828994729054635, "learning_rate": 6.8227192865296e-07, "loss": 0.0051, "step": 1663 }, { "epoch": 2.6624, "grad_norm": 0.2779927024355395, "learning_rate": 6.760123024328624e-07, "loss": 0.0031, "step": 1664 }, { "epoch": 2.664, "grad_norm": 0.1688514914160169, "learning_rate": 6.697805190553086e-07, "loss": 0.002, "step": 1665 }, { "epoch": 2.6656, "grad_norm": 0.28104834926529215, "learning_rate": 6.635765971293484e-07, "loss": 0.0033, "step": 1666 }, { "epoch": 2.6672000000000002, "grad_norm": 0.3832135221159943, "learning_rate": 6.574005551808338e-07, "loss": 0.0058, "step": 1667 }, { "epoch": 2.6688, "grad_norm": 0.18272617424296622, "learning_rate": 6.512524116523633e-07, "loss": 0.0025, "step": 1668 }, { "epoch": 2.6704, "grad_norm": 0.2412288002713958, "learning_rate": 6.451321849032289e-07, "loss": 0.0026, "step": 1669 }, { "epoch": 2.672, "grad_norm": 0.4097149189369184, "learning_rate": 6.390398932093555e-07, "loss": 0.0043, "step": 1670 }, { "epoch": 2.6736, "grad_norm": 0.27431070134136, "learning_rate": 6.329755547632499e-07, "loss": 0.0033, "step": 1671 }, { "epoch": 2.6752000000000002, "grad_norm": 0.23379952328211157, "learning_rate": 6.269391876739494e-07, "loss": 0.002, "step": 1672 }, { "epoch": 2.6768, "grad_norm": 0.4225055935864826, "learning_rate": 6.209308099669598e-07, "loss": 0.0069, "step": 1673 }, { "epoch": 2.6784, "grad_norm": 0.17709261279424496, "learning_rate": 6.149504395842087e-07, "loss": 0.0016, "step": 1674 }, { "epoch": 2.68, "grad_norm": 0.310095475370071, "learning_rate": 6.089980943839924e-07, "loss": 0.0035, "step": 1675 }, { "epoch": 2.6816, "grad_norm": 0.6092157006990844, "learning_rate": 6.030737921409169e-07, "loss": 0.0043, "step": 1676 }, { "epoch": 2.6832000000000003, "grad_norm": 0.4616010603664959, "learning_rate": 5.971775505458444e-07, "loss": 0.005, "step": 1677 }, { "epoch": 2.6848, "grad_norm": 0.3561329460517029, "learning_rate": 5.913093872058528e-07, "loss": 0.0053, "step": 1678 }, { "epoch": 2.6864, "grad_norm": 0.2757271718152053, "learning_rate": 5.854693196441641e-07, "loss": 0.0034, "step": 1679 }, { "epoch": 2.6879999999999997, "grad_norm": 0.3106683728908216, "learning_rate": 5.796573653001091e-07, "loss": 0.0039, "step": 1680 }, { "epoch": 2.6896, "grad_norm": 0.3744773202118949, "learning_rate": 5.738735415290642e-07, "loss": 0.0037, "step": 1681 }, { "epoch": 2.6912000000000003, "grad_norm": 0.42079205797207647, "learning_rate": 5.681178656024055e-07, "loss": 0.0043, "step": 1682 }, { "epoch": 2.6928, "grad_norm": 0.22350666390277044, "learning_rate": 5.62390354707455e-07, "loss": 0.0022, "step": 1683 }, { "epoch": 2.6944, "grad_norm": 0.2899058545688104, "learning_rate": 5.56691025947429e-07, "loss": 0.0027, "step": 1684 }, { "epoch": 2.6959999999999997, "grad_norm": 0.2199050514310044, "learning_rate": 5.510198963413882e-07, "loss": 0.0033, "step": 1685 }, { "epoch": 2.6976, "grad_norm": 0.23732426830995915, "learning_rate": 5.453769828241872e-07, "loss": 0.0025, "step": 1686 }, { "epoch": 2.6992000000000003, "grad_norm": 0.4124371839877955, "learning_rate": 5.397623022464227e-07, "loss": 0.004, "step": 1687 }, { "epoch": 2.7008, "grad_norm": 0.334224332684429, "learning_rate": 5.341758713743828e-07, "loss": 0.004, "step": 1688 }, { "epoch": 2.7024, "grad_norm": 0.420696843510133, "learning_rate": 5.286177068899989e-07, "loss": 0.0047, "step": 1689 }, { "epoch": 2.7039999999999997, "grad_norm": 0.4228631696533493, "learning_rate": 5.230878253907911e-07, "loss": 0.0056, "step": 1690 }, { "epoch": 2.7056, "grad_norm": 0.2897142564004242, "learning_rate": 5.175862433898282e-07, "loss": 0.004, "step": 1691 }, { "epoch": 2.7072000000000003, "grad_norm": 0.3487682996311858, "learning_rate": 5.121129773156663e-07, "loss": 0.0048, "step": 1692 }, { "epoch": 2.7088, "grad_norm": 0.13490441372806583, "learning_rate": 5.066680435123106e-07, "loss": 0.0019, "step": 1693 }, { "epoch": 2.7104, "grad_norm": 0.15332938659980572, "learning_rate": 5.012514582391592e-07, "loss": 0.0017, "step": 1694 }, { "epoch": 2.7119999999999997, "grad_norm": 0.47657162395392066, "learning_rate": 4.95863237670956e-07, "loss": 0.0045, "step": 1695 }, { "epoch": 2.7136, "grad_norm": 0.3755987585052613, "learning_rate": 4.905033978977492e-07, "loss": 0.0045, "step": 1696 }, { "epoch": 2.7152, "grad_norm": 0.34581872345178366, "learning_rate": 4.851719549248301e-07, "loss": 0.0033, "step": 1697 }, { "epoch": 2.7168, "grad_norm": 0.3155072867452202, "learning_rate": 4.798689246727006e-07, "loss": 0.0033, "step": 1698 }, { "epoch": 2.7184, "grad_norm": 0.3187100892007427, "learning_rate": 4.7459432297701224e-07, "loss": 0.0059, "step": 1699 }, { "epoch": 2.7199999999999998, "grad_norm": 0.2505391599969973, "learning_rate": 4.693481655885257e-07, "loss": 0.0026, "step": 1700 }, { "epoch": 2.7216, "grad_norm": 0.2971684776745727, "learning_rate": 4.6413046817306404e-07, "loss": 0.0022, "step": 1701 }, { "epoch": 2.7232, "grad_norm": 0.4266222875524458, "learning_rate": 4.58941246311464e-07, "loss": 0.0042, "step": 1702 }, { "epoch": 2.7248, "grad_norm": 0.2588915506106927, "learning_rate": 4.5378051549952783e-07, "loss": 0.003, "step": 1703 }, { "epoch": 2.7264, "grad_norm": 0.29728353306995725, "learning_rate": 4.4864829114798394e-07, "loss": 0.003, "step": 1704 }, { "epoch": 2.7279999999999998, "grad_norm": 0.35270506873865676, "learning_rate": 4.4354458858242857e-07, "loss": 0.0042, "step": 1705 }, { "epoch": 2.7296, "grad_norm": 0.35164139940299416, "learning_rate": 4.384694230432984e-07, "loss": 0.005, "step": 1706 }, { "epoch": 2.7312, "grad_norm": 0.303702081177274, "learning_rate": 4.3342280968580287e-07, "loss": 0.0043, "step": 1707 }, { "epoch": 2.7328, "grad_norm": 0.4782687537873158, "learning_rate": 4.2840476357989825e-07, "loss": 0.0062, "step": 1708 }, { "epoch": 2.7344, "grad_norm": 0.22788466574501093, "learning_rate": 4.2341529971023253e-07, "loss": 0.0025, "step": 1709 }, { "epoch": 2.7359999999999998, "grad_norm": 0.3466322894758412, "learning_rate": 4.184544329761009e-07, "loss": 0.0036, "step": 1710 }, { "epoch": 2.7376, "grad_norm": 0.1948126746096502, "learning_rate": 4.1352217819140337e-07, "loss": 0.0023, "step": 1711 }, { "epoch": 2.7392, "grad_norm": 0.10559120452233281, "learning_rate": 4.0861855008460403e-07, "loss": 0.0013, "step": 1712 }, { "epoch": 2.7408, "grad_norm": 0.2509838023826191, "learning_rate": 4.037435632986786e-07, "loss": 0.0027, "step": 1713 }, { "epoch": 2.7424, "grad_norm": 0.432675046135986, "learning_rate": 3.988972323910778e-07, "loss": 0.0092, "step": 1714 }, { "epoch": 2.7439999999999998, "grad_norm": 0.4779228597178864, "learning_rate": 3.9407957183368093e-07, "loss": 0.0109, "step": 1715 }, { "epoch": 2.7456, "grad_norm": 0.1786314616239119, "learning_rate": 3.8929059601275463e-07, "loss": 0.0029, "step": 1716 }, { "epoch": 2.7472, "grad_norm": 0.31884951510899023, "learning_rate": 3.845303192289074e-07, "loss": 0.0035, "step": 1717 }, { "epoch": 2.7488, "grad_norm": 0.2333984870236776, "learning_rate": 3.797987556970495e-07, "loss": 0.0068, "step": 1718 }, { "epoch": 2.7504, "grad_norm": 0.22299594984545076, "learning_rate": 3.750959195463466e-07, "loss": 0.0028, "step": 1719 }, { "epoch": 2.752, "grad_norm": 0.3939060564745656, "learning_rate": 3.7042182482018074e-07, "loss": 0.0049, "step": 1720 }, { "epoch": 2.7536, "grad_norm": 0.47879957382724625, "learning_rate": 3.6577648547611033e-07, "loss": 0.0033, "step": 1721 }, { "epoch": 2.7552, "grad_norm": 0.24671324801670064, "learning_rate": 3.611599153858214e-07, "loss": 0.0029, "step": 1722 }, { "epoch": 2.7568, "grad_norm": 0.24115774832630246, "learning_rate": 3.5657212833509313e-07, "loss": 0.0023, "step": 1723 }, { "epoch": 2.7584, "grad_norm": 0.20611777074711515, "learning_rate": 3.520131380237546e-07, "loss": 0.0017, "step": 1724 }, { "epoch": 2.76, "grad_norm": 0.2860716910321781, "learning_rate": 3.474829580656436e-07, "loss": 0.0032, "step": 1725 }, { "epoch": 2.7616, "grad_norm": 0.6103961839691567, "learning_rate": 3.429816019885657e-07, "loss": 0.0068, "step": 1726 }, { "epoch": 2.7632, "grad_norm": 0.18070944610210354, "learning_rate": 3.385090832342497e-07, "loss": 0.0015, "step": 1727 }, { "epoch": 2.7648, "grad_norm": 0.3073672440385467, "learning_rate": 3.3406541515832e-07, "loss": 0.0031, "step": 1728 }, { "epoch": 2.7664, "grad_norm": 0.23741522353964717, "learning_rate": 3.296506110302422e-07, "loss": 0.0026, "step": 1729 }, { "epoch": 2.768, "grad_norm": 0.29542755115257086, "learning_rate": 3.252646840332918e-07, "loss": 0.0037, "step": 1730 }, { "epoch": 2.7696, "grad_norm": 0.183430451355198, "learning_rate": 3.209076472645112e-07, "loss": 0.0018, "step": 1731 }, { "epoch": 2.7712, "grad_norm": 0.3426702018465107, "learning_rate": 3.16579513734675e-07, "loss": 0.003, "step": 1732 }, { "epoch": 2.7728, "grad_norm": 0.505331823250815, "learning_rate": 3.1228029636824477e-07, "loss": 0.0048, "step": 1733 }, { "epoch": 2.7744, "grad_norm": 0.19787976338126043, "learning_rate": 3.080100080033388e-07, "loss": 0.002, "step": 1734 }, { "epoch": 2.776, "grad_norm": 0.6953127204583504, "learning_rate": 3.037686613916857e-07, "loss": 0.0093, "step": 1735 }, { "epoch": 2.7776, "grad_norm": 0.32483452624924114, "learning_rate": 2.995562691985898e-07, "loss": 0.0028, "step": 1736 }, { "epoch": 2.7792, "grad_norm": 0.3534645641928358, "learning_rate": 2.9537284400289354e-07, "loss": 0.0033, "step": 1737 }, { "epoch": 2.7808, "grad_norm": 0.3465996865866739, "learning_rate": 2.9121839829693857e-07, "loss": 0.0067, "step": 1738 }, { "epoch": 2.7824, "grad_norm": 0.3650640452288819, "learning_rate": 2.8709294448653223e-07, "loss": 0.0041, "step": 1739 }, { "epoch": 2.784, "grad_norm": 0.27385479392048906, "learning_rate": 2.829964948909048e-07, "loss": 0.0037, "step": 1740 }, { "epoch": 2.7856, "grad_norm": 0.23809407069874067, "learning_rate": 2.7892906174267653e-07, "loss": 0.002, "step": 1741 }, { "epoch": 2.7872, "grad_norm": 0.20794968591921323, "learning_rate": 2.748906571878207e-07, "loss": 0.0024, "step": 1742 }, { "epoch": 2.7888, "grad_norm": 0.5666314290271737, "learning_rate": 2.708812932856253e-07, "loss": 0.0041, "step": 1743 }, { "epoch": 2.7904, "grad_norm": 0.4194997875269323, "learning_rate": 2.6690098200866097e-07, "loss": 0.0055, "step": 1744 }, { "epoch": 2.792, "grad_norm": 0.2758212918383284, "learning_rate": 2.6294973524274127e-07, "loss": 0.0028, "step": 1745 }, { "epoch": 2.7936, "grad_norm": 0.3627580297339781, "learning_rate": 2.5902756478688674e-07, "loss": 0.0051, "step": 1746 }, { "epoch": 2.7952, "grad_norm": 0.3392983160782683, "learning_rate": 2.551344823532964e-07, "loss": 0.0036, "step": 1747 }, { "epoch": 2.7968, "grad_norm": 0.4412941334882733, "learning_rate": 2.5127049956730207e-07, "loss": 0.009, "step": 1748 }, { "epoch": 2.7984, "grad_norm": 0.28087146012137015, "learning_rate": 2.474356279673462e-07, "loss": 0.003, "step": 1749 }, { "epoch": 2.8, "grad_norm": 0.23433563341090205, "learning_rate": 2.436298790049363e-07, "loss": 0.0022, "step": 1750 }, { "epoch": 2.8016, "grad_norm": 0.38528540201664624, "learning_rate": 2.398532640446161e-07, "loss": 0.0065, "step": 1751 }, { "epoch": 2.8032, "grad_norm": 0.3196802362593276, "learning_rate": 2.3610579436392999e-07, "loss": 0.004, "step": 1752 }, { "epoch": 2.8048, "grad_norm": 0.35865307250947054, "learning_rate": 2.3238748115339327e-07, "loss": 0.0035, "step": 1753 }, { "epoch": 2.8064, "grad_norm": 0.2520640853349629, "learning_rate": 2.2869833551645293e-07, "loss": 0.0024, "step": 1754 }, { "epoch": 2.808, "grad_norm": 0.31570000493873085, "learning_rate": 2.2503836846945792e-07, "loss": 0.0032, "step": 1755 }, { "epoch": 2.8096, "grad_norm": 0.2822847318578194, "learning_rate": 2.2140759094162468e-07, "loss": 0.0036, "step": 1756 }, { "epoch": 2.8112, "grad_norm": 0.2574367021470435, "learning_rate": 2.178060137750071e-07, "loss": 0.0038, "step": 1757 }, { "epoch": 2.8128, "grad_norm": 0.3104942552175079, "learning_rate": 2.1423364772445886e-07, "loss": 0.0033, "step": 1758 }, { "epoch": 2.8144, "grad_norm": 0.19123508252467655, "learning_rate": 2.106905034576112e-07, "loss": 0.0027, "step": 1759 }, { "epoch": 2.816, "grad_norm": 0.31883454012133566, "learning_rate": 2.071765915548274e-07, "loss": 0.0045, "step": 1760 }, { "epoch": 2.8176, "grad_norm": 0.1846107453424149, "learning_rate": 2.036919225091827e-07, "loss": 0.0031, "step": 1761 }, { "epoch": 2.8192, "grad_norm": 0.25595674556604037, "learning_rate": 2.002365067264289e-07, "loss": 0.0026, "step": 1762 }, { "epoch": 2.8208, "grad_norm": 0.2890342090918847, "learning_rate": 1.9681035452496112e-07, "loss": 0.0042, "step": 1763 }, { "epoch": 2.8224, "grad_norm": 0.23682918205933798, "learning_rate": 1.9341347613579086e-07, "loss": 0.0039, "step": 1764 }, { "epoch": 2.824, "grad_norm": 0.3958425374941724, "learning_rate": 1.900458817025097e-07, "loss": 0.0034, "step": 1765 }, { "epoch": 2.8256, "grad_norm": 0.16606697252502298, "learning_rate": 1.867075812812691e-07, "loss": 0.0019, "step": 1766 }, { "epoch": 2.8272, "grad_norm": 0.24363351954827045, "learning_rate": 1.8339858484073935e-07, "loss": 0.002, "step": 1767 }, { "epoch": 2.8288, "grad_norm": 0.44218824058715006, "learning_rate": 1.8011890226208527e-07, "loss": 0.0055, "step": 1768 }, { "epoch": 2.8304, "grad_norm": 0.5250979322757067, "learning_rate": 1.7686854333893833e-07, "loss": 0.0073, "step": 1769 }, { "epoch": 2.832, "grad_norm": 0.3967675739130118, "learning_rate": 1.7364751777736334e-07, "loss": 0.0043, "step": 1770 }, { "epoch": 2.8336, "grad_norm": 0.235277710668951, "learning_rate": 1.7045583519583075e-07, "loss": 0.0023, "step": 1771 }, { "epoch": 2.8352, "grad_norm": 0.24237221288449026, "learning_rate": 1.6729350512519006e-07, "loss": 0.0019, "step": 1772 }, { "epoch": 2.8368, "grad_norm": 0.21388220559863358, "learning_rate": 1.6416053700863965e-07, "loss": 0.0026, "step": 1773 }, { "epoch": 2.8384, "grad_norm": 0.22403363096859452, "learning_rate": 1.6105694020169594e-07, "loss": 0.003, "step": 1774 }, { "epoch": 2.84, "grad_norm": 0.2578079621489086, "learning_rate": 1.5798272397217097e-07, "loss": 0.0025, "step": 1775 }, { "epoch": 2.8416, "grad_norm": 0.22062263793996525, "learning_rate": 1.5493789750014032e-07, "loss": 0.0019, "step": 1776 }, { "epoch": 2.8432, "grad_norm": 0.3146290994127335, "learning_rate": 1.519224698779198e-07, "loss": 0.004, "step": 1777 }, { "epoch": 2.8448, "grad_norm": 0.18928296544669773, "learning_rate": 1.489364501100332e-07, "loss": 0.0017, "step": 1778 }, { "epoch": 2.8464, "grad_norm": 0.45541486559528266, "learning_rate": 1.459798471131868e-07, "loss": 0.0095, "step": 1779 }, { "epoch": 2.848, "grad_norm": 0.3656356148384636, "learning_rate": 1.430526697162482e-07, "loss": 0.0036, "step": 1780 }, { "epoch": 2.8496, "grad_norm": 0.29965165814152733, "learning_rate": 1.4015492666021313e-07, "loss": 0.0058, "step": 1781 }, { "epoch": 2.8512, "grad_norm": 0.14044524917062876, "learning_rate": 1.3728662659818205e-07, "loss": 0.0013, "step": 1782 }, { "epoch": 2.8528000000000002, "grad_norm": 0.3106923022270456, "learning_rate": 1.344477780953346e-07, "loss": 0.0025, "step": 1783 }, { "epoch": 2.8544, "grad_norm": 0.4012983707274509, "learning_rate": 1.3163838962890196e-07, "loss": 0.0035, "step": 1784 }, { "epoch": 2.856, "grad_norm": 0.23270638672389216, "learning_rate": 1.2885846958814673e-07, "loss": 0.0027, "step": 1785 }, { "epoch": 2.8576, "grad_norm": 0.39470098855170954, "learning_rate": 1.2610802627432972e-07, "loss": 0.0051, "step": 1786 }, { "epoch": 2.8592, "grad_norm": 0.2895920263603846, "learning_rate": 1.2338706790069433e-07, "loss": 0.0041, "step": 1787 }, { "epoch": 2.8608000000000002, "grad_norm": 0.455922258530345, "learning_rate": 1.206956025924333e-07, "loss": 0.0078, "step": 1788 }, { "epoch": 2.8624, "grad_norm": 0.38409334305045495, "learning_rate": 1.1803363838667092e-07, "loss": 0.0031, "step": 1789 }, { "epoch": 2.864, "grad_norm": 0.3160169405210605, "learning_rate": 1.1540118323243866e-07, "loss": 0.0032, "step": 1790 }, { "epoch": 2.8656, "grad_norm": 0.33398026700734806, "learning_rate": 1.1279824499064396e-07, "loss": 0.0052, "step": 1791 }, { "epoch": 2.8672, "grad_norm": 0.31053508960210263, "learning_rate": 1.1022483143405705e-07, "loss": 0.0041, "step": 1792 }, { "epoch": 2.8688000000000002, "grad_norm": 0.2165345269428243, "learning_rate": 1.0768095024728309e-07, "loss": 0.0023, "step": 1793 }, { "epoch": 2.8704, "grad_norm": 0.294364409308341, "learning_rate": 1.0516660902673448e-07, "loss": 0.0026, "step": 1794 }, { "epoch": 2.872, "grad_norm": 0.31868904443652213, "learning_rate": 1.0268181528061749e-07, "loss": 0.0036, "step": 1795 }, { "epoch": 2.8736, "grad_norm": 0.27411807331233073, "learning_rate": 1.0022657642890232e-07, "loss": 0.0021, "step": 1796 }, { "epoch": 2.8752, "grad_norm": 0.22469651822799042, "learning_rate": 9.780089980330643e-08, "loss": 0.002, "step": 1797 }, { "epoch": 2.8768000000000002, "grad_norm": 0.5504478236454433, "learning_rate": 9.540479264726676e-08, "loss": 0.0065, "step": 1798 }, { "epoch": 2.8784, "grad_norm": 0.1794435496363775, "learning_rate": 9.303826211592316e-08, "loss": 0.0016, "step": 1799 }, { "epoch": 2.88, "grad_norm": 0.2654843469205127, "learning_rate": 9.070131527609604e-08, "loss": 0.0037, "step": 1800 }, { "epoch": 2.8816, "grad_norm": 0.24419522322704548, "learning_rate": 8.839395910626214e-08, "loss": 0.0025, "step": 1801 }, { "epoch": 2.8832, "grad_norm": 0.22986470580386867, "learning_rate": 8.61162004965388e-08, "loss": 0.0023, "step": 1802 }, { "epoch": 2.8848000000000003, "grad_norm": 0.16419135289067946, "learning_rate": 8.386804624865851e-08, "loss": 0.0017, "step": 1803 }, { "epoch": 2.8864, "grad_norm": 0.15006603379648195, "learning_rate": 8.16495030759501e-08, "loss": 0.002, "step": 1804 }, { "epoch": 2.888, "grad_norm": 0.2881652846675583, "learning_rate": 7.946057760332193e-08, "loss": 0.0037, "step": 1805 }, { "epoch": 2.8895999999999997, "grad_norm": 0.4238189600001012, "learning_rate": 7.730127636723539e-08, "loss": 0.008, "step": 1806 }, { "epoch": 2.8912, "grad_norm": 0.5138563588810117, "learning_rate": 7.517160581569371e-08, "loss": 0.0051, "step": 1807 }, { "epoch": 2.8928000000000003, "grad_norm": 0.5308687748429286, "learning_rate": 7.307157230821426e-08, "loss": 0.0081, "step": 1808 }, { "epoch": 2.8944, "grad_norm": 0.5921440444783744, "learning_rate": 7.100118211581852e-08, "loss": 0.0049, "step": 1809 }, { "epoch": 2.896, "grad_norm": 0.31771549916104036, "learning_rate": 6.896044142100433e-08, "loss": 0.0039, "step": 1810 }, { "epoch": 2.8975999999999997, "grad_norm": 0.3052563999741175, "learning_rate": 6.694935631773259e-08, "loss": 0.0035, "step": 1811 }, { "epoch": 2.8992, "grad_norm": 0.3725902993944454, "learning_rate": 6.496793281141056e-08, "loss": 0.0063, "step": 1812 }, { "epoch": 2.9008000000000003, "grad_norm": 0.3102151511513018, "learning_rate": 6.301617681886863e-08, "loss": 0.0038, "step": 1813 }, { "epoch": 2.9024, "grad_norm": 0.2182676130139072, "learning_rate": 6.109409416834689e-08, "loss": 0.0023, "step": 1814 }, { "epoch": 2.904, "grad_norm": 0.260922445485721, "learning_rate": 5.920169059947412e-08, "loss": 0.0028, "step": 1815 }, { "epoch": 2.9055999999999997, "grad_norm": 0.27559308235578495, "learning_rate": 5.7338971763256646e-08, "loss": 0.0038, "step": 1816 }, { "epoch": 2.9072, "grad_norm": 0.35145438615648, "learning_rate": 5.5505943222055046e-08, "loss": 0.005, "step": 1817 }, { "epoch": 2.9088000000000003, "grad_norm": 0.35854604442106486, "learning_rate": 5.37026104495697e-08, "loss": 0.0042, "step": 1818 }, { "epoch": 2.9104, "grad_norm": 0.37939974541440513, "learning_rate": 5.192897883082748e-08, "loss": 0.0044, "step": 1819 }, { "epoch": 2.912, "grad_norm": 0.21722654227864255, "learning_rate": 5.0185053662161756e-08, "loss": 0.0025, "step": 1820 }, { "epoch": 2.9135999999999997, "grad_norm": 0.35596655693883184, "learning_rate": 4.8470840151195745e-08, "loss": 0.0049, "step": 1821 }, { "epoch": 2.9152, "grad_norm": 0.25434248568517853, "learning_rate": 4.678634341683252e-08, "loss": 0.0025, "step": 1822 }, { "epoch": 2.9168, "grad_norm": 0.24723308450126122, "learning_rate": 4.513156848923616e-08, "loss": 0.0018, "step": 1823 }, { "epoch": 2.9184, "grad_norm": 0.3699984409531007, "learning_rate": 4.350652030981395e-08, "loss": 0.0046, "step": 1824 }, { "epoch": 2.92, "grad_norm": 0.8110138980784226, "learning_rate": 4.19112037312075e-08, "loss": 0.0049, "step": 1825 }, { "epoch": 2.9215999999999998, "grad_norm": 0.4837641134180625, "learning_rate": 4.0345623517273894e-08, "loss": 0.0055, "step": 1826 }, { "epoch": 2.9232, "grad_norm": 0.21993881647963598, "learning_rate": 3.8809784343072364e-08, "loss": 0.0024, "step": 1827 }, { "epoch": 2.9248, "grad_norm": 0.36956005696857025, "learning_rate": 3.7303690794854296e-08, "loss": 0.0027, "step": 1828 }, { "epoch": 2.9264, "grad_norm": 0.2819053942346473, "learning_rate": 3.582734737004101e-08, "loss": 0.0042, "step": 1829 }, { "epoch": 2.928, "grad_norm": 0.2761653957431238, "learning_rate": 3.438075847721933e-08, "loss": 0.0037, "step": 1830 }, { "epoch": 2.9295999999999998, "grad_norm": 0.4670080908820644, "learning_rate": 3.2963928436122726e-08, "loss": 0.008, "step": 1831 }, { "epoch": 2.9312, "grad_norm": 0.2297603417125354, "learning_rate": 3.157686147762129e-08, "loss": 0.0045, "step": 1832 }, { "epoch": 2.9328, "grad_norm": 0.38004580779980107, "learning_rate": 3.0219561743707326e-08, "loss": 0.0027, "step": 1833 }, { "epoch": 2.9344, "grad_norm": 0.3030335150073986, "learning_rate": 2.8892033287484245e-08, "loss": 0.0025, "step": 1834 }, { "epoch": 2.936, "grad_norm": 0.28940707302564167, "learning_rate": 2.7594280073152123e-08, "loss": 0.0051, "step": 1835 }, { "epoch": 2.9375999999999998, "grad_norm": 0.2527949134917213, "learning_rate": 2.6326305976001054e-08, "loss": 0.0031, "step": 1836 }, { "epoch": 2.9392, "grad_norm": 0.4091815899064453, "learning_rate": 2.508811478239226e-08, "loss": 0.005, "step": 1837 }, { "epoch": 2.9408, "grad_norm": 0.28684768340282324, "learning_rate": 2.3879710189753657e-08, "loss": 0.0031, "step": 1838 }, { "epoch": 2.9424, "grad_norm": 0.3642277160058468, "learning_rate": 2.2701095806565432e-08, "loss": 0.0049, "step": 1839 }, { "epoch": 2.944, "grad_norm": 0.28489682761878427, "learning_rate": 2.1552275152346702e-08, "loss": 0.0026, "step": 1840 }, { "epoch": 2.9455999999999998, "grad_norm": 0.3919995396630444, "learning_rate": 2.0433251657653307e-08, "loss": 0.0077, "step": 1841 }, { "epoch": 2.9472, "grad_norm": 0.2654161369122685, "learning_rate": 1.9344028664056715e-08, "loss": 0.0026, "step": 1842 }, { "epoch": 2.9488, "grad_norm": 0.11667868222023793, "learning_rate": 1.8284609424142897e-08, "loss": 0.0012, "step": 1843 }, { "epoch": 2.9504, "grad_norm": 0.2708149125559501, "learning_rate": 1.7254997101500137e-08, "loss": 0.0058, "step": 1844 }, { "epoch": 2.952, "grad_norm": 0.22510298505002913, "learning_rate": 1.6255194770704586e-08, "loss": 0.0023, "step": 1845 }, { "epoch": 2.9536, "grad_norm": 0.47317677070689074, "learning_rate": 1.528520541731915e-08, "loss": 0.0109, "step": 1846 }, { "epoch": 2.9552, "grad_norm": 0.3557026006553359, "learning_rate": 1.4345031937879061e-08, "loss": 0.0032, "step": 1847 }, { "epoch": 2.9568, "grad_norm": 0.23021052365253492, "learning_rate": 1.3434677139885222e-08, "loss": 0.003, "step": 1848 }, { "epoch": 2.9584, "grad_norm": 0.35060407682918143, "learning_rate": 1.2554143741795311e-08, "loss": 0.0047, "step": 1849 }, { "epoch": 2.96, "grad_norm": 0.21884835184223417, "learning_rate": 1.170343437301491e-08, "loss": 0.0025, "step": 1850 }, { "epoch": 2.9616, "grad_norm": 0.29393714548985944, "learning_rate": 1.0882551573891953e-08, "loss": 0.0072, "step": 1851 }, { "epoch": 2.9632, "grad_norm": 0.422476275688548, "learning_rate": 1.0091497795706728e-08, "loss": 0.005, "step": 1852 }, { "epoch": 2.9648, "grad_norm": 0.23338307459392, "learning_rate": 9.330275400666334e-09, "loss": 0.0031, "step": 1853 }, { "epoch": 2.9664, "grad_norm": 0.2891855387233942, "learning_rate": 8.59888666189579e-09, "loss": 0.0038, "step": 1854 }, { "epoch": 2.968, "grad_norm": 0.41158431954259306, "learning_rate": 7.897333763433601e-09, "loss": 0.0067, "step": 1855 }, { "epoch": 2.9696, "grad_norm": 0.4170226848139635, "learning_rate": 7.225618800222878e-09, "loss": 0.0028, "step": 1856 }, { "epoch": 2.9712, "grad_norm": 0.20532199418262084, "learning_rate": 6.583743778106888e-09, "loss": 0.0025, "step": 1857 }, { "epoch": 2.9728, "grad_norm": 0.4447779548881662, "learning_rate": 5.971710613821291e-09, "loss": 0.0048, "step": 1858 }, { "epoch": 2.9744, "grad_norm": 0.31124710271313133, "learning_rate": 5.3895211349896946e-09, "loss": 0.0027, "step": 1859 }, { "epoch": 2.976, "grad_norm": 0.342272098849646, "learning_rate": 4.837177080119215e-09, "loss": 0.005, "step": 1860 }, { "epoch": 2.9776, "grad_norm": 0.32407596246883896, "learning_rate": 4.314680098592705e-09, "loss": 0.0037, "step": 1861 }, { "epoch": 2.9792, "grad_norm": 0.2816315966016677, "learning_rate": 3.8220317506654226e-09, "loss": 0.0034, "step": 1862 }, { "epoch": 2.9808, "grad_norm": 0.2839306857760757, "learning_rate": 3.3592335074594805e-09, "loss": 0.003, "step": 1863 }, { "epoch": 2.9824, "grad_norm": 0.2245985260963881, "learning_rate": 2.9262867509605164e-09, "loss": 0.0028, "step": 1864 }, { "epoch": 2.984, "grad_norm": 0.17332393727737602, "learning_rate": 2.5231927740154705e-09, "loss": 0.0019, "step": 1865 }, { "epoch": 2.9856, "grad_norm": 0.30104904584694364, "learning_rate": 2.149952780321485e-09, "loss": 0.0026, "step": 1866 }, { "epoch": 2.9872, "grad_norm": 0.26045479648076253, "learning_rate": 1.8065678844314538e-09, "loss": 0.0026, "step": 1867 }, { "epoch": 2.9888, "grad_norm": 0.2864632662273618, "learning_rate": 1.4930391117451427e-09, "loss": 0.0033, "step": 1868 }, { "epoch": 2.9904, "grad_norm": 0.1659935085129691, "learning_rate": 1.209367398504746e-09, "loss": 0.0018, "step": 1869 }, { "epoch": 2.992, "grad_norm": 0.26353924511099674, "learning_rate": 9.555535917993297e-10, "loss": 0.0028, "step": 1870 }, { "epoch": 2.9936, "grad_norm": 0.21139113799467085, "learning_rate": 7.315984495548378e-10, "loss": 0.002, "step": 1871 }, { "epoch": 2.9952, "grad_norm": 0.4251269053335163, "learning_rate": 5.375026405352035e-10, "loss": 0.0039, "step": 1872 }, { "epoch": 2.9968, "grad_norm": 0.3840411035653194, "learning_rate": 3.732667443390181e-10, "loss": 0.0036, "step": 1873 }, { "epoch": 2.9984, "grad_norm": 0.3308770694702444, "learning_rate": 2.388912514017516e-10, "loss": 0.0037, "step": 1874 }, { "epoch": 3.0, "grad_norm": 0.38326237286825415, "learning_rate": 1.3437656298687096e-10, "loss": 0.0038, "step": 1875 }, { "epoch": 3.0, "step": 1875, "total_flos": 75543577763840.0, "train_loss": 0.05507048086337745, "train_runtime": 6924.8564, "train_samples_per_second": 4.332, "train_steps_per_second": 0.271 } ], "logging_steps": 1.0, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000, "total_flos": 75543577763840.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }