diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4838 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1370, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00291970802919708, + "grad_norm": 4.875, + "learning_rate": 7.246376811594204e-08, + "loss": 1.320786714553833, + "step": 2 + }, + { + "epoch": 0.00583941605839416, + "grad_norm": 26.375, + "learning_rate": 2.173913043478261e-07, + "loss": 2.3353517055511475, + "step": 4 + }, + { + "epoch": 0.008759124087591242, + "grad_norm": 5.125, + "learning_rate": 3.623188405797102e-07, + "loss": 1.9446890354156494, + "step": 6 + }, + { + "epoch": 0.01167883211678832, + "grad_norm": 2.234375, + "learning_rate": 5.072463768115942e-07, + "loss": 1.6843594312667847, + "step": 8 + }, + { + "epoch": 0.014598540145985401, + "grad_norm": 8.8125, + "learning_rate": 6.521739130434783e-07, + "loss": 1.8062303066253662, + "step": 10 + }, + { + "epoch": 0.017518248175182483, + "grad_norm": 5.0, + "learning_rate": 7.971014492753623e-07, + "loss": 1.9280399084091187, + "step": 12 + }, + { + "epoch": 0.020437956204379562, + "grad_norm": 3.015625, + "learning_rate": 9.420289855072465e-07, + "loss": 1.570988655090332, + "step": 14 + }, + { + "epoch": 0.02335766423357664, + "grad_norm": 11.25, + "learning_rate": 1.0869565217391306e-06, + "loss": 1.7710015773773193, + "step": 16 + }, + { + "epoch": 0.026277372262773723, + "grad_norm": 4.53125, + "learning_rate": 1.2318840579710147e-06, + "loss": 1.9166163206100464, + "step": 18 + }, + { + "epoch": 0.029197080291970802, + "grad_norm": 23.5, + "learning_rate": 1.3768115942028987e-06, + "loss": 1.9079008102416992, + "step": 20 + }, + { + "epoch": 0.032116788321167884, + "grad_norm": 6.15625, + "learning_rate": 1.521739130434783e-06, + "loss": 1.9891327619552612, + "step": 22 + }, + { + "epoch": 0.035036496350364967, + "grad_norm": 8.6875, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.8731980323791504, + "step": 24 + }, + { + "epoch": 0.03795620437956204, + "grad_norm": 41.5, + "learning_rate": 1.8115942028985508e-06, + "loss": 1.996793508529663, + "step": 26 + }, + { + "epoch": 0.040875912408759124, + "grad_norm": 16.125, + "learning_rate": 1.956521739130435e-06, + "loss": 2.4439406394958496, + "step": 28 + }, + { + "epoch": 0.043795620437956206, + "grad_norm": 4.78125, + "learning_rate": 2.101449275362319e-06, + "loss": 1.4941191673278809, + "step": 30 + }, + { + "epoch": 0.04671532846715328, + "grad_norm": 5.71875, + "learning_rate": 2.246376811594203e-06, + "loss": 1.9384567737579346, + "step": 32 + }, + { + "epoch": 0.049635036496350364, + "grad_norm": 3.140625, + "learning_rate": 2.391304347826087e-06, + "loss": 2.106153964996338, + "step": 34 + }, + { + "epoch": 0.052554744525547446, + "grad_norm": 25.875, + "learning_rate": 2.5362318840579714e-06, + "loss": 2.235496997833252, + "step": 36 + }, + { + "epoch": 0.05547445255474453, + "grad_norm": 6.46875, + "learning_rate": 2.6811594202898555e-06, + "loss": 2.4106810092926025, + "step": 38 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 4.375, + "learning_rate": 2.8260869565217393e-06, + "loss": 1.6466758251190186, + "step": 40 + }, + { + "epoch": 0.061313868613138686, + "grad_norm": 95.5, + "learning_rate": 2.9710144927536235e-06, + "loss": 1.9993230104446411, + "step": 42 + }, + { + "epoch": 0.06423357664233577, + "grad_norm": 3.953125, + "learning_rate": 3.1159420289855073e-06, + "loss": 1.7203528881072998, + "step": 44 + }, + { + "epoch": 0.06715328467153285, + "grad_norm": 13.5625, + "learning_rate": 3.2608695652173914e-06, + "loss": 2.5018796920776367, + "step": 46 + }, + { + "epoch": 0.07007299270072993, + "grad_norm": 12.6875, + "learning_rate": 3.4057971014492756e-06, + "loss": 1.935620903968811, + "step": 48 + }, + { + "epoch": 0.072992700729927, + "grad_norm": 4.125, + "learning_rate": 3.55072463768116e-06, + "loss": 1.9458433389663696, + "step": 50 + }, + { + "epoch": 0.07591240875912408, + "grad_norm": 2.171875, + "learning_rate": 3.6956521739130436e-06, + "loss": 1.321602702140808, + "step": 52 + }, + { + "epoch": 0.07883211678832117, + "grad_norm": 3.578125, + "learning_rate": 3.840579710144928e-06, + "loss": 2.0101318359375, + "step": 54 + }, + { + "epoch": 0.08175182481751825, + "grad_norm": 5.625, + "learning_rate": 3.9855072463768115e-06, + "loss": 2.0588250160217285, + "step": 56 + }, + { + "epoch": 0.08467153284671533, + "grad_norm": 5.3125, + "learning_rate": 4.130434782608696e-06, + "loss": 1.860298752784729, + "step": 58 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 5.9375, + "learning_rate": 4.27536231884058e-06, + "loss": 1.9684100151062012, + "step": 60 + }, + { + "epoch": 0.0905109489051095, + "grad_norm": 9.375, + "learning_rate": 4.4202898550724645e-06, + "loss": 1.980459213256836, + "step": 62 + }, + { + "epoch": 0.09343065693430656, + "grad_norm": 4.90625, + "learning_rate": 4.565217391304348e-06, + "loss": 1.8493075370788574, + "step": 64 + }, + { + "epoch": 0.09635036496350365, + "grad_norm": 2.609375, + "learning_rate": 4.710144927536232e-06, + "loss": 1.5537524223327637, + "step": 66 + }, + { + "epoch": 0.09927007299270073, + "grad_norm": 4.46875, + "learning_rate": 4.855072463768117e-06, + "loss": 1.8475682735443115, + "step": 68 + }, + { + "epoch": 0.10218978102189781, + "grad_norm": 3.734375, + "learning_rate": 5e-06, + "loss": 1.7411353588104248, + "step": 70 + }, + { + "epoch": 0.10510948905109489, + "grad_norm": 29.875, + "learning_rate": 4.999973760423467e-06, + "loss": 2.0845284461975098, + "step": 72 + }, + { + "epoch": 0.10802919708029197, + "grad_norm": 6.21875, + "learning_rate": 4.99989504230588e-06, + "loss": 1.5018064975738525, + "step": 74 + }, + { + "epoch": 0.11094890510948906, + "grad_norm": 2.21875, + "learning_rate": 4.999763847483267e-06, + "loss": 1.464540958404541, + "step": 76 + }, + { + "epoch": 0.11386861313868613, + "grad_norm": 4.53125, + "learning_rate": 4.999580179015625e-06, + "loss": 1.8232789039611816, + "step": 78 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 1.7578125, + "learning_rate": 4.999344041186848e-06, + "loss": 1.096325159072876, + "step": 80 + }, + { + "epoch": 0.11970802919708029, + "grad_norm": 3.328125, + "learning_rate": 4.999055439504633e-06, + "loss": 1.8037409782409668, + "step": 82 + }, + { + "epoch": 0.12262773722627737, + "grad_norm": 3.84375, + "learning_rate": 4.998714380700345e-06, + "loss": 1.5575973987579346, + "step": 84 + }, + { + "epoch": 0.12554744525547445, + "grad_norm": 4.1875, + "learning_rate": 4.998320872728862e-06, + "loss": 1.8613684177398682, + "step": 86 + }, + { + "epoch": 0.12846715328467154, + "grad_norm": 5.15625, + "learning_rate": 4.9978749247683895e-06, + "loss": 1.732508897781372, + "step": 88 + }, + { + "epoch": 0.13138686131386862, + "grad_norm": 2.59375, + "learning_rate": 4.99737654722025e-06, + "loss": 1.3435773849487305, + "step": 90 + }, + { + "epoch": 0.1343065693430657, + "grad_norm": 3.25, + "learning_rate": 4.996825751708635e-06, + "loss": 1.7478176355361938, + "step": 92 + }, + { + "epoch": 0.13722627737226278, + "grad_norm": 2.03125, + "learning_rate": 4.996222551080337e-06, + "loss": 1.4358994960784912, + "step": 94 + }, + { + "epoch": 0.14014598540145987, + "grad_norm": 5.4375, + "learning_rate": 4.9955669594044466e-06, + "loss": 1.870757818222046, + "step": 96 + }, + { + "epoch": 0.14306569343065692, + "grad_norm": 3.671875, + "learning_rate": 4.994858991972031e-06, + "loss": 1.6408865451812744, + "step": 98 + }, + { + "epoch": 0.145985401459854, + "grad_norm": 3.375, + "learning_rate": 4.994098665295768e-06, + "loss": 1.4728097915649414, + "step": 100 + }, + { + "epoch": 0.14890510948905109, + "grad_norm": 7.4375, + "learning_rate": 4.9932859971095705e-06, + "loss": 1.7583755254745483, + "step": 102 + }, + { + "epoch": 0.15182481751824817, + "grad_norm": 3.25, + "learning_rate": 4.992421006368166e-06, + "loss": 1.6836040019989014, + "step": 104 + }, + { + "epoch": 0.15474452554744525, + "grad_norm": 26.25, + "learning_rate": 4.991503713246659e-06, + "loss": 1.9515830278396606, + "step": 106 + }, + { + "epoch": 0.15766423357664233, + "grad_norm": 62.25, + "learning_rate": 4.990534139140055e-06, + "loss": 2.0257816314697266, + "step": 108 + }, + { + "epoch": 0.16058394160583941, + "grad_norm": 2.640625, + "learning_rate": 4.989512306662767e-06, + "loss": 1.4182727336883545, + "step": 110 + }, + { + "epoch": 0.1635036496350365, + "grad_norm": 6.6875, + "learning_rate": 4.988438239648084e-06, + "loss": 1.70530366897583, + "step": 112 + }, + { + "epoch": 0.16642335766423358, + "grad_norm": 3.5625, + "learning_rate": 4.98731196314762e-06, + "loss": 1.5088133811950684, + "step": 114 + }, + { + "epoch": 0.16934306569343066, + "grad_norm": 3.078125, + "learning_rate": 4.986133503430724e-06, + "loss": 1.6265062093734741, + "step": 116 + }, + { + "epoch": 0.17226277372262774, + "grad_norm": 6.4375, + "learning_rate": 4.98490288798387e-06, + "loss": 1.402962327003479, + "step": 118 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 4.125, + "learning_rate": 4.983620145510017e-06, + "loss": 1.8057794570922852, + "step": 120 + }, + { + "epoch": 0.1781021897810219, + "grad_norm": 6.875, + "learning_rate": 4.982285305927937e-06, + "loss": 1.9605462551116943, + "step": 122 + }, + { + "epoch": 0.181021897810219, + "grad_norm": 3.625, + "learning_rate": 4.980898400371521e-06, + "loss": 1.8519611358642578, + "step": 124 + }, + { + "epoch": 0.18394160583941604, + "grad_norm": 10.0625, + "learning_rate": 4.9794594611890465e-06, + "loss": 1.6692755222320557, + "step": 126 + }, + { + "epoch": 0.18686131386861313, + "grad_norm": 6.1875, + "learning_rate": 4.977968521942429e-06, + "loss": 1.8997008800506592, + "step": 128 + }, + { + "epoch": 0.1897810218978102, + "grad_norm": 1.8515625, + "learning_rate": 4.97642561740644e-06, + "loss": 1.8168402910232544, + "step": 130 + }, + { + "epoch": 0.1927007299270073, + "grad_norm": 16.375, + "learning_rate": 4.974830783567886e-06, + "loss": 1.4727129936218262, + "step": 132 + }, + { + "epoch": 0.19562043795620437, + "grad_norm": 7.71875, + "learning_rate": 4.973184057624781e-06, + "loss": 1.6138420104980469, + "step": 134 + }, + { + "epoch": 0.19854014598540146, + "grad_norm": 3.5, + "learning_rate": 4.971485477985474e-06, + "loss": 1.6893023252487183, + "step": 136 + }, + { + "epoch": 0.20145985401459854, + "grad_norm": 1.421875, + "learning_rate": 4.969735084267752e-06, + "loss": 1.3670828342437744, + "step": 138 + }, + { + "epoch": 0.20437956204379562, + "grad_norm": 8.4375, + "learning_rate": 4.967932917297915e-06, + "loss": 1.6938685178756714, + "step": 140 + }, + { + "epoch": 0.2072992700729927, + "grad_norm": 4.0625, + "learning_rate": 4.966079019109831e-06, + "loss": 2.2959558963775635, + "step": 142 + }, + { + "epoch": 0.21021897810218979, + "grad_norm": 3.328125, + "learning_rate": 4.964173432943946e-06, + "loss": 1.6218578815460205, + "step": 144 + }, + { + "epoch": 0.21313868613138687, + "grad_norm": 9.0625, + "learning_rate": 4.962216203246281e-06, + "loss": 2.592639446258545, + "step": 146 + }, + { + "epoch": 0.21605839416058395, + "grad_norm": 3.3125, + "learning_rate": 4.960207375667396e-06, + "loss": 1.5585392713546753, + "step": 148 + }, + { + "epoch": 0.21897810218978103, + "grad_norm": 3.96875, + "learning_rate": 4.958146997061319e-06, + "loss": 1.6422696113586426, + "step": 150 + }, + { + "epoch": 0.22189781021897811, + "grad_norm": 5.59375, + "learning_rate": 4.956035115484465e-06, + "loss": 1.7883186340332031, + "step": 152 + }, + { + "epoch": 0.22481751824817517, + "grad_norm": 2.140625, + "learning_rate": 4.953871780194501e-06, + "loss": 1.657930612564087, + "step": 154 + }, + { + "epoch": 0.22773722627737225, + "grad_norm": 24.125, + "learning_rate": 4.951657041649206e-06, + "loss": 1.7987116575241089, + "step": 156 + }, + { + "epoch": 0.23065693430656933, + "grad_norm": 12.0, + "learning_rate": 4.9493909515052944e-06, + "loss": 2.016146659851074, + "step": 158 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 7.90625, + "learning_rate": 4.947073562617206e-06, + "loss": 1.3612116575241089, + "step": 160 + }, + { + "epoch": 0.2364963503649635, + "grad_norm": 3.8125, + "learning_rate": 4.944704929035877e-06, + "loss": 1.7367652654647827, + "step": 162 + }, + { + "epoch": 0.23941605839416058, + "grad_norm": 2.875, + "learning_rate": 4.942285106007477e-06, + "loss": 1.3203725814819336, + "step": 164 + }, + { + "epoch": 0.24233576642335766, + "grad_norm": 11.25, + "learning_rate": 4.9398141499721246e-06, + "loss": 1.7288057804107666, + "step": 166 + }, + { + "epoch": 0.24525547445255474, + "grad_norm": 1.5625, + "learning_rate": 4.937292118562566e-06, + "loss": 1.383696436882019, + "step": 168 + }, + { + "epoch": 0.24817518248175183, + "grad_norm": 12.5625, + "learning_rate": 4.934719070602833e-06, + "loss": 1.6433072090148926, + "step": 170 + }, + { + "epoch": 0.2510948905109489, + "grad_norm": 3.109375, + "learning_rate": 4.932095066106872e-06, + "loss": 1.4025721549987793, + "step": 172 + }, + { + "epoch": 0.25401459854014596, + "grad_norm": 4.1875, + "learning_rate": 4.929420166277141e-06, + "loss": 1.6988599300384521, + "step": 174 + }, + { + "epoch": 0.2569343065693431, + "grad_norm": 3.65625, + "learning_rate": 4.926694433503186e-06, + "loss": 1.6042873859405518, + "step": 176 + }, + { + "epoch": 0.25985401459854013, + "grad_norm": 1.6484375, + "learning_rate": 4.923917931360185e-06, + "loss": 1.2862474918365479, + "step": 178 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 6.65625, + "learning_rate": 4.9210907246074615e-06, + "loss": 1.7310783863067627, + "step": 180 + }, + { + "epoch": 0.2656934306569343, + "grad_norm": 4.5625, + "learning_rate": 4.9182128791869796e-06, + "loss": 1.5482988357543945, + "step": 182 + }, + { + "epoch": 0.2686131386861314, + "grad_norm": 1.5078125, + "learning_rate": 4.9152844622218e-06, + "loss": 1.2439241409301758, + "step": 184 + }, + { + "epoch": 0.27153284671532846, + "grad_norm": 4.3125, + "learning_rate": 4.91230554201452e-06, + "loss": 1.5766255855560303, + "step": 186 + }, + { + "epoch": 0.27445255474452557, + "grad_norm": 3.90625, + "learning_rate": 4.9092761880456764e-06, + "loss": 1.311848759651184, + "step": 188 + }, + { + "epoch": 0.2773722627737226, + "grad_norm": 39.75, + "learning_rate": 4.906196470972128e-06, + "loss": 1.5088813304901123, + "step": 190 + }, + { + "epoch": 0.28029197080291973, + "grad_norm": 6.40625, + "learning_rate": 4.903066462625405e-06, + "loss": 1.6081913709640503, + "step": 192 + }, + { + "epoch": 0.2832116788321168, + "grad_norm": 6.125, + "learning_rate": 4.899886236010036e-06, + "loss": 1.7471773624420166, + "step": 194 + }, + { + "epoch": 0.28613138686131384, + "grad_norm": 4.09375, + "learning_rate": 4.896655865301842e-06, + "loss": 1.6127898693084717, + "step": 196 + }, + { + "epoch": 0.28905109489051095, + "grad_norm": 3.1875, + "learning_rate": 4.893375425846209e-06, + "loss": 1.6075236797332764, + "step": 198 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 3.53125, + "learning_rate": 4.890044994156331e-06, + "loss": 1.712640643119812, + "step": 200 + }, + { + "epoch": 0.2948905109489051, + "grad_norm": 3.84375, + "learning_rate": 4.886664647911422e-06, + "loss": 1.5669183731079102, + "step": 202 + }, + { + "epoch": 0.29781021897810217, + "grad_norm": 5.6875, + "learning_rate": 4.883234465954909e-06, + "loss": 1.7576971054077148, + "step": 204 + }, + { + "epoch": 0.3007299270072993, + "grad_norm": 2.515625, + "learning_rate": 4.879754528292588e-06, + "loss": 1.5543663501739502, + "step": 206 + }, + { + "epoch": 0.30364963503649633, + "grad_norm": 2.921875, + "learning_rate": 4.876224916090762e-06, + "loss": 1.9160549640655518, + "step": 208 + }, + { + "epoch": 0.30656934306569344, + "grad_norm": 4.34375, + "learning_rate": 4.872645711674348e-06, + "loss": 1.646159291267395, + "step": 210 + }, + { + "epoch": 0.3094890510948905, + "grad_norm": 1.625, + "learning_rate": 4.8690169985249516e-06, + "loss": 1.1048507690429688, + "step": 212 + }, + { + "epoch": 0.3124087591240876, + "grad_norm": 1.5625, + "learning_rate": 4.865338861278925e-06, + "loss": 1.0736052989959717, + "step": 214 + }, + { + "epoch": 0.31532846715328466, + "grad_norm": 3.59375, + "learning_rate": 4.8616113857253925e-06, + "loss": 1.2035229206085205, + "step": 216 + }, + { + "epoch": 0.3182481751824818, + "grad_norm": 23.625, + "learning_rate": 4.857834658804247e-06, + "loss": 1.137906789779663, + "step": 218 + }, + { + "epoch": 0.32116788321167883, + "grad_norm": 4.5625, + "learning_rate": 4.8540087686041234e-06, + "loss": 1.7008376121520996, + "step": 220 + }, + { + "epoch": 0.32408759124087594, + "grad_norm": 8.75, + "learning_rate": 4.850133804360346e-06, + "loss": 1.6337850093841553, + "step": 222 + }, + { + "epoch": 0.327007299270073, + "grad_norm": 3.984375, + "learning_rate": 4.8462098564528455e-06, + "loss": 1.1808865070343018, + "step": 224 + }, + { + "epoch": 0.32992700729927005, + "grad_norm": 3.59375, + "learning_rate": 4.842237016404048e-06, + "loss": 1.5622849464416504, + "step": 226 + }, + { + "epoch": 0.33284671532846716, + "grad_norm": 1.1875, + "learning_rate": 4.838215376876744e-06, + "loss": 1.1768817901611328, + "step": 228 + }, + { + "epoch": 0.3357664233576642, + "grad_norm": 6.0, + "learning_rate": 4.834145031671931e-06, + "loss": 1.3726277351379395, + "step": 230 + }, + { + "epoch": 0.3386861313868613, + "grad_norm": 28.375, + "learning_rate": 4.830026075726615e-06, + "loss": 1.1469438076019287, + "step": 232 + }, + { + "epoch": 0.3416058394160584, + "grad_norm": 3.421875, + "learning_rate": 4.8258586051116045e-06, + "loss": 1.5012977123260498, + "step": 234 + }, + { + "epoch": 0.3445255474452555, + "grad_norm": 12.9375, + "learning_rate": 4.821642717029269e-06, + "loss": 1.6817822456359863, + "step": 236 + }, + { + "epoch": 0.34744525547445254, + "grad_norm": 5.0625, + "learning_rate": 4.8173785098112675e-06, + "loss": 1.525681495666504, + "step": 238 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 15.4375, + "learning_rate": 4.81306608291626e-06, + "loss": 2.0758631229400635, + "step": 240 + }, + { + "epoch": 0.3532846715328467, + "grad_norm": 3.25, + "learning_rate": 4.808705536927586e-06, + "loss": 1.4310352802276611, + "step": 242 + }, + { + "epoch": 0.3562043795620438, + "grad_norm": 3.28125, + "learning_rate": 4.804296973550915e-06, + "loss": 1.6908133029937744, + "step": 244 + }, + { + "epoch": 0.35912408759124087, + "grad_norm": 3.15625, + "learning_rate": 4.799840495611879e-06, + "loss": 1.2480230331420898, + "step": 246 + }, + { + "epoch": 0.362043795620438, + "grad_norm": 2.75, + "learning_rate": 4.795336207053674e-06, + "loss": 1.5943894386291504, + "step": 248 + }, + { + "epoch": 0.36496350364963503, + "grad_norm": 3.953125, + "learning_rate": 4.790784212934631e-06, + "loss": 1.1932544708251953, + "step": 250 + }, + { + "epoch": 0.3678832116788321, + "grad_norm": 5.53125, + "learning_rate": 4.786184619425773e-06, + "loss": 1.4538475275039673, + "step": 252 + }, + { + "epoch": 0.3708029197080292, + "grad_norm": 6.1875, + "learning_rate": 4.781537533808331e-06, + "loss": 1.7138783931732178, + "step": 254 + }, + { + "epoch": 0.37372262773722625, + "grad_norm": 1.609375, + "learning_rate": 4.7768430644712435e-06, + "loss": 1.37872314453125, + "step": 256 + }, + { + "epoch": 0.37664233576642336, + "grad_norm": 6.25, + "learning_rate": 4.772101320908636e-06, + "loss": 1.4937684535980225, + "step": 258 + }, + { + "epoch": 0.3795620437956204, + "grad_norm": 8.8125, + "learning_rate": 4.767312413717256e-06, + "loss": 1.4460338354110718, + "step": 260 + }, + { + "epoch": 0.38248175182481753, + "grad_norm": 4.28125, + "learning_rate": 4.7624764545939015e-06, + "loss": 1.4206737279891968, + "step": 262 + }, + { + "epoch": 0.3854014598540146, + "grad_norm": 2.671875, + "learning_rate": 4.757593556332811e-06, + "loss": 1.3555597066879272, + "step": 264 + }, + { + "epoch": 0.3883211678832117, + "grad_norm": 3.1875, + "learning_rate": 4.752663832823038e-06, + "loss": 1.6055470705032349, + "step": 266 + }, + { + "epoch": 0.39124087591240875, + "grad_norm": 4.09375, + "learning_rate": 4.747687399045787e-06, + "loss": 1.3127577304840088, + "step": 268 + }, + { + "epoch": 0.39416058394160586, + "grad_norm": 5.40625, + "learning_rate": 4.7426643710717386e-06, + "loss": 1.6612601280212402, + "step": 270 + }, + { + "epoch": 0.3970802919708029, + "grad_norm": 5.34375, + "learning_rate": 4.737594866058339e-06, + "loss": 1.2799599170684814, + "step": 272 + }, + { + "epoch": 0.4, + "grad_norm": 6.71875, + "learning_rate": 4.7324790022470675e-06, + "loss": 1.9163275957107544, + "step": 274 + }, + { + "epoch": 0.4029197080291971, + "grad_norm": 2.328125, + "learning_rate": 4.727316898960681e-06, + "loss": 1.4439561367034912, + "step": 276 + }, + { + "epoch": 0.4058394160583942, + "grad_norm": 10.6875, + "learning_rate": 4.722108676600427e-06, + "loss": 1.2920876741409302, + "step": 278 + }, + { + "epoch": 0.40875912408759124, + "grad_norm": 3.671875, + "learning_rate": 4.7168544566432365e-06, + "loss": 1.691207766532898, + "step": 280 + }, + { + "epoch": 0.4116788321167883, + "grad_norm": 3.21875, + "learning_rate": 4.711554361638896e-06, + "loss": 1.527019739151001, + "step": 282 + }, + { + "epoch": 0.4145985401459854, + "grad_norm": 3.1875, + "learning_rate": 4.70620851520718e-06, + "loss": 1.4309567213058472, + "step": 284 + }, + { + "epoch": 0.41751824817518246, + "grad_norm": 2.390625, + "learning_rate": 4.7008170420349746e-06, + "loss": 1.2672343254089355, + "step": 286 + }, + { + "epoch": 0.42043795620437957, + "grad_norm": 1.765625, + "learning_rate": 4.695380067873368e-06, + "loss": 1.3927721977233887, + "step": 288 + }, + { + "epoch": 0.4233576642335766, + "grad_norm": 2.75, + "learning_rate": 4.689897719534715e-06, + "loss": 1.5347919464111328, + "step": 290 + }, + { + "epoch": 0.42627737226277373, + "grad_norm": 4.5625, + "learning_rate": 4.68437012488968e-06, + "loss": 1.2839910984039307, + "step": 292 + }, + { + "epoch": 0.4291970802919708, + "grad_norm": 48.25, + "learning_rate": 4.678797412864258e-06, + "loss": 1.3073639869689941, + "step": 294 + }, + { + "epoch": 0.4321167883211679, + "grad_norm": 4.1875, + "learning_rate": 4.673179713436762e-06, + "loss": 1.5608128309249878, + "step": 296 + }, + { + "epoch": 0.43503649635036495, + "grad_norm": 2.875, + "learning_rate": 4.667517157634797e-06, + "loss": 1.6924610137939453, + "step": 298 + }, + { + "epoch": 0.43795620437956206, + "grad_norm": 3.515625, + "learning_rate": 4.6618098775322e-06, + "loss": 1.218139886856079, + "step": 300 + }, + { + "epoch": 0.4408759124087591, + "grad_norm": 5.34375, + "learning_rate": 4.656058006245959e-06, + "loss": 1.4968738555908203, + "step": 302 + }, + { + "epoch": 0.44379562043795623, + "grad_norm": 6.59375, + "learning_rate": 4.650261677933111e-06, + "loss": 1.522092580795288, + "step": 304 + }, + { + "epoch": 0.4467153284671533, + "grad_norm": 3.109375, + "learning_rate": 4.644421027787614e-06, + "loss": 1.15757155418396, + "step": 306 + }, + { + "epoch": 0.44963503649635034, + "grad_norm": 2.5, + "learning_rate": 4.638536192037186e-06, + "loss": 1.0606379508972168, + "step": 308 + }, + { + "epoch": 0.45255474452554745, + "grad_norm": 10.375, + "learning_rate": 4.63260730794014e-06, + "loss": 1.674492597579956, + "step": 310 + }, + { + "epoch": 0.4554744525547445, + "grad_norm": 3.421875, + "learning_rate": 4.62663451378217e-06, + "loss": 1.4489834308624268, + "step": 312 + }, + { + "epoch": 0.4583941605839416, + "grad_norm": 1.6640625, + "learning_rate": 4.620617948873133e-06, + "loss": 1.4036529064178467, + "step": 314 + }, + { + "epoch": 0.46131386861313867, + "grad_norm": 4.21875, + "learning_rate": 4.6145577535438004e-06, + "loss": 1.482384204864502, + "step": 316 + }, + { + "epoch": 0.4642335766423358, + "grad_norm": 2.8125, + "learning_rate": 4.608454069142578e-06, + "loss": 1.4590518474578857, + "step": 318 + }, + { + "epoch": 0.46715328467153283, + "grad_norm": 4.53125, + "learning_rate": 4.602307038032216e-06, + "loss": 1.7169837951660156, + "step": 320 + }, + { + "epoch": 0.47007299270072994, + "grad_norm": 4.75, + "learning_rate": 4.596116803586487e-06, + "loss": 1.5060232877731323, + "step": 322 + }, + { + "epoch": 0.472992700729927, + "grad_norm": 2.828125, + "learning_rate": 4.5898835101868415e-06, + "loss": 1.4886112213134766, + "step": 324 + }, + { + "epoch": 0.4759124087591241, + "grad_norm": 1.7265625, + "learning_rate": 4.583607303219037e-06, + "loss": 1.4076815843582153, + "step": 326 + }, + { + "epoch": 0.47883211678832116, + "grad_norm": 10.4375, + "learning_rate": 4.577288329069753e-06, + "loss": 1.5618150234222412, + "step": 328 + }, + { + "epoch": 0.48175182481751827, + "grad_norm": 4.75, + "learning_rate": 4.570926735123171e-06, + "loss": 1.274332046508789, + "step": 330 + }, + { + "epoch": 0.4846715328467153, + "grad_norm": 4.4375, + "learning_rate": 4.564522669757543e-06, + "loss": 1.4747687578201294, + "step": 332 + }, + { + "epoch": 0.48759124087591244, + "grad_norm": 6.40625, + "learning_rate": 4.558076282341723e-06, + "loss": 1.653844952583313, + "step": 334 + }, + { + "epoch": 0.4905109489051095, + "grad_norm": 39.5, + "learning_rate": 4.551587723231692e-06, + "loss": 1.0735116004943848, + "step": 336 + }, + { + "epoch": 0.49343065693430654, + "grad_norm": 36.0, + "learning_rate": 4.545057143767042e-06, + "loss": 1.6714699268341064, + "step": 338 + }, + { + "epoch": 0.49635036496350365, + "grad_norm": 4.15625, + "learning_rate": 4.538484696267453e-06, + "loss": 1.4629170894622803, + "step": 340 + }, + { + "epoch": 0.4992700729927007, + "grad_norm": 10.3125, + "learning_rate": 4.5318705340291394e-06, + "loss": 1.5702762603759766, + "step": 342 + }, + { + "epoch": 0.5021897810218978, + "grad_norm": 4.96875, + "learning_rate": 4.525214811321269e-06, + "loss": 1.5001425743103027, + "step": 344 + }, + { + "epoch": 0.5051094890510949, + "grad_norm": 5.0625, + "learning_rate": 4.518517683382373e-06, + "loss": 1.4789342880249023, + "step": 346 + }, + { + "epoch": 0.5080291970802919, + "grad_norm": 4.15625, + "learning_rate": 4.511779306416716e-06, + "loss": 1.4476077556610107, + "step": 348 + }, + { + "epoch": 0.5109489051094891, + "grad_norm": 1.703125, + "learning_rate": 4.504999837590665e-06, + "loss": 1.1996196508407593, + "step": 350 + }, + { + "epoch": 0.5138686131386861, + "grad_norm": 4.1875, + "learning_rate": 4.49817943502901e-06, + "loss": 1.532009482383728, + "step": 352 + }, + { + "epoch": 0.5167883211678832, + "grad_norm": 1.65625, + "learning_rate": 4.4913182578112815e-06, + "loss": 1.2889015674591064, + "step": 354 + }, + { + "epoch": 0.5197080291970803, + "grad_norm": 1.640625, + "learning_rate": 4.484416465968049e-06, + "loss": 1.3533192873001099, + "step": 356 + }, + { + "epoch": 0.5226277372262774, + "grad_norm": 4.3125, + "learning_rate": 4.477474220477172e-06, + "loss": 1.4686871767044067, + "step": 358 + }, + { + "epoch": 0.5255474452554745, + "grad_norm": 3.78125, + "learning_rate": 4.470491683260056e-06, + "loss": 1.4659610986709595, + "step": 360 + }, + { + "epoch": 0.5284671532846715, + "grad_norm": 2.46875, + "learning_rate": 4.463469017177876e-06, + "loss": 1.487034797668457, + "step": 362 + }, + { + "epoch": 0.5313868613138686, + "grad_norm": 3.3125, + "learning_rate": 4.456406386027772e-06, + "loss": 1.1844420433044434, + "step": 364 + }, + { + "epoch": 0.5343065693430656, + "grad_norm": 7.34375, + "learning_rate": 4.4493039545390345e-06, + "loss": 1.5557405948638916, + "step": 366 + }, + { + "epoch": 0.5372262773722628, + "grad_norm": 2.984375, + "learning_rate": 4.442161888369258e-06, + "loss": 1.3480842113494873, + "step": 368 + }, + { + "epoch": 0.5401459854014599, + "grad_norm": 2.90625, + "learning_rate": 4.43498035410048e-06, + "loss": 1.2928515672683716, + "step": 370 + }, + { + "epoch": 0.5430656934306569, + "grad_norm": 7.3125, + "learning_rate": 4.427759519235294e-06, + "loss": 1.7453609704971313, + "step": 372 + }, + { + "epoch": 0.545985401459854, + "grad_norm": 2.640625, + "learning_rate": 4.420499552192944e-06, + "loss": 1.4482967853546143, + "step": 374 + }, + { + "epoch": 0.5489051094890511, + "grad_norm": 2.0, + "learning_rate": 4.413200622305395e-06, + "loss": 1.6135839223861694, + "step": 376 + }, + { + "epoch": 0.5518248175182482, + "grad_norm": 13.9375, + "learning_rate": 4.405862899813384e-06, + "loss": 1.570212483406067, + "step": 378 + }, + { + "epoch": 0.5547445255474452, + "grad_norm": 1.3671875, + "learning_rate": 4.398486555862451e-06, + "loss": 1.298504114151001, + "step": 380 + }, + { + "epoch": 0.5576642335766423, + "grad_norm": 7.8125, + "learning_rate": 4.391071762498941e-06, + "loss": 1.4520879983901978, + "step": 382 + }, + { + "epoch": 0.5605839416058395, + "grad_norm": 14.8125, + "learning_rate": 4.383618692666002e-06, + "loss": 1.3408211469650269, + "step": 384 + }, + { + "epoch": 0.5635036496350365, + "grad_norm": 3.375, + "learning_rate": 4.376127520199541e-06, + "loss": 1.4031929969787598, + "step": 386 + }, + { + "epoch": 0.5664233576642336, + "grad_norm": 4.03125, + "learning_rate": 4.3685984198241735e-06, + "loss": 1.5412940979003906, + "step": 388 + }, + { + "epoch": 0.5693430656934306, + "grad_norm": 6.78125, + "learning_rate": 4.361031567149149e-06, + "loss": 1.3730320930480957, + "step": 390 + }, + { + "epoch": 0.5722627737226277, + "grad_norm": 7.28125, + "learning_rate": 4.353427138664254e-06, + "loss": 1.3442788124084473, + "step": 392 + }, + { + "epoch": 0.5751824817518248, + "grad_norm": 6.90625, + "learning_rate": 4.345785311735698e-06, + "loss": 1.4140475988388062, + "step": 394 + }, + { + "epoch": 0.5781021897810219, + "grad_norm": 6.25, + "learning_rate": 4.3381062646019676e-06, + "loss": 1.5376839637756348, + "step": 396 + }, + { + "epoch": 0.581021897810219, + "grad_norm": 4.25, + "learning_rate": 4.330390176369685e-06, + "loss": 1.5938429832458496, + "step": 398 + }, + { + "epoch": 0.583941605839416, + "grad_norm": 1.546875, + "learning_rate": 4.322637227009414e-06, + "loss": 1.1486091613769531, + "step": 400 + }, + { + "epoch": 0.5868613138686132, + "grad_norm": 3.578125, + "learning_rate": 4.314847597351475e-06, + "loss": 1.452984094619751, + "step": 402 + }, + { + "epoch": 0.5897810218978102, + "grad_norm": 3.953125, + "learning_rate": 4.3070214690817195e-06, + "loss": 1.4647376537322998, + "step": 404 + }, + { + "epoch": 0.5927007299270073, + "grad_norm": 2.203125, + "learning_rate": 4.299159024737295e-06, + "loss": 1.2110595703125, + "step": 406 + }, + { + "epoch": 0.5956204379562043, + "grad_norm": 4.1875, + "learning_rate": 4.291260447702389e-06, + "loss": 1.3485263586044312, + "step": 408 + }, + { + "epoch": 0.5985401459854015, + "grad_norm": 5.25, + "learning_rate": 4.283325922203949e-06, + "loss": 1.3334099054336548, + "step": 410 + }, + { + "epoch": 0.6014598540145986, + "grad_norm": 2.0625, + "learning_rate": 4.2753556333073875e-06, + "loss": 1.2992541790008545, + "step": 412 + }, + { + "epoch": 0.6043795620437956, + "grad_norm": 8.3125, + "learning_rate": 4.267349766912266e-06, + "loss": 1.3331689834594727, + "step": 414 + }, + { + "epoch": 0.6072992700729927, + "grad_norm": 3.71875, + "learning_rate": 4.259308509747955e-06, + "loss": 1.4391039609909058, + "step": 416 + }, + { + "epoch": 0.6102189781021898, + "grad_norm": 9.6875, + "learning_rate": 4.251232049369287e-06, + "loss": 1.145450472831726, + "step": 418 + }, + { + "epoch": 0.6131386861313869, + "grad_norm": 10.875, + "learning_rate": 4.243120574152169e-06, + "loss": 1.5916063785552979, + "step": 420 + }, + { + "epoch": 0.6160583941605839, + "grad_norm": 4.75, + "learning_rate": 4.234974273289204e-06, + "loss": 1.619133710861206, + "step": 422 + }, + { + "epoch": 0.618978102189781, + "grad_norm": 4.375, + "learning_rate": 4.226793336785265e-06, + "loss": 1.4133093357086182, + "step": 424 + }, + { + "epoch": 0.621897810218978, + "grad_norm": 6.03125, + "learning_rate": 4.218577955453074e-06, + "loss": 1.253399133682251, + "step": 426 + }, + { + "epoch": 0.6248175182481752, + "grad_norm": 4.6875, + "learning_rate": 4.210328320908744e-06, + "loss": 1.4635814428329468, + "step": 428 + }, + { + "epoch": 0.6277372262773723, + "grad_norm": 2.875, + "learning_rate": 4.20204462556731e-06, + "loss": 1.3652441501617432, + "step": 430 + }, + { + "epoch": 0.6306569343065693, + "grad_norm": 8.9375, + "learning_rate": 4.193727062638247e-06, + "loss": 1.5560953617095947, + "step": 432 + }, + { + "epoch": 0.6335766423357664, + "grad_norm": 3.53125, + "learning_rate": 4.18537582612096e-06, + "loss": 1.4227533340454102, + "step": 434 + }, + { + "epoch": 0.6364963503649635, + "grad_norm": 3.265625, + "learning_rate": 4.176991110800256e-06, + "loss": 1.2683900594711304, + "step": 436 + }, + { + "epoch": 0.6394160583941606, + "grad_norm": 14.1875, + "learning_rate": 4.168573112241805e-06, + "loss": 1.2102452516555786, + "step": 438 + }, + { + "epoch": 0.6423357664233577, + "grad_norm": 4.84375, + "learning_rate": 4.16012202678758e-06, + "loss": 1.2587625980377197, + "step": 440 + }, + { + "epoch": 0.6452554744525547, + "grad_norm": 5.46875, + "learning_rate": 4.1516380515512705e-06, + "loss": 1.410897970199585, + "step": 442 + }, + { + "epoch": 0.6481751824817519, + "grad_norm": 1.78125, + "learning_rate": 4.143121384413695e-06, + "loss": 1.4373693466186523, + "step": 444 + }, + { + "epoch": 0.6510948905109489, + "grad_norm": 2.78125, + "learning_rate": 4.134572224018176e-06, + "loss": 1.4430195093154907, + "step": 446 + }, + { + "epoch": 0.654014598540146, + "grad_norm": 7.90625, + "learning_rate": 4.125990769765911e-06, + "loss": 1.4238855838775635, + "step": 448 + }, + { + "epoch": 0.656934306569343, + "grad_norm": 2.25, + "learning_rate": 4.117377221811324e-06, + "loss": 1.4734668731689453, + "step": 450 + }, + { + "epoch": 0.6598540145985401, + "grad_norm": 2.734375, + "learning_rate": 4.108731781057393e-06, + "loss": 1.5210154056549072, + "step": 452 + }, + { + "epoch": 0.6627737226277373, + "grad_norm": 1.25, + "learning_rate": 4.100054649150967e-06, + "loss": 1.237725019454956, + "step": 454 + }, + { + "epoch": 0.6656934306569343, + "grad_norm": 3.953125, + "learning_rate": 4.091346028478059e-06, + "loss": 1.4640438556671143, + "step": 456 + }, + { + "epoch": 0.6686131386861314, + "grad_norm": 9.0, + "learning_rate": 4.0826061221591326e-06, + "loss": 1.105014681816101, + "step": 458 + }, + { + "epoch": 0.6715328467153284, + "grad_norm": 42.25, + "learning_rate": 4.073835134044356e-06, + "loss": 1.4338090419769287, + "step": 460 + }, + { + "epoch": 0.6744525547445256, + "grad_norm": 5.90625, + "learning_rate": 4.065033268708854e-06, + "loss": 1.3917622566223145, + "step": 462 + }, + { + "epoch": 0.6773722627737226, + "grad_norm": 3.359375, + "learning_rate": 4.056200731447929e-06, + "loss": 1.0591514110565186, + "step": 464 + }, + { + "epoch": 0.6802919708029197, + "grad_norm": 4.625, + "learning_rate": 4.0473377282722845e-06, + "loss": 1.4084625244140625, + "step": 466 + }, + { + "epoch": 0.6832116788321168, + "grad_norm": 3.734375, + "learning_rate": 4.038444465903208e-06, + "loss": 1.4596691131591797, + "step": 468 + }, + { + "epoch": 0.6861313868613139, + "grad_norm": 11.125, + "learning_rate": 4.029521151767757e-06, + "loss": 1.2422056198120117, + "step": 470 + }, + { + "epoch": 0.689051094890511, + "grad_norm": 4.4375, + "learning_rate": 4.0205679939939164e-06, + "loss": 1.33591628074646, + "step": 472 + }, + { + "epoch": 0.691970802919708, + "grad_norm": 2.21875, + "learning_rate": 4.011585201405747e-06, + "loss": 1.2504942417144775, + "step": 474 + }, + { + "epoch": 0.6948905109489051, + "grad_norm": 3.6875, + "learning_rate": 4.002572983518515e-06, + "loss": 1.2631410360336304, + "step": 476 + }, + { + "epoch": 0.6978102189781021, + "grad_norm": 5.8125, + "learning_rate": 3.993531550533804e-06, + "loss": 1.3914625644683838, + "step": 478 + }, + { + "epoch": 0.7007299270072993, + "grad_norm": 20.0, + "learning_rate": 3.98446111333461e-06, + "loss": 1.288975715637207, + "step": 480 + }, + { + "epoch": 0.7036496350364964, + "grad_norm": 3.234375, + "learning_rate": 3.9753618834804295e-06, + "loss": 1.4152731895446777, + "step": 482 + }, + { + "epoch": 0.7065693430656934, + "grad_norm": 5.71875, + "learning_rate": 3.966234073202316e-06, + "loss": 1.316530466079712, + "step": 484 + }, + { + "epoch": 0.7094890510948905, + "grad_norm": 56.5, + "learning_rate": 3.957077895397941e-06, + "loss": 1.3749709129333496, + "step": 486 + }, + { + "epoch": 0.7124087591240876, + "grad_norm": 1.734375, + "learning_rate": 3.947893563626615e-06, + "loss": 1.2120707035064697, + "step": 488 + }, + { + "epoch": 0.7153284671532847, + "grad_norm": 3.546875, + "learning_rate": 3.93868129210432e-06, + "loss": 1.4016718864440918, + "step": 490 + }, + { + "epoch": 0.7182481751824817, + "grad_norm": 8.8125, + "learning_rate": 3.929441295698702e-06, + "loss": 1.154693841934204, + "step": 492 + }, + { + "epoch": 0.7211678832116788, + "grad_norm": 3.640625, + "learning_rate": 3.920173789924065e-06, + "loss": 1.334530234336853, + "step": 494 + }, + { + "epoch": 0.724087591240876, + "grad_norm": 1.921875, + "learning_rate": 3.910878990936346e-06, + "loss": 1.3103371858596802, + "step": 496 + }, + { + "epoch": 0.727007299270073, + "grad_norm": 2.84375, + "learning_rate": 3.901557115528069e-06, + "loss": 1.244321584701538, + "step": 498 + }, + { + "epoch": 0.7299270072992701, + "grad_norm": 4.40625, + "learning_rate": 3.892208381123289e-06, + "loss": 1.4268873929977417, + "step": 500 + }, + { + "epoch": 0.7328467153284671, + "grad_norm": 1.4765625, + "learning_rate": 3.8828330057725225e-06, + "loss": 1.3552806377410889, + "step": 502 + }, + { + "epoch": 0.7357664233576642, + "grad_norm": 3.65625, + "learning_rate": 3.873431208147664e-06, + "loss": 1.6077991724014282, + "step": 504 + }, + { + "epoch": 0.7386861313868613, + "grad_norm": 2.21875, + "learning_rate": 3.864003207536879e-06, + "loss": 1.2244906425476074, + "step": 506 + }, + { + "epoch": 0.7416058394160584, + "grad_norm": 2.265625, + "learning_rate": 3.854549223839497e-06, + "loss": 1.0374276638031006, + "step": 508 + }, + { + "epoch": 0.7445255474452555, + "grad_norm": 6.96875, + "learning_rate": 3.845069477560876e-06, + "loss": 1.547581434249878, + "step": 510 + }, + { + "epoch": 0.7474452554744525, + "grad_norm": 2.203125, + "learning_rate": 3.835564189807263e-06, + "loss": 1.225568175315857, + "step": 512 + }, + { + "epoch": 0.7503649635036497, + "grad_norm": 4.09375, + "learning_rate": 3.826033582280635e-06, + "loss": 1.2825735807418823, + "step": 514 + }, + { + "epoch": 0.7532846715328467, + "grad_norm": 2.96875, + "learning_rate": 3.816477877273533e-06, + "loss": 1.430619716644287, + "step": 516 + }, + { + "epoch": 0.7562043795620438, + "grad_norm": 10.9375, + "learning_rate": 3.8068972976638703e-06, + "loss": 1.489488124847412, + "step": 518 + }, + { + "epoch": 0.7591240875912408, + "grad_norm": 4.3125, + "learning_rate": 3.797292066909734e-06, + "loss": 0.8555082082748413, + "step": 520 + }, + { + "epoch": 0.762043795620438, + "grad_norm": 3.703125, + "learning_rate": 3.787662409044184e-06, + "loss": 1.3753139972686768, + "step": 522 + }, + { + "epoch": 0.7649635036496351, + "grad_norm": 8.0, + "learning_rate": 3.7780085486700126e-06, + "loss": 1.6844412088394165, + "step": 524 + }, + { + "epoch": 0.7678832116788321, + "grad_norm": 5.25, + "learning_rate": 3.768330710954517e-06, + "loss": 1.592594027519226, + "step": 526 + }, + { + "epoch": 0.7708029197080292, + "grad_norm": 1.5, + "learning_rate": 3.7586291216242433e-06, + "loss": 1.2550559043884277, + "step": 528 + }, + { + "epoch": 0.7737226277372263, + "grad_norm": 3.953125, + "learning_rate": 3.748904006959719e-06, + "loss": 1.1512435674667358, + "step": 530 + }, + { + "epoch": 0.7766423357664234, + "grad_norm": 10.375, + "learning_rate": 3.739155593790182e-06, + "loss": 1.5256032943725586, + "step": 532 + }, + { + "epoch": 0.7795620437956204, + "grad_norm": 10.75, + "learning_rate": 3.729384109488282e-06, + "loss": 1.6810424327850342, + "step": 534 + }, + { + "epoch": 0.7824817518248175, + "grad_norm": 3.734375, + "learning_rate": 3.719589781964787e-06, + "loss": 1.4392688274383545, + "step": 536 + }, + { + "epoch": 0.7854014598540145, + "grad_norm": 4.125, + "learning_rate": 3.7097728396632555e-06, + "loss": 1.4172781705856323, + "step": 538 + }, + { + "epoch": 0.7883211678832117, + "grad_norm": 4.125, + "learning_rate": 3.6999335115547185e-06, + "loss": 1.401853322982788, + "step": 540 + }, + { + "epoch": 0.7912408759124088, + "grad_norm": 6.375, + "learning_rate": 3.690072027132335e-06, + "loss": 1.534106731414795, + "step": 542 + }, + { + "epoch": 0.7941605839416058, + "grad_norm": 5.0, + "learning_rate": 3.680188616406037e-06, + "loss": 1.629064679145813, + "step": 544 + }, + { + "epoch": 0.7970802919708029, + "grad_norm": 3.5625, + "learning_rate": 3.6702835098971706e-06, + "loss": 1.5794017314910889, + "step": 546 + }, + { + "epoch": 0.8, + "grad_norm": 7.90625, + "learning_rate": 3.6603569386331122e-06, + "loss": 1.556319236755371, + "step": 548 + }, + { + "epoch": 0.8029197080291971, + "grad_norm": 5.125, + "learning_rate": 3.6504091341418853e-06, + "loss": 1.5984359979629517, + "step": 550 + }, + { + "epoch": 0.8058394160583942, + "grad_norm": 4.5, + "learning_rate": 3.640440328446759e-06, + "loss": 1.5283421277999878, + "step": 552 + }, + { + "epoch": 0.8087591240875912, + "grad_norm": 6.75, + "learning_rate": 3.6304507540608357e-06, + "loss": 1.383811116218567, + "step": 554 + }, + { + "epoch": 0.8116788321167884, + "grad_norm": 3.640625, + "learning_rate": 3.620440643981629e-06, + "loss": 1.3146003484725952, + "step": 556 + }, + { + "epoch": 0.8145985401459854, + "grad_norm": 4.125, + "learning_rate": 3.6104102316856255e-06, + "loss": 1.4131672382354736, + "step": 558 + }, + { + "epoch": 0.8175182481751825, + "grad_norm": 13.25, + "learning_rate": 3.600359751122845e-06, + "loss": 1.549619197845459, + "step": 560 + }, + { + "epoch": 0.8204379562043795, + "grad_norm": 2.796875, + "learning_rate": 3.590289436711379e-06, + "loss": 1.5269279479980469, + "step": 562 + }, + { + "epoch": 0.8233576642335766, + "grad_norm": 3.046875, + "learning_rate": 3.5801995233319265e-06, + "loss": 1.3862372636795044, + "step": 564 + }, + { + "epoch": 0.8262773722627738, + "grad_norm": 2.484375, + "learning_rate": 3.5700902463223137e-06, + "loss": 1.2330877780914307, + "step": 566 + }, + { + "epoch": 0.8291970802919708, + "grad_norm": 7.125, + "learning_rate": 3.559961841472005e-06, + "loss": 1.4884552955627441, + "step": 568 + }, + { + "epoch": 0.8321167883211679, + "grad_norm": 3.28125, + "learning_rate": 3.5498145450166057e-06, + "loss": 1.3787778615951538, + "step": 570 + }, + { + "epoch": 0.8350364963503649, + "grad_norm": 3.609375, + "learning_rate": 3.5396485936323456e-06, + "loss": 1.3882396221160889, + "step": 572 + }, + { + "epoch": 0.8379562043795621, + "grad_norm": 3.15625, + "learning_rate": 3.529464224430568e-06, + "loss": 1.3656411170959473, + "step": 574 + }, + { + "epoch": 0.8408759124087591, + "grad_norm": 5.65625, + "learning_rate": 3.5192616749521942e-06, + "loss": 1.5140806436538696, + "step": 576 + }, + { + "epoch": 0.8437956204379562, + "grad_norm": 4.5, + "learning_rate": 3.5090411831621803e-06, + "loss": 1.5188113451004028, + "step": 578 + }, + { + "epoch": 0.8467153284671532, + "grad_norm": 2.671875, + "learning_rate": 3.498802987443974e-06, + "loss": 1.3665883541107178, + "step": 580 + }, + { + "epoch": 0.8496350364963504, + "grad_norm": 5.25, + "learning_rate": 3.4885473265939464e-06, + "loss": 1.383296012878418, + "step": 582 + }, + { + "epoch": 0.8525547445255475, + "grad_norm": 2.71875, + "learning_rate": 3.478274439815831e-06, + "loss": 1.2266430854797363, + "step": 584 + }, + { + "epoch": 0.8554744525547445, + "grad_norm": 3.9375, + "learning_rate": 3.467984566715137e-06, + "loss": 1.5247292518615723, + "step": 586 + }, + { + "epoch": 0.8583941605839416, + "grad_norm": 4.125, + "learning_rate": 3.4576779472935644e-06, + "loss": 1.4203873872756958, + "step": 588 + }, + { + "epoch": 0.8613138686131386, + "grad_norm": 2.46875, + "learning_rate": 3.447354821943407e-06, + "loss": 1.222019076347351, + "step": 590 + }, + { + "epoch": 0.8642335766423358, + "grad_norm": 4.8125, + "learning_rate": 3.4370154314419395e-06, + "loss": 1.2593979835510254, + "step": 592 + }, + { + "epoch": 0.8671532846715329, + "grad_norm": 3.21875, + "learning_rate": 3.4266600169458135e-06, + "loss": 1.22776460647583, + "step": 594 + }, + { + "epoch": 0.8700729927007299, + "grad_norm": 2.703125, + "learning_rate": 3.4162888199854182e-06, + "loss": 1.2717225551605225, + "step": 596 + }, + { + "epoch": 0.872992700729927, + "grad_norm": 1.2890625, + "learning_rate": 3.405902082459259e-06, + "loss": 1.0713449716567993, + "step": 598 + }, + { + "epoch": 0.8759124087591241, + "grad_norm": 3.453125, + "learning_rate": 3.3955000466283073e-06, + "loss": 1.2096487283706665, + "step": 600 + }, + { + "epoch": 0.8788321167883212, + "grad_norm": 2.03125, + "learning_rate": 3.385082955110355e-06, + "loss": 1.2699155807495117, + "step": 602 + }, + { + "epoch": 0.8817518248175182, + "grad_norm": 2.328125, + "learning_rate": 3.3746510508743533e-06, + "loss": 1.3786303997039795, + "step": 604 + }, + { + "epoch": 0.8846715328467153, + "grad_norm": 5.53125, + "learning_rate": 3.3642045772347453e-06, + "loss": 1.3685808181762695, + "step": 606 + }, + { + "epoch": 0.8875912408759125, + "grad_norm": 9.0625, + "learning_rate": 3.353743777845795e-06, + "loss": 1.178727626800537, + "step": 608 + }, + { + "epoch": 0.8905109489051095, + "grad_norm": 4.1875, + "learning_rate": 3.343268896695897e-06, + "loss": 1.383094310760498, + "step": 610 + }, + { + "epoch": 0.8934306569343066, + "grad_norm": 3.359375, + "learning_rate": 3.3327801781018925e-06, + "loss": 1.4056508541107178, + "step": 612 + }, + { + "epoch": 0.8963503649635036, + "grad_norm": 4.65625, + "learning_rate": 3.322277866703367e-06, + "loss": 1.5974513292312622, + "step": 614 + }, + { + "epoch": 0.8992700729927007, + "grad_norm": 1.1875, + "learning_rate": 3.3117622074569476e-06, + "loss": 1.1610685586929321, + "step": 616 + }, + { + "epoch": 0.9021897810218978, + "grad_norm": 10.75, + "learning_rate": 3.3012334456305846e-06, + "loss": 0.901719331741333, + "step": 618 + }, + { + "epoch": 0.9051094890510949, + "grad_norm": 8.3125, + "learning_rate": 3.2906918267978355e-06, + "loss": 1.2409268617630005, + "step": 620 + }, + { + "epoch": 0.908029197080292, + "grad_norm": 3.453125, + "learning_rate": 3.2801375968321355e-06, + "loss": 1.4349682331085205, + "step": 622 + }, + { + "epoch": 0.910948905109489, + "grad_norm": 6.875, + "learning_rate": 3.269571001901061e-06, + "loss": 1.3277549743652344, + "step": 624 + }, + { + "epoch": 0.9138686131386862, + "grad_norm": 5.1875, + "learning_rate": 3.2589922884605924e-06, + "loss": 1.3614181280136108, + "step": 626 + }, + { + "epoch": 0.9167883211678832, + "grad_norm": 9.125, + "learning_rate": 3.2484017032493615e-06, + "loss": 1.705947756767273, + "step": 628 + }, + { + "epoch": 0.9197080291970803, + "grad_norm": 4.0, + "learning_rate": 3.237799493282897e-06, + "loss": 1.3996449708938599, + "step": 630 + }, + { + "epoch": 0.9226277372262773, + "grad_norm": 2.75, + "learning_rate": 3.2271859058478666e-06, + "loss": 1.4013357162475586, + "step": 632 + }, + { + "epoch": 0.9255474452554745, + "grad_norm": 3.46875, + "learning_rate": 3.2165611884963055e-06, + "loss": 1.2193137407302856, + "step": 634 + }, + { + "epoch": 0.9284671532846716, + "grad_norm": 2.421875, + "learning_rate": 3.2059255890398445e-06, + "loss": 0.9855245351791382, + "step": 636 + }, + { + "epoch": 0.9313868613138686, + "grad_norm": 3.59375, + "learning_rate": 3.1952793555439276e-06, + "loss": 1.4272806644439697, + "step": 638 + }, + { + "epoch": 0.9343065693430657, + "grad_norm": 3.421875, + "learning_rate": 3.18462273632203e-06, + "loss": 1.1866121292114258, + "step": 640 + }, + { + "epoch": 0.9372262773722628, + "grad_norm": 6.84375, + "learning_rate": 3.173955979929863e-06, + "loss": 1.385930061340332, + "step": 642 + }, + { + "epoch": 0.9401459854014599, + "grad_norm": 1.8125, + "learning_rate": 3.163279335159578e-06, + "loss": 1.283376932144165, + "step": 644 + }, + { + "epoch": 0.9430656934306569, + "grad_norm": 5.0625, + "learning_rate": 3.152593051033966e-06, + "loss": 1.368044376373291, + "step": 646 + }, + { + "epoch": 0.945985401459854, + "grad_norm": 14.0625, + "learning_rate": 3.1418973768006424e-06, + "loss": 0.6849503517150879, + "step": 648 + }, + { + "epoch": 0.948905109489051, + "grad_norm": 2.140625, + "learning_rate": 3.1311925619262417e-06, + "loss": 1.3481240272521973, + "step": 650 + }, + { + "epoch": 0.9518248175182482, + "grad_norm": 3.234375, + "learning_rate": 3.1204788560905935e-06, + "loss": 1.390141248703003, + "step": 652 + }, + { + "epoch": 0.9547445255474453, + "grad_norm": 8.8125, + "learning_rate": 3.1097565091809033e-06, + "loss": 1.3187050819396973, + "step": 654 + }, + { + "epoch": 0.9576642335766423, + "grad_norm": 12.125, + "learning_rate": 3.0990257712859184e-06, + "loss": 1.3746651411056519, + "step": 656 + }, + { + "epoch": 0.9605839416058394, + "grad_norm": 7.09375, + "learning_rate": 3.0882868926901e-06, + "loss": 1.2352771759033203, + "step": 658 + }, + { + "epoch": 0.9635036496350365, + "grad_norm": 3.46875, + "learning_rate": 3.077540123867783e-06, + "loss": 1.328325629234314, + "step": 660 + }, + { + "epoch": 0.9664233576642336, + "grad_norm": 3.46875, + "learning_rate": 3.066785715477334e-06, + "loss": 1.2275207042694092, + "step": 662 + }, + { + "epoch": 0.9693430656934306, + "grad_norm": 2.4375, + "learning_rate": 3.056023918355307e-06, + "loss": 1.335202693939209, + "step": 664 + }, + { + "epoch": 0.9722627737226277, + "grad_norm": 6.5, + "learning_rate": 3.0452549835105895e-06, + "loss": 1.4829626083374023, + "step": 666 + }, + { + "epoch": 0.9751824817518249, + "grad_norm": 34.0, + "learning_rate": 3.03447916211855e-06, + "loss": 1.5850169658660889, + "step": 668 + }, + { + "epoch": 0.9781021897810219, + "grad_norm": 6.5, + "learning_rate": 3.0236967055151804e-06, + "loss": 1.671141266822815, + "step": 670 + }, + { + "epoch": 0.981021897810219, + "grad_norm": 23.125, + "learning_rate": 3.0129078651912317e-06, + "loss": 1.300727128982544, + "step": 672 + }, + { + "epoch": 0.983941605839416, + "grad_norm": 8.875, + "learning_rate": 3.00211289278635e-06, + "loss": 1.4001004695892334, + "step": 674 + }, + { + "epoch": 0.9868613138686131, + "grad_norm": 8.875, + "learning_rate": 2.991312040083206e-06, + "loss": 0.47176289558410645, + "step": 676 + }, + { + "epoch": 0.9897810218978103, + "grad_norm": 2.875, + "learning_rate": 2.9805055590016225e-06, + "loss": 1.2891722917556763, + "step": 678 + }, + { + "epoch": 0.9927007299270073, + "grad_norm": 4.1875, + "learning_rate": 2.9696937015926995e-06, + "loss": 1.365147352218628, + "step": 680 + }, + { + "epoch": 0.9956204379562044, + "grad_norm": 1.8828125, + "learning_rate": 2.9588767200329348e-06, + "loss": 1.2809860706329346, + "step": 682 + }, + { + "epoch": 0.9985401459854014, + "grad_norm": 8.25, + "learning_rate": 2.9480548666183427e-06, + "loss": 1.6904196739196777, + "step": 684 + }, + { + "epoch": 1.0014598540145985, + "grad_norm": 2.21875, + "learning_rate": 2.9372283937585675e-06, + "loss": 1.3279258012771606, + "step": 686 + }, + { + "epoch": 1.0043795620437956, + "grad_norm": 4.34375, + "learning_rate": 2.926397553970999e-06, + "loss": 1.277381181716919, + "step": 688 + }, + { + "epoch": 1.0072992700729928, + "grad_norm": 5.84375, + "learning_rate": 2.915562599874882e-06, + "loss": 1.500443935394287, + "step": 690 + }, + { + "epoch": 1.0102189781021897, + "grad_norm": 9.875, + "learning_rate": 2.904723784185422e-06, + "loss": 1.2994956970214844, + "step": 692 + }, + { + "epoch": 1.013138686131387, + "grad_norm": 10.6875, + "learning_rate": 2.893881359707894e-06, + "loss": 1.227457046508789, + "step": 694 + }, + { + "epoch": 1.0160583941605839, + "grad_norm": 2.984375, + "learning_rate": 2.883035579331744e-06, + "loss": 1.2923262119293213, + "step": 696 + }, + { + "epoch": 1.018978102189781, + "grad_norm": 4.0, + "learning_rate": 2.8721866960246912e-06, + "loss": 1.445424199104309, + "step": 698 + }, + { + "epoch": 1.0218978102189782, + "grad_norm": 2.1875, + "learning_rate": 2.861334962826828e-06, + "loss": 1.1312172412872314, + "step": 700 + }, + { + "epoch": 1.0248175182481751, + "grad_norm": 3.734375, + "learning_rate": 2.8504806328447177e-06, + "loss": 1.4891958236694336, + "step": 702 + }, + { + "epoch": 1.0277372262773723, + "grad_norm": 3.734375, + "learning_rate": 2.8396239592454914e-06, + "loss": 1.4066648483276367, + "step": 704 + }, + { + "epoch": 1.0306569343065692, + "grad_norm": 4.21875, + "learning_rate": 2.828765195250942e-06, + "loss": 1.4027667045593262, + "step": 706 + }, + { + "epoch": 1.0335766423357664, + "grad_norm": 3.828125, + "learning_rate": 2.8179045941316214e-06, + "loss": 1.3984425067901611, + "step": 708 + }, + { + "epoch": 1.0364963503649636, + "grad_norm": 37.25, + "learning_rate": 2.8070424092009264e-06, + "loss": 1.5881340503692627, + "step": 710 + }, + { + "epoch": 1.0394160583941605, + "grad_norm": 5.21875, + "learning_rate": 2.7961788938091994e-06, + "loss": 1.3652167320251465, + "step": 712 + }, + { + "epoch": 1.0423357664233577, + "grad_norm": 9.0, + "learning_rate": 2.785314301337811e-06, + "loss": 1.4395644664764404, + "step": 714 + }, + { + "epoch": 1.0452554744525548, + "grad_norm": 4.125, + "learning_rate": 2.7744488851932568e-06, + "loss": 1.3807083368301392, + "step": 716 + }, + { + "epoch": 1.0481751824817518, + "grad_norm": 16.625, + "learning_rate": 2.76358289880124e-06, + "loss": 1.2562787532806396, + "step": 718 + }, + { + "epoch": 1.051094890510949, + "grad_norm": 4.03125, + "learning_rate": 2.752716595600768e-06, + "loss": 1.2394318580627441, + "step": 720 + }, + { + "epoch": 1.054014598540146, + "grad_norm": 8.625, + "learning_rate": 2.7418502290382352e-06, + "loss": 1.1047321557998657, + "step": 722 + }, + { + "epoch": 1.056934306569343, + "grad_norm": 4.46875, + "learning_rate": 2.7309840525615146e-06, + "loss": 1.5514793395996094, + "step": 724 + }, + { + "epoch": 1.0598540145985402, + "grad_norm": 3.234375, + "learning_rate": 2.720118319614047e-06, + "loss": 1.2009215354919434, + "step": 726 + }, + { + "epoch": 1.0627737226277372, + "grad_norm": 2.65625, + "learning_rate": 2.709253283628924e-06, + "loss": 1.2573150396347046, + "step": 728 + }, + { + "epoch": 1.0656934306569343, + "grad_norm": 8.9375, + "learning_rate": 2.698389198022987e-06, + "loss": 1.624213457107544, + "step": 730 + }, + { + "epoch": 1.0686131386861315, + "grad_norm": 5.375, + "learning_rate": 2.6875263161909054e-06, + "loss": 1.3574187755584717, + "step": 732 + }, + { + "epoch": 1.0715328467153284, + "grad_norm": 7.4375, + "learning_rate": 2.676664891499275e-06, + "loss": 1.2222844362258911, + "step": 734 + }, + { + "epoch": 1.0744525547445256, + "grad_norm": 1.640625, + "learning_rate": 2.6658051772807046e-06, + "loss": 1.2617628574371338, + "step": 736 + }, + { + "epoch": 1.0773722627737226, + "grad_norm": 8.0, + "learning_rate": 2.6549474268279074e-06, + "loss": 1.3748055696487427, + "step": 738 + }, + { + "epoch": 1.0802919708029197, + "grad_norm": 8.5625, + "learning_rate": 2.644091893387793e-06, + "loss": 1.4741809368133545, + "step": 740 + }, + { + "epoch": 1.0832116788321169, + "grad_norm": 7.1875, + "learning_rate": 2.6332388301555615e-06, + "loss": 1.3683550357818604, + "step": 742 + }, + { + "epoch": 1.0861313868613138, + "grad_norm": 23.125, + "learning_rate": 2.622388490268799e-06, + "loss": 1.4302444458007812, + "step": 744 + }, + { + "epoch": 1.089051094890511, + "grad_norm": 2.875, + "learning_rate": 2.6115411268015716e-06, + "loss": 1.3794375658035278, + "step": 746 + }, + { + "epoch": 1.091970802919708, + "grad_norm": 3.5, + "learning_rate": 2.6006969927585214e-06, + "loss": 1.6521217823028564, + "step": 748 + }, + { + "epoch": 1.094890510948905, + "grad_norm": 4.09375, + "learning_rate": 2.589856341068969e-06, + "loss": 1.380043625831604, + "step": 750 + }, + { + "epoch": 1.0978102189781023, + "grad_norm": 2.84375, + "learning_rate": 2.5790194245810125e-06, + "loss": 1.2655432224273682, + "step": 752 + }, + { + "epoch": 1.1007299270072992, + "grad_norm": 8.6875, + "learning_rate": 2.568186496055628e-06, + "loss": 1.4429633617401123, + "step": 754 + }, + { + "epoch": 1.1036496350364964, + "grad_norm": 2.34375, + "learning_rate": 2.5573578081607793e-06, + "loss": 1.1212751865386963, + "step": 756 + }, + { + "epoch": 1.1065693430656935, + "grad_norm": 2.71875, + "learning_rate": 2.546533613465518e-06, + "loss": 0.9118128418922424, + "step": 758 + }, + { + "epoch": 1.1094890510948905, + "grad_norm": 2.9375, + "learning_rate": 2.5357141644340966e-06, + "loss": 1.3533203601837158, + "step": 760 + }, + { + "epoch": 1.1124087591240877, + "grad_norm": 5.625, + "learning_rate": 2.5248997134200833e-06, + "loss": 1.2528855800628662, + "step": 762 + }, + { + "epoch": 1.1153284671532846, + "grad_norm": 2.5, + "learning_rate": 2.5140905126604677e-06, + "loss": 1.244079351425171, + "step": 764 + }, + { + "epoch": 1.1182481751824818, + "grad_norm": 5.71875, + "learning_rate": 2.503286814269783e-06, + "loss": 1.3053560256958008, + "step": 766 + }, + { + "epoch": 1.121167883211679, + "grad_norm": 1.5546875, + "learning_rate": 2.4924888702342266e-06, + "loss": 1.2007651329040527, + "step": 768 + }, + { + "epoch": 1.1240875912408759, + "grad_norm": 5.5625, + "learning_rate": 2.481696932405779e-06, + "loss": 1.3610585927963257, + "step": 770 + }, + { + "epoch": 1.127007299270073, + "grad_norm": 2.59375, + "learning_rate": 2.4709112524963326e-06, + "loss": 1.3990166187286377, + "step": 772 + }, + { + "epoch": 1.12992700729927, + "grad_norm": 3.484375, + "learning_rate": 2.4601320820718196e-06, + "loss": 1.3095015287399292, + "step": 774 + }, + { + "epoch": 1.1328467153284671, + "grad_norm": 2.84375, + "learning_rate": 2.4493596725463435e-06, + "loss": 1.2231605052947998, + "step": 776 + }, + { + "epoch": 1.1357664233576643, + "grad_norm": 5.875, + "learning_rate": 2.438594275176318e-06, + "loss": 1.3952467441558838, + "step": 778 + }, + { + "epoch": 1.1386861313868613, + "grad_norm": 5.09375, + "learning_rate": 2.4278361410546027e-06, + "loss": 1.2288057804107666, + "step": 780 + }, + { + "epoch": 1.1416058394160584, + "grad_norm": 7.15625, + "learning_rate": 2.41708552110465e-06, + "loss": 1.46846342086792, + "step": 782 + }, + { + "epoch": 1.1445255474452556, + "grad_norm": 3.421875, + "learning_rate": 2.4063426660746517e-06, + "loss": 1.3782763481140137, + "step": 784 + }, + { + "epoch": 1.1474452554744525, + "grad_norm": 9.375, + "learning_rate": 2.3956078265316883e-06, + "loss": 1.2458666563034058, + "step": 786 + }, + { + "epoch": 1.1503649635036497, + "grad_norm": 3.59375, + "learning_rate": 2.3848812528558887e-06, + "loss": 1.2981244325637817, + "step": 788 + }, + { + "epoch": 1.1532846715328466, + "grad_norm": 5.96875, + "learning_rate": 2.374163195234586e-06, + "loss": 1.3579144477844238, + "step": 790 + }, + { + "epoch": 1.1562043795620438, + "grad_norm": 1.4765625, + "learning_rate": 2.3634539036564853e-06, + "loss": 1.2424495220184326, + "step": 792 + }, + { + "epoch": 1.159124087591241, + "grad_norm": 3.78125, + "learning_rate": 2.352753627905833e-06, + "loss": 1.6642348766326904, + "step": 794 + }, + { + "epoch": 1.162043795620438, + "grad_norm": 5.90625, + "learning_rate": 2.3420626175565877e-06, + "loss": 1.1931509971618652, + "step": 796 + }, + { + "epoch": 1.164963503649635, + "grad_norm": 3.75, + "learning_rate": 2.331381121966603e-06, + "loss": 1.3377602100372314, + "step": 798 + }, + { + "epoch": 1.167883211678832, + "grad_norm": 3.640625, + "learning_rate": 2.3207093902718066e-06, + "loss": 1.2145559787750244, + "step": 800 + }, + { + "epoch": 1.1708029197080292, + "grad_norm": 2.078125, + "learning_rate": 2.3100476713803967e-06, + "loss": 1.1511560678482056, + "step": 802 + }, + { + "epoch": 1.1737226277372264, + "grad_norm": 4.75, + "learning_rate": 2.2993962139670292e-06, + "loss": 1.5985954999923706, + "step": 804 + }, + { + "epoch": 1.1766423357664233, + "grad_norm": 6.71875, + "learning_rate": 2.288755266467022e-06, + "loss": 1.4606941938400269, + "step": 806 + }, + { + "epoch": 1.1795620437956205, + "grad_norm": 7.75, + "learning_rate": 2.2781250770705575e-06, + "loss": 1.5486199855804443, + "step": 808 + }, + { + "epoch": 1.1824817518248176, + "grad_norm": 4.1875, + "learning_rate": 2.267505893716898e-06, + "loss": 1.3502545356750488, + "step": 810 + }, + { + "epoch": 1.1854014598540146, + "grad_norm": 7.40625, + "learning_rate": 2.2568979640885964e-06, + "loss": 1.5650737285614014, + "step": 812 + }, + { + "epoch": 1.1883211678832117, + "grad_norm": 7.96875, + "learning_rate": 2.246301535605726e-06, + "loss": 1.6433610916137695, + "step": 814 + }, + { + "epoch": 1.1912408759124087, + "grad_norm": 3.78125, + "learning_rate": 2.2357168554201066e-06, + "loss": 1.0836632251739502, + "step": 816 + }, + { + "epoch": 1.1941605839416058, + "grad_norm": 3.796875, + "learning_rate": 2.225144170409537e-06, + "loss": 1.1502854824066162, + "step": 818 + }, + { + "epoch": 1.197080291970803, + "grad_norm": 3.015625, + "learning_rate": 2.2145837271720433e-06, + "loss": 1.6808114051818848, + "step": 820 + }, + { + "epoch": 1.2, + "grad_norm": 3.296875, + "learning_rate": 2.204035772020121e-06, + "loss": 1.3705600500106812, + "step": 822 + }, + { + "epoch": 1.2029197080291971, + "grad_norm": 2.78125, + "learning_rate": 2.1935005509749933e-06, + "loss": 1.1946570873260498, + "step": 824 + }, + { + "epoch": 1.205839416058394, + "grad_norm": 17.75, + "learning_rate": 2.182978309760874e-06, + "loss": 1.5363470315933228, + "step": 826 + }, + { + "epoch": 1.2087591240875912, + "grad_norm": 3.78125, + "learning_rate": 2.1724692937992313e-06, + "loss": 1.4042502641677856, + "step": 828 + }, + { + "epoch": 1.2116788321167884, + "grad_norm": 17.25, + "learning_rate": 2.16197374820307e-06, + "loss": 1.2589643001556396, + "step": 830 + }, + { + "epoch": 1.2145985401459853, + "grad_norm": 3.359375, + "learning_rate": 2.1514919177712085e-06, + "loss": 1.6056280136108398, + "step": 832 + }, + { + "epoch": 1.2175182481751825, + "grad_norm": 4.3125, + "learning_rate": 2.141024046982573e-06, + "loss": 1.3564906120300293, + "step": 834 + }, + { + "epoch": 1.2204379562043797, + "grad_norm": 11.625, + "learning_rate": 2.1305703799904947e-06, + "loss": 0.9380712509155273, + "step": 836 + }, + { + "epoch": 1.2233576642335766, + "grad_norm": 8.75, + "learning_rate": 2.120131160617013e-06, + "loss": 1.0530650615692139, + "step": 838 + }, + { + "epoch": 1.2262773722627738, + "grad_norm": 8.4375, + "learning_rate": 2.1097066323471897e-06, + "loss": 0.7292347550392151, + "step": 840 + }, + { + "epoch": 1.2291970802919707, + "grad_norm": 8.125, + "learning_rate": 2.0992970383234336e-06, + "loss": 0.9691898226737976, + "step": 842 + }, + { + "epoch": 1.2321167883211679, + "grad_norm": 1.796875, + "learning_rate": 2.088902621339823e-06, + "loss": 1.152883768081665, + "step": 844 + }, + { + "epoch": 1.235036496350365, + "grad_norm": 6.3125, + "learning_rate": 2.078523623836446e-06, + "loss": 1.4850080013275146, + "step": 846 + }, + { + "epoch": 1.237956204379562, + "grad_norm": 7.3125, + "learning_rate": 2.0681602878937472e-06, + "loss": 1.3769371509552002, + "step": 848 + }, + { + "epoch": 1.2408759124087592, + "grad_norm": 3.53125, + "learning_rate": 2.057812855226879e-06, + "loss": 1.103143334388733, + "step": 850 + }, + { + "epoch": 1.243795620437956, + "grad_norm": 3.578125, + "learning_rate": 2.0474815671800644e-06, + "loss": 1.4019992351531982, + "step": 852 + }, + { + "epoch": 1.2467153284671533, + "grad_norm": 5.40625, + "learning_rate": 2.0371666647209694e-06, + "loss": 1.1963081359863281, + "step": 854 + }, + { + "epoch": 1.2496350364963504, + "grad_norm": 3.0625, + "learning_rate": 2.0268683884350803e-06, + "loss": 1.1888788938522339, + "step": 856 + }, + { + "epoch": 1.2525547445255474, + "grad_norm": 13.6875, + "learning_rate": 2.0165869785200938e-06, + "loss": 1.2623980045318604, + "step": 858 + }, + { + "epoch": 1.2554744525547445, + "grad_norm": 6.4375, + "learning_rate": 2.0063226747803143e-06, + "loss": 1.2596468925476074, + "step": 860 + }, + { + "epoch": 1.2583941605839417, + "grad_norm": 3.859375, + "learning_rate": 1.9960757166210596e-06, + "loss": 1.333680272102356, + "step": 862 + }, + { + "epoch": 1.2613138686131387, + "grad_norm": 3.71875, + "learning_rate": 1.9858463430430807e-06, + "loss": 1.1413600444793701, + "step": 864 + }, + { + "epoch": 1.2642335766423358, + "grad_norm": 5.5625, + "learning_rate": 1.9756347926369813e-06, + "loss": 1.3728548288345337, + "step": 866 + }, + { + "epoch": 1.2671532846715328, + "grad_norm": 4.15625, + "learning_rate": 1.9654413035776585e-06, + "loss": 1.449355125427246, + "step": 868 + }, + { + "epoch": 1.27007299270073, + "grad_norm": 4.09375, + "learning_rate": 1.9552661136187444e-06, + "loss": 1.1183695793151855, + "step": 870 + }, + { + "epoch": 1.2729927007299269, + "grad_norm": 4.40625, + "learning_rate": 1.945109460087061e-06, + "loss": 1.1493186950683594, + "step": 872 + }, + { + "epoch": 1.275912408759124, + "grad_norm": 2.640625, + "learning_rate": 1.934971579877088e-06, + "loss": 1.3397104740142822, + "step": 874 + }, + { + "epoch": 1.2788321167883212, + "grad_norm": 4.3125, + "learning_rate": 1.9248527094454316e-06, + "loss": 1.3082889318466187, + "step": 876 + }, + { + "epoch": 1.2817518248175181, + "grad_norm": 8.4375, + "learning_rate": 1.9147530848053152e-06, + "loss": 1.563565731048584, + "step": 878 + }, + { + "epoch": 1.2846715328467153, + "grad_norm": 8.25, + "learning_rate": 1.9046729415210686e-06, + "loss": 1.4606716632843018, + "step": 880 + }, + { + "epoch": 1.2875912408759125, + "grad_norm": 4.65625, + "learning_rate": 1.8946125147026427e-06, + "loss": 1.3690614700317383, + "step": 882 + }, + { + "epoch": 1.2905109489051094, + "grad_norm": 7.8125, + "learning_rate": 1.8845720390001154e-06, + "loss": 1.6756688356399536, + "step": 884 + }, + { + "epoch": 1.2934306569343066, + "grad_norm": 3.21875, + "learning_rate": 1.874551748598226e-06, + "loss": 1.2701613903045654, + "step": 886 + }, + { + "epoch": 1.2963503649635038, + "grad_norm": 4.78125, + "learning_rate": 1.8645518772109077e-06, + "loss": 1.5865097045898438, + "step": 888 + }, + { + "epoch": 1.2992700729927007, + "grad_norm": 3.921875, + "learning_rate": 1.8545726580758428e-06, + "loss": 1.401726484298706, + "step": 890 + }, + { + "epoch": 1.3021897810218979, + "grad_norm": 7.78125, + "learning_rate": 1.8446143239490168e-06, + "loss": 1.6153247356414795, + "step": 892 + }, + { + "epoch": 1.305109489051095, + "grad_norm": 6.125, + "learning_rate": 1.8346771070992914e-06, + "loss": 1.4763232469558716, + "step": 894 + }, + { + "epoch": 1.308029197080292, + "grad_norm": 1.984375, + "learning_rate": 1.82476123930299e-06, + "loss": 1.2044928073883057, + "step": 896 + }, + { + "epoch": 1.310948905109489, + "grad_norm": 1.4296875, + "learning_rate": 1.8148669518384862e-06, + "loss": 1.0226365327835083, + "step": 898 + }, + { + "epoch": 1.313868613138686, + "grad_norm": 2.1875, + "learning_rate": 1.804994475480815e-06, + "loss": 1.0369101762771606, + "step": 900 + }, + { + "epoch": 1.3167883211678832, + "grad_norm": 1.6875, + "learning_rate": 1.7951440404962856e-06, + "loss": 1.1433358192443848, + "step": 902 + }, + { + "epoch": 1.3197080291970802, + "grad_norm": 5.3125, + "learning_rate": 1.7853158766371143e-06, + "loss": 1.1160844564437866, + "step": 904 + }, + { + "epoch": 1.3226277372262774, + "grad_norm": 10.1875, + "learning_rate": 1.7755102131360639e-06, + "loss": 1.3365674018859863, + "step": 906 + }, + { + "epoch": 1.3255474452554745, + "grad_norm": 2.21875, + "learning_rate": 1.7657272787010967e-06, + "loss": 1.3394170999526978, + "step": 908 + }, + { + "epoch": 1.3284671532846715, + "grad_norm": 14.0625, + "learning_rate": 1.7559673015100405e-06, + "loss": 1.2542470693588257, + "step": 910 + }, + { + "epoch": 1.3313868613138686, + "grad_norm": 1.9453125, + "learning_rate": 1.7462305092052676e-06, + "loss": 1.2083182334899902, + "step": 912 + }, + { + "epoch": 1.3343065693430658, + "grad_norm": 2.234375, + "learning_rate": 1.7365171288883841e-06, + "loss": 1.0745160579681396, + "step": 914 + }, + { + "epoch": 1.3372262773722627, + "grad_norm": 5.5, + "learning_rate": 1.7268273871149335e-06, + "loss": 1.4868173599243164, + "step": 916 + }, + { + "epoch": 1.34014598540146, + "grad_norm": 5.96875, + "learning_rate": 1.7171615098891117e-06, + "loss": 0.7804101705551147, + "step": 918 + }, + { + "epoch": 1.343065693430657, + "grad_norm": 3.65625, + "learning_rate": 1.7075197226584969e-06, + "loss": 1.3761916160583496, + "step": 920 + }, + { + "epoch": 1.345985401459854, + "grad_norm": 2.640625, + "learning_rate": 1.6979022503087905e-06, + "loss": 1.413581132888794, + "step": 922 + }, + { + "epoch": 1.348905109489051, + "grad_norm": 8.125, + "learning_rate": 1.688309317158572e-06, + "loss": 1.6476316452026367, + "step": 924 + }, + { + "epoch": 1.3518248175182481, + "grad_norm": 5.4375, + "learning_rate": 1.6787411469540677e-06, + "loss": 1.5541059970855713, + "step": 926 + }, + { + "epoch": 1.3547445255474453, + "grad_norm": 6.125, + "learning_rate": 1.6691979628639281e-06, + "loss": 1.5634403228759766, + "step": 928 + }, + { + "epoch": 1.3576642335766422, + "grad_norm": 2.65625, + "learning_rate": 1.6596799874740294e-06, + "loss": 1.2540359497070312, + "step": 930 + }, + { + "epoch": 1.3605839416058394, + "grad_norm": 5.59375, + "learning_rate": 1.6501874427822767e-06, + "loss": 1.4849543571472168, + "step": 932 + }, + { + "epoch": 1.3635036496350366, + "grad_norm": 6.40625, + "learning_rate": 1.6407205501934285e-06, + "loss": 1.141026496887207, + "step": 934 + }, + { + "epoch": 1.3664233576642335, + "grad_norm": 2.375, + "learning_rate": 1.6312795305139328e-06, + "loss": 0.9827671647071838, + "step": 936 + }, + { + "epoch": 1.3693430656934307, + "grad_norm": 5.5, + "learning_rate": 1.6218646039467725e-06, + "loss": 1.4801573753356934, + "step": 938 + }, + { + "epoch": 1.3722627737226278, + "grad_norm": 3.109375, + "learning_rate": 1.6124759900863365e-06, + "loss": 1.6479110717773438, + "step": 940 + }, + { + "epoch": 1.3751824817518248, + "grad_norm": 7.25, + "learning_rate": 1.6031139079132933e-06, + "loss": 1.2483787536621094, + "step": 942 + }, + { + "epoch": 1.378102189781022, + "grad_norm": 1.453125, + "learning_rate": 1.593778575789484e-06, + "loss": 1.2027292251586914, + "step": 944 + }, + { + "epoch": 1.3810218978102191, + "grad_norm": 3.859375, + "learning_rate": 1.5844702114528315e-06, + "loss": 1.5109983682632446, + "step": 946 + }, + { + "epoch": 1.383941605839416, + "grad_norm": 5.34375, + "learning_rate": 1.5751890320122568e-06, + "loss": 1.3143746852874756, + "step": 948 + }, + { + "epoch": 1.3868613138686132, + "grad_norm": 8.25, + "learning_rate": 1.5659352539426215e-06, + "loss": 1.2749611139297485, + "step": 950 + }, + { + "epoch": 1.3897810218978102, + "grad_norm": 2.125, + "learning_rate": 1.5567090930796746e-06, + "loss": 1.244338035583496, + "step": 952 + }, + { + "epoch": 1.3927007299270073, + "grad_norm": 4.3125, + "learning_rate": 1.5475107646150203e-06, + "loss": 1.3380858898162842, + "step": 954 + }, + { + "epoch": 1.3956204379562043, + "grad_norm": 1.15625, + "learning_rate": 1.5383404830910981e-06, + "loss": 1.4054020643234253, + "step": 956 + }, + { + "epoch": 1.3985401459854014, + "grad_norm": 10.5625, + "learning_rate": 1.529198462396175e-06, + "loss": 1.4239089488983154, + "step": 958 + }, + { + "epoch": 1.4014598540145986, + "grad_norm": 9.25, + "learning_rate": 1.5200849157593666e-06, + "loss": 1.610469102859497, + "step": 960 + }, + { + "epoch": 1.4043795620437955, + "grad_norm": 1.71875, + "learning_rate": 1.5110000557456542e-06, + "loss": 1.1694961786270142, + "step": 962 + }, + { + "epoch": 1.4072992700729927, + "grad_norm": 5.625, + "learning_rate": 1.5019440942509312e-06, + "loss": 1.5139713287353516, + "step": 964 + }, + { + "epoch": 1.4102189781021899, + "grad_norm": 3.953125, + "learning_rate": 1.4929172424970576e-06, + "loss": 1.376784324645996, + "step": 966 + }, + { + "epoch": 1.4131386861313868, + "grad_norm": 2.34375, + "learning_rate": 1.483919711026939e-06, + "loss": 1.3103041648864746, + "step": 968 + }, + { + "epoch": 1.416058394160584, + "grad_norm": 1.328125, + "learning_rate": 1.4749517096996116e-06, + "loss": 1.2476757764816284, + "step": 970 + }, + { + "epoch": 1.4189781021897812, + "grad_norm": 3.703125, + "learning_rate": 1.4660134476853485e-06, + "loss": 1.3406193256378174, + "step": 972 + }, + { + "epoch": 1.421897810218978, + "grad_norm": 4.375, + "learning_rate": 1.4571051334607813e-06, + "loss": 1.2700021266937256, + "step": 974 + }, + { + "epoch": 1.4248175182481753, + "grad_norm": 2.90625, + "learning_rate": 1.4482269748040358e-06, + "loss": 1.2266380786895752, + "step": 976 + }, + { + "epoch": 1.4277372262773722, + "grad_norm": 3.3125, + "learning_rate": 1.4393791787898896e-06, + "loss": 1.189935564994812, + "step": 978 + }, + { + "epoch": 1.4306569343065694, + "grad_norm": 4.8125, + "learning_rate": 1.430561951784938e-06, + "loss": 1.4163111448287964, + "step": 980 + }, + { + "epoch": 1.4335766423357663, + "grad_norm": 7.125, + "learning_rate": 1.4217754994427844e-06, + "loss": 1.6390494108200073, + "step": 982 + }, + { + "epoch": 1.4364963503649635, + "grad_norm": 1.6640625, + "learning_rate": 1.4130200266992408e-06, + "loss": 1.1357786655426025, + "step": 984 + }, + { + "epoch": 1.4394160583941606, + "grad_norm": 3.5625, + "learning_rate": 1.4042957377675484e-06, + "loss": 1.2841823101043701, + "step": 986 + }, + { + "epoch": 1.4423357664233576, + "grad_norm": 7.34375, + "learning_rate": 1.395602836133616e-06, + "loss": 1.3807730674743652, + "step": 988 + }, + { + "epoch": 1.4452554744525548, + "grad_norm": 1.421875, + "learning_rate": 1.386941524551273e-06, + "loss": 1.135375738143921, + "step": 990 + }, + { + "epoch": 1.448175182481752, + "grad_norm": 2.875, + "learning_rate": 1.37831200503754e-06, + "loss": 1.1764510869979858, + "step": 992 + }, + { + "epoch": 1.4510948905109489, + "grad_norm": 8.9375, + "learning_rate": 1.3697144788679174e-06, + "loss": 1.2467272281646729, + "step": 994 + }, + { + "epoch": 1.454014598540146, + "grad_norm": 5.90625, + "learning_rate": 1.3611491465716898e-06, + "loss": 1.4708714485168457, + "step": 996 + }, + { + "epoch": 1.4569343065693432, + "grad_norm": 3.71875, + "learning_rate": 1.3526162079272495e-06, + "loss": 1.402409553527832, + "step": 998 + }, + { + "epoch": 1.4598540145985401, + "grad_norm": 4.59375, + "learning_rate": 1.34411586195744e-06, + "loss": 1.2477829456329346, + "step": 1000 + }, + { + "epoch": 1.4627737226277373, + "grad_norm": 4.0625, + "learning_rate": 1.3356483069249088e-06, + "loss": 1.3877084255218506, + "step": 1002 + }, + { + "epoch": 1.4656934306569342, + "grad_norm": 7.875, + "learning_rate": 1.3272137403274844e-06, + "loss": 1.555393934249878, + "step": 1004 + }, + { + "epoch": 1.4686131386861314, + "grad_norm": 3.671875, + "learning_rate": 1.318812358893572e-06, + "loss": 1.3621551990509033, + "step": 1006 + }, + { + "epoch": 1.4715328467153284, + "grad_norm": 4.59375, + "learning_rate": 1.3104443585775642e-06, + "loss": 1.3545817136764526, + "step": 1008 + }, + { + "epoch": 1.4744525547445255, + "grad_norm": 3.9375, + "learning_rate": 1.3021099345552695e-06, + "loss": 1.4017988443374634, + "step": 1010 + }, + { + "epoch": 1.4773722627737227, + "grad_norm": 6.21875, + "learning_rate": 1.2938092812193615e-06, + "loss": 1.3940372467041016, + "step": 1012 + }, + { + "epoch": 1.4802919708029196, + "grad_norm": 3.1875, + "learning_rate": 1.285542592174842e-06, + "loss": 1.1765646934509277, + "step": 1014 + }, + { + "epoch": 1.4832116788321168, + "grad_norm": 6.0, + "learning_rate": 1.277310060234529e-06, + "loss": 1.385852336883545, + "step": 1016 + }, + { + "epoch": 1.486131386861314, + "grad_norm": 4.8125, + "learning_rate": 1.2691118774145577e-06, + "loss": 1.395111322402954, + "step": 1018 + }, + { + "epoch": 1.489051094890511, + "grad_norm": 1.640625, + "learning_rate": 1.2609482349299021e-06, + "loss": 1.325355052947998, + "step": 1020 + }, + { + "epoch": 1.491970802919708, + "grad_norm": 3.515625, + "learning_rate": 1.2528193231899156e-06, + "loss": 1.2050141096115112, + "step": 1022 + }, + { + "epoch": 1.4948905109489052, + "grad_norm": 4.03125, + "learning_rate": 1.2447253317938871e-06, + "loss": 1.6511290073394775, + "step": 1024 + }, + { + "epoch": 1.4978102189781022, + "grad_norm": 3.609375, + "learning_rate": 1.236666449526623e-06, + "loss": 1.28155517578125, + "step": 1026 + }, + { + "epoch": 1.5007299270072991, + "grad_norm": 3.734375, + "learning_rate": 1.2286428643540418e-06, + "loss": 1.4207556247711182, + "step": 1028 + }, + { + "epoch": 1.5036496350364965, + "grad_norm": 3.359375, + "learning_rate": 1.22065476341879e-06, + "loss": 1.3519251346588135, + "step": 1030 + }, + { + "epoch": 1.5065693430656935, + "grad_norm": 5.84375, + "learning_rate": 1.2127023330358777e-06, + "loss": 1.396289587020874, + "step": 1032 + }, + { + "epoch": 1.5094890510948904, + "grad_norm": 2.65625, + "learning_rate": 1.204785758688331e-06, + "loss": 1.3400771617889404, + "step": 1034 + }, + { + "epoch": 1.5124087591240876, + "grad_norm": 31.25, + "learning_rate": 1.1969052250228683e-06, + "loss": 1.1934255361557007, + "step": 1036 + }, + { + "epoch": 1.5153284671532847, + "grad_norm": 4.90625, + "learning_rate": 1.1890609158455949e-06, + "loss": 1.4513096809387207, + "step": 1038 + }, + { + "epoch": 1.5182481751824817, + "grad_norm": 2.625, + "learning_rate": 1.181253014117711e-06, + "loss": 1.1264418363571167, + "step": 1040 + }, + { + "epoch": 1.5211678832116788, + "grad_norm": 1.65625, + "learning_rate": 1.1734817019512465e-06, + "loss": 1.1497807502746582, + "step": 1042 + }, + { + "epoch": 1.524087591240876, + "grad_norm": 7.8125, + "learning_rate": 1.1657471606048157e-06, + "loss": 1.6058242321014404, + "step": 1044 + }, + { + "epoch": 1.527007299270073, + "grad_norm": 22.25, + "learning_rate": 1.1580495704793874e-06, + "loss": 1.4766197204589844, + "step": 1046 + }, + { + "epoch": 1.5299270072992701, + "grad_norm": 3.75, + "learning_rate": 1.1503891111140767e-06, + "loss": 1.2432148456573486, + "step": 1048 + }, + { + "epoch": 1.5328467153284673, + "grad_norm": 28.25, + "learning_rate": 1.1427659611819604e-06, + "loss": 1.1451390981674194, + "step": 1050 + }, + { + "epoch": 1.5357664233576642, + "grad_norm": 3.734375, + "learning_rate": 1.1351802984859045e-06, + "loss": 1.3471091985702515, + "step": 1052 + }, + { + "epoch": 1.5386861313868612, + "grad_norm": 1.640625, + "learning_rate": 1.127632299954423e-06, + "loss": 1.1958954334259033, + "step": 1054 + }, + { + "epoch": 1.5416058394160586, + "grad_norm": 10.8125, + "learning_rate": 1.1201221416375456e-06, + "loss": 1.3556766510009766, + "step": 1056 + }, + { + "epoch": 1.5445255474452555, + "grad_norm": 4.75, + "learning_rate": 1.1126499987027172e-06, + "loss": 1.6111273765563965, + "step": 1058 + }, + { + "epoch": 1.5474452554744524, + "grad_norm": 12.5, + "learning_rate": 1.1052160454307085e-06, + "loss": 1.5189365148544312, + "step": 1060 + }, + { + "epoch": 1.5503649635036496, + "grad_norm": 3.96875, + "learning_rate": 1.0978204552115493e-06, + "loss": 1.3763346672058105, + "step": 1062 + }, + { + "epoch": 1.5532846715328468, + "grad_norm": 4.375, + "learning_rate": 1.0904634005404902e-06, + "loss": 1.450345754623413, + "step": 1064 + }, + { + "epoch": 1.5562043795620437, + "grad_norm": 4.09375, + "learning_rate": 1.0831450530139747e-06, + "loss": 1.2109770774841309, + "step": 1066 + }, + { + "epoch": 1.5591240875912409, + "grad_norm": 7.0, + "learning_rate": 1.0758655833256381e-06, + "loss": 1.2681195735931396, + "step": 1068 + }, + { + "epoch": 1.562043795620438, + "grad_norm": 2.640625, + "learning_rate": 1.0686251612623277e-06, + "loss": 1.2694846391677856, + "step": 1070 + }, + { + "epoch": 1.564963503649635, + "grad_norm": 8.6875, + "learning_rate": 1.0614239557001389e-06, + "loss": 1.5101749897003174, + "step": 1072 + }, + { + "epoch": 1.5678832116788322, + "grad_norm": 3.171875, + "learning_rate": 1.0542621346004806e-06, + "loss": 1.313795566558838, + "step": 1074 + }, + { + "epoch": 1.5708029197080293, + "grad_norm": 9.0, + "learning_rate": 1.047139865006155e-06, + "loss": 1.1664808988571167, + "step": 1076 + }, + { + "epoch": 1.5737226277372263, + "grad_norm": 2.03125, + "learning_rate": 1.0400573130374641e-06, + "loss": 1.203639030456543, + "step": 1078 + }, + { + "epoch": 1.5766423357664232, + "grad_norm": 3.265625, + "learning_rate": 1.0330146438883304e-06, + "loss": 1.5285131931304932, + "step": 1080 + }, + { + "epoch": 1.5795620437956206, + "grad_norm": 6.5625, + "learning_rate": 1.0260120218224485e-06, + "loss": 1.516188144683838, + "step": 1082 + }, + { + "epoch": 1.5824817518248175, + "grad_norm": 6.9375, + "learning_rate": 1.019049610169452e-06, + "loss": 1.3165411949157715, + "step": 1084 + }, + { + "epoch": 1.5854014598540145, + "grad_norm": 4.6875, + "learning_rate": 1.012127571321104e-06, + "loss": 1.1730577945709229, + "step": 1086 + }, + { + "epoch": 1.5883211678832116, + "grad_norm": 4.46875, + "learning_rate": 1.0052460667275102e-06, + "loss": 1.3837532997131348, + "step": 1088 + }, + { + "epoch": 1.5912408759124088, + "grad_norm": 4.71875, + "learning_rate": 9.984052568933507e-07, + "loss": 1.342604398727417, + "step": 1090 + }, + { + "epoch": 1.5941605839416058, + "grad_norm": 1.8046875, + "learning_rate": 9.916053013741396e-07, + "loss": 1.0345500707626343, + "step": 1092 + }, + { + "epoch": 1.597080291970803, + "grad_norm": 3.578125, + "learning_rate": 9.848463587725024e-07, + "loss": 1.3031237125396729, + "step": 1094 + }, + { + "epoch": 1.6, + "grad_norm": 2.6875, + "learning_rate": 9.78128586734476e-07, + "loss": 1.4126646518707275, + "step": 1096 + }, + { + "epoch": 1.602919708029197, + "grad_norm": 2.796875, + "learning_rate": 9.714521419458333e-07, + "loss": 1.2036532163619995, + "step": 1098 + }, + { + "epoch": 1.6058394160583942, + "grad_norm": 5.34375, + "learning_rate": 9.648171801284254e-07, + "loss": 1.3445477485656738, + "step": 1100 + }, + { + "epoch": 1.6087591240875914, + "grad_norm": 6.875, + "learning_rate": 9.582238560365534e-07, + "loss": 1.4824466705322266, + "step": 1102 + }, + { + "epoch": 1.6116788321167883, + "grad_norm": 2.171875, + "learning_rate": 9.516723234533573e-07, + "loss": 0.6945338845252991, + "step": 1104 + }, + { + "epoch": 1.6145985401459853, + "grad_norm": 4.375, + "learning_rate": 9.451627351872289e-07, + "loss": 1.691240906715393, + "step": 1106 + }, + { + "epoch": 1.6175182481751826, + "grad_norm": 5.0625, + "learning_rate": 9.386952430682478e-07, + "loss": 1.6143536567687988, + "step": 1108 + }, + { + "epoch": 1.6204379562043796, + "grad_norm": 4.90625, + "learning_rate": 9.322699979446395e-07, + "loss": 1.0810116529464722, + "step": 1110 + }, + { + "epoch": 1.6233576642335765, + "grad_norm": 3.953125, + "learning_rate": 9.25887149679259e-07, + "loss": 1.3443822860717773, + "step": 1112 + }, + { + "epoch": 1.6262773722627737, + "grad_norm": 2.5, + "learning_rate": 9.19546847146093e-07, + "loss": 1.392272710800171, + "step": 1114 + }, + { + "epoch": 1.6291970802919709, + "grad_norm": 2.890625, + "learning_rate": 9.132492382267895e-07, + "loss": 1.2860863208770752, + "step": 1116 + }, + { + "epoch": 1.6321167883211678, + "grad_norm": 6.03125, + "learning_rate": 9.069944698072071e-07, + "loss": 1.4681463241577148, + "step": 1118 + }, + { + "epoch": 1.635036496350365, + "grad_norm": 1.828125, + "learning_rate": 9.0078268777399e-07, + "loss": 1.1984715461730957, + "step": 1120 + }, + { + "epoch": 1.6379562043795621, + "grad_norm": 3.328125, + "learning_rate": 8.946140370111651e-07, + "loss": 1.3620171546936035, + "step": 1122 + }, + { + "epoch": 1.640875912408759, + "grad_norm": 2.5625, + "learning_rate": 8.884886613967625e-07, + "loss": 1.0197124481201172, + "step": 1124 + }, + { + "epoch": 1.6437956204379562, + "grad_norm": 2.96875, + "learning_rate": 8.824067037994597e-07, + "loss": 1.2507963180541992, + "step": 1126 + }, + { + "epoch": 1.6467153284671534, + "grad_norm": 7.59375, + "learning_rate": 8.763683060752492e-07, + "loss": 1.5034403800964355, + "step": 1128 + }, + { + "epoch": 1.6496350364963503, + "grad_norm": 3.703125, + "learning_rate": 8.703736090641302e-07, + "loss": 1.250478744506836, + "step": 1130 + }, + { + "epoch": 1.6525547445255473, + "grad_norm": 2.921875, + "learning_rate": 8.644227525868238e-07, + "loss": 1.2682870626449585, + "step": 1132 + }, + { + "epoch": 1.6554744525547447, + "grad_norm": 8.5, + "learning_rate": 8.585158754415114e-07, + "loss": 1.5448431968688965, + "step": 1134 + }, + { + "epoch": 1.6583941605839416, + "grad_norm": 5.65625, + "learning_rate": 8.52653115400598e-07, + "loss": 1.3879718780517578, + "step": 1136 + }, + { + "epoch": 1.6613138686131386, + "grad_norm": 3.3125, + "learning_rate": 8.468346092074961e-07, + "loss": 1.3755671977996826, + "step": 1138 + }, + { + "epoch": 1.6642335766423357, + "grad_norm": 2.75, + "learning_rate": 8.410604925734411e-07, + "loss": 1.1513915061950684, + "step": 1140 + }, + { + "epoch": 1.667153284671533, + "grad_norm": 14.1875, + "learning_rate": 8.35330900174322e-07, + "loss": 1.5474663972854614, + "step": 1142 + }, + { + "epoch": 1.6700729927007298, + "grad_norm": 3.515625, + "learning_rate": 8.296459656475413e-07, + "loss": 0.8504141569137573, + "step": 1144 + }, + { + "epoch": 1.672992700729927, + "grad_norm": 5.78125, + "learning_rate": 8.240058215888998e-07, + "loss": 1.3289515972137451, + "step": 1146 + }, + { + "epoch": 1.6759124087591242, + "grad_norm": 6.9375, + "learning_rate": 8.184105995494998e-07, + "loss": 0.9470740556716919, + "step": 1148 + }, + { + "epoch": 1.6788321167883211, + "grad_norm": 3.359375, + "learning_rate": 8.128604300326812e-07, + "loss": 1.352350115776062, + "step": 1150 + }, + { + "epoch": 1.6817518248175183, + "grad_norm": 4.78125, + "learning_rate": 8.073554424909755e-07, + "loss": 1.3660526275634766, + "step": 1152 + }, + { + "epoch": 1.6846715328467154, + "grad_norm": 2.46875, + "learning_rate": 8.01895765323087e-07, + "loss": 1.2722463607788086, + "step": 1154 + }, + { + "epoch": 1.6875912408759124, + "grad_norm": 6.21875, + "learning_rate": 7.964815258708971e-07, + "loss": 1.13301420211792, + "step": 1156 + }, + { + "epoch": 1.6905109489051093, + "grad_norm": 2.03125, + "learning_rate": 7.911128504164947e-07, + "loss": 1.3945411443710327, + "step": 1158 + }, + { + "epoch": 1.6934306569343067, + "grad_norm": 1.7421875, + "learning_rate": 7.857898641792322e-07, + "loss": 1.1629891395568848, + "step": 1160 + }, + { + "epoch": 1.6963503649635037, + "grad_norm": 2.09375, + "learning_rate": 7.805126913128018e-07, + "loss": 1.1993281841278076, + "step": 1162 + }, + { + "epoch": 1.6992700729927006, + "grad_norm": 3.0625, + "learning_rate": 7.752814549023437e-07, + "loss": 1.4611374139785767, + "step": 1164 + }, + { + "epoch": 1.7021897810218978, + "grad_norm": 4.625, + "learning_rate": 7.700962769615704e-07, + "loss": 1.1919968128204346, + "step": 1166 + }, + { + "epoch": 1.705109489051095, + "grad_norm": 2.515625, + "learning_rate": 7.649572784299255e-07, + "loss": 1.2250781059265137, + "step": 1168 + }, + { + "epoch": 1.7080291970802919, + "grad_norm": 8.1875, + "learning_rate": 7.598645791697601e-07, + "loss": 1.3479260206222534, + "step": 1170 + }, + { + "epoch": 1.710948905109489, + "grad_norm": 4.25, + "learning_rate": 7.548182979635389e-07, + "loss": 1.3197946548461914, + "step": 1172 + }, + { + "epoch": 1.7138686131386862, + "grad_norm": 8.6875, + "learning_rate": 7.49818552511068e-07, + "loss": 1.1691796779632568, + "step": 1174 + }, + { + "epoch": 1.7167883211678832, + "grad_norm": 3.203125, + "learning_rate": 7.448654594267496e-07, + "loss": 1.2978925704956055, + "step": 1176 + }, + { + "epoch": 1.7197080291970803, + "grad_norm": 2.96875, + "learning_rate": 7.399591342368644e-07, + "loss": 1.174210786819458, + "step": 1178 + }, + { + "epoch": 1.7226277372262775, + "grad_norm": 4.625, + "learning_rate": 7.350996913768743e-07, + "loss": 1.2740840911865234, + "step": 1180 + }, + { + "epoch": 1.7255474452554744, + "grad_norm": 8.0625, + "learning_rate": 7.302872441887562e-07, + "loss": 1.1019668579101562, + "step": 1182 + }, + { + "epoch": 1.7284671532846714, + "grad_norm": 2.84375, + "learning_rate": 7.255219049183552e-07, + "loss": 1.3885023593902588, + "step": 1184 + }, + { + "epoch": 1.7313868613138688, + "grad_norm": 5.625, + "learning_rate": 7.208037847127683e-07, + "loss": 1.5192725658416748, + "step": 1186 + }, + { + "epoch": 1.7343065693430657, + "grad_norm": 6.625, + "learning_rate": 7.161329936177522e-07, + "loss": 1.3260494470596313, + "step": 1188 + }, + { + "epoch": 1.7372262773722627, + "grad_norm": 3.375, + "learning_rate": 7.115096405751567e-07, + "loss": 1.3762927055358887, + "step": 1190 + }, + { + "epoch": 1.7401459854014598, + "grad_norm": 1.8515625, + "learning_rate": 7.069338334203818e-07, + "loss": 1.0026099681854248, + "step": 1192 + }, + { + "epoch": 1.743065693430657, + "grad_norm": 1.1015625, + "learning_rate": 7.024056788798658e-07, + "loss": 1.1264629364013672, + "step": 1194 + }, + { + "epoch": 1.745985401459854, + "grad_norm": 16.75, + "learning_rate": 6.979252825685927e-07, + "loss": 1.5443601608276367, + "step": 1196 + }, + { + "epoch": 1.748905109489051, + "grad_norm": 1.8671875, + "learning_rate": 6.934927489876312e-07, + "loss": 1.0794442892074585, + "step": 1198 + }, + { + "epoch": 1.7518248175182483, + "grad_norm": 6.90625, + "learning_rate": 6.891081815216958e-07, + "loss": 1.348907470703125, + "step": 1200 + }, + { + "epoch": 1.7547445255474452, + "grad_norm": 3.140625, + "learning_rate": 6.847716824367369e-07, + "loss": 1.3414909839630127, + "step": 1202 + }, + { + "epoch": 1.7576642335766424, + "grad_norm": 4.59375, + "learning_rate": 6.804833528775531e-07, + "loss": 1.4073083400726318, + "step": 1204 + }, + { + "epoch": 1.7605839416058395, + "grad_norm": 3.671875, + "learning_rate": 6.762432928654358e-07, + "loss": 0.8366962671279907, + "step": 1206 + }, + { + "epoch": 1.7635036496350365, + "grad_norm": 5.53125, + "learning_rate": 6.720516012958325e-07, + "loss": 1.3547214269638062, + "step": 1208 + }, + { + "epoch": 1.7664233576642334, + "grad_norm": 5.21875, + "learning_rate": 6.679083759360433e-07, + "loss": 1.6114599704742432, + "step": 1210 + }, + { + "epoch": 1.7693430656934308, + "grad_norm": 4.5, + "learning_rate": 6.638137134229375e-07, + "loss": 1.5248315334320068, + "step": 1212 + }, + { + "epoch": 1.7722627737226277, + "grad_norm": 3.6875, + "learning_rate": 6.597677092607025e-07, + "loss": 1.093032956123352, + "step": 1214 + }, + { + "epoch": 1.7751824817518247, + "grad_norm": 4.5, + "learning_rate": 6.557704578186146e-07, + "loss": 1.408461093902588, + "step": 1216 + }, + { + "epoch": 1.7781021897810219, + "grad_norm": 9.9375, + "learning_rate": 6.518220523288382e-07, + "loss": 1.3268358707427979, + "step": 1218 + }, + { + "epoch": 1.781021897810219, + "grad_norm": 4.75, + "learning_rate": 6.479225848842523e-07, + "loss": 1.544386386871338, + "step": 1220 + }, + { + "epoch": 1.783941605839416, + "grad_norm": 5.9375, + "learning_rate": 6.440721464362998e-07, + "loss": 1.4272065162658691, + "step": 1222 + }, + { + "epoch": 1.7868613138686131, + "grad_norm": 3.515625, + "learning_rate": 6.402708267928694e-07, + "loss": 1.3150466680526733, + "step": 1224 + }, + { + "epoch": 1.7897810218978103, + "grad_norm": 5.0, + "learning_rate": 6.365187146161991e-07, + "loss": 1.2979998588562012, + "step": 1226 + }, + { + "epoch": 1.7927007299270072, + "grad_norm": 4.75, + "learning_rate": 6.32815897420809e-07, + "loss": 1.6841963529586792, + "step": 1228 + }, + { + "epoch": 1.7956204379562044, + "grad_norm": 5.0, + "learning_rate": 6.29162461571459e-07, + "loss": 1.6227900981903076, + "step": 1230 + }, + { + "epoch": 1.7985401459854016, + "grad_norm": 11.6875, + "learning_rate": 6.25558492281135e-07, + "loss": 1.4919426441192627, + "step": 1232 + }, + { + "epoch": 1.8014598540145985, + "grad_norm": 4.8125, + "learning_rate": 6.220040736090617e-07, + "loss": 1.3797836303710938, + "step": 1234 + }, + { + "epoch": 1.8043795620437955, + "grad_norm": 4.09375, + "learning_rate": 6.18499288458743e-07, + "loss": 1.6902371644973755, + "step": 1236 + }, + { + "epoch": 1.8072992700729928, + "grad_norm": 2.453125, + "learning_rate": 6.150442185760258e-07, + "loss": 1.2298048734664917, + "step": 1238 + }, + { + "epoch": 1.8102189781021898, + "grad_norm": 4.53125, + "learning_rate": 6.116389445471948e-07, + "loss": 1.3514063358306885, + "step": 1240 + }, + { + "epoch": 1.8131386861313867, + "grad_norm": 3.828125, + "learning_rate": 6.082835457970935e-07, + "loss": 1.3649213314056396, + "step": 1242 + }, + { + "epoch": 1.816058394160584, + "grad_norm": 4.15625, + "learning_rate": 6.0497810058727e-07, + "loss": 1.3873786926269531, + "step": 1244 + }, + { + "epoch": 1.818978102189781, + "grad_norm": 5.21875, + "learning_rate": 6.017226860141535e-07, + "loss": 1.6073391437530518, + "step": 1246 + }, + { + "epoch": 1.821897810218978, + "grad_norm": 2.90625, + "learning_rate": 5.985173780072558e-07, + "loss": 1.333566427230835, + "step": 1248 + }, + { + "epoch": 1.8248175182481752, + "grad_norm": 3.0625, + "learning_rate": 5.953622513273977e-07, + "loss": 1.3585089445114136, + "step": 1250 + }, + { + "epoch": 1.8277372262773723, + "grad_norm": 3.953125, + "learning_rate": 5.92257379564969e-07, + "loss": 1.195847749710083, + "step": 1252 + }, + { + "epoch": 1.8306569343065693, + "grad_norm": 4.84375, + "learning_rate": 5.892028351382101e-07, + "loss": 1.4418195486068726, + "step": 1254 + }, + { + "epoch": 1.8335766423357664, + "grad_norm": 4.09375, + "learning_rate": 5.861986892915227e-07, + "loss": 1.384018063545227, + "step": 1256 + }, + { + "epoch": 1.8364963503649636, + "grad_norm": 9.4375, + "learning_rate": 5.832450120938093e-07, + "loss": 1.3380024433135986, + "step": 1258 + }, + { + "epoch": 1.8394160583941606, + "grad_norm": 6.46875, + "learning_rate": 5.803418724368373e-07, + "loss": 1.3088436126708984, + "step": 1260 + }, + { + "epoch": 1.8423357664233575, + "grad_norm": 9.9375, + "learning_rate": 5.774893380336338e-07, + "loss": 1.5858633518218994, + "step": 1262 + }, + { + "epoch": 1.845255474452555, + "grad_norm": 6.375, + "learning_rate": 5.746874754169053e-07, + "loss": 1.5293078422546387, + "step": 1264 + }, + { + "epoch": 1.8481751824817518, + "grad_norm": 2.921875, + "learning_rate": 5.719363499374861e-07, + "loss": 1.1518256664276123, + "step": 1266 + }, + { + "epoch": 1.8510948905109488, + "grad_norm": 7.6875, + "learning_rate": 5.692360257628144e-07, + "loss": 1.3224802017211914, + "step": 1268 + }, + { + "epoch": 1.854014598540146, + "grad_norm": 4.28125, + "learning_rate": 5.665865658754341e-07, + "loss": 1.2233679294586182, + "step": 1270 + }, + { + "epoch": 1.856934306569343, + "grad_norm": 6.34375, + "learning_rate": 5.639880320715284e-07, + "loss": 1.4993672370910645, + "step": 1272 + }, + { + "epoch": 1.85985401459854, + "grad_norm": 3.703125, + "learning_rate": 5.614404849594762e-07, + "loss": 1.3802194595336914, + "step": 1274 + }, + { + "epoch": 1.8627737226277372, + "grad_norm": 2.5625, + "learning_rate": 5.589439839584404e-07, + "loss": 1.0489559173583984, + "step": 1276 + }, + { + "epoch": 1.8656934306569344, + "grad_norm": 1.40625, + "learning_rate": 5.564985872969791e-07, + "loss": 1.2326107025146484, + "step": 1278 + }, + { + "epoch": 1.8686131386861313, + "grad_norm": 5.4375, + "learning_rate": 5.541043520116912e-07, + "loss": 1.1945993900299072, + "step": 1280 + }, + { + "epoch": 1.8715328467153285, + "grad_norm": 2.625, + "learning_rate": 5.517613339458832e-07, + "loss": 1.2813007831573486, + "step": 1282 + }, + { + "epoch": 1.8744525547445257, + "grad_norm": 4.46875, + "learning_rate": 5.494695877482676e-07, + "loss": 1.1684314012527466, + "step": 1284 + }, + { + "epoch": 1.8773722627737226, + "grad_norm": 3.71875, + "learning_rate": 5.472291668716893e-07, + "loss": 1.222388505935669, + "step": 1286 + }, + { + "epoch": 1.8802919708029195, + "grad_norm": 2.984375, + "learning_rate": 5.450401235718762e-07, + "loss": 1.2156729698181152, + "step": 1288 + }, + { + "epoch": 1.883211678832117, + "grad_norm": 5.96875, + "learning_rate": 5.42902508906224e-07, + "loss": 1.311574935913086, + "step": 1290 + }, + { + "epoch": 1.8861313868613139, + "grad_norm": 7.96875, + "learning_rate": 5.408163727326021e-07, + "loss": 1.34036123752594, + "step": 1292 + }, + { + "epoch": 1.8890510948905108, + "grad_norm": 3.640625, + "learning_rate": 5.387817637081928e-07, + "loss": 1.1132798194885254, + "step": 1294 + }, + { + "epoch": 1.891970802919708, + "grad_norm": 3.359375, + "learning_rate": 5.367987292883554e-07, + "loss": 1.3646128177642822, + "step": 1296 + }, + { + "epoch": 1.8948905109489051, + "grad_norm": 5.1875, + "learning_rate": 5.348673157255195e-07, + "loss": 1.4554338455200195, + "step": 1298 + }, + { + "epoch": 1.897810218978102, + "grad_norm": 3.96875, + "learning_rate": 5.329875680681065e-07, + "loss": 1.4109296798706055, + "step": 1300 + }, + { + "epoch": 1.9007299270072993, + "grad_norm": 4.875, + "learning_rate": 5.311595301594783e-07, + "loss": 1.1961219310760498, + "step": 1302 + }, + { + "epoch": 1.9036496350364964, + "grad_norm": 2.921875, + "learning_rate": 5.293832446369158e-07, + "loss": 0.6657427549362183, + "step": 1304 + }, + { + "epoch": 1.9065693430656934, + "grad_norm": 10.4375, + "learning_rate": 5.276587529306236e-07, + "loss": 1.397131323814392, + "step": 1306 + }, + { + "epoch": 1.9094890510948905, + "grad_norm": 6.5, + "learning_rate": 5.25986095262763e-07, + "loss": 1.323398470878601, + "step": 1308 + }, + { + "epoch": 1.9124087591240877, + "grad_norm": 3.203125, + "learning_rate": 5.243653106465157e-07, + "loss": 1.3060777187347412, + "step": 1310 + }, + { + "epoch": 1.9153284671532846, + "grad_norm": 5.71875, + "learning_rate": 5.227964368851721e-07, + "loss": 1.5433318614959717, + "step": 1312 + }, + { + "epoch": 1.9182481751824818, + "grad_norm": 3.359375, + "learning_rate": 5.212795105712508e-07, + "loss": 1.4788509607315063, + "step": 1314 + }, + { + "epoch": 1.921167883211679, + "grad_norm": 4.8125, + "learning_rate": 5.198145670856438e-07, + "loss": 1.3976120948791504, + "step": 1316 + }, + { + "epoch": 1.924087591240876, + "grad_norm": 2.0625, + "learning_rate": 5.184016405967931e-07, + "loss": 1.1872693300247192, + "step": 1318 + }, + { + "epoch": 1.9270072992700729, + "grad_norm": 2.296875, + "learning_rate": 5.170407640598921e-07, + "loss": 1.1601970195770264, + "step": 1320 + }, + { + "epoch": 1.92992700729927, + "grad_norm": 3.5625, + "learning_rate": 5.157319692161178e-07, + "loss": 1.205195426940918, + "step": 1322 + }, + { + "epoch": 1.9328467153284672, + "grad_norm": 3.734375, + "learning_rate": 5.144752865918901e-07, + "loss": 1.1591906547546387, + "step": 1324 + }, + { + "epoch": 1.9357664233576641, + "grad_norm": 3.421875, + "learning_rate": 5.132707454981602e-07, + "loss": 1.3498120307922363, + "step": 1326 + }, + { + "epoch": 1.9386861313868613, + "grad_norm": 3.796875, + "learning_rate": 5.121183740297261e-07, + "loss": 1.3916034698486328, + "step": 1328 + }, + { + "epoch": 1.9416058394160585, + "grad_norm": 17.375, + "learning_rate": 5.110181990645788e-07, + "loss": 1.2117153406143188, + "step": 1330 + }, + { + "epoch": 1.9445255474452554, + "grad_norm": 1.734375, + "learning_rate": 5.099702462632737e-07, + "loss": 1.19834566116333, + "step": 1332 + }, + { + "epoch": 1.9474452554744526, + "grad_norm": 10.0625, + "learning_rate": 5.089745400683333e-07, + "loss": 0.8368179798126221, + "step": 1334 + }, + { + "epoch": 1.9503649635036497, + "grad_norm": 5.625, + "learning_rate": 5.080311037036767e-07, + "loss": 1.314239263534546, + "step": 1336 + }, + { + "epoch": 1.9532846715328467, + "grad_norm": 1.65625, + "learning_rate": 5.071399591740777e-07, + "loss": 1.216627597808838, + "step": 1338 + }, + { + "epoch": 1.9562043795620438, + "grad_norm": 6.375, + "learning_rate": 5.063011272646521e-07, + "loss": 1.2274556159973145, + "step": 1340 + }, + { + "epoch": 1.959124087591241, + "grad_norm": 2.546875, + "learning_rate": 5.055146275403725e-07, + "loss": 1.4812201261520386, + "step": 1342 + }, + { + "epoch": 1.962043795620438, + "grad_norm": 5.71875, + "learning_rate": 5.047804783456117e-07, + "loss": 1.215821623802185, + "step": 1344 + }, + { + "epoch": 1.964963503649635, + "grad_norm": 4.71875, + "learning_rate": 5.040986968037157e-07, + "loss": 1.318119764328003, + "step": 1346 + }, + { + "epoch": 1.967883211678832, + "grad_norm": 2.953125, + "learning_rate": 5.034692988166033e-07, + "loss": 1.2136964797973633, + "step": 1348 + }, + { + "epoch": 1.9708029197080292, + "grad_norm": 4.125, + "learning_rate": 5.028922990643963e-07, + "loss": 1.3341786861419678, + "step": 1350 + }, + { + "epoch": 1.9737226277372262, + "grad_norm": 3.75, + "learning_rate": 5.023677110050759e-07, + "loss": 1.4188188314437866, + "step": 1352 + }, + { + "epoch": 1.9766423357664233, + "grad_norm": 3.421875, + "learning_rate": 5.018955468741701e-07, + "loss": 1.608628511428833, + "step": 1354 + }, + { + "epoch": 1.9795620437956205, + "grad_norm": 3.359375, + "learning_rate": 5.014758176844665e-07, + "loss": 1.5936325788497925, + "step": 1356 + }, + { + "epoch": 1.9824817518248175, + "grad_norm": 2.796875, + "learning_rate": 5.011085332257579e-07, + "loss": 1.178612232208252, + "step": 1358 + }, + { + "epoch": 1.9854014598540146, + "grad_norm": 7.1875, + "learning_rate": 5.007937020646117e-07, + "loss": 1.1231637001037598, + "step": 1360 + }, + { + "epoch": 1.9883211678832118, + "grad_norm": 1.90625, + "learning_rate": 5.005313315441716e-07, + "loss": 0.6363063454627991, + "step": 1362 + }, + { + "epoch": 1.9912408759124087, + "grad_norm": 5.5, + "learning_rate": 5.003214277839851e-07, + "loss": 1.3855026960372925, + "step": 1364 + }, + { + "epoch": 1.994160583941606, + "grad_norm": 5.6875, + "learning_rate": 5.00163995679862e-07, + "loss": 1.346792459487915, + "step": 1366 + }, + { + "epoch": 1.997080291970803, + "grad_norm": 8.1875, + "learning_rate": 5.000590389037593e-07, + "loss": 1.3148702383041382, + "step": 1368 + }, + { + "epoch": 2.0, + "grad_norm": 4.0625, + "learning_rate": 5.00006559903696e-07, + "loss": 1.6425683498382568, + "step": 1370 + }, + { + "epoch": 2.0, + "step": 1370, + "total_flos": 1.984544544032555e+18, + "train_loss": 1.409229011779284, + "train_runtime": 8212.4061, + "train_samples_per_second": 2.669, + "train_steps_per_second": 0.167 + } + ], + "logging_steps": 2, + "max_steps": 1370, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.984544544032555e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}