{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 116.68827514412096, "learning_rate": 5.91715976331361e-08, "loss": 1.459, "step": 1 }, { "epoch": 0.0, "grad_norm": 137.05559220326518, "learning_rate": 1.183431952662722e-07, "loss": 1.5293, "step": 2 }, { "epoch": 0.0, "grad_norm": 96.59057528089231, "learning_rate": 1.775147928994083e-07, "loss": 1.4766, "step": 3 }, { "epoch": 0.0, "grad_norm": 66.60254143032729, "learning_rate": 2.366863905325444e-07, "loss": 1.2988, "step": 4 }, { "epoch": 0.0, "grad_norm": 103.34499410914529, "learning_rate": 2.958579881656805e-07, "loss": 1.3887, "step": 5 }, { "epoch": 0.0, "grad_norm": 52.76380439002071, "learning_rate": 3.550295857988166e-07, "loss": 1.1865, "step": 6 }, { "epoch": 0.0, "grad_norm": 48.37017508979888, "learning_rate": 4.1420118343195276e-07, "loss": 1.3906, "step": 7 }, { "epoch": 0.01, "grad_norm": 94.36297438079606, "learning_rate": 4.733727810650888e-07, "loss": 1.3477, "step": 8 }, { "epoch": 0.01, "grad_norm": 45.10300787040884, "learning_rate": 5.32544378698225e-07, "loss": 1.2451, "step": 9 }, { "epoch": 0.01, "grad_norm": 103.24939665836075, "learning_rate": 5.91715976331361e-07, "loss": 1.3477, "step": 10 }, { "epoch": 0.01, "grad_norm": 102.10130791881714, "learning_rate": 6.50887573964497e-07, "loss": 1.2109, "step": 11 }, { "epoch": 0.01, "grad_norm": 29.836605877501068, "learning_rate": 7.100591715976332e-07, "loss": 1.1523, "step": 12 }, { "epoch": 0.01, "grad_norm": 35.91418906814854, "learning_rate": 7.692307692307694e-07, "loss": 1.2031, "step": 13 }, { "epoch": 0.01, "grad_norm": 55.18879990724547, "learning_rate": 8.284023668639055e-07, "loss": 1.1562, "step": 14 }, { "epoch": 0.01, "grad_norm": 111.28485143610422, "learning_rate": 8.875739644970415e-07, "loss": 1.2539, "step": 15 }, { "epoch": 0.01, "grad_norm": 64.53655153369745, "learning_rate": 9.467455621301776e-07, "loss": 1.1113, "step": 16 }, { "epoch": 0.01, "grad_norm": 49.81258432944396, "learning_rate": 1.0059171597633138e-06, "loss": 1.0557, "step": 17 }, { "epoch": 0.01, "grad_norm": 81.3821602984751, "learning_rate": 1.06508875739645e-06, "loss": 1.1436, "step": 18 }, { "epoch": 0.01, "grad_norm": 77.89797760634401, "learning_rate": 1.1242603550295859e-06, "loss": 1.0498, "step": 19 }, { "epoch": 0.01, "grad_norm": 103.51657421815158, "learning_rate": 1.183431952662722e-06, "loss": 1.1523, "step": 20 }, { "epoch": 0.01, "grad_norm": 34.98384966020216, "learning_rate": 1.242603550295858e-06, "loss": 0.8809, "step": 21 }, { "epoch": 0.02, "grad_norm": 17.909290919081947, "learning_rate": 1.301775147928994e-06, "loss": 0.8525, "step": 22 }, { "epoch": 0.02, "grad_norm": 17.7361005439023, "learning_rate": 1.3609467455621303e-06, "loss": 0.8418, "step": 23 }, { "epoch": 0.02, "grad_norm": 26.264367353380752, "learning_rate": 1.4201183431952664e-06, "loss": 0.876, "step": 24 }, { "epoch": 0.02, "grad_norm": 35.89694409386303, "learning_rate": 1.4792899408284026e-06, "loss": 0.8018, "step": 25 }, { "epoch": 0.02, "grad_norm": 52.88511927832449, "learning_rate": 1.5384615384615387e-06, "loss": 0.8203, "step": 26 }, { "epoch": 0.02, "grad_norm": 20.430991723939343, "learning_rate": 1.5976331360946749e-06, "loss": 0.7646, "step": 27 }, { "epoch": 0.02, "grad_norm": 40.02223588407742, "learning_rate": 1.656804733727811e-06, "loss": 0.7539, "step": 28 }, { "epoch": 0.02, "grad_norm": 23.523542799757628, "learning_rate": 1.7159763313609468e-06, "loss": 0.8047, "step": 29 }, { "epoch": 0.02, "grad_norm": 58.78591457424737, "learning_rate": 1.775147928994083e-06, "loss": 0.8398, "step": 30 }, { "epoch": 0.02, "grad_norm": 15.018173530627244, "learning_rate": 1.834319526627219e-06, "loss": 0.7764, "step": 31 }, { "epoch": 0.02, "grad_norm": 15.224774251543503, "learning_rate": 1.8934911242603552e-06, "loss": 0.7041, "step": 32 }, { "epoch": 0.02, "grad_norm": 16.267158783210224, "learning_rate": 1.952662721893491e-06, "loss": 0.6865, "step": 33 }, { "epoch": 0.02, "grad_norm": 19.71449038710854, "learning_rate": 2.0118343195266275e-06, "loss": 0.7051, "step": 34 }, { "epoch": 0.02, "grad_norm": 19.808645253515795, "learning_rate": 2.0710059171597635e-06, "loss": 0.7783, "step": 35 }, { "epoch": 0.03, "grad_norm": 36.10291312881198, "learning_rate": 2.1301775147929e-06, "loss": 0.6826, "step": 36 }, { "epoch": 0.03, "grad_norm": 25.047754674174154, "learning_rate": 2.1893491124260358e-06, "loss": 0.6885, "step": 37 }, { "epoch": 0.03, "grad_norm": 14.948882559824707, "learning_rate": 2.2485207100591717e-06, "loss": 0.6152, "step": 38 }, { "epoch": 0.03, "grad_norm": 26.159147372349974, "learning_rate": 2.307692307692308e-06, "loss": 0.7139, "step": 39 }, { "epoch": 0.03, "grad_norm": 14.488442848627141, "learning_rate": 2.366863905325444e-06, "loss": 0.6602, "step": 40 }, { "epoch": 0.03, "grad_norm": 12.239851284814467, "learning_rate": 2.42603550295858e-06, "loss": 0.6924, "step": 41 }, { "epoch": 0.03, "grad_norm": 12.408441779673309, "learning_rate": 2.485207100591716e-06, "loss": 0.6289, "step": 42 }, { "epoch": 0.03, "grad_norm": 21.76752777666436, "learning_rate": 2.5443786982248527e-06, "loss": 0.6436, "step": 43 }, { "epoch": 0.03, "grad_norm": 11.405290702619705, "learning_rate": 2.603550295857988e-06, "loss": 0.6064, "step": 44 }, { "epoch": 0.03, "grad_norm": 22.56239327853637, "learning_rate": 2.6627218934911246e-06, "loss": 0.6279, "step": 45 }, { "epoch": 0.03, "grad_norm": 25.96378836265653, "learning_rate": 2.7218934911242605e-06, "loss": 0.6484, "step": 46 }, { "epoch": 0.03, "grad_norm": 11.297707654533205, "learning_rate": 2.7810650887573965e-06, "loss": 0.5977, "step": 47 }, { "epoch": 0.03, "grad_norm": 21.368931517016037, "learning_rate": 2.840236686390533e-06, "loss": 0.6211, "step": 48 }, { "epoch": 0.03, "grad_norm": 17.055034780572917, "learning_rate": 2.8994082840236688e-06, "loss": 0.6504, "step": 49 }, { "epoch": 0.04, "grad_norm": 14.170299121299626, "learning_rate": 2.958579881656805e-06, "loss": 0.5391, "step": 50 }, { "epoch": 0.04, "grad_norm": 16.65980083079512, "learning_rate": 3.017751479289941e-06, "loss": 0.6562, "step": 51 }, { "epoch": 0.04, "grad_norm": 9.141571770193977, "learning_rate": 3.0769230769230774e-06, "loss": 0.5576, "step": 52 }, { "epoch": 0.04, "grad_norm": 38.18938821430388, "learning_rate": 3.1360946745562134e-06, "loss": 0.6465, "step": 53 }, { "epoch": 0.04, "grad_norm": 36.485584647459994, "learning_rate": 3.1952662721893497e-06, "loss": 0.6182, "step": 54 }, { "epoch": 0.04, "grad_norm": 26.1172270983848, "learning_rate": 3.2544378698224853e-06, "loss": 0.6304, "step": 55 }, { "epoch": 0.04, "grad_norm": 10.35522270180552, "learning_rate": 3.313609467455622e-06, "loss": 0.5791, "step": 56 }, { "epoch": 0.04, "grad_norm": 8.919799862854324, "learning_rate": 3.3727810650887576e-06, "loss": 0.6064, "step": 57 }, { "epoch": 0.04, "grad_norm": 37.86337212965342, "learning_rate": 3.4319526627218935e-06, "loss": 0.6719, "step": 58 }, { "epoch": 0.04, "grad_norm": 32.4241795097629, "learning_rate": 3.49112426035503e-06, "loss": 0.6816, "step": 59 }, { "epoch": 0.04, "grad_norm": 12.843367212342677, "learning_rate": 3.550295857988166e-06, "loss": 0.5415, "step": 60 }, { "epoch": 0.04, "grad_norm": 11.033086617607612, "learning_rate": 3.609467455621302e-06, "loss": 0.5898, "step": 61 }, { "epoch": 0.04, "grad_norm": 11.78012764506264, "learning_rate": 3.668639053254438e-06, "loss": 0.5967, "step": 62 }, { "epoch": 0.04, "grad_norm": 11.240783033272136, "learning_rate": 3.7278106508875745e-06, "loss": 0.5645, "step": 63 }, { "epoch": 0.05, "grad_norm": 9.123512981508057, "learning_rate": 3.7869822485207104e-06, "loss": 0.5703, "step": 64 }, { "epoch": 0.05, "grad_norm": 27.660633641874856, "learning_rate": 3.846153846153847e-06, "loss": 0.6182, "step": 65 }, { "epoch": 0.05, "grad_norm": 14.27816226916257, "learning_rate": 3.905325443786982e-06, "loss": 0.623, "step": 66 }, { "epoch": 0.05, "grad_norm": 16.991620078648673, "learning_rate": 3.964497041420119e-06, "loss": 0.5547, "step": 67 }, { "epoch": 0.05, "grad_norm": 29.73306262944105, "learning_rate": 4.023668639053255e-06, "loss": 0.624, "step": 68 }, { "epoch": 0.05, "grad_norm": 19.983350773990825, "learning_rate": 4.0828402366863906e-06, "loss": 0.5938, "step": 69 }, { "epoch": 0.05, "grad_norm": 11.203249085331949, "learning_rate": 4.142011834319527e-06, "loss": 0.6152, "step": 70 }, { "epoch": 0.05, "grad_norm": 18.903248382781342, "learning_rate": 4.201183431952663e-06, "loss": 0.5425, "step": 71 }, { "epoch": 0.05, "grad_norm": 16.743327563970325, "learning_rate": 4.2603550295858e-06, "loss": 0.5518, "step": 72 }, { "epoch": 0.05, "grad_norm": 13.530873380507195, "learning_rate": 4.319526627218935e-06, "loss": 0.5859, "step": 73 }, { "epoch": 0.05, "grad_norm": 10.306285659968713, "learning_rate": 4.3786982248520715e-06, "loss": 0.585, "step": 74 }, { "epoch": 0.05, "grad_norm": 16.30837385866365, "learning_rate": 4.437869822485207e-06, "loss": 0.7109, "step": 75 }, { "epoch": 0.05, "grad_norm": 22.972681142302918, "learning_rate": 4.497041420118343e-06, "loss": 0.5781, "step": 76 }, { "epoch": 0.05, "grad_norm": 11.69935251821666, "learning_rate": 4.55621301775148e-06, "loss": 0.5889, "step": 77 }, { "epoch": 0.06, "grad_norm": 30.74542232405432, "learning_rate": 4.615384615384616e-06, "loss": 0.5996, "step": 78 }, { "epoch": 0.06, "grad_norm": 13.053844021932429, "learning_rate": 4.674556213017752e-06, "loss": 0.6523, "step": 79 }, { "epoch": 0.06, "grad_norm": 18.660654525701556, "learning_rate": 4.733727810650888e-06, "loss": 0.5903, "step": 80 }, { "epoch": 0.06, "grad_norm": 33.06855879264672, "learning_rate": 4.792899408284024e-06, "loss": 0.5947, "step": 81 }, { "epoch": 0.06, "grad_norm": 18.027708084710582, "learning_rate": 4.85207100591716e-06, "loss": 0.5249, "step": 82 }, { "epoch": 0.06, "grad_norm": 9.927455268454475, "learning_rate": 4.911242603550296e-06, "loss": 0.5024, "step": 83 }, { "epoch": 0.06, "grad_norm": 13.12037453816752, "learning_rate": 4.970414201183432e-06, "loss": 0.5884, "step": 84 }, { "epoch": 0.06, "grad_norm": 15.758818975299697, "learning_rate": 5.029585798816569e-06, "loss": 0.5601, "step": 85 }, { "epoch": 0.06, "grad_norm": 10.52708954325852, "learning_rate": 5.088757396449705e-06, "loss": 0.4854, "step": 86 }, { "epoch": 0.06, "grad_norm": 16.86444076854706, "learning_rate": 5.14792899408284e-06, "loss": 0.4858, "step": 87 }, { "epoch": 0.06, "grad_norm": 9.60677062559464, "learning_rate": 5.207100591715976e-06, "loss": 0.5625, "step": 88 }, { "epoch": 0.06, "grad_norm": 18.10508123587462, "learning_rate": 5.266272189349113e-06, "loss": 0.5068, "step": 89 }, { "epoch": 0.06, "grad_norm": 15.276312006219614, "learning_rate": 5.325443786982249e-06, "loss": 0.6006, "step": 90 }, { "epoch": 0.06, "grad_norm": 16.278189599056375, "learning_rate": 5.384615384615385e-06, "loss": 0.5049, "step": 91 }, { "epoch": 0.07, "grad_norm": 24.04098331248457, "learning_rate": 5.443786982248521e-06, "loss": 0.4922, "step": 92 }, { "epoch": 0.07, "grad_norm": 29.519688923776314, "learning_rate": 5.502958579881657e-06, "loss": 0.5532, "step": 93 }, { "epoch": 0.07, "grad_norm": 11.073075096267969, "learning_rate": 5.562130177514793e-06, "loss": 0.5015, "step": 94 }, { "epoch": 0.07, "grad_norm": 23.04049428861296, "learning_rate": 5.621301775147929e-06, "loss": 0.4731, "step": 95 }, { "epoch": 0.07, "grad_norm": 15.43611431386209, "learning_rate": 5.680473372781066e-06, "loss": 0.5938, "step": 96 }, { "epoch": 0.07, "grad_norm": 20.37496355488797, "learning_rate": 5.739644970414202e-06, "loss": 0.6074, "step": 97 }, { "epoch": 0.07, "grad_norm": 23.4534463120769, "learning_rate": 5.7988165680473375e-06, "loss": 0.5552, "step": 98 }, { "epoch": 0.07, "grad_norm": 15.224551348565953, "learning_rate": 5.857988165680474e-06, "loss": 0.5259, "step": 99 }, { "epoch": 0.07, "grad_norm": 15.043175008582129, "learning_rate": 5.91715976331361e-06, "loss": 0.5615, "step": 100 }, { "epoch": 0.07, "grad_norm": 13.886760864897116, "learning_rate": 5.976331360946747e-06, "loss": 0.564, "step": 101 }, { "epoch": 0.07, "grad_norm": 10.62984639534972, "learning_rate": 6.035502958579882e-06, "loss": 0.5703, "step": 102 }, { "epoch": 0.07, "grad_norm": 13.366791130831217, "learning_rate": 6.0946745562130185e-06, "loss": 0.4766, "step": 103 }, { "epoch": 0.07, "grad_norm": 19.11640373494133, "learning_rate": 6.153846153846155e-06, "loss": 0.5801, "step": 104 }, { "epoch": 0.07, "grad_norm": 13.344028697162539, "learning_rate": 6.21301775147929e-06, "loss": 0.6074, "step": 105 }, { "epoch": 0.08, "grad_norm": 24.171423052067137, "learning_rate": 6.272189349112427e-06, "loss": 0.5859, "step": 106 }, { "epoch": 0.08, "grad_norm": 10.52701822263934, "learning_rate": 6.331360946745563e-06, "loss": 0.5063, "step": 107 }, { "epoch": 0.08, "grad_norm": 12.13390826805979, "learning_rate": 6.3905325443786995e-06, "loss": 0.5391, "step": 108 }, { "epoch": 0.08, "grad_norm": 24.893616746652306, "learning_rate": 6.449704142011834e-06, "loss": 0.6562, "step": 109 }, { "epoch": 0.08, "grad_norm": 10.267811806709746, "learning_rate": 6.5088757396449705e-06, "loss": 0.5327, "step": 110 }, { "epoch": 0.08, "grad_norm": 9.573660726227825, "learning_rate": 6.568047337278107e-06, "loss": 0.501, "step": 111 }, { "epoch": 0.08, "grad_norm": 9.424349082496683, "learning_rate": 6.627218934911244e-06, "loss": 0.479, "step": 112 }, { "epoch": 0.08, "grad_norm": 19.516504502014815, "learning_rate": 6.686390532544379e-06, "loss": 0.7539, "step": 113 }, { "epoch": 0.08, "grad_norm": 10.53470669412648, "learning_rate": 6.745562130177515e-06, "loss": 0.4805, "step": 114 }, { "epoch": 0.08, "grad_norm": 13.690074282712688, "learning_rate": 6.8047337278106515e-06, "loss": 0.5947, "step": 115 }, { "epoch": 0.08, "grad_norm": 11.491351335831942, "learning_rate": 6.863905325443787e-06, "loss": 0.5752, "step": 116 }, { "epoch": 0.08, "grad_norm": 16.709408023616227, "learning_rate": 6.923076923076923e-06, "loss": 0.5063, "step": 117 }, { "epoch": 0.08, "grad_norm": 17.782856613289166, "learning_rate": 6.98224852071006e-06, "loss": 0.5957, "step": 118 }, { "epoch": 0.08, "grad_norm": 19.968430706562753, "learning_rate": 7.041420118343196e-06, "loss": 0.6045, "step": 119 }, { "epoch": 0.09, "grad_norm": 23.252540842800897, "learning_rate": 7.100591715976332e-06, "loss": 0.5908, "step": 120 }, { "epoch": 0.09, "grad_norm": 10.269334605120518, "learning_rate": 7.159763313609468e-06, "loss": 0.5405, "step": 121 }, { "epoch": 0.09, "grad_norm": 14.810048379769524, "learning_rate": 7.218934911242604e-06, "loss": 0.5518, "step": 122 }, { "epoch": 0.09, "grad_norm": 10.265067489943311, "learning_rate": 7.278106508875741e-06, "loss": 0.5259, "step": 123 }, { "epoch": 0.09, "grad_norm": 13.5951034364997, "learning_rate": 7.337278106508876e-06, "loss": 0.5137, "step": 124 }, { "epoch": 0.09, "grad_norm": 22.46844506700403, "learning_rate": 7.396449704142013e-06, "loss": 0.4946, "step": 125 }, { "epoch": 0.09, "grad_norm": 16.353931220439996, "learning_rate": 7.455621301775149e-06, "loss": 0.5303, "step": 126 }, { "epoch": 0.09, "grad_norm": 18.291482953488973, "learning_rate": 7.5147928994082845e-06, "loss": 0.5142, "step": 127 }, { "epoch": 0.09, "grad_norm": 10.676486048527362, "learning_rate": 7.573964497041421e-06, "loss": 0.519, "step": 128 }, { "epoch": 0.09, "grad_norm": 18.100948739888956, "learning_rate": 7.633136094674556e-06, "loss": 0.5303, "step": 129 }, { "epoch": 0.09, "grad_norm": 21.616985829007, "learning_rate": 7.692307692307694e-06, "loss": 0.4478, "step": 130 }, { "epoch": 0.09, "grad_norm": 17.999044255166673, "learning_rate": 7.751479289940829e-06, "loss": 0.6455, "step": 131 }, { "epoch": 0.09, "grad_norm": 50.80837956748365, "learning_rate": 7.810650887573965e-06, "loss": 0.6377, "step": 132 }, { "epoch": 0.09, "grad_norm": 14.987804243664623, "learning_rate": 7.869822485207102e-06, "loss": 0.6377, "step": 133 }, { "epoch": 0.1, "grad_norm": 14.857441145744092, "learning_rate": 7.928994082840237e-06, "loss": 0.6055, "step": 134 }, { "epoch": 0.1, "grad_norm": 31.853850755139643, "learning_rate": 7.988165680473373e-06, "loss": 0.5396, "step": 135 }, { "epoch": 0.1, "grad_norm": 26.922060020459558, "learning_rate": 8.04733727810651e-06, "loss": 0.6792, "step": 136 }, { "epoch": 0.1, "grad_norm": 8.884182601970089, "learning_rate": 8.106508875739646e-06, "loss": 0.5957, "step": 137 }, { "epoch": 0.1, "grad_norm": 20.414653577595885, "learning_rate": 8.165680473372781e-06, "loss": 0.5635, "step": 138 }, { "epoch": 0.1, "grad_norm": 17.83898607971713, "learning_rate": 8.224852071005918e-06, "loss": 0.6191, "step": 139 }, { "epoch": 0.1, "grad_norm": 15.238831882725698, "learning_rate": 8.284023668639054e-06, "loss": 0.5503, "step": 140 }, { "epoch": 0.1, "grad_norm": 13.273476773493702, "learning_rate": 8.343195266272191e-06, "loss": 0.5918, "step": 141 }, { "epoch": 0.1, "grad_norm": 19.25326134871138, "learning_rate": 8.402366863905327e-06, "loss": 0.6064, "step": 142 }, { "epoch": 0.1, "grad_norm": 26.52884271993335, "learning_rate": 8.461538461538462e-06, "loss": 0.6025, "step": 143 }, { "epoch": 0.1, "grad_norm": 9.41160283896373, "learning_rate": 8.5207100591716e-06, "loss": 0.4663, "step": 144 }, { "epoch": 0.1, "grad_norm": 35.434570167809206, "learning_rate": 8.579881656804735e-06, "loss": 0.6113, "step": 145 }, { "epoch": 0.1, "grad_norm": 11.227787194473562, "learning_rate": 8.63905325443787e-06, "loss": 0.5303, "step": 146 }, { "epoch": 0.1, "grad_norm": 19.767727771237986, "learning_rate": 8.698224852071006e-06, "loss": 0.6484, "step": 147 }, { "epoch": 0.11, "grad_norm": 8.707533062635017, "learning_rate": 8.757396449704143e-06, "loss": 0.5186, "step": 148 }, { "epoch": 0.11, "grad_norm": 13.412761457820203, "learning_rate": 8.816568047337279e-06, "loss": 0.5659, "step": 149 }, { "epoch": 0.11, "grad_norm": 8.467634594644565, "learning_rate": 8.875739644970414e-06, "loss": 0.5151, "step": 150 }, { "epoch": 0.11, "grad_norm": 13.925519538332352, "learning_rate": 8.934911242603551e-06, "loss": 0.5879, "step": 151 }, { "epoch": 0.11, "grad_norm": 12.71700779047759, "learning_rate": 8.994082840236687e-06, "loss": 0.5835, "step": 152 }, { "epoch": 0.11, "grad_norm": 17.84943528345089, "learning_rate": 9.053254437869822e-06, "loss": 0.5264, "step": 153 }, { "epoch": 0.11, "grad_norm": 17.688901386150174, "learning_rate": 9.11242603550296e-06, "loss": 0.6367, "step": 154 }, { "epoch": 0.11, "grad_norm": 13.867220435355714, "learning_rate": 9.171597633136095e-06, "loss": 0.6777, "step": 155 }, { "epoch": 0.11, "grad_norm": 26.02685451565874, "learning_rate": 9.230769230769232e-06, "loss": 0.5967, "step": 156 }, { "epoch": 0.11, "grad_norm": 15.482134608562367, "learning_rate": 9.289940828402368e-06, "loss": 0.5615, "step": 157 }, { "epoch": 0.11, "grad_norm": 10.88805449508632, "learning_rate": 9.349112426035503e-06, "loss": 0.5684, "step": 158 }, { "epoch": 0.11, "grad_norm": 18.661463282963666, "learning_rate": 9.40828402366864e-06, "loss": 0.6445, "step": 159 }, { "epoch": 0.11, "grad_norm": 13.35562044235245, "learning_rate": 9.467455621301776e-06, "loss": 0.5557, "step": 160 }, { "epoch": 0.11, "grad_norm": 19.980779661002774, "learning_rate": 9.526627218934912e-06, "loss": 0.5474, "step": 161 }, { "epoch": 0.12, "grad_norm": 12.268903766575969, "learning_rate": 9.585798816568049e-06, "loss": 0.5225, "step": 162 }, { "epoch": 0.12, "grad_norm": 21.12465799994179, "learning_rate": 9.644970414201184e-06, "loss": 0.5547, "step": 163 }, { "epoch": 0.12, "grad_norm": 7.6737374134887215, "learning_rate": 9.70414201183432e-06, "loss": 0.5859, "step": 164 }, { "epoch": 0.12, "grad_norm": 18.06983956812275, "learning_rate": 9.763313609467457e-06, "loss": 0.5649, "step": 165 }, { "epoch": 0.12, "grad_norm": 12.848185642792462, "learning_rate": 9.822485207100593e-06, "loss": 0.5591, "step": 166 }, { "epoch": 0.12, "grad_norm": 21.283247757964315, "learning_rate": 9.88165680473373e-06, "loss": 0.5698, "step": 167 }, { "epoch": 0.12, "grad_norm": 11.36449736428342, "learning_rate": 9.940828402366864e-06, "loss": 0.5366, "step": 168 }, { "epoch": 0.12, "grad_norm": 12.504342411667235, "learning_rate": 1e-05, "loss": 0.5571, "step": 169 }, { "epoch": 0.12, "grad_norm": 28.526999718293393, "learning_rate": 9.999999164703534e-06, "loss": 0.6133, "step": 170 }, { "epoch": 0.12, "grad_norm": 8.676287862832698, "learning_rate": 9.999996658814406e-06, "loss": 0.4307, "step": 171 }, { "epoch": 0.12, "grad_norm": 12.27838137396804, "learning_rate": 9.999992482333461e-06, "loss": 0.4771, "step": 172 }, { "epoch": 0.12, "grad_norm": 17.098596330705657, "learning_rate": 9.99998663526209e-06, "loss": 0.4785, "step": 173 }, { "epoch": 0.12, "grad_norm": 10.364350114790247, "learning_rate": 9.99997911760225e-06, "loss": 0.5928, "step": 174 }, { "epoch": 0.12, "grad_norm": 32.439748171009725, "learning_rate": 9.99996992935645e-06, "loss": 0.6206, "step": 175 }, { "epoch": 0.13, "grad_norm": 10.869970364498535, "learning_rate": 9.99995907052776e-06, "loss": 0.5176, "step": 176 }, { "epoch": 0.13, "grad_norm": 28.853561378084798, "learning_rate": 9.99994654111981e-06, "loss": 0.5718, "step": 177 }, { "epoch": 0.13, "grad_norm": 30.030018985404727, "learning_rate": 9.999932341136785e-06, "loss": 0.6338, "step": 178 }, { "epoch": 0.13, "grad_norm": 10.898208289891516, "learning_rate": 9.999916470583429e-06, "loss": 0.5049, "step": 179 }, { "epoch": 0.13, "grad_norm": 29.836549153691703, "learning_rate": 9.999898929465047e-06, "loss": 0.5605, "step": 180 }, { "epoch": 0.13, "grad_norm": 31.456036267942125, "learning_rate": 9.999879717787495e-06, "loss": 0.6118, "step": 181 }, { "epoch": 0.13, "grad_norm": 19.965440018657493, "learning_rate": 9.999858835557197e-06, "loss": 0.5356, "step": 182 }, { "epoch": 0.13, "grad_norm": 10.241499025434392, "learning_rate": 9.999836282781128e-06, "loss": 0.5215, "step": 183 }, { "epoch": 0.13, "grad_norm": 10.343720644680257, "learning_rate": 9.999812059466825e-06, "loss": 0.6118, "step": 184 }, { "epoch": 0.13, "grad_norm": 28.654780626684733, "learning_rate": 9.999786165622379e-06, "loss": 0.6016, "step": 185 }, { "epoch": 0.13, "grad_norm": 24.17157492958299, "learning_rate": 9.999758601256441e-06, "loss": 0.624, "step": 186 }, { "epoch": 0.13, "grad_norm": 17.176557440775614, "learning_rate": 9.999729366378224e-06, "loss": 0.5527, "step": 187 }, { "epoch": 0.13, "grad_norm": 11.253723481448025, "learning_rate": 9.999698460997493e-06, "loss": 0.5601, "step": 188 }, { "epoch": 0.13, "grad_norm": 13.43944022216274, "learning_rate": 9.999665885124577e-06, "loss": 0.5273, "step": 189 }, { "epoch": 0.14, "grad_norm": 22.913171126499616, "learning_rate": 9.99963163877036e-06, "loss": 0.6587, "step": 190 }, { "epoch": 0.14, "grad_norm": 22.224898845103755, "learning_rate": 9.99959572194628e-06, "loss": 0.666, "step": 191 }, { "epoch": 0.14, "grad_norm": 8.83510331844381, "learning_rate": 9.999558134664342e-06, "loss": 0.6099, "step": 192 }, { "epoch": 0.14, "grad_norm": 8.477618783981828, "learning_rate": 9.999518876937102e-06, "loss": 0.4771, "step": 193 }, { "epoch": 0.14, "grad_norm": 23.778400243235488, "learning_rate": 9.999477948777678e-06, "loss": 0.5562, "step": 194 }, { "epoch": 0.14, "grad_norm": 7.459065251806433, "learning_rate": 9.999435350199745e-06, "loss": 0.5342, "step": 195 }, { "epoch": 0.14, "grad_norm": 21.538105722452126, "learning_rate": 9.999391081217536e-06, "loss": 0.627, "step": 196 }, { "epoch": 0.14, "grad_norm": 18.7688376682692, "learning_rate": 9.999345141845842e-06, "loss": 0.5293, "step": 197 }, { "epoch": 0.14, "grad_norm": 10.99862969600929, "learning_rate": 9.99929753210001e-06, "loss": 0.5005, "step": 198 }, { "epoch": 0.14, "grad_norm": 13.055362447650138, "learning_rate": 9.999248251995951e-06, "loss": 0.5659, "step": 199 }, { "epoch": 0.14, "grad_norm": 10.011136685984399, "learning_rate": 9.999197301550127e-06, "loss": 0.5586, "step": 200 }, { "epoch": 0.14, "grad_norm": 13.565468854396421, "learning_rate": 9.999144680779564e-06, "loss": 0.5127, "step": 201 }, { "epoch": 0.14, "grad_norm": 10.322706038393164, "learning_rate": 9.999090389701844e-06, "loss": 0.5396, "step": 202 }, { "epoch": 0.14, "grad_norm": 11.689469240544318, "learning_rate": 9.999034428335103e-06, "loss": 0.5366, "step": 203 }, { "epoch": 0.15, "grad_norm": 13.174468406049195, "learning_rate": 9.998976796698043e-06, "loss": 0.6064, "step": 204 }, { "epoch": 0.15, "grad_norm": 13.26408461209251, "learning_rate": 9.998917494809917e-06, "loss": 0.5181, "step": 205 }, { "epoch": 0.15, "grad_norm": 21.482605171650068, "learning_rate": 9.998856522690538e-06, "loss": 0.6626, "step": 206 }, { "epoch": 0.15, "grad_norm": 10.98758204165801, "learning_rate": 9.998793880360283e-06, "loss": 0.48, "step": 207 }, { "epoch": 0.15, "grad_norm": 22.66544425395466, "learning_rate": 9.998729567840077e-06, "loss": 0.6836, "step": 208 }, { "epoch": 0.15, "grad_norm": 18.309350432593693, "learning_rate": 9.998663585151409e-06, "loss": 0.5674, "step": 209 }, { "epoch": 0.15, "grad_norm": 27.286620280815722, "learning_rate": 9.998595932316327e-06, "loss": 0.6514, "step": 210 }, { "epoch": 0.15, "grad_norm": 14.739890462945858, "learning_rate": 9.998526609357432e-06, "loss": 0.5947, "step": 211 }, { "epoch": 0.15, "grad_norm": 6.551376294742977, "learning_rate": 9.998455616297889e-06, "loss": 0.5879, "step": 212 }, { "epoch": 0.15, "grad_norm": 21.631161156381427, "learning_rate": 9.998382953161417e-06, "loss": 0.6865, "step": 213 }, { "epoch": 0.15, "grad_norm": 18.33142486833869, "learning_rate": 9.998308619972292e-06, "loss": 0.6357, "step": 214 }, { "epoch": 0.15, "grad_norm": 10.60392425953813, "learning_rate": 9.998232616755354e-06, "loss": 0.5732, "step": 215 }, { "epoch": 0.15, "grad_norm": 5.976381604704594, "learning_rate": 9.998154943535996e-06, "loss": 0.5645, "step": 216 }, { "epoch": 0.15, "grad_norm": 6.10651009987485, "learning_rate": 9.998075600340166e-06, "loss": 0.582, "step": 217 }, { "epoch": 0.16, "grad_norm": 6.196857537217384, "learning_rate": 9.997994587194381e-06, "loss": 0.564, "step": 218 }, { "epoch": 0.16, "grad_norm": 21.280398372848552, "learning_rate": 9.997911904125704e-06, "loss": 0.6353, "step": 219 }, { "epoch": 0.16, "grad_norm": 9.816198078589931, "learning_rate": 9.997827551161762e-06, "loss": 0.5684, "step": 220 }, { "epoch": 0.16, "grad_norm": 12.183296652188321, "learning_rate": 9.997741528330739e-06, "loss": 0.5449, "step": 221 }, { "epoch": 0.16, "grad_norm": 11.185998935163878, "learning_rate": 9.997653835661376e-06, "loss": 0.5967, "step": 222 }, { "epoch": 0.16, "grad_norm": 12.288601461686234, "learning_rate": 9.997564473182976e-06, "loss": 0.5698, "step": 223 }, { "epoch": 0.16, "grad_norm": 9.321856011834956, "learning_rate": 9.997473440925394e-06, "loss": 0.5771, "step": 224 }, { "epoch": 0.16, "grad_norm": 6.311288966424027, "learning_rate": 9.997380738919045e-06, "loss": 0.5259, "step": 225 }, { "epoch": 0.16, "grad_norm": 21.481033073870574, "learning_rate": 9.997286367194903e-06, "loss": 0.6689, "step": 226 }, { "epoch": 0.16, "grad_norm": 14.550467772007261, "learning_rate": 9.9971903257845e-06, "loss": 0.5928, "step": 227 }, { "epoch": 0.16, "grad_norm": 13.702124607152298, "learning_rate": 9.997092614719926e-06, "loss": 0.5181, "step": 228 }, { "epoch": 0.16, "grad_norm": 17.241256236619744, "learning_rate": 9.996993234033826e-06, "loss": 0.5918, "step": 229 }, { "epoch": 0.16, "grad_norm": 10.044871821417644, "learning_rate": 9.996892183759407e-06, "loss": 0.5811, "step": 230 }, { "epoch": 0.16, "grad_norm": 16.21185115842485, "learning_rate": 9.99678946393043e-06, "loss": 0.5791, "step": 231 }, { "epoch": 0.17, "grad_norm": 17.301100574961, "learning_rate": 9.996685074581216e-06, "loss": 0.5332, "step": 232 }, { "epoch": 0.17, "grad_norm": 11.335320291463777, "learning_rate": 9.996579015746645e-06, "loss": 0.5742, "step": 233 }, { "epoch": 0.17, "grad_norm": 6.8646367315459935, "learning_rate": 9.996471287462151e-06, "loss": 0.5376, "step": 234 }, { "epoch": 0.17, "grad_norm": 5.966657193641781, "learning_rate": 9.99636188976373e-06, "loss": 0.5762, "step": 235 }, { "epoch": 0.17, "grad_norm": 17.07991289086175, "learning_rate": 9.996250822687932e-06, "loss": 0.5405, "step": 236 }, { "epoch": 0.17, "grad_norm": 13.807407534670729, "learning_rate": 9.996138086271869e-06, "loss": 0.585, "step": 237 }, { "epoch": 0.17, "grad_norm": 5.991884688831597, "learning_rate": 9.996023680553204e-06, "loss": 0.5181, "step": 238 }, { "epoch": 0.17, "grad_norm": 7.4817825314537645, "learning_rate": 9.995907605570167e-06, "loss": 0.5957, "step": 239 }, { "epoch": 0.17, "grad_norm": 24.175799986215807, "learning_rate": 9.995789861361538e-06, "loss": 0.6895, "step": 240 }, { "epoch": 0.17, "grad_norm": 7.737883477841022, "learning_rate": 9.995670447966658e-06, "loss": 0.4727, "step": 241 }, { "epoch": 0.17, "grad_norm": 13.747945457840865, "learning_rate": 9.995549365425426e-06, "loss": 0.5635, "step": 242 }, { "epoch": 0.17, "grad_norm": 15.188385247219632, "learning_rate": 9.995426613778297e-06, "loss": 0.6445, "step": 243 }, { "epoch": 0.17, "grad_norm": 5.735363655951899, "learning_rate": 9.995302193066286e-06, "loss": 0.5112, "step": 244 }, { "epoch": 0.17, "grad_norm": 9.187157770296569, "learning_rate": 9.995176103330962e-06, "loss": 0.5776, "step": 245 }, { "epoch": 0.18, "grad_norm": 7.366215466061506, "learning_rate": 9.995048344614455e-06, "loss": 0.6016, "step": 246 }, { "epoch": 0.18, "grad_norm": 11.101901236927242, "learning_rate": 9.994918916959453e-06, "loss": 0.5952, "step": 247 }, { "epoch": 0.18, "grad_norm": 5.494534788687338, "learning_rate": 9.994787820409198e-06, "loss": 0.5625, "step": 248 }, { "epoch": 0.18, "grad_norm": 7.350350461703404, "learning_rate": 9.994655055007491e-06, "loss": 0.5278, "step": 249 }, { "epoch": 0.18, "grad_norm": 5.815623310077835, "learning_rate": 9.994520620798696e-06, "loss": 0.5273, "step": 250 }, { "epoch": 0.18, "grad_norm": 24.04445898352151, "learning_rate": 9.994384517827726e-06, "loss": 0.6157, "step": 251 }, { "epoch": 0.18, "grad_norm": 8.217464632831893, "learning_rate": 9.994246746140057e-06, "loss": 0.5576, "step": 252 }, { "epoch": 0.18, "grad_norm": 17.624589818254435, "learning_rate": 9.99410730578172e-06, "loss": 0.5503, "step": 253 }, { "epoch": 0.18, "grad_norm": 8.459911000864924, "learning_rate": 9.993966196799304e-06, "loss": 0.5166, "step": 254 }, { "epoch": 0.18, "grad_norm": 16.972917737075516, "learning_rate": 9.993823419239959e-06, "loss": 0.6016, "step": 255 }, { "epoch": 0.18, "grad_norm": 11.26491066381988, "learning_rate": 9.993678973151388e-06, "loss": 0.4448, "step": 256 }, { "epoch": 0.18, "grad_norm": 13.959824992641648, "learning_rate": 9.993532858581853e-06, "loss": 0.6025, "step": 257 }, { "epoch": 0.18, "grad_norm": 8.17714303025146, "learning_rate": 9.993385075580173e-06, "loss": 0.605, "step": 258 }, { "epoch": 0.18, "grad_norm": 15.583037935905157, "learning_rate": 9.993235624195728e-06, "loss": 0.5659, "step": 259 }, { "epoch": 0.19, "grad_norm": 19.251932963533232, "learning_rate": 9.993084504478448e-06, "loss": 0.5811, "step": 260 }, { "epoch": 0.19, "grad_norm": 16.8640641134105, "learning_rate": 9.99293171647883e-06, "loss": 0.4863, "step": 261 }, { "epoch": 0.19, "grad_norm": 7.539968969637762, "learning_rate": 9.992777260247916e-06, "loss": 0.5469, "step": 262 }, { "epoch": 0.19, "grad_norm": 21.333336259322376, "learning_rate": 9.99262113583732e-06, "loss": 0.5479, "step": 263 }, { "epoch": 0.19, "grad_norm": 31.227206000118187, "learning_rate": 9.992463343299203e-06, "loss": 0.6367, "step": 264 }, { "epoch": 0.19, "grad_norm": 16.373876295931044, "learning_rate": 9.992303882686288e-06, "loss": 0.5479, "step": 265 }, { "epoch": 0.19, "grad_norm": 6.169578673800072, "learning_rate": 9.99214275405185e-06, "loss": 0.5327, "step": 266 }, { "epoch": 0.19, "grad_norm": 19.948032240297028, "learning_rate": 9.991979957449729e-06, "loss": 0.6313, "step": 267 }, { "epoch": 0.19, "grad_norm": 28.97323801726882, "learning_rate": 9.991815492934318e-06, "loss": 0.6411, "step": 268 }, { "epoch": 0.19, "grad_norm": 19.141849461327332, "learning_rate": 9.991649360560565e-06, "loss": 0.6318, "step": 269 }, { "epoch": 0.19, "grad_norm": 6.920257209617739, "learning_rate": 9.99148156038398e-06, "loss": 0.5425, "step": 270 }, { "epoch": 0.19, "grad_norm": 10.115581788358424, "learning_rate": 9.991312092460626e-06, "loss": 0.4868, "step": 271 }, { "epoch": 0.19, "grad_norm": 7.252432772054797, "learning_rate": 9.991140956847128e-06, "loss": 0.6235, "step": 272 }, { "epoch": 0.19, "grad_norm": 22.131525985269647, "learning_rate": 9.990968153600664e-06, "loss": 0.6006, "step": 273 }, { "epoch": 0.2, "grad_norm": 9.41445550986435, "learning_rate": 9.990793682778973e-06, "loss": 0.52, "step": 274 }, { "epoch": 0.2, "grad_norm": 17.317553621436222, "learning_rate": 9.990617544440346e-06, "loss": 0.5083, "step": 275 }, { "epoch": 0.2, "grad_norm": 6.54955596823522, "learning_rate": 9.990439738643635e-06, "loss": 0.5161, "step": 276 }, { "epoch": 0.2, "grad_norm": 5.964194414552132, "learning_rate": 9.99026026544825e-06, "loss": 0.5083, "step": 277 }, { "epoch": 0.2, "grad_norm": 16.720177088068684, "learning_rate": 9.990079124914156e-06, "loss": 0.522, "step": 278 }, { "epoch": 0.2, "grad_norm": 9.459906145881508, "learning_rate": 9.989896317101873e-06, "loss": 0.4951, "step": 279 }, { "epoch": 0.2, "grad_norm": 16.765981248006945, "learning_rate": 9.989711842072482e-06, "loss": 0.6133, "step": 280 }, { "epoch": 0.2, "grad_norm": 11.835633957228747, "learning_rate": 9.989525699887619e-06, "loss": 0.5205, "step": 281 }, { "epoch": 0.2, "grad_norm": 11.649027140963295, "learning_rate": 9.989337890609478e-06, "loss": 0.5625, "step": 282 }, { "epoch": 0.2, "grad_norm": 10.40438475972807, "learning_rate": 9.98914841430081e-06, "loss": 0.4858, "step": 283 }, { "epoch": 0.2, "grad_norm": 19.573834463528204, "learning_rate": 9.988957271024922e-06, "loss": 0.542, "step": 284 }, { "epoch": 0.2, "grad_norm": 8.091426545143989, "learning_rate": 9.988764460845676e-06, "loss": 0.5542, "step": 285 }, { "epoch": 0.2, "grad_norm": 6.36487501871403, "learning_rate": 9.9885699838275e-06, "loss": 0.4185, "step": 286 }, { "epoch": 0.2, "grad_norm": 14.691682917775056, "learning_rate": 9.988373840035366e-06, "loss": 0.541, "step": 287 }, { "epoch": 0.21, "grad_norm": 16.560245915679797, "learning_rate": 9.988176029534814e-06, "loss": 0.543, "step": 288 }, { "epoch": 0.21, "grad_norm": 9.393486733090167, "learning_rate": 9.987976552391933e-06, "loss": 0.4878, "step": 289 }, { "epoch": 0.21, "grad_norm": 10.696902281023876, "learning_rate": 9.987775408673373e-06, "loss": 0.603, "step": 290 }, { "epoch": 0.21, "grad_norm": 7.542550692108352, "learning_rate": 9.987572598446337e-06, "loss": 0.5083, "step": 291 }, { "epoch": 0.21, "grad_norm": 12.93517301011984, "learning_rate": 9.987368121778594e-06, "loss": 0.4785, "step": 292 }, { "epoch": 0.21, "grad_norm": 9.117485021835481, "learning_rate": 9.98716197873846e-06, "loss": 0.4951, "step": 293 }, { "epoch": 0.21, "grad_norm": 29.587346547838976, "learning_rate": 9.98695416939481e-06, "loss": 0.7002, "step": 294 }, { "epoch": 0.21, "grad_norm": 14.306933072284718, "learning_rate": 9.986744693817077e-06, "loss": 0.561, "step": 295 }, { "epoch": 0.21, "grad_norm": 10.606210464051424, "learning_rate": 9.986533552075252e-06, "loss": 0.5801, "step": 296 }, { "epoch": 0.21, "grad_norm": 10.005787274091094, "learning_rate": 9.986320744239883e-06, "loss": 0.5742, "step": 297 }, { "epoch": 0.21, "grad_norm": 14.512421082684124, "learning_rate": 9.98610627038207e-06, "loss": 0.5532, "step": 298 }, { "epoch": 0.21, "grad_norm": 8.006107344505082, "learning_rate": 9.985890130573474e-06, "loss": 0.5298, "step": 299 }, { "epoch": 0.21, "grad_norm": 7.111526173571763, "learning_rate": 9.98567232488631e-06, "loss": 0.5459, "step": 300 }, { "epoch": 0.21, "grad_norm": 11.911631119108614, "learning_rate": 9.985452853393353e-06, "loss": 0.5425, "step": 301 }, { "epoch": 0.22, "grad_norm": 8.375016686561212, "learning_rate": 9.985231716167933e-06, "loss": 0.5298, "step": 302 }, { "epoch": 0.22, "grad_norm": 8.179532269489528, "learning_rate": 9.985008913283933e-06, "loss": 0.5459, "step": 303 }, { "epoch": 0.22, "grad_norm": 9.104521872821598, "learning_rate": 9.984784444815799e-06, "loss": 0.6201, "step": 304 }, { "epoch": 0.22, "grad_norm": 12.983836084666084, "learning_rate": 9.984558310838528e-06, "loss": 0.5645, "step": 305 }, { "epoch": 0.22, "grad_norm": 9.16177018185428, "learning_rate": 9.984330511427676e-06, "loss": 0.5693, "step": 306 }, { "epoch": 0.22, "grad_norm": 11.784575497017391, "learning_rate": 9.984101046659353e-06, "loss": 0.4595, "step": 307 }, { "epoch": 0.22, "grad_norm": 8.33312354424948, "learning_rate": 9.983869916610232e-06, "loss": 0.4668, "step": 308 }, { "epoch": 0.22, "grad_norm": 11.786060507815215, "learning_rate": 9.983637121357534e-06, "loss": 0.5557, "step": 309 }, { "epoch": 0.22, "grad_norm": 14.611215764264122, "learning_rate": 9.983402660979042e-06, "loss": 0.6064, "step": 310 }, { "epoch": 0.22, "grad_norm": 10.263169097847555, "learning_rate": 9.983166535553093e-06, "loss": 0.5977, "step": 311 }, { "epoch": 0.22, "grad_norm": 11.022215974074236, "learning_rate": 9.98292874515858e-06, "loss": 0.5137, "step": 312 }, { "epoch": 0.22, "grad_norm": 10.936834299449774, "learning_rate": 9.982689289874956e-06, "loss": 0.5898, "step": 313 }, { "epoch": 0.22, "grad_norm": 6.511642609064424, "learning_rate": 9.982448169782226e-06, "loss": 0.5967, "step": 314 }, { "epoch": 0.22, "grad_norm": 11.598567525889859, "learning_rate": 9.98220538496095e-06, "loss": 0.5234, "step": 315 }, { "epoch": 0.23, "grad_norm": 7.979819387251359, "learning_rate": 9.98196093549225e-06, "loss": 0.5054, "step": 316 }, { "epoch": 0.23, "grad_norm": 8.574667202148518, "learning_rate": 9.9817148214578e-06, "loss": 0.5801, "step": 317 }, { "epoch": 0.23, "grad_norm": 9.56598373183688, "learning_rate": 9.981467042939833e-06, "loss": 0.5732, "step": 318 }, { "epoch": 0.23, "grad_norm": 11.776851149154552, "learning_rate": 9.981217600021133e-06, "loss": 0.5469, "step": 319 }, { "epoch": 0.23, "grad_norm": 8.737203552004415, "learning_rate": 9.980966492785048e-06, "loss": 0.5742, "step": 320 }, { "epoch": 0.23, "grad_norm": 8.915557318934983, "learning_rate": 9.980713721315473e-06, "loss": 0.4888, "step": 321 }, { "epoch": 0.23, "grad_norm": 9.65537704300492, "learning_rate": 9.98045928569687e-06, "loss": 0.5425, "step": 322 }, { "epoch": 0.23, "grad_norm": 7.172596910986449, "learning_rate": 9.98020318601424e-06, "loss": 0.4824, "step": 323 }, { "epoch": 0.23, "grad_norm": 10.440835376005527, "learning_rate": 9.97994542235316e-06, "loss": 0.5522, "step": 324 }, { "epoch": 0.23, "grad_norm": 19.118269130078797, "learning_rate": 9.979685994799753e-06, "loss": 0.6069, "step": 325 }, { "epoch": 0.23, "grad_norm": 23.200575202995402, "learning_rate": 9.979424903440695e-06, "loss": 0.5405, "step": 326 }, { "epoch": 0.23, "grad_norm": 8.163978783652977, "learning_rate": 9.979162148363222e-06, "loss": 0.5332, "step": 327 }, { "epoch": 0.23, "grad_norm": 10.535886529238546, "learning_rate": 9.978897729655127e-06, "loss": 0.5405, "step": 328 }, { "epoch": 0.23, "grad_norm": 38.988384289584985, "learning_rate": 9.978631647404755e-06, "loss": 0.6826, "step": 329 }, { "epoch": 0.24, "grad_norm": 18.249997402471777, "learning_rate": 9.97836390170101e-06, "loss": 0.5259, "step": 330 }, { "epoch": 0.24, "grad_norm": 12.01736287591341, "learning_rate": 9.978094492633353e-06, "loss": 0.5601, "step": 331 }, { "epoch": 0.24, "grad_norm": 12.939324146161377, "learning_rate": 9.977823420291796e-06, "loss": 0.5688, "step": 332 }, { "epoch": 0.24, "grad_norm": 14.361987880781351, "learning_rate": 9.97755068476691e-06, "loss": 0.5605, "step": 333 }, { "epoch": 0.24, "grad_norm": 22.600799019220247, "learning_rate": 9.977276286149821e-06, "loss": 0.6226, "step": 334 }, { "epoch": 0.24, "grad_norm": 15.773235886574865, "learning_rate": 9.977000224532211e-06, "loss": 0.5332, "step": 335 }, { "epoch": 0.24, "grad_norm": 10.06416768362281, "learning_rate": 9.976722500006318e-06, "loss": 0.6416, "step": 336 }, { "epoch": 0.24, "grad_norm": 9.827713447093007, "learning_rate": 9.976443112664932e-06, "loss": 0.5957, "step": 337 }, { "epoch": 0.24, "grad_norm": 10.629610673442508, "learning_rate": 9.976162062601407e-06, "loss": 0.5527, "step": 338 }, { "epoch": 0.24, "grad_norm": 14.528967178159732, "learning_rate": 9.97587934990964e-06, "loss": 0.5762, "step": 339 }, { "epoch": 0.24, "grad_norm": 12.608082935381697, "learning_rate": 9.975594974684096e-06, "loss": 0.5659, "step": 340 }, { "epoch": 0.24, "grad_norm": 9.717599293321035, "learning_rate": 9.975308937019787e-06, "loss": 0.5278, "step": 341 }, { "epoch": 0.24, "grad_norm": 7.812143947109021, "learning_rate": 9.975021237012286e-06, "loss": 0.5552, "step": 342 }, { "epoch": 0.24, "grad_norm": 11.925371312954011, "learning_rate": 9.974731874757717e-06, "loss": 0.5596, "step": 343 }, { "epoch": 0.25, "grad_norm": 8.339309116508288, "learning_rate": 9.974440850352762e-06, "loss": 0.582, "step": 344 }, { "epoch": 0.25, "grad_norm": 16.614761768586927, "learning_rate": 9.974148163894658e-06, "loss": 0.5303, "step": 345 }, { "epoch": 0.25, "grad_norm": 8.71369707758011, "learning_rate": 9.973853815481196e-06, "loss": 0.5601, "step": 346 }, { "epoch": 0.25, "grad_norm": 12.093172429150728, "learning_rate": 9.973557805210724e-06, "loss": 0.5283, "step": 347 }, { "epoch": 0.25, "grad_norm": 7.19951189726805, "learning_rate": 9.973260133182145e-06, "loss": 0.5615, "step": 348 }, { "epoch": 0.25, "grad_norm": 11.218505188713452, "learning_rate": 9.972960799494915e-06, "loss": 0.5591, "step": 349 }, { "epoch": 0.25, "grad_norm": 9.343689711468452, "learning_rate": 9.972659804249047e-06, "loss": 0.4824, "step": 350 }, { "epoch": 0.25, "grad_norm": 16.095683927102183, "learning_rate": 9.972357147545113e-06, "loss": 0.5591, "step": 351 }, { "epoch": 0.25, "grad_norm": 18.28619957306297, "learning_rate": 9.972052829484231e-06, "loss": 0.5586, "step": 352 }, { "epoch": 0.25, "grad_norm": 13.927219563114457, "learning_rate": 9.971746850168084e-06, "loss": 0.543, "step": 353 }, { "epoch": 0.25, "grad_norm": 19.070239504065867, "learning_rate": 9.971439209698902e-06, "loss": 0.6523, "step": 354 }, { "epoch": 0.25, "grad_norm": 10.674351013034489, "learning_rate": 9.971129908179474e-06, "loss": 0.541, "step": 355 }, { "epoch": 0.25, "grad_norm": 12.105592852551204, "learning_rate": 9.970818945713145e-06, "loss": 0.5659, "step": 356 }, { "epoch": 0.25, "grad_norm": 7.0981327588150025, "learning_rate": 9.970506322403813e-06, "loss": 0.4458, "step": 357 }, { "epoch": 0.26, "grad_norm": 31.65511002299402, "learning_rate": 9.970192038355928e-06, "loss": 0.6401, "step": 358 }, { "epoch": 0.26, "grad_norm": 7.640887314827702, "learning_rate": 9.969876093674502e-06, "loss": 0.5005, "step": 359 }, { "epoch": 0.26, "grad_norm": 9.615326153238845, "learning_rate": 9.969558488465097e-06, "loss": 0.4995, "step": 360 }, { "epoch": 0.26, "grad_norm": 25.0121028825566, "learning_rate": 9.969239222833829e-06, "loss": 0.5254, "step": 361 }, { "epoch": 0.26, "grad_norm": 8.942638756744063, "learning_rate": 9.968918296887374e-06, "loss": 0.48, "step": 362 }, { "epoch": 0.26, "grad_norm": 7.8607942912242965, "learning_rate": 9.968595710732955e-06, "loss": 0.5239, "step": 363 }, { "epoch": 0.26, "grad_norm": 9.584219880853102, "learning_rate": 9.968271464478357e-06, "loss": 0.6064, "step": 364 }, { "epoch": 0.26, "grad_norm": 18.74816464727213, "learning_rate": 9.967945558231917e-06, "loss": 0.624, "step": 365 }, { "epoch": 0.26, "grad_norm": 10.18424589657994, "learning_rate": 9.967617992102526e-06, "loss": 0.5688, "step": 366 }, { "epoch": 0.26, "grad_norm": 14.963994900078255, "learning_rate": 9.967288766199628e-06, "loss": 0.5151, "step": 367 }, { "epoch": 0.26, "grad_norm": 15.107368415735328, "learning_rate": 9.966957880633225e-06, "loss": 0.5117, "step": 368 }, { "epoch": 0.26, "grad_norm": 9.726462628184319, "learning_rate": 9.966625335513873e-06, "loss": 0.5464, "step": 369 }, { "epoch": 0.26, "grad_norm": 25.415412889353508, "learning_rate": 9.96629113095268e-06, "loss": 0.6455, "step": 370 }, { "epoch": 0.26, "grad_norm": 13.29743204783179, "learning_rate": 9.965955267061309e-06, "loss": 0.564, "step": 371 }, { "epoch": 0.27, "grad_norm": 7.177772883312987, "learning_rate": 9.965617743951982e-06, "loss": 0.4883, "step": 372 }, { "epoch": 0.27, "grad_norm": 6.407931454753083, "learning_rate": 9.965278561737466e-06, "loss": 0.4746, "step": 373 }, { "epoch": 0.27, "grad_norm": 9.137420154130437, "learning_rate": 9.964937720531094e-06, "loss": 0.5532, "step": 374 }, { "epoch": 0.27, "grad_norm": 15.056581957504337, "learning_rate": 9.964595220446744e-06, "loss": 0.5771, "step": 375 }, { "epoch": 0.27, "grad_norm": 8.993928512302421, "learning_rate": 9.964251061598853e-06, "loss": 0.583, "step": 376 }, { "epoch": 0.27, "grad_norm": 8.30189244870148, "learning_rate": 9.96390524410241e-06, "loss": 0.5552, "step": 377 }, { "epoch": 0.27, "grad_norm": 16.305556797808645, "learning_rate": 9.96355776807296e-06, "loss": 0.5098, "step": 378 }, { "epoch": 0.27, "grad_norm": 9.335808377185364, "learning_rate": 9.9632086336266e-06, "loss": 0.5747, "step": 379 }, { "epoch": 0.27, "grad_norm": 8.797481794532704, "learning_rate": 9.962857840879983e-06, "loss": 0.5664, "step": 380 }, { "epoch": 0.27, "grad_norm": 10.172910648343924, "learning_rate": 9.962505389950317e-06, "loss": 0.6455, "step": 381 }, { "epoch": 0.27, "grad_norm": 6.971801104641452, "learning_rate": 9.962151280955359e-06, "loss": 0.5317, "step": 382 }, { "epoch": 0.27, "grad_norm": 14.019462783189976, "learning_rate": 9.961795514013424e-06, "loss": 0.6611, "step": 383 }, { "epoch": 0.27, "grad_norm": 13.015095631990192, "learning_rate": 9.961438089243384e-06, "loss": 0.54, "step": 384 }, { "epoch": 0.27, "grad_norm": 10.693443657141236, "learning_rate": 9.961079006764659e-06, "loss": 0.6846, "step": 385 }, { "epoch": 0.28, "grad_norm": 8.388009134623115, "learning_rate": 9.960718266697223e-06, "loss": 0.4805, "step": 386 }, { "epoch": 0.28, "grad_norm": 9.305245922345355, "learning_rate": 9.960355869161609e-06, "loss": 0.5625, "step": 387 }, { "epoch": 0.28, "grad_norm": 12.98682854904853, "learning_rate": 9.959991814278898e-06, "loss": 0.5361, "step": 388 }, { "epoch": 0.28, "grad_norm": 30.855897554320187, "learning_rate": 9.95962610217073e-06, "loss": 0.6348, "step": 389 }, { "epoch": 0.28, "grad_norm": 18.186818656200053, "learning_rate": 9.959258732959296e-06, "loss": 0.6367, "step": 390 }, { "epoch": 0.28, "grad_norm": 10.1692607171307, "learning_rate": 9.958889706767341e-06, "loss": 0.6035, "step": 391 }, { "epoch": 0.28, "grad_norm": 19.89752794089645, "learning_rate": 9.95851902371816e-06, "loss": 0.6279, "step": 392 }, { "epoch": 0.28, "grad_norm": 18.94232313274701, "learning_rate": 9.95814668393561e-06, "loss": 0.625, "step": 393 }, { "epoch": 0.28, "grad_norm": 11.263112378321651, "learning_rate": 9.957772687544094e-06, "loss": 0.6211, "step": 394 }, { "epoch": 0.28, "grad_norm": 18.694064854132726, "learning_rate": 9.95739703466857e-06, "loss": 0.6113, "step": 395 }, { "epoch": 0.28, "grad_norm": 14.054888000787919, "learning_rate": 9.957019725434554e-06, "loss": 0.6055, "step": 396 }, { "epoch": 0.28, "grad_norm": 6.852195125531498, "learning_rate": 9.956640759968111e-06, "loss": 0.4897, "step": 397 }, { "epoch": 0.28, "grad_norm": 10.66681348604124, "learning_rate": 9.956260138395857e-06, "loss": 0.5479, "step": 398 }, { "epoch": 0.28, "grad_norm": 20.557192888856104, "learning_rate": 9.955877860844969e-06, "loss": 0.6069, "step": 399 }, { "epoch": 0.29, "grad_norm": 8.942620031874748, "learning_rate": 9.955493927443171e-06, "loss": 0.4883, "step": 400 }, { "epoch": 0.29, "grad_norm": 16.005501795532552, "learning_rate": 9.955108338318743e-06, "loss": 0.6094, "step": 401 }, { "epoch": 0.29, "grad_norm": 9.342699104317559, "learning_rate": 9.954721093600517e-06, "loss": 0.541, "step": 402 }, { "epoch": 0.29, "grad_norm": 10.090322647328717, "learning_rate": 9.95433219341788e-06, "loss": 0.5225, "step": 403 }, { "epoch": 0.29, "grad_norm": 16.150636904880326, "learning_rate": 9.953941637900769e-06, "loss": 0.666, "step": 404 }, { "epoch": 0.29, "grad_norm": 15.523930157702077, "learning_rate": 9.953549427179676e-06, "loss": 0.5566, "step": 405 }, { "epoch": 0.29, "grad_norm": 6.643994089560804, "learning_rate": 9.953155561385646e-06, "loss": 0.5015, "step": 406 }, { "epoch": 0.29, "grad_norm": 9.03294877321269, "learning_rate": 9.952760040650278e-06, "loss": 0.562, "step": 407 }, { "epoch": 0.29, "grad_norm": 10.801467808368454, "learning_rate": 9.95236286510572e-06, "loss": 0.519, "step": 408 }, { "epoch": 0.29, "grad_norm": 9.80918459345022, "learning_rate": 9.95196403488468e-06, "loss": 0.5747, "step": 409 }, { "epoch": 0.29, "grad_norm": 9.774285807444773, "learning_rate": 9.951563550120412e-06, "loss": 0.5752, "step": 410 }, { "epoch": 0.29, "grad_norm": 17.463568068842072, "learning_rate": 9.951161410946725e-06, "loss": 0.5527, "step": 411 }, { "epoch": 0.29, "grad_norm": 16.73368659445093, "learning_rate": 9.950757617497983e-06, "loss": 0.4585, "step": 412 }, { "epoch": 0.29, "grad_norm": 14.942045154978281, "learning_rate": 9.950352169909101e-06, "loss": 0.4893, "step": 413 }, { "epoch": 0.3, "grad_norm": 9.393249457904501, "learning_rate": 9.949945068315544e-06, "loss": 0.5684, "step": 414 }, { "epoch": 0.3, "grad_norm": 13.911557122462217, "learning_rate": 9.949536312853334e-06, "loss": 0.5786, "step": 415 }, { "epoch": 0.3, "grad_norm": 13.653566902215857, "learning_rate": 9.949125903659042e-06, "loss": 0.6289, "step": 416 }, { "epoch": 0.3, "grad_norm": 8.471009617251026, "learning_rate": 9.948713840869797e-06, "loss": 0.5283, "step": 417 }, { "epoch": 0.3, "grad_norm": 8.083527199424468, "learning_rate": 9.948300124623274e-06, "loss": 0.4492, "step": 418 }, { "epoch": 0.3, "grad_norm": 8.197608537282099, "learning_rate": 9.947884755057703e-06, "loss": 0.5186, "step": 419 }, { "epoch": 0.3, "grad_norm": 13.630148938077044, "learning_rate": 9.947467732311868e-06, "loss": 0.5669, "step": 420 }, { "epoch": 0.3, "grad_norm": 11.24383405339626, "learning_rate": 9.947049056525104e-06, "loss": 0.5068, "step": 421 }, { "epoch": 0.3, "grad_norm": 11.288937926215693, "learning_rate": 9.9466287278373e-06, "loss": 0.5103, "step": 422 }, { "epoch": 0.3, "grad_norm": 8.534922729005983, "learning_rate": 9.946206746388892e-06, "loss": 0.5278, "step": 423 }, { "epoch": 0.3, "grad_norm": 13.99533118513947, "learning_rate": 9.94578311232087e-06, "loss": 0.585, "step": 424 }, { "epoch": 0.3, "grad_norm": 13.150688092265762, "learning_rate": 9.945357825774786e-06, "loss": 0.5933, "step": 425 }, { "epoch": 0.3, "grad_norm": 10.850098763453946, "learning_rate": 9.944930886892731e-06, "loss": 0.5488, "step": 426 }, { "epoch": 0.3, "grad_norm": 7.389796398632541, "learning_rate": 9.944502295817353e-06, "loss": 0.5278, "step": 427 }, { "epoch": 0.31, "grad_norm": 14.254074622509817, "learning_rate": 9.944072052691853e-06, "loss": 0.5723, "step": 428 }, { "epoch": 0.31, "grad_norm": 13.682353391544783, "learning_rate": 9.943640157659984e-06, "loss": 0.4854, "step": 429 }, { "epoch": 0.31, "grad_norm": 8.532300495721698, "learning_rate": 9.94320661086605e-06, "loss": 0.4614, "step": 430 }, { "epoch": 0.31, "grad_norm": 23.899338345320164, "learning_rate": 9.942771412454906e-06, "loss": 0.749, "step": 431 }, { "epoch": 0.31, "grad_norm": 8.323403099794728, "learning_rate": 9.942334562571961e-06, "loss": 0.5317, "step": 432 }, { "epoch": 0.31, "grad_norm": 15.057160250544284, "learning_rate": 9.941896061363173e-06, "loss": 0.5894, "step": 433 }, { "epoch": 0.31, "grad_norm": 19.87921251443284, "learning_rate": 9.941455908975054e-06, "loss": 0.5293, "step": 434 }, { "epoch": 0.31, "grad_norm": 10.374070788814263, "learning_rate": 9.941014105554668e-06, "loss": 0.6357, "step": 435 }, { "epoch": 0.31, "grad_norm": 8.07075036921681, "learning_rate": 9.94057065124963e-06, "loss": 0.5859, "step": 436 }, { "epoch": 0.31, "grad_norm": 12.935502918801745, "learning_rate": 9.940125546208107e-06, "loss": 0.4937, "step": 437 }, { "epoch": 0.31, "grad_norm": 10.849912769259404, "learning_rate": 9.939678790578813e-06, "loss": 0.5679, "step": 438 }, { "epoch": 0.31, "grad_norm": 12.909255099305636, "learning_rate": 9.93923038451102e-06, "loss": 0.5547, "step": 439 }, { "epoch": 0.31, "grad_norm": 18.00963416001379, "learning_rate": 9.938780328154549e-06, "loss": 0.6104, "step": 440 }, { "epoch": 0.31, "grad_norm": 7.4109051578140575, "learning_rate": 9.938328621659775e-06, "loss": 0.5923, "step": 441 }, { "epoch": 0.32, "grad_norm": 12.359334459897687, "learning_rate": 9.937875265177615e-06, "loss": 0.5879, "step": 442 }, { "epoch": 0.32, "grad_norm": 19.83227778183207, "learning_rate": 9.937420258859547e-06, "loss": 0.563, "step": 443 }, { "epoch": 0.32, "grad_norm": 5.415148747386903, "learning_rate": 9.9369636028576e-06, "loss": 0.5352, "step": 444 }, { "epoch": 0.32, "grad_norm": 7.775852096977613, "learning_rate": 9.936505297324346e-06, "loss": 0.5283, "step": 445 }, { "epoch": 0.32, "grad_norm": 13.923034553787746, "learning_rate": 9.936045342412917e-06, "loss": 0.5732, "step": 446 }, { "epoch": 0.32, "grad_norm": 6.630684830040368, "learning_rate": 9.93558373827699e-06, "loss": 0.5078, "step": 447 }, { "epoch": 0.32, "grad_norm": 7.677385634372398, "learning_rate": 9.935120485070799e-06, "loss": 0.5557, "step": 448 }, { "epoch": 0.32, "grad_norm": 8.267582350250366, "learning_rate": 9.934655582949123e-06, "loss": 0.4868, "step": 449 }, { "epoch": 0.32, "grad_norm": 9.591276642156359, "learning_rate": 9.934189032067296e-06, "loss": 0.5791, "step": 450 }, { "epoch": 0.32, "grad_norm": 13.200236779301449, "learning_rate": 9.933720832581197e-06, "loss": 0.5679, "step": 451 }, { "epoch": 0.32, "grad_norm": 11.316615652129823, "learning_rate": 9.933250984647266e-06, "loss": 0.5435, "step": 452 }, { "epoch": 0.32, "grad_norm": 18.91748370377269, "learning_rate": 9.932779488422484e-06, "loss": 0.5562, "step": 453 }, { "epoch": 0.32, "grad_norm": 15.369736880224455, "learning_rate": 9.93230634406439e-06, "loss": 0.5498, "step": 454 }, { "epoch": 0.32, "grad_norm": 11.79281460550446, "learning_rate": 9.931831551731067e-06, "loss": 0.585, "step": 455 }, { "epoch": 0.33, "grad_norm": 17.320334157458905, "learning_rate": 9.931355111581154e-06, "loss": 0.6392, "step": 456 }, { "epoch": 0.33, "grad_norm": 10.65106240403786, "learning_rate": 9.930877023773837e-06, "loss": 0.5015, "step": 457 }, { "epoch": 0.33, "grad_norm": 12.87208988766515, "learning_rate": 9.930397288468853e-06, "loss": 0.5522, "step": 458 }, { "epoch": 0.33, "grad_norm": 6.81270031499954, "learning_rate": 9.929915905826494e-06, "loss": 0.4966, "step": 459 }, { "epoch": 0.33, "grad_norm": 15.623328763405382, "learning_rate": 9.9294328760076e-06, "loss": 0.5083, "step": 460 }, { "epoch": 0.33, "grad_norm": 10.176704828042569, "learning_rate": 9.928948199173552e-06, "loss": 0.5142, "step": 461 }, { "epoch": 0.33, "grad_norm": 18.452044947551833, "learning_rate": 9.928461875486297e-06, "loss": 0.4746, "step": 462 }, { "epoch": 0.33, "grad_norm": 13.515116021434167, "learning_rate": 9.927973905108323e-06, "loss": 0.5566, "step": 463 }, { "epoch": 0.33, "grad_norm": 8.279657241609938, "learning_rate": 9.927484288202671e-06, "loss": 0.5566, "step": 464 }, { "epoch": 0.33, "grad_norm": 23.899242007190043, "learning_rate": 9.926993024932929e-06, "loss": 0.5767, "step": 465 }, { "epoch": 0.33, "grad_norm": 13.739763048191918, "learning_rate": 9.926500115463238e-06, "loss": 0.5396, "step": 466 }, { "epoch": 0.33, "grad_norm": 13.322105279702368, "learning_rate": 9.926005559958287e-06, "loss": 0.5317, "step": 467 }, { "epoch": 0.33, "grad_norm": 13.685933746852832, "learning_rate": 9.925509358583319e-06, "loss": 0.5044, "step": 468 }, { "epoch": 0.33, "grad_norm": 7.819644131934337, "learning_rate": 9.92501151150412e-06, "loss": 0.5361, "step": 469 }, { "epoch": 0.34, "grad_norm": 10.096851486127841, "learning_rate": 9.924512018887036e-06, "loss": 0.54, "step": 470 }, { "epoch": 0.34, "grad_norm": 18.4073975042839, "learning_rate": 9.924010880898952e-06, "loss": 0.5059, "step": 471 }, { "epoch": 0.34, "grad_norm": 16.34808552011236, "learning_rate": 9.923508097707306e-06, "loss": 0.6025, "step": 472 }, { "epoch": 0.34, "grad_norm": 9.169077335799752, "learning_rate": 9.923003669480094e-06, "loss": 0.562, "step": 473 }, { "epoch": 0.34, "grad_norm": 12.814349393155531, "learning_rate": 9.922497596385848e-06, "loss": 0.5376, "step": 474 }, { "epoch": 0.34, "grad_norm": 11.91476877486239, "learning_rate": 9.92198987859366e-06, "loss": 0.458, "step": 475 }, { "epoch": 0.34, "grad_norm": 15.796586778125109, "learning_rate": 9.921480516273168e-06, "loss": 0.5645, "step": 476 }, { "epoch": 0.34, "grad_norm": 24.08880736913912, "learning_rate": 9.920969509594558e-06, "loss": 0.5273, "step": 477 }, { "epoch": 0.34, "grad_norm": 8.015029790773587, "learning_rate": 9.920456858728567e-06, "loss": 0.4678, "step": 478 }, { "epoch": 0.34, "grad_norm": 8.165530523776276, "learning_rate": 9.919942563846482e-06, "loss": 0.4663, "step": 479 }, { "epoch": 0.34, "grad_norm": 8.515499096006561, "learning_rate": 9.919426625120137e-06, "loss": 0.5649, "step": 480 }, { "epoch": 0.34, "grad_norm": 7.393807256918857, "learning_rate": 9.918909042721918e-06, "loss": 0.5576, "step": 481 }, { "epoch": 0.34, "grad_norm": 16.408074858410433, "learning_rate": 9.918389816824759e-06, "loss": 0.6514, "step": 482 }, { "epoch": 0.34, "grad_norm": 18.273121385573273, "learning_rate": 9.917868947602144e-06, "loss": 0.6157, "step": 483 }, { "epoch": 0.35, "grad_norm": 12.733233581807566, "learning_rate": 9.917346435228102e-06, "loss": 0.6221, "step": 484 }, { "epoch": 0.35, "grad_norm": 12.1906994483687, "learning_rate": 9.916822279877217e-06, "loss": 0.4849, "step": 485 }, { "epoch": 0.35, "grad_norm": 16.21592645313153, "learning_rate": 9.91629648172462e-06, "loss": 0.5352, "step": 486 }, { "epoch": 0.35, "grad_norm": 9.57311751283234, "learning_rate": 9.915769040945984e-06, "loss": 0.54, "step": 487 }, { "epoch": 0.35, "grad_norm": 6.283261788372716, "learning_rate": 9.915239957717542e-06, "loss": 0.5034, "step": 488 }, { "epoch": 0.35, "grad_norm": 6.597381341995393, "learning_rate": 9.91470923221607e-06, "loss": 0.5898, "step": 489 }, { "epoch": 0.35, "grad_norm": 8.86577970600292, "learning_rate": 9.914176864618891e-06, "loss": 0.5303, "step": 490 }, { "epoch": 0.35, "grad_norm": 12.003835054098607, "learning_rate": 9.913642855103881e-06, "loss": 0.4561, "step": 491 }, { "epoch": 0.35, "grad_norm": 20.025050964061947, "learning_rate": 9.913107203849464e-06, "loss": 0.5947, "step": 492 }, { "epoch": 0.35, "grad_norm": 20.663144493285422, "learning_rate": 9.912569911034607e-06, "loss": 0.6509, "step": 493 }, { "epoch": 0.35, "grad_norm": 15.22745250393881, "learning_rate": 9.912030976838832e-06, "loss": 0.5649, "step": 494 }, { "epoch": 0.35, "grad_norm": 19.50368431179958, "learning_rate": 9.911490401442205e-06, "loss": 0.5723, "step": 495 }, { "epoch": 0.35, "grad_norm": 18.650046429480334, "learning_rate": 9.910948185025345e-06, "loss": 0.582, "step": 496 }, { "epoch": 0.35, "grad_norm": 13.16731458869406, "learning_rate": 9.910404327769414e-06, "loss": 0.5161, "step": 497 }, { "epoch": 0.36, "grad_norm": 16.48953220742883, "learning_rate": 9.909858829856127e-06, "loss": 0.5527, "step": 498 }, { "epoch": 0.36, "grad_norm": 9.6203128320521, "learning_rate": 9.909311691467744e-06, "loss": 0.479, "step": 499 }, { "epoch": 0.36, "grad_norm": 8.61392255485728, "learning_rate": 9.908762912787073e-06, "loss": 0.5513, "step": 500 }, { "epoch": 0.36, "eval_avg_AUC": 0.7617813721540047, "eval_avg_Accuracy": 0.688204575596817, "eval_avg_Accuracy-right": 0.8745924090256946, "eval_avg_Accuracy-wrong": 0.3632021833068001, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6387441987762204, "eval_last_AUC": 0.7801619916516469, "eval_last_Accuracy": 0.7156415782493368, "eval_last_Accuracy-right": 0.7742924220686057, "eval_last_Accuracy-wrong": 0.6133727541505573, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6543256471238833, "eval_max_AUC": 0.6908342239463399, "eval_max_Accuracy": 0.6420755968169761, "eval_max_Accuracy-right": 0.9715012390765619, "eval_max_Accuracy-wrong": 0.06765976802365249, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.5464614776782707, "eval_min_AUC": 0.7800757977941195, "eval_min_Accuracy": 0.712823275862069, "eval_min_Accuracy-right": 0.7380331289943916, "eval_min_Accuracy-wrong": 0.6688651353195361, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6532406424802264, "eval_prod_AUC": 0.7709256779799856, "eval_prod_Accuracy": 0.6040285145888594, "eval_prod_Accuracy-right": 0.41932959436546235, "eval_prod_Accuracy-wrong": 0.9260859677052535, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6340698606601375, "eval_runtime": 247.7022, "eval_samples_per_second": 97.407, "eval_steps_per_second": 3.044, "eval_sum_AUC": 0.6279793611726269, "eval_sum_Accuracy": 0.6357758620689655, "eval_sum_Accuracy-right": 0.9995434981087779, "eval_sum_Accuracy-wrong": 0.0014782806458949283, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6170558343639955, "step": 500 }, { "epoch": 0.36, "grad_norm": 18.540498504399512, "learning_rate": 9.908212493997473e-06, "loss": 0.5596, "step": 501 }, { "epoch": 0.36, "grad_norm": 9.128203532313956, "learning_rate": 9.90766043528285e-06, "loss": 0.5674, "step": 502 }, { "epoch": 0.36, "grad_norm": 21.257665582917742, "learning_rate": 9.907106736827654e-06, "loss": 0.5996, "step": 503 }, { "epoch": 0.36, "grad_norm": 6.247378361884045, "learning_rate": 9.906551398816886e-06, "loss": 0.4883, "step": 504 }, { "epoch": 0.36, "grad_norm": 7.880394950980597, "learning_rate": 9.9059944214361e-06, "loss": 0.543, "step": 505 }, { "epoch": 0.36, "grad_norm": 11.949181718064018, "learning_rate": 9.905435804871387e-06, "loss": 0.5933, "step": 506 }, { "epoch": 0.36, "grad_norm": 20.010979155843657, "learning_rate": 9.904875549309391e-06, "loss": 0.6396, "step": 507 }, { "epoch": 0.36, "grad_norm": 15.747061457011934, "learning_rate": 9.904313654937308e-06, "loss": 0.6533, "step": 508 }, { "epoch": 0.36, "grad_norm": 16.93160977054565, "learning_rate": 9.903750121942873e-06, "loss": 0.5938, "step": 509 }, { "epoch": 0.36, "grad_norm": 4.696205755954405, "learning_rate": 9.903184950514378e-06, "loss": 0.5122, "step": 510 }, { "epoch": 0.36, "grad_norm": 7.53643132677427, "learning_rate": 9.90261814084065e-06, "loss": 0.5781, "step": 511 }, { "epoch": 0.37, "grad_norm": 17.83447427750056, "learning_rate": 9.902049693111077e-06, "loss": 0.6719, "step": 512 }, { "epoch": 0.37, "grad_norm": 10.7557972910997, "learning_rate": 9.901479607515587e-06, "loss": 0.5342, "step": 513 }, { "epoch": 0.37, "grad_norm": 4.403858483082648, "learning_rate": 9.900907884244654e-06, "loss": 0.5684, "step": 514 }, { "epoch": 0.37, "grad_norm": 10.25591218016626, "learning_rate": 9.900334523489303e-06, "loss": 0.5972, "step": 515 }, { "epoch": 0.37, "grad_norm": 8.831414048462934, "learning_rate": 9.899759525441101e-06, "loss": 0.5625, "step": 516 }, { "epoch": 0.37, "grad_norm": 9.09169003682686, "learning_rate": 9.899182890292171e-06, "loss": 0.5913, "step": 517 }, { "epoch": 0.37, "grad_norm": 17.221486795019896, "learning_rate": 9.898604618235175e-06, "loss": 0.6055, "step": 518 }, { "epoch": 0.37, "grad_norm": 19.080050148413697, "learning_rate": 9.898024709463322e-06, "loss": 0.6074, "step": 519 }, { "epoch": 0.37, "grad_norm": 10.676674384714692, "learning_rate": 9.897443164170375e-06, "loss": 0.5405, "step": 520 }, { "epoch": 0.37, "grad_norm": 16.304810818457433, "learning_rate": 9.896859982550636e-06, "loss": 0.4937, "step": 521 }, { "epoch": 0.37, "grad_norm": 7.777058423733208, "learning_rate": 9.89627516479896e-06, "loss": 0.5742, "step": 522 }, { "epoch": 0.37, "grad_norm": 10.391882166187722, "learning_rate": 9.895688711110739e-06, "loss": 0.5264, "step": 523 }, { "epoch": 0.37, "grad_norm": 11.925774976971661, "learning_rate": 9.895100621681923e-06, "loss": 0.666, "step": 524 }, { "epoch": 0.37, "grad_norm": 5.998754301379017, "learning_rate": 9.894510896709003e-06, "loss": 0.5176, "step": 525 }, { "epoch": 0.38, "grad_norm": 5.840708276690297, "learning_rate": 9.893919536389017e-06, "loss": 0.5288, "step": 526 }, { "epoch": 0.38, "grad_norm": 6.687147803937831, "learning_rate": 9.89332654091955e-06, "loss": 0.5239, "step": 527 }, { "epoch": 0.38, "grad_norm": 6.768270858341022, "learning_rate": 9.892731910498731e-06, "loss": 0.5415, "step": 528 }, { "epoch": 0.38, "grad_norm": 5.905157062520896, "learning_rate": 9.892135645325238e-06, "loss": 0.5664, "step": 529 }, { "epoch": 0.38, "grad_norm": 14.120351808706744, "learning_rate": 9.891537745598293e-06, "loss": 0.5811, "step": 530 }, { "epoch": 0.38, "grad_norm": 14.407988555074818, "learning_rate": 9.89093821151767e-06, "loss": 0.5762, "step": 531 }, { "epoch": 0.38, "grad_norm": 5.3711923602355105, "learning_rate": 9.89033704328368e-06, "loss": 0.5137, "step": 532 }, { "epoch": 0.38, "grad_norm": 7.0687402397265675, "learning_rate": 9.889734241097186e-06, "loss": 0.5273, "step": 533 }, { "epoch": 0.38, "grad_norm": 11.00724274100732, "learning_rate": 9.889129805159595e-06, "loss": 0.5386, "step": 534 }, { "epoch": 0.38, "grad_norm": 14.481688049622676, "learning_rate": 9.888523735672861e-06, "loss": 0.519, "step": 535 }, { "epoch": 0.38, "grad_norm": 7.6367957136746245, "learning_rate": 9.887916032839482e-06, "loss": 0.498, "step": 536 }, { "epoch": 0.38, "grad_norm": 5.466046619158893, "learning_rate": 9.887306696862504e-06, "loss": 0.4902, "step": 537 }, { "epoch": 0.38, "grad_norm": 8.116707420357166, "learning_rate": 9.886695727945515e-06, "loss": 0.5771, "step": 538 }, { "epoch": 0.38, "grad_norm": 8.725720682821406, "learning_rate": 9.886083126292655e-06, "loss": 0.5659, "step": 539 }, { "epoch": 0.39, "grad_norm": 13.590126217196405, "learning_rate": 9.885468892108603e-06, "loss": 0.5752, "step": 540 }, { "epoch": 0.39, "grad_norm": 12.37036153224896, "learning_rate": 9.884853025598587e-06, "loss": 0.5225, "step": 541 }, { "epoch": 0.39, "grad_norm": 17.911884760434983, "learning_rate": 9.884235526968377e-06, "loss": 0.5542, "step": 542 }, { "epoch": 0.39, "grad_norm": 13.973277734720734, "learning_rate": 9.883616396424294e-06, "loss": 0.5732, "step": 543 }, { "epoch": 0.39, "grad_norm": 12.445859091078967, "learning_rate": 9.8829956341732e-06, "loss": 0.5679, "step": 544 }, { "epoch": 0.39, "grad_norm": 14.059475523503908, "learning_rate": 9.882373240422503e-06, "loss": 0.5146, "step": 545 }, { "epoch": 0.39, "grad_norm": 24.837290624700998, "learning_rate": 9.881749215380156e-06, "loss": 0.5596, "step": 546 }, { "epoch": 0.39, "grad_norm": 9.213469746306272, "learning_rate": 9.881123559254658e-06, "loss": 0.5791, "step": 547 }, { "epoch": 0.39, "grad_norm": 19.571221832522987, "learning_rate": 9.880496272255053e-06, "loss": 0.5654, "step": 548 }, { "epoch": 0.39, "grad_norm": 6.884368751248441, "learning_rate": 9.879867354590926e-06, "loss": 0.5015, "step": 549 }, { "epoch": 0.39, "grad_norm": 7.867690100404052, "learning_rate": 9.879236806472414e-06, "loss": 0.5454, "step": 550 }, { "epoch": 0.39, "grad_norm": 12.247957723804134, "learning_rate": 9.878604628110194e-06, "loss": 0.5864, "step": 551 }, { "epoch": 0.39, "grad_norm": 17.43931135081461, "learning_rate": 9.877970819715485e-06, "loss": 0.561, "step": 552 }, { "epoch": 0.39, "grad_norm": 9.78533565974865, "learning_rate": 9.87733538150006e-06, "loss": 0.5269, "step": 553 }, { "epoch": 0.4, "grad_norm": 9.031541245747464, "learning_rate": 9.876698313676225e-06, "loss": 0.5073, "step": 554 }, { "epoch": 0.4, "grad_norm": 15.535835835682214, "learning_rate": 9.876059616456842e-06, "loss": 0.5723, "step": 555 }, { "epoch": 0.4, "grad_norm": 8.31605031845537, "learning_rate": 9.875419290055305e-06, "loss": 0.4766, "step": 556 }, { "epoch": 0.4, "grad_norm": 10.670021692503195, "learning_rate": 9.874777334685565e-06, "loss": 0.6074, "step": 557 }, { "epoch": 0.4, "grad_norm": 7.219201299200287, "learning_rate": 9.874133750562108e-06, "loss": 0.5947, "step": 558 }, { "epoch": 0.4, "grad_norm": 9.904120927324652, "learning_rate": 9.873488537899967e-06, "loss": 0.626, "step": 559 }, { "epoch": 0.4, "grad_norm": 8.496631368652434, "learning_rate": 9.872841696914721e-06, "loss": 0.5391, "step": 560 }, { "epoch": 0.4, "grad_norm": 7.536188576252165, "learning_rate": 9.872193227822492e-06, "loss": 0.5508, "step": 561 }, { "epoch": 0.4, "grad_norm": 7.528143767413258, "learning_rate": 9.871543130839944e-06, "loss": 0.5732, "step": 562 }, { "epoch": 0.4, "grad_norm": 9.58970570036274, "learning_rate": 9.870891406184288e-06, "loss": 0.5674, "step": 563 }, { "epoch": 0.4, "grad_norm": 11.800186998521902, "learning_rate": 9.870238054073275e-06, "loss": 0.5566, "step": 564 }, { "epoch": 0.4, "grad_norm": 13.911555208685854, "learning_rate": 9.869583074725206e-06, "loss": 0.6074, "step": 565 }, { "epoch": 0.4, "grad_norm": 8.231300969398722, "learning_rate": 9.868926468358919e-06, "loss": 0.4829, "step": 566 }, { "epoch": 0.4, "grad_norm": 7.709909484735207, "learning_rate": 9.868268235193796e-06, "loss": 0.502, "step": 567 }, { "epoch": 0.41, "grad_norm": 9.178709235999069, "learning_rate": 9.867608375449772e-06, "loss": 0.5166, "step": 568 }, { "epoch": 0.41, "grad_norm": 12.354410293206907, "learning_rate": 9.866946889347311e-06, "loss": 0.5703, "step": 569 }, { "epoch": 0.41, "grad_norm": 10.915312310012048, "learning_rate": 9.866283777107432e-06, "loss": 0.5254, "step": 570 }, { "epoch": 0.41, "grad_norm": 20.60559354889048, "learning_rate": 9.865619038951692e-06, "loss": 0.5996, "step": 571 }, { "epoch": 0.41, "grad_norm": 11.26569614452492, "learning_rate": 9.864952675102193e-06, "loss": 0.5723, "step": 572 }, { "epoch": 0.41, "grad_norm": 16.002089457228422, "learning_rate": 9.864284685781578e-06, "loss": 0.5864, "step": 573 }, { "epoch": 0.41, "grad_norm": 23.044339027647595, "learning_rate": 9.863615071213036e-06, "loss": 0.5591, "step": 574 }, { "epoch": 0.41, "grad_norm": 9.59838106899753, "learning_rate": 9.862943831620298e-06, "loss": 0.4917, "step": 575 }, { "epoch": 0.41, "grad_norm": 8.247430329121268, "learning_rate": 9.862270967227636e-06, "loss": 0.5332, "step": 576 }, { "epoch": 0.41, "grad_norm": 8.838445091054146, "learning_rate": 9.861596478259869e-06, "loss": 0.5576, "step": 577 }, { "epoch": 0.41, "grad_norm": 10.4489106721379, "learning_rate": 9.860920364942353e-06, "loss": 0.5679, "step": 578 }, { "epoch": 0.41, "grad_norm": 12.347633105018176, "learning_rate": 9.860242627500994e-06, "loss": 0.5747, "step": 579 }, { "epoch": 0.41, "grad_norm": 12.918155502814725, "learning_rate": 9.859563266162231e-06, "loss": 0.5552, "step": 580 }, { "epoch": 0.41, "grad_norm": 7.352511123265138, "learning_rate": 9.858882281153058e-06, "loss": 0.4307, "step": 581 }, { "epoch": 0.42, "grad_norm": 7.102252433561581, "learning_rate": 9.858199672701e-06, "loss": 0.499, "step": 582 }, { "epoch": 0.42, "grad_norm": 10.073596010036917, "learning_rate": 9.85751544103413e-06, "loss": 0.4551, "step": 583 }, { "epoch": 0.42, "grad_norm": 12.478615776414374, "learning_rate": 9.856829586381065e-06, "loss": 0.542, "step": 584 }, { "epoch": 0.42, "grad_norm": 31.02955657240596, "learning_rate": 9.856142108970958e-06, "loss": 0.6479, "step": 585 }, { "epoch": 0.42, "grad_norm": 14.277538328288841, "learning_rate": 9.855453009033512e-06, "loss": 0.5327, "step": 586 }, { "epoch": 0.42, "grad_norm": 8.683057931417592, "learning_rate": 9.854762286798965e-06, "loss": 0.5645, "step": 587 }, { "epoch": 0.42, "grad_norm": 25.215793586089497, "learning_rate": 9.854069942498102e-06, "loss": 0.8555, "step": 588 }, { "epoch": 0.42, "grad_norm": 9.562234064634753, "learning_rate": 9.853375976362245e-06, "loss": 0.5874, "step": 589 }, { "epoch": 0.42, "grad_norm": 8.721200107181694, "learning_rate": 9.852680388623266e-06, "loss": 0.5298, "step": 590 }, { "epoch": 0.42, "grad_norm": 22.093284898328076, "learning_rate": 9.85198317951357e-06, "loss": 0.6309, "step": 591 }, { "epoch": 0.42, "grad_norm": 18.346628659885774, "learning_rate": 9.851284349266107e-06, "loss": 0.6416, "step": 592 }, { "epoch": 0.42, "grad_norm": 13.619550827391471, "learning_rate": 9.850583898114372e-06, "loss": 0.6084, "step": 593 }, { "epoch": 0.42, "grad_norm": 9.504830357240213, "learning_rate": 9.849881826292399e-06, "loss": 0.5742, "step": 594 }, { "epoch": 0.42, "grad_norm": 21.655717476878046, "learning_rate": 9.84917813403476e-06, "loss": 0.6416, "step": 595 }, { "epoch": 0.43, "grad_norm": 16.761291695955354, "learning_rate": 9.848472821576572e-06, "loss": 0.5938, "step": 596 }, { "epoch": 0.43, "grad_norm": 5.531377595805906, "learning_rate": 9.847765889153497e-06, "loss": 0.5601, "step": 597 }, { "epoch": 0.43, "grad_norm": 5.367401037379296, "learning_rate": 9.847057337001731e-06, "loss": 0.6104, "step": 598 }, { "epoch": 0.43, "grad_norm": 13.851863355122642, "learning_rate": 9.846347165358014e-06, "loss": 0.5552, "step": 599 }, { "epoch": 0.43, "grad_norm": 10.152768776233245, "learning_rate": 9.84563537445963e-06, "loss": 0.5654, "step": 600 }, { "epoch": 0.43, "grad_norm": 5.288895704846823, "learning_rate": 9.844921964544398e-06, "loss": 0.542, "step": 601 }, { "epoch": 0.43, "grad_norm": 10.57924848248388, "learning_rate": 9.844206935850687e-06, "loss": 0.585, "step": 602 }, { "epoch": 0.43, "grad_norm": 6.139809939859182, "learning_rate": 9.843490288617397e-06, "loss": 0.5186, "step": 603 }, { "epoch": 0.43, "grad_norm": 9.68694326416687, "learning_rate": 9.842772023083972e-06, "loss": 0.4717, "step": 604 }, { "epoch": 0.43, "grad_norm": 16.74113624317417, "learning_rate": 9.842052139490403e-06, "loss": 0.5591, "step": 605 }, { "epoch": 0.43, "grad_norm": 9.814314683170982, "learning_rate": 9.841330638077213e-06, "loss": 0.5303, "step": 606 }, { "epoch": 0.43, "grad_norm": 8.263159508477482, "learning_rate": 9.840607519085467e-06, "loss": 0.5293, "step": 607 }, { "epoch": 0.43, "grad_norm": 8.917568543252393, "learning_rate": 9.839882782756778e-06, "loss": 0.5425, "step": 608 }, { "epoch": 0.43, "grad_norm": 13.506441410922555, "learning_rate": 9.839156429333291e-06, "loss": 0.5986, "step": 609 }, { "epoch": 0.44, "grad_norm": 8.668039530691972, "learning_rate": 9.838428459057694e-06, "loss": 0.5171, "step": 610 }, { "epoch": 0.44, "grad_norm": 7.4133141642419424, "learning_rate": 9.837698872173214e-06, "loss": 0.4961, "step": 611 }, { "epoch": 0.44, "grad_norm": 13.506554715753216, "learning_rate": 9.836967668923623e-06, "loss": 0.564, "step": 612 }, { "epoch": 0.44, "grad_norm": 9.580418265887094, "learning_rate": 9.836234849553228e-06, "loss": 0.4878, "step": 613 }, { "epoch": 0.44, "grad_norm": 13.792067921572484, "learning_rate": 9.835500414306875e-06, "loss": 0.5903, "step": 614 }, { "epoch": 0.44, "grad_norm": 7.889230874692596, "learning_rate": 9.834764363429956e-06, "loss": 0.501, "step": 615 }, { "epoch": 0.44, "grad_norm": 13.876917354950514, "learning_rate": 9.8340266971684e-06, "loss": 0.5684, "step": 616 }, { "epoch": 0.44, "grad_norm": 9.194977979042537, "learning_rate": 9.83328741576867e-06, "loss": 0.4902, "step": 617 }, { "epoch": 0.44, "grad_norm": 16.105001380677454, "learning_rate": 9.832546519477778e-06, "loss": 0.5854, "step": 618 }, { "epoch": 0.44, "grad_norm": 21.476338145150045, "learning_rate": 9.831804008543271e-06, "loss": 0.6064, "step": 619 }, { "epoch": 0.44, "grad_norm": 9.759650991581609, "learning_rate": 9.831059883213234e-06, "loss": 0.6025, "step": 620 }, { "epoch": 0.44, "grad_norm": 10.320154908129068, "learning_rate": 9.830314143736292e-06, "loss": 0.4985, "step": 621 }, { "epoch": 0.44, "grad_norm": 18.063611986524947, "learning_rate": 9.829566790361615e-06, "loss": 0.5317, "step": 622 }, { "epoch": 0.44, "grad_norm": 6.8381031832744865, "learning_rate": 9.828817823338903e-06, "loss": 0.5234, "step": 623 }, { "epoch": 0.45, "grad_norm": 12.028721530123617, "learning_rate": 9.828067242918402e-06, "loss": 0.5938, "step": 624 }, { "epoch": 0.45, "grad_norm": 13.738098966810448, "learning_rate": 9.827315049350895e-06, "loss": 0.5249, "step": 625 }, { "epoch": 0.45, "grad_norm": 11.549683197853444, "learning_rate": 9.826561242887704e-06, "loss": 0.5049, "step": 626 }, { "epoch": 0.45, "grad_norm": 6.847026094764958, "learning_rate": 9.825805823780687e-06, "loss": 0.562, "step": 627 }, { "epoch": 0.45, "grad_norm": 7.1475623479236985, "learning_rate": 9.825048792282247e-06, "loss": 0.5259, "step": 628 }, { "epoch": 0.45, "grad_norm": 18.57915719785624, "learning_rate": 9.824290148645322e-06, "loss": 0.5098, "step": 629 }, { "epoch": 0.45, "grad_norm": 7.748425573675469, "learning_rate": 9.823529893123384e-06, "loss": 0.5869, "step": 630 }, { "epoch": 0.45, "grad_norm": 7.059327133724575, "learning_rate": 9.822768025970456e-06, "loss": 0.4497, "step": 631 }, { "epoch": 0.45, "grad_norm": 7.293876053511848, "learning_rate": 9.822004547441088e-06, "loss": 0.6064, "step": 632 }, { "epoch": 0.45, "grad_norm": 18.38956537939008, "learning_rate": 9.821239457790373e-06, "loss": 0.624, "step": 633 }, { "epoch": 0.45, "grad_norm": 13.857454287932656, "learning_rate": 9.82047275727394e-06, "loss": 0.5557, "step": 634 }, { "epoch": 0.45, "grad_norm": 14.129814430870212, "learning_rate": 9.81970444614796e-06, "loss": 0.5376, "step": 635 }, { "epoch": 0.45, "grad_norm": 7.239828802765227, "learning_rate": 9.81893452466914e-06, "loss": 0.5459, "step": 636 }, { "epoch": 0.45, "grad_norm": 15.759535502268271, "learning_rate": 9.818162993094724e-06, "loss": 0.5229, "step": 637 }, { "epoch": 0.46, "grad_norm": 8.269358730497334, "learning_rate": 9.817389851682494e-06, "loss": 0.5811, "step": 638 }, { "epoch": 0.46, "grad_norm": 9.532561868563791, "learning_rate": 9.816615100690773e-06, "loss": 0.5068, "step": 639 }, { "epoch": 0.46, "grad_norm": 10.542345953735255, "learning_rate": 9.81583874037842e-06, "loss": 0.5195, "step": 640 }, { "epoch": 0.46, "grad_norm": 17.295525138598315, "learning_rate": 9.815060771004831e-06, "loss": 0.5811, "step": 641 }, { "epoch": 0.46, "grad_norm": 10.216808140877509, "learning_rate": 9.81428119282994e-06, "loss": 0.5483, "step": 642 }, { "epoch": 0.46, "grad_norm": 11.054796365751393, "learning_rate": 9.813500006114216e-06, "loss": 0.5225, "step": 643 }, { "epoch": 0.46, "grad_norm": 11.532068190094899, "learning_rate": 9.812717211118673e-06, "loss": 0.5259, "step": 644 }, { "epoch": 0.46, "grad_norm": 12.617232952388049, "learning_rate": 9.811932808104852e-06, "loss": 0.583, "step": 645 }, { "epoch": 0.46, "grad_norm": 17.53476537868423, "learning_rate": 9.811146797334838e-06, "loss": 0.5986, "step": 646 }, { "epoch": 0.46, "grad_norm": 9.459867876200462, "learning_rate": 9.810359179071255e-06, "loss": 0.4854, "step": 647 }, { "epoch": 0.46, "grad_norm": 13.442128811771815, "learning_rate": 9.809569953577258e-06, "loss": 0.5283, "step": 648 }, { "epoch": 0.46, "grad_norm": 10.05485902012583, "learning_rate": 9.808779121116542e-06, "loss": 0.5869, "step": 649 }, { "epoch": 0.46, "grad_norm": 14.614481323202112, "learning_rate": 9.807986681953341e-06, "loss": 0.5005, "step": 650 }, { "epoch": 0.46, "grad_norm": 9.302158068476116, "learning_rate": 9.807192636352422e-06, "loss": 0.5518, "step": 651 }, { "epoch": 0.47, "grad_norm": 7.246031731872442, "learning_rate": 9.80639698457909e-06, "loss": 0.5, "step": 652 }, { "epoch": 0.47, "grad_norm": 14.741614003843505, "learning_rate": 9.805599726899188e-06, "loss": 0.5195, "step": 653 }, { "epoch": 0.47, "grad_norm": 9.914045939553276, "learning_rate": 9.804800863579094e-06, "loss": 0.5059, "step": 654 }, { "epoch": 0.47, "grad_norm": 8.602058773881305, "learning_rate": 9.804000394885723e-06, "loss": 0.5132, "step": 655 }, { "epoch": 0.47, "grad_norm": 17.395330548720693, "learning_rate": 9.803198321086527e-06, "loss": 0.6504, "step": 656 }, { "epoch": 0.47, "grad_norm": 9.082779853562936, "learning_rate": 9.802394642449494e-06, "loss": 0.5503, "step": 657 }, { "epoch": 0.47, "grad_norm": 16.05035071434153, "learning_rate": 9.801589359243147e-06, "loss": 0.5625, "step": 658 }, { "epoch": 0.47, "grad_norm": 8.74752221065336, "learning_rate": 9.800782471736547e-06, "loss": 0.5024, "step": 659 }, { "epoch": 0.47, "grad_norm": 11.93020384218078, "learning_rate": 9.799973980199288e-06, "loss": 0.5679, "step": 660 }, { "epoch": 0.47, "grad_norm": 23.58689659447437, "learning_rate": 9.799163884901506e-06, "loss": 0.5435, "step": 661 }, { "epoch": 0.47, "grad_norm": 9.583439466485594, "learning_rate": 9.798352186113867e-06, "loss": 0.5093, "step": 662 }, { "epoch": 0.47, "grad_norm": 10.423178186172265, "learning_rate": 9.797538884107574e-06, "loss": 0.6123, "step": 663 }, { "epoch": 0.47, "grad_norm": 12.533199007539238, "learning_rate": 9.796723979154366e-06, "loss": 0.5229, "step": 664 }, { "epoch": 0.47, "grad_norm": 8.971226162816325, "learning_rate": 9.795907471526518e-06, "loss": 0.5547, "step": 665 }, { "epoch": 0.48, "grad_norm": 13.783730829396234, "learning_rate": 9.79508936149684e-06, "loss": 0.4644, "step": 666 }, { "epoch": 0.48, "grad_norm": 13.03312528136628, "learning_rate": 9.79426964933868e-06, "loss": 0.543, "step": 667 }, { "epoch": 0.48, "grad_norm": 8.283132104994282, "learning_rate": 9.793448335325919e-06, "loss": 0.4917, "step": 668 }, { "epoch": 0.48, "grad_norm": 13.20800855604922, "learning_rate": 9.792625419732969e-06, "loss": 0.5264, "step": 669 }, { "epoch": 0.48, "grad_norm": 9.415376972801536, "learning_rate": 9.791800902834787e-06, "loss": 0.4824, "step": 670 }, { "epoch": 0.48, "grad_norm": 10.273820442326011, "learning_rate": 9.790974784906855e-06, "loss": 0.501, "step": 671 }, { "epoch": 0.48, "grad_norm": 7.671344611499754, "learning_rate": 9.790147066225198e-06, "loss": 0.605, "step": 672 }, { "epoch": 0.48, "grad_norm": 14.088843769717563, "learning_rate": 9.789317747066369e-06, "loss": 0.5586, "step": 673 }, { "epoch": 0.48, "grad_norm": 10.751038170132446, "learning_rate": 9.788486827707462e-06, "loss": 0.6519, "step": 674 }, { "epoch": 0.48, "grad_norm": 9.207892997115179, "learning_rate": 9.7876543084261e-06, "loss": 0.5347, "step": 675 }, { "epoch": 0.48, "grad_norm": 8.857264061196519, "learning_rate": 9.786820189500443e-06, "loss": 0.5464, "step": 676 }, { "epoch": 0.48, "grad_norm": 13.830984057445567, "learning_rate": 9.785984471209186e-06, "loss": 0.5273, "step": 677 }, { "epoch": 0.48, "grad_norm": 6.60468250187857, "learning_rate": 9.785147153831562e-06, "loss": 0.5459, "step": 678 }, { "epoch": 0.48, "grad_norm": 7.593019502309981, "learning_rate": 9.784308237647329e-06, "loss": 0.5757, "step": 679 }, { "epoch": 0.49, "grad_norm": 9.955889286248375, "learning_rate": 9.783467722936786e-06, "loss": 0.583, "step": 680 }, { "epoch": 0.49, "grad_norm": 12.939295729656601, "learning_rate": 9.782625609980767e-06, "loss": 0.5991, "step": 681 }, { "epoch": 0.49, "grad_norm": 22.514876614342388, "learning_rate": 9.781781899060635e-06, "loss": 0.5703, "step": 682 }, { "epoch": 0.49, "grad_norm": 13.553266494196397, "learning_rate": 9.78093659045829e-06, "loss": 0.5435, "step": 683 }, { "epoch": 0.49, "grad_norm": 5.263926473907152, "learning_rate": 9.780089684456164e-06, "loss": 0.5229, "step": 684 }, { "epoch": 0.49, "grad_norm": 5.635255900891267, "learning_rate": 9.779241181337228e-06, "loss": 0.5176, "step": 685 }, { "epoch": 0.49, "grad_norm": 10.608059375423723, "learning_rate": 9.778391081384979e-06, "loss": 0.5645, "step": 686 }, { "epoch": 0.49, "grad_norm": 19.41849821192734, "learning_rate": 9.777539384883453e-06, "loss": 0.4922, "step": 687 }, { "epoch": 0.49, "grad_norm": 17.159527005843714, "learning_rate": 9.776686092117216e-06, "loss": 0.5977, "step": 688 }, { "epoch": 0.49, "grad_norm": 20.36894102930875, "learning_rate": 9.775831203371371e-06, "loss": 0.5693, "step": 689 }, { "epoch": 0.49, "grad_norm": 16.091636163606054, "learning_rate": 9.774974718931551e-06, "loss": 0.6221, "step": 690 }, { "epoch": 0.49, "grad_norm": 17.969604571791074, "learning_rate": 9.774116639083923e-06, "loss": 0.5854, "step": 691 }, { "epoch": 0.49, "grad_norm": 19.88504893165957, "learning_rate": 9.773256964115189e-06, "loss": 0.5049, "step": 692 }, { "epoch": 0.49, "grad_norm": 11.758445464297264, "learning_rate": 9.772395694312583e-06, "loss": 0.603, "step": 693 }, { "epoch": 0.5, "grad_norm": 14.955201215215652, "learning_rate": 9.771532829963865e-06, "loss": 0.5571, "step": 694 }, { "epoch": 0.5, "grad_norm": 5.013451757180628, "learning_rate": 9.770668371357344e-06, "loss": 0.4849, "step": 695 }, { "epoch": 0.5, "grad_norm": 5.6991625858998365, "learning_rate": 9.769802318781842e-06, "loss": 0.5337, "step": 696 }, { "epoch": 0.5, "grad_norm": 19.575556491861107, "learning_rate": 9.76893467252673e-06, "loss": 0.5771, "step": 697 }, { "epoch": 0.5, "grad_norm": 23.83427689960832, "learning_rate": 9.768065432881903e-06, "loss": 0.5859, "step": 698 }, { "epoch": 0.5, "grad_norm": 10.23151806170552, "learning_rate": 9.767194600137789e-06, "loss": 0.4951, "step": 699 }, { "epoch": 0.5, "grad_norm": 9.185321738716956, "learning_rate": 9.766322174585347e-06, "loss": 0.5342, "step": 700 }, { "epoch": 0.5, "grad_norm": 5.443299162915134, "learning_rate": 9.765448156516077e-06, "loss": 0.4678, "step": 701 }, { "epoch": 0.5, "grad_norm": 9.19817487137709, "learning_rate": 9.764572546222e-06, "loss": 0.5684, "step": 702 }, { "epoch": 0.5, "grad_norm": 6.21262946620503, "learning_rate": 9.763695343995674e-06, "loss": 0.5308, "step": 703 }, { "epoch": 0.5, "grad_norm": 13.192980873552312, "learning_rate": 9.762816550130192e-06, "loss": 0.5112, "step": 704 }, { "epoch": 0.5, "grad_norm": 9.960657984729142, "learning_rate": 9.76193616491917e-06, "loss": 0.6094, "step": 705 }, { "epoch": 0.5, "grad_norm": 6.679940313177855, "learning_rate": 9.761054188656766e-06, "loss": 0.5415, "step": 706 }, { "epoch": 0.5, "grad_norm": 8.09877092756067, "learning_rate": 9.760170621637661e-06, "loss": 0.5601, "step": 707 }, { "epoch": 0.51, "grad_norm": 11.479043385831767, "learning_rate": 9.759285464157073e-06, "loss": 0.5474, "step": 708 }, { "epoch": 0.51, "grad_norm": 9.234613065252855, "learning_rate": 9.758398716510751e-06, "loss": 0.501, "step": 709 }, { "epoch": 0.51, "grad_norm": 13.64323110344669, "learning_rate": 9.75751037899497e-06, "loss": 0.5635, "step": 710 }, { "epoch": 0.51, "grad_norm": 13.041571595123274, "learning_rate": 9.756620451906543e-06, "loss": 0.5952, "step": 711 }, { "epoch": 0.51, "grad_norm": 12.629287618121229, "learning_rate": 9.75572893554281e-06, "loss": 0.5493, "step": 712 }, { "epoch": 0.51, "grad_norm": 7.5855291941636755, "learning_rate": 9.754835830201645e-06, "loss": 0.5557, "step": 713 }, { "epoch": 0.51, "grad_norm": 4.8782598567148705, "learning_rate": 9.753941136181448e-06, "loss": 0.5273, "step": 714 }, { "epoch": 0.51, "grad_norm": 11.85114790600747, "learning_rate": 9.753044853781155e-06, "loss": 0.5078, "step": 715 }, { "epoch": 0.51, "grad_norm": 9.657250772447208, "learning_rate": 9.75214698330023e-06, "loss": 0.6157, "step": 716 }, { "epoch": 0.51, "grad_norm": 17.576065249418974, "learning_rate": 9.751247525038669e-06, "loss": 0.4863, "step": 717 }, { "epoch": 0.51, "grad_norm": 33.781222697734435, "learning_rate": 9.750346479296998e-06, "loss": 0.6094, "step": 718 }, { "epoch": 0.51, "grad_norm": 185.99349497355314, "learning_rate": 9.74944384637627e-06, "loss": 0.625, "step": 719 }, { "epoch": 0.51, "grad_norm": 7.364273036024208, "learning_rate": 9.748539626578076e-06, "loss": 0.4727, "step": 720 }, { "epoch": 0.51, "grad_norm": 7.47582915048441, "learning_rate": 9.747633820204527e-06, "loss": 0.4775, "step": 721 }, { "epoch": 0.52, "grad_norm": 160.92451037044142, "learning_rate": 9.746726427558276e-06, "loss": 0.583, "step": 722 }, { "epoch": 0.52, "grad_norm": 14.29556717471361, "learning_rate": 9.745817448942496e-06, "loss": 0.6426, "step": 723 }, { "epoch": 0.52, "grad_norm": 7.845209139079835, "learning_rate": 9.744906884660894e-06, "loss": 0.584, "step": 724 }, { "epoch": 0.52, "grad_norm": 11.487953481735268, "learning_rate": 9.743994735017708e-06, "loss": 0.4824, "step": 725 }, { "epoch": 0.52, "grad_norm": 16.209235710267066, "learning_rate": 9.743081000317703e-06, "loss": 0.6045, "step": 726 }, { "epoch": 0.52, "grad_norm": 8.112078657361787, "learning_rate": 9.742165680866173e-06, "loss": 0.5244, "step": 727 }, { "epoch": 0.52, "grad_norm": 12.970186411419759, "learning_rate": 9.741248776968947e-06, "loss": 0.5825, "step": 728 }, { "epoch": 0.52, "grad_norm": 10.315186088300615, "learning_rate": 9.740330288932379e-06, "loss": 0.5918, "step": 729 }, { "epoch": 0.52, "grad_norm": 11.457437824827904, "learning_rate": 9.73941021706335e-06, "loss": 0.5781, "step": 730 }, { "epoch": 0.52, "grad_norm": 10.793356765325614, "learning_rate": 9.738488561669272e-06, "loss": 0.5474, "step": 731 }, { "epoch": 0.52, "grad_norm": 7.112475758078757, "learning_rate": 9.737565323058094e-06, "loss": 0.5337, "step": 732 }, { "epoch": 0.52, "grad_norm": 7.954732042734927, "learning_rate": 9.736640501538281e-06, "loss": 0.5552, "step": 733 }, { "epoch": 0.52, "grad_norm": 9.682917509984245, "learning_rate": 9.735714097418835e-06, "loss": 0.5811, "step": 734 }, { "epoch": 0.52, "grad_norm": 8.954785296883502, "learning_rate": 9.734786111009287e-06, "loss": 0.5283, "step": 735 }, { "epoch": 0.53, "grad_norm": 7.187461253271188, "learning_rate": 9.73385654261969e-06, "loss": 0.5161, "step": 736 }, { "epoch": 0.53, "grad_norm": 11.105803753991632, "learning_rate": 9.732925392560634e-06, "loss": 0.5781, "step": 737 }, { "epoch": 0.53, "grad_norm": 6.780060124597431, "learning_rate": 9.731992661143233e-06, "loss": 0.5137, "step": 738 }, { "epoch": 0.53, "grad_norm": 7.149918598961957, "learning_rate": 9.731058348679128e-06, "loss": 0.5459, "step": 739 }, { "epoch": 0.53, "grad_norm": 10.600033304087317, "learning_rate": 9.73012245548049e-06, "loss": 0.6167, "step": 740 }, { "epoch": 0.53, "grad_norm": 8.959608925181033, "learning_rate": 9.729184981860023e-06, "loss": 0.5547, "step": 741 }, { "epoch": 0.53, "grad_norm": 18.421233340277922, "learning_rate": 9.728245928130949e-06, "loss": 0.4907, "step": 742 }, { "epoch": 0.53, "grad_norm": 13.131878096040454, "learning_rate": 9.727305294607024e-06, "loss": 0.5337, "step": 743 }, { "epoch": 0.53, "grad_norm": 5.933583907781678, "learning_rate": 9.726363081602532e-06, "loss": 0.4868, "step": 744 }, { "epoch": 0.53, "grad_norm": 8.773000311800002, "learning_rate": 9.725419289432287e-06, "loss": 0.5586, "step": 745 }, { "epoch": 0.53, "grad_norm": 11.30603969977933, "learning_rate": 9.724473918411624e-06, "loss": 0.5532, "step": 746 }, { "epoch": 0.53, "grad_norm": 7.45632748446621, "learning_rate": 9.723526968856408e-06, "loss": 0.4321, "step": 747 }, { "epoch": 0.53, "grad_norm": 10.567004859337702, "learning_rate": 9.722578441083035e-06, "loss": 0.4902, "step": 748 }, { "epoch": 0.53, "grad_norm": 14.086491004356668, "learning_rate": 9.721628335408423e-06, "loss": 0.5205, "step": 749 }, { "epoch": 0.54, "grad_norm": 18.63422639061617, "learning_rate": 9.720676652150025e-06, "loss": 0.5288, "step": 750 }, { "epoch": 0.54, "grad_norm": 11.428381354480067, "learning_rate": 9.719723391625813e-06, "loss": 0.6455, "step": 751 }, { "epoch": 0.54, "grad_norm": 14.967731907649567, "learning_rate": 9.718768554154287e-06, "loss": 0.6621, "step": 752 }, { "epoch": 0.54, "grad_norm": 10.977453743909852, "learning_rate": 9.717812140054479e-06, "loss": 0.5083, "step": 753 }, { "epoch": 0.54, "grad_norm": 19.976246002857728, "learning_rate": 9.716854149645945e-06, "loss": 0.5532, "step": 754 }, { "epoch": 0.54, "grad_norm": 10.694789061118618, "learning_rate": 9.715894583248764e-06, "loss": 0.5239, "step": 755 }, { "epoch": 0.54, "grad_norm": 6.615602095338727, "learning_rate": 9.714933441183549e-06, "loss": 0.4473, "step": 756 }, { "epoch": 0.54, "grad_norm": 8.216340319991541, "learning_rate": 9.713970723771432e-06, "loss": 0.479, "step": 757 }, { "epoch": 0.54, "grad_norm": 10.167611058044052, "learning_rate": 9.713006431334076e-06, "loss": 0.5361, "step": 758 }, { "epoch": 0.54, "grad_norm": 6.666468658089926, "learning_rate": 9.71204056419367e-06, "loss": 0.5303, "step": 759 }, { "epoch": 0.54, "grad_norm": 10.66660693531791, "learning_rate": 9.711073122672928e-06, "loss": 0.5464, "step": 760 }, { "epoch": 0.54, "grad_norm": 29.673553271376097, "learning_rate": 9.71010410709509e-06, "loss": 0.6133, "step": 761 }, { "epoch": 0.54, "grad_norm": 14.889123197476716, "learning_rate": 9.70913351778392e-06, "loss": 0.5479, "step": 762 }, { "epoch": 0.54, "grad_norm": 11.884074672070827, "learning_rate": 9.708161355063714e-06, "loss": 0.623, "step": 763 }, { "epoch": 0.55, "grad_norm": 11.200359924204133, "learning_rate": 9.707187619259286e-06, "loss": 0.46, "step": 764 }, { "epoch": 0.55, "grad_norm": 17.9354480427811, "learning_rate": 9.706212310695981e-06, "loss": 0.5781, "step": 765 }, { "epoch": 0.55, "grad_norm": 29.381896018391878, "learning_rate": 9.705235429699666e-06, "loss": 0.6304, "step": 766 }, { "epoch": 0.55, "grad_norm": 16.16877662873349, "learning_rate": 9.704256976596737e-06, "loss": 0.5361, "step": 767 }, { "epoch": 0.55, "grad_norm": 6.227617088641859, "learning_rate": 9.703276951714114e-06, "loss": 0.4468, "step": 768 }, { "epoch": 0.55, "grad_norm": 11.762967619147144, "learning_rate": 9.70229535537924e-06, "loss": 0.4521, "step": 769 }, { "epoch": 0.55, "grad_norm": 25.62549106013863, "learning_rate": 9.701312187920084e-06, "loss": 0.7197, "step": 770 }, { "epoch": 0.55, "grad_norm": 25.332401107525445, "learning_rate": 9.700327449665143e-06, "loss": 0.6289, "step": 771 }, { "epoch": 0.55, "grad_norm": 19.306780662436637, "learning_rate": 9.699341140943434e-06, "loss": 0.5376, "step": 772 }, { "epoch": 0.55, "grad_norm": 9.721571034550369, "learning_rate": 9.698353262084501e-06, "loss": 0.4839, "step": 773 }, { "epoch": 0.55, "grad_norm": 8.264526568674992, "learning_rate": 9.697363813418414e-06, "loss": 0.5029, "step": 774 }, { "epoch": 0.55, "grad_norm": 28.02052415350964, "learning_rate": 9.696372795275766e-06, "loss": 0.6436, "step": 775 }, { "epoch": 0.55, "grad_norm": 22.973871491555215, "learning_rate": 9.695380207987675e-06, "loss": 0.5459, "step": 776 }, { "epoch": 0.55, "grad_norm": 22.86541152325245, "learning_rate": 9.69438605188578e-06, "loss": 0.6797, "step": 777 }, { "epoch": 0.56, "grad_norm": 13.430529034939772, "learning_rate": 9.69339032730225e-06, "loss": 0.5947, "step": 778 }, { "epoch": 0.56, "grad_norm": 21.136839582171906, "learning_rate": 9.692393034569776e-06, "loss": 0.5117, "step": 779 }, { "epoch": 0.56, "grad_norm": 11.567653375487124, "learning_rate": 9.69139417402157e-06, "loss": 0.4937, "step": 780 }, { "epoch": 0.56, "grad_norm": 38.10236907533684, "learning_rate": 9.690393745991368e-06, "loss": 0.6172, "step": 781 }, { "epoch": 0.56, "grad_norm": 40.793834029889545, "learning_rate": 9.689391750813436e-06, "loss": 0.8066, "step": 782 }, { "epoch": 0.56, "grad_norm": 24.16046506654703, "learning_rate": 9.688388188822556e-06, "loss": 0.7109, "step": 783 }, { "epoch": 0.56, "grad_norm": 16.209686684667055, "learning_rate": 9.687383060354038e-06, "loss": 0.623, "step": 784 }, { "epoch": 0.56, "grad_norm": 7.029930794357556, "learning_rate": 9.686376365743714e-06, "loss": 0.5557, "step": 785 }, { "epoch": 0.56, "grad_norm": 5.702750453811597, "learning_rate": 9.685368105327938e-06, "loss": 0.5537, "step": 786 }, { "epoch": 0.56, "grad_norm": 23.2055662093521, "learning_rate": 9.684358279443593e-06, "loss": 0.5771, "step": 787 }, { "epoch": 0.56, "grad_norm": 7.310900740988059, "learning_rate": 9.683346888428074e-06, "loss": 0.4946, "step": 788 }, { "epoch": 0.56, "grad_norm": 10.032483835960253, "learning_rate": 9.68233393261931e-06, "loss": 0.5244, "step": 789 }, { "epoch": 0.56, "grad_norm": 5.215648584728573, "learning_rate": 9.681319412355748e-06, "loss": 0.5078, "step": 790 }, { "epoch": 0.56, "grad_norm": 8.062322777960844, "learning_rate": 9.680303327976356e-06, "loss": 0.5327, "step": 791 }, { "epoch": 0.57, "grad_norm": 6.167802022767416, "learning_rate": 9.679285679820628e-06, "loss": 0.501, "step": 792 }, { "epoch": 0.57, "grad_norm": 8.46849010922445, "learning_rate": 9.67826646822858e-06, "loss": 0.5239, "step": 793 }, { "epoch": 0.57, "grad_norm": 16.653596919202503, "learning_rate": 9.677245693540749e-06, "loss": 0.5176, "step": 794 }, { "epoch": 0.57, "grad_norm": 7.021405696985937, "learning_rate": 9.676223356098194e-06, "loss": 0.4546, "step": 795 }, { "epoch": 0.57, "grad_norm": 13.864820200236489, "learning_rate": 9.675199456242499e-06, "loss": 0.5151, "step": 796 }, { "epoch": 0.57, "grad_norm": 9.137355785760509, "learning_rate": 9.674173994315764e-06, "loss": 0.6768, "step": 797 }, { "epoch": 0.57, "grad_norm": 8.117836656138127, "learning_rate": 9.67314697066062e-06, "loss": 0.542, "step": 798 }, { "epoch": 0.57, "grad_norm": 7.298051983963956, "learning_rate": 9.672118385620209e-06, "loss": 0.4927, "step": 799 }, { "epoch": 0.57, "grad_norm": 24.163165582397156, "learning_rate": 9.671088239538204e-06, "loss": 0.7588, "step": 800 }, { "epoch": 0.57, "grad_norm": 9.215200611680789, "learning_rate": 9.670056532758798e-06, "loss": 0.5474, "step": 801 }, { "epoch": 0.57, "grad_norm": 10.42334654362743, "learning_rate": 9.669023265626698e-06, "loss": 0.6289, "step": 802 }, { "epoch": 0.57, "grad_norm": 14.141287911310176, "learning_rate": 9.66798843848714e-06, "loss": 0.4873, "step": 803 }, { "epoch": 0.57, "grad_norm": 14.552342787875665, "learning_rate": 9.666952051685882e-06, "loss": 0.4731, "step": 804 }, { "epoch": 0.57, "grad_norm": 9.755001186494166, "learning_rate": 9.665914105569196e-06, "loss": 0.5591, "step": 805 }, { "epoch": 0.58, "grad_norm": 11.425217131212628, "learning_rate": 9.664874600483883e-06, "loss": 0.5239, "step": 806 }, { "epoch": 0.58, "grad_norm": 13.638565826764706, "learning_rate": 9.663833536777256e-06, "loss": 0.5005, "step": 807 }, { "epoch": 0.58, "grad_norm": 8.967345411360103, "learning_rate": 9.662790914797158e-06, "loss": 0.6133, "step": 808 }, { "epoch": 0.58, "grad_norm": 16.774853818643642, "learning_rate": 9.661746734891947e-06, "loss": 0.5449, "step": 809 }, { "epoch": 0.58, "grad_norm": 12.358167390535703, "learning_rate": 9.6607009974105e-06, "loss": 0.5386, "step": 810 }, { "epoch": 0.58, "grad_norm": 7.443902186847719, "learning_rate": 9.659653702702223e-06, "loss": 0.519, "step": 811 }, { "epoch": 0.58, "grad_norm": 20.581607014220506, "learning_rate": 9.658604851117032e-06, "loss": 0.6064, "step": 812 }, { "epoch": 0.58, "grad_norm": 20.10227881796404, "learning_rate": 9.65755444300537e-06, "loss": 0.5649, "step": 813 }, { "epoch": 0.58, "grad_norm": 13.936458119810688, "learning_rate": 9.656502478718197e-06, "loss": 0.5459, "step": 814 }, { "epoch": 0.58, "grad_norm": 19.720426589157082, "learning_rate": 9.655448958606994e-06, "loss": 0.6309, "step": 815 }, { "epoch": 0.58, "grad_norm": 9.062061943723108, "learning_rate": 9.654393883023763e-06, "loss": 0.5693, "step": 816 }, { "epoch": 0.58, "grad_norm": 11.539927314427118, "learning_rate": 9.653337252321023e-06, "loss": 0.584, "step": 817 }, { "epoch": 0.58, "grad_norm": 20.01962400655284, "learning_rate": 9.652279066851811e-06, "loss": 0.5146, "step": 818 }, { "epoch": 0.58, "grad_norm": 6.278364471511755, "learning_rate": 9.651219326969694e-06, "loss": 0.4888, "step": 819 }, { "epoch": 0.59, "grad_norm": 11.862188944826272, "learning_rate": 9.650158033028743e-06, "loss": 0.5386, "step": 820 }, { "epoch": 0.59, "grad_norm": 9.55019661530497, "learning_rate": 9.64909518538356e-06, "loss": 0.4985, "step": 821 }, { "epoch": 0.59, "grad_norm": 6.736899314710898, "learning_rate": 9.648030784389264e-06, "loss": 0.5303, "step": 822 }, { "epoch": 0.59, "grad_norm": 5.366946707310364, "learning_rate": 9.646964830401487e-06, "loss": 0.6001, "step": 823 }, { "epoch": 0.59, "grad_norm": 10.13088503009922, "learning_rate": 9.645897323776386e-06, "loss": 0.5889, "step": 824 }, { "epoch": 0.59, "grad_norm": 14.30858943174471, "learning_rate": 9.644828264870634e-06, "loss": 0.5371, "step": 825 }, { "epoch": 0.59, "grad_norm": 12.315173886504947, "learning_rate": 9.643757654041423e-06, "loss": 0.5508, "step": 826 }, { "epoch": 0.59, "grad_norm": 6.679272046818679, "learning_rate": 9.642685491646467e-06, "loss": 0.481, "step": 827 }, { "epoch": 0.59, "grad_norm": 7.93076938329635, "learning_rate": 9.641611778043992e-06, "loss": 0.5361, "step": 828 }, { "epoch": 0.59, "grad_norm": 11.131118051844421, "learning_rate": 9.64053651359275e-06, "loss": 0.4561, "step": 829 }, { "epoch": 0.59, "grad_norm": 16.845626382905177, "learning_rate": 9.639459698652e-06, "loss": 0.5811, "step": 830 }, { "epoch": 0.59, "grad_norm": 7.452226118644797, "learning_rate": 9.63838133358153e-06, "loss": 0.5049, "step": 831 }, { "epoch": 0.59, "grad_norm": 13.715195201767449, "learning_rate": 9.637301418741643e-06, "loss": 0.5732, "step": 832 }, { "epoch": 0.59, "grad_norm": 5.948105431930284, "learning_rate": 9.636219954493157e-06, "loss": 0.5264, "step": 833 }, { "epoch": 0.6, "grad_norm": 6.493455666198669, "learning_rate": 9.635136941197409e-06, "loss": 0.5498, "step": 834 }, { "epoch": 0.6, "grad_norm": 6.154843832131087, "learning_rate": 9.634052379216256e-06, "loss": 0.5889, "step": 835 }, { "epoch": 0.6, "grad_norm": 5.9733407722182035, "learning_rate": 9.632966268912067e-06, "loss": 0.5796, "step": 836 }, { "epoch": 0.6, "grad_norm": 13.393897880638866, "learning_rate": 9.631878610647734e-06, "loss": 0.5762, "step": 837 }, { "epoch": 0.6, "grad_norm": 9.346535492939477, "learning_rate": 9.630789404786664e-06, "loss": 0.5845, "step": 838 }, { "epoch": 0.6, "grad_norm": 7.414814582454439, "learning_rate": 9.629698651692779e-06, "loss": 0.519, "step": 839 }, { "epoch": 0.6, "grad_norm": 7.027989103111617, "learning_rate": 9.62860635173052e-06, "loss": 0.5229, "step": 840 }, { "epoch": 0.6, "grad_norm": 9.691852570057074, "learning_rate": 9.627512505264847e-06, "loss": 0.54, "step": 841 }, { "epoch": 0.6, "grad_norm": 13.672266244948228, "learning_rate": 9.626417112661233e-06, "loss": 0.501, "step": 842 }, { "epoch": 0.6, "grad_norm": 14.046157418465752, "learning_rate": 9.62532017428567e-06, "loss": 0.5464, "step": 843 }, { "epoch": 0.6, "grad_norm": 5.894528680541082, "learning_rate": 9.624221690504663e-06, "loss": 0.5454, "step": 844 }, { "epoch": 0.6, "grad_norm": 19.359046304477772, "learning_rate": 9.623121661685239e-06, "loss": 0.5347, "step": 845 }, { "epoch": 0.6, "grad_norm": 9.931630197988305, "learning_rate": 9.622020088194934e-06, "loss": 0.5518, "step": 846 }, { "epoch": 0.6, "grad_norm": 7.126175546600982, "learning_rate": 9.62091697040181e-06, "loss": 0.5073, "step": 847 }, { "epoch": 0.61, "grad_norm": 6.424497735342813, "learning_rate": 9.619812308674434e-06, "loss": 0.5981, "step": 848 }, { "epoch": 0.61, "grad_norm": 13.429709124263976, "learning_rate": 9.618706103381896e-06, "loss": 0.5327, "step": 849 }, { "epoch": 0.61, "grad_norm": 6.370766522473178, "learning_rate": 9.6175983548938e-06, "loss": 0.5356, "step": 850 }, { "epoch": 0.61, "grad_norm": 7.936351360877747, "learning_rate": 9.616489063580265e-06, "loss": 0.6279, "step": 851 }, { "epoch": 0.61, "grad_norm": 13.565406309781352, "learning_rate": 9.615378229811927e-06, "loss": 0.498, "step": 852 }, { "epoch": 0.61, "grad_norm": 14.85975243185665, "learning_rate": 9.614265853959935e-06, "loss": 0.5449, "step": 853 }, { "epoch": 0.61, "grad_norm": 9.376637510131717, "learning_rate": 9.613151936395952e-06, "loss": 0.6953, "step": 854 }, { "epoch": 0.61, "grad_norm": 5.639153075645197, "learning_rate": 9.612036477492163e-06, "loss": 0.5469, "step": 855 }, { "epoch": 0.61, "grad_norm": 23.55427631393824, "learning_rate": 9.610919477621262e-06, "loss": 0.5225, "step": 856 }, { "epoch": 0.61, "grad_norm": 16.272759938860226, "learning_rate": 9.609800937156459e-06, "loss": 0.6147, "step": 857 }, { "epoch": 0.61, "grad_norm": 5.7353396751586585, "learning_rate": 9.60868085647148e-06, "loss": 0.6284, "step": 858 }, { "epoch": 0.61, "grad_norm": 8.677161643210795, "learning_rate": 9.607559235940562e-06, "loss": 0.4409, "step": 859 }, { "epoch": 0.61, "grad_norm": 5.674513654587285, "learning_rate": 9.60643607593846e-06, "loss": 0.5229, "step": 860 }, { "epoch": 0.61, "grad_norm": 5.208902038096937, "learning_rate": 9.605311376840446e-06, "loss": 0.4419, "step": 861 }, { "epoch": 0.62, "grad_norm": 7.200913020665042, "learning_rate": 9.604185139022302e-06, "loss": 0.5039, "step": 862 }, { "epoch": 0.62, "grad_norm": 13.88962045733014, "learning_rate": 9.603057362860323e-06, "loss": 0.6357, "step": 863 }, { "epoch": 0.62, "grad_norm": 15.58000300790741, "learning_rate": 9.60192804873132e-06, "loss": 0.4673, "step": 864 }, { "epoch": 0.62, "grad_norm": 5.975171350200098, "learning_rate": 9.60079719701262e-06, "loss": 0.5605, "step": 865 }, { "epoch": 0.62, "grad_norm": 5.2559604366098025, "learning_rate": 9.599664808082058e-06, "loss": 0.5229, "step": 866 }, { "epoch": 0.62, "grad_norm": 30.607460221946777, "learning_rate": 9.598530882317992e-06, "loss": 0.7324, "step": 867 }, { "epoch": 0.62, "grad_norm": 16.12508471498375, "learning_rate": 9.59739542009928e-06, "loss": 0.6279, "step": 868 }, { "epoch": 0.62, "grad_norm": 7.213276965176858, "learning_rate": 9.596258421805306e-06, "loss": 0.54, "step": 869 }, { "epoch": 0.62, "grad_norm": 8.939321698210003, "learning_rate": 9.595119887815962e-06, "loss": 0.5654, "step": 870 }, { "epoch": 0.62, "grad_norm": 7.54597463715274, "learning_rate": 9.593979818511655e-06, "loss": 0.5391, "step": 871 }, { "epoch": 0.62, "grad_norm": 8.796494601908567, "learning_rate": 9.592838214273298e-06, "loss": 0.6221, "step": 872 }, { "epoch": 0.62, "grad_norm": 16.350816902914385, "learning_rate": 9.591695075482326e-06, "loss": 0.5195, "step": 873 }, { "epoch": 0.62, "grad_norm": 16.356102942266837, "learning_rate": 9.590550402520683e-06, "loss": 0.5361, "step": 874 }, { "epoch": 0.62, "grad_norm": 4.949881570510586, "learning_rate": 9.589404195770821e-06, "loss": 0.5151, "step": 875 }, { "epoch": 0.63, "grad_norm": 7.926146321546089, "learning_rate": 9.588256455615716e-06, "loss": 0.5801, "step": 876 }, { "epoch": 0.63, "grad_norm": 6.859921527227033, "learning_rate": 9.587107182438846e-06, "loss": 0.5, "step": 877 }, { "epoch": 0.63, "grad_norm": 11.050613799522687, "learning_rate": 9.585956376624204e-06, "loss": 0.5527, "step": 878 }, { "epoch": 0.63, "grad_norm": 14.002907445909361, "learning_rate": 9.584804038556297e-06, "loss": 0.6289, "step": 879 }, { "epoch": 0.63, "grad_norm": 6.0542998877576375, "learning_rate": 9.58365016862014e-06, "loss": 0.4893, "step": 880 }, { "epoch": 0.63, "grad_norm": 16.56578174324824, "learning_rate": 9.582494767201265e-06, "loss": 0.5493, "step": 881 }, { "epoch": 0.63, "grad_norm": 5.301104366990973, "learning_rate": 9.581337834685713e-06, "loss": 0.4365, "step": 882 }, { "epoch": 0.63, "grad_norm": 21.230580902888516, "learning_rate": 9.580179371460034e-06, "loss": 0.5376, "step": 883 }, { "epoch": 0.63, "grad_norm": 9.115363175931497, "learning_rate": 9.579019377911296e-06, "loss": 0.5518, "step": 884 }, { "epoch": 0.63, "grad_norm": 8.063043768232241, "learning_rate": 9.57785785442707e-06, "loss": 0.5142, "step": 885 }, { "epoch": 0.63, "grad_norm": 8.200092331472336, "learning_rate": 9.576694801395447e-06, "loss": 0.4912, "step": 886 }, { "epoch": 0.63, "grad_norm": 16.048503793582384, "learning_rate": 9.57553021920502e-06, "loss": 0.708, "step": 887 }, { "epoch": 0.63, "grad_norm": 7.183349614718361, "learning_rate": 9.574364108244903e-06, "loss": 0.459, "step": 888 }, { "epoch": 0.63, "grad_norm": 7.786011428361001, "learning_rate": 9.573196468904711e-06, "loss": 0.5586, "step": 889 }, { "epoch": 0.64, "grad_norm": 12.666551982517106, "learning_rate": 9.572027301574576e-06, "loss": 0.6201, "step": 890 }, { "epoch": 0.64, "grad_norm": 9.989372858349427, "learning_rate": 9.570856606645139e-06, "loss": 0.543, "step": 891 }, { "epoch": 0.64, "grad_norm": 6.978420031263653, "learning_rate": 9.569684384507547e-06, "loss": 0.4585, "step": 892 }, { "epoch": 0.64, "grad_norm": 6.494441221850212, "learning_rate": 9.568510635553466e-06, "loss": 0.5176, "step": 893 }, { "epoch": 0.64, "grad_norm": 7.496868914068109, "learning_rate": 9.567335360175065e-06, "loss": 0.5283, "step": 894 }, { "epoch": 0.64, "grad_norm": 19.82256439758004, "learning_rate": 9.566158558765026e-06, "loss": 0.6777, "step": 895 }, { "epoch": 0.64, "grad_norm": 8.172352317808183, "learning_rate": 9.564980231716541e-06, "loss": 0.5371, "step": 896 }, { "epoch": 0.64, "grad_norm": 7.641915716101109, "learning_rate": 9.56380037942331e-06, "loss": 0.5103, "step": 897 }, { "epoch": 0.64, "grad_norm": 10.279754257232414, "learning_rate": 9.562619002279541e-06, "loss": 0.5737, "step": 898 }, { "epoch": 0.64, "grad_norm": 10.850853767539375, "learning_rate": 9.561436100679959e-06, "loss": 0.5244, "step": 899 }, { "epoch": 0.64, "grad_norm": 7.349447369826582, "learning_rate": 9.56025167501979e-06, "loss": 0.4907, "step": 900 }, { "epoch": 0.64, "grad_norm": 15.819552722595972, "learning_rate": 9.559065725694775e-06, "loss": 0.6021, "step": 901 }, { "epoch": 0.64, "grad_norm": 18.09721976041659, "learning_rate": 9.55787825310116e-06, "loss": 0.5762, "step": 902 }, { "epoch": 0.64, "grad_norm": 6.611541858548501, "learning_rate": 9.5566892576357e-06, "loss": 0.4741, "step": 903 }, { "epoch": 0.65, "grad_norm": 11.17797250965005, "learning_rate": 9.555498739695665e-06, "loss": 0.6016, "step": 904 }, { "epoch": 0.65, "grad_norm": 15.107430166339677, "learning_rate": 9.554306699678827e-06, "loss": 0.5166, "step": 905 }, { "epoch": 0.65, "grad_norm": 7.571816321971257, "learning_rate": 9.553113137983467e-06, "loss": 0.5151, "step": 906 }, { "epoch": 0.65, "grad_norm": 7.8343904045849095, "learning_rate": 9.551918055008378e-06, "loss": 0.5376, "step": 907 }, { "epoch": 0.65, "grad_norm": 11.669597503208232, "learning_rate": 9.55072145115286e-06, "loss": 0.5171, "step": 908 }, { "epoch": 0.65, "grad_norm": 9.787777933655681, "learning_rate": 9.54952332681672e-06, "loss": 0.5527, "step": 909 }, { "epoch": 0.65, "grad_norm": 7.9497521719036, "learning_rate": 9.54832368240027e-06, "loss": 0.4326, "step": 910 }, { "epoch": 0.65, "grad_norm": 10.51362496368161, "learning_rate": 9.54712251830434e-06, "loss": 0.4834, "step": 911 }, { "epoch": 0.65, "grad_norm": 17.840346790600776, "learning_rate": 9.545919834930257e-06, "loss": 0.4854, "step": 912 }, { "epoch": 0.65, "grad_norm": 13.48508364031533, "learning_rate": 9.54471563267986e-06, "loss": 0.6523, "step": 913 }, { "epoch": 0.65, "grad_norm": 12.107481431790621, "learning_rate": 9.543509911955497e-06, "loss": 0.5186, "step": 914 }, { "epoch": 0.65, "grad_norm": 9.554933322462405, "learning_rate": 9.542302673160021e-06, "loss": 0.5552, "step": 915 }, { "epoch": 0.65, "grad_norm": 9.359795062769347, "learning_rate": 9.541093916696793e-06, "loss": 0.6514, "step": 916 }, { "epoch": 0.65, "grad_norm": 7.933210596647848, "learning_rate": 9.539883642969681e-06, "loss": 0.4824, "step": 917 }, { "epoch": 0.66, "grad_norm": 12.030646602797862, "learning_rate": 9.53867185238306e-06, "loss": 0.5791, "step": 918 }, { "epoch": 0.66, "grad_norm": 10.531996572790305, "learning_rate": 9.53745854534181e-06, "loss": 0.542, "step": 919 }, { "epoch": 0.66, "grad_norm": 18.737722994826893, "learning_rate": 9.536243722251321e-06, "loss": 0.6357, "step": 920 }, { "epoch": 0.66, "grad_norm": 11.176392594106526, "learning_rate": 9.53502738351749e-06, "loss": 0.5479, "step": 921 }, { "epoch": 0.66, "grad_norm": 9.791505085435352, "learning_rate": 9.533809529546716e-06, "loss": 0.5146, "step": 922 }, { "epoch": 0.66, "grad_norm": 11.194713334314685, "learning_rate": 9.532590160745906e-06, "loss": 0.5542, "step": 923 }, { "epoch": 0.66, "grad_norm": 21.304141717636043, "learning_rate": 9.531369277522475e-06, "loss": 0.4966, "step": 924 }, { "epoch": 0.66, "grad_norm": 8.781951810922592, "learning_rate": 9.530146880284343e-06, "loss": 0.5664, "step": 925 }, { "epoch": 0.66, "grad_norm": 11.045579974558065, "learning_rate": 9.528922969439935e-06, "loss": 0.5073, "step": 926 }, { "epoch": 0.66, "grad_norm": 9.68357233374754, "learning_rate": 9.527697545398183e-06, "loss": 0.5542, "step": 927 }, { "epoch": 0.66, "grad_norm": 15.122021239823766, "learning_rate": 9.526470608568521e-06, "loss": 0.5415, "step": 928 }, { "epoch": 0.66, "grad_norm": 6.454918266617198, "learning_rate": 9.525242159360897e-06, "loss": 0.5361, "step": 929 }, { "epoch": 0.66, "grad_norm": 8.493825899265097, "learning_rate": 9.524012198185755e-06, "loss": 0.7109, "step": 930 }, { "epoch": 0.66, "grad_norm": 5.636191715860241, "learning_rate": 9.522780725454048e-06, "loss": 0.5635, "step": 931 }, { "epoch": 0.67, "grad_norm": 8.967666658359168, "learning_rate": 9.521547741577232e-06, "loss": 0.5757, "step": 932 }, { "epoch": 0.67, "grad_norm": 5.728302127565175, "learning_rate": 9.520313246967277e-06, "loss": 0.6177, "step": 933 }, { "epoch": 0.67, "grad_norm": 9.518191584124642, "learning_rate": 9.519077242036643e-06, "loss": 0.6094, "step": 934 }, { "epoch": 0.67, "grad_norm": 7.997331318486447, "learning_rate": 9.517839727198306e-06, "loss": 0.5352, "step": 935 }, { "epoch": 0.67, "grad_norm": 9.08940793882954, "learning_rate": 9.516600702865742e-06, "loss": 0.5396, "step": 936 }, { "epoch": 0.67, "grad_norm": 6.3323126968670005, "learning_rate": 9.51536016945293e-06, "loss": 0.5122, "step": 937 }, { "epoch": 0.67, "grad_norm": 7.266058320003383, "learning_rate": 9.514118127374358e-06, "loss": 0.5044, "step": 938 }, { "epoch": 0.67, "grad_norm": 7.08730151307726, "learning_rate": 9.512874577045016e-06, "loss": 0.5518, "step": 939 }, { "epoch": 0.67, "grad_norm": 13.676561438718075, "learning_rate": 9.511629518880394e-06, "loss": 0.5298, "step": 940 }, { "epoch": 0.67, "grad_norm": 17.411348317275475, "learning_rate": 9.510382953296492e-06, "loss": 0.4536, "step": 941 }, { "epoch": 0.67, "grad_norm": 44.62122780743628, "learning_rate": 9.50913488070981e-06, "loss": 0.6172, "step": 942 }, { "epoch": 0.67, "grad_norm": 228.46722688523354, "learning_rate": 9.50788530153735e-06, "loss": 0.8828, "step": 943 }, { "epoch": 0.67, "grad_norm": 109.64473438367058, "learning_rate": 9.506634216196621e-06, "loss": 0.9453, "step": 944 }, { "epoch": 0.67, "grad_norm": 78.65828191559729, "learning_rate": 9.505381625105636e-06, "loss": 0.9453, "step": 945 }, { "epoch": 0.68, "grad_norm": 51.832204433461676, "learning_rate": 9.504127528682907e-06, "loss": 0.6997, "step": 946 }, { "epoch": 0.68, "grad_norm": 31.13063318378493, "learning_rate": 9.502871927347452e-06, "loss": 0.6172, "step": 947 }, { "epoch": 0.68, "grad_norm": 14.291137372507908, "learning_rate": 9.501614821518789e-06, "loss": 0.5537, "step": 948 }, { "epoch": 0.68, "grad_norm": 23.46365835435606, "learning_rate": 9.500356211616941e-06, "loss": 0.6475, "step": 949 }, { "epoch": 0.68, "grad_norm": 32.29740862656474, "learning_rate": 9.499096098062435e-06, "loss": 0.6709, "step": 950 }, { "epoch": 0.68, "grad_norm": 63.097784255629676, "learning_rate": 9.497834481276293e-06, "loss": 0.6182, "step": 951 }, { "epoch": 0.68, "grad_norm": 23.902799208094248, "learning_rate": 9.496571361680052e-06, "loss": 0.5908, "step": 952 }, { "epoch": 0.68, "grad_norm": 16.627462349415925, "learning_rate": 9.495306739695738e-06, "loss": 0.6055, "step": 953 }, { "epoch": 0.68, "grad_norm": 8.838239735724807, "learning_rate": 9.494040615745887e-06, "loss": 0.5063, "step": 954 }, { "epoch": 0.68, "grad_norm": 37.579178855565374, "learning_rate": 9.492772990253535e-06, "loss": 0.6553, "step": 955 }, { "epoch": 0.68, "grad_norm": 12.534070316127274, "learning_rate": 9.49150386364222e-06, "loss": 0.5283, "step": 956 }, { "epoch": 0.68, "grad_norm": 18.81207673614865, "learning_rate": 9.490233236335977e-06, "loss": 0.5747, "step": 957 }, { "epoch": 0.68, "grad_norm": 35.2501349954626, "learning_rate": 9.488961108759349e-06, "loss": 0.5806, "step": 958 }, { "epoch": 0.68, "grad_norm": 24.779299324475687, "learning_rate": 9.487687481337377e-06, "loss": 0.6221, "step": 959 }, { "epoch": 0.69, "grad_norm": 10.734399861098689, "learning_rate": 9.486412354495605e-06, "loss": 0.563, "step": 960 }, { "epoch": 0.69, "grad_norm": 14.688681103139775, "learning_rate": 9.485135728660073e-06, "loss": 0.5747, "step": 961 }, { "epoch": 0.69, "grad_norm": 15.39443389421036, "learning_rate": 9.48385760425733e-06, "loss": 0.5088, "step": 962 }, { "epoch": 0.69, "grad_norm": 18.663199374468476, "learning_rate": 9.482577981714417e-06, "loss": 0.6211, "step": 963 }, { "epoch": 0.69, "grad_norm": 12.818151185370693, "learning_rate": 9.481296861458881e-06, "loss": 0.5361, "step": 964 }, { "epoch": 0.69, "grad_norm": 21.38553722751277, "learning_rate": 9.480014243918769e-06, "loss": 0.6396, "step": 965 }, { "epoch": 0.69, "grad_norm": 11.402379274239996, "learning_rate": 9.478730129522627e-06, "loss": 0.5635, "step": 966 }, { "epoch": 0.69, "grad_norm": 12.858863875511425, "learning_rate": 9.477444518699501e-06, "loss": 0.6465, "step": 967 }, { "epoch": 0.69, "grad_norm": 15.572804809224767, "learning_rate": 9.476157411878937e-06, "loss": 0.5537, "step": 968 }, { "epoch": 0.69, "grad_norm": 30.627062162450095, "learning_rate": 9.474868809490984e-06, "loss": 0.6309, "step": 969 }, { "epoch": 0.69, "grad_norm": 19.284249577261523, "learning_rate": 9.473578711966185e-06, "loss": 0.6641, "step": 970 }, { "epoch": 0.69, "grad_norm": 9.635613200387112, "learning_rate": 9.472287119735588e-06, "loss": 0.5928, "step": 971 }, { "epoch": 0.69, "grad_norm": 16.198639315955063, "learning_rate": 9.470994033230735e-06, "loss": 0.521, "step": 972 }, { "epoch": 0.69, "grad_norm": 17.11429585253634, "learning_rate": 9.469699452883672e-06, "loss": 0.623, "step": 973 }, { "epoch": 0.7, "grad_norm": 58.34165469496508, "learning_rate": 9.468403379126943e-06, "loss": 0.6406, "step": 974 }, { "epoch": 0.7, "grad_norm": 25.195532099505844, "learning_rate": 9.46710581239359e-06, "loss": 0.6553, "step": 975 }, { "epoch": 0.7, "grad_norm": 23.180951301910945, "learning_rate": 9.465806753117153e-06, "loss": 0.6494, "step": 976 }, { "epoch": 0.7, "grad_norm": 14.047197342311845, "learning_rate": 9.464506201731674e-06, "loss": 0.6182, "step": 977 }, { "epoch": 0.7, "grad_norm": 19.207255751064135, "learning_rate": 9.463204158671687e-06, "loss": 0.561, "step": 978 }, { "epoch": 0.7, "grad_norm": 14.684448206775564, "learning_rate": 9.461900624372233e-06, "loss": 0.6177, "step": 979 }, { "epoch": 0.7, "grad_norm": 20.50000538321963, "learning_rate": 9.460595599268848e-06, "loss": 0.6416, "step": 980 }, { "epoch": 0.7, "grad_norm": 14.660593164014767, "learning_rate": 9.45928908379756e-06, "loss": 0.5166, "step": 981 }, { "epoch": 0.7, "grad_norm": 21.206357340067083, "learning_rate": 9.457981078394905e-06, "loss": 0.6357, "step": 982 }, { "epoch": 0.7, "grad_norm": 11.40715091937378, "learning_rate": 9.45667158349791e-06, "loss": 0.5601, "step": 983 }, { "epoch": 0.7, "grad_norm": 36.68503954323254, "learning_rate": 9.4553605995441e-06, "loss": 0.6162, "step": 984 }, { "epoch": 0.7, "grad_norm": 15.636828128912919, "learning_rate": 9.4540481269715e-06, "loss": 0.6436, "step": 985 }, { "epoch": 0.7, "grad_norm": 18.970530099676225, "learning_rate": 9.452734166218635e-06, "loss": 0.6279, "step": 986 }, { "epoch": 0.7, "grad_norm": 11.10424986097931, "learning_rate": 9.451418717724518e-06, "loss": 0.5771, "step": 987 }, { "epoch": 0.71, "grad_norm": 19.04133879032038, "learning_rate": 9.45010178192867e-06, "loss": 0.5747, "step": 988 }, { "epoch": 0.71, "grad_norm": 11.337405957274404, "learning_rate": 9.448783359271102e-06, "loss": 0.6104, "step": 989 }, { "epoch": 0.71, "grad_norm": 13.704100149151028, "learning_rate": 9.44746345019232e-06, "loss": 0.5547, "step": 990 }, { "epoch": 0.71, "grad_norm": 23.243738749349486, "learning_rate": 9.446142055133333e-06, "loss": 0.7139, "step": 991 }, { "epoch": 0.71, "grad_norm": 23.327598910169833, "learning_rate": 9.444819174535647e-06, "loss": 0.7104, "step": 992 }, { "epoch": 0.71, "grad_norm": 10.328880835372676, "learning_rate": 9.443494808841255e-06, "loss": 0.5815, "step": 993 }, { "epoch": 0.71, "grad_norm": 20.53709813505537, "learning_rate": 9.442168958492657e-06, "loss": 0.6484, "step": 994 }, { "epoch": 0.71, "grad_norm": 9.46766757628474, "learning_rate": 9.44084162393284e-06, "loss": 0.5952, "step": 995 }, { "epoch": 0.71, "grad_norm": 15.838152911509205, "learning_rate": 9.439512805605294e-06, "loss": 0.5791, "step": 996 }, { "epoch": 0.71, "grad_norm": 10.976704798054344, "learning_rate": 9.438182503954002e-06, "loss": 0.6211, "step": 997 }, { "epoch": 0.71, "grad_norm": 13.574758990726995, "learning_rate": 9.43685071942344e-06, "loss": 0.5728, "step": 998 }, { "epoch": 0.71, "grad_norm": 10.175433896531516, "learning_rate": 9.435517452458584e-06, "loss": 0.5, "step": 999 }, { "epoch": 0.71, "grad_norm": 23.327701848585804, "learning_rate": 9.434182703504904e-06, "loss": 0.6045, "step": 1000 }, { "epoch": 0.71, "eval_avg_AUC": 0.6339471196716615, "eval_avg_Accuracy": 0.6359830901856764, "eval_avg_Accuracy-right": 0.9985000652145559, "eval_avg_Accuracy-wrong": 0.003866272458494428, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.5146999788919214, "eval_last_AUC": 0.6325870765603462, "eval_last_Accuracy": 0.6397546419098143, "eval_last_Accuracy-right": 0.9924351115168906, "eval_last_Accuracy-wrong": 0.024789629292699566, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.5155307886197232, "eval_max_AUC": 0.6025879470270927, "eval_max_Accuracy": 0.6355271883289124, "eval_max_Accuracy-right": 0.9996087126646668, "eval_max_Accuracy-wrong": 0.0006822833750284285, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.5068662445848651, "eval_min_AUC": 0.6212461473800316, "eval_min_Accuracy": 0.640376326259947, "eval_min_Accuracy-right": 0.9918481805138907, "eval_min_Accuracy-wrong": 0.027518762792813282, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.5093913468479994, "eval_prod_AUC": 0.5978007803736609, "eval_prod_Accuracy": 0.38884283819628646, "eval_prod_Accuracy-right": 0.04995434981087779, "eval_prod_Accuracy-wrong": 0.9797589265408233, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.5021156954591185, "eval_runtime": 249.2562, "eval_samples_per_second": 96.8, "eval_steps_per_second": 3.025, "eval_sum_AUC": 0.4675143189480929, "eval_sum_Accuracy": 0.635651525198939, "eval_sum_Accuracy-right": 1.0, "eval_sum_Accuracy-wrong": 0.0003411416875142142, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.5161108860078178, "step": 1000 }, { "epoch": 0.71, "grad_norm": 23.82364706060358, "learning_rate": 9.432846473008363e-06, "loss": 0.5806, "step": 1001 }, { "epoch": 0.72, "grad_norm": 27.37121267396446, "learning_rate": 9.431508761415422e-06, "loss": 0.6816, "step": 1002 }, { "epoch": 0.72, "grad_norm": 13.853525294894828, "learning_rate": 9.430169569173034e-06, "loss": 0.583, "step": 1003 }, { "epoch": 0.72, "grad_norm": 15.775963932541249, "learning_rate": 9.428828896728645e-06, "loss": 0.5537, "step": 1004 }, { "epoch": 0.72, "grad_norm": 9.72952999413491, "learning_rate": 9.427486744530205e-06, "loss": 0.5576, "step": 1005 }, { "epoch": 0.72, "grad_norm": 20.721816684636078, "learning_rate": 9.426143113026147e-06, "loss": 0.6729, "step": 1006 }, { "epoch": 0.72, "grad_norm": 12.544564691174461, "learning_rate": 9.424798002665405e-06, "loss": 0.666, "step": 1007 }, { "epoch": 0.72, "grad_norm": 41.719641221253966, "learning_rate": 9.423451413897406e-06, "loss": 0.6758, "step": 1008 }, { "epoch": 0.72, "grad_norm": 17.0766374680692, "learning_rate": 9.42210334717207e-06, "loss": 0.6533, "step": 1009 }, { "epoch": 0.72, "grad_norm": 19.964130971436024, "learning_rate": 9.42075380293981e-06, "loss": 0.6338, "step": 1010 }, { "epoch": 0.72, "grad_norm": 16.093738265546637, "learning_rate": 9.419402781651537e-06, "loss": 0.5977, "step": 1011 }, { "epoch": 0.72, "grad_norm": 16.29527691970946, "learning_rate": 9.418050283758647e-06, "loss": 0.6133, "step": 1012 }, { "epoch": 0.72, "grad_norm": 14.094752942831983, "learning_rate": 9.416696309713038e-06, "loss": 0.6084, "step": 1013 }, { "epoch": 0.72, "grad_norm": 19.37532956112354, "learning_rate": 9.415340859967099e-06, "loss": 0.6182, "step": 1014 }, { "epoch": 0.72, "grad_norm": 13.71031567566946, "learning_rate": 9.413983934973709e-06, "loss": 0.5996, "step": 1015 }, { "epoch": 0.73, "grad_norm": 15.269730904625384, "learning_rate": 9.412625535186242e-06, "loss": 0.5771, "step": 1016 }, { "epoch": 0.73, "grad_norm": 28.656034482685637, "learning_rate": 9.411265661058565e-06, "loss": 0.6465, "step": 1017 }, { "epoch": 0.73, "grad_norm": 42.68689422622165, "learning_rate": 9.409904313045038e-06, "loss": 0.6816, "step": 1018 }, { "epoch": 0.73, "grad_norm": 9.650685008374277, "learning_rate": 9.408541491600511e-06, "loss": 0.5723, "step": 1019 }, { "epoch": 0.73, "grad_norm": 18.730637303407523, "learning_rate": 9.407177197180328e-06, "loss": 0.5962, "step": 1020 }, { "epoch": 0.73, "grad_norm": 26.648761689749307, "learning_rate": 9.405811430240329e-06, "loss": 0.6924, "step": 1021 }, { "epoch": 0.73, "grad_norm": 14.798406394080896, "learning_rate": 9.404444191236837e-06, "loss": 0.5791, "step": 1022 }, { "epoch": 0.73, "grad_norm": 14.329130838909766, "learning_rate": 9.403075480626674e-06, "loss": 0.5547, "step": 1023 }, { "epoch": 0.73, "grad_norm": 14.655850010883231, "learning_rate": 9.401705298867151e-06, "loss": 0.5957, "step": 1024 }, { "epoch": 0.73, "grad_norm": 17.47268243968311, "learning_rate": 9.400333646416073e-06, "loss": 0.6396, "step": 1025 }, { "epoch": 0.73, "grad_norm": 26.170896114421865, "learning_rate": 9.398960523731735e-06, "loss": 0.6133, "step": 1026 }, { "epoch": 0.73, "grad_norm": 23.953028885048408, "learning_rate": 9.397585931272919e-06, "loss": 0.5537, "step": 1027 }, { "epoch": 0.73, "grad_norm": 10.88531583467331, "learning_rate": 9.396209869498905e-06, "loss": 0.5498, "step": 1028 }, { "epoch": 0.73, "grad_norm": 9.586608224218812, "learning_rate": 9.39483233886946e-06, "loss": 0.4663, "step": 1029 }, { "epoch": 0.74, "grad_norm": 11.459922262332075, "learning_rate": 9.393453339844842e-06, "loss": 0.5684, "step": 1030 }, { "epoch": 0.74, "grad_norm": 18.9938727234055, "learning_rate": 9.392072872885802e-06, "loss": 0.5859, "step": 1031 }, { "epoch": 0.74, "grad_norm": 17.95231020980077, "learning_rate": 9.39069093845358e-06, "loss": 0.5669, "step": 1032 }, { "epoch": 0.74, "grad_norm": 13.451010224992702, "learning_rate": 9.389307537009902e-06, "loss": 0.6187, "step": 1033 }, { "epoch": 0.74, "grad_norm": 15.835554267305948, "learning_rate": 9.387922669016992e-06, "loss": 0.6094, "step": 1034 }, { "epoch": 0.74, "grad_norm": 13.225942002568258, "learning_rate": 9.386536334937557e-06, "loss": 0.5293, "step": 1035 }, { "epoch": 0.74, "grad_norm": 19.28571736427067, "learning_rate": 9.385148535234799e-06, "loss": 0.5771, "step": 1036 }, { "epoch": 0.74, "grad_norm": 21.11918969171961, "learning_rate": 9.383759270372408e-06, "loss": 0.6396, "step": 1037 }, { "epoch": 0.74, "grad_norm": 10.987795594520582, "learning_rate": 9.382368540814563e-06, "loss": 0.5493, "step": 1038 }, { "epoch": 0.74, "grad_norm": 13.332762133791691, "learning_rate": 9.380976347025932e-06, "loss": 0.5781, "step": 1039 }, { "epoch": 0.74, "grad_norm": 13.833649187371895, "learning_rate": 9.379582689471671e-06, "loss": 0.5078, "step": 1040 }, { "epoch": 0.74, "grad_norm": 29.85600810006891, "learning_rate": 9.378187568617431e-06, "loss": 0.668, "step": 1041 }, { "epoch": 0.74, "grad_norm": 18.86118062820929, "learning_rate": 9.376790984929348e-06, "loss": 0.5869, "step": 1042 }, { "epoch": 0.74, "grad_norm": 28.051207820211953, "learning_rate": 9.37539293887404e-06, "loss": 0.6113, "step": 1043 }, { "epoch": 0.75, "grad_norm": 32.11996073913673, "learning_rate": 9.373993430918626e-06, "loss": 0.5068, "step": 1044 }, { "epoch": 0.75, "grad_norm": 17.79452829191096, "learning_rate": 9.372592461530708e-06, "loss": 0.5225, "step": 1045 }, { "epoch": 0.75, "grad_norm": 11.681808797361166, "learning_rate": 9.371190031178372e-06, "loss": 0.582, "step": 1046 }, { "epoch": 0.75, "grad_norm": 25.230590380011453, "learning_rate": 9.369786140330198e-06, "loss": 0.5239, "step": 1047 }, { "epoch": 0.75, "grad_norm": 42.558740499704996, "learning_rate": 9.368380789455251e-06, "loss": 0.519, "step": 1048 }, { "epoch": 0.75, "grad_norm": 13.297934374384019, "learning_rate": 9.36697397902309e-06, "loss": 0.5977, "step": 1049 }, { "epoch": 0.75, "grad_norm": 12.086643595178904, "learning_rate": 9.365565709503748e-06, "loss": 0.5469, "step": 1050 }, { "epoch": 0.75, "grad_norm": 10.056732327995272, "learning_rate": 9.364155981367761e-06, "loss": 0.4849, "step": 1051 }, { "epoch": 0.75, "grad_norm": 8.381781397718909, "learning_rate": 9.36274479508614e-06, "loss": 0.519, "step": 1052 }, { "epoch": 0.75, "grad_norm": 16.39451781815997, "learning_rate": 9.361332151130396e-06, "loss": 0.5249, "step": 1053 }, { "epoch": 0.75, "grad_norm": 15.967285956708139, "learning_rate": 9.359918049972512e-06, "loss": 0.5493, "step": 1054 }, { "epoch": 0.75, "grad_norm": 24.743475345880128, "learning_rate": 9.358502492084969e-06, "loss": 0.5654, "step": 1055 }, { "epoch": 0.75, "grad_norm": 13.646634416378777, "learning_rate": 9.35708547794073e-06, "loss": 0.5703, "step": 1056 }, { "epoch": 0.75, "grad_norm": 12.719559112795636, "learning_rate": 9.355667008013249e-06, "loss": 0.5825, "step": 1057 }, { "epoch": 0.76, "grad_norm": 21.823118073564967, "learning_rate": 9.354247082776459e-06, "loss": 0.5981, "step": 1058 }, { "epoch": 0.76, "grad_norm": 19.487103900742174, "learning_rate": 9.352825702704784e-06, "loss": 0.5532, "step": 1059 }, { "epoch": 0.76, "grad_norm": 8.077491438600466, "learning_rate": 9.351402868273136e-06, "loss": 0.5513, "step": 1060 }, { "epoch": 0.76, "grad_norm": 10.897696554119129, "learning_rate": 9.349978579956908e-06, "loss": 0.5938, "step": 1061 }, { "epoch": 0.76, "grad_norm": 7.206740424557369, "learning_rate": 9.348552838231983e-06, "loss": 0.5547, "step": 1062 }, { "epoch": 0.76, "grad_norm": 11.745893827810162, "learning_rate": 9.347125643574726e-06, "loss": 0.5117, "step": 1063 }, { "epoch": 0.76, "grad_norm": 6.609051578212978, "learning_rate": 9.345696996461992e-06, "loss": 0.5918, "step": 1064 }, { "epoch": 0.76, "grad_norm": 10.368231522386932, "learning_rate": 9.344266897371114e-06, "loss": 0.519, "step": 1065 }, { "epoch": 0.76, "grad_norm": 20.399140300952528, "learning_rate": 9.34283534677992e-06, "loss": 0.5483, "step": 1066 }, { "epoch": 0.76, "grad_norm": 12.568335263277644, "learning_rate": 9.341402345166714e-06, "loss": 0.6382, "step": 1067 }, { "epoch": 0.76, "grad_norm": 6.28361345060809, "learning_rate": 9.33996789301029e-06, "loss": 0.626, "step": 1068 }, { "epoch": 0.76, "grad_norm": 6.034588447871568, "learning_rate": 9.338531990789926e-06, "loss": 0.4995, "step": 1069 }, { "epoch": 0.76, "grad_norm": 13.260306318615566, "learning_rate": 9.33709463898538e-06, "loss": 0.5732, "step": 1070 }, { "epoch": 0.76, "grad_norm": 12.271709992114081, "learning_rate": 9.335655838076902e-06, "loss": 0.4492, "step": 1071 }, { "epoch": 0.77, "grad_norm": 7.56028285693857, "learning_rate": 9.33421558854522e-06, "loss": 0.5234, "step": 1072 }, { "epoch": 0.77, "grad_norm": 12.782154263459084, "learning_rate": 9.332773890871548e-06, "loss": 0.582, "step": 1073 }, { "epoch": 0.77, "grad_norm": 8.094570074636316, "learning_rate": 9.331330745537586e-06, "loss": 0.5386, "step": 1074 }, { "epoch": 0.77, "grad_norm": 20.10166588549232, "learning_rate": 9.329886153025513e-06, "loss": 0.5449, "step": 1075 }, { "epoch": 0.77, "grad_norm": 6.192408971954997, "learning_rate": 9.328440113817995e-06, "loss": 0.5259, "step": 1076 }, { "epoch": 0.77, "grad_norm": 12.208683818696056, "learning_rate": 9.326992628398182e-06, "loss": 0.5278, "step": 1077 }, { "epoch": 0.77, "grad_norm": 17.34674029850454, "learning_rate": 9.325543697249706e-06, "loss": 0.6328, "step": 1078 }, { "epoch": 0.77, "grad_norm": 11.912080486024484, "learning_rate": 9.324093320856679e-06, "loss": 0.5576, "step": 1079 }, { "epoch": 0.77, "grad_norm": 8.692872107879284, "learning_rate": 9.3226414997037e-06, "loss": 0.5107, "step": 1080 }, { "epoch": 0.77, "grad_norm": 12.965283753527137, "learning_rate": 9.32118823427585e-06, "loss": 0.6133, "step": 1081 }, { "epoch": 0.77, "grad_norm": 9.088812339526163, "learning_rate": 9.319733525058694e-06, "loss": 0.5312, "step": 1082 }, { "epoch": 0.77, "grad_norm": 8.69717168681488, "learning_rate": 9.318277372538274e-06, "loss": 0.5225, "step": 1083 }, { "epoch": 0.77, "grad_norm": 6.757166076166586, "learning_rate": 9.316819777201119e-06, "loss": 0.5303, "step": 1084 }, { "epoch": 0.77, "grad_norm": 14.268054362533457, "learning_rate": 9.315360739534235e-06, "loss": 0.6626, "step": 1085 }, { "epoch": 0.78, "grad_norm": 12.75302943618852, "learning_rate": 9.313900260025121e-06, "loss": 0.4951, "step": 1086 }, { "epoch": 0.78, "grad_norm": 13.058324474577528, "learning_rate": 9.312438339161746e-06, "loss": 0.5337, "step": 1087 }, { "epoch": 0.78, "grad_norm": 9.186156422831758, "learning_rate": 9.310974977432565e-06, "loss": 0.5122, "step": 1088 }, { "epoch": 0.78, "grad_norm": 19.810759557849348, "learning_rate": 9.309510175326515e-06, "loss": 0.5986, "step": 1089 }, { "epoch": 0.78, "grad_norm": 7.352639000106736, "learning_rate": 9.308043933333012e-06, "loss": 0.54, "step": 1090 }, { "epoch": 0.78, "grad_norm": 6.18764295150179, "learning_rate": 9.306576251941957e-06, "loss": 0.5889, "step": 1091 }, { "epoch": 0.78, "grad_norm": 23.066681900530504, "learning_rate": 9.305107131643729e-06, "loss": 0.6221, "step": 1092 }, { "epoch": 0.78, "grad_norm": 5.405377306247158, "learning_rate": 9.303636572929188e-06, "loss": 0.5723, "step": 1093 }, { "epoch": 0.78, "grad_norm": 18.5355497270562, "learning_rate": 9.302164576289674e-06, "loss": 0.5957, "step": 1094 }, { "epoch": 0.78, "grad_norm": 11.64764367778218, "learning_rate": 9.30069114221701e-06, "loss": 0.6143, "step": 1095 }, { "epoch": 0.78, "grad_norm": 8.446851167336094, "learning_rate": 9.299216271203498e-06, "loss": 0.6006, "step": 1096 }, { "epoch": 0.78, "grad_norm": 7.594638602419203, "learning_rate": 9.297739963741918e-06, "loss": 0.5596, "step": 1097 }, { "epoch": 0.78, "grad_norm": 12.822830235951766, "learning_rate": 9.296262220325535e-06, "loss": 0.5557, "step": 1098 }, { "epoch": 0.78, "grad_norm": 11.680693513430095, "learning_rate": 9.294783041448088e-06, "loss": 0.5522, "step": 1099 }, { "epoch": 0.79, "grad_norm": 10.487159518778437, "learning_rate": 9.293302427603796e-06, "loss": 0.5518, "step": 1100 }, { "epoch": 0.79, "grad_norm": 6.5217580901104935, "learning_rate": 9.291820379287364e-06, "loss": 0.5269, "step": 1101 }, { "epoch": 0.79, "grad_norm": 10.594198953311631, "learning_rate": 9.29033689699397e-06, "loss": 0.5562, "step": 1102 }, { "epoch": 0.79, "grad_norm": 5.991686017035859, "learning_rate": 9.288851981219273e-06, "loss": 0.5503, "step": 1103 }, { "epoch": 0.79, "grad_norm": 10.804601642168482, "learning_rate": 9.28736563245941e-06, "loss": 0.5806, "step": 1104 }, { "epoch": 0.79, "grad_norm": 12.916446670933475, "learning_rate": 9.285877851210999e-06, "loss": 0.5259, "step": 1105 }, { "epoch": 0.79, "grad_norm": 9.312528006718804, "learning_rate": 9.284388637971136e-06, "loss": 0.563, "step": 1106 }, { "epoch": 0.79, "grad_norm": 17.16927949883247, "learning_rate": 9.282897993237392e-06, "loss": 0.6025, "step": 1107 }, { "epoch": 0.79, "grad_norm": 15.748674092022494, "learning_rate": 9.281405917507824e-06, "loss": 0.5723, "step": 1108 }, { "epoch": 0.79, "grad_norm": 20.416556061970823, "learning_rate": 9.279912411280958e-06, "loss": 0.5889, "step": 1109 }, { "epoch": 0.79, "grad_norm": 9.2599868177948, "learning_rate": 9.278417475055803e-06, "loss": 0.5654, "step": 1110 }, { "epoch": 0.79, "grad_norm": 8.138906667061427, "learning_rate": 9.276921109331845e-06, "loss": 0.5127, "step": 1111 }, { "epoch": 0.79, "grad_norm": 9.142722431903195, "learning_rate": 9.275423314609049e-06, "loss": 0.5771, "step": 1112 }, { "epoch": 0.79, "grad_norm": 14.367984402900843, "learning_rate": 9.273924091387855e-06, "loss": 0.5132, "step": 1113 }, { "epoch": 0.8, "grad_norm": 8.460138534290378, "learning_rate": 9.272423440169181e-06, "loss": 0.5537, "step": 1114 }, { "epoch": 0.8, "grad_norm": 11.885508556319811, "learning_rate": 9.270921361454424e-06, "loss": 0.5698, "step": 1115 }, { "epoch": 0.8, "grad_norm": 14.642226342988765, "learning_rate": 9.269417855745453e-06, "loss": 0.5547, "step": 1116 }, { "epoch": 0.8, "grad_norm": 16.93459268058794, "learning_rate": 9.267912923544621e-06, "loss": 0.6357, "step": 1117 }, { "epoch": 0.8, "grad_norm": 19.61687681956424, "learning_rate": 9.266406565354753e-06, "loss": 0.5869, "step": 1118 }, { "epoch": 0.8, "grad_norm": 12.948478273276576, "learning_rate": 9.26489878167915e-06, "loss": 0.5884, "step": 1119 }, { "epoch": 0.8, "grad_norm": 9.224943637865685, "learning_rate": 9.263389573021592e-06, "loss": 0.5391, "step": 1120 }, { "epoch": 0.8, "grad_norm": 16.53281858669421, "learning_rate": 9.261878939886332e-06, "loss": 0.5391, "step": 1121 }, { "epoch": 0.8, "grad_norm": 6.571223439001684, "learning_rate": 9.2603668827781e-06, "loss": 0.5869, "step": 1122 }, { "epoch": 0.8, "grad_norm": 10.996971954221829, "learning_rate": 9.258853402202106e-06, "loss": 0.6143, "step": 1123 }, { "epoch": 0.8, "grad_norm": 5.562877307679958, "learning_rate": 9.25733849866403e-06, "loss": 0.54, "step": 1124 }, { "epoch": 0.8, "grad_norm": 25.681379456692, "learning_rate": 9.255822172670028e-06, "loss": 0.5957, "step": 1125 }, { "epoch": 0.8, "grad_norm": 11.240913002501355, "learning_rate": 9.254304424726734e-06, "loss": 0.48, "step": 1126 }, { "epoch": 0.8, "grad_norm": 5.171552813788881, "learning_rate": 9.252785255341256e-06, "loss": 0.5161, "step": 1127 }, { "epoch": 0.81, "grad_norm": 10.921769965960761, "learning_rate": 9.251264665021178e-06, "loss": 0.5864, "step": 1128 }, { "epoch": 0.81, "grad_norm": 6.1627582262793785, "learning_rate": 9.249742654274554e-06, "loss": 0.5283, "step": 1129 }, { "epoch": 0.81, "grad_norm": 11.70067037372327, "learning_rate": 9.24821922360992e-06, "loss": 0.5396, "step": 1130 }, { "epoch": 0.81, "grad_norm": 25.93057220073389, "learning_rate": 9.246694373536277e-06, "loss": 0.6123, "step": 1131 }, { "epoch": 0.81, "grad_norm": 19.705923665031527, "learning_rate": 9.245168104563112e-06, "loss": 0.6152, "step": 1132 }, { "epoch": 0.81, "grad_norm": 10.636045888986658, "learning_rate": 9.243640417200376e-06, "loss": 0.5352, "step": 1133 }, { "epoch": 0.81, "grad_norm": 11.505774334822952, "learning_rate": 9.242111311958502e-06, "loss": 0.521, "step": 1134 }, { "epoch": 0.81, "grad_norm": 7.567202760806081, "learning_rate": 9.240580789348385e-06, "loss": 0.6143, "step": 1135 }, { "epoch": 0.81, "grad_norm": 11.503439518116549, "learning_rate": 9.23904884988141e-06, "loss": 0.5908, "step": 1136 }, { "epoch": 0.81, "grad_norm": 15.221857770793607, "learning_rate": 9.237515494069417e-06, "loss": 0.5098, "step": 1137 }, { "epoch": 0.81, "grad_norm": 18.552085086378824, "learning_rate": 9.235980722424737e-06, "loss": 0.5034, "step": 1138 }, { "epoch": 0.81, "grad_norm": 16.725639961202933, "learning_rate": 9.234444535460161e-06, "loss": 0.4692, "step": 1139 }, { "epoch": 0.81, "grad_norm": 11.41674940985237, "learning_rate": 9.232906933688959e-06, "loss": 0.5, "step": 1140 }, { "epoch": 0.81, "grad_norm": 5.7960411684762825, "learning_rate": 9.231367917624872e-06, "loss": 0.5225, "step": 1141 }, { "epoch": 0.82, "grad_norm": 12.017568946472704, "learning_rate": 9.229827487782115e-06, "loss": 0.5518, "step": 1142 }, { "epoch": 0.82, "grad_norm": 31.05573818747122, "learning_rate": 9.228285644675372e-06, "loss": 0.7061, "step": 1143 }, { "epoch": 0.82, "grad_norm": 14.083495042416361, "learning_rate": 9.226742388819804e-06, "loss": 0.5938, "step": 1144 }, { "epoch": 0.82, "grad_norm": 7.823638652711353, "learning_rate": 9.225197720731039e-06, "loss": 0.606, "step": 1145 }, { "epoch": 0.82, "grad_norm": 7.519289143758317, "learning_rate": 9.223651640925181e-06, "loss": 0.5732, "step": 1146 }, { "epoch": 0.82, "grad_norm": 8.367894727037841, "learning_rate": 9.222104149918804e-06, "loss": 0.585, "step": 1147 }, { "epoch": 0.82, "grad_norm": 21.655844816080055, "learning_rate": 9.220555248228954e-06, "loss": 0.5459, "step": 1148 }, { "epoch": 0.82, "grad_norm": 19.43974033304336, "learning_rate": 9.219004936373146e-06, "loss": 0.6084, "step": 1149 }, { "epoch": 0.82, "grad_norm": 9.847797651330511, "learning_rate": 9.217453214869368e-06, "loss": 0.5312, "step": 1150 }, { "epoch": 0.82, "grad_norm": 12.437081911431463, "learning_rate": 9.21590008423608e-06, "loss": 0.5049, "step": 1151 }, { "epoch": 0.82, "grad_norm": 6.470652434232925, "learning_rate": 9.214345544992214e-06, "loss": 0.5342, "step": 1152 }, { "epoch": 0.82, "grad_norm": 10.034516507112599, "learning_rate": 9.212789597657167e-06, "loss": 0.5249, "step": 1153 }, { "epoch": 0.82, "grad_norm": 9.99367571691055, "learning_rate": 9.21123224275081e-06, "loss": 0.6074, "step": 1154 }, { "epoch": 0.82, "grad_norm": 8.864377559517447, "learning_rate": 9.209673480793486e-06, "loss": 0.5376, "step": 1155 }, { "epoch": 0.83, "grad_norm": 6.346477449728334, "learning_rate": 9.208113312306006e-06, "loss": 0.478, "step": 1156 }, { "epoch": 0.83, "grad_norm": 8.006532830891896, "learning_rate": 9.206551737809653e-06, "loss": 0.6025, "step": 1157 }, { "epoch": 0.83, "grad_norm": 12.611614451718726, "learning_rate": 9.204988757826173e-06, "loss": 0.5278, "step": 1158 }, { "epoch": 0.83, "grad_norm": 7.714149559119864, "learning_rate": 9.203424372877791e-06, "loss": 0.519, "step": 1159 }, { "epoch": 0.83, "grad_norm": 10.767330554783726, "learning_rate": 9.201858583487195e-06, "loss": 0.5977, "step": 1160 }, { "epoch": 0.83, "grad_norm": 6.727023867341504, "learning_rate": 9.200291390177546e-06, "loss": 0.4941, "step": 1161 }, { "epoch": 0.83, "grad_norm": 12.08139287351435, "learning_rate": 9.198722793472471e-06, "loss": 0.4712, "step": 1162 }, { "epoch": 0.83, "grad_norm": 19.361690233383793, "learning_rate": 9.197152793896068e-06, "loss": 0.5293, "step": 1163 }, { "epoch": 0.83, "grad_norm": 12.659319120735523, "learning_rate": 9.195581391972903e-06, "loss": 0.6079, "step": 1164 }, { "epoch": 0.83, "grad_norm": 14.574233768699525, "learning_rate": 9.194008588228011e-06, "loss": 0.5083, "step": 1165 }, { "epoch": 0.83, "grad_norm": 12.875647206303185, "learning_rate": 9.192434383186894e-06, "loss": 0.5625, "step": 1166 }, { "epoch": 0.83, "grad_norm": 10.59800269890822, "learning_rate": 9.190858777375523e-06, "loss": 0.5361, "step": 1167 }, { "epoch": 0.83, "grad_norm": 8.150351861232854, "learning_rate": 9.18928177132034e-06, "loss": 0.5273, "step": 1168 }, { "epoch": 0.83, "grad_norm": 15.189230279097433, "learning_rate": 9.187703365548248e-06, "loss": 0.5054, "step": 1169 }, { "epoch": 0.84, "grad_norm": 16.844111824576274, "learning_rate": 9.186123560586623e-06, "loss": 0.7607, "step": 1170 }, { "epoch": 0.84, "grad_norm": 10.745982102578823, "learning_rate": 9.18454235696331e-06, "loss": 0.5322, "step": 1171 }, { "epoch": 0.84, "grad_norm": 9.782839753489112, "learning_rate": 9.182959755206613e-06, "loss": 0.5449, "step": 1172 }, { "epoch": 0.84, "grad_norm": 10.418717825018716, "learning_rate": 9.181375755845314e-06, "loss": 0.5337, "step": 1173 }, { "epoch": 0.84, "grad_norm": 6.352862983177861, "learning_rate": 9.179790359408655e-06, "loss": 0.5342, "step": 1174 }, { "epoch": 0.84, "grad_norm": 8.998049401916107, "learning_rate": 9.178203566426344e-06, "loss": 0.5479, "step": 1175 }, { "epoch": 0.84, "grad_norm": 5.420625471547234, "learning_rate": 9.176615377428563e-06, "loss": 0.5527, "step": 1176 }, { "epoch": 0.84, "grad_norm": 16.578151600737712, "learning_rate": 9.175025792945951e-06, "loss": 0.5308, "step": 1177 }, { "epoch": 0.84, "grad_norm": 12.490365222019324, "learning_rate": 9.173434813509618e-06, "loss": 0.4888, "step": 1178 }, { "epoch": 0.84, "grad_norm": 5.223972990178867, "learning_rate": 9.171842439651143e-06, "loss": 0.5366, "step": 1179 }, { "epoch": 0.84, "grad_norm": 4.921385835164377, "learning_rate": 9.170248671902565e-06, "loss": 0.4946, "step": 1180 }, { "epoch": 0.84, "grad_norm": 16.07766786230304, "learning_rate": 9.168653510796392e-06, "loss": 0.7295, "step": 1181 }, { "epoch": 0.84, "grad_norm": 14.593698266134565, "learning_rate": 9.167056956865596e-06, "loss": 0.5977, "step": 1182 }, { "epoch": 0.84, "grad_norm": 16.775199062594883, "learning_rate": 9.165459010643618e-06, "loss": 0.5488, "step": 1183 }, { "epoch": 0.85, "grad_norm": 7.663762375167531, "learning_rate": 9.16385967266436e-06, "loss": 0.5054, "step": 1184 }, { "epoch": 0.85, "grad_norm": 5.215806260743865, "learning_rate": 9.16225894346219e-06, "loss": 0.5186, "step": 1185 }, { "epoch": 0.85, "grad_norm": 12.282697942390582, "learning_rate": 9.160656823571942e-06, "loss": 0.5898, "step": 1186 }, { "epoch": 0.85, "grad_norm": 17.51121958562781, "learning_rate": 9.159053313528913e-06, "loss": 0.5605, "step": 1187 }, { "epoch": 0.85, "grad_norm": 17.268837485274915, "learning_rate": 9.15744841386887e-06, "loss": 0.5498, "step": 1188 }, { "epoch": 0.85, "grad_norm": 10.072700613697913, "learning_rate": 9.155842125128033e-06, "loss": 0.5215, "step": 1189 }, { "epoch": 0.85, "grad_norm": 11.958687423245578, "learning_rate": 9.154234447843098e-06, "loss": 0.5063, "step": 1190 }, { "epoch": 0.85, "grad_norm": 6.237034674269332, "learning_rate": 9.152625382551217e-06, "loss": 0.4717, "step": 1191 }, { "epoch": 0.85, "grad_norm": 20.153622241744905, "learning_rate": 9.15101492979001e-06, "loss": 0.5029, "step": 1192 }, { "epoch": 0.85, "grad_norm": 21.47771817765446, "learning_rate": 9.149403090097557e-06, "loss": 0.5664, "step": 1193 }, { "epoch": 0.85, "grad_norm": 9.451169764840035, "learning_rate": 9.147789864012408e-06, "loss": 0.6226, "step": 1194 }, { "epoch": 0.85, "grad_norm": 10.500389478462552, "learning_rate": 9.146175252073568e-06, "loss": 0.5518, "step": 1195 }, { "epoch": 0.85, "grad_norm": 9.352519920650707, "learning_rate": 9.144559254820511e-06, "loss": 0.4795, "step": 1196 }, { "epoch": 0.85, "grad_norm": 7.579403203990102, "learning_rate": 9.14294187279317e-06, "loss": 0.5386, "step": 1197 }, { "epoch": 0.86, "grad_norm": 16.402304104093776, "learning_rate": 9.141323106531943e-06, "loss": 0.5537, "step": 1198 }, { "epoch": 0.86, "grad_norm": 11.906202099251876, "learning_rate": 9.139702956577693e-06, "loss": 0.5342, "step": 1199 }, { "epoch": 0.86, "grad_norm": 6.994868615336212, "learning_rate": 9.138081423471736e-06, "loss": 0.5054, "step": 1200 }, { "epoch": 0.86, "grad_norm": 10.708925086407962, "learning_rate": 9.136458507755862e-06, "loss": 0.5317, "step": 1201 }, { "epoch": 0.86, "grad_norm": 22.74881444619516, "learning_rate": 9.134834209972314e-06, "loss": 0.5034, "step": 1202 }, { "epoch": 0.86, "grad_norm": 10.191911807224606, "learning_rate": 9.133208530663801e-06, "loss": 0.4849, "step": 1203 }, { "epoch": 0.86, "grad_norm": 8.95443208118561, "learning_rate": 9.131581470373495e-06, "loss": 0.5337, "step": 1204 }, { "epoch": 0.86, "grad_norm": 15.525395622875937, "learning_rate": 9.129953029645022e-06, "loss": 0.4604, "step": 1205 }, { "epoch": 0.86, "grad_norm": 11.79694996087205, "learning_rate": 9.128323209022478e-06, "loss": 0.5122, "step": 1206 }, { "epoch": 0.86, "grad_norm": 11.755997307705414, "learning_rate": 9.126692009050415e-06, "loss": 0.4463, "step": 1207 }, { "epoch": 0.86, "grad_norm": 26.80591783623869, "learning_rate": 9.125059430273848e-06, "loss": 0.5796, "step": 1208 }, { "epoch": 0.86, "grad_norm": 18.852441160925146, "learning_rate": 9.123425473238253e-06, "loss": 0.5015, "step": 1209 }, { "epoch": 0.86, "grad_norm": 12.625338450562605, "learning_rate": 9.121790138489564e-06, "loss": 0.5117, "step": 1210 }, { "epoch": 0.86, "grad_norm": 13.080326610727653, "learning_rate": 9.120153426574177e-06, "loss": 0.5244, "step": 1211 }, { "epoch": 0.87, "grad_norm": 10.88918740584613, "learning_rate": 9.118515338038947e-06, "loss": 0.48, "step": 1212 }, { "epoch": 0.87, "grad_norm": 9.68959162364965, "learning_rate": 9.11687587343119e-06, "loss": 0.5259, "step": 1213 }, { "epoch": 0.87, "grad_norm": 19.726892232656102, "learning_rate": 9.115235033298682e-06, "loss": 0.5601, "step": 1214 }, { "epoch": 0.87, "grad_norm": 8.708377102841364, "learning_rate": 9.113592818189661e-06, "loss": 0.5708, "step": 1215 }, { "epoch": 0.87, "grad_norm": 20.856805732469365, "learning_rate": 9.111949228652816e-06, "loss": 0.5498, "step": 1216 }, { "epoch": 0.87, "grad_norm": 12.19921968394708, "learning_rate": 9.110304265237304e-06, "loss": 0.5098, "step": 1217 }, { "epoch": 0.87, "grad_norm": 5.873996190520849, "learning_rate": 9.10865792849274e-06, "loss": 0.4893, "step": 1218 }, { "epoch": 0.87, "grad_norm": 13.856928395460525, "learning_rate": 9.107010218969191e-06, "loss": 0.6113, "step": 1219 }, { "epoch": 0.87, "grad_norm": 7.008354271851358, "learning_rate": 9.10536113721719e-06, "loss": 0.5874, "step": 1220 }, { "epoch": 0.87, "grad_norm": 21.27467515244621, "learning_rate": 9.103710683787728e-06, "loss": 0.4824, "step": 1221 }, { "epoch": 0.87, "grad_norm": 6.838954506974407, "learning_rate": 9.102058859232247e-06, "loss": 0.5596, "step": 1222 }, { "epoch": 0.87, "grad_norm": 5.3653312710855285, "learning_rate": 9.100405664102656e-06, "loss": 0.5063, "step": 1223 }, { "epoch": 0.87, "grad_norm": 17.92988348312749, "learning_rate": 9.098751098951317e-06, "loss": 0.6328, "step": 1224 }, { "epoch": 0.87, "grad_norm": 8.868600191790884, "learning_rate": 9.09709516433105e-06, "loss": 0.5679, "step": 1225 }, { "epoch": 0.88, "grad_norm": 11.654596470733907, "learning_rate": 9.095437860795138e-06, "loss": 0.5215, "step": 1226 }, { "epoch": 0.88, "grad_norm": 11.054939394702243, "learning_rate": 9.09377918889731e-06, "loss": 0.5317, "step": 1227 }, { "epoch": 0.88, "grad_norm": 7.809295897500469, "learning_rate": 9.092119149191765e-06, "loss": 0.5142, "step": 1228 }, { "epoch": 0.88, "grad_norm": 7.752863951527627, "learning_rate": 9.090457742233152e-06, "loss": 0.4692, "step": 1229 }, { "epoch": 0.88, "grad_norm": 7.080450373075989, "learning_rate": 9.088794968576575e-06, "loss": 0.5771, "step": 1230 }, { "epoch": 0.88, "grad_norm": 6.273931732814395, "learning_rate": 9.087130828777598e-06, "loss": 0.5811, "step": 1231 }, { "epoch": 0.88, "grad_norm": 14.801947879251854, "learning_rate": 9.085465323392243e-06, "loss": 0.5923, "step": 1232 }, { "epoch": 0.88, "grad_norm": 11.904080105524493, "learning_rate": 9.083798452976988e-06, "loss": 0.584, "step": 1233 }, { "epoch": 0.88, "grad_norm": 12.329611523622987, "learning_rate": 9.082130218088762e-06, "loss": 0.5698, "step": 1234 }, { "epoch": 0.88, "grad_norm": 7.2037432427127, "learning_rate": 9.080460619284954e-06, "loss": 0.5283, "step": 1235 }, { "epoch": 0.88, "grad_norm": 12.783777108155, "learning_rate": 9.07878965712341e-06, "loss": 0.6377, "step": 1236 }, { "epoch": 0.88, "grad_norm": 5.5497103145234625, "learning_rate": 9.077117332162427e-06, "loss": 0.5791, "step": 1237 }, { "epoch": 0.88, "grad_norm": 7.356999836172582, "learning_rate": 9.075443644960761e-06, "loss": 0.5063, "step": 1238 }, { "epoch": 0.88, "grad_norm": 9.158396146128032, "learning_rate": 9.07376859607762e-06, "loss": 0.5376, "step": 1239 }, { "epoch": 0.89, "grad_norm": 9.510823350991721, "learning_rate": 9.072092186072675e-06, "loss": 0.6494, "step": 1240 }, { "epoch": 0.89, "grad_norm": 15.757079666813102, "learning_rate": 9.070414415506038e-06, "loss": 0.6143, "step": 1241 }, { "epoch": 0.89, "grad_norm": 8.23137401597286, "learning_rate": 9.068735284938288e-06, "loss": 0.4785, "step": 1242 }, { "epoch": 0.89, "grad_norm": 5.909470075068411, "learning_rate": 9.067054794930452e-06, "loss": 0.4731, "step": 1243 }, { "epoch": 0.89, "grad_norm": 6.924869803084112, "learning_rate": 9.065372946044014e-06, "loss": 0.606, "step": 1244 }, { "epoch": 0.89, "grad_norm": 8.833436716234493, "learning_rate": 9.063689738840911e-06, "loss": 0.5996, "step": 1245 }, { "epoch": 0.89, "grad_norm": 11.76560707751374, "learning_rate": 9.06200517388353e-06, "loss": 0.6333, "step": 1246 }, { "epoch": 0.89, "grad_norm": 6.385881684068905, "learning_rate": 9.060319251734723e-06, "loss": 0.5283, "step": 1247 }, { "epoch": 0.89, "grad_norm": 5.674391394289866, "learning_rate": 9.058631972957783e-06, "loss": 0.5977, "step": 1248 }, { "epoch": 0.89, "grad_norm": 7.351792465763545, "learning_rate": 9.056943338116461e-06, "loss": 0.5024, "step": 1249 }, { "epoch": 0.89, "grad_norm": 9.53319179062876, "learning_rate": 9.055253347774961e-06, "loss": 0.5386, "step": 1250 }, { "epoch": 0.89, "grad_norm": 11.528600979838268, "learning_rate": 9.053562002497943e-06, "loss": 0.5737, "step": 1251 }, { "epoch": 0.89, "grad_norm": 7.959305596996799, "learning_rate": 9.051869302850515e-06, "loss": 0.4683, "step": 1252 }, { "epoch": 0.89, "grad_norm": 8.58161660458802, "learning_rate": 9.05017524939824e-06, "loss": 0.5527, "step": 1253 }, { "epoch": 0.9, "grad_norm": 12.243428609193199, "learning_rate": 9.048479842707132e-06, "loss": 0.5894, "step": 1254 }, { "epoch": 0.9, "grad_norm": 5.236285055544797, "learning_rate": 9.046783083343657e-06, "loss": 0.4614, "step": 1255 }, { "epoch": 0.9, "grad_norm": 7.591178916038821, "learning_rate": 9.045084971874738e-06, "loss": 0.5181, "step": 1256 }, { "epoch": 0.9, "grad_norm": 10.84688810132272, "learning_rate": 9.043385508867741e-06, "loss": 0.4771, "step": 1257 }, { "epoch": 0.9, "grad_norm": 8.615688659064489, "learning_rate": 9.041684694890492e-06, "loss": 0.4736, "step": 1258 }, { "epoch": 0.9, "grad_norm": 12.846075910748137, "learning_rate": 9.03998253051126e-06, "loss": 0.6909, "step": 1259 }, { "epoch": 0.9, "grad_norm": 11.612951792166978, "learning_rate": 9.038279016298773e-06, "loss": 0.4756, "step": 1260 }, { "epoch": 0.9, "grad_norm": 8.350725640206349, "learning_rate": 9.036574152822206e-06, "loss": 0.5273, "step": 1261 }, { "epoch": 0.9, "grad_norm": 6.878946181687478, "learning_rate": 9.034867940651186e-06, "loss": 0.5635, "step": 1262 }, { "epoch": 0.9, "grad_norm": 5.333503190009848, "learning_rate": 9.033160380355789e-06, "loss": 0.4512, "step": 1263 }, { "epoch": 0.9, "grad_norm": 9.830536532606617, "learning_rate": 9.031451472506544e-06, "loss": 0.5039, "step": 1264 }, { "epoch": 0.9, "grad_norm": 9.00597678487424, "learning_rate": 9.029741217674428e-06, "loss": 0.5776, "step": 1265 }, { "epoch": 0.9, "grad_norm": 9.365161480301378, "learning_rate": 9.02802961643087e-06, "loss": 0.5635, "step": 1266 }, { "epoch": 0.9, "grad_norm": 13.387833721542348, "learning_rate": 9.026316669347747e-06, "loss": 0.5137, "step": 1267 }, { "epoch": 0.91, "grad_norm": 10.41425867798381, "learning_rate": 9.024602376997387e-06, "loss": 0.5454, "step": 1268 }, { "epoch": 0.91, "grad_norm": 10.311986546252852, "learning_rate": 9.022886739952565e-06, "loss": 0.4316, "step": 1269 }, { "epoch": 0.91, "grad_norm": 14.496984433352765, "learning_rate": 9.02116975878651e-06, "loss": 0.5542, "step": 1270 }, { "epoch": 0.91, "grad_norm": 11.783629489842369, "learning_rate": 9.019451434072894e-06, "loss": 0.5654, "step": 1271 }, { "epoch": 0.91, "grad_norm": 7.656169059630007, "learning_rate": 9.017731766385844e-06, "loss": 0.5093, "step": 1272 }, { "epoch": 0.91, "grad_norm": 7.338346102975163, "learning_rate": 9.016010756299934e-06, "loss": 0.4619, "step": 1273 }, { "epoch": 0.91, "grad_norm": 8.6546009929447, "learning_rate": 9.014288404390182e-06, "loss": 0.5293, "step": 1274 }, { "epoch": 0.91, "grad_norm": 14.49900328718962, "learning_rate": 9.012564711232059e-06, "loss": 0.4985, "step": 1275 }, { "epoch": 0.91, "grad_norm": 8.799568957162913, "learning_rate": 9.010839677401484e-06, "loss": 0.5776, "step": 1276 }, { "epoch": 0.91, "grad_norm": 6.442440495388319, "learning_rate": 9.009113303474822e-06, "loss": 0.4966, "step": 1277 }, { "epoch": 0.91, "grad_norm": 8.780144748137946, "learning_rate": 9.007385590028887e-06, "loss": 0.4404, "step": 1278 }, { "epoch": 0.91, "grad_norm": 13.298026166100321, "learning_rate": 9.005656537640942e-06, "loss": 0.5498, "step": 1279 }, { "epoch": 0.91, "grad_norm": 5.495108785802219, "learning_rate": 9.003926146888691e-06, "loss": 0.4897, "step": 1280 }, { "epoch": 0.91, "grad_norm": 6.611746701232708, "learning_rate": 9.002194418350291e-06, "loss": 0.6392, "step": 1281 }, { "epoch": 0.92, "grad_norm": 11.718656093085405, "learning_rate": 9.000461352604349e-06, "loss": 0.5972, "step": 1282 }, { "epoch": 0.92, "grad_norm": 9.384919907374725, "learning_rate": 8.99872695022991e-06, "loss": 0.5142, "step": 1283 }, { "epoch": 0.92, "grad_norm": 8.111422961254767, "learning_rate": 8.996991211806471e-06, "loss": 0.5176, "step": 1284 }, { "epoch": 0.92, "grad_norm": 6.61897079404457, "learning_rate": 8.995254137913977e-06, "loss": 0.5859, "step": 1285 }, { "epoch": 0.92, "grad_norm": 7.626370903659802, "learning_rate": 8.99351572913281e-06, "loss": 0.5083, "step": 1286 }, { "epoch": 0.92, "grad_norm": 6.1278822351560756, "learning_rate": 8.991775986043814e-06, "loss": 0.4365, "step": 1287 }, { "epoch": 0.92, "grad_norm": 5.807804903096628, "learning_rate": 8.990034909228262e-06, "loss": 0.5439, "step": 1288 }, { "epoch": 0.92, "grad_norm": 5.912336119687683, "learning_rate": 8.988292499267885e-06, "loss": 0.4189, "step": 1289 }, { "epoch": 0.92, "grad_norm": 16.556716686548828, "learning_rate": 8.986548756744852e-06, "loss": 0.4966, "step": 1290 }, { "epoch": 0.92, "grad_norm": 7.882441819426605, "learning_rate": 8.98480368224178e-06, "loss": 0.5439, "step": 1291 }, { "epoch": 0.92, "grad_norm": 10.43392907102242, "learning_rate": 8.98305727634173e-06, "loss": 0.4883, "step": 1292 }, { "epoch": 0.92, "grad_norm": 7.656944181948202, "learning_rate": 8.981309539628212e-06, "loss": 0.5811, "step": 1293 }, { "epoch": 0.92, "grad_norm": 6.359227028201466, "learning_rate": 8.979560472685174e-06, "loss": 0.4385, "step": 1294 }, { "epoch": 0.92, "grad_norm": 8.356588723580822, "learning_rate": 8.977810076097013e-06, "loss": 0.4492, "step": 1295 }, { "epoch": 0.93, "grad_norm": 9.82540714825025, "learning_rate": 8.97605835044857e-06, "loss": 0.647, "step": 1296 }, { "epoch": 0.93, "grad_norm": 6.504595450793861, "learning_rate": 8.974305296325125e-06, "loss": 0.4238, "step": 1297 }, { "epoch": 0.93, "grad_norm": 6.877558211686209, "learning_rate": 8.97255091431241e-06, "loss": 0.4917, "step": 1298 }, { "epoch": 0.93, "grad_norm": 10.50916493277105, "learning_rate": 8.970795204996597e-06, "loss": 0.4795, "step": 1299 }, { "epoch": 0.93, "grad_norm": 9.204769262934178, "learning_rate": 8.969038168964298e-06, "loss": 0.5645, "step": 1300 }, { "epoch": 0.93, "grad_norm": 16.851121613107946, "learning_rate": 8.967279806802576e-06, "loss": 0.5483, "step": 1301 }, { "epoch": 0.93, "grad_norm": 8.587881444493236, "learning_rate": 8.965520119098926e-06, "loss": 0.478, "step": 1302 }, { "epoch": 0.93, "grad_norm": 22.027251162811073, "learning_rate": 8.9637591064413e-06, "loss": 0.624, "step": 1303 }, { "epoch": 0.93, "grad_norm": 21.099260672728388, "learning_rate": 8.961996769418077e-06, "loss": 0.5215, "step": 1304 }, { "epoch": 0.93, "grad_norm": 12.415671378119475, "learning_rate": 8.960233108618092e-06, "loss": 0.5791, "step": 1305 }, { "epoch": 0.93, "grad_norm": 7.662223641152644, "learning_rate": 8.958468124630617e-06, "loss": 0.5718, "step": 1306 }, { "epoch": 0.93, "grad_norm": 6.366844981796646, "learning_rate": 8.956701818045363e-06, "loss": 0.4946, "step": 1307 }, { "epoch": 0.93, "grad_norm": 18.096606101285385, "learning_rate": 8.954934189452489e-06, "loss": 0.4512, "step": 1308 }, { "epoch": 0.93, "grad_norm": 12.714222025011097, "learning_rate": 8.953165239442589e-06, "loss": 0.5986, "step": 1309 }, { "epoch": 0.94, "grad_norm": 14.336782119481711, "learning_rate": 8.951394968606704e-06, "loss": 0.5625, "step": 1310 }, { "epoch": 0.94, "grad_norm": 18.32913837235048, "learning_rate": 8.949623377536314e-06, "loss": 0.5757, "step": 1311 }, { "epoch": 0.94, "grad_norm": 11.84859426270501, "learning_rate": 8.947850466823343e-06, "loss": 0.4834, "step": 1312 }, { "epoch": 0.94, "grad_norm": 8.789293590180087, "learning_rate": 8.946076237060148e-06, "loss": 0.5361, "step": 1313 }, { "epoch": 0.94, "grad_norm": 5.977729367014027, "learning_rate": 8.944300688839538e-06, "loss": 0.5249, "step": 1314 }, { "epoch": 0.94, "grad_norm": 11.763893603900193, "learning_rate": 8.942523822754751e-06, "loss": 0.5415, "step": 1315 }, { "epoch": 0.94, "grad_norm": 4.7524094218657575, "learning_rate": 8.940745639399477e-06, "loss": 0.5156, "step": 1316 }, { "epoch": 0.94, "grad_norm": 5.7945367399217975, "learning_rate": 8.938966139367837e-06, "loss": 0.5059, "step": 1317 }, { "epoch": 0.94, "grad_norm": 7.0240203752149535, "learning_rate": 8.937185323254395e-06, "loss": 0.5151, "step": 1318 }, { "epoch": 0.94, "grad_norm": 7.481981866447117, "learning_rate": 8.935403191654155e-06, "loss": 0.4126, "step": 1319 }, { "epoch": 0.94, "grad_norm": 9.449159435304624, "learning_rate": 8.933619745162559e-06, "loss": 0.5938, "step": 1320 }, { "epoch": 0.94, "grad_norm": 8.242575178472086, "learning_rate": 8.931834984375492e-06, "loss": 0.4771, "step": 1321 }, { "epoch": 0.94, "grad_norm": 8.949440636644377, "learning_rate": 8.930048909889272e-06, "loss": 0.5474, "step": 1322 }, { "epoch": 0.94, "grad_norm": 7.20984477728289, "learning_rate": 8.928261522300665e-06, "loss": 0.5073, "step": 1323 }, { "epoch": 0.95, "grad_norm": 5.653104358487502, "learning_rate": 8.926472822206869e-06, "loss": 0.4878, "step": 1324 }, { "epoch": 0.95, "grad_norm": 15.750277767714117, "learning_rate": 8.924682810205519e-06, "loss": 0.5728, "step": 1325 }, { "epoch": 0.95, "grad_norm": 6.545984784195176, "learning_rate": 8.922891486894692e-06, "loss": 0.4961, "step": 1326 }, { "epoch": 0.95, "grad_norm": 6.327138896740525, "learning_rate": 8.921098852872904e-06, "loss": 0.4985, "step": 1327 }, { "epoch": 0.95, "grad_norm": 7.5493735701358435, "learning_rate": 8.919304908739106e-06, "loss": 0.5244, "step": 1328 }, { "epoch": 0.95, "grad_norm": 8.695313836532721, "learning_rate": 8.917509655092691e-06, "loss": 0.5732, "step": 1329 }, { "epoch": 0.95, "grad_norm": 6.7706710902436695, "learning_rate": 8.915713092533483e-06, "loss": 0.4644, "step": 1330 }, { "epoch": 0.95, "grad_norm": 10.650752650381953, "learning_rate": 8.913915221661748e-06, "loss": 0.5396, "step": 1331 }, { "epoch": 0.95, "grad_norm": 13.714879719273181, "learning_rate": 8.912116043078188e-06, "loss": 0.5454, "step": 1332 }, { "epoch": 0.95, "grad_norm": 8.739222984938651, "learning_rate": 8.910315557383944e-06, "loss": 0.6167, "step": 1333 }, { "epoch": 0.95, "grad_norm": 14.128982922295334, "learning_rate": 8.90851376518059e-06, "loss": 0.5786, "step": 1334 }, { "epoch": 0.95, "grad_norm": 8.459299957556542, "learning_rate": 8.906710667070136e-06, "loss": 0.624, "step": 1335 }, { "epoch": 0.95, "grad_norm": 6.598499948785349, "learning_rate": 8.904906263655036e-06, "loss": 0.6475, "step": 1336 }, { "epoch": 0.95, "grad_norm": 6.194095670682739, "learning_rate": 8.903100555538169e-06, "loss": 0.5264, "step": 1337 }, { "epoch": 0.96, "grad_norm": 11.260818438840108, "learning_rate": 8.90129354332286e-06, "loss": 0.5459, "step": 1338 }, { "epoch": 0.96, "grad_norm": 10.831921547703958, "learning_rate": 8.899485227612865e-06, "loss": 0.5386, "step": 1339 }, { "epoch": 0.96, "grad_norm": 10.993655493393854, "learning_rate": 8.897675609012372e-06, "loss": 0.5488, "step": 1340 }, { "epoch": 0.96, "grad_norm": 7.371509471270881, "learning_rate": 8.895864688126013e-06, "loss": 0.5415, "step": 1341 }, { "epoch": 0.96, "grad_norm": 12.289776737421548, "learning_rate": 8.894052465558846e-06, "loss": 0.5205, "step": 1342 }, { "epoch": 0.96, "grad_norm": 6.368680291839942, "learning_rate": 8.892238941916372e-06, "loss": 0.5693, "step": 1343 }, { "epoch": 0.96, "grad_norm": 6.862113997301074, "learning_rate": 8.890424117804522e-06, "loss": 0.5518, "step": 1344 }, { "epoch": 0.96, "grad_norm": 6.249598220460352, "learning_rate": 8.88860799382966e-06, "loss": 0.6641, "step": 1345 }, { "epoch": 0.96, "grad_norm": 9.675919356373685, "learning_rate": 8.88679057059859e-06, "loss": 0.5454, "step": 1346 }, { "epoch": 0.96, "grad_norm": 7.730236623835078, "learning_rate": 8.884971848718544e-06, "loss": 0.5562, "step": 1347 }, { "epoch": 0.96, "grad_norm": 11.74713854214673, "learning_rate": 8.883151828797194e-06, "loss": 0.5557, "step": 1348 }, { "epoch": 0.96, "grad_norm": 8.820048698580294, "learning_rate": 8.88133051144264e-06, "loss": 0.5674, "step": 1349 }, { "epoch": 0.96, "grad_norm": 6.657133326905244, "learning_rate": 8.87950789726342e-06, "loss": 0.5859, "step": 1350 }, { "epoch": 0.96, "grad_norm": 7.081312677292772, "learning_rate": 8.8776839868685e-06, "loss": 0.5039, "step": 1351 }, { "epoch": 0.97, "grad_norm": 10.681121735697019, "learning_rate": 8.875858780867286e-06, "loss": 0.5093, "step": 1352 }, { "epoch": 0.97, "grad_norm": 16.68941777386891, "learning_rate": 8.87403227986961e-06, "loss": 0.6191, "step": 1353 }, { "epoch": 0.97, "grad_norm": 22.743290633026398, "learning_rate": 8.872204484485743e-06, "loss": 0.5903, "step": 1354 }, { "epoch": 0.97, "grad_norm": 7.595883157216245, "learning_rate": 8.870375395326384e-06, "loss": 0.4712, "step": 1355 }, { "epoch": 0.97, "grad_norm": 11.431960114700615, "learning_rate": 8.868545013002665e-06, "loss": 0.4814, "step": 1356 }, { "epoch": 0.97, "grad_norm": 24.738428919658247, "learning_rate": 8.866713338126152e-06, "loss": 0.6064, "step": 1357 }, { "epoch": 0.97, "grad_norm": 16.483334098623065, "learning_rate": 8.86488037130884e-06, "loss": 0.542, "step": 1358 }, { "epoch": 0.97, "grad_norm": 28.654635831705043, "learning_rate": 8.863046113163158e-06, "loss": 0.5166, "step": 1359 }, { "epoch": 0.97, "grad_norm": 8.446905888828201, "learning_rate": 8.861210564301967e-06, "loss": 0.5576, "step": 1360 }, { "epoch": 0.97, "grad_norm": 6.786577684525464, "learning_rate": 8.859373725338558e-06, "loss": 0.5571, "step": 1361 }, { "epoch": 0.97, "grad_norm": 19.77386586403384, "learning_rate": 8.857535596886652e-06, "loss": 0.5259, "step": 1362 }, { "epoch": 0.97, "grad_norm": 27.49805900372077, "learning_rate": 8.855696179560402e-06, "loss": 0.6602, "step": 1363 }, { "epoch": 0.97, "grad_norm": 7.273355043162183, "learning_rate": 8.85385547397439e-06, "loss": 0.4653, "step": 1364 }, { "epoch": 0.97, "grad_norm": 20.217716781867352, "learning_rate": 8.852013480743632e-06, "loss": 0.6235, "step": 1365 }, { "epoch": 0.98, "grad_norm": 9.843685680691951, "learning_rate": 8.850170200483573e-06, "loss": 0.4951, "step": 1366 }, { "epoch": 0.98, "grad_norm": 13.271803428940668, "learning_rate": 8.848325633810083e-06, "loss": 0.498, "step": 1367 }, { "epoch": 0.98, "grad_norm": 11.37198649024497, "learning_rate": 8.84647978133947e-06, "loss": 0.5361, "step": 1368 }, { "epoch": 0.98, "grad_norm": 17.854819527941277, "learning_rate": 8.844632643688467e-06, "loss": 0.6172, "step": 1369 }, { "epoch": 0.98, "grad_norm": 7.139410040675058, "learning_rate": 8.842784221474237e-06, "loss": 0.5098, "step": 1370 }, { "epoch": 0.98, "grad_norm": 10.702123104062924, "learning_rate": 8.840934515314372e-06, "loss": 0.6406, "step": 1371 }, { "epoch": 0.98, "grad_norm": 8.327060121802434, "learning_rate": 8.839083525826893e-06, "loss": 0.562, "step": 1372 }, { "epoch": 0.98, "grad_norm": 12.967346146693572, "learning_rate": 8.837231253630247e-06, "loss": 0.5122, "step": 1373 }, { "epoch": 0.98, "grad_norm": 9.222045501981935, "learning_rate": 8.835377699343318e-06, "loss": 0.5391, "step": 1374 }, { "epoch": 0.98, "grad_norm": 5.7150367345669855, "learning_rate": 8.83352286358541e-06, "loss": 0.501, "step": 1375 }, { "epoch": 0.98, "grad_norm": 10.34388020346308, "learning_rate": 8.83166674697626e-06, "loss": 0.5498, "step": 1376 }, { "epoch": 0.98, "grad_norm": 6.994275061577608, "learning_rate": 8.829809350136027e-06, "loss": 0.5469, "step": 1377 }, { "epoch": 0.98, "grad_norm": 6.286509681654963, "learning_rate": 8.827950673685306e-06, "loss": 0.5586, "step": 1378 }, { "epoch": 0.98, "grad_norm": 7.615667447671237, "learning_rate": 8.826090718245112e-06, "loss": 0.5747, "step": 1379 }, { "epoch": 0.99, "grad_norm": 8.045180625435941, "learning_rate": 8.824229484436894e-06, "loss": 0.5361, "step": 1380 }, { "epoch": 0.99, "grad_norm": 16.914235609616878, "learning_rate": 8.822366972882523e-06, "loss": 0.6753, "step": 1381 }, { "epoch": 0.99, "grad_norm": 6.65556839435706, "learning_rate": 8.820503184204299e-06, "loss": 0.5171, "step": 1382 }, { "epoch": 0.99, "grad_norm": 17.818279504647876, "learning_rate": 8.818638119024949e-06, "loss": 0.5488, "step": 1383 }, { "epoch": 0.99, "grad_norm": 29.01882884415183, "learning_rate": 8.816771777967623e-06, "loss": 0.6357, "step": 1384 }, { "epoch": 0.99, "grad_norm": 5.102254652825774, "learning_rate": 8.814904161655904e-06, "loss": 0.5728, "step": 1385 }, { "epoch": 0.99, "grad_norm": 9.028759055391328, "learning_rate": 8.813035270713796e-06, "loss": 0.4946, "step": 1386 }, { "epoch": 0.99, "grad_norm": 5.481350770883132, "learning_rate": 8.811165105765732e-06, "loss": 0.5146, "step": 1387 }, { "epoch": 0.99, "grad_norm": 4.584523783116981, "learning_rate": 8.809293667436565e-06, "loss": 0.5498, "step": 1388 }, { "epoch": 0.99, "grad_norm": 7.670757323199173, "learning_rate": 8.80742095635158e-06, "loss": 0.5869, "step": 1389 }, { "epoch": 0.99, "grad_norm": 13.789379257935748, "learning_rate": 8.805546973136481e-06, "loss": 0.5391, "step": 1390 }, { "epoch": 0.99, "grad_norm": 9.92942354858489, "learning_rate": 8.803671718417407e-06, "loss": 0.5396, "step": 1391 }, { "epoch": 0.99, "grad_norm": 10.025556704510787, "learning_rate": 8.80179519282091e-06, "loss": 0.4453, "step": 1392 }, { "epoch": 0.99, "grad_norm": 9.863285419079764, "learning_rate": 8.799917396973976e-06, "loss": 0.5576, "step": 1393 }, { "epoch": 1.0, "grad_norm": 7.510372608172871, "learning_rate": 8.798038331504008e-06, "loss": 0.499, "step": 1394 }, { "epoch": 1.0, "grad_norm": 10.339190789992044, "learning_rate": 8.79615799703884e-06, "loss": 0.5347, "step": 1395 }, { "epoch": 1.0, "grad_norm": 9.228994238966333, "learning_rate": 8.794276394206722e-06, "loss": 0.4858, "step": 1396 }, { "epoch": 1.0, "grad_norm": 10.720067062308178, "learning_rate": 8.792393523636337e-06, "loss": 0.5122, "step": 1397 }, { "epoch": 1.0, "grad_norm": 9.959990759829996, "learning_rate": 8.790509385956784e-06, "loss": 0.6104, "step": 1398 }, { "epoch": 1.0, "grad_norm": 7.7350338881120315, "learning_rate": 8.788623981797592e-06, "loss": 0.5569, "step": 1399 }, { "epoch": 1.0, "grad_norm": 6.122908029947811, "learning_rate": 8.786737311788708e-06, "loss": 0.5083, "step": 1400 }, { "epoch": 1.0, "grad_norm": 7.249960081112658, "learning_rate": 8.784849376560503e-06, "loss": 0.4473, "step": 1401 }, { "epoch": 1.0, "grad_norm": 5.9378761564742, "learning_rate": 8.78296017674377e-06, "loss": 0.3892, "step": 1402 }, { "epoch": 1.0, "grad_norm": 7.947182755307541, "learning_rate": 8.781069712969726e-06, "loss": 0.4663, "step": 1403 }, { "epoch": 1.0, "grad_norm": 7.735615644483297, "learning_rate": 8.779177985870012e-06, "loss": 0.4985, "step": 1404 }, { "epoch": 1.0, "grad_norm": 6.221979827272879, "learning_rate": 8.77728499607669e-06, "loss": 0.3682, "step": 1405 }, { "epoch": 1.0, "grad_norm": 7.51360598274658, "learning_rate": 8.775390744222238e-06, "loss": 0.4927, "step": 1406 }, { "epoch": 1.0, "grad_norm": 9.824576130882553, "learning_rate": 8.773495230939567e-06, "loss": 0.439, "step": 1407 }, { "epoch": 1.0, "grad_norm": 7.542083031305357, "learning_rate": 8.771598456861998e-06, "loss": 0.407, "step": 1408 }, { "epoch": 1.01, "grad_norm": 6.74463891725652, "learning_rate": 8.769700422623283e-06, "loss": 0.3843, "step": 1409 }, { "epoch": 1.01, "grad_norm": 7.052978396859257, "learning_rate": 8.767801128857588e-06, "loss": 0.4321, "step": 1410 }, { "epoch": 1.01, "grad_norm": 11.845362886453344, "learning_rate": 8.765900576199502e-06, "loss": 0.4565, "step": 1411 }, { "epoch": 1.01, "grad_norm": 11.17409389016001, "learning_rate": 8.763998765284036e-06, "loss": 0.4888, "step": 1412 }, { "epoch": 1.01, "grad_norm": 17.04399663972258, "learning_rate": 8.76209569674662e-06, "loss": 0.4199, "step": 1413 }, { "epoch": 1.01, "grad_norm": 10.796687567190697, "learning_rate": 8.760191371223104e-06, "loss": 0.4346, "step": 1414 }, { "epoch": 1.01, "grad_norm": 13.742485700753212, "learning_rate": 8.758285789349759e-06, "loss": 0.479, "step": 1415 }, { "epoch": 1.01, "grad_norm": 10.549981447404763, "learning_rate": 8.756378951763277e-06, "loss": 0.4429, "step": 1416 }, { "epoch": 1.01, "grad_norm": 10.523418901631306, "learning_rate": 8.754470859100765e-06, "loss": 0.3989, "step": 1417 }, { "epoch": 1.01, "grad_norm": 17.12913542529696, "learning_rate": 8.752561511999754e-06, "loss": 0.5083, "step": 1418 }, { "epoch": 1.01, "grad_norm": 15.040083536035315, "learning_rate": 8.750650911098193e-06, "loss": 0.4619, "step": 1419 }, { "epoch": 1.01, "grad_norm": 14.958198799461423, "learning_rate": 8.748739057034447e-06, "loss": 0.457, "step": 1420 }, { "epoch": 1.01, "grad_norm": 34.28258733668816, "learning_rate": 8.746825950447302e-06, "loss": 0.3999, "step": 1421 }, { "epoch": 1.01, "grad_norm": 18.46394179525724, "learning_rate": 8.744911591975967e-06, "loss": 0.4434, "step": 1422 }, { "epoch": 1.02, "grad_norm": 17.570102832172687, "learning_rate": 8.742995982260059e-06, "loss": 0.4307, "step": 1423 }, { "epoch": 1.02, "grad_norm": 20.157526748035593, "learning_rate": 8.741079121939621e-06, "loss": 0.4961, "step": 1424 }, { "epoch": 1.02, "grad_norm": 10.562647415324578, "learning_rate": 8.739161011655113e-06, "loss": 0.458, "step": 1425 }, { "epoch": 1.02, "grad_norm": 11.069738473837281, "learning_rate": 8.737241652047408e-06, "loss": 0.603, "step": 1426 }, { "epoch": 1.02, "grad_norm": 11.918982204133165, "learning_rate": 8.735321043757805e-06, "loss": 0.4688, "step": 1427 }, { "epoch": 1.02, "grad_norm": 10.46229012344293, "learning_rate": 8.73339918742801e-06, "loss": 0.4341, "step": 1428 }, { "epoch": 1.02, "grad_norm": 12.84418719877701, "learning_rate": 8.731476083700154e-06, "loss": 0.4683, "step": 1429 }, { "epoch": 1.02, "grad_norm": 11.59064803571736, "learning_rate": 8.729551733216779e-06, "loss": 0.4229, "step": 1430 }, { "epoch": 1.02, "grad_norm": 10.516324309495134, "learning_rate": 8.727626136620848e-06, "loss": 0.4502, "step": 1431 }, { "epoch": 1.02, "grad_norm": 26.410113834424802, "learning_rate": 8.725699294555739e-06, "loss": 0.5132, "step": 1432 }, { "epoch": 1.02, "grad_norm": 20.34794795255431, "learning_rate": 8.723771207665245e-06, "loss": 0.5312, "step": 1433 }, { "epoch": 1.02, "grad_norm": 13.382266025888864, "learning_rate": 8.721841876593576e-06, "loss": 0.4482, "step": 1434 }, { "epoch": 1.02, "grad_norm": 12.996872293754613, "learning_rate": 8.719911301985355e-06, "loss": 0.4189, "step": 1435 }, { "epoch": 1.02, "grad_norm": 13.616870720209013, "learning_rate": 8.717979484485628e-06, "loss": 0.3623, "step": 1436 }, { "epoch": 1.03, "grad_norm": 18.78460022966816, "learning_rate": 8.716046424739845e-06, "loss": 0.5029, "step": 1437 }, { "epoch": 1.03, "grad_norm": 27.875048810799026, "learning_rate": 8.714112123393882e-06, "loss": 0.5117, "step": 1438 }, { "epoch": 1.03, "grad_norm": 11.085152704221128, "learning_rate": 8.712176581094025e-06, "loss": 0.438, "step": 1439 }, { "epoch": 1.03, "grad_norm": 20.939014408876105, "learning_rate": 8.710239798486972e-06, "loss": 0.5273, "step": 1440 }, { "epoch": 1.03, "grad_norm": 36.2593949826461, "learning_rate": 8.708301776219838e-06, "loss": 0.4185, "step": 1441 }, { "epoch": 1.03, "grad_norm": 112.11982714132783, "learning_rate": 8.706362514940153e-06, "loss": 0.4478, "step": 1442 }, { "epoch": 1.03, "grad_norm": 277.4382397294177, "learning_rate": 8.704422015295861e-06, "loss": 0.5786, "step": 1443 }, { "epoch": 1.03, "grad_norm": 181.53470646624706, "learning_rate": 8.702480277935319e-06, "loss": 0.6011, "step": 1444 }, { "epoch": 1.03, "grad_norm": 15.931282079030481, "learning_rate": 8.700537303507298e-06, "loss": 0.3828, "step": 1445 }, { "epoch": 1.03, "grad_norm": 22.345280510200695, "learning_rate": 8.69859309266098e-06, "loss": 0.54, "step": 1446 }, { "epoch": 1.03, "grad_norm": 16.92976281404644, "learning_rate": 8.696647646045962e-06, "loss": 0.4468, "step": 1447 }, { "epoch": 1.03, "grad_norm": 15.584050288102251, "learning_rate": 8.694700964312257e-06, "loss": 0.499, "step": 1448 }, { "epoch": 1.03, "grad_norm": 9.782706492066882, "learning_rate": 8.692753048110283e-06, "loss": 0.5635, "step": 1449 }, { "epoch": 1.03, "grad_norm": 17.982712514408526, "learning_rate": 8.690803898090878e-06, "loss": 0.4897, "step": 1450 }, { "epoch": 1.04, "grad_norm": 10.009874311456212, "learning_rate": 8.68885351490529e-06, "loss": 0.3774, "step": 1451 }, { "epoch": 1.04, "grad_norm": 16.187289386392372, "learning_rate": 8.686901899205177e-06, "loss": 0.4077, "step": 1452 }, { "epoch": 1.04, "grad_norm": 22.74330679007414, "learning_rate": 8.684949051642609e-06, "loss": 0.4907, "step": 1453 }, { "epoch": 1.04, "grad_norm": 8.533483611225794, "learning_rate": 8.68299497287007e-06, "loss": 0.4238, "step": 1454 }, { "epoch": 1.04, "grad_norm": 22.93205428371546, "learning_rate": 8.681039663540454e-06, "loss": 0.4351, "step": 1455 }, { "epoch": 1.04, "grad_norm": 11.711598620907111, "learning_rate": 8.679083124307064e-06, "loss": 0.321, "step": 1456 }, { "epoch": 1.04, "grad_norm": 20.3063051905801, "learning_rate": 8.67712535582362e-06, "loss": 0.4761, "step": 1457 }, { "epoch": 1.04, "grad_norm": 15.81069964942567, "learning_rate": 8.675166358744247e-06, "loss": 0.4497, "step": 1458 }, { "epoch": 1.04, "grad_norm": 25.172668334190924, "learning_rate": 8.67320613372348e-06, "loss": 0.6162, "step": 1459 }, { "epoch": 1.04, "grad_norm": 13.868299776664779, "learning_rate": 8.67124468141627e-06, "loss": 0.4697, "step": 1460 }, { "epoch": 1.04, "grad_norm": 17.65557462571263, "learning_rate": 8.669282002477975e-06, "loss": 0.5439, "step": 1461 }, { "epoch": 1.04, "grad_norm": 19.986941627654915, "learning_rate": 8.66731809756436e-06, "loss": 0.4077, "step": 1462 }, { "epoch": 1.04, "grad_norm": 13.481751253152993, "learning_rate": 8.665352967331604e-06, "loss": 0.4507, "step": 1463 }, { "epoch": 1.04, "grad_norm": 10.69597798278669, "learning_rate": 8.66338661243629e-06, "loss": 0.4185, "step": 1464 }, { "epoch": 1.05, "grad_norm": 21.12105825698134, "learning_rate": 8.661419033535419e-06, "loss": 0.4966, "step": 1465 }, { "epoch": 1.05, "grad_norm": 15.364831925795514, "learning_rate": 8.659450231286392e-06, "loss": 0.4619, "step": 1466 }, { "epoch": 1.05, "grad_norm": 23.36575882497529, "learning_rate": 8.657480206347024e-06, "loss": 0.478, "step": 1467 }, { "epoch": 1.05, "grad_norm": 19.20617981129387, "learning_rate": 8.655508959375536e-06, "loss": 0.458, "step": 1468 }, { "epoch": 1.05, "grad_norm": 12.332507150136136, "learning_rate": 8.653536491030559e-06, "loss": 0.4453, "step": 1469 }, { "epoch": 1.05, "grad_norm": 8.375737589014147, "learning_rate": 8.651562801971131e-06, "loss": 0.4199, "step": 1470 }, { "epoch": 1.05, "grad_norm": 11.622191769632364, "learning_rate": 8.649587892856698e-06, "loss": 0.4438, "step": 1471 }, { "epoch": 1.05, "grad_norm": 15.028816389671348, "learning_rate": 8.647611764347114e-06, "loss": 0.4634, "step": 1472 }, { "epoch": 1.05, "grad_norm": 51.19574362423425, "learning_rate": 8.64563441710264e-06, "loss": 0.7012, "step": 1473 }, { "epoch": 1.05, "grad_norm": 8.972205154177201, "learning_rate": 8.643655851783947e-06, "loss": 0.3843, "step": 1474 }, { "epoch": 1.05, "grad_norm": 8.72003218429071, "learning_rate": 8.641676069052104e-06, "loss": 0.4072, "step": 1475 }, { "epoch": 1.05, "grad_norm": 14.009750309222879, "learning_rate": 8.639695069568602e-06, "loss": 0.4717, "step": 1476 }, { "epoch": 1.05, "grad_norm": 16.704206269175184, "learning_rate": 8.637712853995324e-06, "loss": 0.4814, "step": 1477 }, { "epoch": 1.05, "grad_norm": 9.870396068272045, "learning_rate": 8.635729422994566e-06, "loss": 0.4634, "step": 1478 }, { "epoch": 1.06, "grad_norm": 11.55132356224423, "learning_rate": 8.633744777229029e-06, "loss": 0.4365, "step": 1479 }, { "epoch": 1.06, "grad_norm": 12.263513038793375, "learning_rate": 8.63175891736182e-06, "loss": 0.3862, "step": 1480 }, { "epoch": 1.06, "grad_norm": 10.735243962302292, "learning_rate": 8.629771844056452e-06, "loss": 0.3691, "step": 1481 }, { "epoch": 1.06, "grad_norm": 11.39389573007216, "learning_rate": 8.627783557976846e-06, "loss": 0.4902, "step": 1482 }, { "epoch": 1.06, "grad_norm": 9.633505897881955, "learning_rate": 8.62579405978732e-06, "loss": 0.4678, "step": 1483 }, { "epoch": 1.06, "grad_norm": 10.95154363916375, "learning_rate": 8.623803350152606e-06, "loss": 0.4326, "step": 1484 }, { "epoch": 1.06, "grad_norm": 12.629591334063445, "learning_rate": 8.621811429737837e-06, "loss": 0.4819, "step": 1485 }, { "epoch": 1.06, "grad_norm": 13.321230859689711, "learning_rate": 8.619818299208548e-06, "loss": 0.3994, "step": 1486 }, { "epoch": 1.06, "grad_norm": 12.680762867215169, "learning_rate": 8.617823959230683e-06, "loss": 0.5298, "step": 1487 }, { "epoch": 1.06, "grad_norm": 14.312614949546656, "learning_rate": 8.615828410470589e-06, "loss": 0.5034, "step": 1488 }, { "epoch": 1.06, "grad_norm": 15.386409700658552, "learning_rate": 8.613831653595013e-06, "loss": 0.5503, "step": 1489 }, { "epoch": 1.06, "grad_norm": 16.553167562663948, "learning_rate": 8.61183368927111e-06, "loss": 0.3765, "step": 1490 }, { "epoch": 1.06, "grad_norm": 11.97547090065287, "learning_rate": 8.609834518166439e-06, "loss": 0.4897, "step": 1491 }, { "epoch": 1.06, "grad_norm": 12.51487310954413, "learning_rate": 8.607834140948958e-06, "loss": 0.4663, "step": 1492 }, { "epoch": 1.07, "grad_norm": 15.020553039764543, "learning_rate": 8.60583255828703e-06, "loss": 0.416, "step": 1493 }, { "epoch": 1.07, "grad_norm": 12.540414928748616, "learning_rate": 8.603829770849421e-06, "loss": 0.5366, "step": 1494 }, { "epoch": 1.07, "grad_norm": 14.386118227859209, "learning_rate": 8.601825779305302e-06, "loss": 0.4199, "step": 1495 }, { "epoch": 1.07, "grad_norm": 15.19089679469395, "learning_rate": 8.59982058432424e-06, "loss": 0.4824, "step": 1496 }, { "epoch": 1.07, "grad_norm": 12.744882941104166, "learning_rate": 8.597814186576212e-06, "loss": 0.4531, "step": 1497 }, { "epoch": 1.07, "grad_norm": 12.998468653111926, "learning_rate": 8.595806586731589e-06, "loss": 0.4771, "step": 1498 }, { "epoch": 1.07, "grad_norm": 21.331660088580012, "learning_rate": 8.59379778546115e-06, "loss": 0.5522, "step": 1499 }, { "epoch": 1.07, "grad_norm": 18.388963075279165, "learning_rate": 8.591787783436073e-06, "loss": 0.4834, "step": 1500 }, { "epoch": 1.07, "eval_avg_AUC": 0.7334799976115187, "eval_avg_Accuracy": 0.6713776525198939, "eval_avg_Accuracy-right": 0.87544019825225, "eval_avg_Accuracy-wrong": 0.31555606095064814, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6231060873343826, "eval_last_AUC": 0.7557780548881416, "eval_last_Accuracy": 0.6933438328912467, "eval_last_Accuracy-right": 0.8278987870092605, "eval_last_Accuracy-wrong": 0.45872185581078007, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6509237397272707, "eval_max_AUC": 0.7050180895122209, "eval_max_Accuracy": 0.6380968169761273, "eval_max_Accuracy-right": 0.947045780618234, "eval_max_Accuracy-wrong": 0.09938594496247441, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.5989895777078, "eval_min_AUC": 0.7259579465041305, "eval_min_Accuracy": 0.663834549071618, "eval_min_Accuracy-right": 0.7148819616538411, "eval_min_Accuracy-wrong": 0.574823743461451, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6227692066316574, "eval_prod_AUC": 0.7346491876627528, "eval_prod_Accuracy": 0.6056863395225465, "eval_prod_Accuracy-right": 0.46087126646667537, "eval_prod_Accuracy-wrong": 0.858198771889925, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6170978703493736, "eval_runtime": 251.2444, "eval_samples_per_second": 96.034, "eval_steps_per_second": 3.001, "eval_sum_AUC": 0.6155480756158993, "eval_sum_Accuracy": 0.6353199602122016, "eval_sum_Accuracy-right": 0.9979783487674448, "eval_sum_Accuracy-wrong": 0.0029565612917898565, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6239868444598957, "step": 1500 }, { "epoch": 1.07, "grad_norm": 18.466477606369253, "learning_rate": 8.589776581327936e-06, "loss": 0.5674, "step": 1501 }, { "epoch": 1.07, "grad_norm": 13.612158852318972, "learning_rate": 8.587764179808716e-06, "loss": 0.4512, "step": 1502 }, { "epoch": 1.07, "grad_norm": 15.617095094049752, "learning_rate": 8.5857505795508e-06, "loss": 0.4932, "step": 1503 }, { "epoch": 1.07, "grad_norm": 20.816769693684805, "learning_rate": 8.583735781226964e-06, "loss": 0.4648, "step": 1504 }, { "epoch": 1.07, "grad_norm": 12.235810035295055, "learning_rate": 8.581719785510391e-06, "loss": 0.4233, "step": 1505 }, { "epoch": 1.07, "grad_norm": 14.94140620816876, "learning_rate": 8.579702593074666e-06, "loss": 0.5186, "step": 1506 }, { "epoch": 1.08, "grad_norm": 14.10601483533534, "learning_rate": 8.577684204593767e-06, "loss": 0.5, "step": 1507 }, { "epoch": 1.08, "grad_norm": 16.588955613794905, "learning_rate": 8.575664620742073e-06, "loss": 0.4282, "step": 1508 }, { "epoch": 1.08, "grad_norm": 16.445563006577682, "learning_rate": 8.57364384219437e-06, "loss": 0.4819, "step": 1509 }, { "epoch": 1.08, "grad_norm": 10.081966627737616, "learning_rate": 8.571621869625835e-06, "loss": 0.4707, "step": 1510 }, { "epoch": 1.08, "grad_norm": 14.676101669447284, "learning_rate": 8.569598703712045e-06, "loss": 0.4351, "step": 1511 }, { "epoch": 1.08, "grad_norm": 11.660950506422358, "learning_rate": 8.56757434512898e-06, "loss": 0.5244, "step": 1512 }, { "epoch": 1.08, "grad_norm": 11.673297608958263, "learning_rate": 8.565548794553016e-06, "loss": 0.4507, "step": 1513 }, { "epoch": 1.08, "grad_norm": 10.604090359301853, "learning_rate": 8.563522052660925e-06, "loss": 0.5532, "step": 1514 }, { "epoch": 1.08, "grad_norm": 17.711609470198233, "learning_rate": 8.561494120129878e-06, "loss": 0.5186, "step": 1515 }, { "epoch": 1.08, "grad_norm": 8.518970050965722, "learning_rate": 8.55946499763745e-06, "loss": 0.4624, "step": 1516 }, { "epoch": 1.08, "grad_norm": 24.622164675000597, "learning_rate": 8.557434685861604e-06, "loss": 0.5537, "step": 1517 }, { "epoch": 1.08, "grad_norm": 10.341729976026523, "learning_rate": 8.555403185480706e-06, "loss": 0.5444, "step": 1518 }, { "epoch": 1.08, "grad_norm": 14.951861865208672, "learning_rate": 8.553370497173518e-06, "loss": 0.499, "step": 1519 }, { "epoch": 1.08, "grad_norm": 6.798380638701046, "learning_rate": 8.551336621619202e-06, "loss": 0.4722, "step": 1520 }, { "epoch": 1.09, "grad_norm": 10.1744411262223, "learning_rate": 8.549301559497309e-06, "loss": 0.4937, "step": 1521 }, { "epoch": 1.09, "grad_norm": 14.356907776973655, "learning_rate": 8.547265311487794e-06, "loss": 0.4932, "step": 1522 }, { "epoch": 1.09, "grad_norm": 11.570800954915105, "learning_rate": 8.545227878271004e-06, "loss": 0.5479, "step": 1523 }, { "epoch": 1.09, "grad_norm": 8.945217800371761, "learning_rate": 8.543189260527685e-06, "loss": 0.4419, "step": 1524 }, { "epoch": 1.09, "grad_norm": 15.810431743115078, "learning_rate": 8.541149458938972e-06, "loss": 0.5332, "step": 1525 }, { "epoch": 1.09, "grad_norm": 15.488127989967904, "learning_rate": 8.539108474186408e-06, "loss": 0.4951, "step": 1526 }, { "epoch": 1.09, "grad_norm": 7.5250746108148165, "learning_rate": 8.53706630695192e-06, "loss": 0.4365, "step": 1527 }, { "epoch": 1.09, "grad_norm": 10.090581047201338, "learning_rate": 8.535022957917833e-06, "loss": 0.4536, "step": 1528 }, { "epoch": 1.09, "grad_norm": 12.713255870212436, "learning_rate": 8.53297842776687e-06, "loss": 0.5649, "step": 1529 }, { "epoch": 1.09, "grad_norm": 14.611298003245253, "learning_rate": 8.530932717182148e-06, "loss": 0.5117, "step": 1530 }, { "epoch": 1.09, "grad_norm": 9.796700782725793, "learning_rate": 8.528885826847173e-06, "loss": 0.4463, "step": 1531 }, { "epoch": 1.09, "grad_norm": 10.108308604467041, "learning_rate": 8.52683775744585e-06, "loss": 0.416, "step": 1532 }, { "epoch": 1.09, "grad_norm": 13.550989790783058, "learning_rate": 8.524788509662478e-06, "loss": 0.4971, "step": 1533 }, { "epoch": 1.09, "grad_norm": 13.558016423382215, "learning_rate": 8.522738084181749e-06, "loss": 0.5479, "step": 1534 }, { "epoch": 1.1, "grad_norm": 9.741126259708553, "learning_rate": 8.52068648168875e-06, "loss": 0.4404, "step": 1535 }, { "epoch": 1.1, "grad_norm": 13.33677197442007, "learning_rate": 8.518633702868955e-06, "loss": 0.4131, "step": 1536 }, { "epoch": 1.1, "grad_norm": 12.431400081236161, "learning_rate": 8.516579748408237e-06, "loss": 0.4629, "step": 1537 }, { "epoch": 1.1, "grad_norm": 11.038672006100304, "learning_rate": 8.514524618992864e-06, "loss": 0.4155, "step": 1538 }, { "epoch": 1.1, "grad_norm": 14.732435386054672, "learning_rate": 8.51246831530949e-06, "loss": 0.4648, "step": 1539 }, { "epoch": 1.1, "grad_norm": 18.788487706437273, "learning_rate": 8.510410838045165e-06, "loss": 0.4658, "step": 1540 }, { "epoch": 1.1, "grad_norm": 13.614847193707238, "learning_rate": 8.508352187887329e-06, "loss": 0.4868, "step": 1541 }, { "epoch": 1.1, "grad_norm": 9.621493961264449, "learning_rate": 8.506292365523816e-06, "loss": 0.4014, "step": 1542 }, { "epoch": 1.1, "grad_norm": 13.908663513591653, "learning_rate": 8.504231371642852e-06, "loss": 0.5405, "step": 1543 }, { "epoch": 1.1, "grad_norm": 13.103135984089345, "learning_rate": 8.502169206933053e-06, "loss": 0.4414, "step": 1544 }, { "epoch": 1.1, "grad_norm": 19.697544626949995, "learning_rate": 8.500105872083424e-06, "loss": 0.4463, "step": 1545 }, { "epoch": 1.1, "grad_norm": 19.462066133716373, "learning_rate": 8.498041367783367e-06, "loss": 0.4731, "step": 1546 }, { "epoch": 1.1, "grad_norm": 11.778716367020898, "learning_rate": 8.49597569472267e-06, "loss": 0.5264, "step": 1547 }, { "epoch": 1.1, "grad_norm": 9.104157085710243, "learning_rate": 8.493908853591515e-06, "loss": 0.4722, "step": 1548 }, { "epoch": 1.11, "grad_norm": 16.793898810134827, "learning_rate": 8.491840845080467e-06, "loss": 0.6045, "step": 1549 }, { "epoch": 1.11, "grad_norm": 13.705523280095832, "learning_rate": 8.489771669880489e-06, "loss": 0.5137, "step": 1550 }, { "epoch": 1.11, "grad_norm": 11.332670535302874, "learning_rate": 8.487701328682932e-06, "loss": 0.4795, "step": 1551 }, { "epoch": 1.11, "grad_norm": 8.589426733756543, "learning_rate": 8.485629822179533e-06, "loss": 0.4575, "step": 1552 }, { "epoch": 1.11, "grad_norm": 13.353226183033948, "learning_rate": 8.483557151062423e-06, "loss": 0.4497, "step": 1553 }, { "epoch": 1.11, "grad_norm": 9.18956399500516, "learning_rate": 8.481483316024117e-06, "loss": 0.4678, "step": 1554 }, { "epoch": 1.11, "grad_norm": 16.368944548115262, "learning_rate": 8.479408317757525e-06, "loss": 0.5283, "step": 1555 }, { "epoch": 1.11, "grad_norm": 10.054711315944372, "learning_rate": 8.477332156955942e-06, "loss": 0.5078, "step": 1556 }, { "epoch": 1.11, "grad_norm": 10.511879702200282, "learning_rate": 8.475254834313051e-06, "loss": 0.5015, "step": 1557 }, { "epoch": 1.11, "grad_norm": 8.3864182168499, "learning_rate": 8.473176350522925e-06, "loss": 0.4126, "step": 1558 }, { "epoch": 1.11, "grad_norm": 10.192652705648785, "learning_rate": 8.471096706280022e-06, "loss": 0.5127, "step": 1559 }, { "epoch": 1.11, "grad_norm": 11.215018804441314, "learning_rate": 8.469015902279191e-06, "loss": 0.4111, "step": 1560 }, { "epoch": 1.11, "grad_norm": 8.060201902069851, "learning_rate": 8.466933939215669e-06, "loss": 0.4883, "step": 1561 }, { "epoch": 1.11, "grad_norm": 11.696202616513581, "learning_rate": 8.464850817785075e-06, "loss": 0.4199, "step": 1562 }, { "epoch": 1.12, "grad_norm": 10.514532333226486, "learning_rate": 8.462766538683422e-06, "loss": 0.4536, "step": 1563 }, { "epoch": 1.12, "grad_norm": 9.329536103759395, "learning_rate": 8.460681102607106e-06, "loss": 0.4072, "step": 1564 }, { "epoch": 1.12, "grad_norm": 18.684227028359807, "learning_rate": 8.45859451025291e-06, "loss": 0.6074, "step": 1565 }, { "epoch": 1.12, "grad_norm": 25.38421632073626, "learning_rate": 8.456506762317998e-06, "loss": 0.7139, "step": 1566 }, { "epoch": 1.12, "grad_norm": 14.286401468541102, "learning_rate": 8.454417859499932e-06, "loss": 0.5562, "step": 1567 }, { "epoch": 1.12, "grad_norm": 10.597284316583334, "learning_rate": 8.45232780249665e-06, "loss": 0.4604, "step": 1568 }, { "epoch": 1.12, "grad_norm": 16.314962157168296, "learning_rate": 8.450236592006481e-06, "loss": 0.4844, "step": 1569 }, { "epoch": 1.12, "grad_norm": 10.11769377664577, "learning_rate": 8.448144228728135e-06, "loss": 0.4971, "step": 1570 }, { "epoch": 1.12, "grad_norm": 15.673405541056715, "learning_rate": 8.446050713360711e-06, "loss": 0.4473, "step": 1571 }, { "epoch": 1.12, "grad_norm": 12.223436085969732, "learning_rate": 8.443956046603692e-06, "loss": 0.54, "step": 1572 }, { "epoch": 1.12, "grad_norm": 8.801140278815467, "learning_rate": 8.441860229156944e-06, "loss": 0.4429, "step": 1573 }, { "epoch": 1.12, "grad_norm": 15.623164408153567, "learning_rate": 8.439763261720716e-06, "loss": 0.6367, "step": 1574 }, { "epoch": 1.12, "grad_norm": 12.285922227520546, "learning_rate": 8.43766514499565e-06, "loss": 0.54, "step": 1575 }, { "epoch": 1.12, "grad_norm": 10.746100363903965, "learning_rate": 8.435565879682759e-06, "loss": 0.5107, "step": 1576 }, { "epoch": 1.13, "grad_norm": 9.979137821257687, "learning_rate": 8.433465466483452e-06, "loss": 0.5464, "step": 1577 }, { "epoch": 1.13, "grad_norm": 7.598799151468013, "learning_rate": 8.431363906099513e-06, "loss": 0.4863, "step": 1578 }, { "epoch": 1.13, "grad_norm": 8.105699816627178, "learning_rate": 8.429261199233114e-06, "loss": 0.4521, "step": 1579 }, { "epoch": 1.13, "grad_norm": 16.9180976781052, "learning_rate": 8.427157346586807e-06, "loss": 0.4756, "step": 1580 }, { "epoch": 1.13, "grad_norm": 15.86495838313817, "learning_rate": 8.42505234886353e-06, "loss": 0.4922, "step": 1581 }, { "epoch": 1.13, "grad_norm": 8.545267152082388, "learning_rate": 8.422946206766598e-06, "loss": 0.4888, "step": 1582 }, { "epoch": 1.13, "grad_norm": 7.713434930793355, "learning_rate": 8.420838920999718e-06, "loss": 0.416, "step": 1583 }, { "epoch": 1.13, "grad_norm": 16.76577412131441, "learning_rate": 8.418730492266968e-06, "loss": 0.5044, "step": 1584 }, { "epoch": 1.13, "grad_norm": 7.502799153950787, "learning_rate": 8.416620921272818e-06, "loss": 0.4326, "step": 1585 }, { "epoch": 1.13, "grad_norm": 15.461253439409639, "learning_rate": 8.414510208722111e-06, "loss": 0.5684, "step": 1586 }, { "epoch": 1.13, "grad_norm": 19.10348435047004, "learning_rate": 8.412398355320078e-06, "loss": 0.5229, "step": 1587 }, { "epoch": 1.13, "grad_norm": 12.506878119981076, "learning_rate": 8.410285361772328e-06, "loss": 0.4419, "step": 1588 }, { "epoch": 1.13, "grad_norm": 9.761222151604644, "learning_rate": 8.408171228784847e-06, "loss": 0.4199, "step": 1589 }, { "epoch": 1.13, "grad_norm": 11.759892100205057, "learning_rate": 8.406055957064014e-06, "loss": 0.4717, "step": 1590 }, { "epoch": 1.14, "grad_norm": 15.051203228423255, "learning_rate": 8.403939547316576e-06, "loss": 0.4541, "step": 1591 }, { "epoch": 1.14, "grad_norm": 8.735328118613461, "learning_rate": 8.401822000249661e-06, "loss": 0.4087, "step": 1592 }, { "epoch": 1.14, "grad_norm": 9.260653736059055, "learning_rate": 8.399703316570788e-06, "loss": 0.4463, "step": 1593 }, { "epoch": 1.14, "grad_norm": 18.10976108191569, "learning_rate": 8.397583496987846e-06, "loss": 0.519, "step": 1594 }, { "epoch": 1.14, "grad_norm": 12.768171024664671, "learning_rate": 8.395462542209106e-06, "loss": 0.4648, "step": 1595 }, { "epoch": 1.14, "grad_norm": 9.026676798546331, "learning_rate": 8.393340452943219e-06, "loss": 0.501, "step": 1596 }, { "epoch": 1.14, "grad_norm": 12.16743911627819, "learning_rate": 8.391217229899211e-06, "loss": 0.5093, "step": 1597 }, { "epoch": 1.14, "grad_norm": 7.830636260029298, "learning_rate": 8.389092873786495e-06, "loss": 0.375, "step": 1598 }, { "epoch": 1.14, "grad_norm": 12.20546214650608, "learning_rate": 8.386967385314857e-06, "loss": 0.4756, "step": 1599 }, { "epoch": 1.14, "grad_norm": 8.439896508899231, "learning_rate": 8.384840765194458e-06, "loss": 0.4346, "step": 1600 }, { "epoch": 1.14, "grad_norm": 19.327934158649416, "learning_rate": 8.382713014135846e-06, "loss": 0.6797, "step": 1601 }, { "epoch": 1.14, "grad_norm": 9.885276182041746, "learning_rate": 8.38058413284994e-06, "loss": 0.5352, "step": 1602 }, { "epoch": 1.14, "grad_norm": 10.596961361196003, "learning_rate": 8.37845412204804e-06, "loss": 0.4497, "step": 1603 }, { "epoch": 1.14, "grad_norm": 11.019827177632346, "learning_rate": 8.376322982441821e-06, "loss": 0.4795, "step": 1604 }, { "epoch": 1.15, "grad_norm": 10.122204287350154, "learning_rate": 8.374190714743338e-06, "loss": 0.3926, "step": 1605 }, { "epoch": 1.15, "grad_norm": 13.991030964363688, "learning_rate": 8.37205731966502e-06, "loss": 0.4463, "step": 1606 }, { "epoch": 1.15, "grad_norm": 8.075319370068696, "learning_rate": 8.369922797919672e-06, "loss": 0.395, "step": 1607 }, { "epoch": 1.15, "grad_norm": 11.044817706502199, "learning_rate": 8.367787150220481e-06, "loss": 0.4814, "step": 1608 }, { "epoch": 1.15, "grad_norm": 9.41670683342002, "learning_rate": 8.365650377281004e-06, "loss": 0.4272, "step": 1609 }, { "epoch": 1.15, "grad_norm": 15.173314309019913, "learning_rate": 8.36351247981518e-06, "loss": 0.4678, "step": 1610 }, { "epoch": 1.15, "grad_norm": 11.550970965767231, "learning_rate": 8.361373458537316e-06, "loss": 0.374, "step": 1611 }, { "epoch": 1.15, "grad_norm": 11.034925091044531, "learning_rate": 8.359233314162102e-06, "loss": 0.439, "step": 1612 }, { "epoch": 1.15, "grad_norm": 8.425295339263013, "learning_rate": 8.357092047404598e-06, "loss": 0.3662, "step": 1613 }, { "epoch": 1.15, "grad_norm": 10.31408262861963, "learning_rate": 8.354949658980243e-06, "loss": 0.4409, "step": 1614 }, { "epoch": 1.15, "grad_norm": 12.260734469839917, "learning_rate": 8.352806149604847e-06, "loss": 0.3794, "step": 1615 }, { "epoch": 1.15, "grad_norm": 14.468850196741588, "learning_rate": 8.350661519994596e-06, "loss": 0.6748, "step": 1616 }, { "epoch": 1.15, "grad_norm": 21.792508635270057, "learning_rate": 8.348515770866051e-06, "loss": 0.564, "step": 1617 }, { "epoch": 1.15, "grad_norm": 10.705560827563696, "learning_rate": 8.346368902936149e-06, "loss": 0.5049, "step": 1618 }, { "epoch": 1.16, "grad_norm": 10.253537791784684, "learning_rate": 8.344220916922195e-06, "loss": 0.4131, "step": 1619 }, { "epoch": 1.16, "grad_norm": 9.961687115125363, "learning_rate": 8.342071813541873e-06, "loss": 0.3879, "step": 1620 }, { "epoch": 1.16, "grad_norm": 21.258364878364446, "learning_rate": 8.339921593513239e-06, "loss": 0.5815, "step": 1621 }, { "epoch": 1.16, "grad_norm": 8.105088701540607, "learning_rate": 8.337770257554721e-06, "loss": 0.4336, "step": 1622 }, { "epoch": 1.16, "grad_norm": 10.40871548874701, "learning_rate": 8.335617806385119e-06, "loss": 0.3545, "step": 1623 }, { "epoch": 1.16, "grad_norm": 8.767330324784009, "learning_rate": 8.333464240723608e-06, "loss": 0.4673, "step": 1624 }, { "epoch": 1.16, "grad_norm": 11.648199165290603, "learning_rate": 8.331309561289734e-06, "loss": 0.4395, "step": 1625 }, { "epoch": 1.16, "grad_norm": 11.32741559592417, "learning_rate": 8.329153768803415e-06, "loss": 0.4731, "step": 1626 }, { "epoch": 1.16, "grad_norm": 12.778161659116726, "learning_rate": 8.326996863984942e-06, "loss": 0.5933, "step": 1627 }, { "epoch": 1.16, "grad_norm": 12.348733791355947, "learning_rate": 8.324838847554976e-06, "loss": 0.4395, "step": 1628 }, { "epoch": 1.16, "grad_norm": 12.740098579497312, "learning_rate": 8.322679720234553e-06, "loss": 0.3857, "step": 1629 }, { "epoch": 1.16, "grad_norm": 23.715818168299126, "learning_rate": 8.320519482745076e-06, "loss": 0.4917, "step": 1630 }, { "epoch": 1.16, "grad_norm": 11.55961555884494, "learning_rate": 8.31835813580832e-06, "loss": 0.416, "step": 1631 }, { "epoch": 1.16, "grad_norm": 17.584012997946722, "learning_rate": 8.316195680146431e-06, "loss": 0.5322, "step": 1632 }, { "epoch": 1.17, "grad_norm": 20.513206719088476, "learning_rate": 8.314032116481927e-06, "loss": 0.5117, "step": 1633 }, { "epoch": 1.17, "grad_norm": 15.048893376656899, "learning_rate": 8.311867445537694e-06, "loss": 0.4272, "step": 1634 }, { "epoch": 1.17, "grad_norm": 9.270871090848454, "learning_rate": 8.30970166803699e-06, "loss": 0.4766, "step": 1635 }, { "epoch": 1.17, "grad_norm": 10.569630501193533, "learning_rate": 8.307534784703438e-06, "loss": 0.4082, "step": 1636 }, { "epoch": 1.17, "grad_norm": 12.423256634220007, "learning_rate": 8.305366796261036e-06, "loss": 0.4019, "step": 1637 }, { "epoch": 1.17, "grad_norm": 19.689591987413117, "learning_rate": 8.303197703434151e-06, "loss": 0.5371, "step": 1638 }, { "epoch": 1.17, "grad_norm": 16.2646267025812, "learning_rate": 8.301027506947516e-06, "loss": 0.5225, "step": 1639 }, { "epoch": 1.17, "grad_norm": 10.004304702541242, "learning_rate": 8.298856207526234e-06, "loss": 0.5005, "step": 1640 }, { "epoch": 1.17, "grad_norm": 9.275278524089247, "learning_rate": 8.296683805895777e-06, "loss": 0.4683, "step": 1641 }, { "epoch": 1.17, "grad_norm": 15.349760402537493, "learning_rate": 8.294510302781984e-06, "loss": 0.4644, "step": 1642 }, { "epoch": 1.17, "grad_norm": 14.34559489538677, "learning_rate": 8.29233569891106e-06, "loss": 0.4131, "step": 1643 }, { "epoch": 1.17, "grad_norm": 15.09744536399142, "learning_rate": 8.290159995009586e-06, "loss": 0.4858, "step": 1644 }, { "epoch": 1.17, "grad_norm": 8.573866063978919, "learning_rate": 8.2879831918045e-06, "loss": 0.4844, "step": 1645 }, { "epoch": 1.17, "grad_norm": 15.655351070066807, "learning_rate": 8.285805290023119e-06, "loss": 0.4937, "step": 1646 }, { "epoch": 1.18, "grad_norm": 8.922336465989796, "learning_rate": 8.283626290393112e-06, "loss": 0.5044, "step": 1647 }, { "epoch": 1.18, "grad_norm": 8.600737753837139, "learning_rate": 8.28144619364253e-06, "loss": 0.4478, "step": 1648 }, { "epoch": 1.18, "grad_norm": 21.9710210367027, "learning_rate": 8.279265000499783e-06, "loss": 0.5781, "step": 1649 }, { "epoch": 1.18, "grad_norm": 9.960701863701635, "learning_rate": 8.277082711693645e-06, "loss": 0.5278, "step": 1650 }, { "epoch": 1.18, "grad_norm": 11.16288939323412, "learning_rate": 8.274899327953261e-06, "loss": 0.4927, "step": 1651 }, { "epoch": 1.18, "grad_norm": 28.608511125234113, "learning_rate": 8.272714850008142e-06, "loss": 0.6494, "step": 1652 }, { "epoch": 1.18, "grad_norm": 7.7112136057185765, "learning_rate": 8.270529278588158e-06, "loss": 0.4153, "step": 1653 }, { "epoch": 1.18, "grad_norm": 8.677455868257633, "learning_rate": 8.268342614423553e-06, "loss": 0.4663, "step": 1654 }, { "epoch": 1.18, "grad_norm": 17.614598532387653, "learning_rate": 8.26615485824493e-06, "loss": 0.4663, "step": 1655 }, { "epoch": 1.18, "grad_norm": 8.086291528498144, "learning_rate": 8.263966010783259e-06, "loss": 0.4448, "step": 1656 }, { "epoch": 1.18, "grad_norm": 7.52793141998646, "learning_rate": 8.261776072769878e-06, "loss": 0.4453, "step": 1657 }, { "epoch": 1.18, "grad_norm": 10.71379532658703, "learning_rate": 8.259585044936484e-06, "loss": 0.4429, "step": 1658 }, { "epoch": 1.18, "grad_norm": 11.139946386227027, "learning_rate": 8.257392928015138e-06, "loss": 0.4644, "step": 1659 }, { "epoch": 1.18, "grad_norm": 12.658585729760876, "learning_rate": 8.25519972273827e-06, "loss": 0.4727, "step": 1660 }, { "epoch": 1.19, "grad_norm": 10.794428336250439, "learning_rate": 8.253005429838667e-06, "loss": 0.4209, "step": 1661 }, { "epoch": 1.19, "grad_norm": 10.775021748055948, "learning_rate": 8.250810050049488e-06, "loss": 0.4678, "step": 1662 }, { "epoch": 1.19, "grad_norm": 14.921459024373188, "learning_rate": 8.248613584104245e-06, "loss": 0.4731, "step": 1663 }, { "epoch": 1.19, "grad_norm": 10.454721792025683, "learning_rate": 8.246416032736824e-06, "loss": 0.4658, "step": 1664 }, { "epoch": 1.19, "grad_norm": 8.732194232724195, "learning_rate": 8.244217396681461e-06, "loss": 0.3638, "step": 1665 }, { "epoch": 1.19, "grad_norm": 12.820566534030801, "learning_rate": 8.242017676672766e-06, "loss": 0.4893, "step": 1666 }, { "epoch": 1.19, "grad_norm": 20.00873769637777, "learning_rate": 8.239816873445705e-06, "loss": 0.4873, "step": 1667 }, { "epoch": 1.19, "grad_norm": 10.979873575388726, "learning_rate": 8.237614987735607e-06, "loss": 0.3708, "step": 1668 }, { "epoch": 1.19, "grad_norm": 13.872748317763673, "learning_rate": 8.235412020278164e-06, "loss": 0.397, "step": 1669 }, { "epoch": 1.19, "grad_norm": 16.271402719404538, "learning_rate": 8.233207971809427e-06, "loss": 0.3921, "step": 1670 }, { "epoch": 1.19, "grad_norm": 10.329697166946847, "learning_rate": 8.23100284306581e-06, "loss": 0.3176, "step": 1671 }, { "epoch": 1.19, "grad_norm": 18.026001535157672, "learning_rate": 8.228796634784086e-06, "loss": 0.5127, "step": 1672 }, { "epoch": 1.19, "grad_norm": 16.380805016305548, "learning_rate": 8.226589347701396e-06, "loss": 0.3657, "step": 1673 }, { "epoch": 1.19, "grad_norm": 38.080808210850535, "learning_rate": 8.224380982555226e-06, "loss": 0.4443, "step": 1674 }, { "epoch": 1.2, "grad_norm": 18.3361844592776, "learning_rate": 8.222171540083442e-06, "loss": 0.5322, "step": 1675 }, { "epoch": 1.2, "grad_norm": 13.113199868710224, "learning_rate": 8.219961021024251e-06, "loss": 0.4336, "step": 1676 }, { "epoch": 1.2, "grad_norm": 12.666400780810209, "learning_rate": 8.217749426116238e-06, "loss": 0.5059, "step": 1677 }, { "epoch": 1.2, "grad_norm": 8.936998665809853, "learning_rate": 8.215536756098327e-06, "loss": 0.4058, "step": 1678 }, { "epoch": 1.2, "grad_norm": 17.828959837607, "learning_rate": 8.21332301170982e-06, "loss": 0.4595, "step": 1679 }, { "epoch": 1.2, "grad_norm": 10.325958282764173, "learning_rate": 8.211108193690369e-06, "loss": 0.4141, "step": 1680 }, { "epoch": 1.2, "grad_norm": 12.793987371835103, "learning_rate": 8.208892302779982e-06, "loss": 0.5151, "step": 1681 }, { "epoch": 1.2, "grad_norm": 13.843795834098652, "learning_rate": 8.206675339719034e-06, "loss": 0.4849, "step": 1682 }, { "epoch": 1.2, "grad_norm": 15.74955973168439, "learning_rate": 8.204457305248253e-06, "loss": 0.499, "step": 1683 }, { "epoch": 1.2, "grad_norm": 12.444677514125921, "learning_rate": 8.202238200108721e-06, "loss": 0.5122, "step": 1684 }, { "epoch": 1.2, "grad_norm": 11.718027155024345, "learning_rate": 8.200018025041887e-06, "loss": 0.501, "step": 1685 }, { "epoch": 1.2, "grad_norm": 13.98702758856593, "learning_rate": 8.19779678078955e-06, "loss": 0.4941, "step": 1686 }, { "epoch": 1.2, "grad_norm": 20.583545079481684, "learning_rate": 8.195574468093872e-06, "loss": 0.4937, "step": 1687 }, { "epoch": 1.2, "grad_norm": 10.339228146259074, "learning_rate": 8.193351087697366e-06, "loss": 0.4468, "step": 1688 }, { "epoch": 1.21, "grad_norm": 14.124375880053021, "learning_rate": 8.191126640342906e-06, "loss": 0.4336, "step": 1689 }, { "epoch": 1.21, "grad_norm": 11.822649187564727, "learning_rate": 8.18890112677372e-06, "loss": 0.4316, "step": 1690 }, { "epoch": 1.21, "grad_norm": 12.84834184460974, "learning_rate": 8.186674547733398e-06, "loss": 0.5522, "step": 1691 }, { "epoch": 1.21, "grad_norm": 20.928740644270853, "learning_rate": 8.184446903965875e-06, "loss": 0.4897, "step": 1692 }, { "epoch": 1.21, "grad_norm": 9.767092152016358, "learning_rate": 8.182218196215452e-06, "loss": 0.5, "step": 1693 }, { "epoch": 1.21, "grad_norm": 11.47958841486727, "learning_rate": 8.17998842522678e-06, "loss": 0.417, "step": 1694 }, { "epoch": 1.21, "grad_norm": 11.010687951365263, "learning_rate": 8.17775759174487e-06, "loss": 0.5488, "step": 1695 }, { "epoch": 1.21, "grad_norm": 18.742980213365875, "learning_rate": 8.17552569651508e-06, "loss": 0.4692, "step": 1696 }, { "epoch": 1.21, "grad_norm": 13.073207754471264, "learning_rate": 8.173292740283135e-06, "loss": 0.48, "step": 1697 }, { "epoch": 1.21, "grad_norm": 10.048458414659237, "learning_rate": 8.171058723795097e-06, "loss": 0.4868, "step": 1698 }, { "epoch": 1.21, "grad_norm": 18.883107102405994, "learning_rate": 8.168823647797401e-06, "loss": 0.5278, "step": 1699 }, { "epoch": 1.21, "grad_norm": 12.307579695529716, "learning_rate": 8.166587513036826e-06, "loss": 0.5342, "step": 1700 }, { "epoch": 1.21, "grad_norm": 14.099245148537669, "learning_rate": 8.164350320260502e-06, "loss": 0.3953, "step": 1701 }, { "epoch": 1.21, "grad_norm": 20.941909833565685, "learning_rate": 8.16211207021592e-06, "loss": 0.5112, "step": 1702 }, { "epoch": 1.22, "grad_norm": 7.820604772453076, "learning_rate": 8.15987276365092e-06, "loss": 0.4077, "step": 1703 }, { "epoch": 1.22, "grad_norm": 9.473817682607748, "learning_rate": 8.157632401313696e-06, "loss": 0.479, "step": 1704 }, { "epoch": 1.22, "grad_norm": 10.74411226039115, "learning_rate": 8.155390983952795e-06, "loss": 0.5112, "step": 1705 }, { "epoch": 1.22, "grad_norm": 7.227050478751384, "learning_rate": 8.153148512317117e-06, "loss": 0.3857, "step": 1706 }, { "epoch": 1.22, "grad_norm": 15.200446672036492, "learning_rate": 8.150904987155911e-06, "loss": 0.5029, "step": 1707 }, { "epoch": 1.22, "grad_norm": 17.4110528802431, "learning_rate": 8.148660409218786e-06, "loss": 0.5918, "step": 1708 }, { "epoch": 1.22, "grad_norm": 8.448382094255253, "learning_rate": 8.146414779255689e-06, "loss": 0.4707, "step": 1709 }, { "epoch": 1.22, "grad_norm": 7.909316780039679, "learning_rate": 8.144168098016933e-06, "loss": 0.4331, "step": 1710 }, { "epoch": 1.22, "grad_norm": 12.433238461814103, "learning_rate": 8.141920366253173e-06, "loss": 0.5918, "step": 1711 }, { "epoch": 1.22, "grad_norm": 10.771308440459347, "learning_rate": 8.139671584715419e-06, "loss": 0.5146, "step": 1712 }, { "epoch": 1.22, "grad_norm": 9.892728317342609, "learning_rate": 8.137421754155031e-06, "loss": 0.5664, "step": 1713 }, { "epoch": 1.22, "grad_norm": 12.318633500646037, "learning_rate": 8.13517087532372e-06, "loss": 0.4043, "step": 1714 }, { "epoch": 1.22, "grad_norm": 9.17077151093477, "learning_rate": 8.132918948973543e-06, "loss": 0.4736, "step": 1715 }, { "epoch": 1.22, "grad_norm": 20.996362868045267, "learning_rate": 8.130665975856913e-06, "loss": 0.5215, "step": 1716 }, { "epoch": 1.23, "grad_norm": 11.616605183075755, "learning_rate": 8.128411956726592e-06, "loss": 0.5415, "step": 1717 }, { "epoch": 1.23, "grad_norm": 9.643006691319318, "learning_rate": 8.126156892335686e-06, "loss": 0.4834, "step": 1718 }, { "epoch": 1.23, "grad_norm": 11.195645679300869, "learning_rate": 8.123900783437655e-06, "loss": 0.5327, "step": 1719 }, { "epoch": 1.23, "grad_norm": 13.905175950266107, "learning_rate": 8.121643630786308e-06, "loss": 0.5674, "step": 1720 }, { "epoch": 1.23, "grad_norm": 11.886407787375669, "learning_rate": 8.1193854351358e-06, "loss": 0.4756, "step": 1721 }, { "epoch": 1.23, "grad_norm": 13.66213829319637, "learning_rate": 8.11712619724064e-06, "loss": 0.4854, "step": 1722 }, { "epoch": 1.23, "grad_norm": 11.919219445746402, "learning_rate": 8.114865917855676e-06, "loss": 0.4976, "step": 1723 }, { "epoch": 1.23, "grad_norm": 11.518986982659731, "learning_rate": 8.112604597736113e-06, "loss": 0.4453, "step": 1724 }, { "epoch": 1.23, "grad_norm": 8.744337753421034, "learning_rate": 8.110342237637501e-06, "loss": 0.4009, "step": 1725 }, { "epoch": 1.23, "grad_norm": 14.907567080982792, "learning_rate": 8.108078838315732e-06, "loss": 0.4307, "step": 1726 }, { "epoch": 1.23, "grad_norm": 14.419935864909833, "learning_rate": 8.105814400527052e-06, "loss": 0.415, "step": 1727 }, { "epoch": 1.23, "grad_norm": 13.496398651006333, "learning_rate": 8.103548925028054e-06, "loss": 0.5171, "step": 1728 }, { "epoch": 1.23, "grad_norm": 11.775365355328299, "learning_rate": 8.101282412575673e-06, "loss": 0.4819, "step": 1729 }, { "epoch": 1.23, "grad_norm": 10.67720477465964, "learning_rate": 8.099014863927192e-06, "loss": 0.3726, "step": 1730 }, { "epoch": 1.24, "grad_norm": 10.790063693984372, "learning_rate": 8.096746279840245e-06, "loss": 0.4429, "step": 1731 }, { "epoch": 1.24, "grad_norm": 15.660047686678814, "learning_rate": 8.094476661072806e-06, "loss": 0.4814, "step": 1732 }, { "epoch": 1.24, "grad_norm": 20.596433199232035, "learning_rate": 8.092206008383195e-06, "loss": 0.5518, "step": 1733 }, { "epoch": 1.24, "grad_norm": 16.474511824070156, "learning_rate": 8.089934322530082e-06, "loss": 0.5264, "step": 1734 }, { "epoch": 1.24, "grad_norm": 14.071167596382951, "learning_rate": 8.087661604272477e-06, "loss": 0.5366, "step": 1735 }, { "epoch": 1.24, "grad_norm": 10.404014720227183, "learning_rate": 8.08538785436974e-06, "loss": 0.5059, "step": 1736 }, { "epoch": 1.24, "grad_norm": 15.235368526664493, "learning_rate": 8.08311307358157e-06, "loss": 0.5933, "step": 1737 }, { "epoch": 1.24, "grad_norm": 11.232707612320379, "learning_rate": 8.080837262668017e-06, "loss": 0.4341, "step": 1738 }, { "epoch": 1.24, "grad_norm": 12.513187841921981, "learning_rate": 8.078560422389472e-06, "loss": 0.4668, "step": 1739 }, { "epoch": 1.24, "grad_norm": 8.808341804249373, "learning_rate": 8.076282553506664e-06, "loss": 0.4595, "step": 1740 }, { "epoch": 1.24, "grad_norm": 8.722292004822773, "learning_rate": 8.074003656780678e-06, "loss": 0.5083, "step": 1741 }, { "epoch": 1.24, "grad_norm": 9.773723618455808, "learning_rate": 8.071723732972933e-06, "loss": 0.499, "step": 1742 }, { "epoch": 1.24, "grad_norm": 18.009490592834382, "learning_rate": 8.069442782845191e-06, "loss": 0.6309, "step": 1743 }, { "epoch": 1.24, "grad_norm": 9.228749331770816, "learning_rate": 8.067160807159566e-06, "loss": 0.5264, "step": 1744 }, { "epoch": 1.25, "grad_norm": 7.760608376818505, "learning_rate": 8.064877806678504e-06, "loss": 0.4751, "step": 1745 }, { "epoch": 1.25, "grad_norm": 9.86883350705853, "learning_rate": 8.062593782164798e-06, "loss": 0.5444, "step": 1746 }, { "epoch": 1.25, "grad_norm": 12.526705575487213, "learning_rate": 8.060308734381585e-06, "loss": 0.52, "step": 1747 }, { "epoch": 1.25, "grad_norm": 7.766194944671241, "learning_rate": 8.05802266409234e-06, "loss": 0.4355, "step": 1748 }, { "epoch": 1.25, "grad_norm": 21.32642846265708, "learning_rate": 8.055735572060883e-06, "loss": 0.4399, "step": 1749 }, { "epoch": 1.25, "grad_norm": 13.768520958766539, "learning_rate": 8.053447459051374e-06, "loss": 0.4658, "step": 1750 }, { "epoch": 1.25, "grad_norm": 7.417625812427548, "learning_rate": 8.051158325828315e-06, "loss": 0.4585, "step": 1751 }, { "epoch": 1.25, "grad_norm": 7.241606084022609, "learning_rate": 8.048868173156546e-06, "loss": 0.4858, "step": 1752 }, { "epoch": 1.25, "grad_norm": 7.691430849772492, "learning_rate": 8.046577001801248e-06, "loss": 0.4819, "step": 1753 }, { "epoch": 1.25, "grad_norm": 22.879141711500264, "learning_rate": 8.044284812527949e-06, "loss": 0.5547, "step": 1754 }, { "epoch": 1.25, "grad_norm": 21.480942295074424, "learning_rate": 8.041991606102507e-06, "loss": 0.6348, "step": 1755 }, { "epoch": 1.25, "grad_norm": 9.43827233604463, "learning_rate": 8.039697383291127e-06, "loss": 0.4663, "step": 1756 }, { "epoch": 1.25, "grad_norm": 8.948768279992116, "learning_rate": 8.037402144860353e-06, "loss": 0.4976, "step": 1757 }, { "epoch": 1.25, "grad_norm": 7.369467383801608, "learning_rate": 8.035105891577064e-06, "loss": 0.4478, "step": 1758 }, { "epoch": 1.26, "grad_norm": 9.310600570034742, "learning_rate": 8.032808624208485e-06, "loss": 0.4702, "step": 1759 }, { "epoch": 1.26, "grad_norm": 19.0022124954022, "learning_rate": 8.030510343522172e-06, "loss": 0.4883, "step": 1760 }, { "epoch": 1.26, "grad_norm": 11.2828868419992, "learning_rate": 8.02821105028602e-06, "loss": 0.4268, "step": 1761 }, { "epoch": 1.26, "grad_norm": 10.03094557514591, "learning_rate": 8.025910745268276e-06, "loss": 0.5122, "step": 1762 }, { "epoch": 1.26, "grad_norm": 13.600050046217289, "learning_rate": 8.023609429237504e-06, "loss": 0.3862, "step": 1763 }, { "epoch": 1.26, "grad_norm": 7.4892743731139815, "learning_rate": 8.021307102962623e-06, "loss": 0.4751, "step": 1764 }, { "epoch": 1.26, "grad_norm": 17.988405551016548, "learning_rate": 8.019003767212881e-06, "loss": 0.4453, "step": 1765 }, { "epoch": 1.26, "grad_norm": 25.122334044245886, "learning_rate": 8.016699422757865e-06, "loss": 0.5967, "step": 1766 }, { "epoch": 1.26, "grad_norm": 19.211595252663454, "learning_rate": 8.014394070367499e-06, "loss": 0.5542, "step": 1767 }, { "epoch": 1.26, "grad_norm": 8.735010608127107, "learning_rate": 8.012087710812047e-06, "loss": 0.4873, "step": 1768 }, { "epoch": 1.26, "grad_norm": 11.903117170564997, "learning_rate": 8.009780344862101e-06, "loss": 0.5122, "step": 1769 }, { "epoch": 1.26, "grad_norm": 12.873539053988267, "learning_rate": 8.0074719732886e-06, "loss": 0.4502, "step": 1770 }, { "epoch": 1.26, "grad_norm": 16.28213540501501, "learning_rate": 8.005162596862812e-06, "loss": 0.4248, "step": 1771 }, { "epoch": 1.26, "grad_norm": 22.486287226040226, "learning_rate": 8.002852216356343e-06, "loss": 0.5264, "step": 1772 }, { "epoch": 1.27, "grad_norm": 12.996115895199278, "learning_rate": 8.000540832541132e-06, "loss": 0.4854, "step": 1773 }, { "epoch": 1.27, "grad_norm": 11.038573416312534, "learning_rate": 7.99822844618946e-06, "loss": 0.4604, "step": 1774 }, { "epoch": 1.27, "grad_norm": 10.159547013032094, "learning_rate": 7.995915058073933e-06, "loss": 0.5234, "step": 1775 }, { "epoch": 1.27, "grad_norm": 20.659676859984405, "learning_rate": 7.9936006689675e-06, "loss": 0.4878, "step": 1776 }, { "epoch": 1.27, "grad_norm": 11.634718738069445, "learning_rate": 7.99128527964344e-06, "loss": 0.5059, "step": 1777 }, { "epoch": 1.27, "grad_norm": 18.586703189973363, "learning_rate": 7.988968890875368e-06, "loss": 0.479, "step": 1778 }, { "epoch": 1.27, "grad_norm": 10.900105375633403, "learning_rate": 7.986651503437233e-06, "loss": 0.4824, "step": 1779 }, { "epoch": 1.27, "grad_norm": 8.814400149182877, "learning_rate": 7.984333118103318e-06, "loss": 0.4585, "step": 1780 }, { "epoch": 1.27, "grad_norm": 7.089228994356707, "learning_rate": 7.982013735648235e-06, "loss": 0.3984, "step": 1781 }, { "epoch": 1.27, "grad_norm": 11.909003750271049, "learning_rate": 7.979693356846937e-06, "loss": 0.6475, "step": 1782 }, { "epoch": 1.27, "grad_norm": 20.987506759661084, "learning_rate": 7.977371982474705e-06, "loss": 0.5854, "step": 1783 }, { "epoch": 1.27, "grad_norm": 9.953186291279971, "learning_rate": 7.975049613307151e-06, "loss": 0.5356, "step": 1784 }, { "epoch": 1.27, "grad_norm": 10.353919440549886, "learning_rate": 7.972726250120225e-06, "loss": 0.4849, "step": 1785 }, { "epoch": 1.27, "grad_norm": 10.610513184195263, "learning_rate": 7.970401893690202e-06, "loss": 0.5479, "step": 1786 }, { "epoch": 1.28, "grad_norm": 12.063317949232149, "learning_rate": 7.968076544793696e-06, "loss": 0.4551, "step": 1787 }, { "epoch": 1.28, "grad_norm": 8.971498295810411, "learning_rate": 7.965750204207647e-06, "loss": 0.418, "step": 1788 }, { "epoch": 1.28, "grad_norm": 8.315924369534915, "learning_rate": 7.96342287270933e-06, "loss": 0.4507, "step": 1789 }, { "epoch": 1.28, "grad_norm": 7.953940518033675, "learning_rate": 7.96109455107635e-06, "loss": 0.3809, "step": 1790 }, { "epoch": 1.28, "grad_norm": 22.05948822429463, "learning_rate": 7.958765240086639e-06, "loss": 0.5605, "step": 1791 }, { "epoch": 1.28, "grad_norm": 12.700046157464467, "learning_rate": 7.956434940518468e-06, "loss": 0.4351, "step": 1792 }, { "epoch": 1.28, "grad_norm": 10.454966939327795, "learning_rate": 7.954103653150432e-06, "loss": 0.5576, "step": 1793 }, { "epoch": 1.28, "grad_norm": 9.619801280870984, "learning_rate": 7.951771378761455e-06, "loss": 0.418, "step": 1794 }, { "epoch": 1.28, "grad_norm": 16.520532675801153, "learning_rate": 7.949438118130797e-06, "loss": 0.6309, "step": 1795 }, { "epoch": 1.28, "grad_norm": 16.82962380647751, "learning_rate": 7.94710387203804e-06, "loss": 0.5132, "step": 1796 }, { "epoch": 1.28, "grad_norm": 15.229460325358946, "learning_rate": 7.944768641263101e-06, "loss": 0.438, "step": 1797 }, { "epoch": 1.28, "grad_norm": 18.18358063129885, "learning_rate": 7.942432426586224e-06, "loss": 0.479, "step": 1798 }, { "epoch": 1.28, "grad_norm": 10.038226449592317, "learning_rate": 7.94009522878798e-06, "loss": 0.458, "step": 1799 }, { "epoch": 1.28, "grad_norm": 12.23538850601863, "learning_rate": 7.937757048649274e-06, "loss": 0.5488, "step": 1800 }, { "epoch": 1.29, "grad_norm": 19.148183869442587, "learning_rate": 7.935417886951332e-06, "loss": 0.5342, "step": 1801 }, { "epoch": 1.29, "grad_norm": 9.068638115217762, "learning_rate": 7.933077744475713e-06, "loss": 0.4331, "step": 1802 }, { "epoch": 1.29, "grad_norm": 15.673804499921573, "learning_rate": 7.930736622004301e-06, "loss": 0.4766, "step": 1803 }, { "epoch": 1.29, "grad_norm": 16.854117836669452, "learning_rate": 7.928394520319311e-06, "loss": 0.5518, "step": 1804 }, { "epoch": 1.29, "grad_norm": 9.307530919938724, "learning_rate": 7.926051440203278e-06, "loss": 0.4248, "step": 1805 }, { "epoch": 1.29, "grad_norm": 9.980958338000624, "learning_rate": 7.923707382439073e-06, "loss": 0.436, "step": 1806 }, { "epoch": 1.29, "grad_norm": 8.178450156206857, "learning_rate": 7.921362347809888e-06, "loss": 0.4692, "step": 1807 }, { "epoch": 1.29, "grad_norm": 9.204168068608656, "learning_rate": 7.919016337099242e-06, "loss": 0.4541, "step": 1808 }, { "epoch": 1.29, "grad_norm": 8.081522614205248, "learning_rate": 7.916669351090981e-06, "loss": 0.4487, "step": 1809 }, { "epoch": 1.29, "grad_norm": 13.916600263727217, "learning_rate": 7.914321390569278e-06, "loss": 0.4282, "step": 1810 }, { "epoch": 1.29, "grad_norm": 7.60473063045003, "learning_rate": 7.911972456318629e-06, "loss": 0.437, "step": 1811 }, { "epoch": 1.29, "grad_norm": 9.650718167681877, "learning_rate": 7.909622549123855e-06, "loss": 0.4644, "step": 1812 }, { "epoch": 1.29, "grad_norm": 11.491205118287587, "learning_rate": 7.907271669770107e-06, "loss": 0.4053, "step": 1813 }, { "epoch": 1.29, "grad_norm": 9.07315325683419, "learning_rate": 7.904919819042855e-06, "loss": 0.4756, "step": 1814 }, { "epoch": 1.3, "grad_norm": 13.101956566078766, "learning_rate": 7.902566997727896e-06, "loss": 0.4263, "step": 1815 }, { "epoch": 1.3, "grad_norm": 12.528361439531825, "learning_rate": 7.900213206611353e-06, "loss": 0.4673, "step": 1816 }, { "epoch": 1.3, "grad_norm": 11.82005333902504, "learning_rate": 7.897858446479672e-06, "loss": 0.4355, "step": 1817 }, { "epoch": 1.3, "grad_norm": 24.150479129083248, "learning_rate": 7.895502718119618e-06, "loss": 0.5068, "step": 1818 }, { "epoch": 1.3, "grad_norm": 8.074857103355468, "learning_rate": 7.89314602231829e-06, "loss": 0.4043, "step": 1819 }, { "epoch": 1.3, "grad_norm": 10.774025110056453, "learning_rate": 7.8907883598631e-06, "loss": 0.4058, "step": 1820 }, { "epoch": 1.3, "grad_norm": 16.169418137670785, "learning_rate": 7.888429731541784e-06, "loss": 0.5068, "step": 1821 }, { "epoch": 1.3, "grad_norm": 12.638373472077571, "learning_rate": 7.886070138142407e-06, "loss": 0.4453, "step": 1822 }, { "epoch": 1.3, "grad_norm": 14.724664699868582, "learning_rate": 7.883709580453354e-06, "loss": 0.5068, "step": 1823 }, { "epoch": 1.3, "grad_norm": 14.422813863600243, "learning_rate": 7.88134805926333e-06, "loss": 0.4424, "step": 1824 }, { "epoch": 1.3, "grad_norm": 11.591815386989563, "learning_rate": 7.878985575361362e-06, "loss": 0.4644, "step": 1825 }, { "epoch": 1.3, "grad_norm": 12.579472910793811, "learning_rate": 7.876622129536801e-06, "loss": 0.4619, "step": 1826 }, { "epoch": 1.3, "grad_norm": 12.59809088506851, "learning_rate": 7.874257722579319e-06, "loss": 0.5039, "step": 1827 }, { "epoch": 1.3, "grad_norm": 18.124095864166325, "learning_rate": 7.871892355278906e-06, "loss": 0.4302, "step": 1828 }, { "epoch": 1.31, "grad_norm": 8.067465357679238, "learning_rate": 7.869526028425878e-06, "loss": 0.5254, "step": 1829 }, { "epoch": 1.31, "grad_norm": 10.94843976061978, "learning_rate": 7.867158742810866e-06, "loss": 0.4839, "step": 1830 }, { "epoch": 1.31, "grad_norm": 8.151059746712043, "learning_rate": 7.864790499224825e-06, "loss": 0.437, "step": 1831 }, { "epoch": 1.31, "grad_norm": 11.408199801575625, "learning_rate": 7.86242129845903e-06, "loss": 0.458, "step": 1832 }, { "epoch": 1.31, "grad_norm": 17.645230899000442, "learning_rate": 7.860051141305074e-06, "loss": 0.5107, "step": 1833 }, { "epoch": 1.31, "grad_norm": 8.561675138435755, "learning_rate": 7.857680028554873e-06, "loss": 0.4189, "step": 1834 }, { "epoch": 1.31, "grad_norm": 8.925281089125331, "learning_rate": 7.855307961000656e-06, "loss": 0.4893, "step": 1835 }, { "epoch": 1.31, "grad_norm": 11.26652117905419, "learning_rate": 7.852934939434977e-06, "loss": 0.4399, "step": 1836 }, { "epoch": 1.31, "grad_norm": 12.505458999460473, "learning_rate": 7.850560964650707e-06, "loss": 0.6421, "step": 1837 }, { "epoch": 1.31, "grad_norm": 11.372565753801249, "learning_rate": 7.848186037441035e-06, "loss": 0.6289, "step": 1838 }, { "epoch": 1.31, "grad_norm": 6.881114480690239, "learning_rate": 7.845810158599467e-06, "loss": 0.4624, "step": 1839 }, { "epoch": 1.31, "grad_norm": 13.626691743280656, "learning_rate": 7.84343332891983e-06, "loss": 0.4785, "step": 1840 }, { "epoch": 1.31, "grad_norm": 7.687233987923879, "learning_rate": 7.841055549196267e-06, "loss": 0.521, "step": 1841 }, { "epoch": 1.31, "grad_norm": 9.195815391563308, "learning_rate": 7.838676820223234e-06, "loss": 0.3867, "step": 1842 }, { "epoch": 1.32, "grad_norm": 11.299770030769283, "learning_rate": 7.836297142795515e-06, "loss": 0.4546, "step": 1843 }, { "epoch": 1.32, "grad_norm": 6.8699709812237515, "learning_rate": 7.833916517708203e-06, "loss": 0.3662, "step": 1844 }, { "epoch": 1.32, "grad_norm": 17.559884344433186, "learning_rate": 7.831534945756703e-06, "loss": 0.54, "step": 1845 }, { "epoch": 1.32, "grad_norm": 9.525388260025515, "learning_rate": 7.82915242773675e-06, "loss": 0.4473, "step": 1846 }, { "epoch": 1.32, "grad_norm": 8.9216506801838, "learning_rate": 7.826768964444384e-06, "loss": 0.4189, "step": 1847 }, { "epoch": 1.32, "grad_norm": 22.147593629165257, "learning_rate": 7.824384556675966e-06, "loss": 0.5132, "step": 1848 }, { "epoch": 1.32, "grad_norm": 12.171992879631954, "learning_rate": 7.821999205228168e-06, "loss": 0.4893, "step": 1849 }, { "epoch": 1.32, "grad_norm": 21.79619518512073, "learning_rate": 7.819612910897985e-06, "loss": 0.541, "step": 1850 }, { "epoch": 1.32, "grad_norm": 17.026117401917226, "learning_rate": 7.817225674482717e-06, "loss": 0.5181, "step": 1851 }, { "epoch": 1.32, "grad_norm": 7.809802504536831, "learning_rate": 7.814837496779988e-06, "loss": 0.3872, "step": 1852 }, { "epoch": 1.32, "grad_norm": 9.661493775692145, "learning_rate": 7.812448378587731e-06, "loss": 0.4951, "step": 1853 }, { "epoch": 1.32, "grad_norm": 14.532780424040105, "learning_rate": 7.810058320704194e-06, "loss": 0.5244, "step": 1854 }, { "epoch": 1.32, "grad_norm": 9.622050906657064, "learning_rate": 7.807667323927941e-06, "loss": 0.3711, "step": 1855 }, { "epoch": 1.32, "grad_norm": 12.137416584677046, "learning_rate": 7.80527538905785e-06, "loss": 0.4214, "step": 1856 }, { "epoch": 1.33, "grad_norm": 9.531652456122275, "learning_rate": 7.802882516893106e-06, "loss": 0.3865, "step": 1857 }, { "epoch": 1.33, "grad_norm": 14.761181122860886, "learning_rate": 7.800488708233219e-06, "loss": 0.458, "step": 1858 }, { "epoch": 1.33, "grad_norm": 10.986448218756069, "learning_rate": 7.798093963877998e-06, "loss": 0.3965, "step": 1859 }, { "epoch": 1.33, "grad_norm": 14.525624462916241, "learning_rate": 7.795698284627575e-06, "loss": 0.5312, "step": 1860 }, { "epoch": 1.33, "grad_norm": 9.56296095061409, "learning_rate": 7.793301671282391e-06, "loss": 0.4253, "step": 1861 }, { "epoch": 1.33, "grad_norm": 9.813705657660236, "learning_rate": 7.7909041246432e-06, "loss": 0.4346, "step": 1862 }, { "epoch": 1.33, "grad_norm": 8.044009444296448, "learning_rate": 7.788505645511065e-06, "loss": 0.4697, "step": 1863 }, { "epoch": 1.33, "grad_norm": 11.756948527676359, "learning_rate": 7.786106234687362e-06, "loss": 0.4351, "step": 1864 }, { "epoch": 1.33, "grad_norm": 15.838558191606422, "learning_rate": 7.783705892973782e-06, "loss": 0.5356, "step": 1865 }, { "epoch": 1.33, "grad_norm": 10.97549632585959, "learning_rate": 7.78130462117232e-06, "loss": 0.4238, "step": 1866 }, { "epoch": 1.33, "grad_norm": 14.230138714195853, "learning_rate": 7.778902420085289e-06, "loss": 0.4741, "step": 1867 }, { "epoch": 1.33, "grad_norm": 7.33800229572185, "learning_rate": 7.776499290515304e-06, "loss": 0.3765, "step": 1868 }, { "epoch": 1.33, "grad_norm": 12.435836369297808, "learning_rate": 7.7740952332653e-06, "loss": 0.4629, "step": 1869 }, { "epoch": 1.33, "grad_norm": 9.644876077277099, "learning_rate": 7.771690249138517e-06, "loss": 0.4565, "step": 1870 }, { "epoch": 1.34, "grad_norm": 9.755377787104848, "learning_rate": 7.769284338938502e-06, "loss": 0.4541, "step": 1871 }, { "epoch": 1.34, "grad_norm": 19.27972086793472, "learning_rate": 7.766877503469117e-06, "loss": 0.5132, "step": 1872 }, { "epoch": 1.34, "grad_norm": 13.760972849630273, "learning_rate": 7.764469743534529e-06, "loss": 0.4404, "step": 1873 }, { "epoch": 1.34, "grad_norm": 10.940065164124238, "learning_rate": 7.762061059939214e-06, "loss": 0.4536, "step": 1874 }, { "epoch": 1.34, "grad_norm": 7.751090211455104, "learning_rate": 7.759651453487963e-06, "loss": 0.3506, "step": 1875 }, { "epoch": 1.34, "grad_norm": 23.25806719313458, "learning_rate": 7.757240924985866e-06, "loss": 0.6182, "step": 1876 }, { "epoch": 1.34, "grad_norm": 9.896852124606287, "learning_rate": 7.754829475238323e-06, "loss": 0.4888, "step": 1877 }, { "epoch": 1.34, "grad_norm": 13.942142350442522, "learning_rate": 7.752417105051051e-06, "loss": 0.5947, "step": 1878 }, { "epoch": 1.34, "grad_norm": 14.63242801319795, "learning_rate": 7.750003815230062e-06, "loss": 0.4194, "step": 1879 }, { "epoch": 1.34, "grad_norm": 11.915779656542515, "learning_rate": 7.747589606581686e-06, "loss": 0.3735, "step": 1880 }, { "epoch": 1.34, "grad_norm": 7.6414314110252075, "learning_rate": 7.745174479912551e-06, "loss": 0.4844, "step": 1881 }, { "epoch": 1.34, "grad_norm": 9.279768255216817, "learning_rate": 7.742758436029596e-06, "loss": 0.4561, "step": 1882 }, { "epoch": 1.34, "grad_norm": 8.118475626162798, "learning_rate": 7.740341475740068e-06, "loss": 0.4819, "step": 1883 }, { "epoch": 1.34, "grad_norm": 11.986938122701325, "learning_rate": 7.737923599851519e-06, "loss": 0.4072, "step": 1884 }, { "epoch": 1.35, "grad_norm": 7.3213451137330825, "learning_rate": 7.735504809171801e-06, "loss": 0.4595, "step": 1885 }, { "epoch": 1.35, "grad_norm": 10.386651252542793, "learning_rate": 7.733085104509084e-06, "loss": 0.4014, "step": 1886 }, { "epoch": 1.35, "grad_norm": 8.548276619740427, "learning_rate": 7.730664486671831e-06, "loss": 0.4585, "step": 1887 }, { "epoch": 1.35, "grad_norm": 10.158689999749116, "learning_rate": 7.72824295646882e-06, "loss": 0.3936, "step": 1888 }, { "epoch": 1.35, "grad_norm": 12.499012467824802, "learning_rate": 7.725820514709124e-06, "loss": 0.4233, "step": 1889 }, { "epoch": 1.35, "grad_norm": 12.070340859207697, "learning_rate": 7.723397162202128e-06, "loss": 0.5679, "step": 1890 }, { "epoch": 1.35, "grad_norm": 11.183380030500457, "learning_rate": 7.720972899757522e-06, "loss": 0.4019, "step": 1891 }, { "epoch": 1.35, "grad_norm": 19.63336525431284, "learning_rate": 7.718547728185293e-06, "loss": 0.5845, "step": 1892 }, { "epoch": 1.35, "grad_norm": 8.930649239400196, "learning_rate": 7.716121648295738e-06, "loss": 0.4297, "step": 1893 }, { "epoch": 1.35, "grad_norm": 7.936977513469789, "learning_rate": 7.713694660899455e-06, "loss": 0.3965, "step": 1894 }, { "epoch": 1.35, "grad_norm": 15.880197223026865, "learning_rate": 7.711266766807345e-06, "loss": 0.478, "step": 1895 }, { "epoch": 1.35, "grad_norm": 25.38985775819577, "learning_rate": 7.708837966830615e-06, "loss": 0.5176, "step": 1896 }, { "epoch": 1.35, "grad_norm": 11.073264625514588, "learning_rate": 7.706408261780769e-06, "loss": 0.4155, "step": 1897 }, { "epoch": 1.35, "grad_norm": 13.734620428534315, "learning_rate": 7.703977652469618e-06, "loss": 0.4585, "step": 1898 }, { "epoch": 1.36, "grad_norm": 10.71268865147328, "learning_rate": 7.701546139709272e-06, "loss": 0.4351, "step": 1899 }, { "epoch": 1.36, "grad_norm": 8.690569659507187, "learning_rate": 7.69911372431215e-06, "loss": 0.4819, "step": 1900 }, { "epoch": 1.36, "grad_norm": 11.038069885853988, "learning_rate": 7.696680407090962e-06, "loss": 0.4741, "step": 1901 }, { "epoch": 1.36, "grad_norm": 19.166802662393746, "learning_rate": 7.694246188858726e-06, "loss": 0.4458, "step": 1902 }, { "epoch": 1.36, "grad_norm": 12.652531412656241, "learning_rate": 7.691811070428758e-06, "loss": 0.4409, "step": 1903 }, { "epoch": 1.36, "grad_norm": 13.775786518184024, "learning_rate": 7.689375052614681e-06, "loss": 0.5034, "step": 1904 }, { "epoch": 1.36, "grad_norm": 10.730043609598852, "learning_rate": 7.686938136230408e-06, "loss": 0.4878, "step": 1905 }, { "epoch": 1.36, "grad_norm": 11.070741977627472, "learning_rate": 7.684500322090162e-06, "loss": 0.4946, "step": 1906 }, { "epoch": 1.36, "grad_norm": 21.66702912722839, "learning_rate": 7.68206161100846e-06, "loss": 0.4512, "step": 1907 }, { "epoch": 1.36, "grad_norm": 16.18598427399989, "learning_rate": 7.679622003800122e-06, "loss": 0.5322, "step": 1908 }, { "epoch": 1.36, "grad_norm": 18.592185583152887, "learning_rate": 7.677181501280266e-06, "loss": 0.384, "step": 1909 }, { "epoch": 1.36, "grad_norm": 10.483647129362556, "learning_rate": 7.674740104264308e-06, "loss": 0.4438, "step": 1910 }, { "epoch": 1.36, "grad_norm": 19.56330359346813, "learning_rate": 7.672297813567968e-06, "loss": 0.6631, "step": 1911 }, { "epoch": 1.36, "grad_norm": 11.750687177326649, "learning_rate": 7.669854630007257e-06, "loss": 0.4619, "step": 1912 }, { "epoch": 1.37, "grad_norm": 10.801460674280925, "learning_rate": 7.667410554398486e-06, "loss": 0.5146, "step": 1913 }, { "epoch": 1.37, "grad_norm": 10.654003604266663, "learning_rate": 7.664965587558271e-06, "loss": 0.5215, "step": 1914 }, { "epoch": 1.37, "grad_norm": 7.252336343356576, "learning_rate": 7.662519730303517e-06, "loss": 0.4146, "step": 1915 }, { "epoch": 1.37, "grad_norm": 17.96405723828609, "learning_rate": 7.660072983451433e-06, "loss": 0.481, "step": 1916 }, { "epoch": 1.37, "grad_norm": 10.053400748038461, "learning_rate": 7.657625347819522e-06, "loss": 0.4648, "step": 1917 }, { "epoch": 1.37, "grad_norm": 10.665072797465664, "learning_rate": 7.655176824225582e-06, "loss": 0.4395, "step": 1918 }, { "epoch": 1.37, "grad_norm": 14.43877602912696, "learning_rate": 7.652727413487716e-06, "loss": 0.5225, "step": 1919 }, { "epoch": 1.37, "grad_norm": 11.53955384603415, "learning_rate": 7.650277116424313e-06, "loss": 0.4473, "step": 1920 }, { "epoch": 1.37, "grad_norm": 17.42532980284874, "learning_rate": 7.647825933854063e-06, "loss": 0.5376, "step": 1921 }, { "epoch": 1.37, "grad_norm": 10.36636988005007, "learning_rate": 7.645373866595953e-06, "loss": 0.4521, "step": 1922 }, { "epoch": 1.37, "grad_norm": 9.525600543992109, "learning_rate": 7.642920915469265e-06, "loss": 0.4688, "step": 1923 }, { "epoch": 1.37, "grad_norm": 17.106931536030515, "learning_rate": 7.640467081293573e-06, "loss": 0.5918, "step": 1924 }, { "epoch": 1.37, "grad_norm": 11.348679176976812, "learning_rate": 7.638012364888751e-06, "loss": 0.4907, "step": 1925 }, { "epoch": 1.37, "grad_norm": 11.55252362508795, "learning_rate": 7.635556767074965e-06, "loss": 0.4731, "step": 1926 }, { "epoch": 1.38, "grad_norm": 12.419504634208726, "learning_rate": 7.633100288672674e-06, "loss": 0.481, "step": 1927 }, { "epoch": 1.38, "grad_norm": 12.121962467884394, "learning_rate": 7.630642930502634e-06, "loss": 0.418, "step": 1928 }, { "epoch": 1.38, "grad_norm": 10.289195654821805, "learning_rate": 7.628184693385896e-06, "loss": 0.4648, "step": 1929 }, { "epoch": 1.38, "grad_norm": 7.9227069619546375, "learning_rate": 7.625725578143801e-06, "loss": 0.4712, "step": 1930 }, { "epoch": 1.38, "grad_norm": 9.229125735331845, "learning_rate": 7.6232655855979844e-06, "loss": 0.5029, "step": 1931 }, { "epoch": 1.38, "grad_norm": 13.049117828200913, "learning_rate": 7.620804716570376e-06, "loss": 0.4653, "step": 1932 }, { "epoch": 1.38, "grad_norm": 15.576958024182636, "learning_rate": 7.618342971883199e-06, "loss": 0.5605, "step": 1933 }, { "epoch": 1.38, "grad_norm": 12.779629505768929, "learning_rate": 7.615880352358967e-06, "loss": 0.4014, "step": 1934 }, { "epoch": 1.38, "grad_norm": 18.50645646388632, "learning_rate": 7.613416858820486e-06, "loss": 0.4214, "step": 1935 }, { "epoch": 1.38, "grad_norm": 25.52806005097817, "learning_rate": 7.6109524920908575e-06, "loss": 0.5288, "step": 1936 }, { "epoch": 1.38, "grad_norm": 13.933316977057585, "learning_rate": 7.608487252993471e-06, "loss": 0.5151, "step": 1937 }, { "epoch": 1.38, "grad_norm": 13.6592436046183, "learning_rate": 7.6060211423520095e-06, "loss": 0.4072, "step": 1938 }, { "epoch": 1.38, "grad_norm": 21.814161825521396, "learning_rate": 7.6035541609904425e-06, "loss": 0.606, "step": 1939 }, { "epoch": 1.38, "grad_norm": 15.66882526072226, "learning_rate": 7.60108630973304e-06, "loss": 0.458, "step": 1940 }, { "epoch": 1.39, "grad_norm": 17.85672505413872, "learning_rate": 7.598617589404354e-06, "loss": 0.4565, "step": 1941 }, { "epoch": 1.39, "grad_norm": 16.858479644628172, "learning_rate": 7.596148000829229e-06, "loss": 0.6187, "step": 1942 }, { "epoch": 1.39, "grad_norm": 13.16162358033872, "learning_rate": 7.593677544832802e-06, "loss": 0.5293, "step": 1943 }, { "epoch": 1.39, "grad_norm": 9.524117704758424, "learning_rate": 7.5912062222404965e-06, "loss": 0.3533, "step": 1944 }, { "epoch": 1.39, "grad_norm": 20.818889018640792, "learning_rate": 7.588734033878031e-06, "loss": 0.5693, "step": 1945 }, { "epoch": 1.39, "grad_norm": 16.522766326382442, "learning_rate": 7.586260980571407e-06, "loss": 0.5547, "step": 1946 }, { "epoch": 1.39, "grad_norm": 11.564225336554951, "learning_rate": 7.5837870631469165e-06, "loss": 0.6562, "step": 1947 }, { "epoch": 1.39, "grad_norm": 5.668952257315175, "learning_rate": 7.581312282431143e-06, "loss": 0.3936, "step": 1948 }, { "epoch": 1.39, "grad_norm": 11.336521625375687, "learning_rate": 7.578836639250958e-06, "loss": 0.5151, "step": 1949 }, { "epoch": 1.39, "grad_norm": 8.918797922477602, "learning_rate": 7.576360134433517e-06, "loss": 0.4668, "step": 1950 }, { "epoch": 1.39, "grad_norm": 7.6676020443715975, "learning_rate": 7.5738827688062676e-06, "loss": 0.4961, "step": 1951 }, { "epoch": 1.39, "grad_norm": 7.713872938147478, "learning_rate": 7.571404543196943e-06, "loss": 0.4824, "step": 1952 }, { "epoch": 1.39, "grad_norm": 6.834704519149373, "learning_rate": 7.568925458433567e-06, "loss": 0.4819, "step": 1953 }, { "epoch": 1.39, "grad_norm": 12.91835327063667, "learning_rate": 7.566445515344445e-06, "loss": 0.4233, "step": 1954 }, { "epoch": 1.4, "grad_norm": 7.599974116531055, "learning_rate": 7.563964714758172e-06, "loss": 0.4458, "step": 1955 }, { "epoch": 1.4, "grad_norm": 10.201329272152451, "learning_rate": 7.561483057503632e-06, "loss": 0.5386, "step": 1956 }, { "epoch": 1.4, "grad_norm": 10.350442516482657, "learning_rate": 7.559000544409991e-06, "loss": 0.4849, "step": 1957 }, { "epoch": 1.4, "grad_norm": 12.802291496242244, "learning_rate": 7.556517176306704e-06, "loss": 0.5127, "step": 1958 }, { "epoch": 1.4, "grad_norm": 14.032857918894152, "learning_rate": 7.554032954023508e-06, "loss": 0.479, "step": 1959 }, { "epoch": 1.4, "grad_norm": 11.170707738509645, "learning_rate": 7.55154787839043e-06, "loss": 0.5752, "step": 1960 }, { "epoch": 1.4, "grad_norm": 14.33356640527272, "learning_rate": 7.5490619502377805e-06, "loss": 0.562, "step": 1961 }, { "epoch": 1.4, "grad_norm": 10.3400026057796, "learning_rate": 7.546575170396153e-06, "loss": 0.4961, "step": 1962 }, { "epoch": 1.4, "grad_norm": 11.41020505136803, "learning_rate": 7.544087539696427e-06, "loss": 0.5356, "step": 1963 }, { "epoch": 1.4, "grad_norm": 15.384398968489197, "learning_rate": 7.541599058969766e-06, "loss": 0.5757, "step": 1964 }, { "epoch": 1.4, "grad_norm": 26.85010786013174, "learning_rate": 7.539109729047619e-06, "loss": 0.5869, "step": 1965 }, { "epoch": 1.4, "grad_norm": 13.678164293750017, "learning_rate": 7.5366195507617155e-06, "loss": 0.5024, "step": 1966 }, { "epoch": 1.4, "grad_norm": 11.760310761103758, "learning_rate": 7.534128524944071e-06, "loss": 0.4077, "step": 1967 }, { "epoch": 1.4, "grad_norm": 15.324608405481133, "learning_rate": 7.531636652426985e-06, "loss": 0.4927, "step": 1968 }, { "epoch": 1.41, "grad_norm": 12.427863874423563, "learning_rate": 7.529143934043036e-06, "loss": 0.562, "step": 1969 }, { "epoch": 1.41, "grad_norm": 9.721667941829319, "learning_rate": 7.526650370625088e-06, "loss": 0.6045, "step": 1970 }, { "epoch": 1.41, "grad_norm": 11.373487140290482, "learning_rate": 7.5241559630062896e-06, "loss": 0.5576, "step": 1971 }, { "epoch": 1.41, "grad_norm": 10.699156015321359, "learning_rate": 7.5216607120200655e-06, "loss": 0.4937, "step": 1972 }, { "epoch": 1.41, "grad_norm": 9.49661605079921, "learning_rate": 7.519164618500127e-06, "loss": 0.4209, "step": 1973 }, { "epoch": 1.41, "grad_norm": 14.772198775491923, "learning_rate": 7.5166676832804655e-06, "loss": 0.4731, "step": 1974 }, { "epoch": 1.41, "grad_norm": 8.746747420450404, "learning_rate": 7.514169907195352e-06, "loss": 0.457, "step": 1975 }, { "epoch": 1.41, "grad_norm": 11.152389759194886, "learning_rate": 7.511671291079342e-06, "loss": 0.3955, "step": 1976 }, { "epoch": 1.41, "grad_norm": 12.057173209748903, "learning_rate": 7.509171835767268e-06, "loss": 0.4814, "step": 1977 }, { "epoch": 1.41, "grad_norm": 15.473228863127407, "learning_rate": 7.506671542094246e-06, "loss": 0.4976, "step": 1978 }, { "epoch": 1.41, "grad_norm": 10.727530037186542, "learning_rate": 7.504170410895668e-06, "loss": 0.4204, "step": 1979 }, { "epoch": 1.41, "grad_norm": 7.985145065845938, "learning_rate": 7.501668443007212e-06, "loss": 0.4468, "step": 1980 }, { "epoch": 1.41, "grad_norm": 14.866585730814796, "learning_rate": 7.499165639264828e-06, "loss": 0.5054, "step": 1981 }, { "epoch": 1.41, "grad_norm": 13.54865900787404, "learning_rate": 7.496662000504752e-06, "loss": 0.5615, "step": 1982 }, { "epoch": 1.42, "grad_norm": 10.596067173111546, "learning_rate": 7.4941575275634945e-06, "loss": 0.4961, "step": 1983 }, { "epoch": 1.42, "grad_norm": 18.522213226674722, "learning_rate": 7.49165222127785e-06, "loss": 0.4829, "step": 1984 }, { "epoch": 1.42, "grad_norm": 9.827635155968103, "learning_rate": 7.489146082484882e-06, "loss": 0.3564, "step": 1985 }, { "epoch": 1.42, "grad_norm": 9.292648628242718, "learning_rate": 7.486639112021944e-06, "loss": 0.4268, "step": 1986 }, { "epoch": 1.42, "grad_norm": 10.52207584728679, "learning_rate": 7.484131310726658e-06, "loss": 0.3796, "step": 1987 }, { "epoch": 1.42, "grad_norm": 14.540889479395993, "learning_rate": 7.481622679436929e-06, "loss": 0.4463, "step": 1988 }, { "epoch": 1.42, "grad_norm": 8.775521996498332, "learning_rate": 7.479113218990934e-06, "loss": 0.3867, "step": 1989 }, { "epoch": 1.42, "grad_norm": 11.41497161064518, "learning_rate": 7.4766029302271335e-06, "loss": 0.5225, "step": 1990 }, { "epoch": 1.42, "grad_norm": 7.663185183957201, "learning_rate": 7.474091813984261e-06, "loss": 0.3784, "step": 1991 }, { "epoch": 1.42, "grad_norm": 10.438508094185057, "learning_rate": 7.471579871101326e-06, "loss": 0.4409, "step": 1992 }, { "epoch": 1.42, "grad_norm": 12.101689467991465, "learning_rate": 7.4690671024176165e-06, "loss": 0.5542, "step": 1993 }, { "epoch": 1.42, "grad_norm": 13.147337868681644, "learning_rate": 7.466553508772695e-06, "loss": 0.4072, "step": 1994 }, { "epoch": 1.42, "grad_norm": 13.211891837722657, "learning_rate": 7.4640390910064e-06, "loss": 0.4326, "step": 1995 }, { "epoch": 1.42, "grad_norm": 14.383061627960748, "learning_rate": 7.461523849958845e-06, "loss": 0.5654, "step": 1996 }, { "epoch": 1.43, "grad_norm": 11.337379991199462, "learning_rate": 7.459007786470418e-06, "loss": 0.3994, "step": 1997 }, { "epoch": 1.43, "grad_norm": 10.488871378983164, "learning_rate": 7.4564909013817845e-06, "loss": 0.501, "step": 1998 }, { "epoch": 1.43, "grad_norm": 12.83370126958352, "learning_rate": 7.45397319553388e-06, "loss": 0.354, "step": 1999 }, { "epoch": 1.43, "grad_norm": 12.89565539397935, "learning_rate": 7.451454669767919e-06, "loss": 0.4561, "step": 2000 }, { "epoch": 1.43, "eval_avg_AUC": 0.7710454060906093, "eval_avg_Accuracy": 0.6791694297082228, "eval_avg_Accuracy-right": 0.9080474761966871, "eval_avg_Accuracy-wrong": 0.2800773254491699, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.626217598019666, "eval_last_AUC": 0.7805472536484926, "eval_last_Accuracy": 0.705238726790451, "eval_last_Accuracy-right": 0.851506456241033, "eval_last_Accuracy-wrong": 0.4501933136229247, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6337350556153214, "eval_max_AUC": 0.7253894400592504, "eval_max_Accuracy": 0.6378481432360743, "eval_max_Accuracy-right": 0.9711099517412286, "eval_max_Accuracy-wrong": 0.056743234023197635, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6102256988583293, "eval_min_AUC": 0.7744442531826913, "eval_min_Accuracy": 0.707145225464191, "eval_min_Accuracy-right": 0.7947697926177123, "eval_min_Accuracy-wrong": 0.5543552422105982, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6233897492201612, "eval_prod_AUC": 0.7756218453367024, "eval_prod_Accuracy": 0.6703000663129973, "eval_prod_Accuracy-right": 0.5567366636233207, "eval_prod_Accuracy-wrong": 0.8683193086195133, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6229021173701285, "eval_runtime": 246.9142, "eval_samples_per_second": 97.718, "eval_steps_per_second": 3.054, "eval_sum_AUC": 0.6216167742931785, "eval_sum_Accuracy": 0.6359001989389921, "eval_sum_Accuracy-right": 0.9986304943263337, "eval_sum_Accuracy-wrong": 0.0034114168751421424, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6193609469118248, "step": 2000 }, { "epoch": 1.43, "grad_norm": 14.915881897626125, "learning_rate": 7.448935324925386e-06, "loss": 0.3965, "step": 2001 }, { "epoch": 1.43, "grad_norm": 12.292176761992444, "learning_rate": 7.446415161848043e-06, "loss": 0.4697, "step": 2002 }, { "epoch": 1.43, "grad_norm": 12.41157340253059, "learning_rate": 7.443894181377921e-06, "loss": 0.4683, "step": 2003 }, { "epoch": 1.43, "grad_norm": 11.703353976942797, "learning_rate": 7.441372384357328e-06, "loss": 0.4531, "step": 2004 }, { "epoch": 1.43, "grad_norm": 16.728696147279525, "learning_rate": 7.438849771628844e-06, "loss": 0.4814, "step": 2005 }, { "epoch": 1.43, "grad_norm": 9.964762875434154, "learning_rate": 7.43632634403532e-06, "loss": 0.4409, "step": 2006 }, { "epoch": 1.43, "grad_norm": 19.767032870416173, "learning_rate": 7.433802102419878e-06, "loss": 0.4531, "step": 2007 }, { "epoch": 1.43, "grad_norm": 8.973473512937122, "learning_rate": 7.431277047625918e-06, "loss": 0.4146, "step": 2008 }, { "epoch": 1.43, "grad_norm": 14.382184093999804, "learning_rate": 7.428751180497104e-06, "loss": 0.5166, "step": 2009 }, { "epoch": 1.43, "grad_norm": 17.177950650233484, "learning_rate": 7.426224501877376e-06, "loss": 0.5278, "step": 2010 }, { "epoch": 1.44, "grad_norm": 12.821415821919816, "learning_rate": 7.423697012610947e-06, "loss": 0.4058, "step": 2011 }, { "epoch": 1.44, "grad_norm": 10.12631349817775, "learning_rate": 7.421168713542294e-06, "loss": 0.4873, "step": 2012 }, { "epoch": 1.44, "grad_norm": 12.189855739113973, "learning_rate": 7.418639605516172e-06, "loss": 0.5205, "step": 2013 }, { "epoch": 1.44, "grad_norm": 10.654050014113183, "learning_rate": 7.416109689377603e-06, "loss": 0.5635, "step": 2014 }, { "epoch": 1.44, "grad_norm": 25.246721169792306, "learning_rate": 7.413578965971876e-06, "loss": 0.4805, "step": 2015 }, { "epoch": 1.44, "grad_norm": 10.730269042581263, "learning_rate": 7.411047436144556e-06, "loss": 0.4165, "step": 2016 }, { "epoch": 1.44, "grad_norm": 12.059699400483384, "learning_rate": 7.408515100741471e-06, "loss": 0.4656, "step": 2017 }, { "epoch": 1.44, "grad_norm": 12.604554128594822, "learning_rate": 7.405981960608725e-06, "loss": 0.4453, "step": 2018 }, { "epoch": 1.44, "grad_norm": 9.398182516011172, "learning_rate": 7.403448016592685e-06, "loss": 0.4951, "step": 2019 }, { "epoch": 1.44, "grad_norm": 13.429871973957303, "learning_rate": 7.400913269539988e-06, "loss": 0.5908, "step": 2020 }, { "epoch": 1.44, "grad_norm": 11.90665542864224, "learning_rate": 7.398377720297541e-06, "loss": 0.4917, "step": 2021 }, { "epoch": 1.44, "grad_norm": 18.435262236737888, "learning_rate": 7.39584136971252e-06, "loss": 0.4473, "step": 2022 }, { "epoch": 1.44, "grad_norm": 25.241789540294814, "learning_rate": 7.393304218632364e-06, "loss": 0.4507, "step": 2023 }, { "epoch": 1.44, "grad_norm": 12.58272231581781, "learning_rate": 7.390766267904783e-06, "loss": 0.3755, "step": 2024 }, { "epoch": 1.45, "grad_norm": 16.95465858537594, "learning_rate": 7.3882275183777554e-06, "loss": 0.5312, "step": 2025 }, { "epoch": 1.45, "grad_norm": 18.639047246847788, "learning_rate": 7.385687970899523e-06, "loss": 0.5078, "step": 2026 }, { "epoch": 1.45, "grad_norm": 14.953213267741265, "learning_rate": 7.3831476263185965e-06, "loss": 0.4751, "step": 2027 }, { "epoch": 1.45, "grad_norm": 14.322106542062425, "learning_rate": 7.380606485483751e-06, "loss": 0.4009, "step": 2028 }, { "epoch": 1.45, "grad_norm": 43.45347427250669, "learning_rate": 7.378064549244031e-06, "loss": 0.5298, "step": 2029 }, { "epoch": 1.45, "grad_norm": 34.42523425392354, "learning_rate": 7.375521818448741e-06, "loss": 0.5371, "step": 2030 }, { "epoch": 1.45, "grad_norm": 55.77740354141135, "learning_rate": 7.372978293947459e-06, "loss": 0.4761, "step": 2031 }, { "epoch": 1.45, "grad_norm": 49.09132676927039, "learning_rate": 7.3704339765900205e-06, "loss": 0.5615, "step": 2032 }, { "epoch": 1.45, "grad_norm": 34.9256636465853, "learning_rate": 7.367888867226531e-06, "loss": 0.4976, "step": 2033 }, { "epoch": 1.45, "grad_norm": 15.312677064646966, "learning_rate": 7.365342966707359e-06, "loss": 0.4487, "step": 2034 }, { "epoch": 1.45, "grad_norm": 12.813978743326054, "learning_rate": 7.362796275883135e-06, "loss": 0.4038, "step": 2035 }, { "epoch": 1.45, "grad_norm": 12.643306134854747, "learning_rate": 7.360248795604758e-06, "loss": 0.4575, "step": 2036 }, { "epoch": 1.45, "grad_norm": 12.868389092088139, "learning_rate": 7.3577005267233885e-06, "loss": 0.429, "step": 2037 }, { "epoch": 1.45, "grad_norm": 9.982700342237198, "learning_rate": 7.355151470090449e-06, "loss": 0.4712, "step": 2038 }, { "epoch": 1.46, "grad_norm": 9.034999940641313, "learning_rate": 7.352601626557628e-06, "loss": 0.4663, "step": 2039 }, { "epoch": 1.46, "grad_norm": 20.88549792075512, "learning_rate": 7.350050996976875e-06, "loss": 0.5669, "step": 2040 }, { "epoch": 1.46, "grad_norm": 9.208945274018559, "learning_rate": 7.347499582200404e-06, "loss": 0.4097, "step": 2041 }, { "epoch": 1.46, "grad_norm": 11.404862265993508, "learning_rate": 7.344947383080687e-06, "loss": 0.4634, "step": 2042 }, { "epoch": 1.46, "grad_norm": 8.366856522102745, "learning_rate": 7.342394400470463e-06, "loss": 0.3926, "step": 2043 }, { "epoch": 1.46, "grad_norm": 9.362697922089307, "learning_rate": 7.339840635222732e-06, "loss": 0.4116, "step": 2044 }, { "epoch": 1.46, "grad_norm": 10.008373884805586, "learning_rate": 7.337286088190754e-06, "loss": 0.4399, "step": 2045 }, { "epoch": 1.46, "grad_norm": 18.955581395387284, "learning_rate": 7.334730760228049e-06, "loss": 0.5054, "step": 2046 }, { "epoch": 1.46, "grad_norm": 18.736021901140624, "learning_rate": 7.332174652188401e-06, "loss": 0.5195, "step": 2047 }, { "epoch": 1.46, "grad_norm": 12.418786675988692, "learning_rate": 7.329617764925853e-06, "loss": 0.5605, "step": 2048 }, { "epoch": 1.46, "grad_norm": 10.156432856945289, "learning_rate": 7.32706009929471e-06, "loss": 0.5474, "step": 2049 }, { "epoch": 1.46, "grad_norm": 15.360135291080843, "learning_rate": 7.324501656149532e-06, "loss": 0.4775, "step": 2050 }, { "epoch": 1.46, "grad_norm": 6.446127423689205, "learning_rate": 7.321942436345146e-06, "loss": 0.324, "step": 2051 }, { "epoch": 1.46, "grad_norm": 12.489691101046363, "learning_rate": 7.319382440736632e-06, "loss": 0.5054, "step": 2052 }, { "epoch": 1.47, "grad_norm": 6.514088999847083, "learning_rate": 7.316821670179335e-06, "loss": 0.4668, "step": 2053 }, { "epoch": 1.47, "grad_norm": 11.159539585109048, "learning_rate": 7.314260125528854e-06, "loss": 0.6167, "step": 2054 }, { "epoch": 1.47, "grad_norm": 25.094579873479073, "learning_rate": 7.311697807641048e-06, "loss": 0.3867, "step": 2055 }, { "epoch": 1.47, "grad_norm": 8.846673523885213, "learning_rate": 7.3091347173720386e-06, "loss": 0.4932, "step": 2056 }, { "epoch": 1.47, "grad_norm": 6.2914105450018125, "learning_rate": 7.3065708555781986e-06, "loss": 0.4727, "step": 2057 }, { "epoch": 1.47, "grad_norm": 19.595181627149866, "learning_rate": 7.304006223116162e-06, "loss": 0.5098, "step": 2058 }, { "epoch": 1.47, "grad_norm": 19.44544649111758, "learning_rate": 7.301440820842822e-06, "loss": 0.4707, "step": 2059 }, { "epoch": 1.47, "grad_norm": 13.018485991705333, "learning_rate": 7.298874649615327e-06, "loss": 0.6582, "step": 2060 }, { "epoch": 1.47, "grad_norm": 5.661937937304063, "learning_rate": 7.29630771029108e-06, "loss": 0.3862, "step": 2061 }, { "epoch": 1.47, "grad_norm": 7.548593437781802, "learning_rate": 7.293740003727745e-06, "loss": 0.4663, "step": 2062 }, { "epoch": 1.47, "grad_norm": 17.2212974848326, "learning_rate": 7.291171530783241e-06, "loss": 0.4976, "step": 2063 }, { "epoch": 1.47, "grad_norm": 11.235414814450403, "learning_rate": 7.288602292315742e-06, "loss": 0.5596, "step": 2064 }, { "epoch": 1.47, "grad_norm": 15.313344290622826, "learning_rate": 7.286032289183679e-06, "loss": 0.5317, "step": 2065 }, { "epoch": 1.47, "grad_norm": 14.94697386715504, "learning_rate": 7.283461522245736e-06, "loss": 0.5137, "step": 2066 }, { "epoch": 1.48, "grad_norm": 10.081682104162239, "learning_rate": 7.280889992360856e-06, "loss": 0.5156, "step": 2067 }, { "epoch": 1.48, "grad_norm": 7.742609851819826, "learning_rate": 7.278317700388232e-06, "loss": 0.4551, "step": 2068 }, { "epoch": 1.48, "grad_norm": 16.514579824634136, "learning_rate": 7.275744647187318e-06, "loss": 0.5801, "step": 2069 }, { "epoch": 1.48, "grad_norm": 8.69700237796884, "learning_rate": 7.273170833617818e-06, "loss": 0.4678, "step": 2070 }, { "epoch": 1.48, "grad_norm": 11.058671415569465, "learning_rate": 7.2705962605396895e-06, "loss": 0.521, "step": 2071 }, { "epoch": 1.48, "grad_norm": 18.321934672500923, "learning_rate": 7.268020928813147e-06, "loss": 0.4917, "step": 2072 }, { "epoch": 1.48, "grad_norm": 16.94345204999045, "learning_rate": 7.265444839298656e-06, "loss": 0.4526, "step": 2073 }, { "epoch": 1.48, "grad_norm": 12.541712902362422, "learning_rate": 7.262867992856934e-06, "loss": 0.4409, "step": 2074 }, { "epoch": 1.48, "grad_norm": 19.3187862663539, "learning_rate": 7.260290390348956e-06, "loss": 0.5542, "step": 2075 }, { "epoch": 1.48, "grad_norm": 9.647404924361794, "learning_rate": 7.257712032635946e-06, "loss": 0.5205, "step": 2076 }, { "epoch": 1.48, "grad_norm": 9.207651531426404, "learning_rate": 7.255132920579382e-06, "loss": 0.4556, "step": 2077 }, { "epoch": 1.48, "grad_norm": 11.087482473274372, "learning_rate": 7.252553055040991e-06, "loss": 0.4121, "step": 2078 }, { "epoch": 1.48, "grad_norm": 25.25159930562991, "learning_rate": 7.249972436882756e-06, "loss": 0.5176, "step": 2079 }, { "epoch": 1.48, "grad_norm": 17.702457354457117, "learning_rate": 7.247391066966909e-06, "loss": 0.4609, "step": 2080 }, { "epoch": 1.49, "grad_norm": 7.603868415722688, "learning_rate": 7.244808946155933e-06, "loss": 0.438, "step": 2081 }, { "epoch": 1.49, "grad_norm": 11.349525316850295, "learning_rate": 7.242226075312564e-06, "loss": 0.437, "step": 2082 }, { "epoch": 1.49, "grad_norm": 10.177336362256062, "learning_rate": 7.239642455299787e-06, "loss": 0.4385, "step": 2083 }, { "epoch": 1.49, "grad_norm": 23.495636394435223, "learning_rate": 7.237058086980835e-06, "loss": 0.6406, "step": 2084 }, { "epoch": 1.49, "grad_norm": 12.827798850107106, "learning_rate": 7.234472971219197e-06, "loss": 0.4165, "step": 2085 }, { "epoch": 1.49, "grad_norm": 17.754753037992685, "learning_rate": 7.231887108878606e-06, "loss": 0.5586, "step": 2086 }, { "epoch": 1.49, "grad_norm": 11.180399436654513, "learning_rate": 7.229300500823047e-06, "loss": 0.4966, "step": 2087 }, { "epoch": 1.49, "grad_norm": 21.98964058960827, "learning_rate": 7.226713147916754e-06, "loss": 0.4678, "step": 2088 }, { "epoch": 1.49, "grad_norm": 12.21132602807958, "learning_rate": 7.22412505102421e-06, "loss": 0.4282, "step": 2089 }, { "epoch": 1.49, "grad_norm": 8.175190157330336, "learning_rate": 7.221536211010147e-06, "loss": 0.3574, "step": 2090 }, { "epoch": 1.49, "grad_norm": 10.902417216507908, "learning_rate": 7.2189466287395425e-06, "loss": 0.4727, "step": 2091 }, { "epoch": 1.49, "grad_norm": 8.995076716425732, "learning_rate": 7.216356305077625e-06, "loss": 0.4937, "step": 2092 }, { "epoch": 1.49, "grad_norm": 19.69763566486368, "learning_rate": 7.21376524088987e-06, "loss": 0.5181, "step": 2093 }, { "epoch": 1.49, "grad_norm": 10.925797313775647, "learning_rate": 7.211173437042001e-06, "loss": 0.3823, "step": 2094 }, { "epoch": 1.5, "grad_norm": 9.692483691370395, "learning_rate": 7.208580894399986e-06, "loss": 0.4917, "step": 2095 }, { "epoch": 1.5, "grad_norm": 19.59837794732827, "learning_rate": 7.205987613830043e-06, "loss": 0.5283, "step": 2096 }, { "epoch": 1.5, "grad_norm": 14.027224117019413, "learning_rate": 7.203393596198635e-06, "loss": 0.4348, "step": 2097 }, { "epoch": 1.5, "grad_norm": 9.277539316575139, "learning_rate": 7.200798842372472e-06, "loss": 0.5938, "step": 2098 }, { "epoch": 1.5, "grad_norm": 8.158219347531539, "learning_rate": 7.198203353218508e-06, "loss": 0.4829, "step": 2099 }, { "epoch": 1.5, "grad_norm": 12.788815769871793, "learning_rate": 7.195607129603946e-06, "loss": 0.4785, "step": 2100 }, { "epoch": 1.5, "grad_norm": 13.303724332755069, "learning_rate": 7.19301017239623e-06, "loss": 0.4536, "step": 2101 }, { "epoch": 1.5, "grad_norm": 8.88556880734833, "learning_rate": 7.190412482463054e-06, "loss": 0.4185, "step": 2102 }, { "epoch": 1.5, "grad_norm": 9.39510497345665, "learning_rate": 7.187814060672354e-06, "loss": 0.499, "step": 2103 }, { "epoch": 1.5, "grad_norm": 7.517777067379724, "learning_rate": 7.1852149078923105e-06, "loss": 0.4316, "step": 2104 }, { "epoch": 1.5, "grad_norm": 11.90115946127883, "learning_rate": 7.1826150249913495e-06, "loss": 0.5176, "step": 2105 }, { "epoch": 1.5, "grad_norm": 9.267334886302498, "learning_rate": 7.18001441283814e-06, "loss": 0.3643, "step": 2106 }, { "epoch": 1.5, "grad_norm": 22.066236031166735, "learning_rate": 7.1774130723015955e-06, "loss": 0.6748, "step": 2107 }, { "epoch": 1.5, "grad_norm": 6.466559423365932, "learning_rate": 7.17481100425087e-06, "loss": 0.3584, "step": 2108 }, { "epoch": 1.51, "grad_norm": 11.920295137461906, "learning_rate": 7.172208209555365e-06, "loss": 0.4668, "step": 2109 }, { "epoch": 1.51, "grad_norm": 10.95986098611621, "learning_rate": 7.1696046890847206e-06, "loss": 0.4946, "step": 2110 }, { "epoch": 1.51, "grad_norm": 16.786202212897535, "learning_rate": 7.167000443708823e-06, "loss": 0.5708, "step": 2111 }, { "epoch": 1.51, "grad_norm": 6.838902041674624, "learning_rate": 7.164395474297798e-06, "loss": 0.3911, "step": 2112 }, { "epoch": 1.51, "grad_norm": 9.176921095103253, "learning_rate": 7.161789781722016e-06, "loss": 0.4888, "step": 2113 }, { "epoch": 1.51, "grad_norm": 15.90810755723991, "learning_rate": 7.159183366852085e-06, "loss": 0.522, "step": 2114 }, { "epoch": 1.51, "grad_norm": 25.715900600535367, "learning_rate": 7.156576230558859e-06, "loss": 0.5347, "step": 2115 }, { "epoch": 1.51, "grad_norm": 28.28368775228431, "learning_rate": 7.153968373713429e-06, "loss": 0.5601, "step": 2116 }, { "epoch": 1.51, "grad_norm": 11.980625504053775, "learning_rate": 7.1513597971871295e-06, "loss": 0.4326, "step": 2117 }, { "epoch": 1.51, "grad_norm": 12.501120633814393, "learning_rate": 7.148750501851532e-06, "loss": 0.4717, "step": 2118 }, { "epoch": 1.51, "grad_norm": 7.712009993731347, "learning_rate": 7.1461404885784545e-06, "loss": 0.4873, "step": 2119 }, { "epoch": 1.51, "grad_norm": 13.12024516319098, "learning_rate": 7.1435297582399475e-06, "loss": 0.4927, "step": 2120 }, { "epoch": 1.51, "grad_norm": 7.745722717197969, "learning_rate": 7.140918311708306e-06, "loss": 0.459, "step": 2121 }, { "epoch": 1.51, "grad_norm": 8.688287138210573, "learning_rate": 7.138306149856062e-06, "loss": 0.3975, "step": 2122 }, { "epoch": 1.52, "grad_norm": 7.090721711186, "learning_rate": 7.1356932735559905e-06, "loss": 0.4312, "step": 2123 }, { "epoch": 1.52, "grad_norm": 13.510222651737925, "learning_rate": 7.133079683681099e-06, "loss": 0.5864, "step": 2124 }, { "epoch": 1.52, "grad_norm": 8.147602862513805, "learning_rate": 7.130465381104635e-06, "loss": 0.3774, "step": 2125 }, { "epoch": 1.52, "grad_norm": 8.962162623356548, "learning_rate": 7.1278503667000885e-06, "loss": 0.4297, "step": 2126 }, { "epoch": 1.52, "grad_norm": 7.176462569931615, "learning_rate": 7.125234641341185e-06, "loss": 0.4458, "step": 2127 }, { "epoch": 1.52, "grad_norm": 7.786769472257886, "learning_rate": 7.1226182059018835e-06, "loss": 0.397, "step": 2128 }, { "epoch": 1.52, "grad_norm": 10.022347014680557, "learning_rate": 7.120001061256387e-06, "loss": 0.3726, "step": 2129 }, { "epoch": 1.52, "grad_norm": 16.58498208142081, "learning_rate": 7.1173832082791294e-06, "loss": 0.5288, "step": 2130 }, { "epoch": 1.52, "grad_norm": 14.626054474790065, "learning_rate": 7.114764647844788e-06, "loss": 0.4883, "step": 2131 }, { "epoch": 1.52, "grad_norm": 15.442528730947034, "learning_rate": 7.112145380828267e-06, "loss": 0.5249, "step": 2132 }, { "epoch": 1.52, "grad_norm": 12.78478336790942, "learning_rate": 7.109525408104717e-06, "loss": 0.5713, "step": 2133 }, { "epoch": 1.52, "grad_norm": 14.13171845697349, "learning_rate": 7.106904730549517e-06, "loss": 0.4658, "step": 2134 }, { "epoch": 1.52, "grad_norm": 7.812889203251429, "learning_rate": 7.104283349038285e-06, "loss": 0.4365, "step": 2135 }, { "epoch": 1.52, "grad_norm": 10.244931909540133, "learning_rate": 7.101661264446875e-06, "loss": 0.4551, "step": 2136 }, { "epoch": 1.53, "grad_norm": 6.42786049799438, "learning_rate": 7.099038477651371e-06, "loss": 0.3369, "step": 2137 }, { "epoch": 1.53, "grad_norm": 7.446564970154658, "learning_rate": 7.096414989528095e-06, "loss": 0.4336, "step": 2138 }, { "epoch": 1.53, "grad_norm": 11.95393484819928, "learning_rate": 7.093790800953606e-06, "loss": 0.5459, "step": 2139 }, { "epoch": 1.53, "grad_norm": 8.975724248095739, "learning_rate": 7.091165912804693e-06, "loss": 0.4619, "step": 2140 }, { "epoch": 1.53, "grad_norm": 8.463205000165559, "learning_rate": 7.088540325958379e-06, "loss": 0.4702, "step": 2141 }, { "epoch": 1.53, "grad_norm": 8.668064646053395, "learning_rate": 7.085914041291921e-06, "loss": 0.4897, "step": 2142 }, { "epoch": 1.53, "grad_norm": 17.501799641654888, "learning_rate": 7.08328705968281e-06, "loss": 0.5454, "step": 2143 }, { "epoch": 1.53, "grad_norm": 7.3244754880169625, "learning_rate": 7.080659382008772e-06, "loss": 0.4458, "step": 2144 }, { "epoch": 1.53, "grad_norm": 11.920592534439697, "learning_rate": 7.078031009147759e-06, "loss": 0.6465, "step": 2145 }, { "epoch": 1.53, "grad_norm": 7.120473562340443, "learning_rate": 7.075401941977961e-06, "loss": 0.4741, "step": 2146 }, { "epoch": 1.53, "grad_norm": 8.506397076951163, "learning_rate": 7.072772181377798e-06, "loss": 0.4634, "step": 2147 }, { "epoch": 1.53, "grad_norm": 16.898055505594296, "learning_rate": 7.070141728225922e-06, "loss": 0.5186, "step": 2148 }, { "epoch": 1.53, "grad_norm": 14.770324812358789, "learning_rate": 7.067510583401217e-06, "loss": 0.4585, "step": 2149 }, { "epoch": 1.53, "grad_norm": 13.618834750606382, "learning_rate": 7.0648787477827965e-06, "loss": 0.4229, "step": 2150 }, { "epoch": 1.54, "grad_norm": 13.901946072429947, "learning_rate": 7.062246222250005e-06, "loss": 0.4673, "step": 2151 }, { "epoch": 1.54, "grad_norm": 11.322546513477945, "learning_rate": 7.05961300768242e-06, "loss": 0.4419, "step": 2152 }, { "epoch": 1.54, "grad_norm": 21.827793100279667, "learning_rate": 7.056979104959847e-06, "loss": 0.6724, "step": 2153 }, { "epoch": 1.54, "grad_norm": 9.850403835999005, "learning_rate": 7.054344514962319e-06, "loss": 0.4351, "step": 2154 }, { "epoch": 1.54, "grad_norm": 11.28905672300358, "learning_rate": 7.051709238570106e-06, "loss": 0.5864, "step": 2155 }, { "epoch": 1.54, "grad_norm": 11.027979265939116, "learning_rate": 7.0490732766637e-06, "loss": 0.4814, "step": 2156 }, { "epoch": 1.54, "grad_norm": 14.594956263785003, "learning_rate": 7.046436630123826e-06, "loss": 0.5908, "step": 2157 }, { "epoch": 1.54, "grad_norm": 7.349980848273088, "learning_rate": 7.043799299831438e-06, "loss": 0.4062, "step": 2158 }, { "epoch": 1.54, "grad_norm": 11.422633603478822, "learning_rate": 7.041161286667713e-06, "loss": 0.4761, "step": 2159 }, { "epoch": 1.54, "grad_norm": 7.42059519880841, "learning_rate": 7.038522591514061e-06, "loss": 0.4302, "step": 2160 }, { "epoch": 1.54, "grad_norm": 14.084142584061851, "learning_rate": 7.035883215252123e-06, "loss": 0.4736, "step": 2161 }, { "epoch": 1.54, "grad_norm": 16.355044115107113, "learning_rate": 7.03324315876376e-06, "loss": 0.5054, "step": 2162 }, { "epoch": 1.54, "grad_norm": 7.3052309454934035, "learning_rate": 7.030602422931065e-06, "loss": 0.4629, "step": 2163 }, { "epoch": 1.54, "grad_norm": 10.739648409543541, "learning_rate": 7.027961008636359e-06, "loss": 0.4648, "step": 2164 }, { "epoch": 1.55, "grad_norm": 8.804729154765832, "learning_rate": 7.025318916762185e-06, "loss": 0.3853, "step": 2165 }, { "epoch": 1.55, "grad_norm": 16.13762113342716, "learning_rate": 7.022676148191315e-06, "loss": 0.604, "step": 2166 }, { "epoch": 1.55, "grad_norm": 15.74981032671273, "learning_rate": 7.020032703806748e-06, "loss": 0.4409, "step": 2167 }, { "epoch": 1.55, "grad_norm": 9.4004788730862, "learning_rate": 7.017388584491709e-06, "loss": 0.4077, "step": 2168 }, { "epoch": 1.55, "grad_norm": 17.52904134228692, "learning_rate": 7.014743791129644e-06, "loss": 0.5288, "step": 2169 }, { "epoch": 1.55, "grad_norm": 9.05836662525096, "learning_rate": 7.012098324604231e-06, "loss": 0.396, "step": 2170 }, { "epoch": 1.55, "grad_norm": 11.211986374727667, "learning_rate": 7.009452185799368e-06, "loss": 0.5439, "step": 2171 }, { "epoch": 1.55, "grad_norm": 13.01994379164931, "learning_rate": 7.00680537559918e-06, "loss": 0.5039, "step": 2172 }, { "epoch": 1.55, "grad_norm": 13.969090071115868, "learning_rate": 7.0041578948880155e-06, "loss": 0.4111, "step": 2173 }, { "epoch": 1.55, "grad_norm": 15.659257826403904, "learning_rate": 7.001509744550446e-06, "loss": 0.543, "step": 2174 }, { "epoch": 1.55, "grad_norm": 21.720135778440106, "learning_rate": 6.998860925471267e-06, "loss": 0.5352, "step": 2175 }, { "epoch": 1.55, "grad_norm": 10.39302176847214, "learning_rate": 6.9962114385355e-06, "loss": 0.4092, "step": 2176 }, { "epoch": 1.55, "grad_norm": 13.659365134233374, "learning_rate": 6.993561284628388e-06, "loss": 0.5156, "step": 2177 }, { "epoch": 1.55, "grad_norm": 7.458849658112828, "learning_rate": 6.990910464635395e-06, "loss": 0.4067, "step": 2178 }, { "epoch": 1.56, "grad_norm": 11.086418567695002, "learning_rate": 6.9882589794422105e-06, "loss": 0.418, "step": 2179 }, { "epoch": 1.56, "grad_norm": 12.32462739243237, "learning_rate": 6.9856068299347455e-06, "loss": 0.4932, "step": 2180 }, { "epoch": 1.56, "grad_norm": 8.227065489328242, "learning_rate": 6.98295401699913e-06, "loss": 0.4438, "step": 2181 }, { "epoch": 1.56, "grad_norm": 8.53687999777279, "learning_rate": 6.980300541521721e-06, "loss": 0.4766, "step": 2182 }, { "epoch": 1.56, "grad_norm": 7.091176799000983, "learning_rate": 6.977646404389092e-06, "loss": 0.3911, "step": 2183 }, { "epoch": 1.56, "grad_norm": 18.60903069175726, "learning_rate": 6.9749916064880404e-06, "loss": 0.6152, "step": 2184 }, { "epoch": 1.56, "grad_norm": 14.532470312845014, "learning_rate": 6.972336148705583e-06, "loss": 0.4365, "step": 2185 }, { "epoch": 1.56, "grad_norm": 13.726765751198604, "learning_rate": 6.969680031928959e-06, "loss": 0.4414, "step": 2186 }, { "epoch": 1.56, "grad_norm": 10.729404027235125, "learning_rate": 6.967023257045624e-06, "loss": 0.4883, "step": 2187 }, { "epoch": 1.56, "grad_norm": 11.006117166133711, "learning_rate": 6.96436582494326e-06, "loss": 0.375, "step": 2188 }, { "epoch": 1.56, "grad_norm": 10.304932091242616, "learning_rate": 6.961707736509759e-06, "loss": 0.5664, "step": 2189 }, { "epoch": 1.56, "grad_norm": 12.158180976223932, "learning_rate": 6.959048992633241e-06, "loss": 0.4287, "step": 2190 }, { "epoch": 1.56, "grad_norm": 7.951351287243968, "learning_rate": 6.956389594202041e-06, "loss": 0.4077, "step": 2191 }, { "epoch": 1.56, "grad_norm": 11.814399269265236, "learning_rate": 6.953729542104713e-06, "loss": 0.4473, "step": 2192 }, { "epoch": 1.57, "grad_norm": 9.228592101191628, "learning_rate": 6.951068837230032e-06, "loss": 0.6001, "step": 2193 }, { "epoch": 1.57, "grad_norm": 8.684761092197428, "learning_rate": 6.9484074804669865e-06, "loss": 0.4868, "step": 2194 }, { "epoch": 1.57, "grad_norm": 13.022339608408158, "learning_rate": 6.945745472704786e-06, "loss": 0.4446, "step": 2195 }, { "epoch": 1.57, "grad_norm": 15.806399945358521, "learning_rate": 6.943082814832858e-06, "loss": 0.439, "step": 2196 }, { "epoch": 1.57, "grad_norm": 7.733376328962677, "learning_rate": 6.940419507740843e-06, "loss": 0.5063, "step": 2197 }, { "epoch": 1.57, "grad_norm": 9.045131247868405, "learning_rate": 6.937755552318606e-06, "loss": 0.4028, "step": 2198 }, { "epoch": 1.57, "grad_norm": 8.45991812912647, "learning_rate": 6.935090949456219e-06, "loss": 0.4683, "step": 2199 }, { "epoch": 1.57, "grad_norm": 10.754774822400977, "learning_rate": 6.93242570004398e-06, "loss": 0.4248, "step": 2200 }, { "epoch": 1.57, "grad_norm": 9.026980227662335, "learning_rate": 6.929759804972394e-06, "loss": 0.4004, "step": 2201 }, { "epoch": 1.57, "grad_norm": 7.231964685107052, "learning_rate": 6.92709326513219e-06, "loss": 0.3926, "step": 2202 }, { "epoch": 1.57, "grad_norm": 8.292860309744759, "learning_rate": 6.924426081414305e-06, "loss": 0.4873, "step": 2203 }, { "epoch": 1.57, "grad_norm": 8.757286190878336, "learning_rate": 6.921758254709897e-06, "loss": 0.3643, "step": 2204 }, { "epoch": 1.57, "grad_norm": 11.606419829333902, "learning_rate": 6.919089785910336e-06, "loss": 0.4116, "step": 2205 }, { "epoch": 1.57, "grad_norm": 8.234839110113752, "learning_rate": 6.916420675907207e-06, "loss": 0.4482, "step": 2206 }, { "epoch": 1.58, "grad_norm": 11.023740197910566, "learning_rate": 6.9137509255923085e-06, "loss": 0.4917, "step": 2207 }, { "epoch": 1.58, "grad_norm": 9.280209803610372, "learning_rate": 6.911080535857655e-06, "loss": 0.4019, "step": 2208 }, { "epoch": 1.58, "grad_norm": 10.746299427986333, "learning_rate": 6.908409507595472e-06, "loss": 0.5361, "step": 2209 }, { "epoch": 1.58, "grad_norm": 10.906230626644021, "learning_rate": 6.905737841698201e-06, "loss": 0.4429, "step": 2210 }, { "epoch": 1.58, "grad_norm": 10.155722381902946, "learning_rate": 6.903065539058496e-06, "loss": 0.4624, "step": 2211 }, { "epoch": 1.58, "grad_norm": 9.196138695564693, "learning_rate": 6.900392600569219e-06, "loss": 0.3521, "step": 2212 }, { "epoch": 1.58, "grad_norm": 10.280936832961778, "learning_rate": 6.897719027123451e-06, "loss": 0.4634, "step": 2213 }, { "epoch": 1.58, "grad_norm": 9.371583647428105, "learning_rate": 6.895044819614484e-06, "loss": 0.3848, "step": 2214 }, { "epoch": 1.58, "grad_norm": 8.266982874224437, "learning_rate": 6.8923699789358185e-06, "loss": 0.3877, "step": 2215 }, { "epoch": 1.58, "grad_norm": 17.175344791502628, "learning_rate": 6.88969450598117e-06, "loss": 0.4609, "step": 2216 }, { "epoch": 1.58, "grad_norm": 20.899917847127078, "learning_rate": 6.887018401644463e-06, "loss": 0.5835, "step": 2217 }, { "epoch": 1.58, "grad_norm": 14.085073006810147, "learning_rate": 6.884341666819832e-06, "loss": 0.4443, "step": 2218 }, { "epoch": 1.58, "grad_norm": 10.194755959365828, "learning_rate": 6.881664302401626e-06, "loss": 0.5088, "step": 2219 }, { "epoch": 1.58, "grad_norm": 28.041590639227383, "learning_rate": 6.878986309284401e-06, "loss": 0.4272, "step": 2220 }, { "epoch": 1.59, "grad_norm": 18.858404710323796, "learning_rate": 6.876307688362925e-06, "loss": 0.5181, "step": 2221 }, { "epoch": 1.59, "grad_norm": 10.769923044069692, "learning_rate": 6.873628440532175e-06, "loss": 0.4094, "step": 2222 }, { "epoch": 1.59, "grad_norm": 10.760643999959065, "learning_rate": 6.8709485666873375e-06, "loss": 0.4131, "step": 2223 }, { "epoch": 1.59, "grad_norm": 10.059077802767067, "learning_rate": 6.868268067723808e-06, "loss": 0.498, "step": 2224 }, { "epoch": 1.59, "grad_norm": 16.067818695297724, "learning_rate": 6.86558694453719e-06, "loss": 0.4736, "step": 2225 }, { "epoch": 1.59, "grad_norm": 11.23251193640447, "learning_rate": 6.8629051980233e-06, "loss": 0.415, "step": 2226 }, { "epoch": 1.59, "grad_norm": 7.114819340426512, "learning_rate": 6.860222829078156e-06, "loss": 0.3423, "step": 2227 }, { "epoch": 1.59, "grad_norm": 12.762589126480844, "learning_rate": 6.857539838597987e-06, "loss": 0.3765, "step": 2228 }, { "epoch": 1.59, "grad_norm": 9.926047987180356, "learning_rate": 6.8548562274792325e-06, "loss": 0.4263, "step": 2229 }, { "epoch": 1.59, "grad_norm": 13.231555849492707, "learning_rate": 6.8521719966185355e-06, "loss": 0.498, "step": 2230 }, { "epoch": 1.59, "grad_norm": 14.73486658686708, "learning_rate": 6.8494871469127474e-06, "loss": 0.5493, "step": 2231 }, { "epoch": 1.59, "grad_norm": 12.05707560464292, "learning_rate": 6.846801679258926e-06, "loss": 0.5015, "step": 2232 }, { "epoch": 1.59, "grad_norm": 10.28340838759031, "learning_rate": 6.844115594554338e-06, "loss": 0.3784, "step": 2233 }, { "epoch": 1.59, "grad_norm": 21.26492682141842, "learning_rate": 6.841428893696453e-06, "loss": 0.521, "step": 2234 }, { "epoch": 1.6, "grad_norm": 8.269713147364001, "learning_rate": 6.838741577582946e-06, "loss": 0.4575, "step": 2235 }, { "epoch": 1.6, "grad_norm": 12.193115329733613, "learning_rate": 6.836053647111701e-06, "loss": 0.4976, "step": 2236 }, { "epoch": 1.6, "grad_norm": 8.42556116246502, "learning_rate": 6.833365103180806e-06, "loss": 0.4404, "step": 2237 }, { "epoch": 1.6, "grad_norm": 11.312625877201313, "learning_rate": 6.830675946688552e-06, "loss": 0.4473, "step": 2238 }, { "epoch": 1.6, "grad_norm": 11.467050902572582, "learning_rate": 6.827986178533437e-06, "loss": 0.5322, "step": 2239 }, { "epoch": 1.6, "grad_norm": 7.750353718427244, "learning_rate": 6.825295799614163e-06, "loss": 0.4478, "step": 2240 }, { "epoch": 1.6, "grad_norm": 8.352234008790848, "learning_rate": 6.822604810829634e-06, "loss": 0.4609, "step": 2241 }, { "epoch": 1.6, "grad_norm": 8.559244440826264, "learning_rate": 6.819913213078961e-06, "loss": 0.3923, "step": 2242 }, { "epoch": 1.6, "grad_norm": 14.88840732071109, "learning_rate": 6.817221007261456e-06, "loss": 0.4204, "step": 2243 }, { "epoch": 1.6, "grad_norm": 10.030745063332024, "learning_rate": 6.814528194276636e-06, "loss": 0.4292, "step": 2244 }, { "epoch": 1.6, "grad_norm": 11.309812809426399, "learning_rate": 6.811834775024219e-06, "loss": 0.5493, "step": 2245 }, { "epoch": 1.6, "grad_norm": 9.66642562016404, "learning_rate": 6.809140750404127e-06, "loss": 0.4292, "step": 2246 }, { "epoch": 1.6, "grad_norm": 10.012313894198071, "learning_rate": 6.8064461213164825e-06, "loss": 0.4946, "step": 2247 }, { "epoch": 1.6, "grad_norm": 9.857244202004253, "learning_rate": 6.803750888661611e-06, "loss": 0.4478, "step": 2248 }, { "epoch": 1.61, "grad_norm": 13.033407840690604, "learning_rate": 6.8010550533400425e-06, "loss": 0.438, "step": 2249 }, { "epoch": 1.61, "grad_norm": 10.267146635510022, "learning_rate": 6.798358616252503e-06, "loss": 0.4214, "step": 2250 }, { "epoch": 1.61, "grad_norm": 18.114380153302204, "learning_rate": 6.795661578299924e-06, "loss": 0.4097, "step": 2251 }, { "epoch": 1.61, "grad_norm": 15.6843245806793, "learning_rate": 6.792963940383436e-06, "loss": 0.5952, "step": 2252 }, { "epoch": 1.61, "grad_norm": 16.510405242918026, "learning_rate": 6.790265703404368e-06, "loss": 0.4707, "step": 2253 }, { "epoch": 1.61, "grad_norm": 8.494076326644194, "learning_rate": 6.787566868264253e-06, "loss": 0.4829, "step": 2254 }, { "epoch": 1.61, "grad_norm": 17.25444248319594, "learning_rate": 6.7848674358648195e-06, "loss": 0.438, "step": 2255 }, { "epoch": 1.61, "grad_norm": 13.54816942321224, "learning_rate": 6.782167407108001e-06, "loss": 0.5273, "step": 2256 }, { "epoch": 1.61, "grad_norm": 15.11514410545727, "learning_rate": 6.779466782895926e-06, "loss": 0.4658, "step": 2257 }, { "epoch": 1.61, "grad_norm": 16.18230982436298, "learning_rate": 6.7767655641309234e-06, "loss": 0.5889, "step": 2258 }, { "epoch": 1.61, "grad_norm": 11.47295733121618, "learning_rate": 6.7740637517155205e-06, "loss": 0.5142, "step": 2259 }, { "epoch": 1.61, "grad_norm": 11.585772419117465, "learning_rate": 6.771361346552445e-06, "loss": 0.4607, "step": 2260 }, { "epoch": 1.61, "grad_norm": 24.554032105944135, "learning_rate": 6.7686583495446164e-06, "loss": 0.4375, "step": 2261 }, { "epoch": 1.61, "grad_norm": 14.428829908613086, "learning_rate": 6.765954761595161e-06, "loss": 0.5117, "step": 2262 }, { "epoch": 1.62, "grad_norm": 6.54475905891446, "learning_rate": 6.763250583607392e-06, "loss": 0.3823, "step": 2263 }, { "epoch": 1.62, "grad_norm": 15.52379110881373, "learning_rate": 6.7605458164848316e-06, "loss": 0.4619, "step": 2264 }, { "epoch": 1.62, "grad_norm": 13.02231162018557, "learning_rate": 6.75784046113119e-06, "loss": 0.5483, "step": 2265 }, { "epoch": 1.62, "grad_norm": 10.998871840978227, "learning_rate": 6.755134518450377e-06, "loss": 0.502, "step": 2266 }, { "epoch": 1.62, "grad_norm": 21.813653765158957, "learning_rate": 6.752427989346497e-06, "loss": 0.4629, "step": 2267 }, { "epoch": 1.62, "grad_norm": 18.768658187986116, "learning_rate": 6.749720874723854e-06, "loss": 0.4678, "step": 2268 }, { "epoch": 1.62, "grad_norm": 8.821712753766564, "learning_rate": 6.747013175486944e-06, "loss": 0.4683, "step": 2269 }, { "epoch": 1.62, "grad_norm": 16.14702978647226, "learning_rate": 6.74430489254046e-06, "loss": 0.542, "step": 2270 }, { "epoch": 1.62, "grad_norm": 7.591931087130397, "learning_rate": 6.741596026789288e-06, "loss": 0.5176, "step": 2271 }, { "epoch": 1.62, "grad_norm": 9.99774676180277, "learning_rate": 6.7388865791385124e-06, "loss": 0.4536, "step": 2272 }, { "epoch": 1.62, "grad_norm": 10.404000893575382, "learning_rate": 6.736176550493411e-06, "loss": 0.5005, "step": 2273 }, { "epoch": 1.62, "grad_norm": 14.67200743832191, "learning_rate": 6.7334659417594514e-06, "loss": 0.5234, "step": 2274 }, { "epoch": 1.62, "grad_norm": 13.823394969175009, "learning_rate": 6.730754753842303e-06, "loss": 0.4229, "step": 2275 }, { "epoch": 1.62, "grad_norm": 9.011366696228007, "learning_rate": 6.728042987647818e-06, "loss": 0.3921, "step": 2276 }, { "epoch": 1.63, "grad_norm": 16.774675579908692, "learning_rate": 6.725330644082054e-06, "loss": 0.5049, "step": 2277 }, { "epoch": 1.63, "grad_norm": 8.935065734139199, "learning_rate": 6.7226177240512516e-06, "loss": 0.4927, "step": 2278 }, { "epoch": 1.63, "grad_norm": 8.554489129584331, "learning_rate": 6.7199042284618484e-06, "loss": 0.4419, "step": 2279 }, { "epoch": 1.63, "grad_norm": 10.926780112484742, "learning_rate": 6.717190158220475e-06, "loss": 0.5508, "step": 2280 }, { "epoch": 1.63, "grad_norm": 9.840068980238367, "learning_rate": 6.714475514233951e-06, "loss": 0.4165, "step": 2281 }, { "epoch": 1.63, "grad_norm": 20.501441251965698, "learning_rate": 6.71176029740929e-06, "loss": 0.4741, "step": 2282 }, { "epoch": 1.63, "grad_norm": 14.55482426352725, "learning_rate": 6.709044508653697e-06, "loss": 0.4609, "step": 2283 }, { "epoch": 1.63, "grad_norm": 11.114294673812632, "learning_rate": 6.706328148874568e-06, "loss": 0.4961, "step": 2284 }, { "epoch": 1.63, "grad_norm": 16.349083200716997, "learning_rate": 6.703611218979488e-06, "loss": 0.5098, "step": 2285 }, { "epoch": 1.63, "grad_norm": 11.011989880992953, "learning_rate": 6.700893719876234e-06, "loss": 0.4868, "step": 2286 }, { "epoch": 1.63, "grad_norm": 10.689757524975285, "learning_rate": 6.698175652472774e-06, "loss": 0.4512, "step": 2287 }, { "epoch": 1.63, "grad_norm": 12.898403684293179, "learning_rate": 6.695457017677263e-06, "loss": 0.4131, "step": 2288 }, { "epoch": 1.63, "grad_norm": 10.240922912512271, "learning_rate": 6.692737816398048e-06, "loss": 0.5112, "step": 2289 }, { "epoch": 1.63, "grad_norm": 16.147572604976922, "learning_rate": 6.6900180495436664e-06, "loss": 0.3979, "step": 2290 }, { "epoch": 1.64, "grad_norm": 19.700085377690556, "learning_rate": 6.68729771802284e-06, "loss": 0.4995, "step": 2291 }, { "epoch": 1.64, "grad_norm": 8.978659517104596, "learning_rate": 6.6845768227444855e-06, "loss": 0.3945, "step": 2292 }, { "epoch": 1.64, "grad_norm": 11.810366816874936, "learning_rate": 6.681855364617702e-06, "loss": 0.4082, "step": 2293 }, { "epoch": 1.64, "grad_norm": 11.532079335607683, "learning_rate": 6.67913334455178e-06, "loss": 0.4424, "step": 2294 }, { "epoch": 1.64, "grad_norm": 15.749828735552693, "learning_rate": 6.676410763456197e-06, "loss": 0.4722, "step": 2295 }, { "epoch": 1.64, "grad_norm": 9.918601367919049, "learning_rate": 6.673687622240619e-06, "loss": 0.4126, "step": 2296 }, { "epoch": 1.64, "grad_norm": 14.349256190704107, "learning_rate": 6.670963921814896e-06, "loss": 0.5859, "step": 2297 }, { "epoch": 1.64, "grad_norm": 26.856607295110678, "learning_rate": 6.668239663089069e-06, "loss": 0.542, "step": 2298 }, { "epoch": 1.64, "grad_norm": 13.00766965784754, "learning_rate": 6.665514846973361e-06, "loss": 0.4756, "step": 2299 }, { "epoch": 1.64, "grad_norm": 14.099268884344017, "learning_rate": 6.662789474378186e-06, "loss": 0.5103, "step": 2300 }, { "epoch": 1.64, "grad_norm": 13.330890563879269, "learning_rate": 6.6600635462141415e-06, "loss": 0.355, "step": 2301 }, { "epoch": 1.64, "grad_norm": 11.992856781553082, "learning_rate": 6.657337063392011e-06, "loss": 0.4316, "step": 2302 }, { "epoch": 1.64, "grad_norm": 6.711043536387954, "learning_rate": 6.654610026822761e-06, "loss": 0.3696, "step": 2303 }, { "epoch": 1.64, "grad_norm": 11.942728892170798, "learning_rate": 6.651882437417546e-06, "loss": 0.4727, "step": 2304 }, { "epoch": 1.65, "grad_norm": 11.25898618932495, "learning_rate": 6.649154296087705e-06, "loss": 0.5059, "step": 2305 }, { "epoch": 1.65, "grad_norm": 10.960042275216363, "learning_rate": 6.646425603744759e-06, "loss": 0.4067, "step": 2306 }, { "epoch": 1.65, "grad_norm": 11.117852544611928, "learning_rate": 6.643696361300418e-06, "loss": 0.5503, "step": 2307 }, { "epoch": 1.65, "grad_norm": 19.347126680847555, "learning_rate": 6.6409665696665715e-06, "loss": 0.4541, "step": 2308 }, { "epoch": 1.65, "grad_norm": 14.520385793005548, "learning_rate": 6.638236229755292e-06, "loss": 0.5381, "step": 2309 }, { "epoch": 1.65, "grad_norm": 7.949529629176509, "learning_rate": 6.635505342478838e-06, "loss": 0.4204, "step": 2310 }, { "epoch": 1.65, "grad_norm": 10.147732369032411, "learning_rate": 6.632773908749649e-06, "loss": 0.4448, "step": 2311 }, { "epoch": 1.65, "grad_norm": 9.58683877473919, "learning_rate": 6.630041929480349e-06, "loss": 0.436, "step": 2312 }, { "epoch": 1.65, "grad_norm": 14.967781744974912, "learning_rate": 6.627309405583741e-06, "loss": 0.4839, "step": 2313 }, { "epoch": 1.65, "grad_norm": 9.185548630716545, "learning_rate": 6.624576337972815e-06, "loss": 0.4331, "step": 2314 }, { "epoch": 1.65, "grad_norm": 9.782057188604156, "learning_rate": 6.621842727560737e-06, "loss": 0.415, "step": 2315 }, { "epoch": 1.65, "grad_norm": 11.673091390131038, "learning_rate": 6.6191085752608575e-06, "loss": 0.4946, "step": 2316 }, { "epoch": 1.65, "grad_norm": 10.697402257685303, "learning_rate": 6.616373881986708e-06, "loss": 0.5723, "step": 2317 }, { "epoch": 1.65, "grad_norm": 7.647812348945917, "learning_rate": 6.613638648652002e-06, "loss": 0.4097, "step": 2318 }, { "epoch": 1.66, "grad_norm": 6.549329240009999, "learning_rate": 6.610902876170631e-06, "loss": 0.4482, "step": 2319 }, { "epoch": 1.66, "grad_norm": 10.337967711639493, "learning_rate": 6.608166565456666e-06, "loss": 0.4434, "step": 2320 }, { "epoch": 1.66, "grad_norm": 17.036210608178823, "learning_rate": 6.605429717424359e-06, "loss": 0.4116, "step": 2321 }, { "epoch": 1.66, "grad_norm": 10.4673537904469, "learning_rate": 6.602692332988143e-06, "loss": 0.4302, "step": 2322 }, { "epoch": 1.66, "grad_norm": 9.325826026833447, "learning_rate": 6.5999544130626305e-06, "loss": 0.4438, "step": 2323 }, { "epoch": 1.66, "grad_norm": 12.040149249921235, "learning_rate": 6.597215958562608e-06, "loss": 0.46, "step": 2324 }, { "epoch": 1.66, "grad_norm": 8.656646325351767, "learning_rate": 6.5944769704030465e-06, "loss": 0.4453, "step": 2325 }, { "epoch": 1.66, "grad_norm": 11.170279267084911, "learning_rate": 6.591737449499092e-06, "loss": 0.4639, "step": 2326 }, { "epoch": 1.66, "grad_norm": 9.82839224846332, "learning_rate": 6.58899739676607e-06, "loss": 0.4546, "step": 2327 }, { "epoch": 1.66, "grad_norm": 9.496186973092405, "learning_rate": 6.586256813119482e-06, "loss": 0.4648, "step": 2328 }, { "epoch": 1.66, "grad_norm": 8.873342997647923, "learning_rate": 6.583515699475009e-06, "loss": 0.4561, "step": 2329 }, { "epoch": 1.66, "grad_norm": 10.344690040103952, "learning_rate": 6.580774056748508e-06, "loss": 0.4336, "step": 2330 }, { "epoch": 1.66, "grad_norm": 11.872733176778468, "learning_rate": 6.578031885856011e-06, "loss": 0.4167, "step": 2331 }, { "epoch": 1.66, "grad_norm": 8.83642693495639, "learning_rate": 6.575289187713731e-06, "loss": 0.3911, "step": 2332 }, { "epoch": 1.67, "grad_norm": 17.856781054644795, "learning_rate": 6.572545963238053e-06, "loss": 0.4536, "step": 2333 }, { "epoch": 1.67, "grad_norm": 11.347855910699995, "learning_rate": 6.569802213345537e-06, "loss": 0.5303, "step": 2334 }, { "epoch": 1.67, "grad_norm": 11.646592360652438, "learning_rate": 6.5670579389529255e-06, "loss": 0.4077, "step": 2335 }, { "epoch": 1.67, "grad_norm": 7.438467277076052, "learning_rate": 6.56431314097713e-06, "loss": 0.3088, "step": 2336 }, { "epoch": 1.67, "grad_norm": 11.023040745678406, "learning_rate": 6.561567820335236e-06, "loss": 0.3955, "step": 2337 }, { "epoch": 1.67, "grad_norm": 10.033813975390757, "learning_rate": 6.558821977944508e-06, "loss": 0.4468, "step": 2338 }, { "epoch": 1.67, "grad_norm": 16.110205229735936, "learning_rate": 6.556075614722383e-06, "loss": 0.5103, "step": 2339 }, { "epoch": 1.67, "grad_norm": 11.807452959562594, "learning_rate": 6.553328731586473e-06, "loss": 0.5112, "step": 2340 }, { "epoch": 1.67, "grad_norm": 11.161963178049959, "learning_rate": 6.550581329454561e-06, "loss": 0.437, "step": 2341 }, { "epoch": 1.67, "grad_norm": 8.447457606510927, "learning_rate": 6.547833409244606e-06, "loss": 0.3843, "step": 2342 }, { "epoch": 1.67, "grad_norm": 9.377025255192994, "learning_rate": 6.545084971874738e-06, "loss": 0.4229, "step": 2343 }, { "epoch": 1.67, "grad_norm": 7.855103606469367, "learning_rate": 6.542336018263262e-06, "loss": 0.3804, "step": 2344 }, { "epoch": 1.67, "grad_norm": 27.46634631545949, "learning_rate": 6.539586549328656e-06, "loss": 0.7871, "step": 2345 }, { "epoch": 1.67, "grad_norm": 11.359851062684688, "learning_rate": 6.536836565989565e-06, "loss": 0.3911, "step": 2346 }, { "epoch": 1.68, "grad_norm": 11.053055914900025, "learning_rate": 6.534086069164813e-06, "loss": 0.4321, "step": 2347 }, { "epoch": 1.68, "grad_norm": 14.018450662610983, "learning_rate": 6.531335059773392e-06, "loss": 0.4824, "step": 2348 }, { "epoch": 1.68, "grad_norm": 11.550512997935256, "learning_rate": 6.528583538734463e-06, "loss": 0.5078, "step": 2349 }, { "epoch": 1.68, "grad_norm": 13.133602547357903, "learning_rate": 6.525831506967361e-06, "loss": 0.4575, "step": 2350 }, { "epoch": 1.68, "grad_norm": 8.318668444691664, "learning_rate": 6.523078965391592e-06, "loss": 0.4258, "step": 2351 }, { "epoch": 1.68, "grad_norm": 9.446059276582055, "learning_rate": 6.520325914926831e-06, "loss": 0.4482, "step": 2352 }, { "epoch": 1.68, "grad_norm": 16.682194650341806, "learning_rate": 6.517572356492922e-06, "loss": 0.4624, "step": 2353 }, { "epoch": 1.68, "grad_norm": 20.23938157311803, "learning_rate": 6.514818291009881e-06, "loss": 0.498, "step": 2354 }, { "epoch": 1.68, "grad_norm": 17.820982474486428, "learning_rate": 6.512063719397894e-06, "loss": 0.5381, "step": 2355 }, { "epoch": 1.68, "grad_norm": 13.993613065974447, "learning_rate": 6.5093086425773126e-06, "loss": 0.5732, "step": 2356 }, { "epoch": 1.68, "grad_norm": 8.37289512006142, "learning_rate": 6.506553061468659e-06, "loss": 0.4253, "step": 2357 }, { "epoch": 1.68, "grad_norm": 11.600077704313506, "learning_rate": 6.5037969769926256e-06, "loss": 0.4316, "step": 2358 }, { "epoch": 1.68, "grad_norm": 14.425829865085978, "learning_rate": 6.501040390070071e-06, "loss": 0.4639, "step": 2359 }, { "epoch": 1.68, "grad_norm": 6.9917751899752485, "learning_rate": 6.498283301622022e-06, "loss": 0.3745, "step": 2360 }, { "epoch": 1.69, "grad_norm": 11.202380861543828, "learning_rate": 6.495525712569673e-06, "loss": 0.4907, "step": 2361 }, { "epoch": 1.69, "grad_norm": 10.609299484513734, "learning_rate": 6.492767623834385e-06, "loss": 0.4478, "step": 2362 }, { "epoch": 1.69, "grad_norm": 10.853753914008019, "learning_rate": 6.490009036337687e-06, "loss": 0.4463, "step": 2363 }, { "epoch": 1.69, "grad_norm": 12.183051932609295, "learning_rate": 6.487249951001276e-06, "loss": 0.501, "step": 2364 }, { "epoch": 1.69, "grad_norm": 11.744349364358902, "learning_rate": 6.484490368747012e-06, "loss": 0.4519, "step": 2365 }, { "epoch": 1.69, "grad_norm": 14.03256193597822, "learning_rate": 6.4817302904969226e-06, "loss": 0.5122, "step": 2366 }, { "epoch": 1.69, "grad_norm": 10.807691344764418, "learning_rate": 6.4789697171732024e-06, "loss": 0.5269, "step": 2367 }, { "epoch": 1.69, "grad_norm": 7.611847017024495, "learning_rate": 6.476208649698209e-06, "loss": 0.4209, "step": 2368 }, { "epoch": 1.69, "grad_norm": 13.295897556272315, "learning_rate": 6.473447088994467e-06, "loss": 0.3936, "step": 2369 }, { "epoch": 1.69, "grad_norm": 14.443824455010702, "learning_rate": 6.470685035984667e-06, "loss": 0.4585, "step": 2370 }, { "epoch": 1.69, "grad_norm": 9.843529569578008, "learning_rate": 6.467922491591658e-06, "loss": 0.3989, "step": 2371 }, { "epoch": 1.69, "grad_norm": 7.301042404680986, "learning_rate": 6.465159456738461e-06, "loss": 0.4258, "step": 2372 }, { "epoch": 1.69, "grad_norm": 9.457468991739518, "learning_rate": 6.462395932348257e-06, "loss": 0.437, "step": 2373 }, { "epoch": 1.69, "grad_norm": 9.706783737244205, "learning_rate": 6.459631919344389e-06, "loss": 0.4785, "step": 2374 }, { "epoch": 1.7, "grad_norm": 11.469484126644588, "learning_rate": 6.456867418650366e-06, "loss": 0.4321, "step": 2375 }, { "epoch": 1.7, "grad_norm": 14.561227461724823, "learning_rate": 6.454102431189859e-06, "loss": 0.4399, "step": 2376 }, { "epoch": 1.7, "grad_norm": 16.39246937193766, "learning_rate": 6.4513369578867026e-06, "loss": 0.521, "step": 2377 }, { "epoch": 1.7, "grad_norm": 12.29432435609855, "learning_rate": 6.448570999664894e-06, "loss": 0.4775, "step": 2378 }, { "epoch": 1.7, "grad_norm": 17.171427132716822, "learning_rate": 6.4458045574485875e-06, "loss": 0.5679, "step": 2379 }, { "epoch": 1.7, "grad_norm": 23.83379330955623, "learning_rate": 6.443037632162104e-06, "loss": 0.5278, "step": 2380 }, { "epoch": 1.7, "grad_norm": 14.84513216881372, "learning_rate": 6.440270224729927e-06, "loss": 0.5034, "step": 2381 }, { "epoch": 1.7, "grad_norm": 11.928504744206203, "learning_rate": 6.437502336076695e-06, "loss": 0.5376, "step": 2382 }, { "epoch": 1.7, "grad_norm": 8.857986910329018, "learning_rate": 6.4347339671272155e-06, "loss": 0.3999, "step": 2383 }, { "epoch": 1.7, "grad_norm": 16.81608671529195, "learning_rate": 6.431965118806449e-06, "loss": 0.4619, "step": 2384 }, { "epoch": 1.7, "grad_norm": 27.17987199179481, "learning_rate": 6.42919579203952e-06, "loss": 0.5947, "step": 2385 }, { "epoch": 1.7, "grad_norm": 11.90191014571275, "learning_rate": 6.4264259877517124e-06, "loss": 0.5737, "step": 2386 }, { "epoch": 1.7, "grad_norm": 8.486777872297138, "learning_rate": 6.423655706868468e-06, "loss": 0.4072, "step": 2387 }, { "epoch": 1.7, "grad_norm": 7.62100655773416, "learning_rate": 6.4208849503153915e-06, "loss": 0.4175, "step": 2388 }, { "epoch": 1.71, "grad_norm": 12.205713284193536, "learning_rate": 6.418113719018242e-06, "loss": 0.541, "step": 2389 }, { "epoch": 1.71, "grad_norm": 8.27010808597464, "learning_rate": 6.415342013902939e-06, "loss": 0.4458, "step": 2390 }, { "epoch": 1.71, "grad_norm": 21.96766320470673, "learning_rate": 6.412569835895562e-06, "loss": 0.4741, "step": 2391 }, { "epoch": 1.71, "grad_norm": 7.6282420344549635, "learning_rate": 6.409797185922349e-06, "loss": 0.4624, "step": 2392 }, { "epoch": 1.71, "grad_norm": 16.407715638452274, "learning_rate": 6.40702406490969e-06, "loss": 0.438, "step": 2393 }, { "epoch": 1.71, "grad_norm": 6.853759038300608, "learning_rate": 6.404250473784138e-06, "loss": 0.4116, "step": 2394 }, { "epoch": 1.71, "grad_norm": 10.122502455763344, "learning_rate": 6.401476413472404e-06, "loss": 0.4565, "step": 2395 }, { "epoch": 1.71, "grad_norm": 7.738275465556727, "learning_rate": 6.398701884901348e-06, "loss": 0.4673, "step": 2396 }, { "epoch": 1.71, "grad_norm": 7.403912357064693, "learning_rate": 6.3959268889979956e-06, "loss": 0.4712, "step": 2397 }, { "epoch": 1.71, "grad_norm": 7.246073860215684, "learning_rate": 6.393151426689522e-06, "loss": 0.4727, "step": 2398 }, { "epoch": 1.71, "grad_norm": 12.726839635078996, "learning_rate": 6.390375498903263e-06, "loss": 0.5, "step": 2399 }, { "epoch": 1.71, "grad_norm": 9.077518387969189, "learning_rate": 6.387599106566705e-06, "loss": 0.3665, "step": 2400 }, { "epoch": 1.71, "grad_norm": 13.115234070495553, "learning_rate": 6.384822250607495e-06, "loss": 0.5576, "step": 2401 }, { "epoch": 1.71, "grad_norm": 10.158381190711447, "learning_rate": 6.382044931953431e-06, "loss": 0.4087, "step": 2402 }, { "epoch": 1.72, "grad_norm": 9.67290912672018, "learning_rate": 6.379267151532467e-06, "loss": 0.543, "step": 2403 }, { "epoch": 1.72, "grad_norm": 7.817381759789311, "learning_rate": 6.376488910272709e-06, "loss": 0.4165, "step": 2404 }, { "epoch": 1.72, "grad_norm": 10.705521476960307, "learning_rate": 6.373710209102423e-06, "loss": 0.4487, "step": 2405 }, { "epoch": 1.72, "grad_norm": 19.270499479688624, "learning_rate": 6.370931048950022e-06, "loss": 0.4756, "step": 2406 }, { "epoch": 1.72, "grad_norm": 15.738719945653308, "learning_rate": 6.368151430744075e-06, "loss": 0.4893, "step": 2407 }, { "epoch": 1.72, "grad_norm": 10.66405350297458, "learning_rate": 6.365371355413306e-06, "loss": 0.4688, "step": 2408 }, { "epoch": 1.72, "grad_norm": 9.542128682731759, "learning_rate": 6.362590823886588e-06, "loss": 0.4131, "step": 2409 }, { "epoch": 1.72, "grad_norm": 12.231576688229438, "learning_rate": 6.359809837092947e-06, "loss": 0.4248, "step": 2410 }, { "epoch": 1.72, "grad_norm": 13.471150104035912, "learning_rate": 6.357028395961566e-06, "loss": 0.4961, "step": 2411 }, { "epoch": 1.72, "grad_norm": 12.518938039693966, "learning_rate": 6.354246501421777e-06, "loss": 0.5054, "step": 2412 }, { "epoch": 1.72, "grad_norm": 17.325117897931076, "learning_rate": 6.3514641544030575e-06, "loss": 0.5117, "step": 2413 }, { "epoch": 1.72, "grad_norm": 11.695654010917847, "learning_rate": 6.348681355835043e-06, "loss": 0.4731, "step": 2414 }, { "epoch": 1.72, "grad_norm": 10.353958500414972, "learning_rate": 6.345898106647521e-06, "loss": 0.4497, "step": 2415 }, { "epoch": 1.72, "grad_norm": 9.125134539120891, "learning_rate": 6.3431144077704245e-06, "loss": 0.5361, "step": 2416 }, { "epoch": 1.73, "grad_norm": 12.29982044993289, "learning_rate": 6.340330260133839e-06, "loss": 0.5303, "step": 2417 }, { "epoch": 1.73, "grad_norm": 11.635927861507614, "learning_rate": 6.337545664668001e-06, "loss": 0.4492, "step": 2418 }, { "epoch": 1.73, "grad_norm": 8.450720986263002, "learning_rate": 6.334760622303294e-06, "loss": 0.4526, "step": 2419 }, { "epoch": 1.73, "grad_norm": 9.96898407515457, "learning_rate": 6.331975133970255e-06, "loss": 0.374, "step": 2420 }, { "epoch": 1.73, "grad_norm": 22.99713545008352, "learning_rate": 6.329189200599566e-06, "loss": 0.5244, "step": 2421 }, { "epoch": 1.73, "grad_norm": 7.164058078222877, "learning_rate": 6.326402823122059e-06, "loss": 0.3335, "step": 2422 }, { "epoch": 1.73, "grad_norm": 8.821666072907615, "learning_rate": 6.3236160024687134e-06, "loss": 0.4614, "step": 2423 }, { "epoch": 1.73, "grad_norm": 8.86789061992421, "learning_rate": 6.3208287395706595e-06, "loss": 0.4541, "step": 2424 }, { "epoch": 1.73, "grad_norm": 14.767069143692144, "learning_rate": 6.3180410353591735e-06, "loss": 0.4414, "step": 2425 }, { "epoch": 1.73, "grad_norm": 12.27317813903384, "learning_rate": 6.315252890765678e-06, "loss": 0.502, "step": 2426 }, { "epoch": 1.73, "grad_norm": 7.0533891840444864, "learning_rate": 6.312464306721745e-06, "loss": 0.4478, "step": 2427 }, { "epoch": 1.73, "grad_norm": 8.284405823285676, "learning_rate": 6.309675284159093e-06, "loss": 0.4644, "step": 2428 }, { "epoch": 1.73, "grad_norm": 10.273614279657444, "learning_rate": 6.306885824009585e-06, "loss": 0.3833, "step": 2429 }, { "epoch": 1.73, "grad_norm": 8.094247190767463, "learning_rate": 6.3040959272052315e-06, "loss": 0.3955, "step": 2430 }, { "epoch": 1.74, "grad_norm": 8.05210332824741, "learning_rate": 6.301305594678189e-06, "loss": 0.4409, "step": 2431 }, { "epoch": 1.74, "grad_norm": 7.070593419557918, "learning_rate": 6.2985148273607586e-06, "loss": 0.394, "step": 2432 }, { "epoch": 1.74, "grad_norm": 7.162689408282903, "learning_rate": 6.29572362618539e-06, "loss": 0.3916, "step": 2433 }, { "epoch": 1.74, "grad_norm": 18.341208257731296, "learning_rate": 6.292931992084672e-06, "loss": 0.6147, "step": 2434 }, { "epoch": 1.74, "grad_norm": 12.475007541185304, "learning_rate": 6.290139925991345e-06, "loss": 0.5015, "step": 2435 }, { "epoch": 1.74, "grad_norm": 11.472537023920362, "learning_rate": 6.287347428838289e-06, "loss": 0.3416, "step": 2436 }, { "epoch": 1.74, "grad_norm": 12.81968592719629, "learning_rate": 6.2845545015585275e-06, "loss": 0.5249, "step": 2437 }, { "epoch": 1.74, "grad_norm": 11.127563834698211, "learning_rate": 6.281761145085232e-06, "loss": 0.4546, "step": 2438 }, { "epoch": 1.74, "grad_norm": 6.2131478626402, "learning_rate": 6.278967360351712e-06, "loss": 0.291, "step": 2439 }, { "epoch": 1.74, "grad_norm": 13.014830861323171, "learning_rate": 6.276173148291425e-06, "loss": 0.4507, "step": 2440 }, { "epoch": 1.74, "grad_norm": 8.370157635005995, "learning_rate": 6.273378509837969e-06, "loss": 0.3735, "step": 2441 }, { "epoch": 1.74, "grad_norm": 19.4882237053066, "learning_rate": 6.2705834459250825e-06, "loss": 0.4019, "step": 2442 }, { "epoch": 1.74, "grad_norm": 10.073520612528323, "learning_rate": 6.2677879574866515e-06, "loss": 0.4048, "step": 2443 }, { "epoch": 1.74, "grad_norm": 10.132342613326061, "learning_rate": 6.264992045456699e-06, "loss": 0.4619, "step": 2444 }, { "epoch": 1.75, "grad_norm": 11.160719007717073, "learning_rate": 6.262195710769391e-06, "loss": 0.3857, "step": 2445 }, { "epoch": 1.75, "grad_norm": 19.472439966811617, "learning_rate": 6.259398954359037e-06, "loss": 0.4429, "step": 2446 }, { "epoch": 1.75, "grad_norm": 15.427769712154046, "learning_rate": 6.256601777160082e-06, "loss": 0.6123, "step": 2447 }, { "epoch": 1.75, "grad_norm": 13.639713264184406, "learning_rate": 6.253804180107116e-06, "loss": 0.4785, "step": 2448 }, { "epoch": 1.75, "grad_norm": 19.158135673228788, "learning_rate": 6.2510061641348695e-06, "loss": 0.5244, "step": 2449 }, { "epoch": 1.75, "grad_norm": 10.730810111574574, "learning_rate": 6.248207730178211e-06, "loss": 0.4771, "step": 2450 }, { "epoch": 1.75, "grad_norm": 8.479475483349935, "learning_rate": 6.245408879172148e-06, "loss": 0.4209, "step": 2451 }, { "epoch": 1.75, "grad_norm": 12.545307163598485, "learning_rate": 6.24260961205183e-06, "loss": 0.4536, "step": 2452 }, { "epoch": 1.75, "grad_norm": 7.433203777453386, "learning_rate": 6.239809929752544e-06, "loss": 0.3677, "step": 2453 }, { "epoch": 1.75, "grad_norm": 10.724851412359518, "learning_rate": 6.237009833209715e-06, "loss": 0.4668, "step": 2454 }, { "epoch": 1.75, "grad_norm": 24.535444127877504, "learning_rate": 6.2342093233589095e-06, "loss": 0.6094, "step": 2455 }, { "epoch": 1.75, "grad_norm": 10.131949690353993, "learning_rate": 6.231408401135828e-06, "loss": 0.4727, "step": 2456 }, { "epoch": 1.75, "grad_norm": 12.95385057418741, "learning_rate": 6.228607067476311e-06, "loss": 0.425, "step": 2457 }, { "epoch": 1.75, "grad_norm": 11.652779199575274, "learning_rate": 6.225805323316336e-06, "loss": 0.5317, "step": 2458 }, { "epoch": 1.76, "grad_norm": 13.756602418041059, "learning_rate": 6.223003169592018e-06, "loss": 0.4741, "step": 2459 }, { "epoch": 1.76, "grad_norm": 13.349749974546135, "learning_rate": 6.220200607239609e-06, "loss": 0.4736, "step": 2460 }, { "epoch": 1.76, "grad_norm": 9.203291816830776, "learning_rate": 6.217397637195497e-06, "loss": 0.4951, "step": 2461 }, { "epoch": 1.76, "grad_norm": 10.149381480669541, "learning_rate": 6.214594260396206e-06, "loss": 0.4688, "step": 2462 }, { "epoch": 1.76, "grad_norm": 14.733718301343428, "learning_rate": 6.211790477778399e-06, "loss": 0.4492, "step": 2463 }, { "epoch": 1.76, "grad_norm": 15.776778166448288, "learning_rate": 6.208986290278866e-06, "loss": 0.3716, "step": 2464 }, { "epoch": 1.76, "grad_norm": 9.981056473195446, "learning_rate": 6.206181698834544e-06, "loss": 0.5264, "step": 2465 }, { "epoch": 1.76, "grad_norm": 12.177267724597133, "learning_rate": 6.2033767043824955e-06, "loss": 0.4902, "step": 2466 }, { "epoch": 1.76, "grad_norm": 9.497702259870394, "learning_rate": 6.200571307859923e-06, "loss": 0.4185, "step": 2467 }, { "epoch": 1.76, "grad_norm": 19.49048950885061, "learning_rate": 6.197765510204161e-06, "loss": 0.5205, "step": 2468 }, { "epoch": 1.76, "grad_norm": 8.508146338639264, "learning_rate": 6.19495931235268e-06, "loss": 0.4531, "step": 2469 }, { "epoch": 1.76, "grad_norm": 14.260708359404394, "learning_rate": 6.19215271524308e-06, "loss": 0.4541, "step": 2470 }, { "epoch": 1.76, "grad_norm": 13.1175023636757, "learning_rate": 6.189345719813099e-06, "loss": 0.4873, "step": 2471 }, { "epoch": 1.76, "grad_norm": 11.651326204773177, "learning_rate": 6.186538327000609e-06, "loss": 0.4849, "step": 2472 }, { "epoch": 1.77, "grad_norm": 9.573726463990967, "learning_rate": 6.183730537743607e-06, "loss": 0.4097, "step": 2473 }, { "epoch": 1.77, "grad_norm": 16.017710357353614, "learning_rate": 6.18092235298023e-06, "loss": 0.4702, "step": 2474 }, { "epoch": 1.77, "grad_norm": 9.84022010023255, "learning_rate": 6.178113773648745e-06, "loss": 0.3994, "step": 2475 }, { "epoch": 1.77, "grad_norm": 9.022407957306385, "learning_rate": 6.175304800687551e-06, "loss": 0.4067, "step": 2476 }, { "epoch": 1.77, "grad_norm": 11.250357096874497, "learning_rate": 6.172495435035176e-06, "loss": 0.4756, "step": 2477 }, { "epoch": 1.77, "grad_norm": 8.921552289632256, "learning_rate": 6.169685677630284e-06, "loss": 0.4336, "step": 2478 }, { "epoch": 1.77, "grad_norm": 7.198045788857288, "learning_rate": 6.1668755294116655e-06, "loss": 0.3325, "step": 2479 }, { "epoch": 1.77, "grad_norm": 14.228855338016636, "learning_rate": 6.1640649913182436e-06, "loss": 0.4209, "step": 2480 }, { "epoch": 1.77, "grad_norm": 8.828383315352083, "learning_rate": 6.161254064289072e-06, "loss": 0.4023, "step": 2481 }, { "epoch": 1.77, "grad_norm": 11.423877993770597, "learning_rate": 6.158442749263332e-06, "loss": 0.4683, "step": 2482 }, { "epoch": 1.77, "grad_norm": 14.780261625052676, "learning_rate": 6.155631047180337e-06, "loss": 0.4595, "step": 2483 }, { "epoch": 1.77, "grad_norm": 11.04469853406095, "learning_rate": 6.152818958979529e-06, "loss": 0.5542, "step": 2484 }, { "epoch": 1.77, "grad_norm": 15.285094723692712, "learning_rate": 6.1500064856004796e-06, "loss": 0.4995, "step": 2485 }, { "epoch": 1.77, "grad_norm": 8.325731810799802, "learning_rate": 6.147193627982887e-06, "loss": 0.3689, "step": 2486 }, { "epoch": 1.78, "grad_norm": 12.60771444277801, "learning_rate": 6.144380387066581e-06, "loss": 0.4771, "step": 2487 }, { "epoch": 1.78, "grad_norm": 8.951542508375255, "learning_rate": 6.141566763791518e-06, "loss": 0.4243, "step": 2488 }, { "epoch": 1.78, "grad_norm": 10.53628405098184, "learning_rate": 6.138752759097778e-06, "loss": 0.4272, "step": 2489 }, { "epoch": 1.78, "grad_norm": 10.97888607653731, "learning_rate": 6.135938373925576e-06, "loss": 0.4653, "step": 2490 }, { "epoch": 1.78, "grad_norm": 10.48936310841184, "learning_rate": 6.133123609215249e-06, "loss": 0.4019, "step": 2491 }, { "epoch": 1.78, "grad_norm": 10.096587824152824, "learning_rate": 6.130308465907263e-06, "loss": 0.396, "step": 2492 }, { "epoch": 1.78, "grad_norm": 9.227096202950863, "learning_rate": 6.127492944942209e-06, "loss": 0.4258, "step": 2493 }, { "epoch": 1.78, "grad_norm": 12.942411840921341, "learning_rate": 6.124677047260805e-06, "loss": 0.3965, "step": 2494 }, { "epoch": 1.78, "grad_norm": 9.00521904473558, "learning_rate": 6.121860773803895e-06, "loss": 0.4102, "step": 2495 }, { "epoch": 1.78, "grad_norm": 9.245096943439378, "learning_rate": 6.119044125512447e-06, "loss": 0.3967, "step": 2496 }, { "epoch": 1.78, "grad_norm": 9.70014640452883, "learning_rate": 6.116227103327559e-06, "loss": 0.4951, "step": 2497 }, { "epoch": 1.78, "grad_norm": 15.83092193293532, "learning_rate": 6.113409708190447e-06, "loss": 0.5278, "step": 2498 }, { "epoch": 1.78, "grad_norm": 18.447966368255546, "learning_rate": 6.1105919410424566e-06, "loss": 0.5225, "step": 2499 }, { "epoch": 1.78, "grad_norm": 16.135937978938692, "learning_rate": 6.107773802825055e-06, "loss": 0.52, "step": 2500 }, { "epoch": 1.78, "eval_avg_AUC": 0.7652931989776042, "eval_avg_Accuracy": 0.6897380636604774, "eval_avg_Accuracy-right": 0.8790922133820269, "eval_avg_Accuracy-wrong": 0.3595633386399818, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6470409856535183, "eval_last_AUC": 0.7821256327791941, "eval_last_Accuracy": 0.7235162466843501, "eval_last_Accuracy-right": 0.8033128994391548, "eval_last_Accuracy-wrong": 0.584375710711849, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6545993525396961, "eval_max_AUC": 0.7132634593306635, "eval_max_Accuracy": 0.6386770557029178, "eval_max_Accuracy-right": 0.9663492891613408, "eval_max_Accuracy-wrong": 0.06731862633613828, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.5960403684660525, "eval_min_AUC": 0.7740940679255949, "eval_min_Accuracy": 0.7144396551724138, "eval_min_Accuracy-right": 0.7332724664145037, "eval_min_Accuracy-wrong": 0.6816010916534, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6455547810166508, "eval_prod_AUC": 0.774109644816048, "eval_prod_Accuracy": 0.6302635941644562, "eval_prod_Accuracy-right": 0.4789356984478936, "eval_prod_Accuracy-wrong": 0.8941323629747555, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6440620351725057, "eval_runtime": 247.2806, "eval_samples_per_second": 97.573, "eval_steps_per_second": 3.049, "eval_sum_AUC": 0.6276356610336933, "eval_sum_Accuracy": 0.6374336870026526, "eval_sum_Accuracy-right": 0.9956306247554454, "eval_sum_Accuracy-wrong": 0.01284967022970207, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6313597803483167, "step": 2500 }, { "epoch": 1.79, "grad_norm": 10.414355809239556, "learning_rate": 6.1049552944798355e-06, "loss": 0.4023, "step": 2501 }, { "epoch": 1.79, "grad_norm": 9.539305998342051, "learning_rate": 6.102136416948513e-06, "loss": 0.4678, "step": 2502 }, { "epoch": 1.79, "grad_norm": 8.768527333824066, "learning_rate": 6.099317171172929e-06, "loss": 0.4272, "step": 2503 }, { "epoch": 1.79, "grad_norm": 11.846773419479307, "learning_rate": 6.0964975580950445e-06, "loss": 0.4639, "step": 2504 }, { "epoch": 1.79, "grad_norm": 11.851344453902673, "learning_rate": 6.093677578656946e-06, "loss": 0.5596, "step": 2505 }, { "epoch": 1.79, "grad_norm": 17.964802832185107, "learning_rate": 6.090857233800839e-06, "loss": 0.4653, "step": 2506 }, { "epoch": 1.79, "grad_norm": 10.580109191519524, "learning_rate": 6.0880365244690546e-06, "loss": 0.4565, "step": 2507 }, { "epoch": 1.79, "grad_norm": 11.766966924313547, "learning_rate": 6.085215451604044e-06, "loss": 0.5005, "step": 2508 }, { "epoch": 1.79, "grad_norm": 8.585939267007543, "learning_rate": 6.082394016148379e-06, "loss": 0.4629, "step": 2509 }, { "epoch": 1.79, "grad_norm": 10.256079889521589, "learning_rate": 6.079572219044755e-06, "loss": 0.4443, "step": 2510 }, { "epoch": 1.79, "grad_norm": 8.962206455993298, "learning_rate": 6.076750061235985e-06, "loss": 0.4058, "step": 2511 }, { "epoch": 1.79, "grad_norm": 11.047853996411353, "learning_rate": 6.073927543665008e-06, "loss": 0.519, "step": 2512 }, { "epoch": 1.79, "grad_norm": 12.508769620004047, "learning_rate": 6.071104667274875e-06, "loss": 0.5142, "step": 2513 }, { "epoch": 1.79, "grad_norm": 18.47411144644649, "learning_rate": 6.068281433008765e-06, "loss": 0.5996, "step": 2514 }, { "epoch": 1.8, "grad_norm": 9.586741408151706, "learning_rate": 6.0654578418099715e-06, "loss": 0.5146, "step": 2515 }, { "epoch": 1.8, "grad_norm": 7.649138506555311, "learning_rate": 6.062633894621909e-06, "loss": 0.4038, "step": 2516 }, { "epoch": 1.8, "grad_norm": 17.188289282459092, "learning_rate": 6.0598095923881105e-06, "loss": 0.5435, "step": 2517 }, { "epoch": 1.8, "grad_norm": 11.818692335136738, "learning_rate": 6.056984936052229e-06, "loss": 0.4629, "step": 2518 }, { "epoch": 1.8, "grad_norm": 13.234001156083075, "learning_rate": 6.054159926558033e-06, "loss": 0.5342, "step": 2519 }, { "epoch": 1.8, "grad_norm": 10.075130769946417, "learning_rate": 6.051334564849413e-06, "loss": 0.4712, "step": 2520 }, { "epoch": 1.8, "grad_norm": 8.956699388387689, "learning_rate": 6.048508851870372e-06, "loss": 0.4111, "step": 2521 }, { "epoch": 1.8, "grad_norm": 12.649995024246259, "learning_rate": 6.045682788565036e-06, "loss": 0.3521, "step": 2522 }, { "epoch": 1.8, "grad_norm": 14.193089965908905, "learning_rate": 6.042856375877644e-06, "loss": 0.5518, "step": 2523 }, { "epoch": 1.8, "grad_norm": 9.405344544727377, "learning_rate": 6.040029614752551e-06, "loss": 0.4873, "step": 2524 }, { "epoch": 1.8, "grad_norm": 17.872507537818283, "learning_rate": 6.037202506134234e-06, "loss": 0.502, "step": 2525 }, { "epoch": 1.8, "grad_norm": 20.056217550868038, "learning_rate": 6.03437505096728e-06, "loss": 0.457, "step": 2526 }, { "epoch": 1.8, "grad_norm": 8.745479176249967, "learning_rate": 6.0315472501963955e-06, "loss": 0.4609, "step": 2527 }, { "epoch": 1.8, "grad_norm": 9.223610098312774, "learning_rate": 6.028719104766402e-06, "loss": 0.4082, "step": 2528 }, { "epoch": 1.81, "grad_norm": 8.50042655333063, "learning_rate": 6.025890615622233e-06, "loss": 0.5039, "step": 2529 }, { "epoch": 1.81, "grad_norm": 7.476170718540832, "learning_rate": 6.023061783708941e-06, "loss": 0.4048, "step": 2530 }, { "epoch": 1.81, "grad_norm": 11.520249627385631, "learning_rate": 6.020232609971694e-06, "loss": 0.439, "step": 2531 }, { "epoch": 1.81, "grad_norm": 18.589519698746084, "learning_rate": 6.017403095355766e-06, "loss": 0.5166, "step": 2532 }, { "epoch": 1.81, "grad_norm": 10.950886743696067, "learning_rate": 6.014573240806553e-06, "loss": 0.4604, "step": 2533 }, { "epoch": 1.81, "grad_norm": 9.551260453439275, "learning_rate": 6.011743047269563e-06, "loss": 0.4204, "step": 2534 }, { "epoch": 1.81, "grad_norm": 19.017128679094064, "learning_rate": 6.008912515690415e-06, "loss": 0.4873, "step": 2535 }, { "epoch": 1.81, "grad_norm": 8.121011247353195, "learning_rate": 6.006081647014842e-06, "loss": 0.4297, "step": 2536 }, { "epoch": 1.81, "grad_norm": 28.47281369820584, "learning_rate": 6.00325044218869e-06, "loss": 0.5586, "step": 2537 }, { "epoch": 1.81, "grad_norm": 9.714417153244161, "learning_rate": 6.000418902157919e-06, "loss": 0.5317, "step": 2538 }, { "epoch": 1.81, "grad_norm": 7.720034449703754, "learning_rate": 5.997587027868598e-06, "loss": 0.4829, "step": 2539 }, { "epoch": 1.81, "grad_norm": 6.914782644486152, "learning_rate": 5.994754820266908e-06, "loss": 0.3906, "step": 2540 }, { "epoch": 1.81, "grad_norm": 10.814365270699001, "learning_rate": 5.991922280299143e-06, "loss": 0.3979, "step": 2541 }, { "epoch": 1.81, "grad_norm": 16.20707607734891, "learning_rate": 5.989089408911706e-06, "loss": 0.4653, "step": 2542 }, { "epoch": 1.82, "grad_norm": 10.049185460575597, "learning_rate": 5.986256207051113e-06, "loss": 0.48, "step": 2543 }, { "epoch": 1.82, "grad_norm": 13.672521854401475, "learning_rate": 5.98342267566399e-06, "loss": 0.5356, "step": 2544 }, { "epoch": 1.82, "grad_norm": 12.541254903832398, "learning_rate": 5.9805888156970714e-06, "loss": 0.4609, "step": 2545 }, { "epoch": 1.82, "grad_norm": 10.284454916773962, "learning_rate": 5.977754628097203e-06, "loss": 0.4688, "step": 2546 }, { "epoch": 1.82, "grad_norm": 8.23610205729337, "learning_rate": 5.97492011381134e-06, "loss": 0.3687, "step": 2547 }, { "epoch": 1.82, "grad_norm": 7.11706014179341, "learning_rate": 5.972085273786547e-06, "loss": 0.4453, "step": 2548 }, { "epoch": 1.82, "grad_norm": 8.756513019937023, "learning_rate": 5.969250108969995e-06, "loss": 0.4448, "step": 2549 }, { "epoch": 1.82, "grad_norm": 7.098290335839529, "learning_rate": 5.966414620308965e-06, "loss": 0.4639, "step": 2550 }, { "epoch": 1.82, "grad_norm": 9.29559974589973, "learning_rate": 5.9635788087508474e-06, "loss": 0.438, "step": 2551 }, { "epoch": 1.82, "grad_norm": 8.829092992423524, "learning_rate": 5.960742675243139e-06, "loss": 0.3999, "step": 2552 }, { "epoch": 1.82, "grad_norm": 8.468196516461774, "learning_rate": 5.957906220733447e-06, "loss": 0.416, "step": 2553 }, { "epoch": 1.82, "grad_norm": 6.859111236122798, "learning_rate": 5.9550694461694806e-06, "loss": 0.4062, "step": 2554 }, { "epoch": 1.82, "grad_norm": 8.371396705917446, "learning_rate": 5.95223235249906e-06, "loss": 0.4697, "step": 2555 }, { "epoch": 1.82, "grad_norm": 9.74581839772436, "learning_rate": 5.949394940670112e-06, "loss": 0.4634, "step": 2556 }, { "epoch": 1.83, "grad_norm": 10.11191421574571, "learning_rate": 5.946557211630667e-06, "loss": 0.5122, "step": 2557 }, { "epoch": 1.83, "grad_norm": 8.41215503415198, "learning_rate": 5.943719166328864e-06, "loss": 0.4316, "step": 2558 }, { "epoch": 1.83, "grad_norm": 8.922238946122983, "learning_rate": 5.940880805712945e-06, "loss": 0.3711, "step": 2559 }, { "epoch": 1.83, "grad_norm": 11.556285820468634, "learning_rate": 5.938042130731262e-06, "loss": 0.4712, "step": 2560 }, { "epoch": 1.83, "grad_norm": 21.602935307175088, "learning_rate": 5.935203142332267e-06, "loss": 0.5796, "step": 2561 }, { "epoch": 1.83, "grad_norm": 10.172490980473096, "learning_rate": 5.932363841464519e-06, "loss": 0.3892, "step": 2562 }, { "epoch": 1.83, "grad_norm": 12.979141458728968, "learning_rate": 5.9295242290766805e-06, "loss": 0.4556, "step": 2563 }, { "epoch": 1.83, "grad_norm": 10.830863705403445, "learning_rate": 5.9266843061175216e-06, "loss": 0.4551, "step": 2564 }, { "epoch": 1.83, "grad_norm": 23.815492103215934, "learning_rate": 5.92384407353591e-06, "loss": 0.5635, "step": 2565 }, { "epoch": 1.83, "grad_norm": 13.015347774380794, "learning_rate": 5.921003532280822e-06, "loss": 0.4316, "step": 2566 }, { "epoch": 1.83, "grad_norm": 18.625830475732585, "learning_rate": 5.918162683301336e-06, "loss": 0.5039, "step": 2567 }, { "epoch": 1.83, "grad_norm": 11.919505793902855, "learning_rate": 5.91532152754663e-06, "loss": 0.5859, "step": 2568 }, { "epoch": 1.83, "grad_norm": 7.891844235140119, "learning_rate": 5.91248006596599e-06, "loss": 0.4126, "step": 2569 }, { "epoch": 1.83, "grad_norm": 15.943605729070335, "learning_rate": 5.909638299508798e-06, "loss": 0.4131, "step": 2570 }, { "epoch": 1.84, "grad_norm": 14.256434625761717, "learning_rate": 5.906796229124543e-06, "loss": 0.4639, "step": 2571 }, { "epoch": 1.84, "grad_norm": 20.457753839967175, "learning_rate": 5.903953855762812e-06, "loss": 0.4829, "step": 2572 }, { "epoch": 1.84, "grad_norm": 20.1694335296129, "learning_rate": 5.901111180373298e-06, "loss": 0.5698, "step": 2573 }, { "epoch": 1.84, "grad_norm": 10.654685001614778, "learning_rate": 5.898268203905788e-06, "loss": 0.4927, "step": 2574 }, { "epoch": 1.84, "grad_norm": 12.94449663637914, "learning_rate": 5.895424927310174e-06, "loss": 0.478, "step": 2575 }, { "epoch": 1.84, "grad_norm": 7.653556967960311, "learning_rate": 5.89258135153645e-06, "loss": 0.4351, "step": 2576 }, { "epoch": 1.84, "grad_norm": 8.724275472225052, "learning_rate": 5.889737477534704e-06, "loss": 0.3901, "step": 2577 }, { "epoch": 1.84, "grad_norm": 10.683542432644515, "learning_rate": 5.886893306255129e-06, "loss": 0.4609, "step": 2578 }, { "epoch": 1.84, "grad_norm": 8.145435964386623, "learning_rate": 5.884048838648017e-06, "loss": 0.5005, "step": 2579 }, { "epoch": 1.84, "grad_norm": 8.871255725801072, "learning_rate": 5.881204075663755e-06, "loss": 0.4761, "step": 2580 }, { "epoch": 1.84, "grad_norm": 24.214634872213015, "learning_rate": 5.878359018252831e-06, "loss": 0.6128, "step": 2581 }, { "epoch": 1.84, "grad_norm": 9.406317683172762, "learning_rate": 5.8755136673658365e-06, "loss": 0.4609, "step": 2582 }, { "epoch": 1.84, "grad_norm": 15.554115156101938, "learning_rate": 5.872668023953449e-06, "loss": 0.5054, "step": 2583 }, { "epoch": 1.84, "grad_norm": 11.952532962810603, "learning_rate": 5.869822088966455e-06, "loss": 0.4531, "step": 2584 }, { "epoch": 1.85, "grad_norm": 8.834444713741934, "learning_rate": 5.866975863355734e-06, "loss": 0.4854, "step": 2585 }, { "epoch": 1.85, "grad_norm": 8.54691819321447, "learning_rate": 5.864129348072261e-06, "loss": 0.5293, "step": 2586 }, { "epoch": 1.85, "grad_norm": 12.582713115393668, "learning_rate": 5.861282544067112e-06, "loss": 0.4829, "step": 2587 }, { "epoch": 1.85, "grad_norm": 14.239317398524113, "learning_rate": 5.8584354522914555e-06, "loss": 0.5029, "step": 2588 }, { "epoch": 1.85, "grad_norm": 7.723468978865414, "learning_rate": 5.855588073696559e-06, "loss": 0.4668, "step": 2589 }, { "epoch": 1.85, "grad_norm": 7.930051826297984, "learning_rate": 5.852740409233785e-06, "loss": 0.4092, "step": 2590 }, { "epoch": 1.85, "grad_norm": 9.019117275735518, "learning_rate": 5.849892459854588e-06, "loss": 0.3613, "step": 2591 }, { "epoch": 1.85, "grad_norm": 12.473807935463805, "learning_rate": 5.847044226510524e-06, "loss": 0.4814, "step": 2592 }, { "epoch": 1.85, "grad_norm": 10.961602975113415, "learning_rate": 5.84419571015324e-06, "loss": 0.4736, "step": 2593 }, { "epoch": 1.85, "grad_norm": 8.145353957021218, "learning_rate": 5.8413469117344766e-06, "loss": 0.4971, "step": 2594 }, { "epoch": 1.85, "grad_norm": 9.251775681976962, "learning_rate": 5.838497832206074e-06, "loss": 0.4351, "step": 2595 }, { "epoch": 1.85, "grad_norm": 10.772584386631225, "learning_rate": 5.835648472519958e-06, "loss": 0.4829, "step": 2596 }, { "epoch": 1.85, "grad_norm": 19.526504254371492, "learning_rate": 5.832798833628156e-06, "loss": 0.4814, "step": 2597 }, { "epoch": 1.85, "grad_norm": 7.52349073531977, "learning_rate": 5.829948916482784e-06, "loss": 0.4419, "step": 2598 }, { "epoch": 1.86, "grad_norm": 10.240904448794703, "learning_rate": 5.827098722036053e-06, "loss": 0.4404, "step": 2599 }, { "epoch": 1.86, "grad_norm": 11.162623628027008, "learning_rate": 5.824248251240265e-06, "loss": 0.437, "step": 2600 }, { "epoch": 1.86, "grad_norm": 13.063102295836982, "learning_rate": 5.8213975050478155e-06, "loss": 0.4668, "step": 2601 }, { "epoch": 1.86, "grad_norm": 9.197499697597129, "learning_rate": 5.818546484411191e-06, "loss": 0.4873, "step": 2602 }, { "epoch": 1.86, "grad_norm": 10.957695255120187, "learning_rate": 5.815695190282974e-06, "loss": 0.5273, "step": 2603 }, { "epoch": 1.86, "grad_norm": 11.149765327635293, "learning_rate": 5.81284362361583e-06, "loss": 0.5435, "step": 2604 }, { "epoch": 1.86, "grad_norm": 11.897344437580472, "learning_rate": 5.809991785362525e-06, "loss": 0.4624, "step": 2605 }, { "epoch": 1.86, "grad_norm": 14.41339345815525, "learning_rate": 5.8071396764759065e-06, "loss": 0.4155, "step": 2606 }, { "epoch": 1.86, "grad_norm": 12.588859240549306, "learning_rate": 5.804287297908923e-06, "loss": 0.4224, "step": 2607 }, { "epoch": 1.86, "grad_norm": 11.051689248011026, "learning_rate": 5.801434650614601e-06, "loss": 0.4731, "step": 2608 }, { "epoch": 1.86, "grad_norm": 12.164254844968275, "learning_rate": 5.798581735546066e-06, "loss": 0.4878, "step": 2609 }, { "epoch": 1.86, "grad_norm": 11.618691567900477, "learning_rate": 5.79572855365653e-06, "loss": 0.3784, "step": 2610 }, { "epoch": 1.86, "grad_norm": 12.41055691629168, "learning_rate": 5.792875105899294e-06, "loss": 0.5732, "step": 2611 }, { "epoch": 1.86, "grad_norm": 9.670181143911773, "learning_rate": 5.790021393227747e-06, "loss": 0.4133, "step": 2612 }, { "epoch": 1.87, "grad_norm": 11.575788308226384, "learning_rate": 5.787167416595369e-06, "loss": 0.4673, "step": 2613 }, { "epoch": 1.87, "grad_norm": 10.142364245518445, "learning_rate": 5.784313176955726e-06, "loss": 0.4351, "step": 2614 }, { "epoch": 1.87, "grad_norm": 9.236307058725137, "learning_rate": 5.781458675262472e-06, "loss": 0.3555, "step": 2615 }, { "epoch": 1.87, "grad_norm": 8.169804367113855, "learning_rate": 5.778603912469349e-06, "loss": 0.4067, "step": 2616 }, { "epoch": 1.87, "grad_norm": 13.194556650327089, "learning_rate": 5.775748889530187e-06, "loss": 0.5103, "step": 2617 }, { "epoch": 1.87, "grad_norm": 6.8031354287890995, "learning_rate": 5.772893607398901e-06, "loss": 0.3564, "step": 2618 }, { "epoch": 1.87, "grad_norm": 10.611694361969873, "learning_rate": 5.770038067029496e-06, "loss": 0.4175, "step": 2619 }, { "epoch": 1.87, "grad_norm": 11.88527859419395, "learning_rate": 5.76718226937606e-06, "loss": 0.3125, "step": 2620 }, { "epoch": 1.87, "grad_norm": 9.739686500701513, "learning_rate": 5.764326215392768e-06, "loss": 0.418, "step": 2621 }, { "epoch": 1.87, "grad_norm": 11.698752773439546, "learning_rate": 5.761469906033879e-06, "loss": 0.3662, "step": 2622 }, { "epoch": 1.87, "grad_norm": 10.707736166224288, "learning_rate": 5.758613342253743e-06, "loss": 0.374, "step": 2623 }, { "epoch": 1.87, "grad_norm": 20.383000470021422, "learning_rate": 5.7557565250067896e-06, "loss": 0.4565, "step": 2624 }, { "epoch": 1.87, "grad_norm": 10.55652728383383, "learning_rate": 5.752899455247532e-06, "loss": 0.3955, "step": 2625 }, { "epoch": 1.87, "grad_norm": 14.66660044282398, "learning_rate": 5.750042133930571e-06, "loss": 0.4761, "step": 2626 }, { "epoch": 1.88, "grad_norm": 13.605013174142563, "learning_rate": 5.7471845620105925e-06, "loss": 0.4524, "step": 2627 }, { "epoch": 1.88, "grad_norm": 13.512664823617058, "learning_rate": 5.744326740442364e-06, "loss": 0.4385, "step": 2628 }, { "epoch": 1.88, "grad_norm": 21.55939086501065, "learning_rate": 5.741468670180737e-06, "loss": 0.5186, "step": 2629 }, { "epoch": 1.88, "grad_norm": 20.560690542811283, "learning_rate": 5.738610352180645e-06, "loss": 0.5356, "step": 2630 }, { "epoch": 1.88, "grad_norm": 12.948898188809688, "learning_rate": 5.735751787397106e-06, "loss": 0.3574, "step": 2631 }, { "epoch": 1.88, "grad_norm": 24.160391432425694, "learning_rate": 5.732892976785218e-06, "loss": 0.4609, "step": 2632 }, { "epoch": 1.88, "grad_norm": 10.426468726837316, "learning_rate": 5.730033921300166e-06, "loss": 0.3936, "step": 2633 }, { "epoch": 1.88, "grad_norm": 9.517440332362233, "learning_rate": 5.7271746218972105e-06, "loss": 0.4478, "step": 2634 }, { "epoch": 1.88, "grad_norm": 7.545964981138855, "learning_rate": 5.724315079531697e-06, "loss": 0.4224, "step": 2635 }, { "epoch": 1.88, "grad_norm": 11.167790368619306, "learning_rate": 5.721455295159053e-06, "loss": 0.4131, "step": 2636 }, { "epoch": 1.88, "grad_norm": 15.35330099624201, "learning_rate": 5.7185952697347844e-06, "loss": 0.5435, "step": 2637 }, { "epoch": 1.88, "grad_norm": 17.478710612288964, "learning_rate": 5.71573500421448e-06, "loss": 0.4561, "step": 2638 }, { "epoch": 1.88, "grad_norm": 24.345386420729913, "learning_rate": 5.712874499553807e-06, "loss": 0.6011, "step": 2639 }, { "epoch": 1.88, "grad_norm": 14.103831958980157, "learning_rate": 5.710013756708513e-06, "loss": 0.5371, "step": 2640 }, { "epoch": 1.89, "grad_norm": 8.723792228428197, "learning_rate": 5.707152776634427e-06, "loss": 0.4746, "step": 2641 }, { "epoch": 1.89, "grad_norm": 11.135521772087765, "learning_rate": 5.704291560287454e-06, "loss": 0.5806, "step": 2642 }, { "epoch": 1.89, "grad_norm": 9.61923834918151, "learning_rate": 5.701430108623578e-06, "loss": 0.5034, "step": 2643 }, { "epoch": 1.89, "grad_norm": 11.595901720364187, "learning_rate": 5.698568422598867e-06, "loss": 0.4658, "step": 2644 }, { "epoch": 1.89, "grad_norm": 8.033670237094597, "learning_rate": 5.69570650316946e-06, "loss": 0.3877, "step": 2645 }, { "epoch": 1.89, "grad_norm": 10.311323300972322, "learning_rate": 5.69284435129158e-06, "loss": 0.5474, "step": 2646 }, { "epoch": 1.89, "grad_norm": 17.16920875171198, "learning_rate": 5.689981967921523e-06, "loss": 0.4785, "step": 2647 }, { "epoch": 1.89, "grad_norm": 11.505309016080973, "learning_rate": 5.6871193540156666e-06, "loss": 0.5347, "step": 2648 }, { "epoch": 1.89, "grad_norm": 9.375726119567252, "learning_rate": 5.684256510530461e-06, "loss": 0.5317, "step": 2649 }, { "epoch": 1.89, "grad_norm": 7.956873091796398, "learning_rate": 5.68139343842244e-06, "loss": 0.4897, "step": 2650 }, { "epoch": 1.89, "grad_norm": 6.598497045655397, "learning_rate": 5.678530138648204e-06, "loss": 0.3809, "step": 2651 }, { "epoch": 1.89, "grad_norm": 8.743508614000168, "learning_rate": 5.675666612164436e-06, "loss": 0.4536, "step": 2652 }, { "epoch": 1.89, "grad_norm": 16.871449050564767, "learning_rate": 5.672802859927895e-06, "loss": 0.4248, "step": 2653 }, { "epoch": 1.89, "grad_norm": 14.838442238490044, "learning_rate": 5.669938882895412e-06, "loss": 0.4878, "step": 2654 }, { "epoch": 1.9, "grad_norm": 12.731771693748918, "learning_rate": 5.667074682023896e-06, "loss": 0.4346, "step": 2655 }, { "epoch": 1.9, "grad_norm": 11.31273234375937, "learning_rate": 5.664210258270331e-06, "loss": 0.5474, "step": 2656 }, { "epoch": 1.9, "grad_norm": 6.388124322189932, "learning_rate": 5.661345612591771e-06, "loss": 0.3623, "step": 2657 }, { "epoch": 1.9, "grad_norm": 11.206485885563511, "learning_rate": 5.6584807459453515e-06, "loss": 0.4312, "step": 2658 }, { "epoch": 1.9, "grad_norm": 8.641694644438422, "learning_rate": 5.655615659288274e-06, "loss": 0.4653, "step": 2659 }, { "epoch": 1.9, "grad_norm": 9.129600359571274, "learning_rate": 5.652750353577818e-06, "loss": 0.4902, "step": 2660 }, { "epoch": 1.9, "grad_norm": 25.692674590295074, "learning_rate": 5.649884829771337e-06, "loss": 0.5063, "step": 2661 }, { "epoch": 1.9, "grad_norm": 13.54868839498731, "learning_rate": 5.6470190888262545e-06, "loss": 0.457, "step": 2662 }, { "epoch": 1.9, "grad_norm": 14.289852844856199, "learning_rate": 5.644153131700067e-06, "loss": 0.4634, "step": 2663 }, { "epoch": 1.9, "grad_norm": 15.249877303141151, "learning_rate": 5.6412869593503476e-06, "loss": 0.4956, "step": 2664 }, { "epoch": 1.9, "grad_norm": 6.960042402355725, "learning_rate": 5.638420572734733e-06, "loss": 0.457, "step": 2665 }, { "epoch": 1.9, "grad_norm": 8.138114527336878, "learning_rate": 5.63555397281094e-06, "loss": 0.4009, "step": 2666 }, { "epoch": 1.9, "grad_norm": 8.009079366365885, "learning_rate": 5.632687160536751e-06, "loss": 0.4043, "step": 2667 }, { "epoch": 1.9, "grad_norm": 11.217920799149297, "learning_rate": 5.629820136870022e-06, "loss": 0.4946, "step": 2668 }, { "epoch": 1.91, "grad_norm": 18.941849965542726, "learning_rate": 5.626952902768678e-06, "loss": 0.5039, "step": 2669 }, { "epoch": 1.91, "grad_norm": 6.464934606221962, "learning_rate": 5.624085459190717e-06, "loss": 0.3403, "step": 2670 }, { "epoch": 1.91, "grad_norm": 6.790559734697411, "learning_rate": 5.621217807094202e-06, "loss": 0.353, "step": 2671 }, { "epoch": 1.91, "grad_norm": 15.647088939509905, "learning_rate": 5.618349947437272e-06, "loss": 0.4565, "step": 2672 }, { "epoch": 1.91, "grad_norm": 11.789802694379299, "learning_rate": 5.615481881178132e-06, "loss": 0.4419, "step": 2673 }, { "epoch": 1.91, "grad_norm": 8.442548561932774, "learning_rate": 5.612613609275054e-06, "loss": 0.4175, "step": 2674 }, { "epoch": 1.91, "grad_norm": 15.641515924971305, "learning_rate": 5.609745132686383e-06, "loss": 0.5254, "step": 2675 }, { "epoch": 1.91, "grad_norm": 15.432218412288657, "learning_rate": 5.60687645237053e-06, "loss": 0.4419, "step": 2676 }, { "epoch": 1.91, "grad_norm": 12.15077877053977, "learning_rate": 5.604007569285973e-06, "loss": 0.5625, "step": 2677 }, { "epoch": 1.91, "grad_norm": 11.468525979946765, "learning_rate": 5.6011384843912605e-06, "loss": 0.4912, "step": 2678 }, { "epoch": 1.91, "grad_norm": 22.363832033779826, "learning_rate": 5.598269198645008e-06, "loss": 0.4634, "step": 2679 }, { "epoch": 1.91, "grad_norm": 11.383705824219753, "learning_rate": 5.5953997130058945e-06, "loss": 0.5581, "step": 2680 }, { "epoch": 1.91, "grad_norm": 11.042518750173171, "learning_rate": 5.5925300284326715e-06, "loss": 0.5088, "step": 2681 }, { "epoch": 1.91, "grad_norm": 8.76598841043737, "learning_rate": 5.5896601458841505e-06, "loss": 0.4141, "step": 2682 }, { "epoch": 1.92, "grad_norm": 13.862820427563568, "learning_rate": 5.586790066319217e-06, "loss": 0.4126, "step": 2683 }, { "epoch": 1.92, "grad_norm": 12.59617718975847, "learning_rate": 5.583919790696814e-06, "loss": 0.4648, "step": 2684 }, { "epoch": 1.92, "grad_norm": 9.54969966751114, "learning_rate": 5.581049319975957e-06, "loss": 0.437, "step": 2685 }, { "epoch": 1.92, "grad_norm": 20.884177236890935, "learning_rate": 5.57817865511572e-06, "loss": 0.5962, "step": 2686 }, { "epoch": 1.92, "grad_norm": 9.066486791899976, "learning_rate": 5.575307797075249e-06, "loss": 0.3447, "step": 2687 }, { "epoch": 1.92, "grad_norm": 10.359577468848478, "learning_rate": 5.572436746813748e-06, "loss": 0.48, "step": 2688 }, { "epoch": 1.92, "grad_norm": 8.3029594415371, "learning_rate": 5.5695655052904905e-06, "loss": 0.4507, "step": 2689 }, { "epoch": 1.92, "grad_norm": 15.98486194284849, "learning_rate": 5.566694073464812e-06, "loss": 0.4419, "step": 2690 }, { "epoch": 1.92, "grad_norm": 12.087720324088506, "learning_rate": 5.56382245229611e-06, "loss": 0.4771, "step": 2691 }, { "epoch": 1.92, "grad_norm": 9.174292843227011, "learning_rate": 5.560950642743847e-06, "loss": 0.4883, "step": 2692 }, { "epoch": 1.92, "grad_norm": 16.926555916096607, "learning_rate": 5.558078645767547e-06, "loss": 0.4019, "step": 2693 }, { "epoch": 1.92, "grad_norm": 13.536627620327389, "learning_rate": 5.5552064623267986e-06, "loss": 0.5322, "step": 2694 }, { "epoch": 1.92, "grad_norm": 8.76012508956878, "learning_rate": 5.5523340933812505e-06, "loss": 0.4253, "step": 2695 }, { "epoch": 1.92, "grad_norm": 8.90679754193909, "learning_rate": 5.549461539890616e-06, "loss": 0.4507, "step": 2696 }, { "epoch": 1.93, "grad_norm": 8.420670976992822, "learning_rate": 5.546588802814669e-06, "loss": 0.4375, "step": 2697 }, { "epoch": 1.93, "grad_norm": 8.727787659346541, "learning_rate": 5.543715883113241e-06, "loss": 0.3931, "step": 2698 }, { "epoch": 1.93, "grad_norm": 11.086024809133871, "learning_rate": 5.540842781746231e-06, "loss": 0.3979, "step": 2699 }, { "epoch": 1.93, "grad_norm": 13.011343075606366, "learning_rate": 5.537969499673598e-06, "loss": 0.522, "step": 2700 }, { "epoch": 1.93, "grad_norm": 9.15708782937107, "learning_rate": 5.535096037855353e-06, "loss": 0.459, "step": 2701 }, { "epoch": 1.93, "grad_norm": 11.044008252312308, "learning_rate": 5.532222397251576e-06, "loss": 0.4487, "step": 2702 }, { "epoch": 1.93, "grad_norm": 9.232084819714416, "learning_rate": 5.529348578822403e-06, "loss": 0.5, "step": 2703 }, { "epoch": 1.93, "grad_norm": 12.659043205460982, "learning_rate": 5.526474583528032e-06, "loss": 0.5312, "step": 2704 }, { "epoch": 1.93, "grad_norm": 21.00752567623559, "learning_rate": 5.523600412328716e-06, "loss": 0.5352, "step": 2705 }, { "epoch": 1.93, "grad_norm": 11.033115280436297, "learning_rate": 5.520726066184769e-06, "loss": 0.5396, "step": 2706 }, { "epoch": 1.93, "grad_norm": 8.80140343593761, "learning_rate": 5.517851546056566e-06, "loss": 0.3618, "step": 2707 }, { "epoch": 1.93, "grad_norm": 8.135541690096266, "learning_rate": 5.5149768529045355e-06, "loss": 0.3765, "step": 2708 }, { "epoch": 1.93, "grad_norm": 9.033113241944184, "learning_rate": 5.512101987689168e-06, "loss": 0.3994, "step": 2709 }, { "epoch": 1.93, "grad_norm": 8.091847973750784, "learning_rate": 5.509226951371006e-06, "loss": 0.4431, "step": 2710 }, { "epoch": 1.94, "grad_norm": 10.550320506061626, "learning_rate": 5.506351744910654e-06, "loss": 0.4248, "step": 2711 }, { "epoch": 1.94, "grad_norm": 7.907476430192875, "learning_rate": 5.503476369268773e-06, "loss": 0.4434, "step": 2712 }, { "epoch": 1.94, "grad_norm": 11.662632331178548, "learning_rate": 5.50060082540608e-06, "loss": 0.3901, "step": 2713 }, { "epoch": 1.94, "grad_norm": 11.946005732308972, "learning_rate": 5.4977251142833445e-06, "loss": 0.5063, "step": 2714 }, { "epoch": 1.94, "grad_norm": 7.764860363419759, "learning_rate": 5.494849236861397e-06, "loss": 0.3701, "step": 2715 }, { "epoch": 1.94, "grad_norm": 10.57151587816436, "learning_rate": 5.491973194101122e-06, "loss": 0.4678, "step": 2716 }, { "epoch": 1.94, "grad_norm": 10.496999220354681, "learning_rate": 5.4890969869634606e-06, "loss": 0.4072, "step": 2717 }, { "epoch": 1.94, "grad_norm": 21.955739348741126, "learning_rate": 5.486220616409403e-06, "loss": 0.417, "step": 2718 }, { "epoch": 1.94, "grad_norm": 9.480253591668859, "learning_rate": 5.4833440834e-06, "loss": 0.4419, "step": 2719 }, { "epoch": 1.94, "grad_norm": 13.417238840494207, "learning_rate": 5.480467388896353e-06, "loss": 0.4951, "step": 2720 }, { "epoch": 1.94, "grad_norm": 9.778607214982058, "learning_rate": 5.477590533859623e-06, "loss": 0.4058, "step": 2721 }, { "epoch": 1.94, "grad_norm": 15.095377405354045, "learning_rate": 5.474713519251018e-06, "loss": 0.501, "step": 2722 }, { "epoch": 1.94, "grad_norm": 9.109288662149696, "learning_rate": 5.471836346031802e-06, "loss": 0.4067, "step": 2723 }, { "epoch": 1.94, "grad_norm": 10.606747867678797, "learning_rate": 5.468959015163293e-06, "loss": 0.4321, "step": 2724 }, { "epoch": 1.95, "grad_norm": 10.521320145257938, "learning_rate": 5.46608152760686e-06, "loss": 0.3298, "step": 2725 }, { "epoch": 1.95, "grad_norm": 13.610539194603652, "learning_rate": 5.463203884323926e-06, "loss": 0.541, "step": 2726 }, { "epoch": 1.95, "grad_norm": 13.273515699596409, "learning_rate": 5.460326086275964e-06, "loss": 0.5078, "step": 2727 }, { "epoch": 1.95, "grad_norm": 9.111661385131107, "learning_rate": 5.4574481344245015e-06, "loss": 0.4756, "step": 2728 }, { "epoch": 1.95, "grad_norm": 8.977805736536428, "learning_rate": 5.454570029731115e-06, "loss": 0.4663, "step": 2729 }, { "epoch": 1.95, "grad_norm": 13.975752561343054, "learning_rate": 5.451691773157431e-06, "loss": 0.4971, "step": 2730 }, { "epoch": 1.95, "grad_norm": 17.32339975667838, "learning_rate": 5.448813365665129e-06, "loss": 0.5049, "step": 2731 }, { "epoch": 1.95, "grad_norm": 11.315123889810822, "learning_rate": 5.44593480821594e-06, "loss": 0.5132, "step": 2732 }, { "epoch": 1.95, "grad_norm": 9.29119410355911, "learning_rate": 5.443056101771643e-06, "loss": 0.4316, "step": 2733 }, { "epoch": 1.95, "grad_norm": 11.289099728879002, "learning_rate": 5.44017724729407e-06, "loss": 0.3882, "step": 2734 }, { "epoch": 1.95, "grad_norm": 8.888644836261584, "learning_rate": 5.437298245745093e-06, "loss": 0.4331, "step": 2735 }, { "epoch": 1.95, "grad_norm": 48.21181573416906, "learning_rate": 5.434419098086645e-06, "loss": 0.5977, "step": 2736 }, { "epoch": 1.95, "grad_norm": 9.641134447756468, "learning_rate": 5.431539805280702e-06, "loss": 0.3945, "step": 2737 }, { "epoch": 1.95, "grad_norm": 10.965847623472408, "learning_rate": 5.428660368289289e-06, "loss": 0.4741, "step": 2738 }, { "epoch": 1.96, "grad_norm": 11.10664209523203, "learning_rate": 5.42578078807448e-06, "loss": 0.4512, "step": 2739 }, { "epoch": 1.96, "grad_norm": 10.681304900537418, "learning_rate": 5.422901065598395e-06, "loss": 0.4297, "step": 2740 }, { "epoch": 1.96, "grad_norm": 8.739932626389907, "learning_rate": 5.4200212018232024e-06, "loss": 0.3513, "step": 2741 }, { "epoch": 1.96, "grad_norm": 8.846041535480271, "learning_rate": 5.41714119771112e-06, "loss": 0.4302, "step": 2742 }, { "epoch": 1.96, "grad_norm": 11.984598238079338, "learning_rate": 5.414261054224412e-06, "loss": 0.4033, "step": 2743 }, { "epoch": 1.96, "grad_norm": 18.933328673227695, "learning_rate": 5.411380772325383e-06, "loss": 0.5703, "step": 2744 }, { "epoch": 1.96, "grad_norm": 18.854711059795555, "learning_rate": 5.408500352976392e-06, "loss": 0.5151, "step": 2745 }, { "epoch": 1.96, "grad_norm": 10.111285712834931, "learning_rate": 5.40561979713984e-06, "loss": 0.4084, "step": 2746 }, { "epoch": 1.96, "grad_norm": 11.242923858855155, "learning_rate": 5.402739105778175e-06, "loss": 0.4956, "step": 2747 }, { "epoch": 1.96, "grad_norm": 9.954715434651323, "learning_rate": 5.399858279853889e-06, "loss": 0.4229, "step": 2748 }, { "epoch": 1.96, "grad_norm": 7.3157730974535315, "learning_rate": 5.39697732032952e-06, "loss": 0.3398, "step": 2749 }, { "epoch": 1.96, "grad_norm": 10.265906166274753, "learning_rate": 5.394096228167648e-06, "loss": 0.4565, "step": 2750 }, { "epoch": 1.96, "grad_norm": 12.198284629775742, "learning_rate": 5.391215004330903e-06, "loss": 0.3813, "step": 2751 }, { "epoch": 1.96, "grad_norm": 12.31796639425881, "learning_rate": 5.388333649781951e-06, "loss": 0.4683, "step": 2752 }, { "epoch": 1.97, "grad_norm": 10.01789404392901, "learning_rate": 5.3854521654835105e-06, "loss": 0.4502, "step": 2753 }, { "epoch": 1.97, "grad_norm": 11.306465904258715, "learning_rate": 5.3825705523983366e-06, "loss": 0.4351, "step": 2754 }, { "epoch": 1.97, "grad_norm": 11.519624094511274, "learning_rate": 5.37968881148923e-06, "loss": 0.4707, "step": 2755 }, { "epoch": 1.97, "grad_norm": 13.226114092265847, "learning_rate": 5.376806943719033e-06, "loss": 0.4814, "step": 2756 }, { "epoch": 1.97, "grad_norm": 9.840072523952024, "learning_rate": 5.373924950050633e-06, "loss": 0.4194, "step": 2757 }, { "epoch": 1.97, "grad_norm": 19.039602192178293, "learning_rate": 5.371042831446957e-06, "loss": 0.5571, "step": 2758 }, { "epoch": 1.97, "grad_norm": 10.416861116474312, "learning_rate": 5.3681605888709755e-06, "loss": 0.4858, "step": 2759 }, { "epoch": 1.97, "grad_norm": 15.51902810805772, "learning_rate": 5.365278223285698e-06, "loss": 0.7158, "step": 2760 }, { "epoch": 1.97, "grad_norm": 8.508514981825446, "learning_rate": 5.362395735654175e-06, "loss": 0.5264, "step": 2761 }, { "epoch": 1.97, "grad_norm": 11.292618872818796, "learning_rate": 5.3595131269395015e-06, "loss": 0.4668, "step": 2762 }, { "epoch": 1.97, "grad_norm": 8.600159324345311, "learning_rate": 5.356630398104811e-06, "loss": 0.4302, "step": 2763 }, { "epoch": 1.97, "grad_norm": 8.912551709797079, "learning_rate": 5.353747550113274e-06, "loss": 0.3501, "step": 2764 }, { "epoch": 1.97, "grad_norm": 10.525523170385298, "learning_rate": 5.350864583928106e-06, "loss": 0.4648, "step": 2765 }, { "epoch": 1.97, "grad_norm": 8.191292892938462, "learning_rate": 5.347981500512558e-06, "loss": 0.3613, "step": 2766 }, { "epoch": 1.98, "grad_norm": 19.57847648591052, "learning_rate": 5.345098300829924e-06, "loss": 0.5903, "step": 2767 }, { "epoch": 1.98, "grad_norm": 13.335634989474457, "learning_rate": 5.342214985843534e-06, "loss": 0.4829, "step": 2768 }, { "epoch": 1.98, "grad_norm": 21.001861539425636, "learning_rate": 5.339331556516755e-06, "loss": 0.4902, "step": 2769 }, { "epoch": 1.98, "grad_norm": 10.013265462647908, "learning_rate": 5.336448013812996e-06, "loss": 0.4912, "step": 2770 }, { "epoch": 1.98, "grad_norm": 6.809090114106525, "learning_rate": 5.333564358695701e-06, "loss": 0.4336, "step": 2771 }, { "epoch": 1.98, "grad_norm": 16.281886038205364, "learning_rate": 5.330680592128355e-06, "loss": 0.4937, "step": 2772 }, { "epoch": 1.98, "grad_norm": 8.150545229018581, "learning_rate": 5.3277967150744755e-06, "loss": 0.4189, "step": 2773 }, { "epoch": 1.98, "grad_norm": 7.345814588398108, "learning_rate": 5.324912728497621e-06, "loss": 0.4009, "step": 2774 }, { "epoch": 1.98, "grad_norm": 20.091474666204597, "learning_rate": 5.322028633361386e-06, "loss": 0.562, "step": 2775 }, { "epoch": 1.98, "grad_norm": 11.754059247137123, "learning_rate": 5.319144430629397e-06, "loss": 0.4702, "step": 2776 }, { "epoch": 1.98, "grad_norm": 7.586444942071105, "learning_rate": 5.316260121265323e-06, "loss": 0.4595, "step": 2777 }, { "epoch": 1.98, "grad_norm": 7.6680666422755275, "learning_rate": 5.313375706232864e-06, "loss": 0.4072, "step": 2778 }, { "epoch": 1.98, "grad_norm": 12.091604704175747, "learning_rate": 5.310491186495757e-06, "loss": 0.4297, "step": 2779 }, { "epoch": 1.98, "grad_norm": 6.434600968392432, "learning_rate": 5.307606563017772e-06, "loss": 0.3491, "step": 2780 }, { "epoch": 1.99, "grad_norm": 7.651832210814672, "learning_rate": 5.304721836762717e-06, "loss": 0.4087, "step": 2781 }, { "epoch": 1.99, "grad_norm": 7.123238252935426, "learning_rate": 5.301837008694433e-06, "loss": 0.418, "step": 2782 }, { "epoch": 1.99, "grad_norm": 10.623468647704522, "learning_rate": 5.298952079776794e-06, "loss": 0.4336, "step": 2783 }, { "epoch": 1.99, "grad_norm": 9.961097932950663, "learning_rate": 5.296067050973709e-06, "loss": 0.4009, "step": 2784 }, { "epoch": 1.99, "grad_norm": 10.004467440225236, "learning_rate": 5.29318192324912e-06, "loss": 0.4385, "step": 2785 }, { "epoch": 1.99, "grad_norm": 12.937849862725015, "learning_rate": 5.290296697566999e-06, "loss": 0.4731, "step": 2786 }, { "epoch": 1.99, "grad_norm": 10.851718115774863, "learning_rate": 5.287411374891356e-06, "loss": 0.4741, "step": 2787 }, { "epoch": 1.99, "grad_norm": 10.247981728977928, "learning_rate": 5.284525956186231e-06, "loss": 0.4355, "step": 2788 }, { "epoch": 1.99, "grad_norm": 10.992715571855904, "learning_rate": 5.281640442415695e-06, "loss": 0.5229, "step": 2789 }, { "epoch": 1.99, "grad_norm": 12.344871517460058, "learning_rate": 5.278754834543852e-06, "loss": 0.4722, "step": 2790 }, { "epoch": 1.99, "grad_norm": 13.839020938381688, "learning_rate": 5.275869133534838e-06, "loss": 0.4785, "step": 2791 }, { "epoch": 1.99, "grad_norm": 18.33203373375532, "learning_rate": 5.272983340352818e-06, "loss": 0.5005, "step": 2792 }, { "epoch": 1.99, "grad_norm": 9.957846186020241, "learning_rate": 5.270097455961991e-06, "loss": 0.4048, "step": 2793 }, { "epoch": 1.99, "grad_norm": 8.793496740544484, "learning_rate": 5.267211481326584e-06, "loss": 0.3716, "step": 2794 }, { "epoch": 2.0, "grad_norm": 10.547713987824872, "learning_rate": 5.264325417410854e-06, "loss": 0.3901, "step": 2795 }, { "epoch": 2.0, "grad_norm": 10.683395761109471, "learning_rate": 5.261439265179089e-06, "loss": 0.4375, "step": 2796 }, { "epoch": 2.0, "grad_norm": 15.88713456122952, "learning_rate": 5.258553025595605e-06, "loss": 0.4854, "step": 2797 }, { "epoch": 2.0, "grad_norm": 13.15203401175118, "learning_rate": 5.255666699624749e-06, "loss": 0.4604, "step": 2798 }, { "epoch": 2.0, "grad_norm": 8.439586255360073, "learning_rate": 5.252780288230899e-06, "loss": 0.4316, "step": 2799 }, { "epoch": 2.0, "grad_norm": 11.690633931887922, "learning_rate": 5.249893792378454e-06, "loss": 0.4546, "step": 2800 }, { "epoch": 2.0, "grad_norm": 7.5972252881605, "learning_rate": 5.24700721303185e-06, "loss": 0.3784, "step": 2801 }, { "epoch": 2.0, "grad_norm": 8.09788081335953, "learning_rate": 5.244120551155544e-06, "loss": 0.4463, "step": 2802 }, { "epoch": 2.0, "grad_norm": 6.183682722164911, "learning_rate": 5.241233807714024e-06, "loss": 0.2798, "step": 2803 }, { "epoch": 2.0, "grad_norm": 7.492711477273929, "learning_rate": 5.238346983671805e-06, "loss": 0.321, "step": 2804 }, { "epoch": 2.0, "grad_norm": 8.78545087675637, "learning_rate": 5.235460079993429e-06, "loss": 0.3223, "step": 2805 }, { "epoch": 2.0, "grad_norm": 6.184465239974276, "learning_rate": 5.232573097643462e-06, "loss": 0.2476, "step": 2806 }, { "epoch": 2.0, "grad_norm": 7.38970510024269, "learning_rate": 5.229686037586502e-06, "loss": 0.3081, "step": 2807 }, { "epoch": 2.0, "grad_norm": 6.337615749238382, "learning_rate": 5.226798900787167e-06, "loss": 0.3176, "step": 2808 }, { "epoch": 2.0, "grad_norm": 8.065520523779682, "learning_rate": 5.223911688210104e-06, "loss": 0.2876, "step": 2809 }, { "epoch": 2.01, "grad_norm": 13.050626203183986, "learning_rate": 5.221024400819983e-06, "loss": 0.3157, "step": 2810 }, { "epoch": 2.01, "grad_norm": 10.158021790081161, "learning_rate": 5.218137039581504e-06, "loss": 0.2568, "step": 2811 }, { "epoch": 2.01, "grad_norm": 11.581251837695142, "learning_rate": 5.215249605459382e-06, "loss": 0.3867, "step": 2812 }, { "epoch": 2.01, "grad_norm": 8.194547291558267, "learning_rate": 5.212362099418369e-06, "loss": 0.26, "step": 2813 }, { "epoch": 2.01, "grad_norm": 10.028079795632454, "learning_rate": 5.2094745224232306e-06, "loss": 0.2729, "step": 2814 }, { "epoch": 2.01, "grad_norm": 16.016565575074754, "learning_rate": 5.206586875438759e-06, "loss": 0.4033, "step": 2815 }, { "epoch": 2.01, "grad_norm": 9.122323867559514, "learning_rate": 5.203699159429773e-06, "loss": 0.2603, "step": 2816 }, { "epoch": 2.01, "grad_norm": 11.258210540115515, "learning_rate": 5.200811375361112e-06, "loss": 0.3042, "step": 2817 }, { "epoch": 2.01, "grad_norm": 7.530978379942863, "learning_rate": 5.197923524197639e-06, "loss": 0.2639, "step": 2818 }, { "epoch": 2.01, "grad_norm": 11.244056210394618, "learning_rate": 5.195035606904237e-06, "loss": 0.3047, "step": 2819 }, { "epoch": 2.01, "grad_norm": 12.976152241105012, "learning_rate": 5.1921476244458135e-06, "loss": 0.2971, "step": 2820 }, { "epoch": 2.01, "grad_norm": 12.62736935340582, "learning_rate": 5.189259577787297e-06, "loss": 0.2849, "step": 2821 }, { "epoch": 2.01, "grad_norm": 8.841839115792192, "learning_rate": 5.186371467893638e-06, "loss": 0.2444, "step": 2822 }, { "epoch": 2.01, "grad_norm": 11.733355341593247, "learning_rate": 5.1834832957298075e-06, "loss": 0.2522, "step": 2823 }, { "epoch": 2.02, "grad_norm": 10.20366363366365, "learning_rate": 5.180595062260797e-06, "loss": 0.1924, "step": 2824 }, { "epoch": 2.02, "grad_norm": 12.665148545436182, "learning_rate": 5.177706768451619e-06, "loss": 0.2285, "step": 2825 }, { "epoch": 2.02, "grad_norm": 8.249918360609486, "learning_rate": 5.174818415267308e-06, "loss": 0.1785, "step": 2826 }, { "epoch": 2.02, "grad_norm": 12.068358421491178, "learning_rate": 5.1719300036729135e-06, "loss": 0.2644, "step": 2827 }, { "epoch": 2.02, "grad_norm": 11.250338443568086, "learning_rate": 5.169041534633511e-06, "loss": 0.2855, "step": 2828 }, { "epoch": 2.02, "grad_norm": 11.981055766027776, "learning_rate": 5.166153009114188e-06, "loss": 0.2559, "step": 2829 }, { "epoch": 2.02, "grad_norm": 16.58373133194685, "learning_rate": 5.163264428080057e-06, "loss": 0.229, "step": 2830 }, { "epoch": 2.02, "grad_norm": 11.426613456253506, "learning_rate": 5.160375792496246e-06, "loss": 0.2656, "step": 2831 }, { "epoch": 2.02, "grad_norm": 28.624808178448106, "learning_rate": 5.157487103327901e-06, "loss": 0.4053, "step": 2832 }, { "epoch": 2.02, "grad_norm": 9.440233476330917, "learning_rate": 5.1545983615401885e-06, "loss": 0.2058, "step": 2833 }, { "epoch": 2.02, "grad_norm": 14.486881898609434, "learning_rate": 5.151709568098289e-06, "loss": 0.3364, "step": 2834 }, { "epoch": 2.02, "grad_norm": 8.482819056787882, "learning_rate": 5.1488207239674036e-06, "loss": 0.2424, "step": 2835 }, { "epoch": 2.02, "grad_norm": 9.817322942492323, "learning_rate": 5.145931830112748e-06, "loss": 0.2766, "step": 2836 }, { "epoch": 2.02, "grad_norm": 9.581341500866515, "learning_rate": 5.1430428874995554e-06, "loss": 0.2036, "step": 2837 }, { "epoch": 2.03, "grad_norm": 9.851916462924994, "learning_rate": 5.140153897093076e-06, "loss": 0.2317, "step": 2838 }, { "epoch": 2.03, "grad_norm": 13.85859461783392, "learning_rate": 5.1372648598585725e-06, "loss": 0.3672, "step": 2839 }, { "epoch": 2.03, "grad_norm": 7.51481537393519, "learning_rate": 5.134375776761329e-06, "loss": 0.2026, "step": 2840 }, { "epoch": 2.03, "grad_norm": 12.011970306520274, "learning_rate": 5.131486648766642e-06, "loss": 0.2827, "step": 2841 }, { "epoch": 2.03, "grad_norm": 8.623869309435559, "learning_rate": 5.1285974768398205e-06, "loss": 0.2432, "step": 2842 }, { "epoch": 2.03, "grad_norm": 13.971151752756175, "learning_rate": 5.125708261946192e-06, "loss": 0.2737, "step": 2843 }, { "epoch": 2.03, "grad_norm": 12.16313895403444, "learning_rate": 5.122819005051096e-06, "loss": 0.2595, "step": 2844 }, { "epoch": 2.03, "grad_norm": 10.834344804916485, "learning_rate": 5.119929707119889e-06, "loss": 0.3093, "step": 2845 }, { "epoch": 2.03, "grad_norm": 10.59230099518939, "learning_rate": 5.117040369117937e-06, "loss": 0.269, "step": 2846 }, { "epoch": 2.03, "grad_norm": 9.261755241964481, "learning_rate": 5.114150992010621e-06, "loss": 0.2363, "step": 2847 }, { "epoch": 2.03, "grad_norm": 17.845223713454207, "learning_rate": 5.1112615767633385e-06, "loss": 0.3608, "step": 2848 }, { "epoch": 2.03, "grad_norm": 11.952115168213247, "learning_rate": 5.108372124341494e-06, "loss": 0.2449, "step": 2849 }, { "epoch": 2.03, "grad_norm": 10.962593059947745, "learning_rate": 5.105482635710509e-06, "loss": 0.281, "step": 2850 }, { "epoch": 2.03, "grad_norm": 11.47356055701605, "learning_rate": 5.102593111835815e-06, "loss": 0.303, "step": 2851 }, { "epoch": 2.04, "grad_norm": 13.090099328504023, "learning_rate": 5.099703553682854e-06, "loss": 0.2979, "step": 2852 }, { "epoch": 2.04, "grad_norm": 11.384435155829522, "learning_rate": 5.096813962217086e-06, "loss": 0.2117, "step": 2853 }, { "epoch": 2.04, "grad_norm": 12.707780782866086, "learning_rate": 5.093924338403971e-06, "loss": 0.3604, "step": 2854 }, { "epoch": 2.04, "grad_norm": 11.41635335383991, "learning_rate": 5.091034683208988e-06, "loss": 0.3076, "step": 2855 }, { "epoch": 2.04, "grad_norm": 10.542784265367086, "learning_rate": 5.088144997597627e-06, "loss": 0.2373, "step": 2856 }, { "epoch": 2.04, "grad_norm": 16.300403445154583, "learning_rate": 5.085255282535383e-06, "loss": 0.3362, "step": 2857 }, { "epoch": 2.04, "grad_norm": 18.820500247221656, "learning_rate": 5.082365538987765e-06, "loss": 0.3486, "step": 2858 }, { "epoch": 2.04, "grad_norm": 10.963174706002892, "learning_rate": 5.079475767920289e-06, "loss": 0.2498, "step": 2859 }, { "epoch": 2.04, "grad_norm": 15.924691223131479, "learning_rate": 5.076585970298481e-06, "loss": 0.3442, "step": 2860 }, { "epoch": 2.04, "grad_norm": 10.618086517328626, "learning_rate": 5.073696147087878e-06, "loss": 0.2406, "step": 2861 }, { "epoch": 2.04, "grad_norm": 11.971342596280728, "learning_rate": 5.070806299254023e-06, "loss": 0.207, "step": 2862 }, { "epoch": 2.04, "grad_norm": 13.74822806936663, "learning_rate": 5.067916427762466e-06, "loss": 0.2971, "step": 2863 }, { "epoch": 2.04, "grad_norm": 14.841852154556609, "learning_rate": 5.0650265335787685e-06, "loss": 0.3123, "step": 2864 }, { "epoch": 2.04, "grad_norm": 12.028441837620969, "learning_rate": 5.062136617668497e-06, "loss": 0.2405, "step": 2865 }, { "epoch": 2.05, "grad_norm": 10.77779210531338, "learning_rate": 5.059246680997228e-06, "loss": 0.2537, "step": 2866 }, { "epoch": 2.05, "grad_norm": 16.536986749042004, "learning_rate": 5.05635672453054e-06, "loss": 0.3887, "step": 2867 }, { "epoch": 2.05, "grad_norm": 12.088559869194436, "learning_rate": 5.053466749234023e-06, "loss": 0.2421, "step": 2868 }, { "epoch": 2.05, "grad_norm": 17.184815205436934, "learning_rate": 5.050576756073272e-06, "loss": 0.3806, "step": 2869 }, { "epoch": 2.05, "grad_norm": 9.23293466912918, "learning_rate": 5.047686746013888e-06, "loss": 0.2493, "step": 2870 }, { "epoch": 2.05, "grad_norm": 11.369014628236714, "learning_rate": 5.044796720021474e-06, "loss": 0.2585, "step": 2871 }, { "epoch": 2.05, "grad_norm": 14.817280948904253, "learning_rate": 5.041906679061643e-06, "loss": 0.2686, "step": 2872 }, { "epoch": 2.05, "grad_norm": 14.400824784637889, "learning_rate": 5.039016624100013e-06, "loss": 0.3796, "step": 2873 }, { "epoch": 2.05, "grad_norm": 14.411506271031973, "learning_rate": 5.036126556102202e-06, "loss": 0.2939, "step": 2874 }, { "epoch": 2.05, "grad_norm": 8.73804711630787, "learning_rate": 5.033236476033838e-06, "loss": 0.2456, "step": 2875 }, { "epoch": 2.05, "grad_norm": 8.075734103259556, "learning_rate": 5.0303463848605495e-06, "loss": 0.2654, "step": 2876 }, { "epoch": 2.05, "grad_norm": 21.79768975539953, "learning_rate": 5.027456283547969e-06, "loss": 0.2686, "step": 2877 }, { "epoch": 2.05, "grad_norm": 10.816952929731688, "learning_rate": 5.0245661730617344e-06, "loss": 0.2668, "step": 2878 }, { "epoch": 2.05, "grad_norm": 15.601781932049883, "learning_rate": 5.0216760543674855e-06, "loss": 0.28, "step": 2879 }, { "epoch": 2.06, "grad_norm": 15.279934779358646, "learning_rate": 5.0187859284308635e-06, "loss": 0.3567, "step": 2880 }, { "epoch": 2.06, "grad_norm": 11.09264143614305, "learning_rate": 5.015895796217514e-06, "loss": 0.2632, "step": 2881 }, { "epoch": 2.06, "grad_norm": 6.72906261801957, "learning_rate": 5.013005658693083e-06, "loss": 0.2432, "step": 2882 }, { "epoch": 2.06, "grad_norm": 14.904243447527767, "learning_rate": 5.01011551682322e-06, "loss": 0.3123, "step": 2883 }, { "epoch": 2.06, "grad_norm": 9.56158421894508, "learning_rate": 5.007225371573573e-06, "loss": 0.2267, "step": 2884 }, { "epoch": 2.06, "grad_norm": 13.556632021145733, "learning_rate": 5.004335223909797e-06, "loss": 0.3115, "step": 2885 }, { "epoch": 2.06, "grad_norm": 9.697069853449166, "learning_rate": 5.0014450747975416e-06, "loss": 0.2358, "step": 2886 }, { "epoch": 2.06, "grad_norm": 10.361640909885745, "learning_rate": 4.998554925202459e-06, "loss": 0.2517, "step": 2887 }, { "epoch": 2.06, "grad_norm": 18.22972936773402, "learning_rate": 4.995664776090204e-06, "loss": 0.4097, "step": 2888 }, { "epoch": 2.06, "grad_norm": 9.143973237259363, "learning_rate": 4.9927746284264275e-06, "loss": 0.2427, "step": 2889 }, { "epoch": 2.06, "grad_norm": 7.457619103227681, "learning_rate": 4.9898844831767826e-06, "loss": 0.2324, "step": 2890 }, { "epoch": 2.06, "grad_norm": 11.882434995590293, "learning_rate": 4.98699434130692e-06, "loss": 0.2408, "step": 2891 }, { "epoch": 2.06, "grad_norm": 12.593705192329265, "learning_rate": 4.984104203782488e-06, "loss": 0.3152, "step": 2892 }, { "epoch": 2.06, "grad_norm": 7.4272420996640856, "learning_rate": 4.981214071569139e-06, "loss": 0.2415, "step": 2893 }, { "epoch": 2.07, "grad_norm": 8.71270569735412, "learning_rate": 4.978323945632515e-06, "loss": 0.2395, "step": 2894 }, { "epoch": 2.07, "grad_norm": 10.869809390534899, "learning_rate": 4.975433826938267e-06, "loss": 0.2932, "step": 2895 }, { "epoch": 2.07, "grad_norm": 12.64672924292059, "learning_rate": 4.972543716452031e-06, "loss": 0.2837, "step": 2896 }, { "epoch": 2.07, "grad_norm": 11.846800699280617, "learning_rate": 4.969653615139452e-06, "loss": 0.2664, "step": 2897 }, { "epoch": 2.07, "grad_norm": 20.340053708726927, "learning_rate": 4.966763523966163e-06, "loss": 0.4248, "step": 2898 }, { "epoch": 2.07, "grad_norm": 8.986583690207297, "learning_rate": 4.963873443897799e-06, "loss": 0.2932, "step": 2899 }, { "epoch": 2.07, "grad_norm": 8.877813445906238, "learning_rate": 4.96098337589999e-06, "loss": 0.2527, "step": 2900 }, { "epoch": 2.07, "grad_norm": 8.751716315394958, "learning_rate": 4.958093320938358e-06, "loss": 0.2856, "step": 2901 }, { "epoch": 2.07, "grad_norm": 10.259213554000896, "learning_rate": 4.955203279978529e-06, "loss": 0.312, "step": 2902 }, { "epoch": 2.07, "grad_norm": 10.869350826007125, "learning_rate": 4.952313253986114e-06, "loss": 0.302, "step": 2903 }, { "epoch": 2.07, "grad_norm": 10.818949638241829, "learning_rate": 4.9494232439267296e-06, "loss": 0.2224, "step": 2904 }, { "epoch": 2.07, "grad_norm": 9.567323569857239, "learning_rate": 4.946533250765977e-06, "loss": 0.2588, "step": 2905 }, { "epoch": 2.07, "grad_norm": 17.916886883895785, "learning_rate": 4.943643275469461e-06, "loss": 0.2678, "step": 2906 }, { "epoch": 2.07, "grad_norm": 8.444466859746376, "learning_rate": 4.940753319002773e-06, "loss": 0.2598, "step": 2907 }, { "epoch": 2.08, "grad_norm": 12.782074101523976, "learning_rate": 4.937863382331504e-06, "loss": 0.3218, "step": 2908 }, { "epoch": 2.08, "grad_norm": 11.596863218172778, "learning_rate": 4.934973466421234e-06, "loss": 0.3345, "step": 2909 }, { "epoch": 2.08, "grad_norm": 8.650320442975373, "learning_rate": 4.932083572237535e-06, "loss": 0.2859, "step": 2910 }, { "epoch": 2.08, "grad_norm": 10.743783621991485, "learning_rate": 4.92919370074598e-06, "loss": 0.2761, "step": 2911 }, { "epoch": 2.08, "grad_norm": 14.118954150902352, "learning_rate": 4.926303852912123e-06, "loss": 0.3367, "step": 2912 }, { "epoch": 2.08, "grad_norm": 11.571991535980812, "learning_rate": 4.9234140297015204e-06, "loss": 0.2288, "step": 2913 }, { "epoch": 2.08, "grad_norm": 14.628440666155255, "learning_rate": 4.920524232079712e-06, "loss": 0.3997, "step": 2914 }, { "epoch": 2.08, "grad_norm": 17.12851494373273, "learning_rate": 4.917634461012238e-06, "loss": 0.3601, "step": 2915 }, { "epoch": 2.08, "grad_norm": 12.147043201032695, "learning_rate": 4.914744717464617e-06, "loss": 0.3708, "step": 2916 }, { "epoch": 2.08, "grad_norm": 16.657303874351186, "learning_rate": 4.911855002402375e-06, "loss": 0.3149, "step": 2917 }, { "epoch": 2.08, "grad_norm": 6.918834616338623, "learning_rate": 4.908965316791014e-06, "loss": 0.1868, "step": 2918 }, { "epoch": 2.08, "grad_norm": 7.338228828504006, "learning_rate": 4.906075661596031e-06, "loss": 0.2456, "step": 2919 }, { "epoch": 2.08, "grad_norm": 9.717111899061246, "learning_rate": 4.903186037782917e-06, "loss": 0.2471, "step": 2920 }, { "epoch": 2.08, "grad_norm": 12.048131071947918, "learning_rate": 4.900296446317146e-06, "loss": 0.2698, "step": 2921 }, { "epoch": 2.09, "grad_norm": 16.548570881430447, "learning_rate": 4.897406888164187e-06, "loss": 0.3094, "step": 2922 }, { "epoch": 2.09, "grad_norm": 14.795066171226647, "learning_rate": 4.8945173642894915e-06, "loss": 0.2437, "step": 2923 }, { "epoch": 2.09, "grad_norm": 11.874111871990417, "learning_rate": 4.8916278756585074e-06, "loss": 0.2549, "step": 2924 }, { "epoch": 2.09, "grad_norm": 10.369183327295028, "learning_rate": 4.888738423236664e-06, "loss": 0.2812, "step": 2925 }, { "epoch": 2.09, "grad_norm": 15.705046693449054, "learning_rate": 4.88584900798938e-06, "loss": 0.2434, "step": 2926 }, { "epoch": 2.09, "grad_norm": 17.69346857544719, "learning_rate": 4.882959630882066e-06, "loss": 0.2947, "step": 2927 }, { "epoch": 2.09, "grad_norm": 8.467959400291866, "learning_rate": 4.8800702928801124e-06, "loss": 0.2712, "step": 2928 }, { "epoch": 2.09, "grad_norm": 21.203239862503867, "learning_rate": 4.8771809949489056e-06, "loss": 0.3223, "step": 2929 }, { "epoch": 2.09, "grad_norm": 21.718444322534985, "learning_rate": 4.874291738053809e-06, "loss": 0.479, "step": 2930 }, { "epoch": 2.09, "grad_norm": 9.328069951779979, "learning_rate": 4.871402523160181e-06, "loss": 0.2515, "step": 2931 }, { "epoch": 2.09, "grad_norm": 12.698570030098262, "learning_rate": 4.868513351233359e-06, "loss": 0.3232, "step": 2932 }, { "epoch": 2.09, "grad_norm": 12.399731299784895, "learning_rate": 4.865624223238672e-06, "loss": 0.3057, "step": 2933 }, { "epoch": 2.09, "grad_norm": 14.15982457143095, "learning_rate": 4.862735140141428e-06, "loss": 0.3362, "step": 2934 }, { "epoch": 2.09, "grad_norm": 12.120995256901526, "learning_rate": 4.859846102906927e-06, "loss": 0.2119, "step": 2935 }, { "epoch": 2.1, "grad_norm": 11.790485443872855, "learning_rate": 4.856957112500446e-06, "loss": 0.26, "step": 2936 }, { "epoch": 2.1, "grad_norm": 9.804955003772951, "learning_rate": 4.854068169887254e-06, "loss": 0.2522, "step": 2937 }, { "epoch": 2.1, "grad_norm": 9.67276784524615, "learning_rate": 4.851179276032598e-06, "loss": 0.2461, "step": 2938 }, { "epoch": 2.1, "grad_norm": 13.029967703322102, "learning_rate": 4.848290431901712e-06, "loss": 0.2825, "step": 2939 }, { "epoch": 2.1, "grad_norm": 13.701298043559392, "learning_rate": 4.845401638459813e-06, "loss": 0.3257, "step": 2940 }, { "epoch": 2.1, "grad_norm": 24.527517306096676, "learning_rate": 4.8425128966721e-06, "loss": 0.3032, "step": 2941 }, { "epoch": 2.1, "grad_norm": 10.060243434857231, "learning_rate": 4.8396242075037555e-06, "loss": 0.2424, "step": 2942 }, { "epoch": 2.1, "grad_norm": 11.110048308538667, "learning_rate": 4.836735571919946e-06, "loss": 0.2317, "step": 2943 }, { "epoch": 2.1, "grad_norm": 13.270855038860887, "learning_rate": 4.833846990885813e-06, "loss": 0.2925, "step": 2944 }, { "epoch": 2.1, "grad_norm": 13.546059086949068, "learning_rate": 4.830958465366492e-06, "loss": 0.3115, "step": 2945 }, { "epoch": 2.1, "grad_norm": 15.147060376074828, "learning_rate": 4.828069996327088e-06, "loss": 0.3071, "step": 2946 }, { "epoch": 2.1, "grad_norm": 21.89733878398577, "learning_rate": 4.825181584732695e-06, "loss": 0.354, "step": 2947 }, { "epoch": 2.1, "grad_norm": 12.098924759848195, "learning_rate": 4.822293231548382e-06, "loss": 0.3223, "step": 2948 }, { "epoch": 2.1, "grad_norm": 14.59459222498052, "learning_rate": 4.819404937739205e-06, "loss": 0.2988, "step": 2949 }, { "epoch": 2.11, "grad_norm": 7.9228442130931, "learning_rate": 4.816516704270194e-06, "loss": 0.1917, "step": 2950 }, { "epoch": 2.11, "grad_norm": 11.304262104161308, "learning_rate": 4.813628532106363e-06, "loss": 0.2844, "step": 2951 }, { "epoch": 2.11, "grad_norm": 10.91478464087612, "learning_rate": 4.810740422212705e-06, "loss": 0.2534, "step": 2952 }, { "epoch": 2.11, "grad_norm": 13.369008662112533, "learning_rate": 4.807852375554188e-06, "loss": 0.248, "step": 2953 }, { "epoch": 2.11, "grad_norm": 9.194937078896238, "learning_rate": 4.804964393095765e-06, "loss": 0.2935, "step": 2954 }, { "epoch": 2.11, "grad_norm": 11.543922616402899, "learning_rate": 4.802076475802362e-06, "loss": 0.2605, "step": 2955 }, { "epoch": 2.11, "grad_norm": 10.955492140429202, "learning_rate": 4.799188624638889e-06, "loss": 0.207, "step": 2956 }, { "epoch": 2.11, "grad_norm": 11.777530819834624, "learning_rate": 4.796300840570227e-06, "loss": 0.2734, "step": 2957 }, { "epoch": 2.11, "grad_norm": 11.600327023654208, "learning_rate": 4.793413124561243e-06, "loss": 0.2236, "step": 2958 }, { "epoch": 2.11, "grad_norm": 9.515292072213855, "learning_rate": 4.790525477576773e-06, "loss": 0.2729, "step": 2959 }, { "epoch": 2.11, "grad_norm": 9.024728170748826, "learning_rate": 4.7876379005816325e-06, "loss": 0.192, "step": 2960 }, { "epoch": 2.11, "grad_norm": 8.964190783912358, "learning_rate": 4.784750394540619e-06, "loss": 0.2013, "step": 2961 }, { "epoch": 2.11, "grad_norm": 10.784790275659192, "learning_rate": 4.781862960418498e-06, "loss": 0.2595, "step": 2962 }, { "epoch": 2.11, "grad_norm": 21.133720980277968, "learning_rate": 4.778975599180019e-06, "loss": 0.3374, "step": 2963 }, { "epoch": 2.12, "grad_norm": 14.406518179067787, "learning_rate": 4.776088311789897e-06, "loss": 0.2964, "step": 2964 }, { "epoch": 2.12, "grad_norm": 8.188477238179528, "learning_rate": 4.773201099212835e-06, "loss": 0.1699, "step": 2965 }, { "epoch": 2.12, "grad_norm": 9.483178808062954, "learning_rate": 4.770313962413499e-06, "loss": 0.2429, "step": 2966 }, { "epoch": 2.12, "grad_norm": 11.54657457090472, "learning_rate": 4.767426902356539e-06, "loss": 0.2224, "step": 2967 }, { "epoch": 2.12, "grad_norm": 14.491438595730546, "learning_rate": 4.7645399200065745e-06, "loss": 0.3599, "step": 2968 }, { "epoch": 2.12, "grad_norm": 10.84370673100882, "learning_rate": 4.761653016328197e-06, "loss": 0.2827, "step": 2969 }, { "epoch": 2.12, "grad_norm": 9.737199358136072, "learning_rate": 4.758766192285979e-06, "loss": 0.1892, "step": 2970 }, { "epoch": 2.12, "grad_norm": 15.368842569814777, "learning_rate": 4.755879448844458e-06, "loss": 0.3108, "step": 2971 }, { "epoch": 2.12, "grad_norm": 11.722321883118072, "learning_rate": 4.752992786968153e-06, "loss": 0.2773, "step": 2972 }, { "epoch": 2.12, "grad_norm": 14.416553515826045, "learning_rate": 4.750106207621546e-06, "loss": 0.2751, "step": 2973 }, { "epoch": 2.12, "grad_norm": 13.678335643158764, "learning_rate": 4.747219711769103e-06, "loss": 0.2622, "step": 2974 }, { "epoch": 2.12, "grad_norm": 14.178387254968198, "learning_rate": 4.74433330037525e-06, "loss": 0.2866, "step": 2975 }, { "epoch": 2.12, "grad_norm": 8.722021862525153, "learning_rate": 4.741446974404396e-06, "loss": 0.2549, "step": 2976 }, { "epoch": 2.12, "grad_norm": 8.939812320748437, "learning_rate": 4.738560734820914e-06, "loss": 0.2354, "step": 2977 }, { "epoch": 2.13, "grad_norm": 10.457539144367189, "learning_rate": 4.735674582589147e-06, "loss": 0.2371, "step": 2978 }, { "epoch": 2.13, "grad_norm": 11.031971470356236, "learning_rate": 4.732788518673418e-06, "loss": 0.283, "step": 2979 }, { "epoch": 2.13, "grad_norm": 13.290069870349917, "learning_rate": 4.729902544038009e-06, "loss": 0.262, "step": 2980 }, { "epoch": 2.13, "grad_norm": 9.646079051872375, "learning_rate": 4.7270166596471825e-06, "loss": 0.2522, "step": 2981 }, { "epoch": 2.13, "grad_norm": 9.643876500840022, "learning_rate": 4.724130866465163e-06, "loss": 0.26, "step": 2982 }, { "epoch": 2.13, "grad_norm": 10.46389033776257, "learning_rate": 4.721245165456149e-06, "loss": 0.2505, "step": 2983 }, { "epoch": 2.13, "grad_norm": 10.041824511182282, "learning_rate": 4.7183595575843055e-06, "loss": 0.2754, "step": 2984 }, { "epoch": 2.13, "grad_norm": 9.441670864552433, "learning_rate": 4.715474043813771e-06, "loss": 0.2422, "step": 2985 }, { "epoch": 2.13, "grad_norm": 13.272432898291353, "learning_rate": 4.712588625108645e-06, "loss": 0.2637, "step": 2986 }, { "epoch": 2.13, "grad_norm": 13.77655861565552, "learning_rate": 4.709703302433003e-06, "loss": 0.2734, "step": 2987 }, { "epoch": 2.13, "grad_norm": 12.616444934585852, "learning_rate": 4.706818076750883e-06, "loss": 0.3889, "step": 2988 }, { "epoch": 2.13, "grad_norm": 13.01033899307128, "learning_rate": 4.703932949026291e-06, "loss": 0.2466, "step": 2989 }, { "epoch": 2.13, "grad_norm": 11.858289061563696, "learning_rate": 4.701047920223207e-06, "loss": 0.293, "step": 2990 }, { "epoch": 2.13, "grad_norm": 16.866643961827666, "learning_rate": 4.6981629913055674e-06, "loss": 0.3655, "step": 2991 }, { "epoch": 2.14, "grad_norm": 12.132085368005846, "learning_rate": 4.695278163237284e-06, "loss": 0.1934, "step": 2992 }, { "epoch": 2.14, "grad_norm": 15.039198152601585, "learning_rate": 4.692393436982229e-06, "loss": 0.23, "step": 2993 }, { "epoch": 2.14, "grad_norm": 11.3107040441794, "learning_rate": 4.689508813504246e-06, "loss": 0.2344, "step": 2994 }, { "epoch": 2.14, "grad_norm": 8.629823411245509, "learning_rate": 4.686624293767138e-06, "loss": 0.2749, "step": 2995 }, { "epoch": 2.14, "grad_norm": 15.154888487363252, "learning_rate": 4.683739878734678e-06, "loss": 0.2816, "step": 2996 }, { "epoch": 2.14, "grad_norm": 11.03652875316943, "learning_rate": 4.6808555693706045e-06, "loss": 0.2156, "step": 2997 }, { "epoch": 2.14, "grad_norm": 11.487451590380616, "learning_rate": 4.677971366638616e-06, "loss": 0.2493, "step": 2998 }, { "epoch": 2.14, "grad_norm": 9.49534880905934, "learning_rate": 4.67508727150238e-06, "loss": 0.25, "step": 2999 }, { "epoch": 2.14, "grad_norm": 7.968859660999014, "learning_rate": 4.672203284925525e-06, "loss": 0.2004, "step": 3000 }, { "epoch": 2.14, "eval_avg_AUC": 0.7804411891516083, "eval_avg_Accuracy": 0.6809101458885941, "eval_avg_Accuracy-right": 0.8848963088561367, "eval_avg_Accuracy-wrong": 0.32522174209688426, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6521728732768947, "eval_last_AUC": 0.798815075101747, "eval_last_Accuracy": 0.7263759946949602, "eval_last_Accuracy-right": 0.8329855223685927, "eval_last_Accuracy-wrong": 0.5404821469183534, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6680679021503793, "eval_max_AUC": 0.7486281620568513, "eval_max_Accuracy": 0.6368534482758621, "eval_max_Accuracy-right": 0.9711751662971175, "eval_max_Accuracy-wrong": 0.05390038662724585, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.60539630409024, "eval_min_AUC": 0.7853322845511744, "eval_min_Accuracy": 0.7209051724137931, "eval_min_Accuracy-right": 0.7549889135254989, "eval_min_Accuracy-wrong": 0.6614737320900614, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6588892732875934, "eval_prod_AUC": 0.7847266249422049, "eval_prod_Accuracy": 0.706440649867374, "eval_prod_Accuracy-right": 0.6447763140733012, "eval_prod_Accuracy-wrong": 0.8139640664089152, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6510440752440944, "eval_runtime": 252.474, "eval_samples_per_second": 95.566, "eval_steps_per_second": 2.986, "eval_sum_AUC": 0.6452340173243202, "eval_sum_Accuracy": 0.6382211538461539, "eval_sum_Accuracy-right": 0.9940654754141124, "eval_sum_Accuracy-wrong": 0.01773936775073914, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6368181458868507, "step": 3000 }, { "epoch": 2.14, "grad_norm": 9.719355683834616, "learning_rate": 4.669319407871647e-06, "loss": 0.2515, "step": 3001 }, { "epoch": 2.14, "grad_norm": 14.170964090776609, "learning_rate": 4.666435641304301e-06, "loss": 0.2522, "step": 3002 }, { "epoch": 2.14, "grad_norm": 9.923468867083184, "learning_rate": 4.663551986187006e-06, "loss": 0.2493, "step": 3003 }, { "epoch": 2.14, "grad_norm": 11.30112917716831, "learning_rate": 4.660668443483248e-06, "loss": 0.3486, "step": 3004 }, { "epoch": 2.14, "grad_norm": 10.14288841228628, "learning_rate": 4.657785014156468e-06, "loss": 0.2489, "step": 3005 }, { "epoch": 2.15, "grad_norm": 11.713009972335938, "learning_rate": 4.654901699170077e-06, "loss": 0.3506, "step": 3006 }, { "epoch": 2.15, "grad_norm": 14.161879396772502, "learning_rate": 4.652018499487442e-06, "loss": 0.2977, "step": 3007 }, { "epoch": 2.15, "grad_norm": 12.191379054117588, "learning_rate": 4.649135416071896e-06, "loss": 0.2581, "step": 3008 }, { "epoch": 2.15, "grad_norm": 20.471445742521716, "learning_rate": 4.646252449886727e-06, "loss": 0.3132, "step": 3009 }, { "epoch": 2.15, "grad_norm": 14.975686497998877, "learning_rate": 4.6433696018951915e-06, "loss": 0.3794, "step": 3010 }, { "epoch": 2.15, "grad_norm": 16.4897021990898, "learning_rate": 4.640486873060501e-06, "loss": 0.3147, "step": 3011 }, { "epoch": 2.15, "grad_norm": 9.363266401118915, "learning_rate": 4.6376042643458254e-06, "loss": 0.2346, "step": 3012 }, { "epoch": 2.15, "grad_norm": 9.753203788882743, "learning_rate": 4.634721776714305e-06, "loss": 0.2937, "step": 3013 }, { "epoch": 2.15, "grad_norm": 13.363526616262392, "learning_rate": 4.631839411129025e-06, "loss": 0.3018, "step": 3014 }, { "epoch": 2.15, "grad_norm": 11.110221054974863, "learning_rate": 4.628957168553044e-06, "loss": 0.2727, "step": 3015 }, { "epoch": 2.15, "grad_norm": 16.632277373728602, "learning_rate": 4.6260750499493665e-06, "loss": 0.4019, "step": 3016 }, { "epoch": 2.15, "grad_norm": 14.087285690858906, "learning_rate": 4.623193056280968e-06, "loss": 0.3335, "step": 3017 }, { "epoch": 2.15, "grad_norm": 8.025810609472378, "learning_rate": 4.6203111885107735e-06, "loss": 0.2375, "step": 3018 }, { "epoch": 2.15, "grad_norm": 11.067682340287535, "learning_rate": 4.617429447601665e-06, "loss": 0.2476, "step": 3019 }, { "epoch": 2.16, "grad_norm": 8.772321631454632, "learning_rate": 4.614547834516492e-06, "loss": 0.2551, "step": 3020 }, { "epoch": 2.16, "grad_norm": 16.09704435421464, "learning_rate": 4.6116663502180495e-06, "loss": 0.3059, "step": 3021 }, { "epoch": 2.16, "grad_norm": 8.204814449759008, "learning_rate": 4.6087849956691e-06, "loss": 0.2561, "step": 3022 }, { "epoch": 2.16, "grad_norm": 11.234181879339191, "learning_rate": 4.605903771832353e-06, "loss": 0.2791, "step": 3023 }, { "epoch": 2.16, "grad_norm": 12.496146964582882, "learning_rate": 4.603022679670482e-06, "loss": 0.2939, "step": 3024 }, { "epoch": 2.16, "grad_norm": 20.489599009018622, "learning_rate": 4.6001417201461114e-06, "loss": 0.2979, "step": 3025 }, { "epoch": 2.16, "grad_norm": 12.091464787515235, "learning_rate": 4.597260894221826e-06, "loss": 0.3162, "step": 3026 }, { "epoch": 2.16, "grad_norm": 16.839579410632815, "learning_rate": 4.594380202860162e-06, "loss": 0.3357, "step": 3027 }, { "epoch": 2.16, "grad_norm": 10.013418973388266, "learning_rate": 4.5914996470236094e-06, "loss": 0.3015, "step": 3028 }, { "epoch": 2.16, "grad_norm": 12.220647672670827, "learning_rate": 4.588619227674619e-06, "loss": 0.2327, "step": 3029 }, { "epoch": 2.16, "grad_norm": 11.062502514019542, "learning_rate": 4.58573894577559e-06, "loss": 0.322, "step": 3030 }, { "epoch": 2.16, "grad_norm": 10.599524075966587, "learning_rate": 4.5828588022888815e-06, "loss": 0.2974, "step": 3031 }, { "epoch": 2.16, "grad_norm": 11.933286676593458, "learning_rate": 4.5799787981767975e-06, "loss": 0.2861, "step": 3032 }, { "epoch": 2.16, "grad_norm": 10.79407778155579, "learning_rate": 4.577098934401607e-06, "loss": 0.2773, "step": 3033 }, { "epoch": 2.17, "grad_norm": 12.25668831567693, "learning_rate": 4.57421921192552e-06, "loss": 0.2288, "step": 3034 }, { "epoch": 2.17, "grad_norm": 8.332016300630848, "learning_rate": 4.5713396317107115e-06, "loss": 0.2136, "step": 3035 }, { "epoch": 2.17, "grad_norm": 8.56505719334519, "learning_rate": 4.568460194719299e-06, "loss": 0.1902, "step": 3036 }, { "epoch": 2.17, "grad_norm": 9.736750372255026, "learning_rate": 4.565580901913356e-06, "loss": 0.2759, "step": 3037 }, { "epoch": 2.17, "grad_norm": 13.30006734127021, "learning_rate": 4.562701754254909e-06, "loss": 0.29, "step": 3038 }, { "epoch": 2.17, "grad_norm": 12.077457240517598, "learning_rate": 4.559822752705933e-06, "loss": 0.3496, "step": 3039 }, { "epoch": 2.17, "grad_norm": 8.698125787141345, "learning_rate": 4.556943898228358e-06, "loss": 0.2375, "step": 3040 }, { "epoch": 2.17, "grad_norm": 20.038157371173256, "learning_rate": 4.55406519178406e-06, "loss": 0.2932, "step": 3041 }, { "epoch": 2.17, "grad_norm": 10.500307715120192, "learning_rate": 4.551186634334873e-06, "loss": 0.2507, "step": 3042 }, { "epoch": 2.17, "grad_norm": 17.239595721895437, "learning_rate": 4.54830822684257e-06, "loss": 0.2397, "step": 3043 }, { "epoch": 2.17, "grad_norm": 26.518571980335143, "learning_rate": 4.545429970268888e-06, "loss": 0.5195, "step": 3044 }, { "epoch": 2.17, "grad_norm": 21.63226226043342, "learning_rate": 4.542551865575499e-06, "loss": 0.314, "step": 3045 }, { "epoch": 2.17, "grad_norm": 10.077102386147187, "learning_rate": 4.539673913724037e-06, "loss": 0.2119, "step": 3046 }, { "epoch": 2.17, "grad_norm": 10.394200369375191, "learning_rate": 4.5367961156760745e-06, "loss": 0.3079, "step": 3047 }, { "epoch": 2.18, "grad_norm": 29.417594716851383, "learning_rate": 4.533918472393141e-06, "loss": 0.5259, "step": 3048 }, { "epoch": 2.18, "grad_norm": 11.457438246959063, "learning_rate": 4.531040984836708e-06, "loss": 0.2346, "step": 3049 }, { "epoch": 2.18, "grad_norm": 10.204113553765206, "learning_rate": 4.5281636539682e-06, "loss": 0.3069, "step": 3050 }, { "epoch": 2.18, "grad_norm": 8.049228198201712, "learning_rate": 4.5252864807489836e-06, "loss": 0.2656, "step": 3051 }, { "epoch": 2.18, "grad_norm": 10.606648655895928, "learning_rate": 4.522409466140379e-06, "loss": 0.2593, "step": 3052 }, { "epoch": 2.18, "grad_norm": 15.556356018045541, "learning_rate": 4.5195326111036475e-06, "loss": 0.3882, "step": 3053 }, { "epoch": 2.18, "grad_norm": 11.99828379110033, "learning_rate": 4.5166559166000035e-06, "loss": 0.3203, "step": 3054 }, { "epoch": 2.18, "grad_norm": 22.263085046077336, "learning_rate": 4.513779383590599e-06, "loss": 0.3269, "step": 3055 }, { "epoch": 2.18, "grad_norm": 9.933203567646347, "learning_rate": 4.510903013036542e-06, "loss": 0.3042, "step": 3056 }, { "epoch": 2.18, "grad_norm": 8.491586252287485, "learning_rate": 4.508026805898878e-06, "loss": 0.2708, "step": 3057 }, { "epoch": 2.18, "grad_norm": 9.372992871527948, "learning_rate": 4.505150763138604e-06, "loss": 0.2822, "step": 3058 }, { "epoch": 2.18, "grad_norm": 9.457990662995527, "learning_rate": 4.502274885716656e-06, "loss": 0.2791, "step": 3059 }, { "epoch": 2.18, "grad_norm": 9.701114569175404, "learning_rate": 4.499399174593923e-06, "loss": 0.239, "step": 3060 }, { "epoch": 2.18, "grad_norm": 11.167254709611056, "learning_rate": 4.496523630731229e-06, "loss": 0.3047, "step": 3061 }, { "epoch": 2.19, "grad_norm": 9.111502148347535, "learning_rate": 4.493648255089347e-06, "loss": 0.249, "step": 3062 }, { "epoch": 2.19, "grad_norm": 9.384302535844311, "learning_rate": 4.490773048628997e-06, "loss": 0.2898, "step": 3063 }, { "epoch": 2.19, "grad_norm": 9.800690735712097, "learning_rate": 4.487898012310834e-06, "loss": 0.2688, "step": 3064 }, { "epoch": 2.19, "grad_norm": 17.264879768298673, "learning_rate": 4.485023147095466e-06, "loss": 0.5088, "step": 3065 }, { "epoch": 2.19, "grad_norm": 10.264233050955779, "learning_rate": 4.482148453943434e-06, "loss": 0.3213, "step": 3066 }, { "epoch": 2.19, "grad_norm": 12.516757746284293, "learning_rate": 4.479273933815232e-06, "loss": 0.3726, "step": 3067 }, { "epoch": 2.19, "grad_norm": 20.358627634880012, "learning_rate": 4.476399587671285e-06, "loss": 0.4409, "step": 3068 }, { "epoch": 2.19, "grad_norm": 14.196044087714956, "learning_rate": 4.47352541647197e-06, "loss": 0.2632, "step": 3069 }, { "epoch": 2.19, "grad_norm": 10.996167525812037, "learning_rate": 4.470651421177599e-06, "loss": 0.2305, "step": 3070 }, { "epoch": 2.19, "grad_norm": 8.349338044623135, "learning_rate": 4.467777602748425e-06, "loss": 0.1885, "step": 3071 }, { "epoch": 2.19, "grad_norm": 13.05700812773356, "learning_rate": 4.4649039621446495e-06, "loss": 0.3647, "step": 3072 }, { "epoch": 2.19, "grad_norm": 14.042404398136627, "learning_rate": 4.462030500326403e-06, "loss": 0.2471, "step": 3073 }, { "epoch": 2.19, "grad_norm": 11.157726115560735, "learning_rate": 4.459157218253769e-06, "loss": 0.3259, "step": 3074 }, { "epoch": 2.19, "grad_norm": 12.746593922157988, "learning_rate": 4.456284116886758e-06, "loss": 0.2834, "step": 3075 }, { "epoch": 2.2, "grad_norm": 10.324680514897194, "learning_rate": 4.453411197185334e-06, "loss": 0.2542, "step": 3076 }, { "epoch": 2.2, "grad_norm": 14.104263762334465, "learning_rate": 4.450538460109384e-06, "loss": 0.314, "step": 3077 }, { "epoch": 2.2, "grad_norm": 10.666298882817523, "learning_rate": 4.447665906618751e-06, "loss": 0.2043, "step": 3078 }, { "epoch": 2.2, "grad_norm": 10.855428365451152, "learning_rate": 4.444793537673204e-06, "loss": 0.2446, "step": 3079 }, { "epoch": 2.2, "grad_norm": 12.617290032096342, "learning_rate": 4.441921354232455e-06, "loss": 0.3394, "step": 3080 }, { "epoch": 2.2, "grad_norm": 15.767718908396866, "learning_rate": 4.439049357256156e-06, "loss": 0.2822, "step": 3081 }, { "epoch": 2.2, "grad_norm": 16.04967813853941, "learning_rate": 4.436177547703891e-06, "loss": 0.2839, "step": 3082 }, { "epoch": 2.2, "grad_norm": 16.124779669352307, "learning_rate": 4.433305926535189e-06, "loss": 0.3984, "step": 3083 }, { "epoch": 2.2, "grad_norm": 12.063769370375915, "learning_rate": 4.430434494709509e-06, "loss": 0.2358, "step": 3084 }, { "epoch": 2.2, "grad_norm": 7.918550212828764, "learning_rate": 4.427563253186253e-06, "loss": 0.1887, "step": 3085 }, { "epoch": 2.2, "grad_norm": 5.958914847278019, "learning_rate": 4.424692202924754e-06, "loss": 0.1517, "step": 3086 }, { "epoch": 2.2, "grad_norm": 7.898577580549401, "learning_rate": 4.421821344884281e-06, "loss": 0.2023, "step": 3087 }, { "epoch": 2.2, "grad_norm": 9.715953044475873, "learning_rate": 4.418950680024046e-06, "loss": 0.2214, "step": 3088 }, { "epoch": 2.2, "grad_norm": 9.016811447032085, "learning_rate": 4.416080209303187e-06, "loss": 0.2229, "step": 3089 }, { "epoch": 2.21, "grad_norm": 11.39587616677386, "learning_rate": 4.413209933680786e-06, "loss": 0.2915, "step": 3090 }, { "epoch": 2.21, "grad_norm": 13.239323594842487, "learning_rate": 4.410339854115849e-06, "loss": 0.3066, "step": 3091 }, { "epoch": 2.21, "grad_norm": 11.047328873557003, "learning_rate": 4.407469971567331e-06, "loss": 0.3145, "step": 3092 }, { "epoch": 2.21, "grad_norm": 12.89869317299782, "learning_rate": 4.4046002869941055e-06, "loss": 0.2461, "step": 3093 }, { "epoch": 2.21, "grad_norm": 18.751841757753965, "learning_rate": 4.401730801354994e-06, "loss": 0.3765, "step": 3094 }, { "epoch": 2.21, "grad_norm": 13.149602274951638, "learning_rate": 4.39886151560874e-06, "loss": 0.2488, "step": 3095 }, { "epoch": 2.21, "grad_norm": 13.324113331094987, "learning_rate": 4.395992430714028e-06, "loss": 0.2659, "step": 3096 }, { "epoch": 2.21, "grad_norm": 11.07732938634878, "learning_rate": 4.393123547629472e-06, "loss": 0.2417, "step": 3097 }, { "epoch": 2.21, "grad_norm": 13.730825721640787, "learning_rate": 4.390254867313619e-06, "loss": 0.2175, "step": 3098 }, { "epoch": 2.21, "grad_norm": 11.498964036214774, "learning_rate": 4.387386390724947e-06, "loss": 0.2483, "step": 3099 }, { "epoch": 2.21, "grad_norm": 9.819080003077305, "learning_rate": 4.38451811882187e-06, "loss": 0.2152, "step": 3100 }, { "epoch": 2.21, "grad_norm": 17.016314180753497, "learning_rate": 4.3816500525627284e-06, "loss": 0.2668, "step": 3101 }, { "epoch": 2.21, "grad_norm": 9.737209247832773, "learning_rate": 4.3787821929057985e-06, "loss": 0.2485, "step": 3102 }, { "epoch": 2.21, "grad_norm": 11.21172272431426, "learning_rate": 4.3759145408092855e-06, "loss": 0.2683, "step": 3103 }, { "epoch": 2.22, "grad_norm": 13.720873434696868, "learning_rate": 4.373047097231324e-06, "loss": 0.281, "step": 3104 }, { "epoch": 2.22, "grad_norm": 14.82444071744457, "learning_rate": 4.370179863129979e-06, "loss": 0.3105, "step": 3105 }, { "epoch": 2.22, "grad_norm": 16.67477919882557, "learning_rate": 4.367312839463251e-06, "loss": 0.3049, "step": 3106 }, { "epoch": 2.22, "grad_norm": 11.979444272208877, "learning_rate": 4.3644460271890614e-06, "loss": 0.2878, "step": 3107 }, { "epoch": 2.22, "grad_norm": 11.93302818612876, "learning_rate": 4.361579427265268e-06, "loss": 0.2407, "step": 3108 }, { "epoch": 2.22, "grad_norm": 12.835216618647086, "learning_rate": 4.358713040649654e-06, "loss": 0.2849, "step": 3109 }, { "epoch": 2.22, "grad_norm": 8.05126265344838, "learning_rate": 4.3558468682999336e-06, "loss": 0.2156, "step": 3110 }, { "epoch": 2.22, "grad_norm": 12.93526267989827, "learning_rate": 4.352980911173747e-06, "loss": 0.3201, "step": 3111 }, { "epoch": 2.22, "grad_norm": 10.299964275506714, "learning_rate": 4.350115170228664e-06, "loss": 0.2556, "step": 3112 }, { "epoch": 2.22, "grad_norm": 12.497945099673855, "learning_rate": 4.3472496464221845e-06, "loss": 0.3054, "step": 3113 }, { "epoch": 2.22, "grad_norm": 6.59759493393717, "learning_rate": 4.344384340711728e-06, "loss": 0.1531, "step": 3114 }, { "epoch": 2.22, "grad_norm": 8.031330972624689, "learning_rate": 4.341519254054651e-06, "loss": 0.1885, "step": 3115 }, { "epoch": 2.22, "grad_norm": 10.543383964941674, "learning_rate": 4.338654387408229e-06, "loss": 0.229, "step": 3116 }, { "epoch": 2.22, "grad_norm": 10.767553783865127, "learning_rate": 4.335789741729671e-06, "loss": 0.2268, "step": 3117 }, { "epoch": 2.23, "grad_norm": 10.926314350733149, "learning_rate": 4.332925317976104e-06, "loss": 0.2434, "step": 3118 }, { "epoch": 2.23, "grad_norm": 10.66200156018986, "learning_rate": 4.330061117104589e-06, "loss": 0.2202, "step": 3119 }, { "epoch": 2.23, "grad_norm": 12.672090923776402, "learning_rate": 4.327197140072108e-06, "loss": 0.2769, "step": 3120 }, { "epoch": 2.23, "grad_norm": 18.31880580033002, "learning_rate": 4.324333387835565e-06, "loss": 0.2791, "step": 3121 }, { "epoch": 2.23, "grad_norm": 12.507999522819803, "learning_rate": 4.321469861351799e-06, "loss": 0.2307, "step": 3122 }, { "epoch": 2.23, "grad_norm": 18.389475444013744, "learning_rate": 4.318606561577562e-06, "loss": 0.3716, "step": 3123 }, { "epoch": 2.23, "grad_norm": 11.701663568377832, "learning_rate": 4.31574348946954e-06, "loss": 0.2278, "step": 3124 }, { "epoch": 2.23, "grad_norm": 10.68598335546021, "learning_rate": 4.312880645984334e-06, "loss": 0.228, "step": 3125 }, { "epoch": 2.23, "grad_norm": 9.815726426115686, "learning_rate": 4.310018032078479e-06, "loss": 0.2471, "step": 3126 }, { "epoch": 2.23, "grad_norm": 19.09950822431688, "learning_rate": 4.307155648708421e-06, "loss": 0.3633, "step": 3127 }, { "epoch": 2.23, "grad_norm": 18.393836294451905, "learning_rate": 4.304293496830542e-06, "loss": 0.4065, "step": 3128 }, { "epoch": 2.23, "grad_norm": 14.434362345596266, "learning_rate": 4.301431577401136e-06, "loss": 0.377, "step": 3129 }, { "epoch": 2.23, "grad_norm": 9.846364671226091, "learning_rate": 4.298569891376423e-06, "loss": 0.2998, "step": 3130 }, { "epoch": 2.23, "grad_norm": 11.059306524932216, "learning_rate": 4.2957084397125496e-06, "loss": 0.3047, "step": 3131 }, { "epoch": 2.24, "grad_norm": 10.741521017522524, "learning_rate": 4.292847223365574e-06, "loss": 0.2766, "step": 3132 }, { "epoch": 2.24, "grad_norm": 13.152323226909704, "learning_rate": 4.289986243291488e-06, "loss": 0.2642, "step": 3133 }, { "epoch": 2.24, "grad_norm": 12.623268724924818, "learning_rate": 4.287125500446193e-06, "loss": 0.248, "step": 3134 }, { "epoch": 2.24, "grad_norm": 9.09056413640535, "learning_rate": 4.284264995785521e-06, "loss": 0.2935, "step": 3135 }, { "epoch": 2.24, "grad_norm": 9.216703895780089, "learning_rate": 4.2814047302652155e-06, "loss": 0.24, "step": 3136 }, { "epoch": 2.24, "grad_norm": 22.248993121674086, "learning_rate": 4.278544704840948e-06, "loss": 0.373, "step": 3137 }, { "epoch": 2.24, "grad_norm": 15.845301439748043, "learning_rate": 4.275684920468306e-06, "loss": 0.272, "step": 3138 }, { "epoch": 2.24, "grad_norm": 9.246046989550374, "learning_rate": 4.272825378102791e-06, "loss": 0.2622, "step": 3139 }, { "epoch": 2.24, "grad_norm": 10.234508413168031, "learning_rate": 4.269966078699836e-06, "loss": 0.2419, "step": 3140 }, { "epoch": 2.24, "grad_norm": 9.587106775142328, "learning_rate": 4.267107023214782e-06, "loss": 0.2439, "step": 3141 }, { "epoch": 2.24, "grad_norm": 9.882611698235124, "learning_rate": 4.264248212602896e-06, "loss": 0.2478, "step": 3142 }, { "epoch": 2.24, "grad_norm": 24.969669318687767, "learning_rate": 4.261389647819355e-06, "loss": 0.3423, "step": 3143 }, { "epoch": 2.24, "grad_norm": 13.55812135577993, "learning_rate": 4.258531329819264e-06, "loss": 0.3176, "step": 3144 }, { "epoch": 2.24, "grad_norm": 10.263892512075122, "learning_rate": 4.255673259557636e-06, "loss": 0.239, "step": 3145 }, { "epoch": 2.25, "grad_norm": 13.911486938832663, "learning_rate": 4.252815437989408e-06, "loss": 0.3022, "step": 3146 }, { "epoch": 2.25, "grad_norm": 14.005026801764096, "learning_rate": 4.24995786606943e-06, "loss": 0.377, "step": 3147 }, { "epoch": 2.25, "grad_norm": 11.225242396822619, "learning_rate": 4.24710054475247e-06, "loss": 0.2725, "step": 3148 }, { "epoch": 2.25, "grad_norm": 18.15552040543596, "learning_rate": 4.244243474993214e-06, "loss": 0.2903, "step": 3149 }, { "epoch": 2.25, "grad_norm": 10.337215646854586, "learning_rate": 4.241386657746257e-06, "loss": 0.2544, "step": 3150 }, { "epoch": 2.25, "grad_norm": 13.69049114869754, "learning_rate": 4.2385300939661215e-06, "loss": 0.3018, "step": 3151 }, { "epoch": 2.25, "grad_norm": 12.674673511882043, "learning_rate": 4.2356737846072326e-06, "loss": 0.2776, "step": 3152 }, { "epoch": 2.25, "grad_norm": 9.409534549944462, "learning_rate": 4.232817730623941e-06, "loss": 0.272, "step": 3153 }, { "epoch": 2.25, "grad_norm": 7.195418655205769, "learning_rate": 4.229961932970505e-06, "loss": 0.1794, "step": 3154 }, { "epoch": 2.25, "grad_norm": 14.817548675323104, "learning_rate": 4.2271063926010995e-06, "loss": 0.2317, "step": 3155 }, { "epoch": 2.25, "grad_norm": 10.93728757969069, "learning_rate": 4.224251110469814e-06, "loss": 0.313, "step": 3156 }, { "epoch": 2.25, "grad_norm": 9.095598064837857, "learning_rate": 4.221396087530652e-06, "loss": 0.251, "step": 3157 }, { "epoch": 2.25, "grad_norm": 25.89434606145267, "learning_rate": 4.218541324737529e-06, "loss": 0.3334, "step": 3158 }, { "epoch": 2.25, "grad_norm": 9.844389906885638, "learning_rate": 4.2156868230442756e-06, "loss": 0.2351, "step": 3159 }, { "epoch": 2.26, "grad_norm": 18.609452101643022, "learning_rate": 4.212832583404632e-06, "loss": 0.3489, "step": 3160 }, { "epoch": 2.26, "grad_norm": 20.13110023399353, "learning_rate": 4.2099786067722535e-06, "loss": 0.3137, "step": 3161 }, { "epoch": 2.26, "grad_norm": 13.109251771071031, "learning_rate": 4.207124894100707e-06, "loss": 0.2671, "step": 3162 }, { "epoch": 2.26, "grad_norm": 13.889889573613662, "learning_rate": 4.2042714463434715e-06, "loss": 0.311, "step": 3163 }, { "epoch": 2.26, "grad_norm": 9.05359030193196, "learning_rate": 4.201418264453935e-06, "loss": 0.2292, "step": 3164 }, { "epoch": 2.26, "grad_norm": 29.070179041968114, "learning_rate": 4.198565349385402e-06, "loss": 0.3848, "step": 3165 }, { "epoch": 2.26, "grad_norm": 11.001684670124861, "learning_rate": 4.195712702091079e-06, "loss": 0.3359, "step": 3166 }, { "epoch": 2.26, "grad_norm": 17.578718859825187, "learning_rate": 4.192860323524094e-06, "loss": 0.28, "step": 3167 }, { "epoch": 2.26, "grad_norm": 16.171497366139107, "learning_rate": 4.190008214637476e-06, "loss": 0.2961, "step": 3168 }, { "epoch": 2.26, "grad_norm": 15.876458793899186, "learning_rate": 4.187156376384171e-06, "loss": 0.2766, "step": 3169 }, { "epoch": 2.26, "grad_norm": 11.134458157859976, "learning_rate": 4.184304809717027e-06, "loss": 0.311, "step": 3170 }, { "epoch": 2.26, "grad_norm": 25.636667120923505, "learning_rate": 4.18145351558881e-06, "loss": 0.3188, "step": 3171 }, { "epoch": 2.26, "grad_norm": 14.453629348290125, "learning_rate": 4.178602494952187e-06, "loss": 0.3232, "step": 3172 }, { "epoch": 2.26, "grad_norm": 9.292622999034865, "learning_rate": 4.175751748759737e-06, "loss": 0.2307, "step": 3173 }, { "epoch": 2.27, "grad_norm": 7.461810881057556, "learning_rate": 4.1729012779639495e-06, "loss": 0.1851, "step": 3174 }, { "epoch": 2.27, "grad_norm": 10.995403715034513, "learning_rate": 4.170051083517217e-06, "loss": 0.3142, "step": 3175 }, { "epoch": 2.27, "grad_norm": 28.442476871673804, "learning_rate": 4.167201166371846e-06, "loss": 0.3682, "step": 3176 }, { "epoch": 2.27, "grad_norm": 18.11822897463826, "learning_rate": 4.164351527480042e-06, "loss": 0.2732, "step": 3177 }, { "epoch": 2.27, "grad_norm": 16.21883279375824, "learning_rate": 4.161502167793928e-06, "loss": 0.3286, "step": 3178 }, { "epoch": 2.27, "grad_norm": 27.22564232638533, "learning_rate": 4.1586530882655226e-06, "loss": 0.3633, "step": 3179 }, { "epoch": 2.27, "grad_norm": 9.812336180461106, "learning_rate": 4.155804289846762e-06, "loss": 0.2236, "step": 3180 }, { "epoch": 2.27, "grad_norm": 11.885754005735777, "learning_rate": 4.152955773489479e-06, "loss": 0.3079, "step": 3181 }, { "epoch": 2.27, "grad_norm": 12.559628066175136, "learning_rate": 4.150107540145413e-06, "loss": 0.3069, "step": 3182 }, { "epoch": 2.27, "grad_norm": 13.868335548365271, "learning_rate": 4.147259590766219e-06, "loss": 0.3408, "step": 3183 }, { "epoch": 2.27, "grad_norm": 10.395868967470347, "learning_rate": 4.144411926303442e-06, "loss": 0.241, "step": 3184 }, { "epoch": 2.27, "grad_norm": 8.321970943416959, "learning_rate": 4.141564547708546e-06, "loss": 0.2585, "step": 3185 }, { "epoch": 2.27, "grad_norm": 16.89739295436778, "learning_rate": 4.138717455932888e-06, "loss": 0.3022, "step": 3186 }, { "epoch": 2.27, "grad_norm": 9.404981314367001, "learning_rate": 4.13587065192774e-06, "loss": 0.2917, "step": 3187 }, { "epoch": 2.28, "grad_norm": 17.078981032390395, "learning_rate": 4.133024136644269e-06, "loss": 0.2913, "step": 3188 }, { "epoch": 2.28, "grad_norm": 8.308577953291362, "learning_rate": 4.130177911033546e-06, "loss": 0.2468, "step": 3189 }, { "epoch": 2.28, "grad_norm": 9.135277981344027, "learning_rate": 4.127331976046553e-06, "loss": 0.2318, "step": 3190 }, { "epoch": 2.28, "grad_norm": 8.987586352845991, "learning_rate": 4.124486332634165e-06, "loss": 0.3101, "step": 3191 }, { "epoch": 2.28, "grad_norm": 12.693395458674322, "learning_rate": 4.121640981747169e-06, "loss": 0.2869, "step": 3192 }, { "epoch": 2.28, "grad_norm": 10.221225064372197, "learning_rate": 4.118795924336245e-06, "loss": 0.2749, "step": 3193 }, { "epoch": 2.28, "grad_norm": 9.464738717879746, "learning_rate": 4.115951161351985e-06, "loss": 0.2207, "step": 3194 }, { "epoch": 2.28, "grad_norm": 21.485375562985087, "learning_rate": 4.113106693744871e-06, "loss": 0.3633, "step": 3195 }, { "epoch": 2.28, "grad_norm": 13.928871090786055, "learning_rate": 4.110262522465298e-06, "loss": 0.3142, "step": 3196 }, { "epoch": 2.28, "grad_norm": 12.569062534229731, "learning_rate": 4.107418648463553e-06, "loss": 0.2415, "step": 3197 }, { "epoch": 2.28, "grad_norm": 19.556704268124058, "learning_rate": 4.104575072689827e-06, "loss": 0.4214, "step": 3198 }, { "epoch": 2.28, "grad_norm": 11.309652598968563, "learning_rate": 4.101731796094215e-06, "loss": 0.2314, "step": 3199 }, { "epoch": 2.28, "grad_norm": 11.249740156900172, "learning_rate": 4.098888819626704e-06, "loss": 0.3022, "step": 3200 }, { "epoch": 2.28, "grad_norm": 11.021724075151338, "learning_rate": 4.096046144237189e-06, "loss": 0.2642, "step": 3201 }, { "epoch": 2.29, "grad_norm": 9.964776567576948, "learning_rate": 4.093203770875458e-06, "loss": 0.2451, "step": 3202 }, { "epoch": 2.29, "grad_norm": 7.948673464764161, "learning_rate": 4.090361700491203e-06, "loss": 0.2285, "step": 3203 }, { "epoch": 2.29, "grad_norm": 16.28404893480785, "learning_rate": 4.087519934034011e-06, "loss": 0.3101, "step": 3204 }, { "epoch": 2.29, "grad_norm": 8.473785693501132, "learning_rate": 4.084678472453371e-06, "loss": 0.2549, "step": 3205 }, { "epoch": 2.29, "grad_norm": 24.12057528554086, "learning_rate": 4.081837316698665e-06, "loss": 0.3501, "step": 3206 }, { "epoch": 2.29, "grad_norm": 19.997967895220985, "learning_rate": 4.078996467719179e-06, "loss": 0.3188, "step": 3207 }, { "epoch": 2.29, "grad_norm": 6.614239673097449, "learning_rate": 4.076155926464091e-06, "loss": 0.2056, "step": 3208 }, { "epoch": 2.29, "grad_norm": 17.47178593134632, "learning_rate": 4.07331569388248e-06, "loss": 0.2585, "step": 3209 }, { "epoch": 2.29, "grad_norm": 10.19903305701167, "learning_rate": 4.07047577092332e-06, "loss": 0.3623, "step": 3210 }, { "epoch": 2.29, "grad_norm": 10.950945054310028, "learning_rate": 4.067636158535483e-06, "loss": 0.2402, "step": 3211 }, { "epoch": 2.29, "grad_norm": 10.688010166084021, "learning_rate": 4.064796857667734e-06, "loss": 0.3113, "step": 3212 }, { "epoch": 2.29, "grad_norm": 12.84894387925838, "learning_rate": 4.0619578692687405e-06, "loss": 0.3286, "step": 3213 }, { "epoch": 2.29, "grad_norm": 22.02913425595009, "learning_rate": 4.059119194287056e-06, "loss": 0.3047, "step": 3214 }, { "epoch": 2.29, "grad_norm": 10.256721550325487, "learning_rate": 4.056280833671139e-06, "loss": 0.2534, "step": 3215 }, { "epoch": 2.3, "grad_norm": 9.369809286741337, "learning_rate": 4.053442788369334e-06, "loss": 0.2544, "step": 3216 }, { "epoch": 2.3, "grad_norm": 12.298685517077775, "learning_rate": 4.05060505932989e-06, "loss": 0.3677, "step": 3217 }, { "epoch": 2.3, "grad_norm": 8.898769900347371, "learning_rate": 4.04776764750094e-06, "loss": 0.2266, "step": 3218 }, { "epoch": 2.3, "grad_norm": 10.947460381359758, "learning_rate": 4.04493055383052e-06, "loss": 0.301, "step": 3219 }, { "epoch": 2.3, "grad_norm": 11.74632555217383, "learning_rate": 4.042093779266553e-06, "loss": 0.2559, "step": 3220 }, { "epoch": 2.3, "grad_norm": 9.599283467869368, "learning_rate": 4.0392573247568614e-06, "loss": 0.27, "step": 3221 }, { "epoch": 2.3, "grad_norm": 12.943002421727876, "learning_rate": 4.036421191249155e-06, "loss": 0.2822, "step": 3222 }, { "epoch": 2.3, "grad_norm": 29.579454388849882, "learning_rate": 4.033585379691036e-06, "loss": 0.4604, "step": 3223 }, { "epoch": 2.3, "grad_norm": 11.899580216055918, "learning_rate": 4.030749891030008e-06, "loss": 0.28, "step": 3224 }, { "epoch": 2.3, "grad_norm": 10.093984979965922, "learning_rate": 4.0279147262134534e-06, "loss": 0.281, "step": 3225 }, { "epoch": 2.3, "grad_norm": 18.406878672576628, "learning_rate": 4.025079886188661e-06, "loss": 0.3867, "step": 3226 }, { "epoch": 2.3, "grad_norm": 14.447300544742673, "learning_rate": 4.022245371902796e-06, "loss": 0.3687, "step": 3227 }, { "epoch": 2.3, "grad_norm": 10.587222605286408, "learning_rate": 4.01941118430293e-06, "loss": 0.3228, "step": 3228 }, { "epoch": 2.3, "grad_norm": 8.088881097873506, "learning_rate": 4.0165773243360105e-06, "loss": 0.1971, "step": 3229 }, { "epoch": 2.31, "grad_norm": 10.273170092461974, "learning_rate": 4.0137437929488885e-06, "loss": 0.2725, "step": 3230 }, { "epoch": 2.31, "grad_norm": 13.35424018521676, "learning_rate": 4.010910591088296e-06, "loss": 0.2815, "step": 3231 }, { "epoch": 2.31, "grad_norm": 14.00753162673241, "learning_rate": 4.008077719700859e-06, "loss": 0.3716, "step": 3232 }, { "epoch": 2.31, "grad_norm": 12.541628475729205, "learning_rate": 4.005245179733095e-06, "loss": 0.2659, "step": 3233 }, { "epoch": 2.31, "grad_norm": 8.232635981574267, "learning_rate": 4.002412972131403e-06, "loss": 0.208, "step": 3234 }, { "epoch": 2.31, "grad_norm": 6.224460677900014, "learning_rate": 3.999581097842082e-06, "loss": 0.1646, "step": 3235 }, { "epoch": 2.31, "grad_norm": 8.8840908089983, "learning_rate": 3.99674955781131e-06, "loss": 0.2446, "step": 3236 }, { "epoch": 2.31, "grad_norm": 9.304529312568652, "learning_rate": 3.99391835298516e-06, "loss": 0.2317, "step": 3237 }, { "epoch": 2.31, "grad_norm": 10.502911845939426, "learning_rate": 3.991087484309586e-06, "loss": 0.231, "step": 3238 }, { "epoch": 2.31, "grad_norm": 15.99359587043134, "learning_rate": 3.988256952730439e-06, "loss": 0.3862, "step": 3239 }, { "epoch": 2.31, "grad_norm": 10.178986410442256, "learning_rate": 3.985426759193449e-06, "loss": 0.2451, "step": 3240 }, { "epoch": 2.31, "grad_norm": 8.303906956030472, "learning_rate": 3.982596904644236e-06, "loss": 0.2224, "step": 3241 }, { "epoch": 2.31, "grad_norm": 23.532535397233683, "learning_rate": 3.979767390028309e-06, "loss": 0.2461, "step": 3242 }, { "epoch": 2.31, "grad_norm": 14.192913755233537, "learning_rate": 3.976938216291059e-06, "loss": 0.2808, "step": 3243 }, { "epoch": 2.32, "grad_norm": 11.248098877642407, "learning_rate": 3.974109384377768e-06, "loss": 0.2585, "step": 3244 }, { "epoch": 2.32, "grad_norm": 10.235266597149444, "learning_rate": 3.971280895233599e-06, "loss": 0.2322, "step": 3245 }, { "epoch": 2.32, "grad_norm": 16.562031261403643, "learning_rate": 3.968452749803605e-06, "loss": 0.3599, "step": 3246 }, { "epoch": 2.32, "grad_norm": 19.3566097178524, "learning_rate": 3.965624949032723e-06, "loss": 0.3271, "step": 3247 }, { "epoch": 2.32, "grad_norm": 14.20409787900473, "learning_rate": 3.962797493865767e-06, "loss": 0.2771, "step": 3248 }, { "epoch": 2.32, "grad_norm": 14.070021402214895, "learning_rate": 3.959970385247451e-06, "loss": 0.3074, "step": 3249 }, { "epoch": 2.32, "grad_norm": 25.348359088033686, "learning_rate": 3.957143624122359e-06, "loss": 0.3887, "step": 3250 }, { "epoch": 2.32, "grad_norm": 10.65763701778697, "learning_rate": 3.954317211434966e-06, "loss": 0.2871, "step": 3251 }, { "epoch": 2.32, "grad_norm": 16.067469739234614, "learning_rate": 3.951491148129628e-06, "loss": 0.3311, "step": 3252 }, { "epoch": 2.32, "grad_norm": 13.506402978506092, "learning_rate": 3.948665435150589e-06, "loss": 0.2527, "step": 3253 }, { "epoch": 2.32, "grad_norm": 12.367653284704408, "learning_rate": 3.945840073441967e-06, "loss": 0.2432, "step": 3254 }, { "epoch": 2.32, "grad_norm": 12.529703746228293, "learning_rate": 3.943015063947773e-06, "loss": 0.2793, "step": 3255 }, { "epoch": 2.32, "grad_norm": 28.26985063062385, "learning_rate": 3.940190407611891e-06, "loss": 0.333, "step": 3256 }, { "epoch": 2.32, "grad_norm": 6.659362639715853, "learning_rate": 3.937366105378093e-06, "loss": 0.1985, "step": 3257 }, { "epoch": 2.33, "grad_norm": 9.796592659095309, "learning_rate": 3.93454215819003e-06, "loss": 0.2639, "step": 3258 }, { "epoch": 2.33, "grad_norm": 17.950373980503407, "learning_rate": 3.931718566991236e-06, "loss": 0.4453, "step": 3259 }, { "epoch": 2.33, "grad_norm": 12.59777182080348, "learning_rate": 3.9288953327251265e-06, "loss": 0.3032, "step": 3260 }, { "epoch": 2.33, "grad_norm": 12.217410108136798, "learning_rate": 3.9260724563349935e-06, "loss": 0.2666, "step": 3261 }, { "epoch": 2.33, "grad_norm": 11.026004296899076, "learning_rate": 3.923249938764016e-06, "loss": 0.312, "step": 3262 }, { "epoch": 2.33, "grad_norm": 16.644089760277204, "learning_rate": 3.920427780955247e-06, "loss": 0.3784, "step": 3263 }, { "epoch": 2.33, "grad_norm": 10.92215295788311, "learning_rate": 3.917605983851622e-06, "loss": 0.2981, "step": 3264 }, { "epoch": 2.33, "grad_norm": 14.252312169191091, "learning_rate": 3.914784548395959e-06, "loss": 0.2703, "step": 3265 }, { "epoch": 2.33, "grad_norm": 18.325498594913125, "learning_rate": 3.911963475530948e-06, "loss": 0.3665, "step": 3266 }, { "epoch": 2.33, "grad_norm": 16.00125047262502, "learning_rate": 3.909142766199163e-06, "loss": 0.3264, "step": 3267 }, { "epoch": 2.33, "grad_norm": 8.43569373211537, "learning_rate": 3.906322421343055e-06, "loss": 0.2512, "step": 3268 }, { "epoch": 2.33, "grad_norm": 9.382819352662063, "learning_rate": 3.903502441904956e-06, "loss": 0.2378, "step": 3269 }, { "epoch": 2.33, "grad_norm": 12.338844125815825, "learning_rate": 3.900682828827072e-06, "loss": 0.2664, "step": 3270 }, { "epoch": 2.33, "grad_norm": 14.983592958009579, "learning_rate": 3.897863583051488e-06, "loss": 0.2817, "step": 3271 }, { "epoch": 2.34, "grad_norm": 9.878791410236738, "learning_rate": 3.895044705520167e-06, "loss": 0.2729, "step": 3272 }, { "epoch": 2.34, "grad_norm": 16.09056810345942, "learning_rate": 3.892226197174947e-06, "loss": 0.3022, "step": 3273 }, { "epoch": 2.34, "grad_norm": 17.416882596827744, "learning_rate": 3.889408058957547e-06, "loss": 0.251, "step": 3274 }, { "epoch": 2.34, "grad_norm": 10.418938219681527, "learning_rate": 3.886590291809554e-06, "loss": 0.3281, "step": 3275 }, { "epoch": 2.34, "grad_norm": 11.54374780153606, "learning_rate": 3.883772896672443e-06, "loss": 0.25, "step": 3276 }, { "epoch": 2.34, "grad_norm": 12.475161782689312, "learning_rate": 3.8809558744875534e-06, "loss": 0.3037, "step": 3277 }, { "epoch": 2.34, "grad_norm": 12.217374801851173, "learning_rate": 3.878139226196107e-06, "loss": 0.2986, "step": 3278 }, { "epoch": 2.34, "grad_norm": 14.47972978144969, "learning_rate": 3.875322952739196e-06, "loss": 0.3706, "step": 3279 }, { "epoch": 2.34, "grad_norm": 11.281463354122508, "learning_rate": 3.872507055057793e-06, "loss": 0.2288, "step": 3280 }, { "epoch": 2.34, "grad_norm": 12.610178959550831, "learning_rate": 3.8696915340927395e-06, "loss": 0.2668, "step": 3281 }, { "epoch": 2.34, "grad_norm": 10.2691544838789, "learning_rate": 3.866876390784752e-06, "loss": 0.231, "step": 3282 }, { "epoch": 2.34, "grad_norm": 36.402448271805106, "learning_rate": 3.8640616260744266e-06, "loss": 0.3735, "step": 3283 }, { "epoch": 2.34, "grad_norm": 10.171805634334017, "learning_rate": 3.861247240902223e-06, "loss": 0.2512, "step": 3284 }, { "epoch": 2.34, "grad_norm": 51.24345152155576, "learning_rate": 3.858433236208485e-06, "loss": 0.2886, "step": 3285 }, { "epoch": 2.35, "grad_norm": 8.317004480140044, "learning_rate": 3.85561961293342e-06, "loss": 0.2485, "step": 3286 }, { "epoch": 2.35, "grad_norm": 10.386888418538078, "learning_rate": 3.852806372017115e-06, "loss": 0.2256, "step": 3287 }, { "epoch": 2.35, "grad_norm": 8.24971292042215, "learning_rate": 3.849993514399521e-06, "loss": 0.2556, "step": 3288 }, { "epoch": 2.35, "grad_norm": 15.008978089204213, "learning_rate": 3.847181041020472e-06, "loss": 0.2876, "step": 3289 }, { "epoch": 2.35, "grad_norm": 9.822270914959086, "learning_rate": 3.844368952819666e-06, "loss": 0.2314, "step": 3290 }, { "epoch": 2.35, "grad_norm": 12.175452659496289, "learning_rate": 3.84155725073667e-06, "loss": 0.3372, "step": 3291 }, { "epoch": 2.35, "grad_norm": 8.094646202827287, "learning_rate": 3.838745935710931e-06, "loss": 0.2441, "step": 3292 }, { "epoch": 2.35, "grad_norm": 10.954423265082738, "learning_rate": 3.835935008681757e-06, "loss": 0.3174, "step": 3293 }, { "epoch": 2.35, "grad_norm": 13.370092110407349, "learning_rate": 3.833124470588336e-06, "loss": 0.3279, "step": 3294 }, { "epoch": 2.35, "grad_norm": 10.808483717392717, "learning_rate": 3.830314322369717e-06, "loss": 0.2334, "step": 3295 }, { "epoch": 2.35, "grad_norm": 16.576910490466176, "learning_rate": 3.827504564964825e-06, "loss": 0.2522, "step": 3296 }, { "epoch": 2.35, "grad_norm": 16.48583322025054, "learning_rate": 3.82469519931245e-06, "loss": 0.323, "step": 3297 }, { "epoch": 2.35, "grad_norm": 11.518800925347575, "learning_rate": 3.8218862263512565e-06, "loss": 0.2456, "step": 3298 }, { "epoch": 2.35, "grad_norm": 18.559272871616013, "learning_rate": 3.819077647019772e-06, "loss": 0.3108, "step": 3299 }, { "epoch": 2.36, "grad_norm": 16.96699943610106, "learning_rate": 3.816269462256394e-06, "loss": 0.3784, "step": 3300 }, { "epoch": 2.36, "grad_norm": 12.209507947737528, "learning_rate": 3.813461672999394e-06, "loss": 0.2932, "step": 3301 }, { "epoch": 2.36, "grad_norm": 10.387762465843936, "learning_rate": 3.8106542801869007e-06, "loss": 0.2808, "step": 3302 }, { "epoch": 2.36, "grad_norm": 15.201757791001164, "learning_rate": 3.8078472847569215e-06, "loss": 0.3765, "step": 3303 }, { "epoch": 2.36, "grad_norm": 14.253111845630949, "learning_rate": 3.805040687647321e-06, "loss": 0.3374, "step": 3304 }, { "epoch": 2.36, "grad_norm": 19.82147693696702, "learning_rate": 3.8022344897958402e-06, "loss": 0.332, "step": 3305 }, { "epoch": 2.36, "grad_norm": 11.343136940610577, "learning_rate": 3.799428692140077e-06, "loss": 0.2681, "step": 3306 }, { "epoch": 2.36, "grad_norm": 11.444088175662092, "learning_rate": 3.7966232956175053e-06, "loss": 0.2773, "step": 3307 }, { "epoch": 2.36, "grad_norm": 16.004902990239433, "learning_rate": 3.793818301165457e-06, "loss": 0.3726, "step": 3308 }, { "epoch": 2.36, "grad_norm": 12.09082633279314, "learning_rate": 3.7910137097211345e-06, "loss": 0.3049, "step": 3309 }, { "epoch": 2.36, "grad_norm": 13.587555974439523, "learning_rate": 3.788209522221604e-06, "loss": 0.2961, "step": 3310 }, { "epoch": 2.36, "grad_norm": 9.99106911580416, "learning_rate": 3.7854057396037934e-06, "loss": 0.2881, "step": 3311 }, { "epoch": 2.36, "grad_norm": 8.139850321676342, "learning_rate": 3.7826023628045037e-06, "loss": 0.2412, "step": 3312 }, { "epoch": 2.36, "grad_norm": 10.388827648592175, "learning_rate": 3.779799392760391e-06, "loss": 0.3181, "step": 3313 }, { "epoch": 2.37, "grad_norm": 10.958626380275174, "learning_rate": 3.7769968304079833e-06, "loss": 0.2668, "step": 3314 }, { "epoch": 2.37, "grad_norm": 11.3999162309862, "learning_rate": 3.7741946766836657e-06, "loss": 0.283, "step": 3315 }, { "epoch": 2.37, "grad_norm": 18.78486008620017, "learning_rate": 3.771392932523691e-06, "loss": 0.2568, "step": 3316 }, { "epoch": 2.37, "grad_norm": 10.359568318591949, "learning_rate": 3.768591598864174e-06, "loss": 0.2939, "step": 3317 }, { "epoch": 2.37, "grad_norm": 31.670155201809273, "learning_rate": 3.765790676641092e-06, "loss": 0.3071, "step": 3318 }, { "epoch": 2.37, "grad_norm": 11.970236514238282, "learning_rate": 3.762990166790286e-06, "loss": 0.2551, "step": 3319 }, { "epoch": 2.37, "grad_norm": 18.047255891257112, "learning_rate": 3.760190070247458e-06, "loss": 0.3247, "step": 3320 }, { "epoch": 2.37, "grad_norm": 10.64156184651028, "learning_rate": 3.7573903879481714e-06, "loss": 0.2834, "step": 3321 }, { "epoch": 2.37, "grad_norm": 6.664506406435355, "learning_rate": 3.754591120827854e-06, "loss": 0.2263, "step": 3322 }, { "epoch": 2.37, "grad_norm": 8.453693666344321, "learning_rate": 3.7517922698217914e-06, "loss": 0.2427, "step": 3323 }, { "epoch": 2.37, "grad_norm": 11.250753271812135, "learning_rate": 3.7489938358651334e-06, "loss": 0.2156, "step": 3324 }, { "epoch": 2.37, "grad_norm": 23.8413429198328, "learning_rate": 3.746195819892885e-06, "loss": 0.3213, "step": 3325 }, { "epoch": 2.37, "grad_norm": 8.338062073228624, "learning_rate": 3.7433982228399205e-06, "loss": 0.1901, "step": 3326 }, { "epoch": 2.37, "grad_norm": 11.241923335507073, "learning_rate": 3.7406010456409648e-06, "loss": 0.3037, "step": 3327 }, { "epoch": 2.38, "grad_norm": 9.63418710168968, "learning_rate": 3.73780428923061e-06, "loss": 0.2549, "step": 3328 }, { "epoch": 2.38, "grad_norm": 10.930254467732022, "learning_rate": 3.7350079545433014e-06, "loss": 0.2166, "step": 3329 }, { "epoch": 2.38, "grad_norm": 6.752872103252343, "learning_rate": 3.7322120425133497e-06, "loss": 0.1606, "step": 3330 }, { "epoch": 2.38, "grad_norm": 10.592309135473164, "learning_rate": 3.729416554074917e-06, "loss": 0.2258, "step": 3331 }, { "epoch": 2.38, "grad_norm": 7.525689690264553, "learning_rate": 3.726621490162033e-06, "loss": 0.1725, "step": 3332 }, { "epoch": 2.38, "grad_norm": 12.915876448357972, "learning_rate": 3.7238268517085773e-06, "loss": 0.353, "step": 3333 }, { "epoch": 2.38, "grad_norm": 14.281531039394093, "learning_rate": 3.7210326396482893e-06, "loss": 0.2805, "step": 3334 }, { "epoch": 2.38, "grad_norm": 12.802973255554388, "learning_rate": 3.718238854914771e-06, "loss": 0.3052, "step": 3335 }, { "epoch": 2.38, "grad_norm": 18.4298425578648, "learning_rate": 3.7154454984414733e-06, "loss": 0.3263, "step": 3336 }, { "epoch": 2.38, "grad_norm": 19.220340865499665, "learning_rate": 3.7126525711617135e-06, "loss": 0.3015, "step": 3337 }, { "epoch": 2.38, "grad_norm": 9.281971521389087, "learning_rate": 3.7098600740086555e-06, "loss": 0.2118, "step": 3338 }, { "epoch": 2.38, "grad_norm": 10.39960104281075, "learning_rate": 3.707068007915329e-06, "loss": 0.2546, "step": 3339 }, { "epoch": 2.38, "grad_norm": 13.371386947285565, "learning_rate": 3.704276373814611e-06, "loss": 0.2737, "step": 3340 }, { "epoch": 2.38, "grad_norm": 11.294716255095457, "learning_rate": 3.7014851726392427e-06, "loss": 0.2411, "step": 3341 }, { "epoch": 2.39, "grad_norm": 10.309078229290204, "learning_rate": 3.6986944053218143e-06, "loss": 0.2798, "step": 3342 }, { "epoch": 2.39, "grad_norm": 15.365685779974543, "learning_rate": 3.69590407279477e-06, "loss": 0.3003, "step": 3343 }, { "epoch": 2.39, "grad_norm": 8.837019788680069, "learning_rate": 3.6931141759904175e-06, "loss": 0.293, "step": 3344 }, { "epoch": 2.39, "grad_norm": 13.925370194635887, "learning_rate": 3.6903247158409077e-06, "loss": 0.2639, "step": 3345 }, { "epoch": 2.39, "grad_norm": 16.855847089664, "learning_rate": 3.687535693278256e-06, "loss": 0.3687, "step": 3346 }, { "epoch": 2.39, "grad_norm": 13.239321422376957, "learning_rate": 3.6847471092343225e-06, "loss": 0.2676, "step": 3347 }, { "epoch": 2.39, "grad_norm": 9.481727189698633, "learning_rate": 3.681958964640828e-06, "loss": 0.2578, "step": 3348 }, { "epoch": 2.39, "grad_norm": 9.534587315413534, "learning_rate": 3.679171260429343e-06, "loss": 0.2925, "step": 3349 }, { "epoch": 2.39, "grad_norm": 18.42058638916999, "learning_rate": 3.676383997531288e-06, "loss": 0.3088, "step": 3350 }, { "epoch": 2.39, "grad_norm": 7.855041948096473, "learning_rate": 3.673597176877944e-06, "loss": 0.2554, "step": 3351 }, { "epoch": 2.39, "grad_norm": 7.898301518405677, "learning_rate": 3.670810799400435e-06, "loss": 0.2297, "step": 3352 }, { "epoch": 2.39, "grad_norm": 11.44162249590785, "learning_rate": 3.668024866029747e-06, "loss": 0.2598, "step": 3353 }, { "epoch": 2.39, "grad_norm": 13.992432735538818, "learning_rate": 3.665239377696706e-06, "loss": 0.2859, "step": 3354 }, { "epoch": 2.39, "grad_norm": 9.802186033426322, "learning_rate": 3.6624543353320006e-06, "loss": 0.254, "step": 3355 }, { "epoch": 2.4, "grad_norm": 14.7282741034783, "learning_rate": 3.659669739866162e-06, "loss": 0.2305, "step": 3356 }, { "epoch": 2.4, "grad_norm": 10.747783422915768, "learning_rate": 3.6568855922295776e-06, "loss": 0.3083, "step": 3357 }, { "epoch": 2.4, "grad_norm": 11.203973860796053, "learning_rate": 3.654101893352482e-06, "loss": 0.2449, "step": 3358 }, { "epoch": 2.4, "grad_norm": 14.381045935118614, "learning_rate": 3.651318644164958e-06, "loss": 0.2786, "step": 3359 }, { "epoch": 2.4, "grad_norm": 10.200099711534078, "learning_rate": 3.6485358455969454e-06, "loss": 0.2385, "step": 3360 }, { "epoch": 2.4, "grad_norm": 8.050190434196447, "learning_rate": 3.645753498578225e-06, "loss": 0.1902, "step": 3361 }, { "epoch": 2.4, "grad_norm": 15.029174110851411, "learning_rate": 3.6429716040384346e-06, "loss": 0.2703, "step": 3362 }, { "epoch": 2.4, "grad_norm": 14.962055596198097, "learning_rate": 3.6401901629070524e-06, "loss": 0.3083, "step": 3363 }, { "epoch": 2.4, "grad_norm": 13.313655103966168, "learning_rate": 3.6374091761134147e-06, "loss": 0.301, "step": 3364 }, { "epoch": 2.4, "grad_norm": 10.61602666368444, "learning_rate": 3.6346286445866953e-06, "loss": 0.1937, "step": 3365 }, { "epoch": 2.4, "grad_norm": 8.310994068495816, "learning_rate": 3.6318485692559263e-06, "loss": 0.2715, "step": 3366 }, { "epoch": 2.4, "grad_norm": 20.739251234096173, "learning_rate": 3.62906895104998e-06, "loss": 0.3269, "step": 3367 }, { "epoch": 2.4, "grad_norm": 20.251420055737725, "learning_rate": 3.6262897908975787e-06, "loss": 0.3164, "step": 3368 }, { "epoch": 2.4, "grad_norm": 12.647741659961826, "learning_rate": 3.6235110897272917e-06, "loss": 0.2031, "step": 3369 }, { "epoch": 2.41, "grad_norm": 10.353959481918734, "learning_rate": 3.620732848467535e-06, "loss": 0.2383, "step": 3370 }, { "epoch": 2.41, "grad_norm": 9.086259437715091, "learning_rate": 3.6179550680465703e-06, "loss": 0.2429, "step": 3371 }, { "epoch": 2.41, "grad_norm": 12.140836475782853, "learning_rate": 3.615177749392506e-06, "loss": 0.25, "step": 3372 }, { "epoch": 2.41, "grad_norm": 12.462306422712425, "learning_rate": 3.6124008934332956e-06, "loss": 0.2981, "step": 3373 }, { "epoch": 2.41, "grad_norm": 15.595663064497298, "learning_rate": 3.609624501096739e-06, "loss": 0.2786, "step": 3374 }, { "epoch": 2.41, "grad_norm": 9.6865300704338, "learning_rate": 3.606848573310479e-06, "loss": 0.2834, "step": 3375 }, { "epoch": 2.41, "grad_norm": 10.459999547360587, "learning_rate": 3.6040731110020065e-06, "loss": 0.252, "step": 3376 }, { "epoch": 2.41, "grad_norm": 17.050438349312042, "learning_rate": 3.6012981150986524e-06, "loss": 0.3784, "step": 3377 }, { "epoch": 2.41, "grad_norm": 7.520037630380769, "learning_rate": 3.598523586527599e-06, "loss": 0.207, "step": 3378 }, { "epoch": 2.41, "grad_norm": 11.356693880055909, "learning_rate": 3.595749526215862e-06, "loss": 0.2615, "step": 3379 }, { "epoch": 2.41, "grad_norm": 9.210506673880975, "learning_rate": 3.5929759350903117e-06, "loss": 0.243, "step": 3380 }, { "epoch": 2.41, "grad_norm": 16.890635842665827, "learning_rate": 3.5902028140776524e-06, "loss": 0.3169, "step": 3381 }, { "epoch": 2.41, "grad_norm": 8.508691081951469, "learning_rate": 3.5874301641044386e-06, "loss": 0.2642, "step": 3382 }, { "epoch": 2.41, "grad_norm": 9.55890525274352, "learning_rate": 3.5846579860970632e-06, "loss": 0.2678, "step": 3383 }, { "epoch": 2.42, "grad_norm": 15.731659464137019, "learning_rate": 3.58188628098176e-06, "loss": 0.3152, "step": 3384 }, { "epoch": 2.42, "grad_norm": 8.571055770883325, "learning_rate": 3.579115049684612e-06, "loss": 0.2434, "step": 3385 }, { "epoch": 2.42, "grad_norm": 11.26780059174239, "learning_rate": 3.576344293131533e-06, "loss": 0.2771, "step": 3386 }, { "epoch": 2.42, "grad_norm": 10.9806207910959, "learning_rate": 3.5735740122482896e-06, "loss": 0.2788, "step": 3387 }, { "epoch": 2.42, "grad_norm": 11.608528508998647, "learning_rate": 3.570804207960481e-06, "loss": 0.3105, "step": 3388 }, { "epoch": 2.42, "grad_norm": 8.976436128368848, "learning_rate": 3.5680348811935527e-06, "loss": 0.2446, "step": 3389 }, { "epoch": 2.42, "grad_norm": 11.725885115817704, "learning_rate": 3.565266032872785e-06, "loss": 0.2861, "step": 3390 }, { "epoch": 2.42, "grad_norm": 13.014407863564019, "learning_rate": 3.5624976639233056e-06, "loss": 0.2568, "step": 3391 }, { "epoch": 2.42, "grad_norm": 9.169543696415937, "learning_rate": 3.559729775270076e-06, "loss": 0.2629, "step": 3392 }, { "epoch": 2.42, "grad_norm": 11.368641787713843, "learning_rate": 3.5569623678378972e-06, "loss": 0.3442, "step": 3393 }, { "epoch": 2.42, "grad_norm": 9.144314587647392, "learning_rate": 3.554195442551416e-06, "loss": 0.2119, "step": 3394 }, { "epoch": 2.42, "grad_norm": 9.282894560665648, "learning_rate": 3.551429000335108e-06, "loss": 0.3357, "step": 3395 }, { "epoch": 2.42, "grad_norm": 7.94605752362725, "learning_rate": 3.5486630421132983e-06, "loss": 0.2141, "step": 3396 }, { "epoch": 2.42, "grad_norm": 10.588223525170012, "learning_rate": 3.5458975688101403e-06, "loss": 0.2935, "step": 3397 }, { "epoch": 2.43, "grad_norm": 10.249548737786958, "learning_rate": 3.5431325813496352e-06, "loss": 0.2644, "step": 3398 }, { "epoch": 2.43, "grad_norm": 16.03872538160447, "learning_rate": 3.540368080655612e-06, "loss": 0.3416, "step": 3399 }, { "epoch": 2.43, "grad_norm": 12.263906287807048, "learning_rate": 3.5376040676517443e-06, "loss": 0.3013, "step": 3400 }, { "epoch": 2.43, "grad_norm": 11.90507660616009, "learning_rate": 3.5348405432615407e-06, "loss": 0.2251, "step": 3401 }, { "epoch": 2.43, "grad_norm": 8.652509111149236, "learning_rate": 3.5320775084083425e-06, "loss": 0.1938, "step": 3402 }, { "epoch": 2.43, "grad_norm": 7.948684175375187, "learning_rate": 3.529314964015336e-06, "loss": 0.2017, "step": 3403 }, { "epoch": 2.43, "grad_norm": 9.81378214693452, "learning_rate": 3.526552911005533e-06, "loss": 0.2417, "step": 3404 }, { "epoch": 2.43, "grad_norm": 16.127745326816424, "learning_rate": 3.523791350301793e-06, "loss": 0.2727, "step": 3405 }, { "epoch": 2.43, "grad_norm": 14.96529610492523, "learning_rate": 3.5210302828267984e-06, "loss": 0.2617, "step": 3406 }, { "epoch": 2.43, "grad_norm": 17.61248036670611, "learning_rate": 3.5182697095030795e-06, "loss": 0.3103, "step": 3407 }, { "epoch": 2.43, "grad_norm": 16.296153280775098, "learning_rate": 3.5155096312529913e-06, "loss": 0.3633, "step": 3408 }, { "epoch": 2.43, "grad_norm": 9.295250617043212, "learning_rate": 3.5127500489987252e-06, "loss": 0.2856, "step": 3409 }, { "epoch": 2.43, "grad_norm": 12.24988189484059, "learning_rate": 3.5099909636623148e-06, "loss": 0.3184, "step": 3410 }, { "epoch": 2.43, "grad_norm": 14.376778291796514, "learning_rate": 3.5072323761656163e-06, "loss": 0.3359, "step": 3411 }, { "epoch": 2.44, "grad_norm": 11.142529765067701, "learning_rate": 3.5044742874303297e-06, "loss": 0.3108, "step": 3412 }, { "epoch": 2.44, "grad_norm": 9.349635726849158, "learning_rate": 3.501716698377979e-06, "loss": 0.2485, "step": 3413 }, { "epoch": 2.44, "grad_norm": 7.768764085615702, "learning_rate": 3.4989596099299306e-06, "loss": 0.2454, "step": 3414 }, { "epoch": 2.44, "grad_norm": 17.424964292649417, "learning_rate": 3.496203023007374e-06, "loss": 0.3284, "step": 3415 }, { "epoch": 2.44, "grad_norm": 15.246943837578039, "learning_rate": 3.4934469385313418e-06, "loss": 0.3223, "step": 3416 }, { "epoch": 2.44, "grad_norm": 21.516172225486876, "learning_rate": 3.490691357422689e-06, "loss": 0.25, "step": 3417 }, { "epoch": 2.44, "grad_norm": 8.56752172697326, "learning_rate": 3.487936280602108e-06, "loss": 0.2329, "step": 3418 }, { "epoch": 2.44, "grad_norm": 9.265980094835774, "learning_rate": 3.4851817089901203e-06, "loss": 0.2244, "step": 3419 }, { "epoch": 2.44, "grad_norm": 8.204349860262266, "learning_rate": 3.4824276435070804e-06, "loss": 0.2239, "step": 3420 }, { "epoch": 2.44, "grad_norm": 9.692570947593712, "learning_rate": 3.4796740850731716e-06, "loss": 0.2324, "step": 3421 }, { "epoch": 2.44, "grad_norm": 12.387294099444116, "learning_rate": 3.47692103460841e-06, "loss": 0.2925, "step": 3422 }, { "epoch": 2.44, "grad_norm": 13.421806378326073, "learning_rate": 3.474168493032641e-06, "loss": 0.3445, "step": 3423 }, { "epoch": 2.44, "grad_norm": 16.231543949083306, "learning_rate": 3.4714164612655387e-06, "loss": 0.3259, "step": 3424 }, { "epoch": 2.44, "grad_norm": 11.791636744953154, "learning_rate": 3.468664940226609e-06, "loss": 0.3198, "step": 3425 }, { "epoch": 2.45, "grad_norm": 10.274282803160158, "learning_rate": 3.4659139308351885e-06, "loss": 0.2417, "step": 3426 }, { "epoch": 2.45, "grad_norm": 9.905001077666203, "learning_rate": 3.4631634340104357e-06, "loss": 0.2465, "step": 3427 }, { "epoch": 2.45, "grad_norm": 12.736134980260223, "learning_rate": 3.460413450671346e-06, "loss": 0.2791, "step": 3428 }, { "epoch": 2.45, "grad_norm": 14.233692284060737, "learning_rate": 3.457663981736739e-06, "loss": 0.4175, "step": 3429 }, { "epoch": 2.45, "grad_norm": 8.855372305832327, "learning_rate": 3.4549150281252635e-06, "loss": 0.2551, "step": 3430 }, { "epoch": 2.45, "grad_norm": 10.398405654424193, "learning_rate": 3.4521665907553957e-06, "loss": 0.1948, "step": 3431 }, { "epoch": 2.45, "grad_norm": 11.49001839122159, "learning_rate": 3.4494186705454402e-06, "loss": 0.2893, "step": 3432 }, { "epoch": 2.45, "grad_norm": 13.738087130527903, "learning_rate": 3.446671268413528e-06, "loss": 0.2937, "step": 3433 }, { "epoch": 2.45, "grad_norm": 11.80364763645785, "learning_rate": 3.443924385277617e-06, "loss": 0.2493, "step": 3434 }, { "epoch": 2.45, "grad_norm": 8.451947488762436, "learning_rate": 3.4411780220554937e-06, "loss": 0.2285, "step": 3435 }, { "epoch": 2.45, "grad_norm": 10.469520240369008, "learning_rate": 3.4384321796647645e-06, "loss": 0.3096, "step": 3436 }, { "epoch": 2.45, "grad_norm": 10.22828595409656, "learning_rate": 3.4356868590228727e-06, "loss": 0.2534, "step": 3437 }, { "epoch": 2.45, "grad_norm": 13.115450507117941, "learning_rate": 3.4329420610470745e-06, "loss": 0.2698, "step": 3438 }, { "epoch": 2.45, "grad_norm": 12.542372576236675, "learning_rate": 3.4301977866544634e-06, "loss": 0.2993, "step": 3439 }, { "epoch": 2.46, "grad_norm": 7.09705606272018, "learning_rate": 3.427454036761948e-06, "loss": 0.2233, "step": 3440 }, { "epoch": 2.46, "grad_norm": 13.02634813937635, "learning_rate": 3.4247108122862703e-06, "loss": 0.2429, "step": 3441 }, { "epoch": 2.46, "grad_norm": 9.935522500990654, "learning_rate": 3.4219681141439907e-06, "loss": 0.2246, "step": 3442 }, { "epoch": 2.46, "grad_norm": 17.667359751130242, "learning_rate": 3.4192259432514934e-06, "loss": 0.2803, "step": 3443 }, { "epoch": 2.46, "grad_norm": 10.039200499360646, "learning_rate": 3.4164843005249928e-06, "loss": 0.2092, "step": 3444 }, { "epoch": 2.46, "grad_norm": 10.756691213045567, "learning_rate": 3.413743186880519e-06, "loss": 0.2317, "step": 3445 }, { "epoch": 2.46, "grad_norm": 14.776664463230171, "learning_rate": 3.4110026032339317e-06, "loss": 0.2922, "step": 3446 }, { "epoch": 2.46, "grad_norm": 15.417900806390653, "learning_rate": 3.408262550500908e-06, "loss": 0.3977, "step": 3447 }, { "epoch": 2.46, "grad_norm": 13.178425128085712, "learning_rate": 3.4055230295969556e-06, "loss": 0.2422, "step": 3448 }, { "epoch": 2.46, "grad_norm": 12.742971833222871, "learning_rate": 3.4027840414373924e-06, "loss": 0.344, "step": 3449 }, { "epoch": 2.46, "grad_norm": 9.435224415388793, "learning_rate": 3.4000455869373716e-06, "loss": 0.2715, "step": 3450 }, { "epoch": 2.46, "grad_norm": 10.634275587173237, "learning_rate": 3.397307667011859e-06, "loss": 0.3154, "step": 3451 }, { "epoch": 2.46, "grad_norm": 15.921317196391337, "learning_rate": 3.394570282575642e-06, "loss": 0.2876, "step": 3452 }, { "epoch": 2.46, "grad_norm": 19.62653176049099, "learning_rate": 3.3918334345433367e-06, "loss": 0.3252, "step": 3453 }, { "epoch": 2.47, "grad_norm": 10.047215418020556, "learning_rate": 3.3890971238293703e-06, "loss": 0.3218, "step": 3454 }, { "epoch": 2.47, "grad_norm": 14.006570126629176, "learning_rate": 3.386361351347999e-06, "loss": 0.2898, "step": 3455 }, { "epoch": 2.47, "grad_norm": 10.7530458486607, "learning_rate": 3.3836261180132914e-06, "loss": 0.2742, "step": 3456 }, { "epoch": 2.47, "grad_norm": 9.313128539846193, "learning_rate": 3.3808914247391437e-06, "loss": 0.2656, "step": 3457 }, { "epoch": 2.47, "grad_norm": 10.2334413075271, "learning_rate": 3.3781572724392642e-06, "loss": 0.2427, "step": 3458 }, { "epoch": 2.47, "grad_norm": 9.268830421432604, "learning_rate": 3.3754236620271876e-06, "loss": 0.2834, "step": 3459 }, { "epoch": 2.47, "grad_norm": 7.645365717660876, "learning_rate": 3.3726905944162615e-06, "loss": 0.2603, "step": 3460 }, { "epoch": 2.47, "grad_norm": 13.475867556089085, "learning_rate": 3.3699580705196527e-06, "loss": 0.271, "step": 3461 }, { "epoch": 2.47, "grad_norm": 9.891164807765543, "learning_rate": 3.367226091250353e-06, "loss": 0.2837, "step": 3462 }, { "epoch": 2.47, "grad_norm": 10.147315000935372, "learning_rate": 3.3644946575211634e-06, "loss": 0.2432, "step": 3463 }, { "epoch": 2.47, "grad_norm": 14.722101869142618, "learning_rate": 3.36176377024471e-06, "loss": 0.29, "step": 3464 }, { "epoch": 2.47, "grad_norm": 11.016997359020882, "learning_rate": 3.3590334303334293e-06, "loss": 0.3162, "step": 3465 }, { "epoch": 2.47, "grad_norm": 13.714073886046096, "learning_rate": 3.356303638699583e-06, "loss": 0.302, "step": 3466 }, { "epoch": 2.47, "grad_norm": 19.31122867211997, "learning_rate": 3.35357439625524e-06, "loss": 0.2225, "step": 3467 }, { "epoch": 2.48, "grad_norm": 12.443164765186587, "learning_rate": 3.3508457039122965e-06, "loss": 0.3494, "step": 3468 }, { "epoch": 2.48, "grad_norm": 16.264537881834457, "learning_rate": 3.348117562582457e-06, "loss": 0.3677, "step": 3469 }, { "epoch": 2.48, "grad_norm": 11.05702734122788, "learning_rate": 3.345389973177241e-06, "loss": 0.2539, "step": 3470 }, { "epoch": 2.48, "grad_norm": 12.800434645157752, "learning_rate": 3.342662936607992e-06, "loss": 0.261, "step": 3471 }, { "epoch": 2.48, "grad_norm": 14.088140859929942, "learning_rate": 3.3399364537858594e-06, "loss": 0.2424, "step": 3472 }, { "epoch": 2.48, "grad_norm": 10.233296365055466, "learning_rate": 3.3372105256218153e-06, "loss": 0.3066, "step": 3473 }, { "epoch": 2.48, "grad_norm": 7.936594581867434, "learning_rate": 3.334485153026639e-06, "loss": 0.2, "step": 3474 }, { "epoch": 2.48, "grad_norm": 19.685217490083236, "learning_rate": 3.3317603369109332e-06, "loss": 0.2756, "step": 3475 }, { "epoch": 2.48, "grad_norm": 12.901083967072752, "learning_rate": 3.3290360781851055e-06, "loss": 0.2666, "step": 3476 }, { "epoch": 2.48, "grad_norm": 9.973876232975487, "learning_rate": 3.326312377759383e-06, "loss": 0.2457, "step": 3477 }, { "epoch": 2.48, "grad_norm": 15.582000889219513, "learning_rate": 3.3235892365438038e-06, "loss": 0.2554, "step": 3478 }, { "epoch": 2.48, "grad_norm": 8.7223189804276, "learning_rate": 3.3208666554482216e-06, "loss": 0.2821, "step": 3479 }, { "epoch": 2.48, "grad_norm": 8.830264112686557, "learning_rate": 3.3181446353822997e-06, "loss": 0.2622, "step": 3480 }, { "epoch": 2.48, "grad_norm": 16.7891982910683, "learning_rate": 3.315423177255516e-06, "loss": 0.3813, "step": 3481 }, { "epoch": 2.49, "grad_norm": 11.65036520970705, "learning_rate": 3.312702281977161e-06, "loss": 0.2422, "step": 3482 }, { "epoch": 2.49, "grad_norm": 18.934632301351446, "learning_rate": 3.3099819504563356e-06, "loss": 0.2981, "step": 3483 }, { "epoch": 2.49, "grad_norm": 12.371036590772636, "learning_rate": 3.3072621836019535e-06, "loss": 0.2908, "step": 3484 }, { "epoch": 2.49, "grad_norm": 6.64869466776249, "learning_rate": 3.3045429823227405e-06, "loss": 0.201, "step": 3485 }, { "epoch": 2.49, "grad_norm": 9.730727432132026, "learning_rate": 3.3018243475272282e-06, "loss": 0.2419, "step": 3486 }, { "epoch": 2.49, "grad_norm": 11.465970571860945, "learning_rate": 3.2991062801237683e-06, "loss": 0.2417, "step": 3487 }, { "epoch": 2.49, "grad_norm": 9.95581933840102, "learning_rate": 3.296388781020513e-06, "loss": 0.2732, "step": 3488 }, { "epoch": 2.49, "grad_norm": 17.72636391101251, "learning_rate": 3.293671851125434e-06, "loss": 0.3291, "step": 3489 }, { "epoch": 2.49, "grad_norm": 11.831655869700418, "learning_rate": 3.2909554913463034e-06, "loss": 0.2332, "step": 3490 }, { "epoch": 2.49, "grad_norm": 11.681447470044946, "learning_rate": 3.2882397025907114e-06, "loss": 0.3584, "step": 3491 }, { "epoch": 2.49, "grad_norm": 8.474346090221069, "learning_rate": 3.2855244857660497e-06, "loss": 0.2732, "step": 3492 }, { "epoch": 2.49, "grad_norm": 6.735075386972751, "learning_rate": 3.2828098417795267e-06, "loss": 0.2156, "step": 3493 }, { "epoch": 2.49, "grad_norm": 14.010717824928776, "learning_rate": 3.2800957715381537e-06, "loss": 0.3191, "step": 3494 }, { "epoch": 2.49, "grad_norm": 12.239704900786217, "learning_rate": 3.2773822759487497e-06, "loss": 0.2515, "step": 3495 }, { "epoch": 2.5, "grad_norm": 8.594755050704686, "learning_rate": 3.2746693559179483e-06, "loss": 0.2563, "step": 3496 }, { "epoch": 2.5, "grad_norm": 13.933295020461518, "learning_rate": 3.2719570123521816e-06, "loss": 0.3687, "step": 3497 }, { "epoch": 2.5, "grad_norm": 10.270240218891797, "learning_rate": 3.2692452461576997e-06, "loss": 0.2876, "step": 3498 }, { "epoch": 2.5, "grad_norm": 15.502834749242435, "learning_rate": 3.266534058240548e-06, "loss": 0.2979, "step": 3499 }, { "epoch": 2.5, "grad_norm": 9.296849614190464, "learning_rate": 3.2638234495065903e-06, "loss": 0.2004, "step": 3500 }, { "epoch": 2.5, "eval_avg_AUC": 0.7877963736079466, "eval_avg_Accuracy": 0.6891992705570292, "eval_avg_Accuracy-right": 0.9025694535020217, "eval_avg_Accuracy-wrong": 0.3171480554923812, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6606969481112277, "eval_last_AUC": 0.8078098190488144, "eval_last_Accuracy": 0.7378149867374005, "eval_last_Accuracy-right": 0.852028172688144, "eval_last_Accuracy-wrong": 0.5386627245849442, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6786984038893772, "eval_max_AUC": 0.7524562987575951, "eval_max_Accuracy": 0.6421999336870027, "eval_max_Accuracy-right": 0.981805138907004, "eval_max_Accuracy-wrong": 0.05003411416875142, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6149865113803138, "eval_min_AUC": 0.7926767012641479, "eval_min_Accuracy": 0.7277022546419099, "eval_min_Accuracy-right": 0.7812051649928264, "eval_min_Accuracy-wrong": 0.6344098248806004, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6649279504885879, "eval_prod_AUC": 0.794738394115366, "eval_prod_Accuracy": 0.7094247347480106, "eval_prod_Accuracy-right": 0.6642754662840746, "eval_prod_Accuracy-wrong": 0.788151012053673, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.660230571971425, "eval_runtime": 246.6392, "eval_samples_per_second": 97.827, "eval_steps_per_second": 3.057, "eval_sum_AUC": 0.6402790236735809, "eval_sum_Accuracy": 0.6413710212201591, "eval_sum_Accuracy-right": 0.996869701317334, "eval_sum_Accuracy-wrong": 0.0214919263133955, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6389305505739054, "step": 3500 }, { "epoch": 2.5, "grad_norm": 13.068769827644681, "learning_rate": 3.261113420861487e-06, "loss": 0.3, "step": 3501 }, { "epoch": 2.5, "grad_norm": 9.556338925829841, "learning_rate": 3.258403973210713e-06, "loss": 0.2725, "step": 3502 }, { "epoch": 2.5, "grad_norm": 8.173921559638918, "learning_rate": 3.2556951074595435e-06, "loss": 0.2256, "step": 3503 }, { "epoch": 2.5, "grad_norm": 13.308116193516451, "learning_rate": 3.2529868245130577e-06, "loss": 0.3523, "step": 3504 }, { "epoch": 2.5, "grad_norm": 8.483362795187777, "learning_rate": 3.250279125276148e-06, "loss": 0.1975, "step": 3505 }, { "epoch": 2.5, "grad_norm": 10.818400765853749, "learning_rate": 3.2475720106535036e-06, "loss": 0.2515, "step": 3506 }, { "epoch": 2.5, "grad_norm": 9.579502198721391, "learning_rate": 3.244865481549625e-06, "loss": 0.2383, "step": 3507 }, { "epoch": 2.5, "grad_norm": 14.595647574214624, "learning_rate": 3.24215953886881e-06, "loss": 0.3413, "step": 3508 }, { "epoch": 2.5, "grad_norm": 8.566942539380886, "learning_rate": 3.2394541835151692e-06, "loss": 0.2266, "step": 3509 }, { "epoch": 2.51, "grad_norm": 11.679457066390924, "learning_rate": 3.2367494163926095e-06, "loss": 0.3003, "step": 3510 }, { "epoch": 2.51, "grad_norm": 12.640679927317136, "learning_rate": 3.234045238404841e-06, "loss": 0.3547, "step": 3511 }, { "epoch": 2.51, "grad_norm": 12.18961731562165, "learning_rate": 3.2313416504553852e-06, "loss": 0.3152, "step": 3512 }, { "epoch": 2.51, "grad_norm": 10.22642226732791, "learning_rate": 3.2286386534475568e-06, "loss": 0.2302, "step": 3513 }, { "epoch": 2.51, "grad_norm": 8.438253560959739, "learning_rate": 3.2259362482844803e-06, "loss": 0.2563, "step": 3514 }, { "epoch": 2.51, "grad_norm": 14.27856156365522, "learning_rate": 3.2232344358690765e-06, "loss": 0.2539, "step": 3515 }, { "epoch": 2.51, "grad_norm": 11.93630832767274, "learning_rate": 3.220533217104075e-06, "loss": 0.301, "step": 3516 }, { "epoch": 2.51, "grad_norm": 11.027334258786563, "learning_rate": 3.217832592891999e-06, "loss": 0.2263, "step": 3517 }, { "epoch": 2.51, "grad_norm": 10.754441023455815, "learning_rate": 3.2151325641351817e-06, "loss": 0.2654, "step": 3518 }, { "epoch": 2.51, "grad_norm": 10.74013272575728, "learning_rate": 3.2124331317357506e-06, "loss": 0.2236, "step": 3519 }, { "epoch": 2.51, "grad_norm": 13.383946788535154, "learning_rate": 3.2097342965956334e-06, "loss": 0.2791, "step": 3520 }, { "epoch": 2.51, "grad_norm": 11.32091809219386, "learning_rate": 3.2070360596165667e-06, "loss": 0.2312, "step": 3521 }, { "epoch": 2.51, "grad_norm": 21.730483344555825, "learning_rate": 3.204338421700076e-06, "loss": 0.3027, "step": 3522 }, { "epoch": 2.51, "grad_norm": 16.68177293630166, "learning_rate": 3.201641383747498e-06, "loss": 0.3059, "step": 3523 }, { "epoch": 2.52, "grad_norm": 10.746578325895099, "learning_rate": 3.1989449466599574e-06, "loss": 0.3159, "step": 3524 }, { "epoch": 2.52, "grad_norm": 14.046874777971698, "learning_rate": 3.1962491113383896e-06, "loss": 0.3032, "step": 3525 }, { "epoch": 2.52, "grad_norm": 9.701317007452005, "learning_rate": 3.1935538786835183e-06, "loss": 0.2761, "step": 3526 }, { "epoch": 2.52, "grad_norm": 12.573694027775247, "learning_rate": 3.1908592495958747e-06, "loss": 0.2598, "step": 3527 }, { "epoch": 2.52, "grad_norm": 9.454753496917233, "learning_rate": 3.1881652249757823e-06, "loss": 0.2227, "step": 3528 }, { "epoch": 2.52, "grad_norm": 8.312287431795568, "learning_rate": 3.185471805723365e-06, "loss": 0.2664, "step": 3529 }, { "epoch": 2.52, "grad_norm": 10.584289964239222, "learning_rate": 3.1827789927385444e-06, "loss": 0.2786, "step": 3530 }, { "epoch": 2.52, "grad_norm": 9.770661082389356, "learning_rate": 3.18008678692104e-06, "loss": 0.2793, "step": 3531 }, { "epoch": 2.52, "grad_norm": 11.506383377691748, "learning_rate": 3.1773951891703668e-06, "loss": 0.2068, "step": 3532 }, { "epoch": 2.52, "grad_norm": 10.113353081336689, "learning_rate": 3.1747042003858386e-06, "loss": 0.2349, "step": 3533 }, { "epoch": 2.52, "grad_norm": 8.615187115741376, "learning_rate": 3.1720138214665643e-06, "loss": 0.2405, "step": 3534 }, { "epoch": 2.52, "grad_norm": 20.977315544926444, "learning_rate": 3.1693240533114496e-06, "loss": 0.4512, "step": 3535 }, { "epoch": 2.52, "grad_norm": 14.063179683264558, "learning_rate": 3.1666348968191955e-06, "loss": 0.2495, "step": 3536 }, { "epoch": 2.52, "grad_norm": 8.892504016191015, "learning_rate": 3.1639463528883007e-06, "loss": 0.262, "step": 3537 }, { "epoch": 2.53, "grad_norm": 9.55789315186977, "learning_rate": 3.161258422417055e-06, "loss": 0.2847, "step": 3538 }, { "epoch": 2.53, "grad_norm": 11.550961610557891, "learning_rate": 3.1585711063035496e-06, "loss": 0.2666, "step": 3539 }, { "epoch": 2.53, "grad_norm": 11.899767453488833, "learning_rate": 3.155884405445663e-06, "loss": 0.2334, "step": 3540 }, { "epoch": 2.53, "grad_norm": 8.23350483466534, "learning_rate": 3.153198320741074e-06, "loss": 0.2043, "step": 3541 }, { "epoch": 2.53, "grad_norm": 13.410011852557782, "learning_rate": 3.150512853087253e-06, "loss": 0.325, "step": 3542 }, { "epoch": 2.53, "grad_norm": 8.11365917752667, "learning_rate": 3.1478280033814657e-06, "loss": 0.2322, "step": 3543 }, { "epoch": 2.53, "grad_norm": 9.164175984205736, "learning_rate": 3.14514377252077e-06, "loss": 0.23, "step": 3544 }, { "epoch": 2.53, "grad_norm": 12.426890476211831, "learning_rate": 3.142460161402014e-06, "loss": 0.2512, "step": 3545 }, { "epoch": 2.53, "grad_norm": 12.904224322008433, "learning_rate": 3.139777170921847e-06, "loss": 0.3203, "step": 3546 }, { "epoch": 2.53, "grad_norm": 9.782016167639494, "learning_rate": 3.137094801976701e-06, "loss": 0.2834, "step": 3547 }, { "epoch": 2.53, "grad_norm": 11.154378436965436, "learning_rate": 3.1344130554628104e-06, "loss": 0.2375, "step": 3548 }, { "epoch": 2.53, "grad_norm": 13.41557779364289, "learning_rate": 3.131731932276193e-06, "loss": 0.2896, "step": 3549 }, { "epoch": 2.53, "grad_norm": 17.215143879601154, "learning_rate": 3.129051433312664e-06, "loss": 0.3809, "step": 3550 }, { "epoch": 2.53, "grad_norm": 13.634755228372835, "learning_rate": 3.1263715594678257e-06, "loss": 0.3027, "step": 3551 }, { "epoch": 2.54, "grad_norm": 13.49266302111991, "learning_rate": 3.1236923116370764e-06, "loss": 0.2471, "step": 3552 }, { "epoch": 2.54, "grad_norm": 9.359540843720524, "learning_rate": 3.121013690715601e-06, "loss": 0.1829, "step": 3553 }, { "epoch": 2.54, "grad_norm": 8.828670124176265, "learning_rate": 3.118335697598376e-06, "loss": 0.2185, "step": 3554 }, { "epoch": 2.54, "grad_norm": 12.78806333330499, "learning_rate": 3.1156583331801703e-06, "loss": 0.2986, "step": 3555 }, { "epoch": 2.54, "grad_norm": 8.261812795713473, "learning_rate": 3.1129815983555387e-06, "loss": 0.2212, "step": 3556 }, { "epoch": 2.54, "grad_norm": 27.50178705776172, "learning_rate": 3.1103054940188316e-06, "loss": 0.3633, "step": 3557 }, { "epoch": 2.54, "grad_norm": 13.276709743000696, "learning_rate": 3.1076300210641814e-06, "loss": 0.2769, "step": 3558 }, { "epoch": 2.54, "grad_norm": 9.091825282135263, "learning_rate": 3.1049551803855173e-06, "loss": 0.261, "step": 3559 }, { "epoch": 2.54, "grad_norm": 9.497532460353776, "learning_rate": 3.1022809728765486e-06, "loss": 0.2812, "step": 3560 }, { "epoch": 2.54, "grad_norm": 14.352342983807455, "learning_rate": 3.0996073994307825e-06, "loss": 0.2544, "step": 3561 }, { "epoch": 2.54, "grad_norm": 10.47067050690066, "learning_rate": 3.0969344609415076e-06, "loss": 0.2544, "step": 3562 }, { "epoch": 2.54, "grad_norm": 13.14407357274232, "learning_rate": 3.0942621583017994e-06, "loss": 0.2639, "step": 3563 }, { "epoch": 2.54, "grad_norm": 12.376995841990846, "learning_rate": 3.0915904924045294e-06, "loss": 0.2681, "step": 3564 }, { "epoch": 2.54, "grad_norm": 8.615971289854569, "learning_rate": 3.088919464142346e-06, "loss": 0.2559, "step": 3565 }, { "epoch": 2.55, "grad_norm": 13.374764637160622, "learning_rate": 3.0862490744076928e-06, "loss": 0.3003, "step": 3566 }, { "epoch": 2.55, "grad_norm": 12.754057552940706, "learning_rate": 3.0835793240927937e-06, "loss": 0.3311, "step": 3567 }, { "epoch": 2.55, "grad_norm": 8.835687713457723, "learning_rate": 3.0809102140896652e-06, "loss": 0.2524, "step": 3568 }, { "epoch": 2.55, "grad_norm": 17.100356007256956, "learning_rate": 3.078241745290103e-06, "loss": 0.3794, "step": 3569 }, { "epoch": 2.55, "grad_norm": 13.290591029206796, "learning_rate": 3.075573918585696e-06, "loss": 0.2791, "step": 3570 }, { "epoch": 2.55, "grad_norm": 7.028395777757924, "learning_rate": 3.0729067348678127e-06, "loss": 0.1995, "step": 3571 }, { "epoch": 2.55, "grad_norm": 7.641816702608815, "learning_rate": 3.0702401950276066e-06, "loss": 0.1987, "step": 3572 }, { "epoch": 2.55, "grad_norm": 10.246342226496322, "learning_rate": 3.067574299956022e-06, "loss": 0.2441, "step": 3573 }, { "epoch": 2.55, "grad_norm": 11.844832056141563, "learning_rate": 3.0649090505437804e-06, "loss": 0.2976, "step": 3574 }, { "epoch": 2.55, "grad_norm": 12.680252229465067, "learning_rate": 3.062244447681396e-06, "loss": 0.3022, "step": 3575 }, { "epoch": 2.55, "grad_norm": 8.665615703923262, "learning_rate": 3.0595804922591564e-06, "loss": 0.2463, "step": 3576 }, { "epoch": 2.55, "grad_norm": 10.440754388319283, "learning_rate": 3.0569171851671436e-06, "loss": 0.2668, "step": 3577 }, { "epoch": 2.55, "grad_norm": 11.905524709518772, "learning_rate": 3.054254527295215e-06, "loss": 0.3701, "step": 3578 }, { "epoch": 2.55, "grad_norm": 9.807086250686762, "learning_rate": 3.0515925195330148e-06, "loss": 0.2522, "step": 3579 }, { "epoch": 2.56, "grad_norm": 10.476465301590263, "learning_rate": 3.048931162769969e-06, "loss": 0.2666, "step": 3580 }, { "epoch": 2.56, "grad_norm": 11.107929057524471, "learning_rate": 3.0462704578952874e-06, "loss": 0.2861, "step": 3581 }, { "epoch": 2.56, "grad_norm": 21.986391423429573, "learning_rate": 3.0436104057979604e-06, "loss": 0.2964, "step": 3582 }, { "epoch": 2.56, "grad_norm": 12.90626976949449, "learning_rate": 3.0409510073667602e-06, "loss": 0.2129, "step": 3583 }, { "epoch": 2.56, "grad_norm": 13.164803117431044, "learning_rate": 3.038292263490242e-06, "loss": 0.3071, "step": 3584 }, { "epoch": 2.56, "grad_norm": 15.092854006583256, "learning_rate": 3.035634175056742e-06, "loss": 0.3521, "step": 3585 }, { "epoch": 2.56, "grad_norm": 14.609423145742273, "learning_rate": 3.0329767429543767e-06, "loss": 0.2844, "step": 3586 }, { "epoch": 2.56, "grad_norm": 9.691645964790691, "learning_rate": 3.030319968071043e-06, "loss": 0.2329, "step": 3587 }, { "epoch": 2.56, "grad_norm": 7.81284676046446, "learning_rate": 3.0276638512944177e-06, "loss": 0.2225, "step": 3588 }, { "epoch": 2.56, "grad_norm": 12.769037847281707, "learning_rate": 3.025008393511961e-06, "loss": 0.3105, "step": 3589 }, { "epoch": 2.56, "grad_norm": 15.295061680310575, "learning_rate": 3.022353595610909e-06, "loss": 0.3152, "step": 3590 }, { "epoch": 2.56, "grad_norm": 11.698927211936732, "learning_rate": 3.01969945847828e-06, "loss": 0.293, "step": 3591 }, { "epoch": 2.56, "grad_norm": 11.609293397061075, "learning_rate": 3.017045983000871e-06, "loss": 0.2698, "step": 3592 }, { "epoch": 2.56, "grad_norm": 10.157130344506225, "learning_rate": 3.014393170065256e-06, "loss": 0.312, "step": 3593 }, { "epoch": 2.57, "grad_norm": 13.898830020128793, "learning_rate": 3.0117410205577903e-06, "loss": 0.2737, "step": 3594 }, { "epoch": 2.57, "grad_norm": 7.558372901547542, "learning_rate": 3.0090895353646053e-06, "loss": 0.2512, "step": 3595 }, { "epoch": 2.57, "grad_norm": 8.583363620015973, "learning_rate": 3.006438715371614e-06, "loss": 0.2542, "step": 3596 }, { "epoch": 2.57, "grad_norm": 8.806692519344198, "learning_rate": 3.0037885614645e-06, "loss": 0.2227, "step": 3597 }, { "epoch": 2.57, "grad_norm": 10.779965721273376, "learning_rate": 3.001139074528735e-06, "loss": 0.205, "step": 3598 }, { "epoch": 2.57, "grad_norm": 9.531815564612236, "learning_rate": 2.9984902554495556e-06, "loss": 0.2676, "step": 3599 }, { "epoch": 2.57, "grad_norm": 18.506453906728485, "learning_rate": 2.995842105111987e-06, "loss": 0.2793, "step": 3600 }, { "epoch": 2.57, "grad_norm": 11.15394731501427, "learning_rate": 2.99319462440082e-06, "loss": 0.2083, "step": 3601 }, { "epoch": 2.57, "grad_norm": 10.932997861337693, "learning_rate": 2.990547814200633e-06, "loss": 0.2432, "step": 3602 }, { "epoch": 2.57, "grad_norm": 14.849576244908592, "learning_rate": 2.987901675395771e-06, "loss": 0.2454, "step": 3603 }, { "epoch": 2.57, "grad_norm": 14.12584654833872, "learning_rate": 2.985256208870357e-06, "loss": 0.3494, "step": 3604 }, { "epoch": 2.57, "grad_norm": 10.767528797117537, "learning_rate": 2.982611415508294e-06, "loss": 0.2358, "step": 3605 }, { "epoch": 2.57, "grad_norm": 11.280128754931008, "learning_rate": 2.9799672961932525e-06, "loss": 0.2463, "step": 3606 }, { "epoch": 2.57, "grad_norm": 8.9328456698097, "learning_rate": 2.9773238518086866e-06, "loss": 0.176, "step": 3607 }, { "epoch": 2.58, "grad_norm": 9.21994488542098, "learning_rate": 2.974681083237816e-06, "loss": 0.262, "step": 3608 }, { "epoch": 2.58, "grad_norm": 10.967970336760006, "learning_rate": 2.972038991363643e-06, "loss": 0.2529, "step": 3609 }, { "epoch": 2.58, "grad_norm": 12.528992859617613, "learning_rate": 2.9693975770689344e-06, "loss": 0.2549, "step": 3610 }, { "epoch": 2.58, "grad_norm": 14.244301431577744, "learning_rate": 2.9667568412362415e-06, "loss": 0.2222, "step": 3611 }, { "epoch": 2.58, "grad_norm": 16.112330292266513, "learning_rate": 2.9641167847478797e-06, "loss": 0.2761, "step": 3612 }, { "epoch": 2.58, "grad_norm": 14.774287510401793, "learning_rate": 2.96147740848594e-06, "loss": 0.3308, "step": 3613 }, { "epoch": 2.58, "grad_norm": 14.048586990552954, "learning_rate": 2.9588387133322903e-06, "loss": 0.2192, "step": 3614 }, { "epoch": 2.58, "grad_norm": 13.648117792562253, "learning_rate": 2.9562007001685644e-06, "loss": 0.2556, "step": 3615 }, { "epoch": 2.58, "grad_norm": 12.586600872006583, "learning_rate": 2.9535633698761755e-06, "loss": 0.2891, "step": 3616 }, { "epoch": 2.58, "grad_norm": 14.723776113704208, "learning_rate": 2.9509267233363005e-06, "loss": 0.3936, "step": 3617 }, { "epoch": 2.58, "grad_norm": 8.347490053483744, "learning_rate": 2.948290761429895e-06, "loss": 0.2351, "step": 3618 }, { "epoch": 2.58, "grad_norm": 10.14243876444085, "learning_rate": 2.9456554850376805e-06, "loss": 0.2601, "step": 3619 }, { "epoch": 2.58, "grad_norm": 11.440692510978845, "learning_rate": 2.943020895040155e-06, "loss": 0.25, "step": 3620 }, { "epoch": 2.58, "grad_norm": 29.257448998416535, "learning_rate": 2.940386992317582e-06, "loss": 0.4346, "step": 3621 }, { "epoch": 2.59, "grad_norm": 10.111642278946286, "learning_rate": 2.937753777749996e-06, "loss": 0.2034, "step": 3622 }, { "epoch": 2.59, "grad_norm": 14.479860863761866, "learning_rate": 2.9351212522172056e-06, "loss": 0.3098, "step": 3623 }, { "epoch": 2.59, "grad_norm": 8.382932164941954, "learning_rate": 2.9324894165987837e-06, "loss": 0.2429, "step": 3624 }, { "epoch": 2.59, "grad_norm": 9.11158565572809, "learning_rate": 2.9298582717740797e-06, "loss": 0.2952, "step": 3625 }, { "epoch": 2.59, "grad_norm": 8.281584890559799, "learning_rate": 2.9272278186222025e-06, "loss": 0.2167, "step": 3626 }, { "epoch": 2.59, "grad_norm": 13.801083924218155, "learning_rate": 2.9245980580220405e-06, "loss": 0.2754, "step": 3627 }, { "epoch": 2.59, "grad_norm": 11.920589259305089, "learning_rate": 2.921968990852242e-06, "loss": 0.2957, "step": 3628 }, { "epoch": 2.59, "grad_norm": 8.695498611228306, "learning_rate": 2.9193406179912297e-06, "loss": 0.252, "step": 3629 }, { "epoch": 2.59, "grad_norm": 11.833770314366229, "learning_rate": 2.91671294031719e-06, "loss": 0.2573, "step": 3630 }, { "epoch": 2.59, "grad_norm": 10.9105743148522, "learning_rate": 2.91408595870808e-06, "loss": 0.2749, "step": 3631 }, { "epoch": 2.59, "grad_norm": 9.022981742578796, "learning_rate": 2.9114596740416224e-06, "loss": 0.2517, "step": 3632 }, { "epoch": 2.59, "grad_norm": 10.157774956816326, "learning_rate": 2.908834087195308e-06, "loss": 0.3579, "step": 3633 }, { "epoch": 2.59, "grad_norm": 14.441924468999797, "learning_rate": 2.9062091990463935e-06, "loss": 0.3257, "step": 3634 }, { "epoch": 2.59, "grad_norm": 14.911170610783156, "learning_rate": 2.903585010471904e-06, "loss": 0.3979, "step": 3635 }, { "epoch": 2.6, "grad_norm": 11.658127870248308, "learning_rate": 2.9009615223486297e-06, "loss": 0.3418, "step": 3636 }, { "epoch": 2.6, "grad_norm": 10.556064566768066, "learning_rate": 2.898338735553128e-06, "loss": 0.2759, "step": 3637 }, { "epoch": 2.6, "grad_norm": 10.750443370130997, "learning_rate": 2.895716650961714e-06, "loss": 0.3328, "step": 3638 }, { "epoch": 2.6, "grad_norm": 13.404219192176061, "learning_rate": 2.8930952694504843e-06, "loss": 0.3159, "step": 3639 }, { "epoch": 2.6, "grad_norm": 8.801417766512998, "learning_rate": 2.8904745918952833e-06, "loss": 0.24, "step": 3640 }, { "epoch": 2.6, "grad_norm": 10.893767277763509, "learning_rate": 2.887854619171735e-06, "loss": 0.2925, "step": 3641 }, { "epoch": 2.6, "grad_norm": 8.238072093150642, "learning_rate": 2.8852353521552135e-06, "loss": 0.2283, "step": 3642 }, { "epoch": 2.6, "grad_norm": 12.888970588309743, "learning_rate": 2.8826167917208727e-06, "loss": 0.2354, "step": 3643 }, { "epoch": 2.6, "grad_norm": 11.876703842177745, "learning_rate": 2.8799989387436137e-06, "loss": 0.2683, "step": 3644 }, { "epoch": 2.6, "grad_norm": 8.14852428141835, "learning_rate": 2.8773817940981186e-06, "loss": 0.2678, "step": 3645 }, { "epoch": 2.6, "grad_norm": 10.672569274403406, "learning_rate": 2.8747653586588183e-06, "loss": 0.3386, "step": 3646 }, { "epoch": 2.6, "grad_norm": 11.871670166521756, "learning_rate": 2.872149633299913e-06, "loss": 0.2263, "step": 3647 }, { "epoch": 2.6, "grad_norm": 12.647933339436156, "learning_rate": 2.8695346188953666e-06, "loss": 0.2524, "step": 3648 }, { "epoch": 2.6, "grad_norm": 11.424868547184808, "learning_rate": 2.866920316318904e-06, "loss": 0.3276, "step": 3649 }, { "epoch": 2.61, "grad_norm": 12.588799181559985, "learning_rate": 2.8643067264440116e-06, "loss": 0.3127, "step": 3650 }, { "epoch": 2.61, "grad_norm": 12.677503962337894, "learning_rate": 2.8616938501439384e-06, "loss": 0.2363, "step": 3651 }, { "epoch": 2.61, "grad_norm": 14.00332101941201, "learning_rate": 2.8590816882916948e-06, "loss": 0.2627, "step": 3652 }, { "epoch": 2.61, "grad_norm": 13.14346918330707, "learning_rate": 2.856470241760054e-06, "loss": 0.3857, "step": 3653 }, { "epoch": 2.61, "grad_norm": 13.979054378804332, "learning_rate": 2.8538595114215472e-06, "loss": 0.2969, "step": 3654 }, { "epoch": 2.61, "grad_norm": 10.350457278452481, "learning_rate": 2.8512494981484706e-06, "loss": 0.2629, "step": 3655 }, { "epoch": 2.61, "grad_norm": 13.518490136692607, "learning_rate": 2.848640202812872e-06, "loss": 0.2688, "step": 3656 }, { "epoch": 2.61, "grad_norm": 11.77830597016325, "learning_rate": 2.846031626286574e-06, "loss": 0.2463, "step": 3657 }, { "epoch": 2.61, "grad_norm": 13.232483324704306, "learning_rate": 2.8434237694411414e-06, "loss": 0.2715, "step": 3658 }, { "epoch": 2.61, "grad_norm": 17.486679322201393, "learning_rate": 2.840816633147917e-06, "loss": 0.335, "step": 3659 }, { "epoch": 2.61, "grad_norm": 7.147692850153896, "learning_rate": 2.8382102182779846e-06, "loss": 0.1785, "step": 3660 }, { "epoch": 2.61, "grad_norm": 9.423807909050868, "learning_rate": 2.8356045257022037e-06, "loss": 0.2021, "step": 3661 }, { "epoch": 2.61, "grad_norm": 9.72309990205576, "learning_rate": 2.832999556291177e-06, "loss": 0.2351, "step": 3662 }, { "epoch": 2.61, "grad_norm": 14.42597066218875, "learning_rate": 2.8303953109152815e-06, "loss": 0.3379, "step": 3663 }, { "epoch": 2.62, "grad_norm": 10.89552475555432, "learning_rate": 2.827791790444638e-06, "loss": 0.2471, "step": 3664 }, { "epoch": 2.62, "grad_norm": 21.378000080961417, "learning_rate": 2.8251889957491317e-06, "loss": 0.4006, "step": 3665 }, { "epoch": 2.62, "grad_norm": 10.967603756260177, "learning_rate": 2.822586927698407e-06, "loss": 0.2324, "step": 3666 }, { "epoch": 2.62, "grad_norm": 12.808368358224756, "learning_rate": 2.819985587161861e-06, "loss": 0.2229, "step": 3667 }, { "epoch": 2.62, "grad_norm": 20.91800691344456, "learning_rate": 2.8173849750086513e-06, "loss": 0.3875, "step": 3668 }, { "epoch": 2.62, "grad_norm": 19.77994963136096, "learning_rate": 2.8147850921076903e-06, "loss": 0.2908, "step": 3669 }, { "epoch": 2.62, "grad_norm": 9.899888131646092, "learning_rate": 2.8121859393276475e-06, "loss": 0.2932, "step": 3670 }, { "epoch": 2.62, "grad_norm": 13.500226797134557, "learning_rate": 2.809587517536947e-06, "loss": 0.2898, "step": 3671 }, { "epoch": 2.62, "grad_norm": 13.24302333969292, "learning_rate": 2.806989827603771e-06, "loss": 0.2646, "step": 3672 }, { "epoch": 2.62, "grad_norm": 9.78450213872708, "learning_rate": 2.8043928703960565e-06, "loss": 0.2385, "step": 3673 }, { "epoch": 2.62, "grad_norm": 9.502438761869197, "learning_rate": 2.8017966467814933e-06, "loss": 0.22, "step": 3674 }, { "epoch": 2.62, "grad_norm": 8.757377161825325, "learning_rate": 2.7992011576275295e-06, "loss": 0.2163, "step": 3675 }, { "epoch": 2.62, "grad_norm": 7.72219922258268, "learning_rate": 2.7966064038013657e-06, "loss": 0.1946, "step": 3676 }, { "epoch": 2.62, "grad_norm": 10.007379297576533, "learning_rate": 2.7940123861699577e-06, "loss": 0.2786, "step": 3677 }, { "epoch": 2.63, "grad_norm": 8.593026440283587, "learning_rate": 2.7914191056000147e-06, "loss": 0.2473, "step": 3678 }, { "epoch": 2.63, "grad_norm": 18.03444982249345, "learning_rate": 2.788826562958e-06, "loss": 0.2756, "step": 3679 }, { "epoch": 2.63, "grad_norm": 18.91137019627999, "learning_rate": 2.7862347591101326e-06, "loss": 0.2871, "step": 3680 }, { "epoch": 2.63, "grad_norm": 12.723301260051828, "learning_rate": 2.7836436949223755e-06, "loss": 0.2795, "step": 3681 }, { "epoch": 2.63, "grad_norm": 9.113635889283595, "learning_rate": 2.78105337126046e-06, "loss": 0.2056, "step": 3682 }, { "epoch": 2.63, "grad_norm": 16.904639312214186, "learning_rate": 2.7784637889898534e-06, "loss": 0.3232, "step": 3683 }, { "epoch": 2.63, "grad_norm": 14.888350346451826, "learning_rate": 2.7758749489757914e-06, "loss": 0.3789, "step": 3684 }, { "epoch": 2.63, "grad_norm": 10.763047444352006, "learning_rate": 2.7732868520832455e-06, "loss": 0.2673, "step": 3685 }, { "epoch": 2.63, "grad_norm": 10.351563872031413, "learning_rate": 2.770699499176954e-06, "loss": 0.2411, "step": 3686 }, { "epoch": 2.63, "grad_norm": 12.930545068368955, "learning_rate": 2.768112891121394e-06, "loss": 0.2139, "step": 3687 }, { "epoch": 2.63, "grad_norm": 11.380131399436635, "learning_rate": 2.7655270287808045e-06, "loss": 0.2854, "step": 3688 }, { "epoch": 2.63, "grad_norm": 12.686797655460788, "learning_rate": 2.762941913019166e-06, "loss": 0.2605, "step": 3689 }, { "epoch": 2.63, "grad_norm": 17.096895929111895, "learning_rate": 2.760357544700215e-06, "loss": 0.3394, "step": 3690 }, { "epoch": 2.63, "grad_norm": 10.428775038502973, "learning_rate": 2.757773924687437e-06, "loss": 0.3103, "step": 3691 }, { "epoch": 2.64, "grad_norm": 14.336479750202862, "learning_rate": 2.755191053844068e-06, "loss": 0.3137, "step": 3692 }, { "epoch": 2.64, "grad_norm": 14.401781691538487, "learning_rate": 2.7526089330330925e-06, "loss": 0.301, "step": 3693 }, { "epoch": 2.64, "grad_norm": 12.653098093782502, "learning_rate": 2.7500275631172455e-06, "loss": 0.3079, "step": 3694 }, { "epoch": 2.64, "grad_norm": 7.841260346341729, "learning_rate": 2.74744694495901e-06, "loss": 0.2393, "step": 3695 }, { "epoch": 2.64, "grad_norm": 9.365598543542117, "learning_rate": 2.74486707942062e-06, "loss": 0.2288, "step": 3696 }, { "epoch": 2.64, "grad_norm": 12.161027149177121, "learning_rate": 2.7422879673640552e-06, "loss": 0.2568, "step": 3697 }, { "epoch": 2.64, "grad_norm": 12.575772535414341, "learning_rate": 2.7397096096510467e-06, "loss": 0.3198, "step": 3698 }, { "epoch": 2.64, "grad_norm": 8.191135053850635, "learning_rate": 2.7371320071430674e-06, "loss": 0.183, "step": 3699 }, { "epoch": 2.64, "grad_norm": 11.824506901967847, "learning_rate": 2.7345551607013475e-06, "loss": 0.2175, "step": 3700 }, { "epoch": 2.64, "grad_norm": 11.666492088642991, "learning_rate": 2.7319790711868545e-06, "loss": 0.2837, "step": 3701 }, { "epoch": 2.64, "grad_norm": 14.142683081101087, "learning_rate": 2.7294037394603135e-06, "loss": 0.3069, "step": 3702 }, { "epoch": 2.64, "grad_norm": 9.323056173012265, "learning_rate": 2.7268291663821825e-06, "loss": 0.2463, "step": 3703 }, { "epoch": 2.64, "grad_norm": 8.448469652485555, "learning_rate": 2.7242553528126842e-06, "loss": 0.261, "step": 3704 }, { "epoch": 2.64, "grad_norm": 12.413334200894413, "learning_rate": 2.72168229961177e-06, "loss": 0.2285, "step": 3705 }, { "epoch": 2.65, "grad_norm": 8.372411343210498, "learning_rate": 2.7191100076391473e-06, "loss": 0.28, "step": 3706 }, { "epoch": 2.65, "grad_norm": 13.386689789949346, "learning_rate": 2.716538477754266e-06, "loss": 0.2786, "step": 3707 }, { "epoch": 2.65, "grad_norm": 9.380636299671224, "learning_rate": 2.713967710816323e-06, "loss": 0.2209, "step": 3708 }, { "epoch": 2.65, "grad_norm": 9.743026725229255, "learning_rate": 2.7113977076842597e-06, "loss": 0.262, "step": 3709 }, { "epoch": 2.65, "grad_norm": 8.806331974095787, "learning_rate": 2.7088284692167604e-06, "loss": 0.2461, "step": 3710 }, { "epoch": 2.65, "grad_norm": 10.668378133256628, "learning_rate": 2.7062599962722563e-06, "loss": 0.2358, "step": 3711 }, { "epoch": 2.65, "grad_norm": 8.267929385459727, "learning_rate": 2.703692289708922e-06, "loss": 0.1868, "step": 3712 }, { "epoch": 2.65, "grad_norm": 9.835870231063108, "learning_rate": 2.701125350384676e-06, "loss": 0.2524, "step": 3713 }, { "epoch": 2.65, "grad_norm": 9.563910610196777, "learning_rate": 2.69855917915718e-06, "loss": 0.2437, "step": 3714 }, { "epoch": 2.65, "grad_norm": 12.04255826697796, "learning_rate": 2.695993776883839e-06, "loss": 0.2261, "step": 3715 }, { "epoch": 2.65, "grad_norm": 8.574915744949621, "learning_rate": 2.693429144421803e-06, "loss": 0.2065, "step": 3716 }, { "epoch": 2.65, "grad_norm": 13.575572137934644, "learning_rate": 2.6908652826279623e-06, "loss": 0.3191, "step": 3717 }, { "epoch": 2.65, "grad_norm": 11.921722163471122, "learning_rate": 2.688302192358952e-06, "loss": 0.2988, "step": 3718 }, { "epoch": 2.65, "grad_norm": 14.103930678311931, "learning_rate": 2.6857398744711472e-06, "loss": 0.2549, "step": 3719 }, { "epoch": 2.66, "grad_norm": 12.661523899831044, "learning_rate": 2.683178329820666e-06, "loss": 0.272, "step": 3720 }, { "epoch": 2.66, "grad_norm": 20.55045890604639, "learning_rate": 2.680617559263368e-06, "loss": 0.4014, "step": 3721 }, { "epoch": 2.66, "grad_norm": 10.731468401982053, "learning_rate": 2.6780575636548544e-06, "loss": 0.2571, "step": 3722 }, { "epoch": 2.66, "grad_norm": 10.002086520928234, "learning_rate": 2.67549834385047e-06, "loss": 0.23, "step": 3723 }, { "epoch": 2.66, "grad_norm": 17.74586382467539, "learning_rate": 2.67293990070529e-06, "loss": 0.2725, "step": 3724 }, { "epoch": 2.66, "grad_norm": 15.370350233340924, "learning_rate": 2.6703822350741483e-06, "loss": 0.2493, "step": 3725 }, { "epoch": 2.66, "grad_norm": 13.138445754547114, "learning_rate": 2.6678253478116e-06, "loss": 0.2695, "step": 3726 }, { "epoch": 2.66, "grad_norm": 17.832773166121346, "learning_rate": 2.665269239771953e-06, "loss": 0.3164, "step": 3727 }, { "epoch": 2.66, "grad_norm": 11.550195192135488, "learning_rate": 2.662713911809248e-06, "loss": 0.2651, "step": 3728 }, { "epoch": 2.66, "grad_norm": 9.218101826282929, "learning_rate": 2.6601593647772696e-06, "loss": 0.2422, "step": 3729 }, { "epoch": 2.66, "grad_norm": 7.894184013419036, "learning_rate": 2.657605599529538e-06, "loss": 0.2026, "step": 3730 }, { "epoch": 2.66, "grad_norm": 19.05854846221324, "learning_rate": 2.6550526169193148e-06, "loss": 0.2878, "step": 3731 }, { "epoch": 2.66, "grad_norm": 11.9197967302707, "learning_rate": 2.6525004177995984e-06, "loss": 0.2617, "step": 3732 }, { "epoch": 2.66, "grad_norm": 10.975852184158727, "learning_rate": 2.6499490030231255e-06, "loss": 0.2622, "step": 3733 }, { "epoch": 2.67, "grad_norm": 15.155629799506274, "learning_rate": 2.6473983734423725e-06, "loss": 0.3186, "step": 3734 }, { "epoch": 2.67, "grad_norm": 11.758339565258634, "learning_rate": 2.644848529909552e-06, "loss": 0.2964, "step": 3735 }, { "epoch": 2.67, "grad_norm": 7.913581401033196, "learning_rate": 2.6422994732766124e-06, "loss": 0.2395, "step": 3736 }, { "epoch": 2.67, "grad_norm": 11.148566465671331, "learning_rate": 2.6397512043952422e-06, "loss": 0.2524, "step": 3737 }, { "epoch": 2.67, "grad_norm": 11.657857821556732, "learning_rate": 2.637203724116865e-06, "loss": 0.3242, "step": 3738 }, { "epoch": 2.67, "grad_norm": 10.957623757375671, "learning_rate": 2.634657033292644e-06, "loss": 0.2217, "step": 3739 }, { "epoch": 2.67, "grad_norm": 13.508785111154575, "learning_rate": 2.6321111327734693e-06, "loss": 0.2539, "step": 3740 }, { "epoch": 2.67, "grad_norm": 11.622247025051292, "learning_rate": 2.6295660234099816e-06, "loss": 0.291, "step": 3741 }, { "epoch": 2.67, "grad_norm": 13.422136750776735, "learning_rate": 2.6270217060525416e-06, "loss": 0.2888, "step": 3742 }, { "epoch": 2.67, "grad_norm": 7.53914884344272, "learning_rate": 2.624478181551261e-06, "loss": 0.2026, "step": 3743 }, { "epoch": 2.67, "grad_norm": 10.498572267274076, "learning_rate": 2.62193545075597e-06, "loss": 0.2798, "step": 3744 }, { "epoch": 2.67, "grad_norm": 15.79889320226917, "learning_rate": 2.6193935145162507e-06, "loss": 0.2163, "step": 3745 }, { "epoch": 2.67, "grad_norm": 15.306685142228023, "learning_rate": 2.6168523736814035e-06, "loss": 0.2278, "step": 3746 }, { "epoch": 2.67, "grad_norm": 12.577440943022701, "learning_rate": 2.6143120291004785e-06, "loss": 0.2603, "step": 3747 }, { "epoch": 2.68, "grad_norm": 9.96685761518886, "learning_rate": 2.611772481622246e-06, "loss": 0.2502, "step": 3748 }, { "epoch": 2.68, "grad_norm": 9.149397489215964, "learning_rate": 2.609233732095218e-06, "loss": 0.2128, "step": 3749 }, { "epoch": 2.68, "grad_norm": 14.844697752164103, "learning_rate": 2.6066957813676375e-06, "loss": 0.2615, "step": 3750 }, { "epoch": 2.68, "grad_norm": 19.15670169536182, "learning_rate": 2.604158630287482e-06, "loss": 0.3196, "step": 3751 }, { "epoch": 2.68, "grad_norm": 8.279178479626223, "learning_rate": 2.60162227970246e-06, "loss": 0.2178, "step": 3752 }, { "epoch": 2.68, "grad_norm": 11.751400189966994, "learning_rate": 2.5990867304600136e-06, "loss": 0.2583, "step": 3753 }, { "epoch": 2.68, "grad_norm": 15.652516885894475, "learning_rate": 2.5965519834073172e-06, "loss": 0.3057, "step": 3754 }, { "epoch": 2.68, "grad_norm": 19.546997677748575, "learning_rate": 2.5940180393912767e-06, "loss": 0.2573, "step": 3755 }, { "epoch": 2.68, "grad_norm": 11.260684953345004, "learning_rate": 2.5914848992585293e-06, "loss": 0.2771, "step": 3756 }, { "epoch": 2.68, "grad_norm": 13.383273870807141, "learning_rate": 2.588952563855448e-06, "loss": 0.2675, "step": 3757 }, { "epoch": 2.68, "grad_norm": 25.01046644893153, "learning_rate": 2.5864210340281247e-06, "loss": 0.2979, "step": 3758 }, { "epoch": 2.68, "grad_norm": 9.939204656676262, "learning_rate": 2.5838903106224004e-06, "loss": 0.2478, "step": 3759 }, { "epoch": 2.68, "grad_norm": 12.320199686371675, "learning_rate": 2.5813603944838283e-06, "loss": 0.3015, "step": 3760 }, { "epoch": 2.68, "grad_norm": 14.595923087132736, "learning_rate": 2.578831286457708e-06, "loss": 0.3175, "step": 3761 }, { "epoch": 2.69, "grad_norm": 11.246086611925866, "learning_rate": 2.5763029873890542e-06, "loss": 0.2749, "step": 3762 }, { "epoch": 2.69, "grad_norm": 12.986807659021308, "learning_rate": 2.573775498122626e-06, "loss": 0.2788, "step": 3763 }, { "epoch": 2.69, "grad_norm": 15.310292571942231, "learning_rate": 2.5712488195028972e-06, "loss": 0.3462, "step": 3764 }, { "epoch": 2.69, "grad_norm": 13.136729199970594, "learning_rate": 2.5687229523740852e-06, "loss": 0.282, "step": 3765 }, { "epoch": 2.69, "grad_norm": 9.782342303010028, "learning_rate": 2.566197897580124e-06, "loss": 0.2458, "step": 3766 }, { "epoch": 2.69, "grad_norm": 13.133256291376362, "learning_rate": 2.5636736559646824e-06, "loss": 0.2234, "step": 3767 }, { "epoch": 2.69, "grad_norm": 12.590584644856651, "learning_rate": 2.5611502283711576e-06, "loss": 0.3142, "step": 3768 }, { "epoch": 2.69, "grad_norm": 6.440635231131513, "learning_rate": 2.5586276156426726e-06, "loss": 0.2224, "step": 3769 }, { "epoch": 2.69, "grad_norm": 16.245808625760535, "learning_rate": 2.55610581862208e-06, "loss": 0.4028, "step": 3770 }, { "epoch": 2.69, "grad_norm": 10.723496311621378, "learning_rate": 2.553584838151959e-06, "loss": 0.2771, "step": 3771 }, { "epoch": 2.69, "grad_norm": 10.5810352576066, "learning_rate": 2.5510646750746154e-06, "loss": 0.2427, "step": 3772 }, { "epoch": 2.69, "grad_norm": 10.01714154859388, "learning_rate": 2.548545330232083e-06, "loss": 0.2751, "step": 3773 }, { "epoch": 2.69, "grad_norm": 9.33044917355502, "learning_rate": 2.5460268044661215e-06, "loss": 0.2717, "step": 3774 }, { "epoch": 2.69, "grad_norm": 8.953209080406706, "learning_rate": 2.5435090986182176e-06, "loss": 0.2373, "step": 3775 }, { "epoch": 2.7, "grad_norm": 8.763927527803714, "learning_rate": 2.5409922135295827e-06, "loss": 0.2861, "step": 3776 }, { "epoch": 2.7, "grad_norm": 11.902909129104998, "learning_rate": 2.538476150041156e-06, "loss": 0.2371, "step": 3777 }, { "epoch": 2.7, "grad_norm": 11.972557490659907, "learning_rate": 2.5359609089936006e-06, "loss": 0.3052, "step": 3778 }, { "epoch": 2.7, "grad_norm": 9.45909372522568, "learning_rate": 2.533446491227305e-06, "loss": 0.2371, "step": 3779 }, { "epoch": 2.7, "grad_norm": 15.827756309288517, "learning_rate": 2.5309328975823834e-06, "loss": 0.2866, "step": 3780 }, { "epoch": 2.7, "grad_norm": 11.749836878444942, "learning_rate": 2.5284201288986744e-06, "loss": 0.2866, "step": 3781 }, { "epoch": 2.7, "grad_norm": 8.965182716060708, "learning_rate": 2.5259081860157418e-06, "loss": 0.2061, "step": 3782 }, { "epoch": 2.7, "grad_norm": 24.07848339998907, "learning_rate": 2.5233970697728673e-06, "loss": 0.3752, "step": 3783 }, { "epoch": 2.7, "grad_norm": 9.527044251899435, "learning_rate": 2.520886781009068e-06, "loss": 0.2346, "step": 3784 }, { "epoch": 2.7, "grad_norm": 8.012392512661368, "learning_rate": 2.5183773205630726e-06, "loss": 0.1793, "step": 3785 }, { "epoch": 2.7, "grad_norm": 25.177862422351314, "learning_rate": 2.515868689273344e-06, "loss": 0.3994, "step": 3786 }, { "epoch": 2.7, "grad_norm": 12.33196589779338, "learning_rate": 2.513360887978056e-06, "loss": 0.3093, "step": 3787 }, { "epoch": 2.7, "grad_norm": 11.253538907290611, "learning_rate": 2.510853917515119e-06, "loss": 0.2842, "step": 3788 }, { "epoch": 2.7, "grad_norm": 8.249265295320773, "learning_rate": 2.50834777872215e-06, "loss": 0.2053, "step": 3789 }, { "epoch": 2.71, "grad_norm": 10.582172247591428, "learning_rate": 2.505842472436506e-06, "loss": 0.2583, "step": 3790 }, { "epoch": 2.71, "grad_norm": 10.66633784718915, "learning_rate": 2.5033379994952493e-06, "loss": 0.2407, "step": 3791 }, { "epoch": 2.71, "grad_norm": 12.944642252674152, "learning_rate": 2.5008343607351733e-06, "loss": 0.2534, "step": 3792 }, { "epoch": 2.71, "grad_norm": 12.936903818038365, "learning_rate": 2.4983315569927895e-06, "loss": 0.2915, "step": 3793 }, { "epoch": 2.71, "grad_norm": 14.489379129954239, "learning_rate": 2.495829589104333e-06, "loss": 0.3008, "step": 3794 }, { "epoch": 2.71, "grad_norm": 10.125082832585976, "learning_rate": 2.493328457905755e-06, "loss": 0.2649, "step": 3795 }, { "epoch": 2.71, "grad_norm": 8.20842735541152, "learning_rate": 2.490828164232732e-06, "loss": 0.3149, "step": 3796 }, { "epoch": 2.71, "grad_norm": 7.675386676649888, "learning_rate": 2.4883287089206582e-06, "loss": 0.1863, "step": 3797 }, { "epoch": 2.71, "grad_norm": 20.742830149961485, "learning_rate": 2.48583009280465e-06, "loss": 0.3511, "step": 3798 }, { "epoch": 2.71, "grad_norm": 14.230891927189097, "learning_rate": 2.483332316719535e-06, "loss": 0.2849, "step": 3799 }, { "epoch": 2.71, "grad_norm": 10.848726525249605, "learning_rate": 2.4808353814998747e-06, "loss": 0.2275, "step": 3800 }, { "epoch": 2.71, "grad_norm": 12.124014387257533, "learning_rate": 2.4783392879799345e-06, "loss": 0.2949, "step": 3801 }, { "epoch": 2.71, "grad_norm": 9.7237259687238, "learning_rate": 2.4758440369937125e-06, "loss": 0.2478, "step": 3802 }, { "epoch": 2.71, "grad_norm": 13.256139492796564, "learning_rate": 2.4733496293749116e-06, "loss": 0.2549, "step": 3803 }, { "epoch": 2.72, "grad_norm": 11.982938440215118, "learning_rate": 2.4708560659569665e-06, "loss": 0.2588, "step": 3804 }, { "epoch": 2.72, "grad_norm": 10.720889972293365, "learning_rate": 2.4683633475730158e-06, "loss": 0.2373, "step": 3805 }, { "epoch": 2.72, "grad_norm": 12.082888203055472, "learning_rate": 2.465871475055931e-06, "loss": 0.2601, "step": 3806 }, { "epoch": 2.72, "grad_norm": 11.744412266081433, "learning_rate": 2.4633804492382866e-06, "loss": 0.2532, "step": 3807 }, { "epoch": 2.72, "grad_norm": 8.257941981624063, "learning_rate": 2.460890270952383e-06, "loss": 0.2229, "step": 3808 }, { "epoch": 2.72, "grad_norm": 8.784405215041396, "learning_rate": 2.4584009410302357e-06, "loss": 0.2222, "step": 3809 }, { "epoch": 2.72, "grad_norm": 11.388475216063645, "learning_rate": 2.4559124603035744e-06, "loss": 0.2717, "step": 3810 }, { "epoch": 2.72, "grad_norm": 14.219406174751503, "learning_rate": 2.4534248296038488e-06, "loss": 0.2698, "step": 3811 }, { "epoch": 2.72, "grad_norm": 6.6815938869164615, "learning_rate": 2.4509380497622208e-06, "loss": 0.22, "step": 3812 }, { "epoch": 2.72, "grad_norm": 9.677189639592727, "learning_rate": 2.448452121609571e-06, "loss": 0.2183, "step": 3813 }, { "epoch": 2.72, "grad_norm": 12.373482728475846, "learning_rate": 2.445967045976493e-06, "loss": 0.3013, "step": 3814 }, { "epoch": 2.72, "grad_norm": 11.139596133312253, "learning_rate": 2.443482823693298e-06, "loss": 0.2468, "step": 3815 }, { "epoch": 2.72, "grad_norm": 10.189247114208078, "learning_rate": 2.4409994555900125e-06, "loss": 0.2351, "step": 3816 }, { "epoch": 2.72, "grad_norm": 12.972346774800624, "learning_rate": 2.4385169424963696e-06, "loss": 0.3157, "step": 3817 }, { "epoch": 2.73, "grad_norm": 9.086435159510867, "learning_rate": 2.4360352852418305e-06, "loss": 0.2595, "step": 3818 }, { "epoch": 2.73, "grad_norm": 9.164889104160295, "learning_rate": 2.4335544846555564e-06, "loss": 0.2026, "step": 3819 }, { "epoch": 2.73, "grad_norm": 8.857166435097096, "learning_rate": 2.431074541566436e-06, "loss": 0.2142, "step": 3820 }, { "epoch": 2.73, "grad_norm": 12.925741012441494, "learning_rate": 2.4285954568030566e-06, "loss": 0.3203, "step": 3821 }, { "epoch": 2.73, "grad_norm": 16.356894972943223, "learning_rate": 2.426117231193735e-06, "loss": 0.2651, "step": 3822 }, { "epoch": 2.73, "grad_norm": 10.0580568696073, "learning_rate": 2.4236398655664834e-06, "loss": 0.2673, "step": 3823 }, { "epoch": 2.73, "grad_norm": 12.718078840336174, "learning_rate": 2.4211633607490442e-06, "loss": 0.3003, "step": 3824 }, { "epoch": 2.73, "grad_norm": 10.376049611246835, "learning_rate": 2.4186877175688576e-06, "loss": 0.2839, "step": 3825 }, { "epoch": 2.73, "grad_norm": 10.126606193237182, "learning_rate": 2.4162129368530848e-06, "loss": 0.2422, "step": 3826 }, { "epoch": 2.73, "grad_norm": 15.206258398092292, "learning_rate": 2.413739019428595e-06, "loss": 0.2449, "step": 3827 }, { "epoch": 2.73, "grad_norm": 9.679245970025766, "learning_rate": 2.41126596612197e-06, "loss": 0.2478, "step": 3828 }, { "epoch": 2.73, "grad_norm": 7.765818992891482, "learning_rate": 2.408793777759504e-06, "loss": 0.2175, "step": 3829 }, { "epoch": 2.73, "grad_norm": 7.619739347290504, "learning_rate": 2.4063224551672e-06, "loss": 0.1636, "step": 3830 }, { "epoch": 2.73, "grad_norm": 8.46872304315626, "learning_rate": 2.4038519991707725e-06, "loss": 0.217, "step": 3831 }, { "epoch": 2.74, "grad_norm": 12.19458341714486, "learning_rate": 2.4013824105956483e-06, "loss": 0.2549, "step": 3832 }, { "epoch": 2.74, "grad_norm": 10.625624020708948, "learning_rate": 2.3989136902669614e-06, "loss": 0.2244, "step": 3833 }, { "epoch": 2.74, "grad_norm": 12.101152390439431, "learning_rate": 2.396445839009558e-06, "loss": 0.2581, "step": 3834 }, { "epoch": 2.74, "grad_norm": 10.164102827836336, "learning_rate": 2.3939788576479926e-06, "loss": 0.2493, "step": 3835 }, { "epoch": 2.74, "grad_norm": 14.073936277817232, "learning_rate": 2.39151274700653e-06, "loss": 0.2141, "step": 3836 }, { "epoch": 2.74, "grad_norm": 11.663235322211742, "learning_rate": 2.389047507909143e-06, "loss": 0.2766, "step": 3837 }, { "epoch": 2.74, "grad_norm": 12.427826685566103, "learning_rate": 2.3865831411795137e-06, "loss": 0.2869, "step": 3838 }, { "epoch": 2.74, "grad_norm": 9.972113366082914, "learning_rate": 2.3841196476410337e-06, "loss": 0.2744, "step": 3839 }, { "epoch": 2.74, "grad_norm": 17.7061464713193, "learning_rate": 2.3816570281168016e-06, "loss": 0.2939, "step": 3840 }, { "epoch": 2.74, "grad_norm": 12.571672824051713, "learning_rate": 2.379195283429626e-06, "loss": 0.2527, "step": 3841 }, { "epoch": 2.74, "grad_norm": 12.099134977591973, "learning_rate": 2.3767344144020164e-06, "loss": 0.2732, "step": 3842 }, { "epoch": 2.74, "grad_norm": 11.620004046032994, "learning_rate": 2.374274421856202e-06, "loss": 0.2632, "step": 3843 }, { "epoch": 2.74, "grad_norm": 8.39076395500453, "learning_rate": 2.371815306614104e-06, "loss": 0.2305, "step": 3844 }, { "epoch": 2.74, "grad_norm": 10.1166915100912, "learning_rate": 2.3693570694973673e-06, "loss": 0.2808, "step": 3845 }, { "epoch": 2.75, "grad_norm": 9.41468021588428, "learning_rate": 2.366899711327326e-06, "loss": 0.1987, "step": 3846 }, { "epoch": 2.75, "grad_norm": 11.880151286249157, "learning_rate": 2.3644432329250374e-06, "loss": 0.3016, "step": 3847 }, { "epoch": 2.75, "grad_norm": 18.01327928861911, "learning_rate": 2.3619876351112486e-06, "loss": 0.2637, "step": 3848 }, { "epoch": 2.75, "grad_norm": 12.736270873846344, "learning_rate": 2.3595329187064282e-06, "loss": 0.25, "step": 3849 }, { "epoch": 2.75, "grad_norm": 9.835553229983626, "learning_rate": 2.3570790845307367e-06, "loss": 0.2292, "step": 3850 }, { "epoch": 2.75, "grad_norm": 14.202387368493234, "learning_rate": 2.3546261334040475e-06, "loss": 0.2852, "step": 3851 }, { "epoch": 2.75, "grad_norm": 16.411782936488706, "learning_rate": 2.352174066145938e-06, "loss": 0.3911, "step": 3852 }, { "epoch": 2.75, "grad_norm": 11.67620624008918, "learning_rate": 2.3497228835756887e-06, "loss": 0.2145, "step": 3853 }, { "epoch": 2.75, "grad_norm": 11.462214082378056, "learning_rate": 2.3472725865122854e-06, "loss": 0.2725, "step": 3854 }, { "epoch": 2.75, "grad_norm": 13.87215540966945, "learning_rate": 2.344823175774418e-06, "loss": 0.2715, "step": 3855 }, { "epoch": 2.75, "grad_norm": 9.18779316525026, "learning_rate": 2.3423746521804796e-06, "loss": 0.2561, "step": 3856 }, { "epoch": 2.75, "grad_norm": 12.833742282866638, "learning_rate": 2.339927016548568e-06, "loss": 0.3223, "step": 3857 }, { "epoch": 2.75, "grad_norm": 12.93936635239193, "learning_rate": 2.3374802696964842e-06, "loss": 0.3257, "step": 3858 }, { "epoch": 2.75, "grad_norm": 6.703243223218935, "learning_rate": 2.3350344124417336e-06, "loss": 0.2383, "step": 3859 }, { "epoch": 2.76, "grad_norm": 12.33796827383027, "learning_rate": 2.3325894456015154e-06, "loss": 0.2477, "step": 3860 }, { "epoch": 2.76, "grad_norm": 10.236913282982604, "learning_rate": 2.3301453699927477e-06, "loss": 0.2988, "step": 3861 }, { "epoch": 2.76, "grad_norm": 11.224723971064577, "learning_rate": 2.3277021864320332e-06, "loss": 0.2644, "step": 3862 }, { "epoch": 2.76, "grad_norm": 12.688501246139568, "learning_rate": 2.325259895735693e-06, "loss": 0.335, "step": 3863 }, { "epoch": 2.76, "grad_norm": 8.090819890854913, "learning_rate": 2.322818498719734e-06, "loss": 0.239, "step": 3864 }, { "epoch": 2.76, "grad_norm": 9.057125126226383, "learning_rate": 2.3203779961998795e-06, "loss": 0.2053, "step": 3865 }, { "epoch": 2.76, "grad_norm": 8.958169951613703, "learning_rate": 2.317938388991541e-06, "loss": 0.1948, "step": 3866 }, { "epoch": 2.76, "grad_norm": 14.03811936060471, "learning_rate": 2.3154996779098405e-06, "loss": 0.3455, "step": 3867 }, { "epoch": 2.76, "grad_norm": 10.677363035725163, "learning_rate": 2.313061863769594e-06, "loss": 0.271, "step": 3868 }, { "epoch": 2.76, "grad_norm": 7.65190708066888, "learning_rate": 2.310624947385322e-06, "loss": 0.2233, "step": 3869 }, { "epoch": 2.76, "grad_norm": 10.957316079617797, "learning_rate": 2.3081889295712434e-06, "loss": 0.2454, "step": 3870 }, { "epoch": 2.76, "grad_norm": 14.538888305745177, "learning_rate": 2.3057538111412765e-06, "loss": 0.2864, "step": 3871 }, { "epoch": 2.76, "grad_norm": 8.502555486177886, "learning_rate": 2.3033195929090404e-06, "loss": 0.2903, "step": 3872 }, { "epoch": 2.76, "grad_norm": 10.599925868669361, "learning_rate": 2.300886275687852e-06, "loss": 0.2695, "step": 3873 }, { "epoch": 2.77, "grad_norm": 8.740186546474655, "learning_rate": 2.298453860290728e-06, "loss": 0.2144, "step": 3874 }, { "epoch": 2.77, "grad_norm": 11.863302566675813, "learning_rate": 2.296022347530384e-06, "loss": 0.229, "step": 3875 }, { "epoch": 2.77, "grad_norm": 11.423946125684537, "learning_rate": 2.293591738219233e-06, "loss": 0.2622, "step": 3876 }, { "epoch": 2.77, "grad_norm": 9.70179065557097, "learning_rate": 2.2911620331693867e-06, "loss": 0.2466, "step": 3877 }, { "epoch": 2.77, "grad_norm": 10.731816493491, "learning_rate": 2.2887332331926555e-06, "loss": 0.2634, "step": 3878 }, { "epoch": 2.77, "grad_norm": 9.239045588894083, "learning_rate": 2.2863053391005462e-06, "loss": 0.2148, "step": 3879 }, { "epoch": 2.77, "grad_norm": 9.005638078267589, "learning_rate": 2.2838783517042628e-06, "loss": 0.2544, "step": 3880 }, { "epoch": 2.77, "grad_norm": 13.100939806282982, "learning_rate": 2.281452271814708e-06, "loss": 0.3584, "step": 3881 }, { "epoch": 2.77, "grad_norm": 12.093234947012963, "learning_rate": 2.2790271002424794e-06, "loss": 0.293, "step": 3882 }, { "epoch": 2.77, "grad_norm": 8.795211245967684, "learning_rate": 2.276602837797872e-06, "loss": 0.2092, "step": 3883 }, { "epoch": 2.77, "grad_norm": 13.552937714867474, "learning_rate": 2.274179485290879e-06, "loss": 0.3291, "step": 3884 }, { "epoch": 2.77, "grad_norm": 8.991506238164844, "learning_rate": 2.271757043531184e-06, "loss": 0.2747, "step": 3885 }, { "epoch": 2.77, "grad_norm": 12.652415067342714, "learning_rate": 2.2693355133281706e-06, "loss": 0.2778, "step": 3886 }, { "epoch": 2.77, "grad_norm": 8.559659679329272, "learning_rate": 2.266914895490918e-06, "loss": 0.249, "step": 3887 }, { "epoch": 2.78, "grad_norm": 11.410463329166966, "learning_rate": 2.2644951908282e-06, "loss": 0.2316, "step": 3888 }, { "epoch": 2.78, "grad_norm": 25.89091889726885, "learning_rate": 2.262076400148484e-06, "loss": 0.4292, "step": 3889 }, { "epoch": 2.78, "grad_norm": 15.884730752793354, "learning_rate": 2.2596585242599333e-06, "loss": 0.3232, "step": 3890 }, { "epoch": 2.78, "grad_norm": 10.28933616549401, "learning_rate": 2.257241563970405e-06, "loss": 0.2344, "step": 3891 }, { "epoch": 2.78, "grad_norm": 11.385456564823834, "learning_rate": 2.254825520087451e-06, "loss": 0.2449, "step": 3892 }, { "epoch": 2.78, "grad_norm": 9.305077098961187, "learning_rate": 2.2524103934183154e-06, "loss": 0.3003, "step": 3893 }, { "epoch": 2.78, "grad_norm": 15.773934697174711, "learning_rate": 2.249996184769938e-06, "loss": 0.2681, "step": 3894 }, { "epoch": 2.78, "grad_norm": 20.217611301648915, "learning_rate": 2.2475828949489504e-06, "loss": 0.353, "step": 3895 }, { "epoch": 2.78, "grad_norm": 11.849058612686495, "learning_rate": 2.2451705247616774e-06, "loss": 0.2488, "step": 3896 }, { "epoch": 2.78, "grad_norm": 13.548086863537335, "learning_rate": 2.2427590750141364e-06, "loss": 0.3018, "step": 3897 }, { "epoch": 2.78, "grad_norm": 9.09548543829281, "learning_rate": 2.240348546512039e-06, "loss": 0.2832, "step": 3898 }, { "epoch": 2.78, "grad_norm": 12.482779249189054, "learning_rate": 2.237938940060786e-06, "loss": 0.2422, "step": 3899 }, { "epoch": 2.78, "grad_norm": 10.515407607343274, "learning_rate": 2.235530256465474e-06, "loss": 0.2533, "step": 3900 }, { "epoch": 2.78, "grad_norm": 9.136175867369973, "learning_rate": 2.233122496530884e-06, "loss": 0.2546, "step": 3901 }, { "epoch": 2.79, "grad_norm": 11.726217970795506, "learning_rate": 2.2307156610615e-06, "loss": 0.2715, "step": 3902 }, { "epoch": 2.79, "grad_norm": 8.768133881503765, "learning_rate": 2.2283097508614837e-06, "loss": 0.2378, "step": 3903 }, { "epoch": 2.79, "grad_norm": 8.681822646089104, "learning_rate": 2.225904766734702e-06, "loss": 0.2346, "step": 3904 }, { "epoch": 2.79, "grad_norm": 11.275082284621488, "learning_rate": 2.2235007094846963e-06, "loss": 0.2439, "step": 3905 }, { "epoch": 2.79, "grad_norm": 14.454333835161174, "learning_rate": 2.2210975799147143e-06, "loss": 0.2776, "step": 3906 }, { "epoch": 2.79, "grad_norm": 9.102927596290863, "learning_rate": 2.21869537882768e-06, "loss": 0.2295, "step": 3907 }, { "epoch": 2.79, "grad_norm": 9.406469054773485, "learning_rate": 2.21629410702622e-06, "loss": 0.2034, "step": 3908 }, { "epoch": 2.79, "grad_norm": 6.923690320426484, "learning_rate": 2.2138937653126393e-06, "loss": 0.2527, "step": 3909 }, { "epoch": 2.79, "grad_norm": 8.679522304935842, "learning_rate": 2.2114943544889366e-06, "loss": 0.2437, "step": 3910 }, { "epoch": 2.79, "grad_norm": 7.2130763130063285, "learning_rate": 2.2090958753568013e-06, "loss": 0.2241, "step": 3911 }, { "epoch": 2.79, "grad_norm": 21.126425833535844, "learning_rate": 2.206698328717609e-06, "loss": 0.321, "step": 3912 }, { "epoch": 2.79, "grad_norm": 10.088147801041128, "learning_rate": 2.2043017153724253e-06, "loss": 0.228, "step": 3913 }, { "epoch": 2.79, "grad_norm": 9.08748121795016, "learning_rate": 2.2019060361220036e-06, "loss": 0.249, "step": 3914 }, { "epoch": 2.79, "grad_norm": 24.755148260939105, "learning_rate": 2.199511291766783e-06, "loss": 0.2686, "step": 3915 }, { "epoch": 2.8, "grad_norm": 12.238612933720066, "learning_rate": 2.1971174831068944e-06, "loss": 0.2566, "step": 3916 }, { "epoch": 2.8, "grad_norm": 16.062718282153504, "learning_rate": 2.1947246109421514e-06, "loss": 0.322, "step": 3917 }, { "epoch": 2.8, "grad_norm": 13.06127849954014, "learning_rate": 2.192332676072061e-06, "loss": 0.3306, "step": 3918 }, { "epoch": 2.8, "grad_norm": 10.283758870531178, "learning_rate": 2.189941679295807e-06, "loss": 0.2327, "step": 3919 }, { "epoch": 2.8, "grad_norm": 14.062313929604555, "learning_rate": 2.1875516214122723e-06, "loss": 0.2507, "step": 3920 }, { "epoch": 2.8, "grad_norm": 11.090141149972927, "learning_rate": 2.185162503220013e-06, "loss": 0.2285, "step": 3921 }, { "epoch": 2.8, "grad_norm": 10.824756292791596, "learning_rate": 2.182774325517285e-06, "loss": 0.2534, "step": 3922 }, { "epoch": 2.8, "grad_norm": 10.320492406815534, "learning_rate": 2.180387089102016e-06, "loss": 0.2246, "step": 3923 }, { "epoch": 2.8, "grad_norm": 7.596778425586805, "learning_rate": 2.1780007947718336e-06, "loss": 0.1755, "step": 3924 }, { "epoch": 2.8, "grad_norm": 19.455925151353494, "learning_rate": 2.175615443324035e-06, "loss": 0.3096, "step": 3925 }, { "epoch": 2.8, "grad_norm": 13.420610683613315, "learning_rate": 2.173231035555618e-06, "loss": 0.2738, "step": 3926 }, { "epoch": 2.8, "grad_norm": 12.592246153137603, "learning_rate": 2.170847572263252e-06, "loss": 0.2576, "step": 3927 }, { "epoch": 2.8, "grad_norm": 15.101309193662187, "learning_rate": 2.1684650542432985e-06, "loss": 0.2688, "step": 3928 }, { "epoch": 2.8, "grad_norm": 8.79780471508127, "learning_rate": 2.166083482291801e-06, "loss": 0.2893, "step": 3929 }, { "epoch": 2.81, "grad_norm": 11.65657764794894, "learning_rate": 2.1637028572044867e-06, "loss": 0.2734, "step": 3930 }, { "epoch": 2.81, "grad_norm": 14.391962577247151, "learning_rate": 2.1613231797767668e-06, "loss": 0.2554, "step": 3931 }, { "epoch": 2.81, "grad_norm": 16.977331007596224, "learning_rate": 2.158944450803736e-06, "loss": 0.2844, "step": 3932 }, { "epoch": 2.81, "grad_norm": 10.01757401617662, "learning_rate": 2.1565666710801714e-06, "loss": 0.271, "step": 3933 }, { "epoch": 2.81, "grad_norm": 13.615886862288278, "learning_rate": 2.1541898414005343e-06, "loss": 0.2778, "step": 3934 }, { "epoch": 2.81, "grad_norm": 13.59283742513515, "learning_rate": 2.1518139625589663e-06, "loss": 0.2664, "step": 3935 }, { "epoch": 2.81, "grad_norm": 11.502287160025258, "learning_rate": 2.1494390353492935e-06, "loss": 0.3418, "step": 3936 }, { "epoch": 2.81, "grad_norm": 7.93143738046724, "learning_rate": 2.1470650605650235e-06, "loss": 0.2114, "step": 3937 }, { "epoch": 2.81, "grad_norm": 16.164621540582115, "learning_rate": 2.144692038999345e-06, "loss": 0.2739, "step": 3938 }, { "epoch": 2.81, "grad_norm": 12.845701204985016, "learning_rate": 2.142319971445129e-06, "loss": 0.3232, "step": 3939 }, { "epoch": 2.81, "grad_norm": 10.959776190394157, "learning_rate": 2.139948858694926e-06, "loss": 0.2891, "step": 3940 }, { "epoch": 2.81, "grad_norm": 7.79971040797762, "learning_rate": 2.137578701540971e-06, "loss": 0.2053, "step": 3941 }, { "epoch": 2.81, "grad_norm": 14.638963180765387, "learning_rate": 2.1352095007751754e-06, "loss": 0.3005, "step": 3942 }, { "epoch": 2.81, "grad_norm": 43.00522394014673, "learning_rate": 2.132841257189137e-06, "loss": 0.4937, "step": 3943 }, { "epoch": 2.82, "grad_norm": 10.985232967151337, "learning_rate": 2.1304739715741235e-06, "loss": 0.3257, "step": 3944 }, { "epoch": 2.82, "grad_norm": 8.283080793744645, "learning_rate": 2.128107644721096e-06, "loss": 0.1897, "step": 3945 }, { "epoch": 2.82, "grad_norm": 11.552268713806683, "learning_rate": 2.1257422774206816e-06, "loss": 0.2751, "step": 3946 }, { "epoch": 2.82, "grad_norm": 14.25244065240587, "learning_rate": 2.1233778704632002e-06, "loss": 0.2583, "step": 3947 }, { "epoch": 2.82, "grad_norm": 28.46181807821369, "learning_rate": 2.1210144246386378e-06, "loss": 0.4048, "step": 3948 }, { "epoch": 2.82, "grad_norm": 13.599125276057082, "learning_rate": 2.1186519407366725e-06, "loss": 0.2998, "step": 3949 }, { "epoch": 2.82, "grad_norm": 18.69833834877681, "learning_rate": 2.1162904195466455e-06, "loss": 0.2974, "step": 3950 }, { "epoch": 2.82, "grad_norm": 9.782064797336112, "learning_rate": 2.113929861857594e-06, "loss": 0.2415, "step": 3951 }, { "epoch": 2.82, "grad_norm": 9.05525476633752, "learning_rate": 2.1115702684582177e-06, "loss": 0.2354, "step": 3952 }, { "epoch": 2.82, "grad_norm": 8.488998620156735, "learning_rate": 2.1092116401369033e-06, "loss": 0.2205, "step": 3953 }, { "epoch": 2.82, "grad_norm": 7.6604425215527465, "learning_rate": 2.1068539776817115e-06, "loss": 0.1997, "step": 3954 }, { "epoch": 2.82, "grad_norm": 10.936366716585368, "learning_rate": 2.1044972818803816e-06, "loss": 0.2666, "step": 3955 }, { "epoch": 2.82, "grad_norm": 13.919791218319379, "learning_rate": 2.1021415535203294e-06, "loss": 0.2935, "step": 3956 }, { "epoch": 2.82, "grad_norm": 12.83542796098615, "learning_rate": 2.0997867933886467e-06, "loss": 0.2839, "step": 3957 }, { "epoch": 2.83, "grad_norm": 11.346017326459489, "learning_rate": 2.0974330022721044e-06, "loss": 0.3179, "step": 3958 }, { "epoch": 2.83, "grad_norm": 11.944822762505321, "learning_rate": 2.0950801809571466e-06, "loss": 0.2749, "step": 3959 }, { "epoch": 2.83, "grad_norm": 11.391359360322923, "learning_rate": 2.0927283302298944e-06, "loss": 0.303, "step": 3960 }, { "epoch": 2.83, "grad_norm": 7.032353529389161, "learning_rate": 2.0903774508761477e-06, "loss": 0.2107, "step": 3961 }, { "epoch": 2.83, "grad_norm": 6.485677725607823, "learning_rate": 2.0880275436813726e-06, "loss": 0.1442, "step": 3962 }, { "epoch": 2.83, "grad_norm": 13.077591332146431, "learning_rate": 2.0856786094307247e-06, "loss": 0.2629, "step": 3963 }, { "epoch": 2.83, "grad_norm": 11.819852695103354, "learning_rate": 2.0833306489090186e-06, "loss": 0.356, "step": 3964 }, { "epoch": 2.83, "grad_norm": 9.563988233532719, "learning_rate": 2.08098366290076e-06, "loss": 0.2456, "step": 3965 }, { "epoch": 2.83, "grad_norm": 8.809406976811916, "learning_rate": 2.078637652190112e-06, "loss": 0.2141, "step": 3966 }, { "epoch": 2.83, "grad_norm": 10.610564298049212, "learning_rate": 2.0762926175609287e-06, "loss": 0.2444, "step": 3967 }, { "epoch": 2.83, "grad_norm": 13.367480219472963, "learning_rate": 2.0739485597967237e-06, "loss": 0.3579, "step": 3968 }, { "epoch": 2.83, "grad_norm": 9.40280542559146, "learning_rate": 2.0716054796806916e-06, "loss": 0.24, "step": 3969 }, { "epoch": 2.83, "grad_norm": 18.096437879516504, "learning_rate": 2.0692633779956998e-06, "loss": 0.2996, "step": 3970 }, { "epoch": 2.83, "grad_norm": 11.01349355918468, "learning_rate": 2.0669222555242884e-06, "loss": 0.3135, "step": 3971 }, { "epoch": 2.84, "grad_norm": 11.179708500485882, "learning_rate": 2.064582113048669e-06, "loss": 0.2285, "step": 3972 }, { "epoch": 2.84, "grad_norm": 9.404710632894906, "learning_rate": 2.0622429513507275e-06, "loss": 0.2913, "step": 3973 }, { "epoch": 2.84, "grad_norm": 11.974891410309862, "learning_rate": 2.05990477121202e-06, "loss": 0.3313, "step": 3974 }, { "epoch": 2.84, "grad_norm": 10.45063399330934, "learning_rate": 2.0575675734137773e-06, "loss": 0.2144, "step": 3975 }, { "epoch": 2.84, "grad_norm": 11.698967473996044, "learning_rate": 2.0552313587369003e-06, "loss": 0.2664, "step": 3976 }, { "epoch": 2.84, "grad_norm": 12.174398856640751, "learning_rate": 2.052896127961963e-06, "loss": 0.2932, "step": 3977 }, { "epoch": 2.84, "grad_norm": 15.769798231540381, "learning_rate": 2.050561881869205e-06, "loss": 0.2498, "step": 3978 }, { "epoch": 2.84, "grad_norm": 10.068055612015428, "learning_rate": 2.048228621238547e-06, "loss": 0.2527, "step": 3979 }, { "epoch": 2.84, "grad_norm": 9.886413081007158, "learning_rate": 2.0458963468495692e-06, "loss": 0.2693, "step": 3980 }, { "epoch": 2.84, "grad_norm": 8.972122538335825, "learning_rate": 2.0435650594815338e-06, "loss": 0.179, "step": 3981 }, { "epoch": 2.84, "grad_norm": 16.441213776975122, "learning_rate": 2.0412347599133607e-06, "loss": 0.2732, "step": 3982 }, { "epoch": 2.84, "grad_norm": 12.836617592039206, "learning_rate": 2.0389054489236534e-06, "loss": 0.2695, "step": 3983 }, { "epoch": 2.84, "grad_norm": 11.202241722151822, "learning_rate": 2.03657712729067e-06, "loss": 0.2607, "step": 3984 }, { "epoch": 2.84, "grad_norm": 8.254407875075486, "learning_rate": 2.034249795792355e-06, "loss": 0.177, "step": 3985 }, { "epoch": 2.85, "grad_norm": 10.31910673688725, "learning_rate": 2.031923455206306e-06, "loss": 0.2949, "step": 3986 }, { "epoch": 2.85, "grad_norm": 17.84033377167254, "learning_rate": 2.0295981063098e-06, "loss": 0.3264, "step": 3987 }, { "epoch": 2.85, "grad_norm": 10.599633875942123, "learning_rate": 2.027273749879777e-06, "loss": 0.2413, "step": 3988 }, { "epoch": 2.85, "grad_norm": 10.28616607550584, "learning_rate": 2.02495038669285e-06, "loss": 0.2134, "step": 3989 }, { "epoch": 2.85, "grad_norm": 11.91463101296468, "learning_rate": 2.0226280175252966e-06, "loss": 0.2627, "step": 3990 }, { "epoch": 2.85, "grad_norm": 11.012332249144231, "learning_rate": 2.020306643153063e-06, "loss": 0.2666, "step": 3991 }, { "epoch": 2.85, "grad_norm": 16.681187563001785, "learning_rate": 2.0179862643517657e-06, "loss": 0.3252, "step": 3992 }, { "epoch": 2.85, "grad_norm": 9.817386411445645, "learning_rate": 2.015666881896684e-06, "loss": 0.2441, "step": 3993 }, { "epoch": 2.85, "grad_norm": 11.064995665760662, "learning_rate": 2.0133484965627683e-06, "loss": 0.2886, "step": 3994 }, { "epoch": 2.85, "grad_norm": 14.865028740128867, "learning_rate": 2.0110311091246333e-06, "loss": 0.3228, "step": 3995 }, { "epoch": 2.85, "grad_norm": 10.764784698091306, "learning_rate": 2.0087147203565614e-06, "loss": 0.2949, "step": 3996 }, { "epoch": 2.85, "grad_norm": 10.094254980125088, "learning_rate": 2.0063993310325013e-06, "loss": 0.217, "step": 3997 }, { "epoch": 2.85, "grad_norm": 9.135626121609093, "learning_rate": 2.0040849419260682e-06, "loss": 0.251, "step": 3998 }, { "epoch": 2.85, "grad_norm": 18.33339179679279, "learning_rate": 2.0017715538105416e-06, "loss": 0.2485, "step": 3999 }, { "epoch": 2.86, "grad_norm": 16.79996387979217, "learning_rate": 1.9994591674588677e-06, "loss": 0.2537, "step": 4000 }, { "epoch": 2.86, "eval_avg_AUC": 0.7879958586606428, "eval_avg_Accuracy": 0.6990218832891246, "eval_avg_Accuracy-right": 0.8709403938959176, "eval_avg_Accuracy-wrong": 0.3992494882874687, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6623938351746894, "eval_last_AUC": 0.8070917433092194, "eval_last_Accuracy": 0.7334631962864722, "eval_last_Accuracy-right": 0.8252902047737055, "eval_last_Accuracy-wrong": 0.5733454628155561, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6848945312092272, "eval_max_AUC": 0.7616910032003927, "eval_max_Accuracy": 0.6416611405835544, "eval_max_Accuracy-right": 0.9620451284726751, "eval_max_Accuracy-wrong": 0.08301114396179213, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6150202437158859, "eval_min_AUC": 0.7931598852081432, "eval_min_Accuracy": 0.720407824933687, "eval_min_Accuracy-right": 0.7332072518586148, "eval_min_Accuracy-wrong": 0.6980896065499204, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6685353032201301, "eval_prod_AUC": 0.7945893958373447, "eval_prod_Accuracy": 0.702420424403183, "eval_prod_Accuracy-right": 0.6255380200860832, "eval_prod_Accuracy-wrong": 0.8364794177848534, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6701653120639736, "eval_runtime": 247.3512, "eval_samples_per_second": 97.545, "eval_steps_per_second": 3.048, "eval_sum_AUC": 0.6614690230562896, "eval_sum_Accuracy": 0.6374336870026526, "eval_sum_Accuracy-right": 0.9853919394808921, "eval_sum_Accuracy-wrong": 0.030702751876279282, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6497230981517113, "step": 4000 }, { "epoch": 2.86, "grad_norm": 20.294863167005364, "learning_rate": 1.9971477836436575e-06, "loss": 0.3467, "step": 4001 }, { "epoch": 2.86, "grad_norm": 10.327232865174512, "learning_rate": 1.99483740313719e-06, "loss": 0.2378, "step": 4002 }, { "epoch": 2.86, "grad_norm": 11.22041031697138, "learning_rate": 1.9925280267114e-06, "loss": 0.248, "step": 4003 }, { "epoch": 2.86, "grad_norm": 11.21213166626409, "learning_rate": 1.9902196551379006e-06, "loss": 0.2129, "step": 4004 }, { "epoch": 2.86, "grad_norm": 13.610468088544783, "learning_rate": 1.987912289187954e-06, "loss": 0.2856, "step": 4005 }, { "epoch": 2.86, "grad_norm": 12.763270070414201, "learning_rate": 1.9856059296325027e-06, "loss": 0.3086, "step": 4006 }, { "epoch": 2.86, "grad_norm": 9.34977464133706, "learning_rate": 1.9833005772421354e-06, "loss": 0.1912, "step": 4007 }, { "epoch": 2.86, "grad_norm": 9.99525600692564, "learning_rate": 1.980996232787121e-06, "loss": 0.2854, "step": 4008 }, { "epoch": 2.86, "grad_norm": 8.482697059629118, "learning_rate": 1.978692897037377e-06, "loss": 0.2537, "step": 4009 }, { "epoch": 2.86, "grad_norm": 11.88855064417953, "learning_rate": 1.9763905707624975e-06, "loss": 0.2056, "step": 4010 }, { "epoch": 2.86, "grad_norm": 12.515119687094433, "learning_rate": 1.974089254731727e-06, "loss": 0.3047, "step": 4011 }, { "epoch": 2.86, "grad_norm": 10.460042983380315, "learning_rate": 1.97178894971398e-06, "loss": 0.1714, "step": 4012 }, { "epoch": 2.86, "grad_norm": 11.000891418019652, "learning_rate": 1.9694896564778317e-06, "loss": 0.2881, "step": 4013 }, { "epoch": 2.87, "grad_norm": 10.457581961485896, "learning_rate": 1.9671913757915173e-06, "loss": 0.1953, "step": 4014 }, { "epoch": 2.87, "grad_norm": 13.329358437695408, "learning_rate": 1.964894108422936e-06, "loss": 0.2489, "step": 4015 }, { "epoch": 2.87, "grad_norm": 10.507027263953846, "learning_rate": 1.962597855139648e-06, "loss": 0.2153, "step": 4016 }, { "epoch": 2.87, "grad_norm": 13.10099282908348, "learning_rate": 1.960302616708873e-06, "loss": 0.2883, "step": 4017 }, { "epoch": 2.87, "grad_norm": 8.480233105548061, "learning_rate": 1.9580083938974937e-06, "loss": 0.223, "step": 4018 }, { "epoch": 2.87, "grad_norm": 12.559695185275773, "learning_rate": 1.9557151874720526e-06, "loss": 0.2325, "step": 4019 }, { "epoch": 2.87, "grad_norm": 13.141810003892173, "learning_rate": 1.953422998198754e-06, "loss": 0.1979, "step": 4020 }, { "epoch": 2.87, "grad_norm": 14.982944575808363, "learning_rate": 1.9511318268434554e-06, "loss": 0.3599, "step": 4021 }, { "epoch": 2.87, "grad_norm": 11.211224106691908, "learning_rate": 1.9488416741716877e-06, "loss": 0.2395, "step": 4022 }, { "epoch": 2.87, "grad_norm": 8.569449118407048, "learning_rate": 1.946552540948625e-06, "loss": 0.2422, "step": 4023 }, { "epoch": 2.87, "grad_norm": 13.650414366436912, "learning_rate": 1.944264427939118e-06, "loss": 0.3, "step": 4024 }, { "epoch": 2.87, "grad_norm": 7.739566513501393, "learning_rate": 1.941977335907659e-06, "loss": 0.1785, "step": 4025 }, { "epoch": 2.87, "grad_norm": 18.6977439093024, "learning_rate": 1.939691265618417e-06, "loss": 0.3027, "step": 4026 }, { "epoch": 2.87, "grad_norm": 9.915637048581093, "learning_rate": 1.9374062178352036e-06, "loss": 0.2341, "step": 4027 }, { "epoch": 2.88, "grad_norm": 9.897353050548803, "learning_rate": 1.935122193321499e-06, "loss": 0.2625, "step": 4028 }, { "epoch": 2.88, "grad_norm": 16.08317279527024, "learning_rate": 1.932839192840436e-06, "loss": 0.2683, "step": 4029 }, { "epoch": 2.88, "grad_norm": 12.704675205411846, "learning_rate": 1.930557217154809e-06, "loss": 0.2295, "step": 4030 }, { "epoch": 2.88, "grad_norm": 8.351026891672202, "learning_rate": 1.9282762670270693e-06, "loss": 0.2058, "step": 4031 }, { "epoch": 2.88, "grad_norm": 11.34895226658225, "learning_rate": 1.925996343219323e-06, "loss": 0.3765, "step": 4032 }, { "epoch": 2.88, "grad_norm": 11.597504826581677, "learning_rate": 1.923717446493336e-06, "loss": 0.3281, "step": 4033 }, { "epoch": 2.88, "grad_norm": 11.653477339408518, "learning_rate": 1.9214395776105297e-06, "loss": 0.2778, "step": 4034 }, { "epoch": 2.88, "grad_norm": 8.944620053801449, "learning_rate": 1.919162737331983e-06, "loss": 0.2278, "step": 4035 }, { "epoch": 2.88, "grad_norm": 17.550617411583413, "learning_rate": 1.9168869264184296e-06, "loss": 0.2656, "step": 4036 }, { "epoch": 2.88, "grad_norm": 11.945812481491696, "learning_rate": 1.9146121456302613e-06, "loss": 0.2358, "step": 4037 }, { "epoch": 2.88, "grad_norm": 11.422504829133128, "learning_rate": 1.9123383957275237e-06, "loss": 0.2812, "step": 4038 }, { "epoch": 2.88, "grad_norm": 17.311605530516562, "learning_rate": 1.91006567746992e-06, "loss": 0.2559, "step": 4039 }, { "epoch": 2.88, "grad_norm": 11.786074439406534, "learning_rate": 1.907793991616806e-06, "loss": 0.2878, "step": 4040 }, { "epoch": 2.88, "grad_norm": 9.6922862971148, "learning_rate": 1.9055233389271955e-06, "loss": 0.2671, "step": 4041 }, { "epoch": 2.89, "grad_norm": 10.628962310466495, "learning_rate": 1.9032537201597556e-06, "loss": 0.218, "step": 4042 }, { "epoch": 2.89, "grad_norm": 10.911645743068291, "learning_rate": 1.9009851360728077e-06, "loss": 0.2557, "step": 4043 }, { "epoch": 2.89, "grad_norm": 22.595771054903054, "learning_rate": 1.898717587424328e-06, "loss": 0.3633, "step": 4044 }, { "epoch": 2.89, "grad_norm": 8.518102120271452, "learning_rate": 1.8964510749719484e-06, "loss": 0.2349, "step": 4045 }, { "epoch": 2.89, "grad_norm": 16.22905494698704, "learning_rate": 1.8941855994729497e-06, "loss": 0.29, "step": 4046 }, { "epoch": 2.89, "grad_norm": 9.207905032025637, "learning_rate": 1.8919211616842703e-06, "loss": 0.209, "step": 4047 }, { "epoch": 2.89, "grad_norm": 12.410396427093156, "learning_rate": 1.8896577623625017e-06, "loss": 0.2229, "step": 4048 }, { "epoch": 2.89, "grad_norm": 15.527011086500547, "learning_rate": 1.887395402263888e-06, "loss": 0.3206, "step": 4049 }, { "epoch": 2.89, "grad_norm": 13.672016334197911, "learning_rate": 1.8851340821443248e-06, "loss": 0.2427, "step": 4050 }, { "epoch": 2.89, "grad_norm": 11.125832400500446, "learning_rate": 1.882873802759362e-06, "loss": 0.312, "step": 4051 }, { "epoch": 2.89, "grad_norm": 14.046251858097985, "learning_rate": 1.8806145648642005e-06, "loss": 0.2979, "step": 4052 }, { "epoch": 2.89, "grad_norm": 9.838718306043596, "learning_rate": 1.8783563692136936e-06, "loss": 0.2334, "step": 4053 }, { "epoch": 2.89, "grad_norm": 10.943639533290575, "learning_rate": 1.8760992165623465e-06, "loss": 0.3025, "step": 4054 }, { "epoch": 2.89, "grad_norm": 10.666070064383465, "learning_rate": 1.873843107664316e-06, "loss": 0.2688, "step": 4055 }, { "epoch": 2.9, "grad_norm": 11.583556219560066, "learning_rate": 1.87158804327341e-06, "loss": 0.2915, "step": 4056 }, { "epoch": 2.9, "grad_norm": 7.0617200379471425, "learning_rate": 1.8693340241430874e-06, "loss": 0.1807, "step": 4057 }, { "epoch": 2.9, "grad_norm": 12.72852127215646, "learning_rate": 1.867081051026458e-06, "loss": 0.3193, "step": 4058 }, { "epoch": 2.9, "grad_norm": 10.749413424323619, "learning_rate": 1.8648291246762818e-06, "loss": 0.2511, "step": 4059 }, { "epoch": 2.9, "grad_norm": 12.654719307217244, "learning_rate": 1.8625782458449693e-06, "loss": 0.2336, "step": 4060 }, { "epoch": 2.9, "grad_norm": 10.407631474167662, "learning_rate": 1.860328415284583e-06, "loss": 0.2976, "step": 4061 }, { "epoch": 2.9, "grad_norm": 10.48686271000459, "learning_rate": 1.8580796337468276e-06, "loss": 0.1995, "step": 4062 }, { "epoch": 2.9, "grad_norm": 9.274457610353167, "learning_rate": 1.8558319019830695e-06, "loss": 0.2031, "step": 4063 }, { "epoch": 2.9, "grad_norm": 11.802148591200808, "learning_rate": 1.853585220744311e-06, "loss": 0.2812, "step": 4064 }, { "epoch": 2.9, "grad_norm": 8.092229309343045, "learning_rate": 1.851339590781217e-06, "loss": 0.2485, "step": 4065 }, { "epoch": 2.9, "grad_norm": 21.463031361010216, "learning_rate": 1.8490950128440877e-06, "loss": 0.355, "step": 4066 }, { "epoch": 2.9, "grad_norm": 10.998522968766485, "learning_rate": 1.8468514876828847e-06, "loss": 0.2788, "step": 4067 }, { "epoch": 2.9, "grad_norm": 14.552298244913189, "learning_rate": 1.844609016047204e-06, "loss": 0.3298, "step": 4068 }, { "epoch": 2.9, "grad_norm": 16.40982638450866, "learning_rate": 1.8423675986863054e-06, "loss": 0.3369, "step": 4069 }, { "epoch": 2.91, "grad_norm": 10.645489950141965, "learning_rate": 1.8401272363490818e-06, "loss": 0.21, "step": 4070 }, { "epoch": 2.91, "grad_norm": 13.723345576862725, "learning_rate": 1.8378879297840818e-06, "loss": 0.3018, "step": 4071 }, { "epoch": 2.91, "grad_norm": 10.662657815513155, "learning_rate": 1.8356496797395002e-06, "loss": 0.2375, "step": 4072 }, { "epoch": 2.91, "grad_norm": 13.360949595986845, "learning_rate": 1.8334124869631765e-06, "loss": 0.2791, "step": 4073 }, { "epoch": 2.91, "grad_norm": 9.685132876470785, "learning_rate": 1.8311763522025994e-06, "loss": 0.2039, "step": 4074 }, { "epoch": 2.91, "grad_norm": 9.895276388339477, "learning_rate": 1.828941276204903e-06, "loss": 0.2727, "step": 4075 }, { "epoch": 2.91, "grad_norm": 8.432764560809574, "learning_rate": 1.8267072597168673e-06, "loss": 0.2058, "step": 4076 }, { "epoch": 2.91, "grad_norm": 8.308441456073734, "learning_rate": 1.8244743034849193e-06, "loss": 0.2285, "step": 4077 }, { "epoch": 2.91, "grad_norm": 12.095081699709256, "learning_rate": 1.8222424082551303e-06, "loss": 0.2605, "step": 4078 }, { "epoch": 2.91, "grad_norm": 11.298604259683547, "learning_rate": 1.820011574773221e-06, "loss": 0.2684, "step": 4079 }, { "epoch": 2.91, "grad_norm": 8.019721690601129, "learning_rate": 1.8177818037845485e-06, "loss": 0.2024, "step": 4080 }, { "epoch": 2.91, "grad_norm": 16.13667342258012, "learning_rate": 1.8155530960341273e-06, "loss": 0.2921, "step": 4081 }, { "epoch": 2.91, "grad_norm": 13.865382533021625, "learning_rate": 1.8133254522666033e-06, "loss": 0.293, "step": 4082 }, { "epoch": 2.91, "grad_norm": 11.783714143176804, "learning_rate": 1.8110988732262808e-06, "loss": 0.2706, "step": 4083 }, { "epoch": 2.92, "grad_norm": 9.22742606505741, "learning_rate": 1.8088733596570945e-06, "loss": 0.2429, "step": 4084 }, { "epoch": 2.92, "grad_norm": 11.119724412060055, "learning_rate": 1.806648912302636e-06, "loss": 0.2507, "step": 4085 }, { "epoch": 2.92, "grad_norm": 8.320477068822154, "learning_rate": 1.8044255319061287e-06, "loss": 0.28, "step": 4086 }, { "epoch": 2.92, "grad_norm": 8.759806331612182, "learning_rate": 1.8022032192104517e-06, "loss": 0.2698, "step": 4087 }, { "epoch": 2.92, "grad_norm": 14.35985526948073, "learning_rate": 1.7999819749581154e-06, "loss": 0.3721, "step": 4088 }, { "epoch": 2.92, "grad_norm": 9.828623960795465, "learning_rate": 1.797761799891281e-06, "loss": 0.3364, "step": 4089 }, { "epoch": 2.92, "grad_norm": 11.744365228222355, "learning_rate": 1.7955426947517507e-06, "loss": 0.24, "step": 4090 }, { "epoch": 2.92, "grad_norm": 9.279599538506409, "learning_rate": 1.793324660280968e-06, "loss": 0.2603, "step": 4091 }, { "epoch": 2.92, "grad_norm": 12.284245558123292, "learning_rate": 1.7911076972200193e-06, "loss": 0.2681, "step": 4092 }, { "epoch": 2.92, "grad_norm": 10.759102641777156, "learning_rate": 1.7888918063096334e-06, "loss": 0.2205, "step": 4093 }, { "epoch": 2.92, "grad_norm": 10.414253236529335, "learning_rate": 1.7866769882901814e-06, "loss": 0.2959, "step": 4094 }, { "epoch": 2.92, "grad_norm": 12.100896779951352, "learning_rate": 1.784463243901674e-06, "loss": 0.25, "step": 4095 }, { "epoch": 2.92, "grad_norm": 15.39298134834118, "learning_rate": 1.7822505738837648e-06, "loss": 0.397, "step": 4096 }, { "epoch": 2.92, "grad_norm": 7.320159627540454, "learning_rate": 1.7800389789757483e-06, "loss": 0.2217, "step": 4097 }, { "epoch": 2.93, "grad_norm": 8.705040302239233, "learning_rate": 1.7778284599165597e-06, "loss": 0.2642, "step": 4098 }, { "epoch": 2.93, "grad_norm": 12.866603928279114, "learning_rate": 1.7756190174447734e-06, "loss": 0.2966, "step": 4099 }, { "epoch": 2.93, "grad_norm": 12.300852071369984, "learning_rate": 1.7734106522986061e-06, "loss": 0.2854, "step": 4100 }, { "epoch": 2.93, "grad_norm": 7.0406718245773545, "learning_rate": 1.7712033652159133e-06, "loss": 0.2153, "step": 4101 }, { "epoch": 2.93, "grad_norm": 16.366009476816807, "learning_rate": 1.7689971569341907e-06, "loss": 0.3169, "step": 4102 }, { "epoch": 2.93, "grad_norm": 8.099939081723868, "learning_rate": 1.7667920281905738e-06, "loss": 0.1508, "step": 4103 }, { "epoch": 2.93, "grad_norm": 17.54395094001839, "learning_rate": 1.764587979721838e-06, "loss": 0.3247, "step": 4104 }, { "epoch": 2.93, "grad_norm": 9.355552433889878, "learning_rate": 1.7623850122643926e-06, "loss": 0.1946, "step": 4105 }, { "epoch": 2.93, "grad_norm": 11.451247973760308, "learning_rate": 1.7601831265542968e-06, "loss": 0.2495, "step": 4106 }, { "epoch": 2.93, "grad_norm": 8.38373080535724, "learning_rate": 1.7579823233272337e-06, "loss": 0.2188, "step": 4107 }, { "epoch": 2.93, "grad_norm": 32.26992819582037, "learning_rate": 1.7557826033185404e-06, "loss": 0.3677, "step": 4108 }, { "epoch": 2.93, "grad_norm": 9.31935554317174, "learning_rate": 1.7535839672631772e-06, "loss": 0.3042, "step": 4109 }, { "epoch": 2.93, "grad_norm": 7.333138141169738, "learning_rate": 1.7513864158957556e-06, "loss": 0.1741, "step": 4110 }, { "epoch": 2.93, "grad_norm": 12.064201456722603, "learning_rate": 1.7491899499505122e-06, "loss": 0.2629, "step": 4111 }, { "epoch": 2.94, "grad_norm": 12.246914808812818, "learning_rate": 1.746994570161334e-06, "loss": 0.2664, "step": 4112 }, { "epoch": 2.94, "grad_norm": 16.025286460767063, "learning_rate": 1.7448002772617324e-06, "loss": 0.2764, "step": 4113 }, { "epoch": 2.94, "grad_norm": 9.595517375370573, "learning_rate": 1.7426070719848632e-06, "loss": 0.1829, "step": 4114 }, { "epoch": 2.94, "grad_norm": 9.033101510979936, "learning_rate": 1.7404149550635173e-06, "loss": 0.2468, "step": 4115 }, { "epoch": 2.94, "grad_norm": 15.395760488217663, "learning_rate": 1.7382239272301221e-06, "loss": 0.345, "step": 4116 }, { "epoch": 2.94, "grad_norm": 8.718081017809373, "learning_rate": 1.7360339892167404e-06, "loss": 0.2185, "step": 4117 }, { "epoch": 2.94, "grad_norm": 11.570151019962179, "learning_rate": 1.7338451417550712e-06, "loss": 0.2932, "step": 4118 }, { "epoch": 2.94, "grad_norm": 15.489633738751614, "learning_rate": 1.7316573855764485e-06, "loss": 0.3535, "step": 4119 }, { "epoch": 2.94, "grad_norm": 11.204623438557102, "learning_rate": 1.7294707214118434e-06, "loss": 0.2524, "step": 4120 }, { "epoch": 2.94, "grad_norm": 8.024037924894355, "learning_rate": 1.7272851499918603e-06, "loss": 0.2061, "step": 4121 }, { "epoch": 2.94, "grad_norm": 10.025616633736863, "learning_rate": 1.725100672046741e-06, "loss": 0.2207, "step": 4122 }, { "epoch": 2.94, "grad_norm": 9.237786503227326, "learning_rate": 1.7229172883063556e-06, "loss": 0.2344, "step": 4123 }, { "epoch": 2.94, "grad_norm": 11.400008128488517, "learning_rate": 1.7207349995002192e-06, "loss": 0.2041, "step": 4124 }, { "epoch": 2.94, "grad_norm": 13.04513731466515, "learning_rate": 1.7185538063574692e-06, "loss": 0.3313, "step": 4125 }, { "epoch": 2.95, "grad_norm": 9.065512097775526, "learning_rate": 1.7163737096068883e-06, "loss": 0.198, "step": 4126 }, { "epoch": 2.95, "grad_norm": 11.819709518012287, "learning_rate": 1.7141947099768818e-06, "loss": 0.2305, "step": 4127 }, { "epoch": 2.95, "grad_norm": 6.6082456220174, "learning_rate": 1.7120168081955001e-06, "loss": 0.168, "step": 4128 }, { "epoch": 2.95, "grad_norm": 9.349276591227465, "learning_rate": 1.7098400049904163e-06, "loss": 0.2913, "step": 4129 }, { "epoch": 2.95, "grad_norm": 8.586751031921361, "learning_rate": 1.707664301088941e-06, "loss": 0.2065, "step": 4130 }, { "epoch": 2.95, "grad_norm": 14.717204391015239, "learning_rate": 1.705489697218019e-06, "loss": 0.3105, "step": 4131 }, { "epoch": 2.95, "grad_norm": 9.460015525511492, "learning_rate": 1.7033161941042248e-06, "loss": 0.208, "step": 4132 }, { "epoch": 2.95, "grad_norm": 11.400780736034415, "learning_rate": 1.7011437924737666e-06, "loss": 0.3025, "step": 4133 }, { "epoch": 2.95, "grad_norm": 10.28553264691427, "learning_rate": 1.6989724930524843e-06, "loss": 0.2966, "step": 4134 }, { "epoch": 2.95, "grad_norm": 10.477581081372428, "learning_rate": 1.6968022965658492e-06, "loss": 0.183, "step": 4135 }, { "epoch": 2.95, "grad_norm": 15.331783836914225, "learning_rate": 1.694633203738964e-06, "loss": 0.281, "step": 4136 }, { "epoch": 2.95, "grad_norm": 10.89865181755367, "learning_rate": 1.6924652152965632e-06, "loss": 0.2869, "step": 4137 }, { "epoch": 2.95, "grad_norm": 10.774512612157912, "learning_rate": 1.690298331963014e-06, "loss": 0.2534, "step": 4138 }, { "epoch": 2.95, "grad_norm": 9.468974723185733, "learning_rate": 1.6881325544623067e-06, "loss": 0.2205, "step": 4139 }, { "epoch": 2.96, "grad_norm": 11.15613380148459, "learning_rate": 1.6859678835180749e-06, "loss": 0.2581, "step": 4140 }, { "epoch": 2.96, "grad_norm": 9.938617854657009, "learning_rate": 1.6838043198535693e-06, "loss": 0.2761, "step": 4141 }, { "epoch": 2.96, "grad_norm": 22.526184909674672, "learning_rate": 1.681641864191682e-06, "loss": 0.4287, "step": 4142 }, { "epoch": 2.96, "grad_norm": 10.610954712676994, "learning_rate": 1.6794805172549244e-06, "loss": 0.2856, "step": 4143 }, { "epoch": 2.96, "grad_norm": 11.130277611997542, "learning_rate": 1.6773202797654486e-06, "loss": 0.2515, "step": 4144 }, { "epoch": 2.96, "grad_norm": 11.42493725544239, "learning_rate": 1.6751611524450235e-06, "loss": 0.2637, "step": 4145 }, { "epoch": 2.96, "grad_norm": 10.082805920327193, "learning_rate": 1.6730031360150605e-06, "loss": 0.2311, "step": 4146 }, { "epoch": 2.96, "grad_norm": 8.655752640833025, "learning_rate": 1.670846231196588e-06, "loss": 0.2336, "step": 4147 }, { "epoch": 2.96, "grad_norm": 9.13420076254242, "learning_rate": 1.6686904387102692e-06, "loss": 0.2661, "step": 4148 }, { "epoch": 2.96, "grad_norm": 10.637960833953823, "learning_rate": 1.6665357592763948e-06, "loss": 0.2262, "step": 4149 }, { "epoch": 2.96, "grad_norm": 11.514907798772807, "learning_rate": 1.6643821936148834e-06, "loss": 0.2236, "step": 4150 }, { "epoch": 2.96, "grad_norm": 9.881896754186977, "learning_rate": 1.6622297424452817e-06, "loss": 0.2382, "step": 4151 }, { "epoch": 2.96, "grad_norm": 6.96329529621935, "learning_rate": 1.6600784064867625e-06, "loss": 0.2207, "step": 4152 }, { "epoch": 2.96, "grad_norm": 9.235052105560225, "learning_rate": 1.6579281864581275e-06, "loss": 0.2438, "step": 4153 }, { "epoch": 2.97, "grad_norm": 8.127333202419477, "learning_rate": 1.6557790830778058e-06, "loss": 0.2133, "step": 4154 }, { "epoch": 2.97, "grad_norm": 11.008855273874723, "learning_rate": 1.6536310970638525e-06, "loss": 0.2527, "step": 4155 }, { "epoch": 2.97, "grad_norm": 10.218850473628057, "learning_rate": 1.6514842291339494e-06, "loss": 0.2563, "step": 4156 }, { "epoch": 2.97, "grad_norm": 9.405684231978427, "learning_rate": 1.6493384800054052e-06, "loss": 0.2542, "step": 4157 }, { "epoch": 2.97, "grad_norm": 12.309209754540694, "learning_rate": 1.6471938503951546e-06, "loss": 0.2742, "step": 4158 }, { "epoch": 2.97, "grad_norm": 11.300836718904058, "learning_rate": 1.6450503410197582e-06, "loss": 0.2483, "step": 4159 }, { "epoch": 2.97, "grad_norm": 13.129412391262694, "learning_rate": 1.6429079525954023e-06, "loss": 0.4229, "step": 4160 }, { "epoch": 2.97, "grad_norm": 9.519718325999454, "learning_rate": 1.6407666858378985e-06, "loss": 0.2643, "step": 4161 }, { "epoch": 2.97, "grad_norm": 12.957849553694674, "learning_rate": 1.6386265414626834e-06, "loss": 0.2549, "step": 4162 }, { "epoch": 2.97, "grad_norm": 8.915315627608456, "learning_rate": 1.636487520184822e-06, "loss": 0.2188, "step": 4163 }, { "epoch": 2.97, "grad_norm": 10.583275524212816, "learning_rate": 1.6343496227189948e-06, "loss": 0.2233, "step": 4164 }, { "epoch": 2.97, "grad_norm": 11.15496354912215, "learning_rate": 1.632212849779521e-06, "loss": 0.2312, "step": 4165 }, { "epoch": 2.97, "grad_norm": 9.269917208785433, "learning_rate": 1.630077202080328e-06, "loss": 0.1956, "step": 4166 }, { "epoch": 2.97, "grad_norm": 12.657935965779476, "learning_rate": 1.6279426803349828e-06, "loss": 0.2642, "step": 4167 }, { "epoch": 2.98, "grad_norm": 6.290607785408874, "learning_rate": 1.6258092852566625e-06, "loss": 0.1294, "step": 4168 }, { "epoch": 2.98, "grad_norm": 10.711605990480061, "learning_rate": 1.6236770175581807e-06, "loss": 0.2771, "step": 4169 }, { "epoch": 2.98, "grad_norm": 9.135466780344332, "learning_rate": 1.62154587795196e-06, "loss": 0.2688, "step": 4170 }, { "epoch": 2.98, "grad_norm": 16.83078442721911, "learning_rate": 1.6194158671500616e-06, "loss": 0.214, "step": 4171 }, { "epoch": 2.98, "grad_norm": 10.353277203263982, "learning_rate": 1.6172869858641554e-06, "loss": 0.2915, "step": 4172 }, { "epoch": 2.98, "grad_norm": 9.629289861141057, "learning_rate": 1.6151592348055433e-06, "loss": 0.2556, "step": 4173 }, { "epoch": 2.98, "grad_norm": 8.912948810612065, "learning_rate": 1.6130326146851455e-06, "loss": 0.2493, "step": 4174 }, { "epoch": 2.98, "grad_norm": 12.026915751127602, "learning_rate": 1.6109071262135056e-06, "loss": 0.252, "step": 4175 }, { "epoch": 2.98, "grad_norm": 13.580442426663522, "learning_rate": 1.608782770100789e-06, "loss": 0.2695, "step": 4176 }, { "epoch": 2.98, "grad_norm": 11.492422800213438, "learning_rate": 1.6066595470567825e-06, "loss": 0.2375, "step": 4177 }, { "epoch": 2.98, "grad_norm": 14.85339863796703, "learning_rate": 1.6045374577908944e-06, "loss": 0.3145, "step": 4178 }, { "epoch": 2.98, "grad_norm": 8.415153390378142, "learning_rate": 1.6024165030121542e-06, "loss": 0.1987, "step": 4179 }, { "epoch": 2.98, "grad_norm": 16.294077072163837, "learning_rate": 1.6002966834292116e-06, "loss": 0.3169, "step": 4180 }, { "epoch": 2.98, "grad_norm": 9.594024560968199, "learning_rate": 1.5981779997503405e-06, "loss": 0.2385, "step": 4181 }, { "epoch": 2.99, "grad_norm": 11.105734484271524, "learning_rate": 1.5960604526834266e-06, "loss": 0.2812, "step": 4182 }, { "epoch": 2.99, "grad_norm": 11.49417910592248, "learning_rate": 1.5939440429359888e-06, "loss": 0.2888, "step": 4183 }, { "epoch": 2.99, "grad_norm": 9.633425145280981, "learning_rate": 1.591828771215152e-06, "loss": 0.2197, "step": 4184 }, { "epoch": 2.99, "grad_norm": 11.785768156691029, "learning_rate": 1.5897146382276752e-06, "loss": 0.2354, "step": 4185 }, { "epoch": 2.99, "grad_norm": 15.318466928652242, "learning_rate": 1.587601644679922e-06, "loss": 0.27, "step": 4186 }, { "epoch": 2.99, "grad_norm": 10.41034009289898, "learning_rate": 1.58548979127789e-06, "loss": 0.2205, "step": 4187 }, { "epoch": 2.99, "grad_norm": 9.937568460758499, "learning_rate": 1.5833790787271819e-06, "loss": 0.1968, "step": 4188 }, { "epoch": 2.99, "grad_norm": 9.46241310438527, "learning_rate": 1.5812695077330325e-06, "loss": 0.2495, "step": 4189 }, { "epoch": 2.99, "grad_norm": 16.29389775463895, "learning_rate": 1.5791610790002838e-06, "loss": 0.3013, "step": 4190 }, { "epoch": 2.99, "grad_norm": 13.717352648476366, "learning_rate": 1.577053793233403e-06, "loss": 0.2676, "step": 4191 }, { "epoch": 2.99, "grad_norm": 11.711414459758286, "learning_rate": 1.5749476511364726e-06, "loss": 0.332, "step": 4192 }, { "epoch": 2.99, "grad_norm": 7.705499968975047, "learning_rate": 1.5728426534131946e-06, "loss": 0.1589, "step": 4193 }, { "epoch": 2.99, "grad_norm": 12.787386008766704, "learning_rate": 1.5707388007668877e-06, "loss": 0.3003, "step": 4194 }, { "epoch": 2.99, "grad_norm": 7.667900077969025, "learning_rate": 1.568636093900488e-06, "loss": 0.2053, "step": 4195 }, { "epoch": 3.0, "grad_norm": 15.07311298463385, "learning_rate": 1.5665345335165488e-06, "loss": 0.2358, "step": 4196 }, { "epoch": 3.0, "grad_norm": 11.966148032918056, "learning_rate": 1.5644341203172415e-06, "loss": 0.3, "step": 4197 }, { "epoch": 3.0, "grad_norm": 6.900175323732201, "learning_rate": 1.5623348550043516e-06, "loss": 0.1863, "step": 4198 }, { "epoch": 3.0, "grad_norm": 13.34318877693945, "learning_rate": 1.5602367382792839e-06, "loss": 0.2773, "step": 4199 }, { "epoch": 3.0, "grad_norm": 14.531517790747417, "learning_rate": 1.5581397708430578e-06, "loss": 0.3015, "step": 4200 }, { "epoch": 3.0, "grad_norm": 12.05529215822636, "learning_rate": 1.556043953396309e-06, "loss": 0.23, "step": 4201 }, { "epoch": 3.0, "grad_norm": 10.926738561041796, "learning_rate": 1.5539492866392891e-06, "loss": 0.201, "step": 4202 }, { "epoch": 3.0, "grad_norm": 11.655803933246485, "learning_rate": 1.551855771271865e-06, "loss": 0.2241, "step": 4203 }, { "epoch": 3.0, "grad_norm": 9.49786951614942, "learning_rate": 1.5497634079935198e-06, "loss": 0.2363, "step": 4204 }, { "epoch": 3.0, "grad_norm": 5.562319258107449, "learning_rate": 1.5476721975033498e-06, "loss": 0.1233, "step": 4205 }, { "epoch": 3.0, "grad_norm": 5.707661563901169, "learning_rate": 1.5455821405000703e-06, "loss": 0.1448, "step": 4206 }, { "epoch": 3.0, "grad_norm": 3.98664541474231, "learning_rate": 1.5434932376820039e-06, "loss": 0.1188, "step": 4207 }, { "epoch": 3.0, "grad_norm": 7.560483987455881, "learning_rate": 1.5414054897470942e-06, "loss": 0.1791, "step": 4208 }, { "epoch": 3.0, "grad_norm": 4.315253135542903, "learning_rate": 1.5393188973928957e-06, "loss": 0.1033, "step": 4209 }, { "epoch": 3.0, "grad_norm": 4.611585149366166, "learning_rate": 1.5372334613165784e-06, "loss": 0.1366, "step": 4210 }, { "epoch": 3.01, "grad_norm": 4.021499687105682, "learning_rate": 1.5351491822149255e-06, "loss": 0.1254, "step": 4211 }, { "epoch": 3.01, "grad_norm": 7.3862962459769514, "learning_rate": 1.533066060784333e-06, "loss": 0.1672, "step": 4212 }, { "epoch": 3.01, "grad_norm": 4.011539859613148, "learning_rate": 1.5309840977208096e-06, "loss": 0.1176, "step": 4213 }, { "epoch": 3.01, "grad_norm": 6.460881953756751, "learning_rate": 1.5289032937199793e-06, "loss": 0.151, "step": 4214 }, { "epoch": 3.01, "grad_norm": 5.886640344653612, "learning_rate": 1.5268236494770772e-06, "loss": 0.1545, "step": 4215 }, { "epoch": 3.01, "grad_norm": 8.44915060491121, "learning_rate": 1.5247451656869499e-06, "loss": 0.163, "step": 4216 }, { "epoch": 3.01, "grad_norm": 4.682171952885785, "learning_rate": 1.5226678430440588e-06, "loss": 0.1411, "step": 4217 }, { "epoch": 3.01, "grad_norm": 5.312109950691412, "learning_rate": 1.5205916822424755e-06, "loss": 0.1438, "step": 4218 }, { "epoch": 3.01, "grad_norm": 5.1623724941925175, "learning_rate": 1.5185166839758836e-06, "loss": 0.1252, "step": 4219 }, { "epoch": 3.01, "grad_norm": 6.471874299848998, "learning_rate": 1.5164428489375789e-06, "loss": 0.167, "step": 4220 }, { "epoch": 3.01, "grad_norm": 4.334160602466491, "learning_rate": 1.5143701778204683e-06, "loss": 0.1064, "step": 4221 }, { "epoch": 3.01, "grad_norm": 4.790958055528099, "learning_rate": 1.5122986713170712e-06, "loss": 0.1418, "step": 4222 }, { "epoch": 3.01, "grad_norm": 4.360409064903208, "learning_rate": 1.510228330119512e-06, "loss": 0.1225, "step": 4223 }, { "epoch": 3.01, "grad_norm": 4.105021948656687, "learning_rate": 1.5081591549195357e-06, "loss": 0.1021, "step": 4224 }, { "epoch": 3.02, "grad_norm": 7.345073889890468, "learning_rate": 1.5060911464084864e-06, "loss": 0.1582, "step": 4225 }, { "epoch": 3.02, "grad_norm": 8.936176216847798, "learning_rate": 1.5040243052773312e-06, "loss": 0.1592, "step": 4226 }, { "epoch": 3.02, "grad_norm": 5.858093703881658, "learning_rate": 1.5019586322166323e-06, "loss": 0.1035, "step": 4227 }, { "epoch": 3.02, "grad_norm": 7.983548488354241, "learning_rate": 1.4998941279165773e-06, "loss": 0.1723, "step": 4228 }, { "epoch": 3.02, "grad_norm": 10.277666504790933, "learning_rate": 1.4978307930669483e-06, "loss": 0.1005, "step": 4229 }, { "epoch": 3.02, "grad_norm": 8.547800791086695, "learning_rate": 1.4957686283571498e-06, "loss": 0.1653, "step": 4230 }, { "epoch": 3.02, "grad_norm": 7.992527436033497, "learning_rate": 1.4937076344761858e-06, "loss": 0.1716, "step": 4231 }, { "epoch": 3.02, "grad_norm": 6.936012071215718, "learning_rate": 1.4916478121126732e-06, "loss": 0.1029, "step": 4232 }, { "epoch": 3.02, "grad_norm": 4.3255729830641885, "learning_rate": 1.4895891619548374e-06, "loss": 0.1046, "step": 4233 }, { "epoch": 3.02, "grad_norm": 9.902272124427949, "learning_rate": 1.4875316846905113e-06, "loss": 0.1895, "step": 4234 }, { "epoch": 3.02, "grad_norm": 5.363267661890996, "learning_rate": 1.4854753810071364e-06, "loss": 0.1187, "step": 4235 }, { "epoch": 3.02, "grad_norm": 10.19363846323527, "learning_rate": 1.4834202515917628e-06, "loss": 0.2157, "step": 4236 }, { "epoch": 3.02, "grad_norm": 8.752546304397477, "learning_rate": 1.4813662971310465e-06, "loss": 0.1414, "step": 4237 }, { "epoch": 3.02, "grad_norm": 5.430637602687925, "learning_rate": 1.4793135183112523e-06, "loss": 0.1292, "step": 4238 }, { "epoch": 3.03, "grad_norm": 8.280105835812476, "learning_rate": 1.477261915818251e-06, "loss": 0.2173, "step": 4239 }, { "epoch": 3.03, "grad_norm": 5.619705907816034, "learning_rate": 1.4752114903375243e-06, "loss": 0.0962, "step": 4240 }, { "epoch": 3.03, "grad_norm": 5.991828112437791, "learning_rate": 1.473162242554151e-06, "loss": 0.1031, "step": 4241 }, { "epoch": 3.03, "grad_norm": 5.770685175159459, "learning_rate": 1.47111417315283e-06, "loss": 0.1162, "step": 4242 }, { "epoch": 3.03, "grad_norm": 17.414337641647094, "learning_rate": 1.4690672828178532e-06, "loss": 0.1656, "step": 4243 }, { "epoch": 3.03, "grad_norm": 8.439371331134558, "learning_rate": 1.467021572233131e-06, "loss": 0.1172, "step": 4244 }, { "epoch": 3.03, "grad_norm": 6.740136894772873, "learning_rate": 1.4649770420821663e-06, "loss": 0.1099, "step": 4245 }, { "epoch": 3.03, "grad_norm": 6.614518214652328, "learning_rate": 1.4629336930480813e-06, "loss": 0.1323, "step": 4246 }, { "epoch": 3.03, "grad_norm": 7.219648721235388, "learning_rate": 1.4608915258135914e-06, "loss": 0.1555, "step": 4247 }, { "epoch": 3.03, "grad_norm": 6.825768758767164, "learning_rate": 1.4588505410610283e-06, "loss": 0.141, "step": 4248 }, { "epoch": 3.03, "grad_norm": 11.787094208035665, "learning_rate": 1.4568107394723175e-06, "loss": 0.1892, "step": 4249 }, { "epoch": 3.03, "grad_norm": 7.676246808791587, "learning_rate": 1.4547721217289972e-06, "loss": 0.1515, "step": 4250 }, { "epoch": 3.03, "grad_norm": 8.044534219415103, "learning_rate": 1.4527346885122073e-06, "loss": 0.1312, "step": 4251 }, { "epoch": 3.03, "grad_norm": 9.284106118194046, "learning_rate": 1.450698440502692e-06, "loss": 0.1499, "step": 4252 }, { "epoch": 3.04, "grad_norm": 3.7252324984477916, "learning_rate": 1.4486633783807997e-06, "loss": 0.0759, "step": 4253 }, { "epoch": 3.04, "grad_norm": 7.906025467327889, "learning_rate": 1.4466295028264822e-06, "loss": 0.1543, "step": 4254 }, { "epoch": 3.04, "grad_norm": 6.324242408244137, "learning_rate": 1.4445968145192951e-06, "loss": 0.1219, "step": 4255 }, { "epoch": 3.04, "grad_norm": 5.877864296276188, "learning_rate": 1.4425653141383977e-06, "loss": 0.14, "step": 4256 }, { "epoch": 3.04, "grad_norm": 5.312508541147513, "learning_rate": 1.4405350023625514e-06, "loss": 0.0874, "step": 4257 }, { "epoch": 3.04, "grad_norm": 5.865148996474216, "learning_rate": 1.4385058798701223e-06, "loss": 0.1302, "step": 4258 }, { "epoch": 3.04, "grad_norm": 9.287278930298898, "learning_rate": 1.4364779473390767e-06, "loss": 0.1621, "step": 4259 }, { "epoch": 3.04, "grad_norm": 8.177031611561803, "learning_rate": 1.4344512054469855e-06, "loss": 0.1541, "step": 4260 }, { "epoch": 3.04, "grad_norm": 4.308894066948919, "learning_rate": 1.4324256548710202e-06, "loss": 0.1062, "step": 4261 }, { "epoch": 3.04, "grad_norm": 8.610067458675067, "learning_rate": 1.430401296287955e-06, "loss": 0.1289, "step": 4262 }, { "epoch": 3.04, "grad_norm": 6.510987310154899, "learning_rate": 1.4283781303741662e-06, "loss": 0.1726, "step": 4263 }, { "epoch": 3.04, "grad_norm": 5.778662504339967, "learning_rate": 1.4263561578056307e-06, "loss": 0.1449, "step": 4264 }, { "epoch": 3.04, "grad_norm": 6.4303052868247565, "learning_rate": 1.4243353792579285e-06, "loss": 0.1311, "step": 4265 }, { "epoch": 3.04, "grad_norm": 4.0518291313422985, "learning_rate": 1.4223157954062344e-06, "loss": 0.1038, "step": 4266 }, { "epoch": 3.05, "grad_norm": 8.014311988987807, "learning_rate": 1.4202974069253362e-06, "loss": 0.1282, "step": 4267 }, { "epoch": 3.05, "grad_norm": 6.347104216461915, "learning_rate": 1.418280214489608e-06, "loss": 0.1176, "step": 4268 }, { "epoch": 3.05, "grad_norm": 6.026484655477969, "learning_rate": 1.416264218773038e-06, "loss": 0.1364, "step": 4269 }, { "epoch": 3.05, "grad_norm": 6.206163622677682, "learning_rate": 1.4142494204492007e-06, "loss": 0.1118, "step": 4270 }, { "epoch": 3.05, "grad_norm": 4.449486750709002, "learning_rate": 1.412235820191285e-06, "loss": 0.1207, "step": 4271 }, { "epoch": 3.05, "grad_norm": 7.169597104216024, "learning_rate": 1.4102234186720653e-06, "loss": 0.1562, "step": 4272 }, { "epoch": 3.05, "grad_norm": 7.3213306438478165, "learning_rate": 1.4082122165639285e-06, "loss": 0.1102, "step": 4273 }, { "epoch": 3.05, "grad_norm": 4.644150306624669, "learning_rate": 1.4062022145388503e-06, "loss": 0.0866, "step": 4274 }, { "epoch": 3.05, "grad_norm": 5.509551286371005, "learning_rate": 1.4041934132684116e-06, "loss": 0.1194, "step": 4275 }, { "epoch": 3.05, "grad_norm": 7.0171083888829395, "learning_rate": 1.4021858134237892e-06, "loss": 0.1792, "step": 4276 }, { "epoch": 3.05, "grad_norm": 6.766478418536765, "learning_rate": 1.4001794156757598e-06, "loss": 0.1609, "step": 4277 }, { "epoch": 3.05, "grad_norm": 5.7061716612934505, "learning_rate": 1.398174220694699e-06, "loss": 0.1375, "step": 4278 }, { "epoch": 3.05, "grad_norm": 6.22806612837643, "learning_rate": 1.3961702291505791e-06, "loss": 0.1154, "step": 4279 }, { "epoch": 3.05, "grad_norm": 5.054628039524962, "learning_rate": 1.3941674417129714e-06, "loss": 0.1053, "step": 4280 }, { "epoch": 3.06, "grad_norm": 9.438939415322599, "learning_rate": 1.3921658590510434e-06, "loss": 0.1853, "step": 4281 }, { "epoch": 3.06, "grad_norm": 5.909877879321542, "learning_rate": 1.3901654818335618e-06, "loss": 0.1347, "step": 4282 }, { "epoch": 3.06, "grad_norm": 9.17926909272783, "learning_rate": 1.3881663107288918e-06, "loss": 0.1378, "step": 4283 }, { "epoch": 3.06, "grad_norm": 4.0576033881925015, "learning_rate": 1.386168346404988e-06, "loss": 0.1311, "step": 4284 }, { "epoch": 3.06, "grad_norm": 6.618556358799638, "learning_rate": 1.3841715895294138e-06, "loss": 0.0911, "step": 4285 }, { "epoch": 3.06, "grad_norm": 5.8253753457194595, "learning_rate": 1.3821760407693175e-06, "loss": 0.0886, "step": 4286 }, { "epoch": 3.06, "grad_norm": 6.792261943559342, "learning_rate": 1.3801817007914543e-06, "loss": 0.1329, "step": 4287 }, { "epoch": 3.06, "grad_norm": 10.076624429072998, "learning_rate": 1.3781885702621644e-06, "loss": 0.2051, "step": 4288 }, { "epoch": 3.06, "grad_norm": 5.086198267318938, "learning_rate": 1.3761966498473956e-06, "loss": 0.0788, "step": 4289 }, { "epoch": 3.06, "grad_norm": 8.435317943548803, "learning_rate": 1.3742059402126818e-06, "loss": 0.1439, "step": 4290 }, { "epoch": 3.06, "grad_norm": 5.6714105862417075, "learning_rate": 1.3722164420231565e-06, "loss": 0.1266, "step": 4291 }, { "epoch": 3.06, "grad_norm": 6.760288551086754, "learning_rate": 1.370228155943548e-06, "loss": 0.132, "step": 4292 }, { "epoch": 3.06, "grad_norm": 5.607704087759127, "learning_rate": 1.3682410826381816e-06, "loss": 0.0842, "step": 4293 }, { "epoch": 3.06, "grad_norm": 5.449194178955735, "learning_rate": 1.366255222770973e-06, "loss": 0.1436, "step": 4294 }, { "epoch": 3.07, "grad_norm": 10.9077308370694, "learning_rate": 1.364270577005436e-06, "loss": 0.132, "step": 4295 }, { "epoch": 3.07, "grad_norm": 5.9434830126409635, "learning_rate": 1.3622871460046778e-06, "loss": 0.1117, "step": 4296 }, { "epoch": 3.07, "grad_norm": 5.335529396281524, "learning_rate": 1.3603049304313992e-06, "loss": 0.1206, "step": 4297 }, { "epoch": 3.07, "grad_norm": 6.400241514226316, "learning_rate": 1.3583239309478953e-06, "loss": 0.1349, "step": 4298 }, { "epoch": 3.07, "grad_norm": 7.442955154627488, "learning_rate": 1.3563441482160562e-06, "loss": 0.1261, "step": 4299 }, { "epoch": 3.07, "grad_norm": 11.286798997787256, "learning_rate": 1.35436558289736e-06, "loss": 0.1791, "step": 4300 }, { "epoch": 3.07, "grad_norm": 4.602769205972933, "learning_rate": 1.3523882356528883e-06, "loss": 0.1063, "step": 4301 }, { "epoch": 3.07, "grad_norm": 8.182228061518014, "learning_rate": 1.350412107143303e-06, "loss": 0.1431, "step": 4302 }, { "epoch": 3.07, "grad_norm": 5.4663437903465395, "learning_rate": 1.3484371980288712e-06, "loss": 0.1296, "step": 4303 }, { "epoch": 3.07, "grad_norm": 4.621578796410083, "learning_rate": 1.3464635089694416e-06, "loss": 0.0828, "step": 4304 }, { "epoch": 3.07, "grad_norm": 5.558582314176478, "learning_rate": 1.344491040624466e-06, "loss": 0.1587, "step": 4305 }, { "epoch": 3.07, "grad_norm": 6.181336673297239, "learning_rate": 1.3425197936529766e-06, "loss": 0.1385, "step": 4306 }, { "epoch": 3.07, "grad_norm": 10.412134988285041, "learning_rate": 1.3405497687136098e-06, "loss": 0.1656, "step": 4307 }, { "epoch": 3.07, "grad_norm": 7.582811581572259, "learning_rate": 1.3385809664645827e-06, "loss": 0.1295, "step": 4308 }, { "epoch": 3.08, "grad_norm": 8.150651347785129, "learning_rate": 1.336613387563711e-06, "loss": 0.1396, "step": 4309 }, { "epoch": 3.08, "grad_norm": 5.360245382647963, "learning_rate": 1.3346470326683986e-06, "loss": 0.1394, "step": 4310 }, { "epoch": 3.08, "grad_norm": 5.92768096906367, "learning_rate": 1.3326819024356413e-06, "loss": 0.1395, "step": 4311 }, { "epoch": 3.08, "grad_norm": 8.126264782030967, "learning_rate": 1.3307179975220264e-06, "loss": 0.1683, "step": 4312 }, { "epoch": 3.08, "grad_norm": 7.097276549928228, "learning_rate": 1.3287553185837298e-06, "loss": 0.1547, "step": 4313 }, { "epoch": 3.08, "grad_norm": 6.968139851957762, "learning_rate": 1.3267938662765206e-06, "loss": 0.1147, "step": 4314 }, { "epoch": 3.08, "grad_norm": 4.5357337366094725, "learning_rate": 1.324833641255755e-06, "loss": 0.1165, "step": 4315 }, { "epoch": 3.08, "grad_norm": 6.212179238446433, "learning_rate": 1.3228746441763813e-06, "loss": 0.0844, "step": 4316 }, { "epoch": 3.08, "grad_norm": 5.800820871659498, "learning_rate": 1.3209168756929363e-06, "loss": 0.1323, "step": 4317 }, { "epoch": 3.08, "grad_norm": 8.876523252846415, "learning_rate": 1.3189603364595483e-06, "loss": 0.1721, "step": 4318 }, { "epoch": 3.08, "grad_norm": 6.1759123731714825, "learning_rate": 1.3170050271299316e-06, "loss": 0.1298, "step": 4319 }, { "epoch": 3.08, "grad_norm": 8.861593791331382, "learning_rate": 1.315050948357392e-06, "loss": 0.1289, "step": 4320 }, { "epoch": 3.08, "grad_norm": 8.188259150222102, "learning_rate": 1.3130981007948247e-06, "loss": 0.1427, "step": 4321 }, { "epoch": 3.08, "grad_norm": 6.725476274791131, "learning_rate": 1.3111464850947103e-06, "loss": 0.1479, "step": 4322 }, { "epoch": 3.09, "grad_norm": 7.735183629373757, "learning_rate": 1.3091961019091216e-06, "loss": 0.1288, "step": 4323 }, { "epoch": 3.09, "grad_norm": 5.745017577097906, "learning_rate": 1.3072469518897184e-06, "loss": 0.1412, "step": 4324 }, { "epoch": 3.09, "grad_norm": 8.405808376306096, "learning_rate": 1.3052990356877444e-06, "loss": 0.1046, "step": 4325 }, { "epoch": 3.09, "grad_norm": 8.517500797990728, "learning_rate": 1.3033523539540394e-06, "loss": 0.1934, "step": 4326 }, { "epoch": 3.09, "grad_norm": 5.815632664792931, "learning_rate": 1.3014069073390206e-06, "loss": 0.1445, "step": 4327 }, { "epoch": 3.09, "grad_norm": 6.250647236307092, "learning_rate": 1.2994626964927042e-06, "loss": 0.1066, "step": 4328 }, { "epoch": 3.09, "grad_norm": 5.161414714395807, "learning_rate": 1.2975197220646807e-06, "loss": 0.1146, "step": 4329 }, { "epoch": 3.09, "grad_norm": 8.547923821989846, "learning_rate": 1.29557798470414e-06, "loss": 0.1351, "step": 4330 }, { "epoch": 3.09, "grad_norm": 5.445406438833862, "learning_rate": 1.293637485059847e-06, "loss": 0.1071, "step": 4331 }, { "epoch": 3.09, "grad_norm": 12.910301333668382, "learning_rate": 1.291698223780164e-06, "loss": 0.1795, "step": 4332 }, { "epoch": 3.09, "grad_norm": 5.415868085569591, "learning_rate": 1.2897602015130306e-06, "loss": 0.1145, "step": 4333 }, { "epoch": 3.09, "grad_norm": 7.580899357603486, "learning_rate": 1.287823418905977e-06, "loss": 0.1422, "step": 4334 }, { "epoch": 3.09, "grad_norm": 7.710105285949303, "learning_rate": 1.2858878766061178e-06, "loss": 0.1465, "step": 4335 }, { "epoch": 3.09, "grad_norm": 7.719436922598486, "learning_rate": 1.2839535752601551e-06, "loss": 0.1892, "step": 4336 }, { "epoch": 3.1, "grad_norm": 8.01755051568949, "learning_rate": 1.2820205155143738e-06, "loss": 0.1098, "step": 4337 }, { "epoch": 3.1, "grad_norm": 5.586621697986031, "learning_rate": 1.2800886980146453e-06, "loss": 0.1371, "step": 4338 }, { "epoch": 3.1, "grad_norm": 10.331064240612887, "learning_rate": 1.2781581234064256e-06, "loss": 0.1504, "step": 4339 }, { "epoch": 3.1, "grad_norm": 6.770807504782781, "learning_rate": 1.276228792334756e-06, "loss": 0.1121, "step": 4340 }, { "epoch": 3.1, "grad_norm": 6.086051023648102, "learning_rate": 1.274300705444262e-06, "loss": 0.1239, "step": 4341 }, { "epoch": 3.1, "grad_norm": 8.608147481251128, "learning_rate": 1.2723738633791538e-06, "loss": 0.2008, "step": 4342 }, { "epoch": 3.1, "grad_norm": 4.898067540734754, "learning_rate": 1.2704482667832218e-06, "loss": 0.1062, "step": 4343 }, { "epoch": 3.1, "grad_norm": 7.497203187118496, "learning_rate": 1.2685239162998485e-06, "loss": 0.1232, "step": 4344 }, { "epoch": 3.1, "grad_norm": 19.126104586121958, "learning_rate": 1.2666008125719904e-06, "loss": 0.1772, "step": 4345 }, { "epoch": 3.1, "grad_norm": 8.12284642374614, "learning_rate": 1.2646789562421975e-06, "loss": 0.1339, "step": 4346 }, { "epoch": 3.1, "grad_norm": 8.079010398172034, "learning_rate": 1.2627583479525913e-06, "loss": 0.1396, "step": 4347 }, { "epoch": 3.1, "grad_norm": 4.1620164187548765, "learning_rate": 1.2608389883448896e-06, "loss": 0.1212, "step": 4348 }, { "epoch": 3.1, "grad_norm": 7.859990849012603, "learning_rate": 1.2589208780603795e-06, "loss": 0.1313, "step": 4349 }, { "epoch": 3.1, "grad_norm": 7.106346065393317, "learning_rate": 1.2570040177399435e-06, "loss": 0.1424, "step": 4350 }, { "epoch": 3.11, "grad_norm": 5.06095769329329, "learning_rate": 1.255088408024036e-06, "loss": 0.1101, "step": 4351 }, { "epoch": 3.11, "grad_norm": 5.6470216837778135, "learning_rate": 1.2531740495526989e-06, "loss": 0.1184, "step": 4352 }, { "epoch": 3.11, "grad_norm": 10.158476286310746, "learning_rate": 1.2512609429655553e-06, "loss": 0.1348, "step": 4353 }, { "epoch": 3.11, "grad_norm": 7.516757987633382, "learning_rate": 1.249349088901809e-06, "loss": 0.1202, "step": 4354 }, { "epoch": 3.11, "grad_norm": 5.6383151746155065, "learning_rate": 1.247438488000247e-06, "loss": 0.0861, "step": 4355 }, { "epoch": 3.11, "grad_norm": 6.12153217876157, "learning_rate": 1.245529140899236e-06, "loss": 0.163, "step": 4356 }, { "epoch": 3.11, "grad_norm": 4.4984484754078276, "learning_rate": 1.2436210482367245e-06, "loss": 0.0786, "step": 4357 }, { "epoch": 3.11, "grad_norm": 7.595141893853033, "learning_rate": 1.2417142106502418e-06, "loss": 0.0952, "step": 4358 }, { "epoch": 3.11, "grad_norm": 7.398814994664079, "learning_rate": 1.2398086287768969e-06, "loss": 0.1138, "step": 4359 }, { "epoch": 3.11, "grad_norm": 7.727230258964825, "learning_rate": 1.237904303253381e-06, "loss": 0.1183, "step": 4360 }, { "epoch": 3.11, "grad_norm": 5.279257418876605, "learning_rate": 1.236001234715965e-06, "loss": 0.1007, "step": 4361 }, { "epoch": 3.11, "grad_norm": 6.402366707274822, "learning_rate": 1.2340994238004987e-06, "loss": 0.1294, "step": 4362 }, { "epoch": 3.11, "grad_norm": 5.776850831934081, "learning_rate": 1.2321988711424132e-06, "loss": 0.1176, "step": 4363 }, { "epoch": 3.11, "grad_norm": 6.872878484333298, "learning_rate": 1.2302995773767174e-06, "loss": 0.1392, "step": 4364 }, { "epoch": 3.12, "grad_norm": 10.12314710750376, "learning_rate": 1.2284015431380015e-06, "loss": 0.1311, "step": 4365 }, { "epoch": 3.12, "grad_norm": 8.469770250969509, "learning_rate": 1.2265047690604354e-06, "loss": 0.1562, "step": 4366 }, { "epoch": 3.12, "grad_norm": 7.966300776911517, "learning_rate": 1.2246092557777633e-06, "loss": 0.1357, "step": 4367 }, { "epoch": 3.12, "grad_norm": 7.398989033867051, "learning_rate": 1.2227150039233132e-06, "loss": 0.157, "step": 4368 }, { "epoch": 3.12, "grad_norm": 12.612083517699997, "learning_rate": 1.2208220141299893e-06, "loss": 0.1385, "step": 4369 }, { "epoch": 3.12, "grad_norm": 5.966421949471924, "learning_rate": 1.2189302870302755e-06, "loss": 0.1266, "step": 4370 }, { "epoch": 3.12, "grad_norm": 5.641265236099781, "learning_rate": 1.2170398232562324e-06, "loss": 0.1183, "step": 4371 }, { "epoch": 3.12, "grad_norm": 20.375633559108344, "learning_rate": 1.2151506234395e-06, "loss": 0.2162, "step": 4372 }, { "epoch": 3.12, "grad_norm": 10.58226033610902, "learning_rate": 1.2132626882112935e-06, "loss": 0.1302, "step": 4373 }, { "epoch": 3.12, "grad_norm": 9.180226888360846, "learning_rate": 1.211376018202408e-06, "loss": 0.1749, "step": 4374 }, { "epoch": 3.12, "grad_norm": 6.08472849567162, "learning_rate": 1.2094906140432155e-06, "loss": 0.1486, "step": 4375 }, { "epoch": 3.12, "grad_norm": 5.74121178158958, "learning_rate": 1.2076064763636641e-06, "loss": 0.0946, "step": 4376 }, { "epoch": 3.12, "grad_norm": 8.286183532915876, "learning_rate": 1.205723605793279e-06, "loss": 0.1323, "step": 4377 }, { "epoch": 3.12, "grad_norm": 6.530368951438912, "learning_rate": 1.2038420029611625e-06, "loss": 0.1406, "step": 4378 }, { "epoch": 3.13, "grad_norm": 9.283823696986877, "learning_rate": 1.2019616684959934e-06, "loss": 0.1354, "step": 4379 }, { "epoch": 3.13, "grad_norm": 5.768580237770132, "learning_rate": 1.2000826030260254e-06, "loss": 0.1118, "step": 4380 }, { "epoch": 3.13, "grad_norm": 7.393517881119877, "learning_rate": 1.1982048071790903e-06, "loss": 0.15, "step": 4381 }, { "epoch": 3.13, "grad_norm": 7.074275685681868, "learning_rate": 1.1963282815825938e-06, "loss": 0.135, "step": 4382 }, { "epoch": 3.13, "grad_norm": 5.8588043109415, "learning_rate": 1.194453026863519e-06, "loss": 0.0863, "step": 4383 }, { "epoch": 3.13, "grad_norm": 8.252158205024786, "learning_rate": 1.1925790436484219e-06, "loss": 0.1028, "step": 4384 }, { "epoch": 3.13, "grad_norm": 4.4442688489776465, "learning_rate": 1.1907063325634376e-06, "loss": 0.0718, "step": 4385 }, { "epoch": 3.13, "grad_norm": 6.31765034734141, "learning_rate": 1.1888348942342697e-06, "loss": 0.1121, "step": 4386 }, { "epoch": 3.13, "grad_norm": 8.438393240156737, "learning_rate": 1.1869647292862051e-06, "loss": 0.1555, "step": 4387 }, { "epoch": 3.13, "grad_norm": 11.366731513055292, "learning_rate": 1.1850958383440957e-06, "loss": 0.1699, "step": 4388 }, { "epoch": 3.13, "grad_norm": 5.652295308843194, "learning_rate": 1.183228222032378e-06, "loss": 0.1521, "step": 4389 }, { "epoch": 3.13, "grad_norm": 6.177913643706088, "learning_rate": 1.181361880975052e-06, "loss": 0.1331, "step": 4390 }, { "epoch": 3.13, "grad_norm": 5.520601566917642, "learning_rate": 1.1794968157957026e-06, "loss": 0.0956, "step": 4391 }, { "epoch": 3.13, "grad_norm": 5.836669023955266, "learning_rate": 1.1776330271174786e-06, "loss": 0.1168, "step": 4392 }, { "epoch": 3.14, "grad_norm": 5.603939921185731, "learning_rate": 1.1757705155631072e-06, "loss": 0.1146, "step": 4393 }, { "epoch": 3.14, "grad_norm": 6.98450834534631, "learning_rate": 1.1739092817548887e-06, "loss": 0.1356, "step": 4394 }, { "epoch": 3.14, "grad_norm": 5.048629847874484, "learning_rate": 1.172049326314696e-06, "loss": 0.1296, "step": 4395 }, { "epoch": 3.14, "grad_norm": 6.414733074091697, "learning_rate": 1.1701906498639741e-06, "loss": 0.0904, "step": 4396 }, { "epoch": 3.14, "grad_norm": 6.206320184441604, "learning_rate": 1.1683332530237423e-06, "loss": 0.0958, "step": 4397 }, { "epoch": 3.14, "grad_norm": 8.848301273909941, "learning_rate": 1.1664771364145905e-06, "loss": 0.182, "step": 4398 }, { "epoch": 3.14, "grad_norm": 18.28608439063244, "learning_rate": 1.1646223006566827e-06, "loss": 0.3428, "step": 4399 }, { "epoch": 3.14, "grad_norm": 5.770307322322705, "learning_rate": 1.162768746369753e-06, "loss": 0.1066, "step": 4400 }, { "epoch": 3.14, "grad_norm": 12.372434701004785, "learning_rate": 1.1609164741731105e-06, "loss": 0.2958, "step": 4401 }, { "epoch": 3.14, "grad_norm": 5.22763139963171, "learning_rate": 1.1590654846856291e-06, "loss": 0.1393, "step": 4402 }, { "epoch": 3.14, "grad_norm": 4.248723078530217, "learning_rate": 1.1572157785257643e-06, "loss": 0.0935, "step": 4403 }, { "epoch": 3.14, "grad_norm": 6.694964132694257, "learning_rate": 1.1553673563115325e-06, "loss": 0.1555, "step": 4404 }, { "epoch": 3.14, "grad_norm": 8.188557172066655, "learning_rate": 1.153520218660531e-06, "loss": 0.1984, "step": 4405 }, { "epoch": 3.14, "grad_norm": 6.541895242637127, "learning_rate": 1.1516743661899172e-06, "loss": 0.1124, "step": 4406 }, { "epoch": 3.15, "grad_norm": 10.548845679168716, "learning_rate": 1.1498297995164305e-06, "loss": 0.186, "step": 4407 }, { "epoch": 3.15, "grad_norm": 8.565033160611465, "learning_rate": 1.1479865192563683e-06, "loss": 0.1838, "step": 4408 }, { "epoch": 3.15, "grad_norm": 4.752373240136069, "learning_rate": 1.146144526025612e-06, "loss": 0.0936, "step": 4409 }, { "epoch": 3.15, "grad_norm": 7.942694309672477, "learning_rate": 1.1443038204396007e-06, "loss": 0.1306, "step": 4410 }, { "epoch": 3.15, "grad_norm": 6.89945116790705, "learning_rate": 1.1424644031133502e-06, "loss": 0.1313, "step": 4411 }, { "epoch": 3.15, "grad_norm": 9.26264775999036, "learning_rate": 1.1406262746614433e-06, "loss": 0.1857, "step": 4412 }, { "epoch": 3.15, "grad_norm": 5.4397623406990405, "learning_rate": 1.1387894356980334e-06, "loss": 0.1163, "step": 4413 }, { "epoch": 3.15, "grad_norm": 8.491939243996956, "learning_rate": 1.1369538868368424e-06, "loss": 0.1293, "step": 4414 }, { "epoch": 3.15, "grad_norm": 5.750098964300776, "learning_rate": 1.1351196286911615e-06, "loss": 0.1233, "step": 4415 }, { "epoch": 3.15, "grad_norm": 6.21545036806548, "learning_rate": 1.1332866618738498e-06, "loss": 0.1461, "step": 4416 }, { "epoch": 3.15, "grad_norm": 4.539810649852684, "learning_rate": 1.1314549869973363e-06, "loss": 0.1044, "step": 4417 }, { "epoch": 3.15, "grad_norm": 4.573431615715713, "learning_rate": 1.1296246046736176e-06, "loss": 0.0757, "step": 4418 }, { "epoch": 3.15, "grad_norm": 11.731198672608688, "learning_rate": 1.1277955155142578e-06, "loss": 0.1499, "step": 4419 }, { "epoch": 3.15, "grad_norm": 5.893278961301827, "learning_rate": 1.1259677201303905e-06, "loss": 0.1198, "step": 4420 }, { "epoch": 3.16, "grad_norm": 5.0550091247501125, "learning_rate": 1.1241412191327155e-06, "loss": 0.1017, "step": 4421 }, { "epoch": 3.16, "grad_norm": 6.282672874051473, "learning_rate": 1.1223160131315008e-06, "loss": 0.1163, "step": 4422 }, { "epoch": 3.16, "grad_norm": 12.271735935923063, "learning_rate": 1.1204921027365818e-06, "loss": 0.1686, "step": 4423 }, { "epoch": 3.16, "grad_norm": 6.60070124272832, "learning_rate": 1.1186694885573602e-06, "loss": 0.1394, "step": 4424 }, { "epoch": 3.16, "grad_norm": 6.119314793118176, "learning_rate": 1.1168481712028061e-06, "loss": 0.1188, "step": 4425 }, { "epoch": 3.16, "grad_norm": 5.845012516399945, "learning_rate": 1.115028151281457e-06, "loss": 0.0951, "step": 4426 }, { "epoch": 3.16, "grad_norm": 4.51100338272369, "learning_rate": 1.1132094294014106e-06, "loss": 0.0892, "step": 4427 }, { "epoch": 3.16, "grad_norm": 8.698616071989141, "learning_rate": 1.1113920061703416e-06, "loss": 0.1289, "step": 4428 }, { "epoch": 3.16, "grad_norm": 6.597902292498433, "learning_rate": 1.1095758821954788e-06, "loss": 0.1199, "step": 4429 }, { "epoch": 3.16, "grad_norm": 7.13322096070308, "learning_rate": 1.107761058083629e-06, "loss": 0.1552, "step": 4430 }, { "epoch": 3.16, "grad_norm": 5.3313785556010505, "learning_rate": 1.1059475344411535e-06, "loss": 0.0945, "step": 4431 }, { "epoch": 3.16, "grad_norm": 8.065704209387357, "learning_rate": 1.104135311873989e-06, "loss": 0.1471, "step": 4432 }, { "epoch": 3.16, "grad_norm": 7.85943646738916, "learning_rate": 1.1023243909876275e-06, "loss": 0.1545, "step": 4433 }, { "epoch": 3.16, "grad_norm": 5.066634102847706, "learning_rate": 1.1005147723871374e-06, "loss": 0.0896, "step": 4434 }, { "epoch": 3.17, "grad_norm": 7.8424079666970155, "learning_rate": 1.0987064566771405e-06, "loss": 0.1359, "step": 4435 }, { "epoch": 3.17, "grad_norm": 7.6459659701311695, "learning_rate": 1.0968994444618313e-06, "loss": 0.167, "step": 4436 }, { "epoch": 3.17, "grad_norm": 5.500227738741658, "learning_rate": 1.0950937363449659e-06, "loss": 0.1115, "step": 4437 }, { "epoch": 3.17, "grad_norm": 7.2102276882225445, "learning_rate": 1.0932893329298643e-06, "loss": 0.1222, "step": 4438 }, { "epoch": 3.17, "grad_norm": 7.847799303012751, "learning_rate": 1.0914862348194121e-06, "loss": 0.1068, "step": 4439 }, { "epoch": 3.17, "grad_norm": 6.851543694579042, "learning_rate": 1.0896844426160575e-06, "loss": 0.1298, "step": 4440 }, { "epoch": 3.17, "grad_norm": 6.889551647224606, "learning_rate": 1.0878839569218124e-06, "loss": 0.116, "step": 4441 }, { "epoch": 3.17, "grad_norm": 7.152322142099854, "learning_rate": 1.0860847783382534e-06, "loss": 0.1624, "step": 4442 }, { "epoch": 3.17, "grad_norm": 7.011318397121267, "learning_rate": 1.0842869074665186e-06, "loss": 0.1464, "step": 4443 }, { "epoch": 3.17, "grad_norm": 8.68536297533047, "learning_rate": 1.0824903449073115e-06, "loss": 0.2058, "step": 4444 }, { "epoch": 3.17, "grad_norm": 8.256267124703266, "learning_rate": 1.0806950912608937e-06, "loss": 0.1498, "step": 4445 }, { "epoch": 3.17, "grad_norm": 9.177033572690247, "learning_rate": 1.0789011471270983e-06, "loss": 0.1533, "step": 4446 }, { "epoch": 3.17, "grad_norm": 9.42342228155218, "learning_rate": 1.0771085131053087e-06, "loss": 0.1993, "step": 4447 }, { "epoch": 3.17, "grad_norm": 5.096230095880606, "learning_rate": 1.0753171897944835e-06, "loss": 0.1036, "step": 4448 }, { "epoch": 3.18, "grad_norm": 8.154504511502616, "learning_rate": 1.0735271777931322e-06, "loss": 0.1755, "step": 4449 }, { "epoch": 3.18, "grad_norm": 7.04583975692834, "learning_rate": 1.0717384776993356e-06, "loss": 0.2039, "step": 4450 }, { "epoch": 3.18, "grad_norm": 6.7824108907953224, "learning_rate": 1.069951090110728e-06, "loss": 0.1451, "step": 4451 }, { "epoch": 3.18, "grad_norm": 5.187754014764868, "learning_rate": 1.06816501562451e-06, "loss": 0.0834, "step": 4452 }, { "epoch": 3.18, "grad_norm": 7.278261677229582, "learning_rate": 1.0663802548374424e-06, "loss": 0.1149, "step": 4453 }, { "epoch": 3.18, "grad_norm": 4.340393440241648, "learning_rate": 1.064596808345847e-06, "loss": 0.1205, "step": 4454 }, { "epoch": 3.18, "grad_norm": 7.746103373611341, "learning_rate": 1.0628146767456066e-06, "loss": 0.166, "step": 4455 }, { "epoch": 3.18, "grad_norm": 6.3971573426174295, "learning_rate": 1.061033860632164e-06, "loss": 0.1436, "step": 4456 }, { "epoch": 3.18, "grad_norm": 9.242403232576306, "learning_rate": 1.0592543606005235e-06, "loss": 0.181, "step": 4457 }, { "epoch": 3.18, "grad_norm": 10.503336807115314, "learning_rate": 1.0574761772452486e-06, "loss": 0.2029, "step": 4458 }, { "epoch": 3.18, "grad_norm": 7.744097047628762, "learning_rate": 1.0556993111604635e-06, "loss": 0.1935, "step": 4459 }, { "epoch": 3.18, "grad_norm": 9.014041823353091, "learning_rate": 1.0539237629398536e-06, "loss": 0.1501, "step": 4460 }, { "epoch": 3.18, "grad_norm": 8.4496955488824, "learning_rate": 1.052149533176659e-06, "loss": 0.1421, "step": 4461 }, { "epoch": 3.18, "grad_norm": 6.407043789529695, "learning_rate": 1.050376622463688e-06, "loss": 0.1115, "step": 4462 }, { "epoch": 3.19, "grad_norm": 5.934429129123148, "learning_rate": 1.0486050313932972e-06, "loss": 0.1068, "step": 4463 }, { "epoch": 3.19, "grad_norm": 3.89642147430061, "learning_rate": 1.0468347605574137e-06, "loss": 0.0684, "step": 4464 }, { "epoch": 3.19, "grad_norm": 6.884801342876653, "learning_rate": 1.0450658105475126e-06, "loss": 0.1141, "step": 4465 }, { "epoch": 3.19, "grad_norm": 5.913543630708609, "learning_rate": 1.0432981819546384e-06, "loss": 0.1215, "step": 4466 }, { "epoch": 3.19, "grad_norm": 7.435283990174136, "learning_rate": 1.0415318753693837e-06, "loss": 0.1531, "step": 4467 }, { "epoch": 3.19, "grad_norm": 9.051636352276448, "learning_rate": 1.0397668913819086e-06, "loss": 0.1371, "step": 4468 }, { "epoch": 3.19, "grad_norm": 6.3128900776674755, "learning_rate": 1.0380032305819243e-06, "loss": 0.1064, "step": 4469 }, { "epoch": 3.19, "grad_norm": 8.142204214879126, "learning_rate": 1.0362408935587026e-06, "loss": 0.1516, "step": 4470 }, { "epoch": 3.19, "grad_norm": 7.661871841618407, "learning_rate": 1.0344798809010748e-06, "loss": 0.1476, "step": 4471 }, { "epoch": 3.19, "grad_norm": 11.682682233942165, "learning_rate": 1.0327201931974262e-06, "loss": 0.188, "step": 4472 }, { "epoch": 3.19, "grad_norm": 4.381237196793194, "learning_rate": 1.0309618310357023e-06, "loss": 0.1077, "step": 4473 }, { "epoch": 3.19, "grad_norm": 7.058241319504342, "learning_rate": 1.0292047950034046e-06, "loss": 0.1934, "step": 4474 }, { "epoch": 3.19, "grad_norm": 4.707890326892382, "learning_rate": 1.0274490856875908e-06, "loss": 0.0759, "step": 4475 }, { "epoch": 3.19, "grad_norm": 6.573092415383323, "learning_rate": 1.0256947036748766e-06, "loss": 0.1527, "step": 4476 }, { "epoch": 3.2, "grad_norm": 5.479401416008466, "learning_rate": 1.0239416495514331e-06, "loss": 0.124, "step": 4477 }, { "epoch": 3.2, "grad_norm": 4.404838785894077, "learning_rate": 1.0221899239029887e-06, "loss": 0.0833, "step": 4478 }, { "epoch": 3.2, "grad_norm": 5.3591410165394775, "learning_rate": 1.0204395273148277e-06, "loss": 0.0904, "step": 4479 }, { "epoch": 3.2, "grad_norm": 8.06517124521747, "learning_rate": 1.0186904603717894e-06, "loss": 0.1272, "step": 4480 }, { "epoch": 3.2, "grad_norm": 8.295034878534254, "learning_rate": 1.0169427236582702e-06, "loss": 0.1395, "step": 4481 }, { "epoch": 3.2, "grad_norm": 5.6017148138313075, "learning_rate": 1.0151963177582208e-06, "loss": 0.0892, "step": 4482 }, { "epoch": 3.2, "grad_norm": 5.168505372280764, "learning_rate": 1.0134512432551492e-06, "loss": 0.0732, "step": 4483 }, { "epoch": 3.2, "grad_norm": 4.844662307023592, "learning_rate": 1.0117075007321152e-06, "loss": 0.0717, "step": 4484 }, { "epoch": 3.2, "grad_norm": 5.47237204481135, "learning_rate": 1.009965090771739e-06, "loss": 0.1017, "step": 4485 }, { "epoch": 3.2, "grad_norm": 8.845533470002579, "learning_rate": 1.0082240139561866e-06, "loss": 0.1592, "step": 4486 }, { "epoch": 3.2, "grad_norm": 8.253239344807614, "learning_rate": 1.0064842708671908e-06, "loss": 0.141, "step": 4487 }, { "epoch": 3.2, "grad_norm": 6.3375223471928095, "learning_rate": 1.0047458620860251e-06, "loss": 0.1008, "step": 4488 }, { "epoch": 3.2, "grad_norm": 5.7321531585346435, "learning_rate": 1.0030087881935308e-06, "loss": 0.0897, "step": 4489 }, { "epoch": 3.2, "grad_norm": 7.816882454233154, "learning_rate": 1.0012730497700912e-06, "loss": 0.1459, "step": 4490 }, { "epoch": 3.21, "grad_norm": 11.259126205309581, "learning_rate": 9.995386473956531e-07, "loss": 0.1963, "step": 4491 }, { "epoch": 3.21, "grad_norm": 4.533563851322039, "learning_rate": 9.978055816497084e-07, "loss": 0.0829, "step": 4492 }, { "epoch": 3.21, "grad_norm": 8.880117960445354, "learning_rate": 9.960738531113118e-07, "loss": 0.1495, "step": 4493 }, { "epoch": 3.21, "grad_norm": 4.388909654201851, "learning_rate": 9.94343462359061e-07, "loss": 0.1064, "step": 4494 }, { "epoch": 3.21, "grad_norm": 5.589157030071463, "learning_rate": 9.926144099711138e-07, "loss": 0.1085, "step": 4495 }, { "epoch": 3.21, "grad_norm": 6.486877158713429, "learning_rate": 9.90886696525179e-07, "loss": 0.1301, "step": 4496 }, { "epoch": 3.21, "grad_norm": 9.916504210295873, "learning_rate": 9.89160322598517e-07, "loss": 0.1409, "step": 4497 }, { "epoch": 3.21, "grad_norm": 7.150060234978749, "learning_rate": 9.874352887679416e-07, "loss": 0.1315, "step": 4498 }, { "epoch": 3.21, "grad_norm": 7.447202722521027, "learning_rate": 9.857115956098196e-07, "loss": 0.1461, "step": 4499 }, { "epoch": 3.21, "grad_norm": 7.592805113175272, "learning_rate": 9.839892437000675e-07, "loss": 0.0893, "step": 4500 }, { "epoch": 3.21, "eval_avg_AUC": 0.7918247035704028, "eval_avg_Accuracy": 0.711414124668435, "eval_avg_Accuracy-right": 0.8560062605973653, "eval_avg_Accuracy-wrong": 0.45929042528997044, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6577722838853147, "eval_last_AUC": 0.8121226339774985, "eval_last_Accuracy": 0.7365716180371353, "eval_last_Accuracy-right": 0.7942480761706012, "eval_last_Accuracy-wrong": 0.6360018194223334, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6811064518145272, "eval_max_AUC": 0.766090631206006, "eval_max_Accuracy": 0.6466760610079576, "eval_max_Accuracy-right": 0.9701317334028955, "eval_max_Accuracy-wrong": 0.08267000227427791, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6136546806309608, "eval_min_AUC": 0.7982965177859538, "eval_min_Accuracy": 0.7182940981432361, "eval_min_Accuracy-right": 0.6972088170079562, "eval_min_Accuracy-wrong": 0.7550602683647942, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6616380620169989, "eval_prod_AUC": 0.7992793561684441, "eval_prod_Accuracy": 0.7075596816976127, "eval_prod_Accuracy-right": 0.6415155862788574, "eval_prod_Accuracy-wrong": 0.8227200363884467, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6610387644238402, "eval_runtime": 246.5009, "eval_samples_per_second": 97.882, "eval_steps_per_second": 3.059, "eval_sum_AUC": 0.6820948876089348, "eval_sum_Accuracy": 0.6409565649867374, "eval_sum_Accuracy-right": 0.9834355028042259, "eval_sum_Accuracy-wrong": 0.04377984989765749, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6492510686816373, "step": 4500 }, { "epoch": 3.21, "grad_norm": 6.017970370822423, "learning_rate": 9.822682336141558e-07, "loss": 0.1088, "step": 4501 }, { "epoch": 3.21, "grad_norm": 8.347395956475857, "learning_rate": 9.805485659271064e-07, "loss": 0.1423, "step": 4502 }, { "epoch": 3.21, "grad_norm": 6.147141262858037, "learning_rate": 9.788302412134931e-07, "loss": 0.1143, "step": 4503 }, { "epoch": 3.21, "grad_norm": 5.992507716204142, "learning_rate": 9.77113260047436e-07, "loss": 0.1105, "step": 4504 }, { "epoch": 3.22, "grad_norm": 5.896232384462128, "learning_rate": 9.753976230026158e-07, "loss": 0.1024, "step": 4505 }, { "epoch": 3.22, "grad_norm": 7.730396170031993, "learning_rate": 9.736833306522537e-07, "loss": 0.1816, "step": 4506 }, { "epoch": 3.22, "grad_norm": 9.87791454318401, "learning_rate": 9.719703835691314e-07, "loss": 0.1908, "step": 4507 }, { "epoch": 3.22, "grad_norm": 9.305401921388526, "learning_rate": 9.702587823255715e-07, "loss": 0.1743, "step": 4508 }, { "epoch": 3.22, "grad_norm": 12.725995171162081, "learning_rate": 9.685485274934576e-07, "loss": 0.2882, "step": 4509 }, { "epoch": 3.22, "grad_norm": 9.06907116647378, "learning_rate": 9.66839619644211e-07, "loss": 0.1427, "step": 4510 }, { "epoch": 3.22, "grad_norm": 5.154694219463353, "learning_rate": 9.651320593488162e-07, "loss": 0.1312, "step": 4511 }, { "epoch": 3.22, "grad_norm": 5.18461317267077, "learning_rate": 9.634258471777958e-07, "loss": 0.1302, "step": 4512 }, { "epoch": 3.22, "grad_norm": 8.304539402084123, "learning_rate": 9.617209837012287e-07, "loss": 0.129, "step": 4513 }, { "epoch": 3.22, "grad_norm": 7.210423547955145, "learning_rate": 9.600174694887421e-07, "loss": 0.1196, "step": 4514 }, { "epoch": 3.22, "grad_norm": 6.6024670811500705, "learning_rate": 9.583153051095107e-07, "loss": 0.113, "step": 4515 }, { "epoch": 3.22, "grad_norm": 7.713142607529561, "learning_rate": 9.5661449113226e-07, "loss": 0.1359, "step": 4516 }, { "epoch": 3.22, "grad_norm": 6.483198577305126, "learning_rate": 9.549150281252633e-07, "loss": 0.1104, "step": 4517 }, { "epoch": 3.22, "grad_norm": 7.5373377619248, "learning_rate": 9.532169166563426e-07, "loss": 0.1328, "step": 4518 }, { "epoch": 3.23, "grad_norm": 5.18316197627452, "learning_rate": 9.515201572928689e-07, "loss": 0.0995, "step": 4519 }, { "epoch": 3.23, "grad_norm": 8.12338317679702, "learning_rate": 9.49824750601761e-07, "loss": 0.1083, "step": 4520 }, { "epoch": 3.23, "grad_norm": 7.602592282544346, "learning_rate": 9.481306971494858e-07, "loss": 0.1244, "step": 4521 }, { "epoch": 3.23, "grad_norm": 6.777997231167417, "learning_rate": 9.464379975020576e-07, "loss": 0.1606, "step": 4522 }, { "epoch": 3.23, "grad_norm": 6.124881227801997, "learning_rate": 9.447466522250393e-07, "loss": 0.1326, "step": 4523 }, { "epoch": 3.23, "grad_norm": 5.658499339226998, "learning_rate": 9.430566618835407e-07, "loss": 0.1031, "step": 4524 }, { "epoch": 3.23, "grad_norm": 8.542957014977226, "learning_rate": 9.413680270422187e-07, "loss": 0.1439, "step": 4525 }, { "epoch": 3.23, "grad_norm": 8.15924583194008, "learning_rate": 9.396807482652775e-07, "loss": 0.1235, "step": 4526 }, { "epoch": 3.23, "grad_norm": 5.153176455458262, "learning_rate": 9.3799482611647e-07, "loss": 0.14, "step": 4527 }, { "epoch": 3.23, "grad_norm": 6.857510589303771, "learning_rate": 9.363102611590918e-07, "loss": 0.1151, "step": 4528 }, { "epoch": 3.23, "grad_norm": 6.841608683622035, "learning_rate": 9.346270539559882e-07, "loss": 0.1255, "step": 4529 }, { "epoch": 3.23, "grad_norm": 6.349487702065171, "learning_rate": 9.329452050695497e-07, "loss": 0.1427, "step": 4530 }, { "epoch": 3.23, "grad_norm": 5.185775397201693, "learning_rate": 9.312647150617144e-07, "loss": 0.1322, "step": 4531 }, { "epoch": 3.23, "grad_norm": 5.993422226179697, "learning_rate": 9.295855844939639e-07, "loss": 0.1133, "step": 4532 }, { "epoch": 3.24, "grad_norm": 6.877262751498541, "learning_rate": 9.279078139273279e-07, "loss": 0.1621, "step": 4533 }, { "epoch": 3.24, "grad_norm": 10.9101913835771, "learning_rate": 9.262314039223802e-07, "loss": 0.1537, "step": 4534 }, { "epoch": 3.24, "grad_norm": 8.089060660483195, "learning_rate": 9.245563550392406e-07, "loss": 0.1165, "step": 4535 }, { "epoch": 3.24, "grad_norm": 5.9168601004208625, "learning_rate": 9.22882667837574e-07, "loss": 0.1515, "step": 4536 }, { "epoch": 3.24, "grad_norm": 3.366227246368016, "learning_rate": 9.212103428765912e-07, "loss": 0.0607, "step": 4537 }, { "epoch": 3.24, "grad_norm": 6.537651822483652, "learning_rate": 9.19539380715046e-07, "loss": 0.1354, "step": 4538 }, { "epoch": 3.24, "grad_norm": 5.770155298405159, "learning_rate": 9.178697819112381e-07, "loss": 0.1273, "step": 4539 }, { "epoch": 3.24, "grad_norm": 9.281594784034297, "learning_rate": 9.162015470230123e-07, "loss": 0.1356, "step": 4540 }, { "epoch": 3.24, "grad_norm": 8.028631915325574, "learning_rate": 9.145346766077562e-07, "loss": 0.1834, "step": 4541 }, { "epoch": 3.24, "grad_norm": 4.045279042279176, "learning_rate": 9.128691712224025e-07, "loss": 0.1058, "step": 4542 }, { "epoch": 3.24, "grad_norm": 6.339980352393081, "learning_rate": 9.112050314234272e-07, "loss": 0.1326, "step": 4543 }, { "epoch": 3.24, "grad_norm": 3.1982368923211744, "learning_rate": 9.0954225776685e-07, "loss": 0.0649, "step": 4544 }, { "epoch": 3.24, "grad_norm": 5.253249493987398, "learning_rate": 9.078808508082354e-07, "loss": 0.0817, "step": 4545 }, { "epoch": 3.24, "grad_norm": 7.390707096657996, "learning_rate": 9.06220811102691e-07, "loss": 0.1368, "step": 4546 }, { "epoch": 3.25, "grad_norm": 15.032324120582194, "learning_rate": 9.045621392048637e-07, "loss": 0.2621, "step": 4547 }, { "epoch": 3.25, "grad_norm": 8.212400030385552, "learning_rate": 9.029048356689507e-07, "loss": 0.1085, "step": 4548 }, { "epoch": 3.25, "grad_norm": 7.560047001509493, "learning_rate": 9.012489010486835e-07, "loss": 0.1552, "step": 4549 }, { "epoch": 3.25, "grad_norm": 9.24670417792784, "learning_rate": 8.995943358973463e-07, "loss": 0.134, "step": 4550 }, { "epoch": 3.25, "grad_norm": 8.061594801279545, "learning_rate": 8.979411407677535e-07, "loss": 0.1624, "step": 4551 }, { "epoch": 3.25, "grad_norm": 7.841536214499788, "learning_rate": 8.962893162122749e-07, "loss": 0.1432, "step": 4552 }, { "epoch": 3.25, "grad_norm": 7.80332838186666, "learning_rate": 8.946388627828106e-07, "loss": 0.1351, "step": 4553 }, { "epoch": 3.25, "grad_norm": 5.577103709697863, "learning_rate": 8.929897810308102e-07, "loss": 0.1639, "step": 4554 }, { "epoch": 3.25, "grad_norm": 6.890676018724506, "learning_rate": 8.913420715072619e-07, "loss": 0.1366, "step": 4555 }, { "epoch": 3.25, "grad_norm": 7.615943828764504, "learning_rate": 8.896957347626966e-07, "loss": 0.1583, "step": 4556 }, { "epoch": 3.25, "grad_norm": 5.849886192759192, "learning_rate": 8.880507713471853e-07, "loss": 0.1052, "step": 4557 }, { "epoch": 3.25, "grad_norm": 6.021066135784146, "learning_rate": 8.864071818103415e-07, "loss": 0.1222, "step": 4558 }, { "epoch": 3.25, "grad_norm": 7.485990965887972, "learning_rate": 8.847649667013187e-07, "loss": 0.12, "step": 4559 }, { "epoch": 3.25, "grad_norm": 5.798179875468671, "learning_rate": 8.831241265688112e-07, "loss": 0.145, "step": 4560 }, { "epoch": 3.26, "grad_norm": 7.884685397021309, "learning_rate": 8.814846619610545e-07, "loss": 0.1251, "step": 4561 }, { "epoch": 3.26, "grad_norm": 9.08712853808379, "learning_rate": 8.79846573425826e-07, "loss": 0.1326, "step": 4562 }, { "epoch": 3.26, "grad_norm": 6.151088191408366, "learning_rate": 8.782098615104373e-07, "loss": 0.1555, "step": 4563 }, { "epoch": 3.26, "grad_norm": 7.585973402310389, "learning_rate": 8.765745267617487e-07, "loss": 0.1342, "step": 4564 }, { "epoch": 3.26, "grad_norm": 6.157889175348612, "learning_rate": 8.749405697261515e-07, "loss": 0.1348, "step": 4565 }, { "epoch": 3.26, "grad_norm": 6.384538264495767, "learning_rate": 8.733079909495868e-07, "loss": 0.1028, "step": 4566 }, { "epoch": 3.26, "grad_norm": 6.53422431942535, "learning_rate": 8.716767909775231e-07, "loss": 0.1035, "step": 4567 }, { "epoch": 3.26, "grad_norm": 5.424476627599804, "learning_rate": 8.700469703549802e-07, "loss": 0.0785, "step": 4568 }, { "epoch": 3.26, "grad_norm": 8.037680257428233, "learning_rate": 8.684185296265074e-07, "loss": 0.1572, "step": 4569 }, { "epoch": 3.26, "grad_norm": 6.581140538771417, "learning_rate": 8.667914693362006e-07, "loss": 0.1245, "step": 4570 }, { "epoch": 3.26, "grad_norm": 7.29417094424049, "learning_rate": 8.651657900276878e-07, "loss": 0.1288, "step": 4571 }, { "epoch": 3.26, "grad_norm": 5.612495200486463, "learning_rate": 8.635414922441398e-07, "loss": 0.1091, "step": 4572 }, { "epoch": 3.26, "grad_norm": 5.7330667780604205, "learning_rate": 8.61918576528265e-07, "loss": 0.147, "step": 4573 }, { "epoch": 3.26, "grad_norm": 7.977158518523487, "learning_rate": 8.60297043422309e-07, "loss": 0.1433, "step": 4574 }, { "epoch": 3.27, "grad_norm": 9.135644940015077, "learning_rate": 8.586768934680572e-07, "loss": 0.1904, "step": 4575 }, { "epoch": 3.27, "grad_norm": 6.314051493515507, "learning_rate": 8.570581272068307e-07, "loss": 0.0915, "step": 4576 }, { "epoch": 3.27, "grad_norm": 5.336134974937514, "learning_rate": 8.554407451794905e-07, "loss": 0.0985, "step": 4577 }, { "epoch": 3.27, "grad_norm": 7.694521064289034, "learning_rate": 8.538247479264327e-07, "loss": 0.1609, "step": 4578 }, { "epoch": 3.27, "grad_norm": 7.528554460208515, "learning_rate": 8.522101359875934e-07, "loss": 0.1183, "step": 4579 }, { "epoch": 3.27, "grad_norm": 9.875424450214489, "learning_rate": 8.505969099024436e-07, "loss": 0.1346, "step": 4580 }, { "epoch": 3.27, "grad_norm": 6.967351372296031, "learning_rate": 8.489850702099922e-07, "loss": 0.1478, "step": 4581 }, { "epoch": 3.27, "grad_norm": 5.9745850342812785, "learning_rate": 8.473746174487846e-07, "loss": 0.1324, "step": 4582 }, { "epoch": 3.27, "grad_norm": 9.59845328096796, "learning_rate": 8.457655521569036e-07, "loss": 0.1688, "step": 4583 }, { "epoch": 3.27, "grad_norm": 6.193901791217303, "learning_rate": 8.441578748719676e-07, "loss": 0.1298, "step": 4584 }, { "epoch": 3.27, "grad_norm": 5.411916620325072, "learning_rate": 8.425515861311312e-07, "loss": 0.1118, "step": 4585 }, { "epoch": 3.27, "grad_norm": 11.622734989607526, "learning_rate": 8.409466864710858e-07, "loss": 0.1835, "step": 4586 }, { "epoch": 3.27, "grad_norm": 7.586682013277674, "learning_rate": 8.393431764280591e-07, "loss": 0.123, "step": 4587 }, { "epoch": 3.27, "grad_norm": 4.8412719082832485, "learning_rate": 8.377410565378097e-07, "loss": 0.1078, "step": 4588 }, { "epoch": 3.28, "grad_norm": 8.79076733178493, "learning_rate": 8.361403273356411e-07, "loss": 0.1606, "step": 4589 }, { "epoch": 3.28, "grad_norm": 6.946585563110569, "learning_rate": 8.345409893563816e-07, "loss": 0.1257, "step": 4590 }, { "epoch": 3.28, "grad_norm": 11.064964839267628, "learning_rate": 8.329430431344043e-07, "loss": 0.174, "step": 4591 }, { "epoch": 3.28, "grad_norm": 6.590378379957648, "learning_rate": 8.313464892036083e-07, "loss": 0.0907, "step": 4592 }, { "epoch": 3.28, "grad_norm": 7.76238767588108, "learning_rate": 8.297513280974362e-07, "loss": 0.1501, "step": 4593 }, { "epoch": 3.28, "grad_norm": 6.591967597672873, "learning_rate": 8.281575603488573e-07, "loss": 0.1106, "step": 4594 }, { "epoch": 3.28, "grad_norm": 7.372299842456041, "learning_rate": 8.265651864903823e-07, "loss": 0.1545, "step": 4595 }, { "epoch": 3.28, "grad_norm": 7.960406328039799, "learning_rate": 8.249742070540506e-07, "loss": 0.1243, "step": 4596 }, { "epoch": 3.28, "grad_norm": 10.592265053976755, "learning_rate": 8.233846225714386e-07, "loss": 0.1826, "step": 4597 }, { "epoch": 3.28, "grad_norm": 8.888448431893492, "learning_rate": 8.217964335736556e-07, "loss": 0.1232, "step": 4598 }, { "epoch": 3.28, "grad_norm": 5.383633290511777, "learning_rate": 8.202096405913462e-07, "loss": 0.0842, "step": 4599 }, { "epoch": 3.28, "grad_norm": 7.216804849791828, "learning_rate": 8.186242441546866e-07, "loss": 0.0851, "step": 4600 }, { "epoch": 3.28, "grad_norm": 10.806207876507727, "learning_rate": 8.170402447933873e-07, "loss": 0.1648, "step": 4601 }, { "epoch": 3.28, "grad_norm": 6.586285027658275, "learning_rate": 8.154576430366922e-07, "loss": 0.0989, "step": 4602 }, { "epoch": 3.29, "grad_norm": 7.464315850177548, "learning_rate": 8.13876439413378e-07, "loss": 0.1607, "step": 4603 }, { "epoch": 3.29, "grad_norm": 11.060642250176604, "learning_rate": 8.122966344517536e-07, "loss": 0.1982, "step": 4604 }, { "epoch": 3.29, "grad_norm": 5.741654866635015, "learning_rate": 8.107182286796633e-07, "loss": 0.0873, "step": 4605 }, { "epoch": 3.29, "grad_norm": 7.245045678499837, "learning_rate": 8.091412226244771e-07, "loss": 0.1475, "step": 4606 }, { "epoch": 3.29, "grad_norm": 6.950428201411654, "learning_rate": 8.07565616813108e-07, "loss": 0.145, "step": 4607 }, { "epoch": 3.29, "grad_norm": 10.063829105184517, "learning_rate": 8.059914117719897e-07, "loss": 0.1516, "step": 4608 }, { "epoch": 3.29, "grad_norm": 7.472916859631381, "learning_rate": 8.044186080270983e-07, "loss": 0.1149, "step": 4609 }, { "epoch": 3.29, "grad_norm": 6.777826055951287, "learning_rate": 8.028472061039322e-07, "loss": 0.1243, "step": 4610 }, { "epoch": 3.29, "grad_norm": 10.824941387370806, "learning_rate": 8.012772065275304e-07, "loss": 0.1042, "step": 4611 }, { "epoch": 3.29, "grad_norm": 9.813588704435642, "learning_rate": 7.997086098224555e-07, "loss": 0.1234, "step": 4612 }, { "epoch": 3.29, "grad_norm": 9.105923319287843, "learning_rate": 7.981414165128065e-07, "loss": 0.1295, "step": 4613 }, { "epoch": 3.29, "grad_norm": 6.18452983325367, "learning_rate": 7.965756271222108e-07, "loss": 0.1241, "step": 4614 }, { "epoch": 3.29, "grad_norm": 5.555813455964336, "learning_rate": 7.950112421738282e-07, "loss": 0.101, "step": 4615 }, { "epoch": 3.29, "grad_norm": 7.138099022811065, "learning_rate": 7.934482621903494e-07, "loss": 0.153, "step": 4616 }, { "epoch": 3.3, "grad_norm": 5.029921247555546, "learning_rate": 7.91886687693994e-07, "loss": 0.103, "step": 4617 }, { "epoch": 3.3, "grad_norm": 5.58503608433331, "learning_rate": 7.903265192065141e-07, "loss": 0.0841, "step": 4618 }, { "epoch": 3.3, "grad_norm": 7.5716326952600115, "learning_rate": 7.887677572491903e-07, "loss": 0.1713, "step": 4619 }, { "epoch": 3.3, "grad_norm": 6.81055005735616, "learning_rate": 7.872104023428339e-07, "loss": 0.1244, "step": 4620 }, { "epoch": 3.3, "grad_norm": 4.568296952845966, "learning_rate": 7.856544550077883e-07, "loss": 0.088, "step": 4621 }, { "epoch": 3.3, "grad_norm": 6.472127459738777, "learning_rate": 7.840999157639195e-07, "loss": 0.1244, "step": 4622 }, { "epoch": 3.3, "grad_norm": 20.818327230629205, "learning_rate": 7.825467851306335e-07, "loss": 0.1603, "step": 4623 }, { "epoch": 3.3, "grad_norm": 7.986882546336097, "learning_rate": 7.809950636268554e-07, "loss": 0.2106, "step": 4624 }, { "epoch": 3.3, "grad_norm": 7.765351049579674, "learning_rate": 7.794447517710485e-07, "loss": 0.1481, "step": 4625 }, { "epoch": 3.3, "grad_norm": 13.522443356939034, "learning_rate": 7.778958500811961e-07, "loss": 0.1749, "step": 4626 }, { "epoch": 3.3, "grad_norm": 5.920109695380363, "learning_rate": 7.7634835907482e-07, "loss": 0.1223, "step": 4627 }, { "epoch": 3.3, "grad_norm": 7.115723186086778, "learning_rate": 7.748022792689613e-07, "loss": 0.0972, "step": 4628 }, { "epoch": 3.3, "grad_norm": 4.771496666945104, "learning_rate": 7.732576111801982e-07, "loss": 0.0928, "step": 4629 }, { "epoch": 3.3, "grad_norm": 11.926008165260582, "learning_rate": 7.717143553246298e-07, "loss": 0.1371, "step": 4630 }, { "epoch": 3.31, "grad_norm": 7.99102315789066, "learning_rate": 7.701725122178871e-07, "loss": 0.1481, "step": 4631 }, { "epoch": 3.31, "grad_norm": 12.122142585290227, "learning_rate": 7.686320823751298e-07, "loss": 0.146, "step": 4632 }, { "epoch": 3.31, "grad_norm": 8.408071273100916, "learning_rate": 7.670930663110426e-07, "loss": 0.1442, "step": 4633 }, { "epoch": 3.31, "grad_norm": 6.679326638442814, "learning_rate": 7.655554645398405e-07, "loss": 0.1382, "step": 4634 }, { "epoch": 3.31, "grad_norm": 25.188966590030155, "learning_rate": 7.640192775752647e-07, "loss": 0.129, "step": 4635 }, { "epoch": 3.31, "grad_norm": 7.961221286961985, "learning_rate": 7.624845059305836e-07, "loss": 0.1493, "step": 4636 }, { "epoch": 3.31, "grad_norm": 9.194206304233527, "learning_rate": 7.609511501185929e-07, "loss": 0.139, "step": 4637 }, { "epoch": 3.31, "grad_norm": 6.073454340195965, "learning_rate": 7.594192106516151e-07, "loss": 0.1255, "step": 4638 }, { "epoch": 3.31, "grad_norm": 6.415599395063826, "learning_rate": 7.578886880414999e-07, "loss": 0.1212, "step": 4639 }, { "epoch": 3.31, "grad_norm": 5.17906997606143, "learning_rate": 7.563595827996235e-07, "loss": 0.1142, "step": 4640 }, { "epoch": 3.31, "grad_norm": 9.769899102967289, "learning_rate": 7.548318954368883e-07, "loss": 0.1178, "step": 4641 }, { "epoch": 3.31, "grad_norm": 6.54723718176854, "learning_rate": 7.533056264637228e-07, "loss": 0.1173, "step": 4642 }, { "epoch": 3.31, "grad_norm": 5.078416054162937, "learning_rate": 7.51780776390082e-07, "loss": 0.1028, "step": 4643 }, { "epoch": 3.31, "grad_norm": 10.329231158209321, "learning_rate": 7.50257345725447e-07, "loss": 0.1575, "step": 4644 }, { "epoch": 3.32, "grad_norm": 5.934461236117057, "learning_rate": 7.487353349788234e-07, "loss": 0.1179, "step": 4645 }, { "epoch": 3.32, "grad_norm": 11.00570511198538, "learning_rate": 7.472147446587452e-07, "loss": 0.1691, "step": 4646 }, { "epoch": 3.32, "grad_norm": 6.461797158871515, "learning_rate": 7.456955752732659e-07, "loss": 0.1371, "step": 4647 }, { "epoch": 3.32, "grad_norm": 5.756284807242618, "learning_rate": 7.441778273299738e-07, "loss": 0.0889, "step": 4648 }, { "epoch": 3.32, "grad_norm": 7.33097177974351, "learning_rate": 7.426615013359706e-07, "loss": 0.1659, "step": 4649 }, { "epoch": 3.32, "grad_norm": 5.974735983581375, "learning_rate": 7.411465977978949e-07, "loss": 0.1, "step": 4650 }, { "epoch": 3.32, "grad_norm": 11.534506781082133, "learning_rate": 7.396331172218996e-07, "loss": 0.1552, "step": 4651 }, { "epoch": 3.32, "grad_norm": 5.406433437663972, "learning_rate": 7.381210601136702e-07, "loss": 0.1144, "step": 4652 }, { "epoch": 3.32, "grad_norm": 5.408309239317868, "learning_rate": 7.366104269784086e-07, "loss": 0.1577, "step": 4653 }, { "epoch": 3.32, "grad_norm": 11.29949891157189, "learning_rate": 7.351012183208511e-07, "loss": 0.2051, "step": 4654 }, { "epoch": 3.32, "grad_norm": 7.593810512608232, "learning_rate": 7.335934346452484e-07, "loss": 0.1118, "step": 4655 }, { "epoch": 3.32, "grad_norm": 7.302955662252433, "learning_rate": 7.320870764553795e-07, "loss": 0.1782, "step": 4656 }, { "epoch": 3.32, "grad_norm": 8.661085471989326, "learning_rate": 7.305821442545474e-07, "loss": 0.1105, "step": 4657 }, { "epoch": 3.32, "grad_norm": 6.860609598218582, "learning_rate": 7.290786385455778e-07, "loss": 0.1268, "step": 4658 }, { "epoch": 3.33, "grad_norm": 9.255949888842219, "learning_rate": 7.275765598308199e-07, "loss": 0.1941, "step": 4659 }, { "epoch": 3.33, "grad_norm": 8.14943801715935, "learning_rate": 7.26075908612146e-07, "loss": 0.1467, "step": 4660 }, { "epoch": 3.33, "grad_norm": 8.30112212336164, "learning_rate": 7.245766853909519e-07, "loss": 0.1393, "step": 4661 }, { "epoch": 3.33, "grad_norm": 6.7267328951619785, "learning_rate": 7.230788906681558e-07, "loss": 0.1006, "step": 4662 }, { "epoch": 3.33, "grad_norm": 9.21055385244117, "learning_rate": 7.215825249441982e-07, "loss": 0.1885, "step": 4663 }, { "epoch": 3.33, "grad_norm": 12.814543386192565, "learning_rate": 7.200875887190445e-07, "loss": 0.1476, "step": 4664 }, { "epoch": 3.33, "grad_norm": 9.004947303321043, "learning_rate": 7.185940824921772e-07, "loss": 0.1779, "step": 4665 }, { "epoch": 3.33, "grad_norm": 7.554539232706164, "learning_rate": 7.171020067626089e-07, "loss": 0.1388, "step": 4666 }, { "epoch": 3.33, "grad_norm": 8.517269227462895, "learning_rate": 7.156113620288646e-07, "loss": 0.2025, "step": 4667 }, { "epoch": 3.33, "grad_norm": 6.16280886352939, "learning_rate": 7.141221487890027e-07, "loss": 0.1512, "step": 4668 }, { "epoch": 3.33, "grad_norm": 6.039791042262847, "learning_rate": 7.126343675405905e-07, "loss": 0.084, "step": 4669 }, { "epoch": 3.33, "grad_norm": 7.185117749237396, "learning_rate": 7.111480187807296e-07, "loss": 0.1752, "step": 4670 }, { "epoch": 3.33, "grad_norm": 4.745690408307122, "learning_rate": 7.096631030060308e-07, "loss": 0.0933, "step": 4671 }, { "epoch": 3.33, "grad_norm": 3.9267132250498196, "learning_rate": 7.081796207126373e-07, "loss": 0.0792, "step": 4672 }, { "epoch": 3.34, "grad_norm": 3.914207317636215, "learning_rate": 7.06697572396205e-07, "loss": 0.0667, "step": 4673 }, { "epoch": 3.34, "grad_norm": 5.048328757763309, "learning_rate": 7.052169585519142e-07, "loss": 0.0873, "step": 4674 }, { "epoch": 3.34, "grad_norm": 6.948918663582835, "learning_rate": 7.037377796744666e-07, "loss": 0.1484, "step": 4675 }, { "epoch": 3.34, "grad_norm": 5.111718013258758, "learning_rate": 7.022600362580817e-07, "loss": 0.1198, "step": 4676 }, { "epoch": 3.34, "grad_norm": 4.507791776880594, "learning_rate": 7.007837287965024e-07, "loss": 0.1039, "step": 4677 }, { "epoch": 3.34, "grad_norm": 7.7886653073378005, "learning_rate": 6.993088577829904e-07, "loss": 0.1141, "step": 4678 }, { "epoch": 3.34, "grad_norm": 11.19723147804638, "learning_rate": 6.978354237103264e-07, "loss": 0.1788, "step": 4679 }, { "epoch": 3.34, "grad_norm": 5.665500446115932, "learning_rate": 6.963634270708137e-07, "loss": 0.1144, "step": 4680 }, { "epoch": 3.34, "grad_norm": 4.1211146195001245, "learning_rate": 6.948928683562722e-07, "loss": 0.0544, "step": 4681 }, { "epoch": 3.34, "grad_norm": 5.297345133083735, "learning_rate": 6.934237480580435e-07, "loss": 0.0996, "step": 4682 }, { "epoch": 3.34, "grad_norm": 6.441164589598402, "learning_rate": 6.919560666669889e-07, "loss": 0.1211, "step": 4683 }, { "epoch": 3.34, "grad_norm": 7.607568415682107, "learning_rate": 6.904898246734864e-07, "loss": 0.1151, "step": 4684 }, { "epoch": 3.34, "grad_norm": 7.063782633192273, "learning_rate": 6.890250225674361e-07, "loss": 0.134, "step": 4685 }, { "epoch": 3.34, "grad_norm": 7.207873640293048, "learning_rate": 6.875616608382562e-07, "loss": 0.153, "step": 4686 }, { "epoch": 3.35, "grad_norm": 6.1416823543894346, "learning_rate": 6.860997399748792e-07, "loss": 0.1389, "step": 4687 }, { "epoch": 3.35, "grad_norm": 4.1354245919128685, "learning_rate": 6.846392604657653e-07, "loss": 0.0589, "step": 4688 }, { "epoch": 3.35, "grad_norm": 5.418657085737841, "learning_rate": 6.831802227988843e-07, "loss": 0.0791, "step": 4689 }, { "epoch": 3.35, "grad_norm": 5.212588473079516, "learning_rate": 6.817226274617283e-07, "loss": 0.0958, "step": 4690 }, { "epoch": 3.35, "grad_norm": 5.918837142437118, "learning_rate": 6.802664749413079e-07, "loss": 0.1281, "step": 4691 }, { "epoch": 3.35, "grad_norm": 8.434507021495676, "learning_rate": 6.788117657241506e-07, "loss": 0.1213, "step": 4692 }, { "epoch": 3.35, "grad_norm": 7.487469942524017, "learning_rate": 6.773585002963007e-07, "loss": 0.1587, "step": 4693 }, { "epoch": 3.35, "grad_norm": 7.914773574255319, "learning_rate": 6.759066791433228e-07, "loss": 0.1387, "step": 4694 }, { "epoch": 3.35, "grad_norm": 9.230097938274037, "learning_rate": 6.744563027502959e-07, "loss": 0.139, "step": 4695 }, { "epoch": 3.35, "grad_norm": 7.406132054228652, "learning_rate": 6.730073716018187e-07, "loss": 0.1562, "step": 4696 }, { "epoch": 3.35, "grad_norm": 6.54202900315914, "learning_rate": 6.715598861820055e-07, "loss": 0.1226, "step": 4697 }, { "epoch": 3.35, "grad_norm": 5.922145967208217, "learning_rate": 6.701138469744883e-07, "loss": 0.111, "step": 4698 }, { "epoch": 3.35, "grad_norm": 6.397305802789072, "learning_rate": 6.686692544624157e-07, "loss": 0.1105, "step": 4699 }, { "epoch": 3.35, "grad_norm": 6.660593338022077, "learning_rate": 6.672261091284526e-07, "loss": 0.1178, "step": 4700 }, { "epoch": 3.36, "grad_norm": 8.2651428920971, "learning_rate": 6.657844114547812e-07, "loss": 0.1681, "step": 4701 }, { "epoch": 3.36, "grad_norm": 5.319009621262608, "learning_rate": 6.643441619230989e-07, "loss": 0.094, "step": 4702 }, { "epoch": 3.36, "grad_norm": 6.571037009481898, "learning_rate": 6.629053610146202e-07, "loss": 0.1138, "step": 4703 }, { "epoch": 3.36, "grad_norm": 7.379231512952576, "learning_rate": 6.61468009210075e-07, "loss": 0.1219, "step": 4704 }, { "epoch": 3.36, "grad_norm": 6.991505241069526, "learning_rate": 6.600321069897097e-07, "loss": 0.1466, "step": 4705 }, { "epoch": 3.36, "grad_norm": 11.1500655678584, "learning_rate": 6.585976548332856e-07, "loss": 0.2023, "step": 4706 }, { "epoch": 3.36, "grad_norm": 7.836974094823008, "learning_rate": 6.571646532200815e-07, "loss": 0.1, "step": 4707 }, { "epoch": 3.36, "grad_norm": 3.2192067894980907, "learning_rate": 6.557331026288855e-07, "loss": 0.0592, "step": 4708 }, { "epoch": 3.36, "grad_norm": 4.990968911316154, "learning_rate": 6.543030035380099e-07, "loss": 0.1028, "step": 4709 }, { "epoch": 3.36, "grad_norm": 5.338984666872942, "learning_rate": 6.528743564252737e-07, "loss": 0.1162, "step": 4710 }, { "epoch": 3.36, "grad_norm": 5.645866648570372, "learning_rate": 6.514471617680184e-07, "loss": 0.1099, "step": 4711 }, { "epoch": 3.36, "grad_norm": 4.5825772199128085, "learning_rate": 6.500214200430921e-07, "loss": 0.1029, "step": 4712 }, { "epoch": 3.36, "grad_norm": 5.144375515006759, "learning_rate": 6.485971317268658e-07, "loss": 0.1123, "step": 4713 }, { "epoch": 3.36, "grad_norm": 5.7099658296260705, "learning_rate": 6.471742972952172e-07, "loss": 0.1021, "step": 4714 }, { "epoch": 3.37, "grad_norm": 6.454486821044744, "learning_rate": 6.457529172235427e-07, "loss": 0.1027, "step": 4715 }, { "epoch": 3.37, "grad_norm": 5.6424283925645256, "learning_rate": 6.44332991986753e-07, "loss": 0.1061, "step": 4716 }, { "epoch": 3.37, "grad_norm": 5.0353165182692345, "learning_rate": 6.429145220592703e-07, "loss": 0.1191, "step": 4717 }, { "epoch": 3.37, "grad_norm": 7.423744958784793, "learning_rate": 6.414975079150321e-07, "loss": 0.1349, "step": 4718 }, { "epoch": 3.37, "grad_norm": 7.998700076551767, "learning_rate": 6.400819500274891e-07, "loss": 0.1064, "step": 4719 }, { "epoch": 3.37, "grad_norm": 6.338283445411675, "learning_rate": 6.386678488696057e-07, "loss": 0.1465, "step": 4720 }, { "epoch": 3.37, "grad_norm": 7.025518446998439, "learning_rate": 6.372552049138591e-07, "loss": 0.1173, "step": 4721 }, { "epoch": 3.37, "grad_norm": 6.876253982615516, "learning_rate": 6.358440186322401e-07, "loss": 0.092, "step": 4722 }, { "epoch": 3.37, "grad_norm": 8.410474264470912, "learning_rate": 6.344342904962536e-07, "loss": 0.0841, "step": 4723 }, { "epoch": 3.37, "grad_norm": 5.331845239953185, "learning_rate": 6.330260209769124e-07, "loss": 0.1038, "step": 4724 }, { "epoch": 3.37, "grad_norm": 7.044163364551721, "learning_rate": 6.316192105447499e-07, "loss": 0.1189, "step": 4725 }, { "epoch": 3.37, "grad_norm": 4.123656229134195, "learning_rate": 6.302138596698032e-07, "loss": 0.0798, "step": 4726 }, { "epoch": 3.37, "grad_norm": 7.533338908312375, "learning_rate": 6.288099688216309e-07, "loss": 0.1392, "step": 4727 }, { "epoch": 3.37, "grad_norm": 7.886494343160055, "learning_rate": 6.27407538469294e-07, "loss": 0.1273, "step": 4728 }, { "epoch": 3.38, "grad_norm": 7.451374710261631, "learning_rate": 6.260065690813754e-07, "loss": 0.163, "step": 4729 }, { "epoch": 3.38, "grad_norm": 5.280301091354748, "learning_rate": 6.246070611259603e-07, "loss": 0.0771, "step": 4730 }, { "epoch": 3.38, "grad_norm": 6.950197928743349, "learning_rate": 6.232090150706555e-07, "loss": 0.1183, "step": 4731 }, { "epoch": 3.38, "grad_norm": 7.311848775780406, "learning_rate": 6.218124313825696e-07, "loss": 0.1131, "step": 4732 }, { "epoch": 3.38, "grad_norm": 6.961362394740585, "learning_rate": 6.204173105283295e-07, "loss": 0.1559, "step": 4733 }, { "epoch": 3.38, "grad_norm": 6.77422676116153, "learning_rate": 6.190236529740701e-07, "loss": 0.1699, "step": 4734 }, { "epoch": 3.38, "grad_norm": 8.929780803606354, "learning_rate": 6.176314591854388e-07, "loss": 0.168, "step": 4735 }, { "epoch": 3.38, "grad_norm": 6.220542108817106, "learning_rate": 6.162407296275936e-07, "loss": 0.1192, "step": 4736 }, { "epoch": 3.38, "grad_norm": 20.56705499979564, "learning_rate": 6.148514647652026e-07, "loss": 0.149, "step": 4737 }, { "epoch": 3.38, "grad_norm": 9.716608379449566, "learning_rate": 6.134636650624448e-07, "loss": 0.1365, "step": 4738 }, { "epoch": 3.38, "grad_norm": 5.082791614875223, "learning_rate": 6.120773309830108e-07, "loss": 0.1092, "step": 4739 }, { "epoch": 3.38, "grad_norm": 4.58955029935693, "learning_rate": 6.106924629900996e-07, "loss": 0.1178, "step": 4740 }, { "epoch": 3.38, "grad_norm": 7.309633988207233, "learning_rate": 6.09309061546422e-07, "loss": 0.0879, "step": 4741 }, { "epoch": 3.38, "grad_norm": 8.559534602189101, "learning_rate": 6.079271271141979e-07, "loss": 0.1665, "step": 4742 }, { "epoch": 3.39, "grad_norm": 6.906477266849566, "learning_rate": 6.065466601551578e-07, "loss": 0.1103, "step": 4743 }, { "epoch": 3.39, "grad_norm": 9.231629348906024, "learning_rate": 6.051676611305401e-07, "loss": 0.1409, "step": 4744 }, { "epoch": 3.39, "grad_norm": 5.120806072520015, "learning_rate": 6.037901305010951e-07, "loss": 0.1036, "step": 4745 }, { "epoch": 3.39, "grad_norm": 7.357155083012127, "learning_rate": 6.024140687270813e-07, "loss": 0.1104, "step": 4746 }, { "epoch": 3.39, "grad_norm": 5.399708029837064, "learning_rate": 6.010394762682659e-07, "loss": 0.0915, "step": 4747 }, { "epoch": 3.39, "grad_norm": 5.7543160484824085, "learning_rate": 5.996663535839275e-07, "loss": 0.0948, "step": 4748 }, { "epoch": 3.39, "grad_norm": 10.87274696278645, "learning_rate": 5.982947011328489e-07, "loss": 0.1655, "step": 4749 }, { "epoch": 3.39, "grad_norm": 6.79729828235239, "learning_rate": 5.969245193733275e-07, "loss": 0.0967, "step": 4750 }, { "epoch": 3.39, "grad_norm": 7.446921297838255, "learning_rate": 5.955558087631641e-07, "loss": 0.1556, "step": 4751 }, { "epoch": 3.39, "grad_norm": 6.479713414080842, "learning_rate": 5.941885697596734e-07, "loss": 0.1226, "step": 4752 }, { "epoch": 3.39, "grad_norm": 8.37872487494799, "learning_rate": 5.928228028196714e-07, "loss": 0.1355, "step": 4753 }, { "epoch": 3.39, "grad_norm": 7.936869826658227, "learning_rate": 5.914585083994906e-07, "loss": 0.1339, "step": 4754 }, { "epoch": 3.39, "grad_norm": 6.964322624359045, "learning_rate": 5.900956869549629e-07, "loss": 0.1367, "step": 4755 }, { "epoch": 3.39, "grad_norm": 7.650251277890639, "learning_rate": 5.887343389414363e-07, "loss": 0.1934, "step": 4756 }, { "epoch": 3.4, "grad_norm": 9.021005113566153, "learning_rate": 5.873744648137592e-07, "loss": 0.1357, "step": 4757 }, { "epoch": 3.4, "grad_norm": 11.186719406066896, "learning_rate": 5.860160650262925e-07, "loss": 0.1373, "step": 4758 }, { "epoch": 3.4, "grad_norm": 3.9630385462474855, "learning_rate": 5.846591400329021e-07, "loss": 0.0551, "step": 4759 }, { "epoch": 3.4, "grad_norm": 7.011343925108888, "learning_rate": 5.833036902869626e-07, "loss": 0.0994, "step": 4760 }, { "epoch": 3.4, "grad_norm": 8.396161662969257, "learning_rate": 5.81949716241354e-07, "loss": 0.1549, "step": 4761 }, { "epoch": 3.4, "grad_norm": 7.423724973182227, "learning_rate": 5.805972183484654e-07, "loss": 0.1567, "step": 4762 }, { "epoch": 3.4, "grad_norm": 6.139312833169541, "learning_rate": 5.792461970601903e-07, "loss": 0.1575, "step": 4763 }, { "epoch": 3.4, "grad_norm": 5.275973947423691, "learning_rate": 5.778966528279306e-07, "loss": 0.0974, "step": 4764 }, { "epoch": 3.4, "grad_norm": 6.929297521573184, "learning_rate": 5.765485861025944e-07, "loss": 0.1428, "step": 4765 }, { "epoch": 3.4, "grad_norm": 9.094874672404059, "learning_rate": 5.752019973345963e-07, "loss": 0.1667, "step": 4766 }, { "epoch": 3.4, "grad_norm": 5.088141421563402, "learning_rate": 5.738568869738537e-07, "loss": 0.1113, "step": 4767 }, { "epoch": 3.4, "grad_norm": 6.680419138288556, "learning_rate": 5.725132554697971e-07, "loss": 0.1432, "step": 4768 }, { "epoch": 3.4, "grad_norm": 5.9210508941282285, "learning_rate": 5.711711032713547e-07, "loss": 0.1071, "step": 4769 }, { "epoch": 3.4, "grad_norm": 7.310725061193263, "learning_rate": 5.698304308269686e-07, "loss": 0.1504, "step": 4770 }, { "epoch": 3.41, "grad_norm": 6.741656309899917, "learning_rate": 5.684912385845786e-07, "loss": 0.1337, "step": 4771 }, { "epoch": 3.41, "grad_norm": 8.428815406923771, "learning_rate": 5.671535269916373e-07, "loss": 0.1229, "step": 4772 }, { "epoch": 3.41, "grad_norm": 6.190650489682488, "learning_rate": 5.658172964950953e-07, "loss": 0.1238, "step": 4773 }, { "epoch": 3.41, "grad_norm": 4.782034891009411, "learning_rate": 5.644825475414162e-07, "loss": 0.0731, "step": 4774 }, { "epoch": 3.41, "grad_norm": 6.444261645843906, "learning_rate": 5.631492805765609e-07, "loss": 0.1149, "step": 4775 }, { "epoch": 3.41, "grad_norm": 6.550284656212252, "learning_rate": 5.618174960459999e-07, "loss": 0.1343, "step": 4776 }, { "epoch": 3.41, "grad_norm": 6.641197272560726, "learning_rate": 5.604871943947071e-07, "loss": 0.1244, "step": 4777 }, { "epoch": 3.41, "grad_norm": 16.192125850606384, "learning_rate": 5.591583760671609e-07, "loss": 0.175, "step": 4778 }, { "epoch": 3.41, "grad_norm": 4.770239360221412, "learning_rate": 5.578310415073451e-07, "loss": 0.0812, "step": 4779 }, { "epoch": 3.41, "grad_norm": 9.519992706791493, "learning_rate": 5.565051911587455e-07, "loss": 0.156, "step": 4780 }, { "epoch": 3.41, "grad_norm": 6.899766338731636, "learning_rate": 5.551808254643543e-07, "loss": 0.1325, "step": 4781 }, { "epoch": 3.41, "grad_norm": 5.377638893489806, "learning_rate": 5.538579448666675e-07, "loss": 0.1172, "step": 4782 }, { "epoch": 3.41, "grad_norm": 5.657629714932012, "learning_rate": 5.525365498076807e-07, "loss": 0.0878, "step": 4783 }, { "epoch": 3.41, "grad_norm": 6.999669463350998, "learning_rate": 5.51216640728901e-07, "loss": 0.1361, "step": 4784 }, { "epoch": 3.42, "grad_norm": 7.528022836641468, "learning_rate": 5.498982180713308e-07, "loss": 0.161, "step": 4785 }, { "epoch": 3.42, "grad_norm": 13.981093221771234, "learning_rate": 5.485812822754826e-07, "loss": 0.2415, "step": 4786 }, { "epoch": 3.42, "grad_norm": 6.204609953884755, "learning_rate": 5.472658337813664e-07, "loss": 0.0935, "step": 4787 }, { "epoch": 3.42, "grad_norm": 9.851833696225833, "learning_rate": 5.459518730285007e-07, "loss": 0.1015, "step": 4788 }, { "epoch": 3.42, "grad_norm": 8.172761495859053, "learning_rate": 5.446394004559008e-07, "loss": 0.1106, "step": 4789 }, { "epoch": 3.42, "grad_norm": 4.069756351444801, "learning_rate": 5.43328416502093e-07, "loss": 0.0833, "step": 4790 }, { "epoch": 3.42, "grad_norm": 7.754921719414479, "learning_rate": 5.420189216050969e-07, "loss": 0.1276, "step": 4791 }, { "epoch": 3.42, "grad_norm": 6.097853307500944, "learning_rate": 5.407109162024409e-07, "loss": 0.1288, "step": 4792 }, { "epoch": 3.42, "grad_norm": 7.6596367143990065, "learning_rate": 5.394044007311544e-07, "loss": 0.1259, "step": 4793 }, { "epoch": 3.42, "grad_norm": 9.109859513949639, "learning_rate": 5.380993756277675e-07, "loss": 0.1785, "step": 4794 }, { "epoch": 3.42, "grad_norm": 5.758617786067658, "learning_rate": 5.367958413283141e-07, "loss": 0.1019, "step": 4795 }, { "epoch": 3.42, "grad_norm": 10.073887412643687, "learning_rate": 5.354937982683283e-07, "loss": 0.1434, "step": 4796 }, { "epoch": 3.42, "grad_norm": 20.106138254105133, "learning_rate": 5.341932468828481e-07, "loss": 0.0942, "step": 4797 }, { "epoch": 3.42, "grad_norm": 7.547332850402219, "learning_rate": 5.328941876064114e-07, "loss": 0.1196, "step": 4798 }, { "epoch": 3.43, "grad_norm": 9.800610244619884, "learning_rate": 5.315966208730578e-07, "loss": 0.1415, "step": 4799 }, { "epoch": 3.43, "grad_norm": 5.735215749717641, "learning_rate": 5.30300547116328e-07, "loss": 0.1469, "step": 4800 }, { "epoch": 3.43, "grad_norm": 6.803423115653081, "learning_rate": 5.290059667692655e-07, "loss": 0.1031, "step": 4801 }, { "epoch": 3.43, "grad_norm": 7.114085098454165, "learning_rate": 5.277128802644133e-07, "loss": 0.1191, "step": 4802 }, { "epoch": 3.43, "grad_norm": 10.514855529211154, "learning_rate": 5.264212880338154e-07, "loss": 0.1995, "step": 4803 }, { "epoch": 3.43, "grad_norm": 7.725958167463443, "learning_rate": 5.251311905090167e-07, "loss": 0.0878, "step": 4804 }, { "epoch": 3.43, "grad_norm": 6.131421273398189, "learning_rate": 5.238425881210624e-07, "loss": 0.0955, "step": 4805 }, { "epoch": 3.43, "grad_norm": 9.949154525113872, "learning_rate": 5.225554813004996e-07, "loss": 0.1548, "step": 4806 }, { "epoch": 3.43, "grad_norm": 8.331347956438186, "learning_rate": 5.21269870477375e-07, "loss": 0.1467, "step": 4807 }, { "epoch": 3.43, "grad_norm": 6.448868593975817, "learning_rate": 5.199857560812316e-07, "loss": 0.1307, "step": 4808 }, { "epoch": 3.43, "grad_norm": 9.401074138435595, "learning_rate": 5.187031385411206e-07, "loss": 0.1598, "step": 4809 }, { "epoch": 3.43, "grad_norm": 8.997603436753522, "learning_rate": 5.174220182855844e-07, "loss": 0.1896, "step": 4810 }, { "epoch": 3.43, "grad_norm": 5.9623142347970015, "learning_rate": 5.161423957426725e-07, "loss": 0.1287, "step": 4811 }, { "epoch": 3.43, "grad_norm": 7.338573874367029, "learning_rate": 5.148642713399272e-07, "loss": 0.112, "step": 4812 }, { "epoch": 3.44, "grad_norm": 7.146812781041527, "learning_rate": 5.13587645504397e-07, "loss": 0.1658, "step": 4813 }, { "epoch": 3.44, "grad_norm": 9.006764590662835, "learning_rate": 5.123125186626227e-07, "loss": 0.2671, "step": 4814 }, { "epoch": 3.44, "grad_norm": 6.677054214930086, "learning_rate": 5.110388912406517e-07, "loss": 0.1119, "step": 4815 }, { "epoch": 3.44, "grad_norm": 5.786888005642126, "learning_rate": 5.097667636640241e-07, "loss": 0.117, "step": 4816 }, { "epoch": 3.44, "grad_norm": 8.566924141847709, "learning_rate": 5.084961363577817e-07, "loss": 0.1681, "step": 4817 }, { "epoch": 3.44, "grad_norm": 6.283648205807579, "learning_rate": 5.072270097464649e-07, "loss": 0.1382, "step": 4818 }, { "epoch": 3.44, "grad_norm": 6.70248104170661, "learning_rate": 5.059593842541127e-07, "loss": 0.1403, "step": 4819 }, { "epoch": 3.44, "grad_norm": 4.145804986104878, "learning_rate": 5.04693260304262e-07, "loss": 0.0723, "step": 4820 }, { "epoch": 3.44, "grad_norm": 7.58725175311385, "learning_rate": 5.034286383199488e-07, "loss": 0.1171, "step": 4821 }, { "epoch": 3.44, "grad_norm": 5.057300916102727, "learning_rate": 5.021655187237067e-07, "loss": 0.1077, "step": 4822 }, { "epoch": 3.44, "grad_norm": 4.823300742743953, "learning_rate": 5.009039019375672e-07, "loss": 0.1105, "step": 4823 }, { "epoch": 3.44, "grad_norm": 6.406379850384206, "learning_rate": 4.996437883830596e-07, "loss": 0.1053, "step": 4824 }, { "epoch": 3.44, "grad_norm": 7.476297609589221, "learning_rate": 4.983851784812127e-07, "loss": 0.1573, "step": 4825 }, { "epoch": 3.44, "grad_norm": 10.028708462150261, "learning_rate": 4.97128072652549e-07, "loss": 0.0988, "step": 4826 }, { "epoch": 3.45, "grad_norm": 6.70771060659306, "learning_rate": 4.958724713170943e-07, "loss": 0.121, "step": 4827 }, { "epoch": 3.45, "grad_norm": 7.550234303625725, "learning_rate": 4.946183748943639e-07, "loss": 0.1516, "step": 4828 }, { "epoch": 3.45, "grad_norm": 4.250072350581676, "learning_rate": 4.933657838033795e-07, "loss": 0.0782, "step": 4829 }, { "epoch": 3.45, "grad_norm": 14.901528479240605, "learning_rate": 4.921146984626507e-07, "loss": 0.2083, "step": 4830 }, { "epoch": 3.45, "grad_norm": 8.861570669785552, "learning_rate": 4.908651192901926e-07, "loss": 0.1427, "step": 4831 }, { "epoch": 3.45, "grad_norm": 7.774296537989752, "learning_rate": 4.896170467035089e-07, "loss": 0.1311, "step": 4832 }, { "epoch": 3.45, "grad_norm": 6.097843488270496, "learning_rate": 4.883704811196072e-07, "loss": 0.1015, "step": 4833 }, { "epoch": 3.45, "grad_norm": 5.944954993942699, "learning_rate": 4.871254229549855e-07, "loss": 0.1368, "step": 4834 }, { "epoch": 3.45, "grad_norm": 6.610591286434344, "learning_rate": 4.858818726256425e-07, "loss": 0.1675, "step": 4835 }, { "epoch": 3.45, "grad_norm": 7.955225527923404, "learning_rate": 4.846398305470712e-07, "loss": 0.1721, "step": 4836 }, { "epoch": 3.45, "grad_norm": 10.896382771601361, "learning_rate": 4.833992971342604e-07, "loss": 0.1346, "step": 4837 }, { "epoch": 3.45, "grad_norm": 4.712058505532531, "learning_rate": 4.821602728016955e-07, "loss": 0.0718, "step": 4838 }, { "epoch": 3.45, "grad_norm": 4.889206680905457, "learning_rate": 4.809227579633585e-07, "loss": 0.0939, "step": 4839 }, { "epoch": 3.45, "grad_norm": 6.531400313220989, "learning_rate": 4.796867530327249e-07, "loss": 0.1354, "step": 4840 }, { "epoch": 3.46, "grad_norm": 6.121051370381773, "learning_rate": 4.784522584227675e-07, "loss": 0.1246, "step": 4841 }, { "epoch": 3.46, "grad_norm": 7.447583991391642, "learning_rate": 4.772192745459536e-07, "loss": 0.1349, "step": 4842 }, { "epoch": 3.46, "grad_norm": 7.9717320548571236, "learning_rate": 4.7598780181424666e-07, "loss": 0.1399, "step": 4843 }, { "epoch": 3.46, "grad_norm": 7.189759067456543, "learning_rate": 4.7475784063910404e-07, "loss": 0.1154, "step": 4844 }, { "epoch": 3.46, "grad_norm": 4.200052811630355, "learning_rate": 4.7352939143147927e-07, "loss": 0.1024, "step": 4845 }, { "epoch": 3.46, "grad_norm": 7.327313296684159, "learning_rate": 4.72302454601819e-07, "loss": 0.1127, "step": 4846 }, { "epoch": 3.46, "grad_norm": 4.821690226987369, "learning_rate": 4.7107703056006706e-07, "loss": 0.1036, "step": 4847 }, { "epoch": 3.46, "grad_norm": 4.37306344284546, "learning_rate": 4.6985311971565806e-07, "loss": 0.0723, "step": 4848 }, { "epoch": 3.46, "grad_norm": 5.177107325600691, "learning_rate": 4.6863072247752664e-07, "loss": 0.0664, "step": 4849 }, { "epoch": 3.46, "grad_norm": 6.044564249735391, "learning_rate": 4.67409839254096e-07, "loss": 0.1136, "step": 4850 }, { "epoch": 3.46, "grad_norm": 5.84423169120056, "learning_rate": 4.66190470453286e-07, "loss": 0.1257, "step": 4851 }, { "epoch": 3.46, "grad_norm": 6.516029681667653, "learning_rate": 4.6497261648251134e-07, "loss": 0.095, "step": 4852 }, { "epoch": 3.46, "grad_norm": 5.602034097877229, "learning_rate": 4.6375627774867925e-07, "loss": 0.1377, "step": 4853 }, { "epoch": 3.46, "grad_norm": 8.561475016663382, "learning_rate": 4.6254145465819134e-07, "loss": 0.1226, "step": 4854 }, { "epoch": 3.47, "grad_norm": 6.043790120678902, "learning_rate": 4.6132814761694234e-07, "loss": 0.0833, "step": 4855 }, { "epoch": 3.47, "grad_norm": 5.934699069054085, "learning_rate": 4.6011635703032075e-07, "loss": 0.0867, "step": 4856 }, { "epoch": 3.47, "grad_norm": 8.280932206636068, "learning_rate": 4.589060833032083e-07, "loss": 0.1135, "step": 4857 }, { "epoch": 3.47, "grad_norm": 10.11043341804737, "learning_rate": 4.5769732683997983e-07, "loss": 0.1636, "step": 4858 }, { "epoch": 3.47, "grad_norm": 4.578783922415493, "learning_rate": 4.564900880445039e-07, "loss": 0.1052, "step": 4859 }, { "epoch": 3.47, "grad_norm": 6.231779936962509, "learning_rate": 4.552843673201407e-07, "loss": 0.105, "step": 4860 }, { "epoch": 3.47, "grad_norm": 6.714870031308601, "learning_rate": 4.540801650697446e-07, "loss": 0.176, "step": 4861 }, { "epoch": 3.47, "grad_norm": 5.82521739237652, "learning_rate": 4.528774816956616e-07, "loss": 0.0887, "step": 4862 }, { "epoch": 3.47, "grad_norm": 7.575021134589715, "learning_rate": 4.516763175997302e-07, "loss": 0.1743, "step": 4863 }, { "epoch": 3.47, "grad_norm": 7.637041020100914, "learning_rate": 4.5047667318328215e-07, "loss": 0.0961, "step": 4864 }, { "epoch": 3.47, "grad_norm": 5.555943353432021, "learning_rate": 4.492785488471413e-07, "loss": 0.1068, "step": 4865 }, { "epoch": 3.47, "grad_norm": 6.394841605960666, "learning_rate": 4.480819449916224e-07, "loss": 0.1062, "step": 4866 }, { "epoch": 3.47, "grad_norm": 6.625874447124118, "learning_rate": 4.468868620165334e-07, "loss": 0.1735, "step": 4867 }, { "epoch": 3.47, "grad_norm": 9.268464740985305, "learning_rate": 4.4569330032117496e-07, "loss": 0.1744, "step": 4868 }, { "epoch": 3.48, "grad_norm": 7.161236160543748, "learning_rate": 4.445012603043347e-07, "loss": 0.158, "step": 4869 }, { "epoch": 3.48, "grad_norm": 6.150201890120088, "learning_rate": 4.4331074236430014e-07, "loss": 0.1084, "step": 4870 }, { "epoch": 3.48, "grad_norm": 5.604703088860564, "learning_rate": 4.421217468988409e-07, "loss": 0.1189, "step": 4871 }, { "epoch": 3.48, "grad_norm": 6.470583792177313, "learning_rate": 4.409342743052264e-07, "loss": 0.1427, "step": 4872 }, { "epoch": 3.48, "grad_norm": 6.157097944965909, "learning_rate": 4.3974832498020983e-07, "loss": 0.1149, "step": 4873 }, { "epoch": 3.48, "grad_norm": 7.227392034283128, "learning_rate": 4.385638993200425e-07, "loss": 0.1059, "step": 4874 }, { "epoch": 3.48, "grad_norm": 5.359135702963531, "learning_rate": 4.3738099772045963e-07, "loss": 0.0933, "step": 4875 }, { "epoch": 3.48, "grad_norm": 5.540963344549303, "learning_rate": 4.3619962057669216e-07, "loss": 0.1465, "step": 4876 }, { "epoch": 3.48, "grad_norm": 8.921949783265053, "learning_rate": 4.350197682834606e-07, "loss": 0.1624, "step": 4877 }, { "epoch": 3.48, "grad_norm": 6.8268412987785485, "learning_rate": 4.338414412349745e-07, "loss": 0.1013, "step": 4878 }, { "epoch": 3.48, "grad_norm": 7.046004451519703, "learning_rate": 4.3266463982493566e-07, "loss": 0.1239, "step": 4879 }, { "epoch": 3.48, "grad_norm": 6.6020244320462504, "learning_rate": 4.314893644465351e-07, "loss": 0.1201, "step": 4880 }, { "epoch": 3.48, "grad_norm": 5.97689046629764, "learning_rate": 4.303156154924537e-07, "loss": 0.1025, "step": 4881 }, { "epoch": 3.48, "grad_norm": 16.85592779467006, "learning_rate": 4.291433933548633e-07, "loss": 0.1746, "step": 4882 }, { "epoch": 3.49, "grad_norm": 20.986355936055386, "learning_rate": 4.279726984254251e-07, "loss": 0.2146, "step": 4883 }, { "epoch": 3.49, "grad_norm": 8.537459210196602, "learning_rate": 4.268035310952906e-07, "loss": 0.1295, "step": 4884 }, { "epoch": 3.49, "grad_norm": 3.993488754974167, "learning_rate": 4.256358917550979e-07, "loss": 0.0913, "step": 4885 }, { "epoch": 3.49, "grad_norm": 5.377518655838321, "learning_rate": 4.244697807949805e-07, "loss": 0.0779, "step": 4886 }, { "epoch": 3.49, "grad_norm": 6.679235808102704, "learning_rate": 4.2330519860455446e-07, "loss": 0.1, "step": 4887 }, { "epoch": 3.49, "grad_norm": 6.926326752680834, "learning_rate": 4.2214214557293133e-07, "loss": 0.1694, "step": 4888 }, { "epoch": 3.49, "grad_norm": 10.173919781648312, "learning_rate": 4.209806220887053e-07, "loss": 0.1553, "step": 4889 }, { "epoch": 3.49, "grad_norm": 8.716790283479154, "learning_rate": 4.1982062853996695e-07, "loss": 0.1282, "step": 4890 }, { "epoch": 3.49, "grad_norm": 5.741222580663796, "learning_rate": 4.1866216531428806e-07, "loss": 0.0996, "step": 4891 }, { "epoch": 3.49, "grad_norm": 8.604221773949565, "learning_rate": 4.1750523279873613e-07, "loss": 0.119, "step": 4892 }, { "epoch": 3.49, "grad_norm": 6.081160173256382, "learning_rate": 4.1634983137986083e-07, "loss": 0.1091, "step": 4893 }, { "epoch": 3.49, "grad_norm": 7.791749150013185, "learning_rate": 4.151959614437046e-07, "loss": 0.1576, "step": 4894 }, { "epoch": 3.49, "grad_norm": 7.986445225209414, "learning_rate": 4.1404362337579716e-07, "loss": 0.1707, "step": 4895 }, { "epoch": 3.49, "grad_norm": 8.400952391001237, "learning_rate": 4.128928175611546e-07, "loss": 0.1184, "step": 4896 }, { "epoch": 3.5, "grad_norm": 5.741392906608111, "learning_rate": 4.1174354438428434e-07, "loss": 0.1042, "step": 4897 }, { "epoch": 3.5, "grad_norm": 7.079323754898526, "learning_rate": 4.105958042291791e-07, "loss": 0.1346, "step": 4898 }, { "epoch": 3.5, "grad_norm": 8.395224511829195, "learning_rate": 4.0944959747931945e-07, "loss": 0.1548, "step": 4899 }, { "epoch": 3.5, "grad_norm": 8.677897205722589, "learning_rate": 4.0830492451767566e-07, "loss": 0.1658, "step": 4900 }, { "epoch": 3.5, "grad_norm": 5.826376865813371, "learning_rate": 4.0716178572670405e-07, "loss": 0.1022, "step": 4901 }, { "epoch": 3.5, "grad_norm": 7.570457325496883, "learning_rate": 4.060201814883474e-07, "loss": 0.1015, "step": 4902 }, { "epoch": 3.5, "grad_norm": 6.666909786473492, "learning_rate": 4.0488011218403844e-07, "loss": 0.1423, "step": 4903 }, { "epoch": 3.5, "grad_norm": 10.844528241311131, "learning_rate": 4.0374157819469406e-07, "loss": 0.1428, "step": 4904 }, { "epoch": 3.5, "grad_norm": 5.958351896197906, "learning_rate": 4.0260457990072113e-07, "loss": 0.106, "step": 4905 }, { "epoch": 3.5, "grad_norm": 6.199859560189758, "learning_rate": 4.014691176820107e-07, "loss": 0.1133, "step": 4906 }, { "epoch": 3.5, "grad_norm": 6.634470604062944, "learning_rate": 4.003351919179421e-07, "loss": 0.1403, "step": 4907 }, { "epoch": 3.5, "grad_norm": 7.633691260784369, "learning_rate": 3.9920280298738125e-07, "loss": 0.2009, "step": 4908 }, { "epoch": 3.5, "grad_norm": 5.181243123537604, "learning_rate": 3.980719512686809e-07, "loss": 0.1056, "step": 4909 }, { "epoch": 3.5, "grad_norm": 5.055526650996727, "learning_rate": 3.969426371396773e-07, "loss": 0.0876, "step": 4910 }, { "epoch": 3.51, "grad_norm": 9.38989387103326, "learning_rate": 3.9581486097769905e-07, "loss": 0.1478, "step": 4911 }, { "epoch": 3.51, "grad_norm": 6.137927650461356, "learning_rate": 3.946886231595526e-07, "loss": 0.1445, "step": 4912 }, { "epoch": 3.51, "grad_norm": 4.845052725437297, "learning_rate": 3.935639240615396e-07, "loss": 0.1101, "step": 4913 }, { "epoch": 3.51, "grad_norm": 9.620807970425544, "learning_rate": 3.924407640594391e-07, "loss": 0.1301, "step": 4914 }, { "epoch": 3.51, "grad_norm": 5.003315815491063, "learning_rate": 3.913191435285224e-07, "loss": 0.0854, "step": 4915 }, { "epoch": 3.51, "grad_norm": 10.51854542922581, "learning_rate": 3.9019906284354145e-07, "loss": 0.1211, "step": 4916 }, { "epoch": 3.51, "grad_norm": 11.27692744405923, "learning_rate": 3.8908052237873863e-07, "loss": 0.1339, "step": 4917 }, { "epoch": 3.51, "grad_norm": 6.778719813958779, "learning_rate": 3.879635225078371e-07, "loss": 0.1556, "step": 4918 }, { "epoch": 3.51, "grad_norm": 6.5589312805748445, "learning_rate": 3.868480636040484e-07, "loss": 0.094, "step": 4919 }, { "epoch": 3.51, "grad_norm": 11.230659542738735, "learning_rate": 3.857341460400665e-07, "loss": 0.1584, "step": 4920 }, { "epoch": 3.51, "grad_norm": 6.838176925618872, "learning_rate": 3.846217701880739e-07, "loss": 0.1112, "step": 4921 }, { "epoch": 3.51, "grad_norm": 4.807557680870521, "learning_rate": 3.835109364197348e-07, "loss": 0.0952, "step": 4922 }, { "epoch": 3.51, "grad_norm": 5.99047386603123, "learning_rate": 3.8240164510620017e-07, "loss": 0.0955, "step": 4923 }, { "epoch": 3.51, "grad_norm": 5.921585109056007, "learning_rate": 3.81293896618104e-07, "loss": 0.1046, "step": 4924 }, { "epoch": 3.52, "grad_norm": 4.71667528673715, "learning_rate": 3.8018769132556644e-07, "loss": 0.0726, "step": 4925 }, { "epoch": 3.52, "grad_norm": 6.07125569493634, "learning_rate": 3.790830295981912e-07, "loss": 0.1105, "step": 4926 }, { "epoch": 3.52, "grad_norm": 4.207268748077647, "learning_rate": 3.7797991180506643e-07, "loss": 0.0854, "step": 4927 }, { "epoch": 3.52, "grad_norm": 6.1493317988743845, "learning_rate": 3.768783383147623e-07, "loss": 0.0932, "step": 4928 }, { "epoch": 3.52, "grad_norm": 7.303017501676457, "learning_rate": 3.757783094953382e-07, "loss": 0.1523, "step": 4929 }, { "epoch": 3.52, "grad_norm": 10.291370181214797, "learning_rate": 3.746798257143314e-07, "loss": 0.1628, "step": 4930 }, { "epoch": 3.52, "grad_norm": 4.898999805060367, "learning_rate": 3.735828873387681e-07, "loss": 0.0839, "step": 4931 }, { "epoch": 3.52, "grad_norm": 4.965168117910225, "learning_rate": 3.724874947351531e-07, "loss": 0.1173, "step": 4932 }, { "epoch": 3.52, "grad_norm": 5.063017750866719, "learning_rate": 3.7139364826948077e-07, "loss": 0.0971, "step": 4933 }, { "epoch": 3.52, "grad_norm": 5.753008355329342, "learning_rate": 3.7030134830722207e-07, "loss": 0.0903, "step": 4934 }, { "epoch": 3.52, "grad_norm": 4.717973246272266, "learning_rate": 3.692105952133379e-07, "loss": 0.0975, "step": 4935 }, { "epoch": 3.52, "grad_norm": 8.333430271233516, "learning_rate": 3.681213893522667e-07, "loss": 0.1337, "step": 4936 }, { "epoch": 3.52, "grad_norm": 10.107222632655112, "learning_rate": 3.670337310879335e-07, "loss": 0.1675, "step": 4937 }, { "epoch": 3.52, "grad_norm": 9.419377853290698, "learning_rate": 3.6594762078374536e-07, "loss": 0.1014, "step": 4938 }, { "epoch": 3.53, "grad_norm": 12.43896448622122, "learning_rate": 3.6486305880259085e-07, "loss": 0.2437, "step": 4939 }, { "epoch": 3.53, "grad_norm": 5.616562037480525, "learning_rate": 3.6378004550684355e-07, "loss": 0.0884, "step": 4940 }, { "epoch": 3.53, "grad_norm": 6.338766725755504, "learning_rate": 3.626985812583572e-07, "loss": 0.1049, "step": 4941 }, { "epoch": 3.53, "grad_norm": 5.202111188217427, "learning_rate": 3.6161866641847007e-07, "loss": 0.1033, "step": 4942 }, { "epoch": 3.53, "grad_norm": 7.316758540310851, "learning_rate": 3.6054030134800243e-07, "loss": 0.1517, "step": 4943 }, { "epoch": 3.53, "grad_norm": 7.275565880480988, "learning_rate": 3.594634864072527e-07, "loss": 0.1464, "step": 4944 }, { "epoch": 3.53, "grad_norm": 10.317802181069961, "learning_rate": 3.583882219560092e-07, "loss": 0.2065, "step": 4945 }, { "epoch": 3.53, "grad_norm": 8.125843837992509, "learning_rate": 3.57314508353534e-07, "loss": 0.1593, "step": 4946 }, { "epoch": 3.53, "grad_norm": 5.59942806326855, "learning_rate": 3.5624234595857787e-07, "loss": 0.1151, "step": 4947 }, { "epoch": 3.53, "grad_norm": 7.182502216677451, "learning_rate": 3.551717351293676e-07, "loss": 0.1285, "step": 4948 }, { "epoch": 3.53, "grad_norm": 8.035563717929566, "learning_rate": 3.541026762236166e-07, "loss": 0.1669, "step": 4949 }, { "epoch": 3.53, "grad_norm": 5.790990792314284, "learning_rate": 3.5303516959851405e-07, "loss": 0.1127, "step": 4950 }, { "epoch": 3.53, "grad_norm": 4.881110923975365, "learning_rate": 3.519692156107379e-07, "loss": 0.1031, "step": 4951 }, { "epoch": 3.53, "grad_norm": 9.586597575490307, "learning_rate": 3.509048146164401e-07, "loss": 0.1537, "step": 4952 }, { "epoch": 3.54, "grad_norm": 5.858555298946315, "learning_rate": 3.4984196697125827e-07, "loss": 0.0942, "step": 4953 }, { "epoch": 3.54, "grad_norm": 3.355412324174652, "learning_rate": 3.4878067303030836e-07, "loss": 0.0513, "step": 4954 }, { "epoch": 3.54, "grad_norm": 9.169699928002089, "learning_rate": 3.4772093314818957e-07, "loss": 0.1544, "step": 4955 }, { "epoch": 3.54, "grad_norm": 7.753478969791648, "learning_rate": 3.4666274767897967e-07, "loss": 0.1509, "step": 4956 }, { "epoch": 3.54, "grad_norm": 7.529002063527844, "learning_rate": 3.456061169762392e-07, "loss": 0.1528, "step": 4957 }, { "epoch": 3.54, "grad_norm": 6.793947603721516, "learning_rate": 3.44551041393007e-07, "loss": 0.1267, "step": 4958 }, { "epoch": 3.54, "grad_norm": 7.569499505899291, "learning_rate": 3.434975212818048e-07, "loss": 0.1304, "step": 4959 }, { "epoch": 3.54, "grad_norm": 6.899698816129731, "learning_rate": 3.424455569946317e-07, "loss": 0.1163, "step": 4960 }, { "epoch": 3.54, "grad_norm": 7.341545137728287, "learning_rate": 3.4139514888296975e-07, "loss": 0.1301, "step": 4961 }, { "epoch": 3.54, "grad_norm": 10.0706321470383, "learning_rate": 3.403462972977789e-07, "loss": 0.1243, "step": 4962 }, { "epoch": 3.54, "grad_norm": 6.231489025292407, "learning_rate": 3.392990025895004e-07, "loss": 0.1277, "step": 4963 }, { "epoch": 3.54, "grad_norm": 10.311986109419417, "learning_rate": 3.3825326510805556e-07, "loss": 0.1787, "step": 4964 }, { "epoch": 3.54, "grad_norm": 7.1396184042409105, "learning_rate": 3.372090852028437e-07, "loss": 0.1366, "step": 4965 }, { "epoch": 3.54, "grad_norm": 5.781959199435736, "learning_rate": 3.361664632227446e-07, "loss": 0.0825, "step": 4966 }, { "epoch": 3.55, "grad_norm": 6.2875082133414395, "learning_rate": 3.3512539951611856e-07, "loss": 0.147, "step": 4967 }, { "epoch": 3.55, "grad_norm": 5.914439645723257, "learning_rate": 3.3408589443080395e-07, "loss": 0.1083, "step": 4968 }, { "epoch": 3.55, "grad_norm": 7.325977047729606, "learning_rate": 3.3304794831411804e-07, "loss": 0.1431, "step": 4969 }, { "epoch": 3.55, "grad_norm": 8.27154445640713, "learning_rate": 3.3201156151285994e-07, "loss": 0.1476, "step": 4970 }, { "epoch": 3.55, "grad_norm": 6.2873062920787355, "learning_rate": 3.309767343733028e-07, "loss": 0.0926, "step": 4971 }, { "epoch": 3.55, "grad_norm": 3.7912487931444017, "learning_rate": 3.299434672412044e-07, "loss": 0.0831, "step": 4972 }, { "epoch": 3.55, "grad_norm": 6.907191593542313, "learning_rate": 3.2891176046179583e-07, "loss": 0.1079, "step": 4973 }, { "epoch": 3.55, "grad_norm": 7.634784204009729, "learning_rate": 3.278816143797919e-07, "loss": 0.1184, "step": 4974 }, { "epoch": 3.55, "grad_norm": 8.738231719908597, "learning_rate": 3.2685302933938177e-07, "loss": 0.1627, "step": 4975 }, { "epoch": 3.55, "grad_norm": 7.051208068222329, "learning_rate": 3.2582600568423715e-07, "loss": 0.1112, "step": 4976 }, { "epoch": 3.55, "grad_norm": 5.633258071747942, "learning_rate": 3.2480054375750305e-07, "loss": 0.0843, "step": 4977 }, { "epoch": 3.55, "grad_norm": 6.457146068070933, "learning_rate": 3.237766439018064e-07, "loss": 0.1165, "step": 4978 }, { "epoch": 3.55, "grad_norm": 4.686262096507274, "learning_rate": 3.227543064592514e-07, "loss": 0.0764, "step": 4979 }, { "epoch": 3.55, "grad_norm": 4.730731801506817, "learning_rate": 3.2173353177142044e-07, "loss": 0.0969, "step": 4980 }, { "epoch": 3.56, "grad_norm": 4.936171738164148, "learning_rate": 3.207143201793722e-07, "loss": 0.0854, "step": 4981 }, { "epoch": 3.56, "grad_norm": 5.8137612074069, "learning_rate": 3.1969667202364496e-07, "loss": 0.1029, "step": 4982 }, { "epoch": 3.56, "grad_norm": 9.229282590022793, "learning_rate": 3.1868058764425337e-07, "loss": 0.1616, "step": 4983 }, { "epoch": 3.56, "grad_norm": 7.105331418972408, "learning_rate": 3.1766606738069084e-07, "loss": 0.1034, "step": 4984 }, { "epoch": 3.56, "grad_norm": 7.807312628779206, "learning_rate": 3.166531115719268e-07, "loss": 0.144, "step": 4985 }, { "epoch": 3.56, "grad_norm": 7.869038868344821, "learning_rate": 3.1564172055640994e-07, "loss": 0.0964, "step": 4986 }, { "epoch": 3.56, "grad_norm": 8.875331548761368, "learning_rate": 3.1463189467206166e-07, "loss": 0.1221, "step": 4987 }, { "epoch": 3.56, "grad_norm": 4.938849961784851, "learning_rate": 3.1362363425628763e-07, "loss": 0.0954, "step": 4988 }, { "epoch": 3.56, "grad_norm": 10.180385680278965, "learning_rate": 3.1261693964596275e-07, "loss": 0.1552, "step": 4989 }, { "epoch": 3.56, "grad_norm": 6.039383569321269, "learning_rate": 3.116118111774452e-07, "loss": 0.0889, "step": 4990 }, { "epoch": 3.56, "grad_norm": 5.068556225143177, "learning_rate": 3.106082491865647e-07, "loss": 0.093, "step": 4991 }, { "epoch": 3.56, "grad_norm": 7.284668219658361, "learning_rate": 3.0960625400863253e-07, "loss": 0.1415, "step": 4992 }, { "epoch": 3.56, "grad_norm": 14.454110851656004, "learning_rate": 3.0860582597843137e-07, "loss": 0.2096, "step": 4993 }, { "epoch": 3.56, "grad_norm": 6.243067078626986, "learning_rate": 3.0760696543022496e-07, "loss": 0.0966, "step": 4994 }, { "epoch": 3.57, "grad_norm": 5.398929947949957, "learning_rate": 3.066096726977502e-07, "loss": 0.0952, "step": 4995 }, { "epoch": 3.57, "grad_norm": 6.239239492157707, "learning_rate": 3.056139481142206e-07, "loss": 0.101, "step": 4996 }, { "epoch": 3.57, "grad_norm": 9.023044388173963, "learning_rate": 3.0461979201232674e-07, "loss": 0.1794, "step": 4997 }, { "epoch": 3.57, "grad_norm": 7.570504247937259, "learning_rate": 3.0362720472423503e-07, "loss": 0.1161, "step": 4998 }, { "epoch": 3.57, "grad_norm": 10.355018381146655, "learning_rate": 3.026361865815869e-07, "loss": 0.1808, "step": 4999 }, { "epoch": 3.57, "grad_norm": 6.25080861497107, "learning_rate": 3.016467379154997e-07, "loss": 0.1207, "step": 5000 }, { "epoch": 3.57, "eval_avg_AUC": 0.7939005198150356, "eval_avg_Accuracy": 0.7027105437665783, "eval_avg_Accuracy-right": 0.8833963740706926, "eval_avg_Accuracy-wrong": 0.38765067091198546, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6581912384379904, "eval_last_AUC": 0.8149474201895899, "eval_last_Accuracy": 0.738395225464191, "eval_last_Accuracy-right": 0.8196817529672623, "eval_last_Accuracy-wrong": 0.5966568114623607, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6804592176400829, "eval_max_AUC": 0.7778448578196614, "eval_max_Accuracy": 0.6441893236074271, "eval_max_Accuracy-right": 0.9792617712273379, "eval_max_Accuracy-wrong": 0.05992722310666363, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6216123178884508, "eval_min_AUC": 0.7975842894056173, "eval_min_Accuracy": 0.7238478116710876, "eval_min_Accuracy-right": 0.7295552367288379, "eval_min_Accuracy-wrong": 0.7138958380714123, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6586734622814718, "eval_prod_AUC": 0.8002513229863527, "eval_prod_Accuracy": 0.7148955570291777, "eval_prod_Accuracy-right": 0.6743185078909613, "eval_prod_Accuracy-wrong": 0.7856493063452354, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.661249401851051, "eval_runtime": 251.6637, "eval_samples_per_second": 95.874, "eval_steps_per_second": 2.996, "eval_sum_AUC": 0.6658840128941205, "eval_sum_Accuracy": 0.638967175066313, "eval_sum_Accuracy-right": 0.9868918742663363, "eval_sum_Accuracy-wrong": 0.032294746418012284, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6474363870292106, "step": 5000 }, { "epoch": 3.57, "grad_norm": 8.039699710822118, "learning_rate": 3.0065885905656733e-07, "loss": 0.139, "step": 5001 }, { "epoch": 3.57, "grad_norm": 3.561175020518558, "learning_rate": 2.99672550334858e-07, "loss": 0.0732, "step": 5002 }, { "epoch": 3.57, "grad_norm": 5.489308809958234, "learning_rate": 2.986878120799158e-07, "loss": 0.1164, "step": 5003 }, { "epoch": 3.57, "grad_norm": 7.287913363035142, "learning_rate": 2.977046446207604e-07, "loss": 0.0874, "step": 5004 }, { "epoch": 3.57, "grad_norm": 7.221934125161167, "learning_rate": 2.967230482858863e-07, "loss": 0.1486, "step": 5005 }, { "epoch": 3.57, "grad_norm": 8.007176575502351, "learning_rate": 2.957430234032627e-07, "loss": 0.1381, "step": 5006 }, { "epoch": 3.57, "grad_norm": 9.070326449400396, "learning_rate": 2.947645703003338e-07, "loss": 0.1473, "step": 5007 }, { "epoch": 3.57, "grad_norm": 10.242941718824545, "learning_rate": 2.937876893040209e-07, "loss": 0.1548, "step": 5008 }, { "epoch": 3.58, "grad_norm": 7.051438057132722, "learning_rate": 2.9281238074071463e-07, "loss": 0.1425, "step": 5009 }, { "epoch": 3.58, "grad_norm": 8.348048515530829, "learning_rate": 2.9183864493628756e-07, "loss": 0.1549, "step": 5010 }, { "epoch": 3.58, "grad_norm": 9.348540506356116, "learning_rate": 2.908664822160806e-07, "loss": 0.1504, "step": 5011 }, { "epoch": 3.58, "grad_norm": 6.608695626483558, "learning_rate": 2.898958929049117e-07, "loss": 0.0918, "step": 5012 }, { "epoch": 3.58, "grad_norm": 7.522212380654753, "learning_rate": 2.889268773270731e-07, "loss": 0.166, "step": 5013 }, { "epoch": 3.58, "grad_norm": 6.086064311100961, "learning_rate": 2.879594358063303e-07, "loss": 0.117, "step": 5014 }, { "epoch": 3.58, "grad_norm": 6.2796926492958995, "learning_rate": 2.869935686659248e-07, "loss": 0.0981, "step": 5015 }, { "epoch": 3.58, "grad_norm": 6.291532974607174, "learning_rate": 2.8602927622856935e-07, "loss": 0.1378, "step": 5016 }, { "epoch": 3.58, "grad_norm": 5.3972265354997235, "learning_rate": 2.8506655881645305e-07, "loss": 0.0914, "step": 5017 }, { "epoch": 3.58, "grad_norm": 7.768331165784399, "learning_rate": 2.841054167512369e-07, "loss": 0.1865, "step": 5018 }, { "epoch": 3.58, "grad_norm": 7.9944003757927975, "learning_rate": 2.8314585035405683e-07, "loss": 0.1632, "step": 5019 }, { "epoch": 3.58, "grad_norm": 6.256066146658917, "learning_rate": 2.8218785994552136e-07, "loss": 0.106, "step": 5020 }, { "epoch": 3.58, "grad_norm": 14.01419943071183, "learning_rate": 2.8123144584571326e-07, "loss": 0.1823, "step": 5021 }, { "epoch": 3.58, "grad_norm": 7.421782138203785, "learning_rate": 2.8027660837418813e-07, "loss": 0.1256, "step": 5022 }, { "epoch": 3.59, "grad_norm": 6.145061257805026, "learning_rate": 2.793233478499752e-07, "loss": 0.1068, "step": 5023 }, { "epoch": 3.59, "grad_norm": 7.773542257086725, "learning_rate": 2.7837166459157625e-07, "loss": 0.1125, "step": 5024 }, { "epoch": 3.59, "grad_norm": 4.764175341048938, "learning_rate": 2.77421558916966e-07, "loss": 0.1198, "step": 5025 }, { "epoch": 3.59, "grad_norm": 4.684493454442424, "learning_rate": 2.764730311435931e-07, "loss": 0.0885, "step": 5026 }, { "epoch": 3.59, "grad_norm": 7.440949476639598, "learning_rate": 2.755260815883781e-07, "loss": 0.1259, "step": 5027 }, { "epoch": 3.59, "grad_norm": 9.466050890106064, "learning_rate": 2.745807105677145e-07, "loss": 0.1307, "step": 5028 }, { "epoch": 3.59, "grad_norm": 5.8345255587512765, "learning_rate": 2.736369183974685e-07, "loss": 0.0985, "step": 5029 }, { "epoch": 3.59, "grad_norm": 7.466645169078422, "learning_rate": 2.726947053929768e-07, "loss": 0.1625, "step": 5030 }, { "epoch": 3.59, "grad_norm": 13.413102283651124, "learning_rate": 2.7175407186905367e-07, "loss": 0.1526, "step": 5031 }, { "epoch": 3.59, "grad_norm": 9.913384538686545, "learning_rate": 2.708150181399788e-07, "loss": 0.1227, "step": 5032 }, { "epoch": 3.59, "grad_norm": 29.713012887802034, "learning_rate": 2.698775445195101e-07, "loss": 0.1667, "step": 5033 }, { "epoch": 3.59, "grad_norm": 5.012468307125459, "learning_rate": 2.689416513208726e-07, "loss": 0.0981, "step": 5034 }, { "epoch": 3.59, "grad_norm": 9.259889616367115, "learning_rate": 2.6800733885676833e-07, "loss": 0.1725, "step": 5035 }, { "epoch": 3.59, "grad_norm": 8.444744690254245, "learning_rate": 2.6707460743936653e-07, "loss": 0.1523, "step": 5036 }, { "epoch": 3.6, "grad_norm": 10.368928285325568, "learning_rate": 2.6614345738031014e-07, "loss": 0.1543, "step": 5037 }, { "epoch": 3.6, "grad_norm": 7.8762410419001005, "learning_rate": 2.6521388899071467e-07, "loss": 0.1006, "step": 5038 }, { "epoch": 3.6, "grad_norm": 7.032417525664705, "learning_rate": 2.642859025811656e-07, "loss": 0.1088, "step": 5039 }, { "epoch": 3.6, "grad_norm": 8.871769638082888, "learning_rate": 2.633594984617199e-07, "loss": 0.1471, "step": 5040 }, { "epoch": 3.6, "grad_norm": 6.930037290213703, "learning_rate": 2.624346769419078e-07, "loss": 0.1376, "step": 5041 }, { "epoch": 3.6, "grad_norm": 7.1968598150909955, "learning_rate": 2.6151143833072824e-07, "loss": 0.1517, "step": 5042 }, { "epoch": 3.6, "grad_norm": 5.261682987100089, "learning_rate": 2.605897829366527e-07, "loss": 0.0737, "step": 5043 }, { "epoch": 3.6, "grad_norm": 6.097460825799445, "learning_rate": 2.596697110676233e-07, "loss": 0.0956, "step": 5044 }, { "epoch": 3.6, "grad_norm": 8.016509501397028, "learning_rate": 2.5875122303105403e-07, "loss": 0.1566, "step": 5045 }, { "epoch": 3.6, "grad_norm": 6.711365280336349, "learning_rate": 2.5783431913382673e-07, "loss": 0.1152, "step": 5046 }, { "epoch": 3.6, "grad_norm": 5.2815052281868216, "learning_rate": 2.5691899968229904e-07, "loss": 0.1055, "step": 5047 }, { "epoch": 3.6, "grad_norm": 4.197556171742845, "learning_rate": 2.560052649822925e-07, "loss": 0.0975, "step": 5048 }, { "epoch": 3.6, "grad_norm": 12.246007633872518, "learning_rate": 2.5509311533910674e-07, "loss": 0.1521, "step": 5049 }, { "epoch": 3.6, "grad_norm": 7.104104832857858, "learning_rate": 2.5418255105750465e-07, "loss": 0.1245, "step": 5050 }, { "epoch": 3.61, "grad_norm": 7.889736914067923, "learning_rate": 2.532735724417251e-07, "loss": 0.1339, "step": 5051 }, { "epoch": 3.61, "grad_norm": 9.149482943543847, "learning_rate": 2.52366179795473e-07, "loss": 0.1666, "step": 5052 }, { "epoch": 3.61, "grad_norm": 11.841217531696419, "learning_rate": 2.5146037342192673e-07, "loss": 0.2165, "step": 5053 }, { "epoch": 3.61, "grad_norm": 8.367402162775445, "learning_rate": 2.505561536237311e-07, "loss": 0.158, "step": 5054 }, { "epoch": 3.61, "grad_norm": 4.577698639400431, "learning_rate": 2.496535207030043e-07, "loss": 0.0587, "step": 5055 }, { "epoch": 3.61, "grad_norm": 9.757830561055071, "learning_rate": 2.4875247496133234e-07, "loss": 0.1003, "step": 5056 }, { "epoch": 3.61, "grad_norm": 10.30190449595356, "learning_rate": 2.4785301669977116e-07, "loss": 0.1698, "step": 5057 }, { "epoch": 3.61, "grad_norm": 7.289239425103564, "learning_rate": 2.469551462188463e-07, "loss": 0.1116, "step": 5058 }, { "epoch": 3.61, "grad_norm": 6.916398851164515, "learning_rate": 2.460588638185535e-07, "loss": 0.1403, "step": 5059 }, { "epoch": 3.61, "grad_norm": 8.156390119211379, "learning_rate": 2.45164169798357e-07, "loss": 0.1232, "step": 5060 }, { "epoch": 3.61, "grad_norm": 7.444029203628988, "learning_rate": 2.4427106445719053e-07, "loss": 0.1094, "step": 5061 }, { "epoch": 3.61, "grad_norm": 7.602904335530537, "learning_rate": 2.4337954809345807e-07, "loss": 0.1433, "step": 5062 }, { "epoch": 3.61, "grad_norm": 5.715241288214859, "learning_rate": 2.4248962100503095e-07, "loss": 0.0869, "step": 5063 }, { "epoch": 3.61, "grad_norm": 4.962640828966288, "learning_rate": 2.416012834892506e-07, "loss": 0.1055, "step": 5064 }, { "epoch": 3.62, "grad_norm": 6.660676204308661, "learning_rate": 2.4071453584292693e-07, "loss": 0.1177, "step": 5065 }, { "epoch": 3.62, "grad_norm": 7.962611371474207, "learning_rate": 2.3982937836233954e-07, "loss": 0.1488, "step": 5066 }, { "epoch": 3.62, "grad_norm": 7.174907994976234, "learning_rate": 2.389458113432347e-07, "loss": 0.1106, "step": 5067 }, { "epoch": 3.62, "grad_norm": 5.848634135676804, "learning_rate": 2.380638350808301e-07, "loss": 0.1166, "step": 5068 }, { "epoch": 3.62, "grad_norm": 4.030302418548135, "learning_rate": 2.371834498698089e-07, "loss": 0.0978, "step": 5069 }, { "epoch": 3.62, "grad_norm": 6.902607563471164, "learning_rate": 2.363046560043264e-07, "loss": 0.1177, "step": 5070 }, { "epoch": 3.62, "grad_norm": 4.432084198181119, "learning_rate": 2.3542745377800046e-07, "loss": 0.0757, "step": 5071 }, { "epoch": 3.62, "grad_norm": 4.034887652754706, "learning_rate": 2.3455184348392446e-07, "loss": 0.0648, "step": 5072 }, { "epoch": 3.62, "grad_norm": 6.045491895151377, "learning_rate": 2.3367782541465268e-07, "loss": 0.1531, "step": 5073 }, { "epoch": 3.62, "grad_norm": 10.244042229911464, "learning_rate": 2.3280539986221317e-07, "loss": 0.1196, "step": 5074 }, { "epoch": 3.62, "grad_norm": 4.67069532901348, "learning_rate": 2.3193456711809837e-07, "loss": 0.1213, "step": 5075 }, { "epoch": 3.62, "grad_norm": 5.294449192222114, "learning_rate": 2.3106532747327104e-07, "loss": 0.0883, "step": 5076 }, { "epoch": 3.62, "grad_norm": 9.311678585887437, "learning_rate": 2.3019768121815777e-07, "loss": 0.1554, "step": 5077 }, { "epoch": 3.62, "grad_norm": 20.638914783602576, "learning_rate": 2.2933162864265836e-07, "loss": 0.1319, "step": 5078 }, { "epoch": 3.63, "grad_norm": 9.5037044607945, "learning_rate": 2.2846717003613462e-07, "loss": 0.1365, "step": 5079 }, { "epoch": 3.63, "grad_norm": 5.275015466909017, "learning_rate": 2.2760430568741943e-07, "loss": 0.0837, "step": 5080 }, { "epoch": 3.63, "grad_norm": 8.731705867976217, "learning_rate": 2.2674303588481162e-07, "loss": 0.1273, "step": 5081 }, { "epoch": 3.63, "grad_norm": 9.145995314417187, "learning_rate": 2.258833609160771e-07, "loss": 0.1223, "step": 5082 }, { "epoch": 3.63, "grad_norm": 6.35548689524532, "learning_rate": 2.2502528106845e-07, "loss": 0.1062, "step": 5083 }, { "epoch": 3.63, "grad_norm": 8.025136625142972, "learning_rate": 2.241687966286299e-07, "loss": 0.1373, "step": 5084 }, { "epoch": 3.63, "grad_norm": 4.627138376379124, "learning_rate": 2.233139078827845e-07, "loss": 0.0949, "step": 5085 }, { "epoch": 3.63, "grad_norm": 5.550861440722993, "learning_rate": 2.2246061511654816e-07, "loss": 0.0856, "step": 5086 }, { "epoch": 3.63, "grad_norm": 7.0832967651621646, "learning_rate": 2.2160891861502165e-07, "loss": 0.1456, "step": 5087 }, { "epoch": 3.63, "grad_norm": 5.494887892207766, "learning_rate": 2.2075881866277348e-07, "loss": 0.1138, "step": 5088 }, { "epoch": 3.63, "grad_norm": 5.653845807621733, "learning_rate": 2.199103155438359e-07, "loss": 0.149, "step": 5089 }, { "epoch": 3.63, "grad_norm": 7.533091223076104, "learning_rate": 2.1906340954171212e-07, "loss": 0.1851, "step": 5090 }, { "epoch": 3.63, "grad_norm": 6.574364381103182, "learning_rate": 2.1821810093936636e-07, "loss": 0.1305, "step": 5091 }, { "epoch": 3.63, "grad_norm": 4.576468541271489, "learning_rate": 2.1737439001923488e-07, "loss": 0.0978, "step": 5092 }, { "epoch": 3.64, "grad_norm": 10.54344185381927, "learning_rate": 2.1653227706321388e-07, "loss": 0.2559, "step": 5093 }, { "epoch": 3.64, "grad_norm": 5.76013310329922, "learning_rate": 2.156917623526722e-07, "loss": 0.1378, "step": 5094 }, { "epoch": 3.64, "grad_norm": 6.096648775601011, "learning_rate": 2.1485284616843904e-07, "loss": 0.0791, "step": 5095 }, { "epoch": 3.64, "grad_norm": 7.449715752312049, "learning_rate": 2.140155287908141e-07, "loss": 0.1338, "step": 5096 }, { "epoch": 3.64, "grad_norm": 6.19787476290784, "learning_rate": 2.131798104995586e-07, "loss": 0.1239, "step": 5097 }, { "epoch": 3.64, "grad_norm": 5.365081910281841, "learning_rate": 2.123456915739025e-07, "loss": 0.0817, "step": 5098 }, { "epoch": 3.64, "grad_norm": 5.7839621749109735, "learning_rate": 2.115131722925401e-07, "loss": 0.1301, "step": 5099 }, { "epoch": 3.64, "grad_norm": 7.328264787341496, "learning_rate": 2.1068225293363166e-07, "loss": 0.1644, "step": 5100 }, { "epoch": 3.64, "grad_norm": 5.248602303859766, "learning_rate": 2.0985293377480342e-07, "loss": 0.0967, "step": 5101 }, { "epoch": 3.64, "grad_norm": 6.184726044484055, "learning_rate": 2.0902521509314543e-07, "loss": 0.1346, "step": 5102 }, { "epoch": 3.64, "grad_norm": 6.470435895771347, "learning_rate": 2.0819909716521426e-07, "loss": 0.1361, "step": 5103 }, { "epoch": 3.64, "grad_norm": 6.351606542505066, "learning_rate": 2.0737458026703182e-07, "loss": 0.113, "step": 5104 }, { "epoch": 3.64, "grad_norm": 6.701711181172379, "learning_rate": 2.0655166467408283e-07, "loss": 0.1284, "step": 5105 }, { "epoch": 3.64, "grad_norm": 6.303422896520405, "learning_rate": 2.057303506613212e-07, "loss": 0.1362, "step": 5106 }, { "epoch": 3.65, "grad_norm": 6.235708485944981, "learning_rate": 2.049106385031602e-07, "loss": 0.131, "step": 5107 }, { "epoch": 3.65, "grad_norm": 8.200959008051413, "learning_rate": 2.0409252847348404e-07, "loss": 0.1003, "step": 5108 }, { "epoch": 3.65, "grad_norm": 6.76348332083974, "learning_rate": 2.032760208456358e-07, "loss": 0.0938, "step": 5109 }, { "epoch": 3.65, "grad_norm": 10.725542818280745, "learning_rate": 2.0246111589242835e-07, "loss": 0.1349, "step": 5110 }, { "epoch": 3.65, "grad_norm": 6.858265680340651, "learning_rate": 2.0164781388613386e-07, "loss": 0.1703, "step": 5111 }, { "epoch": 3.65, "grad_norm": 7.221302184416432, "learning_rate": 2.0083611509849443e-07, "loss": 0.0912, "step": 5112 }, { "epoch": 3.65, "grad_norm": 6.110379248791678, "learning_rate": 2.0002601980071145e-07, "loss": 0.1471, "step": 5113 }, { "epoch": 3.65, "grad_norm": 9.264511652438255, "learning_rate": 1.9921752826345397e-07, "loss": 0.1936, "step": 5114 }, { "epoch": 3.65, "grad_norm": 9.3500365337071, "learning_rate": 1.9841064075685367e-07, "loss": 0.1804, "step": 5115 }, { "epoch": 3.65, "grad_norm": 7.0264249501411795, "learning_rate": 1.9760535755050715e-07, "loss": 0.0991, "step": 5116 }, { "epoch": 3.65, "grad_norm": 4.367028898943652, "learning_rate": 1.9680167891347356e-07, "loss": 0.0659, "step": 5117 }, { "epoch": 3.65, "grad_norm": 6.694350207237627, "learning_rate": 1.9599960511427761e-07, "loss": 0.1215, "step": 5118 }, { "epoch": 3.65, "grad_norm": 5.01614665495544, "learning_rate": 1.9519913642090715e-07, "loss": 0.0903, "step": 5119 }, { "epoch": 3.65, "grad_norm": 7.6950343847045115, "learning_rate": 1.9440027310081323e-07, "loss": 0.1492, "step": 5120 }, { "epoch": 3.66, "grad_norm": 13.035091459384203, "learning_rate": 1.9360301542091065e-07, "loss": 0.1222, "step": 5121 }, { "epoch": 3.66, "grad_norm": 8.006794034185171, "learning_rate": 1.9280736364757912e-07, "loss": 0.1417, "step": 5122 }, { "epoch": 3.66, "grad_norm": 3.606990746506003, "learning_rate": 1.9201331804665934e-07, "loss": 0.0756, "step": 5123 }, { "epoch": 3.66, "grad_norm": 6.174082855551437, "learning_rate": 1.9122087888345798e-07, "loss": 0.1083, "step": 5124 }, { "epoch": 3.66, "grad_norm": 6.662992780244101, "learning_rate": 1.9043004642274266e-07, "loss": 0.1226, "step": 5125 }, { "epoch": 3.66, "grad_norm": 7.587789426133568, "learning_rate": 1.896408209287459e-07, "loss": 0.14, "step": 5126 }, { "epoch": 3.66, "grad_norm": 11.678374790231842, "learning_rate": 1.888532026651624e-07, "loss": 0.1396, "step": 5127 }, { "epoch": 3.66, "grad_norm": 3.977244116302907, "learning_rate": 1.880671918951499e-07, "loss": 0.0615, "step": 5128 }, { "epoch": 3.66, "grad_norm": 6.1018897296285175, "learning_rate": 1.8728278888132944e-07, "loss": 0.1375, "step": 5129 }, { "epoch": 3.66, "grad_norm": 7.569430217114574, "learning_rate": 1.864999938857842e-07, "loss": 0.1296, "step": 5130 }, { "epoch": 3.66, "grad_norm": 6.084231927095735, "learning_rate": 1.8571880717006218e-07, "loss": 0.0961, "step": 5131 }, { "epoch": 3.66, "grad_norm": 7.006697709775796, "learning_rate": 1.8493922899516902e-07, "loss": 0.1226, "step": 5132 }, { "epoch": 3.66, "grad_norm": 7.603508544129782, "learning_rate": 1.8416125962157971e-07, "loss": 0.1912, "step": 5133 }, { "epoch": 3.66, "grad_norm": 6.0061495778600555, "learning_rate": 1.8338489930922632e-07, "loss": 0.1002, "step": 5134 }, { "epoch": 3.67, "grad_norm": 6.034224352944826, "learning_rate": 1.8261014831750633e-07, "loss": 0.105, "step": 5135 }, { "epoch": 3.67, "grad_norm": 6.209079940714725, "learning_rate": 1.8183700690527717e-07, "loss": 0.0973, "step": 5136 }, { "epoch": 3.67, "grad_norm": 7.222408389143445, "learning_rate": 1.810654753308616e-07, "loss": 0.1681, "step": 5137 }, { "epoch": 3.67, "grad_norm": 8.624917976457603, "learning_rate": 1.8029555385204067e-07, "loss": 0.1587, "step": 5138 }, { "epoch": 3.67, "grad_norm": 6.111127472026909, "learning_rate": 1.795272427260608e-07, "loss": 0.0766, "step": 5139 }, { "epoch": 3.67, "grad_norm": 5.316408256721683, "learning_rate": 1.7876054220962835e-07, "loss": 0.0811, "step": 5140 }, { "epoch": 3.67, "grad_norm": 6.678820447902406, "learning_rate": 1.779954525589128e-07, "loss": 0.165, "step": 5141 }, { "epoch": 3.67, "grad_norm": 3.925332943931374, "learning_rate": 1.7723197402954419e-07, "loss": 0.0615, "step": 5142 }, { "epoch": 3.67, "grad_norm": 8.13593332874638, "learning_rate": 1.7647010687661558e-07, "loss": 0.101, "step": 5143 }, { "epoch": 3.67, "grad_norm": 5.418752250867253, "learning_rate": 1.757098513546801e-07, "loss": 0.0764, "step": 5144 }, { "epoch": 3.67, "grad_norm": 13.33740238830133, "learning_rate": 1.74951207717754e-07, "loss": 0.1919, "step": 5145 }, { "epoch": 3.67, "grad_norm": 5.192594742264245, "learning_rate": 1.7419417621931388e-07, "loss": 0.0905, "step": 5146 }, { "epoch": 3.67, "grad_norm": 6.938927349266944, "learning_rate": 1.7343875711229864e-07, "loss": 0.1296, "step": 5147 }, { "epoch": 3.67, "grad_norm": 5.150904395129445, "learning_rate": 1.7268495064910574e-07, "loss": 0.0952, "step": 5148 }, { "epoch": 3.68, "grad_norm": 6.645643287522623, "learning_rate": 1.719327570815993e-07, "loss": 0.123, "step": 5149 }, { "epoch": 3.68, "grad_norm": 8.033024279828727, "learning_rate": 1.711821766610977e-07, "loss": 0.1221, "step": 5150 }, { "epoch": 3.68, "grad_norm": 7.151014646433906, "learning_rate": 1.704332096383865e-07, "loss": 0.144, "step": 5151 }, { "epoch": 3.68, "grad_norm": 6.751515538072794, "learning_rate": 1.696858562637077e-07, "loss": 0.1128, "step": 5152 }, { "epoch": 3.68, "grad_norm": 7.3260407768072815, "learning_rate": 1.689401167867677e-07, "loss": 0.137, "step": 5153 }, { "epoch": 3.68, "grad_norm": 6.967392348291816, "learning_rate": 1.6819599145672993e-07, "loss": 0.1198, "step": 5154 }, { "epoch": 3.68, "grad_norm": 9.119755770264783, "learning_rate": 1.674534805222222e-07, "loss": 0.1946, "step": 5155 }, { "epoch": 3.68, "grad_norm": 5.9494333508876265, "learning_rate": 1.667125842313305e-07, "loss": 0.0823, "step": 5156 }, { "epoch": 3.68, "grad_norm": 3.699974499969946, "learning_rate": 1.6597330283160184e-07, "loss": 0.0596, "step": 5157 }, { "epoch": 3.68, "grad_norm": 9.275525343827553, "learning_rate": 1.6523563657004416e-07, "loss": 0.1776, "step": 5158 }, { "epoch": 3.68, "grad_norm": 7.8398354915929325, "learning_rate": 1.644995856931253e-07, "loss": 0.1151, "step": 5159 }, { "epoch": 3.68, "grad_norm": 7.010095295632242, "learning_rate": 1.6376515044677354e-07, "loss": 0.1351, "step": 5160 }, { "epoch": 3.68, "grad_norm": 10.561996804977277, "learning_rate": 1.630323310763776e-07, "loss": 0.14, "step": 5161 }, { "epoch": 3.68, "grad_norm": 6.3885597894295945, "learning_rate": 1.6230112782678608e-07, "loss": 0.1088, "step": 5162 }, { "epoch": 3.69, "grad_norm": 7.464820972006126, "learning_rate": 1.6157154094230744e-07, "loss": 0.1079, "step": 5163 }, { "epoch": 3.69, "grad_norm": 8.489218484166322, "learning_rate": 1.6084357066670997e-07, "loss": 0.1548, "step": 5164 }, { "epoch": 3.69, "grad_norm": 7.166544166806824, "learning_rate": 1.601172172432225e-07, "loss": 0.0991, "step": 5165 }, { "epoch": 3.69, "grad_norm": 6.044756449618914, "learning_rate": 1.5939248091453252e-07, "loss": 0.1147, "step": 5166 }, { "epoch": 3.69, "grad_norm": 7.407020648139834, "learning_rate": 1.5866936192278915e-07, "loss": 0.1702, "step": 5167 }, { "epoch": 3.69, "grad_norm": 6.543503830650654, "learning_rate": 1.5794786050959797e-07, "loss": 0.1196, "step": 5168 }, { "epoch": 3.69, "grad_norm": 8.94521426336034, "learning_rate": 1.5722797691602842e-07, "loss": 0.1217, "step": 5169 }, { "epoch": 3.69, "grad_norm": 5.678678949600126, "learning_rate": 1.5650971138260473e-07, "loss": 0.1138, "step": 5170 }, { "epoch": 3.69, "grad_norm": 6.541004043762944, "learning_rate": 1.5579306414931493e-07, "loss": 0.1263, "step": 5171 }, { "epoch": 3.69, "grad_norm": 8.703199754397636, "learning_rate": 1.5507803545560195e-07, "loss": 0.1287, "step": 5172 }, { "epoch": 3.69, "grad_norm": 6.079487966585593, "learning_rate": 1.543646255403719e-07, "loss": 0.1176, "step": 5173 }, { "epoch": 3.69, "grad_norm": 8.247351544243093, "learning_rate": 1.5365283464198743e-07, "loss": 0.1897, "step": 5174 }, { "epoch": 3.69, "grad_norm": 9.843355071661746, "learning_rate": 1.529426629982711e-07, "loss": 0.1487, "step": 5175 }, { "epoch": 3.69, "grad_norm": 7.88124033472976, "learning_rate": 1.5223411084650476e-07, "loss": 0.1071, "step": 5176 }, { "epoch": 3.7, "grad_norm": 8.117452681694125, "learning_rate": 1.5152717842342845e-07, "loss": 0.1221, "step": 5177 }, { "epoch": 3.7, "grad_norm": 6.399799739093315, "learning_rate": 1.5082186596524218e-07, "loss": 0.124, "step": 5178 }, { "epoch": 3.7, "grad_norm": 8.196187923853193, "learning_rate": 1.501181737076035e-07, "loss": 0.1299, "step": 5179 }, { "epoch": 3.7, "grad_norm": 8.07108956974622, "learning_rate": 1.4941610188562884e-07, "loss": 0.1139, "step": 5180 }, { "epoch": 3.7, "grad_norm": 9.192620314889446, "learning_rate": 1.4871565073389382e-07, "loss": 0.1345, "step": 5181 }, { "epoch": 3.7, "grad_norm": 7.048174028646196, "learning_rate": 1.4801682048643183e-07, "loss": 0.1466, "step": 5182 }, { "epoch": 3.7, "grad_norm": 6.816161354114099, "learning_rate": 1.4731961137673555e-07, "loss": 0.1024, "step": 5183 }, { "epoch": 3.7, "grad_norm": 16.351201920865254, "learning_rate": 1.466240236377553e-07, "loss": 0.1097, "step": 5184 }, { "epoch": 3.7, "grad_norm": 7.299762715915274, "learning_rate": 1.4593005750189958e-07, "loss": 0.1072, "step": 5185 }, { "epoch": 3.7, "grad_norm": 5.167377084530258, "learning_rate": 1.4523771320103574e-07, "loss": 0.0734, "step": 5186 }, { "epoch": 3.7, "grad_norm": 6.275327813941812, "learning_rate": 1.4454699096648873e-07, "loss": 0.1506, "step": 5187 }, { "epoch": 3.7, "grad_norm": 5.996799451568316, "learning_rate": 1.4385789102904168e-07, "loss": 0.111, "step": 5188 }, { "epoch": 3.7, "grad_norm": 7.7803406635382215, "learning_rate": 1.4317041361893546e-07, "loss": 0.1682, "step": 5189 }, { "epoch": 3.7, "grad_norm": 5.665189120267458, "learning_rate": 1.4248455896587022e-07, "loss": 0.0935, "step": 5190 }, { "epoch": 3.71, "grad_norm": 8.005633628165455, "learning_rate": 1.418003272990004e-07, "loss": 0.1013, "step": 5191 }, { "epoch": 3.71, "grad_norm": 6.9506037573191515, "learning_rate": 1.4111771884694315e-07, "loss": 0.1016, "step": 5192 }, { "epoch": 3.71, "grad_norm": 5.8278679045020185, "learning_rate": 1.4043673383776825e-07, "loss": 0.0898, "step": 5193 }, { "epoch": 3.71, "grad_norm": 7.641464266293792, "learning_rate": 1.3975737249900812e-07, "loss": 0.1395, "step": 5194 }, { "epoch": 3.71, "grad_norm": 10.172133880539453, "learning_rate": 1.3907963505764731e-07, "loss": 0.1418, "step": 5195 }, { "epoch": 3.71, "grad_norm": 8.166463290616882, "learning_rate": 1.384035217401325e-07, "loss": 0.1249, "step": 5196 }, { "epoch": 3.71, "grad_norm": 7.385868049756229, "learning_rate": 1.3772903277236404e-07, "loss": 0.1636, "step": 5197 }, { "epoch": 3.71, "grad_norm": 7.952994717184201, "learning_rate": 1.370561683797028e-07, "loss": 0.1356, "step": 5198 }, { "epoch": 3.71, "grad_norm": 10.124774749021338, "learning_rate": 1.363849287869645e-07, "loss": 0.1481, "step": 5199 }, { "epoch": 3.71, "grad_norm": 6.320488348773809, "learning_rate": 1.3571531421842256e-07, "loss": 0.106, "step": 5200 }, { "epoch": 3.71, "grad_norm": 6.804058988616448, "learning_rate": 1.3504732489780849e-07, "loss": 0.114, "step": 5201 }, { "epoch": 3.71, "grad_norm": 7.514633914232519, "learning_rate": 1.3438096104830879e-07, "loss": 0.1368, "step": 5202 }, { "epoch": 3.71, "grad_norm": 5.702232298090348, "learning_rate": 1.3371622289256869e-07, "loss": 0.1162, "step": 5203 }, { "epoch": 3.71, "grad_norm": 4.859181618551624, "learning_rate": 1.3305311065269e-07, "loss": 0.0895, "step": 5204 }, { "epoch": 3.72, "grad_norm": 10.143736353282302, "learning_rate": 1.323916245502299e-07, "loss": 0.1407, "step": 5205 }, { "epoch": 3.72, "grad_norm": 6.899522459445874, "learning_rate": 1.3173176480620442e-07, "loss": 0.1295, "step": 5206 }, { "epoch": 3.72, "grad_norm": 5.406089544140982, "learning_rate": 1.3107353164108273e-07, "loss": 0.094, "step": 5207 }, { "epoch": 3.72, "grad_norm": 6.286668556628162, "learning_rate": 1.3041692527479556e-07, "loss": 0.125, "step": 5208 }, { "epoch": 3.72, "grad_norm": 6.2593888521125, "learning_rate": 1.2976194592672465e-07, "loss": 0.1161, "step": 5209 }, { "epoch": 3.72, "grad_norm": 10.413500581443682, "learning_rate": 1.2910859381571327e-07, "loss": 0.1522, "step": 5210 }, { "epoch": 3.72, "grad_norm": 7.838444174981346, "learning_rate": 1.284568691600563e-07, "loss": 0.1145, "step": 5211 }, { "epoch": 3.72, "grad_norm": 5.995371120181053, "learning_rate": 1.2780677217750949e-07, "loss": 0.1064, "step": 5212 }, { "epoch": 3.72, "grad_norm": 7.572479545934153, "learning_rate": 1.271583030852791e-07, "loss": 0.1183, "step": 5213 }, { "epoch": 3.72, "grad_norm": 6.113754497506733, "learning_rate": 1.2651146210003406e-07, "loss": 0.1029, "step": 5214 }, { "epoch": 3.72, "grad_norm": 5.198270804458642, "learning_rate": 1.2586624943789372e-07, "loss": 0.1324, "step": 5215 }, { "epoch": 3.72, "grad_norm": 7.473074709137638, "learning_rate": 1.2522266531443616e-07, "loss": 0.1241, "step": 5216 }, { "epoch": 3.72, "grad_norm": 13.207193018450559, "learning_rate": 1.245807099446955e-07, "loss": 0.1882, "step": 5217 }, { "epoch": 3.72, "grad_norm": 6.335816731636773, "learning_rate": 1.239403835431602e-07, "loss": 0.0775, "step": 5218 }, { "epoch": 3.73, "grad_norm": 10.620746803482923, "learning_rate": 1.2330168632377514e-07, "loss": 0.1576, "step": 5219 }, { "epoch": 3.73, "grad_norm": 7.84388438837485, "learning_rate": 1.2266461849994138e-07, "loss": 0.1257, "step": 5220 }, { "epoch": 3.73, "grad_norm": 7.3623646823821565, "learning_rate": 1.2202918028451527e-07, "loss": 0.1428, "step": 5221 }, { "epoch": 3.73, "grad_norm": 7.362173666685328, "learning_rate": 1.2139537188980753e-07, "loss": 0.1882, "step": 5222 }, { "epoch": 3.73, "grad_norm": 5.891116663875299, "learning_rate": 1.207631935275866e-07, "loss": 0.1367, "step": 5223 }, { "epoch": 3.73, "grad_norm": 5.184386123808496, "learning_rate": 1.2013264540907455e-07, "loss": 0.1078, "step": 5224 }, { "epoch": 3.73, "grad_norm": 6.019083925984621, "learning_rate": 1.1950372774494846e-07, "loss": 0.0882, "step": 5225 }, { "epoch": 3.73, "grad_norm": 6.246318195836614, "learning_rate": 1.1887644074534244e-07, "loss": 0.1174, "step": 5226 }, { "epoch": 3.73, "grad_norm": 7.847553205535199, "learning_rate": 1.182507846198444e-07, "loss": 0.1487, "step": 5227 }, { "epoch": 3.73, "grad_norm": 11.611646202895807, "learning_rate": 1.1762675957749769e-07, "loss": 0.2279, "step": 5228 }, { "epoch": 3.73, "grad_norm": 5.737661801161129, "learning_rate": 1.1700436582680108e-07, "loss": 0.1067, "step": 5229 }, { "epoch": 3.73, "grad_norm": 8.299874718836872, "learning_rate": 1.1638360357570654e-07, "loss": 0.1802, "step": 5230 }, { "epoch": 3.73, "grad_norm": 7.031326583486857, "learning_rate": 1.157644730316243e-07, "loss": 0.115, "step": 5231 }, { "epoch": 3.73, "grad_norm": 6.615993043157198, "learning_rate": 1.1514697440141498e-07, "loss": 0.116, "step": 5232 }, { "epoch": 3.74, "grad_norm": 6.184865547249952, "learning_rate": 1.1453110789139855e-07, "loss": 0.1007, "step": 5233 }, { "epoch": 3.74, "grad_norm": 6.274144565026902, "learning_rate": 1.1391687370734594e-07, "loss": 0.1247, "step": 5234 }, { "epoch": 3.74, "grad_norm": 8.242541318513114, "learning_rate": 1.1330427205448579e-07, "loss": 0.1362, "step": 5235 }, { "epoch": 3.74, "grad_norm": 12.150162498817286, "learning_rate": 1.1269330313749715e-07, "loss": 0.1766, "step": 5236 }, { "epoch": 3.74, "grad_norm": 6.308670112443291, "learning_rate": 1.1208396716051895e-07, "loss": 0.1266, "step": 5237 }, { "epoch": 3.74, "grad_norm": 6.6392974652541605, "learning_rate": 1.1147626432713943e-07, "loss": 0.0908, "step": 5238 }, { "epoch": 3.74, "grad_norm": 7.564703684509811, "learning_rate": 1.1087019484040562e-07, "loss": 0.1458, "step": 5239 }, { "epoch": 3.74, "grad_norm": 7.470040083902322, "learning_rate": 1.1026575890281443e-07, "loss": 0.1246, "step": 5240 }, { "epoch": 3.74, "grad_norm": 6.086766086218585, "learning_rate": 1.0966295671632043e-07, "loss": 0.1151, "step": 5241 }, { "epoch": 3.74, "grad_norm": 6.4424206168832585, "learning_rate": 1.0906178848233029e-07, "loss": 0.0956, "step": 5242 }, { "epoch": 3.74, "grad_norm": 7.511100671159325, "learning_rate": 1.0846225440170611e-07, "loss": 0.1285, "step": 5243 }, { "epoch": 3.74, "grad_norm": 5.8044703706351815, "learning_rate": 1.0786435467476264e-07, "loss": 0.1116, "step": 5244 }, { "epoch": 3.74, "grad_norm": 5.525857288049917, "learning_rate": 1.072680895012701e-07, "loss": 0.1162, "step": 5245 }, { "epoch": 3.74, "grad_norm": 6.722086445426205, "learning_rate": 1.0667345908045135e-07, "loss": 0.1187, "step": 5246 }, { "epoch": 3.75, "grad_norm": 5.848587661746584, "learning_rate": 1.0608046361098356e-07, "loss": 0.1238, "step": 5247 }, { "epoch": 3.75, "grad_norm": 8.613934378361632, "learning_rate": 1.0548910329099771e-07, "loss": 0.1035, "step": 5248 }, { "epoch": 3.75, "grad_norm": 5.758696950894796, "learning_rate": 1.048993783180785e-07, "loss": 0.1001, "step": 5249 }, { "epoch": 3.75, "grad_norm": 11.99570441487423, "learning_rate": 1.0431128888926222e-07, "loss": 0.1798, "step": 5250 }, { "epoch": 3.75, "grad_norm": 9.68337979709574, "learning_rate": 1.0372483520104337e-07, "loss": 0.2222, "step": 5251 }, { "epoch": 3.75, "grad_norm": 9.159716336335311, "learning_rate": 1.0314001744936409e-07, "loss": 0.1133, "step": 5252 }, { "epoch": 3.75, "grad_norm": 12.181529893720981, "learning_rate": 1.0255683582962583e-07, "loss": 0.1626, "step": 5253 }, { "epoch": 3.75, "grad_norm": 5.799677606632576, "learning_rate": 1.0197529053667721e-07, "loss": 0.0961, "step": 5254 }, { "epoch": 3.75, "grad_norm": 6.57621765664244, "learning_rate": 1.013953817648261e-07, "loss": 0.1383, "step": 5255 }, { "epoch": 3.75, "grad_norm": 6.189186863181362, "learning_rate": 1.008171097078292e-07, "loss": 0.1518, "step": 5256 }, { "epoch": 3.75, "grad_norm": 12.319055223640666, "learning_rate": 1.0024047455889918e-07, "loss": 0.1925, "step": 5257 }, { "epoch": 3.75, "grad_norm": 4.369429730589778, "learning_rate": 9.966547651069913e-08, "loss": 0.0916, "step": 5258 }, { "epoch": 3.75, "grad_norm": 7.524207259807521, "learning_rate": 9.909211575534705e-08, "loss": 0.132, "step": 5259 }, { "epoch": 3.75, "grad_norm": 7.236946977446954, "learning_rate": 9.852039248441414e-08, "loss": 0.1023, "step": 5260 }, { "epoch": 3.76, "grad_norm": 9.47859230035742, "learning_rate": 9.79503068889226e-08, "loss": 0.1814, "step": 5261 }, { "epoch": 3.76, "grad_norm": 8.758267109966468, "learning_rate": 9.738185915935005e-08, "loss": 0.175, "step": 5262 }, { "epoch": 3.76, "grad_norm": 9.27801847977357, "learning_rate": 9.681504948562403e-08, "loss": 0.0741, "step": 5263 }, { "epoch": 3.76, "grad_norm": 5.229611351433055, "learning_rate": 9.624987805712749e-08, "loss": 0.1229, "step": 5264 }, { "epoch": 3.76, "grad_norm": 7.05122561410525, "learning_rate": 9.568634506269381e-08, "loss": 0.1503, "step": 5265 }, { "epoch": 3.76, "grad_norm": 6.437579163294752, "learning_rate": 9.51244506906096e-08, "loss": 0.0978, "step": 5266 }, { "epoch": 3.76, "grad_norm": 4.629315173299733, "learning_rate": 9.45641951286158e-08, "loss": 0.076, "step": 5267 }, { "epoch": 3.76, "grad_norm": 12.414008878645971, "learning_rate": 9.400557856390158e-08, "loss": 0.1239, "step": 5268 }, { "epoch": 3.76, "grad_norm": 7.86659051015309, "learning_rate": 9.344860118311427e-08, "loss": 0.1056, "step": 5269 }, { "epoch": 3.76, "grad_norm": 4.829098697103711, "learning_rate": 9.289326317234726e-08, "loss": 0.114, "step": 5270 }, { "epoch": 3.76, "grad_norm": 5.748563587949899, "learning_rate": 9.23395647171521e-08, "loss": 0.1136, "step": 5271 }, { "epoch": 3.76, "grad_norm": 9.201838202910588, "learning_rate": 9.178750600252695e-08, "loss": 0.1781, "step": 5272 }, { "epoch": 3.76, "grad_norm": 10.329283760051627, "learning_rate": 9.123708721292756e-08, "loss": 0.1503, "step": 5273 }, { "epoch": 3.76, "grad_norm": 5.90976125143801, "learning_rate": 9.06883085322574e-08, "loss": 0.0987, "step": 5274 }, { "epoch": 3.77, "grad_norm": 9.31051888219555, "learning_rate": 9.014117014387424e-08, "loss": 0.1438, "step": 5275 }, { "epoch": 3.77, "grad_norm": 5.3011211156186935, "learning_rate": 8.95956722305874e-08, "loss": 0.1042, "step": 5276 }, { "epoch": 3.77, "grad_norm": 5.25084946851794, "learning_rate": 8.905181497465664e-08, "loss": 0.1144, "step": 5277 }, { "epoch": 3.77, "grad_norm": 12.014268472267695, "learning_rate": 8.850959855779662e-08, "loss": 0.1437, "step": 5278 }, { "epoch": 3.77, "grad_norm": 6.3680562353578, "learning_rate": 8.796902316117018e-08, "loss": 0.0974, "step": 5279 }, { "epoch": 3.77, "grad_norm": 8.104205605705294, "learning_rate": 8.743008896539451e-08, "loss": 0.1185, "step": 5280 }, { "epoch": 3.77, "grad_norm": 5.822397728872039, "learning_rate": 8.68927961505378e-08, "loss": 0.0966, "step": 5281 }, { "epoch": 3.77, "grad_norm": 6.962525356143755, "learning_rate": 8.635714489611868e-08, "loss": 0.1772, "step": 5282 }, { "epoch": 3.77, "grad_norm": 3.4875000040030812, "learning_rate": 8.582313538110898e-08, "loss": 0.0679, "step": 5283 }, { "epoch": 3.77, "grad_norm": 5.751258070965259, "learning_rate": 8.529076778393097e-08, "loss": 0.1108, "step": 5284 }, { "epoch": 3.77, "grad_norm": 22.636631698978753, "learning_rate": 8.476004228245848e-08, "loss": 0.2013, "step": 5285 }, { "epoch": 3.77, "grad_norm": 6.386527686790105, "learning_rate": 8.42309590540169e-08, "loss": 0.1111, "step": 5286 }, { "epoch": 3.77, "grad_norm": 7.245454822317008, "learning_rate": 8.370351827538259e-08, "loss": 0.1567, "step": 5287 }, { "epoch": 3.77, "grad_norm": 7.696534926561681, "learning_rate": 8.317772012278347e-08, "loss": 0.0975, "step": 5288 }, { "epoch": 3.78, "grad_norm": 7.159676601821471, "learning_rate": 8.26535647718979e-08, "loss": 0.1014, "step": 5289 }, { "epoch": 3.78, "grad_norm": 7.323349288176798, "learning_rate": 8.213105239785691e-08, "loss": 0.121, "step": 5290 }, { "epoch": 3.78, "grad_norm": 6.626972303043425, "learning_rate": 8.161018317524139e-08, "loss": 0.1101, "step": 5291 }, { "epoch": 3.78, "grad_norm": 9.572853627923612, "learning_rate": 8.109095727808269e-08, "loss": 0.1646, "step": 5292 }, { "epoch": 3.78, "grad_norm": 6.1048890627998045, "learning_rate": 8.057337487986427e-08, "loss": 0.1073, "step": 5293 }, { "epoch": 3.78, "grad_norm": 8.486123430984657, "learning_rate": 8.005743615352057e-08, "loss": 0.1229, "step": 5294 }, { "epoch": 3.78, "grad_norm": 10.624680701422056, "learning_rate": 7.954314127143481e-08, "loss": 0.1277, "step": 5295 }, { "epoch": 3.78, "grad_norm": 9.68142575432856, "learning_rate": 7.903049040544453e-08, "loss": 0.1707, "step": 5296 }, { "epoch": 3.78, "grad_norm": 5.035336991397328, "learning_rate": 7.851948372683382e-08, "loss": 0.1169, "step": 5297 }, { "epoch": 3.78, "grad_norm": 6.927256106603059, "learning_rate": 7.801012140634167e-08, "loss": 0.1127, "step": 5298 }, { "epoch": 3.78, "grad_norm": 8.320357114365319, "learning_rate": 7.750240361415362e-08, "loss": 0.1432, "step": 5299 }, { "epoch": 3.78, "grad_norm": 8.895191827361518, "learning_rate": 7.69963305199084e-08, "loss": 0.1588, "step": 5300 }, { "epoch": 3.78, "grad_norm": 8.444645977840397, "learning_rate": 7.64919022926941e-08, "loss": 0.1382, "step": 5301 }, { "epoch": 3.78, "grad_norm": 8.356571405819423, "learning_rate": 7.598911910105033e-08, "loss": 0.1312, "step": 5302 }, { "epoch": 3.79, "grad_norm": 9.29974473920337, "learning_rate": 7.548798111296552e-08, "loss": 0.1755, "step": 5303 }, { "epoch": 3.79, "grad_norm": 6.36759012920459, "learning_rate": 7.498848849588015e-08, "loss": 0.1384, "step": 5304 }, { "epoch": 3.79, "grad_norm": 6.686410092064511, "learning_rate": 7.449064141668238e-08, "loss": 0.0949, "step": 5305 }, { "epoch": 3.79, "grad_norm": 6.1045580573583695, "learning_rate": 7.399444004171364e-08, "loss": 0.1318, "step": 5306 }, { "epoch": 3.79, "grad_norm": 6.181571610180925, "learning_rate": 7.349988453676349e-08, "loss": 0.1101, "step": 5307 }, { "epoch": 3.79, "grad_norm": 5.948354119093815, "learning_rate": 7.300697506707254e-08, "loss": 0.1398, "step": 5308 }, { "epoch": 3.79, "grad_norm": 6.1083301561778685, "learning_rate": 7.251571179732963e-08, "loss": 0.1018, "step": 5309 }, { "epoch": 3.79, "grad_norm": 6.193753633726565, "learning_rate": 7.202609489167734e-08, "loss": 0.1401, "step": 5310 }, { "epoch": 3.79, "grad_norm": 6.971897988354667, "learning_rate": 7.153812451370312e-08, "loss": 0.1178, "step": 5311 }, { "epoch": 3.79, "grad_norm": 8.547983835503912, "learning_rate": 7.10518008264488e-08, "loss": 0.1567, "step": 5312 }, { "epoch": 3.79, "grad_norm": 5.901556010148924, "learning_rate": 7.056712399240274e-08, "loss": 0.0923, "step": 5313 }, { "epoch": 3.79, "grad_norm": 5.1229350172421615, "learning_rate": 7.008409417350648e-08, "loss": 0.1046, "step": 5314 }, { "epoch": 3.79, "grad_norm": 21.260270101970896, "learning_rate": 6.960271153114706e-08, "loss": 0.279, "step": 5315 }, { "epoch": 3.79, "grad_norm": 4.805978363368682, "learning_rate": 6.912297622616526e-08, "loss": 0.0815, "step": 5316 }, { "epoch": 3.8, "grad_norm": 11.926823320423209, "learning_rate": 6.864488841884786e-08, "loss": 0.1357, "step": 5317 }, { "epoch": 3.8, "grad_norm": 5.122540845339858, "learning_rate": 6.816844826893431e-08, "loss": 0.1118, "step": 5318 }, { "epoch": 3.8, "grad_norm": 8.90771512298503, "learning_rate": 6.769365593561117e-08, "loss": 0.1603, "step": 5319 }, { "epoch": 3.8, "grad_norm": 5.151048217047981, "learning_rate": 6.722051157751597e-08, "loss": 0.0963, "step": 5320 }, { "epoch": 3.8, "grad_norm": 7.657265995185122, "learning_rate": 6.674901535273448e-08, "loss": 0.101, "step": 5321 }, { "epoch": 3.8, "grad_norm": 6.07689362939208, "learning_rate": 6.627916741880291e-08, "loss": 0.1242, "step": 5322 }, { "epoch": 3.8, "grad_norm": 6.1248540676091565, "learning_rate": 6.581096793270625e-08, "loss": 0.073, "step": 5323 }, { "epoch": 3.8, "grad_norm": 9.295501650534323, "learning_rate": 6.534441705087768e-08, "loss": 0.1558, "step": 5324 }, { "epoch": 3.8, "grad_norm": 14.16116865988068, "learning_rate": 6.487951492920141e-08, "loss": 0.1665, "step": 5325 }, { "epoch": 3.8, "grad_norm": 6.979231240584932, "learning_rate": 6.441626172300986e-08, "loss": 0.149, "step": 5326 }, { "epoch": 3.8, "grad_norm": 8.526966234599504, "learning_rate": 6.395465758708419e-08, "loss": 0.1081, "step": 5327 }, { "epoch": 3.8, "grad_norm": 13.410360679446114, "learning_rate": 6.349470267565549e-08, "loss": 0.1249, "step": 5328 }, { "epoch": 3.8, "grad_norm": 6.87362720959621, "learning_rate": 6.303639714240196e-08, "loss": 0.1234, "step": 5329 }, { "epoch": 3.8, "grad_norm": 5.4425196121684865, "learning_rate": 6.257974114045385e-08, "loss": 0.0964, "step": 5330 }, { "epoch": 3.81, "grad_norm": 5.646662279495886, "learning_rate": 6.212473482238635e-08, "loss": 0.1497, "step": 5331 }, { "epoch": 3.81, "grad_norm": 6.057643625550772, "learning_rate": 6.167137834022785e-08, "loss": 0.0977, "step": 5332 }, { "epoch": 3.81, "grad_norm": 6.539793036897762, "learning_rate": 6.121967184545107e-08, "loss": 0.1141, "step": 5333 }, { "epoch": 3.81, "grad_norm": 5.761339793280051, "learning_rate": 6.076961548898086e-08, "loss": 0.0969, "step": 5334 }, { "epoch": 3.81, "grad_norm": 6.800019097577064, "learning_rate": 6.032120942118858e-08, "loss": 0.1007, "step": 5335 }, { "epoch": 3.81, "grad_norm": 5.289423884200621, "learning_rate": 5.98744537918955e-08, "loss": 0.0994, "step": 5336 }, { "epoch": 3.81, "grad_norm": 6.500424630787519, "learning_rate": 5.9429348750371097e-08, "loss": 0.1061, "step": 5337 }, { "epoch": 3.81, "grad_norm": 7.843475544162, "learning_rate": 5.898589444533254e-08, "loss": 0.146, "step": 5338 }, { "epoch": 3.81, "grad_norm": 11.118806698310935, "learning_rate": 5.85440910249474e-08, "loss": 0.1675, "step": 5339 }, { "epoch": 3.81, "grad_norm": 7.320211335379985, "learning_rate": 5.810393863682873e-08, "loss": 0.1436, "step": 5340 }, { "epoch": 3.81, "grad_norm": 6.079462595694946, "learning_rate": 5.7665437428041096e-08, "loss": 0.1572, "step": 5341 }, { "epoch": 3.81, "grad_norm": 7.477284639355565, "learning_rate": 5.722858754509564e-08, "loss": 0.1337, "step": 5342 }, { "epoch": 3.81, "grad_norm": 5.941412410817457, "learning_rate": 5.679338913395116e-08, "loss": 0.0836, "step": 5343 }, { "epoch": 3.81, "grad_norm": 15.49462238075577, "learning_rate": 5.6359842340016904e-08, "loss": 0.168, "step": 5344 }, { "epoch": 3.82, "grad_norm": 7.814014540138655, "learning_rate": 5.5927947308147545e-08, "loss": 0.0989, "step": 5345 }, { "epoch": 3.82, "grad_norm": 6.40539018080863, "learning_rate": 5.549770418264766e-08, "loss": 0.0836, "step": 5346 }, { "epoch": 3.82, "grad_norm": 8.444026783490271, "learning_rate": 5.5069113107270034e-08, "loss": 0.1371, "step": 5347 }, { "epoch": 3.82, "grad_norm": 6.902013975947065, "learning_rate": 5.464217422521456e-08, "loss": 0.0989, "step": 5348 }, { "epoch": 3.82, "grad_norm": 4.274615043574269, "learning_rate": 5.421688767912936e-08, "loss": 0.0985, "step": 5349 }, { "epoch": 3.82, "grad_norm": 5.836113346579141, "learning_rate": 5.3793253611110206e-08, "loss": 0.0904, "step": 5350 }, { "epoch": 3.82, "grad_norm": 10.277038370643657, "learning_rate": 5.3371272162702214e-08, "loss": 0.1272, "step": 5351 }, { "epoch": 3.82, "grad_norm": 7.427580721132972, "learning_rate": 5.295094347489593e-08, "loss": 0.1472, "step": 5352 }, { "epoch": 3.82, "grad_norm": 5.647665876407714, "learning_rate": 5.253226768813235e-08, "loss": 0.0901, "step": 5353 }, { "epoch": 3.82, "grad_norm": 9.858056079098924, "learning_rate": 5.211524494229736e-08, "loss": 0.1442, "step": 5354 }, { "epoch": 3.82, "grad_norm": 7.289792068781541, "learning_rate": 5.169987537672727e-08, "loss": 0.135, "step": 5355 }, { "epoch": 3.82, "grad_norm": 8.232332580344814, "learning_rate": 5.128615913020385e-08, "loss": 0.1414, "step": 5356 }, { "epoch": 3.82, "grad_norm": 6.759511421442111, "learning_rate": 5.087409634095819e-08, "loss": 0.1025, "step": 5357 }, { "epoch": 3.82, "grad_norm": 9.901571938672475, "learning_rate": 5.046368714666683e-08, "loss": 0.1069, "step": 5358 }, { "epoch": 3.83, "grad_norm": 6.677195791425341, "learning_rate": 5.0054931684457296e-08, "loss": 0.1721, "step": 5359 }, { "epoch": 3.83, "grad_norm": 7.5234884387987, "learning_rate": 4.964783009090035e-08, "loss": 0.1272, "step": 5360 }, { "epoch": 3.83, "grad_norm": 7.1565601746836665, "learning_rate": 4.9242382502017185e-08, "loss": 0.1442, "step": 5361 }, { "epoch": 3.83, "grad_norm": 6.612248876782069, "learning_rate": 4.883858905327499e-08, "loss": 0.1156, "step": 5362 }, { "epoch": 3.83, "grad_norm": 6.653265623648825, "learning_rate": 4.843644987958862e-08, "loss": 0.109, "step": 5363 }, { "epoch": 3.83, "grad_norm": 7.539454475937336, "learning_rate": 4.8035965115320604e-08, "loss": 0.1241, "step": 5364 }, { "epoch": 3.83, "grad_norm": 9.420747332016814, "learning_rate": 4.763713489428001e-08, "loss": 0.121, "step": 5365 }, { "epoch": 3.83, "grad_norm": 5.7408741621782, "learning_rate": 4.723995934972414e-08, "loss": 0.1077, "step": 5366 }, { "epoch": 3.83, "grad_norm": 7.081059176994714, "learning_rate": 4.684443861435572e-08, "loss": 0.1456, "step": 5367 }, { "epoch": 3.83, "grad_norm": 12.657636446362256, "learning_rate": 4.6450572820325727e-08, "loss": 0.1986, "step": 5368 }, { "epoch": 3.83, "grad_norm": 5.913660788091783, "learning_rate": 4.605836209923331e-08, "loss": 0.1097, "step": 5369 }, { "epoch": 3.83, "grad_norm": 7.210402941111909, "learning_rate": 4.566780658212144e-08, "loss": 0.1443, "step": 5370 }, { "epoch": 3.83, "grad_norm": 5.194586966451776, "learning_rate": 4.5278906399483516e-08, "loss": 0.0763, "step": 5371 }, { "epoch": 3.83, "grad_norm": 7.14924387954306, "learning_rate": 4.489166168125725e-08, "loss": 0.1792, "step": 5372 }, { "epoch": 3.84, "grad_norm": 9.147775379876485, "learning_rate": 4.4506072556829704e-08, "loss": 0.1053, "step": 5373 }, { "epoch": 3.84, "grad_norm": 10.730163857110522, "learning_rate": 4.4122139155031717e-08, "loss": 0.0945, "step": 5374 }, { "epoch": 3.84, "grad_norm": 9.80794922773675, "learning_rate": 4.373986160414345e-08, "loss": 0.117, "step": 5375 }, { "epoch": 3.84, "grad_norm": 5.52024439305837, "learning_rate": 4.335924003189107e-08, "loss": 0.0768, "step": 5376 }, { "epoch": 3.84, "grad_norm": 6.642666235134706, "learning_rate": 4.298027456544674e-08, "loss": 0.0894, "step": 5377 }, { "epoch": 3.84, "grad_norm": 8.033102864873747, "learning_rate": 4.260296533143027e-08, "loss": 0.1226, "step": 5378 }, { "epoch": 3.84, "grad_norm": 7.958363936548108, "learning_rate": 4.22273124559075e-08, "loss": 0.1164, "step": 5379 }, { "epoch": 3.84, "grad_norm": 6.064455639298206, "learning_rate": 4.185331606439136e-08, "loss": 0.116, "step": 5380 }, { "epoch": 3.84, "grad_norm": 7.2571709246078715, "learning_rate": 4.148097628184078e-08, "loss": 0.1587, "step": 5381 }, { "epoch": 3.84, "grad_norm": 5.531175996724435, "learning_rate": 4.111029323266125e-08, "loss": 0.1157, "step": 5382 }, { "epoch": 3.84, "grad_norm": 3.8346388590651186, "learning_rate": 4.07412670407048e-08, "loss": 0.0787, "step": 5383 }, { "epoch": 3.84, "grad_norm": 15.590850711055705, "learning_rate": 4.037389782927059e-08, "loss": 0.222, "step": 5384 }, { "epoch": 3.84, "grad_norm": 9.156488520749987, "learning_rate": 4.000818572110265e-08, "loss": 0.1475, "step": 5385 }, { "epoch": 3.84, "grad_norm": 6.095122261021763, "learning_rate": 3.964413083839269e-08, "loss": 0.0922, "step": 5386 }, { "epoch": 3.85, "grad_norm": 7.495658799999846, "learning_rate": 3.9281733302778404e-08, "loss": 0.099, "step": 5387 }, { "epoch": 3.85, "grad_norm": 6.902837764596447, "learning_rate": 3.892099323534293e-08, "loss": 0.1339, "step": 5388 }, { "epoch": 3.85, "grad_norm": 5.998154833344351, "learning_rate": 3.856191075661708e-08, "loss": 0.0916, "step": 5389 }, { "epoch": 3.85, "grad_norm": 8.054604203914527, "learning_rate": 3.8204485986576e-08, "loss": 0.1375, "step": 5390 }, { "epoch": 3.85, "grad_norm": 5.718326581624709, "learning_rate": 3.784871904464249e-08, "loss": 0.1193, "step": 5391 }, { "epoch": 3.85, "grad_norm": 6.0495502960815655, "learning_rate": 3.7494610049684796e-08, "loss": 0.1162, "step": 5392 }, { "epoch": 3.85, "grad_norm": 10.193965675630396, "learning_rate": 3.714215912001773e-08, "loss": 0.1565, "step": 5393 }, { "epoch": 3.85, "grad_norm": 5.735370222498481, "learning_rate": 3.6791366373400974e-08, "loss": 0.0824, "step": 5394 }, { "epoch": 3.85, "grad_norm": 8.796648636559523, "learning_rate": 3.6442231927041324e-08, "loss": 0.118, "step": 5395 }, { "epoch": 3.85, "grad_norm": 8.101682970617409, "learning_rate": 3.609475589759104e-08, "loss": 0.1078, "step": 5396 }, { "epoch": 3.85, "grad_norm": 6.642030857014743, "learning_rate": 3.574893840114835e-08, "loss": 0.1002, "step": 5397 }, { "epoch": 3.85, "grad_norm": 6.008978983404813, "learning_rate": 3.5404779553257494e-08, "loss": 0.1146, "step": 5398 }, { "epoch": 3.85, "grad_norm": 8.787617973801435, "learning_rate": 3.506227946890761e-08, "loss": 0.209, "step": 5399 }, { "epoch": 3.85, "grad_norm": 5.136877676664234, "learning_rate": 3.4721438262534935e-08, "loss": 0.0845, "step": 5400 }, { "epoch": 3.86, "grad_norm": 6.842953334692523, "learning_rate": 3.438225604802115e-08, "loss": 0.1489, "step": 5401 }, { "epoch": 3.86, "grad_norm": 5.907887079400435, "learning_rate": 3.404473293869226e-08, "loss": 0.1051, "step": 5402 }, { "epoch": 3.86, "grad_norm": 8.675482010996637, "learning_rate": 3.370886904732196e-08, "loss": 0.1149, "step": 5403 }, { "epoch": 3.86, "grad_norm": 5.556595931410459, "learning_rate": 3.33746644861288e-08, "loss": 0.0774, "step": 5404 }, { "epoch": 3.86, "grad_norm": 7.278531821366081, "learning_rate": 3.30421193667757e-08, "loss": 0.1306, "step": 5405 }, { "epoch": 3.86, "grad_norm": 5.349133272872112, "learning_rate": 3.271123380037322e-08, "loss": 0.0956, "step": 5406 }, { "epoch": 3.86, "grad_norm": 6.334550501760739, "learning_rate": 3.2382007897475695e-08, "loss": 0.0842, "step": 5407 }, { "epoch": 3.86, "grad_norm": 6.826027471570106, "learning_rate": 3.2054441768083477e-08, "loss": 0.1401, "step": 5408 }, { "epoch": 3.86, "grad_norm": 6.591080668686281, "learning_rate": 3.1728535521643454e-08, "loss": 0.1153, "step": 5409 }, { "epoch": 3.86, "grad_norm": 4.910777250160643, "learning_rate": 3.1404289267046305e-08, "loss": 0.0955, "step": 5410 }, { "epoch": 3.86, "grad_norm": 6.193743146559833, "learning_rate": 3.1081703112628146e-08, "loss": 0.1156, "step": 5411 }, { "epoch": 3.86, "grad_norm": 4.131743099975989, "learning_rate": 3.0760777166172206e-08, "loss": 0.0914, "step": 5412 }, { "epoch": 3.86, "grad_norm": 9.147186309625447, "learning_rate": 3.0441511534904934e-08, "loss": 0.186, "step": 5413 }, { "epoch": 3.86, "grad_norm": 16.89801450783667, "learning_rate": 3.012390632549933e-08, "loss": 0.2162, "step": 5414 }, { "epoch": 3.87, "grad_norm": 7.065640199939804, "learning_rate": 2.9807961644073294e-08, "loss": 0.1641, "step": 5415 }, { "epoch": 3.87, "grad_norm": 5.507514754630227, "learning_rate": 2.9493677596189595e-08, "loss": 0.1292, "step": 5416 }, { "epoch": 3.87, "grad_norm": 6.253875594598688, "learning_rate": 2.9181054286855916e-08, "loss": 0.1044, "step": 5417 }, { "epoch": 3.87, "grad_norm": 9.876290985174975, "learning_rate": 2.887009182052647e-08, "loss": 0.2092, "step": 5418 }, { "epoch": 3.87, "grad_norm": 8.262022252186037, "learning_rate": 2.8560790301098705e-08, "loss": 0.1256, "step": 5419 }, { "epoch": 3.87, "grad_norm": 4.444790652855763, "learning_rate": 2.825314983191718e-08, "loss": 0.0795, "step": 5420 }, { "epoch": 3.87, "grad_norm": 5.034772326384623, "learning_rate": 2.7947170515768562e-08, "loss": 0.1035, "step": 5421 }, { "epoch": 3.87, "grad_norm": 8.398317634313441, "learning_rate": 2.7642852454887736e-08, "loss": 0.1326, "step": 5422 }, { "epoch": 3.87, "grad_norm": 4.212256085984667, "learning_rate": 2.7340195750952813e-08, "loss": 0.0872, "step": 5423 }, { "epoch": 3.87, "grad_norm": 6.414797855257343, "learning_rate": 2.703920050508624e-08, "loss": 0.1196, "step": 5424 }, { "epoch": 3.87, "grad_norm": 6.604779209237305, "learning_rate": 2.673986681785645e-08, "loss": 0.1197, "step": 5425 }, { "epoch": 3.87, "grad_norm": 4.618675757820333, "learning_rate": 2.6442194789277342e-08, "loss": 0.0822, "step": 5426 }, { "epoch": 3.87, "grad_norm": 10.808901861160757, "learning_rate": 2.6146184518804908e-08, "loss": 0.1245, "step": 5427 }, { "epoch": 3.87, "grad_norm": 4.610150315103935, "learning_rate": 2.5851836105343363e-08, "loss": 0.098, "step": 5428 }, { "epoch": 3.88, "grad_norm": 9.495465719612765, "learning_rate": 2.555914964723849e-08, "loss": 0.1179, "step": 5429 }, { "epoch": 3.88, "grad_norm": 6.207059094738852, "learning_rate": 2.5268125242283724e-08, "loss": 0.0848, "step": 5430 }, { "epoch": 3.88, "grad_norm": 6.450983131869945, "learning_rate": 2.4978762987714067e-08, "loss": 0.123, "step": 5431 }, { "epoch": 3.88, "grad_norm": 10.375350232266669, "learning_rate": 2.469106298021273e-08, "loss": 0.165, "step": 5432 }, { "epoch": 3.88, "grad_norm": 6.503010938323237, "learning_rate": 2.4405025315904495e-08, "loss": 0.1229, "step": 5433 }, { "epoch": 3.88, "grad_norm": 8.816882888336334, "learning_rate": 2.412065009036013e-08, "loss": 0.1213, "step": 5434 }, { "epoch": 3.88, "grad_norm": 5.889173798463627, "learning_rate": 2.3837937398594747e-08, "loss": 0.1117, "step": 5435 }, { "epoch": 3.88, "grad_norm": 8.450886362130994, "learning_rate": 2.3556887335067223e-08, "loss": 0.1573, "step": 5436 }, { "epoch": 3.88, "grad_norm": 7.633750300186745, "learning_rate": 2.3277499993682452e-08, "loss": 0.1022, "step": 5437 }, { "epoch": 3.88, "grad_norm": 5.677983573724589, "learning_rate": 2.2999775467788532e-08, "loss": 0.0836, "step": 5438 }, { "epoch": 3.88, "grad_norm": 6.121189112026573, "learning_rate": 2.272371385017902e-08, "loss": 0.1059, "step": 5439 }, { "epoch": 3.88, "grad_norm": 6.797540986482973, "learning_rate": 2.244931523309013e-08, "loss": 0.1263, "step": 5440 }, { "epoch": 3.88, "grad_norm": 6.379117284864385, "learning_rate": 2.2176579708204636e-08, "loss": 0.1324, "step": 5441 }, { "epoch": 3.88, "grad_norm": 10.852701175072118, "learning_rate": 2.190550736664798e-08, "loss": 0.2468, "step": 5442 }, { "epoch": 3.89, "grad_norm": 7.513796502846481, "learning_rate": 2.163609829898994e-08, "loss": 0.1409, "step": 5443 }, { "epoch": 3.89, "grad_norm": 9.610713700051518, "learning_rate": 2.136835259524628e-08, "loss": 0.1302, "step": 5444 }, { "epoch": 3.89, "grad_norm": 7.7294639743377545, "learning_rate": 2.1102270344874887e-08, "loss": 0.1514, "step": 5445 }, { "epoch": 3.89, "grad_norm": 5.143827292567964, "learning_rate": 2.083785163677965e-08, "loss": 0.0825, "step": 5446 }, { "epoch": 3.89, "grad_norm": 5.874115522509679, "learning_rate": 2.0575096559306564e-08, "loss": 0.1707, "step": 5447 }, { "epoch": 3.89, "grad_norm": 5.5660499922054685, "learning_rate": 2.0314005200248178e-08, "loss": 0.0833, "step": 5448 }, { "epoch": 3.89, "grad_norm": 6.291346107187886, "learning_rate": 2.0054577646839156e-08, "loss": 0.104, "step": 5449 }, { "epoch": 3.89, "grad_norm": 7.101207817053639, "learning_rate": 1.979681398575961e-08, "loss": 0.1224, "step": 5450 }, { "epoch": 3.89, "grad_norm": 8.288766121361911, "learning_rate": 1.954071430313287e-08, "loss": 0.1025, "step": 5451 }, { "epoch": 3.89, "grad_norm": 7.137866886658638, "learning_rate": 1.9286278684526593e-08, "loss": 0.1288, "step": 5452 }, { "epoch": 3.89, "grad_norm": 8.577387327531774, "learning_rate": 1.9033507214952784e-08, "loss": 0.0977, "step": 5453 }, { "epoch": 3.89, "grad_norm": 7.789300210080421, "learning_rate": 1.878239997886666e-08, "loss": 0.1188, "step": 5454 }, { "epoch": 3.89, "grad_norm": 8.261173115162729, "learning_rate": 1.853295706016778e-08, "loss": 0.145, "step": 5455 }, { "epoch": 3.89, "grad_norm": 8.57419604447386, "learning_rate": 1.8285178542200022e-08, "loss": 0.1451, "step": 5456 }, { "epoch": 3.9, "grad_norm": 8.624820178787198, "learning_rate": 1.8039064507750503e-08, "loss": 0.0851, "step": 5457 }, { "epoch": 3.9, "grad_norm": 5.3281446819414855, "learning_rate": 1.7794615039050665e-08, "loss": 0.1077, "step": 5458 }, { "epoch": 3.9, "grad_norm": 6.719079268451326, "learning_rate": 1.7551830217775734e-08, "loss": 0.1064, "step": 5459 }, { "epoch": 3.9, "grad_norm": 6.361150677660455, "learning_rate": 1.7310710125044707e-08, "loss": 0.1342, "step": 5460 }, { "epoch": 3.9, "grad_norm": 9.934178727478375, "learning_rate": 1.7071254841419805e-08, "loss": 0.188, "step": 5461 }, { "epoch": 3.9, "grad_norm": 5.961703433504403, "learning_rate": 1.6833464446907588e-08, "loss": 0.126, "step": 5462 }, { "epoch": 3.9, "grad_norm": 4.709967478366248, "learning_rate": 1.6597339020958393e-08, "loss": 0.0972, "step": 5463 }, { "epoch": 3.9, "grad_norm": 7.621860983040733, "learning_rate": 1.6362878642466328e-08, "loss": 0.1342, "step": 5464 }, { "epoch": 3.9, "grad_norm": 10.209473650410427, "learning_rate": 1.6130083389768735e-08, "loss": 0.1364, "step": 5465 }, { "epoch": 3.9, "grad_norm": 6.139084520540914, "learning_rate": 1.5898953340646728e-08, "loss": 0.1218, "step": 5466 }, { "epoch": 3.9, "grad_norm": 9.71994860554082, "learning_rate": 1.5669488572325197e-08, "loss": 0.1554, "step": 5467 }, { "epoch": 3.9, "grad_norm": 12.086414770955265, "learning_rate": 1.5441689161472816e-08, "loss": 0.1787, "step": 5468 }, { "epoch": 3.9, "grad_norm": 24.407807669084864, "learning_rate": 1.521555518420148e-08, "loss": 0.1927, "step": 5469 }, { "epoch": 3.9, "grad_norm": 6.650159393488484, "learning_rate": 1.499108671606686e-08, "loss": 0.1473, "step": 5470 }, { "epoch": 3.91, "grad_norm": 6.438063211825578, "learning_rate": 1.4768283832067853e-08, "loss": 0.1339, "step": 5471 }, { "epoch": 3.91, "grad_norm": 15.259887152827618, "learning_rate": 1.4547146606646578e-08, "loss": 0.2236, "step": 5472 }, { "epoch": 3.91, "grad_norm": 5.182419293842352, "learning_rate": 1.4327675113690598e-08, "loss": 0.0764, "step": 5473 }, { "epoch": 3.91, "grad_norm": 9.2094503251359, "learning_rate": 1.4109869426527368e-08, "loss": 0.1317, "step": 5474 }, { "epoch": 3.91, "grad_norm": 6.283356876259326, "learning_rate": 1.3893729617931451e-08, "loss": 0.1119, "step": 5475 }, { "epoch": 3.91, "grad_norm": 6.905426440583543, "learning_rate": 1.3679255760118415e-08, "loss": 0.1092, "step": 5476 }, { "epoch": 3.91, "grad_norm": 5.641634779840197, "learning_rate": 1.3466447924748716e-08, "loss": 0.099, "step": 5477 }, { "epoch": 3.91, "grad_norm": 4.588743851538649, "learning_rate": 1.3255306182924365e-08, "loss": 0.0688, "step": 5478 }, { "epoch": 3.91, "grad_norm": 8.406396921378816, "learning_rate": 1.3045830605192266e-08, "loss": 0.1873, "step": 5479 }, { "epoch": 3.91, "grad_norm": 5.491021032175865, "learning_rate": 1.2838021261541988e-08, "loss": 0.134, "step": 5480 }, { "epoch": 3.91, "grad_norm": 7.735288437189416, "learning_rate": 1.263187822140688e-08, "loss": 0.1151, "step": 5481 }, { "epoch": 3.91, "grad_norm": 9.538077504415023, "learning_rate": 1.2427401553662955e-08, "loss": 0.1193, "step": 5482 }, { "epoch": 3.91, "grad_norm": 13.617139287693995, "learning_rate": 1.2224591326628898e-08, "loss": 0.2377, "step": 5483 }, { "epoch": 3.91, "grad_norm": 7.037545116460254, "learning_rate": 1.2023447608068283e-08, "loss": 0.1289, "step": 5484 }, { "epoch": 3.92, "grad_norm": 7.051654601622574, "learning_rate": 1.182397046518735e-08, "loss": 0.0864, "step": 5485 }, { "epoch": 3.92, "grad_norm": 7.660167436446467, "learning_rate": 1.1626159964633899e-08, "loss": 0.1224, "step": 5486 }, { "epoch": 3.92, "grad_norm": 5.448552963768357, "learning_rate": 1.1430016172501169e-08, "loss": 0.0796, "step": 5487 }, { "epoch": 3.92, "grad_norm": 5.531596088000746, "learning_rate": 1.1235539154323405e-08, "loss": 0.1172, "step": 5488 }, { "epoch": 3.92, "grad_norm": 7.503280936817476, "learning_rate": 1.1042728975079741e-08, "loss": 0.1206, "step": 5489 }, { "epoch": 3.92, "grad_norm": 13.364423091479154, "learning_rate": 1.0851585699191425e-08, "loss": 0.1788, "step": 5490 }, { "epoch": 3.92, "grad_norm": 5.933243904552928, "learning_rate": 1.0662109390522924e-08, "loss": 0.1199, "step": 5491 }, { "epoch": 3.92, "grad_norm": 7.257081865184139, "learning_rate": 1.047430011238193e-08, "loss": 0.1149, "step": 5492 }, { "epoch": 3.92, "grad_norm": 7.065807785440134, "learning_rate": 1.028815792751936e-08, "loss": 0.11, "step": 5493 }, { "epoch": 3.92, "grad_norm": 4.983139579728731, "learning_rate": 1.0103682898128241e-08, "loss": 0.075, "step": 5494 }, { "epoch": 3.92, "grad_norm": 10.618784716584416, "learning_rate": 9.920875085845383e-09, "loss": 0.1307, "step": 5495 }, { "epoch": 3.92, "grad_norm": 6.388645197421817, "learning_rate": 9.739734551749703e-09, "loss": 0.1406, "step": 5496 }, { "epoch": 3.92, "grad_norm": 10.393540666994594, "learning_rate": 9.560261356364452e-09, "loss": 0.1676, "step": 5497 }, { "epoch": 3.92, "grad_norm": 6.166647278650001, "learning_rate": 9.382455559654446e-09, "loss": 0.1536, "step": 5498 }, { "epoch": 3.93, "grad_norm": 4.81742399868161, "learning_rate": 9.206317221027717e-09, "loss": 0.0848, "step": 5499 }, { "epoch": 3.93, "grad_norm": 7.5698480294633415, "learning_rate": 9.031846399336075e-09, "loss": 0.1356, "step": 5500 }, { "epoch": 3.93, "eval_avg_AUC": 0.7950252002273744, "eval_avg_Accuracy": 0.7066893236074271, "eval_avg_Accuracy-right": 0.8786357114908048, "eval_avg_Accuracy-wrong": 0.4068683193086195, "eval_avg_Num questions with both labels": 523, "eval_avg_Question-wise AUC": 0.6590350719883701, "eval_last_AUC": 0.8161622767447089, "eval_last_Accuracy": 0.741959549071618, "eval_last_Accuracy-right": 0.8153123777227077, "eval_last_Accuracy-wrong": 0.6140550375255857, "eval_last_Num questions with both labels": 523, "eval_last_Question-wise AUC": 0.6823020710938568, "eval_max_AUC": 0.77593042794898, "eval_max_Accuracy": 0.6453083554376657, "eval_max_Accuracy-right": 0.9788704838920047, "eval_max_Accuracy-wrong": 0.06367978166932, "eval_max_Num questions with both labels": 523, "eval_max_Question-wise AUC": 0.6221345468988383, "eval_min_AUC": 0.8001189509346562, "eval_min_Accuracy": 0.7241379310344828, "eval_min_Accuracy-right": 0.7241424285900613, "eval_min_Accuracy-wrong": 0.7241300886968387, "eval_min_Num questions with both labels": 523, "eval_min_Question-wise AUC": 0.6609511576435902, "eval_prod_AUC": 0.802284305563165, "eval_prod_Accuracy": 0.7134864058355438, "eval_prod_Accuracy-right": 0.6670796921872962, "eval_prod_Accuracy-wrong": 0.7944052763247669, "eval_prod_Num questions with both labels": 523, "eval_prod_Question-wise AUC": 0.6623692613743422, "eval_runtime": 248.6606, "eval_samples_per_second": 97.032, "eval_steps_per_second": 3.032, "eval_sum_AUC": 0.669465043974663, "eval_sum_Accuracy": 0.6391329575596817, "eval_sum_Accuracy-right": 0.9867614451545585, "eval_sum_Accuracy-wrong": 0.03297702979304071, "eval_sum_Num questions with both labels": 523, "eval_sum_Question-wise AUC": 0.6489398581558867, "step": 5500 }, { "epoch": 3.93, "grad_norm": 8.429106410555258, "learning_rate": 8.859043152872892e-09, "loss": 0.1237, "step": 5501 }, { "epoch": 3.93, "grad_norm": 8.009167896999422, "learning_rate": 8.687907539375318e-09, "loss": 0.1164, "step": 5502 }, { "epoch": 3.93, "grad_norm": 5.780427122348584, "learning_rate": 8.518439616022057e-09, "loss": 0.1174, "step": 5503 }, { "epoch": 3.93, "grad_norm": 7.547785765991373, "learning_rate": 8.350639439436703e-09, "loss": 0.1143, "step": 5504 }, { "epoch": 3.93, "grad_norm": 6.131647170129058, "learning_rate": 8.184507065683855e-09, "loss": 0.0824, "step": 5505 }, { "epoch": 3.93, "grad_norm": 11.483331590260356, "learning_rate": 8.020042550271889e-09, "loss": 0.1688, "step": 5506 }, { "epoch": 3.93, "grad_norm": 4.2724277741797625, "learning_rate": 7.857245948150183e-09, "loss": 0.1176, "step": 5507 }, { "epoch": 3.93, "grad_norm": 5.573181505757999, "learning_rate": 7.696117313713559e-09, "loss": 0.1044, "step": 5508 }, { "epoch": 3.93, "grad_norm": 4.598852408301717, "learning_rate": 7.536656700797284e-09, "loss": 0.0922, "step": 5509 }, { "epoch": 3.93, "grad_norm": 10.981912827262958, "learning_rate": 7.37886416268041e-09, "loss": 0.1188, "step": 5510 }, { "epoch": 3.93, "grad_norm": 10.405947394175044, "learning_rate": 7.222739752084096e-09, "loss": 0.1179, "step": 5511 }, { "epoch": 3.93, "grad_norm": 6.540864550860748, "learning_rate": 7.068283521172725e-09, "loss": 0.1253, "step": 5512 }, { "epoch": 3.94, "grad_norm": 7.525882733341714, "learning_rate": 6.915495521552795e-09, "loss": 0.1077, "step": 5513 }, { "epoch": 3.94, "grad_norm": 6.164782248488719, "learning_rate": 6.764375804274026e-09, "loss": 0.1436, "step": 5514 }, { "epoch": 3.94, "grad_norm": 7.444440508428989, "learning_rate": 6.61492441982714e-09, "loss": 0.1319, "step": 5515 }, { "epoch": 3.94, "grad_norm": 4.812978938104751, "learning_rate": 6.467141418147748e-09, "loss": 0.0881, "step": 5516 }, { "epoch": 3.94, "grad_norm": 5.184902484678889, "learning_rate": 6.321026848613021e-09, "loss": 0.1021, "step": 5517 }, { "epoch": 3.94, "grad_norm": 4.699439697594633, "learning_rate": 6.176580760041684e-09, "loss": 0.0901, "step": 5518 }, { "epoch": 3.94, "grad_norm": 7.963371822825025, "learning_rate": 6.033803200696242e-09, "loss": 0.1211, "step": 5519 }, { "epoch": 3.94, "grad_norm": 8.425984631270113, "learning_rate": 5.892694218281869e-09, "loss": 0.1633, "step": 5520 }, { "epoch": 3.94, "grad_norm": 7.949188213453105, "learning_rate": 5.753253859944741e-09, "loss": 0.1158, "step": 5521 }, { "epoch": 3.94, "grad_norm": 9.136952727452089, "learning_rate": 5.615482172275366e-09, "loss": 0.1742, "step": 5522 }, { "epoch": 3.94, "grad_norm": 7.699454602256628, "learning_rate": 5.479379201305257e-09, "loss": 0.1188, "step": 5523 }, { "epoch": 3.94, "grad_norm": 7.953008550854942, "learning_rate": 5.344944992509149e-09, "loss": 0.1521, "step": 5524 }, { "epoch": 3.94, "grad_norm": 6.618564531824671, "learning_rate": 5.212179590803335e-09, "loss": 0.1507, "step": 5525 }, { "epoch": 3.94, "grad_norm": 9.999104323435565, "learning_rate": 5.08108304054844e-09, "loss": 0.1151, "step": 5526 }, { "epoch": 3.95, "grad_norm": 15.707971260259198, "learning_rate": 4.9516553855455395e-09, "loss": 0.1814, "step": 5527 }, { "epoch": 3.95, "grad_norm": 8.642743845716193, "learning_rate": 4.82389666903893e-09, "loss": 0.116, "step": 5528 }, { "epoch": 3.95, "grad_norm": 7.118772631160384, "learning_rate": 4.697806933715021e-09, "loss": 0.125, "step": 5529 }, { "epoch": 3.95, "grad_norm": 9.23091508452254, "learning_rate": 4.573386221703446e-09, "loss": 0.1327, "step": 5530 }, { "epoch": 3.95, "grad_norm": 13.387861094338058, "learning_rate": 4.450634574574286e-09, "loss": 0.1758, "step": 5531 }, { "epoch": 3.95, "grad_norm": 5.407477037058821, "learning_rate": 4.329552033341955e-09, "loss": 0.0692, "step": 5532 }, { "epoch": 3.95, "grad_norm": 5.614578814740924, "learning_rate": 4.210138638462424e-09, "loss": 0.0916, "step": 5533 }, { "epoch": 3.95, "grad_norm": 10.253247016948384, "learning_rate": 4.0923944298337796e-09, "loss": 0.1483, "step": 5534 }, { "epoch": 3.95, "grad_norm": 6.932631685042542, "learning_rate": 3.976319446795662e-09, "loss": 0.1497, "step": 5535 }, { "epoch": 3.95, "grad_norm": 7.467510399283004, "learning_rate": 3.8619137281326044e-09, "loss": 0.1044, "step": 5536 }, { "epoch": 3.95, "grad_norm": 8.923637692156998, "learning_rate": 3.749177312068475e-09, "loss": 0.1453, "step": 5537 }, { "epoch": 3.95, "grad_norm": 13.938583660145987, "learning_rate": 3.63811023627092e-09, "loss": 0.183, "step": 5538 }, { "epoch": 3.95, "grad_norm": 10.314501087676051, "learning_rate": 3.528712537849144e-09, "loss": 0.1874, "step": 5539 }, { "epoch": 3.95, "grad_norm": 7.711784952194379, "learning_rate": 3.42098425335613e-09, "loss": 0.1631, "step": 5540 }, { "epoch": 3.96, "grad_norm": 7.6746819815164455, "learning_rate": 3.3149254187841985e-09, "loss": 0.1301, "step": 5541 }, { "epoch": 3.96, "grad_norm": 6.077385997300826, "learning_rate": 3.210536069571113e-09, "loss": 0.1223, "step": 5542 }, { "epoch": 3.96, "grad_norm": 6.60490536386744, "learning_rate": 3.1078162405939747e-09, "loss": 0.1371, "step": 5543 }, { "epoch": 3.96, "grad_norm": 6.546314650170883, "learning_rate": 3.006765966174774e-09, "loss": 0.1312, "step": 5544 }, { "epoch": 3.96, "grad_norm": 6.402040317933734, "learning_rate": 2.907385280075392e-09, "loss": 0.1078, "step": 5545 }, { "epoch": 3.96, "grad_norm": 7.1113178071259355, "learning_rate": 2.80967421550038e-09, "loss": 0.124, "step": 5546 }, { "epoch": 3.96, "grad_norm": 9.882733207585435, "learning_rate": 2.7136328050980654e-09, "loss": 0.2028, "step": 5547 }, { "epoch": 3.96, "grad_norm": 8.22923561778551, "learning_rate": 2.6192610809566697e-09, "loss": 0.1747, "step": 5548 }, { "epoch": 3.96, "grad_norm": 9.744168083139122, "learning_rate": 2.5265590746076373e-09, "loss": 0.0875, "step": 5549 }, { "epoch": 3.96, "grad_norm": 7.470206404886737, "learning_rate": 2.43552681702508e-09, "loss": 0.1339, "step": 5550 }, { "epoch": 3.96, "grad_norm": 5.8082392348378225, "learning_rate": 2.346164338624113e-09, "loss": 0.1084, "step": 5551 }, { "epoch": 3.96, "grad_norm": 6.477160650962435, "learning_rate": 2.2584716692619636e-09, "loss": 0.1305, "step": 5552 }, { "epoch": 3.96, "grad_norm": 6.159335395880286, "learning_rate": 2.172448838239083e-09, "loss": 0.1267, "step": 5553 }, { "epoch": 3.96, "grad_norm": 5.573462560585101, "learning_rate": 2.08809587429748e-09, "loss": 0.0966, "step": 5554 }, { "epoch": 3.97, "grad_norm": 4.343642412756653, "learning_rate": 2.0054128056201662e-09, "loss": 0.0584, "step": 5555 }, { "epoch": 3.97, "grad_norm": 8.254147958795857, "learning_rate": 1.924399659833376e-09, "loss": 0.139, "step": 5556 }, { "epoch": 3.97, "grad_norm": 10.092190845622381, "learning_rate": 1.8450564640054569e-09, "loss": 0.1674, "step": 5557 }, { "epoch": 3.97, "grad_norm": 8.035445013347708, "learning_rate": 1.7673832446463146e-09, "loss": 0.1506, "step": 5558 }, { "epoch": 3.97, "grad_norm": 8.832099606863865, "learning_rate": 1.6913800277085225e-09, "loss": 0.1419, "step": 5559 }, { "epoch": 3.97, "grad_norm": 9.404074142401095, "learning_rate": 1.6170468385845462e-09, "loss": 0.1946, "step": 5560 }, { "epoch": 3.97, "grad_norm": 4.577566022959948, "learning_rate": 1.5443837021122954e-09, "loss": 0.0952, "step": 5561 }, { "epoch": 3.97, "grad_norm": 7.343705255356882, "learning_rate": 1.473390642569017e-09, "loss": 0.1126, "step": 5562 }, { "epoch": 3.97, "grad_norm": 7.2585089112143795, "learning_rate": 1.4040676836746259e-09, "loss": 0.1133, "step": 5563 }, { "epoch": 3.97, "grad_norm": 6.784254763986543, "learning_rate": 1.336414848591705e-09, "loss": 0.1276, "step": 5564 }, { "epoch": 3.97, "grad_norm": 5.854702420201817, "learning_rate": 1.2704321599243951e-09, "loss": 0.0994, "step": 5565 }, { "epoch": 3.97, "grad_norm": 5.753710545202959, "learning_rate": 1.206119639718395e-09, "loss": 0.1279, "step": 5566 }, { "epoch": 3.97, "grad_norm": 7.307788810185479, "learning_rate": 1.1434773094615158e-09, "loss": 0.1118, "step": 5567 }, { "epoch": 3.97, "grad_norm": 16.079904243512647, "learning_rate": 1.0825051900842377e-09, "loss": 0.1475, "step": 5568 }, { "epoch": 3.98, "grad_norm": 6.869485387673937, "learning_rate": 1.0232033019580423e-09, "loss": 0.1205, "step": 5569 }, { "epoch": 3.98, "grad_norm": 9.561539939824815, "learning_rate": 9.655716648970804e-10, "loss": 0.1674, "step": 5570 }, { "epoch": 3.98, "grad_norm": 8.986453072022883, "learning_rate": 9.096102981570598e-10, "loss": 0.166, "step": 5571 }, { "epoch": 3.98, "grad_norm": 4.372199399777608, "learning_rate": 8.553192204358018e-10, "loss": 0.0487, "step": 5572 }, { "epoch": 3.98, "grad_norm": 8.885464213647072, "learning_rate": 8.026984498726853e-10, "loss": 0.1139, "step": 5573 }, { "epoch": 3.98, "grad_norm": 4.969673215125041, "learning_rate": 7.517480040497572e-10, "loss": 0.0748, "step": 5574 }, { "epoch": 3.98, "grad_norm": 8.019389192351877, "learning_rate": 7.024678999900669e-10, "loss": 0.1306, "step": 5575 }, { "epoch": 3.98, "grad_norm": 6.295109377091253, "learning_rate": 6.548581541593324e-10, "loss": 0.1327, "step": 5576 }, { "epoch": 3.98, "grad_norm": 8.69637612538181, "learning_rate": 6.08918782464829e-10, "loss": 0.1021, "step": 5577 }, { "epoch": 3.98, "grad_norm": 6.261474454096343, "learning_rate": 5.646498002553902e-10, "loss": 0.0916, "step": 5578 }, { "epoch": 3.98, "grad_norm": 16.482766831647567, "learning_rate": 5.220512223219621e-10, "loss": 0.2062, "step": 5579 }, { "epoch": 3.98, "grad_norm": 7.218560157195067, "learning_rate": 4.81123062898714e-10, "loss": 0.1332, "step": 5580 }, { "epoch": 3.98, "grad_norm": 4.056314731703903, "learning_rate": 4.4186533565915293e-10, "loss": 0.0832, "step": 5581 }, { "epoch": 3.98, "grad_norm": 6.337018112627061, "learning_rate": 4.042780537205637e-10, "loss": 0.1094, "step": 5582 }, { "epoch": 3.99, "grad_norm": 8.678745644886078, "learning_rate": 3.6836122964178934e-10, "loss": 0.1344, "step": 5583 }, { "epoch": 3.99, "grad_norm": 8.323983732961677, "learning_rate": 3.341148754232304e-10, "loss": 0.112, "step": 5584 }, { "epoch": 3.99, "grad_norm": 6.730233285070333, "learning_rate": 3.015390025068454e-10, "loss": 0.1141, "step": 5585 }, { "epoch": 3.99, "grad_norm": 5.2887206705015775, "learning_rate": 2.706336217767058e-10, "loss": 0.0966, "step": 5586 }, { "epoch": 3.99, "grad_norm": 10.523107370264057, "learning_rate": 2.4139874355955105e-10, "loss": 0.1647, "step": 5587 }, { "epoch": 3.99, "grad_norm": 7.034757136628954, "learning_rate": 2.138343776231233e-10, "loss": 0.1074, "step": 5588 }, { "epoch": 3.99, "grad_norm": 6.404901800476969, "learning_rate": 1.8794053317672255e-10, "loss": 0.1074, "step": 5589 }, { "epoch": 3.99, "grad_norm": 6.157714487346645, "learning_rate": 1.6371721887287196e-10, "loss": 0.1122, "step": 5590 }, { "epoch": 3.99, "grad_norm": 8.352229011291241, "learning_rate": 1.4116444280398711e-10, "loss": 0.1099, "step": 5591 }, { "epoch": 3.99, "grad_norm": 5.989421477649134, "learning_rate": 1.2028221250570683e-10, "loss": 0.0992, "step": 5592 }, { "epoch": 3.99, "grad_norm": 9.9800678266692, "learning_rate": 1.0107053495522767e-10, "loss": 0.1093, "step": 5593 }, { "epoch": 3.99, "grad_norm": 11.303595468417102, "learning_rate": 8.35294165718592e-11, "loss": 0.15, "step": 5594 }, { "epoch": 3.99, "grad_norm": 9.776566003654384, "learning_rate": 6.765886321646874e-11, "loss": 0.1285, "step": 5595 }, { "epoch": 3.99, "grad_norm": 9.512879527949313, "learning_rate": 5.345888019092638e-11, "loss": 0.1599, "step": 5596 }, { "epoch": 4.0, "grad_norm": 9.328733178787845, "learning_rate": 4.092947224032529e-11, "loss": 0.1466, "step": 5597 }, { "epoch": 4.0, "grad_norm": 12.1455101579189, "learning_rate": 3.007064355076139e-11, "loss": 0.1296, "step": 5598 }, { "epoch": 4.0, "grad_norm": 6.318880612200429, "learning_rate": 2.088239775044354e-11, "loss": 0.0999, "step": 5599 }, { "epoch": 4.0, "grad_norm": 4.899339012947326, "learning_rate": 1.3364737909138392e-11, "loss": 0.0953, "step": 5600 }, { "epoch": 4.0, "grad_norm": 7.1594901159283815, "learning_rate": 7.517666539280654e-12, "loss": 0.1243, "step": 5601 }, { "epoch": 4.0, "grad_norm": 9.295531250814186, "learning_rate": 3.3411855937526273e-12, "loss": 0.1388, "step": 5602 }, { "epoch": 4.0, "grad_norm": 9.718829953860425, "learning_rate": 8.352964681046516e-13, "loss": 0.1476, "step": 5603 }, { "epoch": 4.0, "grad_norm": 5.714918142396491, "learning_rate": 0.0, "loss": 0.1135, "step": 5604 }, { "epoch": 4.0, "step": 5604, "total_flos": 750239094767616.0, "train_loss": 0.36047464298572307, "train_runtime": 15643.8891, "train_samples_per_second": 22.919, "train_steps_per_second": 0.358 } ], "logging_steps": 1.0, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 24000, "total_flos": 750239094767616.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }