diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,39731 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 116.68827514412096, + "learning_rate": 5.91715976331361e-08, + "loss": 1.459, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 137.05559220326518, + "learning_rate": 1.183431952662722e-07, + "loss": 1.5293, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 96.59057528089231, + "learning_rate": 1.775147928994083e-07, + "loss": 1.4766, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 66.60254143032729, + "learning_rate": 2.366863905325444e-07, + "loss": 1.2988, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 103.34499410914529, + "learning_rate": 2.958579881656805e-07, + "loss": 1.3887, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 52.76380439002071, + "learning_rate": 3.550295857988166e-07, + "loss": 1.1865, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 48.37017508979888, + "learning_rate": 4.1420118343195276e-07, + "loss": 1.3906, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 94.36297438079606, + "learning_rate": 4.733727810650888e-07, + "loss": 1.3477, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 45.10300787040884, + "learning_rate": 5.32544378698225e-07, + "loss": 1.2451, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 103.24939665836075, + "learning_rate": 5.91715976331361e-07, + "loss": 1.3477, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 102.10130791881714, + "learning_rate": 6.50887573964497e-07, + "loss": 1.2109, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 29.836605877501068, + "learning_rate": 7.100591715976332e-07, + "loss": 1.1523, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 35.91418906814854, + "learning_rate": 7.692307692307694e-07, + "loss": 1.2031, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 55.18879990724547, + "learning_rate": 8.284023668639055e-07, + "loss": 1.1562, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 111.28485143610422, + "learning_rate": 8.875739644970415e-07, + "loss": 1.2539, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 64.53655153369745, + "learning_rate": 9.467455621301776e-07, + "loss": 1.1113, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 49.81258432944396, + "learning_rate": 1.0059171597633138e-06, + "loss": 1.0557, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 81.3821602984751, + "learning_rate": 1.06508875739645e-06, + "loss": 1.1436, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 77.89797760634401, + "learning_rate": 1.1242603550295859e-06, + "loss": 1.0498, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 103.51657421815158, + "learning_rate": 1.183431952662722e-06, + "loss": 1.1523, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 34.98384966020216, + "learning_rate": 1.242603550295858e-06, + "loss": 0.8809, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 17.909290919081947, + "learning_rate": 1.301775147928994e-06, + "loss": 0.8525, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 17.7361005439023, + "learning_rate": 1.3609467455621303e-06, + "loss": 0.8418, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 26.264367353380752, + "learning_rate": 1.4201183431952664e-06, + "loss": 0.876, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 35.89694409386303, + "learning_rate": 1.4792899408284026e-06, + "loss": 0.8018, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 52.88511927832449, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.8203, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 20.430991723939343, + "learning_rate": 1.5976331360946749e-06, + "loss": 0.7646, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 40.02223588407742, + "learning_rate": 1.656804733727811e-06, + "loss": 0.7539, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 23.523542799757628, + "learning_rate": 1.7159763313609468e-06, + "loss": 0.8047, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 58.78591457424737, + "learning_rate": 1.775147928994083e-06, + "loss": 0.8398, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 15.018173530627244, + "learning_rate": 1.834319526627219e-06, + "loss": 0.7764, + "step": 31 + }, + { + "epoch": 0.02, + "grad_norm": 15.224774251543503, + "learning_rate": 1.8934911242603552e-06, + "loss": 0.7041, + "step": 32 + }, + { + "epoch": 0.02, + "grad_norm": 16.267158783210224, + "learning_rate": 1.952662721893491e-06, + "loss": 0.6865, + "step": 33 + }, + { + "epoch": 0.02, + "grad_norm": 19.71449038710854, + "learning_rate": 2.0118343195266275e-06, + "loss": 0.7051, + "step": 34 + }, + { + "epoch": 0.02, + "grad_norm": 19.808645253515795, + "learning_rate": 2.0710059171597635e-06, + "loss": 0.7783, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 36.10291312881198, + "learning_rate": 2.1301775147929e-06, + "loss": 0.6826, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 25.047754674174154, + "learning_rate": 2.1893491124260358e-06, + "loss": 0.6885, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 14.948882559824707, + "learning_rate": 2.2485207100591717e-06, + "loss": 0.6152, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 26.159147372349974, + "learning_rate": 2.307692307692308e-06, + "loss": 0.7139, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 14.488442848627141, + "learning_rate": 2.366863905325444e-06, + "loss": 0.6602, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 12.239851284814467, + "learning_rate": 2.42603550295858e-06, + "loss": 0.6924, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 12.408441779673309, + "learning_rate": 2.485207100591716e-06, + "loss": 0.6289, + "step": 42 + }, + { + "epoch": 0.03, + "grad_norm": 21.76752777666436, + "learning_rate": 2.5443786982248527e-06, + "loss": 0.6436, + "step": 43 + }, + { + "epoch": 0.03, + "grad_norm": 11.405290702619705, + "learning_rate": 2.603550295857988e-06, + "loss": 0.6064, + "step": 44 + }, + { + "epoch": 0.03, + "grad_norm": 22.56239327853637, + "learning_rate": 2.6627218934911246e-06, + "loss": 0.6279, + "step": 45 + }, + { + "epoch": 0.03, + "grad_norm": 25.96378836265653, + "learning_rate": 2.7218934911242605e-06, + "loss": 0.6484, + "step": 46 + }, + { + "epoch": 0.03, + "grad_norm": 11.297707654533205, + "learning_rate": 2.7810650887573965e-06, + "loss": 0.5977, + "step": 47 + }, + { + "epoch": 0.03, + "grad_norm": 21.368931517016037, + "learning_rate": 2.840236686390533e-06, + "loss": 0.6211, + "step": 48 + }, + { + "epoch": 0.03, + "grad_norm": 17.055034780572917, + "learning_rate": 2.8994082840236688e-06, + "loss": 0.6504, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 14.170299121299626, + "learning_rate": 2.958579881656805e-06, + "loss": 0.5391, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 16.65980083079512, + "learning_rate": 3.017751479289941e-06, + "loss": 0.6562, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 9.141571770193977, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.5576, + "step": 52 + }, + { + "epoch": 0.04, + "grad_norm": 38.18938821430388, + "learning_rate": 3.1360946745562134e-06, + "loss": 0.6465, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 36.485584647459994, + "learning_rate": 3.1952662721893497e-06, + "loss": 0.6182, + "step": 54 + }, + { + "epoch": 0.04, + "grad_norm": 26.1172270983848, + "learning_rate": 3.2544378698224853e-06, + "loss": 0.6304, + "step": 55 + }, + { + "epoch": 0.04, + "grad_norm": 10.35522270180552, + "learning_rate": 3.313609467455622e-06, + "loss": 0.5791, + "step": 56 + }, + { + "epoch": 0.04, + "grad_norm": 8.919799862854324, + "learning_rate": 3.3727810650887576e-06, + "loss": 0.6064, + "step": 57 + }, + { + "epoch": 0.04, + "grad_norm": 37.86337212965342, + "learning_rate": 3.4319526627218935e-06, + "loss": 0.6719, + "step": 58 + }, + { + "epoch": 0.04, + "grad_norm": 32.4241795097629, + "learning_rate": 3.49112426035503e-06, + "loss": 0.6816, + "step": 59 + }, + { + "epoch": 0.04, + "grad_norm": 12.843367212342677, + "learning_rate": 3.550295857988166e-06, + "loss": 0.5415, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 11.033086617607612, + "learning_rate": 3.609467455621302e-06, + "loss": 0.5898, + "step": 61 + }, + { + "epoch": 0.04, + "grad_norm": 11.78012764506264, + "learning_rate": 3.668639053254438e-06, + "loss": 0.5967, + "step": 62 + }, + { + "epoch": 0.04, + "grad_norm": 11.240783033272136, + "learning_rate": 3.7278106508875745e-06, + "loss": 0.5645, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 9.123512981508057, + "learning_rate": 3.7869822485207104e-06, + "loss": 0.5703, + "step": 64 + }, + { + "epoch": 0.05, + "grad_norm": 27.660633641874856, + "learning_rate": 3.846153846153847e-06, + "loss": 0.6182, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 14.27816226916257, + "learning_rate": 3.905325443786982e-06, + "loss": 0.623, + "step": 66 + }, + { + "epoch": 0.05, + "grad_norm": 16.991620078648673, + "learning_rate": 3.964497041420119e-06, + "loss": 0.5547, + "step": 67 + }, + { + "epoch": 0.05, + "grad_norm": 29.73306262944105, + "learning_rate": 4.023668639053255e-06, + "loss": 0.624, + "step": 68 + }, + { + "epoch": 0.05, + "grad_norm": 19.983350773990825, + "learning_rate": 4.0828402366863906e-06, + "loss": 0.5938, + "step": 69 + }, + { + "epoch": 0.05, + "grad_norm": 11.203249085331949, + "learning_rate": 4.142011834319527e-06, + "loss": 0.6152, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 18.903248382781342, + "learning_rate": 4.201183431952663e-06, + "loss": 0.5425, + "step": 71 + }, + { + "epoch": 0.05, + "grad_norm": 16.743327563970325, + "learning_rate": 4.2603550295858e-06, + "loss": 0.5518, + "step": 72 + }, + { + "epoch": 0.05, + "grad_norm": 13.530873380507195, + "learning_rate": 4.319526627218935e-06, + "loss": 0.5859, + "step": 73 + }, + { + "epoch": 0.05, + "grad_norm": 10.306285659968713, + "learning_rate": 4.3786982248520715e-06, + "loss": 0.585, + "step": 74 + }, + { + "epoch": 0.05, + "grad_norm": 16.30837385866365, + "learning_rate": 4.437869822485207e-06, + "loss": 0.7109, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 22.972681142302918, + "learning_rate": 4.497041420118343e-06, + "loss": 0.5781, + "step": 76 + }, + { + "epoch": 0.05, + "grad_norm": 11.69935251821666, + "learning_rate": 4.55621301775148e-06, + "loss": 0.5889, + "step": 77 + }, + { + "epoch": 0.06, + "grad_norm": 30.74542232405432, + "learning_rate": 4.615384615384616e-06, + "loss": 0.5996, + "step": 78 + }, + { + "epoch": 0.06, + "grad_norm": 13.053844021932429, + "learning_rate": 4.674556213017752e-06, + "loss": 0.6523, + "step": 79 + }, + { + "epoch": 0.06, + "grad_norm": 18.660654525701556, + "learning_rate": 4.733727810650888e-06, + "loss": 0.5903, + "step": 80 + }, + { + "epoch": 0.06, + "grad_norm": 33.06855879264672, + "learning_rate": 4.792899408284024e-06, + "loss": 0.5947, + "step": 81 + }, + { + "epoch": 0.06, + "grad_norm": 18.027708084710582, + "learning_rate": 4.85207100591716e-06, + "loss": 0.5249, + "step": 82 + }, + { + "epoch": 0.06, + "grad_norm": 9.927455268454475, + "learning_rate": 4.911242603550296e-06, + "loss": 0.5024, + "step": 83 + }, + { + "epoch": 0.06, + "grad_norm": 13.12037453816752, + "learning_rate": 4.970414201183432e-06, + "loss": 0.5884, + "step": 84 + }, + { + "epoch": 0.06, + "grad_norm": 15.758818975299697, + "learning_rate": 5.029585798816569e-06, + "loss": 0.5601, + "step": 85 + }, + { + "epoch": 0.06, + "grad_norm": 10.52708954325852, + "learning_rate": 5.088757396449705e-06, + "loss": 0.4854, + "step": 86 + }, + { + "epoch": 0.06, + "grad_norm": 16.86444076854706, + "learning_rate": 5.14792899408284e-06, + "loss": 0.4858, + "step": 87 + }, + { + "epoch": 0.06, + "grad_norm": 9.60677062559464, + "learning_rate": 5.207100591715976e-06, + "loss": 0.5625, + "step": 88 + }, + { + "epoch": 0.06, + "grad_norm": 18.10508123587462, + "learning_rate": 5.266272189349113e-06, + "loss": 0.5068, + "step": 89 + }, + { + "epoch": 0.06, + "grad_norm": 15.276312006219614, + "learning_rate": 5.325443786982249e-06, + "loss": 0.6006, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 16.278189599056375, + "learning_rate": 5.384615384615385e-06, + "loss": 0.5049, + "step": 91 + }, + { + "epoch": 0.07, + "grad_norm": 24.04098331248457, + "learning_rate": 5.443786982248521e-06, + "loss": 0.4922, + "step": 92 + }, + { + "epoch": 0.07, + "grad_norm": 29.519688923776314, + "learning_rate": 5.502958579881657e-06, + "loss": 0.5532, + "step": 93 + }, + { + "epoch": 0.07, + "grad_norm": 11.073075096267969, + "learning_rate": 5.562130177514793e-06, + "loss": 0.5015, + "step": 94 + }, + { + "epoch": 0.07, + "grad_norm": 23.04049428861296, + "learning_rate": 5.621301775147929e-06, + "loss": 0.4731, + "step": 95 + }, + { + "epoch": 0.07, + "grad_norm": 15.43611431386209, + "learning_rate": 5.680473372781066e-06, + "loss": 0.5938, + "step": 96 + }, + { + "epoch": 0.07, + "grad_norm": 20.37496355488797, + "learning_rate": 5.739644970414202e-06, + "loss": 0.6074, + "step": 97 + }, + { + "epoch": 0.07, + "grad_norm": 23.4534463120769, + "learning_rate": 5.7988165680473375e-06, + "loss": 0.5552, + "step": 98 + }, + { + "epoch": 0.07, + "grad_norm": 15.224551348565953, + "learning_rate": 5.857988165680474e-06, + "loss": 0.5259, + "step": 99 + }, + { + "epoch": 0.07, + "grad_norm": 15.043175008582129, + "learning_rate": 5.91715976331361e-06, + "loss": 0.5615, + "step": 100 + }, + { + "epoch": 0.07, + "grad_norm": 13.886760864897116, + "learning_rate": 5.976331360946747e-06, + "loss": 0.564, + "step": 101 + }, + { + "epoch": 0.07, + "grad_norm": 10.62984639534972, + "learning_rate": 6.035502958579882e-06, + "loss": 0.5703, + "step": 102 + }, + { + "epoch": 0.07, + "grad_norm": 13.366791130831217, + "learning_rate": 6.0946745562130185e-06, + "loss": 0.4766, + "step": 103 + }, + { + "epoch": 0.07, + "grad_norm": 19.11640373494133, + "learning_rate": 6.153846153846155e-06, + "loss": 0.5801, + "step": 104 + }, + { + "epoch": 0.07, + "grad_norm": 13.344028697162539, + "learning_rate": 6.21301775147929e-06, + "loss": 0.6074, + "step": 105 + }, + { + "epoch": 0.08, + "grad_norm": 24.171423052067137, + "learning_rate": 6.272189349112427e-06, + "loss": 0.5859, + "step": 106 + }, + { + "epoch": 0.08, + "grad_norm": 10.52701822263934, + "learning_rate": 6.331360946745563e-06, + "loss": 0.5063, + "step": 107 + }, + { + "epoch": 0.08, + "grad_norm": 12.13390826805979, + "learning_rate": 6.3905325443786995e-06, + "loss": 0.5391, + "step": 108 + }, + { + "epoch": 0.08, + "grad_norm": 24.893616746652306, + "learning_rate": 6.449704142011834e-06, + "loss": 0.6562, + "step": 109 + }, + { + "epoch": 0.08, + "grad_norm": 10.267811806709746, + "learning_rate": 6.5088757396449705e-06, + "loss": 0.5327, + "step": 110 + }, + { + "epoch": 0.08, + "grad_norm": 9.573660726227825, + "learning_rate": 6.568047337278107e-06, + "loss": 0.501, + "step": 111 + }, + { + "epoch": 0.08, + "grad_norm": 9.424349082496683, + "learning_rate": 6.627218934911244e-06, + "loss": 0.479, + "step": 112 + }, + { + "epoch": 0.08, + "grad_norm": 19.516504502014815, + "learning_rate": 6.686390532544379e-06, + "loss": 0.7539, + "step": 113 + }, + { + "epoch": 0.08, + "grad_norm": 10.53470669412648, + "learning_rate": 6.745562130177515e-06, + "loss": 0.4805, + "step": 114 + }, + { + "epoch": 0.08, + "grad_norm": 13.690074282712688, + "learning_rate": 6.8047337278106515e-06, + "loss": 0.5947, + "step": 115 + }, + { + "epoch": 0.08, + "grad_norm": 11.491351335831942, + "learning_rate": 6.863905325443787e-06, + "loss": 0.5752, + "step": 116 + }, + { + "epoch": 0.08, + "grad_norm": 16.709408023616227, + "learning_rate": 6.923076923076923e-06, + "loss": 0.5063, + "step": 117 + }, + { + "epoch": 0.08, + "grad_norm": 17.782856613289166, + "learning_rate": 6.98224852071006e-06, + "loss": 0.5957, + "step": 118 + }, + { + "epoch": 0.08, + "grad_norm": 19.968430706562753, + "learning_rate": 7.041420118343196e-06, + "loss": 0.6045, + "step": 119 + }, + { + "epoch": 0.09, + "grad_norm": 23.252540842800897, + "learning_rate": 7.100591715976332e-06, + "loss": 0.5908, + "step": 120 + }, + { + "epoch": 0.09, + "grad_norm": 10.269334605120518, + "learning_rate": 7.159763313609468e-06, + "loss": 0.5405, + "step": 121 + }, + { + "epoch": 0.09, + "grad_norm": 14.810048379769524, + "learning_rate": 7.218934911242604e-06, + "loss": 0.5518, + "step": 122 + }, + { + "epoch": 0.09, + "grad_norm": 10.265067489943311, + "learning_rate": 7.278106508875741e-06, + "loss": 0.5259, + "step": 123 + }, + { + "epoch": 0.09, + "grad_norm": 13.5951034364997, + "learning_rate": 7.337278106508876e-06, + "loss": 0.5137, + "step": 124 + }, + { + "epoch": 0.09, + "grad_norm": 22.46844506700403, + "learning_rate": 7.396449704142013e-06, + "loss": 0.4946, + "step": 125 + }, + { + "epoch": 0.09, + "grad_norm": 16.353931220439996, + "learning_rate": 7.455621301775149e-06, + "loss": 0.5303, + "step": 126 + }, + { + "epoch": 0.09, + "grad_norm": 18.291482953488973, + "learning_rate": 7.5147928994082845e-06, + "loss": 0.5142, + "step": 127 + }, + { + "epoch": 0.09, + "grad_norm": 10.676486048527362, + "learning_rate": 7.573964497041421e-06, + "loss": 0.519, + "step": 128 + }, + { + "epoch": 0.09, + "grad_norm": 18.100948739888956, + "learning_rate": 7.633136094674556e-06, + "loss": 0.5303, + "step": 129 + }, + { + "epoch": 0.09, + "grad_norm": 21.616985829007, + "learning_rate": 7.692307692307694e-06, + "loss": 0.4478, + "step": 130 + }, + { + "epoch": 0.09, + "grad_norm": 17.999044255166673, + "learning_rate": 7.751479289940829e-06, + "loss": 0.6455, + "step": 131 + }, + { + "epoch": 0.09, + "grad_norm": 50.80837956748365, + "learning_rate": 7.810650887573965e-06, + "loss": 0.6377, + "step": 132 + }, + { + "epoch": 0.09, + "grad_norm": 14.987804243664623, + "learning_rate": 7.869822485207102e-06, + "loss": 0.6377, + "step": 133 + }, + { + "epoch": 0.1, + "grad_norm": 14.857441145744092, + "learning_rate": 7.928994082840237e-06, + "loss": 0.6055, + "step": 134 + }, + { + "epoch": 0.1, + "grad_norm": 31.853850755139643, + "learning_rate": 7.988165680473373e-06, + "loss": 0.5396, + "step": 135 + }, + { + "epoch": 0.1, + "grad_norm": 26.922060020459558, + "learning_rate": 8.04733727810651e-06, + "loss": 0.6792, + "step": 136 + }, + { + "epoch": 0.1, + "grad_norm": 8.884182601970089, + "learning_rate": 8.106508875739646e-06, + "loss": 0.5957, + "step": 137 + }, + { + "epoch": 0.1, + "grad_norm": 20.414653577595885, + "learning_rate": 8.165680473372781e-06, + "loss": 0.5635, + "step": 138 + }, + { + "epoch": 0.1, + "grad_norm": 17.83898607971713, + "learning_rate": 8.224852071005918e-06, + "loss": 0.6191, + "step": 139 + }, + { + "epoch": 0.1, + "grad_norm": 15.238831882725698, + "learning_rate": 8.284023668639054e-06, + "loss": 0.5503, + "step": 140 + }, + { + "epoch": 0.1, + "grad_norm": 13.273476773493702, + "learning_rate": 8.343195266272191e-06, + "loss": 0.5918, + "step": 141 + }, + { + "epoch": 0.1, + "grad_norm": 19.25326134871138, + "learning_rate": 8.402366863905327e-06, + "loss": 0.6064, + "step": 142 + }, + { + "epoch": 0.1, + "grad_norm": 26.52884271993335, + "learning_rate": 8.461538461538462e-06, + "loss": 0.6025, + "step": 143 + }, + { + "epoch": 0.1, + "grad_norm": 9.41160283896373, + "learning_rate": 8.5207100591716e-06, + "loss": 0.4663, + "step": 144 + }, + { + "epoch": 0.1, + "grad_norm": 35.434570167809206, + "learning_rate": 8.579881656804735e-06, + "loss": 0.6113, + "step": 145 + }, + { + "epoch": 0.1, + "grad_norm": 11.227787194473562, + "learning_rate": 8.63905325443787e-06, + "loss": 0.5303, + "step": 146 + }, + { + "epoch": 0.1, + "grad_norm": 19.767727771237986, + "learning_rate": 8.698224852071006e-06, + "loss": 0.6484, + "step": 147 + }, + { + "epoch": 0.11, + "grad_norm": 8.707533062635017, + "learning_rate": 8.757396449704143e-06, + "loss": 0.5186, + "step": 148 + }, + { + "epoch": 0.11, + "grad_norm": 13.412761457820203, + "learning_rate": 8.816568047337279e-06, + "loss": 0.5659, + "step": 149 + }, + { + "epoch": 0.11, + "grad_norm": 8.467634594644565, + "learning_rate": 8.875739644970414e-06, + "loss": 0.5151, + "step": 150 + }, + { + "epoch": 0.11, + "grad_norm": 13.925519538332352, + "learning_rate": 8.934911242603551e-06, + "loss": 0.5879, + "step": 151 + }, + { + "epoch": 0.11, + "grad_norm": 12.71700779047759, + "learning_rate": 8.994082840236687e-06, + "loss": 0.5835, + "step": 152 + }, + { + "epoch": 0.11, + "grad_norm": 17.84943528345089, + "learning_rate": 9.053254437869822e-06, + "loss": 0.5264, + "step": 153 + }, + { + "epoch": 0.11, + "grad_norm": 17.688901386150174, + "learning_rate": 9.11242603550296e-06, + "loss": 0.6367, + "step": 154 + }, + { + "epoch": 0.11, + "grad_norm": 13.867220435355714, + "learning_rate": 9.171597633136095e-06, + "loss": 0.6777, + "step": 155 + }, + { + "epoch": 0.11, + "grad_norm": 26.02685451565874, + "learning_rate": 9.230769230769232e-06, + "loss": 0.5967, + "step": 156 + }, + { + "epoch": 0.11, + "grad_norm": 15.482134608562367, + "learning_rate": 9.289940828402368e-06, + "loss": 0.5615, + "step": 157 + }, + { + "epoch": 0.11, + "grad_norm": 10.88805449508632, + "learning_rate": 9.349112426035503e-06, + "loss": 0.5684, + "step": 158 + }, + { + "epoch": 0.11, + "grad_norm": 18.661463282963666, + "learning_rate": 9.40828402366864e-06, + "loss": 0.6445, + "step": 159 + }, + { + "epoch": 0.11, + "grad_norm": 13.35562044235245, + "learning_rate": 9.467455621301776e-06, + "loss": 0.5557, + "step": 160 + }, + { + "epoch": 0.11, + "grad_norm": 19.980779661002774, + "learning_rate": 9.526627218934912e-06, + "loss": 0.5474, + "step": 161 + }, + { + "epoch": 0.12, + "grad_norm": 12.268903766575969, + "learning_rate": 9.585798816568049e-06, + "loss": 0.5225, + "step": 162 + }, + { + "epoch": 0.12, + "grad_norm": 21.12465799994179, + "learning_rate": 9.644970414201184e-06, + "loss": 0.5547, + "step": 163 + }, + { + "epoch": 0.12, + "grad_norm": 7.6737374134887215, + "learning_rate": 9.70414201183432e-06, + "loss": 0.5859, + "step": 164 + }, + { + "epoch": 0.12, + "grad_norm": 18.06983956812275, + "learning_rate": 9.763313609467457e-06, + "loss": 0.5649, + "step": 165 + }, + { + "epoch": 0.12, + "grad_norm": 12.848185642792462, + "learning_rate": 9.822485207100593e-06, + "loss": 0.5591, + "step": 166 + }, + { + "epoch": 0.12, + "grad_norm": 21.283247757964315, + "learning_rate": 9.88165680473373e-06, + "loss": 0.5698, + "step": 167 + }, + { + "epoch": 0.12, + "grad_norm": 11.36449736428342, + "learning_rate": 9.940828402366864e-06, + "loss": 0.5366, + "step": 168 + }, + { + "epoch": 0.12, + "grad_norm": 12.504342411667235, + "learning_rate": 1e-05, + "loss": 0.5571, + "step": 169 + }, + { + "epoch": 0.12, + "grad_norm": 28.526999718293393, + "learning_rate": 9.999999164703534e-06, + "loss": 0.6133, + "step": 170 + }, + { + "epoch": 0.12, + "grad_norm": 8.676287862832698, + "learning_rate": 9.999996658814406e-06, + "loss": 0.4307, + "step": 171 + }, + { + "epoch": 0.12, + "grad_norm": 12.27838137396804, + "learning_rate": 9.999992482333461e-06, + "loss": 0.4771, + "step": 172 + }, + { + "epoch": 0.12, + "grad_norm": 17.098596330705657, + "learning_rate": 9.99998663526209e-06, + "loss": 0.4785, + "step": 173 + }, + { + "epoch": 0.12, + "grad_norm": 10.364350114790247, + "learning_rate": 9.99997911760225e-06, + "loss": 0.5928, + "step": 174 + }, + { + "epoch": 0.12, + "grad_norm": 32.439748171009725, + "learning_rate": 9.99996992935645e-06, + "loss": 0.6206, + "step": 175 + }, + { + "epoch": 0.13, + "grad_norm": 10.869970364498535, + "learning_rate": 9.99995907052776e-06, + "loss": 0.5176, + "step": 176 + }, + { + "epoch": 0.13, + "grad_norm": 28.853561378084798, + "learning_rate": 9.99994654111981e-06, + "loss": 0.5718, + "step": 177 + }, + { + "epoch": 0.13, + "grad_norm": 30.030018985404727, + "learning_rate": 9.999932341136785e-06, + "loss": 0.6338, + "step": 178 + }, + { + "epoch": 0.13, + "grad_norm": 10.898208289891516, + "learning_rate": 9.999916470583429e-06, + "loss": 0.5049, + "step": 179 + }, + { + "epoch": 0.13, + "grad_norm": 29.836549153691703, + "learning_rate": 9.999898929465047e-06, + "loss": 0.5605, + "step": 180 + }, + { + "epoch": 0.13, + "grad_norm": 31.456036267942125, + "learning_rate": 9.999879717787495e-06, + "loss": 0.6118, + "step": 181 + }, + { + "epoch": 0.13, + "grad_norm": 19.965440018657493, + "learning_rate": 9.999858835557197e-06, + "loss": 0.5356, + "step": 182 + }, + { + "epoch": 0.13, + "grad_norm": 10.241499025434392, + "learning_rate": 9.999836282781128e-06, + "loss": 0.5215, + "step": 183 + }, + { + "epoch": 0.13, + "grad_norm": 10.343720644680257, + "learning_rate": 9.999812059466825e-06, + "loss": 0.6118, + "step": 184 + }, + { + "epoch": 0.13, + "grad_norm": 28.654780626684733, + "learning_rate": 9.999786165622379e-06, + "loss": 0.6016, + "step": 185 + }, + { + "epoch": 0.13, + "grad_norm": 24.17157492958299, + "learning_rate": 9.999758601256441e-06, + "loss": 0.624, + "step": 186 + }, + { + "epoch": 0.13, + "grad_norm": 17.176557440775614, + "learning_rate": 9.999729366378224e-06, + "loss": 0.5527, + "step": 187 + }, + { + "epoch": 0.13, + "grad_norm": 11.253723481448025, + "learning_rate": 9.999698460997493e-06, + "loss": 0.5601, + "step": 188 + }, + { + "epoch": 0.13, + "grad_norm": 13.43944022216274, + "learning_rate": 9.999665885124577e-06, + "loss": 0.5273, + "step": 189 + }, + { + "epoch": 0.14, + "grad_norm": 22.913171126499616, + "learning_rate": 9.99963163877036e-06, + "loss": 0.6587, + "step": 190 + }, + { + "epoch": 0.14, + "grad_norm": 22.224898845103755, + "learning_rate": 9.99959572194628e-06, + "loss": 0.666, + "step": 191 + }, + { + "epoch": 0.14, + "grad_norm": 8.83510331844381, + "learning_rate": 9.999558134664342e-06, + "loss": 0.6099, + "step": 192 + }, + { + "epoch": 0.14, + "grad_norm": 8.477618783981828, + "learning_rate": 9.999518876937102e-06, + "loss": 0.4771, + "step": 193 + }, + { + "epoch": 0.14, + "grad_norm": 23.778400243235488, + "learning_rate": 9.999477948777678e-06, + "loss": 0.5562, + "step": 194 + }, + { + "epoch": 0.14, + "grad_norm": 7.459065251806433, + "learning_rate": 9.999435350199745e-06, + "loss": 0.5342, + "step": 195 + }, + { + "epoch": 0.14, + "grad_norm": 21.538105722452126, + "learning_rate": 9.999391081217536e-06, + "loss": 0.627, + "step": 196 + }, + { + "epoch": 0.14, + "grad_norm": 18.7688376682692, + "learning_rate": 9.999345141845842e-06, + "loss": 0.5293, + "step": 197 + }, + { + "epoch": 0.14, + "grad_norm": 10.99862969600929, + "learning_rate": 9.99929753210001e-06, + "loss": 0.5005, + "step": 198 + }, + { + "epoch": 0.14, + "grad_norm": 13.055362447650138, + "learning_rate": 9.999248251995951e-06, + "loss": 0.5659, + "step": 199 + }, + { + "epoch": 0.14, + "grad_norm": 10.011136685984399, + "learning_rate": 9.999197301550127e-06, + "loss": 0.5586, + "step": 200 + }, + { + "epoch": 0.14, + "grad_norm": 13.565468854396421, + "learning_rate": 9.999144680779564e-06, + "loss": 0.5127, + "step": 201 + }, + { + "epoch": 0.14, + "grad_norm": 10.322706038393164, + "learning_rate": 9.999090389701844e-06, + "loss": 0.5396, + "step": 202 + }, + { + "epoch": 0.14, + "grad_norm": 11.689469240544318, + "learning_rate": 9.999034428335103e-06, + "loss": 0.5366, + "step": 203 + }, + { + "epoch": 0.15, + "grad_norm": 13.174468406049195, + "learning_rate": 9.998976796698043e-06, + "loss": 0.6064, + "step": 204 + }, + { + "epoch": 0.15, + "grad_norm": 13.26408461209251, + "learning_rate": 9.998917494809917e-06, + "loss": 0.5181, + "step": 205 + }, + { + "epoch": 0.15, + "grad_norm": 21.482605171650068, + "learning_rate": 9.998856522690538e-06, + "loss": 0.6626, + "step": 206 + }, + { + "epoch": 0.15, + "grad_norm": 10.98758204165801, + "learning_rate": 9.998793880360283e-06, + "loss": 0.48, + "step": 207 + }, + { + "epoch": 0.15, + "grad_norm": 22.66544425395466, + "learning_rate": 9.998729567840077e-06, + "loss": 0.6836, + "step": 208 + }, + { + "epoch": 0.15, + "grad_norm": 18.309350432593693, + "learning_rate": 9.998663585151409e-06, + "loss": 0.5674, + "step": 209 + }, + { + "epoch": 0.15, + "grad_norm": 27.286620280815722, + "learning_rate": 9.998595932316327e-06, + "loss": 0.6514, + "step": 210 + }, + { + "epoch": 0.15, + "grad_norm": 14.739890462945858, + "learning_rate": 9.998526609357432e-06, + "loss": 0.5947, + "step": 211 + }, + { + "epoch": 0.15, + "grad_norm": 6.551376294742977, + "learning_rate": 9.998455616297889e-06, + "loss": 0.5879, + "step": 212 + }, + { + "epoch": 0.15, + "grad_norm": 21.631161156381427, + "learning_rate": 9.998382953161417e-06, + "loss": 0.6865, + "step": 213 + }, + { + "epoch": 0.15, + "grad_norm": 18.33142486833869, + "learning_rate": 9.998308619972292e-06, + "loss": 0.6357, + "step": 214 + }, + { + "epoch": 0.15, + "grad_norm": 10.60392425953813, + "learning_rate": 9.998232616755354e-06, + "loss": 0.5732, + "step": 215 + }, + { + "epoch": 0.15, + "grad_norm": 5.976381604704594, + "learning_rate": 9.998154943535996e-06, + "loss": 0.5645, + "step": 216 + }, + { + "epoch": 0.15, + "grad_norm": 6.10651009987485, + "learning_rate": 9.998075600340166e-06, + "loss": 0.582, + "step": 217 + }, + { + "epoch": 0.16, + "grad_norm": 6.196857537217384, + "learning_rate": 9.997994587194381e-06, + "loss": 0.564, + "step": 218 + }, + { + "epoch": 0.16, + "grad_norm": 21.280398372848552, + "learning_rate": 9.997911904125704e-06, + "loss": 0.6353, + "step": 219 + }, + { + "epoch": 0.16, + "grad_norm": 9.816198078589931, + "learning_rate": 9.997827551161762e-06, + "loss": 0.5684, + "step": 220 + }, + { + "epoch": 0.16, + "grad_norm": 12.183296652188321, + "learning_rate": 9.997741528330739e-06, + "loss": 0.5449, + "step": 221 + }, + { + "epoch": 0.16, + "grad_norm": 11.185998935163878, + "learning_rate": 9.997653835661376e-06, + "loss": 0.5967, + "step": 222 + }, + { + "epoch": 0.16, + "grad_norm": 12.288601461686234, + "learning_rate": 9.997564473182976e-06, + "loss": 0.5698, + "step": 223 + }, + { + "epoch": 0.16, + "grad_norm": 9.321856011834956, + "learning_rate": 9.997473440925394e-06, + "loss": 0.5771, + "step": 224 + }, + { + "epoch": 0.16, + "grad_norm": 6.311288966424027, + "learning_rate": 9.997380738919045e-06, + "loss": 0.5259, + "step": 225 + }, + { + "epoch": 0.16, + "grad_norm": 21.481033073870574, + "learning_rate": 9.997286367194903e-06, + "loss": 0.6689, + "step": 226 + }, + { + "epoch": 0.16, + "grad_norm": 14.550467772007261, + "learning_rate": 9.9971903257845e-06, + "loss": 0.5928, + "step": 227 + }, + { + "epoch": 0.16, + "grad_norm": 13.702124607152298, + "learning_rate": 9.997092614719926e-06, + "loss": 0.5181, + "step": 228 + }, + { + "epoch": 0.16, + "grad_norm": 17.241256236619744, + "learning_rate": 9.996993234033826e-06, + "loss": 0.5918, + "step": 229 + }, + { + "epoch": 0.16, + "grad_norm": 10.044871821417644, + "learning_rate": 9.996892183759407e-06, + "loss": 0.5811, + "step": 230 + }, + { + "epoch": 0.16, + "grad_norm": 16.21185115842485, + "learning_rate": 9.99678946393043e-06, + "loss": 0.5791, + "step": 231 + }, + { + "epoch": 0.17, + "grad_norm": 17.301100574961, + "learning_rate": 9.996685074581216e-06, + "loss": 0.5332, + "step": 232 + }, + { + "epoch": 0.17, + "grad_norm": 11.335320291463777, + "learning_rate": 9.996579015746645e-06, + "loss": 0.5742, + "step": 233 + }, + { + "epoch": 0.17, + "grad_norm": 6.8646367315459935, + "learning_rate": 9.996471287462151e-06, + "loss": 0.5376, + "step": 234 + }, + { + "epoch": 0.17, + "grad_norm": 5.966657193641781, + "learning_rate": 9.99636188976373e-06, + "loss": 0.5762, + "step": 235 + }, + { + "epoch": 0.17, + "grad_norm": 17.07991289086175, + "learning_rate": 9.996250822687932e-06, + "loss": 0.5405, + "step": 236 + }, + { + "epoch": 0.17, + "grad_norm": 13.807407534670729, + "learning_rate": 9.996138086271869e-06, + "loss": 0.585, + "step": 237 + }, + { + "epoch": 0.17, + "grad_norm": 5.991884688831597, + "learning_rate": 9.996023680553204e-06, + "loss": 0.5181, + "step": 238 + }, + { + "epoch": 0.17, + "grad_norm": 7.4817825314537645, + "learning_rate": 9.995907605570167e-06, + "loss": 0.5957, + "step": 239 + }, + { + "epoch": 0.17, + "grad_norm": 24.175799986215807, + "learning_rate": 9.995789861361538e-06, + "loss": 0.6895, + "step": 240 + }, + { + "epoch": 0.17, + "grad_norm": 7.737883477841022, + "learning_rate": 9.995670447966658e-06, + "loss": 0.4727, + "step": 241 + }, + { + "epoch": 0.17, + "grad_norm": 13.747945457840865, + "learning_rate": 9.995549365425426e-06, + "loss": 0.5635, + "step": 242 + }, + { + "epoch": 0.17, + "grad_norm": 15.188385247219632, + "learning_rate": 9.995426613778297e-06, + "loss": 0.6445, + "step": 243 + }, + { + "epoch": 0.17, + "grad_norm": 5.735363655951899, + "learning_rate": 9.995302193066286e-06, + "loss": 0.5112, + "step": 244 + }, + { + "epoch": 0.17, + "grad_norm": 9.187157770296569, + "learning_rate": 9.995176103330962e-06, + "loss": 0.5776, + "step": 245 + }, + { + "epoch": 0.18, + "grad_norm": 7.366215466061506, + "learning_rate": 9.995048344614455e-06, + "loss": 0.6016, + "step": 246 + }, + { + "epoch": 0.18, + "grad_norm": 11.101901236927242, + "learning_rate": 9.994918916959453e-06, + "loss": 0.5952, + "step": 247 + }, + { + "epoch": 0.18, + "grad_norm": 5.494534788687338, + "learning_rate": 9.994787820409198e-06, + "loss": 0.5625, + "step": 248 + }, + { + "epoch": 0.18, + "grad_norm": 7.350350461703404, + "learning_rate": 9.994655055007491e-06, + "loss": 0.5278, + "step": 249 + }, + { + "epoch": 0.18, + "grad_norm": 5.815623310077835, + "learning_rate": 9.994520620798696e-06, + "loss": 0.5273, + "step": 250 + }, + { + "epoch": 0.18, + "grad_norm": 24.04445898352151, + "learning_rate": 9.994384517827726e-06, + "loss": 0.6157, + "step": 251 + }, + { + "epoch": 0.18, + "grad_norm": 8.217464632831893, + "learning_rate": 9.994246746140057e-06, + "loss": 0.5576, + "step": 252 + }, + { + "epoch": 0.18, + "grad_norm": 17.624589818254435, + "learning_rate": 9.99410730578172e-06, + "loss": 0.5503, + "step": 253 + }, + { + "epoch": 0.18, + "grad_norm": 8.459911000864924, + "learning_rate": 9.993966196799304e-06, + "loss": 0.5166, + "step": 254 + }, + { + "epoch": 0.18, + "grad_norm": 16.972917737075516, + "learning_rate": 9.993823419239959e-06, + "loss": 0.6016, + "step": 255 + }, + { + "epoch": 0.18, + "grad_norm": 11.26491066381988, + "learning_rate": 9.993678973151388e-06, + "loss": 0.4448, + "step": 256 + }, + { + "epoch": 0.18, + "grad_norm": 13.959824992641648, + "learning_rate": 9.993532858581853e-06, + "loss": 0.6025, + "step": 257 + }, + { + "epoch": 0.18, + "grad_norm": 8.17714303025146, + "learning_rate": 9.993385075580173e-06, + "loss": 0.605, + "step": 258 + }, + { + "epoch": 0.18, + "grad_norm": 15.583037935905157, + "learning_rate": 9.993235624195728e-06, + "loss": 0.5659, + "step": 259 + }, + { + "epoch": 0.19, + "grad_norm": 19.251932963533232, + "learning_rate": 9.993084504478448e-06, + "loss": 0.5811, + "step": 260 + }, + { + "epoch": 0.19, + "grad_norm": 16.8640641134105, + "learning_rate": 9.99293171647883e-06, + "loss": 0.4863, + "step": 261 + }, + { + "epoch": 0.19, + "grad_norm": 7.539968969637762, + "learning_rate": 9.992777260247916e-06, + "loss": 0.5469, + "step": 262 + }, + { + "epoch": 0.19, + "grad_norm": 21.333336259322376, + "learning_rate": 9.99262113583732e-06, + "loss": 0.5479, + "step": 263 + }, + { + "epoch": 0.19, + "grad_norm": 31.227206000118187, + "learning_rate": 9.992463343299203e-06, + "loss": 0.6367, + "step": 264 + }, + { + "epoch": 0.19, + "grad_norm": 16.373876295931044, + "learning_rate": 9.992303882686288e-06, + "loss": 0.5479, + "step": 265 + }, + { + "epoch": 0.19, + "grad_norm": 6.169578673800072, + "learning_rate": 9.99214275405185e-06, + "loss": 0.5327, + "step": 266 + }, + { + "epoch": 0.19, + "grad_norm": 19.948032240297028, + "learning_rate": 9.991979957449729e-06, + "loss": 0.6313, + "step": 267 + }, + { + "epoch": 0.19, + "grad_norm": 28.97323801726882, + "learning_rate": 9.991815492934318e-06, + "loss": 0.6411, + "step": 268 + }, + { + "epoch": 0.19, + "grad_norm": 19.141849461327332, + "learning_rate": 9.991649360560565e-06, + "loss": 0.6318, + "step": 269 + }, + { + "epoch": 0.19, + "grad_norm": 6.920257209617739, + "learning_rate": 9.99148156038398e-06, + "loss": 0.5425, + "step": 270 + }, + { + "epoch": 0.19, + "grad_norm": 10.115581788358424, + "learning_rate": 9.991312092460626e-06, + "loss": 0.4868, + "step": 271 + }, + { + "epoch": 0.19, + "grad_norm": 7.252432772054797, + "learning_rate": 9.991140956847128e-06, + "loss": 0.6235, + "step": 272 + }, + { + "epoch": 0.19, + "grad_norm": 22.131525985269647, + "learning_rate": 9.990968153600664e-06, + "loss": 0.6006, + "step": 273 + }, + { + "epoch": 0.2, + "grad_norm": 9.41445550986435, + "learning_rate": 9.990793682778973e-06, + "loss": 0.52, + "step": 274 + }, + { + "epoch": 0.2, + "grad_norm": 17.317553621436222, + "learning_rate": 9.990617544440346e-06, + "loss": 0.5083, + "step": 275 + }, + { + "epoch": 0.2, + "grad_norm": 6.54955596823522, + "learning_rate": 9.990439738643635e-06, + "loss": 0.5161, + "step": 276 + }, + { + "epoch": 0.2, + "grad_norm": 5.964194414552132, + "learning_rate": 9.99026026544825e-06, + "loss": 0.5083, + "step": 277 + }, + { + "epoch": 0.2, + "grad_norm": 16.720177088068684, + "learning_rate": 9.990079124914156e-06, + "loss": 0.522, + "step": 278 + }, + { + "epoch": 0.2, + "grad_norm": 9.459906145881508, + "learning_rate": 9.989896317101873e-06, + "loss": 0.4951, + "step": 279 + }, + { + "epoch": 0.2, + "grad_norm": 16.765981248006945, + "learning_rate": 9.989711842072482e-06, + "loss": 0.6133, + "step": 280 + }, + { + "epoch": 0.2, + "grad_norm": 11.835633957228747, + "learning_rate": 9.989525699887619e-06, + "loss": 0.5205, + "step": 281 + }, + { + "epoch": 0.2, + "grad_norm": 11.649027140963295, + "learning_rate": 9.989337890609478e-06, + "loss": 0.5625, + "step": 282 + }, + { + "epoch": 0.2, + "grad_norm": 10.40438475972807, + "learning_rate": 9.98914841430081e-06, + "loss": 0.4858, + "step": 283 + }, + { + "epoch": 0.2, + "grad_norm": 19.573834463528204, + "learning_rate": 9.988957271024922e-06, + "loss": 0.542, + "step": 284 + }, + { + "epoch": 0.2, + "grad_norm": 8.091426545143989, + "learning_rate": 9.988764460845676e-06, + "loss": 0.5542, + "step": 285 + }, + { + "epoch": 0.2, + "grad_norm": 6.36487501871403, + "learning_rate": 9.9885699838275e-06, + "loss": 0.4185, + "step": 286 + }, + { + "epoch": 0.2, + "grad_norm": 14.691682917775056, + "learning_rate": 9.988373840035366e-06, + "loss": 0.541, + "step": 287 + }, + { + "epoch": 0.21, + "grad_norm": 16.560245915679797, + "learning_rate": 9.988176029534814e-06, + "loss": 0.543, + "step": 288 + }, + { + "epoch": 0.21, + "grad_norm": 9.393486733090167, + "learning_rate": 9.987976552391933e-06, + "loss": 0.4878, + "step": 289 + }, + { + "epoch": 0.21, + "grad_norm": 10.696902281023876, + "learning_rate": 9.987775408673373e-06, + "loss": 0.603, + "step": 290 + }, + { + "epoch": 0.21, + "grad_norm": 7.542550692108352, + "learning_rate": 9.987572598446337e-06, + "loss": 0.5083, + "step": 291 + }, + { + "epoch": 0.21, + "grad_norm": 12.93517301011984, + "learning_rate": 9.987368121778594e-06, + "loss": 0.4785, + "step": 292 + }, + { + "epoch": 0.21, + "grad_norm": 9.117485021835481, + "learning_rate": 9.98716197873846e-06, + "loss": 0.4951, + "step": 293 + }, + { + "epoch": 0.21, + "grad_norm": 29.587346547838976, + "learning_rate": 9.98695416939481e-06, + "loss": 0.7002, + "step": 294 + }, + { + "epoch": 0.21, + "grad_norm": 14.306933072284718, + "learning_rate": 9.986744693817077e-06, + "loss": 0.561, + "step": 295 + }, + { + "epoch": 0.21, + "grad_norm": 10.606210464051424, + "learning_rate": 9.986533552075252e-06, + "loss": 0.5801, + "step": 296 + }, + { + "epoch": 0.21, + "grad_norm": 10.005787274091094, + "learning_rate": 9.986320744239883e-06, + "loss": 0.5742, + "step": 297 + }, + { + "epoch": 0.21, + "grad_norm": 14.512421082684124, + "learning_rate": 9.98610627038207e-06, + "loss": 0.5532, + "step": 298 + }, + { + "epoch": 0.21, + "grad_norm": 8.006107344505082, + "learning_rate": 9.985890130573474e-06, + "loss": 0.5298, + "step": 299 + }, + { + "epoch": 0.21, + "grad_norm": 7.111526173571763, + "learning_rate": 9.98567232488631e-06, + "loss": 0.5459, + "step": 300 + }, + { + "epoch": 0.21, + "grad_norm": 11.911631119108614, + "learning_rate": 9.985452853393353e-06, + "loss": 0.5425, + "step": 301 + }, + { + "epoch": 0.22, + "grad_norm": 8.375016686561212, + "learning_rate": 9.985231716167933e-06, + "loss": 0.5298, + "step": 302 + }, + { + "epoch": 0.22, + "grad_norm": 8.179532269489528, + "learning_rate": 9.985008913283933e-06, + "loss": 0.5459, + "step": 303 + }, + { + "epoch": 0.22, + "grad_norm": 9.104521872821598, + "learning_rate": 9.984784444815799e-06, + "loss": 0.6201, + "step": 304 + }, + { + "epoch": 0.22, + "grad_norm": 12.983836084666084, + "learning_rate": 9.984558310838528e-06, + "loss": 0.5645, + "step": 305 + }, + { + "epoch": 0.22, + "grad_norm": 9.16177018185428, + "learning_rate": 9.984330511427676e-06, + "loss": 0.5693, + "step": 306 + }, + { + "epoch": 0.22, + "grad_norm": 11.784575497017391, + "learning_rate": 9.984101046659353e-06, + "loss": 0.4595, + "step": 307 + }, + { + "epoch": 0.22, + "grad_norm": 8.33312354424948, + "learning_rate": 9.983869916610232e-06, + "loss": 0.4668, + "step": 308 + }, + { + "epoch": 0.22, + "grad_norm": 11.786060507815215, + "learning_rate": 9.983637121357534e-06, + "loss": 0.5557, + "step": 309 + }, + { + "epoch": 0.22, + "grad_norm": 14.611215764264122, + "learning_rate": 9.983402660979042e-06, + "loss": 0.6064, + "step": 310 + }, + { + "epoch": 0.22, + "grad_norm": 10.263169097847555, + "learning_rate": 9.983166535553093e-06, + "loss": 0.5977, + "step": 311 + }, + { + "epoch": 0.22, + "grad_norm": 11.022215974074236, + "learning_rate": 9.98292874515858e-06, + "loss": 0.5137, + "step": 312 + }, + { + "epoch": 0.22, + "grad_norm": 10.936834299449774, + "learning_rate": 9.982689289874956e-06, + "loss": 0.5898, + "step": 313 + }, + { + "epoch": 0.22, + "grad_norm": 6.511642609064424, + "learning_rate": 9.982448169782226e-06, + "loss": 0.5967, + "step": 314 + }, + { + "epoch": 0.22, + "grad_norm": 11.598567525889859, + "learning_rate": 9.98220538496095e-06, + "loss": 0.5234, + "step": 315 + }, + { + "epoch": 0.23, + "grad_norm": 7.979819387251359, + "learning_rate": 9.98196093549225e-06, + "loss": 0.5054, + "step": 316 + }, + { + "epoch": 0.23, + "grad_norm": 8.574667202148518, + "learning_rate": 9.9817148214578e-06, + "loss": 0.5801, + "step": 317 + }, + { + "epoch": 0.23, + "grad_norm": 9.56598373183688, + "learning_rate": 9.981467042939833e-06, + "loss": 0.5732, + "step": 318 + }, + { + "epoch": 0.23, + "grad_norm": 11.776851149154552, + "learning_rate": 9.981217600021133e-06, + "loss": 0.5469, + "step": 319 + }, + { + "epoch": 0.23, + "grad_norm": 8.737203552004415, + "learning_rate": 9.980966492785048e-06, + "loss": 0.5742, + "step": 320 + }, + { + "epoch": 0.23, + "grad_norm": 8.915557318934983, + "learning_rate": 9.980713721315473e-06, + "loss": 0.4888, + "step": 321 + }, + { + "epoch": 0.23, + "grad_norm": 9.65537704300492, + "learning_rate": 9.98045928569687e-06, + "loss": 0.5425, + "step": 322 + }, + { + "epoch": 0.23, + "grad_norm": 7.172596910986449, + "learning_rate": 9.98020318601424e-06, + "loss": 0.4824, + "step": 323 + }, + { + "epoch": 0.23, + "grad_norm": 10.440835376005527, + "learning_rate": 9.97994542235316e-06, + "loss": 0.5522, + "step": 324 + }, + { + "epoch": 0.23, + "grad_norm": 19.118269130078797, + "learning_rate": 9.979685994799753e-06, + "loss": 0.6069, + "step": 325 + }, + { + "epoch": 0.23, + "grad_norm": 23.200575202995402, + "learning_rate": 9.979424903440695e-06, + "loss": 0.5405, + "step": 326 + }, + { + "epoch": 0.23, + "grad_norm": 8.163978783652977, + "learning_rate": 9.979162148363222e-06, + "loss": 0.5332, + "step": 327 + }, + { + "epoch": 0.23, + "grad_norm": 10.535886529238546, + "learning_rate": 9.978897729655127e-06, + "loss": 0.5405, + "step": 328 + }, + { + "epoch": 0.23, + "grad_norm": 38.988384289584985, + "learning_rate": 9.978631647404755e-06, + "loss": 0.6826, + "step": 329 + }, + { + "epoch": 0.24, + "grad_norm": 18.249997402471777, + "learning_rate": 9.97836390170101e-06, + "loss": 0.5259, + "step": 330 + }, + { + "epoch": 0.24, + "grad_norm": 12.01736287591341, + "learning_rate": 9.978094492633353e-06, + "loss": 0.5601, + "step": 331 + }, + { + "epoch": 0.24, + "grad_norm": 12.939324146161377, + "learning_rate": 9.977823420291796e-06, + "loss": 0.5688, + "step": 332 + }, + { + "epoch": 0.24, + "grad_norm": 14.361987880781351, + "learning_rate": 9.97755068476691e-06, + "loss": 0.5605, + "step": 333 + }, + { + "epoch": 0.24, + "grad_norm": 22.600799019220247, + "learning_rate": 9.977276286149821e-06, + "loss": 0.6226, + "step": 334 + }, + { + "epoch": 0.24, + "grad_norm": 15.773235886574865, + "learning_rate": 9.977000224532211e-06, + "loss": 0.5332, + "step": 335 + }, + { + "epoch": 0.24, + "grad_norm": 10.06416768362281, + "learning_rate": 9.976722500006318e-06, + "loss": 0.6416, + "step": 336 + }, + { + "epoch": 0.24, + "grad_norm": 9.827713447093007, + "learning_rate": 9.976443112664932e-06, + "loss": 0.5957, + "step": 337 + }, + { + "epoch": 0.24, + "grad_norm": 10.629610673442508, + "learning_rate": 9.976162062601407e-06, + "loss": 0.5527, + "step": 338 + }, + { + "epoch": 0.24, + "grad_norm": 14.528967178159732, + "learning_rate": 9.97587934990964e-06, + "loss": 0.5762, + "step": 339 + }, + { + "epoch": 0.24, + "grad_norm": 12.608082935381697, + "learning_rate": 9.975594974684096e-06, + "loss": 0.5659, + "step": 340 + }, + { + "epoch": 0.24, + "grad_norm": 9.717599293321035, + "learning_rate": 9.975308937019787e-06, + "loss": 0.5278, + "step": 341 + }, + { + "epoch": 0.24, + "grad_norm": 7.812143947109021, + "learning_rate": 9.975021237012286e-06, + "loss": 0.5552, + "step": 342 + }, + { + "epoch": 0.24, + "grad_norm": 11.925371312954011, + "learning_rate": 9.974731874757717e-06, + "loss": 0.5596, + "step": 343 + }, + { + "epoch": 0.25, + "grad_norm": 8.339309116508288, + "learning_rate": 9.974440850352762e-06, + "loss": 0.582, + "step": 344 + }, + { + "epoch": 0.25, + "grad_norm": 16.614761768586927, + "learning_rate": 9.974148163894658e-06, + "loss": 0.5303, + "step": 345 + }, + { + "epoch": 0.25, + "grad_norm": 8.71369707758011, + "learning_rate": 9.973853815481196e-06, + "loss": 0.5601, + "step": 346 + }, + { + "epoch": 0.25, + "grad_norm": 12.093172429150728, + "learning_rate": 9.973557805210724e-06, + "loss": 0.5283, + "step": 347 + }, + { + "epoch": 0.25, + "grad_norm": 7.19951189726805, + "learning_rate": 9.973260133182145e-06, + "loss": 0.5615, + "step": 348 + }, + { + "epoch": 0.25, + "grad_norm": 11.218505188713452, + "learning_rate": 9.972960799494915e-06, + "loss": 0.5591, + "step": 349 + }, + { + "epoch": 0.25, + "grad_norm": 9.343689711468452, + "learning_rate": 9.972659804249047e-06, + "loss": 0.4824, + "step": 350 + }, + { + "epoch": 0.25, + "grad_norm": 16.095683927102183, + "learning_rate": 9.972357147545113e-06, + "loss": 0.5591, + "step": 351 + }, + { + "epoch": 0.25, + "grad_norm": 18.28619957306297, + "learning_rate": 9.972052829484231e-06, + "loss": 0.5586, + "step": 352 + }, + { + "epoch": 0.25, + "grad_norm": 13.927219563114457, + "learning_rate": 9.971746850168084e-06, + "loss": 0.543, + "step": 353 + }, + { + "epoch": 0.25, + "grad_norm": 19.070239504065867, + "learning_rate": 9.971439209698902e-06, + "loss": 0.6523, + "step": 354 + }, + { + "epoch": 0.25, + "grad_norm": 10.674351013034489, + "learning_rate": 9.971129908179474e-06, + "loss": 0.541, + "step": 355 + }, + { + "epoch": 0.25, + "grad_norm": 12.105592852551204, + "learning_rate": 9.970818945713145e-06, + "loss": 0.5659, + "step": 356 + }, + { + "epoch": 0.25, + "grad_norm": 7.0981327588150025, + "learning_rate": 9.970506322403813e-06, + "loss": 0.4458, + "step": 357 + }, + { + "epoch": 0.26, + "grad_norm": 31.65511002299402, + "learning_rate": 9.970192038355928e-06, + "loss": 0.6401, + "step": 358 + }, + { + "epoch": 0.26, + "grad_norm": 7.640887314827702, + "learning_rate": 9.969876093674502e-06, + "loss": 0.5005, + "step": 359 + }, + { + "epoch": 0.26, + "grad_norm": 9.615326153238845, + "learning_rate": 9.969558488465097e-06, + "loss": 0.4995, + "step": 360 + }, + { + "epoch": 0.26, + "grad_norm": 25.0121028825566, + "learning_rate": 9.969239222833829e-06, + "loss": 0.5254, + "step": 361 + }, + { + "epoch": 0.26, + "grad_norm": 8.942638756744063, + "learning_rate": 9.968918296887374e-06, + "loss": 0.48, + "step": 362 + }, + { + "epoch": 0.26, + "grad_norm": 7.8607942912242965, + "learning_rate": 9.968595710732955e-06, + "loss": 0.5239, + "step": 363 + }, + { + "epoch": 0.26, + "grad_norm": 9.584219880853102, + "learning_rate": 9.968271464478357e-06, + "loss": 0.6064, + "step": 364 + }, + { + "epoch": 0.26, + "grad_norm": 18.74816464727213, + "learning_rate": 9.967945558231917e-06, + "loss": 0.624, + "step": 365 + }, + { + "epoch": 0.26, + "grad_norm": 10.18424589657994, + "learning_rate": 9.967617992102526e-06, + "loss": 0.5688, + "step": 366 + }, + { + "epoch": 0.26, + "grad_norm": 14.963994900078255, + "learning_rate": 9.967288766199628e-06, + "loss": 0.5151, + "step": 367 + }, + { + "epoch": 0.26, + "grad_norm": 15.107368415735328, + "learning_rate": 9.966957880633225e-06, + "loss": 0.5117, + "step": 368 + }, + { + "epoch": 0.26, + "grad_norm": 9.726462628184319, + "learning_rate": 9.966625335513873e-06, + "loss": 0.5464, + "step": 369 + }, + { + "epoch": 0.26, + "grad_norm": 25.415412889353508, + "learning_rate": 9.96629113095268e-06, + "loss": 0.6455, + "step": 370 + }, + { + "epoch": 0.26, + "grad_norm": 13.29743204783179, + "learning_rate": 9.965955267061309e-06, + "loss": 0.564, + "step": 371 + }, + { + "epoch": 0.27, + "grad_norm": 7.177772883312987, + "learning_rate": 9.965617743951982e-06, + "loss": 0.4883, + "step": 372 + }, + { + "epoch": 0.27, + "grad_norm": 6.407931454753083, + "learning_rate": 9.965278561737466e-06, + "loss": 0.4746, + "step": 373 + }, + { + "epoch": 0.27, + "grad_norm": 9.137420154130437, + "learning_rate": 9.964937720531094e-06, + "loss": 0.5532, + "step": 374 + }, + { + "epoch": 0.27, + "grad_norm": 15.056581957504337, + "learning_rate": 9.964595220446744e-06, + "loss": 0.5771, + "step": 375 + }, + { + "epoch": 0.27, + "grad_norm": 8.993928512302421, + "learning_rate": 9.964251061598853e-06, + "loss": 0.583, + "step": 376 + }, + { + "epoch": 0.27, + "grad_norm": 8.30189244870148, + "learning_rate": 9.96390524410241e-06, + "loss": 0.5552, + "step": 377 + }, + { + "epoch": 0.27, + "grad_norm": 16.305556797808645, + "learning_rate": 9.96355776807296e-06, + "loss": 0.5098, + "step": 378 + }, + { + "epoch": 0.27, + "grad_norm": 9.335808377185364, + "learning_rate": 9.9632086336266e-06, + "loss": 0.5747, + "step": 379 + }, + { + "epoch": 0.27, + "grad_norm": 8.797481794532704, + "learning_rate": 9.962857840879983e-06, + "loss": 0.5664, + "step": 380 + }, + { + "epoch": 0.27, + "grad_norm": 10.172910648343924, + "learning_rate": 9.962505389950317e-06, + "loss": 0.6455, + "step": 381 + }, + { + "epoch": 0.27, + "grad_norm": 6.971801104641452, + "learning_rate": 9.962151280955359e-06, + "loss": 0.5317, + "step": 382 + }, + { + "epoch": 0.27, + "grad_norm": 14.019462783189976, + "learning_rate": 9.961795514013424e-06, + "loss": 0.6611, + "step": 383 + }, + { + "epoch": 0.27, + "grad_norm": 13.015095631990192, + "learning_rate": 9.961438089243384e-06, + "loss": 0.54, + "step": 384 + }, + { + "epoch": 0.27, + "grad_norm": 10.693443657141236, + "learning_rate": 9.961079006764659e-06, + "loss": 0.6846, + "step": 385 + }, + { + "epoch": 0.28, + "grad_norm": 8.388009134623115, + "learning_rate": 9.960718266697223e-06, + "loss": 0.4805, + "step": 386 + }, + { + "epoch": 0.28, + "grad_norm": 9.305245922345355, + "learning_rate": 9.960355869161609e-06, + "loss": 0.5625, + "step": 387 + }, + { + "epoch": 0.28, + "grad_norm": 12.98682854904853, + "learning_rate": 9.959991814278898e-06, + "loss": 0.5361, + "step": 388 + }, + { + "epoch": 0.28, + "grad_norm": 30.855897554320187, + "learning_rate": 9.95962610217073e-06, + "loss": 0.6348, + "step": 389 + }, + { + "epoch": 0.28, + "grad_norm": 18.186818656200053, + "learning_rate": 9.959258732959296e-06, + "loss": 0.6367, + "step": 390 + }, + { + "epoch": 0.28, + "grad_norm": 10.1692607171307, + "learning_rate": 9.958889706767341e-06, + "loss": 0.6035, + "step": 391 + }, + { + "epoch": 0.28, + "grad_norm": 19.89752794089645, + "learning_rate": 9.95851902371816e-06, + "loss": 0.6279, + "step": 392 + }, + { + "epoch": 0.28, + "grad_norm": 18.94232313274701, + "learning_rate": 9.95814668393561e-06, + "loss": 0.625, + "step": 393 + }, + { + "epoch": 0.28, + "grad_norm": 11.263112378321651, + "learning_rate": 9.957772687544094e-06, + "loss": 0.6211, + "step": 394 + }, + { + "epoch": 0.28, + "grad_norm": 18.694064854132726, + "learning_rate": 9.95739703466857e-06, + "loss": 0.6113, + "step": 395 + }, + { + "epoch": 0.28, + "grad_norm": 14.054888000787919, + "learning_rate": 9.957019725434554e-06, + "loss": 0.6055, + "step": 396 + }, + { + "epoch": 0.28, + "grad_norm": 6.852195125531498, + "learning_rate": 9.956640759968111e-06, + "loss": 0.4897, + "step": 397 + }, + { + "epoch": 0.28, + "grad_norm": 10.66681348604124, + "learning_rate": 9.956260138395857e-06, + "loss": 0.5479, + "step": 398 + }, + { + "epoch": 0.28, + "grad_norm": 20.557192888856104, + "learning_rate": 9.955877860844969e-06, + "loss": 0.6069, + "step": 399 + }, + { + "epoch": 0.29, + "grad_norm": 8.942620031874748, + "learning_rate": 9.955493927443171e-06, + "loss": 0.4883, + "step": 400 + }, + { + "epoch": 0.29, + "grad_norm": 16.005501795532552, + "learning_rate": 9.955108338318743e-06, + "loss": 0.6094, + "step": 401 + }, + { + "epoch": 0.29, + "grad_norm": 9.342699104317559, + "learning_rate": 9.954721093600517e-06, + "loss": 0.541, + "step": 402 + }, + { + "epoch": 0.29, + "grad_norm": 10.090322647328717, + "learning_rate": 9.95433219341788e-06, + "loss": 0.5225, + "step": 403 + }, + { + "epoch": 0.29, + "grad_norm": 16.150636904880326, + "learning_rate": 9.953941637900769e-06, + "loss": 0.666, + "step": 404 + }, + { + "epoch": 0.29, + "grad_norm": 15.523930157702077, + "learning_rate": 9.953549427179676e-06, + "loss": 0.5566, + "step": 405 + }, + { + "epoch": 0.29, + "grad_norm": 6.643994089560804, + "learning_rate": 9.953155561385646e-06, + "loss": 0.5015, + "step": 406 + }, + { + "epoch": 0.29, + "grad_norm": 9.03294877321269, + "learning_rate": 9.952760040650278e-06, + "loss": 0.562, + "step": 407 + }, + { + "epoch": 0.29, + "grad_norm": 10.801467808368454, + "learning_rate": 9.95236286510572e-06, + "loss": 0.519, + "step": 408 + }, + { + "epoch": 0.29, + "grad_norm": 9.80918459345022, + "learning_rate": 9.95196403488468e-06, + "loss": 0.5747, + "step": 409 + }, + { + "epoch": 0.29, + "grad_norm": 9.774285807444773, + "learning_rate": 9.951563550120412e-06, + "loss": 0.5752, + "step": 410 + }, + { + "epoch": 0.29, + "grad_norm": 17.463568068842072, + "learning_rate": 9.951161410946725e-06, + "loss": 0.5527, + "step": 411 + }, + { + "epoch": 0.29, + "grad_norm": 16.73368659445093, + "learning_rate": 9.950757617497983e-06, + "loss": 0.4585, + "step": 412 + }, + { + "epoch": 0.29, + "grad_norm": 14.942045154978281, + "learning_rate": 9.950352169909101e-06, + "loss": 0.4893, + "step": 413 + }, + { + "epoch": 0.3, + "grad_norm": 9.393249457904501, + "learning_rate": 9.949945068315544e-06, + "loss": 0.5684, + "step": 414 + }, + { + "epoch": 0.3, + "grad_norm": 13.911557122462217, + "learning_rate": 9.949536312853334e-06, + "loss": 0.5786, + "step": 415 + }, + { + "epoch": 0.3, + "grad_norm": 13.653566902215857, + "learning_rate": 9.949125903659042e-06, + "loss": 0.6289, + "step": 416 + }, + { + "epoch": 0.3, + "grad_norm": 8.471009617251026, + "learning_rate": 9.948713840869797e-06, + "loss": 0.5283, + "step": 417 + }, + { + "epoch": 0.3, + "grad_norm": 8.083527199424468, + "learning_rate": 9.948300124623274e-06, + "loss": 0.4492, + "step": 418 + }, + { + "epoch": 0.3, + "grad_norm": 8.197608537282099, + "learning_rate": 9.947884755057703e-06, + "loss": 0.5186, + "step": 419 + }, + { + "epoch": 0.3, + "grad_norm": 13.630148938077044, + "learning_rate": 9.947467732311868e-06, + "loss": 0.5669, + "step": 420 + }, + { + "epoch": 0.3, + "grad_norm": 11.24383405339626, + "learning_rate": 9.947049056525104e-06, + "loss": 0.5068, + "step": 421 + }, + { + "epoch": 0.3, + "grad_norm": 11.288937926215693, + "learning_rate": 9.9466287278373e-06, + "loss": 0.5103, + "step": 422 + }, + { + "epoch": 0.3, + "grad_norm": 8.534922729005983, + "learning_rate": 9.946206746388892e-06, + "loss": 0.5278, + "step": 423 + }, + { + "epoch": 0.3, + "grad_norm": 13.99533118513947, + "learning_rate": 9.94578311232087e-06, + "loss": 0.585, + "step": 424 + }, + { + "epoch": 0.3, + "grad_norm": 13.150688092265762, + "learning_rate": 9.945357825774786e-06, + "loss": 0.5933, + "step": 425 + }, + { + "epoch": 0.3, + "grad_norm": 10.850098763453946, + "learning_rate": 9.944930886892731e-06, + "loss": 0.5488, + "step": 426 + }, + { + "epoch": 0.3, + "grad_norm": 7.389796398632541, + "learning_rate": 9.944502295817353e-06, + "loss": 0.5278, + "step": 427 + }, + { + "epoch": 0.31, + "grad_norm": 14.254074622509817, + "learning_rate": 9.944072052691853e-06, + "loss": 0.5723, + "step": 428 + }, + { + "epoch": 0.31, + "grad_norm": 13.682353391544783, + "learning_rate": 9.943640157659984e-06, + "loss": 0.4854, + "step": 429 + }, + { + "epoch": 0.31, + "grad_norm": 8.532300495721698, + "learning_rate": 9.94320661086605e-06, + "loss": 0.4614, + "step": 430 + }, + { + "epoch": 0.31, + "grad_norm": 23.899338345320164, + "learning_rate": 9.942771412454906e-06, + "loss": 0.749, + "step": 431 + }, + { + "epoch": 0.31, + "grad_norm": 8.323403099794728, + "learning_rate": 9.942334562571961e-06, + "loss": 0.5317, + "step": 432 + }, + { + "epoch": 0.31, + "grad_norm": 15.057160250544284, + "learning_rate": 9.941896061363173e-06, + "loss": 0.5894, + "step": 433 + }, + { + "epoch": 0.31, + "grad_norm": 19.87921251443284, + "learning_rate": 9.941455908975054e-06, + "loss": 0.5293, + "step": 434 + }, + { + "epoch": 0.31, + "grad_norm": 10.374070788814263, + "learning_rate": 9.941014105554668e-06, + "loss": 0.6357, + "step": 435 + }, + { + "epoch": 0.31, + "grad_norm": 8.07075036921681, + "learning_rate": 9.94057065124963e-06, + "loss": 0.5859, + "step": 436 + }, + { + "epoch": 0.31, + "grad_norm": 12.935502918801745, + "learning_rate": 9.940125546208107e-06, + "loss": 0.4937, + "step": 437 + }, + { + "epoch": 0.31, + "grad_norm": 10.849912769259404, + "learning_rate": 9.939678790578813e-06, + "loss": 0.5679, + "step": 438 + }, + { + "epoch": 0.31, + "grad_norm": 12.909255099305636, + "learning_rate": 9.93923038451102e-06, + "loss": 0.5547, + "step": 439 + }, + { + "epoch": 0.31, + "grad_norm": 18.00963416001379, + "learning_rate": 9.938780328154549e-06, + "loss": 0.6104, + "step": 440 + }, + { + "epoch": 0.31, + "grad_norm": 7.4109051578140575, + "learning_rate": 9.938328621659775e-06, + "loss": 0.5923, + "step": 441 + }, + { + "epoch": 0.32, + "grad_norm": 12.359334459897687, + "learning_rate": 9.937875265177615e-06, + "loss": 0.5879, + "step": 442 + }, + { + "epoch": 0.32, + "grad_norm": 19.83227778183207, + "learning_rate": 9.937420258859547e-06, + "loss": 0.563, + "step": 443 + }, + { + "epoch": 0.32, + "grad_norm": 5.415148747386903, + "learning_rate": 9.9369636028576e-06, + "loss": 0.5352, + "step": 444 + }, + { + "epoch": 0.32, + "grad_norm": 7.775852096977613, + "learning_rate": 9.936505297324346e-06, + "loss": 0.5283, + "step": 445 + }, + { + "epoch": 0.32, + "grad_norm": 13.923034553787746, + "learning_rate": 9.936045342412917e-06, + "loss": 0.5732, + "step": 446 + }, + { + "epoch": 0.32, + "grad_norm": 6.630684830040368, + "learning_rate": 9.93558373827699e-06, + "loss": 0.5078, + "step": 447 + }, + { + "epoch": 0.32, + "grad_norm": 7.677385634372398, + "learning_rate": 9.935120485070799e-06, + "loss": 0.5557, + "step": 448 + }, + { + "epoch": 0.32, + "grad_norm": 8.267582350250366, + "learning_rate": 9.934655582949123e-06, + "loss": 0.4868, + "step": 449 + }, + { + "epoch": 0.32, + "grad_norm": 9.591276642156359, + "learning_rate": 9.934189032067296e-06, + "loss": 0.5791, + "step": 450 + }, + { + "epoch": 0.32, + "grad_norm": 13.200236779301449, + "learning_rate": 9.933720832581197e-06, + "loss": 0.5679, + "step": 451 + }, + { + "epoch": 0.32, + "grad_norm": 11.316615652129823, + "learning_rate": 9.933250984647266e-06, + "loss": 0.5435, + "step": 452 + }, + { + "epoch": 0.32, + "grad_norm": 18.91748370377269, + "learning_rate": 9.932779488422484e-06, + "loss": 0.5562, + "step": 453 + }, + { + "epoch": 0.32, + "grad_norm": 15.369736880224455, + "learning_rate": 9.93230634406439e-06, + "loss": 0.5498, + "step": 454 + }, + { + "epoch": 0.32, + "grad_norm": 11.79281460550446, + "learning_rate": 9.931831551731067e-06, + "loss": 0.585, + "step": 455 + }, + { + "epoch": 0.33, + "grad_norm": 17.320334157458905, + "learning_rate": 9.931355111581154e-06, + "loss": 0.6392, + "step": 456 + }, + { + "epoch": 0.33, + "grad_norm": 10.65106240403786, + "learning_rate": 9.930877023773837e-06, + "loss": 0.5015, + "step": 457 + }, + { + "epoch": 0.33, + "grad_norm": 12.87208988766515, + "learning_rate": 9.930397288468853e-06, + "loss": 0.5522, + "step": 458 + }, + { + "epoch": 0.33, + "grad_norm": 6.81270031499954, + "learning_rate": 9.929915905826494e-06, + "loss": 0.4966, + "step": 459 + }, + { + "epoch": 0.33, + "grad_norm": 15.623328763405382, + "learning_rate": 9.9294328760076e-06, + "loss": 0.5083, + "step": 460 + }, + { + "epoch": 0.33, + "grad_norm": 10.176704828042569, + "learning_rate": 9.928948199173552e-06, + "loss": 0.5142, + "step": 461 + }, + { + "epoch": 0.33, + "grad_norm": 18.452044947551833, + "learning_rate": 9.928461875486297e-06, + "loss": 0.4746, + "step": 462 + }, + { + "epoch": 0.33, + "grad_norm": 13.515116021434167, + "learning_rate": 9.927973905108323e-06, + "loss": 0.5566, + "step": 463 + }, + { + "epoch": 0.33, + "grad_norm": 8.279657241609938, + "learning_rate": 9.927484288202671e-06, + "loss": 0.5566, + "step": 464 + }, + { + "epoch": 0.33, + "grad_norm": 23.899242007190043, + "learning_rate": 9.926993024932929e-06, + "loss": 0.5767, + "step": 465 + }, + { + "epoch": 0.33, + "grad_norm": 13.739763048191918, + "learning_rate": 9.926500115463238e-06, + "loss": 0.5396, + "step": 466 + }, + { + "epoch": 0.33, + "grad_norm": 13.322105279702368, + "learning_rate": 9.926005559958287e-06, + "loss": 0.5317, + "step": 467 + }, + { + "epoch": 0.33, + "grad_norm": 13.685933746852832, + "learning_rate": 9.925509358583319e-06, + "loss": 0.5044, + "step": 468 + }, + { + "epoch": 0.33, + "grad_norm": 7.819644131934337, + "learning_rate": 9.92501151150412e-06, + "loss": 0.5361, + "step": 469 + }, + { + "epoch": 0.34, + "grad_norm": 10.096851486127841, + "learning_rate": 9.924512018887036e-06, + "loss": 0.54, + "step": 470 + }, + { + "epoch": 0.34, + "grad_norm": 18.4073975042839, + "learning_rate": 9.924010880898952e-06, + "loss": 0.5059, + "step": 471 + }, + { + "epoch": 0.34, + "grad_norm": 16.34808552011236, + "learning_rate": 9.923508097707306e-06, + "loss": 0.6025, + "step": 472 + }, + { + "epoch": 0.34, + "grad_norm": 9.169077335799752, + "learning_rate": 9.923003669480094e-06, + "loss": 0.562, + "step": 473 + }, + { + "epoch": 0.34, + "grad_norm": 12.814349393155531, + "learning_rate": 9.922497596385848e-06, + "loss": 0.5376, + "step": 474 + }, + { + "epoch": 0.34, + "grad_norm": 11.91476877486239, + "learning_rate": 9.92198987859366e-06, + "loss": 0.458, + "step": 475 + }, + { + "epoch": 0.34, + "grad_norm": 15.796586778125109, + "learning_rate": 9.921480516273168e-06, + "loss": 0.5645, + "step": 476 + }, + { + "epoch": 0.34, + "grad_norm": 24.08880736913912, + "learning_rate": 9.920969509594558e-06, + "loss": 0.5273, + "step": 477 + }, + { + "epoch": 0.34, + "grad_norm": 8.015029790773587, + "learning_rate": 9.920456858728567e-06, + "loss": 0.4678, + "step": 478 + }, + { + "epoch": 0.34, + "grad_norm": 8.165530523776276, + "learning_rate": 9.919942563846482e-06, + "loss": 0.4663, + "step": 479 + }, + { + "epoch": 0.34, + "grad_norm": 8.515499096006561, + "learning_rate": 9.919426625120137e-06, + "loss": 0.5649, + "step": 480 + }, + { + "epoch": 0.34, + "grad_norm": 7.393807256918857, + "learning_rate": 9.918909042721918e-06, + "loss": 0.5576, + "step": 481 + }, + { + "epoch": 0.34, + "grad_norm": 16.408074858410433, + "learning_rate": 9.918389816824759e-06, + "loss": 0.6514, + "step": 482 + }, + { + "epoch": 0.34, + "grad_norm": 18.273121385573273, + "learning_rate": 9.917868947602144e-06, + "loss": 0.6157, + "step": 483 + }, + { + "epoch": 0.35, + "grad_norm": 12.733233581807566, + "learning_rate": 9.917346435228102e-06, + "loss": 0.6221, + "step": 484 + }, + { + "epoch": 0.35, + "grad_norm": 12.1906994483687, + "learning_rate": 9.916822279877217e-06, + "loss": 0.4849, + "step": 485 + }, + { + "epoch": 0.35, + "grad_norm": 16.21592645313153, + "learning_rate": 9.91629648172462e-06, + "loss": 0.5352, + "step": 486 + }, + { + "epoch": 0.35, + "grad_norm": 9.57311751283234, + "learning_rate": 9.915769040945984e-06, + "loss": 0.54, + "step": 487 + }, + { + "epoch": 0.35, + "grad_norm": 6.283261788372716, + "learning_rate": 9.915239957717542e-06, + "loss": 0.5034, + "step": 488 + }, + { + "epoch": 0.35, + "grad_norm": 6.597381341995393, + "learning_rate": 9.91470923221607e-06, + "loss": 0.5898, + "step": 489 + }, + { + "epoch": 0.35, + "grad_norm": 8.86577970600292, + "learning_rate": 9.914176864618891e-06, + "loss": 0.5303, + "step": 490 + }, + { + "epoch": 0.35, + "grad_norm": 12.003835054098607, + "learning_rate": 9.913642855103881e-06, + "loss": 0.4561, + "step": 491 + }, + { + "epoch": 0.35, + "grad_norm": 20.025050964061947, + "learning_rate": 9.913107203849464e-06, + "loss": 0.5947, + "step": 492 + }, + { + "epoch": 0.35, + "grad_norm": 20.663144493285422, + "learning_rate": 9.912569911034607e-06, + "loss": 0.6509, + "step": 493 + }, + { + "epoch": 0.35, + "grad_norm": 15.22745250393881, + "learning_rate": 9.912030976838832e-06, + "loss": 0.5649, + "step": 494 + }, + { + "epoch": 0.35, + "grad_norm": 19.50368431179958, + "learning_rate": 9.911490401442205e-06, + "loss": 0.5723, + "step": 495 + }, + { + "epoch": 0.35, + "grad_norm": 18.650046429480334, + "learning_rate": 9.910948185025345e-06, + "loss": 0.582, + "step": 496 + }, + { + "epoch": 0.35, + "grad_norm": 13.16731458869406, + "learning_rate": 9.910404327769414e-06, + "loss": 0.5161, + "step": 497 + }, + { + "epoch": 0.36, + "grad_norm": 16.48953220742883, + "learning_rate": 9.909858829856127e-06, + "loss": 0.5527, + "step": 498 + }, + { + "epoch": 0.36, + "grad_norm": 9.6203128320521, + "learning_rate": 9.909311691467744e-06, + "loss": 0.479, + "step": 499 + }, + { + "epoch": 0.36, + "grad_norm": 8.61392255485728, + "learning_rate": 9.908762912787073e-06, + "loss": 0.5513, + "step": 500 + }, + { + "epoch": 0.36, + "eval_avg_AUC": 0.7617813721540047, + "eval_avg_Accuracy": 0.688204575596817, + "eval_avg_Accuracy-right": 0.8745924090256946, + "eval_avg_Accuracy-wrong": 0.3632021833068001, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6387441987762204, + "eval_last_AUC": 0.7801619916516469, + "eval_last_Accuracy": 0.7156415782493368, + "eval_last_Accuracy-right": 0.7742924220686057, + "eval_last_Accuracy-wrong": 0.6133727541505573, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6543256471238833, + "eval_max_AUC": 0.6908342239463399, + "eval_max_Accuracy": 0.6420755968169761, + "eval_max_Accuracy-right": 0.9715012390765619, + "eval_max_Accuracy-wrong": 0.06765976802365249, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.5464614776782707, + "eval_min_AUC": 0.7800757977941195, + "eval_min_Accuracy": 0.712823275862069, + "eval_min_Accuracy-right": 0.7380331289943916, + "eval_min_Accuracy-wrong": 0.6688651353195361, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6532406424802264, + "eval_prod_AUC": 0.7709256779799856, + "eval_prod_Accuracy": 0.6040285145888594, + "eval_prod_Accuracy-right": 0.41932959436546235, + "eval_prod_Accuracy-wrong": 0.9260859677052535, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6340698606601375, + "eval_runtime": 247.7022, + "eval_samples_per_second": 97.407, + "eval_steps_per_second": 3.044, + "eval_sum_AUC": 0.6279793611726269, + "eval_sum_Accuracy": 0.6357758620689655, + "eval_sum_Accuracy-right": 0.9995434981087779, + "eval_sum_Accuracy-wrong": 0.0014782806458949283, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6170558343639955, + "step": 500 + }, + { + "epoch": 0.36, + "grad_norm": 18.540498504399512, + "learning_rate": 9.908212493997473e-06, + "loss": 0.5596, + "step": 501 + }, + { + "epoch": 0.36, + "grad_norm": 9.128203532313956, + "learning_rate": 9.90766043528285e-06, + "loss": 0.5674, + "step": 502 + }, + { + "epoch": 0.36, + "grad_norm": 21.257665582917742, + "learning_rate": 9.907106736827654e-06, + "loss": 0.5996, + "step": 503 + }, + { + "epoch": 0.36, + "grad_norm": 6.247378361884045, + "learning_rate": 9.906551398816886e-06, + "loss": 0.4883, + "step": 504 + }, + { + "epoch": 0.36, + "grad_norm": 7.880394950980597, + "learning_rate": 9.9059944214361e-06, + "loss": 0.543, + "step": 505 + }, + { + "epoch": 0.36, + "grad_norm": 11.949181718064018, + "learning_rate": 9.905435804871387e-06, + "loss": 0.5933, + "step": 506 + }, + { + "epoch": 0.36, + "grad_norm": 20.010979155843657, + "learning_rate": 9.904875549309391e-06, + "loss": 0.6396, + "step": 507 + }, + { + "epoch": 0.36, + "grad_norm": 15.747061457011934, + "learning_rate": 9.904313654937308e-06, + "loss": 0.6533, + "step": 508 + }, + { + "epoch": 0.36, + "grad_norm": 16.93160977054565, + "learning_rate": 9.903750121942873e-06, + "loss": 0.5938, + "step": 509 + }, + { + "epoch": 0.36, + "grad_norm": 4.696205755954405, + "learning_rate": 9.903184950514378e-06, + "loss": 0.5122, + "step": 510 + }, + { + "epoch": 0.36, + "grad_norm": 7.53643132677427, + "learning_rate": 9.90261814084065e-06, + "loss": 0.5781, + "step": 511 + }, + { + "epoch": 0.37, + "grad_norm": 17.83447427750056, + "learning_rate": 9.902049693111077e-06, + "loss": 0.6719, + "step": 512 + }, + { + "epoch": 0.37, + "grad_norm": 10.7557972910997, + "learning_rate": 9.901479607515587e-06, + "loss": 0.5342, + "step": 513 + }, + { + "epoch": 0.37, + "grad_norm": 4.403858483082648, + "learning_rate": 9.900907884244654e-06, + "loss": 0.5684, + "step": 514 + }, + { + "epoch": 0.37, + "grad_norm": 10.25591218016626, + "learning_rate": 9.900334523489303e-06, + "loss": 0.5972, + "step": 515 + }, + { + "epoch": 0.37, + "grad_norm": 8.831414048462934, + "learning_rate": 9.899759525441101e-06, + "loss": 0.5625, + "step": 516 + }, + { + "epoch": 0.37, + "grad_norm": 9.09169003682686, + "learning_rate": 9.899182890292171e-06, + "loss": 0.5913, + "step": 517 + }, + { + "epoch": 0.37, + "grad_norm": 17.221486795019896, + "learning_rate": 9.898604618235175e-06, + "loss": 0.6055, + "step": 518 + }, + { + "epoch": 0.37, + "grad_norm": 19.080050148413697, + "learning_rate": 9.898024709463322e-06, + "loss": 0.6074, + "step": 519 + }, + { + "epoch": 0.37, + "grad_norm": 10.676674384714692, + "learning_rate": 9.897443164170375e-06, + "loss": 0.5405, + "step": 520 + }, + { + "epoch": 0.37, + "grad_norm": 16.304810818457433, + "learning_rate": 9.896859982550636e-06, + "loss": 0.4937, + "step": 521 + }, + { + "epoch": 0.37, + "grad_norm": 7.777058423733208, + "learning_rate": 9.89627516479896e-06, + "loss": 0.5742, + "step": 522 + }, + { + "epoch": 0.37, + "grad_norm": 10.391882166187722, + "learning_rate": 9.895688711110739e-06, + "loss": 0.5264, + "step": 523 + }, + { + "epoch": 0.37, + "grad_norm": 11.925774976971661, + "learning_rate": 9.895100621681923e-06, + "loss": 0.666, + "step": 524 + }, + { + "epoch": 0.37, + "grad_norm": 5.998754301379017, + "learning_rate": 9.894510896709003e-06, + "loss": 0.5176, + "step": 525 + }, + { + "epoch": 0.38, + "grad_norm": 5.840708276690297, + "learning_rate": 9.893919536389017e-06, + "loss": 0.5288, + "step": 526 + }, + { + "epoch": 0.38, + "grad_norm": 6.687147803937831, + "learning_rate": 9.89332654091955e-06, + "loss": 0.5239, + "step": 527 + }, + { + "epoch": 0.38, + "grad_norm": 6.768270858341022, + "learning_rate": 9.892731910498731e-06, + "loss": 0.5415, + "step": 528 + }, + { + "epoch": 0.38, + "grad_norm": 5.905157062520896, + "learning_rate": 9.892135645325238e-06, + "loss": 0.5664, + "step": 529 + }, + { + "epoch": 0.38, + "grad_norm": 14.120351808706744, + "learning_rate": 9.891537745598293e-06, + "loss": 0.5811, + "step": 530 + }, + { + "epoch": 0.38, + "grad_norm": 14.407988555074818, + "learning_rate": 9.89093821151767e-06, + "loss": 0.5762, + "step": 531 + }, + { + "epoch": 0.38, + "grad_norm": 5.3711923602355105, + "learning_rate": 9.89033704328368e-06, + "loss": 0.5137, + "step": 532 + }, + { + "epoch": 0.38, + "grad_norm": 7.0687402397265675, + "learning_rate": 9.889734241097186e-06, + "loss": 0.5273, + "step": 533 + }, + { + "epoch": 0.38, + "grad_norm": 11.00724274100732, + "learning_rate": 9.889129805159595e-06, + "loss": 0.5386, + "step": 534 + }, + { + "epoch": 0.38, + "grad_norm": 14.481688049622676, + "learning_rate": 9.888523735672861e-06, + "loss": 0.519, + "step": 535 + }, + { + "epoch": 0.38, + "grad_norm": 7.6367957136746245, + "learning_rate": 9.887916032839482e-06, + "loss": 0.498, + "step": 536 + }, + { + "epoch": 0.38, + "grad_norm": 5.466046619158893, + "learning_rate": 9.887306696862504e-06, + "loss": 0.4902, + "step": 537 + }, + { + "epoch": 0.38, + "grad_norm": 8.116707420357166, + "learning_rate": 9.886695727945515e-06, + "loss": 0.5771, + "step": 538 + }, + { + "epoch": 0.38, + "grad_norm": 8.725720682821406, + "learning_rate": 9.886083126292655e-06, + "loss": 0.5659, + "step": 539 + }, + { + "epoch": 0.39, + "grad_norm": 13.590126217196405, + "learning_rate": 9.885468892108603e-06, + "loss": 0.5752, + "step": 540 + }, + { + "epoch": 0.39, + "grad_norm": 12.37036153224896, + "learning_rate": 9.884853025598587e-06, + "loss": 0.5225, + "step": 541 + }, + { + "epoch": 0.39, + "grad_norm": 17.911884760434983, + "learning_rate": 9.884235526968377e-06, + "loss": 0.5542, + "step": 542 + }, + { + "epoch": 0.39, + "grad_norm": 13.973277734720734, + "learning_rate": 9.883616396424294e-06, + "loss": 0.5732, + "step": 543 + }, + { + "epoch": 0.39, + "grad_norm": 12.445859091078967, + "learning_rate": 9.8829956341732e-06, + "loss": 0.5679, + "step": 544 + }, + { + "epoch": 0.39, + "grad_norm": 14.059475523503908, + "learning_rate": 9.882373240422503e-06, + "loss": 0.5146, + "step": 545 + }, + { + "epoch": 0.39, + "grad_norm": 24.837290624700998, + "learning_rate": 9.881749215380156e-06, + "loss": 0.5596, + "step": 546 + }, + { + "epoch": 0.39, + "grad_norm": 9.213469746306272, + "learning_rate": 9.881123559254658e-06, + "loss": 0.5791, + "step": 547 + }, + { + "epoch": 0.39, + "grad_norm": 19.571221832522987, + "learning_rate": 9.880496272255053e-06, + "loss": 0.5654, + "step": 548 + }, + { + "epoch": 0.39, + "grad_norm": 6.884368751248441, + "learning_rate": 9.879867354590926e-06, + "loss": 0.5015, + "step": 549 + }, + { + "epoch": 0.39, + "grad_norm": 7.867690100404052, + "learning_rate": 9.879236806472414e-06, + "loss": 0.5454, + "step": 550 + }, + { + "epoch": 0.39, + "grad_norm": 12.247957723804134, + "learning_rate": 9.878604628110194e-06, + "loss": 0.5864, + "step": 551 + }, + { + "epoch": 0.39, + "grad_norm": 17.43931135081461, + "learning_rate": 9.877970819715485e-06, + "loss": 0.561, + "step": 552 + }, + { + "epoch": 0.39, + "grad_norm": 9.78533565974865, + "learning_rate": 9.87733538150006e-06, + "loss": 0.5269, + "step": 553 + }, + { + "epoch": 0.4, + "grad_norm": 9.031541245747464, + "learning_rate": 9.876698313676225e-06, + "loss": 0.5073, + "step": 554 + }, + { + "epoch": 0.4, + "grad_norm": 15.535835835682214, + "learning_rate": 9.876059616456842e-06, + "loss": 0.5723, + "step": 555 + }, + { + "epoch": 0.4, + "grad_norm": 8.31605031845537, + "learning_rate": 9.875419290055305e-06, + "loss": 0.4766, + "step": 556 + }, + { + "epoch": 0.4, + "grad_norm": 10.670021692503195, + "learning_rate": 9.874777334685565e-06, + "loss": 0.6074, + "step": 557 + }, + { + "epoch": 0.4, + "grad_norm": 7.219201299200287, + "learning_rate": 9.874133750562108e-06, + "loss": 0.5947, + "step": 558 + }, + { + "epoch": 0.4, + "grad_norm": 9.904120927324652, + "learning_rate": 9.873488537899967e-06, + "loss": 0.626, + "step": 559 + }, + { + "epoch": 0.4, + "grad_norm": 8.496631368652434, + "learning_rate": 9.872841696914721e-06, + "loss": 0.5391, + "step": 560 + }, + { + "epoch": 0.4, + "grad_norm": 7.536188576252165, + "learning_rate": 9.872193227822492e-06, + "loss": 0.5508, + "step": 561 + }, + { + "epoch": 0.4, + "grad_norm": 7.528143767413258, + "learning_rate": 9.871543130839944e-06, + "loss": 0.5732, + "step": 562 + }, + { + "epoch": 0.4, + "grad_norm": 9.58970570036274, + "learning_rate": 9.870891406184288e-06, + "loss": 0.5674, + "step": 563 + }, + { + "epoch": 0.4, + "grad_norm": 11.800186998521902, + "learning_rate": 9.870238054073275e-06, + "loss": 0.5566, + "step": 564 + }, + { + "epoch": 0.4, + "grad_norm": 13.911555208685854, + "learning_rate": 9.869583074725206e-06, + "loss": 0.6074, + "step": 565 + }, + { + "epoch": 0.4, + "grad_norm": 8.231300969398722, + "learning_rate": 9.868926468358919e-06, + "loss": 0.4829, + "step": 566 + }, + { + "epoch": 0.4, + "grad_norm": 7.709909484735207, + "learning_rate": 9.868268235193796e-06, + "loss": 0.502, + "step": 567 + }, + { + "epoch": 0.41, + "grad_norm": 9.178709235999069, + "learning_rate": 9.867608375449772e-06, + "loss": 0.5166, + "step": 568 + }, + { + "epoch": 0.41, + "grad_norm": 12.354410293206907, + "learning_rate": 9.866946889347311e-06, + "loss": 0.5703, + "step": 569 + }, + { + "epoch": 0.41, + "grad_norm": 10.915312310012048, + "learning_rate": 9.866283777107432e-06, + "loss": 0.5254, + "step": 570 + }, + { + "epoch": 0.41, + "grad_norm": 20.60559354889048, + "learning_rate": 9.865619038951692e-06, + "loss": 0.5996, + "step": 571 + }, + { + "epoch": 0.41, + "grad_norm": 11.26569614452492, + "learning_rate": 9.864952675102193e-06, + "loss": 0.5723, + "step": 572 + }, + { + "epoch": 0.41, + "grad_norm": 16.002089457228422, + "learning_rate": 9.864284685781578e-06, + "loss": 0.5864, + "step": 573 + }, + { + "epoch": 0.41, + "grad_norm": 23.044339027647595, + "learning_rate": 9.863615071213036e-06, + "loss": 0.5591, + "step": 574 + }, + { + "epoch": 0.41, + "grad_norm": 9.59838106899753, + "learning_rate": 9.862943831620298e-06, + "loss": 0.4917, + "step": 575 + }, + { + "epoch": 0.41, + "grad_norm": 8.247430329121268, + "learning_rate": 9.862270967227636e-06, + "loss": 0.5332, + "step": 576 + }, + { + "epoch": 0.41, + "grad_norm": 8.838445091054146, + "learning_rate": 9.861596478259869e-06, + "loss": 0.5576, + "step": 577 + }, + { + "epoch": 0.41, + "grad_norm": 10.4489106721379, + "learning_rate": 9.860920364942353e-06, + "loss": 0.5679, + "step": 578 + }, + { + "epoch": 0.41, + "grad_norm": 12.347633105018176, + "learning_rate": 9.860242627500994e-06, + "loss": 0.5747, + "step": 579 + }, + { + "epoch": 0.41, + "grad_norm": 12.918155502814725, + "learning_rate": 9.859563266162231e-06, + "loss": 0.5552, + "step": 580 + }, + { + "epoch": 0.41, + "grad_norm": 7.352511123265138, + "learning_rate": 9.858882281153058e-06, + "loss": 0.4307, + "step": 581 + }, + { + "epoch": 0.42, + "grad_norm": 7.102252433561581, + "learning_rate": 9.858199672701e-06, + "loss": 0.499, + "step": 582 + }, + { + "epoch": 0.42, + "grad_norm": 10.073596010036917, + "learning_rate": 9.85751544103413e-06, + "loss": 0.4551, + "step": 583 + }, + { + "epoch": 0.42, + "grad_norm": 12.478615776414374, + "learning_rate": 9.856829586381065e-06, + "loss": 0.542, + "step": 584 + }, + { + "epoch": 0.42, + "grad_norm": 31.02955657240596, + "learning_rate": 9.856142108970958e-06, + "loss": 0.6479, + "step": 585 + }, + { + "epoch": 0.42, + "grad_norm": 14.277538328288841, + "learning_rate": 9.855453009033512e-06, + "loss": 0.5327, + "step": 586 + }, + { + "epoch": 0.42, + "grad_norm": 8.683057931417592, + "learning_rate": 9.854762286798965e-06, + "loss": 0.5645, + "step": 587 + }, + { + "epoch": 0.42, + "grad_norm": 25.215793586089497, + "learning_rate": 9.854069942498102e-06, + "loss": 0.8555, + "step": 588 + }, + { + "epoch": 0.42, + "grad_norm": 9.562234064634753, + "learning_rate": 9.853375976362245e-06, + "loss": 0.5874, + "step": 589 + }, + { + "epoch": 0.42, + "grad_norm": 8.721200107181694, + "learning_rate": 9.852680388623266e-06, + "loss": 0.5298, + "step": 590 + }, + { + "epoch": 0.42, + "grad_norm": 22.093284898328076, + "learning_rate": 9.85198317951357e-06, + "loss": 0.6309, + "step": 591 + }, + { + "epoch": 0.42, + "grad_norm": 18.346628659885774, + "learning_rate": 9.851284349266107e-06, + "loss": 0.6416, + "step": 592 + }, + { + "epoch": 0.42, + "grad_norm": 13.619550827391471, + "learning_rate": 9.850583898114372e-06, + "loss": 0.6084, + "step": 593 + }, + { + "epoch": 0.42, + "grad_norm": 9.504830357240213, + "learning_rate": 9.849881826292399e-06, + "loss": 0.5742, + "step": 594 + }, + { + "epoch": 0.42, + "grad_norm": 21.655717476878046, + "learning_rate": 9.84917813403476e-06, + "loss": 0.6416, + "step": 595 + }, + { + "epoch": 0.43, + "grad_norm": 16.761291695955354, + "learning_rate": 9.848472821576572e-06, + "loss": 0.5938, + "step": 596 + }, + { + "epoch": 0.43, + "grad_norm": 5.531377595805906, + "learning_rate": 9.847765889153497e-06, + "loss": 0.5601, + "step": 597 + }, + { + "epoch": 0.43, + "grad_norm": 5.367401037379296, + "learning_rate": 9.847057337001731e-06, + "loss": 0.6104, + "step": 598 + }, + { + "epoch": 0.43, + "grad_norm": 13.851863355122642, + "learning_rate": 9.846347165358014e-06, + "loss": 0.5552, + "step": 599 + }, + { + "epoch": 0.43, + "grad_norm": 10.152768776233245, + "learning_rate": 9.84563537445963e-06, + "loss": 0.5654, + "step": 600 + }, + { + "epoch": 0.43, + "grad_norm": 5.288895704846823, + "learning_rate": 9.844921964544398e-06, + "loss": 0.542, + "step": 601 + }, + { + "epoch": 0.43, + "grad_norm": 10.57924848248388, + "learning_rate": 9.844206935850687e-06, + "loss": 0.585, + "step": 602 + }, + { + "epoch": 0.43, + "grad_norm": 6.139809939859182, + "learning_rate": 9.843490288617397e-06, + "loss": 0.5186, + "step": 603 + }, + { + "epoch": 0.43, + "grad_norm": 9.68694326416687, + "learning_rate": 9.842772023083972e-06, + "loss": 0.4717, + "step": 604 + }, + { + "epoch": 0.43, + "grad_norm": 16.74113624317417, + "learning_rate": 9.842052139490403e-06, + "loss": 0.5591, + "step": 605 + }, + { + "epoch": 0.43, + "grad_norm": 9.814314683170982, + "learning_rate": 9.841330638077213e-06, + "loss": 0.5303, + "step": 606 + }, + { + "epoch": 0.43, + "grad_norm": 8.263159508477482, + "learning_rate": 9.840607519085467e-06, + "loss": 0.5293, + "step": 607 + }, + { + "epoch": 0.43, + "grad_norm": 8.917568543252393, + "learning_rate": 9.839882782756778e-06, + "loss": 0.5425, + "step": 608 + }, + { + "epoch": 0.43, + "grad_norm": 13.506441410922555, + "learning_rate": 9.839156429333291e-06, + "loss": 0.5986, + "step": 609 + }, + { + "epoch": 0.44, + "grad_norm": 8.668039530691972, + "learning_rate": 9.838428459057694e-06, + "loss": 0.5171, + "step": 610 + }, + { + "epoch": 0.44, + "grad_norm": 7.4133141642419424, + "learning_rate": 9.837698872173214e-06, + "loss": 0.4961, + "step": 611 + }, + { + "epoch": 0.44, + "grad_norm": 13.506554715753216, + "learning_rate": 9.836967668923623e-06, + "loss": 0.564, + "step": 612 + }, + { + "epoch": 0.44, + "grad_norm": 9.580418265887094, + "learning_rate": 9.836234849553228e-06, + "loss": 0.4878, + "step": 613 + }, + { + "epoch": 0.44, + "grad_norm": 13.792067921572484, + "learning_rate": 9.835500414306875e-06, + "loss": 0.5903, + "step": 614 + }, + { + "epoch": 0.44, + "grad_norm": 7.889230874692596, + "learning_rate": 9.834764363429956e-06, + "loss": 0.501, + "step": 615 + }, + { + "epoch": 0.44, + "grad_norm": 13.876917354950514, + "learning_rate": 9.8340266971684e-06, + "loss": 0.5684, + "step": 616 + }, + { + "epoch": 0.44, + "grad_norm": 9.194977979042537, + "learning_rate": 9.83328741576867e-06, + "loss": 0.4902, + "step": 617 + }, + { + "epoch": 0.44, + "grad_norm": 16.105001380677454, + "learning_rate": 9.832546519477778e-06, + "loss": 0.5854, + "step": 618 + }, + { + "epoch": 0.44, + "grad_norm": 21.476338145150045, + "learning_rate": 9.831804008543271e-06, + "loss": 0.6064, + "step": 619 + }, + { + "epoch": 0.44, + "grad_norm": 9.759650991581609, + "learning_rate": 9.831059883213234e-06, + "loss": 0.6025, + "step": 620 + }, + { + "epoch": 0.44, + "grad_norm": 10.320154908129068, + "learning_rate": 9.830314143736292e-06, + "loss": 0.4985, + "step": 621 + }, + { + "epoch": 0.44, + "grad_norm": 18.063611986524947, + "learning_rate": 9.829566790361615e-06, + "loss": 0.5317, + "step": 622 + }, + { + "epoch": 0.44, + "grad_norm": 6.8381031832744865, + "learning_rate": 9.828817823338903e-06, + "loss": 0.5234, + "step": 623 + }, + { + "epoch": 0.45, + "grad_norm": 12.028721530123617, + "learning_rate": 9.828067242918402e-06, + "loss": 0.5938, + "step": 624 + }, + { + "epoch": 0.45, + "grad_norm": 13.738098966810448, + "learning_rate": 9.827315049350895e-06, + "loss": 0.5249, + "step": 625 + }, + { + "epoch": 0.45, + "grad_norm": 11.549683197853444, + "learning_rate": 9.826561242887704e-06, + "loss": 0.5049, + "step": 626 + }, + { + "epoch": 0.45, + "grad_norm": 6.847026094764958, + "learning_rate": 9.825805823780687e-06, + "loss": 0.562, + "step": 627 + }, + { + "epoch": 0.45, + "grad_norm": 7.1475623479236985, + "learning_rate": 9.825048792282247e-06, + "loss": 0.5259, + "step": 628 + }, + { + "epoch": 0.45, + "grad_norm": 18.57915719785624, + "learning_rate": 9.824290148645322e-06, + "loss": 0.5098, + "step": 629 + }, + { + "epoch": 0.45, + "grad_norm": 7.748425573675469, + "learning_rate": 9.823529893123384e-06, + "loss": 0.5869, + "step": 630 + }, + { + "epoch": 0.45, + "grad_norm": 7.059327133724575, + "learning_rate": 9.822768025970456e-06, + "loss": 0.4497, + "step": 631 + }, + { + "epoch": 0.45, + "grad_norm": 7.293876053511848, + "learning_rate": 9.822004547441088e-06, + "loss": 0.6064, + "step": 632 + }, + { + "epoch": 0.45, + "grad_norm": 18.38956537939008, + "learning_rate": 9.821239457790373e-06, + "loss": 0.624, + "step": 633 + }, + { + "epoch": 0.45, + "grad_norm": 13.857454287932656, + "learning_rate": 9.82047275727394e-06, + "loss": 0.5557, + "step": 634 + }, + { + "epoch": 0.45, + "grad_norm": 14.129814430870212, + "learning_rate": 9.81970444614796e-06, + "loss": 0.5376, + "step": 635 + }, + { + "epoch": 0.45, + "grad_norm": 7.239828802765227, + "learning_rate": 9.81893452466914e-06, + "loss": 0.5459, + "step": 636 + }, + { + "epoch": 0.45, + "grad_norm": 15.759535502268271, + "learning_rate": 9.818162993094724e-06, + "loss": 0.5229, + "step": 637 + }, + { + "epoch": 0.46, + "grad_norm": 8.269358730497334, + "learning_rate": 9.817389851682494e-06, + "loss": 0.5811, + "step": 638 + }, + { + "epoch": 0.46, + "grad_norm": 9.532561868563791, + "learning_rate": 9.816615100690773e-06, + "loss": 0.5068, + "step": 639 + }, + { + "epoch": 0.46, + "grad_norm": 10.542345953735255, + "learning_rate": 9.81583874037842e-06, + "loss": 0.5195, + "step": 640 + }, + { + "epoch": 0.46, + "grad_norm": 17.295525138598315, + "learning_rate": 9.815060771004831e-06, + "loss": 0.5811, + "step": 641 + }, + { + "epoch": 0.46, + "grad_norm": 10.216808140877509, + "learning_rate": 9.81428119282994e-06, + "loss": 0.5483, + "step": 642 + }, + { + "epoch": 0.46, + "grad_norm": 11.054796365751393, + "learning_rate": 9.813500006114216e-06, + "loss": 0.5225, + "step": 643 + }, + { + "epoch": 0.46, + "grad_norm": 11.532068190094899, + "learning_rate": 9.812717211118673e-06, + "loss": 0.5259, + "step": 644 + }, + { + "epoch": 0.46, + "grad_norm": 12.617232952388049, + "learning_rate": 9.811932808104852e-06, + "loss": 0.583, + "step": 645 + }, + { + "epoch": 0.46, + "grad_norm": 17.53476537868423, + "learning_rate": 9.811146797334838e-06, + "loss": 0.5986, + "step": 646 + }, + { + "epoch": 0.46, + "grad_norm": 9.459867876200462, + "learning_rate": 9.810359179071255e-06, + "loss": 0.4854, + "step": 647 + }, + { + "epoch": 0.46, + "grad_norm": 13.442128811771815, + "learning_rate": 9.809569953577258e-06, + "loss": 0.5283, + "step": 648 + }, + { + "epoch": 0.46, + "grad_norm": 10.05485902012583, + "learning_rate": 9.808779121116542e-06, + "loss": 0.5869, + "step": 649 + }, + { + "epoch": 0.46, + "grad_norm": 14.614481323202112, + "learning_rate": 9.807986681953341e-06, + "loss": 0.5005, + "step": 650 + }, + { + "epoch": 0.46, + "grad_norm": 9.302158068476116, + "learning_rate": 9.807192636352422e-06, + "loss": 0.5518, + "step": 651 + }, + { + "epoch": 0.47, + "grad_norm": 7.246031731872442, + "learning_rate": 9.80639698457909e-06, + "loss": 0.5, + "step": 652 + }, + { + "epoch": 0.47, + "grad_norm": 14.741614003843505, + "learning_rate": 9.805599726899188e-06, + "loss": 0.5195, + "step": 653 + }, + { + "epoch": 0.47, + "grad_norm": 9.914045939553276, + "learning_rate": 9.804800863579094e-06, + "loss": 0.5059, + "step": 654 + }, + { + "epoch": 0.47, + "grad_norm": 8.602058773881305, + "learning_rate": 9.804000394885723e-06, + "loss": 0.5132, + "step": 655 + }, + { + "epoch": 0.47, + "grad_norm": 17.395330548720693, + "learning_rate": 9.803198321086527e-06, + "loss": 0.6504, + "step": 656 + }, + { + "epoch": 0.47, + "grad_norm": 9.082779853562936, + "learning_rate": 9.802394642449494e-06, + "loss": 0.5503, + "step": 657 + }, + { + "epoch": 0.47, + "grad_norm": 16.05035071434153, + "learning_rate": 9.801589359243147e-06, + "loss": 0.5625, + "step": 658 + }, + { + "epoch": 0.47, + "grad_norm": 8.74752221065336, + "learning_rate": 9.800782471736547e-06, + "loss": 0.5024, + "step": 659 + }, + { + "epoch": 0.47, + "grad_norm": 11.93020384218078, + "learning_rate": 9.799973980199288e-06, + "loss": 0.5679, + "step": 660 + }, + { + "epoch": 0.47, + "grad_norm": 23.58689659447437, + "learning_rate": 9.799163884901506e-06, + "loss": 0.5435, + "step": 661 + }, + { + "epoch": 0.47, + "grad_norm": 9.583439466485594, + "learning_rate": 9.798352186113867e-06, + "loss": 0.5093, + "step": 662 + }, + { + "epoch": 0.47, + "grad_norm": 10.423178186172265, + "learning_rate": 9.797538884107574e-06, + "loss": 0.6123, + "step": 663 + }, + { + "epoch": 0.47, + "grad_norm": 12.533199007539238, + "learning_rate": 9.796723979154366e-06, + "loss": 0.5229, + "step": 664 + }, + { + "epoch": 0.47, + "grad_norm": 8.971226162816325, + "learning_rate": 9.795907471526518e-06, + "loss": 0.5547, + "step": 665 + }, + { + "epoch": 0.48, + "grad_norm": 13.783730829396234, + "learning_rate": 9.79508936149684e-06, + "loss": 0.4644, + "step": 666 + }, + { + "epoch": 0.48, + "grad_norm": 13.03312528136628, + "learning_rate": 9.79426964933868e-06, + "loss": 0.543, + "step": 667 + }, + { + "epoch": 0.48, + "grad_norm": 8.283132104994282, + "learning_rate": 9.793448335325919e-06, + "loss": 0.4917, + "step": 668 + }, + { + "epoch": 0.48, + "grad_norm": 13.20800855604922, + "learning_rate": 9.792625419732969e-06, + "loss": 0.5264, + "step": 669 + }, + { + "epoch": 0.48, + "grad_norm": 9.415376972801536, + "learning_rate": 9.791800902834787e-06, + "loss": 0.4824, + "step": 670 + }, + { + "epoch": 0.48, + "grad_norm": 10.273820442326011, + "learning_rate": 9.790974784906855e-06, + "loss": 0.501, + "step": 671 + }, + { + "epoch": 0.48, + "grad_norm": 7.671344611499754, + "learning_rate": 9.790147066225198e-06, + "loss": 0.605, + "step": 672 + }, + { + "epoch": 0.48, + "grad_norm": 14.088843769717563, + "learning_rate": 9.789317747066369e-06, + "loss": 0.5586, + "step": 673 + }, + { + "epoch": 0.48, + "grad_norm": 10.751038170132446, + "learning_rate": 9.788486827707462e-06, + "loss": 0.6519, + "step": 674 + }, + { + "epoch": 0.48, + "grad_norm": 9.207892997115179, + "learning_rate": 9.7876543084261e-06, + "loss": 0.5347, + "step": 675 + }, + { + "epoch": 0.48, + "grad_norm": 8.857264061196519, + "learning_rate": 9.786820189500443e-06, + "loss": 0.5464, + "step": 676 + }, + { + "epoch": 0.48, + "grad_norm": 13.830984057445567, + "learning_rate": 9.785984471209186e-06, + "loss": 0.5273, + "step": 677 + }, + { + "epoch": 0.48, + "grad_norm": 6.60468250187857, + "learning_rate": 9.785147153831562e-06, + "loss": 0.5459, + "step": 678 + }, + { + "epoch": 0.48, + "grad_norm": 7.593019502309981, + "learning_rate": 9.784308237647329e-06, + "loss": 0.5757, + "step": 679 + }, + { + "epoch": 0.49, + "grad_norm": 9.955889286248375, + "learning_rate": 9.783467722936786e-06, + "loss": 0.583, + "step": 680 + }, + { + "epoch": 0.49, + "grad_norm": 12.939295729656601, + "learning_rate": 9.782625609980767e-06, + "loss": 0.5991, + "step": 681 + }, + { + "epoch": 0.49, + "grad_norm": 22.514876614342388, + "learning_rate": 9.781781899060635e-06, + "loss": 0.5703, + "step": 682 + }, + { + "epoch": 0.49, + "grad_norm": 13.553266494196397, + "learning_rate": 9.78093659045829e-06, + "loss": 0.5435, + "step": 683 + }, + { + "epoch": 0.49, + "grad_norm": 5.263926473907152, + "learning_rate": 9.780089684456164e-06, + "loss": 0.5229, + "step": 684 + }, + { + "epoch": 0.49, + "grad_norm": 5.635255900891267, + "learning_rate": 9.779241181337228e-06, + "loss": 0.5176, + "step": 685 + }, + { + "epoch": 0.49, + "grad_norm": 10.608059375423723, + "learning_rate": 9.778391081384979e-06, + "loss": 0.5645, + "step": 686 + }, + { + "epoch": 0.49, + "grad_norm": 19.41849821192734, + "learning_rate": 9.777539384883453e-06, + "loss": 0.4922, + "step": 687 + }, + { + "epoch": 0.49, + "grad_norm": 17.159527005843714, + "learning_rate": 9.776686092117216e-06, + "loss": 0.5977, + "step": 688 + }, + { + "epoch": 0.49, + "grad_norm": 20.36894102930875, + "learning_rate": 9.775831203371371e-06, + "loss": 0.5693, + "step": 689 + }, + { + "epoch": 0.49, + "grad_norm": 16.091636163606054, + "learning_rate": 9.774974718931551e-06, + "loss": 0.6221, + "step": 690 + }, + { + "epoch": 0.49, + "grad_norm": 17.969604571791074, + "learning_rate": 9.774116639083923e-06, + "loss": 0.5854, + "step": 691 + }, + { + "epoch": 0.49, + "grad_norm": 19.88504893165957, + "learning_rate": 9.773256964115189e-06, + "loss": 0.5049, + "step": 692 + }, + { + "epoch": 0.49, + "grad_norm": 11.758445464297264, + "learning_rate": 9.772395694312583e-06, + "loss": 0.603, + "step": 693 + }, + { + "epoch": 0.5, + "grad_norm": 14.955201215215652, + "learning_rate": 9.771532829963865e-06, + "loss": 0.5571, + "step": 694 + }, + { + "epoch": 0.5, + "grad_norm": 5.013451757180628, + "learning_rate": 9.770668371357344e-06, + "loss": 0.4849, + "step": 695 + }, + { + "epoch": 0.5, + "grad_norm": 5.6991625858998365, + "learning_rate": 9.769802318781842e-06, + "loss": 0.5337, + "step": 696 + }, + { + "epoch": 0.5, + "grad_norm": 19.575556491861107, + "learning_rate": 9.76893467252673e-06, + "loss": 0.5771, + "step": 697 + }, + { + "epoch": 0.5, + "grad_norm": 23.83427689960832, + "learning_rate": 9.768065432881903e-06, + "loss": 0.5859, + "step": 698 + }, + { + "epoch": 0.5, + "grad_norm": 10.23151806170552, + "learning_rate": 9.767194600137789e-06, + "loss": 0.4951, + "step": 699 + }, + { + "epoch": 0.5, + "grad_norm": 9.185321738716956, + "learning_rate": 9.766322174585347e-06, + "loss": 0.5342, + "step": 700 + }, + { + "epoch": 0.5, + "grad_norm": 5.443299162915134, + "learning_rate": 9.765448156516077e-06, + "loss": 0.4678, + "step": 701 + }, + { + "epoch": 0.5, + "grad_norm": 9.19817487137709, + "learning_rate": 9.764572546222e-06, + "loss": 0.5684, + "step": 702 + }, + { + "epoch": 0.5, + "grad_norm": 6.21262946620503, + "learning_rate": 9.763695343995674e-06, + "loss": 0.5308, + "step": 703 + }, + { + "epoch": 0.5, + "grad_norm": 13.192980873552312, + "learning_rate": 9.762816550130192e-06, + "loss": 0.5112, + "step": 704 + }, + { + "epoch": 0.5, + "grad_norm": 9.960657984729142, + "learning_rate": 9.76193616491917e-06, + "loss": 0.6094, + "step": 705 + }, + { + "epoch": 0.5, + "grad_norm": 6.679940313177855, + "learning_rate": 9.761054188656766e-06, + "loss": 0.5415, + "step": 706 + }, + { + "epoch": 0.5, + "grad_norm": 8.09877092756067, + "learning_rate": 9.760170621637661e-06, + "loss": 0.5601, + "step": 707 + }, + { + "epoch": 0.51, + "grad_norm": 11.479043385831767, + "learning_rate": 9.759285464157073e-06, + "loss": 0.5474, + "step": 708 + }, + { + "epoch": 0.51, + "grad_norm": 9.234613065252855, + "learning_rate": 9.758398716510751e-06, + "loss": 0.501, + "step": 709 + }, + { + "epoch": 0.51, + "grad_norm": 13.64323110344669, + "learning_rate": 9.75751037899497e-06, + "loss": 0.5635, + "step": 710 + }, + { + "epoch": 0.51, + "grad_norm": 13.041571595123274, + "learning_rate": 9.756620451906543e-06, + "loss": 0.5952, + "step": 711 + }, + { + "epoch": 0.51, + "grad_norm": 12.629287618121229, + "learning_rate": 9.75572893554281e-06, + "loss": 0.5493, + "step": 712 + }, + { + "epoch": 0.51, + "grad_norm": 7.5855291941636755, + "learning_rate": 9.754835830201645e-06, + "loss": 0.5557, + "step": 713 + }, + { + "epoch": 0.51, + "grad_norm": 4.8782598567148705, + "learning_rate": 9.753941136181448e-06, + "loss": 0.5273, + "step": 714 + }, + { + "epoch": 0.51, + "grad_norm": 11.85114790600747, + "learning_rate": 9.753044853781155e-06, + "loss": 0.5078, + "step": 715 + }, + { + "epoch": 0.51, + "grad_norm": 9.657250772447208, + "learning_rate": 9.75214698330023e-06, + "loss": 0.6157, + "step": 716 + }, + { + "epoch": 0.51, + "grad_norm": 17.576065249418974, + "learning_rate": 9.751247525038669e-06, + "loss": 0.4863, + "step": 717 + }, + { + "epoch": 0.51, + "grad_norm": 33.781222697734435, + "learning_rate": 9.750346479296998e-06, + "loss": 0.6094, + "step": 718 + }, + { + "epoch": 0.51, + "grad_norm": 185.99349497355314, + "learning_rate": 9.74944384637627e-06, + "loss": 0.625, + "step": 719 + }, + { + "epoch": 0.51, + "grad_norm": 7.364273036024208, + "learning_rate": 9.748539626578076e-06, + "loss": 0.4727, + "step": 720 + }, + { + "epoch": 0.51, + "grad_norm": 7.47582915048441, + "learning_rate": 9.747633820204527e-06, + "loss": 0.4775, + "step": 721 + }, + { + "epoch": 0.52, + "grad_norm": 160.92451037044142, + "learning_rate": 9.746726427558276e-06, + "loss": 0.583, + "step": 722 + }, + { + "epoch": 0.52, + "grad_norm": 14.29556717471361, + "learning_rate": 9.745817448942496e-06, + "loss": 0.6426, + "step": 723 + }, + { + "epoch": 0.52, + "grad_norm": 7.845209139079835, + "learning_rate": 9.744906884660894e-06, + "loss": 0.584, + "step": 724 + }, + { + "epoch": 0.52, + "grad_norm": 11.487953481735268, + "learning_rate": 9.743994735017708e-06, + "loss": 0.4824, + "step": 725 + }, + { + "epoch": 0.52, + "grad_norm": 16.209235710267066, + "learning_rate": 9.743081000317703e-06, + "loss": 0.6045, + "step": 726 + }, + { + "epoch": 0.52, + "grad_norm": 8.112078657361787, + "learning_rate": 9.742165680866173e-06, + "loss": 0.5244, + "step": 727 + }, + { + "epoch": 0.52, + "grad_norm": 12.970186411419759, + "learning_rate": 9.741248776968947e-06, + "loss": 0.5825, + "step": 728 + }, + { + "epoch": 0.52, + "grad_norm": 10.315186088300615, + "learning_rate": 9.740330288932379e-06, + "loss": 0.5918, + "step": 729 + }, + { + "epoch": 0.52, + "grad_norm": 11.457437824827904, + "learning_rate": 9.73941021706335e-06, + "loss": 0.5781, + "step": 730 + }, + { + "epoch": 0.52, + "grad_norm": 10.793356765325614, + "learning_rate": 9.738488561669272e-06, + "loss": 0.5474, + "step": 731 + }, + { + "epoch": 0.52, + "grad_norm": 7.112475758078757, + "learning_rate": 9.737565323058094e-06, + "loss": 0.5337, + "step": 732 + }, + { + "epoch": 0.52, + "grad_norm": 7.954732042734927, + "learning_rate": 9.736640501538281e-06, + "loss": 0.5552, + "step": 733 + }, + { + "epoch": 0.52, + "grad_norm": 9.682917509984245, + "learning_rate": 9.735714097418835e-06, + "loss": 0.5811, + "step": 734 + }, + { + "epoch": 0.52, + "grad_norm": 8.954785296883502, + "learning_rate": 9.734786111009287e-06, + "loss": 0.5283, + "step": 735 + }, + { + "epoch": 0.53, + "grad_norm": 7.187461253271188, + "learning_rate": 9.73385654261969e-06, + "loss": 0.5161, + "step": 736 + }, + { + "epoch": 0.53, + "grad_norm": 11.105803753991632, + "learning_rate": 9.732925392560634e-06, + "loss": 0.5781, + "step": 737 + }, + { + "epoch": 0.53, + "grad_norm": 6.780060124597431, + "learning_rate": 9.731992661143233e-06, + "loss": 0.5137, + "step": 738 + }, + { + "epoch": 0.53, + "grad_norm": 7.149918598961957, + "learning_rate": 9.731058348679128e-06, + "loss": 0.5459, + "step": 739 + }, + { + "epoch": 0.53, + "grad_norm": 10.600033304087317, + "learning_rate": 9.73012245548049e-06, + "loss": 0.6167, + "step": 740 + }, + { + "epoch": 0.53, + "grad_norm": 8.959608925181033, + "learning_rate": 9.729184981860023e-06, + "loss": 0.5547, + "step": 741 + }, + { + "epoch": 0.53, + "grad_norm": 18.421233340277922, + "learning_rate": 9.728245928130949e-06, + "loss": 0.4907, + "step": 742 + }, + { + "epoch": 0.53, + "grad_norm": 13.131878096040454, + "learning_rate": 9.727305294607024e-06, + "loss": 0.5337, + "step": 743 + }, + { + "epoch": 0.53, + "grad_norm": 5.933583907781678, + "learning_rate": 9.726363081602532e-06, + "loss": 0.4868, + "step": 744 + }, + { + "epoch": 0.53, + "grad_norm": 8.773000311800002, + "learning_rate": 9.725419289432287e-06, + "loss": 0.5586, + "step": 745 + }, + { + "epoch": 0.53, + "grad_norm": 11.30603969977933, + "learning_rate": 9.724473918411624e-06, + "loss": 0.5532, + "step": 746 + }, + { + "epoch": 0.53, + "grad_norm": 7.45632748446621, + "learning_rate": 9.723526968856408e-06, + "loss": 0.4321, + "step": 747 + }, + { + "epoch": 0.53, + "grad_norm": 10.567004859337702, + "learning_rate": 9.722578441083035e-06, + "loss": 0.4902, + "step": 748 + }, + { + "epoch": 0.53, + "grad_norm": 14.086491004356668, + "learning_rate": 9.721628335408423e-06, + "loss": 0.5205, + "step": 749 + }, + { + "epoch": 0.54, + "grad_norm": 18.63422639061617, + "learning_rate": 9.720676652150025e-06, + "loss": 0.5288, + "step": 750 + }, + { + "epoch": 0.54, + "grad_norm": 11.428381354480067, + "learning_rate": 9.719723391625813e-06, + "loss": 0.6455, + "step": 751 + }, + { + "epoch": 0.54, + "grad_norm": 14.967731907649567, + "learning_rate": 9.718768554154287e-06, + "loss": 0.6621, + "step": 752 + }, + { + "epoch": 0.54, + "grad_norm": 10.977453743909852, + "learning_rate": 9.717812140054479e-06, + "loss": 0.5083, + "step": 753 + }, + { + "epoch": 0.54, + "grad_norm": 19.976246002857728, + "learning_rate": 9.716854149645945e-06, + "loss": 0.5532, + "step": 754 + }, + { + "epoch": 0.54, + "grad_norm": 10.694789061118618, + "learning_rate": 9.715894583248764e-06, + "loss": 0.5239, + "step": 755 + }, + { + "epoch": 0.54, + "grad_norm": 6.615602095338727, + "learning_rate": 9.714933441183549e-06, + "loss": 0.4473, + "step": 756 + }, + { + "epoch": 0.54, + "grad_norm": 8.216340319991541, + "learning_rate": 9.713970723771432e-06, + "loss": 0.479, + "step": 757 + }, + { + "epoch": 0.54, + "grad_norm": 10.167611058044052, + "learning_rate": 9.713006431334076e-06, + "loss": 0.5361, + "step": 758 + }, + { + "epoch": 0.54, + "grad_norm": 6.666468658089926, + "learning_rate": 9.71204056419367e-06, + "loss": 0.5303, + "step": 759 + }, + { + "epoch": 0.54, + "grad_norm": 10.66660693531791, + "learning_rate": 9.711073122672928e-06, + "loss": 0.5464, + "step": 760 + }, + { + "epoch": 0.54, + "grad_norm": 29.673553271376097, + "learning_rate": 9.71010410709509e-06, + "loss": 0.6133, + "step": 761 + }, + { + "epoch": 0.54, + "grad_norm": 14.889123197476716, + "learning_rate": 9.70913351778392e-06, + "loss": 0.5479, + "step": 762 + }, + { + "epoch": 0.54, + "grad_norm": 11.884074672070827, + "learning_rate": 9.708161355063714e-06, + "loss": 0.623, + "step": 763 + }, + { + "epoch": 0.55, + "grad_norm": 11.200359924204133, + "learning_rate": 9.707187619259286e-06, + "loss": 0.46, + "step": 764 + }, + { + "epoch": 0.55, + "grad_norm": 17.9354480427811, + "learning_rate": 9.706212310695981e-06, + "loss": 0.5781, + "step": 765 + }, + { + "epoch": 0.55, + "grad_norm": 29.381896018391878, + "learning_rate": 9.705235429699666e-06, + "loss": 0.6304, + "step": 766 + }, + { + "epoch": 0.55, + "grad_norm": 16.16877662873349, + "learning_rate": 9.704256976596737e-06, + "loss": 0.5361, + "step": 767 + }, + { + "epoch": 0.55, + "grad_norm": 6.227617088641859, + "learning_rate": 9.703276951714114e-06, + "loss": 0.4468, + "step": 768 + }, + { + "epoch": 0.55, + "grad_norm": 11.762967619147144, + "learning_rate": 9.70229535537924e-06, + "loss": 0.4521, + "step": 769 + }, + { + "epoch": 0.55, + "grad_norm": 25.62549106013863, + "learning_rate": 9.701312187920084e-06, + "loss": 0.7197, + "step": 770 + }, + { + "epoch": 0.55, + "grad_norm": 25.332401107525445, + "learning_rate": 9.700327449665143e-06, + "loss": 0.6289, + "step": 771 + }, + { + "epoch": 0.55, + "grad_norm": 19.306780662436637, + "learning_rate": 9.699341140943434e-06, + "loss": 0.5376, + "step": 772 + }, + { + "epoch": 0.55, + "grad_norm": 9.721571034550369, + "learning_rate": 9.698353262084501e-06, + "loss": 0.4839, + "step": 773 + }, + { + "epoch": 0.55, + "grad_norm": 8.264526568674992, + "learning_rate": 9.697363813418414e-06, + "loss": 0.5029, + "step": 774 + }, + { + "epoch": 0.55, + "grad_norm": 28.02052415350964, + "learning_rate": 9.696372795275766e-06, + "loss": 0.6436, + "step": 775 + }, + { + "epoch": 0.55, + "grad_norm": 22.973871491555215, + "learning_rate": 9.695380207987675e-06, + "loss": 0.5459, + "step": 776 + }, + { + "epoch": 0.55, + "grad_norm": 22.86541152325245, + "learning_rate": 9.69438605188578e-06, + "loss": 0.6797, + "step": 777 + }, + { + "epoch": 0.56, + "grad_norm": 13.430529034939772, + "learning_rate": 9.69339032730225e-06, + "loss": 0.5947, + "step": 778 + }, + { + "epoch": 0.56, + "grad_norm": 21.136839582171906, + "learning_rate": 9.692393034569776e-06, + "loss": 0.5117, + "step": 779 + }, + { + "epoch": 0.56, + "grad_norm": 11.567653375487124, + "learning_rate": 9.69139417402157e-06, + "loss": 0.4937, + "step": 780 + }, + { + "epoch": 0.56, + "grad_norm": 38.10236907533684, + "learning_rate": 9.690393745991368e-06, + "loss": 0.6172, + "step": 781 + }, + { + "epoch": 0.56, + "grad_norm": 40.793834029889545, + "learning_rate": 9.689391750813436e-06, + "loss": 0.8066, + "step": 782 + }, + { + "epoch": 0.56, + "grad_norm": 24.16046506654703, + "learning_rate": 9.688388188822556e-06, + "loss": 0.7109, + "step": 783 + }, + { + "epoch": 0.56, + "grad_norm": 16.209686684667055, + "learning_rate": 9.687383060354038e-06, + "loss": 0.623, + "step": 784 + }, + { + "epoch": 0.56, + "grad_norm": 7.029930794357556, + "learning_rate": 9.686376365743714e-06, + "loss": 0.5557, + "step": 785 + }, + { + "epoch": 0.56, + "grad_norm": 5.702750453811597, + "learning_rate": 9.685368105327938e-06, + "loss": 0.5537, + "step": 786 + }, + { + "epoch": 0.56, + "grad_norm": 23.2055662093521, + "learning_rate": 9.684358279443593e-06, + "loss": 0.5771, + "step": 787 + }, + { + "epoch": 0.56, + "grad_norm": 7.310900740988059, + "learning_rate": 9.683346888428074e-06, + "loss": 0.4946, + "step": 788 + }, + { + "epoch": 0.56, + "grad_norm": 10.032483835960253, + "learning_rate": 9.68233393261931e-06, + "loss": 0.5244, + "step": 789 + }, + { + "epoch": 0.56, + "grad_norm": 5.215648584728573, + "learning_rate": 9.681319412355748e-06, + "loss": 0.5078, + "step": 790 + }, + { + "epoch": 0.56, + "grad_norm": 8.062322777960844, + "learning_rate": 9.680303327976356e-06, + "loss": 0.5327, + "step": 791 + }, + { + "epoch": 0.57, + "grad_norm": 6.167802022767416, + "learning_rate": 9.679285679820628e-06, + "loss": 0.501, + "step": 792 + }, + { + "epoch": 0.57, + "grad_norm": 8.46849010922445, + "learning_rate": 9.67826646822858e-06, + "loss": 0.5239, + "step": 793 + }, + { + "epoch": 0.57, + "grad_norm": 16.653596919202503, + "learning_rate": 9.677245693540749e-06, + "loss": 0.5176, + "step": 794 + }, + { + "epoch": 0.57, + "grad_norm": 7.021405696985937, + "learning_rate": 9.676223356098194e-06, + "loss": 0.4546, + "step": 795 + }, + { + "epoch": 0.57, + "grad_norm": 13.864820200236489, + "learning_rate": 9.675199456242499e-06, + "loss": 0.5151, + "step": 796 + }, + { + "epoch": 0.57, + "grad_norm": 9.137355785760509, + "learning_rate": 9.674173994315764e-06, + "loss": 0.6768, + "step": 797 + }, + { + "epoch": 0.57, + "grad_norm": 8.117836656138127, + "learning_rate": 9.67314697066062e-06, + "loss": 0.542, + "step": 798 + }, + { + "epoch": 0.57, + "grad_norm": 7.298051983963956, + "learning_rate": 9.672118385620209e-06, + "loss": 0.4927, + "step": 799 + }, + { + "epoch": 0.57, + "grad_norm": 24.163165582397156, + "learning_rate": 9.671088239538204e-06, + "loss": 0.7588, + "step": 800 + }, + { + "epoch": 0.57, + "grad_norm": 9.215200611680789, + "learning_rate": 9.670056532758798e-06, + "loss": 0.5474, + "step": 801 + }, + { + "epoch": 0.57, + "grad_norm": 10.42334654362743, + "learning_rate": 9.669023265626698e-06, + "loss": 0.6289, + "step": 802 + }, + { + "epoch": 0.57, + "grad_norm": 14.141287911310176, + "learning_rate": 9.66798843848714e-06, + "loss": 0.4873, + "step": 803 + }, + { + "epoch": 0.57, + "grad_norm": 14.552342787875665, + "learning_rate": 9.666952051685882e-06, + "loss": 0.4731, + "step": 804 + }, + { + "epoch": 0.57, + "grad_norm": 9.755001186494166, + "learning_rate": 9.665914105569196e-06, + "loss": 0.5591, + "step": 805 + }, + { + "epoch": 0.58, + "grad_norm": 11.425217131212628, + "learning_rate": 9.664874600483883e-06, + "loss": 0.5239, + "step": 806 + }, + { + "epoch": 0.58, + "grad_norm": 13.638565826764706, + "learning_rate": 9.663833536777256e-06, + "loss": 0.5005, + "step": 807 + }, + { + "epoch": 0.58, + "grad_norm": 8.967345411360103, + "learning_rate": 9.662790914797158e-06, + "loss": 0.6133, + "step": 808 + }, + { + "epoch": 0.58, + "grad_norm": 16.774853818643642, + "learning_rate": 9.661746734891947e-06, + "loss": 0.5449, + "step": 809 + }, + { + "epoch": 0.58, + "grad_norm": 12.358167390535703, + "learning_rate": 9.6607009974105e-06, + "loss": 0.5386, + "step": 810 + }, + { + "epoch": 0.58, + "grad_norm": 7.443902186847719, + "learning_rate": 9.659653702702223e-06, + "loss": 0.519, + "step": 811 + }, + { + "epoch": 0.58, + "grad_norm": 20.581607014220506, + "learning_rate": 9.658604851117032e-06, + "loss": 0.6064, + "step": 812 + }, + { + "epoch": 0.58, + "grad_norm": 20.10227881796404, + "learning_rate": 9.65755444300537e-06, + "loss": 0.5649, + "step": 813 + }, + { + "epoch": 0.58, + "grad_norm": 13.936458119810688, + "learning_rate": 9.656502478718197e-06, + "loss": 0.5459, + "step": 814 + }, + { + "epoch": 0.58, + "grad_norm": 19.720426589157082, + "learning_rate": 9.655448958606994e-06, + "loss": 0.6309, + "step": 815 + }, + { + "epoch": 0.58, + "grad_norm": 9.062061943723108, + "learning_rate": 9.654393883023763e-06, + "loss": 0.5693, + "step": 816 + }, + { + "epoch": 0.58, + "grad_norm": 11.539927314427118, + "learning_rate": 9.653337252321023e-06, + "loss": 0.584, + "step": 817 + }, + { + "epoch": 0.58, + "grad_norm": 20.01962400655284, + "learning_rate": 9.652279066851811e-06, + "loss": 0.5146, + "step": 818 + }, + { + "epoch": 0.58, + "grad_norm": 6.278364471511755, + "learning_rate": 9.651219326969694e-06, + "loss": 0.4888, + "step": 819 + }, + { + "epoch": 0.59, + "grad_norm": 11.862188944826272, + "learning_rate": 9.650158033028743e-06, + "loss": 0.5386, + "step": 820 + }, + { + "epoch": 0.59, + "grad_norm": 9.55019661530497, + "learning_rate": 9.64909518538356e-06, + "loss": 0.4985, + "step": 821 + }, + { + "epoch": 0.59, + "grad_norm": 6.736899314710898, + "learning_rate": 9.648030784389264e-06, + "loss": 0.5303, + "step": 822 + }, + { + "epoch": 0.59, + "grad_norm": 5.366946707310364, + "learning_rate": 9.646964830401487e-06, + "loss": 0.6001, + "step": 823 + }, + { + "epoch": 0.59, + "grad_norm": 10.13088503009922, + "learning_rate": 9.645897323776386e-06, + "loss": 0.5889, + "step": 824 + }, + { + "epoch": 0.59, + "grad_norm": 14.30858943174471, + "learning_rate": 9.644828264870634e-06, + "loss": 0.5371, + "step": 825 + }, + { + "epoch": 0.59, + "grad_norm": 12.315173886504947, + "learning_rate": 9.643757654041423e-06, + "loss": 0.5508, + "step": 826 + }, + { + "epoch": 0.59, + "grad_norm": 6.679272046818679, + "learning_rate": 9.642685491646467e-06, + "loss": 0.481, + "step": 827 + }, + { + "epoch": 0.59, + "grad_norm": 7.93076938329635, + "learning_rate": 9.641611778043992e-06, + "loss": 0.5361, + "step": 828 + }, + { + "epoch": 0.59, + "grad_norm": 11.131118051844421, + "learning_rate": 9.64053651359275e-06, + "loss": 0.4561, + "step": 829 + }, + { + "epoch": 0.59, + "grad_norm": 16.845626382905177, + "learning_rate": 9.639459698652e-06, + "loss": 0.5811, + "step": 830 + }, + { + "epoch": 0.59, + "grad_norm": 7.452226118644797, + "learning_rate": 9.63838133358153e-06, + "loss": 0.5049, + "step": 831 + }, + { + "epoch": 0.59, + "grad_norm": 13.715195201767449, + "learning_rate": 9.637301418741643e-06, + "loss": 0.5732, + "step": 832 + }, + { + "epoch": 0.59, + "grad_norm": 5.948105431930284, + "learning_rate": 9.636219954493157e-06, + "loss": 0.5264, + "step": 833 + }, + { + "epoch": 0.6, + "grad_norm": 6.493455666198669, + "learning_rate": 9.635136941197409e-06, + "loss": 0.5498, + "step": 834 + }, + { + "epoch": 0.6, + "grad_norm": 6.154843832131087, + "learning_rate": 9.634052379216256e-06, + "loss": 0.5889, + "step": 835 + }, + { + "epoch": 0.6, + "grad_norm": 5.9733407722182035, + "learning_rate": 9.632966268912067e-06, + "loss": 0.5796, + "step": 836 + }, + { + "epoch": 0.6, + "grad_norm": 13.393897880638866, + "learning_rate": 9.631878610647734e-06, + "loss": 0.5762, + "step": 837 + }, + { + "epoch": 0.6, + "grad_norm": 9.346535492939477, + "learning_rate": 9.630789404786664e-06, + "loss": 0.5845, + "step": 838 + }, + { + "epoch": 0.6, + "grad_norm": 7.414814582454439, + "learning_rate": 9.629698651692779e-06, + "loss": 0.519, + "step": 839 + }, + { + "epoch": 0.6, + "grad_norm": 7.027989103111617, + "learning_rate": 9.62860635173052e-06, + "loss": 0.5229, + "step": 840 + }, + { + "epoch": 0.6, + "grad_norm": 9.691852570057074, + "learning_rate": 9.627512505264847e-06, + "loss": 0.54, + "step": 841 + }, + { + "epoch": 0.6, + "grad_norm": 13.672266244948228, + "learning_rate": 9.626417112661233e-06, + "loss": 0.501, + "step": 842 + }, + { + "epoch": 0.6, + "grad_norm": 14.046157418465752, + "learning_rate": 9.62532017428567e-06, + "loss": 0.5464, + "step": 843 + }, + { + "epoch": 0.6, + "grad_norm": 5.894528680541082, + "learning_rate": 9.624221690504663e-06, + "loss": 0.5454, + "step": 844 + }, + { + "epoch": 0.6, + "grad_norm": 19.359046304477772, + "learning_rate": 9.623121661685239e-06, + "loss": 0.5347, + "step": 845 + }, + { + "epoch": 0.6, + "grad_norm": 9.931630197988305, + "learning_rate": 9.622020088194934e-06, + "loss": 0.5518, + "step": 846 + }, + { + "epoch": 0.6, + "grad_norm": 7.126175546600982, + "learning_rate": 9.62091697040181e-06, + "loss": 0.5073, + "step": 847 + }, + { + "epoch": 0.61, + "grad_norm": 6.424497735342813, + "learning_rate": 9.619812308674434e-06, + "loss": 0.5981, + "step": 848 + }, + { + "epoch": 0.61, + "grad_norm": 13.429709124263976, + "learning_rate": 9.618706103381896e-06, + "loss": 0.5327, + "step": 849 + }, + { + "epoch": 0.61, + "grad_norm": 6.370766522473178, + "learning_rate": 9.6175983548938e-06, + "loss": 0.5356, + "step": 850 + }, + { + "epoch": 0.61, + "grad_norm": 7.936351360877747, + "learning_rate": 9.616489063580265e-06, + "loss": 0.6279, + "step": 851 + }, + { + "epoch": 0.61, + "grad_norm": 13.565406309781352, + "learning_rate": 9.615378229811927e-06, + "loss": 0.498, + "step": 852 + }, + { + "epoch": 0.61, + "grad_norm": 14.85975243185665, + "learning_rate": 9.614265853959935e-06, + "loss": 0.5449, + "step": 853 + }, + { + "epoch": 0.61, + "grad_norm": 9.376637510131717, + "learning_rate": 9.613151936395952e-06, + "loss": 0.6953, + "step": 854 + }, + { + "epoch": 0.61, + "grad_norm": 5.639153075645197, + "learning_rate": 9.612036477492163e-06, + "loss": 0.5469, + "step": 855 + }, + { + "epoch": 0.61, + "grad_norm": 23.55427631393824, + "learning_rate": 9.610919477621262e-06, + "loss": 0.5225, + "step": 856 + }, + { + "epoch": 0.61, + "grad_norm": 16.272759938860226, + "learning_rate": 9.609800937156459e-06, + "loss": 0.6147, + "step": 857 + }, + { + "epoch": 0.61, + "grad_norm": 5.7353396751586585, + "learning_rate": 9.60868085647148e-06, + "loss": 0.6284, + "step": 858 + }, + { + "epoch": 0.61, + "grad_norm": 8.677161643210795, + "learning_rate": 9.607559235940562e-06, + "loss": 0.4409, + "step": 859 + }, + { + "epoch": 0.61, + "grad_norm": 5.674513654587285, + "learning_rate": 9.60643607593846e-06, + "loss": 0.5229, + "step": 860 + }, + { + "epoch": 0.61, + "grad_norm": 5.208902038096937, + "learning_rate": 9.605311376840446e-06, + "loss": 0.4419, + "step": 861 + }, + { + "epoch": 0.62, + "grad_norm": 7.200913020665042, + "learning_rate": 9.604185139022302e-06, + "loss": 0.5039, + "step": 862 + }, + { + "epoch": 0.62, + "grad_norm": 13.88962045733014, + "learning_rate": 9.603057362860323e-06, + "loss": 0.6357, + "step": 863 + }, + { + "epoch": 0.62, + "grad_norm": 15.58000300790741, + "learning_rate": 9.60192804873132e-06, + "loss": 0.4673, + "step": 864 + }, + { + "epoch": 0.62, + "grad_norm": 5.975171350200098, + "learning_rate": 9.60079719701262e-06, + "loss": 0.5605, + "step": 865 + }, + { + "epoch": 0.62, + "grad_norm": 5.2559604366098025, + "learning_rate": 9.599664808082058e-06, + "loss": 0.5229, + "step": 866 + }, + { + "epoch": 0.62, + "grad_norm": 30.607460221946777, + "learning_rate": 9.598530882317992e-06, + "loss": 0.7324, + "step": 867 + }, + { + "epoch": 0.62, + "grad_norm": 16.12508471498375, + "learning_rate": 9.59739542009928e-06, + "loss": 0.6279, + "step": 868 + }, + { + "epoch": 0.62, + "grad_norm": 7.213276965176858, + "learning_rate": 9.596258421805306e-06, + "loss": 0.54, + "step": 869 + }, + { + "epoch": 0.62, + "grad_norm": 8.939321698210003, + "learning_rate": 9.595119887815962e-06, + "loss": 0.5654, + "step": 870 + }, + { + "epoch": 0.62, + "grad_norm": 7.54597463715274, + "learning_rate": 9.593979818511655e-06, + "loss": 0.5391, + "step": 871 + }, + { + "epoch": 0.62, + "grad_norm": 8.796494601908567, + "learning_rate": 9.592838214273298e-06, + "loss": 0.6221, + "step": 872 + }, + { + "epoch": 0.62, + "grad_norm": 16.350816902914385, + "learning_rate": 9.591695075482326e-06, + "loss": 0.5195, + "step": 873 + }, + { + "epoch": 0.62, + "grad_norm": 16.356102942266837, + "learning_rate": 9.590550402520683e-06, + "loss": 0.5361, + "step": 874 + }, + { + "epoch": 0.62, + "grad_norm": 4.949881570510586, + "learning_rate": 9.589404195770821e-06, + "loss": 0.5151, + "step": 875 + }, + { + "epoch": 0.63, + "grad_norm": 7.926146321546089, + "learning_rate": 9.588256455615716e-06, + "loss": 0.5801, + "step": 876 + }, + { + "epoch": 0.63, + "grad_norm": 6.859921527227033, + "learning_rate": 9.587107182438846e-06, + "loss": 0.5, + "step": 877 + }, + { + "epoch": 0.63, + "grad_norm": 11.050613799522687, + "learning_rate": 9.585956376624204e-06, + "loss": 0.5527, + "step": 878 + }, + { + "epoch": 0.63, + "grad_norm": 14.002907445909361, + "learning_rate": 9.584804038556297e-06, + "loss": 0.6289, + "step": 879 + }, + { + "epoch": 0.63, + "grad_norm": 6.0542998877576375, + "learning_rate": 9.58365016862014e-06, + "loss": 0.4893, + "step": 880 + }, + { + "epoch": 0.63, + "grad_norm": 16.56578174324824, + "learning_rate": 9.582494767201265e-06, + "loss": 0.5493, + "step": 881 + }, + { + "epoch": 0.63, + "grad_norm": 5.301104366990973, + "learning_rate": 9.581337834685713e-06, + "loss": 0.4365, + "step": 882 + }, + { + "epoch": 0.63, + "grad_norm": 21.230580902888516, + "learning_rate": 9.580179371460034e-06, + "loss": 0.5376, + "step": 883 + }, + { + "epoch": 0.63, + "grad_norm": 9.115363175931497, + "learning_rate": 9.579019377911296e-06, + "loss": 0.5518, + "step": 884 + }, + { + "epoch": 0.63, + "grad_norm": 8.063043768232241, + "learning_rate": 9.57785785442707e-06, + "loss": 0.5142, + "step": 885 + }, + { + "epoch": 0.63, + "grad_norm": 8.200092331472336, + "learning_rate": 9.576694801395447e-06, + "loss": 0.4912, + "step": 886 + }, + { + "epoch": 0.63, + "grad_norm": 16.048503793582384, + "learning_rate": 9.57553021920502e-06, + "loss": 0.708, + "step": 887 + }, + { + "epoch": 0.63, + "grad_norm": 7.183349614718361, + "learning_rate": 9.574364108244903e-06, + "loss": 0.459, + "step": 888 + }, + { + "epoch": 0.63, + "grad_norm": 7.786011428361001, + "learning_rate": 9.573196468904711e-06, + "loss": 0.5586, + "step": 889 + }, + { + "epoch": 0.64, + "grad_norm": 12.666551982517106, + "learning_rate": 9.572027301574576e-06, + "loss": 0.6201, + "step": 890 + }, + { + "epoch": 0.64, + "grad_norm": 9.989372858349427, + "learning_rate": 9.570856606645139e-06, + "loss": 0.543, + "step": 891 + }, + { + "epoch": 0.64, + "grad_norm": 6.978420031263653, + "learning_rate": 9.569684384507547e-06, + "loss": 0.4585, + "step": 892 + }, + { + "epoch": 0.64, + "grad_norm": 6.494441221850212, + "learning_rate": 9.568510635553466e-06, + "loss": 0.5176, + "step": 893 + }, + { + "epoch": 0.64, + "grad_norm": 7.496868914068109, + "learning_rate": 9.567335360175065e-06, + "loss": 0.5283, + "step": 894 + }, + { + "epoch": 0.64, + "grad_norm": 19.82256439758004, + "learning_rate": 9.566158558765026e-06, + "loss": 0.6777, + "step": 895 + }, + { + "epoch": 0.64, + "grad_norm": 8.172352317808183, + "learning_rate": 9.564980231716541e-06, + "loss": 0.5371, + "step": 896 + }, + { + "epoch": 0.64, + "grad_norm": 7.641915716101109, + "learning_rate": 9.56380037942331e-06, + "loss": 0.5103, + "step": 897 + }, + { + "epoch": 0.64, + "grad_norm": 10.279754257232414, + "learning_rate": 9.562619002279541e-06, + "loss": 0.5737, + "step": 898 + }, + { + "epoch": 0.64, + "grad_norm": 10.850853767539375, + "learning_rate": 9.561436100679959e-06, + "loss": 0.5244, + "step": 899 + }, + { + "epoch": 0.64, + "grad_norm": 7.349447369826582, + "learning_rate": 9.56025167501979e-06, + "loss": 0.4907, + "step": 900 + }, + { + "epoch": 0.64, + "grad_norm": 15.819552722595972, + "learning_rate": 9.559065725694775e-06, + "loss": 0.6021, + "step": 901 + }, + { + "epoch": 0.64, + "grad_norm": 18.09721976041659, + "learning_rate": 9.55787825310116e-06, + "loss": 0.5762, + "step": 902 + }, + { + "epoch": 0.64, + "grad_norm": 6.611541858548501, + "learning_rate": 9.5566892576357e-06, + "loss": 0.4741, + "step": 903 + }, + { + "epoch": 0.65, + "grad_norm": 11.17797250965005, + "learning_rate": 9.555498739695665e-06, + "loss": 0.6016, + "step": 904 + }, + { + "epoch": 0.65, + "grad_norm": 15.107430166339677, + "learning_rate": 9.554306699678827e-06, + "loss": 0.5166, + "step": 905 + }, + { + "epoch": 0.65, + "grad_norm": 7.571816321971257, + "learning_rate": 9.553113137983467e-06, + "loss": 0.5151, + "step": 906 + }, + { + "epoch": 0.65, + "grad_norm": 7.8343904045849095, + "learning_rate": 9.551918055008378e-06, + "loss": 0.5376, + "step": 907 + }, + { + "epoch": 0.65, + "grad_norm": 11.669597503208232, + "learning_rate": 9.55072145115286e-06, + "loss": 0.5171, + "step": 908 + }, + { + "epoch": 0.65, + "grad_norm": 9.787777933655681, + "learning_rate": 9.54952332681672e-06, + "loss": 0.5527, + "step": 909 + }, + { + "epoch": 0.65, + "grad_norm": 7.9497521719036, + "learning_rate": 9.54832368240027e-06, + "loss": 0.4326, + "step": 910 + }, + { + "epoch": 0.65, + "grad_norm": 10.51362496368161, + "learning_rate": 9.54712251830434e-06, + "loss": 0.4834, + "step": 911 + }, + { + "epoch": 0.65, + "grad_norm": 17.840346790600776, + "learning_rate": 9.545919834930257e-06, + "loss": 0.4854, + "step": 912 + }, + { + "epoch": 0.65, + "grad_norm": 13.48508364031533, + "learning_rate": 9.54471563267986e-06, + "loss": 0.6523, + "step": 913 + }, + { + "epoch": 0.65, + "grad_norm": 12.107481431790621, + "learning_rate": 9.543509911955497e-06, + "loss": 0.5186, + "step": 914 + }, + { + "epoch": 0.65, + "grad_norm": 9.554933322462405, + "learning_rate": 9.542302673160021e-06, + "loss": 0.5552, + "step": 915 + }, + { + "epoch": 0.65, + "grad_norm": 9.359795062769347, + "learning_rate": 9.541093916696793e-06, + "loss": 0.6514, + "step": 916 + }, + { + "epoch": 0.65, + "grad_norm": 7.933210596647848, + "learning_rate": 9.539883642969681e-06, + "loss": 0.4824, + "step": 917 + }, + { + "epoch": 0.66, + "grad_norm": 12.030646602797862, + "learning_rate": 9.53867185238306e-06, + "loss": 0.5791, + "step": 918 + }, + { + "epoch": 0.66, + "grad_norm": 10.531996572790305, + "learning_rate": 9.53745854534181e-06, + "loss": 0.542, + "step": 919 + }, + { + "epoch": 0.66, + "grad_norm": 18.737722994826893, + "learning_rate": 9.536243722251321e-06, + "loss": 0.6357, + "step": 920 + }, + { + "epoch": 0.66, + "grad_norm": 11.176392594106526, + "learning_rate": 9.53502738351749e-06, + "loss": 0.5479, + "step": 921 + }, + { + "epoch": 0.66, + "grad_norm": 9.791505085435352, + "learning_rate": 9.533809529546716e-06, + "loss": 0.5146, + "step": 922 + }, + { + "epoch": 0.66, + "grad_norm": 11.194713334314685, + "learning_rate": 9.532590160745906e-06, + "loss": 0.5542, + "step": 923 + }, + { + "epoch": 0.66, + "grad_norm": 21.304141717636043, + "learning_rate": 9.531369277522475e-06, + "loss": 0.4966, + "step": 924 + }, + { + "epoch": 0.66, + "grad_norm": 8.781951810922592, + "learning_rate": 9.530146880284343e-06, + "loss": 0.5664, + "step": 925 + }, + { + "epoch": 0.66, + "grad_norm": 11.045579974558065, + "learning_rate": 9.528922969439935e-06, + "loss": 0.5073, + "step": 926 + }, + { + "epoch": 0.66, + "grad_norm": 9.68357233374754, + "learning_rate": 9.527697545398183e-06, + "loss": 0.5542, + "step": 927 + }, + { + "epoch": 0.66, + "grad_norm": 15.122021239823766, + "learning_rate": 9.526470608568521e-06, + "loss": 0.5415, + "step": 928 + }, + { + "epoch": 0.66, + "grad_norm": 6.454918266617198, + "learning_rate": 9.525242159360897e-06, + "loss": 0.5361, + "step": 929 + }, + { + "epoch": 0.66, + "grad_norm": 8.493825899265097, + "learning_rate": 9.524012198185755e-06, + "loss": 0.7109, + "step": 930 + }, + { + "epoch": 0.66, + "grad_norm": 5.636191715860241, + "learning_rate": 9.522780725454048e-06, + "loss": 0.5635, + "step": 931 + }, + { + "epoch": 0.67, + "grad_norm": 8.967666658359168, + "learning_rate": 9.521547741577232e-06, + "loss": 0.5757, + "step": 932 + }, + { + "epoch": 0.67, + "grad_norm": 5.728302127565175, + "learning_rate": 9.520313246967277e-06, + "loss": 0.6177, + "step": 933 + }, + { + "epoch": 0.67, + "grad_norm": 9.518191584124642, + "learning_rate": 9.519077242036643e-06, + "loss": 0.6094, + "step": 934 + }, + { + "epoch": 0.67, + "grad_norm": 7.997331318486447, + "learning_rate": 9.517839727198306e-06, + "loss": 0.5352, + "step": 935 + }, + { + "epoch": 0.67, + "grad_norm": 9.08940793882954, + "learning_rate": 9.516600702865742e-06, + "loss": 0.5396, + "step": 936 + }, + { + "epoch": 0.67, + "grad_norm": 6.3323126968670005, + "learning_rate": 9.51536016945293e-06, + "loss": 0.5122, + "step": 937 + }, + { + "epoch": 0.67, + "grad_norm": 7.266058320003383, + "learning_rate": 9.514118127374358e-06, + "loss": 0.5044, + "step": 938 + }, + { + "epoch": 0.67, + "grad_norm": 7.08730151307726, + "learning_rate": 9.512874577045016e-06, + "loss": 0.5518, + "step": 939 + }, + { + "epoch": 0.67, + "grad_norm": 13.676561438718075, + "learning_rate": 9.511629518880394e-06, + "loss": 0.5298, + "step": 940 + }, + { + "epoch": 0.67, + "grad_norm": 17.411348317275475, + "learning_rate": 9.510382953296492e-06, + "loss": 0.4536, + "step": 941 + }, + { + "epoch": 0.67, + "grad_norm": 44.62122780743628, + "learning_rate": 9.50913488070981e-06, + "loss": 0.6172, + "step": 942 + }, + { + "epoch": 0.67, + "grad_norm": 228.46722688523354, + "learning_rate": 9.50788530153735e-06, + "loss": 0.8828, + "step": 943 + }, + { + "epoch": 0.67, + "grad_norm": 109.64473438367058, + "learning_rate": 9.506634216196621e-06, + "loss": 0.9453, + "step": 944 + }, + { + "epoch": 0.67, + "grad_norm": 78.65828191559729, + "learning_rate": 9.505381625105636e-06, + "loss": 0.9453, + "step": 945 + }, + { + "epoch": 0.68, + "grad_norm": 51.832204433461676, + "learning_rate": 9.504127528682907e-06, + "loss": 0.6997, + "step": 946 + }, + { + "epoch": 0.68, + "grad_norm": 31.13063318378493, + "learning_rate": 9.502871927347452e-06, + "loss": 0.6172, + "step": 947 + }, + { + "epoch": 0.68, + "grad_norm": 14.291137372507908, + "learning_rate": 9.501614821518789e-06, + "loss": 0.5537, + "step": 948 + }, + { + "epoch": 0.68, + "grad_norm": 23.46365835435606, + "learning_rate": 9.500356211616941e-06, + "loss": 0.6475, + "step": 949 + }, + { + "epoch": 0.68, + "grad_norm": 32.29740862656474, + "learning_rate": 9.499096098062435e-06, + "loss": 0.6709, + "step": 950 + }, + { + "epoch": 0.68, + "grad_norm": 63.097784255629676, + "learning_rate": 9.497834481276293e-06, + "loss": 0.6182, + "step": 951 + }, + { + "epoch": 0.68, + "grad_norm": 23.902799208094248, + "learning_rate": 9.496571361680052e-06, + "loss": 0.5908, + "step": 952 + }, + { + "epoch": 0.68, + "grad_norm": 16.627462349415925, + "learning_rate": 9.495306739695738e-06, + "loss": 0.6055, + "step": 953 + }, + { + "epoch": 0.68, + "grad_norm": 8.838239735724807, + "learning_rate": 9.494040615745887e-06, + "loss": 0.5063, + "step": 954 + }, + { + "epoch": 0.68, + "grad_norm": 37.579178855565374, + "learning_rate": 9.492772990253535e-06, + "loss": 0.6553, + "step": 955 + }, + { + "epoch": 0.68, + "grad_norm": 12.534070316127274, + "learning_rate": 9.49150386364222e-06, + "loss": 0.5283, + "step": 956 + }, + { + "epoch": 0.68, + "grad_norm": 18.81207673614865, + "learning_rate": 9.490233236335977e-06, + "loss": 0.5747, + "step": 957 + }, + { + "epoch": 0.68, + "grad_norm": 35.2501349954626, + "learning_rate": 9.488961108759349e-06, + "loss": 0.5806, + "step": 958 + }, + { + "epoch": 0.68, + "grad_norm": 24.779299324475687, + "learning_rate": 9.487687481337377e-06, + "loss": 0.6221, + "step": 959 + }, + { + "epoch": 0.69, + "grad_norm": 10.734399861098689, + "learning_rate": 9.486412354495605e-06, + "loss": 0.563, + "step": 960 + }, + { + "epoch": 0.69, + "grad_norm": 14.688681103139775, + "learning_rate": 9.485135728660073e-06, + "loss": 0.5747, + "step": 961 + }, + { + "epoch": 0.69, + "grad_norm": 15.39443389421036, + "learning_rate": 9.48385760425733e-06, + "loss": 0.5088, + "step": 962 + }, + { + "epoch": 0.69, + "grad_norm": 18.663199374468476, + "learning_rate": 9.482577981714417e-06, + "loss": 0.6211, + "step": 963 + }, + { + "epoch": 0.69, + "grad_norm": 12.818151185370693, + "learning_rate": 9.481296861458881e-06, + "loss": 0.5361, + "step": 964 + }, + { + "epoch": 0.69, + "grad_norm": 21.38553722751277, + "learning_rate": 9.480014243918769e-06, + "loss": 0.6396, + "step": 965 + }, + { + "epoch": 0.69, + "grad_norm": 11.402379274239996, + "learning_rate": 9.478730129522627e-06, + "loss": 0.5635, + "step": 966 + }, + { + "epoch": 0.69, + "grad_norm": 12.858863875511425, + "learning_rate": 9.477444518699501e-06, + "loss": 0.6465, + "step": 967 + }, + { + "epoch": 0.69, + "grad_norm": 15.572804809224767, + "learning_rate": 9.476157411878937e-06, + "loss": 0.5537, + "step": 968 + }, + { + "epoch": 0.69, + "grad_norm": 30.627062162450095, + "learning_rate": 9.474868809490984e-06, + "loss": 0.6309, + "step": 969 + }, + { + "epoch": 0.69, + "grad_norm": 19.284249577261523, + "learning_rate": 9.473578711966185e-06, + "loss": 0.6641, + "step": 970 + }, + { + "epoch": 0.69, + "grad_norm": 9.635613200387112, + "learning_rate": 9.472287119735588e-06, + "loss": 0.5928, + "step": 971 + }, + { + "epoch": 0.69, + "grad_norm": 16.198639315955063, + "learning_rate": 9.470994033230735e-06, + "loss": 0.521, + "step": 972 + }, + { + "epoch": 0.69, + "grad_norm": 17.11429585253634, + "learning_rate": 9.469699452883672e-06, + "loss": 0.623, + "step": 973 + }, + { + "epoch": 0.7, + "grad_norm": 58.34165469496508, + "learning_rate": 9.468403379126943e-06, + "loss": 0.6406, + "step": 974 + }, + { + "epoch": 0.7, + "grad_norm": 25.195532099505844, + "learning_rate": 9.46710581239359e-06, + "loss": 0.6553, + "step": 975 + }, + { + "epoch": 0.7, + "grad_norm": 23.180951301910945, + "learning_rate": 9.465806753117153e-06, + "loss": 0.6494, + "step": 976 + }, + { + "epoch": 0.7, + "grad_norm": 14.047197342311845, + "learning_rate": 9.464506201731674e-06, + "loss": 0.6182, + "step": 977 + }, + { + "epoch": 0.7, + "grad_norm": 19.207255751064135, + "learning_rate": 9.463204158671687e-06, + "loss": 0.561, + "step": 978 + }, + { + "epoch": 0.7, + "grad_norm": 14.684448206775564, + "learning_rate": 9.461900624372233e-06, + "loss": 0.6177, + "step": 979 + }, + { + "epoch": 0.7, + "grad_norm": 20.50000538321963, + "learning_rate": 9.460595599268848e-06, + "loss": 0.6416, + "step": 980 + }, + { + "epoch": 0.7, + "grad_norm": 14.660593164014767, + "learning_rate": 9.45928908379756e-06, + "loss": 0.5166, + "step": 981 + }, + { + "epoch": 0.7, + "grad_norm": 21.206357340067083, + "learning_rate": 9.457981078394905e-06, + "loss": 0.6357, + "step": 982 + }, + { + "epoch": 0.7, + "grad_norm": 11.40715091937378, + "learning_rate": 9.45667158349791e-06, + "loss": 0.5601, + "step": 983 + }, + { + "epoch": 0.7, + "grad_norm": 36.68503954323254, + "learning_rate": 9.4553605995441e-06, + "loss": 0.6162, + "step": 984 + }, + { + "epoch": 0.7, + "grad_norm": 15.636828128912919, + "learning_rate": 9.4540481269715e-06, + "loss": 0.6436, + "step": 985 + }, + { + "epoch": 0.7, + "grad_norm": 18.970530099676225, + "learning_rate": 9.452734166218635e-06, + "loss": 0.6279, + "step": 986 + }, + { + "epoch": 0.7, + "grad_norm": 11.10424986097931, + "learning_rate": 9.451418717724518e-06, + "loss": 0.5771, + "step": 987 + }, + { + "epoch": 0.71, + "grad_norm": 19.04133879032038, + "learning_rate": 9.45010178192867e-06, + "loss": 0.5747, + "step": 988 + }, + { + "epoch": 0.71, + "grad_norm": 11.337405957274404, + "learning_rate": 9.448783359271102e-06, + "loss": 0.6104, + "step": 989 + }, + { + "epoch": 0.71, + "grad_norm": 13.704100149151028, + "learning_rate": 9.44746345019232e-06, + "loss": 0.5547, + "step": 990 + }, + { + "epoch": 0.71, + "grad_norm": 23.243738749349486, + "learning_rate": 9.446142055133333e-06, + "loss": 0.7139, + "step": 991 + }, + { + "epoch": 0.71, + "grad_norm": 23.327598910169833, + "learning_rate": 9.444819174535647e-06, + "loss": 0.7104, + "step": 992 + }, + { + "epoch": 0.71, + "grad_norm": 10.328880835372676, + "learning_rate": 9.443494808841255e-06, + "loss": 0.5815, + "step": 993 + }, + { + "epoch": 0.71, + "grad_norm": 20.53709813505537, + "learning_rate": 9.442168958492657e-06, + "loss": 0.6484, + "step": 994 + }, + { + "epoch": 0.71, + "grad_norm": 9.46766757628474, + "learning_rate": 9.44084162393284e-06, + "loss": 0.5952, + "step": 995 + }, + { + "epoch": 0.71, + "grad_norm": 15.838152911509205, + "learning_rate": 9.439512805605294e-06, + "loss": 0.5791, + "step": 996 + }, + { + "epoch": 0.71, + "grad_norm": 10.976704798054344, + "learning_rate": 9.438182503954002e-06, + "loss": 0.6211, + "step": 997 + }, + { + "epoch": 0.71, + "grad_norm": 13.574758990726995, + "learning_rate": 9.43685071942344e-06, + "loss": 0.5728, + "step": 998 + }, + { + "epoch": 0.71, + "grad_norm": 10.175433896531516, + "learning_rate": 9.435517452458584e-06, + "loss": 0.5, + "step": 999 + }, + { + "epoch": 0.71, + "grad_norm": 23.327701848585804, + "learning_rate": 9.434182703504904e-06, + "loss": 0.6045, + "step": 1000 + }, + { + "epoch": 0.71, + "eval_avg_AUC": 0.6339471196716615, + "eval_avg_Accuracy": 0.6359830901856764, + "eval_avg_Accuracy-right": 0.9985000652145559, + "eval_avg_Accuracy-wrong": 0.003866272458494428, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.5146999788919214, + "eval_last_AUC": 0.6325870765603462, + "eval_last_Accuracy": 0.6397546419098143, + "eval_last_Accuracy-right": 0.9924351115168906, + "eval_last_Accuracy-wrong": 0.024789629292699566, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.5155307886197232, + "eval_max_AUC": 0.6025879470270927, + "eval_max_Accuracy": 0.6355271883289124, + "eval_max_Accuracy-right": 0.9996087126646668, + "eval_max_Accuracy-wrong": 0.0006822833750284285, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.5068662445848651, + "eval_min_AUC": 0.6212461473800316, + "eval_min_Accuracy": 0.640376326259947, + "eval_min_Accuracy-right": 0.9918481805138907, + "eval_min_Accuracy-wrong": 0.027518762792813282, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.5093913468479994, + "eval_prod_AUC": 0.5978007803736609, + "eval_prod_Accuracy": 0.38884283819628646, + "eval_prod_Accuracy-right": 0.04995434981087779, + "eval_prod_Accuracy-wrong": 0.9797589265408233, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.5021156954591185, + "eval_runtime": 249.2562, + "eval_samples_per_second": 96.8, + "eval_steps_per_second": 3.025, + "eval_sum_AUC": 0.4675143189480929, + "eval_sum_Accuracy": 0.635651525198939, + "eval_sum_Accuracy-right": 1.0, + "eval_sum_Accuracy-wrong": 0.0003411416875142142, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.5161108860078178, + "step": 1000 + }, + { + "epoch": 0.71, + "grad_norm": 23.82364706060358, + "learning_rate": 9.432846473008363e-06, + "loss": 0.5806, + "step": 1001 + }, + { + "epoch": 0.72, + "grad_norm": 27.37121267396446, + "learning_rate": 9.431508761415422e-06, + "loss": 0.6816, + "step": 1002 + }, + { + "epoch": 0.72, + "grad_norm": 13.853525294894828, + "learning_rate": 9.430169569173034e-06, + "loss": 0.583, + "step": 1003 + }, + { + "epoch": 0.72, + "grad_norm": 15.775963932541249, + "learning_rate": 9.428828896728645e-06, + "loss": 0.5537, + "step": 1004 + }, + { + "epoch": 0.72, + "grad_norm": 9.72952999413491, + "learning_rate": 9.427486744530205e-06, + "loss": 0.5576, + "step": 1005 + }, + { + "epoch": 0.72, + "grad_norm": 20.721816684636078, + "learning_rate": 9.426143113026147e-06, + "loss": 0.6729, + "step": 1006 + }, + { + "epoch": 0.72, + "grad_norm": 12.544564691174461, + "learning_rate": 9.424798002665405e-06, + "loss": 0.666, + "step": 1007 + }, + { + "epoch": 0.72, + "grad_norm": 41.719641221253966, + "learning_rate": 9.423451413897406e-06, + "loss": 0.6758, + "step": 1008 + }, + { + "epoch": 0.72, + "grad_norm": 17.0766374680692, + "learning_rate": 9.42210334717207e-06, + "loss": 0.6533, + "step": 1009 + }, + { + "epoch": 0.72, + "grad_norm": 19.964130971436024, + "learning_rate": 9.42075380293981e-06, + "loss": 0.6338, + "step": 1010 + }, + { + "epoch": 0.72, + "grad_norm": 16.093738265546637, + "learning_rate": 9.419402781651537e-06, + "loss": 0.5977, + "step": 1011 + }, + { + "epoch": 0.72, + "grad_norm": 16.29527691970946, + "learning_rate": 9.418050283758647e-06, + "loss": 0.6133, + "step": 1012 + }, + { + "epoch": 0.72, + "grad_norm": 14.094752942831983, + "learning_rate": 9.416696309713038e-06, + "loss": 0.6084, + "step": 1013 + }, + { + "epoch": 0.72, + "grad_norm": 19.37532956112354, + "learning_rate": 9.415340859967099e-06, + "loss": 0.6182, + "step": 1014 + }, + { + "epoch": 0.72, + "grad_norm": 13.71031567566946, + "learning_rate": 9.413983934973709e-06, + "loss": 0.5996, + "step": 1015 + }, + { + "epoch": 0.73, + "grad_norm": 15.269730904625384, + "learning_rate": 9.412625535186242e-06, + "loss": 0.5771, + "step": 1016 + }, + { + "epoch": 0.73, + "grad_norm": 28.656034482685637, + "learning_rate": 9.411265661058565e-06, + "loss": 0.6465, + "step": 1017 + }, + { + "epoch": 0.73, + "grad_norm": 42.68689422622165, + "learning_rate": 9.409904313045038e-06, + "loss": 0.6816, + "step": 1018 + }, + { + "epoch": 0.73, + "grad_norm": 9.650685008374277, + "learning_rate": 9.408541491600511e-06, + "loss": 0.5723, + "step": 1019 + }, + { + "epoch": 0.73, + "grad_norm": 18.730637303407523, + "learning_rate": 9.407177197180328e-06, + "loss": 0.5962, + "step": 1020 + }, + { + "epoch": 0.73, + "grad_norm": 26.648761689749307, + "learning_rate": 9.405811430240329e-06, + "loss": 0.6924, + "step": 1021 + }, + { + "epoch": 0.73, + "grad_norm": 14.798406394080896, + "learning_rate": 9.404444191236837e-06, + "loss": 0.5791, + "step": 1022 + }, + { + "epoch": 0.73, + "grad_norm": 14.329130838909766, + "learning_rate": 9.403075480626674e-06, + "loss": 0.5547, + "step": 1023 + }, + { + "epoch": 0.73, + "grad_norm": 14.655850010883231, + "learning_rate": 9.401705298867151e-06, + "loss": 0.5957, + "step": 1024 + }, + { + "epoch": 0.73, + "grad_norm": 17.47268243968311, + "learning_rate": 9.400333646416073e-06, + "loss": 0.6396, + "step": 1025 + }, + { + "epoch": 0.73, + "grad_norm": 26.170896114421865, + "learning_rate": 9.398960523731735e-06, + "loss": 0.6133, + "step": 1026 + }, + { + "epoch": 0.73, + "grad_norm": 23.953028885048408, + "learning_rate": 9.397585931272919e-06, + "loss": 0.5537, + "step": 1027 + }, + { + "epoch": 0.73, + "grad_norm": 10.88531583467331, + "learning_rate": 9.396209869498905e-06, + "loss": 0.5498, + "step": 1028 + }, + { + "epoch": 0.73, + "grad_norm": 9.586608224218812, + "learning_rate": 9.39483233886946e-06, + "loss": 0.4663, + "step": 1029 + }, + { + "epoch": 0.74, + "grad_norm": 11.459922262332075, + "learning_rate": 9.393453339844842e-06, + "loss": 0.5684, + "step": 1030 + }, + { + "epoch": 0.74, + "grad_norm": 18.9938727234055, + "learning_rate": 9.392072872885802e-06, + "loss": 0.5859, + "step": 1031 + }, + { + "epoch": 0.74, + "grad_norm": 17.95231020980077, + "learning_rate": 9.39069093845358e-06, + "loss": 0.5669, + "step": 1032 + }, + { + "epoch": 0.74, + "grad_norm": 13.451010224992702, + "learning_rate": 9.389307537009902e-06, + "loss": 0.6187, + "step": 1033 + }, + { + "epoch": 0.74, + "grad_norm": 15.835554267305948, + "learning_rate": 9.387922669016992e-06, + "loss": 0.6094, + "step": 1034 + }, + { + "epoch": 0.74, + "grad_norm": 13.225942002568258, + "learning_rate": 9.386536334937557e-06, + "loss": 0.5293, + "step": 1035 + }, + { + "epoch": 0.74, + "grad_norm": 19.28571736427067, + "learning_rate": 9.385148535234799e-06, + "loss": 0.5771, + "step": 1036 + }, + { + "epoch": 0.74, + "grad_norm": 21.11918969171961, + "learning_rate": 9.383759270372408e-06, + "loss": 0.6396, + "step": 1037 + }, + { + "epoch": 0.74, + "grad_norm": 10.987795594520582, + "learning_rate": 9.382368540814563e-06, + "loss": 0.5493, + "step": 1038 + }, + { + "epoch": 0.74, + "grad_norm": 13.332762133791691, + "learning_rate": 9.380976347025932e-06, + "loss": 0.5781, + "step": 1039 + }, + { + "epoch": 0.74, + "grad_norm": 13.833649187371895, + "learning_rate": 9.379582689471671e-06, + "loss": 0.5078, + "step": 1040 + }, + { + "epoch": 0.74, + "grad_norm": 29.85600810006891, + "learning_rate": 9.378187568617431e-06, + "loss": 0.668, + "step": 1041 + }, + { + "epoch": 0.74, + "grad_norm": 18.86118062820929, + "learning_rate": 9.376790984929348e-06, + "loss": 0.5869, + "step": 1042 + }, + { + "epoch": 0.74, + "grad_norm": 28.051207820211953, + "learning_rate": 9.37539293887404e-06, + "loss": 0.6113, + "step": 1043 + }, + { + "epoch": 0.75, + "grad_norm": 32.11996073913673, + "learning_rate": 9.373993430918626e-06, + "loss": 0.5068, + "step": 1044 + }, + { + "epoch": 0.75, + "grad_norm": 17.79452829191096, + "learning_rate": 9.372592461530708e-06, + "loss": 0.5225, + "step": 1045 + }, + { + "epoch": 0.75, + "grad_norm": 11.681808797361166, + "learning_rate": 9.371190031178372e-06, + "loss": 0.582, + "step": 1046 + }, + { + "epoch": 0.75, + "grad_norm": 25.230590380011453, + "learning_rate": 9.369786140330198e-06, + "loss": 0.5239, + "step": 1047 + }, + { + "epoch": 0.75, + "grad_norm": 42.558740499704996, + "learning_rate": 9.368380789455251e-06, + "loss": 0.519, + "step": 1048 + }, + { + "epoch": 0.75, + "grad_norm": 13.297934374384019, + "learning_rate": 9.36697397902309e-06, + "loss": 0.5977, + "step": 1049 + }, + { + "epoch": 0.75, + "grad_norm": 12.086643595178904, + "learning_rate": 9.365565709503748e-06, + "loss": 0.5469, + "step": 1050 + }, + { + "epoch": 0.75, + "grad_norm": 10.056732327995272, + "learning_rate": 9.364155981367761e-06, + "loss": 0.4849, + "step": 1051 + }, + { + "epoch": 0.75, + "grad_norm": 8.381781397718909, + "learning_rate": 9.36274479508614e-06, + "loss": 0.519, + "step": 1052 + }, + { + "epoch": 0.75, + "grad_norm": 16.39451781815997, + "learning_rate": 9.361332151130396e-06, + "loss": 0.5249, + "step": 1053 + }, + { + "epoch": 0.75, + "grad_norm": 15.967285956708139, + "learning_rate": 9.359918049972512e-06, + "loss": 0.5493, + "step": 1054 + }, + { + "epoch": 0.75, + "grad_norm": 24.743475345880128, + "learning_rate": 9.358502492084969e-06, + "loss": 0.5654, + "step": 1055 + }, + { + "epoch": 0.75, + "grad_norm": 13.646634416378777, + "learning_rate": 9.35708547794073e-06, + "loss": 0.5703, + "step": 1056 + }, + { + "epoch": 0.75, + "grad_norm": 12.719559112795636, + "learning_rate": 9.355667008013249e-06, + "loss": 0.5825, + "step": 1057 + }, + { + "epoch": 0.76, + "grad_norm": 21.823118073564967, + "learning_rate": 9.354247082776459e-06, + "loss": 0.5981, + "step": 1058 + }, + { + "epoch": 0.76, + "grad_norm": 19.487103900742174, + "learning_rate": 9.352825702704784e-06, + "loss": 0.5532, + "step": 1059 + }, + { + "epoch": 0.76, + "grad_norm": 8.077491438600466, + "learning_rate": 9.351402868273136e-06, + "loss": 0.5513, + "step": 1060 + }, + { + "epoch": 0.76, + "grad_norm": 10.897696554119129, + "learning_rate": 9.349978579956908e-06, + "loss": 0.5938, + "step": 1061 + }, + { + "epoch": 0.76, + "grad_norm": 7.206740424557369, + "learning_rate": 9.348552838231983e-06, + "loss": 0.5547, + "step": 1062 + }, + { + "epoch": 0.76, + "grad_norm": 11.745893827810162, + "learning_rate": 9.347125643574726e-06, + "loss": 0.5117, + "step": 1063 + }, + { + "epoch": 0.76, + "grad_norm": 6.609051578212978, + "learning_rate": 9.345696996461992e-06, + "loss": 0.5918, + "step": 1064 + }, + { + "epoch": 0.76, + "grad_norm": 10.368231522386932, + "learning_rate": 9.344266897371114e-06, + "loss": 0.519, + "step": 1065 + }, + { + "epoch": 0.76, + "grad_norm": 20.399140300952528, + "learning_rate": 9.34283534677992e-06, + "loss": 0.5483, + "step": 1066 + }, + { + "epoch": 0.76, + "grad_norm": 12.568335263277644, + "learning_rate": 9.341402345166714e-06, + "loss": 0.6382, + "step": 1067 + }, + { + "epoch": 0.76, + "grad_norm": 6.28361345060809, + "learning_rate": 9.33996789301029e-06, + "loss": 0.626, + "step": 1068 + }, + { + "epoch": 0.76, + "grad_norm": 6.034588447871568, + "learning_rate": 9.338531990789926e-06, + "loss": 0.4995, + "step": 1069 + }, + { + "epoch": 0.76, + "grad_norm": 13.260306318615566, + "learning_rate": 9.33709463898538e-06, + "loss": 0.5732, + "step": 1070 + }, + { + "epoch": 0.76, + "grad_norm": 12.271709992114081, + "learning_rate": 9.335655838076902e-06, + "loss": 0.4492, + "step": 1071 + }, + { + "epoch": 0.77, + "grad_norm": 7.56028285693857, + "learning_rate": 9.33421558854522e-06, + "loss": 0.5234, + "step": 1072 + }, + { + "epoch": 0.77, + "grad_norm": 12.782154263459084, + "learning_rate": 9.332773890871548e-06, + "loss": 0.582, + "step": 1073 + }, + { + "epoch": 0.77, + "grad_norm": 8.094570074636316, + "learning_rate": 9.331330745537586e-06, + "loss": 0.5386, + "step": 1074 + }, + { + "epoch": 0.77, + "grad_norm": 20.10166588549232, + "learning_rate": 9.329886153025513e-06, + "loss": 0.5449, + "step": 1075 + }, + { + "epoch": 0.77, + "grad_norm": 6.192408971954997, + "learning_rate": 9.328440113817995e-06, + "loss": 0.5259, + "step": 1076 + }, + { + "epoch": 0.77, + "grad_norm": 12.208683818696056, + "learning_rate": 9.326992628398182e-06, + "loss": 0.5278, + "step": 1077 + }, + { + "epoch": 0.77, + "grad_norm": 17.34674029850454, + "learning_rate": 9.325543697249706e-06, + "loss": 0.6328, + "step": 1078 + }, + { + "epoch": 0.77, + "grad_norm": 11.912080486024484, + "learning_rate": 9.324093320856679e-06, + "loss": 0.5576, + "step": 1079 + }, + { + "epoch": 0.77, + "grad_norm": 8.692872107879284, + "learning_rate": 9.3226414997037e-06, + "loss": 0.5107, + "step": 1080 + }, + { + "epoch": 0.77, + "grad_norm": 12.965283753527137, + "learning_rate": 9.32118823427585e-06, + "loss": 0.6133, + "step": 1081 + }, + { + "epoch": 0.77, + "grad_norm": 9.088812339526163, + "learning_rate": 9.319733525058694e-06, + "loss": 0.5312, + "step": 1082 + }, + { + "epoch": 0.77, + "grad_norm": 8.69717168681488, + "learning_rate": 9.318277372538274e-06, + "loss": 0.5225, + "step": 1083 + }, + { + "epoch": 0.77, + "grad_norm": 6.757166076166586, + "learning_rate": 9.316819777201119e-06, + "loss": 0.5303, + "step": 1084 + }, + { + "epoch": 0.77, + "grad_norm": 14.268054362533457, + "learning_rate": 9.315360739534235e-06, + "loss": 0.6626, + "step": 1085 + }, + { + "epoch": 0.78, + "grad_norm": 12.75302943618852, + "learning_rate": 9.313900260025121e-06, + "loss": 0.4951, + "step": 1086 + }, + { + "epoch": 0.78, + "grad_norm": 13.058324474577528, + "learning_rate": 9.312438339161746e-06, + "loss": 0.5337, + "step": 1087 + }, + { + "epoch": 0.78, + "grad_norm": 9.186156422831758, + "learning_rate": 9.310974977432565e-06, + "loss": 0.5122, + "step": 1088 + }, + { + "epoch": 0.78, + "grad_norm": 19.810759557849348, + "learning_rate": 9.309510175326515e-06, + "loss": 0.5986, + "step": 1089 + }, + { + "epoch": 0.78, + "grad_norm": 7.352639000106736, + "learning_rate": 9.308043933333012e-06, + "loss": 0.54, + "step": 1090 + }, + { + "epoch": 0.78, + "grad_norm": 6.18764295150179, + "learning_rate": 9.306576251941957e-06, + "loss": 0.5889, + "step": 1091 + }, + { + "epoch": 0.78, + "grad_norm": 23.066681900530504, + "learning_rate": 9.305107131643729e-06, + "loss": 0.6221, + "step": 1092 + }, + { + "epoch": 0.78, + "grad_norm": 5.405377306247158, + "learning_rate": 9.303636572929188e-06, + "loss": 0.5723, + "step": 1093 + }, + { + "epoch": 0.78, + "grad_norm": 18.5355497270562, + "learning_rate": 9.302164576289674e-06, + "loss": 0.5957, + "step": 1094 + }, + { + "epoch": 0.78, + "grad_norm": 11.64764367778218, + "learning_rate": 9.30069114221701e-06, + "loss": 0.6143, + "step": 1095 + }, + { + "epoch": 0.78, + "grad_norm": 8.446851167336094, + "learning_rate": 9.299216271203498e-06, + "loss": 0.6006, + "step": 1096 + }, + { + "epoch": 0.78, + "grad_norm": 7.594638602419203, + "learning_rate": 9.297739963741918e-06, + "loss": 0.5596, + "step": 1097 + }, + { + "epoch": 0.78, + "grad_norm": 12.822830235951766, + "learning_rate": 9.296262220325535e-06, + "loss": 0.5557, + "step": 1098 + }, + { + "epoch": 0.78, + "grad_norm": 11.680693513430095, + "learning_rate": 9.294783041448088e-06, + "loss": 0.5522, + "step": 1099 + }, + { + "epoch": 0.79, + "grad_norm": 10.487159518778437, + "learning_rate": 9.293302427603796e-06, + "loss": 0.5518, + "step": 1100 + }, + { + "epoch": 0.79, + "grad_norm": 6.5217580901104935, + "learning_rate": 9.291820379287364e-06, + "loss": 0.5269, + "step": 1101 + }, + { + "epoch": 0.79, + "grad_norm": 10.594198953311631, + "learning_rate": 9.29033689699397e-06, + "loss": 0.5562, + "step": 1102 + }, + { + "epoch": 0.79, + "grad_norm": 5.991686017035859, + "learning_rate": 9.288851981219273e-06, + "loss": 0.5503, + "step": 1103 + }, + { + "epoch": 0.79, + "grad_norm": 10.804601642168482, + "learning_rate": 9.28736563245941e-06, + "loss": 0.5806, + "step": 1104 + }, + { + "epoch": 0.79, + "grad_norm": 12.916446670933475, + "learning_rate": 9.285877851210999e-06, + "loss": 0.5259, + "step": 1105 + }, + { + "epoch": 0.79, + "grad_norm": 9.312528006718804, + "learning_rate": 9.284388637971136e-06, + "loss": 0.563, + "step": 1106 + }, + { + "epoch": 0.79, + "grad_norm": 17.16927949883247, + "learning_rate": 9.282897993237392e-06, + "loss": 0.6025, + "step": 1107 + }, + { + "epoch": 0.79, + "grad_norm": 15.748674092022494, + "learning_rate": 9.281405917507824e-06, + "loss": 0.5723, + "step": 1108 + }, + { + "epoch": 0.79, + "grad_norm": 20.416556061970823, + "learning_rate": 9.279912411280958e-06, + "loss": 0.5889, + "step": 1109 + }, + { + "epoch": 0.79, + "grad_norm": 9.2599868177948, + "learning_rate": 9.278417475055803e-06, + "loss": 0.5654, + "step": 1110 + }, + { + "epoch": 0.79, + "grad_norm": 8.138906667061427, + "learning_rate": 9.276921109331845e-06, + "loss": 0.5127, + "step": 1111 + }, + { + "epoch": 0.79, + "grad_norm": 9.142722431903195, + "learning_rate": 9.275423314609049e-06, + "loss": 0.5771, + "step": 1112 + }, + { + "epoch": 0.79, + "grad_norm": 14.367984402900843, + "learning_rate": 9.273924091387855e-06, + "loss": 0.5132, + "step": 1113 + }, + { + "epoch": 0.8, + "grad_norm": 8.460138534290378, + "learning_rate": 9.272423440169181e-06, + "loss": 0.5537, + "step": 1114 + }, + { + "epoch": 0.8, + "grad_norm": 11.885508556319811, + "learning_rate": 9.270921361454424e-06, + "loss": 0.5698, + "step": 1115 + }, + { + "epoch": 0.8, + "grad_norm": 14.642226342988765, + "learning_rate": 9.269417855745453e-06, + "loss": 0.5547, + "step": 1116 + }, + { + "epoch": 0.8, + "grad_norm": 16.93459268058794, + "learning_rate": 9.267912923544621e-06, + "loss": 0.6357, + "step": 1117 + }, + { + "epoch": 0.8, + "grad_norm": 19.61687681956424, + "learning_rate": 9.266406565354753e-06, + "loss": 0.5869, + "step": 1118 + }, + { + "epoch": 0.8, + "grad_norm": 12.948478273276576, + "learning_rate": 9.26489878167915e-06, + "loss": 0.5884, + "step": 1119 + }, + { + "epoch": 0.8, + "grad_norm": 9.224943637865685, + "learning_rate": 9.263389573021592e-06, + "loss": 0.5391, + "step": 1120 + }, + { + "epoch": 0.8, + "grad_norm": 16.53281858669421, + "learning_rate": 9.261878939886332e-06, + "loss": 0.5391, + "step": 1121 + }, + { + "epoch": 0.8, + "grad_norm": 6.571223439001684, + "learning_rate": 9.2603668827781e-06, + "loss": 0.5869, + "step": 1122 + }, + { + "epoch": 0.8, + "grad_norm": 10.996971954221829, + "learning_rate": 9.258853402202106e-06, + "loss": 0.6143, + "step": 1123 + }, + { + "epoch": 0.8, + "grad_norm": 5.562877307679958, + "learning_rate": 9.25733849866403e-06, + "loss": 0.54, + "step": 1124 + }, + { + "epoch": 0.8, + "grad_norm": 25.681379456692, + "learning_rate": 9.255822172670028e-06, + "loss": 0.5957, + "step": 1125 + }, + { + "epoch": 0.8, + "grad_norm": 11.240913002501355, + "learning_rate": 9.254304424726734e-06, + "loss": 0.48, + "step": 1126 + }, + { + "epoch": 0.8, + "grad_norm": 5.171552813788881, + "learning_rate": 9.252785255341256e-06, + "loss": 0.5161, + "step": 1127 + }, + { + "epoch": 0.81, + "grad_norm": 10.921769965960761, + "learning_rate": 9.251264665021178e-06, + "loss": 0.5864, + "step": 1128 + }, + { + "epoch": 0.81, + "grad_norm": 6.1627582262793785, + "learning_rate": 9.249742654274554e-06, + "loss": 0.5283, + "step": 1129 + }, + { + "epoch": 0.81, + "grad_norm": 11.70067037372327, + "learning_rate": 9.24821922360992e-06, + "loss": 0.5396, + "step": 1130 + }, + { + "epoch": 0.81, + "grad_norm": 25.93057220073389, + "learning_rate": 9.246694373536277e-06, + "loss": 0.6123, + "step": 1131 + }, + { + "epoch": 0.81, + "grad_norm": 19.705923665031527, + "learning_rate": 9.245168104563112e-06, + "loss": 0.6152, + "step": 1132 + }, + { + "epoch": 0.81, + "grad_norm": 10.636045888986658, + "learning_rate": 9.243640417200376e-06, + "loss": 0.5352, + "step": 1133 + }, + { + "epoch": 0.81, + "grad_norm": 11.505774334822952, + "learning_rate": 9.242111311958502e-06, + "loss": 0.521, + "step": 1134 + }, + { + "epoch": 0.81, + "grad_norm": 7.567202760806081, + "learning_rate": 9.240580789348385e-06, + "loss": 0.6143, + "step": 1135 + }, + { + "epoch": 0.81, + "grad_norm": 11.503439518116549, + "learning_rate": 9.23904884988141e-06, + "loss": 0.5908, + "step": 1136 + }, + { + "epoch": 0.81, + "grad_norm": 15.221857770793607, + "learning_rate": 9.237515494069417e-06, + "loss": 0.5098, + "step": 1137 + }, + { + "epoch": 0.81, + "grad_norm": 18.552085086378824, + "learning_rate": 9.235980722424737e-06, + "loss": 0.5034, + "step": 1138 + }, + { + "epoch": 0.81, + "grad_norm": 16.725639961202933, + "learning_rate": 9.234444535460161e-06, + "loss": 0.4692, + "step": 1139 + }, + { + "epoch": 0.81, + "grad_norm": 11.41674940985237, + "learning_rate": 9.232906933688959e-06, + "loss": 0.5, + "step": 1140 + }, + { + "epoch": 0.81, + "grad_norm": 5.7960411684762825, + "learning_rate": 9.231367917624872e-06, + "loss": 0.5225, + "step": 1141 + }, + { + "epoch": 0.82, + "grad_norm": 12.017568946472704, + "learning_rate": 9.229827487782115e-06, + "loss": 0.5518, + "step": 1142 + }, + { + "epoch": 0.82, + "grad_norm": 31.05573818747122, + "learning_rate": 9.228285644675372e-06, + "loss": 0.7061, + "step": 1143 + }, + { + "epoch": 0.82, + "grad_norm": 14.083495042416361, + "learning_rate": 9.226742388819804e-06, + "loss": 0.5938, + "step": 1144 + }, + { + "epoch": 0.82, + "grad_norm": 7.823638652711353, + "learning_rate": 9.225197720731039e-06, + "loss": 0.606, + "step": 1145 + }, + { + "epoch": 0.82, + "grad_norm": 7.519289143758317, + "learning_rate": 9.223651640925181e-06, + "loss": 0.5732, + "step": 1146 + }, + { + "epoch": 0.82, + "grad_norm": 8.367894727037841, + "learning_rate": 9.222104149918804e-06, + "loss": 0.585, + "step": 1147 + }, + { + "epoch": 0.82, + "grad_norm": 21.655844816080055, + "learning_rate": 9.220555248228954e-06, + "loss": 0.5459, + "step": 1148 + }, + { + "epoch": 0.82, + "grad_norm": 19.43974033304336, + "learning_rate": 9.219004936373146e-06, + "loss": 0.6084, + "step": 1149 + }, + { + "epoch": 0.82, + "grad_norm": 9.847797651330511, + "learning_rate": 9.217453214869368e-06, + "loss": 0.5312, + "step": 1150 + }, + { + "epoch": 0.82, + "grad_norm": 12.437081911431463, + "learning_rate": 9.21590008423608e-06, + "loss": 0.5049, + "step": 1151 + }, + { + "epoch": 0.82, + "grad_norm": 6.470652434232925, + "learning_rate": 9.214345544992214e-06, + "loss": 0.5342, + "step": 1152 + }, + { + "epoch": 0.82, + "grad_norm": 10.034516507112599, + "learning_rate": 9.212789597657167e-06, + "loss": 0.5249, + "step": 1153 + }, + { + "epoch": 0.82, + "grad_norm": 9.99367571691055, + "learning_rate": 9.21123224275081e-06, + "loss": 0.6074, + "step": 1154 + }, + { + "epoch": 0.82, + "grad_norm": 8.864377559517447, + "learning_rate": 9.209673480793486e-06, + "loss": 0.5376, + "step": 1155 + }, + { + "epoch": 0.83, + "grad_norm": 6.346477449728334, + "learning_rate": 9.208113312306006e-06, + "loss": 0.478, + "step": 1156 + }, + { + "epoch": 0.83, + "grad_norm": 8.006532830891896, + "learning_rate": 9.206551737809653e-06, + "loss": 0.6025, + "step": 1157 + }, + { + "epoch": 0.83, + "grad_norm": 12.611614451718726, + "learning_rate": 9.204988757826173e-06, + "loss": 0.5278, + "step": 1158 + }, + { + "epoch": 0.83, + "grad_norm": 7.714149559119864, + "learning_rate": 9.203424372877791e-06, + "loss": 0.519, + "step": 1159 + }, + { + "epoch": 0.83, + "grad_norm": 10.767330554783726, + "learning_rate": 9.201858583487195e-06, + "loss": 0.5977, + "step": 1160 + }, + { + "epoch": 0.83, + "grad_norm": 6.727023867341504, + "learning_rate": 9.200291390177546e-06, + "loss": 0.4941, + "step": 1161 + }, + { + "epoch": 0.83, + "grad_norm": 12.08139287351435, + "learning_rate": 9.198722793472471e-06, + "loss": 0.4712, + "step": 1162 + }, + { + "epoch": 0.83, + "grad_norm": 19.361690233383793, + "learning_rate": 9.197152793896068e-06, + "loss": 0.5293, + "step": 1163 + }, + { + "epoch": 0.83, + "grad_norm": 12.659319120735523, + "learning_rate": 9.195581391972903e-06, + "loss": 0.6079, + "step": 1164 + }, + { + "epoch": 0.83, + "grad_norm": 14.574233768699525, + "learning_rate": 9.194008588228011e-06, + "loss": 0.5083, + "step": 1165 + }, + { + "epoch": 0.83, + "grad_norm": 12.875647206303185, + "learning_rate": 9.192434383186894e-06, + "loss": 0.5625, + "step": 1166 + }, + { + "epoch": 0.83, + "grad_norm": 10.59800269890822, + "learning_rate": 9.190858777375523e-06, + "loss": 0.5361, + "step": 1167 + }, + { + "epoch": 0.83, + "grad_norm": 8.150351861232854, + "learning_rate": 9.18928177132034e-06, + "loss": 0.5273, + "step": 1168 + }, + { + "epoch": 0.83, + "grad_norm": 15.189230279097433, + "learning_rate": 9.187703365548248e-06, + "loss": 0.5054, + "step": 1169 + }, + { + "epoch": 0.84, + "grad_norm": 16.844111824576274, + "learning_rate": 9.186123560586623e-06, + "loss": 0.7607, + "step": 1170 + }, + { + "epoch": 0.84, + "grad_norm": 10.745982102578823, + "learning_rate": 9.18454235696331e-06, + "loss": 0.5322, + "step": 1171 + }, + { + "epoch": 0.84, + "grad_norm": 9.782839753489112, + "learning_rate": 9.182959755206613e-06, + "loss": 0.5449, + "step": 1172 + }, + { + "epoch": 0.84, + "grad_norm": 10.418717825018716, + "learning_rate": 9.181375755845314e-06, + "loss": 0.5337, + "step": 1173 + }, + { + "epoch": 0.84, + "grad_norm": 6.352862983177861, + "learning_rate": 9.179790359408655e-06, + "loss": 0.5342, + "step": 1174 + }, + { + "epoch": 0.84, + "grad_norm": 8.998049401916107, + "learning_rate": 9.178203566426344e-06, + "loss": 0.5479, + "step": 1175 + }, + { + "epoch": 0.84, + "grad_norm": 5.420625471547234, + "learning_rate": 9.176615377428563e-06, + "loss": 0.5527, + "step": 1176 + }, + { + "epoch": 0.84, + "grad_norm": 16.578151600737712, + "learning_rate": 9.175025792945951e-06, + "loss": 0.5308, + "step": 1177 + }, + { + "epoch": 0.84, + "grad_norm": 12.490365222019324, + "learning_rate": 9.173434813509618e-06, + "loss": 0.4888, + "step": 1178 + }, + { + "epoch": 0.84, + "grad_norm": 5.223972990178867, + "learning_rate": 9.171842439651143e-06, + "loss": 0.5366, + "step": 1179 + }, + { + "epoch": 0.84, + "grad_norm": 4.921385835164377, + "learning_rate": 9.170248671902565e-06, + "loss": 0.4946, + "step": 1180 + }, + { + "epoch": 0.84, + "grad_norm": 16.07766786230304, + "learning_rate": 9.168653510796392e-06, + "loss": 0.7295, + "step": 1181 + }, + { + "epoch": 0.84, + "grad_norm": 14.593698266134565, + "learning_rate": 9.167056956865596e-06, + "loss": 0.5977, + "step": 1182 + }, + { + "epoch": 0.84, + "grad_norm": 16.775199062594883, + "learning_rate": 9.165459010643618e-06, + "loss": 0.5488, + "step": 1183 + }, + { + "epoch": 0.85, + "grad_norm": 7.663762375167531, + "learning_rate": 9.16385967266436e-06, + "loss": 0.5054, + "step": 1184 + }, + { + "epoch": 0.85, + "grad_norm": 5.215806260743865, + "learning_rate": 9.16225894346219e-06, + "loss": 0.5186, + "step": 1185 + }, + { + "epoch": 0.85, + "grad_norm": 12.282697942390582, + "learning_rate": 9.160656823571942e-06, + "loss": 0.5898, + "step": 1186 + }, + { + "epoch": 0.85, + "grad_norm": 17.51121958562781, + "learning_rate": 9.159053313528913e-06, + "loss": 0.5605, + "step": 1187 + }, + { + "epoch": 0.85, + "grad_norm": 17.268837485274915, + "learning_rate": 9.15744841386887e-06, + "loss": 0.5498, + "step": 1188 + }, + { + "epoch": 0.85, + "grad_norm": 10.072700613697913, + "learning_rate": 9.155842125128033e-06, + "loss": 0.5215, + "step": 1189 + }, + { + "epoch": 0.85, + "grad_norm": 11.958687423245578, + "learning_rate": 9.154234447843098e-06, + "loss": 0.5063, + "step": 1190 + }, + { + "epoch": 0.85, + "grad_norm": 6.237034674269332, + "learning_rate": 9.152625382551217e-06, + "loss": 0.4717, + "step": 1191 + }, + { + "epoch": 0.85, + "grad_norm": 20.153622241744905, + "learning_rate": 9.15101492979001e-06, + "loss": 0.5029, + "step": 1192 + }, + { + "epoch": 0.85, + "grad_norm": 21.47771817765446, + "learning_rate": 9.149403090097557e-06, + "loss": 0.5664, + "step": 1193 + }, + { + "epoch": 0.85, + "grad_norm": 9.451169764840035, + "learning_rate": 9.147789864012408e-06, + "loss": 0.6226, + "step": 1194 + }, + { + "epoch": 0.85, + "grad_norm": 10.500389478462552, + "learning_rate": 9.146175252073568e-06, + "loss": 0.5518, + "step": 1195 + }, + { + "epoch": 0.85, + "grad_norm": 9.352519920650707, + "learning_rate": 9.144559254820511e-06, + "loss": 0.4795, + "step": 1196 + }, + { + "epoch": 0.85, + "grad_norm": 7.579403203990102, + "learning_rate": 9.14294187279317e-06, + "loss": 0.5386, + "step": 1197 + }, + { + "epoch": 0.86, + "grad_norm": 16.402304104093776, + "learning_rate": 9.141323106531943e-06, + "loss": 0.5537, + "step": 1198 + }, + { + "epoch": 0.86, + "grad_norm": 11.906202099251876, + "learning_rate": 9.139702956577693e-06, + "loss": 0.5342, + "step": 1199 + }, + { + "epoch": 0.86, + "grad_norm": 6.994868615336212, + "learning_rate": 9.138081423471736e-06, + "loss": 0.5054, + "step": 1200 + }, + { + "epoch": 0.86, + "grad_norm": 10.708925086407962, + "learning_rate": 9.136458507755862e-06, + "loss": 0.5317, + "step": 1201 + }, + { + "epoch": 0.86, + "grad_norm": 22.74881444619516, + "learning_rate": 9.134834209972314e-06, + "loss": 0.5034, + "step": 1202 + }, + { + "epoch": 0.86, + "grad_norm": 10.191911807224606, + "learning_rate": 9.133208530663801e-06, + "loss": 0.4849, + "step": 1203 + }, + { + "epoch": 0.86, + "grad_norm": 8.95443208118561, + "learning_rate": 9.131581470373495e-06, + "loss": 0.5337, + "step": 1204 + }, + { + "epoch": 0.86, + "grad_norm": 15.525395622875937, + "learning_rate": 9.129953029645022e-06, + "loss": 0.4604, + "step": 1205 + }, + { + "epoch": 0.86, + "grad_norm": 11.79694996087205, + "learning_rate": 9.128323209022478e-06, + "loss": 0.5122, + "step": 1206 + }, + { + "epoch": 0.86, + "grad_norm": 11.755997307705414, + "learning_rate": 9.126692009050415e-06, + "loss": 0.4463, + "step": 1207 + }, + { + "epoch": 0.86, + "grad_norm": 26.80591783623869, + "learning_rate": 9.125059430273848e-06, + "loss": 0.5796, + "step": 1208 + }, + { + "epoch": 0.86, + "grad_norm": 18.852441160925146, + "learning_rate": 9.123425473238253e-06, + "loss": 0.5015, + "step": 1209 + }, + { + "epoch": 0.86, + "grad_norm": 12.625338450562605, + "learning_rate": 9.121790138489564e-06, + "loss": 0.5117, + "step": 1210 + }, + { + "epoch": 0.86, + "grad_norm": 13.080326610727653, + "learning_rate": 9.120153426574177e-06, + "loss": 0.5244, + "step": 1211 + }, + { + "epoch": 0.87, + "grad_norm": 10.88918740584613, + "learning_rate": 9.118515338038947e-06, + "loss": 0.48, + "step": 1212 + }, + { + "epoch": 0.87, + "grad_norm": 9.68959162364965, + "learning_rate": 9.11687587343119e-06, + "loss": 0.5259, + "step": 1213 + }, + { + "epoch": 0.87, + "grad_norm": 19.726892232656102, + "learning_rate": 9.115235033298682e-06, + "loss": 0.5601, + "step": 1214 + }, + { + "epoch": 0.87, + "grad_norm": 8.708377102841364, + "learning_rate": 9.113592818189661e-06, + "loss": 0.5708, + "step": 1215 + }, + { + "epoch": 0.87, + "grad_norm": 20.856805732469365, + "learning_rate": 9.111949228652816e-06, + "loss": 0.5498, + "step": 1216 + }, + { + "epoch": 0.87, + "grad_norm": 12.19921968394708, + "learning_rate": 9.110304265237304e-06, + "loss": 0.5098, + "step": 1217 + }, + { + "epoch": 0.87, + "grad_norm": 5.873996190520849, + "learning_rate": 9.10865792849274e-06, + "loss": 0.4893, + "step": 1218 + }, + { + "epoch": 0.87, + "grad_norm": 13.856928395460525, + "learning_rate": 9.107010218969191e-06, + "loss": 0.6113, + "step": 1219 + }, + { + "epoch": 0.87, + "grad_norm": 7.008354271851358, + "learning_rate": 9.10536113721719e-06, + "loss": 0.5874, + "step": 1220 + }, + { + "epoch": 0.87, + "grad_norm": 21.27467515244621, + "learning_rate": 9.103710683787728e-06, + "loss": 0.4824, + "step": 1221 + }, + { + "epoch": 0.87, + "grad_norm": 6.838954506974407, + "learning_rate": 9.102058859232247e-06, + "loss": 0.5596, + "step": 1222 + }, + { + "epoch": 0.87, + "grad_norm": 5.3653312710855285, + "learning_rate": 9.100405664102656e-06, + "loss": 0.5063, + "step": 1223 + }, + { + "epoch": 0.87, + "grad_norm": 17.92988348312749, + "learning_rate": 9.098751098951317e-06, + "loss": 0.6328, + "step": 1224 + }, + { + "epoch": 0.87, + "grad_norm": 8.868600191790884, + "learning_rate": 9.09709516433105e-06, + "loss": 0.5679, + "step": 1225 + }, + { + "epoch": 0.88, + "grad_norm": 11.654596470733907, + "learning_rate": 9.095437860795138e-06, + "loss": 0.5215, + "step": 1226 + }, + { + "epoch": 0.88, + "grad_norm": 11.054939394702243, + "learning_rate": 9.09377918889731e-06, + "loss": 0.5317, + "step": 1227 + }, + { + "epoch": 0.88, + "grad_norm": 7.809295897500469, + "learning_rate": 9.092119149191765e-06, + "loss": 0.5142, + "step": 1228 + }, + { + "epoch": 0.88, + "grad_norm": 7.752863951527627, + "learning_rate": 9.090457742233152e-06, + "loss": 0.4692, + "step": 1229 + }, + { + "epoch": 0.88, + "grad_norm": 7.080450373075989, + "learning_rate": 9.088794968576575e-06, + "loss": 0.5771, + "step": 1230 + }, + { + "epoch": 0.88, + "grad_norm": 6.273931732814395, + "learning_rate": 9.087130828777598e-06, + "loss": 0.5811, + "step": 1231 + }, + { + "epoch": 0.88, + "grad_norm": 14.801947879251854, + "learning_rate": 9.085465323392243e-06, + "loss": 0.5923, + "step": 1232 + }, + { + "epoch": 0.88, + "grad_norm": 11.904080105524493, + "learning_rate": 9.083798452976988e-06, + "loss": 0.584, + "step": 1233 + }, + { + "epoch": 0.88, + "grad_norm": 12.329611523622987, + "learning_rate": 9.082130218088762e-06, + "loss": 0.5698, + "step": 1234 + }, + { + "epoch": 0.88, + "grad_norm": 7.2037432427127, + "learning_rate": 9.080460619284954e-06, + "loss": 0.5283, + "step": 1235 + }, + { + "epoch": 0.88, + "grad_norm": 12.783777108155, + "learning_rate": 9.07878965712341e-06, + "loss": 0.6377, + "step": 1236 + }, + { + "epoch": 0.88, + "grad_norm": 5.5497103145234625, + "learning_rate": 9.077117332162427e-06, + "loss": 0.5791, + "step": 1237 + }, + { + "epoch": 0.88, + "grad_norm": 7.356999836172582, + "learning_rate": 9.075443644960761e-06, + "loss": 0.5063, + "step": 1238 + }, + { + "epoch": 0.88, + "grad_norm": 9.158396146128032, + "learning_rate": 9.07376859607762e-06, + "loss": 0.5376, + "step": 1239 + }, + { + "epoch": 0.89, + "grad_norm": 9.510823350991721, + "learning_rate": 9.072092186072675e-06, + "loss": 0.6494, + "step": 1240 + }, + { + "epoch": 0.89, + "grad_norm": 15.757079666813102, + "learning_rate": 9.070414415506038e-06, + "loss": 0.6143, + "step": 1241 + }, + { + "epoch": 0.89, + "grad_norm": 8.23137401597286, + "learning_rate": 9.068735284938288e-06, + "loss": 0.4785, + "step": 1242 + }, + { + "epoch": 0.89, + "grad_norm": 5.909470075068411, + "learning_rate": 9.067054794930452e-06, + "loss": 0.4731, + "step": 1243 + }, + { + "epoch": 0.89, + "grad_norm": 6.924869803084112, + "learning_rate": 9.065372946044014e-06, + "loss": 0.606, + "step": 1244 + }, + { + "epoch": 0.89, + "grad_norm": 8.833436716234493, + "learning_rate": 9.063689738840911e-06, + "loss": 0.5996, + "step": 1245 + }, + { + "epoch": 0.89, + "grad_norm": 11.76560707751374, + "learning_rate": 9.06200517388353e-06, + "loss": 0.6333, + "step": 1246 + }, + { + "epoch": 0.89, + "grad_norm": 6.385881684068905, + "learning_rate": 9.060319251734723e-06, + "loss": 0.5283, + "step": 1247 + }, + { + "epoch": 0.89, + "grad_norm": 5.674391394289866, + "learning_rate": 9.058631972957783e-06, + "loss": 0.5977, + "step": 1248 + }, + { + "epoch": 0.89, + "grad_norm": 7.351792465763545, + "learning_rate": 9.056943338116461e-06, + "loss": 0.5024, + "step": 1249 + }, + { + "epoch": 0.89, + "grad_norm": 9.53319179062876, + "learning_rate": 9.055253347774961e-06, + "loss": 0.5386, + "step": 1250 + }, + { + "epoch": 0.89, + "grad_norm": 11.528600979838268, + "learning_rate": 9.053562002497943e-06, + "loss": 0.5737, + "step": 1251 + }, + { + "epoch": 0.89, + "grad_norm": 7.959305596996799, + "learning_rate": 9.051869302850515e-06, + "loss": 0.4683, + "step": 1252 + }, + { + "epoch": 0.89, + "grad_norm": 8.58161660458802, + "learning_rate": 9.05017524939824e-06, + "loss": 0.5527, + "step": 1253 + }, + { + "epoch": 0.9, + "grad_norm": 12.243428609193199, + "learning_rate": 9.048479842707132e-06, + "loss": 0.5894, + "step": 1254 + }, + { + "epoch": 0.9, + "grad_norm": 5.236285055544797, + "learning_rate": 9.046783083343657e-06, + "loss": 0.4614, + "step": 1255 + }, + { + "epoch": 0.9, + "grad_norm": 7.591178916038821, + "learning_rate": 9.045084971874738e-06, + "loss": 0.5181, + "step": 1256 + }, + { + "epoch": 0.9, + "grad_norm": 10.84688810132272, + "learning_rate": 9.043385508867741e-06, + "loss": 0.4771, + "step": 1257 + }, + { + "epoch": 0.9, + "grad_norm": 8.615688659064489, + "learning_rate": 9.041684694890492e-06, + "loss": 0.4736, + "step": 1258 + }, + { + "epoch": 0.9, + "grad_norm": 12.846075910748137, + "learning_rate": 9.03998253051126e-06, + "loss": 0.6909, + "step": 1259 + }, + { + "epoch": 0.9, + "grad_norm": 11.612951792166978, + "learning_rate": 9.038279016298773e-06, + "loss": 0.4756, + "step": 1260 + }, + { + "epoch": 0.9, + "grad_norm": 8.350725640206349, + "learning_rate": 9.036574152822206e-06, + "loss": 0.5273, + "step": 1261 + }, + { + "epoch": 0.9, + "grad_norm": 6.878946181687478, + "learning_rate": 9.034867940651186e-06, + "loss": 0.5635, + "step": 1262 + }, + { + "epoch": 0.9, + "grad_norm": 5.333503190009848, + "learning_rate": 9.033160380355789e-06, + "loss": 0.4512, + "step": 1263 + }, + { + "epoch": 0.9, + "grad_norm": 9.830536532606617, + "learning_rate": 9.031451472506544e-06, + "loss": 0.5039, + "step": 1264 + }, + { + "epoch": 0.9, + "grad_norm": 9.00597678487424, + "learning_rate": 9.029741217674428e-06, + "loss": 0.5776, + "step": 1265 + }, + { + "epoch": 0.9, + "grad_norm": 9.365161480301378, + "learning_rate": 9.02802961643087e-06, + "loss": 0.5635, + "step": 1266 + }, + { + "epoch": 0.9, + "grad_norm": 13.387833721542348, + "learning_rate": 9.026316669347747e-06, + "loss": 0.5137, + "step": 1267 + }, + { + "epoch": 0.91, + "grad_norm": 10.41425867798381, + "learning_rate": 9.024602376997387e-06, + "loss": 0.5454, + "step": 1268 + }, + { + "epoch": 0.91, + "grad_norm": 10.311986546252852, + "learning_rate": 9.022886739952565e-06, + "loss": 0.4316, + "step": 1269 + }, + { + "epoch": 0.91, + "grad_norm": 14.496984433352765, + "learning_rate": 9.02116975878651e-06, + "loss": 0.5542, + "step": 1270 + }, + { + "epoch": 0.91, + "grad_norm": 11.783629489842369, + "learning_rate": 9.019451434072894e-06, + "loss": 0.5654, + "step": 1271 + }, + { + "epoch": 0.91, + "grad_norm": 7.656169059630007, + "learning_rate": 9.017731766385844e-06, + "loss": 0.5093, + "step": 1272 + }, + { + "epoch": 0.91, + "grad_norm": 7.338346102975163, + "learning_rate": 9.016010756299934e-06, + "loss": 0.4619, + "step": 1273 + }, + { + "epoch": 0.91, + "grad_norm": 8.6546009929447, + "learning_rate": 9.014288404390182e-06, + "loss": 0.5293, + "step": 1274 + }, + { + "epoch": 0.91, + "grad_norm": 14.49900328718962, + "learning_rate": 9.012564711232059e-06, + "loss": 0.4985, + "step": 1275 + }, + { + "epoch": 0.91, + "grad_norm": 8.799568957162913, + "learning_rate": 9.010839677401484e-06, + "loss": 0.5776, + "step": 1276 + }, + { + "epoch": 0.91, + "grad_norm": 6.442440495388319, + "learning_rate": 9.009113303474822e-06, + "loss": 0.4966, + "step": 1277 + }, + { + "epoch": 0.91, + "grad_norm": 8.780144748137946, + "learning_rate": 9.007385590028887e-06, + "loss": 0.4404, + "step": 1278 + }, + { + "epoch": 0.91, + "grad_norm": 13.298026166100321, + "learning_rate": 9.005656537640942e-06, + "loss": 0.5498, + "step": 1279 + }, + { + "epoch": 0.91, + "grad_norm": 5.495108785802219, + "learning_rate": 9.003926146888691e-06, + "loss": 0.4897, + "step": 1280 + }, + { + "epoch": 0.91, + "grad_norm": 6.611746701232708, + "learning_rate": 9.002194418350291e-06, + "loss": 0.6392, + "step": 1281 + }, + { + "epoch": 0.92, + "grad_norm": 11.718656093085405, + "learning_rate": 9.000461352604349e-06, + "loss": 0.5972, + "step": 1282 + }, + { + "epoch": 0.92, + "grad_norm": 9.384919907374725, + "learning_rate": 8.99872695022991e-06, + "loss": 0.5142, + "step": 1283 + }, + { + "epoch": 0.92, + "grad_norm": 8.111422961254767, + "learning_rate": 8.996991211806471e-06, + "loss": 0.5176, + "step": 1284 + }, + { + "epoch": 0.92, + "grad_norm": 6.61897079404457, + "learning_rate": 8.995254137913977e-06, + "loss": 0.5859, + "step": 1285 + }, + { + "epoch": 0.92, + "grad_norm": 7.626370903659802, + "learning_rate": 8.99351572913281e-06, + "loss": 0.5083, + "step": 1286 + }, + { + "epoch": 0.92, + "grad_norm": 6.1278822351560756, + "learning_rate": 8.991775986043814e-06, + "loss": 0.4365, + "step": 1287 + }, + { + "epoch": 0.92, + "grad_norm": 5.807804903096628, + "learning_rate": 8.990034909228262e-06, + "loss": 0.5439, + "step": 1288 + }, + { + "epoch": 0.92, + "grad_norm": 5.912336119687683, + "learning_rate": 8.988292499267885e-06, + "loss": 0.4189, + "step": 1289 + }, + { + "epoch": 0.92, + "grad_norm": 16.556716686548828, + "learning_rate": 8.986548756744852e-06, + "loss": 0.4966, + "step": 1290 + }, + { + "epoch": 0.92, + "grad_norm": 7.882441819426605, + "learning_rate": 8.98480368224178e-06, + "loss": 0.5439, + "step": 1291 + }, + { + "epoch": 0.92, + "grad_norm": 10.43392907102242, + "learning_rate": 8.98305727634173e-06, + "loss": 0.4883, + "step": 1292 + }, + { + "epoch": 0.92, + "grad_norm": 7.656944181948202, + "learning_rate": 8.981309539628212e-06, + "loss": 0.5811, + "step": 1293 + }, + { + "epoch": 0.92, + "grad_norm": 6.359227028201466, + "learning_rate": 8.979560472685174e-06, + "loss": 0.4385, + "step": 1294 + }, + { + "epoch": 0.92, + "grad_norm": 8.356588723580822, + "learning_rate": 8.977810076097013e-06, + "loss": 0.4492, + "step": 1295 + }, + { + "epoch": 0.93, + "grad_norm": 9.82540714825025, + "learning_rate": 8.97605835044857e-06, + "loss": 0.647, + "step": 1296 + }, + { + "epoch": 0.93, + "grad_norm": 6.504595450793861, + "learning_rate": 8.974305296325125e-06, + "loss": 0.4238, + "step": 1297 + }, + { + "epoch": 0.93, + "grad_norm": 6.877558211686209, + "learning_rate": 8.97255091431241e-06, + "loss": 0.4917, + "step": 1298 + }, + { + "epoch": 0.93, + "grad_norm": 10.50916493277105, + "learning_rate": 8.970795204996597e-06, + "loss": 0.4795, + "step": 1299 + }, + { + "epoch": 0.93, + "grad_norm": 9.204769262934178, + "learning_rate": 8.969038168964298e-06, + "loss": 0.5645, + "step": 1300 + }, + { + "epoch": 0.93, + "grad_norm": 16.851121613107946, + "learning_rate": 8.967279806802576e-06, + "loss": 0.5483, + "step": 1301 + }, + { + "epoch": 0.93, + "grad_norm": 8.587881444493236, + "learning_rate": 8.965520119098926e-06, + "loss": 0.478, + "step": 1302 + }, + { + "epoch": 0.93, + "grad_norm": 22.027251162811073, + "learning_rate": 8.9637591064413e-06, + "loss": 0.624, + "step": 1303 + }, + { + "epoch": 0.93, + "grad_norm": 21.099260672728388, + "learning_rate": 8.961996769418077e-06, + "loss": 0.5215, + "step": 1304 + }, + { + "epoch": 0.93, + "grad_norm": 12.415671378119475, + "learning_rate": 8.960233108618092e-06, + "loss": 0.5791, + "step": 1305 + }, + { + "epoch": 0.93, + "grad_norm": 7.662223641152644, + "learning_rate": 8.958468124630617e-06, + "loss": 0.5718, + "step": 1306 + }, + { + "epoch": 0.93, + "grad_norm": 6.366844981796646, + "learning_rate": 8.956701818045363e-06, + "loss": 0.4946, + "step": 1307 + }, + { + "epoch": 0.93, + "grad_norm": 18.096606101285385, + "learning_rate": 8.954934189452489e-06, + "loss": 0.4512, + "step": 1308 + }, + { + "epoch": 0.93, + "grad_norm": 12.714222025011097, + "learning_rate": 8.953165239442589e-06, + "loss": 0.5986, + "step": 1309 + }, + { + "epoch": 0.94, + "grad_norm": 14.336782119481711, + "learning_rate": 8.951394968606704e-06, + "loss": 0.5625, + "step": 1310 + }, + { + "epoch": 0.94, + "grad_norm": 18.32913837235048, + "learning_rate": 8.949623377536314e-06, + "loss": 0.5757, + "step": 1311 + }, + { + "epoch": 0.94, + "grad_norm": 11.84859426270501, + "learning_rate": 8.947850466823343e-06, + "loss": 0.4834, + "step": 1312 + }, + { + "epoch": 0.94, + "grad_norm": 8.789293590180087, + "learning_rate": 8.946076237060148e-06, + "loss": 0.5361, + "step": 1313 + }, + { + "epoch": 0.94, + "grad_norm": 5.977729367014027, + "learning_rate": 8.944300688839538e-06, + "loss": 0.5249, + "step": 1314 + }, + { + "epoch": 0.94, + "grad_norm": 11.763893603900193, + "learning_rate": 8.942523822754751e-06, + "loss": 0.5415, + "step": 1315 + }, + { + "epoch": 0.94, + "grad_norm": 4.7524094218657575, + "learning_rate": 8.940745639399477e-06, + "loss": 0.5156, + "step": 1316 + }, + { + "epoch": 0.94, + "grad_norm": 5.7945367399217975, + "learning_rate": 8.938966139367837e-06, + "loss": 0.5059, + "step": 1317 + }, + { + "epoch": 0.94, + "grad_norm": 7.0240203752149535, + "learning_rate": 8.937185323254395e-06, + "loss": 0.5151, + "step": 1318 + }, + { + "epoch": 0.94, + "grad_norm": 7.481981866447117, + "learning_rate": 8.935403191654155e-06, + "loss": 0.4126, + "step": 1319 + }, + { + "epoch": 0.94, + "grad_norm": 9.449159435304624, + "learning_rate": 8.933619745162559e-06, + "loss": 0.5938, + "step": 1320 + }, + { + "epoch": 0.94, + "grad_norm": 8.242575178472086, + "learning_rate": 8.931834984375492e-06, + "loss": 0.4771, + "step": 1321 + }, + { + "epoch": 0.94, + "grad_norm": 8.949440636644377, + "learning_rate": 8.930048909889272e-06, + "loss": 0.5474, + "step": 1322 + }, + { + "epoch": 0.94, + "grad_norm": 7.20984477728289, + "learning_rate": 8.928261522300665e-06, + "loss": 0.5073, + "step": 1323 + }, + { + "epoch": 0.95, + "grad_norm": 5.653104358487502, + "learning_rate": 8.926472822206869e-06, + "loss": 0.4878, + "step": 1324 + }, + { + "epoch": 0.95, + "grad_norm": 15.750277767714117, + "learning_rate": 8.924682810205519e-06, + "loss": 0.5728, + "step": 1325 + }, + { + "epoch": 0.95, + "grad_norm": 6.545984784195176, + "learning_rate": 8.922891486894692e-06, + "loss": 0.4961, + "step": 1326 + }, + { + "epoch": 0.95, + "grad_norm": 6.327138896740525, + "learning_rate": 8.921098852872904e-06, + "loss": 0.4985, + "step": 1327 + }, + { + "epoch": 0.95, + "grad_norm": 7.5493735701358435, + "learning_rate": 8.919304908739106e-06, + "loss": 0.5244, + "step": 1328 + }, + { + "epoch": 0.95, + "grad_norm": 8.695313836532721, + "learning_rate": 8.917509655092691e-06, + "loss": 0.5732, + "step": 1329 + }, + { + "epoch": 0.95, + "grad_norm": 6.7706710902436695, + "learning_rate": 8.915713092533483e-06, + "loss": 0.4644, + "step": 1330 + }, + { + "epoch": 0.95, + "grad_norm": 10.650752650381953, + "learning_rate": 8.913915221661748e-06, + "loss": 0.5396, + "step": 1331 + }, + { + "epoch": 0.95, + "grad_norm": 13.714879719273181, + "learning_rate": 8.912116043078188e-06, + "loss": 0.5454, + "step": 1332 + }, + { + "epoch": 0.95, + "grad_norm": 8.739222984938651, + "learning_rate": 8.910315557383944e-06, + "loss": 0.6167, + "step": 1333 + }, + { + "epoch": 0.95, + "grad_norm": 14.128982922295334, + "learning_rate": 8.90851376518059e-06, + "loss": 0.5786, + "step": 1334 + }, + { + "epoch": 0.95, + "grad_norm": 8.459299957556542, + "learning_rate": 8.906710667070136e-06, + "loss": 0.624, + "step": 1335 + }, + { + "epoch": 0.95, + "grad_norm": 6.598499948785349, + "learning_rate": 8.904906263655036e-06, + "loss": 0.6475, + "step": 1336 + }, + { + "epoch": 0.95, + "grad_norm": 6.194095670682739, + "learning_rate": 8.903100555538169e-06, + "loss": 0.5264, + "step": 1337 + }, + { + "epoch": 0.96, + "grad_norm": 11.260818438840108, + "learning_rate": 8.90129354332286e-06, + "loss": 0.5459, + "step": 1338 + }, + { + "epoch": 0.96, + "grad_norm": 10.831921547703958, + "learning_rate": 8.899485227612865e-06, + "loss": 0.5386, + "step": 1339 + }, + { + "epoch": 0.96, + "grad_norm": 10.993655493393854, + "learning_rate": 8.897675609012372e-06, + "loss": 0.5488, + "step": 1340 + }, + { + "epoch": 0.96, + "grad_norm": 7.371509471270881, + "learning_rate": 8.895864688126013e-06, + "loss": 0.5415, + "step": 1341 + }, + { + "epoch": 0.96, + "grad_norm": 12.289776737421548, + "learning_rate": 8.894052465558846e-06, + "loss": 0.5205, + "step": 1342 + }, + { + "epoch": 0.96, + "grad_norm": 6.368680291839942, + "learning_rate": 8.892238941916372e-06, + "loss": 0.5693, + "step": 1343 + }, + { + "epoch": 0.96, + "grad_norm": 6.862113997301074, + "learning_rate": 8.890424117804522e-06, + "loss": 0.5518, + "step": 1344 + }, + { + "epoch": 0.96, + "grad_norm": 6.249598220460352, + "learning_rate": 8.88860799382966e-06, + "loss": 0.6641, + "step": 1345 + }, + { + "epoch": 0.96, + "grad_norm": 9.675919356373685, + "learning_rate": 8.88679057059859e-06, + "loss": 0.5454, + "step": 1346 + }, + { + "epoch": 0.96, + "grad_norm": 7.730236623835078, + "learning_rate": 8.884971848718544e-06, + "loss": 0.5562, + "step": 1347 + }, + { + "epoch": 0.96, + "grad_norm": 11.74713854214673, + "learning_rate": 8.883151828797194e-06, + "loss": 0.5557, + "step": 1348 + }, + { + "epoch": 0.96, + "grad_norm": 8.820048698580294, + "learning_rate": 8.88133051144264e-06, + "loss": 0.5674, + "step": 1349 + }, + { + "epoch": 0.96, + "grad_norm": 6.657133326905244, + "learning_rate": 8.87950789726342e-06, + "loss": 0.5859, + "step": 1350 + }, + { + "epoch": 0.96, + "grad_norm": 7.081312677292772, + "learning_rate": 8.8776839868685e-06, + "loss": 0.5039, + "step": 1351 + }, + { + "epoch": 0.97, + "grad_norm": 10.681121735697019, + "learning_rate": 8.875858780867286e-06, + "loss": 0.5093, + "step": 1352 + }, + { + "epoch": 0.97, + "grad_norm": 16.68941777386891, + "learning_rate": 8.87403227986961e-06, + "loss": 0.6191, + "step": 1353 + }, + { + "epoch": 0.97, + "grad_norm": 22.743290633026398, + "learning_rate": 8.872204484485743e-06, + "loss": 0.5903, + "step": 1354 + }, + { + "epoch": 0.97, + "grad_norm": 7.595883157216245, + "learning_rate": 8.870375395326384e-06, + "loss": 0.4712, + "step": 1355 + }, + { + "epoch": 0.97, + "grad_norm": 11.431960114700615, + "learning_rate": 8.868545013002665e-06, + "loss": 0.4814, + "step": 1356 + }, + { + "epoch": 0.97, + "grad_norm": 24.738428919658247, + "learning_rate": 8.866713338126152e-06, + "loss": 0.6064, + "step": 1357 + }, + { + "epoch": 0.97, + "grad_norm": 16.483334098623065, + "learning_rate": 8.86488037130884e-06, + "loss": 0.542, + "step": 1358 + }, + { + "epoch": 0.97, + "grad_norm": 28.654635831705043, + "learning_rate": 8.863046113163158e-06, + "loss": 0.5166, + "step": 1359 + }, + { + "epoch": 0.97, + "grad_norm": 8.446905888828201, + "learning_rate": 8.861210564301967e-06, + "loss": 0.5576, + "step": 1360 + }, + { + "epoch": 0.97, + "grad_norm": 6.786577684525464, + "learning_rate": 8.859373725338558e-06, + "loss": 0.5571, + "step": 1361 + }, + { + "epoch": 0.97, + "grad_norm": 19.77386586403384, + "learning_rate": 8.857535596886652e-06, + "loss": 0.5259, + "step": 1362 + }, + { + "epoch": 0.97, + "grad_norm": 27.49805900372077, + "learning_rate": 8.855696179560402e-06, + "loss": 0.6602, + "step": 1363 + }, + { + "epoch": 0.97, + "grad_norm": 7.273355043162183, + "learning_rate": 8.85385547397439e-06, + "loss": 0.4653, + "step": 1364 + }, + { + "epoch": 0.97, + "grad_norm": 20.217716781867352, + "learning_rate": 8.852013480743632e-06, + "loss": 0.6235, + "step": 1365 + }, + { + "epoch": 0.98, + "grad_norm": 9.843685680691951, + "learning_rate": 8.850170200483573e-06, + "loss": 0.4951, + "step": 1366 + }, + { + "epoch": 0.98, + "grad_norm": 13.271803428940668, + "learning_rate": 8.848325633810083e-06, + "loss": 0.498, + "step": 1367 + }, + { + "epoch": 0.98, + "grad_norm": 11.37198649024497, + "learning_rate": 8.84647978133947e-06, + "loss": 0.5361, + "step": 1368 + }, + { + "epoch": 0.98, + "grad_norm": 17.854819527941277, + "learning_rate": 8.844632643688467e-06, + "loss": 0.6172, + "step": 1369 + }, + { + "epoch": 0.98, + "grad_norm": 7.139410040675058, + "learning_rate": 8.842784221474237e-06, + "loss": 0.5098, + "step": 1370 + }, + { + "epoch": 0.98, + "grad_norm": 10.702123104062924, + "learning_rate": 8.840934515314372e-06, + "loss": 0.6406, + "step": 1371 + }, + { + "epoch": 0.98, + "grad_norm": 8.327060121802434, + "learning_rate": 8.839083525826893e-06, + "loss": 0.562, + "step": 1372 + }, + { + "epoch": 0.98, + "grad_norm": 12.967346146693572, + "learning_rate": 8.837231253630247e-06, + "loss": 0.5122, + "step": 1373 + }, + { + "epoch": 0.98, + "grad_norm": 9.222045501981935, + "learning_rate": 8.835377699343318e-06, + "loss": 0.5391, + "step": 1374 + }, + { + "epoch": 0.98, + "grad_norm": 5.7150367345669855, + "learning_rate": 8.83352286358541e-06, + "loss": 0.501, + "step": 1375 + }, + { + "epoch": 0.98, + "grad_norm": 10.34388020346308, + "learning_rate": 8.83166674697626e-06, + "loss": 0.5498, + "step": 1376 + }, + { + "epoch": 0.98, + "grad_norm": 6.994275061577608, + "learning_rate": 8.829809350136027e-06, + "loss": 0.5469, + "step": 1377 + }, + { + "epoch": 0.98, + "grad_norm": 6.286509681654963, + "learning_rate": 8.827950673685306e-06, + "loss": 0.5586, + "step": 1378 + }, + { + "epoch": 0.98, + "grad_norm": 7.615667447671237, + "learning_rate": 8.826090718245112e-06, + "loss": 0.5747, + "step": 1379 + }, + { + "epoch": 0.99, + "grad_norm": 8.045180625435941, + "learning_rate": 8.824229484436894e-06, + "loss": 0.5361, + "step": 1380 + }, + { + "epoch": 0.99, + "grad_norm": 16.914235609616878, + "learning_rate": 8.822366972882523e-06, + "loss": 0.6753, + "step": 1381 + }, + { + "epoch": 0.99, + "grad_norm": 6.65556839435706, + "learning_rate": 8.820503184204299e-06, + "loss": 0.5171, + "step": 1382 + }, + { + "epoch": 0.99, + "grad_norm": 17.818279504647876, + "learning_rate": 8.818638119024949e-06, + "loss": 0.5488, + "step": 1383 + }, + { + "epoch": 0.99, + "grad_norm": 29.01882884415183, + "learning_rate": 8.816771777967623e-06, + "loss": 0.6357, + "step": 1384 + }, + { + "epoch": 0.99, + "grad_norm": 5.102254652825774, + "learning_rate": 8.814904161655904e-06, + "loss": 0.5728, + "step": 1385 + }, + { + "epoch": 0.99, + "grad_norm": 9.028759055391328, + "learning_rate": 8.813035270713796e-06, + "loss": 0.4946, + "step": 1386 + }, + { + "epoch": 0.99, + "grad_norm": 5.481350770883132, + "learning_rate": 8.811165105765732e-06, + "loss": 0.5146, + "step": 1387 + }, + { + "epoch": 0.99, + "grad_norm": 4.584523783116981, + "learning_rate": 8.809293667436565e-06, + "loss": 0.5498, + "step": 1388 + }, + { + "epoch": 0.99, + "grad_norm": 7.670757323199173, + "learning_rate": 8.80742095635158e-06, + "loss": 0.5869, + "step": 1389 + }, + { + "epoch": 0.99, + "grad_norm": 13.789379257935748, + "learning_rate": 8.805546973136481e-06, + "loss": 0.5391, + "step": 1390 + }, + { + "epoch": 0.99, + "grad_norm": 9.92942354858489, + "learning_rate": 8.803671718417407e-06, + "loss": 0.5396, + "step": 1391 + }, + { + "epoch": 0.99, + "grad_norm": 10.025556704510787, + "learning_rate": 8.80179519282091e-06, + "loss": 0.4453, + "step": 1392 + }, + { + "epoch": 0.99, + "grad_norm": 9.863285419079764, + "learning_rate": 8.799917396973976e-06, + "loss": 0.5576, + "step": 1393 + }, + { + "epoch": 1.0, + "grad_norm": 7.510372608172871, + "learning_rate": 8.798038331504008e-06, + "loss": 0.499, + "step": 1394 + }, + { + "epoch": 1.0, + "grad_norm": 10.339190789992044, + "learning_rate": 8.79615799703884e-06, + "loss": 0.5347, + "step": 1395 + }, + { + "epoch": 1.0, + "grad_norm": 9.228994238966333, + "learning_rate": 8.794276394206722e-06, + "loss": 0.4858, + "step": 1396 + }, + { + "epoch": 1.0, + "grad_norm": 10.720067062308178, + "learning_rate": 8.792393523636337e-06, + "loss": 0.5122, + "step": 1397 + }, + { + "epoch": 1.0, + "grad_norm": 9.959990759829996, + "learning_rate": 8.790509385956784e-06, + "loss": 0.6104, + "step": 1398 + }, + { + "epoch": 1.0, + "grad_norm": 7.7350338881120315, + "learning_rate": 8.788623981797592e-06, + "loss": 0.5569, + "step": 1399 + }, + { + "epoch": 1.0, + "grad_norm": 6.122908029947811, + "learning_rate": 8.786737311788708e-06, + "loss": 0.5083, + "step": 1400 + }, + { + "epoch": 1.0, + "grad_norm": 7.249960081112658, + "learning_rate": 8.784849376560503e-06, + "loss": 0.4473, + "step": 1401 + }, + { + "epoch": 1.0, + "grad_norm": 5.9378761564742, + "learning_rate": 8.78296017674377e-06, + "loss": 0.3892, + "step": 1402 + }, + { + "epoch": 1.0, + "grad_norm": 7.947182755307541, + "learning_rate": 8.781069712969726e-06, + "loss": 0.4663, + "step": 1403 + }, + { + "epoch": 1.0, + "grad_norm": 7.735615644483297, + "learning_rate": 8.779177985870012e-06, + "loss": 0.4985, + "step": 1404 + }, + { + "epoch": 1.0, + "grad_norm": 6.221979827272879, + "learning_rate": 8.77728499607669e-06, + "loss": 0.3682, + "step": 1405 + }, + { + "epoch": 1.0, + "grad_norm": 7.51360598274658, + "learning_rate": 8.775390744222238e-06, + "loss": 0.4927, + "step": 1406 + }, + { + "epoch": 1.0, + "grad_norm": 9.824576130882553, + "learning_rate": 8.773495230939567e-06, + "loss": 0.439, + "step": 1407 + }, + { + "epoch": 1.0, + "grad_norm": 7.542083031305357, + "learning_rate": 8.771598456861998e-06, + "loss": 0.407, + "step": 1408 + }, + { + "epoch": 1.01, + "grad_norm": 6.74463891725652, + "learning_rate": 8.769700422623283e-06, + "loss": 0.3843, + "step": 1409 + }, + { + "epoch": 1.01, + "grad_norm": 7.052978396859257, + "learning_rate": 8.767801128857588e-06, + "loss": 0.4321, + "step": 1410 + }, + { + "epoch": 1.01, + "grad_norm": 11.845362886453344, + "learning_rate": 8.765900576199502e-06, + "loss": 0.4565, + "step": 1411 + }, + { + "epoch": 1.01, + "grad_norm": 11.17409389016001, + "learning_rate": 8.763998765284036e-06, + "loss": 0.4888, + "step": 1412 + }, + { + "epoch": 1.01, + "grad_norm": 17.04399663972258, + "learning_rate": 8.76209569674662e-06, + "loss": 0.4199, + "step": 1413 + }, + { + "epoch": 1.01, + "grad_norm": 10.796687567190697, + "learning_rate": 8.760191371223104e-06, + "loss": 0.4346, + "step": 1414 + }, + { + "epoch": 1.01, + "grad_norm": 13.742485700753212, + "learning_rate": 8.758285789349759e-06, + "loss": 0.479, + "step": 1415 + }, + { + "epoch": 1.01, + "grad_norm": 10.549981447404763, + "learning_rate": 8.756378951763277e-06, + "loss": 0.4429, + "step": 1416 + }, + { + "epoch": 1.01, + "grad_norm": 10.523418901631306, + "learning_rate": 8.754470859100765e-06, + "loss": 0.3989, + "step": 1417 + }, + { + "epoch": 1.01, + "grad_norm": 17.12913542529696, + "learning_rate": 8.752561511999754e-06, + "loss": 0.5083, + "step": 1418 + }, + { + "epoch": 1.01, + "grad_norm": 15.040083536035315, + "learning_rate": 8.750650911098193e-06, + "loss": 0.4619, + "step": 1419 + }, + { + "epoch": 1.01, + "grad_norm": 14.958198799461423, + "learning_rate": 8.748739057034447e-06, + "loss": 0.457, + "step": 1420 + }, + { + "epoch": 1.01, + "grad_norm": 34.28258733668816, + "learning_rate": 8.746825950447302e-06, + "loss": 0.3999, + "step": 1421 + }, + { + "epoch": 1.01, + "grad_norm": 18.46394179525724, + "learning_rate": 8.744911591975967e-06, + "loss": 0.4434, + "step": 1422 + }, + { + "epoch": 1.02, + "grad_norm": 17.570102832172687, + "learning_rate": 8.742995982260059e-06, + "loss": 0.4307, + "step": 1423 + }, + { + "epoch": 1.02, + "grad_norm": 20.157526748035593, + "learning_rate": 8.741079121939621e-06, + "loss": 0.4961, + "step": 1424 + }, + { + "epoch": 1.02, + "grad_norm": 10.562647415324578, + "learning_rate": 8.739161011655113e-06, + "loss": 0.458, + "step": 1425 + }, + { + "epoch": 1.02, + "grad_norm": 11.069738473837281, + "learning_rate": 8.737241652047408e-06, + "loss": 0.603, + "step": 1426 + }, + { + "epoch": 1.02, + "grad_norm": 11.918982204133165, + "learning_rate": 8.735321043757805e-06, + "loss": 0.4688, + "step": 1427 + }, + { + "epoch": 1.02, + "grad_norm": 10.46229012344293, + "learning_rate": 8.73339918742801e-06, + "loss": 0.4341, + "step": 1428 + }, + { + "epoch": 1.02, + "grad_norm": 12.84418719877701, + "learning_rate": 8.731476083700154e-06, + "loss": 0.4683, + "step": 1429 + }, + { + "epoch": 1.02, + "grad_norm": 11.59064803571736, + "learning_rate": 8.729551733216779e-06, + "loss": 0.4229, + "step": 1430 + }, + { + "epoch": 1.02, + "grad_norm": 10.516324309495134, + "learning_rate": 8.727626136620848e-06, + "loss": 0.4502, + "step": 1431 + }, + { + "epoch": 1.02, + "grad_norm": 26.410113834424802, + "learning_rate": 8.725699294555739e-06, + "loss": 0.5132, + "step": 1432 + }, + { + "epoch": 1.02, + "grad_norm": 20.34794795255431, + "learning_rate": 8.723771207665245e-06, + "loss": 0.5312, + "step": 1433 + }, + { + "epoch": 1.02, + "grad_norm": 13.382266025888864, + "learning_rate": 8.721841876593576e-06, + "loss": 0.4482, + "step": 1434 + }, + { + "epoch": 1.02, + "grad_norm": 12.996872293754613, + "learning_rate": 8.719911301985355e-06, + "loss": 0.4189, + "step": 1435 + }, + { + "epoch": 1.02, + "grad_norm": 13.616870720209013, + "learning_rate": 8.717979484485628e-06, + "loss": 0.3623, + "step": 1436 + }, + { + "epoch": 1.03, + "grad_norm": 18.78460022966816, + "learning_rate": 8.716046424739845e-06, + "loss": 0.5029, + "step": 1437 + }, + { + "epoch": 1.03, + "grad_norm": 27.875048810799026, + "learning_rate": 8.714112123393882e-06, + "loss": 0.5117, + "step": 1438 + }, + { + "epoch": 1.03, + "grad_norm": 11.085152704221128, + "learning_rate": 8.712176581094025e-06, + "loss": 0.438, + "step": 1439 + }, + { + "epoch": 1.03, + "grad_norm": 20.939014408876105, + "learning_rate": 8.710239798486972e-06, + "loss": 0.5273, + "step": 1440 + }, + { + "epoch": 1.03, + "grad_norm": 36.2593949826461, + "learning_rate": 8.708301776219838e-06, + "loss": 0.4185, + "step": 1441 + }, + { + "epoch": 1.03, + "grad_norm": 112.11982714132783, + "learning_rate": 8.706362514940153e-06, + "loss": 0.4478, + "step": 1442 + }, + { + "epoch": 1.03, + "grad_norm": 277.4382397294177, + "learning_rate": 8.704422015295861e-06, + "loss": 0.5786, + "step": 1443 + }, + { + "epoch": 1.03, + "grad_norm": 181.53470646624706, + "learning_rate": 8.702480277935319e-06, + "loss": 0.6011, + "step": 1444 + }, + { + "epoch": 1.03, + "grad_norm": 15.931282079030481, + "learning_rate": 8.700537303507298e-06, + "loss": 0.3828, + "step": 1445 + }, + { + "epoch": 1.03, + "grad_norm": 22.345280510200695, + "learning_rate": 8.69859309266098e-06, + "loss": 0.54, + "step": 1446 + }, + { + "epoch": 1.03, + "grad_norm": 16.92976281404644, + "learning_rate": 8.696647646045962e-06, + "loss": 0.4468, + "step": 1447 + }, + { + "epoch": 1.03, + "grad_norm": 15.584050288102251, + "learning_rate": 8.694700964312257e-06, + "loss": 0.499, + "step": 1448 + }, + { + "epoch": 1.03, + "grad_norm": 9.782706492066882, + "learning_rate": 8.692753048110283e-06, + "loss": 0.5635, + "step": 1449 + }, + { + "epoch": 1.03, + "grad_norm": 17.982712514408526, + "learning_rate": 8.690803898090878e-06, + "loss": 0.4897, + "step": 1450 + }, + { + "epoch": 1.04, + "grad_norm": 10.009874311456212, + "learning_rate": 8.68885351490529e-06, + "loss": 0.3774, + "step": 1451 + }, + { + "epoch": 1.04, + "grad_norm": 16.187289386392372, + "learning_rate": 8.686901899205177e-06, + "loss": 0.4077, + "step": 1452 + }, + { + "epoch": 1.04, + "grad_norm": 22.74330679007414, + "learning_rate": 8.684949051642609e-06, + "loss": 0.4907, + "step": 1453 + }, + { + "epoch": 1.04, + "grad_norm": 8.533483611225794, + "learning_rate": 8.68299497287007e-06, + "loss": 0.4238, + "step": 1454 + }, + { + "epoch": 1.04, + "grad_norm": 22.93205428371546, + "learning_rate": 8.681039663540454e-06, + "loss": 0.4351, + "step": 1455 + }, + { + "epoch": 1.04, + "grad_norm": 11.711598620907111, + "learning_rate": 8.679083124307064e-06, + "loss": 0.321, + "step": 1456 + }, + { + "epoch": 1.04, + "grad_norm": 20.3063051905801, + "learning_rate": 8.67712535582362e-06, + "loss": 0.4761, + "step": 1457 + }, + { + "epoch": 1.04, + "grad_norm": 15.81069964942567, + "learning_rate": 8.675166358744247e-06, + "loss": 0.4497, + "step": 1458 + }, + { + "epoch": 1.04, + "grad_norm": 25.172668334190924, + "learning_rate": 8.67320613372348e-06, + "loss": 0.6162, + "step": 1459 + }, + { + "epoch": 1.04, + "grad_norm": 13.868299776664779, + "learning_rate": 8.67124468141627e-06, + "loss": 0.4697, + "step": 1460 + }, + { + "epoch": 1.04, + "grad_norm": 17.65557462571263, + "learning_rate": 8.669282002477975e-06, + "loss": 0.5439, + "step": 1461 + }, + { + "epoch": 1.04, + "grad_norm": 19.986941627654915, + "learning_rate": 8.66731809756436e-06, + "loss": 0.4077, + "step": 1462 + }, + { + "epoch": 1.04, + "grad_norm": 13.481751253152993, + "learning_rate": 8.665352967331604e-06, + "loss": 0.4507, + "step": 1463 + }, + { + "epoch": 1.04, + "grad_norm": 10.69597798278669, + "learning_rate": 8.66338661243629e-06, + "loss": 0.4185, + "step": 1464 + }, + { + "epoch": 1.05, + "grad_norm": 21.12105825698134, + "learning_rate": 8.661419033535419e-06, + "loss": 0.4966, + "step": 1465 + }, + { + "epoch": 1.05, + "grad_norm": 15.364831925795514, + "learning_rate": 8.659450231286392e-06, + "loss": 0.4619, + "step": 1466 + }, + { + "epoch": 1.05, + "grad_norm": 23.36575882497529, + "learning_rate": 8.657480206347024e-06, + "loss": 0.478, + "step": 1467 + }, + { + "epoch": 1.05, + "grad_norm": 19.20617981129387, + "learning_rate": 8.655508959375536e-06, + "loss": 0.458, + "step": 1468 + }, + { + "epoch": 1.05, + "grad_norm": 12.332507150136136, + "learning_rate": 8.653536491030559e-06, + "loss": 0.4453, + "step": 1469 + }, + { + "epoch": 1.05, + "grad_norm": 8.375737589014147, + "learning_rate": 8.651562801971131e-06, + "loss": 0.4199, + "step": 1470 + }, + { + "epoch": 1.05, + "grad_norm": 11.622191769632364, + "learning_rate": 8.649587892856698e-06, + "loss": 0.4438, + "step": 1471 + }, + { + "epoch": 1.05, + "grad_norm": 15.028816389671348, + "learning_rate": 8.647611764347114e-06, + "loss": 0.4634, + "step": 1472 + }, + { + "epoch": 1.05, + "grad_norm": 51.19574362423425, + "learning_rate": 8.64563441710264e-06, + "loss": 0.7012, + "step": 1473 + }, + { + "epoch": 1.05, + "grad_norm": 8.972205154177201, + "learning_rate": 8.643655851783947e-06, + "loss": 0.3843, + "step": 1474 + }, + { + "epoch": 1.05, + "grad_norm": 8.72003218429071, + "learning_rate": 8.641676069052104e-06, + "loss": 0.4072, + "step": 1475 + }, + { + "epoch": 1.05, + "grad_norm": 14.009750309222879, + "learning_rate": 8.639695069568602e-06, + "loss": 0.4717, + "step": 1476 + }, + { + "epoch": 1.05, + "grad_norm": 16.704206269175184, + "learning_rate": 8.637712853995324e-06, + "loss": 0.4814, + "step": 1477 + }, + { + "epoch": 1.05, + "grad_norm": 9.870396068272045, + "learning_rate": 8.635729422994566e-06, + "loss": 0.4634, + "step": 1478 + }, + { + "epoch": 1.06, + "grad_norm": 11.55132356224423, + "learning_rate": 8.633744777229029e-06, + "loss": 0.4365, + "step": 1479 + }, + { + "epoch": 1.06, + "grad_norm": 12.263513038793375, + "learning_rate": 8.63175891736182e-06, + "loss": 0.3862, + "step": 1480 + }, + { + "epoch": 1.06, + "grad_norm": 10.735243962302292, + "learning_rate": 8.629771844056452e-06, + "loss": 0.3691, + "step": 1481 + }, + { + "epoch": 1.06, + "grad_norm": 11.39389573007216, + "learning_rate": 8.627783557976846e-06, + "loss": 0.4902, + "step": 1482 + }, + { + "epoch": 1.06, + "grad_norm": 9.633505897881955, + "learning_rate": 8.62579405978732e-06, + "loss": 0.4678, + "step": 1483 + }, + { + "epoch": 1.06, + "grad_norm": 10.95154363916375, + "learning_rate": 8.623803350152606e-06, + "loss": 0.4326, + "step": 1484 + }, + { + "epoch": 1.06, + "grad_norm": 12.629591334063445, + "learning_rate": 8.621811429737837e-06, + "loss": 0.4819, + "step": 1485 + }, + { + "epoch": 1.06, + "grad_norm": 13.321230859689711, + "learning_rate": 8.619818299208548e-06, + "loss": 0.3994, + "step": 1486 + }, + { + "epoch": 1.06, + "grad_norm": 12.680762867215169, + "learning_rate": 8.617823959230683e-06, + "loss": 0.5298, + "step": 1487 + }, + { + "epoch": 1.06, + "grad_norm": 14.312614949546656, + "learning_rate": 8.615828410470589e-06, + "loss": 0.5034, + "step": 1488 + }, + { + "epoch": 1.06, + "grad_norm": 15.386409700658552, + "learning_rate": 8.613831653595013e-06, + "loss": 0.5503, + "step": 1489 + }, + { + "epoch": 1.06, + "grad_norm": 16.553167562663948, + "learning_rate": 8.61183368927111e-06, + "loss": 0.3765, + "step": 1490 + }, + { + "epoch": 1.06, + "grad_norm": 11.97547090065287, + "learning_rate": 8.609834518166439e-06, + "loss": 0.4897, + "step": 1491 + }, + { + "epoch": 1.06, + "grad_norm": 12.51487310954413, + "learning_rate": 8.607834140948958e-06, + "loss": 0.4663, + "step": 1492 + }, + { + "epoch": 1.07, + "grad_norm": 15.020553039764543, + "learning_rate": 8.60583255828703e-06, + "loss": 0.416, + "step": 1493 + }, + { + "epoch": 1.07, + "grad_norm": 12.540414928748616, + "learning_rate": 8.603829770849421e-06, + "loss": 0.5366, + "step": 1494 + }, + { + "epoch": 1.07, + "grad_norm": 14.386118227859209, + "learning_rate": 8.601825779305302e-06, + "loss": 0.4199, + "step": 1495 + }, + { + "epoch": 1.07, + "grad_norm": 15.19089679469395, + "learning_rate": 8.59982058432424e-06, + "loss": 0.4824, + "step": 1496 + }, + { + "epoch": 1.07, + "grad_norm": 12.744882941104166, + "learning_rate": 8.597814186576212e-06, + "loss": 0.4531, + "step": 1497 + }, + { + "epoch": 1.07, + "grad_norm": 12.998468653111926, + "learning_rate": 8.595806586731589e-06, + "loss": 0.4771, + "step": 1498 + }, + { + "epoch": 1.07, + "grad_norm": 21.331660088580012, + "learning_rate": 8.59379778546115e-06, + "loss": 0.5522, + "step": 1499 + }, + { + "epoch": 1.07, + "grad_norm": 18.388963075279165, + "learning_rate": 8.591787783436073e-06, + "loss": 0.4834, + "step": 1500 + }, + { + "epoch": 1.07, + "eval_avg_AUC": 0.7334799976115187, + "eval_avg_Accuracy": 0.6713776525198939, + "eval_avg_Accuracy-right": 0.87544019825225, + "eval_avg_Accuracy-wrong": 0.31555606095064814, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6231060873343826, + "eval_last_AUC": 0.7557780548881416, + "eval_last_Accuracy": 0.6933438328912467, + "eval_last_Accuracy-right": 0.8278987870092605, + "eval_last_Accuracy-wrong": 0.45872185581078007, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6509237397272707, + "eval_max_AUC": 0.7050180895122209, + "eval_max_Accuracy": 0.6380968169761273, + "eval_max_Accuracy-right": 0.947045780618234, + "eval_max_Accuracy-wrong": 0.09938594496247441, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.5989895777078, + "eval_min_AUC": 0.7259579465041305, + "eval_min_Accuracy": 0.663834549071618, + "eval_min_Accuracy-right": 0.7148819616538411, + "eval_min_Accuracy-wrong": 0.574823743461451, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6227692066316574, + "eval_prod_AUC": 0.7346491876627528, + "eval_prod_Accuracy": 0.6056863395225465, + "eval_prod_Accuracy-right": 0.46087126646667537, + "eval_prod_Accuracy-wrong": 0.858198771889925, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6170978703493736, + "eval_runtime": 251.2444, + "eval_samples_per_second": 96.034, + "eval_steps_per_second": 3.001, + "eval_sum_AUC": 0.6155480756158993, + "eval_sum_Accuracy": 0.6353199602122016, + "eval_sum_Accuracy-right": 0.9979783487674448, + "eval_sum_Accuracy-wrong": 0.0029565612917898565, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6239868444598957, + "step": 1500 + }, + { + "epoch": 1.07, + "grad_norm": 18.466477606369253, + "learning_rate": 8.589776581327936e-06, + "loss": 0.5674, + "step": 1501 + }, + { + "epoch": 1.07, + "grad_norm": 13.612158852318972, + "learning_rate": 8.587764179808716e-06, + "loss": 0.4512, + "step": 1502 + }, + { + "epoch": 1.07, + "grad_norm": 15.617095094049752, + "learning_rate": 8.5857505795508e-06, + "loss": 0.4932, + "step": 1503 + }, + { + "epoch": 1.07, + "grad_norm": 20.816769693684805, + "learning_rate": 8.583735781226964e-06, + "loss": 0.4648, + "step": 1504 + }, + { + "epoch": 1.07, + "grad_norm": 12.235810035295055, + "learning_rate": 8.581719785510391e-06, + "loss": 0.4233, + "step": 1505 + }, + { + "epoch": 1.07, + "grad_norm": 14.94140620816876, + "learning_rate": 8.579702593074666e-06, + "loss": 0.5186, + "step": 1506 + }, + { + "epoch": 1.08, + "grad_norm": 14.10601483533534, + "learning_rate": 8.577684204593767e-06, + "loss": 0.5, + "step": 1507 + }, + { + "epoch": 1.08, + "grad_norm": 16.588955613794905, + "learning_rate": 8.575664620742073e-06, + "loss": 0.4282, + "step": 1508 + }, + { + "epoch": 1.08, + "grad_norm": 16.445563006577682, + "learning_rate": 8.57364384219437e-06, + "loss": 0.4819, + "step": 1509 + }, + { + "epoch": 1.08, + "grad_norm": 10.081966627737616, + "learning_rate": 8.571621869625835e-06, + "loss": 0.4707, + "step": 1510 + }, + { + "epoch": 1.08, + "grad_norm": 14.676101669447284, + "learning_rate": 8.569598703712045e-06, + "loss": 0.4351, + "step": 1511 + }, + { + "epoch": 1.08, + "grad_norm": 11.660950506422358, + "learning_rate": 8.56757434512898e-06, + "loss": 0.5244, + "step": 1512 + }, + { + "epoch": 1.08, + "grad_norm": 11.673297608958263, + "learning_rate": 8.565548794553016e-06, + "loss": 0.4507, + "step": 1513 + }, + { + "epoch": 1.08, + "grad_norm": 10.604090359301853, + "learning_rate": 8.563522052660925e-06, + "loss": 0.5532, + "step": 1514 + }, + { + "epoch": 1.08, + "grad_norm": 17.711609470198233, + "learning_rate": 8.561494120129878e-06, + "loss": 0.5186, + "step": 1515 + }, + { + "epoch": 1.08, + "grad_norm": 8.518970050965722, + "learning_rate": 8.55946499763745e-06, + "loss": 0.4624, + "step": 1516 + }, + { + "epoch": 1.08, + "grad_norm": 24.622164675000597, + "learning_rate": 8.557434685861604e-06, + "loss": 0.5537, + "step": 1517 + }, + { + "epoch": 1.08, + "grad_norm": 10.341729976026523, + "learning_rate": 8.555403185480706e-06, + "loss": 0.5444, + "step": 1518 + }, + { + "epoch": 1.08, + "grad_norm": 14.951861865208672, + "learning_rate": 8.553370497173518e-06, + "loss": 0.499, + "step": 1519 + }, + { + "epoch": 1.08, + "grad_norm": 6.798380638701046, + "learning_rate": 8.551336621619202e-06, + "loss": 0.4722, + "step": 1520 + }, + { + "epoch": 1.09, + "grad_norm": 10.1744411262223, + "learning_rate": 8.549301559497309e-06, + "loss": 0.4937, + "step": 1521 + }, + { + "epoch": 1.09, + "grad_norm": 14.356907776973655, + "learning_rate": 8.547265311487794e-06, + "loss": 0.4932, + "step": 1522 + }, + { + "epoch": 1.09, + "grad_norm": 11.570800954915105, + "learning_rate": 8.545227878271004e-06, + "loss": 0.5479, + "step": 1523 + }, + { + "epoch": 1.09, + "grad_norm": 8.945217800371761, + "learning_rate": 8.543189260527685e-06, + "loss": 0.4419, + "step": 1524 + }, + { + "epoch": 1.09, + "grad_norm": 15.810431743115078, + "learning_rate": 8.541149458938972e-06, + "loss": 0.5332, + "step": 1525 + }, + { + "epoch": 1.09, + "grad_norm": 15.488127989967904, + "learning_rate": 8.539108474186408e-06, + "loss": 0.4951, + "step": 1526 + }, + { + "epoch": 1.09, + "grad_norm": 7.5250746108148165, + "learning_rate": 8.53706630695192e-06, + "loss": 0.4365, + "step": 1527 + }, + { + "epoch": 1.09, + "grad_norm": 10.090581047201338, + "learning_rate": 8.535022957917833e-06, + "loss": 0.4536, + "step": 1528 + }, + { + "epoch": 1.09, + "grad_norm": 12.713255870212436, + "learning_rate": 8.53297842776687e-06, + "loss": 0.5649, + "step": 1529 + }, + { + "epoch": 1.09, + "grad_norm": 14.611298003245253, + "learning_rate": 8.530932717182148e-06, + "loss": 0.5117, + "step": 1530 + }, + { + "epoch": 1.09, + "grad_norm": 9.796700782725793, + "learning_rate": 8.528885826847173e-06, + "loss": 0.4463, + "step": 1531 + }, + { + "epoch": 1.09, + "grad_norm": 10.108308604467041, + "learning_rate": 8.52683775744585e-06, + "loss": 0.416, + "step": 1532 + }, + { + "epoch": 1.09, + "grad_norm": 13.550989790783058, + "learning_rate": 8.524788509662478e-06, + "loss": 0.4971, + "step": 1533 + }, + { + "epoch": 1.09, + "grad_norm": 13.558016423382215, + "learning_rate": 8.522738084181749e-06, + "loss": 0.5479, + "step": 1534 + }, + { + "epoch": 1.1, + "grad_norm": 9.741126259708553, + "learning_rate": 8.52068648168875e-06, + "loss": 0.4404, + "step": 1535 + }, + { + "epoch": 1.1, + "grad_norm": 13.33677197442007, + "learning_rate": 8.518633702868955e-06, + "loss": 0.4131, + "step": 1536 + }, + { + "epoch": 1.1, + "grad_norm": 12.431400081236161, + "learning_rate": 8.516579748408237e-06, + "loss": 0.4629, + "step": 1537 + }, + { + "epoch": 1.1, + "grad_norm": 11.038672006100304, + "learning_rate": 8.514524618992864e-06, + "loss": 0.4155, + "step": 1538 + }, + { + "epoch": 1.1, + "grad_norm": 14.732435386054672, + "learning_rate": 8.51246831530949e-06, + "loss": 0.4648, + "step": 1539 + }, + { + "epoch": 1.1, + "grad_norm": 18.788487706437273, + "learning_rate": 8.510410838045165e-06, + "loss": 0.4658, + "step": 1540 + }, + { + "epoch": 1.1, + "grad_norm": 13.614847193707238, + "learning_rate": 8.508352187887329e-06, + "loss": 0.4868, + "step": 1541 + }, + { + "epoch": 1.1, + "grad_norm": 9.621493961264449, + "learning_rate": 8.506292365523816e-06, + "loss": 0.4014, + "step": 1542 + }, + { + "epoch": 1.1, + "grad_norm": 13.908663513591653, + "learning_rate": 8.504231371642852e-06, + "loss": 0.5405, + "step": 1543 + }, + { + "epoch": 1.1, + "grad_norm": 13.103135984089345, + "learning_rate": 8.502169206933053e-06, + "loss": 0.4414, + "step": 1544 + }, + { + "epoch": 1.1, + "grad_norm": 19.697544626949995, + "learning_rate": 8.500105872083424e-06, + "loss": 0.4463, + "step": 1545 + }, + { + "epoch": 1.1, + "grad_norm": 19.462066133716373, + "learning_rate": 8.498041367783367e-06, + "loss": 0.4731, + "step": 1546 + }, + { + "epoch": 1.1, + "grad_norm": 11.778716367020898, + "learning_rate": 8.49597569472267e-06, + "loss": 0.5264, + "step": 1547 + }, + { + "epoch": 1.1, + "grad_norm": 9.104157085710243, + "learning_rate": 8.493908853591515e-06, + "loss": 0.4722, + "step": 1548 + }, + { + "epoch": 1.11, + "grad_norm": 16.793898810134827, + "learning_rate": 8.491840845080467e-06, + "loss": 0.6045, + "step": 1549 + }, + { + "epoch": 1.11, + "grad_norm": 13.705523280095832, + "learning_rate": 8.489771669880489e-06, + "loss": 0.5137, + "step": 1550 + }, + { + "epoch": 1.11, + "grad_norm": 11.332670535302874, + "learning_rate": 8.487701328682932e-06, + "loss": 0.4795, + "step": 1551 + }, + { + "epoch": 1.11, + "grad_norm": 8.589426733756543, + "learning_rate": 8.485629822179533e-06, + "loss": 0.4575, + "step": 1552 + }, + { + "epoch": 1.11, + "grad_norm": 13.353226183033948, + "learning_rate": 8.483557151062423e-06, + "loss": 0.4497, + "step": 1553 + }, + { + "epoch": 1.11, + "grad_norm": 9.18956399500516, + "learning_rate": 8.481483316024117e-06, + "loss": 0.4678, + "step": 1554 + }, + { + "epoch": 1.11, + "grad_norm": 16.368944548115262, + "learning_rate": 8.479408317757525e-06, + "loss": 0.5283, + "step": 1555 + }, + { + "epoch": 1.11, + "grad_norm": 10.054711315944372, + "learning_rate": 8.477332156955942e-06, + "loss": 0.5078, + "step": 1556 + }, + { + "epoch": 1.11, + "grad_norm": 10.511879702200282, + "learning_rate": 8.475254834313051e-06, + "loss": 0.5015, + "step": 1557 + }, + { + "epoch": 1.11, + "grad_norm": 8.3864182168499, + "learning_rate": 8.473176350522925e-06, + "loss": 0.4126, + "step": 1558 + }, + { + "epoch": 1.11, + "grad_norm": 10.192652705648785, + "learning_rate": 8.471096706280022e-06, + "loss": 0.5127, + "step": 1559 + }, + { + "epoch": 1.11, + "grad_norm": 11.215018804441314, + "learning_rate": 8.469015902279191e-06, + "loss": 0.4111, + "step": 1560 + }, + { + "epoch": 1.11, + "grad_norm": 8.060201902069851, + "learning_rate": 8.466933939215669e-06, + "loss": 0.4883, + "step": 1561 + }, + { + "epoch": 1.11, + "grad_norm": 11.696202616513581, + "learning_rate": 8.464850817785075e-06, + "loss": 0.4199, + "step": 1562 + }, + { + "epoch": 1.12, + "grad_norm": 10.514532333226486, + "learning_rate": 8.462766538683422e-06, + "loss": 0.4536, + "step": 1563 + }, + { + "epoch": 1.12, + "grad_norm": 9.329536103759395, + "learning_rate": 8.460681102607106e-06, + "loss": 0.4072, + "step": 1564 + }, + { + "epoch": 1.12, + "grad_norm": 18.684227028359807, + "learning_rate": 8.45859451025291e-06, + "loss": 0.6074, + "step": 1565 + }, + { + "epoch": 1.12, + "grad_norm": 25.38421632073626, + "learning_rate": 8.456506762317998e-06, + "loss": 0.7139, + "step": 1566 + }, + { + "epoch": 1.12, + "grad_norm": 14.286401468541102, + "learning_rate": 8.454417859499932e-06, + "loss": 0.5562, + "step": 1567 + }, + { + "epoch": 1.12, + "grad_norm": 10.597284316583334, + "learning_rate": 8.45232780249665e-06, + "loss": 0.4604, + "step": 1568 + }, + { + "epoch": 1.12, + "grad_norm": 16.314962157168296, + "learning_rate": 8.450236592006481e-06, + "loss": 0.4844, + "step": 1569 + }, + { + "epoch": 1.12, + "grad_norm": 10.11769377664577, + "learning_rate": 8.448144228728135e-06, + "loss": 0.4971, + "step": 1570 + }, + { + "epoch": 1.12, + "grad_norm": 15.673405541056715, + "learning_rate": 8.446050713360711e-06, + "loss": 0.4473, + "step": 1571 + }, + { + "epoch": 1.12, + "grad_norm": 12.223436085969732, + "learning_rate": 8.443956046603692e-06, + "loss": 0.54, + "step": 1572 + }, + { + "epoch": 1.12, + "grad_norm": 8.801140278815467, + "learning_rate": 8.441860229156944e-06, + "loss": 0.4429, + "step": 1573 + }, + { + "epoch": 1.12, + "grad_norm": 15.623164408153567, + "learning_rate": 8.439763261720716e-06, + "loss": 0.6367, + "step": 1574 + }, + { + "epoch": 1.12, + "grad_norm": 12.285922227520546, + "learning_rate": 8.43766514499565e-06, + "loss": 0.54, + "step": 1575 + }, + { + "epoch": 1.12, + "grad_norm": 10.746100363903965, + "learning_rate": 8.435565879682759e-06, + "loss": 0.5107, + "step": 1576 + }, + { + "epoch": 1.13, + "grad_norm": 9.979137821257687, + "learning_rate": 8.433465466483452e-06, + "loss": 0.5464, + "step": 1577 + }, + { + "epoch": 1.13, + "grad_norm": 7.598799151468013, + "learning_rate": 8.431363906099513e-06, + "loss": 0.4863, + "step": 1578 + }, + { + "epoch": 1.13, + "grad_norm": 8.105699816627178, + "learning_rate": 8.429261199233114e-06, + "loss": 0.4521, + "step": 1579 + }, + { + "epoch": 1.13, + "grad_norm": 16.9180976781052, + "learning_rate": 8.427157346586807e-06, + "loss": 0.4756, + "step": 1580 + }, + { + "epoch": 1.13, + "grad_norm": 15.86495838313817, + "learning_rate": 8.42505234886353e-06, + "loss": 0.4922, + "step": 1581 + }, + { + "epoch": 1.13, + "grad_norm": 8.545267152082388, + "learning_rate": 8.422946206766598e-06, + "loss": 0.4888, + "step": 1582 + }, + { + "epoch": 1.13, + "grad_norm": 7.713434930793355, + "learning_rate": 8.420838920999718e-06, + "loss": 0.416, + "step": 1583 + }, + { + "epoch": 1.13, + "grad_norm": 16.76577412131441, + "learning_rate": 8.418730492266968e-06, + "loss": 0.5044, + "step": 1584 + }, + { + "epoch": 1.13, + "grad_norm": 7.502799153950787, + "learning_rate": 8.416620921272818e-06, + "loss": 0.4326, + "step": 1585 + }, + { + "epoch": 1.13, + "grad_norm": 15.461253439409639, + "learning_rate": 8.414510208722111e-06, + "loss": 0.5684, + "step": 1586 + }, + { + "epoch": 1.13, + "grad_norm": 19.10348435047004, + "learning_rate": 8.412398355320078e-06, + "loss": 0.5229, + "step": 1587 + }, + { + "epoch": 1.13, + "grad_norm": 12.506878119981076, + "learning_rate": 8.410285361772328e-06, + "loss": 0.4419, + "step": 1588 + }, + { + "epoch": 1.13, + "grad_norm": 9.761222151604644, + "learning_rate": 8.408171228784847e-06, + "loss": 0.4199, + "step": 1589 + }, + { + "epoch": 1.13, + "grad_norm": 11.759892100205057, + "learning_rate": 8.406055957064014e-06, + "loss": 0.4717, + "step": 1590 + }, + { + "epoch": 1.14, + "grad_norm": 15.051203228423255, + "learning_rate": 8.403939547316576e-06, + "loss": 0.4541, + "step": 1591 + }, + { + "epoch": 1.14, + "grad_norm": 8.735328118613461, + "learning_rate": 8.401822000249661e-06, + "loss": 0.4087, + "step": 1592 + }, + { + "epoch": 1.14, + "grad_norm": 9.260653736059055, + "learning_rate": 8.399703316570788e-06, + "loss": 0.4463, + "step": 1593 + }, + { + "epoch": 1.14, + "grad_norm": 18.10976108191569, + "learning_rate": 8.397583496987846e-06, + "loss": 0.519, + "step": 1594 + }, + { + "epoch": 1.14, + "grad_norm": 12.768171024664671, + "learning_rate": 8.395462542209106e-06, + "loss": 0.4648, + "step": 1595 + }, + { + "epoch": 1.14, + "grad_norm": 9.026676798546331, + "learning_rate": 8.393340452943219e-06, + "loss": 0.501, + "step": 1596 + }, + { + "epoch": 1.14, + "grad_norm": 12.16743911627819, + "learning_rate": 8.391217229899211e-06, + "loss": 0.5093, + "step": 1597 + }, + { + "epoch": 1.14, + "grad_norm": 7.830636260029298, + "learning_rate": 8.389092873786495e-06, + "loss": 0.375, + "step": 1598 + }, + { + "epoch": 1.14, + "grad_norm": 12.20546214650608, + "learning_rate": 8.386967385314857e-06, + "loss": 0.4756, + "step": 1599 + }, + { + "epoch": 1.14, + "grad_norm": 8.439896508899231, + "learning_rate": 8.384840765194458e-06, + "loss": 0.4346, + "step": 1600 + }, + { + "epoch": 1.14, + "grad_norm": 19.327934158649416, + "learning_rate": 8.382713014135846e-06, + "loss": 0.6797, + "step": 1601 + }, + { + "epoch": 1.14, + "grad_norm": 9.885276182041746, + "learning_rate": 8.38058413284994e-06, + "loss": 0.5352, + "step": 1602 + }, + { + "epoch": 1.14, + "grad_norm": 10.596961361196003, + "learning_rate": 8.37845412204804e-06, + "loss": 0.4497, + "step": 1603 + }, + { + "epoch": 1.14, + "grad_norm": 11.019827177632346, + "learning_rate": 8.376322982441821e-06, + "loss": 0.4795, + "step": 1604 + }, + { + "epoch": 1.15, + "grad_norm": 10.122204287350154, + "learning_rate": 8.374190714743338e-06, + "loss": 0.3926, + "step": 1605 + }, + { + "epoch": 1.15, + "grad_norm": 13.991030964363688, + "learning_rate": 8.37205731966502e-06, + "loss": 0.4463, + "step": 1606 + }, + { + "epoch": 1.15, + "grad_norm": 8.075319370068696, + "learning_rate": 8.369922797919672e-06, + "loss": 0.395, + "step": 1607 + }, + { + "epoch": 1.15, + "grad_norm": 11.044817706502199, + "learning_rate": 8.367787150220481e-06, + "loss": 0.4814, + "step": 1608 + }, + { + "epoch": 1.15, + "grad_norm": 9.41670683342002, + "learning_rate": 8.365650377281004e-06, + "loss": 0.4272, + "step": 1609 + }, + { + "epoch": 1.15, + "grad_norm": 15.173314309019913, + "learning_rate": 8.36351247981518e-06, + "loss": 0.4678, + "step": 1610 + }, + { + "epoch": 1.15, + "grad_norm": 11.550970965767231, + "learning_rate": 8.361373458537316e-06, + "loss": 0.374, + "step": 1611 + }, + { + "epoch": 1.15, + "grad_norm": 11.034925091044531, + "learning_rate": 8.359233314162102e-06, + "loss": 0.439, + "step": 1612 + }, + { + "epoch": 1.15, + "grad_norm": 8.425295339263013, + "learning_rate": 8.357092047404598e-06, + "loss": 0.3662, + "step": 1613 + }, + { + "epoch": 1.15, + "grad_norm": 10.31408262861963, + "learning_rate": 8.354949658980243e-06, + "loss": 0.4409, + "step": 1614 + }, + { + "epoch": 1.15, + "grad_norm": 12.260734469839917, + "learning_rate": 8.352806149604847e-06, + "loss": 0.3794, + "step": 1615 + }, + { + "epoch": 1.15, + "grad_norm": 14.468850196741588, + "learning_rate": 8.350661519994596e-06, + "loss": 0.6748, + "step": 1616 + }, + { + "epoch": 1.15, + "grad_norm": 21.792508635270057, + "learning_rate": 8.348515770866051e-06, + "loss": 0.564, + "step": 1617 + }, + { + "epoch": 1.15, + "grad_norm": 10.705560827563696, + "learning_rate": 8.346368902936149e-06, + "loss": 0.5049, + "step": 1618 + }, + { + "epoch": 1.16, + "grad_norm": 10.253537791784684, + "learning_rate": 8.344220916922195e-06, + "loss": 0.4131, + "step": 1619 + }, + { + "epoch": 1.16, + "grad_norm": 9.961687115125363, + "learning_rate": 8.342071813541873e-06, + "loss": 0.3879, + "step": 1620 + }, + { + "epoch": 1.16, + "grad_norm": 21.258364878364446, + "learning_rate": 8.339921593513239e-06, + "loss": 0.5815, + "step": 1621 + }, + { + "epoch": 1.16, + "grad_norm": 8.105088701540607, + "learning_rate": 8.337770257554721e-06, + "loss": 0.4336, + "step": 1622 + }, + { + "epoch": 1.16, + "grad_norm": 10.40871548874701, + "learning_rate": 8.335617806385119e-06, + "loss": 0.3545, + "step": 1623 + }, + { + "epoch": 1.16, + "grad_norm": 8.767330324784009, + "learning_rate": 8.333464240723608e-06, + "loss": 0.4673, + "step": 1624 + }, + { + "epoch": 1.16, + "grad_norm": 11.648199165290603, + "learning_rate": 8.331309561289734e-06, + "loss": 0.4395, + "step": 1625 + }, + { + "epoch": 1.16, + "grad_norm": 11.32741559592417, + "learning_rate": 8.329153768803415e-06, + "loss": 0.4731, + "step": 1626 + }, + { + "epoch": 1.16, + "grad_norm": 12.778161659116726, + "learning_rate": 8.326996863984942e-06, + "loss": 0.5933, + "step": 1627 + }, + { + "epoch": 1.16, + "grad_norm": 12.348733791355947, + "learning_rate": 8.324838847554976e-06, + "loss": 0.4395, + "step": 1628 + }, + { + "epoch": 1.16, + "grad_norm": 12.740098579497312, + "learning_rate": 8.322679720234553e-06, + "loss": 0.3857, + "step": 1629 + }, + { + "epoch": 1.16, + "grad_norm": 23.715818168299126, + "learning_rate": 8.320519482745076e-06, + "loss": 0.4917, + "step": 1630 + }, + { + "epoch": 1.16, + "grad_norm": 11.55961555884494, + "learning_rate": 8.31835813580832e-06, + "loss": 0.416, + "step": 1631 + }, + { + "epoch": 1.16, + "grad_norm": 17.584012997946722, + "learning_rate": 8.316195680146431e-06, + "loss": 0.5322, + "step": 1632 + }, + { + "epoch": 1.17, + "grad_norm": 20.513206719088476, + "learning_rate": 8.314032116481927e-06, + "loss": 0.5117, + "step": 1633 + }, + { + "epoch": 1.17, + "grad_norm": 15.048893376656899, + "learning_rate": 8.311867445537694e-06, + "loss": 0.4272, + "step": 1634 + }, + { + "epoch": 1.17, + "grad_norm": 9.270871090848454, + "learning_rate": 8.30970166803699e-06, + "loss": 0.4766, + "step": 1635 + }, + { + "epoch": 1.17, + "grad_norm": 10.569630501193533, + "learning_rate": 8.307534784703438e-06, + "loss": 0.4082, + "step": 1636 + }, + { + "epoch": 1.17, + "grad_norm": 12.423256634220007, + "learning_rate": 8.305366796261036e-06, + "loss": 0.4019, + "step": 1637 + }, + { + "epoch": 1.17, + "grad_norm": 19.689591987413117, + "learning_rate": 8.303197703434151e-06, + "loss": 0.5371, + "step": 1638 + }, + { + "epoch": 1.17, + "grad_norm": 16.2646267025812, + "learning_rate": 8.301027506947516e-06, + "loss": 0.5225, + "step": 1639 + }, + { + "epoch": 1.17, + "grad_norm": 10.004304702541242, + "learning_rate": 8.298856207526234e-06, + "loss": 0.5005, + "step": 1640 + }, + { + "epoch": 1.17, + "grad_norm": 9.275278524089247, + "learning_rate": 8.296683805895777e-06, + "loss": 0.4683, + "step": 1641 + }, + { + "epoch": 1.17, + "grad_norm": 15.349760402537493, + "learning_rate": 8.294510302781984e-06, + "loss": 0.4644, + "step": 1642 + }, + { + "epoch": 1.17, + "grad_norm": 14.34559489538677, + "learning_rate": 8.29233569891106e-06, + "loss": 0.4131, + "step": 1643 + }, + { + "epoch": 1.17, + "grad_norm": 15.09744536399142, + "learning_rate": 8.290159995009586e-06, + "loss": 0.4858, + "step": 1644 + }, + { + "epoch": 1.17, + "grad_norm": 8.573866063978919, + "learning_rate": 8.2879831918045e-06, + "loss": 0.4844, + "step": 1645 + }, + { + "epoch": 1.17, + "grad_norm": 15.655351070066807, + "learning_rate": 8.285805290023119e-06, + "loss": 0.4937, + "step": 1646 + }, + { + "epoch": 1.18, + "grad_norm": 8.922336465989796, + "learning_rate": 8.283626290393112e-06, + "loss": 0.5044, + "step": 1647 + }, + { + "epoch": 1.18, + "grad_norm": 8.600737753837139, + "learning_rate": 8.28144619364253e-06, + "loss": 0.4478, + "step": 1648 + }, + { + "epoch": 1.18, + "grad_norm": 21.9710210367027, + "learning_rate": 8.279265000499783e-06, + "loss": 0.5781, + "step": 1649 + }, + { + "epoch": 1.18, + "grad_norm": 9.960701863701635, + "learning_rate": 8.277082711693645e-06, + "loss": 0.5278, + "step": 1650 + }, + { + "epoch": 1.18, + "grad_norm": 11.16288939323412, + "learning_rate": 8.274899327953261e-06, + "loss": 0.4927, + "step": 1651 + }, + { + "epoch": 1.18, + "grad_norm": 28.608511125234113, + "learning_rate": 8.272714850008142e-06, + "loss": 0.6494, + "step": 1652 + }, + { + "epoch": 1.18, + "grad_norm": 7.7112136057185765, + "learning_rate": 8.270529278588158e-06, + "loss": 0.4153, + "step": 1653 + }, + { + "epoch": 1.18, + "grad_norm": 8.677455868257633, + "learning_rate": 8.268342614423553e-06, + "loss": 0.4663, + "step": 1654 + }, + { + "epoch": 1.18, + "grad_norm": 17.614598532387653, + "learning_rate": 8.26615485824493e-06, + "loss": 0.4663, + "step": 1655 + }, + { + "epoch": 1.18, + "grad_norm": 8.086291528498144, + "learning_rate": 8.263966010783259e-06, + "loss": 0.4448, + "step": 1656 + }, + { + "epoch": 1.18, + "grad_norm": 7.52793141998646, + "learning_rate": 8.261776072769878e-06, + "loss": 0.4453, + "step": 1657 + }, + { + "epoch": 1.18, + "grad_norm": 10.71379532658703, + "learning_rate": 8.259585044936484e-06, + "loss": 0.4429, + "step": 1658 + }, + { + "epoch": 1.18, + "grad_norm": 11.139946386227027, + "learning_rate": 8.257392928015138e-06, + "loss": 0.4644, + "step": 1659 + }, + { + "epoch": 1.18, + "grad_norm": 12.658585729760876, + "learning_rate": 8.25519972273827e-06, + "loss": 0.4727, + "step": 1660 + }, + { + "epoch": 1.19, + "grad_norm": 10.794428336250439, + "learning_rate": 8.253005429838667e-06, + "loss": 0.4209, + "step": 1661 + }, + { + "epoch": 1.19, + "grad_norm": 10.775021748055948, + "learning_rate": 8.250810050049488e-06, + "loss": 0.4678, + "step": 1662 + }, + { + "epoch": 1.19, + "grad_norm": 14.921459024373188, + "learning_rate": 8.248613584104245e-06, + "loss": 0.4731, + "step": 1663 + }, + { + "epoch": 1.19, + "grad_norm": 10.454721792025683, + "learning_rate": 8.246416032736824e-06, + "loss": 0.4658, + "step": 1664 + }, + { + "epoch": 1.19, + "grad_norm": 8.732194232724195, + "learning_rate": 8.244217396681461e-06, + "loss": 0.3638, + "step": 1665 + }, + { + "epoch": 1.19, + "grad_norm": 12.820566534030801, + "learning_rate": 8.242017676672766e-06, + "loss": 0.4893, + "step": 1666 + }, + { + "epoch": 1.19, + "grad_norm": 20.00873769637777, + "learning_rate": 8.239816873445705e-06, + "loss": 0.4873, + "step": 1667 + }, + { + "epoch": 1.19, + "grad_norm": 10.979873575388726, + "learning_rate": 8.237614987735607e-06, + "loss": 0.3708, + "step": 1668 + }, + { + "epoch": 1.19, + "grad_norm": 13.872748317763673, + "learning_rate": 8.235412020278164e-06, + "loss": 0.397, + "step": 1669 + }, + { + "epoch": 1.19, + "grad_norm": 16.271402719404538, + "learning_rate": 8.233207971809427e-06, + "loss": 0.3921, + "step": 1670 + }, + { + "epoch": 1.19, + "grad_norm": 10.329697166946847, + "learning_rate": 8.23100284306581e-06, + "loss": 0.3176, + "step": 1671 + }, + { + "epoch": 1.19, + "grad_norm": 18.026001535157672, + "learning_rate": 8.228796634784086e-06, + "loss": 0.5127, + "step": 1672 + }, + { + "epoch": 1.19, + "grad_norm": 16.380805016305548, + "learning_rate": 8.226589347701396e-06, + "loss": 0.3657, + "step": 1673 + }, + { + "epoch": 1.19, + "grad_norm": 38.080808210850535, + "learning_rate": 8.224380982555226e-06, + "loss": 0.4443, + "step": 1674 + }, + { + "epoch": 1.2, + "grad_norm": 18.3361844592776, + "learning_rate": 8.222171540083442e-06, + "loss": 0.5322, + "step": 1675 + }, + { + "epoch": 1.2, + "grad_norm": 13.113199868710224, + "learning_rate": 8.219961021024251e-06, + "loss": 0.4336, + "step": 1676 + }, + { + "epoch": 1.2, + "grad_norm": 12.666400780810209, + "learning_rate": 8.217749426116238e-06, + "loss": 0.5059, + "step": 1677 + }, + { + "epoch": 1.2, + "grad_norm": 8.936998665809853, + "learning_rate": 8.215536756098327e-06, + "loss": 0.4058, + "step": 1678 + }, + { + "epoch": 1.2, + "grad_norm": 17.828959837607, + "learning_rate": 8.21332301170982e-06, + "loss": 0.4595, + "step": 1679 + }, + { + "epoch": 1.2, + "grad_norm": 10.325958282764173, + "learning_rate": 8.211108193690369e-06, + "loss": 0.4141, + "step": 1680 + }, + { + "epoch": 1.2, + "grad_norm": 12.793987371835103, + "learning_rate": 8.208892302779982e-06, + "loss": 0.5151, + "step": 1681 + }, + { + "epoch": 1.2, + "grad_norm": 13.843795834098652, + "learning_rate": 8.206675339719034e-06, + "loss": 0.4849, + "step": 1682 + }, + { + "epoch": 1.2, + "grad_norm": 15.74955973168439, + "learning_rate": 8.204457305248253e-06, + "loss": 0.499, + "step": 1683 + }, + { + "epoch": 1.2, + "grad_norm": 12.444677514125921, + "learning_rate": 8.202238200108721e-06, + "loss": 0.5122, + "step": 1684 + }, + { + "epoch": 1.2, + "grad_norm": 11.718027155024345, + "learning_rate": 8.200018025041887e-06, + "loss": 0.501, + "step": 1685 + }, + { + "epoch": 1.2, + "grad_norm": 13.98702758856593, + "learning_rate": 8.19779678078955e-06, + "loss": 0.4941, + "step": 1686 + }, + { + "epoch": 1.2, + "grad_norm": 20.583545079481684, + "learning_rate": 8.195574468093872e-06, + "loss": 0.4937, + "step": 1687 + }, + { + "epoch": 1.2, + "grad_norm": 10.339228146259074, + "learning_rate": 8.193351087697366e-06, + "loss": 0.4468, + "step": 1688 + }, + { + "epoch": 1.21, + "grad_norm": 14.124375880053021, + "learning_rate": 8.191126640342906e-06, + "loss": 0.4336, + "step": 1689 + }, + { + "epoch": 1.21, + "grad_norm": 11.822649187564727, + "learning_rate": 8.18890112677372e-06, + "loss": 0.4316, + "step": 1690 + }, + { + "epoch": 1.21, + "grad_norm": 12.84834184460974, + "learning_rate": 8.186674547733398e-06, + "loss": 0.5522, + "step": 1691 + }, + { + "epoch": 1.21, + "grad_norm": 20.928740644270853, + "learning_rate": 8.184446903965875e-06, + "loss": 0.4897, + "step": 1692 + }, + { + "epoch": 1.21, + "grad_norm": 9.767092152016358, + "learning_rate": 8.182218196215452e-06, + "loss": 0.5, + "step": 1693 + }, + { + "epoch": 1.21, + "grad_norm": 11.47958841486727, + "learning_rate": 8.17998842522678e-06, + "loss": 0.417, + "step": 1694 + }, + { + "epoch": 1.21, + "grad_norm": 11.010687951365263, + "learning_rate": 8.17775759174487e-06, + "loss": 0.5488, + "step": 1695 + }, + { + "epoch": 1.21, + "grad_norm": 18.742980213365875, + "learning_rate": 8.17552569651508e-06, + "loss": 0.4692, + "step": 1696 + }, + { + "epoch": 1.21, + "grad_norm": 13.073207754471264, + "learning_rate": 8.173292740283135e-06, + "loss": 0.48, + "step": 1697 + }, + { + "epoch": 1.21, + "grad_norm": 10.048458414659237, + "learning_rate": 8.171058723795097e-06, + "loss": 0.4868, + "step": 1698 + }, + { + "epoch": 1.21, + "grad_norm": 18.883107102405994, + "learning_rate": 8.168823647797401e-06, + "loss": 0.5278, + "step": 1699 + }, + { + "epoch": 1.21, + "grad_norm": 12.307579695529716, + "learning_rate": 8.166587513036826e-06, + "loss": 0.5342, + "step": 1700 + }, + { + "epoch": 1.21, + "grad_norm": 14.099245148537669, + "learning_rate": 8.164350320260502e-06, + "loss": 0.3953, + "step": 1701 + }, + { + "epoch": 1.21, + "grad_norm": 20.941909833565685, + "learning_rate": 8.16211207021592e-06, + "loss": 0.5112, + "step": 1702 + }, + { + "epoch": 1.22, + "grad_norm": 7.820604772453076, + "learning_rate": 8.15987276365092e-06, + "loss": 0.4077, + "step": 1703 + }, + { + "epoch": 1.22, + "grad_norm": 9.473817682607748, + "learning_rate": 8.157632401313696e-06, + "loss": 0.479, + "step": 1704 + }, + { + "epoch": 1.22, + "grad_norm": 10.74411226039115, + "learning_rate": 8.155390983952795e-06, + "loss": 0.5112, + "step": 1705 + }, + { + "epoch": 1.22, + "grad_norm": 7.227050478751384, + "learning_rate": 8.153148512317117e-06, + "loss": 0.3857, + "step": 1706 + }, + { + "epoch": 1.22, + "grad_norm": 15.200446672036492, + "learning_rate": 8.150904987155911e-06, + "loss": 0.5029, + "step": 1707 + }, + { + "epoch": 1.22, + "grad_norm": 17.4110528802431, + "learning_rate": 8.148660409218786e-06, + "loss": 0.5918, + "step": 1708 + }, + { + "epoch": 1.22, + "grad_norm": 8.448382094255253, + "learning_rate": 8.146414779255689e-06, + "loss": 0.4707, + "step": 1709 + }, + { + "epoch": 1.22, + "grad_norm": 7.909316780039679, + "learning_rate": 8.144168098016933e-06, + "loss": 0.4331, + "step": 1710 + }, + { + "epoch": 1.22, + "grad_norm": 12.433238461814103, + "learning_rate": 8.141920366253173e-06, + "loss": 0.5918, + "step": 1711 + }, + { + "epoch": 1.22, + "grad_norm": 10.771308440459347, + "learning_rate": 8.139671584715419e-06, + "loss": 0.5146, + "step": 1712 + }, + { + "epoch": 1.22, + "grad_norm": 9.892728317342609, + "learning_rate": 8.137421754155031e-06, + "loss": 0.5664, + "step": 1713 + }, + { + "epoch": 1.22, + "grad_norm": 12.318633500646037, + "learning_rate": 8.13517087532372e-06, + "loss": 0.4043, + "step": 1714 + }, + { + "epoch": 1.22, + "grad_norm": 9.17077151093477, + "learning_rate": 8.132918948973543e-06, + "loss": 0.4736, + "step": 1715 + }, + { + "epoch": 1.22, + "grad_norm": 20.996362868045267, + "learning_rate": 8.130665975856913e-06, + "loss": 0.5215, + "step": 1716 + }, + { + "epoch": 1.23, + "grad_norm": 11.616605183075755, + "learning_rate": 8.128411956726592e-06, + "loss": 0.5415, + "step": 1717 + }, + { + "epoch": 1.23, + "grad_norm": 9.643006691319318, + "learning_rate": 8.126156892335686e-06, + "loss": 0.4834, + "step": 1718 + }, + { + "epoch": 1.23, + "grad_norm": 11.195645679300869, + "learning_rate": 8.123900783437655e-06, + "loss": 0.5327, + "step": 1719 + }, + { + "epoch": 1.23, + "grad_norm": 13.905175950266107, + "learning_rate": 8.121643630786308e-06, + "loss": 0.5674, + "step": 1720 + }, + { + "epoch": 1.23, + "grad_norm": 11.886407787375669, + "learning_rate": 8.1193854351358e-06, + "loss": 0.4756, + "step": 1721 + }, + { + "epoch": 1.23, + "grad_norm": 13.66213829319637, + "learning_rate": 8.11712619724064e-06, + "loss": 0.4854, + "step": 1722 + }, + { + "epoch": 1.23, + "grad_norm": 11.919219445746402, + "learning_rate": 8.114865917855676e-06, + "loss": 0.4976, + "step": 1723 + }, + { + "epoch": 1.23, + "grad_norm": 11.518986982659731, + "learning_rate": 8.112604597736113e-06, + "loss": 0.4453, + "step": 1724 + }, + { + "epoch": 1.23, + "grad_norm": 8.744337753421034, + "learning_rate": 8.110342237637501e-06, + "loss": 0.4009, + "step": 1725 + }, + { + "epoch": 1.23, + "grad_norm": 14.907567080982792, + "learning_rate": 8.108078838315732e-06, + "loss": 0.4307, + "step": 1726 + }, + { + "epoch": 1.23, + "grad_norm": 14.419935864909833, + "learning_rate": 8.105814400527052e-06, + "loss": 0.415, + "step": 1727 + }, + { + "epoch": 1.23, + "grad_norm": 13.496398651006333, + "learning_rate": 8.103548925028054e-06, + "loss": 0.5171, + "step": 1728 + }, + { + "epoch": 1.23, + "grad_norm": 11.775365355328299, + "learning_rate": 8.101282412575673e-06, + "loss": 0.4819, + "step": 1729 + }, + { + "epoch": 1.23, + "grad_norm": 10.67720477465964, + "learning_rate": 8.099014863927192e-06, + "loss": 0.3726, + "step": 1730 + }, + { + "epoch": 1.24, + "grad_norm": 10.790063693984372, + "learning_rate": 8.096746279840245e-06, + "loss": 0.4429, + "step": 1731 + }, + { + "epoch": 1.24, + "grad_norm": 15.660047686678814, + "learning_rate": 8.094476661072806e-06, + "loss": 0.4814, + "step": 1732 + }, + { + "epoch": 1.24, + "grad_norm": 20.596433199232035, + "learning_rate": 8.092206008383195e-06, + "loss": 0.5518, + "step": 1733 + }, + { + "epoch": 1.24, + "grad_norm": 16.474511824070156, + "learning_rate": 8.089934322530082e-06, + "loss": 0.5264, + "step": 1734 + }, + { + "epoch": 1.24, + "grad_norm": 14.071167596382951, + "learning_rate": 8.087661604272477e-06, + "loss": 0.5366, + "step": 1735 + }, + { + "epoch": 1.24, + "grad_norm": 10.404014720227183, + "learning_rate": 8.08538785436974e-06, + "loss": 0.5059, + "step": 1736 + }, + { + "epoch": 1.24, + "grad_norm": 15.235368526664493, + "learning_rate": 8.08311307358157e-06, + "loss": 0.5933, + "step": 1737 + }, + { + "epoch": 1.24, + "grad_norm": 11.232707612320379, + "learning_rate": 8.080837262668017e-06, + "loss": 0.4341, + "step": 1738 + }, + { + "epoch": 1.24, + "grad_norm": 12.513187841921981, + "learning_rate": 8.078560422389472e-06, + "loss": 0.4668, + "step": 1739 + }, + { + "epoch": 1.24, + "grad_norm": 8.808341804249373, + "learning_rate": 8.076282553506664e-06, + "loss": 0.4595, + "step": 1740 + }, + { + "epoch": 1.24, + "grad_norm": 8.722292004822773, + "learning_rate": 8.074003656780678e-06, + "loss": 0.5083, + "step": 1741 + }, + { + "epoch": 1.24, + "grad_norm": 9.773723618455808, + "learning_rate": 8.071723732972933e-06, + "loss": 0.499, + "step": 1742 + }, + { + "epoch": 1.24, + "grad_norm": 18.009490592834382, + "learning_rate": 8.069442782845191e-06, + "loss": 0.6309, + "step": 1743 + }, + { + "epoch": 1.24, + "grad_norm": 9.228749331770816, + "learning_rate": 8.067160807159566e-06, + "loss": 0.5264, + "step": 1744 + }, + { + "epoch": 1.25, + "grad_norm": 7.760608376818505, + "learning_rate": 8.064877806678504e-06, + "loss": 0.4751, + "step": 1745 + }, + { + "epoch": 1.25, + "grad_norm": 9.86883350705853, + "learning_rate": 8.062593782164798e-06, + "loss": 0.5444, + "step": 1746 + }, + { + "epoch": 1.25, + "grad_norm": 12.526705575487213, + "learning_rate": 8.060308734381585e-06, + "loss": 0.52, + "step": 1747 + }, + { + "epoch": 1.25, + "grad_norm": 7.766194944671241, + "learning_rate": 8.05802266409234e-06, + "loss": 0.4355, + "step": 1748 + }, + { + "epoch": 1.25, + "grad_norm": 21.32642846265708, + "learning_rate": 8.055735572060883e-06, + "loss": 0.4399, + "step": 1749 + }, + { + "epoch": 1.25, + "grad_norm": 13.768520958766539, + "learning_rate": 8.053447459051374e-06, + "loss": 0.4658, + "step": 1750 + }, + { + "epoch": 1.25, + "grad_norm": 7.417625812427548, + "learning_rate": 8.051158325828315e-06, + "loss": 0.4585, + "step": 1751 + }, + { + "epoch": 1.25, + "grad_norm": 7.241606084022609, + "learning_rate": 8.048868173156546e-06, + "loss": 0.4858, + "step": 1752 + }, + { + "epoch": 1.25, + "grad_norm": 7.691430849772492, + "learning_rate": 8.046577001801248e-06, + "loss": 0.4819, + "step": 1753 + }, + { + "epoch": 1.25, + "grad_norm": 22.879141711500264, + "learning_rate": 8.044284812527949e-06, + "loss": 0.5547, + "step": 1754 + }, + { + "epoch": 1.25, + "grad_norm": 21.480942295074424, + "learning_rate": 8.041991606102507e-06, + "loss": 0.6348, + "step": 1755 + }, + { + "epoch": 1.25, + "grad_norm": 9.43827233604463, + "learning_rate": 8.039697383291127e-06, + "loss": 0.4663, + "step": 1756 + }, + { + "epoch": 1.25, + "grad_norm": 8.948768279992116, + "learning_rate": 8.037402144860353e-06, + "loss": 0.4976, + "step": 1757 + }, + { + "epoch": 1.25, + "grad_norm": 7.369467383801608, + "learning_rate": 8.035105891577064e-06, + "loss": 0.4478, + "step": 1758 + }, + { + "epoch": 1.26, + "grad_norm": 9.310600570034742, + "learning_rate": 8.032808624208485e-06, + "loss": 0.4702, + "step": 1759 + }, + { + "epoch": 1.26, + "grad_norm": 19.0022124954022, + "learning_rate": 8.030510343522172e-06, + "loss": 0.4883, + "step": 1760 + }, + { + "epoch": 1.26, + "grad_norm": 11.2828868419992, + "learning_rate": 8.02821105028602e-06, + "loss": 0.4268, + "step": 1761 + }, + { + "epoch": 1.26, + "grad_norm": 10.03094557514591, + "learning_rate": 8.025910745268276e-06, + "loss": 0.5122, + "step": 1762 + }, + { + "epoch": 1.26, + "grad_norm": 13.600050046217289, + "learning_rate": 8.023609429237504e-06, + "loss": 0.3862, + "step": 1763 + }, + { + "epoch": 1.26, + "grad_norm": 7.4892743731139815, + "learning_rate": 8.021307102962623e-06, + "loss": 0.4751, + "step": 1764 + }, + { + "epoch": 1.26, + "grad_norm": 17.988405551016548, + "learning_rate": 8.019003767212881e-06, + "loss": 0.4453, + "step": 1765 + }, + { + "epoch": 1.26, + "grad_norm": 25.122334044245886, + "learning_rate": 8.016699422757865e-06, + "loss": 0.5967, + "step": 1766 + }, + { + "epoch": 1.26, + "grad_norm": 19.211595252663454, + "learning_rate": 8.014394070367499e-06, + "loss": 0.5542, + "step": 1767 + }, + { + "epoch": 1.26, + "grad_norm": 8.735010608127107, + "learning_rate": 8.012087710812047e-06, + "loss": 0.4873, + "step": 1768 + }, + { + "epoch": 1.26, + "grad_norm": 11.903117170564997, + "learning_rate": 8.009780344862101e-06, + "loss": 0.5122, + "step": 1769 + }, + { + "epoch": 1.26, + "grad_norm": 12.873539053988267, + "learning_rate": 8.0074719732886e-06, + "loss": 0.4502, + "step": 1770 + }, + { + "epoch": 1.26, + "grad_norm": 16.28213540501501, + "learning_rate": 8.005162596862812e-06, + "loss": 0.4248, + "step": 1771 + }, + { + "epoch": 1.26, + "grad_norm": 22.486287226040226, + "learning_rate": 8.002852216356343e-06, + "loss": 0.5264, + "step": 1772 + }, + { + "epoch": 1.27, + "grad_norm": 12.996115895199278, + "learning_rate": 8.000540832541132e-06, + "loss": 0.4854, + "step": 1773 + }, + { + "epoch": 1.27, + "grad_norm": 11.038573416312534, + "learning_rate": 7.99822844618946e-06, + "loss": 0.4604, + "step": 1774 + }, + { + "epoch": 1.27, + "grad_norm": 10.159547013032094, + "learning_rate": 7.995915058073933e-06, + "loss": 0.5234, + "step": 1775 + }, + { + "epoch": 1.27, + "grad_norm": 20.659676859984405, + "learning_rate": 7.9936006689675e-06, + "loss": 0.4878, + "step": 1776 + }, + { + "epoch": 1.27, + "grad_norm": 11.634718738069445, + "learning_rate": 7.99128527964344e-06, + "loss": 0.5059, + "step": 1777 + }, + { + "epoch": 1.27, + "grad_norm": 18.586703189973363, + "learning_rate": 7.988968890875368e-06, + "loss": 0.479, + "step": 1778 + }, + { + "epoch": 1.27, + "grad_norm": 10.900105375633403, + "learning_rate": 7.986651503437233e-06, + "loss": 0.4824, + "step": 1779 + }, + { + "epoch": 1.27, + "grad_norm": 8.814400149182877, + "learning_rate": 7.984333118103318e-06, + "loss": 0.4585, + "step": 1780 + }, + { + "epoch": 1.27, + "grad_norm": 7.089228994356707, + "learning_rate": 7.982013735648235e-06, + "loss": 0.3984, + "step": 1781 + }, + { + "epoch": 1.27, + "grad_norm": 11.909003750271049, + "learning_rate": 7.979693356846937e-06, + "loss": 0.6475, + "step": 1782 + }, + { + "epoch": 1.27, + "grad_norm": 20.987506759661084, + "learning_rate": 7.977371982474705e-06, + "loss": 0.5854, + "step": 1783 + }, + { + "epoch": 1.27, + "grad_norm": 9.953186291279971, + "learning_rate": 7.975049613307151e-06, + "loss": 0.5356, + "step": 1784 + }, + { + "epoch": 1.27, + "grad_norm": 10.353919440549886, + "learning_rate": 7.972726250120225e-06, + "loss": 0.4849, + "step": 1785 + }, + { + "epoch": 1.27, + "grad_norm": 10.610513184195263, + "learning_rate": 7.970401893690202e-06, + "loss": 0.5479, + "step": 1786 + }, + { + "epoch": 1.28, + "grad_norm": 12.063317949232149, + "learning_rate": 7.968076544793696e-06, + "loss": 0.4551, + "step": 1787 + }, + { + "epoch": 1.28, + "grad_norm": 8.971498295810411, + "learning_rate": 7.965750204207647e-06, + "loss": 0.418, + "step": 1788 + }, + { + "epoch": 1.28, + "grad_norm": 8.315924369534915, + "learning_rate": 7.96342287270933e-06, + "loss": 0.4507, + "step": 1789 + }, + { + "epoch": 1.28, + "grad_norm": 7.953940518033675, + "learning_rate": 7.96109455107635e-06, + "loss": 0.3809, + "step": 1790 + }, + { + "epoch": 1.28, + "grad_norm": 22.05948822429463, + "learning_rate": 7.958765240086639e-06, + "loss": 0.5605, + "step": 1791 + }, + { + "epoch": 1.28, + "grad_norm": 12.700046157464467, + "learning_rate": 7.956434940518468e-06, + "loss": 0.4351, + "step": 1792 + }, + { + "epoch": 1.28, + "grad_norm": 10.454966939327795, + "learning_rate": 7.954103653150432e-06, + "loss": 0.5576, + "step": 1793 + }, + { + "epoch": 1.28, + "grad_norm": 9.619801280870984, + "learning_rate": 7.951771378761455e-06, + "loss": 0.418, + "step": 1794 + }, + { + "epoch": 1.28, + "grad_norm": 16.520532675801153, + "learning_rate": 7.949438118130797e-06, + "loss": 0.6309, + "step": 1795 + }, + { + "epoch": 1.28, + "grad_norm": 16.82962380647751, + "learning_rate": 7.94710387203804e-06, + "loss": 0.5132, + "step": 1796 + }, + { + "epoch": 1.28, + "grad_norm": 15.229460325358946, + "learning_rate": 7.944768641263101e-06, + "loss": 0.438, + "step": 1797 + }, + { + "epoch": 1.28, + "grad_norm": 18.18358063129885, + "learning_rate": 7.942432426586224e-06, + "loss": 0.479, + "step": 1798 + }, + { + "epoch": 1.28, + "grad_norm": 10.038226449592317, + "learning_rate": 7.94009522878798e-06, + "loss": 0.458, + "step": 1799 + }, + { + "epoch": 1.28, + "grad_norm": 12.23538850601863, + "learning_rate": 7.937757048649274e-06, + "loss": 0.5488, + "step": 1800 + }, + { + "epoch": 1.29, + "grad_norm": 19.148183869442587, + "learning_rate": 7.935417886951332e-06, + "loss": 0.5342, + "step": 1801 + }, + { + "epoch": 1.29, + "grad_norm": 9.068638115217762, + "learning_rate": 7.933077744475713e-06, + "loss": 0.4331, + "step": 1802 + }, + { + "epoch": 1.29, + "grad_norm": 15.673804499921573, + "learning_rate": 7.930736622004301e-06, + "loss": 0.4766, + "step": 1803 + }, + { + "epoch": 1.29, + "grad_norm": 16.854117836669452, + "learning_rate": 7.928394520319311e-06, + "loss": 0.5518, + "step": 1804 + }, + { + "epoch": 1.29, + "grad_norm": 9.307530919938724, + "learning_rate": 7.926051440203278e-06, + "loss": 0.4248, + "step": 1805 + }, + { + "epoch": 1.29, + "grad_norm": 9.980958338000624, + "learning_rate": 7.923707382439073e-06, + "loss": 0.436, + "step": 1806 + }, + { + "epoch": 1.29, + "grad_norm": 8.178450156206857, + "learning_rate": 7.921362347809888e-06, + "loss": 0.4692, + "step": 1807 + }, + { + "epoch": 1.29, + "grad_norm": 9.204168068608656, + "learning_rate": 7.919016337099242e-06, + "loss": 0.4541, + "step": 1808 + }, + { + "epoch": 1.29, + "grad_norm": 8.081522614205248, + "learning_rate": 7.916669351090981e-06, + "loss": 0.4487, + "step": 1809 + }, + { + "epoch": 1.29, + "grad_norm": 13.916600263727217, + "learning_rate": 7.914321390569278e-06, + "loss": 0.4282, + "step": 1810 + }, + { + "epoch": 1.29, + "grad_norm": 7.60473063045003, + "learning_rate": 7.911972456318629e-06, + "loss": 0.437, + "step": 1811 + }, + { + "epoch": 1.29, + "grad_norm": 9.650718167681877, + "learning_rate": 7.909622549123855e-06, + "loss": 0.4644, + "step": 1812 + }, + { + "epoch": 1.29, + "grad_norm": 11.491205118287587, + "learning_rate": 7.907271669770107e-06, + "loss": 0.4053, + "step": 1813 + }, + { + "epoch": 1.29, + "grad_norm": 9.07315325683419, + "learning_rate": 7.904919819042855e-06, + "loss": 0.4756, + "step": 1814 + }, + { + "epoch": 1.3, + "grad_norm": 13.101956566078766, + "learning_rate": 7.902566997727896e-06, + "loss": 0.4263, + "step": 1815 + }, + { + "epoch": 1.3, + "grad_norm": 12.528361439531825, + "learning_rate": 7.900213206611353e-06, + "loss": 0.4673, + "step": 1816 + }, + { + "epoch": 1.3, + "grad_norm": 11.82005333902504, + "learning_rate": 7.897858446479672e-06, + "loss": 0.4355, + "step": 1817 + }, + { + "epoch": 1.3, + "grad_norm": 24.150479129083248, + "learning_rate": 7.895502718119618e-06, + "loss": 0.5068, + "step": 1818 + }, + { + "epoch": 1.3, + "grad_norm": 8.074857103355468, + "learning_rate": 7.89314602231829e-06, + "loss": 0.4043, + "step": 1819 + }, + { + "epoch": 1.3, + "grad_norm": 10.774025110056453, + "learning_rate": 7.8907883598631e-06, + "loss": 0.4058, + "step": 1820 + }, + { + "epoch": 1.3, + "grad_norm": 16.169418137670785, + "learning_rate": 7.888429731541784e-06, + "loss": 0.5068, + "step": 1821 + }, + { + "epoch": 1.3, + "grad_norm": 12.638373472077571, + "learning_rate": 7.886070138142407e-06, + "loss": 0.4453, + "step": 1822 + }, + { + "epoch": 1.3, + "grad_norm": 14.724664699868582, + "learning_rate": 7.883709580453354e-06, + "loss": 0.5068, + "step": 1823 + }, + { + "epoch": 1.3, + "grad_norm": 14.422813863600243, + "learning_rate": 7.88134805926333e-06, + "loss": 0.4424, + "step": 1824 + }, + { + "epoch": 1.3, + "grad_norm": 11.591815386989563, + "learning_rate": 7.878985575361362e-06, + "loss": 0.4644, + "step": 1825 + }, + { + "epoch": 1.3, + "grad_norm": 12.579472910793811, + "learning_rate": 7.876622129536801e-06, + "loss": 0.4619, + "step": 1826 + }, + { + "epoch": 1.3, + "grad_norm": 12.59809088506851, + "learning_rate": 7.874257722579319e-06, + "loss": 0.5039, + "step": 1827 + }, + { + "epoch": 1.3, + "grad_norm": 18.124095864166325, + "learning_rate": 7.871892355278906e-06, + "loss": 0.4302, + "step": 1828 + }, + { + "epoch": 1.31, + "grad_norm": 8.067465357679238, + "learning_rate": 7.869526028425878e-06, + "loss": 0.5254, + "step": 1829 + }, + { + "epoch": 1.31, + "grad_norm": 10.94843976061978, + "learning_rate": 7.867158742810866e-06, + "loss": 0.4839, + "step": 1830 + }, + { + "epoch": 1.31, + "grad_norm": 8.151059746712043, + "learning_rate": 7.864790499224825e-06, + "loss": 0.437, + "step": 1831 + }, + { + "epoch": 1.31, + "grad_norm": 11.408199801575625, + "learning_rate": 7.86242129845903e-06, + "loss": 0.458, + "step": 1832 + }, + { + "epoch": 1.31, + "grad_norm": 17.645230899000442, + "learning_rate": 7.860051141305074e-06, + "loss": 0.5107, + "step": 1833 + }, + { + "epoch": 1.31, + "grad_norm": 8.561675138435755, + "learning_rate": 7.857680028554873e-06, + "loss": 0.4189, + "step": 1834 + }, + { + "epoch": 1.31, + "grad_norm": 8.925281089125331, + "learning_rate": 7.855307961000656e-06, + "loss": 0.4893, + "step": 1835 + }, + { + "epoch": 1.31, + "grad_norm": 11.26652117905419, + "learning_rate": 7.852934939434977e-06, + "loss": 0.4399, + "step": 1836 + }, + { + "epoch": 1.31, + "grad_norm": 12.505458999460473, + "learning_rate": 7.850560964650707e-06, + "loss": 0.6421, + "step": 1837 + }, + { + "epoch": 1.31, + "grad_norm": 11.372565753801249, + "learning_rate": 7.848186037441035e-06, + "loss": 0.6289, + "step": 1838 + }, + { + "epoch": 1.31, + "grad_norm": 6.881114480690239, + "learning_rate": 7.845810158599467e-06, + "loss": 0.4624, + "step": 1839 + }, + { + "epoch": 1.31, + "grad_norm": 13.626691743280656, + "learning_rate": 7.84343332891983e-06, + "loss": 0.4785, + "step": 1840 + }, + { + "epoch": 1.31, + "grad_norm": 7.687233987923879, + "learning_rate": 7.841055549196267e-06, + "loss": 0.521, + "step": 1841 + }, + { + "epoch": 1.31, + "grad_norm": 9.195815391563308, + "learning_rate": 7.838676820223234e-06, + "loss": 0.3867, + "step": 1842 + }, + { + "epoch": 1.32, + "grad_norm": 11.299770030769283, + "learning_rate": 7.836297142795515e-06, + "loss": 0.4546, + "step": 1843 + }, + { + "epoch": 1.32, + "grad_norm": 6.8699709812237515, + "learning_rate": 7.833916517708203e-06, + "loss": 0.3662, + "step": 1844 + }, + { + "epoch": 1.32, + "grad_norm": 17.559884344433186, + "learning_rate": 7.831534945756703e-06, + "loss": 0.54, + "step": 1845 + }, + { + "epoch": 1.32, + "grad_norm": 9.525388260025515, + "learning_rate": 7.82915242773675e-06, + "loss": 0.4473, + "step": 1846 + }, + { + "epoch": 1.32, + "grad_norm": 8.9216506801838, + "learning_rate": 7.826768964444384e-06, + "loss": 0.4189, + "step": 1847 + }, + { + "epoch": 1.32, + "grad_norm": 22.147593629165257, + "learning_rate": 7.824384556675966e-06, + "loss": 0.5132, + "step": 1848 + }, + { + "epoch": 1.32, + "grad_norm": 12.171992879631954, + "learning_rate": 7.821999205228168e-06, + "loss": 0.4893, + "step": 1849 + }, + { + "epoch": 1.32, + "grad_norm": 21.79619518512073, + "learning_rate": 7.819612910897985e-06, + "loss": 0.541, + "step": 1850 + }, + { + "epoch": 1.32, + "grad_norm": 17.026117401917226, + "learning_rate": 7.817225674482717e-06, + "loss": 0.5181, + "step": 1851 + }, + { + "epoch": 1.32, + "grad_norm": 7.809802504536831, + "learning_rate": 7.814837496779988e-06, + "loss": 0.3872, + "step": 1852 + }, + { + "epoch": 1.32, + "grad_norm": 9.661493775692145, + "learning_rate": 7.812448378587731e-06, + "loss": 0.4951, + "step": 1853 + }, + { + "epoch": 1.32, + "grad_norm": 14.532780424040105, + "learning_rate": 7.810058320704194e-06, + "loss": 0.5244, + "step": 1854 + }, + { + "epoch": 1.32, + "grad_norm": 9.622050906657064, + "learning_rate": 7.807667323927941e-06, + "loss": 0.3711, + "step": 1855 + }, + { + "epoch": 1.32, + "grad_norm": 12.137416584677046, + "learning_rate": 7.80527538905785e-06, + "loss": 0.4214, + "step": 1856 + }, + { + "epoch": 1.33, + "grad_norm": 9.531652456122275, + "learning_rate": 7.802882516893106e-06, + "loss": 0.3865, + "step": 1857 + }, + { + "epoch": 1.33, + "grad_norm": 14.761181122860886, + "learning_rate": 7.800488708233219e-06, + "loss": 0.458, + "step": 1858 + }, + { + "epoch": 1.33, + "grad_norm": 10.986448218756069, + "learning_rate": 7.798093963877998e-06, + "loss": 0.3965, + "step": 1859 + }, + { + "epoch": 1.33, + "grad_norm": 14.525624462916241, + "learning_rate": 7.795698284627575e-06, + "loss": 0.5312, + "step": 1860 + }, + { + "epoch": 1.33, + "grad_norm": 9.56296095061409, + "learning_rate": 7.793301671282391e-06, + "loss": 0.4253, + "step": 1861 + }, + { + "epoch": 1.33, + "grad_norm": 9.813705657660236, + "learning_rate": 7.7909041246432e-06, + "loss": 0.4346, + "step": 1862 + }, + { + "epoch": 1.33, + "grad_norm": 8.044009444296448, + "learning_rate": 7.788505645511065e-06, + "loss": 0.4697, + "step": 1863 + }, + { + "epoch": 1.33, + "grad_norm": 11.756948527676359, + "learning_rate": 7.786106234687362e-06, + "loss": 0.4351, + "step": 1864 + }, + { + "epoch": 1.33, + "grad_norm": 15.838558191606422, + "learning_rate": 7.783705892973782e-06, + "loss": 0.5356, + "step": 1865 + }, + { + "epoch": 1.33, + "grad_norm": 10.97549632585959, + "learning_rate": 7.78130462117232e-06, + "loss": 0.4238, + "step": 1866 + }, + { + "epoch": 1.33, + "grad_norm": 14.230138714195853, + "learning_rate": 7.778902420085289e-06, + "loss": 0.4741, + "step": 1867 + }, + { + "epoch": 1.33, + "grad_norm": 7.33800229572185, + "learning_rate": 7.776499290515304e-06, + "loss": 0.3765, + "step": 1868 + }, + { + "epoch": 1.33, + "grad_norm": 12.435836369297808, + "learning_rate": 7.7740952332653e-06, + "loss": 0.4629, + "step": 1869 + }, + { + "epoch": 1.33, + "grad_norm": 9.644876077277099, + "learning_rate": 7.771690249138517e-06, + "loss": 0.4565, + "step": 1870 + }, + { + "epoch": 1.34, + "grad_norm": 9.755377787104848, + "learning_rate": 7.769284338938502e-06, + "loss": 0.4541, + "step": 1871 + }, + { + "epoch": 1.34, + "grad_norm": 19.27972086793472, + "learning_rate": 7.766877503469117e-06, + "loss": 0.5132, + "step": 1872 + }, + { + "epoch": 1.34, + "grad_norm": 13.760972849630273, + "learning_rate": 7.764469743534529e-06, + "loss": 0.4404, + "step": 1873 + }, + { + "epoch": 1.34, + "grad_norm": 10.940065164124238, + "learning_rate": 7.762061059939214e-06, + "loss": 0.4536, + "step": 1874 + }, + { + "epoch": 1.34, + "grad_norm": 7.751090211455104, + "learning_rate": 7.759651453487963e-06, + "loss": 0.3506, + "step": 1875 + }, + { + "epoch": 1.34, + "grad_norm": 23.25806719313458, + "learning_rate": 7.757240924985866e-06, + "loss": 0.6182, + "step": 1876 + }, + { + "epoch": 1.34, + "grad_norm": 9.896852124606287, + "learning_rate": 7.754829475238323e-06, + "loss": 0.4888, + "step": 1877 + }, + { + "epoch": 1.34, + "grad_norm": 13.942142350442522, + "learning_rate": 7.752417105051051e-06, + "loss": 0.5947, + "step": 1878 + }, + { + "epoch": 1.34, + "grad_norm": 14.63242801319795, + "learning_rate": 7.750003815230062e-06, + "loss": 0.4194, + "step": 1879 + }, + { + "epoch": 1.34, + "grad_norm": 11.915779656542515, + "learning_rate": 7.747589606581686e-06, + "loss": 0.3735, + "step": 1880 + }, + { + "epoch": 1.34, + "grad_norm": 7.6414314110252075, + "learning_rate": 7.745174479912551e-06, + "loss": 0.4844, + "step": 1881 + }, + { + "epoch": 1.34, + "grad_norm": 9.279768255216817, + "learning_rate": 7.742758436029596e-06, + "loss": 0.4561, + "step": 1882 + }, + { + "epoch": 1.34, + "grad_norm": 8.118475626162798, + "learning_rate": 7.740341475740068e-06, + "loss": 0.4819, + "step": 1883 + }, + { + "epoch": 1.34, + "grad_norm": 11.986938122701325, + "learning_rate": 7.737923599851519e-06, + "loss": 0.4072, + "step": 1884 + }, + { + "epoch": 1.35, + "grad_norm": 7.3213451137330825, + "learning_rate": 7.735504809171801e-06, + "loss": 0.4595, + "step": 1885 + }, + { + "epoch": 1.35, + "grad_norm": 10.386651252542793, + "learning_rate": 7.733085104509084e-06, + "loss": 0.4014, + "step": 1886 + }, + { + "epoch": 1.35, + "grad_norm": 8.548276619740427, + "learning_rate": 7.730664486671831e-06, + "loss": 0.4585, + "step": 1887 + }, + { + "epoch": 1.35, + "grad_norm": 10.158689999749116, + "learning_rate": 7.72824295646882e-06, + "loss": 0.3936, + "step": 1888 + }, + { + "epoch": 1.35, + "grad_norm": 12.499012467824802, + "learning_rate": 7.725820514709124e-06, + "loss": 0.4233, + "step": 1889 + }, + { + "epoch": 1.35, + "grad_norm": 12.070340859207697, + "learning_rate": 7.723397162202128e-06, + "loss": 0.5679, + "step": 1890 + }, + { + "epoch": 1.35, + "grad_norm": 11.183380030500457, + "learning_rate": 7.720972899757522e-06, + "loss": 0.4019, + "step": 1891 + }, + { + "epoch": 1.35, + "grad_norm": 19.63336525431284, + "learning_rate": 7.718547728185293e-06, + "loss": 0.5845, + "step": 1892 + }, + { + "epoch": 1.35, + "grad_norm": 8.930649239400196, + "learning_rate": 7.716121648295738e-06, + "loss": 0.4297, + "step": 1893 + }, + { + "epoch": 1.35, + "grad_norm": 7.936977513469789, + "learning_rate": 7.713694660899455e-06, + "loss": 0.3965, + "step": 1894 + }, + { + "epoch": 1.35, + "grad_norm": 15.880197223026865, + "learning_rate": 7.711266766807345e-06, + "loss": 0.478, + "step": 1895 + }, + { + "epoch": 1.35, + "grad_norm": 25.38985775819577, + "learning_rate": 7.708837966830615e-06, + "loss": 0.5176, + "step": 1896 + }, + { + "epoch": 1.35, + "grad_norm": 11.073264625514588, + "learning_rate": 7.706408261780769e-06, + "loss": 0.4155, + "step": 1897 + }, + { + "epoch": 1.35, + "grad_norm": 13.734620428534315, + "learning_rate": 7.703977652469618e-06, + "loss": 0.4585, + "step": 1898 + }, + { + "epoch": 1.36, + "grad_norm": 10.71268865147328, + "learning_rate": 7.701546139709272e-06, + "loss": 0.4351, + "step": 1899 + }, + { + "epoch": 1.36, + "grad_norm": 8.690569659507187, + "learning_rate": 7.69911372431215e-06, + "loss": 0.4819, + "step": 1900 + }, + { + "epoch": 1.36, + "grad_norm": 11.038069885853988, + "learning_rate": 7.696680407090962e-06, + "loss": 0.4741, + "step": 1901 + }, + { + "epoch": 1.36, + "grad_norm": 19.166802662393746, + "learning_rate": 7.694246188858726e-06, + "loss": 0.4458, + "step": 1902 + }, + { + "epoch": 1.36, + "grad_norm": 12.652531412656241, + "learning_rate": 7.691811070428758e-06, + "loss": 0.4409, + "step": 1903 + }, + { + "epoch": 1.36, + "grad_norm": 13.775786518184024, + "learning_rate": 7.689375052614681e-06, + "loss": 0.5034, + "step": 1904 + }, + { + "epoch": 1.36, + "grad_norm": 10.730043609598852, + "learning_rate": 7.686938136230408e-06, + "loss": 0.4878, + "step": 1905 + }, + { + "epoch": 1.36, + "grad_norm": 11.070741977627472, + "learning_rate": 7.684500322090162e-06, + "loss": 0.4946, + "step": 1906 + }, + { + "epoch": 1.36, + "grad_norm": 21.66702912722839, + "learning_rate": 7.68206161100846e-06, + "loss": 0.4512, + "step": 1907 + }, + { + "epoch": 1.36, + "grad_norm": 16.18598427399989, + "learning_rate": 7.679622003800122e-06, + "loss": 0.5322, + "step": 1908 + }, + { + "epoch": 1.36, + "grad_norm": 18.592185583152887, + "learning_rate": 7.677181501280266e-06, + "loss": 0.384, + "step": 1909 + }, + { + "epoch": 1.36, + "grad_norm": 10.483647129362556, + "learning_rate": 7.674740104264308e-06, + "loss": 0.4438, + "step": 1910 + }, + { + "epoch": 1.36, + "grad_norm": 19.56330359346813, + "learning_rate": 7.672297813567968e-06, + "loss": 0.6631, + "step": 1911 + }, + { + "epoch": 1.36, + "grad_norm": 11.750687177326649, + "learning_rate": 7.669854630007257e-06, + "loss": 0.4619, + "step": 1912 + }, + { + "epoch": 1.37, + "grad_norm": 10.801460674280925, + "learning_rate": 7.667410554398486e-06, + "loss": 0.5146, + "step": 1913 + }, + { + "epoch": 1.37, + "grad_norm": 10.654003604266663, + "learning_rate": 7.664965587558271e-06, + "loss": 0.5215, + "step": 1914 + }, + { + "epoch": 1.37, + "grad_norm": 7.252336343356576, + "learning_rate": 7.662519730303517e-06, + "loss": 0.4146, + "step": 1915 + }, + { + "epoch": 1.37, + "grad_norm": 17.96405723828609, + "learning_rate": 7.660072983451433e-06, + "loss": 0.481, + "step": 1916 + }, + { + "epoch": 1.37, + "grad_norm": 10.053400748038461, + "learning_rate": 7.657625347819522e-06, + "loss": 0.4648, + "step": 1917 + }, + { + "epoch": 1.37, + "grad_norm": 10.665072797465664, + "learning_rate": 7.655176824225582e-06, + "loss": 0.4395, + "step": 1918 + }, + { + "epoch": 1.37, + "grad_norm": 14.43877602912696, + "learning_rate": 7.652727413487716e-06, + "loss": 0.5225, + "step": 1919 + }, + { + "epoch": 1.37, + "grad_norm": 11.53955384603415, + "learning_rate": 7.650277116424313e-06, + "loss": 0.4473, + "step": 1920 + }, + { + "epoch": 1.37, + "grad_norm": 17.42532980284874, + "learning_rate": 7.647825933854063e-06, + "loss": 0.5376, + "step": 1921 + }, + { + "epoch": 1.37, + "grad_norm": 10.36636988005007, + "learning_rate": 7.645373866595953e-06, + "loss": 0.4521, + "step": 1922 + }, + { + "epoch": 1.37, + "grad_norm": 9.525600543992109, + "learning_rate": 7.642920915469265e-06, + "loss": 0.4688, + "step": 1923 + }, + { + "epoch": 1.37, + "grad_norm": 17.106931536030515, + "learning_rate": 7.640467081293573e-06, + "loss": 0.5918, + "step": 1924 + }, + { + "epoch": 1.37, + "grad_norm": 11.348679176976812, + "learning_rate": 7.638012364888751e-06, + "loss": 0.4907, + "step": 1925 + }, + { + "epoch": 1.37, + "grad_norm": 11.55252362508795, + "learning_rate": 7.635556767074965e-06, + "loss": 0.4731, + "step": 1926 + }, + { + "epoch": 1.38, + "grad_norm": 12.419504634208726, + "learning_rate": 7.633100288672674e-06, + "loss": 0.481, + "step": 1927 + }, + { + "epoch": 1.38, + "grad_norm": 12.121962467884394, + "learning_rate": 7.630642930502634e-06, + "loss": 0.418, + "step": 1928 + }, + { + "epoch": 1.38, + "grad_norm": 10.289195654821805, + "learning_rate": 7.628184693385896e-06, + "loss": 0.4648, + "step": 1929 + }, + { + "epoch": 1.38, + "grad_norm": 7.9227069619546375, + "learning_rate": 7.625725578143801e-06, + "loss": 0.4712, + "step": 1930 + }, + { + "epoch": 1.38, + "grad_norm": 9.229125735331845, + "learning_rate": 7.6232655855979844e-06, + "loss": 0.5029, + "step": 1931 + }, + { + "epoch": 1.38, + "grad_norm": 13.049117828200913, + "learning_rate": 7.620804716570376e-06, + "loss": 0.4653, + "step": 1932 + }, + { + "epoch": 1.38, + "grad_norm": 15.576958024182636, + "learning_rate": 7.618342971883199e-06, + "loss": 0.5605, + "step": 1933 + }, + { + "epoch": 1.38, + "grad_norm": 12.779629505768929, + "learning_rate": 7.615880352358967e-06, + "loss": 0.4014, + "step": 1934 + }, + { + "epoch": 1.38, + "grad_norm": 18.50645646388632, + "learning_rate": 7.613416858820486e-06, + "loss": 0.4214, + "step": 1935 + }, + { + "epoch": 1.38, + "grad_norm": 25.52806005097817, + "learning_rate": 7.6109524920908575e-06, + "loss": 0.5288, + "step": 1936 + }, + { + "epoch": 1.38, + "grad_norm": 13.933316977057585, + "learning_rate": 7.608487252993471e-06, + "loss": 0.5151, + "step": 1937 + }, + { + "epoch": 1.38, + "grad_norm": 13.6592436046183, + "learning_rate": 7.6060211423520095e-06, + "loss": 0.4072, + "step": 1938 + }, + { + "epoch": 1.38, + "grad_norm": 21.814161825521396, + "learning_rate": 7.6035541609904425e-06, + "loss": 0.606, + "step": 1939 + }, + { + "epoch": 1.38, + "grad_norm": 15.66882526072226, + "learning_rate": 7.60108630973304e-06, + "loss": 0.458, + "step": 1940 + }, + { + "epoch": 1.39, + "grad_norm": 17.85672505413872, + "learning_rate": 7.598617589404354e-06, + "loss": 0.4565, + "step": 1941 + }, + { + "epoch": 1.39, + "grad_norm": 16.858479644628172, + "learning_rate": 7.596148000829229e-06, + "loss": 0.6187, + "step": 1942 + }, + { + "epoch": 1.39, + "grad_norm": 13.16162358033872, + "learning_rate": 7.593677544832802e-06, + "loss": 0.5293, + "step": 1943 + }, + { + "epoch": 1.39, + "grad_norm": 9.524117704758424, + "learning_rate": 7.5912062222404965e-06, + "loss": 0.3533, + "step": 1944 + }, + { + "epoch": 1.39, + "grad_norm": 20.818889018640792, + "learning_rate": 7.588734033878031e-06, + "loss": 0.5693, + "step": 1945 + }, + { + "epoch": 1.39, + "grad_norm": 16.522766326382442, + "learning_rate": 7.586260980571407e-06, + "loss": 0.5547, + "step": 1946 + }, + { + "epoch": 1.39, + "grad_norm": 11.564225336554951, + "learning_rate": 7.5837870631469165e-06, + "loss": 0.6562, + "step": 1947 + }, + { + "epoch": 1.39, + "grad_norm": 5.668952257315175, + "learning_rate": 7.581312282431143e-06, + "loss": 0.3936, + "step": 1948 + }, + { + "epoch": 1.39, + "grad_norm": 11.336521625375687, + "learning_rate": 7.578836639250958e-06, + "loss": 0.5151, + "step": 1949 + }, + { + "epoch": 1.39, + "grad_norm": 8.918797922477602, + "learning_rate": 7.576360134433517e-06, + "loss": 0.4668, + "step": 1950 + }, + { + "epoch": 1.39, + "grad_norm": 7.6676020443715975, + "learning_rate": 7.5738827688062676e-06, + "loss": 0.4961, + "step": 1951 + }, + { + "epoch": 1.39, + "grad_norm": 7.713872938147478, + "learning_rate": 7.571404543196943e-06, + "loss": 0.4824, + "step": 1952 + }, + { + "epoch": 1.39, + "grad_norm": 6.834704519149373, + "learning_rate": 7.568925458433567e-06, + "loss": 0.4819, + "step": 1953 + }, + { + "epoch": 1.39, + "grad_norm": 12.91835327063667, + "learning_rate": 7.566445515344445e-06, + "loss": 0.4233, + "step": 1954 + }, + { + "epoch": 1.4, + "grad_norm": 7.599974116531055, + "learning_rate": 7.563964714758172e-06, + "loss": 0.4458, + "step": 1955 + }, + { + "epoch": 1.4, + "grad_norm": 10.201329272152451, + "learning_rate": 7.561483057503632e-06, + "loss": 0.5386, + "step": 1956 + }, + { + "epoch": 1.4, + "grad_norm": 10.350442516482657, + "learning_rate": 7.559000544409991e-06, + "loss": 0.4849, + "step": 1957 + }, + { + "epoch": 1.4, + "grad_norm": 12.802291496242244, + "learning_rate": 7.556517176306704e-06, + "loss": 0.5127, + "step": 1958 + }, + { + "epoch": 1.4, + "grad_norm": 14.032857918894152, + "learning_rate": 7.554032954023508e-06, + "loss": 0.479, + "step": 1959 + }, + { + "epoch": 1.4, + "grad_norm": 11.170707738509645, + "learning_rate": 7.55154787839043e-06, + "loss": 0.5752, + "step": 1960 + }, + { + "epoch": 1.4, + "grad_norm": 14.33356640527272, + "learning_rate": 7.5490619502377805e-06, + "loss": 0.562, + "step": 1961 + }, + { + "epoch": 1.4, + "grad_norm": 10.3400026057796, + "learning_rate": 7.546575170396153e-06, + "loss": 0.4961, + "step": 1962 + }, + { + "epoch": 1.4, + "grad_norm": 11.41020505136803, + "learning_rate": 7.544087539696427e-06, + "loss": 0.5356, + "step": 1963 + }, + { + "epoch": 1.4, + "grad_norm": 15.384398968489197, + "learning_rate": 7.541599058969766e-06, + "loss": 0.5757, + "step": 1964 + }, + { + "epoch": 1.4, + "grad_norm": 26.85010786013174, + "learning_rate": 7.539109729047619e-06, + "loss": 0.5869, + "step": 1965 + }, + { + "epoch": 1.4, + "grad_norm": 13.678164293750017, + "learning_rate": 7.5366195507617155e-06, + "loss": 0.5024, + "step": 1966 + }, + { + "epoch": 1.4, + "grad_norm": 11.760310761103758, + "learning_rate": 7.534128524944071e-06, + "loss": 0.4077, + "step": 1967 + }, + { + "epoch": 1.4, + "grad_norm": 15.324608405481133, + "learning_rate": 7.531636652426985e-06, + "loss": 0.4927, + "step": 1968 + }, + { + "epoch": 1.41, + "grad_norm": 12.427863874423563, + "learning_rate": 7.529143934043036e-06, + "loss": 0.562, + "step": 1969 + }, + { + "epoch": 1.41, + "grad_norm": 9.721667941829319, + "learning_rate": 7.526650370625088e-06, + "loss": 0.6045, + "step": 1970 + }, + { + "epoch": 1.41, + "grad_norm": 11.373487140290482, + "learning_rate": 7.5241559630062896e-06, + "loss": 0.5576, + "step": 1971 + }, + { + "epoch": 1.41, + "grad_norm": 10.699156015321359, + "learning_rate": 7.5216607120200655e-06, + "loss": 0.4937, + "step": 1972 + }, + { + "epoch": 1.41, + "grad_norm": 9.49661605079921, + "learning_rate": 7.519164618500127e-06, + "loss": 0.4209, + "step": 1973 + }, + { + "epoch": 1.41, + "grad_norm": 14.772198775491923, + "learning_rate": 7.5166676832804655e-06, + "loss": 0.4731, + "step": 1974 + }, + { + "epoch": 1.41, + "grad_norm": 8.746747420450404, + "learning_rate": 7.514169907195352e-06, + "loss": 0.457, + "step": 1975 + }, + { + "epoch": 1.41, + "grad_norm": 11.152389759194886, + "learning_rate": 7.511671291079342e-06, + "loss": 0.3955, + "step": 1976 + }, + { + "epoch": 1.41, + "grad_norm": 12.057173209748903, + "learning_rate": 7.509171835767268e-06, + "loss": 0.4814, + "step": 1977 + }, + { + "epoch": 1.41, + "grad_norm": 15.473228863127407, + "learning_rate": 7.506671542094246e-06, + "loss": 0.4976, + "step": 1978 + }, + { + "epoch": 1.41, + "grad_norm": 10.727530037186542, + "learning_rate": 7.504170410895668e-06, + "loss": 0.4204, + "step": 1979 + }, + { + "epoch": 1.41, + "grad_norm": 7.985145065845938, + "learning_rate": 7.501668443007212e-06, + "loss": 0.4468, + "step": 1980 + }, + { + "epoch": 1.41, + "grad_norm": 14.866585730814796, + "learning_rate": 7.499165639264828e-06, + "loss": 0.5054, + "step": 1981 + }, + { + "epoch": 1.41, + "grad_norm": 13.54865900787404, + "learning_rate": 7.496662000504752e-06, + "loss": 0.5615, + "step": 1982 + }, + { + "epoch": 1.42, + "grad_norm": 10.596067173111546, + "learning_rate": 7.4941575275634945e-06, + "loss": 0.4961, + "step": 1983 + }, + { + "epoch": 1.42, + "grad_norm": 18.522213226674722, + "learning_rate": 7.49165222127785e-06, + "loss": 0.4829, + "step": 1984 + }, + { + "epoch": 1.42, + "grad_norm": 9.827635155968103, + "learning_rate": 7.489146082484882e-06, + "loss": 0.3564, + "step": 1985 + }, + { + "epoch": 1.42, + "grad_norm": 9.292648628242718, + "learning_rate": 7.486639112021944e-06, + "loss": 0.4268, + "step": 1986 + }, + { + "epoch": 1.42, + "grad_norm": 10.52207584728679, + "learning_rate": 7.484131310726658e-06, + "loss": 0.3796, + "step": 1987 + }, + { + "epoch": 1.42, + "grad_norm": 14.540889479395993, + "learning_rate": 7.481622679436929e-06, + "loss": 0.4463, + "step": 1988 + }, + { + "epoch": 1.42, + "grad_norm": 8.775521996498332, + "learning_rate": 7.479113218990934e-06, + "loss": 0.3867, + "step": 1989 + }, + { + "epoch": 1.42, + "grad_norm": 11.41497161064518, + "learning_rate": 7.4766029302271335e-06, + "loss": 0.5225, + "step": 1990 + }, + { + "epoch": 1.42, + "grad_norm": 7.663185183957201, + "learning_rate": 7.474091813984261e-06, + "loss": 0.3784, + "step": 1991 + }, + { + "epoch": 1.42, + "grad_norm": 10.438508094185057, + "learning_rate": 7.471579871101326e-06, + "loss": 0.4409, + "step": 1992 + }, + { + "epoch": 1.42, + "grad_norm": 12.101689467991465, + "learning_rate": 7.4690671024176165e-06, + "loss": 0.5542, + "step": 1993 + }, + { + "epoch": 1.42, + "grad_norm": 13.147337868681644, + "learning_rate": 7.466553508772695e-06, + "loss": 0.4072, + "step": 1994 + }, + { + "epoch": 1.42, + "grad_norm": 13.211891837722657, + "learning_rate": 7.4640390910064e-06, + "loss": 0.4326, + "step": 1995 + }, + { + "epoch": 1.42, + "grad_norm": 14.383061627960748, + "learning_rate": 7.461523849958845e-06, + "loss": 0.5654, + "step": 1996 + }, + { + "epoch": 1.43, + "grad_norm": 11.337379991199462, + "learning_rate": 7.459007786470418e-06, + "loss": 0.3994, + "step": 1997 + }, + { + "epoch": 1.43, + "grad_norm": 10.488871378983164, + "learning_rate": 7.4564909013817845e-06, + "loss": 0.501, + "step": 1998 + }, + { + "epoch": 1.43, + "grad_norm": 12.83370126958352, + "learning_rate": 7.45397319553388e-06, + "loss": 0.354, + "step": 1999 + }, + { + "epoch": 1.43, + "grad_norm": 12.89565539397935, + "learning_rate": 7.451454669767919e-06, + "loss": 0.4561, + "step": 2000 + }, + { + "epoch": 1.43, + "eval_avg_AUC": 0.7710454060906093, + "eval_avg_Accuracy": 0.6791694297082228, + "eval_avg_Accuracy-right": 0.9080474761966871, + "eval_avg_Accuracy-wrong": 0.2800773254491699, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.626217598019666, + "eval_last_AUC": 0.7805472536484926, + "eval_last_Accuracy": 0.705238726790451, + "eval_last_Accuracy-right": 0.851506456241033, + "eval_last_Accuracy-wrong": 0.4501933136229247, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6337350556153214, + "eval_max_AUC": 0.7253894400592504, + "eval_max_Accuracy": 0.6378481432360743, + "eval_max_Accuracy-right": 0.9711099517412286, + "eval_max_Accuracy-wrong": 0.056743234023197635, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6102256988583293, + "eval_min_AUC": 0.7744442531826913, + "eval_min_Accuracy": 0.707145225464191, + "eval_min_Accuracy-right": 0.7947697926177123, + "eval_min_Accuracy-wrong": 0.5543552422105982, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6233897492201612, + "eval_prod_AUC": 0.7756218453367024, + "eval_prod_Accuracy": 0.6703000663129973, + "eval_prod_Accuracy-right": 0.5567366636233207, + "eval_prod_Accuracy-wrong": 0.8683193086195133, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6229021173701285, + "eval_runtime": 246.9142, + "eval_samples_per_second": 97.718, + "eval_steps_per_second": 3.054, + "eval_sum_AUC": 0.6216167742931785, + "eval_sum_Accuracy": 0.6359001989389921, + "eval_sum_Accuracy-right": 0.9986304943263337, + "eval_sum_Accuracy-wrong": 0.0034114168751421424, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6193609469118248, + "step": 2000 + }, + { + "epoch": 1.43, + "grad_norm": 14.915881897626125, + "learning_rate": 7.448935324925386e-06, + "loss": 0.3965, + "step": 2001 + }, + { + "epoch": 1.43, + "grad_norm": 12.292176761992444, + "learning_rate": 7.446415161848043e-06, + "loss": 0.4697, + "step": 2002 + }, + { + "epoch": 1.43, + "grad_norm": 12.41157340253059, + "learning_rate": 7.443894181377921e-06, + "loss": 0.4683, + "step": 2003 + }, + { + "epoch": 1.43, + "grad_norm": 11.703353976942797, + "learning_rate": 7.441372384357328e-06, + "loss": 0.4531, + "step": 2004 + }, + { + "epoch": 1.43, + "grad_norm": 16.728696147279525, + "learning_rate": 7.438849771628844e-06, + "loss": 0.4814, + "step": 2005 + }, + { + "epoch": 1.43, + "grad_norm": 9.964762875434154, + "learning_rate": 7.43632634403532e-06, + "loss": 0.4409, + "step": 2006 + }, + { + "epoch": 1.43, + "grad_norm": 19.767032870416173, + "learning_rate": 7.433802102419878e-06, + "loss": 0.4531, + "step": 2007 + }, + { + "epoch": 1.43, + "grad_norm": 8.973473512937122, + "learning_rate": 7.431277047625918e-06, + "loss": 0.4146, + "step": 2008 + }, + { + "epoch": 1.43, + "grad_norm": 14.382184093999804, + "learning_rate": 7.428751180497104e-06, + "loss": 0.5166, + "step": 2009 + }, + { + "epoch": 1.43, + "grad_norm": 17.177950650233484, + "learning_rate": 7.426224501877376e-06, + "loss": 0.5278, + "step": 2010 + }, + { + "epoch": 1.44, + "grad_norm": 12.821415821919816, + "learning_rate": 7.423697012610947e-06, + "loss": 0.4058, + "step": 2011 + }, + { + "epoch": 1.44, + "grad_norm": 10.12631349817775, + "learning_rate": 7.421168713542294e-06, + "loss": 0.4873, + "step": 2012 + }, + { + "epoch": 1.44, + "grad_norm": 12.189855739113973, + "learning_rate": 7.418639605516172e-06, + "loss": 0.5205, + "step": 2013 + }, + { + "epoch": 1.44, + "grad_norm": 10.654050014113183, + "learning_rate": 7.416109689377603e-06, + "loss": 0.5635, + "step": 2014 + }, + { + "epoch": 1.44, + "grad_norm": 25.246721169792306, + "learning_rate": 7.413578965971876e-06, + "loss": 0.4805, + "step": 2015 + }, + { + "epoch": 1.44, + "grad_norm": 10.730269042581263, + "learning_rate": 7.411047436144556e-06, + "loss": 0.4165, + "step": 2016 + }, + { + "epoch": 1.44, + "grad_norm": 12.059699400483384, + "learning_rate": 7.408515100741471e-06, + "loss": 0.4656, + "step": 2017 + }, + { + "epoch": 1.44, + "grad_norm": 12.604554128594822, + "learning_rate": 7.405981960608725e-06, + "loss": 0.4453, + "step": 2018 + }, + { + "epoch": 1.44, + "grad_norm": 9.398182516011172, + "learning_rate": 7.403448016592685e-06, + "loss": 0.4951, + "step": 2019 + }, + { + "epoch": 1.44, + "grad_norm": 13.429871973957303, + "learning_rate": 7.400913269539988e-06, + "loss": 0.5908, + "step": 2020 + }, + { + "epoch": 1.44, + "grad_norm": 11.90665542864224, + "learning_rate": 7.398377720297541e-06, + "loss": 0.4917, + "step": 2021 + }, + { + "epoch": 1.44, + "grad_norm": 18.435262236737888, + "learning_rate": 7.39584136971252e-06, + "loss": 0.4473, + "step": 2022 + }, + { + "epoch": 1.44, + "grad_norm": 25.241789540294814, + "learning_rate": 7.393304218632364e-06, + "loss": 0.4507, + "step": 2023 + }, + { + "epoch": 1.44, + "grad_norm": 12.58272231581781, + "learning_rate": 7.390766267904783e-06, + "loss": 0.3755, + "step": 2024 + }, + { + "epoch": 1.45, + "grad_norm": 16.95465858537594, + "learning_rate": 7.3882275183777554e-06, + "loss": 0.5312, + "step": 2025 + }, + { + "epoch": 1.45, + "grad_norm": 18.639047246847788, + "learning_rate": 7.385687970899523e-06, + "loss": 0.5078, + "step": 2026 + }, + { + "epoch": 1.45, + "grad_norm": 14.953213267741265, + "learning_rate": 7.3831476263185965e-06, + "loss": 0.4751, + "step": 2027 + }, + { + "epoch": 1.45, + "grad_norm": 14.322106542062425, + "learning_rate": 7.380606485483751e-06, + "loss": 0.4009, + "step": 2028 + }, + { + "epoch": 1.45, + "grad_norm": 43.45347427250669, + "learning_rate": 7.378064549244031e-06, + "loss": 0.5298, + "step": 2029 + }, + { + "epoch": 1.45, + "grad_norm": 34.42523425392354, + "learning_rate": 7.375521818448741e-06, + "loss": 0.5371, + "step": 2030 + }, + { + "epoch": 1.45, + "grad_norm": 55.77740354141135, + "learning_rate": 7.372978293947459e-06, + "loss": 0.4761, + "step": 2031 + }, + { + "epoch": 1.45, + "grad_norm": 49.09132676927039, + "learning_rate": 7.3704339765900205e-06, + "loss": 0.5615, + "step": 2032 + }, + { + "epoch": 1.45, + "grad_norm": 34.9256636465853, + "learning_rate": 7.367888867226531e-06, + "loss": 0.4976, + "step": 2033 + }, + { + "epoch": 1.45, + "grad_norm": 15.312677064646966, + "learning_rate": 7.365342966707359e-06, + "loss": 0.4487, + "step": 2034 + }, + { + "epoch": 1.45, + "grad_norm": 12.813978743326054, + "learning_rate": 7.362796275883135e-06, + "loss": 0.4038, + "step": 2035 + }, + { + "epoch": 1.45, + "grad_norm": 12.643306134854747, + "learning_rate": 7.360248795604758e-06, + "loss": 0.4575, + "step": 2036 + }, + { + "epoch": 1.45, + "grad_norm": 12.868389092088139, + "learning_rate": 7.3577005267233885e-06, + "loss": 0.429, + "step": 2037 + }, + { + "epoch": 1.45, + "grad_norm": 9.982700342237198, + "learning_rate": 7.355151470090449e-06, + "loss": 0.4712, + "step": 2038 + }, + { + "epoch": 1.46, + "grad_norm": 9.034999940641313, + "learning_rate": 7.352601626557628e-06, + "loss": 0.4663, + "step": 2039 + }, + { + "epoch": 1.46, + "grad_norm": 20.88549792075512, + "learning_rate": 7.350050996976875e-06, + "loss": 0.5669, + "step": 2040 + }, + { + "epoch": 1.46, + "grad_norm": 9.208945274018559, + "learning_rate": 7.347499582200404e-06, + "loss": 0.4097, + "step": 2041 + }, + { + "epoch": 1.46, + "grad_norm": 11.404862265993508, + "learning_rate": 7.344947383080687e-06, + "loss": 0.4634, + "step": 2042 + }, + { + "epoch": 1.46, + "grad_norm": 8.366856522102745, + "learning_rate": 7.342394400470463e-06, + "loss": 0.3926, + "step": 2043 + }, + { + "epoch": 1.46, + "grad_norm": 9.362697922089307, + "learning_rate": 7.339840635222732e-06, + "loss": 0.4116, + "step": 2044 + }, + { + "epoch": 1.46, + "grad_norm": 10.008373884805586, + "learning_rate": 7.337286088190754e-06, + "loss": 0.4399, + "step": 2045 + }, + { + "epoch": 1.46, + "grad_norm": 18.955581395387284, + "learning_rate": 7.334730760228049e-06, + "loss": 0.5054, + "step": 2046 + }, + { + "epoch": 1.46, + "grad_norm": 18.736021901140624, + "learning_rate": 7.332174652188401e-06, + "loss": 0.5195, + "step": 2047 + }, + { + "epoch": 1.46, + "grad_norm": 12.418786675988692, + "learning_rate": 7.329617764925853e-06, + "loss": 0.5605, + "step": 2048 + }, + { + "epoch": 1.46, + "grad_norm": 10.156432856945289, + "learning_rate": 7.32706009929471e-06, + "loss": 0.5474, + "step": 2049 + }, + { + "epoch": 1.46, + "grad_norm": 15.360135291080843, + "learning_rate": 7.324501656149532e-06, + "loss": 0.4775, + "step": 2050 + }, + { + "epoch": 1.46, + "grad_norm": 6.446127423689205, + "learning_rate": 7.321942436345146e-06, + "loss": 0.324, + "step": 2051 + }, + { + "epoch": 1.46, + "grad_norm": 12.489691101046363, + "learning_rate": 7.319382440736632e-06, + "loss": 0.5054, + "step": 2052 + }, + { + "epoch": 1.47, + "grad_norm": 6.514088999847083, + "learning_rate": 7.316821670179335e-06, + "loss": 0.4668, + "step": 2053 + }, + { + "epoch": 1.47, + "grad_norm": 11.159539585109048, + "learning_rate": 7.314260125528854e-06, + "loss": 0.6167, + "step": 2054 + }, + { + "epoch": 1.47, + "grad_norm": 25.094579873479073, + "learning_rate": 7.311697807641048e-06, + "loss": 0.3867, + "step": 2055 + }, + { + "epoch": 1.47, + "grad_norm": 8.846673523885213, + "learning_rate": 7.3091347173720386e-06, + "loss": 0.4932, + "step": 2056 + }, + { + "epoch": 1.47, + "grad_norm": 6.2914105450018125, + "learning_rate": 7.3065708555781986e-06, + "loss": 0.4727, + "step": 2057 + }, + { + "epoch": 1.47, + "grad_norm": 19.595181627149866, + "learning_rate": 7.304006223116162e-06, + "loss": 0.5098, + "step": 2058 + }, + { + "epoch": 1.47, + "grad_norm": 19.44544649111758, + "learning_rate": 7.301440820842822e-06, + "loss": 0.4707, + "step": 2059 + }, + { + "epoch": 1.47, + "grad_norm": 13.018485991705333, + "learning_rate": 7.298874649615327e-06, + "loss": 0.6582, + "step": 2060 + }, + { + "epoch": 1.47, + "grad_norm": 5.661937937304063, + "learning_rate": 7.29630771029108e-06, + "loss": 0.3862, + "step": 2061 + }, + { + "epoch": 1.47, + "grad_norm": 7.548593437781802, + "learning_rate": 7.293740003727745e-06, + "loss": 0.4663, + "step": 2062 + }, + { + "epoch": 1.47, + "grad_norm": 17.2212974848326, + "learning_rate": 7.291171530783241e-06, + "loss": 0.4976, + "step": 2063 + }, + { + "epoch": 1.47, + "grad_norm": 11.235414814450403, + "learning_rate": 7.288602292315742e-06, + "loss": 0.5596, + "step": 2064 + }, + { + "epoch": 1.47, + "grad_norm": 15.313344290622826, + "learning_rate": 7.286032289183679e-06, + "loss": 0.5317, + "step": 2065 + }, + { + "epoch": 1.47, + "grad_norm": 14.94697386715504, + "learning_rate": 7.283461522245736e-06, + "loss": 0.5137, + "step": 2066 + }, + { + "epoch": 1.48, + "grad_norm": 10.081682104162239, + "learning_rate": 7.280889992360856e-06, + "loss": 0.5156, + "step": 2067 + }, + { + "epoch": 1.48, + "grad_norm": 7.742609851819826, + "learning_rate": 7.278317700388232e-06, + "loss": 0.4551, + "step": 2068 + }, + { + "epoch": 1.48, + "grad_norm": 16.514579824634136, + "learning_rate": 7.275744647187318e-06, + "loss": 0.5801, + "step": 2069 + }, + { + "epoch": 1.48, + "grad_norm": 8.69700237796884, + "learning_rate": 7.273170833617818e-06, + "loss": 0.4678, + "step": 2070 + }, + { + "epoch": 1.48, + "grad_norm": 11.058671415569465, + "learning_rate": 7.2705962605396895e-06, + "loss": 0.521, + "step": 2071 + }, + { + "epoch": 1.48, + "grad_norm": 18.321934672500923, + "learning_rate": 7.268020928813147e-06, + "loss": 0.4917, + "step": 2072 + }, + { + "epoch": 1.48, + "grad_norm": 16.94345204999045, + "learning_rate": 7.265444839298656e-06, + "loss": 0.4526, + "step": 2073 + }, + { + "epoch": 1.48, + "grad_norm": 12.541712902362422, + "learning_rate": 7.262867992856934e-06, + "loss": 0.4409, + "step": 2074 + }, + { + "epoch": 1.48, + "grad_norm": 19.3187862663539, + "learning_rate": 7.260290390348956e-06, + "loss": 0.5542, + "step": 2075 + }, + { + "epoch": 1.48, + "grad_norm": 9.647404924361794, + "learning_rate": 7.257712032635946e-06, + "loss": 0.5205, + "step": 2076 + }, + { + "epoch": 1.48, + "grad_norm": 9.207651531426404, + "learning_rate": 7.255132920579382e-06, + "loss": 0.4556, + "step": 2077 + }, + { + "epoch": 1.48, + "grad_norm": 11.087482473274372, + "learning_rate": 7.252553055040991e-06, + "loss": 0.4121, + "step": 2078 + }, + { + "epoch": 1.48, + "grad_norm": 25.25159930562991, + "learning_rate": 7.249972436882756e-06, + "loss": 0.5176, + "step": 2079 + }, + { + "epoch": 1.48, + "grad_norm": 17.702457354457117, + "learning_rate": 7.247391066966909e-06, + "loss": 0.4609, + "step": 2080 + }, + { + "epoch": 1.49, + "grad_norm": 7.603868415722688, + "learning_rate": 7.244808946155933e-06, + "loss": 0.438, + "step": 2081 + }, + { + "epoch": 1.49, + "grad_norm": 11.349525316850295, + "learning_rate": 7.242226075312564e-06, + "loss": 0.437, + "step": 2082 + }, + { + "epoch": 1.49, + "grad_norm": 10.177336362256062, + "learning_rate": 7.239642455299787e-06, + "loss": 0.4385, + "step": 2083 + }, + { + "epoch": 1.49, + "grad_norm": 23.495636394435223, + "learning_rate": 7.237058086980835e-06, + "loss": 0.6406, + "step": 2084 + }, + { + "epoch": 1.49, + "grad_norm": 12.827798850107106, + "learning_rate": 7.234472971219197e-06, + "loss": 0.4165, + "step": 2085 + }, + { + "epoch": 1.49, + "grad_norm": 17.754753037992685, + "learning_rate": 7.231887108878606e-06, + "loss": 0.5586, + "step": 2086 + }, + { + "epoch": 1.49, + "grad_norm": 11.180399436654513, + "learning_rate": 7.229300500823047e-06, + "loss": 0.4966, + "step": 2087 + }, + { + "epoch": 1.49, + "grad_norm": 21.98964058960827, + "learning_rate": 7.226713147916754e-06, + "loss": 0.4678, + "step": 2088 + }, + { + "epoch": 1.49, + "grad_norm": 12.21132602807958, + "learning_rate": 7.22412505102421e-06, + "loss": 0.4282, + "step": 2089 + }, + { + "epoch": 1.49, + "grad_norm": 8.175190157330336, + "learning_rate": 7.221536211010147e-06, + "loss": 0.3574, + "step": 2090 + }, + { + "epoch": 1.49, + "grad_norm": 10.902417216507908, + "learning_rate": 7.2189466287395425e-06, + "loss": 0.4727, + "step": 2091 + }, + { + "epoch": 1.49, + "grad_norm": 8.995076716425732, + "learning_rate": 7.216356305077625e-06, + "loss": 0.4937, + "step": 2092 + }, + { + "epoch": 1.49, + "grad_norm": 19.69763566486368, + "learning_rate": 7.21376524088987e-06, + "loss": 0.5181, + "step": 2093 + }, + { + "epoch": 1.49, + "grad_norm": 10.925797313775647, + "learning_rate": 7.211173437042001e-06, + "loss": 0.3823, + "step": 2094 + }, + { + "epoch": 1.5, + "grad_norm": 9.692483691370395, + "learning_rate": 7.208580894399986e-06, + "loss": 0.4917, + "step": 2095 + }, + { + "epoch": 1.5, + "grad_norm": 19.59837794732827, + "learning_rate": 7.205987613830043e-06, + "loss": 0.5283, + "step": 2096 + }, + { + "epoch": 1.5, + "grad_norm": 14.027224117019413, + "learning_rate": 7.203393596198635e-06, + "loss": 0.4348, + "step": 2097 + }, + { + "epoch": 1.5, + "grad_norm": 9.277539316575139, + "learning_rate": 7.200798842372472e-06, + "loss": 0.5938, + "step": 2098 + }, + { + "epoch": 1.5, + "grad_norm": 8.158219347531539, + "learning_rate": 7.198203353218508e-06, + "loss": 0.4829, + "step": 2099 + }, + { + "epoch": 1.5, + "grad_norm": 12.788815769871793, + "learning_rate": 7.195607129603946e-06, + "loss": 0.4785, + "step": 2100 + }, + { + "epoch": 1.5, + "grad_norm": 13.303724332755069, + "learning_rate": 7.19301017239623e-06, + "loss": 0.4536, + "step": 2101 + }, + { + "epoch": 1.5, + "grad_norm": 8.88556880734833, + "learning_rate": 7.190412482463054e-06, + "loss": 0.4185, + "step": 2102 + }, + { + "epoch": 1.5, + "grad_norm": 9.39510497345665, + "learning_rate": 7.187814060672354e-06, + "loss": 0.499, + "step": 2103 + }, + { + "epoch": 1.5, + "grad_norm": 7.517777067379724, + "learning_rate": 7.1852149078923105e-06, + "loss": 0.4316, + "step": 2104 + }, + { + "epoch": 1.5, + "grad_norm": 11.90115946127883, + "learning_rate": 7.1826150249913495e-06, + "loss": 0.5176, + "step": 2105 + }, + { + "epoch": 1.5, + "grad_norm": 9.267334886302498, + "learning_rate": 7.18001441283814e-06, + "loss": 0.3643, + "step": 2106 + }, + { + "epoch": 1.5, + "grad_norm": 22.066236031166735, + "learning_rate": 7.1774130723015955e-06, + "loss": 0.6748, + "step": 2107 + }, + { + "epoch": 1.5, + "grad_norm": 6.466559423365932, + "learning_rate": 7.17481100425087e-06, + "loss": 0.3584, + "step": 2108 + }, + { + "epoch": 1.51, + "grad_norm": 11.920295137461906, + "learning_rate": 7.172208209555365e-06, + "loss": 0.4668, + "step": 2109 + }, + { + "epoch": 1.51, + "grad_norm": 10.95986098611621, + "learning_rate": 7.1696046890847206e-06, + "loss": 0.4946, + "step": 2110 + }, + { + "epoch": 1.51, + "grad_norm": 16.786202212897535, + "learning_rate": 7.167000443708823e-06, + "loss": 0.5708, + "step": 2111 + }, + { + "epoch": 1.51, + "grad_norm": 6.838902041674624, + "learning_rate": 7.164395474297798e-06, + "loss": 0.3911, + "step": 2112 + }, + { + "epoch": 1.51, + "grad_norm": 9.176921095103253, + "learning_rate": 7.161789781722016e-06, + "loss": 0.4888, + "step": 2113 + }, + { + "epoch": 1.51, + "grad_norm": 15.90810755723991, + "learning_rate": 7.159183366852085e-06, + "loss": 0.522, + "step": 2114 + }, + { + "epoch": 1.51, + "grad_norm": 25.715900600535367, + "learning_rate": 7.156576230558859e-06, + "loss": 0.5347, + "step": 2115 + }, + { + "epoch": 1.51, + "grad_norm": 28.28368775228431, + "learning_rate": 7.153968373713429e-06, + "loss": 0.5601, + "step": 2116 + }, + { + "epoch": 1.51, + "grad_norm": 11.980625504053775, + "learning_rate": 7.1513597971871295e-06, + "loss": 0.4326, + "step": 2117 + }, + { + "epoch": 1.51, + "grad_norm": 12.501120633814393, + "learning_rate": 7.148750501851532e-06, + "loss": 0.4717, + "step": 2118 + }, + { + "epoch": 1.51, + "grad_norm": 7.712009993731347, + "learning_rate": 7.1461404885784545e-06, + "loss": 0.4873, + "step": 2119 + }, + { + "epoch": 1.51, + "grad_norm": 13.12024516319098, + "learning_rate": 7.1435297582399475e-06, + "loss": 0.4927, + "step": 2120 + }, + { + "epoch": 1.51, + "grad_norm": 7.745722717197969, + "learning_rate": 7.140918311708306e-06, + "loss": 0.459, + "step": 2121 + }, + { + "epoch": 1.51, + "grad_norm": 8.688287138210573, + "learning_rate": 7.138306149856062e-06, + "loss": 0.3975, + "step": 2122 + }, + { + "epoch": 1.52, + "grad_norm": 7.090721711186, + "learning_rate": 7.1356932735559905e-06, + "loss": 0.4312, + "step": 2123 + }, + { + "epoch": 1.52, + "grad_norm": 13.510222651737925, + "learning_rate": 7.133079683681099e-06, + "loss": 0.5864, + "step": 2124 + }, + { + "epoch": 1.52, + "grad_norm": 8.147602862513805, + "learning_rate": 7.130465381104635e-06, + "loss": 0.3774, + "step": 2125 + }, + { + "epoch": 1.52, + "grad_norm": 8.962162623356548, + "learning_rate": 7.1278503667000885e-06, + "loss": 0.4297, + "step": 2126 + }, + { + "epoch": 1.52, + "grad_norm": 7.176462569931615, + "learning_rate": 7.125234641341185e-06, + "loss": 0.4458, + "step": 2127 + }, + { + "epoch": 1.52, + "grad_norm": 7.786769472257886, + "learning_rate": 7.1226182059018835e-06, + "loss": 0.397, + "step": 2128 + }, + { + "epoch": 1.52, + "grad_norm": 10.022347014680557, + "learning_rate": 7.120001061256387e-06, + "loss": 0.3726, + "step": 2129 + }, + { + "epoch": 1.52, + "grad_norm": 16.58498208142081, + "learning_rate": 7.1173832082791294e-06, + "loss": 0.5288, + "step": 2130 + }, + { + "epoch": 1.52, + "grad_norm": 14.626054474790065, + "learning_rate": 7.114764647844788e-06, + "loss": 0.4883, + "step": 2131 + }, + { + "epoch": 1.52, + "grad_norm": 15.442528730947034, + "learning_rate": 7.112145380828267e-06, + "loss": 0.5249, + "step": 2132 + }, + { + "epoch": 1.52, + "grad_norm": 12.78478336790942, + "learning_rate": 7.109525408104717e-06, + "loss": 0.5713, + "step": 2133 + }, + { + "epoch": 1.52, + "grad_norm": 14.13171845697349, + "learning_rate": 7.106904730549517e-06, + "loss": 0.4658, + "step": 2134 + }, + { + "epoch": 1.52, + "grad_norm": 7.812889203251429, + "learning_rate": 7.104283349038285e-06, + "loss": 0.4365, + "step": 2135 + }, + { + "epoch": 1.52, + "grad_norm": 10.244931909540133, + "learning_rate": 7.101661264446875e-06, + "loss": 0.4551, + "step": 2136 + }, + { + "epoch": 1.53, + "grad_norm": 6.42786049799438, + "learning_rate": 7.099038477651371e-06, + "loss": 0.3369, + "step": 2137 + }, + { + "epoch": 1.53, + "grad_norm": 7.446564970154658, + "learning_rate": 7.096414989528095e-06, + "loss": 0.4336, + "step": 2138 + }, + { + "epoch": 1.53, + "grad_norm": 11.95393484819928, + "learning_rate": 7.093790800953606e-06, + "loss": 0.5459, + "step": 2139 + }, + { + "epoch": 1.53, + "grad_norm": 8.975724248095739, + "learning_rate": 7.091165912804693e-06, + "loss": 0.4619, + "step": 2140 + }, + { + "epoch": 1.53, + "grad_norm": 8.463205000165559, + "learning_rate": 7.088540325958379e-06, + "loss": 0.4702, + "step": 2141 + }, + { + "epoch": 1.53, + "grad_norm": 8.668064646053395, + "learning_rate": 7.085914041291921e-06, + "loss": 0.4897, + "step": 2142 + }, + { + "epoch": 1.53, + "grad_norm": 17.501799641654888, + "learning_rate": 7.08328705968281e-06, + "loss": 0.5454, + "step": 2143 + }, + { + "epoch": 1.53, + "grad_norm": 7.3244754880169625, + "learning_rate": 7.080659382008772e-06, + "loss": 0.4458, + "step": 2144 + }, + { + "epoch": 1.53, + "grad_norm": 11.920592534439697, + "learning_rate": 7.078031009147759e-06, + "loss": 0.6465, + "step": 2145 + }, + { + "epoch": 1.53, + "grad_norm": 7.120473562340443, + "learning_rate": 7.075401941977961e-06, + "loss": 0.4741, + "step": 2146 + }, + { + "epoch": 1.53, + "grad_norm": 8.506397076951163, + "learning_rate": 7.072772181377798e-06, + "loss": 0.4634, + "step": 2147 + }, + { + "epoch": 1.53, + "grad_norm": 16.898055505594296, + "learning_rate": 7.070141728225922e-06, + "loss": 0.5186, + "step": 2148 + }, + { + "epoch": 1.53, + "grad_norm": 14.770324812358789, + "learning_rate": 7.067510583401217e-06, + "loss": 0.4585, + "step": 2149 + }, + { + "epoch": 1.53, + "grad_norm": 13.618834750606382, + "learning_rate": 7.0648787477827965e-06, + "loss": 0.4229, + "step": 2150 + }, + { + "epoch": 1.54, + "grad_norm": 13.901946072429947, + "learning_rate": 7.062246222250005e-06, + "loss": 0.4673, + "step": 2151 + }, + { + "epoch": 1.54, + "grad_norm": 11.322546513477945, + "learning_rate": 7.05961300768242e-06, + "loss": 0.4419, + "step": 2152 + }, + { + "epoch": 1.54, + "grad_norm": 21.827793100279667, + "learning_rate": 7.056979104959847e-06, + "loss": 0.6724, + "step": 2153 + }, + { + "epoch": 1.54, + "grad_norm": 9.850403835999005, + "learning_rate": 7.054344514962319e-06, + "loss": 0.4351, + "step": 2154 + }, + { + "epoch": 1.54, + "grad_norm": 11.28905672300358, + "learning_rate": 7.051709238570106e-06, + "loss": 0.5864, + "step": 2155 + }, + { + "epoch": 1.54, + "grad_norm": 11.027979265939116, + "learning_rate": 7.0490732766637e-06, + "loss": 0.4814, + "step": 2156 + }, + { + "epoch": 1.54, + "grad_norm": 14.594956263785003, + "learning_rate": 7.046436630123826e-06, + "loss": 0.5908, + "step": 2157 + }, + { + "epoch": 1.54, + "grad_norm": 7.349980848273088, + "learning_rate": 7.043799299831438e-06, + "loss": 0.4062, + "step": 2158 + }, + { + "epoch": 1.54, + "grad_norm": 11.422633603478822, + "learning_rate": 7.041161286667713e-06, + "loss": 0.4761, + "step": 2159 + }, + { + "epoch": 1.54, + "grad_norm": 7.42059519880841, + "learning_rate": 7.038522591514061e-06, + "loss": 0.4302, + "step": 2160 + }, + { + "epoch": 1.54, + "grad_norm": 14.084142584061851, + "learning_rate": 7.035883215252123e-06, + "loss": 0.4736, + "step": 2161 + }, + { + "epoch": 1.54, + "grad_norm": 16.355044115107113, + "learning_rate": 7.03324315876376e-06, + "loss": 0.5054, + "step": 2162 + }, + { + "epoch": 1.54, + "grad_norm": 7.3052309454934035, + "learning_rate": 7.030602422931065e-06, + "loss": 0.4629, + "step": 2163 + }, + { + "epoch": 1.54, + "grad_norm": 10.739648409543541, + "learning_rate": 7.027961008636359e-06, + "loss": 0.4648, + "step": 2164 + }, + { + "epoch": 1.55, + "grad_norm": 8.804729154765832, + "learning_rate": 7.025318916762185e-06, + "loss": 0.3853, + "step": 2165 + }, + { + "epoch": 1.55, + "grad_norm": 16.13762113342716, + "learning_rate": 7.022676148191315e-06, + "loss": 0.604, + "step": 2166 + }, + { + "epoch": 1.55, + "grad_norm": 15.74981032671273, + "learning_rate": 7.020032703806748e-06, + "loss": 0.4409, + "step": 2167 + }, + { + "epoch": 1.55, + "grad_norm": 9.4004788730862, + "learning_rate": 7.017388584491709e-06, + "loss": 0.4077, + "step": 2168 + }, + { + "epoch": 1.55, + "grad_norm": 17.52904134228692, + "learning_rate": 7.014743791129644e-06, + "loss": 0.5288, + "step": 2169 + }, + { + "epoch": 1.55, + "grad_norm": 9.05836662525096, + "learning_rate": 7.012098324604231e-06, + "loss": 0.396, + "step": 2170 + }, + { + "epoch": 1.55, + "grad_norm": 11.211986374727667, + "learning_rate": 7.009452185799368e-06, + "loss": 0.5439, + "step": 2171 + }, + { + "epoch": 1.55, + "grad_norm": 13.01994379164931, + "learning_rate": 7.00680537559918e-06, + "loss": 0.5039, + "step": 2172 + }, + { + "epoch": 1.55, + "grad_norm": 13.969090071115868, + "learning_rate": 7.0041578948880155e-06, + "loss": 0.4111, + "step": 2173 + }, + { + "epoch": 1.55, + "grad_norm": 15.659257826403904, + "learning_rate": 7.001509744550446e-06, + "loss": 0.543, + "step": 2174 + }, + { + "epoch": 1.55, + "grad_norm": 21.720135778440106, + "learning_rate": 6.998860925471267e-06, + "loss": 0.5352, + "step": 2175 + }, + { + "epoch": 1.55, + "grad_norm": 10.39302176847214, + "learning_rate": 6.9962114385355e-06, + "loss": 0.4092, + "step": 2176 + }, + { + "epoch": 1.55, + "grad_norm": 13.659365134233374, + "learning_rate": 6.993561284628388e-06, + "loss": 0.5156, + "step": 2177 + }, + { + "epoch": 1.55, + "grad_norm": 7.458849658112828, + "learning_rate": 6.990910464635395e-06, + "loss": 0.4067, + "step": 2178 + }, + { + "epoch": 1.56, + "grad_norm": 11.086418567695002, + "learning_rate": 6.9882589794422105e-06, + "loss": 0.418, + "step": 2179 + }, + { + "epoch": 1.56, + "grad_norm": 12.32462739243237, + "learning_rate": 6.9856068299347455e-06, + "loss": 0.4932, + "step": 2180 + }, + { + "epoch": 1.56, + "grad_norm": 8.227065489328242, + "learning_rate": 6.98295401699913e-06, + "loss": 0.4438, + "step": 2181 + }, + { + "epoch": 1.56, + "grad_norm": 8.53687999777279, + "learning_rate": 6.980300541521721e-06, + "loss": 0.4766, + "step": 2182 + }, + { + "epoch": 1.56, + "grad_norm": 7.091176799000983, + "learning_rate": 6.977646404389092e-06, + "loss": 0.3911, + "step": 2183 + }, + { + "epoch": 1.56, + "grad_norm": 18.60903069175726, + "learning_rate": 6.9749916064880404e-06, + "loss": 0.6152, + "step": 2184 + }, + { + "epoch": 1.56, + "grad_norm": 14.532470312845014, + "learning_rate": 6.972336148705583e-06, + "loss": 0.4365, + "step": 2185 + }, + { + "epoch": 1.56, + "grad_norm": 13.726765751198604, + "learning_rate": 6.969680031928959e-06, + "loss": 0.4414, + "step": 2186 + }, + { + "epoch": 1.56, + "grad_norm": 10.729404027235125, + "learning_rate": 6.967023257045624e-06, + "loss": 0.4883, + "step": 2187 + }, + { + "epoch": 1.56, + "grad_norm": 11.006117166133711, + "learning_rate": 6.96436582494326e-06, + "loss": 0.375, + "step": 2188 + }, + { + "epoch": 1.56, + "grad_norm": 10.304932091242616, + "learning_rate": 6.961707736509759e-06, + "loss": 0.5664, + "step": 2189 + }, + { + "epoch": 1.56, + "grad_norm": 12.158180976223932, + "learning_rate": 6.959048992633241e-06, + "loss": 0.4287, + "step": 2190 + }, + { + "epoch": 1.56, + "grad_norm": 7.951351287243968, + "learning_rate": 6.956389594202041e-06, + "loss": 0.4077, + "step": 2191 + }, + { + "epoch": 1.56, + "grad_norm": 11.814399269265236, + "learning_rate": 6.953729542104713e-06, + "loss": 0.4473, + "step": 2192 + }, + { + "epoch": 1.57, + "grad_norm": 9.228592101191628, + "learning_rate": 6.951068837230032e-06, + "loss": 0.6001, + "step": 2193 + }, + { + "epoch": 1.57, + "grad_norm": 8.684761092197428, + "learning_rate": 6.9484074804669865e-06, + "loss": 0.4868, + "step": 2194 + }, + { + "epoch": 1.57, + "grad_norm": 13.022339608408158, + "learning_rate": 6.945745472704786e-06, + "loss": 0.4446, + "step": 2195 + }, + { + "epoch": 1.57, + "grad_norm": 15.806399945358521, + "learning_rate": 6.943082814832858e-06, + "loss": 0.439, + "step": 2196 + }, + { + "epoch": 1.57, + "grad_norm": 7.733376328962677, + "learning_rate": 6.940419507740843e-06, + "loss": 0.5063, + "step": 2197 + }, + { + "epoch": 1.57, + "grad_norm": 9.045131247868405, + "learning_rate": 6.937755552318606e-06, + "loss": 0.4028, + "step": 2198 + }, + { + "epoch": 1.57, + "grad_norm": 8.45991812912647, + "learning_rate": 6.935090949456219e-06, + "loss": 0.4683, + "step": 2199 + }, + { + "epoch": 1.57, + "grad_norm": 10.754774822400977, + "learning_rate": 6.93242570004398e-06, + "loss": 0.4248, + "step": 2200 + }, + { + "epoch": 1.57, + "grad_norm": 9.026980227662335, + "learning_rate": 6.929759804972394e-06, + "loss": 0.4004, + "step": 2201 + }, + { + "epoch": 1.57, + "grad_norm": 7.231964685107052, + "learning_rate": 6.92709326513219e-06, + "loss": 0.3926, + "step": 2202 + }, + { + "epoch": 1.57, + "grad_norm": 8.292860309744759, + "learning_rate": 6.924426081414305e-06, + "loss": 0.4873, + "step": 2203 + }, + { + "epoch": 1.57, + "grad_norm": 8.757286190878336, + "learning_rate": 6.921758254709897e-06, + "loss": 0.3643, + "step": 2204 + }, + { + "epoch": 1.57, + "grad_norm": 11.606419829333902, + "learning_rate": 6.919089785910336e-06, + "loss": 0.4116, + "step": 2205 + }, + { + "epoch": 1.57, + "grad_norm": 8.234839110113752, + "learning_rate": 6.916420675907207e-06, + "loss": 0.4482, + "step": 2206 + }, + { + "epoch": 1.58, + "grad_norm": 11.023740197910566, + "learning_rate": 6.9137509255923085e-06, + "loss": 0.4917, + "step": 2207 + }, + { + "epoch": 1.58, + "grad_norm": 9.280209803610372, + "learning_rate": 6.911080535857655e-06, + "loss": 0.4019, + "step": 2208 + }, + { + "epoch": 1.58, + "grad_norm": 10.746299427986333, + "learning_rate": 6.908409507595472e-06, + "loss": 0.5361, + "step": 2209 + }, + { + "epoch": 1.58, + "grad_norm": 10.906230626644021, + "learning_rate": 6.905737841698201e-06, + "loss": 0.4429, + "step": 2210 + }, + { + "epoch": 1.58, + "grad_norm": 10.155722381902946, + "learning_rate": 6.903065539058496e-06, + "loss": 0.4624, + "step": 2211 + }, + { + "epoch": 1.58, + "grad_norm": 9.196138695564693, + "learning_rate": 6.900392600569219e-06, + "loss": 0.3521, + "step": 2212 + }, + { + "epoch": 1.58, + "grad_norm": 10.280936832961778, + "learning_rate": 6.897719027123451e-06, + "loss": 0.4634, + "step": 2213 + }, + { + "epoch": 1.58, + "grad_norm": 9.371583647428105, + "learning_rate": 6.895044819614484e-06, + "loss": 0.3848, + "step": 2214 + }, + { + "epoch": 1.58, + "grad_norm": 8.266982874224437, + "learning_rate": 6.8923699789358185e-06, + "loss": 0.3877, + "step": 2215 + }, + { + "epoch": 1.58, + "grad_norm": 17.175344791502628, + "learning_rate": 6.88969450598117e-06, + "loss": 0.4609, + "step": 2216 + }, + { + "epoch": 1.58, + "grad_norm": 20.899917847127078, + "learning_rate": 6.887018401644463e-06, + "loss": 0.5835, + "step": 2217 + }, + { + "epoch": 1.58, + "grad_norm": 14.085073006810147, + "learning_rate": 6.884341666819832e-06, + "loss": 0.4443, + "step": 2218 + }, + { + "epoch": 1.58, + "grad_norm": 10.194755959365828, + "learning_rate": 6.881664302401626e-06, + "loss": 0.5088, + "step": 2219 + }, + { + "epoch": 1.58, + "grad_norm": 28.041590639227383, + "learning_rate": 6.878986309284401e-06, + "loss": 0.4272, + "step": 2220 + }, + { + "epoch": 1.59, + "grad_norm": 18.858404710323796, + "learning_rate": 6.876307688362925e-06, + "loss": 0.5181, + "step": 2221 + }, + { + "epoch": 1.59, + "grad_norm": 10.769923044069692, + "learning_rate": 6.873628440532175e-06, + "loss": 0.4094, + "step": 2222 + }, + { + "epoch": 1.59, + "grad_norm": 10.760643999959065, + "learning_rate": 6.8709485666873375e-06, + "loss": 0.4131, + "step": 2223 + }, + { + "epoch": 1.59, + "grad_norm": 10.059077802767067, + "learning_rate": 6.868268067723808e-06, + "loss": 0.498, + "step": 2224 + }, + { + "epoch": 1.59, + "grad_norm": 16.067818695297724, + "learning_rate": 6.86558694453719e-06, + "loss": 0.4736, + "step": 2225 + }, + { + "epoch": 1.59, + "grad_norm": 11.23251193640447, + "learning_rate": 6.8629051980233e-06, + "loss": 0.415, + "step": 2226 + }, + { + "epoch": 1.59, + "grad_norm": 7.114819340426512, + "learning_rate": 6.860222829078156e-06, + "loss": 0.3423, + "step": 2227 + }, + { + "epoch": 1.59, + "grad_norm": 12.762589126480844, + "learning_rate": 6.857539838597987e-06, + "loss": 0.3765, + "step": 2228 + }, + { + "epoch": 1.59, + "grad_norm": 9.926047987180356, + "learning_rate": 6.8548562274792325e-06, + "loss": 0.4263, + "step": 2229 + }, + { + "epoch": 1.59, + "grad_norm": 13.231555849492707, + "learning_rate": 6.8521719966185355e-06, + "loss": 0.498, + "step": 2230 + }, + { + "epoch": 1.59, + "grad_norm": 14.73486658686708, + "learning_rate": 6.8494871469127474e-06, + "loss": 0.5493, + "step": 2231 + }, + { + "epoch": 1.59, + "grad_norm": 12.05707560464292, + "learning_rate": 6.846801679258926e-06, + "loss": 0.5015, + "step": 2232 + }, + { + "epoch": 1.59, + "grad_norm": 10.28340838759031, + "learning_rate": 6.844115594554338e-06, + "loss": 0.3784, + "step": 2233 + }, + { + "epoch": 1.59, + "grad_norm": 21.26492682141842, + "learning_rate": 6.841428893696453e-06, + "loss": 0.521, + "step": 2234 + }, + { + "epoch": 1.6, + "grad_norm": 8.269713147364001, + "learning_rate": 6.838741577582946e-06, + "loss": 0.4575, + "step": 2235 + }, + { + "epoch": 1.6, + "grad_norm": 12.193115329733613, + "learning_rate": 6.836053647111701e-06, + "loss": 0.4976, + "step": 2236 + }, + { + "epoch": 1.6, + "grad_norm": 8.42556116246502, + "learning_rate": 6.833365103180806e-06, + "loss": 0.4404, + "step": 2237 + }, + { + "epoch": 1.6, + "grad_norm": 11.312625877201313, + "learning_rate": 6.830675946688552e-06, + "loss": 0.4473, + "step": 2238 + }, + { + "epoch": 1.6, + "grad_norm": 11.467050902572582, + "learning_rate": 6.827986178533437e-06, + "loss": 0.5322, + "step": 2239 + }, + { + "epoch": 1.6, + "grad_norm": 7.750353718427244, + "learning_rate": 6.825295799614163e-06, + "loss": 0.4478, + "step": 2240 + }, + { + "epoch": 1.6, + "grad_norm": 8.352234008790848, + "learning_rate": 6.822604810829634e-06, + "loss": 0.4609, + "step": 2241 + }, + { + "epoch": 1.6, + "grad_norm": 8.559244440826264, + "learning_rate": 6.819913213078961e-06, + "loss": 0.3923, + "step": 2242 + }, + { + "epoch": 1.6, + "grad_norm": 14.88840732071109, + "learning_rate": 6.817221007261456e-06, + "loss": 0.4204, + "step": 2243 + }, + { + "epoch": 1.6, + "grad_norm": 10.030745063332024, + "learning_rate": 6.814528194276636e-06, + "loss": 0.4292, + "step": 2244 + }, + { + "epoch": 1.6, + "grad_norm": 11.309812809426399, + "learning_rate": 6.811834775024219e-06, + "loss": 0.5493, + "step": 2245 + }, + { + "epoch": 1.6, + "grad_norm": 9.66642562016404, + "learning_rate": 6.809140750404127e-06, + "loss": 0.4292, + "step": 2246 + }, + { + "epoch": 1.6, + "grad_norm": 10.012313894198071, + "learning_rate": 6.8064461213164825e-06, + "loss": 0.4946, + "step": 2247 + }, + { + "epoch": 1.6, + "grad_norm": 9.857244202004253, + "learning_rate": 6.803750888661611e-06, + "loss": 0.4478, + "step": 2248 + }, + { + "epoch": 1.61, + "grad_norm": 13.033407840690604, + "learning_rate": 6.8010550533400425e-06, + "loss": 0.438, + "step": 2249 + }, + { + "epoch": 1.61, + "grad_norm": 10.267146635510022, + "learning_rate": 6.798358616252503e-06, + "loss": 0.4214, + "step": 2250 + }, + { + "epoch": 1.61, + "grad_norm": 18.114380153302204, + "learning_rate": 6.795661578299924e-06, + "loss": 0.4097, + "step": 2251 + }, + { + "epoch": 1.61, + "grad_norm": 15.6843245806793, + "learning_rate": 6.792963940383436e-06, + "loss": 0.5952, + "step": 2252 + }, + { + "epoch": 1.61, + "grad_norm": 16.510405242918026, + "learning_rate": 6.790265703404368e-06, + "loss": 0.4707, + "step": 2253 + }, + { + "epoch": 1.61, + "grad_norm": 8.494076326644194, + "learning_rate": 6.787566868264253e-06, + "loss": 0.4829, + "step": 2254 + }, + { + "epoch": 1.61, + "grad_norm": 17.25444248319594, + "learning_rate": 6.7848674358648195e-06, + "loss": 0.438, + "step": 2255 + }, + { + "epoch": 1.61, + "grad_norm": 13.54816942321224, + "learning_rate": 6.782167407108001e-06, + "loss": 0.5273, + "step": 2256 + }, + { + "epoch": 1.61, + "grad_norm": 15.11514410545727, + "learning_rate": 6.779466782895926e-06, + "loss": 0.4658, + "step": 2257 + }, + { + "epoch": 1.61, + "grad_norm": 16.18230982436298, + "learning_rate": 6.7767655641309234e-06, + "loss": 0.5889, + "step": 2258 + }, + { + "epoch": 1.61, + "grad_norm": 11.47295733121618, + "learning_rate": 6.7740637517155205e-06, + "loss": 0.5142, + "step": 2259 + }, + { + "epoch": 1.61, + "grad_norm": 11.585772419117465, + "learning_rate": 6.771361346552445e-06, + "loss": 0.4607, + "step": 2260 + }, + { + "epoch": 1.61, + "grad_norm": 24.554032105944135, + "learning_rate": 6.7686583495446164e-06, + "loss": 0.4375, + "step": 2261 + }, + { + "epoch": 1.61, + "grad_norm": 14.428829908613086, + "learning_rate": 6.765954761595161e-06, + "loss": 0.5117, + "step": 2262 + }, + { + "epoch": 1.62, + "grad_norm": 6.54475905891446, + "learning_rate": 6.763250583607392e-06, + "loss": 0.3823, + "step": 2263 + }, + { + "epoch": 1.62, + "grad_norm": 15.52379110881373, + "learning_rate": 6.7605458164848316e-06, + "loss": 0.4619, + "step": 2264 + }, + { + "epoch": 1.62, + "grad_norm": 13.02231162018557, + "learning_rate": 6.75784046113119e-06, + "loss": 0.5483, + "step": 2265 + }, + { + "epoch": 1.62, + "grad_norm": 10.998871840978227, + "learning_rate": 6.755134518450377e-06, + "loss": 0.502, + "step": 2266 + }, + { + "epoch": 1.62, + "grad_norm": 21.813653765158957, + "learning_rate": 6.752427989346497e-06, + "loss": 0.4629, + "step": 2267 + }, + { + "epoch": 1.62, + "grad_norm": 18.768658187986116, + "learning_rate": 6.749720874723854e-06, + "loss": 0.4678, + "step": 2268 + }, + { + "epoch": 1.62, + "grad_norm": 8.821712753766564, + "learning_rate": 6.747013175486944e-06, + "loss": 0.4683, + "step": 2269 + }, + { + "epoch": 1.62, + "grad_norm": 16.14702978647226, + "learning_rate": 6.74430489254046e-06, + "loss": 0.542, + "step": 2270 + }, + { + "epoch": 1.62, + "grad_norm": 7.591931087130397, + "learning_rate": 6.741596026789288e-06, + "loss": 0.5176, + "step": 2271 + }, + { + "epoch": 1.62, + "grad_norm": 9.99774676180277, + "learning_rate": 6.7388865791385124e-06, + "loss": 0.4536, + "step": 2272 + }, + { + "epoch": 1.62, + "grad_norm": 10.404000893575382, + "learning_rate": 6.736176550493411e-06, + "loss": 0.5005, + "step": 2273 + }, + { + "epoch": 1.62, + "grad_norm": 14.67200743832191, + "learning_rate": 6.7334659417594514e-06, + "loss": 0.5234, + "step": 2274 + }, + { + "epoch": 1.62, + "grad_norm": 13.823394969175009, + "learning_rate": 6.730754753842303e-06, + "loss": 0.4229, + "step": 2275 + }, + { + "epoch": 1.62, + "grad_norm": 9.011366696228007, + "learning_rate": 6.728042987647818e-06, + "loss": 0.3921, + "step": 2276 + }, + { + "epoch": 1.63, + "grad_norm": 16.774675579908692, + "learning_rate": 6.725330644082054e-06, + "loss": 0.5049, + "step": 2277 + }, + { + "epoch": 1.63, + "grad_norm": 8.935065734139199, + "learning_rate": 6.7226177240512516e-06, + "loss": 0.4927, + "step": 2278 + }, + { + "epoch": 1.63, + "grad_norm": 8.554489129584331, + "learning_rate": 6.7199042284618484e-06, + "loss": 0.4419, + "step": 2279 + }, + { + "epoch": 1.63, + "grad_norm": 10.926780112484742, + "learning_rate": 6.717190158220475e-06, + "loss": 0.5508, + "step": 2280 + }, + { + "epoch": 1.63, + "grad_norm": 9.840068980238367, + "learning_rate": 6.714475514233951e-06, + "loss": 0.4165, + "step": 2281 + }, + { + "epoch": 1.63, + "grad_norm": 20.501441251965698, + "learning_rate": 6.71176029740929e-06, + "loss": 0.4741, + "step": 2282 + }, + { + "epoch": 1.63, + "grad_norm": 14.55482426352725, + "learning_rate": 6.709044508653697e-06, + "loss": 0.4609, + "step": 2283 + }, + { + "epoch": 1.63, + "grad_norm": 11.114294673812632, + "learning_rate": 6.706328148874568e-06, + "loss": 0.4961, + "step": 2284 + }, + { + "epoch": 1.63, + "grad_norm": 16.349083200716997, + "learning_rate": 6.703611218979488e-06, + "loss": 0.5098, + "step": 2285 + }, + { + "epoch": 1.63, + "grad_norm": 11.011989880992953, + "learning_rate": 6.700893719876234e-06, + "loss": 0.4868, + "step": 2286 + }, + { + "epoch": 1.63, + "grad_norm": 10.689757524975285, + "learning_rate": 6.698175652472774e-06, + "loss": 0.4512, + "step": 2287 + }, + { + "epoch": 1.63, + "grad_norm": 12.898403684293179, + "learning_rate": 6.695457017677263e-06, + "loss": 0.4131, + "step": 2288 + }, + { + "epoch": 1.63, + "grad_norm": 10.240922912512271, + "learning_rate": 6.692737816398048e-06, + "loss": 0.5112, + "step": 2289 + }, + { + "epoch": 1.63, + "grad_norm": 16.147572604976922, + "learning_rate": 6.6900180495436664e-06, + "loss": 0.3979, + "step": 2290 + }, + { + "epoch": 1.64, + "grad_norm": 19.700085377690556, + "learning_rate": 6.68729771802284e-06, + "loss": 0.4995, + "step": 2291 + }, + { + "epoch": 1.64, + "grad_norm": 8.978659517104596, + "learning_rate": 6.6845768227444855e-06, + "loss": 0.3945, + "step": 2292 + }, + { + "epoch": 1.64, + "grad_norm": 11.810366816874936, + "learning_rate": 6.681855364617702e-06, + "loss": 0.4082, + "step": 2293 + }, + { + "epoch": 1.64, + "grad_norm": 11.532079335607683, + "learning_rate": 6.67913334455178e-06, + "loss": 0.4424, + "step": 2294 + }, + { + "epoch": 1.64, + "grad_norm": 15.749828735552693, + "learning_rate": 6.676410763456197e-06, + "loss": 0.4722, + "step": 2295 + }, + { + "epoch": 1.64, + "grad_norm": 9.918601367919049, + "learning_rate": 6.673687622240619e-06, + "loss": 0.4126, + "step": 2296 + }, + { + "epoch": 1.64, + "grad_norm": 14.349256190704107, + "learning_rate": 6.670963921814896e-06, + "loss": 0.5859, + "step": 2297 + }, + { + "epoch": 1.64, + "grad_norm": 26.856607295110678, + "learning_rate": 6.668239663089069e-06, + "loss": 0.542, + "step": 2298 + }, + { + "epoch": 1.64, + "grad_norm": 13.00766965784754, + "learning_rate": 6.665514846973361e-06, + "loss": 0.4756, + "step": 2299 + }, + { + "epoch": 1.64, + "grad_norm": 14.099268884344017, + "learning_rate": 6.662789474378186e-06, + "loss": 0.5103, + "step": 2300 + }, + { + "epoch": 1.64, + "grad_norm": 13.330890563879269, + "learning_rate": 6.6600635462141415e-06, + "loss": 0.355, + "step": 2301 + }, + { + "epoch": 1.64, + "grad_norm": 11.992856781553082, + "learning_rate": 6.657337063392011e-06, + "loss": 0.4316, + "step": 2302 + }, + { + "epoch": 1.64, + "grad_norm": 6.711043536387954, + "learning_rate": 6.654610026822761e-06, + "loss": 0.3696, + "step": 2303 + }, + { + "epoch": 1.64, + "grad_norm": 11.942728892170798, + "learning_rate": 6.651882437417546e-06, + "loss": 0.4727, + "step": 2304 + }, + { + "epoch": 1.65, + "grad_norm": 11.25898618932495, + "learning_rate": 6.649154296087705e-06, + "loss": 0.5059, + "step": 2305 + }, + { + "epoch": 1.65, + "grad_norm": 10.960042275216363, + "learning_rate": 6.646425603744759e-06, + "loss": 0.4067, + "step": 2306 + }, + { + "epoch": 1.65, + "grad_norm": 11.117852544611928, + "learning_rate": 6.643696361300418e-06, + "loss": 0.5503, + "step": 2307 + }, + { + "epoch": 1.65, + "grad_norm": 19.347126680847555, + "learning_rate": 6.6409665696665715e-06, + "loss": 0.4541, + "step": 2308 + }, + { + "epoch": 1.65, + "grad_norm": 14.520385793005548, + "learning_rate": 6.638236229755292e-06, + "loss": 0.5381, + "step": 2309 + }, + { + "epoch": 1.65, + "grad_norm": 7.949529629176509, + "learning_rate": 6.635505342478838e-06, + "loss": 0.4204, + "step": 2310 + }, + { + "epoch": 1.65, + "grad_norm": 10.147732369032411, + "learning_rate": 6.632773908749649e-06, + "loss": 0.4448, + "step": 2311 + }, + { + "epoch": 1.65, + "grad_norm": 9.58683877473919, + "learning_rate": 6.630041929480349e-06, + "loss": 0.436, + "step": 2312 + }, + { + "epoch": 1.65, + "grad_norm": 14.967781744974912, + "learning_rate": 6.627309405583741e-06, + "loss": 0.4839, + "step": 2313 + }, + { + "epoch": 1.65, + "grad_norm": 9.185548630716545, + "learning_rate": 6.624576337972815e-06, + "loss": 0.4331, + "step": 2314 + }, + { + "epoch": 1.65, + "grad_norm": 9.782057188604156, + "learning_rate": 6.621842727560737e-06, + "loss": 0.415, + "step": 2315 + }, + { + "epoch": 1.65, + "grad_norm": 11.673091390131038, + "learning_rate": 6.6191085752608575e-06, + "loss": 0.4946, + "step": 2316 + }, + { + "epoch": 1.65, + "grad_norm": 10.697402257685303, + "learning_rate": 6.616373881986708e-06, + "loss": 0.5723, + "step": 2317 + }, + { + "epoch": 1.65, + "grad_norm": 7.647812348945917, + "learning_rate": 6.613638648652002e-06, + "loss": 0.4097, + "step": 2318 + }, + { + "epoch": 1.66, + "grad_norm": 6.549329240009999, + "learning_rate": 6.610902876170631e-06, + "loss": 0.4482, + "step": 2319 + }, + { + "epoch": 1.66, + "grad_norm": 10.337967711639493, + "learning_rate": 6.608166565456666e-06, + "loss": 0.4434, + "step": 2320 + }, + { + "epoch": 1.66, + "grad_norm": 17.036210608178823, + "learning_rate": 6.605429717424359e-06, + "loss": 0.4116, + "step": 2321 + }, + { + "epoch": 1.66, + "grad_norm": 10.4673537904469, + "learning_rate": 6.602692332988143e-06, + "loss": 0.4302, + "step": 2322 + }, + { + "epoch": 1.66, + "grad_norm": 9.325826026833447, + "learning_rate": 6.5999544130626305e-06, + "loss": 0.4438, + "step": 2323 + }, + { + "epoch": 1.66, + "grad_norm": 12.040149249921235, + "learning_rate": 6.597215958562608e-06, + "loss": 0.46, + "step": 2324 + }, + { + "epoch": 1.66, + "grad_norm": 8.656646325351767, + "learning_rate": 6.5944769704030465e-06, + "loss": 0.4453, + "step": 2325 + }, + { + "epoch": 1.66, + "grad_norm": 11.170279267084911, + "learning_rate": 6.591737449499092e-06, + "loss": 0.4639, + "step": 2326 + }, + { + "epoch": 1.66, + "grad_norm": 9.82839224846332, + "learning_rate": 6.58899739676607e-06, + "loss": 0.4546, + "step": 2327 + }, + { + "epoch": 1.66, + "grad_norm": 9.496186973092405, + "learning_rate": 6.586256813119482e-06, + "loss": 0.4648, + "step": 2328 + }, + { + "epoch": 1.66, + "grad_norm": 8.873342997647923, + "learning_rate": 6.583515699475009e-06, + "loss": 0.4561, + "step": 2329 + }, + { + "epoch": 1.66, + "grad_norm": 10.344690040103952, + "learning_rate": 6.580774056748508e-06, + "loss": 0.4336, + "step": 2330 + }, + { + "epoch": 1.66, + "grad_norm": 11.872733176778468, + "learning_rate": 6.578031885856011e-06, + "loss": 0.4167, + "step": 2331 + }, + { + "epoch": 1.66, + "grad_norm": 8.83642693495639, + "learning_rate": 6.575289187713731e-06, + "loss": 0.3911, + "step": 2332 + }, + { + "epoch": 1.67, + "grad_norm": 17.856781054644795, + "learning_rate": 6.572545963238053e-06, + "loss": 0.4536, + "step": 2333 + }, + { + "epoch": 1.67, + "grad_norm": 11.347855910699995, + "learning_rate": 6.569802213345537e-06, + "loss": 0.5303, + "step": 2334 + }, + { + "epoch": 1.67, + "grad_norm": 11.646592360652438, + "learning_rate": 6.5670579389529255e-06, + "loss": 0.4077, + "step": 2335 + }, + { + "epoch": 1.67, + "grad_norm": 7.438467277076052, + "learning_rate": 6.56431314097713e-06, + "loss": 0.3088, + "step": 2336 + }, + { + "epoch": 1.67, + "grad_norm": 11.023040745678406, + "learning_rate": 6.561567820335236e-06, + "loss": 0.3955, + "step": 2337 + }, + { + "epoch": 1.67, + "grad_norm": 10.033813975390757, + "learning_rate": 6.558821977944508e-06, + "loss": 0.4468, + "step": 2338 + }, + { + "epoch": 1.67, + "grad_norm": 16.110205229735936, + "learning_rate": 6.556075614722383e-06, + "loss": 0.5103, + "step": 2339 + }, + { + "epoch": 1.67, + "grad_norm": 11.807452959562594, + "learning_rate": 6.553328731586473e-06, + "loss": 0.5112, + "step": 2340 + }, + { + "epoch": 1.67, + "grad_norm": 11.161963178049959, + "learning_rate": 6.550581329454561e-06, + "loss": 0.437, + "step": 2341 + }, + { + "epoch": 1.67, + "grad_norm": 8.447457606510927, + "learning_rate": 6.547833409244606e-06, + "loss": 0.3843, + "step": 2342 + }, + { + "epoch": 1.67, + "grad_norm": 9.377025255192994, + "learning_rate": 6.545084971874738e-06, + "loss": 0.4229, + "step": 2343 + }, + { + "epoch": 1.67, + "grad_norm": 7.855103606469367, + "learning_rate": 6.542336018263262e-06, + "loss": 0.3804, + "step": 2344 + }, + { + "epoch": 1.67, + "grad_norm": 27.46634631545949, + "learning_rate": 6.539586549328656e-06, + "loss": 0.7871, + "step": 2345 + }, + { + "epoch": 1.67, + "grad_norm": 11.359851062684688, + "learning_rate": 6.536836565989565e-06, + "loss": 0.3911, + "step": 2346 + }, + { + "epoch": 1.68, + "grad_norm": 11.053055914900025, + "learning_rate": 6.534086069164813e-06, + "loss": 0.4321, + "step": 2347 + }, + { + "epoch": 1.68, + "grad_norm": 14.018450662610983, + "learning_rate": 6.531335059773392e-06, + "loss": 0.4824, + "step": 2348 + }, + { + "epoch": 1.68, + "grad_norm": 11.550512997935256, + "learning_rate": 6.528583538734463e-06, + "loss": 0.5078, + "step": 2349 + }, + { + "epoch": 1.68, + "grad_norm": 13.133602547357903, + "learning_rate": 6.525831506967361e-06, + "loss": 0.4575, + "step": 2350 + }, + { + "epoch": 1.68, + "grad_norm": 8.318668444691664, + "learning_rate": 6.523078965391592e-06, + "loss": 0.4258, + "step": 2351 + }, + { + "epoch": 1.68, + "grad_norm": 9.446059276582055, + "learning_rate": 6.520325914926831e-06, + "loss": 0.4482, + "step": 2352 + }, + { + "epoch": 1.68, + "grad_norm": 16.682194650341806, + "learning_rate": 6.517572356492922e-06, + "loss": 0.4624, + "step": 2353 + }, + { + "epoch": 1.68, + "grad_norm": 20.23938157311803, + "learning_rate": 6.514818291009881e-06, + "loss": 0.498, + "step": 2354 + }, + { + "epoch": 1.68, + "grad_norm": 17.820982474486428, + "learning_rate": 6.512063719397894e-06, + "loss": 0.5381, + "step": 2355 + }, + { + "epoch": 1.68, + "grad_norm": 13.993613065974447, + "learning_rate": 6.5093086425773126e-06, + "loss": 0.5732, + "step": 2356 + }, + { + "epoch": 1.68, + "grad_norm": 8.37289512006142, + "learning_rate": 6.506553061468659e-06, + "loss": 0.4253, + "step": 2357 + }, + { + "epoch": 1.68, + "grad_norm": 11.600077704313506, + "learning_rate": 6.5037969769926256e-06, + "loss": 0.4316, + "step": 2358 + }, + { + "epoch": 1.68, + "grad_norm": 14.425829865085978, + "learning_rate": 6.501040390070071e-06, + "loss": 0.4639, + "step": 2359 + }, + { + "epoch": 1.68, + "grad_norm": 6.9917751899752485, + "learning_rate": 6.498283301622022e-06, + "loss": 0.3745, + "step": 2360 + }, + { + "epoch": 1.69, + "grad_norm": 11.202380861543828, + "learning_rate": 6.495525712569673e-06, + "loss": 0.4907, + "step": 2361 + }, + { + "epoch": 1.69, + "grad_norm": 10.609299484513734, + "learning_rate": 6.492767623834385e-06, + "loss": 0.4478, + "step": 2362 + }, + { + "epoch": 1.69, + "grad_norm": 10.853753914008019, + "learning_rate": 6.490009036337687e-06, + "loss": 0.4463, + "step": 2363 + }, + { + "epoch": 1.69, + "grad_norm": 12.183051932609295, + "learning_rate": 6.487249951001276e-06, + "loss": 0.501, + "step": 2364 + }, + { + "epoch": 1.69, + "grad_norm": 11.744349364358902, + "learning_rate": 6.484490368747012e-06, + "loss": 0.4519, + "step": 2365 + }, + { + "epoch": 1.69, + "grad_norm": 14.03256193597822, + "learning_rate": 6.4817302904969226e-06, + "loss": 0.5122, + "step": 2366 + }, + { + "epoch": 1.69, + "grad_norm": 10.807691344764418, + "learning_rate": 6.4789697171732024e-06, + "loss": 0.5269, + "step": 2367 + }, + { + "epoch": 1.69, + "grad_norm": 7.611847017024495, + "learning_rate": 6.476208649698209e-06, + "loss": 0.4209, + "step": 2368 + }, + { + "epoch": 1.69, + "grad_norm": 13.295897556272315, + "learning_rate": 6.473447088994467e-06, + "loss": 0.3936, + "step": 2369 + }, + { + "epoch": 1.69, + "grad_norm": 14.443824455010702, + "learning_rate": 6.470685035984667e-06, + "loss": 0.4585, + "step": 2370 + }, + { + "epoch": 1.69, + "grad_norm": 9.843529569578008, + "learning_rate": 6.467922491591658e-06, + "loss": 0.3989, + "step": 2371 + }, + { + "epoch": 1.69, + "grad_norm": 7.301042404680986, + "learning_rate": 6.465159456738461e-06, + "loss": 0.4258, + "step": 2372 + }, + { + "epoch": 1.69, + "grad_norm": 9.457468991739518, + "learning_rate": 6.462395932348257e-06, + "loss": 0.437, + "step": 2373 + }, + { + "epoch": 1.69, + "grad_norm": 9.706783737244205, + "learning_rate": 6.459631919344389e-06, + "loss": 0.4785, + "step": 2374 + }, + { + "epoch": 1.7, + "grad_norm": 11.469484126644588, + "learning_rate": 6.456867418650366e-06, + "loss": 0.4321, + "step": 2375 + }, + { + "epoch": 1.7, + "grad_norm": 14.561227461724823, + "learning_rate": 6.454102431189859e-06, + "loss": 0.4399, + "step": 2376 + }, + { + "epoch": 1.7, + "grad_norm": 16.39246937193766, + "learning_rate": 6.4513369578867026e-06, + "loss": 0.521, + "step": 2377 + }, + { + "epoch": 1.7, + "grad_norm": 12.29432435609855, + "learning_rate": 6.448570999664894e-06, + "loss": 0.4775, + "step": 2378 + }, + { + "epoch": 1.7, + "grad_norm": 17.171427132716822, + "learning_rate": 6.4458045574485875e-06, + "loss": 0.5679, + "step": 2379 + }, + { + "epoch": 1.7, + "grad_norm": 23.83379330955623, + "learning_rate": 6.443037632162104e-06, + "loss": 0.5278, + "step": 2380 + }, + { + "epoch": 1.7, + "grad_norm": 14.84513216881372, + "learning_rate": 6.440270224729927e-06, + "loss": 0.5034, + "step": 2381 + }, + { + "epoch": 1.7, + "grad_norm": 11.928504744206203, + "learning_rate": 6.437502336076695e-06, + "loss": 0.5376, + "step": 2382 + }, + { + "epoch": 1.7, + "grad_norm": 8.857986910329018, + "learning_rate": 6.4347339671272155e-06, + "loss": 0.3999, + "step": 2383 + }, + { + "epoch": 1.7, + "grad_norm": 16.81608671529195, + "learning_rate": 6.431965118806449e-06, + "loss": 0.4619, + "step": 2384 + }, + { + "epoch": 1.7, + "grad_norm": 27.17987199179481, + "learning_rate": 6.42919579203952e-06, + "loss": 0.5947, + "step": 2385 + }, + { + "epoch": 1.7, + "grad_norm": 11.90191014571275, + "learning_rate": 6.4264259877517124e-06, + "loss": 0.5737, + "step": 2386 + }, + { + "epoch": 1.7, + "grad_norm": 8.486777872297138, + "learning_rate": 6.423655706868468e-06, + "loss": 0.4072, + "step": 2387 + }, + { + "epoch": 1.7, + "grad_norm": 7.62100655773416, + "learning_rate": 6.4208849503153915e-06, + "loss": 0.4175, + "step": 2388 + }, + { + "epoch": 1.71, + "grad_norm": 12.205713284193536, + "learning_rate": 6.418113719018242e-06, + "loss": 0.541, + "step": 2389 + }, + { + "epoch": 1.71, + "grad_norm": 8.27010808597464, + "learning_rate": 6.415342013902939e-06, + "loss": 0.4458, + "step": 2390 + }, + { + "epoch": 1.71, + "grad_norm": 21.96766320470673, + "learning_rate": 6.412569835895562e-06, + "loss": 0.4741, + "step": 2391 + }, + { + "epoch": 1.71, + "grad_norm": 7.6282420344549635, + "learning_rate": 6.409797185922349e-06, + "loss": 0.4624, + "step": 2392 + }, + { + "epoch": 1.71, + "grad_norm": 16.407715638452274, + "learning_rate": 6.40702406490969e-06, + "loss": 0.438, + "step": 2393 + }, + { + "epoch": 1.71, + "grad_norm": 6.853759038300608, + "learning_rate": 6.404250473784138e-06, + "loss": 0.4116, + "step": 2394 + }, + { + "epoch": 1.71, + "grad_norm": 10.122502455763344, + "learning_rate": 6.401476413472404e-06, + "loss": 0.4565, + "step": 2395 + }, + { + "epoch": 1.71, + "grad_norm": 7.738275465556727, + "learning_rate": 6.398701884901348e-06, + "loss": 0.4673, + "step": 2396 + }, + { + "epoch": 1.71, + "grad_norm": 7.403912357064693, + "learning_rate": 6.3959268889979956e-06, + "loss": 0.4712, + "step": 2397 + }, + { + "epoch": 1.71, + "grad_norm": 7.246073860215684, + "learning_rate": 6.393151426689522e-06, + "loss": 0.4727, + "step": 2398 + }, + { + "epoch": 1.71, + "grad_norm": 12.726839635078996, + "learning_rate": 6.390375498903263e-06, + "loss": 0.5, + "step": 2399 + }, + { + "epoch": 1.71, + "grad_norm": 9.077518387969189, + "learning_rate": 6.387599106566705e-06, + "loss": 0.3665, + "step": 2400 + }, + { + "epoch": 1.71, + "grad_norm": 13.115234070495553, + "learning_rate": 6.384822250607495e-06, + "loss": 0.5576, + "step": 2401 + }, + { + "epoch": 1.71, + "grad_norm": 10.158381190711447, + "learning_rate": 6.382044931953431e-06, + "loss": 0.4087, + "step": 2402 + }, + { + "epoch": 1.72, + "grad_norm": 9.67290912672018, + "learning_rate": 6.379267151532467e-06, + "loss": 0.543, + "step": 2403 + }, + { + "epoch": 1.72, + "grad_norm": 7.817381759789311, + "learning_rate": 6.376488910272709e-06, + "loss": 0.4165, + "step": 2404 + }, + { + "epoch": 1.72, + "grad_norm": 10.705521476960307, + "learning_rate": 6.373710209102423e-06, + "loss": 0.4487, + "step": 2405 + }, + { + "epoch": 1.72, + "grad_norm": 19.270499479688624, + "learning_rate": 6.370931048950022e-06, + "loss": 0.4756, + "step": 2406 + }, + { + "epoch": 1.72, + "grad_norm": 15.738719945653308, + "learning_rate": 6.368151430744075e-06, + "loss": 0.4893, + "step": 2407 + }, + { + "epoch": 1.72, + "grad_norm": 10.66405350297458, + "learning_rate": 6.365371355413306e-06, + "loss": 0.4688, + "step": 2408 + }, + { + "epoch": 1.72, + "grad_norm": 9.542128682731759, + "learning_rate": 6.362590823886588e-06, + "loss": 0.4131, + "step": 2409 + }, + { + "epoch": 1.72, + "grad_norm": 12.231576688229438, + "learning_rate": 6.359809837092947e-06, + "loss": 0.4248, + "step": 2410 + }, + { + "epoch": 1.72, + "grad_norm": 13.471150104035912, + "learning_rate": 6.357028395961566e-06, + "loss": 0.4961, + "step": 2411 + }, + { + "epoch": 1.72, + "grad_norm": 12.518938039693966, + "learning_rate": 6.354246501421777e-06, + "loss": 0.5054, + "step": 2412 + }, + { + "epoch": 1.72, + "grad_norm": 17.325117897931076, + "learning_rate": 6.3514641544030575e-06, + "loss": 0.5117, + "step": 2413 + }, + { + "epoch": 1.72, + "grad_norm": 11.695654010917847, + "learning_rate": 6.348681355835043e-06, + "loss": 0.4731, + "step": 2414 + }, + { + "epoch": 1.72, + "grad_norm": 10.353958500414972, + "learning_rate": 6.345898106647521e-06, + "loss": 0.4497, + "step": 2415 + }, + { + "epoch": 1.72, + "grad_norm": 9.125134539120891, + "learning_rate": 6.3431144077704245e-06, + "loss": 0.5361, + "step": 2416 + }, + { + "epoch": 1.73, + "grad_norm": 12.29982044993289, + "learning_rate": 6.340330260133839e-06, + "loss": 0.5303, + "step": 2417 + }, + { + "epoch": 1.73, + "grad_norm": 11.635927861507614, + "learning_rate": 6.337545664668001e-06, + "loss": 0.4492, + "step": 2418 + }, + { + "epoch": 1.73, + "grad_norm": 8.450720986263002, + "learning_rate": 6.334760622303294e-06, + "loss": 0.4526, + "step": 2419 + }, + { + "epoch": 1.73, + "grad_norm": 9.96898407515457, + "learning_rate": 6.331975133970255e-06, + "loss": 0.374, + "step": 2420 + }, + { + "epoch": 1.73, + "grad_norm": 22.99713545008352, + "learning_rate": 6.329189200599566e-06, + "loss": 0.5244, + "step": 2421 + }, + { + "epoch": 1.73, + "grad_norm": 7.164058078222877, + "learning_rate": 6.326402823122059e-06, + "loss": 0.3335, + "step": 2422 + }, + { + "epoch": 1.73, + "grad_norm": 8.821666072907615, + "learning_rate": 6.3236160024687134e-06, + "loss": 0.4614, + "step": 2423 + }, + { + "epoch": 1.73, + "grad_norm": 8.86789061992421, + "learning_rate": 6.3208287395706595e-06, + "loss": 0.4541, + "step": 2424 + }, + { + "epoch": 1.73, + "grad_norm": 14.767069143692144, + "learning_rate": 6.3180410353591735e-06, + "loss": 0.4414, + "step": 2425 + }, + { + "epoch": 1.73, + "grad_norm": 12.27317813903384, + "learning_rate": 6.315252890765678e-06, + "loss": 0.502, + "step": 2426 + }, + { + "epoch": 1.73, + "grad_norm": 7.0533891840444864, + "learning_rate": 6.312464306721745e-06, + "loss": 0.4478, + "step": 2427 + }, + { + "epoch": 1.73, + "grad_norm": 8.284405823285676, + "learning_rate": 6.309675284159093e-06, + "loss": 0.4644, + "step": 2428 + }, + { + "epoch": 1.73, + "grad_norm": 10.273614279657444, + "learning_rate": 6.306885824009585e-06, + "loss": 0.3833, + "step": 2429 + }, + { + "epoch": 1.73, + "grad_norm": 8.094247190767463, + "learning_rate": 6.3040959272052315e-06, + "loss": 0.3955, + "step": 2430 + }, + { + "epoch": 1.74, + "grad_norm": 8.05210332824741, + "learning_rate": 6.301305594678189e-06, + "loss": 0.4409, + "step": 2431 + }, + { + "epoch": 1.74, + "grad_norm": 7.070593419557918, + "learning_rate": 6.2985148273607586e-06, + "loss": 0.394, + "step": 2432 + }, + { + "epoch": 1.74, + "grad_norm": 7.162689408282903, + "learning_rate": 6.29572362618539e-06, + "loss": 0.3916, + "step": 2433 + }, + { + "epoch": 1.74, + "grad_norm": 18.341208257731296, + "learning_rate": 6.292931992084672e-06, + "loss": 0.6147, + "step": 2434 + }, + { + "epoch": 1.74, + "grad_norm": 12.475007541185304, + "learning_rate": 6.290139925991345e-06, + "loss": 0.5015, + "step": 2435 + }, + { + "epoch": 1.74, + "grad_norm": 11.472537023920362, + "learning_rate": 6.287347428838289e-06, + "loss": 0.3416, + "step": 2436 + }, + { + "epoch": 1.74, + "grad_norm": 12.81968592719629, + "learning_rate": 6.2845545015585275e-06, + "loss": 0.5249, + "step": 2437 + }, + { + "epoch": 1.74, + "grad_norm": 11.127563834698211, + "learning_rate": 6.281761145085232e-06, + "loss": 0.4546, + "step": 2438 + }, + { + "epoch": 1.74, + "grad_norm": 6.2131478626402, + "learning_rate": 6.278967360351712e-06, + "loss": 0.291, + "step": 2439 + }, + { + "epoch": 1.74, + "grad_norm": 13.014830861323171, + "learning_rate": 6.276173148291425e-06, + "loss": 0.4507, + "step": 2440 + }, + { + "epoch": 1.74, + "grad_norm": 8.370157635005995, + "learning_rate": 6.273378509837969e-06, + "loss": 0.3735, + "step": 2441 + }, + { + "epoch": 1.74, + "grad_norm": 19.4882237053066, + "learning_rate": 6.2705834459250825e-06, + "loss": 0.4019, + "step": 2442 + }, + { + "epoch": 1.74, + "grad_norm": 10.073520612528323, + "learning_rate": 6.2677879574866515e-06, + "loss": 0.4048, + "step": 2443 + }, + { + "epoch": 1.74, + "grad_norm": 10.132342613326061, + "learning_rate": 6.264992045456699e-06, + "loss": 0.4619, + "step": 2444 + }, + { + "epoch": 1.75, + "grad_norm": 11.160719007717073, + "learning_rate": 6.262195710769391e-06, + "loss": 0.3857, + "step": 2445 + }, + { + "epoch": 1.75, + "grad_norm": 19.472439966811617, + "learning_rate": 6.259398954359037e-06, + "loss": 0.4429, + "step": 2446 + }, + { + "epoch": 1.75, + "grad_norm": 15.427769712154046, + "learning_rate": 6.256601777160082e-06, + "loss": 0.6123, + "step": 2447 + }, + { + "epoch": 1.75, + "grad_norm": 13.639713264184406, + "learning_rate": 6.253804180107116e-06, + "loss": 0.4785, + "step": 2448 + }, + { + "epoch": 1.75, + "grad_norm": 19.158135673228788, + "learning_rate": 6.2510061641348695e-06, + "loss": 0.5244, + "step": 2449 + }, + { + "epoch": 1.75, + "grad_norm": 10.730810111574574, + "learning_rate": 6.248207730178211e-06, + "loss": 0.4771, + "step": 2450 + }, + { + "epoch": 1.75, + "grad_norm": 8.479475483349935, + "learning_rate": 6.245408879172148e-06, + "loss": 0.4209, + "step": 2451 + }, + { + "epoch": 1.75, + "grad_norm": 12.545307163598485, + "learning_rate": 6.24260961205183e-06, + "loss": 0.4536, + "step": 2452 + }, + { + "epoch": 1.75, + "grad_norm": 7.433203777453386, + "learning_rate": 6.239809929752544e-06, + "loss": 0.3677, + "step": 2453 + }, + { + "epoch": 1.75, + "grad_norm": 10.724851412359518, + "learning_rate": 6.237009833209715e-06, + "loss": 0.4668, + "step": 2454 + }, + { + "epoch": 1.75, + "grad_norm": 24.535444127877504, + "learning_rate": 6.2342093233589095e-06, + "loss": 0.6094, + "step": 2455 + }, + { + "epoch": 1.75, + "grad_norm": 10.131949690353993, + "learning_rate": 6.231408401135828e-06, + "loss": 0.4727, + "step": 2456 + }, + { + "epoch": 1.75, + "grad_norm": 12.95385057418741, + "learning_rate": 6.228607067476311e-06, + "loss": 0.425, + "step": 2457 + }, + { + "epoch": 1.75, + "grad_norm": 11.652779199575274, + "learning_rate": 6.225805323316336e-06, + "loss": 0.5317, + "step": 2458 + }, + { + "epoch": 1.76, + "grad_norm": 13.756602418041059, + "learning_rate": 6.223003169592018e-06, + "loss": 0.4741, + "step": 2459 + }, + { + "epoch": 1.76, + "grad_norm": 13.349749974546135, + "learning_rate": 6.220200607239609e-06, + "loss": 0.4736, + "step": 2460 + }, + { + "epoch": 1.76, + "grad_norm": 9.203291816830776, + "learning_rate": 6.217397637195497e-06, + "loss": 0.4951, + "step": 2461 + }, + { + "epoch": 1.76, + "grad_norm": 10.149381480669541, + "learning_rate": 6.214594260396206e-06, + "loss": 0.4688, + "step": 2462 + }, + { + "epoch": 1.76, + "grad_norm": 14.733718301343428, + "learning_rate": 6.211790477778399e-06, + "loss": 0.4492, + "step": 2463 + }, + { + "epoch": 1.76, + "grad_norm": 15.776778166448288, + "learning_rate": 6.208986290278866e-06, + "loss": 0.3716, + "step": 2464 + }, + { + "epoch": 1.76, + "grad_norm": 9.981056473195446, + "learning_rate": 6.206181698834544e-06, + "loss": 0.5264, + "step": 2465 + }, + { + "epoch": 1.76, + "grad_norm": 12.177267724597133, + "learning_rate": 6.2033767043824955e-06, + "loss": 0.4902, + "step": 2466 + }, + { + "epoch": 1.76, + "grad_norm": 9.497702259870394, + "learning_rate": 6.200571307859923e-06, + "loss": 0.4185, + "step": 2467 + }, + { + "epoch": 1.76, + "grad_norm": 19.49048950885061, + "learning_rate": 6.197765510204161e-06, + "loss": 0.5205, + "step": 2468 + }, + { + "epoch": 1.76, + "grad_norm": 8.508146338639264, + "learning_rate": 6.19495931235268e-06, + "loss": 0.4531, + "step": 2469 + }, + { + "epoch": 1.76, + "grad_norm": 14.260708359404394, + "learning_rate": 6.19215271524308e-06, + "loss": 0.4541, + "step": 2470 + }, + { + "epoch": 1.76, + "grad_norm": 13.1175023636757, + "learning_rate": 6.189345719813099e-06, + "loss": 0.4873, + "step": 2471 + }, + { + "epoch": 1.76, + "grad_norm": 11.651326204773177, + "learning_rate": 6.186538327000609e-06, + "loss": 0.4849, + "step": 2472 + }, + { + "epoch": 1.77, + "grad_norm": 9.573726463990967, + "learning_rate": 6.183730537743607e-06, + "loss": 0.4097, + "step": 2473 + }, + { + "epoch": 1.77, + "grad_norm": 16.017710357353614, + "learning_rate": 6.18092235298023e-06, + "loss": 0.4702, + "step": 2474 + }, + { + "epoch": 1.77, + "grad_norm": 9.84022010023255, + "learning_rate": 6.178113773648745e-06, + "loss": 0.3994, + "step": 2475 + }, + { + "epoch": 1.77, + "grad_norm": 9.022407957306385, + "learning_rate": 6.175304800687551e-06, + "loss": 0.4067, + "step": 2476 + }, + { + "epoch": 1.77, + "grad_norm": 11.250357096874497, + "learning_rate": 6.172495435035176e-06, + "loss": 0.4756, + "step": 2477 + }, + { + "epoch": 1.77, + "grad_norm": 8.921552289632256, + "learning_rate": 6.169685677630284e-06, + "loss": 0.4336, + "step": 2478 + }, + { + "epoch": 1.77, + "grad_norm": 7.198045788857288, + "learning_rate": 6.1668755294116655e-06, + "loss": 0.3325, + "step": 2479 + }, + { + "epoch": 1.77, + "grad_norm": 14.228855338016636, + "learning_rate": 6.1640649913182436e-06, + "loss": 0.4209, + "step": 2480 + }, + { + "epoch": 1.77, + "grad_norm": 8.828383315352083, + "learning_rate": 6.161254064289072e-06, + "loss": 0.4023, + "step": 2481 + }, + { + "epoch": 1.77, + "grad_norm": 11.423877993770597, + "learning_rate": 6.158442749263332e-06, + "loss": 0.4683, + "step": 2482 + }, + { + "epoch": 1.77, + "grad_norm": 14.780261625052676, + "learning_rate": 6.155631047180337e-06, + "loss": 0.4595, + "step": 2483 + }, + { + "epoch": 1.77, + "grad_norm": 11.04469853406095, + "learning_rate": 6.152818958979529e-06, + "loss": 0.5542, + "step": 2484 + }, + { + "epoch": 1.77, + "grad_norm": 15.285094723692712, + "learning_rate": 6.1500064856004796e-06, + "loss": 0.4995, + "step": 2485 + }, + { + "epoch": 1.77, + "grad_norm": 8.325731810799802, + "learning_rate": 6.147193627982887e-06, + "loss": 0.3689, + "step": 2486 + }, + { + "epoch": 1.78, + "grad_norm": 12.60771444277801, + "learning_rate": 6.144380387066581e-06, + "loss": 0.4771, + "step": 2487 + }, + { + "epoch": 1.78, + "grad_norm": 8.951542508375255, + "learning_rate": 6.141566763791518e-06, + "loss": 0.4243, + "step": 2488 + }, + { + "epoch": 1.78, + "grad_norm": 10.53628405098184, + "learning_rate": 6.138752759097778e-06, + "loss": 0.4272, + "step": 2489 + }, + { + "epoch": 1.78, + "grad_norm": 10.97888607653731, + "learning_rate": 6.135938373925576e-06, + "loss": 0.4653, + "step": 2490 + }, + { + "epoch": 1.78, + "grad_norm": 10.48936310841184, + "learning_rate": 6.133123609215249e-06, + "loss": 0.4019, + "step": 2491 + }, + { + "epoch": 1.78, + "grad_norm": 10.096587824152824, + "learning_rate": 6.130308465907263e-06, + "loss": 0.396, + "step": 2492 + }, + { + "epoch": 1.78, + "grad_norm": 9.227096202950863, + "learning_rate": 6.127492944942209e-06, + "loss": 0.4258, + "step": 2493 + }, + { + "epoch": 1.78, + "grad_norm": 12.942411840921341, + "learning_rate": 6.124677047260805e-06, + "loss": 0.3965, + "step": 2494 + }, + { + "epoch": 1.78, + "grad_norm": 9.00521904473558, + "learning_rate": 6.121860773803895e-06, + "loss": 0.4102, + "step": 2495 + }, + { + "epoch": 1.78, + "grad_norm": 9.245096943439378, + "learning_rate": 6.119044125512447e-06, + "loss": 0.3967, + "step": 2496 + }, + { + "epoch": 1.78, + "grad_norm": 9.70014640452883, + "learning_rate": 6.116227103327559e-06, + "loss": 0.4951, + "step": 2497 + }, + { + "epoch": 1.78, + "grad_norm": 15.83092193293532, + "learning_rate": 6.113409708190447e-06, + "loss": 0.5278, + "step": 2498 + }, + { + "epoch": 1.78, + "grad_norm": 18.447966368255546, + "learning_rate": 6.1105919410424566e-06, + "loss": 0.5225, + "step": 2499 + }, + { + "epoch": 1.78, + "grad_norm": 16.135937978938692, + "learning_rate": 6.107773802825055e-06, + "loss": 0.52, + "step": 2500 + }, + { + "epoch": 1.78, + "eval_avg_AUC": 0.7652931989776042, + "eval_avg_Accuracy": 0.6897380636604774, + "eval_avg_Accuracy-right": 0.8790922133820269, + "eval_avg_Accuracy-wrong": 0.3595633386399818, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6470409856535183, + "eval_last_AUC": 0.7821256327791941, + "eval_last_Accuracy": 0.7235162466843501, + "eval_last_Accuracy-right": 0.8033128994391548, + "eval_last_Accuracy-wrong": 0.584375710711849, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6545993525396961, + "eval_max_AUC": 0.7132634593306635, + "eval_max_Accuracy": 0.6386770557029178, + "eval_max_Accuracy-right": 0.9663492891613408, + "eval_max_Accuracy-wrong": 0.06731862633613828, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.5960403684660525, + "eval_min_AUC": 0.7740940679255949, + "eval_min_Accuracy": 0.7144396551724138, + "eval_min_Accuracy-right": 0.7332724664145037, + "eval_min_Accuracy-wrong": 0.6816010916534, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6455547810166508, + "eval_prod_AUC": 0.774109644816048, + "eval_prod_Accuracy": 0.6302635941644562, + "eval_prod_Accuracy-right": 0.4789356984478936, + "eval_prod_Accuracy-wrong": 0.8941323629747555, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6440620351725057, + "eval_runtime": 247.2806, + "eval_samples_per_second": 97.573, + "eval_steps_per_second": 3.049, + "eval_sum_AUC": 0.6276356610336933, + "eval_sum_Accuracy": 0.6374336870026526, + "eval_sum_Accuracy-right": 0.9956306247554454, + "eval_sum_Accuracy-wrong": 0.01284967022970207, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6313597803483167, + "step": 2500 + }, + { + "epoch": 1.79, + "grad_norm": 10.414355809239556, + "learning_rate": 6.1049552944798355e-06, + "loss": 0.4023, + "step": 2501 + }, + { + "epoch": 1.79, + "grad_norm": 9.539305998342051, + "learning_rate": 6.102136416948513e-06, + "loss": 0.4678, + "step": 2502 + }, + { + "epoch": 1.79, + "grad_norm": 8.768527333824066, + "learning_rate": 6.099317171172929e-06, + "loss": 0.4272, + "step": 2503 + }, + { + "epoch": 1.79, + "grad_norm": 11.846773419479307, + "learning_rate": 6.0964975580950445e-06, + "loss": 0.4639, + "step": 2504 + }, + { + "epoch": 1.79, + "grad_norm": 11.851344453902673, + "learning_rate": 6.093677578656946e-06, + "loss": 0.5596, + "step": 2505 + }, + { + "epoch": 1.79, + "grad_norm": 17.964802832185107, + "learning_rate": 6.090857233800839e-06, + "loss": 0.4653, + "step": 2506 + }, + { + "epoch": 1.79, + "grad_norm": 10.580109191519524, + "learning_rate": 6.0880365244690546e-06, + "loss": 0.4565, + "step": 2507 + }, + { + "epoch": 1.79, + "grad_norm": 11.766966924313547, + "learning_rate": 6.085215451604044e-06, + "loss": 0.5005, + "step": 2508 + }, + { + "epoch": 1.79, + "grad_norm": 8.585939267007543, + "learning_rate": 6.082394016148379e-06, + "loss": 0.4629, + "step": 2509 + }, + { + "epoch": 1.79, + "grad_norm": 10.256079889521589, + "learning_rate": 6.079572219044755e-06, + "loss": 0.4443, + "step": 2510 + }, + { + "epoch": 1.79, + "grad_norm": 8.962206455993298, + "learning_rate": 6.076750061235985e-06, + "loss": 0.4058, + "step": 2511 + }, + { + "epoch": 1.79, + "grad_norm": 11.047853996411353, + "learning_rate": 6.073927543665008e-06, + "loss": 0.519, + "step": 2512 + }, + { + "epoch": 1.79, + "grad_norm": 12.508769620004047, + "learning_rate": 6.071104667274875e-06, + "loss": 0.5142, + "step": 2513 + }, + { + "epoch": 1.79, + "grad_norm": 18.47411144644649, + "learning_rate": 6.068281433008765e-06, + "loss": 0.5996, + "step": 2514 + }, + { + "epoch": 1.8, + "grad_norm": 9.586741408151706, + "learning_rate": 6.0654578418099715e-06, + "loss": 0.5146, + "step": 2515 + }, + { + "epoch": 1.8, + "grad_norm": 7.649138506555311, + "learning_rate": 6.062633894621909e-06, + "loss": 0.4038, + "step": 2516 + }, + { + "epoch": 1.8, + "grad_norm": 17.188289282459092, + "learning_rate": 6.0598095923881105e-06, + "loss": 0.5435, + "step": 2517 + }, + { + "epoch": 1.8, + "grad_norm": 11.818692335136738, + "learning_rate": 6.056984936052229e-06, + "loss": 0.4629, + "step": 2518 + }, + { + "epoch": 1.8, + "grad_norm": 13.234001156083075, + "learning_rate": 6.054159926558033e-06, + "loss": 0.5342, + "step": 2519 + }, + { + "epoch": 1.8, + "grad_norm": 10.075130769946417, + "learning_rate": 6.051334564849413e-06, + "loss": 0.4712, + "step": 2520 + }, + { + "epoch": 1.8, + "grad_norm": 8.956699388387689, + "learning_rate": 6.048508851870372e-06, + "loss": 0.4111, + "step": 2521 + }, + { + "epoch": 1.8, + "grad_norm": 12.649995024246259, + "learning_rate": 6.045682788565036e-06, + "loss": 0.3521, + "step": 2522 + }, + { + "epoch": 1.8, + "grad_norm": 14.193089965908905, + "learning_rate": 6.042856375877644e-06, + "loss": 0.5518, + "step": 2523 + }, + { + "epoch": 1.8, + "grad_norm": 9.405344544727377, + "learning_rate": 6.040029614752551e-06, + "loss": 0.4873, + "step": 2524 + }, + { + "epoch": 1.8, + "grad_norm": 17.872507537818283, + "learning_rate": 6.037202506134234e-06, + "loss": 0.502, + "step": 2525 + }, + { + "epoch": 1.8, + "grad_norm": 20.056217550868038, + "learning_rate": 6.03437505096728e-06, + "loss": 0.457, + "step": 2526 + }, + { + "epoch": 1.8, + "grad_norm": 8.745479176249967, + "learning_rate": 6.0315472501963955e-06, + "loss": 0.4609, + "step": 2527 + }, + { + "epoch": 1.8, + "grad_norm": 9.223610098312774, + "learning_rate": 6.028719104766402e-06, + "loss": 0.4082, + "step": 2528 + }, + { + "epoch": 1.81, + "grad_norm": 8.50042655333063, + "learning_rate": 6.025890615622233e-06, + "loss": 0.5039, + "step": 2529 + }, + { + "epoch": 1.81, + "grad_norm": 7.476170718540832, + "learning_rate": 6.023061783708941e-06, + "loss": 0.4048, + "step": 2530 + }, + { + "epoch": 1.81, + "grad_norm": 11.520249627385631, + "learning_rate": 6.020232609971694e-06, + "loss": 0.439, + "step": 2531 + }, + { + "epoch": 1.81, + "grad_norm": 18.589519698746084, + "learning_rate": 6.017403095355766e-06, + "loss": 0.5166, + "step": 2532 + }, + { + "epoch": 1.81, + "grad_norm": 10.950886743696067, + "learning_rate": 6.014573240806553e-06, + "loss": 0.4604, + "step": 2533 + }, + { + "epoch": 1.81, + "grad_norm": 9.551260453439275, + "learning_rate": 6.011743047269563e-06, + "loss": 0.4204, + "step": 2534 + }, + { + "epoch": 1.81, + "grad_norm": 19.017128679094064, + "learning_rate": 6.008912515690415e-06, + "loss": 0.4873, + "step": 2535 + }, + { + "epoch": 1.81, + "grad_norm": 8.121011247353195, + "learning_rate": 6.006081647014842e-06, + "loss": 0.4297, + "step": 2536 + }, + { + "epoch": 1.81, + "grad_norm": 28.47281369820584, + "learning_rate": 6.00325044218869e-06, + "loss": 0.5586, + "step": 2537 + }, + { + "epoch": 1.81, + "grad_norm": 9.714417153244161, + "learning_rate": 6.000418902157919e-06, + "loss": 0.5317, + "step": 2538 + }, + { + "epoch": 1.81, + "grad_norm": 7.720034449703754, + "learning_rate": 5.997587027868598e-06, + "loss": 0.4829, + "step": 2539 + }, + { + "epoch": 1.81, + "grad_norm": 6.914782644486152, + "learning_rate": 5.994754820266908e-06, + "loss": 0.3906, + "step": 2540 + }, + { + "epoch": 1.81, + "grad_norm": 10.814365270699001, + "learning_rate": 5.991922280299143e-06, + "loss": 0.3979, + "step": 2541 + }, + { + "epoch": 1.81, + "grad_norm": 16.20707607734891, + "learning_rate": 5.989089408911706e-06, + "loss": 0.4653, + "step": 2542 + }, + { + "epoch": 1.82, + "grad_norm": 10.049185460575597, + "learning_rate": 5.986256207051113e-06, + "loss": 0.48, + "step": 2543 + }, + { + "epoch": 1.82, + "grad_norm": 13.672521854401475, + "learning_rate": 5.98342267566399e-06, + "loss": 0.5356, + "step": 2544 + }, + { + "epoch": 1.82, + "grad_norm": 12.541254903832398, + "learning_rate": 5.9805888156970714e-06, + "loss": 0.4609, + "step": 2545 + }, + { + "epoch": 1.82, + "grad_norm": 10.284454916773962, + "learning_rate": 5.977754628097203e-06, + "loss": 0.4688, + "step": 2546 + }, + { + "epoch": 1.82, + "grad_norm": 8.23610205729337, + "learning_rate": 5.97492011381134e-06, + "loss": 0.3687, + "step": 2547 + }, + { + "epoch": 1.82, + "grad_norm": 7.11706014179341, + "learning_rate": 5.972085273786547e-06, + "loss": 0.4453, + "step": 2548 + }, + { + "epoch": 1.82, + "grad_norm": 8.756513019937023, + "learning_rate": 5.969250108969995e-06, + "loss": 0.4448, + "step": 2549 + }, + { + "epoch": 1.82, + "grad_norm": 7.098290335839529, + "learning_rate": 5.966414620308965e-06, + "loss": 0.4639, + "step": 2550 + }, + { + "epoch": 1.82, + "grad_norm": 9.29559974589973, + "learning_rate": 5.9635788087508474e-06, + "loss": 0.438, + "step": 2551 + }, + { + "epoch": 1.82, + "grad_norm": 8.829092992423524, + "learning_rate": 5.960742675243139e-06, + "loss": 0.3999, + "step": 2552 + }, + { + "epoch": 1.82, + "grad_norm": 8.468196516461774, + "learning_rate": 5.957906220733447e-06, + "loss": 0.416, + "step": 2553 + }, + { + "epoch": 1.82, + "grad_norm": 6.859111236122798, + "learning_rate": 5.9550694461694806e-06, + "loss": 0.4062, + "step": 2554 + }, + { + "epoch": 1.82, + "grad_norm": 8.371396705917446, + "learning_rate": 5.95223235249906e-06, + "loss": 0.4697, + "step": 2555 + }, + { + "epoch": 1.82, + "grad_norm": 9.74581839772436, + "learning_rate": 5.949394940670112e-06, + "loss": 0.4634, + "step": 2556 + }, + { + "epoch": 1.83, + "grad_norm": 10.11191421574571, + "learning_rate": 5.946557211630667e-06, + "loss": 0.5122, + "step": 2557 + }, + { + "epoch": 1.83, + "grad_norm": 8.41215503415198, + "learning_rate": 5.943719166328864e-06, + "loss": 0.4316, + "step": 2558 + }, + { + "epoch": 1.83, + "grad_norm": 8.922238946122983, + "learning_rate": 5.940880805712945e-06, + "loss": 0.3711, + "step": 2559 + }, + { + "epoch": 1.83, + "grad_norm": 11.556285820468634, + "learning_rate": 5.938042130731262e-06, + "loss": 0.4712, + "step": 2560 + }, + { + "epoch": 1.83, + "grad_norm": 21.602935307175088, + "learning_rate": 5.935203142332267e-06, + "loss": 0.5796, + "step": 2561 + }, + { + "epoch": 1.83, + "grad_norm": 10.172490980473096, + "learning_rate": 5.932363841464519e-06, + "loss": 0.3892, + "step": 2562 + }, + { + "epoch": 1.83, + "grad_norm": 12.979141458728968, + "learning_rate": 5.9295242290766805e-06, + "loss": 0.4556, + "step": 2563 + }, + { + "epoch": 1.83, + "grad_norm": 10.830863705403445, + "learning_rate": 5.9266843061175216e-06, + "loss": 0.4551, + "step": 2564 + }, + { + "epoch": 1.83, + "grad_norm": 23.815492103215934, + "learning_rate": 5.92384407353591e-06, + "loss": 0.5635, + "step": 2565 + }, + { + "epoch": 1.83, + "grad_norm": 13.015347774380794, + "learning_rate": 5.921003532280822e-06, + "loss": 0.4316, + "step": 2566 + }, + { + "epoch": 1.83, + "grad_norm": 18.625830475732585, + "learning_rate": 5.918162683301336e-06, + "loss": 0.5039, + "step": 2567 + }, + { + "epoch": 1.83, + "grad_norm": 11.919505793902855, + "learning_rate": 5.91532152754663e-06, + "loss": 0.5859, + "step": 2568 + }, + { + "epoch": 1.83, + "grad_norm": 7.891844235140119, + "learning_rate": 5.91248006596599e-06, + "loss": 0.4126, + "step": 2569 + }, + { + "epoch": 1.83, + "grad_norm": 15.943605729070335, + "learning_rate": 5.909638299508798e-06, + "loss": 0.4131, + "step": 2570 + }, + { + "epoch": 1.84, + "grad_norm": 14.256434625761717, + "learning_rate": 5.906796229124543e-06, + "loss": 0.4639, + "step": 2571 + }, + { + "epoch": 1.84, + "grad_norm": 20.457753839967175, + "learning_rate": 5.903953855762812e-06, + "loss": 0.4829, + "step": 2572 + }, + { + "epoch": 1.84, + "grad_norm": 20.1694335296129, + "learning_rate": 5.901111180373298e-06, + "loss": 0.5698, + "step": 2573 + }, + { + "epoch": 1.84, + "grad_norm": 10.654685001614778, + "learning_rate": 5.898268203905788e-06, + "loss": 0.4927, + "step": 2574 + }, + { + "epoch": 1.84, + "grad_norm": 12.94449663637914, + "learning_rate": 5.895424927310174e-06, + "loss": 0.478, + "step": 2575 + }, + { + "epoch": 1.84, + "grad_norm": 7.653556967960311, + "learning_rate": 5.89258135153645e-06, + "loss": 0.4351, + "step": 2576 + }, + { + "epoch": 1.84, + "grad_norm": 8.724275472225052, + "learning_rate": 5.889737477534704e-06, + "loss": 0.3901, + "step": 2577 + }, + { + "epoch": 1.84, + "grad_norm": 10.683542432644515, + "learning_rate": 5.886893306255129e-06, + "loss": 0.4609, + "step": 2578 + }, + { + "epoch": 1.84, + "grad_norm": 8.145435964386623, + "learning_rate": 5.884048838648017e-06, + "loss": 0.5005, + "step": 2579 + }, + { + "epoch": 1.84, + "grad_norm": 8.871255725801072, + "learning_rate": 5.881204075663755e-06, + "loss": 0.4761, + "step": 2580 + }, + { + "epoch": 1.84, + "grad_norm": 24.214634872213015, + "learning_rate": 5.878359018252831e-06, + "loss": 0.6128, + "step": 2581 + }, + { + "epoch": 1.84, + "grad_norm": 9.406317683172762, + "learning_rate": 5.8755136673658365e-06, + "loss": 0.4609, + "step": 2582 + }, + { + "epoch": 1.84, + "grad_norm": 15.554115156101938, + "learning_rate": 5.872668023953449e-06, + "loss": 0.5054, + "step": 2583 + }, + { + "epoch": 1.84, + "grad_norm": 11.952532962810603, + "learning_rate": 5.869822088966455e-06, + "loss": 0.4531, + "step": 2584 + }, + { + "epoch": 1.85, + "grad_norm": 8.834444713741934, + "learning_rate": 5.866975863355734e-06, + "loss": 0.4854, + "step": 2585 + }, + { + "epoch": 1.85, + "grad_norm": 8.54691819321447, + "learning_rate": 5.864129348072261e-06, + "loss": 0.5293, + "step": 2586 + }, + { + "epoch": 1.85, + "grad_norm": 12.582713115393668, + "learning_rate": 5.861282544067112e-06, + "loss": 0.4829, + "step": 2587 + }, + { + "epoch": 1.85, + "grad_norm": 14.239317398524113, + "learning_rate": 5.8584354522914555e-06, + "loss": 0.5029, + "step": 2588 + }, + { + "epoch": 1.85, + "grad_norm": 7.723468978865414, + "learning_rate": 5.855588073696559e-06, + "loss": 0.4668, + "step": 2589 + }, + { + "epoch": 1.85, + "grad_norm": 7.930051826297984, + "learning_rate": 5.852740409233785e-06, + "loss": 0.4092, + "step": 2590 + }, + { + "epoch": 1.85, + "grad_norm": 9.019117275735518, + "learning_rate": 5.849892459854588e-06, + "loss": 0.3613, + "step": 2591 + }, + { + "epoch": 1.85, + "grad_norm": 12.473807935463805, + "learning_rate": 5.847044226510524e-06, + "loss": 0.4814, + "step": 2592 + }, + { + "epoch": 1.85, + "grad_norm": 10.961602975113415, + "learning_rate": 5.84419571015324e-06, + "loss": 0.4736, + "step": 2593 + }, + { + "epoch": 1.85, + "grad_norm": 8.145353957021218, + "learning_rate": 5.8413469117344766e-06, + "loss": 0.4971, + "step": 2594 + }, + { + "epoch": 1.85, + "grad_norm": 9.251775681976962, + "learning_rate": 5.838497832206074e-06, + "loss": 0.4351, + "step": 2595 + }, + { + "epoch": 1.85, + "grad_norm": 10.772584386631225, + "learning_rate": 5.835648472519958e-06, + "loss": 0.4829, + "step": 2596 + }, + { + "epoch": 1.85, + "grad_norm": 19.526504254371492, + "learning_rate": 5.832798833628156e-06, + "loss": 0.4814, + "step": 2597 + }, + { + "epoch": 1.85, + "grad_norm": 7.52349073531977, + "learning_rate": 5.829948916482784e-06, + "loss": 0.4419, + "step": 2598 + }, + { + "epoch": 1.86, + "grad_norm": 10.240904448794703, + "learning_rate": 5.827098722036053e-06, + "loss": 0.4404, + "step": 2599 + }, + { + "epoch": 1.86, + "grad_norm": 11.162623628027008, + "learning_rate": 5.824248251240265e-06, + "loss": 0.437, + "step": 2600 + }, + { + "epoch": 1.86, + "grad_norm": 13.063102295836982, + "learning_rate": 5.8213975050478155e-06, + "loss": 0.4668, + "step": 2601 + }, + { + "epoch": 1.86, + "grad_norm": 9.197499697597129, + "learning_rate": 5.818546484411191e-06, + "loss": 0.4873, + "step": 2602 + }, + { + "epoch": 1.86, + "grad_norm": 10.957695255120187, + "learning_rate": 5.815695190282974e-06, + "loss": 0.5273, + "step": 2603 + }, + { + "epoch": 1.86, + "grad_norm": 11.149765327635293, + "learning_rate": 5.81284362361583e-06, + "loss": 0.5435, + "step": 2604 + }, + { + "epoch": 1.86, + "grad_norm": 11.897344437580472, + "learning_rate": 5.809991785362525e-06, + "loss": 0.4624, + "step": 2605 + }, + { + "epoch": 1.86, + "grad_norm": 14.41339345815525, + "learning_rate": 5.8071396764759065e-06, + "loss": 0.4155, + "step": 2606 + }, + { + "epoch": 1.86, + "grad_norm": 12.588859240549306, + "learning_rate": 5.804287297908923e-06, + "loss": 0.4224, + "step": 2607 + }, + { + "epoch": 1.86, + "grad_norm": 11.051689248011026, + "learning_rate": 5.801434650614601e-06, + "loss": 0.4731, + "step": 2608 + }, + { + "epoch": 1.86, + "grad_norm": 12.164254844968275, + "learning_rate": 5.798581735546066e-06, + "loss": 0.4878, + "step": 2609 + }, + { + "epoch": 1.86, + "grad_norm": 11.618691567900477, + "learning_rate": 5.79572855365653e-06, + "loss": 0.3784, + "step": 2610 + }, + { + "epoch": 1.86, + "grad_norm": 12.41055691629168, + "learning_rate": 5.792875105899294e-06, + "loss": 0.5732, + "step": 2611 + }, + { + "epoch": 1.86, + "grad_norm": 9.670181143911773, + "learning_rate": 5.790021393227747e-06, + "loss": 0.4133, + "step": 2612 + }, + { + "epoch": 1.87, + "grad_norm": 11.575788308226384, + "learning_rate": 5.787167416595369e-06, + "loss": 0.4673, + "step": 2613 + }, + { + "epoch": 1.87, + "grad_norm": 10.142364245518445, + "learning_rate": 5.784313176955726e-06, + "loss": 0.4351, + "step": 2614 + }, + { + "epoch": 1.87, + "grad_norm": 9.236307058725137, + "learning_rate": 5.781458675262472e-06, + "loss": 0.3555, + "step": 2615 + }, + { + "epoch": 1.87, + "grad_norm": 8.169804367113855, + "learning_rate": 5.778603912469349e-06, + "loss": 0.4067, + "step": 2616 + }, + { + "epoch": 1.87, + "grad_norm": 13.194556650327089, + "learning_rate": 5.775748889530187e-06, + "loss": 0.5103, + "step": 2617 + }, + { + "epoch": 1.87, + "grad_norm": 6.8031354287890995, + "learning_rate": 5.772893607398901e-06, + "loss": 0.3564, + "step": 2618 + }, + { + "epoch": 1.87, + "grad_norm": 10.611694361969873, + "learning_rate": 5.770038067029496e-06, + "loss": 0.4175, + "step": 2619 + }, + { + "epoch": 1.87, + "grad_norm": 11.88527859419395, + "learning_rate": 5.76718226937606e-06, + "loss": 0.3125, + "step": 2620 + }, + { + "epoch": 1.87, + "grad_norm": 9.739686500701513, + "learning_rate": 5.764326215392768e-06, + "loss": 0.418, + "step": 2621 + }, + { + "epoch": 1.87, + "grad_norm": 11.698752773439546, + "learning_rate": 5.761469906033879e-06, + "loss": 0.3662, + "step": 2622 + }, + { + "epoch": 1.87, + "grad_norm": 10.707736166224288, + "learning_rate": 5.758613342253743e-06, + "loss": 0.374, + "step": 2623 + }, + { + "epoch": 1.87, + "grad_norm": 20.383000470021422, + "learning_rate": 5.7557565250067896e-06, + "loss": 0.4565, + "step": 2624 + }, + { + "epoch": 1.87, + "grad_norm": 10.55652728383383, + "learning_rate": 5.752899455247532e-06, + "loss": 0.3955, + "step": 2625 + }, + { + "epoch": 1.87, + "grad_norm": 14.66660044282398, + "learning_rate": 5.750042133930571e-06, + "loss": 0.4761, + "step": 2626 + }, + { + "epoch": 1.88, + "grad_norm": 13.605013174142563, + "learning_rate": 5.7471845620105925e-06, + "loss": 0.4524, + "step": 2627 + }, + { + "epoch": 1.88, + "grad_norm": 13.512664823617058, + "learning_rate": 5.744326740442364e-06, + "loss": 0.4385, + "step": 2628 + }, + { + "epoch": 1.88, + "grad_norm": 21.55939086501065, + "learning_rate": 5.741468670180737e-06, + "loss": 0.5186, + "step": 2629 + }, + { + "epoch": 1.88, + "grad_norm": 20.560690542811283, + "learning_rate": 5.738610352180645e-06, + "loss": 0.5356, + "step": 2630 + }, + { + "epoch": 1.88, + "grad_norm": 12.948898188809688, + "learning_rate": 5.735751787397106e-06, + "loss": 0.3574, + "step": 2631 + }, + { + "epoch": 1.88, + "grad_norm": 24.160391432425694, + "learning_rate": 5.732892976785218e-06, + "loss": 0.4609, + "step": 2632 + }, + { + "epoch": 1.88, + "grad_norm": 10.426468726837316, + "learning_rate": 5.730033921300166e-06, + "loss": 0.3936, + "step": 2633 + }, + { + "epoch": 1.88, + "grad_norm": 9.517440332362233, + "learning_rate": 5.7271746218972105e-06, + "loss": 0.4478, + "step": 2634 + }, + { + "epoch": 1.88, + "grad_norm": 7.545964981138855, + "learning_rate": 5.724315079531697e-06, + "loss": 0.4224, + "step": 2635 + }, + { + "epoch": 1.88, + "grad_norm": 11.167790368619306, + "learning_rate": 5.721455295159053e-06, + "loss": 0.4131, + "step": 2636 + }, + { + "epoch": 1.88, + "grad_norm": 15.35330099624201, + "learning_rate": 5.7185952697347844e-06, + "loss": 0.5435, + "step": 2637 + }, + { + "epoch": 1.88, + "grad_norm": 17.478710612288964, + "learning_rate": 5.71573500421448e-06, + "loss": 0.4561, + "step": 2638 + }, + { + "epoch": 1.88, + "grad_norm": 24.345386420729913, + "learning_rate": 5.712874499553807e-06, + "loss": 0.6011, + "step": 2639 + }, + { + "epoch": 1.88, + "grad_norm": 14.103831958980157, + "learning_rate": 5.710013756708513e-06, + "loss": 0.5371, + "step": 2640 + }, + { + "epoch": 1.89, + "grad_norm": 8.723792228428197, + "learning_rate": 5.707152776634427e-06, + "loss": 0.4746, + "step": 2641 + }, + { + "epoch": 1.89, + "grad_norm": 11.135521772087765, + "learning_rate": 5.704291560287454e-06, + "loss": 0.5806, + "step": 2642 + }, + { + "epoch": 1.89, + "grad_norm": 9.61923834918151, + "learning_rate": 5.701430108623578e-06, + "loss": 0.5034, + "step": 2643 + }, + { + "epoch": 1.89, + "grad_norm": 11.595901720364187, + "learning_rate": 5.698568422598867e-06, + "loss": 0.4658, + "step": 2644 + }, + { + "epoch": 1.89, + "grad_norm": 8.033670237094597, + "learning_rate": 5.69570650316946e-06, + "loss": 0.3877, + "step": 2645 + }, + { + "epoch": 1.89, + "grad_norm": 10.311323300972322, + "learning_rate": 5.69284435129158e-06, + "loss": 0.5474, + "step": 2646 + }, + { + "epoch": 1.89, + "grad_norm": 17.16920875171198, + "learning_rate": 5.689981967921523e-06, + "loss": 0.4785, + "step": 2647 + }, + { + "epoch": 1.89, + "grad_norm": 11.505309016080973, + "learning_rate": 5.6871193540156666e-06, + "loss": 0.5347, + "step": 2648 + }, + { + "epoch": 1.89, + "grad_norm": 9.375726119567252, + "learning_rate": 5.684256510530461e-06, + "loss": 0.5317, + "step": 2649 + }, + { + "epoch": 1.89, + "grad_norm": 7.956873091796398, + "learning_rate": 5.68139343842244e-06, + "loss": 0.4897, + "step": 2650 + }, + { + "epoch": 1.89, + "grad_norm": 6.598497045655397, + "learning_rate": 5.678530138648204e-06, + "loss": 0.3809, + "step": 2651 + }, + { + "epoch": 1.89, + "grad_norm": 8.743508614000168, + "learning_rate": 5.675666612164436e-06, + "loss": 0.4536, + "step": 2652 + }, + { + "epoch": 1.89, + "grad_norm": 16.871449050564767, + "learning_rate": 5.672802859927895e-06, + "loss": 0.4248, + "step": 2653 + }, + { + "epoch": 1.89, + "grad_norm": 14.838442238490044, + "learning_rate": 5.669938882895412e-06, + "loss": 0.4878, + "step": 2654 + }, + { + "epoch": 1.9, + "grad_norm": 12.731771693748918, + "learning_rate": 5.667074682023896e-06, + "loss": 0.4346, + "step": 2655 + }, + { + "epoch": 1.9, + "grad_norm": 11.31273234375937, + "learning_rate": 5.664210258270331e-06, + "loss": 0.5474, + "step": 2656 + }, + { + "epoch": 1.9, + "grad_norm": 6.388124322189932, + "learning_rate": 5.661345612591771e-06, + "loss": 0.3623, + "step": 2657 + }, + { + "epoch": 1.9, + "grad_norm": 11.206485885563511, + "learning_rate": 5.6584807459453515e-06, + "loss": 0.4312, + "step": 2658 + }, + { + "epoch": 1.9, + "grad_norm": 8.641694644438422, + "learning_rate": 5.655615659288274e-06, + "loss": 0.4653, + "step": 2659 + }, + { + "epoch": 1.9, + "grad_norm": 9.129600359571274, + "learning_rate": 5.652750353577818e-06, + "loss": 0.4902, + "step": 2660 + }, + { + "epoch": 1.9, + "grad_norm": 25.692674590295074, + "learning_rate": 5.649884829771337e-06, + "loss": 0.5063, + "step": 2661 + }, + { + "epoch": 1.9, + "grad_norm": 13.54868839498731, + "learning_rate": 5.6470190888262545e-06, + "loss": 0.457, + "step": 2662 + }, + { + "epoch": 1.9, + "grad_norm": 14.289852844856199, + "learning_rate": 5.644153131700067e-06, + "loss": 0.4634, + "step": 2663 + }, + { + "epoch": 1.9, + "grad_norm": 15.249877303141151, + "learning_rate": 5.6412869593503476e-06, + "loss": 0.4956, + "step": 2664 + }, + { + "epoch": 1.9, + "grad_norm": 6.960042402355725, + "learning_rate": 5.638420572734733e-06, + "loss": 0.457, + "step": 2665 + }, + { + "epoch": 1.9, + "grad_norm": 8.138114527336878, + "learning_rate": 5.63555397281094e-06, + "loss": 0.4009, + "step": 2666 + }, + { + "epoch": 1.9, + "grad_norm": 8.009079366365885, + "learning_rate": 5.632687160536751e-06, + "loss": 0.4043, + "step": 2667 + }, + { + "epoch": 1.9, + "grad_norm": 11.217920799149297, + "learning_rate": 5.629820136870022e-06, + "loss": 0.4946, + "step": 2668 + }, + { + "epoch": 1.91, + "grad_norm": 18.941849965542726, + "learning_rate": 5.626952902768678e-06, + "loss": 0.5039, + "step": 2669 + }, + { + "epoch": 1.91, + "grad_norm": 6.464934606221962, + "learning_rate": 5.624085459190717e-06, + "loss": 0.3403, + "step": 2670 + }, + { + "epoch": 1.91, + "grad_norm": 6.790559734697411, + "learning_rate": 5.621217807094202e-06, + "loss": 0.353, + "step": 2671 + }, + { + "epoch": 1.91, + "grad_norm": 15.647088939509905, + "learning_rate": 5.618349947437272e-06, + "loss": 0.4565, + "step": 2672 + }, + { + "epoch": 1.91, + "grad_norm": 11.789802694379299, + "learning_rate": 5.615481881178132e-06, + "loss": 0.4419, + "step": 2673 + }, + { + "epoch": 1.91, + "grad_norm": 8.442548561932774, + "learning_rate": 5.612613609275054e-06, + "loss": 0.4175, + "step": 2674 + }, + { + "epoch": 1.91, + "grad_norm": 15.641515924971305, + "learning_rate": 5.609745132686383e-06, + "loss": 0.5254, + "step": 2675 + }, + { + "epoch": 1.91, + "grad_norm": 15.432218412288657, + "learning_rate": 5.60687645237053e-06, + "loss": 0.4419, + "step": 2676 + }, + { + "epoch": 1.91, + "grad_norm": 12.15077877053977, + "learning_rate": 5.604007569285973e-06, + "loss": 0.5625, + "step": 2677 + }, + { + "epoch": 1.91, + "grad_norm": 11.468525979946765, + "learning_rate": 5.6011384843912605e-06, + "loss": 0.4912, + "step": 2678 + }, + { + "epoch": 1.91, + "grad_norm": 22.363832033779826, + "learning_rate": 5.598269198645008e-06, + "loss": 0.4634, + "step": 2679 + }, + { + "epoch": 1.91, + "grad_norm": 11.383705824219753, + "learning_rate": 5.5953997130058945e-06, + "loss": 0.5581, + "step": 2680 + }, + { + "epoch": 1.91, + "grad_norm": 11.042518750173171, + "learning_rate": 5.5925300284326715e-06, + "loss": 0.5088, + "step": 2681 + }, + { + "epoch": 1.91, + "grad_norm": 8.76598841043737, + "learning_rate": 5.5896601458841505e-06, + "loss": 0.4141, + "step": 2682 + }, + { + "epoch": 1.92, + "grad_norm": 13.862820427563568, + "learning_rate": 5.586790066319217e-06, + "loss": 0.4126, + "step": 2683 + }, + { + "epoch": 1.92, + "grad_norm": 12.59617718975847, + "learning_rate": 5.583919790696814e-06, + "loss": 0.4648, + "step": 2684 + }, + { + "epoch": 1.92, + "grad_norm": 9.54969966751114, + "learning_rate": 5.581049319975957e-06, + "loss": 0.437, + "step": 2685 + }, + { + "epoch": 1.92, + "grad_norm": 20.884177236890935, + "learning_rate": 5.57817865511572e-06, + "loss": 0.5962, + "step": 2686 + }, + { + "epoch": 1.92, + "grad_norm": 9.066486791899976, + "learning_rate": 5.575307797075249e-06, + "loss": 0.3447, + "step": 2687 + }, + { + "epoch": 1.92, + "grad_norm": 10.359577468848478, + "learning_rate": 5.572436746813748e-06, + "loss": 0.48, + "step": 2688 + }, + { + "epoch": 1.92, + "grad_norm": 8.3029594415371, + "learning_rate": 5.5695655052904905e-06, + "loss": 0.4507, + "step": 2689 + }, + { + "epoch": 1.92, + "grad_norm": 15.98486194284849, + "learning_rate": 5.566694073464812e-06, + "loss": 0.4419, + "step": 2690 + }, + { + "epoch": 1.92, + "grad_norm": 12.087720324088506, + "learning_rate": 5.56382245229611e-06, + "loss": 0.4771, + "step": 2691 + }, + { + "epoch": 1.92, + "grad_norm": 9.174292843227011, + "learning_rate": 5.560950642743847e-06, + "loss": 0.4883, + "step": 2692 + }, + { + "epoch": 1.92, + "grad_norm": 16.926555916096607, + "learning_rate": 5.558078645767547e-06, + "loss": 0.4019, + "step": 2693 + }, + { + "epoch": 1.92, + "grad_norm": 13.536627620327389, + "learning_rate": 5.5552064623267986e-06, + "loss": 0.5322, + "step": 2694 + }, + { + "epoch": 1.92, + "grad_norm": 8.76012508956878, + "learning_rate": 5.5523340933812505e-06, + "loss": 0.4253, + "step": 2695 + }, + { + "epoch": 1.92, + "grad_norm": 8.90679754193909, + "learning_rate": 5.549461539890616e-06, + "loss": 0.4507, + "step": 2696 + }, + { + "epoch": 1.93, + "grad_norm": 8.420670976992822, + "learning_rate": 5.546588802814669e-06, + "loss": 0.4375, + "step": 2697 + }, + { + "epoch": 1.93, + "grad_norm": 8.727787659346541, + "learning_rate": 5.543715883113241e-06, + "loss": 0.3931, + "step": 2698 + }, + { + "epoch": 1.93, + "grad_norm": 11.086024809133871, + "learning_rate": 5.540842781746231e-06, + "loss": 0.3979, + "step": 2699 + }, + { + "epoch": 1.93, + "grad_norm": 13.011343075606366, + "learning_rate": 5.537969499673598e-06, + "loss": 0.522, + "step": 2700 + }, + { + "epoch": 1.93, + "grad_norm": 9.15708782937107, + "learning_rate": 5.535096037855353e-06, + "loss": 0.459, + "step": 2701 + }, + { + "epoch": 1.93, + "grad_norm": 11.044008252312308, + "learning_rate": 5.532222397251576e-06, + "loss": 0.4487, + "step": 2702 + }, + { + "epoch": 1.93, + "grad_norm": 9.232084819714416, + "learning_rate": 5.529348578822403e-06, + "loss": 0.5, + "step": 2703 + }, + { + "epoch": 1.93, + "grad_norm": 12.659043205460982, + "learning_rate": 5.526474583528032e-06, + "loss": 0.5312, + "step": 2704 + }, + { + "epoch": 1.93, + "grad_norm": 21.00752567623559, + "learning_rate": 5.523600412328716e-06, + "loss": 0.5352, + "step": 2705 + }, + { + "epoch": 1.93, + "grad_norm": 11.033115280436297, + "learning_rate": 5.520726066184769e-06, + "loss": 0.5396, + "step": 2706 + }, + { + "epoch": 1.93, + "grad_norm": 8.80140343593761, + "learning_rate": 5.517851546056566e-06, + "loss": 0.3618, + "step": 2707 + }, + { + "epoch": 1.93, + "grad_norm": 8.135541690096266, + "learning_rate": 5.5149768529045355e-06, + "loss": 0.3765, + "step": 2708 + }, + { + "epoch": 1.93, + "grad_norm": 9.033113241944184, + "learning_rate": 5.512101987689168e-06, + "loss": 0.3994, + "step": 2709 + }, + { + "epoch": 1.93, + "grad_norm": 8.091847973750784, + "learning_rate": 5.509226951371006e-06, + "loss": 0.4431, + "step": 2710 + }, + { + "epoch": 1.94, + "grad_norm": 10.550320506061626, + "learning_rate": 5.506351744910654e-06, + "loss": 0.4248, + "step": 2711 + }, + { + "epoch": 1.94, + "grad_norm": 7.907476430192875, + "learning_rate": 5.503476369268773e-06, + "loss": 0.4434, + "step": 2712 + }, + { + "epoch": 1.94, + "grad_norm": 11.662632331178548, + "learning_rate": 5.50060082540608e-06, + "loss": 0.3901, + "step": 2713 + }, + { + "epoch": 1.94, + "grad_norm": 11.946005732308972, + "learning_rate": 5.4977251142833445e-06, + "loss": 0.5063, + "step": 2714 + }, + { + "epoch": 1.94, + "grad_norm": 7.764860363419759, + "learning_rate": 5.494849236861397e-06, + "loss": 0.3701, + "step": 2715 + }, + { + "epoch": 1.94, + "grad_norm": 10.57151587816436, + "learning_rate": 5.491973194101122e-06, + "loss": 0.4678, + "step": 2716 + }, + { + "epoch": 1.94, + "grad_norm": 10.496999220354681, + "learning_rate": 5.4890969869634606e-06, + "loss": 0.4072, + "step": 2717 + }, + { + "epoch": 1.94, + "grad_norm": 21.955739348741126, + "learning_rate": 5.486220616409403e-06, + "loss": 0.417, + "step": 2718 + }, + { + "epoch": 1.94, + "grad_norm": 9.480253591668859, + "learning_rate": 5.4833440834e-06, + "loss": 0.4419, + "step": 2719 + }, + { + "epoch": 1.94, + "grad_norm": 13.417238840494207, + "learning_rate": 5.480467388896353e-06, + "loss": 0.4951, + "step": 2720 + }, + { + "epoch": 1.94, + "grad_norm": 9.778607214982058, + "learning_rate": 5.477590533859623e-06, + "loss": 0.4058, + "step": 2721 + }, + { + "epoch": 1.94, + "grad_norm": 15.095377405354045, + "learning_rate": 5.474713519251018e-06, + "loss": 0.501, + "step": 2722 + }, + { + "epoch": 1.94, + "grad_norm": 9.109288662149696, + "learning_rate": 5.471836346031802e-06, + "loss": 0.4067, + "step": 2723 + }, + { + "epoch": 1.94, + "grad_norm": 10.606747867678797, + "learning_rate": 5.468959015163293e-06, + "loss": 0.4321, + "step": 2724 + }, + { + "epoch": 1.95, + "grad_norm": 10.521320145257938, + "learning_rate": 5.46608152760686e-06, + "loss": 0.3298, + "step": 2725 + }, + { + "epoch": 1.95, + "grad_norm": 13.610539194603652, + "learning_rate": 5.463203884323926e-06, + "loss": 0.541, + "step": 2726 + }, + { + "epoch": 1.95, + "grad_norm": 13.273515699596409, + "learning_rate": 5.460326086275964e-06, + "loss": 0.5078, + "step": 2727 + }, + { + "epoch": 1.95, + "grad_norm": 9.111661385131107, + "learning_rate": 5.4574481344245015e-06, + "loss": 0.4756, + "step": 2728 + }, + { + "epoch": 1.95, + "grad_norm": 8.977805736536428, + "learning_rate": 5.454570029731115e-06, + "loss": 0.4663, + "step": 2729 + }, + { + "epoch": 1.95, + "grad_norm": 13.975752561343054, + "learning_rate": 5.451691773157431e-06, + "loss": 0.4971, + "step": 2730 + }, + { + "epoch": 1.95, + "grad_norm": 17.32339975667838, + "learning_rate": 5.448813365665129e-06, + "loss": 0.5049, + "step": 2731 + }, + { + "epoch": 1.95, + "grad_norm": 11.315123889810822, + "learning_rate": 5.44593480821594e-06, + "loss": 0.5132, + "step": 2732 + }, + { + "epoch": 1.95, + "grad_norm": 9.29119410355911, + "learning_rate": 5.443056101771643e-06, + "loss": 0.4316, + "step": 2733 + }, + { + "epoch": 1.95, + "grad_norm": 11.289099728879002, + "learning_rate": 5.44017724729407e-06, + "loss": 0.3882, + "step": 2734 + }, + { + "epoch": 1.95, + "grad_norm": 8.888644836261584, + "learning_rate": 5.437298245745093e-06, + "loss": 0.4331, + "step": 2735 + }, + { + "epoch": 1.95, + "grad_norm": 48.21181573416906, + "learning_rate": 5.434419098086645e-06, + "loss": 0.5977, + "step": 2736 + }, + { + "epoch": 1.95, + "grad_norm": 9.641134447756468, + "learning_rate": 5.431539805280702e-06, + "loss": 0.3945, + "step": 2737 + }, + { + "epoch": 1.95, + "grad_norm": 10.965847623472408, + "learning_rate": 5.428660368289289e-06, + "loss": 0.4741, + "step": 2738 + }, + { + "epoch": 1.96, + "grad_norm": 11.10664209523203, + "learning_rate": 5.42578078807448e-06, + "loss": 0.4512, + "step": 2739 + }, + { + "epoch": 1.96, + "grad_norm": 10.681304900537418, + "learning_rate": 5.422901065598395e-06, + "loss": 0.4297, + "step": 2740 + }, + { + "epoch": 1.96, + "grad_norm": 8.739932626389907, + "learning_rate": 5.4200212018232024e-06, + "loss": 0.3513, + "step": 2741 + }, + { + "epoch": 1.96, + "grad_norm": 8.846041535480271, + "learning_rate": 5.41714119771112e-06, + "loss": 0.4302, + "step": 2742 + }, + { + "epoch": 1.96, + "grad_norm": 11.984598238079338, + "learning_rate": 5.414261054224412e-06, + "loss": 0.4033, + "step": 2743 + }, + { + "epoch": 1.96, + "grad_norm": 18.933328673227695, + "learning_rate": 5.411380772325383e-06, + "loss": 0.5703, + "step": 2744 + }, + { + "epoch": 1.96, + "grad_norm": 18.854711059795555, + "learning_rate": 5.408500352976392e-06, + "loss": 0.5151, + "step": 2745 + }, + { + "epoch": 1.96, + "grad_norm": 10.111285712834931, + "learning_rate": 5.40561979713984e-06, + "loss": 0.4084, + "step": 2746 + }, + { + "epoch": 1.96, + "grad_norm": 11.242923858855155, + "learning_rate": 5.402739105778175e-06, + "loss": 0.4956, + "step": 2747 + }, + { + "epoch": 1.96, + "grad_norm": 9.954715434651323, + "learning_rate": 5.399858279853889e-06, + "loss": 0.4229, + "step": 2748 + }, + { + "epoch": 1.96, + "grad_norm": 7.3157730974535315, + "learning_rate": 5.39697732032952e-06, + "loss": 0.3398, + "step": 2749 + }, + { + "epoch": 1.96, + "grad_norm": 10.265906166274753, + "learning_rate": 5.394096228167648e-06, + "loss": 0.4565, + "step": 2750 + }, + { + "epoch": 1.96, + "grad_norm": 12.198284629775742, + "learning_rate": 5.391215004330903e-06, + "loss": 0.3813, + "step": 2751 + }, + { + "epoch": 1.96, + "grad_norm": 12.31796639425881, + "learning_rate": 5.388333649781951e-06, + "loss": 0.4683, + "step": 2752 + }, + { + "epoch": 1.97, + "grad_norm": 10.01789404392901, + "learning_rate": 5.3854521654835105e-06, + "loss": 0.4502, + "step": 2753 + }, + { + "epoch": 1.97, + "grad_norm": 11.306465904258715, + "learning_rate": 5.3825705523983366e-06, + "loss": 0.4351, + "step": 2754 + }, + { + "epoch": 1.97, + "grad_norm": 11.519624094511274, + "learning_rate": 5.37968881148923e-06, + "loss": 0.4707, + "step": 2755 + }, + { + "epoch": 1.97, + "grad_norm": 13.226114092265847, + "learning_rate": 5.376806943719033e-06, + "loss": 0.4814, + "step": 2756 + }, + { + "epoch": 1.97, + "grad_norm": 9.840072523952024, + "learning_rate": 5.373924950050633e-06, + "loss": 0.4194, + "step": 2757 + }, + { + "epoch": 1.97, + "grad_norm": 19.039602192178293, + "learning_rate": 5.371042831446957e-06, + "loss": 0.5571, + "step": 2758 + }, + { + "epoch": 1.97, + "grad_norm": 10.416861116474312, + "learning_rate": 5.3681605888709755e-06, + "loss": 0.4858, + "step": 2759 + }, + { + "epoch": 1.97, + "grad_norm": 15.51902810805772, + "learning_rate": 5.365278223285698e-06, + "loss": 0.7158, + "step": 2760 + }, + { + "epoch": 1.97, + "grad_norm": 8.508514981825446, + "learning_rate": 5.362395735654175e-06, + "loss": 0.5264, + "step": 2761 + }, + { + "epoch": 1.97, + "grad_norm": 11.292618872818796, + "learning_rate": 5.3595131269395015e-06, + "loss": 0.4668, + "step": 2762 + }, + { + "epoch": 1.97, + "grad_norm": 8.600159324345311, + "learning_rate": 5.356630398104811e-06, + "loss": 0.4302, + "step": 2763 + }, + { + "epoch": 1.97, + "grad_norm": 8.912551709797079, + "learning_rate": 5.353747550113274e-06, + "loss": 0.3501, + "step": 2764 + }, + { + "epoch": 1.97, + "grad_norm": 10.525523170385298, + "learning_rate": 5.350864583928106e-06, + "loss": 0.4648, + "step": 2765 + }, + { + "epoch": 1.97, + "grad_norm": 8.191292892938462, + "learning_rate": 5.347981500512558e-06, + "loss": 0.3613, + "step": 2766 + }, + { + "epoch": 1.98, + "grad_norm": 19.57847648591052, + "learning_rate": 5.345098300829924e-06, + "loss": 0.5903, + "step": 2767 + }, + { + "epoch": 1.98, + "grad_norm": 13.335634989474457, + "learning_rate": 5.342214985843534e-06, + "loss": 0.4829, + "step": 2768 + }, + { + "epoch": 1.98, + "grad_norm": 21.001861539425636, + "learning_rate": 5.339331556516755e-06, + "loss": 0.4902, + "step": 2769 + }, + { + "epoch": 1.98, + "grad_norm": 10.013265462647908, + "learning_rate": 5.336448013812996e-06, + "loss": 0.4912, + "step": 2770 + }, + { + "epoch": 1.98, + "grad_norm": 6.809090114106525, + "learning_rate": 5.333564358695701e-06, + "loss": 0.4336, + "step": 2771 + }, + { + "epoch": 1.98, + "grad_norm": 16.281886038205364, + "learning_rate": 5.330680592128355e-06, + "loss": 0.4937, + "step": 2772 + }, + { + "epoch": 1.98, + "grad_norm": 8.150545229018581, + "learning_rate": 5.3277967150744755e-06, + "loss": 0.4189, + "step": 2773 + }, + { + "epoch": 1.98, + "grad_norm": 7.345814588398108, + "learning_rate": 5.324912728497621e-06, + "loss": 0.4009, + "step": 2774 + }, + { + "epoch": 1.98, + "grad_norm": 20.091474666204597, + "learning_rate": 5.322028633361386e-06, + "loss": 0.562, + "step": 2775 + }, + { + "epoch": 1.98, + "grad_norm": 11.754059247137123, + "learning_rate": 5.319144430629397e-06, + "loss": 0.4702, + "step": 2776 + }, + { + "epoch": 1.98, + "grad_norm": 7.586444942071105, + "learning_rate": 5.316260121265323e-06, + "loss": 0.4595, + "step": 2777 + }, + { + "epoch": 1.98, + "grad_norm": 7.6680666422755275, + "learning_rate": 5.313375706232864e-06, + "loss": 0.4072, + "step": 2778 + }, + { + "epoch": 1.98, + "grad_norm": 12.091604704175747, + "learning_rate": 5.310491186495757e-06, + "loss": 0.4297, + "step": 2779 + }, + { + "epoch": 1.98, + "grad_norm": 6.434600968392432, + "learning_rate": 5.307606563017772e-06, + "loss": 0.3491, + "step": 2780 + }, + { + "epoch": 1.99, + "grad_norm": 7.651832210814672, + "learning_rate": 5.304721836762717e-06, + "loss": 0.4087, + "step": 2781 + }, + { + "epoch": 1.99, + "grad_norm": 7.123238252935426, + "learning_rate": 5.301837008694433e-06, + "loss": 0.418, + "step": 2782 + }, + { + "epoch": 1.99, + "grad_norm": 10.623468647704522, + "learning_rate": 5.298952079776794e-06, + "loss": 0.4336, + "step": 2783 + }, + { + "epoch": 1.99, + "grad_norm": 9.961097932950663, + "learning_rate": 5.296067050973709e-06, + "loss": 0.4009, + "step": 2784 + }, + { + "epoch": 1.99, + "grad_norm": 10.004467440225236, + "learning_rate": 5.29318192324912e-06, + "loss": 0.4385, + "step": 2785 + }, + { + "epoch": 1.99, + "grad_norm": 12.937849862725015, + "learning_rate": 5.290296697566999e-06, + "loss": 0.4731, + "step": 2786 + }, + { + "epoch": 1.99, + "grad_norm": 10.851718115774863, + "learning_rate": 5.287411374891356e-06, + "loss": 0.4741, + "step": 2787 + }, + { + "epoch": 1.99, + "grad_norm": 10.247981728977928, + "learning_rate": 5.284525956186231e-06, + "loss": 0.4355, + "step": 2788 + }, + { + "epoch": 1.99, + "grad_norm": 10.992715571855904, + "learning_rate": 5.281640442415695e-06, + "loss": 0.5229, + "step": 2789 + }, + { + "epoch": 1.99, + "grad_norm": 12.344871517460058, + "learning_rate": 5.278754834543852e-06, + "loss": 0.4722, + "step": 2790 + }, + { + "epoch": 1.99, + "grad_norm": 13.839020938381688, + "learning_rate": 5.275869133534838e-06, + "loss": 0.4785, + "step": 2791 + }, + { + "epoch": 1.99, + "grad_norm": 18.33203373375532, + "learning_rate": 5.272983340352818e-06, + "loss": 0.5005, + "step": 2792 + }, + { + "epoch": 1.99, + "grad_norm": 9.957846186020241, + "learning_rate": 5.270097455961991e-06, + "loss": 0.4048, + "step": 2793 + }, + { + "epoch": 1.99, + "grad_norm": 8.793496740544484, + "learning_rate": 5.267211481326584e-06, + "loss": 0.3716, + "step": 2794 + }, + { + "epoch": 2.0, + "grad_norm": 10.547713987824872, + "learning_rate": 5.264325417410854e-06, + "loss": 0.3901, + "step": 2795 + }, + { + "epoch": 2.0, + "grad_norm": 10.683395761109471, + "learning_rate": 5.261439265179089e-06, + "loss": 0.4375, + "step": 2796 + }, + { + "epoch": 2.0, + "grad_norm": 15.88713456122952, + "learning_rate": 5.258553025595605e-06, + "loss": 0.4854, + "step": 2797 + }, + { + "epoch": 2.0, + "grad_norm": 13.15203401175118, + "learning_rate": 5.255666699624749e-06, + "loss": 0.4604, + "step": 2798 + }, + { + "epoch": 2.0, + "grad_norm": 8.439586255360073, + "learning_rate": 5.252780288230899e-06, + "loss": 0.4316, + "step": 2799 + }, + { + "epoch": 2.0, + "grad_norm": 11.690633931887922, + "learning_rate": 5.249893792378454e-06, + "loss": 0.4546, + "step": 2800 + }, + { + "epoch": 2.0, + "grad_norm": 7.5972252881605, + "learning_rate": 5.24700721303185e-06, + "loss": 0.3784, + "step": 2801 + }, + { + "epoch": 2.0, + "grad_norm": 8.09788081335953, + "learning_rate": 5.244120551155544e-06, + "loss": 0.4463, + "step": 2802 + }, + { + "epoch": 2.0, + "grad_norm": 6.183682722164911, + "learning_rate": 5.241233807714024e-06, + "loss": 0.2798, + "step": 2803 + }, + { + "epoch": 2.0, + "grad_norm": 7.492711477273929, + "learning_rate": 5.238346983671805e-06, + "loss": 0.321, + "step": 2804 + }, + { + "epoch": 2.0, + "grad_norm": 8.78545087675637, + "learning_rate": 5.235460079993429e-06, + "loss": 0.3223, + "step": 2805 + }, + { + "epoch": 2.0, + "grad_norm": 6.184465239974276, + "learning_rate": 5.232573097643462e-06, + "loss": 0.2476, + "step": 2806 + }, + { + "epoch": 2.0, + "grad_norm": 7.38970510024269, + "learning_rate": 5.229686037586502e-06, + "loss": 0.3081, + "step": 2807 + }, + { + "epoch": 2.0, + "grad_norm": 6.337615749238382, + "learning_rate": 5.226798900787167e-06, + "loss": 0.3176, + "step": 2808 + }, + { + "epoch": 2.0, + "grad_norm": 8.065520523779682, + "learning_rate": 5.223911688210104e-06, + "loss": 0.2876, + "step": 2809 + }, + { + "epoch": 2.01, + "grad_norm": 13.050626203183986, + "learning_rate": 5.221024400819983e-06, + "loss": 0.3157, + "step": 2810 + }, + { + "epoch": 2.01, + "grad_norm": 10.158021790081161, + "learning_rate": 5.218137039581504e-06, + "loss": 0.2568, + "step": 2811 + }, + { + "epoch": 2.01, + "grad_norm": 11.581251837695142, + "learning_rate": 5.215249605459382e-06, + "loss": 0.3867, + "step": 2812 + }, + { + "epoch": 2.01, + "grad_norm": 8.194547291558267, + "learning_rate": 5.212362099418369e-06, + "loss": 0.26, + "step": 2813 + }, + { + "epoch": 2.01, + "grad_norm": 10.028079795632454, + "learning_rate": 5.2094745224232306e-06, + "loss": 0.2729, + "step": 2814 + }, + { + "epoch": 2.01, + "grad_norm": 16.016565575074754, + "learning_rate": 5.206586875438759e-06, + "loss": 0.4033, + "step": 2815 + }, + { + "epoch": 2.01, + "grad_norm": 9.122323867559514, + "learning_rate": 5.203699159429773e-06, + "loss": 0.2603, + "step": 2816 + }, + { + "epoch": 2.01, + "grad_norm": 11.258210540115515, + "learning_rate": 5.200811375361112e-06, + "loss": 0.3042, + "step": 2817 + }, + { + "epoch": 2.01, + "grad_norm": 7.530978379942863, + "learning_rate": 5.197923524197639e-06, + "loss": 0.2639, + "step": 2818 + }, + { + "epoch": 2.01, + "grad_norm": 11.244056210394618, + "learning_rate": 5.195035606904237e-06, + "loss": 0.3047, + "step": 2819 + }, + { + "epoch": 2.01, + "grad_norm": 12.976152241105012, + "learning_rate": 5.1921476244458135e-06, + "loss": 0.2971, + "step": 2820 + }, + { + "epoch": 2.01, + "grad_norm": 12.62736935340582, + "learning_rate": 5.189259577787297e-06, + "loss": 0.2849, + "step": 2821 + }, + { + "epoch": 2.01, + "grad_norm": 8.841839115792192, + "learning_rate": 5.186371467893638e-06, + "loss": 0.2444, + "step": 2822 + }, + { + "epoch": 2.01, + "grad_norm": 11.733355341593247, + "learning_rate": 5.1834832957298075e-06, + "loss": 0.2522, + "step": 2823 + }, + { + "epoch": 2.02, + "grad_norm": 10.20366363366365, + "learning_rate": 5.180595062260797e-06, + "loss": 0.1924, + "step": 2824 + }, + { + "epoch": 2.02, + "grad_norm": 12.665148545436182, + "learning_rate": 5.177706768451619e-06, + "loss": 0.2285, + "step": 2825 + }, + { + "epoch": 2.02, + "grad_norm": 8.249918360609486, + "learning_rate": 5.174818415267308e-06, + "loss": 0.1785, + "step": 2826 + }, + { + "epoch": 2.02, + "grad_norm": 12.068358421491178, + "learning_rate": 5.1719300036729135e-06, + "loss": 0.2644, + "step": 2827 + }, + { + "epoch": 2.02, + "grad_norm": 11.250338443568086, + "learning_rate": 5.169041534633511e-06, + "loss": 0.2855, + "step": 2828 + }, + { + "epoch": 2.02, + "grad_norm": 11.981055766027776, + "learning_rate": 5.166153009114188e-06, + "loss": 0.2559, + "step": 2829 + }, + { + "epoch": 2.02, + "grad_norm": 16.58373133194685, + "learning_rate": 5.163264428080057e-06, + "loss": 0.229, + "step": 2830 + }, + { + "epoch": 2.02, + "grad_norm": 11.426613456253506, + "learning_rate": 5.160375792496246e-06, + "loss": 0.2656, + "step": 2831 + }, + { + "epoch": 2.02, + "grad_norm": 28.624808178448106, + "learning_rate": 5.157487103327901e-06, + "loss": 0.4053, + "step": 2832 + }, + { + "epoch": 2.02, + "grad_norm": 9.440233476330917, + "learning_rate": 5.1545983615401885e-06, + "loss": 0.2058, + "step": 2833 + }, + { + "epoch": 2.02, + "grad_norm": 14.486881898609434, + "learning_rate": 5.151709568098289e-06, + "loss": 0.3364, + "step": 2834 + }, + { + "epoch": 2.02, + "grad_norm": 8.482819056787882, + "learning_rate": 5.1488207239674036e-06, + "loss": 0.2424, + "step": 2835 + }, + { + "epoch": 2.02, + "grad_norm": 9.817322942492323, + "learning_rate": 5.145931830112748e-06, + "loss": 0.2766, + "step": 2836 + }, + { + "epoch": 2.02, + "grad_norm": 9.581341500866515, + "learning_rate": 5.1430428874995554e-06, + "loss": 0.2036, + "step": 2837 + }, + { + "epoch": 2.03, + "grad_norm": 9.851916462924994, + "learning_rate": 5.140153897093076e-06, + "loss": 0.2317, + "step": 2838 + }, + { + "epoch": 2.03, + "grad_norm": 13.85859461783392, + "learning_rate": 5.1372648598585725e-06, + "loss": 0.3672, + "step": 2839 + }, + { + "epoch": 2.03, + "grad_norm": 7.51481537393519, + "learning_rate": 5.134375776761329e-06, + "loss": 0.2026, + "step": 2840 + }, + { + "epoch": 2.03, + "grad_norm": 12.011970306520274, + "learning_rate": 5.131486648766642e-06, + "loss": 0.2827, + "step": 2841 + }, + { + "epoch": 2.03, + "grad_norm": 8.623869309435559, + "learning_rate": 5.1285974768398205e-06, + "loss": 0.2432, + "step": 2842 + }, + { + "epoch": 2.03, + "grad_norm": 13.971151752756175, + "learning_rate": 5.125708261946192e-06, + "loss": 0.2737, + "step": 2843 + }, + { + "epoch": 2.03, + "grad_norm": 12.16313895403444, + "learning_rate": 5.122819005051096e-06, + "loss": 0.2595, + "step": 2844 + }, + { + "epoch": 2.03, + "grad_norm": 10.834344804916485, + "learning_rate": 5.119929707119889e-06, + "loss": 0.3093, + "step": 2845 + }, + { + "epoch": 2.03, + "grad_norm": 10.59230099518939, + "learning_rate": 5.117040369117937e-06, + "loss": 0.269, + "step": 2846 + }, + { + "epoch": 2.03, + "grad_norm": 9.261755241964481, + "learning_rate": 5.114150992010621e-06, + "loss": 0.2363, + "step": 2847 + }, + { + "epoch": 2.03, + "grad_norm": 17.845223713454207, + "learning_rate": 5.1112615767633385e-06, + "loss": 0.3608, + "step": 2848 + }, + { + "epoch": 2.03, + "grad_norm": 11.952115168213247, + "learning_rate": 5.108372124341494e-06, + "loss": 0.2449, + "step": 2849 + }, + { + "epoch": 2.03, + "grad_norm": 10.962593059947745, + "learning_rate": 5.105482635710509e-06, + "loss": 0.281, + "step": 2850 + }, + { + "epoch": 2.03, + "grad_norm": 11.47356055701605, + "learning_rate": 5.102593111835815e-06, + "loss": 0.303, + "step": 2851 + }, + { + "epoch": 2.04, + "grad_norm": 13.090099328504023, + "learning_rate": 5.099703553682854e-06, + "loss": 0.2979, + "step": 2852 + }, + { + "epoch": 2.04, + "grad_norm": 11.384435155829522, + "learning_rate": 5.096813962217086e-06, + "loss": 0.2117, + "step": 2853 + }, + { + "epoch": 2.04, + "grad_norm": 12.707780782866086, + "learning_rate": 5.093924338403971e-06, + "loss": 0.3604, + "step": 2854 + }, + { + "epoch": 2.04, + "grad_norm": 11.41635335383991, + "learning_rate": 5.091034683208988e-06, + "loss": 0.3076, + "step": 2855 + }, + { + "epoch": 2.04, + "grad_norm": 10.542784265367086, + "learning_rate": 5.088144997597627e-06, + "loss": 0.2373, + "step": 2856 + }, + { + "epoch": 2.04, + "grad_norm": 16.300403445154583, + "learning_rate": 5.085255282535383e-06, + "loss": 0.3362, + "step": 2857 + }, + { + "epoch": 2.04, + "grad_norm": 18.820500247221656, + "learning_rate": 5.082365538987765e-06, + "loss": 0.3486, + "step": 2858 + }, + { + "epoch": 2.04, + "grad_norm": 10.963174706002892, + "learning_rate": 5.079475767920289e-06, + "loss": 0.2498, + "step": 2859 + }, + { + "epoch": 2.04, + "grad_norm": 15.924691223131479, + "learning_rate": 5.076585970298481e-06, + "loss": 0.3442, + "step": 2860 + }, + { + "epoch": 2.04, + "grad_norm": 10.618086517328626, + "learning_rate": 5.073696147087878e-06, + "loss": 0.2406, + "step": 2861 + }, + { + "epoch": 2.04, + "grad_norm": 11.971342596280728, + "learning_rate": 5.070806299254023e-06, + "loss": 0.207, + "step": 2862 + }, + { + "epoch": 2.04, + "grad_norm": 13.74822806936663, + "learning_rate": 5.067916427762466e-06, + "loss": 0.2971, + "step": 2863 + }, + { + "epoch": 2.04, + "grad_norm": 14.841852154556609, + "learning_rate": 5.0650265335787685e-06, + "loss": 0.3123, + "step": 2864 + }, + { + "epoch": 2.04, + "grad_norm": 12.028441837620969, + "learning_rate": 5.062136617668497e-06, + "loss": 0.2405, + "step": 2865 + }, + { + "epoch": 2.05, + "grad_norm": 10.77779210531338, + "learning_rate": 5.059246680997228e-06, + "loss": 0.2537, + "step": 2866 + }, + { + "epoch": 2.05, + "grad_norm": 16.536986749042004, + "learning_rate": 5.05635672453054e-06, + "loss": 0.3887, + "step": 2867 + }, + { + "epoch": 2.05, + "grad_norm": 12.088559869194436, + "learning_rate": 5.053466749234023e-06, + "loss": 0.2421, + "step": 2868 + }, + { + "epoch": 2.05, + "grad_norm": 17.184815205436934, + "learning_rate": 5.050576756073272e-06, + "loss": 0.3806, + "step": 2869 + }, + { + "epoch": 2.05, + "grad_norm": 9.23293466912918, + "learning_rate": 5.047686746013888e-06, + "loss": 0.2493, + "step": 2870 + }, + { + "epoch": 2.05, + "grad_norm": 11.369014628236714, + "learning_rate": 5.044796720021474e-06, + "loss": 0.2585, + "step": 2871 + }, + { + "epoch": 2.05, + "grad_norm": 14.817280948904253, + "learning_rate": 5.041906679061643e-06, + "loss": 0.2686, + "step": 2872 + }, + { + "epoch": 2.05, + "grad_norm": 14.400824784637889, + "learning_rate": 5.039016624100013e-06, + "loss": 0.3796, + "step": 2873 + }, + { + "epoch": 2.05, + "grad_norm": 14.411506271031973, + "learning_rate": 5.036126556102202e-06, + "loss": 0.2939, + "step": 2874 + }, + { + "epoch": 2.05, + "grad_norm": 8.73804711630787, + "learning_rate": 5.033236476033838e-06, + "loss": 0.2456, + "step": 2875 + }, + { + "epoch": 2.05, + "grad_norm": 8.075734103259556, + "learning_rate": 5.0303463848605495e-06, + "loss": 0.2654, + "step": 2876 + }, + { + "epoch": 2.05, + "grad_norm": 21.79768975539953, + "learning_rate": 5.027456283547969e-06, + "loss": 0.2686, + "step": 2877 + }, + { + "epoch": 2.05, + "grad_norm": 10.816952929731688, + "learning_rate": 5.0245661730617344e-06, + "loss": 0.2668, + "step": 2878 + }, + { + "epoch": 2.05, + "grad_norm": 15.601781932049883, + "learning_rate": 5.0216760543674855e-06, + "loss": 0.28, + "step": 2879 + }, + { + "epoch": 2.06, + "grad_norm": 15.279934779358646, + "learning_rate": 5.0187859284308635e-06, + "loss": 0.3567, + "step": 2880 + }, + { + "epoch": 2.06, + "grad_norm": 11.09264143614305, + "learning_rate": 5.015895796217514e-06, + "loss": 0.2632, + "step": 2881 + }, + { + "epoch": 2.06, + "grad_norm": 6.72906261801957, + "learning_rate": 5.013005658693083e-06, + "loss": 0.2432, + "step": 2882 + }, + { + "epoch": 2.06, + "grad_norm": 14.904243447527767, + "learning_rate": 5.01011551682322e-06, + "loss": 0.3123, + "step": 2883 + }, + { + "epoch": 2.06, + "grad_norm": 9.56158421894508, + "learning_rate": 5.007225371573573e-06, + "loss": 0.2267, + "step": 2884 + }, + { + "epoch": 2.06, + "grad_norm": 13.556632021145733, + "learning_rate": 5.004335223909797e-06, + "loss": 0.3115, + "step": 2885 + }, + { + "epoch": 2.06, + "grad_norm": 9.697069853449166, + "learning_rate": 5.0014450747975416e-06, + "loss": 0.2358, + "step": 2886 + }, + { + "epoch": 2.06, + "grad_norm": 10.361640909885745, + "learning_rate": 4.998554925202459e-06, + "loss": 0.2517, + "step": 2887 + }, + { + "epoch": 2.06, + "grad_norm": 18.22972936773402, + "learning_rate": 4.995664776090204e-06, + "loss": 0.4097, + "step": 2888 + }, + { + "epoch": 2.06, + "grad_norm": 9.143973237259363, + "learning_rate": 4.9927746284264275e-06, + "loss": 0.2427, + "step": 2889 + }, + { + "epoch": 2.06, + "grad_norm": 7.457619103227681, + "learning_rate": 4.9898844831767826e-06, + "loss": 0.2324, + "step": 2890 + }, + { + "epoch": 2.06, + "grad_norm": 11.882434995590293, + "learning_rate": 4.98699434130692e-06, + "loss": 0.2408, + "step": 2891 + }, + { + "epoch": 2.06, + "grad_norm": 12.593705192329265, + "learning_rate": 4.984104203782488e-06, + "loss": 0.3152, + "step": 2892 + }, + { + "epoch": 2.06, + "grad_norm": 7.4272420996640856, + "learning_rate": 4.981214071569139e-06, + "loss": 0.2415, + "step": 2893 + }, + { + "epoch": 2.07, + "grad_norm": 8.71270569735412, + "learning_rate": 4.978323945632515e-06, + "loss": 0.2395, + "step": 2894 + }, + { + "epoch": 2.07, + "grad_norm": 10.869809390534899, + "learning_rate": 4.975433826938267e-06, + "loss": 0.2932, + "step": 2895 + }, + { + "epoch": 2.07, + "grad_norm": 12.64672924292059, + "learning_rate": 4.972543716452031e-06, + "loss": 0.2837, + "step": 2896 + }, + { + "epoch": 2.07, + "grad_norm": 11.846800699280617, + "learning_rate": 4.969653615139452e-06, + "loss": 0.2664, + "step": 2897 + }, + { + "epoch": 2.07, + "grad_norm": 20.340053708726927, + "learning_rate": 4.966763523966163e-06, + "loss": 0.4248, + "step": 2898 + }, + { + "epoch": 2.07, + "grad_norm": 8.986583690207297, + "learning_rate": 4.963873443897799e-06, + "loss": 0.2932, + "step": 2899 + }, + { + "epoch": 2.07, + "grad_norm": 8.877813445906238, + "learning_rate": 4.96098337589999e-06, + "loss": 0.2527, + "step": 2900 + }, + { + "epoch": 2.07, + "grad_norm": 8.751716315394958, + "learning_rate": 4.958093320938358e-06, + "loss": 0.2856, + "step": 2901 + }, + { + "epoch": 2.07, + "grad_norm": 10.259213554000896, + "learning_rate": 4.955203279978529e-06, + "loss": 0.312, + "step": 2902 + }, + { + "epoch": 2.07, + "grad_norm": 10.869350826007125, + "learning_rate": 4.952313253986114e-06, + "loss": 0.302, + "step": 2903 + }, + { + "epoch": 2.07, + "grad_norm": 10.818949638241829, + "learning_rate": 4.9494232439267296e-06, + "loss": 0.2224, + "step": 2904 + }, + { + "epoch": 2.07, + "grad_norm": 9.567323569857239, + "learning_rate": 4.946533250765977e-06, + "loss": 0.2588, + "step": 2905 + }, + { + "epoch": 2.07, + "grad_norm": 17.916886883895785, + "learning_rate": 4.943643275469461e-06, + "loss": 0.2678, + "step": 2906 + }, + { + "epoch": 2.07, + "grad_norm": 8.444466859746376, + "learning_rate": 4.940753319002773e-06, + "loss": 0.2598, + "step": 2907 + }, + { + "epoch": 2.08, + "grad_norm": 12.782074101523976, + "learning_rate": 4.937863382331504e-06, + "loss": 0.3218, + "step": 2908 + }, + { + "epoch": 2.08, + "grad_norm": 11.596863218172778, + "learning_rate": 4.934973466421234e-06, + "loss": 0.3345, + "step": 2909 + }, + { + "epoch": 2.08, + "grad_norm": 8.650320442975373, + "learning_rate": 4.932083572237535e-06, + "loss": 0.2859, + "step": 2910 + }, + { + "epoch": 2.08, + "grad_norm": 10.743783621991485, + "learning_rate": 4.92919370074598e-06, + "loss": 0.2761, + "step": 2911 + }, + { + "epoch": 2.08, + "grad_norm": 14.118954150902352, + "learning_rate": 4.926303852912123e-06, + "loss": 0.3367, + "step": 2912 + }, + { + "epoch": 2.08, + "grad_norm": 11.571991535980812, + "learning_rate": 4.9234140297015204e-06, + "loss": 0.2288, + "step": 2913 + }, + { + "epoch": 2.08, + "grad_norm": 14.628440666155255, + "learning_rate": 4.920524232079712e-06, + "loss": 0.3997, + "step": 2914 + }, + { + "epoch": 2.08, + "grad_norm": 17.12851494373273, + "learning_rate": 4.917634461012238e-06, + "loss": 0.3601, + "step": 2915 + }, + { + "epoch": 2.08, + "grad_norm": 12.147043201032695, + "learning_rate": 4.914744717464617e-06, + "loss": 0.3708, + "step": 2916 + }, + { + "epoch": 2.08, + "grad_norm": 16.657303874351186, + "learning_rate": 4.911855002402375e-06, + "loss": 0.3149, + "step": 2917 + }, + { + "epoch": 2.08, + "grad_norm": 6.918834616338623, + "learning_rate": 4.908965316791014e-06, + "loss": 0.1868, + "step": 2918 + }, + { + "epoch": 2.08, + "grad_norm": 7.338228828504006, + "learning_rate": 4.906075661596031e-06, + "loss": 0.2456, + "step": 2919 + }, + { + "epoch": 2.08, + "grad_norm": 9.717111899061246, + "learning_rate": 4.903186037782917e-06, + "loss": 0.2471, + "step": 2920 + }, + { + "epoch": 2.08, + "grad_norm": 12.048131071947918, + "learning_rate": 4.900296446317146e-06, + "loss": 0.2698, + "step": 2921 + }, + { + "epoch": 2.09, + "grad_norm": 16.548570881430447, + "learning_rate": 4.897406888164187e-06, + "loss": 0.3094, + "step": 2922 + }, + { + "epoch": 2.09, + "grad_norm": 14.795066171226647, + "learning_rate": 4.8945173642894915e-06, + "loss": 0.2437, + "step": 2923 + }, + { + "epoch": 2.09, + "grad_norm": 11.874111871990417, + "learning_rate": 4.8916278756585074e-06, + "loss": 0.2549, + "step": 2924 + }, + { + "epoch": 2.09, + "grad_norm": 10.369183327295028, + "learning_rate": 4.888738423236664e-06, + "loss": 0.2812, + "step": 2925 + }, + { + "epoch": 2.09, + "grad_norm": 15.705046693449054, + "learning_rate": 4.88584900798938e-06, + "loss": 0.2434, + "step": 2926 + }, + { + "epoch": 2.09, + "grad_norm": 17.69346857544719, + "learning_rate": 4.882959630882066e-06, + "loss": 0.2947, + "step": 2927 + }, + { + "epoch": 2.09, + "grad_norm": 8.467959400291866, + "learning_rate": 4.8800702928801124e-06, + "loss": 0.2712, + "step": 2928 + }, + { + "epoch": 2.09, + "grad_norm": 21.203239862503867, + "learning_rate": 4.8771809949489056e-06, + "loss": 0.3223, + "step": 2929 + }, + { + "epoch": 2.09, + "grad_norm": 21.718444322534985, + "learning_rate": 4.874291738053809e-06, + "loss": 0.479, + "step": 2930 + }, + { + "epoch": 2.09, + "grad_norm": 9.328069951779979, + "learning_rate": 4.871402523160181e-06, + "loss": 0.2515, + "step": 2931 + }, + { + "epoch": 2.09, + "grad_norm": 12.698570030098262, + "learning_rate": 4.868513351233359e-06, + "loss": 0.3232, + "step": 2932 + }, + { + "epoch": 2.09, + "grad_norm": 12.399731299784895, + "learning_rate": 4.865624223238672e-06, + "loss": 0.3057, + "step": 2933 + }, + { + "epoch": 2.09, + "grad_norm": 14.15982457143095, + "learning_rate": 4.862735140141428e-06, + "loss": 0.3362, + "step": 2934 + }, + { + "epoch": 2.09, + "grad_norm": 12.120995256901526, + "learning_rate": 4.859846102906927e-06, + "loss": 0.2119, + "step": 2935 + }, + { + "epoch": 2.1, + "grad_norm": 11.790485443872855, + "learning_rate": 4.856957112500446e-06, + "loss": 0.26, + "step": 2936 + }, + { + "epoch": 2.1, + "grad_norm": 9.804955003772951, + "learning_rate": 4.854068169887254e-06, + "loss": 0.2522, + "step": 2937 + }, + { + "epoch": 2.1, + "grad_norm": 9.67276784524615, + "learning_rate": 4.851179276032598e-06, + "loss": 0.2461, + "step": 2938 + }, + { + "epoch": 2.1, + "grad_norm": 13.029967703322102, + "learning_rate": 4.848290431901712e-06, + "loss": 0.2825, + "step": 2939 + }, + { + "epoch": 2.1, + "grad_norm": 13.701298043559392, + "learning_rate": 4.845401638459813e-06, + "loss": 0.3257, + "step": 2940 + }, + { + "epoch": 2.1, + "grad_norm": 24.527517306096676, + "learning_rate": 4.8425128966721e-06, + "loss": 0.3032, + "step": 2941 + }, + { + "epoch": 2.1, + "grad_norm": 10.060243434857231, + "learning_rate": 4.8396242075037555e-06, + "loss": 0.2424, + "step": 2942 + }, + { + "epoch": 2.1, + "grad_norm": 11.110048308538667, + "learning_rate": 4.836735571919946e-06, + "loss": 0.2317, + "step": 2943 + }, + { + "epoch": 2.1, + "grad_norm": 13.270855038860887, + "learning_rate": 4.833846990885813e-06, + "loss": 0.2925, + "step": 2944 + }, + { + "epoch": 2.1, + "grad_norm": 13.546059086949068, + "learning_rate": 4.830958465366492e-06, + "loss": 0.3115, + "step": 2945 + }, + { + "epoch": 2.1, + "grad_norm": 15.147060376074828, + "learning_rate": 4.828069996327088e-06, + "loss": 0.3071, + "step": 2946 + }, + { + "epoch": 2.1, + "grad_norm": 21.89733878398577, + "learning_rate": 4.825181584732695e-06, + "loss": 0.354, + "step": 2947 + }, + { + "epoch": 2.1, + "grad_norm": 12.098924759848195, + "learning_rate": 4.822293231548382e-06, + "loss": 0.3223, + "step": 2948 + }, + { + "epoch": 2.1, + "grad_norm": 14.59459222498052, + "learning_rate": 4.819404937739205e-06, + "loss": 0.2988, + "step": 2949 + }, + { + "epoch": 2.11, + "grad_norm": 7.9228442130931, + "learning_rate": 4.816516704270194e-06, + "loss": 0.1917, + "step": 2950 + }, + { + "epoch": 2.11, + "grad_norm": 11.304262104161308, + "learning_rate": 4.813628532106363e-06, + "loss": 0.2844, + "step": 2951 + }, + { + "epoch": 2.11, + "grad_norm": 10.91478464087612, + "learning_rate": 4.810740422212705e-06, + "loss": 0.2534, + "step": 2952 + }, + { + "epoch": 2.11, + "grad_norm": 13.369008662112533, + "learning_rate": 4.807852375554188e-06, + "loss": 0.248, + "step": 2953 + }, + { + "epoch": 2.11, + "grad_norm": 9.194937078896238, + "learning_rate": 4.804964393095765e-06, + "loss": 0.2935, + "step": 2954 + }, + { + "epoch": 2.11, + "grad_norm": 11.543922616402899, + "learning_rate": 4.802076475802362e-06, + "loss": 0.2605, + "step": 2955 + }, + { + "epoch": 2.11, + "grad_norm": 10.955492140429202, + "learning_rate": 4.799188624638889e-06, + "loss": 0.207, + "step": 2956 + }, + { + "epoch": 2.11, + "grad_norm": 11.777530819834624, + "learning_rate": 4.796300840570227e-06, + "loss": 0.2734, + "step": 2957 + }, + { + "epoch": 2.11, + "grad_norm": 11.600327023654208, + "learning_rate": 4.793413124561243e-06, + "loss": 0.2236, + "step": 2958 + }, + { + "epoch": 2.11, + "grad_norm": 9.515292072213855, + "learning_rate": 4.790525477576773e-06, + "loss": 0.2729, + "step": 2959 + }, + { + "epoch": 2.11, + "grad_norm": 9.024728170748826, + "learning_rate": 4.7876379005816325e-06, + "loss": 0.192, + "step": 2960 + }, + { + "epoch": 2.11, + "grad_norm": 8.964190783912358, + "learning_rate": 4.784750394540619e-06, + "loss": 0.2013, + "step": 2961 + }, + { + "epoch": 2.11, + "grad_norm": 10.784790275659192, + "learning_rate": 4.781862960418498e-06, + "loss": 0.2595, + "step": 2962 + }, + { + "epoch": 2.11, + "grad_norm": 21.133720980277968, + "learning_rate": 4.778975599180019e-06, + "loss": 0.3374, + "step": 2963 + }, + { + "epoch": 2.12, + "grad_norm": 14.406518179067787, + "learning_rate": 4.776088311789897e-06, + "loss": 0.2964, + "step": 2964 + }, + { + "epoch": 2.12, + "grad_norm": 8.188477238179528, + "learning_rate": 4.773201099212835e-06, + "loss": 0.1699, + "step": 2965 + }, + { + "epoch": 2.12, + "grad_norm": 9.483178808062954, + "learning_rate": 4.770313962413499e-06, + "loss": 0.2429, + "step": 2966 + }, + { + "epoch": 2.12, + "grad_norm": 11.54657457090472, + "learning_rate": 4.767426902356539e-06, + "loss": 0.2224, + "step": 2967 + }, + { + "epoch": 2.12, + "grad_norm": 14.491438595730546, + "learning_rate": 4.7645399200065745e-06, + "loss": 0.3599, + "step": 2968 + }, + { + "epoch": 2.12, + "grad_norm": 10.84370673100882, + "learning_rate": 4.761653016328197e-06, + "loss": 0.2827, + "step": 2969 + }, + { + "epoch": 2.12, + "grad_norm": 9.737199358136072, + "learning_rate": 4.758766192285979e-06, + "loss": 0.1892, + "step": 2970 + }, + { + "epoch": 2.12, + "grad_norm": 15.368842569814777, + "learning_rate": 4.755879448844458e-06, + "loss": 0.3108, + "step": 2971 + }, + { + "epoch": 2.12, + "grad_norm": 11.722321883118072, + "learning_rate": 4.752992786968153e-06, + "loss": 0.2773, + "step": 2972 + }, + { + "epoch": 2.12, + "grad_norm": 14.416553515826045, + "learning_rate": 4.750106207621546e-06, + "loss": 0.2751, + "step": 2973 + }, + { + "epoch": 2.12, + "grad_norm": 13.678335643158764, + "learning_rate": 4.747219711769103e-06, + "loss": 0.2622, + "step": 2974 + }, + { + "epoch": 2.12, + "grad_norm": 14.178387254968198, + "learning_rate": 4.74433330037525e-06, + "loss": 0.2866, + "step": 2975 + }, + { + "epoch": 2.12, + "grad_norm": 8.722021862525153, + "learning_rate": 4.741446974404396e-06, + "loss": 0.2549, + "step": 2976 + }, + { + "epoch": 2.12, + "grad_norm": 8.939812320748437, + "learning_rate": 4.738560734820914e-06, + "loss": 0.2354, + "step": 2977 + }, + { + "epoch": 2.13, + "grad_norm": 10.457539144367189, + "learning_rate": 4.735674582589147e-06, + "loss": 0.2371, + "step": 2978 + }, + { + "epoch": 2.13, + "grad_norm": 11.031971470356236, + "learning_rate": 4.732788518673418e-06, + "loss": 0.283, + "step": 2979 + }, + { + "epoch": 2.13, + "grad_norm": 13.290069870349917, + "learning_rate": 4.729902544038009e-06, + "loss": 0.262, + "step": 2980 + }, + { + "epoch": 2.13, + "grad_norm": 9.646079051872375, + "learning_rate": 4.7270166596471825e-06, + "loss": 0.2522, + "step": 2981 + }, + { + "epoch": 2.13, + "grad_norm": 9.643876500840022, + "learning_rate": 4.724130866465163e-06, + "loss": 0.26, + "step": 2982 + }, + { + "epoch": 2.13, + "grad_norm": 10.46389033776257, + "learning_rate": 4.721245165456149e-06, + "loss": 0.2505, + "step": 2983 + }, + { + "epoch": 2.13, + "grad_norm": 10.041824511182282, + "learning_rate": 4.7183595575843055e-06, + "loss": 0.2754, + "step": 2984 + }, + { + "epoch": 2.13, + "grad_norm": 9.441670864552433, + "learning_rate": 4.715474043813771e-06, + "loss": 0.2422, + "step": 2985 + }, + { + "epoch": 2.13, + "grad_norm": 13.272432898291353, + "learning_rate": 4.712588625108645e-06, + "loss": 0.2637, + "step": 2986 + }, + { + "epoch": 2.13, + "grad_norm": 13.77655861565552, + "learning_rate": 4.709703302433003e-06, + "loss": 0.2734, + "step": 2987 + }, + { + "epoch": 2.13, + "grad_norm": 12.616444934585852, + "learning_rate": 4.706818076750883e-06, + "loss": 0.3889, + "step": 2988 + }, + { + "epoch": 2.13, + "grad_norm": 13.01033899307128, + "learning_rate": 4.703932949026291e-06, + "loss": 0.2466, + "step": 2989 + }, + { + "epoch": 2.13, + "grad_norm": 11.858289061563696, + "learning_rate": 4.701047920223207e-06, + "loss": 0.293, + "step": 2990 + }, + { + "epoch": 2.13, + "grad_norm": 16.866643961827666, + "learning_rate": 4.6981629913055674e-06, + "loss": 0.3655, + "step": 2991 + }, + { + "epoch": 2.14, + "grad_norm": 12.132085368005846, + "learning_rate": 4.695278163237284e-06, + "loss": 0.1934, + "step": 2992 + }, + { + "epoch": 2.14, + "grad_norm": 15.039198152601585, + "learning_rate": 4.692393436982229e-06, + "loss": 0.23, + "step": 2993 + }, + { + "epoch": 2.14, + "grad_norm": 11.3107040441794, + "learning_rate": 4.689508813504246e-06, + "loss": 0.2344, + "step": 2994 + }, + { + "epoch": 2.14, + "grad_norm": 8.629823411245509, + "learning_rate": 4.686624293767138e-06, + "loss": 0.2749, + "step": 2995 + }, + { + "epoch": 2.14, + "grad_norm": 15.154888487363252, + "learning_rate": 4.683739878734678e-06, + "loss": 0.2816, + "step": 2996 + }, + { + "epoch": 2.14, + "grad_norm": 11.03652875316943, + "learning_rate": 4.6808555693706045e-06, + "loss": 0.2156, + "step": 2997 + }, + { + "epoch": 2.14, + "grad_norm": 11.487451590380616, + "learning_rate": 4.677971366638616e-06, + "loss": 0.2493, + "step": 2998 + }, + { + "epoch": 2.14, + "grad_norm": 9.49534880905934, + "learning_rate": 4.67508727150238e-06, + "loss": 0.25, + "step": 2999 + }, + { + "epoch": 2.14, + "grad_norm": 7.968859660999014, + "learning_rate": 4.672203284925525e-06, + "loss": 0.2004, + "step": 3000 + }, + { + "epoch": 2.14, + "eval_avg_AUC": 0.7804411891516083, + "eval_avg_Accuracy": 0.6809101458885941, + "eval_avg_Accuracy-right": 0.8848963088561367, + "eval_avg_Accuracy-wrong": 0.32522174209688426, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6521728732768947, + "eval_last_AUC": 0.798815075101747, + "eval_last_Accuracy": 0.7263759946949602, + "eval_last_Accuracy-right": 0.8329855223685927, + "eval_last_Accuracy-wrong": 0.5404821469183534, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6680679021503793, + "eval_max_AUC": 0.7486281620568513, + "eval_max_Accuracy": 0.6368534482758621, + "eval_max_Accuracy-right": 0.9711751662971175, + "eval_max_Accuracy-wrong": 0.05390038662724585, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.60539630409024, + "eval_min_AUC": 0.7853322845511744, + "eval_min_Accuracy": 0.7209051724137931, + "eval_min_Accuracy-right": 0.7549889135254989, + "eval_min_Accuracy-wrong": 0.6614737320900614, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6588892732875934, + "eval_prod_AUC": 0.7847266249422049, + "eval_prod_Accuracy": 0.706440649867374, + "eval_prod_Accuracy-right": 0.6447763140733012, + "eval_prod_Accuracy-wrong": 0.8139640664089152, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6510440752440944, + "eval_runtime": 252.474, + "eval_samples_per_second": 95.566, + "eval_steps_per_second": 2.986, + "eval_sum_AUC": 0.6452340173243202, + "eval_sum_Accuracy": 0.6382211538461539, + "eval_sum_Accuracy-right": 0.9940654754141124, + "eval_sum_Accuracy-wrong": 0.01773936775073914, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6368181458868507, + "step": 3000 + }, + { + "epoch": 2.14, + "grad_norm": 9.719355683834616, + "learning_rate": 4.669319407871647e-06, + "loss": 0.2515, + "step": 3001 + }, + { + "epoch": 2.14, + "grad_norm": 14.170964090776609, + "learning_rate": 4.666435641304301e-06, + "loss": 0.2522, + "step": 3002 + }, + { + "epoch": 2.14, + "grad_norm": 9.923468867083184, + "learning_rate": 4.663551986187006e-06, + "loss": 0.2493, + "step": 3003 + }, + { + "epoch": 2.14, + "grad_norm": 11.30112917716831, + "learning_rate": 4.660668443483248e-06, + "loss": 0.3486, + "step": 3004 + }, + { + "epoch": 2.14, + "grad_norm": 10.14288841228628, + "learning_rate": 4.657785014156468e-06, + "loss": 0.2489, + "step": 3005 + }, + { + "epoch": 2.15, + "grad_norm": 11.713009972335938, + "learning_rate": 4.654901699170077e-06, + "loss": 0.3506, + "step": 3006 + }, + { + "epoch": 2.15, + "grad_norm": 14.161879396772502, + "learning_rate": 4.652018499487442e-06, + "loss": 0.2977, + "step": 3007 + }, + { + "epoch": 2.15, + "grad_norm": 12.191379054117588, + "learning_rate": 4.649135416071896e-06, + "loss": 0.2581, + "step": 3008 + }, + { + "epoch": 2.15, + "grad_norm": 20.471445742521716, + "learning_rate": 4.646252449886727e-06, + "loss": 0.3132, + "step": 3009 + }, + { + "epoch": 2.15, + "grad_norm": 14.975686497998877, + "learning_rate": 4.6433696018951915e-06, + "loss": 0.3794, + "step": 3010 + }, + { + "epoch": 2.15, + "grad_norm": 16.4897021990898, + "learning_rate": 4.640486873060501e-06, + "loss": 0.3147, + "step": 3011 + }, + { + "epoch": 2.15, + "grad_norm": 9.363266401118915, + "learning_rate": 4.6376042643458254e-06, + "loss": 0.2346, + "step": 3012 + }, + { + "epoch": 2.15, + "grad_norm": 9.753203788882743, + "learning_rate": 4.634721776714305e-06, + "loss": 0.2937, + "step": 3013 + }, + { + "epoch": 2.15, + "grad_norm": 13.363526616262392, + "learning_rate": 4.631839411129025e-06, + "loss": 0.3018, + "step": 3014 + }, + { + "epoch": 2.15, + "grad_norm": 11.110221054974863, + "learning_rate": 4.628957168553044e-06, + "loss": 0.2727, + "step": 3015 + }, + { + "epoch": 2.15, + "grad_norm": 16.632277373728602, + "learning_rate": 4.6260750499493665e-06, + "loss": 0.4019, + "step": 3016 + }, + { + "epoch": 2.15, + "grad_norm": 14.087285690858906, + "learning_rate": 4.623193056280968e-06, + "loss": 0.3335, + "step": 3017 + }, + { + "epoch": 2.15, + "grad_norm": 8.025810609472378, + "learning_rate": 4.6203111885107735e-06, + "loss": 0.2375, + "step": 3018 + }, + { + "epoch": 2.15, + "grad_norm": 11.067682340287535, + "learning_rate": 4.617429447601665e-06, + "loss": 0.2476, + "step": 3019 + }, + { + "epoch": 2.16, + "grad_norm": 8.772321631454632, + "learning_rate": 4.614547834516492e-06, + "loss": 0.2551, + "step": 3020 + }, + { + "epoch": 2.16, + "grad_norm": 16.09704435421464, + "learning_rate": 4.6116663502180495e-06, + "loss": 0.3059, + "step": 3021 + }, + { + "epoch": 2.16, + "grad_norm": 8.204814449759008, + "learning_rate": 4.6087849956691e-06, + "loss": 0.2561, + "step": 3022 + }, + { + "epoch": 2.16, + "grad_norm": 11.234181879339191, + "learning_rate": 4.605903771832353e-06, + "loss": 0.2791, + "step": 3023 + }, + { + "epoch": 2.16, + "grad_norm": 12.496146964582882, + "learning_rate": 4.603022679670482e-06, + "loss": 0.2939, + "step": 3024 + }, + { + "epoch": 2.16, + "grad_norm": 20.489599009018622, + "learning_rate": 4.6001417201461114e-06, + "loss": 0.2979, + "step": 3025 + }, + { + "epoch": 2.16, + "grad_norm": 12.091464787515235, + "learning_rate": 4.597260894221826e-06, + "loss": 0.3162, + "step": 3026 + }, + { + "epoch": 2.16, + "grad_norm": 16.839579410632815, + "learning_rate": 4.594380202860162e-06, + "loss": 0.3357, + "step": 3027 + }, + { + "epoch": 2.16, + "grad_norm": 10.013418973388266, + "learning_rate": 4.5914996470236094e-06, + "loss": 0.3015, + "step": 3028 + }, + { + "epoch": 2.16, + "grad_norm": 12.220647672670827, + "learning_rate": 4.588619227674619e-06, + "loss": 0.2327, + "step": 3029 + }, + { + "epoch": 2.16, + "grad_norm": 11.062502514019542, + "learning_rate": 4.58573894577559e-06, + "loss": 0.322, + "step": 3030 + }, + { + "epoch": 2.16, + "grad_norm": 10.599524075966587, + "learning_rate": 4.5828588022888815e-06, + "loss": 0.2974, + "step": 3031 + }, + { + "epoch": 2.16, + "grad_norm": 11.933286676593458, + "learning_rate": 4.5799787981767975e-06, + "loss": 0.2861, + "step": 3032 + }, + { + "epoch": 2.16, + "grad_norm": 10.79407778155579, + "learning_rate": 4.577098934401607e-06, + "loss": 0.2773, + "step": 3033 + }, + { + "epoch": 2.17, + "grad_norm": 12.25668831567693, + "learning_rate": 4.57421921192552e-06, + "loss": 0.2288, + "step": 3034 + }, + { + "epoch": 2.17, + "grad_norm": 8.332016300630848, + "learning_rate": 4.5713396317107115e-06, + "loss": 0.2136, + "step": 3035 + }, + { + "epoch": 2.17, + "grad_norm": 8.56505719334519, + "learning_rate": 4.568460194719299e-06, + "loss": 0.1902, + "step": 3036 + }, + { + "epoch": 2.17, + "grad_norm": 9.736750372255026, + "learning_rate": 4.565580901913356e-06, + "loss": 0.2759, + "step": 3037 + }, + { + "epoch": 2.17, + "grad_norm": 13.30006734127021, + "learning_rate": 4.562701754254909e-06, + "loss": 0.29, + "step": 3038 + }, + { + "epoch": 2.17, + "grad_norm": 12.077457240517598, + "learning_rate": 4.559822752705933e-06, + "loss": 0.3496, + "step": 3039 + }, + { + "epoch": 2.17, + "grad_norm": 8.698125787141345, + "learning_rate": 4.556943898228358e-06, + "loss": 0.2375, + "step": 3040 + }, + { + "epoch": 2.17, + "grad_norm": 20.038157371173256, + "learning_rate": 4.55406519178406e-06, + "loss": 0.2932, + "step": 3041 + }, + { + "epoch": 2.17, + "grad_norm": 10.500307715120192, + "learning_rate": 4.551186634334873e-06, + "loss": 0.2507, + "step": 3042 + }, + { + "epoch": 2.17, + "grad_norm": 17.239595721895437, + "learning_rate": 4.54830822684257e-06, + "loss": 0.2397, + "step": 3043 + }, + { + "epoch": 2.17, + "grad_norm": 26.518571980335143, + "learning_rate": 4.545429970268888e-06, + "loss": 0.5195, + "step": 3044 + }, + { + "epoch": 2.17, + "grad_norm": 21.63226226043342, + "learning_rate": 4.542551865575499e-06, + "loss": 0.314, + "step": 3045 + }, + { + "epoch": 2.17, + "grad_norm": 10.077102386147187, + "learning_rate": 4.539673913724037e-06, + "loss": 0.2119, + "step": 3046 + }, + { + "epoch": 2.17, + "grad_norm": 10.394200369375191, + "learning_rate": 4.5367961156760745e-06, + "loss": 0.3079, + "step": 3047 + }, + { + "epoch": 2.18, + "grad_norm": 29.417594716851383, + "learning_rate": 4.533918472393141e-06, + "loss": 0.5259, + "step": 3048 + }, + { + "epoch": 2.18, + "grad_norm": 11.457438246959063, + "learning_rate": 4.531040984836708e-06, + "loss": 0.2346, + "step": 3049 + }, + { + "epoch": 2.18, + "grad_norm": 10.204113553765206, + "learning_rate": 4.5281636539682e-06, + "loss": 0.3069, + "step": 3050 + }, + { + "epoch": 2.18, + "grad_norm": 8.049228198201712, + "learning_rate": 4.5252864807489836e-06, + "loss": 0.2656, + "step": 3051 + }, + { + "epoch": 2.18, + "grad_norm": 10.606648655895928, + "learning_rate": 4.522409466140379e-06, + "loss": 0.2593, + "step": 3052 + }, + { + "epoch": 2.18, + "grad_norm": 15.556356018045541, + "learning_rate": 4.5195326111036475e-06, + "loss": 0.3882, + "step": 3053 + }, + { + "epoch": 2.18, + "grad_norm": 11.99828379110033, + "learning_rate": 4.5166559166000035e-06, + "loss": 0.3203, + "step": 3054 + }, + { + "epoch": 2.18, + "grad_norm": 22.263085046077336, + "learning_rate": 4.513779383590599e-06, + "loss": 0.3269, + "step": 3055 + }, + { + "epoch": 2.18, + "grad_norm": 9.933203567646347, + "learning_rate": 4.510903013036542e-06, + "loss": 0.3042, + "step": 3056 + }, + { + "epoch": 2.18, + "grad_norm": 8.491586252287485, + "learning_rate": 4.508026805898878e-06, + "loss": 0.2708, + "step": 3057 + }, + { + "epoch": 2.18, + "grad_norm": 9.372992871527948, + "learning_rate": 4.505150763138604e-06, + "loss": 0.2822, + "step": 3058 + }, + { + "epoch": 2.18, + "grad_norm": 9.457990662995527, + "learning_rate": 4.502274885716656e-06, + "loss": 0.2791, + "step": 3059 + }, + { + "epoch": 2.18, + "grad_norm": 9.701114569175404, + "learning_rate": 4.499399174593923e-06, + "loss": 0.239, + "step": 3060 + }, + { + "epoch": 2.18, + "grad_norm": 11.167254709611056, + "learning_rate": 4.496523630731229e-06, + "loss": 0.3047, + "step": 3061 + }, + { + "epoch": 2.19, + "grad_norm": 9.111502148347535, + "learning_rate": 4.493648255089347e-06, + "loss": 0.249, + "step": 3062 + }, + { + "epoch": 2.19, + "grad_norm": 9.384302535844311, + "learning_rate": 4.490773048628997e-06, + "loss": 0.2898, + "step": 3063 + }, + { + "epoch": 2.19, + "grad_norm": 9.800690735712097, + "learning_rate": 4.487898012310834e-06, + "loss": 0.2688, + "step": 3064 + }, + { + "epoch": 2.19, + "grad_norm": 17.264879768298673, + "learning_rate": 4.485023147095466e-06, + "loss": 0.5088, + "step": 3065 + }, + { + "epoch": 2.19, + "grad_norm": 10.264233050955779, + "learning_rate": 4.482148453943434e-06, + "loss": 0.3213, + "step": 3066 + }, + { + "epoch": 2.19, + "grad_norm": 12.516757746284293, + "learning_rate": 4.479273933815232e-06, + "loss": 0.3726, + "step": 3067 + }, + { + "epoch": 2.19, + "grad_norm": 20.358627634880012, + "learning_rate": 4.476399587671285e-06, + "loss": 0.4409, + "step": 3068 + }, + { + "epoch": 2.19, + "grad_norm": 14.196044087714956, + "learning_rate": 4.47352541647197e-06, + "loss": 0.2632, + "step": 3069 + }, + { + "epoch": 2.19, + "grad_norm": 10.996167525812037, + "learning_rate": 4.470651421177599e-06, + "loss": 0.2305, + "step": 3070 + }, + { + "epoch": 2.19, + "grad_norm": 8.349338044623135, + "learning_rate": 4.467777602748425e-06, + "loss": 0.1885, + "step": 3071 + }, + { + "epoch": 2.19, + "grad_norm": 13.05700812773356, + "learning_rate": 4.4649039621446495e-06, + "loss": 0.3647, + "step": 3072 + }, + { + "epoch": 2.19, + "grad_norm": 14.042404398136627, + "learning_rate": 4.462030500326403e-06, + "loss": 0.2471, + "step": 3073 + }, + { + "epoch": 2.19, + "grad_norm": 11.157726115560735, + "learning_rate": 4.459157218253769e-06, + "loss": 0.3259, + "step": 3074 + }, + { + "epoch": 2.19, + "grad_norm": 12.746593922157988, + "learning_rate": 4.456284116886758e-06, + "loss": 0.2834, + "step": 3075 + }, + { + "epoch": 2.2, + "grad_norm": 10.324680514897194, + "learning_rate": 4.453411197185334e-06, + "loss": 0.2542, + "step": 3076 + }, + { + "epoch": 2.2, + "grad_norm": 14.104263762334465, + "learning_rate": 4.450538460109384e-06, + "loss": 0.314, + "step": 3077 + }, + { + "epoch": 2.2, + "grad_norm": 10.666298882817523, + "learning_rate": 4.447665906618751e-06, + "loss": 0.2043, + "step": 3078 + }, + { + "epoch": 2.2, + "grad_norm": 10.855428365451152, + "learning_rate": 4.444793537673204e-06, + "loss": 0.2446, + "step": 3079 + }, + { + "epoch": 2.2, + "grad_norm": 12.617290032096342, + "learning_rate": 4.441921354232455e-06, + "loss": 0.3394, + "step": 3080 + }, + { + "epoch": 2.2, + "grad_norm": 15.767718908396866, + "learning_rate": 4.439049357256156e-06, + "loss": 0.2822, + "step": 3081 + }, + { + "epoch": 2.2, + "grad_norm": 16.04967813853941, + "learning_rate": 4.436177547703891e-06, + "loss": 0.2839, + "step": 3082 + }, + { + "epoch": 2.2, + "grad_norm": 16.124779669352307, + "learning_rate": 4.433305926535189e-06, + "loss": 0.3984, + "step": 3083 + }, + { + "epoch": 2.2, + "grad_norm": 12.063769370375915, + "learning_rate": 4.430434494709509e-06, + "loss": 0.2358, + "step": 3084 + }, + { + "epoch": 2.2, + "grad_norm": 7.918550212828764, + "learning_rate": 4.427563253186253e-06, + "loss": 0.1887, + "step": 3085 + }, + { + "epoch": 2.2, + "grad_norm": 5.958914847278019, + "learning_rate": 4.424692202924754e-06, + "loss": 0.1517, + "step": 3086 + }, + { + "epoch": 2.2, + "grad_norm": 7.898577580549401, + "learning_rate": 4.421821344884281e-06, + "loss": 0.2023, + "step": 3087 + }, + { + "epoch": 2.2, + "grad_norm": 9.715953044475873, + "learning_rate": 4.418950680024046e-06, + "loss": 0.2214, + "step": 3088 + }, + { + "epoch": 2.2, + "grad_norm": 9.016811447032085, + "learning_rate": 4.416080209303187e-06, + "loss": 0.2229, + "step": 3089 + }, + { + "epoch": 2.21, + "grad_norm": 11.39587616677386, + "learning_rate": 4.413209933680786e-06, + "loss": 0.2915, + "step": 3090 + }, + { + "epoch": 2.21, + "grad_norm": 13.239323594842487, + "learning_rate": 4.410339854115849e-06, + "loss": 0.3066, + "step": 3091 + }, + { + "epoch": 2.21, + "grad_norm": 11.047328873557003, + "learning_rate": 4.407469971567331e-06, + "loss": 0.3145, + "step": 3092 + }, + { + "epoch": 2.21, + "grad_norm": 12.89869317299782, + "learning_rate": 4.4046002869941055e-06, + "loss": 0.2461, + "step": 3093 + }, + { + "epoch": 2.21, + "grad_norm": 18.751841757753965, + "learning_rate": 4.401730801354994e-06, + "loss": 0.3765, + "step": 3094 + }, + { + "epoch": 2.21, + "grad_norm": 13.149602274951638, + "learning_rate": 4.39886151560874e-06, + "loss": 0.2488, + "step": 3095 + }, + { + "epoch": 2.21, + "grad_norm": 13.324113331094987, + "learning_rate": 4.395992430714028e-06, + "loss": 0.2659, + "step": 3096 + }, + { + "epoch": 2.21, + "grad_norm": 11.07732938634878, + "learning_rate": 4.393123547629472e-06, + "loss": 0.2417, + "step": 3097 + }, + { + "epoch": 2.21, + "grad_norm": 13.730825721640787, + "learning_rate": 4.390254867313619e-06, + "loss": 0.2175, + "step": 3098 + }, + { + "epoch": 2.21, + "grad_norm": 11.498964036214774, + "learning_rate": 4.387386390724947e-06, + "loss": 0.2483, + "step": 3099 + }, + { + "epoch": 2.21, + "grad_norm": 9.819080003077305, + "learning_rate": 4.38451811882187e-06, + "loss": 0.2152, + "step": 3100 + }, + { + "epoch": 2.21, + "grad_norm": 17.016314180753497, + "learning_rate": 4.3816500525627284e-06, + "loss": 0.2668, + "step": 3101 + }, + { + "epoch": 2.21, + "grad_norm": 9.737209247832773, + "learning_rate": 4.3787821929057985e-06, + "loss": 0.2485, + "step": 3102 + }, + { + "epoch": 2.21, + "grad_norm": 11.21172272431426, + "learning_rate": 4.3759145408092855e-06, + "loss": 0.2683, + "step": 3103 + }, + { + "epoch": 2.22, + "grad_norm": 13.720873434696868, + "learning_rate": 4.373047097231324e-06, + "loss": 0.281, + "step": 3104 + }, + { + "epoch": 2.22, + "grad_norm": 14.82444071744457, + "learning_rate": 4.370179863129979e-06, + "loss": 0.3105, + "step": 3105 + }, + { + "epoch": 2.22, + "grad_norm": 16.67477919882557, + "learning_rate": 4.367312839463251e-06, + "loss": 0.3049, + "step": 3106 + }, + { + "epoch": 2.22, + "grad_norm": 11.979444272208877, + "learning_rate": 4.3644460271890614e-06, + "loss": 0.2878, + "step": 3107 + }, + { + "epoch": 2.22, + "grad_norm": 11.93302818612876, + "learning_rate": 4.361579427265268e-06, + "loss": 0.2407, + "step": 3108 + }, + { + "epoch": 2.22, + "grad_norm": 12.835216618647086, + "learning_rate": 4.358713040649654e-06, + "loss": 0.2849, + "step": 3109 + }, + { + "epoch": 2.22, + "grad_norm": 8.05126265344838, + "learning_rate": 4.3558468682999336e-06, + "loss": 0.2156, + "step": 3110 + }, + { + "epoch": 2.22, + "grad_norm": 12.93526267989827, + "learning_rate": 4.352980911173747e-06, + "loss": 0.3201, + "step": 3111 + }, + { + "epoch": 2.22, + "grad_norm": 10.299964275506714, + "learning_rate": 4.350115170228664e-06, + "loss": 0.2556, + "step": 3112 + }, + { + "epoch": 2.22, + "grad_norm": 12.497945099673855, + "learning_rate": 4.3472496464221845e-06, + "loss": 0.3054, + "step": 3113 + }, + { + "epoch": 2.22, + "grad_norm": 6.59759493393717, + "learning_rate": 4.344384340711728e-06, + "loss": 0.1531, + "step": 3114 + }, + { + "epoch": 2.22, + "grad_norm": 8.031330972624689, + "learning_rate": 4.341519254054651e-06, + "loss": 0.1885, + "step": 3115 + }, + { + "epoch": 2.22, + "grad_norm": 10.543383964941674, + "learning_rate": 4.338654387408229e-06, + "loss": 0.229, + "step": 3116 + }, + { + "epoch": 2.22, + "grad_norm": 10.767553783865127, + "learning_rate": 4.335789741729671e-06, + "loss": 0.2268, + "step": 3117 + }, + { + "epoch": 2.23, + "grad_norm": 10.926314350733149, + "learning_rate": 4.332925317976104e-06, + "loss": 0.2434, + "step": 3118 + }, + { + "epoch": 2.23, + "grad_norm": 10.66200156018986, + "learning_rate": 4.330061117104589e-06, + "loss": 0.2202, + "step": 3119 + }, + { + "epoch": 2.23, + "grad_norm": 12.672090923776402, + "learning_rate": 4.327197140072108e-06, + "loss": 0.2769, + "step": 3120 + }, + { + "epoch": 2.23, + "grad_norm": 18.31880580033002, + "learning_rate": 4.324333387835565e-06, + "loss": 0.2791, + "step": 3121 + }, + { + "epoch": 2.23, + "grad_norm": 12.507999522819803, + "learning_rate": 4.321469861351799e-06, + "loss": 0.2307, + "step": 3122 + }, + { + "epoch": 2.23, + "grad_norm": 18.389475444013744, + "learning_rate": 4.318606561577562e-06, + "loss": 0.3716, + "step": 3123 + }, + { + "epoch": 2.23, + "grad_norm": 11.701663568377832, + "learning_rate": 4.31574348946954e-06, + "loss": 0.2278, + "step": 3124 + }, + { + "epoch": 2.23, + "grad_norm": 10.68598335546021, + "learning_rate": 4.312880645984334e-06, + "loss": 0.228, + "step": 3125 + }, + { + "epoch": 2.23, + "grad_norm": 9.815726426115686, + "learning_rate": 4.310018032078479e-06, + "loss": 0.2471, + "step": 3126 + }, + { + "epoch": 2.23, + "grad_norm": 19.09950822431688, + "learning_rate": 4.307155648708421e-06, + "loss": 0.3633, + "step": 3127 + }, + { + "epoch": 2.23, + "grad_norm": 18.393836294451905, + "learning_rate": 4.304293496830542e-06, + "loss": 0.4065, + "step": 3128 + }, + { + "epoch": 2.23, + "grad_norm": 14.434362345596266, + "learning_rate": 4.301431577401136e-06, + "loss": 0.377, + "step": 3129 + }, + { + "epoch": 2.23, + "grad_norm": 9.846364671226091, + "learning_rate": 4.298569891376423e-06, + "loss": 0.2998, + "step": 3130 + }, + { + "epoch": 2.23, + "grad_norm": 11.059306524932216, + "learning_rate": 4.2957084397125496e-06, + "loss": 0.3047, + "step": 3131 + }, + { + "epoch": 2.24, + "grad_norm": 10.741521017522524, + "learning_rate": 4.292847223365574e-06, + "loss": 0.2766, + "step": 3132 + }, + { + "epoch": 2.24, + "grad_norm": 13.152323226909704, + "learning_rate": 4.289986243291488e-06, + "loss": 0.2642, + "step": 3133 + }, + { + "epoch": 2.24, + "grad_norm": 12.623268724924818, + "learning_rate": 4.287125500446193e-06, + "loss": 0.248, + "step": 3134 + }, + { + "epoch": 2.24, + "grad_norm": 9.09056413640535, + "learning_rate": 4.284264995785521e-06, + "loss": 0.2935, + "step": 3135 + }, + { + "epoch": 2.24, + "grad_norm": 9.216703895780089, + "learning_rate": 4.2814047302652155e-06, + "loss": 0.24, + "step": 3136 + }, + { + "epoch": 2.24, + "grad_norm": 22.248993121674086, + "learning_rate": 4.278544704840948e-06, + "loss": 0.373, + "step": 3137 + }, + { + "epoch": 2.24, + "grad_norm": 15.845301439748043, + "learning_rate": 4.275684920468306e-06, + "loss": 0.272, + "step": 3138 + }, + { + "epoch": 2.24, + "grad_norm": 9.246046989550374, + "learning_rate": 4.272825378102791e-06, + "loss": 0.2622, + "step": 3139 + }, + { + "epoch": 2.24, + "grad_norm": 10.234508413168031, + "learning_rate": 4.269966078699836e-06, + "loss": 0.2419, + "step": 3140 + }, + { + "epoch": 2.24, + "grad_norm": 9.587106775142328, + "learning_rate": 4.267107023214782e-06, + "loss": 0.2439, + "step": 3141 + }, + { + "epoch": 2.24, + "grad_norm": 9.882611698235124, + "learning_rate": 4.264248212602896e-06, + "loss": 0.2478, + "step": 3142 + }, + { + "epoch": 2.24, + "grad_norm": 24.969669318687767, + "learning_rate": 4.261389647819355e-06, + "loss": 0.3423, + "step": 3143 + }, + { + "epoch": 2.24, + "grad_norm": 13.55812135577993, + "learning_rate": 4.258531329819264e-06, + "loss": 0.3176, + "step": 3144 + }, + { + "epoch": 2.24, + "grad_norm": 10.263892512075122, + "learning_rate": 4.255673259557636e-06, + "loss": 0.239, + "step": 3145 + }, + { + "epoch": 2.25, + "grad_norm": 13.911486938832663, + "learning_rate": 4.252815437989408e-06, + "loss": 0.3022, + "step": 3146 + }, + { + "epoch": 2.25, + "grad_norm": 14.005026801764096, + "learning_rate": 4.24995786606943e-06, + "loss": 0.377, + "step": 3147 + }, + { + "epoch": 2.25, + "grad_norm": 11.225242396822619, + "learning_rate": 4.24710054475247e-06, + "loss": 0.2725, + "step": 3148 + }, + { + "epoch": 2.25, + "grad_norm": 18.15552040543596, + "learning_rate": 4.244243474993214e-06, + "loss": 0.2903, + "step": 3149 + }, + { + "epoch": 2.25, + "grad_norm": 10.337215646854586, + "learning_rate": 4.241386657746257e-06, + "loss": 0.2544, + "step": 3150 + }, + { + "epoch": 2.25, + "grad_norm": 13.69049114869754, + "learning_rate": 4.2385300939661215e-06, + "loss": 0.3018, + "step": 3151 + }, + { + "epoch": 2.25, + "grad_norm": 12.674673511882043, + "learning_rate": 4.2356737846072326e-06, + "loss": 0.2776, + "step": 3152 + }, + { + "epoch": 2.25, + "grad_norm": 9.409534549944462, + "learning_rate": 4.232817730623941e-06, + "loss": 0.272, + "step": 3153 + }, + { + "epoch": 2.25, + "grad_norm": 7.195418655205769, + "learning_rate": 4.229961932970505e-06, + "loss": 0.1794, + "step": 3154 + }, + { + "epoch": 2.25, + "grad_norm": 14.817548675323104, + "learning_rate": 4.2271063926010995e-06, + "loss": 0.2317, + "step": 3155 + }, + { + "epoch": 2.25, + "grad_norm": 10.93728757969069, + "learning_rate": 4.224251110469814e-06, + "loss": 0.313, + "step": 3156 + }, + { + "epoch": 2.25, + "grad_norm": 9.095598064837857, + "learning_rate": 4.221396087530652e-06, + "loss": 0.251, + "step": 3157 + }, + { + "epoch": 2.25, + "grad_norm": 25.89434606145267, + "learning_rate": 4.218541324737529e-06, + "loss": 0.3334, + "step": 3158 + }, + { + "epoch": 2.25, + "grad_norm": 9.844389906885638, + "learning_rate": 4.2156868230442756e-06, + "loss": 0.2351, + "step": 3159 + }, + { + "epoch": 2.26, + "grad_norm": 18.609452101643022, + "learning_rate": 4.212832583404632e-06, + "loss": 0.3489, + "step": 3160 + }, + { + "epoch": 2.26, + "grad_norm": 20.13110023399353, + "learning_rate": 4.2099786067722535e-06, + "loss": 0.3137, + "step": 3161 + }, + { + "epoch": 2.26, + "grad_norm": 13.109251771071031, + "learning_rate": 4.207124894100707e-06, + "loss": 0.2671, + "step": 3162 + }, + { + "epoch": 2.26, + "grad_norm": 13.889889573613662, + "learning_rate": 4.2042714463434715e-06, + "loss": 0.311, + "step": 3163 + }, + { + "epoch": 2.26, + "grad_norm": 9.05359030193196, + "learning_rate": 4.201418264453935e-06, + "loss": 0.2292, + "step": 3164 + }, + { + "epoch": 2.26, + "grad_norm": 29.070179041968114, + "learning_rate": 4.198565349385402e-06, + "loss": 0.3848, + "step": 3165 + }, + { + "epoch": 2.26, + "grad_norm": 11.001684670124861, + "learning_rate": 4.195712702091079e-06, + "loss": 0.3359, + "step": 3166 + }, + { + "epoch": 2.26, + "grad_norm": 17.578718859825187, + "learning_rate": 4.192860323524094e-06, + "loss": 0.28, + "step": 3167 + }, + { + "epoch": 2.26, + "grad_norm": 16.171497366139107, + "learning_rate": 4.190008214637476e-06, + "loss": 0.2961, + "step": 3168 + }, + { + "epoch": 2.26, + "grad_norm": 15.876458793899186, + "learning_rate": 4.187156376384171e-06, + "loss": 0.2766, + "step": 3169 + }, + { + "epoch": 2.26, + "grad_norm": 11.134458157859976, + "learning_rate": 4.184304809717027e-06, + "loss": 0.311, + "step": 3170 + }, + { + "epoch": 2.26, + "grad_norm": 25.636667120923505, + "learning_rate": 4.18145351558881e-06, + "loss": 0.3188, + "step": 3171 + }, + { + "epoch": 2.26, + "grad_norm": 14.453629348290125, + "learning_rate": 4.178602494952187e-06, + "loss": 0.3232, + "step": 3172 + }, + { + "epoch": 2.26, + "grad_norm": 9.292622999034865, + "learning_rate": 4.175751748759737e-06, + "loss": 0.2307, + "step": 3173 + }, + { + "epoch": 2.27, + "grad_norm": 7.461810881057556, + "learning_rate": 4.1729012779639495e-06, + "loss": 0.1851, + "step": 3174 + }, + { + "epoch": 2.27, + "grad_norm": 10.995403715034513, + "learning_rate": 4.170051083517217e-06, + "loss": 0.3142, + "step": 3175 + }, + { + "epoch": 2.27, + "grad_norm": 28.442476871673804, + "learning_rate": 4.167201166371846e-06, + "loss": 0.3682, + "step": 3176 + }, + { + "epoch": 2.27, + "grad_norm": 18.11822897463826, + "learning_rate": 4.164351527480042e-06, + "loss": 0.2732, + "step": 3177 + }, + { + "epoch": 2.27, + "grad_norm": 16.21883279375824, + "learning_rate": 4.161502167793928e-06, + "loss": 0.3286, + "step": 3178 + }, + { + "epoch": 2.27, + "grad_norm": 27.22564232638533, + "learning_rate": 4.1586530882655226e-06, + "loss": 0.3633, + "step": 3179 + }, + { + "epoch": 2.27, + "grad_norm": 9.812336180461106, + "learning_rate": 4.155804289846762e-06, + "loss": 0.2236, + "step": 3180 + }, + { + "epoch": 2.27, + "grad_norm": 11.885754005735777, + "learning_rate": 4.152955773489479e-06, + "loss": 0.3079, + "step": 3181 + }, + { + "epoch": 2.27, + "grad_norm": 12.559628066175136, + "learning_rate": 4.150107540145413e-06, + "loss": 0.3069, + "step": 3182 + }, + { + "epoch": 2.27, + "grad_norm": 13.868335548365271, + "learning_rate": 4.147259590766219e-06, + "loss": 0.3408, + "step": 3183 + }, + { + "epoch": 2.27, + "grad_norm": 10.395868967470347, + "learning_rate": 4.144411926303442e-06, + "loss": 0.241, + "step": 3184 + }, + { + "epoch": 2.27, + "grad_norm": 8.321970943416959, + "learning_rate": 4.141564547708546e-06, + "loss": 0.2585, + "step": 3185 + }, + { + "epoch": 2.27, + "grad_norm": 16.89739295436778, + "learning_rate": 4.138717455932888e-06, + "loss": 0.3022, + "step": 3186 + }, + { + "epoch": 2.27, + "grad_norm": 9.404981314367001, + "learning_rate": 4.13587065192774e-06, + "loss": 0.2917, + "step": 3187 + }, + { + "epoch": 2.28, + "grad_norm": 17.078981032390395, + "learning_rate": 4.133024136644269e-06, + "loss": 0.2913, + "step": 3188 + }, + { + "epoch": 2.28, + "grad_norm": 8.308577953291362, + "learning_rate": 4.130177911033546e-06, + "loss": 0.2468, + "step": 3189 + }, + { + "epoch": 2.28, + "grad_norm": 9.135277981344027, + "learning_rate": 4.127331976046553e-06, + "loss": 0.2318, + "step": 3190 + }, + { + "epoch": 2.28, + "grad_norm": 8.987586352845991, + "learning_rate": 4.124486332634165e-06, + "loss": 0.3101, + "step": 3191 + }, + { + "epoch": 2.28, + "grad_norm": 12.693395458674322, + "learning_rate": 4.121640981747169e-06, + "loss": 0.2869, + "step": 3192 + }, + { + "epoch": 2.28, + "grad_norm": 10.221225064372197, + "learning_rate": 4.118795924336245e-06, + "loss": 0.2749, + "step": 3193 + }, + { + "epoch": 2.28, + "grad_norm": 9.464738717879746, + "learning_rate": 4.115951161351985e-06, + "loss": 0.2207, + "step": 3194 + }, + { + "epoch": 2.28, + "grad_norm": 21.485375562985087, + "learning_rate": 4.113106693744871e-06, + "loss": 0.3633, + "step": 3195 + }, + { + "epoch": 2.28, + "grad_norm": 13.928871090786055, + "learning_rate": 4.110262522465298e-06, + "loss": 0.3142, + "step": 3196 + }, + { + "epoch": 2.28, + "grad_norm": 12.569062534229731, + "learning_rate": 4.107418648463553e-06, + "loss": 0.2415, + "step": 3197 + }, + { + "epoch": 2.28, + "grad_norm": 19.556704268124058, + "learning_rate": 4.104575072689827e-06, + "loss": 0.4214, + "step": 3198 + }, + { + "epoch": 2.28, + "grad_norm": 11.309652598968563, + "learning_rate": 4.101731796094215e-06, + "loss": 0.2314, + "step": 3199 + }, + { + "epoch": 2.28, + "grad_norm": 11.249740156900172, + "learning_rate": 4.098888819626704e-06, + "loss": 0.3022, + "step": 3200 + }, + { + "epoch": 2.28, + "grad_norm": 11.021724075151338, + "learning_rate": 4.096046144237189e-06, + "loss": 0.2642, + "step": 3201 + }, + { + "epoch": 2.29, + "grad_norm": 9.964776567576948, + "learning_rate": 4.093203770875458e-06, + "loss": 0.2451, + "step": 3202 + }, + { + "epoch": 2.29, + "grad_norm": 7.948673464764161, + "learning_rate": 4.090361700491203e-06, + "loss": 0.2285, + "step": 3203 + }, + { + "epoch": 2.29, + "grad_norm": 16.28404893480785, + "learning_rate": 4.087519934034011e-06, + "loss": 0.3101, + "step": 3204 + }, + { + "epoch": 2.29, + "grad_norm": 8.473785693501132, + "learning_rate": 4.084678472453371e-06, + "loss": 0.2549, + "step": 3205 + }, + { + "epoch": 2.29, + "grad_norm": 24.12057528554086, + "learning_rate": 4.081837316698665e-06, + "loss": 0.3501, + "step": 3206 + }, + { + "epoch": 2.29, + "grad_norm": 19.997967895220985, + "learning_rate": 4.078996467719179e-06, + "loss": 0.3188, + "step": 3207 + }, + { + "epoch": 2.29, + "grad_norm": 6.614239673097449, + "learning_rate": 4.076155926464091e-06, + "loss": 0.2056, + "step": 3208 + }, + { + "epoch": 2.29, + "grad_norm": 17.47178593134632, + "learning_rate": 4.07331569388248e-06, + "loss": 0.2585, + "step": 3209 + }, + { + "epoch": 2.29, + "grad_norm": 10.19903305701167, + "learning_rate": 4.07047577092332e-06, + "loss": 0.3623, + "step": 3210 + }, + { + "epoch": 2.29, + "grad_norm": 10.950945054310028, + "learning_rate": 4.067636158535483e-06, + "loss": 0.2402, + "step": 3211 + }, + { + "epoch": 2.29, + "grad_norm": 10.688010166084021, + "learning_rate": 4.064796857667734e-06, + "loss": 0.3113, + "step": 3212 + }, + { + "epoch": 2.29, + "grad_norm": 12.84894387925838, + "learning_rate": 4.0619578692687405e-06, + "loss": 0.3286, + "step": 3213 + }, + { + "epoch": 2.29, + "grad_norm": 22.02913425595009, + "learning_rate": 4.059119194287056e-06, + "loss": 0.3047, + "step": 3214 + }, + { + "epoch": 2.29, + "grad_norm": 10.256721550325487, + "learning_rate": 4.056280833671139e-06, + "loss": 0.2534, + "step": 3215 + }, + { + "epoch": 2.3, + "grad_norm": 9.369809286741337, + "learning_rate": 4.053442788369334e-06, + "loss": 0.2544, + "step": 3216 + }, + { + "epoch": 2.3, + "grad_norm": 12.298685517077775, + "learning_rate": 4.05060505932989e-06, + "loss": 0.3677, + "step": 3217 + }, + { + "epoch": 2.3, + "grad_norm": 8.898769900347371, + "learning_rate": 4.04776764750094e-06, + "loss": 0.2266, + "step": 3218 + }, + { + "epoch": 2.3, + "grad_norm": 10.947460381359758, + "learning_rate": 4.04493055383052e-06, + "loss": 0.301, + "step": 3219 + }, + { + "epoch": 2.3, + "grad_norm": 11.74632555217383, + "learning_rate": 4.042093779266553e-06, + "loss": 0.2559, + "step": 3220 + }, + { + "epoch": 2.3, + "grad_norm": 9.599283467869368, + "learning_rate": 4.0392573247568614e-06, + "loss": 0.27, + "step": 3221 + }, + { + "epoch": 2.3, + "grad_norm": 12.943002421727876, + "learning_rate": 4.036421191249155e-06, + "loss": 0.2822, + "step": 3222 + }, + { + "epoch": 2.3, + "grad_norm": 29.579454388849882, + "learning_rate": 4.033585379691036e-06, + "loss": 0.4604, + "step": 3223 + }, + { + "epoch": 2.3, + "grad_norm": 11.899580216055918, + "learning_rate": 4.030749891030008e-06, + "loss": 0.28, + "step": 3224 + }, + { + "epoch": 2.3, + "grad_norm": 10.093984979965922, + "learning_rate": 4.0279147262134534e-06, + "loss": 0.281, + "step": 3225 + }, + { + "epoch": 2.3, + "grad_norm": 18.406878672576628, + "learning_rate": 4.025079886188661e-06, + "loss": 0.3867, + "step": 3226 + }, + { + "epoch": 2.3, + "grad_norm": 14.447300544742673, + "learning_rate": 4.022245371902796e-06, + "loss": 0.3687, + "step": 3227 + }, + { + "epoch": 2.3, + "grad_norm": 10.587222605286408, + "learning_rate": 4.01941118430293e-06, + "loss": 0.3228, + "step": 3228 + }, + { + "epoch": 2.3, + "grad_norm": 8.088881097873506, + "learning_rate": 4.0165773243360105e-06, + "loss": 0.1971, + "step": 3229 + }, + { + "epoch": 2.31, + "grad_norm": 10.273170092461974, + "learning_rate": 4.0137437929488885e-06, + "loss": 0.2725, + "step": 3230 + }, + { + "epoch": 2.31, + "grad_norm": 13.35424018521676, + "learning_rate": 4.010910591088296e-06, + "loss": 0.2815, + "step": 3231 + }, + { + "epoch": 2.31, + "grad_norm": 14.00753162673241, + "learning_rate": 4.008077719700859e-06, + "loss": 0.3716, + "step": 3232 + }, + { + "epoch": 2.31, + "grad_norm": 12.541628475729205, + "learning_rate": 4.005245179733095e-06, + "loss": 0.2659, + "step": 3233 + }, + { + "epoch": 2.31, + "grad_norm": 8.232635981574267, + "learning_rate": 4.002412972131403e-06, + "loss": 0.208, + "step": 3234 + }, + { + "epoch": 2.31, + "grad_norm": 6.224460677900014, + "learning_rate": 3.999581097842082e-06, + "loss": 0.1646, + "step": 3235 + }, + { + "epoch": 2.31, + "grad_norm": 8.8840908089983, + "learning_rate": 3.99674955781131e-06, + "loss": 0.2446, + "step": 3236 + }, + { + "epoch": 2.31, + "grad_norm": 9.304529312568652, + "learning_rate": 3.99391835298516e-06, + "loss": 0.2317, + "step": 3237 + }, + { + "epoch": 2.31, + "grad_norm": 10.502911845939426, + "learning_rate": 3.991087484309586e-06, + "loss": 0.231, + "step": 3238 + }, + { + "epoch": 2.31, + "grad_norm": 15.99359587043134, + "learning_rate": 3.988256952730439e-06, + "loss": 0.3862, + "step": 3239 + }, + { + "epoch": 2.31, + "grad_norm": 10.178986410442256, + "learning_rate": 3.985426759193449e-06, + "loss": 0.2451, + "step": 3240 + }, + { + "epoch": 2.31, + "grad_norm": 8.303906956030472, + "learning_rate": 3.982596904644236e-06, + "loss": 0.2224, + "step": 3241 + }, + { + "epoch": 2.31, + "grad_norm": 23.532535397233683, + "learning_rate": 3.979767390028309e-06, + "loss": 0.2461, + "step": 3242 + }, + { + "epoch": 2.31, + "grad_norm": 14.192913755233537, + "learning_rate": 3.976938216291059e-06, + "loss": 0.2808, + "step": 3243 + }, + { + "epoch": 2.32, + "grad_norm": 11.248098877642407, + "learning_rate": 3.974109384377768e-06, + "loss": 0.2585, + "step": 3244 + }, + { + "epoch": 2.32, + "grad_norm": 10.235266597149444, + "learning_rate": 3.971280895233599e-06, + "loss": 0.2322, + "step": 3245 + }, + { + "epoch": 2.32, + "grad_norm": 16.562031261403643, + "learning_rate": 3.968452749803605e-06, + "loss": 0.3599, + "step": 3246 + }, + { + "epoch": 2.32, + "grad_norm": 19.3566097178524, + "learning_rate": 3.965624949032723e-06, + "loss": 0.3271, + "step": 3247 + }, + { + "epoch": 2.32, + "grad_norm": 14.20409787900473, + "learning_rate": 3.962797493865767e-06, + "loss": 0.2771, + "step": 3248 + }, + { + "epoch": 2.32, + "grad_norm": 14.070021402214895, + "learning_rate": 3.959970385247451e-06, + "loss": 0.3074, + "step": 3249 + }, + { + "epoch": 2.32, + "grad_norm": 25.348359088033686, + "learning_rate": 3.957143624122359e-06, + "loss": 0.3887, + "step": 3250 + }, + { + "epoch": 2.32, + "grad_norm": 10.65763701778697, + "learning_rate": 3.954317211434966e-06, + "loss": 0.2871, + "step": 3251 + }, + { + "epoch": 2.32, + "grad_norm": 16.067469739234614, + "learning_rate": 3.951491148129628e-06, + "loss": 0.3311, + "step": 3252 + }, + { + "epoch": 2.32, + "grad_norm": 13.506402978506092, + "learning_rate": 3.948665435150589e-06, + "loss": 0.2527, + "step": 3253 + }, + { + "epoch": 2.32, + "grad_norm": 12.367653284704408, + "learning_rate": 3.945840073441967e-06, + "loss": 0.2432, + "step": 3254 + }, + { + "epoch": 2.32, + "grad_norm": 12.529703746228293, + "learning_rate": 3.943015063947773e-06, + "loss": 0.2793, + "step": 3255 + }, + { + "epoch": 2.32, + "grad_norm": 28.26985063062385, + "learning_rate": 3.940190407611891e-06, + "loss": 0.333, + "step": 3256 + }, + { + "epoch": 2.32, + "grad_norm": 6.659362639715853, + "learning_rate": 3.937366105378093e-06, + "loss": 0.1985, + "step": 3257 + }, + { + "epoch": 2.33, + "grad_norm": 9.796592659095309, + "learning_rate": 3.93454215819003e-06, + "loss": 0.2639, + "step": 3258 + }, + { + "epoch": 2.33, + "grad_norm": 17.950373980503407, + "learning_rate": 3.931718566991236e-06, + "loss": 0.4453, + "step": 3259 + }, + { + "epoch": 2.33, + "grad_norm": 12.59777182080348, + "learning_rate": 3.9288953327251265e-06, + "loss": 0.3032, + "step": 3260 + }, + { + "epoch": 2.33, + "grad_norm": 12.217410108136798, + "learning_rate": 3.9260724563349935e-06, + "loss": 0.2666, + "step": 3261 + }, + { + "epoch": 2.33, + "grad_norm": 11.026004296899076, + "learning_rate": 3.923249938764016e-06, + "loss": 0.312, + "step": 3262 + }, + { + "epoch": 2.33, + "grad_norm": 16.644089760277204, + "learning_rate": 3.920427780955247e-06, + "loss": 0.3784, + "step": 3263 + }, + { + "epoch": 2.33, + "grad_norm": 10.92215295788311, + "learning_rate": 3.917605983851622e-06, + "loss": 0.2981, + "step": 3264 + }, + { + "epoch": 2.33, + "grad_norm": 14.252312169191091, + "learning_rate": 3.914784548395959e-06, + "loss": 0.2703, + "step": 3265 + }, + { + "epoch": 2.33, + "grad_norm": 18.325498594913125, + "learning_rate": 3.911963475530948e-06, + "loss": 0.3665, + "step": 3266 + }, + { + "epoch": 2.33, + "grad_norm": 16.00125047262502, + "learning_rate": 3.909142766199163e-06, + "loss": 0.3264, + "step": 3267 + }, + { + "epoch": 2.33, + "grad_norm": 8.43569373211537, + "learning_rate": 3.906322421343055e-06, + "loss": 0.2512, + "step": 3268 + }, + { + "epoch": 2.33, + "grad_norm": 9.382819352662063, + "learning_rate": 3.903502441904956e-06, + "loss": 0.2378, + "step": 3269 + }, + { + "epoch": 2.33, + "grad_norm": 12.338844125815825, + "learning_rate": 3.900682828827072e-06, + "loss": 0.2664, + "step": 3270 + }, + { + "epoch": 2.33, + "grad_norm": 14.983592958009579, + "learning_rate": 3.897863583051488e-06, + "loss": 0.2817, + "step": 3271 + }, + { + "epoch": 2.34, + "grad_norm": 9.878791410236738, + "learning_rate": 3.895044705520167e-06, + "loss": 0.2729, + "step": 3272 + }, + { + "epoch": 2.34, + "grad_norm": 16.09056810345942, + "learning_rate": 3.892226197174947e-06, + "loss": 0.3022, + "step": 3273 + }, + { + "epoch": 2.34, + "grad_norm": 17.416882596827744, + "learning_rate": 3.889408058957547e-06, + "loss": 0.251, + "step": 3274 + }, + { + "epoch": 2.34, + "grad_norm": 10.418938219681527, + "learning_rate": 3.886590291809554e-06, + "loss": 0.3281, + "step": 3275 + }, + { + "epoch": 2.34, + "grad_norm": 11.54374780153606, + "learning_rate": 3.883772896672443e-06, + "loss": 0.25, + "step": 3276 + }, + { + "epoch": 2.34, + "grad_norm": 12.475161782689312, + "learning_rate": 3.8809558744875534e-06, + "loss": 0.3037, + "step": 3277 + }, + { + "epoch": 2.34, + "grad_norm": 12.217374801851173, + "learning_rate": 3.878139226196107e-06, + "loss": 0.2986, + "step": 3278 + }, + { + "epoch": 2.34, + "grad_norm": 14.47972978144969, + "learning_rate": 3.875322952739196e-06, + "loss": 0.3706, + "step": 3279 + }, + { + "epoch": 2.34, + "grad_norm": 11.281463354122508, + "learning_rate": 3.872507055057793e-06, + "loss": 0.2288, + "step": 3280 + }, + { + "epoch": 2.34, + "grad_norm": 12.610178959550831, + "learning_rate": 3.8696915340927395e-06, + "loss": 0.2668, + "step": 3281 + }, + { + "epoch": 2.34, + "grad_norm": 10.2691544838789, + "learning_rate": 3.866876390784752e-06, + "loss": 0.231, + "step": 3282 + }, + { + "epoch": 2.34, + "grad_norm": 36.402448271805106, + "learning_rate": 3.8640616260744266e-06, + "loss": 0.3735, + "step": 3283 + }, + { + "epoch": 2.34, + "grad_norm": 10.171805634334017, + "learning_rate": 3.861247240902223e-06, + "loss": 0.2512, + "step": 3284 + }, + { + "epoch": 2.34, + "grad_norm": 51.24345152155576, + "learning_rate": 3.858433236208485e-06, + "loss": 0.2886, + "step": 3285 + }, + { + "epoch": 2.35, + "grad_norm": 8.317004480140044, + "learning_rate": 3.85561961293342e-06, + "loss": 0.2485, + "step": 3286 + }, + { + "epoch": 2.35, + "grad_norm": 10.386888418538078, + "learning_rate": 3.852806372017115e-06, + "loss": 0.2256, + "step": 3287 + }, + { + "epoch": 2.35, + "grad_norm": 8.24971292042215, + "learning_rate": 3.849993514399521e-06, + "loss": 0.2556, + "step": 3288 + }, + { + "epoch": 2.35, + "grad_norm": 15.008978089204213, + "learning_rate": 3.847181041020472e-06, + "loss": 0.2876, + "step": 3289 + }, + { + "epoch": 2.35, + "grad_norm": 9.822270914959086, + "learning_rate": 3.844368952819666e-06, + "loss": 0.2314, + "step": 3290 + }, + { + "epoch": 2.35, + "grad_norm": 12.175452659496289, + "learning_rate": 3.84155725073667e-06, + "loss": 0.3372, + "step": 3291 + }, + { + "epoch": 2.35, + "grad_norm": 8.094646202827287, + "learning_rate": 3.838745935710931e-06, + "loss": 0.2441, + "step": 3292 + }, + { + "epoch": 2.35, + "grad_norm": 10.954423265082738, + "learning_rate": 3.835935008681757e-06, + "loss": 0.3174, + "step": 3293 + }, + { + "epoch": 2.35, + "grad_norm": 13.370092110407349, + "learning_rate": 3.833124470588336e-06, + "loss": 0.3279, + "step": 3294 + }, + { + "epoch": 2.35, + "grad_norm": 10.808483717392717, + "learning_rate": 3.830314322369717e-06, + "loss": 0.2334, + "step": 3295 + }, + { + "epoch": 2.35, + "grad_norm": 16.576910490466176, + "learning_rate": 3.827504564964825e-06, + "loss": 0.2522, + "step": 3296 + }, + { + "epoch": 2.35, + "grad_norm": 16.48583322025054, + "learning_rate": 3.82469519931245e-06, + "loss": 0.323, + "step": 3297 + }, + { + "epoch": 2.35, + "grad_norm": 11.518800925347575, + "learning_rate": 3.8218862263512565e-06, + "loss": 0.2456, + "step": 3298 + }, + { + "epoch": 2.35, + "grad_norm": 18.559272871616013, + "learning_rate": 3.819077647019772e-06, + "loss": 0.3108, + "step": 3299 + }, + { + "epoch": 2.36, + "grad_norm": 16.96699943610106, + "learning_rate": 3.816269462256394e-06, + "loss": 0.3784, + "step": 3300 + }, + { + "epoch": 2.36, + "grad_norm": 12.209507947737528, + "learning_rate": 3.813461672999394e-06, + "loss": 0.2932, + "step": 3301 + }, + { + "epoch": 2.36, + "grad_norm": 10.387762465843936, + "learning_rate": 3.8106542801869007e-06, + "loss": 0.2808, + "step": 3302 + }, + { + "epoch": 2.36, + "grad_norm": 15.201757791001164, + "learning_rate": 3.8078472847569215e-06, + "loss": 0.3765, + "step": 3303 + }, + { + "epoch": 2.36, + "grad_norm": 14.253111845630949, + "learning_rate": 3.805040687647321e-06, + "loss": 0.3374, + "step": 3304 + }, + { + "epoch": 2.36, + "grad_norm": 19.82147693696702, + "learning_rate": 3.8022344897958402e-06, + "loss": 0.332, + "step": 3305 + }, + { + "epoch": 2.36, + "grad_norm": 11.343136940610577, + "learning_rate": 3.799428692140077e-06, + "loss": 0.2681, + "step": 3306 + }, + { + "epoch": 2.36, + "grad_norm": 11.444088175662092, + "learning_rate": 3.7966232956175053e-06, + "loss": 0.2773, + "step": 3307 + }, + { + "epoch": 2.36, + "grad_norm": 16.004902990239433, + "learning_rate": 3.793818301165457e-06, + "loss": 0.3726, + "step": 3308 + }, + { + "epoch": 2.36, + "grad_norm": 12.09082633279314, + "learning_rate": 3.7910137097211345e-06, + "loss": 0.3049, + "step": 3309 + }, + { + "epoch": 2.36, + "grad_norm": 13.587555974439523, + "learning_rate": 3.788209522221604e-06, + "loss": 0.2961, + "step": 3310 + }, + { + "epoch": 2.36, + "grad_norm": 9.99106911580416, + "learning_rate": 3.7854057396037934e-06, + "loss": 0.2881, + "step": 3311 + }, + { + "epoch": 2.36, + "grad_norm": 8.139850321676342, + "learning_rate": 3.7826023628045037e-06, + "loss": 0.2412, + "step": 3312 + }, + { + "epoch": 2.36, + "grad_norm": 10.388827648592175, + "learning_rate": 3.779799392760391e-06, + "loss": 0.3181, + "step": 3313 + }, + { + "epoch": 2.37, + "grad_norm": 10.958626380275174, + "learning_rate": 3.7769968304079833e-06, + "loss": 0.2668, + "step": 3314 + }, + { + "epoch": 2.37, + "grad_norm": 11.3999162309862, + "learning_rate": 3.7741946766836657e-06, + "loss": 0.283, + "step": 3315 + }, + { + "epoch": 2.37, + "grad_norm": 18.78486008620017, + "learning_rate": 3.771392932523691e-06, + "loss": 0.2568, + "step": 3316 + }, + { + "epoch": 2.37, + "grad_norm": 10.359568318591949, + "learning_rate": 3.768591598864174e-06, + "loss": 0.2939, + "step": 3317 + }, + { + "epoch": 2.37, + "grad_norm": 31.670155201809273, + "learning_rate": 3.765790676641092e-06, + "loss": 0.3071, + "step": 3318 + }, + { + "epoch": 2.37, + "grad_norm": 11.970236514238282, + "learning_rate": 3.762990166790286e-06, + "loss": 0.2551, + "step": 3319 + }, + { + "epoch": 2.37, + "grad_norm": 18.047255891257112, + "learning_rate": 3.760190070247458e-06, + "loss": 0.3247, + "step": 3320 + }, + { + "epoch": 2.37, + "grad_norm": 10.64156184651028, + "learning_rate": 3.7573903879481714e-06, + "loss": 0.2834, + "step": 3321 + }, + { + "epoch": 2.37, + "grad_norm": 6.664506406435355, + "learning_rate": 3.754591120827854e-06, + "loss": 0.2263, + "step": 3322 + }, + { + "epoch": 2.37, + "grad_norm": 8.453693666344321, + "learning_rate": 3.7517922698217914e-06, + "loss": 0.2427, + "step": 3323 + }, + { + "epoch": 2.37, + "grad_norm": 11.250753271812135, + "learning_rate": 3.7489938358651334e-06, + "loss": 0.2156, + "step": 3324 + }, + { + "epoch": 2.37, + "grad_norm": 23.8413429198328, + "learning_rate": 3.746195819892885e-06, + "loss": 0.3213, + "step": 3325 + }, + { + "epoch": 2.37, + "grad_norm": 8.338062073228624, + "learning_rate": 3.7433982228399205e-06, + "loss": 0.1901, + "step": 3326 + }, + { + "epoch": 2.37, + "grad_norm": 11.241923335507073, + "learning_rate": 3.7406010456409648e-06, + "loss": 0.3037, + "step": 3327 + }, + { + "epoch": 2.38, + "grad_norm": 9.63418710168968, + "learning_rate": 3.73780428923061e-06, + "loss": 0.2549, + "step": 3328 + }, + { + "epoch": 2.38, + "grad_norm": 10.930254467732022, + "learning_rate": 3.7350079545433014e-06, + "loss": 0.2166, + "step": 3329 + }, + { + "epoch": 2.38, + "grad_norm": 6.752872103252343, + "learning_rate": 3.7322120425133497e-06, + "loss": 0.1606, + "step": 3330 + }, + { + "epoch": 2.38, + "grad_norm": 10.592309135473164, + "learning_rate": 3.729416554074917e-06, + "loss": 0.2258, + "step": 3331 + }, + { + "epoch": 2.38, + "grad_norm": 7.525689690264553, + "learning_rate": 3.726621490162033e-06, + "loss": 0.1725, + "step": 3332 + }, + { + "epoch": 2.38, + "grad_norm": 12.915876448357972, + "learning_rate": 3.7238268517085773e-06, + "loss": 0.353, + "step": 3333 + }, + { + "epoch": 2.38, + "grad_norm": 14.281531039394093, + "learning_rate": 3.7210326396482893e-06, + "loss": 0.2805, + "step": 3334 + }, + { + "epoch": 2.38, + "grad_norm": 12.802973255554388, + "learning_rate": 3.718238854914771e-06, + "loss": 0.3052, + "step": 3335 + }, + { + "epoch": 2.38, + "grad_norm": 18.4298425578648, + "learning_rate": 3.7154454984414733e-06, + "loss": 0.3263, + "step": 3336 + }, + { + "epoch": 2.38, + "grad_norm": 19.220340865499665, + "learning_rate": 3.7126525711617135e-06, + "loss": 0.3015, + "step": 3337 + }, + { + "epoch": 2.38, + "grad_norm": 9.281971521389087, + "learning_rate": 3.7098600740086555e-06, + "loss": 0.2118, + "step": 3338 + }, + { + "epoch": 2.38, + "grad_norm": 10.39960104281075, + "learning_rate": 3.707068007915329e-06, + "loss": 0.2546, + "step": 3339 + }, + { + "epoch": 2.38, + "grad_norm": 13.371386947285565, + "learning_rate": 3.704276373814611e-06, + "loss": 0.2737, + "step": 3340 + }, + { + "epoch": 2.38, + "grad_norm": 11.294716255095457, + "learning_rate": 3.7014851726392427e-06, + "loss": 0.2411, + "step": 3341 + }, + { + "epoch": 2.39, + "grad_norm": 10.309078229290204, + "learning_rate": 3.6986944053218143e-06, + "loss": 0.2798, + "step": 3342 + }, + { + "epoch": 2.39, + "grad_norm": 15.365685779974543, + "learning_rate": 3.69590407279477e-06, + "loss": 0.3003, + "step": 3343 + }, + { + "epoch": 2.39, + "grad_norm": 8.837019788680069, + "learning_rate": 3.6931141759904175e-06, + "loss": 0.293, + "step": 3344 + }, + { + "epoch": 2.39, + "grad_norm": 13.925370194635887, + "learning_rate": 3.6903247158409077e-06, + "loss": 0.2639, + "step": 3345 + }, + { + "epoch": 2.39, + "grad_norm": 16.855847089664, + "learning_rate": 3.687535693278256e-06, + "loss": 0.3687, + "step": 3346 + }, + { + "epoch": 2.39, + "grad_norm": 13.239321422376957, + "learning_rate": 3.6847471092343225e-06, + "loss": 0.2676, + "step": 3347 + }, + { + "epoch": 2.39, + "grad_norm": 9.481727189698633, + "learning_rate": 3.681958964640828e-06, + "loss": 0.2578, + "step": 3348 + }, + { + "epoch": 2.39, + "grad_norm": 9.534587315413534, + "learning_rate": 3.679171260429343e-06, + "loss": 0.2925, + "step": 3349 + }, + { + "epoch": 2.39, + "grad_norm": 18.42058638916999, + "learning_rate": 3.676383997531288e-06, + "loss": 0.3088, + "step": 3350 + }, + { + "epoch": 2.39, + "grad_norm": 7.855041948096473, + "learning_rate": 3.673597176877944e-06, + "loss": 0.2554, + "step": 3351 + }, + { + "epoch": 2.39, + "grad_norm": 7.898301518405677, + "learning_rate": 3.670810799400435e-06, + "loss": 0.2297, + "step": 3352 + }, + { + "epoch": 2.39, + "grad_norm": 11.44162249590785, + "learning_rate": 3.668024866029747e-06, + "loss": 0.2598, + "step": 3353 + }, + { + "epoch": 2.39, + "grad_norm": 13.992432735538818, + "learning_rate": 3.665239377696706e-06, + "loss": 0.2859, + "step": 3354 + }, + { + "epoch": 2.39, + "grad_norm": 9.802186033426322, + "learning_rate": 3.6624543353320006e-06, + "loss": 0.254, + "step": 3355 + }, + { + "epoch": 2.4, + "grad_norm": 14.7282741034783, + "learning_rate": 3.659669739866162e-06, + "loss": 0.2305, + "step": 3356 + }, + { + "epoch": 2.4, + "grad_norm": 10.747783422915768, + "learning_rate": 3.6568855922295776e-06, + "loss": 0.3083, + "step": 3357 + }, + { + "epoch": 2.4, + "grad_norm": 11.203973860796053, + "learning_rate": 3.654101893352482e-06, + "loss": 0.2449, + "step": 3358 + }, + { + "epoch": 2.4, + "grad_norm": 14.381045935118614, + "learning_rate": 3.651318644164958e-06, + "loss": 0.2786, + "step": 3359 + }, + { + "epoch": 2.4, + "grad_norm": 10.200099711534078, + "learning_rate": 3.6485358455969454e-06, + "loss": 0.2385, + "step": 3360 + }, + { + "epoch": 2.4, + "grad_norm": 8.050190434196447, + "learning_rate": 3.645753498578225e-06, + "loss": 0.1902, + "step": 3361 + }, + { + "epoch": 2.4, + "grad_norm": 15.029174110851411, + "learning_rate": 3.6429716040384346e-06, + "loss": 0.2703, + "step": 3362 + }, + { + "epoch": 2.4, + "grad_norm": 14.962055596198097, + "learning_rate": 3.6401901629070524e-06, + "loss": 0.3083, + "step": 3363 + }, + { + "epoch": 2.4, + "grad_norm": 13.313655103966168, + "learning_rate": 3.6374091761134147e-06, + "loss": 0.301, + "step": 3364 + }, + { + "epoch": 2.4, + "grad_norm": 10.61602666368444, + "learning_rate": 3.6346286445866953e-06, + "loss": 0.1937, + "step": 3365 + }, + { + "epoch": 2.4, + "grad_norm": 8.310994068495816, + "learning_rate": 3.6318485692559263e-06, + "loss": 0.2715, + "step": 3366 + }, + { + "epoch": 2.4, + "grad_norm": 20.739251234096173, + "learning_rate": 3.62906895104998e-06, + "loss": 0.3269, + "step": 3367 + }, + { + "epoch": 2.4, + "grad_norm": 20.251420055737725, + "learning_rate": 3.6262897908975787e-06, + "loss": 0.3164, + "step": 3368 + }, + { + "epoch": 2.4, + "grad_norm": 12.647741659961826, + "learning_rate": 3.6235110897272917e-06, + "loss": 0.2031, + "step": 3369 + }, + { + "epoch": 2.41, + "grad_norm": 10.353959481918734, + "learning_rate": 3.620732848467535e-06, + "loss": 0.2383, + "step": 3370 + }, + { + "epoch": 2.41, + "grad_norm": 9.086259437715091, + "learning_rate": 3.6179550680465703e-06, + "loss": 0.2429, + "step": 3371 + }, + { + "epoch": 2.41, + "grad_norm": 12.140836475782853, + "learning_rate": 3.615177749392506e-06, + "loss": 0.25, + "step": 3372 + }, + { + "epoch": 2.41, + "grad_norm": 12.462306422712425, + "learning_rate": 3.6124008934332956e-06, + "loss": 0.2981, + "step": 3373 + }, + { + "epoch": 2.41, + "grad_norm": 15.595663064497298, + "learning_rate": 3.609624501096739e-06, + "loss": 0.2786, + "step": 3374 + }, + { + "epoch": 2.41, + "grad_norm": 9.6865300704338, + "learning_rate": 3.606848573310479e-06, + "loss": 0.2834, + "step": 3375 + }, + { + "epoch": 2.41, + "grad_norm": 10.459999547360587, + "learning_rate": 3.6040731110020065e-06, + "loss": 0.252, + "step": 3376 + }, + { + "epoch": 2.41, + "grad_norm": 17.050438349312042, + "learning_rate": 3.6012981150986524e-06, + "loss": 0.3784, + "step": 3377 + }, + { + "epoch": 2.41, + "grad_norm": 7.520037630380769, + "learning_rate": 3.598523586527599e-06, + "loss": 0.207, + "step": 3378 + }, + { + "epoch": 2.41, + "grad_norm": 11.356693880055909, + "learning_rate": 3.595749526215862e-06, + "loss": 0.2615, + "step": 3379 + }, + { + "epoch": 2.41, + "grad_norm": 9.210506673880975, + "learning_rate": 3.5929759350903117e-06, + "loss": 0.243, + "step": 3380 + }, + { + "epoch": 2.41, + "grad_norm": 16.890635842665827, + "learning_rate": 3.5902028140776524e-06, + "loss": 0.3169, + "step": 3381 + }, + { + "epoch": 2.41, + "grad_norm": 8.508691081951469, + "learning_rate": 3.5874301641044386e-06, + "loss": 0.2642, + "step": 3382 + }, + { + "epoch": 2.41, + "grad_norm": 9.55890525274352, + "learning_rate": 3.5846579860970632e-06, + "loss": 0.2678, + "step": 3383 + }, + { + "epoch": 2.42, + "grad_norm": 15.731659464137019, + "learning_rate": 3.58188628098176e-06, + "loss": 0.3152, + "step": 3384 + }, + { + "epoch": 2.42, + "grad_norm": 8.571055770883325, + "learning_rate": 3.579115049684612e-06, + "loss": 0.2434, + "step": 3385 + }, + { + "epoch": 2.42, + "grad_norm": 11.26780059174239, + "learning_rate": 3.576344293131533e-06, + "loss": 0.2771, + "step": 3386 + }, + { + "epoch": 2.42, + "grad_norm": 10.9806207910959, + "learning_rate": 3.5735740122482896e-06, + "loss": 0.2788, + "step": 3387 + }, + { + "epoch": 2.42, + "grad_norm": 11.608528508998647, + "learning_rate": 3.570804207960481e-06, + "loss": 0.3105, + "step": 3388 + }, + { + "epoch": 2.42, + "grad_norm": 8.976436128368848, + "learning_rate": 3.5680348811935527e-06, + "loss": 0.2446, + "step": 3389 + }, + { + "epoch": 2.42, + "grad_norm": 11.725885115817704, + "learning_rate": 3.565266032872785e-06, + "loss": 0.2861, + "step": 3390 + }, + { + "epoch": 2.42, + "grad_norm": 13.014407863564019, + "learning_rate": 3.5624976639233056e-06, + "loss": 0.2568, + "step": 3391 + }, + { + "epoch": 2.42, + "grad_norm": 9.169543696415937, + "learning_rate": 3.559729775270076e-06, + "loss": 0.2629, + "step": 3392 + }, + { + "epoch": 2.42, + "grad_norm": 11.368641787713843, + "learning_rate": 3.5569623678378972e-06, + "loss": 0.3442, + "step": 3393 + }, + { + "epoch": 2.42, + "grad_norm": 9.144314587647392, + "learning_rate": 3.554195442551416e-06, + "loss": 0.2119, + "step": 3394 + }, + { + "epoch": 2.42, + "grad_norm": 9.282894560665648, + "learning_rate": 3.551429000335108e-06, + "loss": 0.3357, + "step": 3395 + }, + { + "epoch": 2.42, + "grad_norm": 7.94605752362725, + "learning_rate": 3.5486630421132983e-06, + "loss": 0.2141, + "step": 3396 + }, + { + "epoch": 2.42, + "grad_norm": 10.588223525170012, + "learning_rate": 3.5458975688101403e-06, + "loss": 0.2935, + "step": 3397 + }, + { + "epoch": 2.43, + "grad_norm": 10.249548737786958, + "learning_rate": 3.5431325813496352e-06, + "loss": 0.2644, + "step": 3398 + }, + { + "epoch": 2.43, + "grad_norm": 16.03872538160447, + "learning_rate": 3.540368080655612e-06, + "loss": 0.3416, + "step": 3399 + }, + { + "epoch": 2.43, + "grad_norm": 12.263906287807048, + "learning_rate": 3.5376040676517443e-06, + "loss": 0.3013, + "step": 3400 + }, + { + "epoch": 2.43, + "grad_norm": 11.90507660616009, + "learning_rate": 3.5348405432615407e-06, + "loss": 0.2251, + "step": 3401 + }, + { + "epoch": 2.43, + "grad_norm": 8.652509111149236, + "learning_rate": 3.5320775084083425e-06, + "loss": 0.1938, + "step": 3402 + }, + { + "epoch": 2.43, + "grad_norm": 7.948684175375187, + "learning_rate": 3.529314964015336e-06, + "loss": 0.2017, + "step": 3403 + }, + { + "epoch": 2.43, + "grad_norm": 9.81378214693452, + "learning_rate": 3.526552911005533e-06, + "loss": 0.2417, + "step": 3404 + }, + { + "epoch": 2.43, + "grad_norm": 16.127745326816424, + "learning_rate": 3.523791350301793e-06, + "loss": 0.2727, + "step": 3405 + }, + { + "epoch": 2.43, + "grad_norm": 14.96529610492523, + "learning_rate": 3.5210302828267984e-06, + "loss": 0.2617, + "step": 3406 + }, + { + "epoch": 2.43, + "grad_norm": 17.61248036670611, + "learning_rate": 3.5182697095030795e-06, + "loss": 0.3103, + "step": 3407 + }, + { + "epoch": 2.43, + "grad_norm": 16.296153280775098, + "learning_rate": 3.5155096312529913e-06, + "loss": 0.3633, + "step": 3408 + }, + { + "epoch": 2.43, + "grad_norm": 9.295250617043212, + "learning_rate": 3.5127500489987252e-06, + "loss": 0.2856, + "step": 3409 + }, + { + "epoch": 2.43, + "grad_norm": 12.24988189484059, + "learning_rate": 3.5099909636623148e-06, + "loss": 0.3184, + "step": 3410 + }, + { + "epoch": 2.43, + "grad_norm": 14.376778291796514, + "learning_rate": 3.5072323761656163e-06, + "loss": 0.3359, + "step": 3411 + }, + { + "epoch": 2.44, + "grad_norm": 11.142529765067701, + "learning_rate": 3.5044742874303297e-06, + "loss": 0.3108, + "step": 3412 + }, + { + "epoch": 2.44, + "grad_norm": 9.349635726849158, + "learning_rate": 3.501716698377979e-06, + "loss": 0.2485, + "step": 3413 + }, + { + "epoch": 2.44, + "grad_norm": 7.768764085615702, + "learning_rate": 3.4989596099299306e-06, + "loss": 0.2454, + "step": 3414 + }, + { + "epoch": 2.44, + "grad_norm": 17.424964292649417, + "learning_rate": 3.496203023007374e-06, + "loss": 0.3284, + "step": 3415 + }, + { + "epoch": 2.44, + "grad_norm": 15.246943837578039, + "learning_rate": 3.4934469385313418e-06, + "loss": 0.3223, + "step": 3416 + }, + { + "epoch": 2.44, + "grad_norm": 21.516172225486876, + "learning_rate": 3.490691357422689e-06, + "loss": 0.25, + "step": 3417 + }, + { + "epoch": 2.44, + "grad_norm": 8.56752172697326, + "learning_rate": 3.487936280602108e-06, + "loss": 0.2329, + "step": 3418 + }, + { + "epoch": 2.44, + "grad_norm": 9.265980094835774, + "learning_rate": 3.4851817089901203e-06, + "loss": 0.2244, + "step": 3419 + }, + { + "epoch": 2.44, + "grad_norm": 8.204349860262266, + "learning_rate": 3.4824276435070804e-06, + "loss": 0.2239, + "step": 3420 + }, + { + "epoch": 2.44, + "grad_norm": 9.692570947593712, + "learning_rate": 3.4796740850731716e-06, + "loss": 0.2324, + "step": 3421 + }, + { + "epoch": 2.44, + "grad_norm": 12.387294099444116, + "learning_rate": 3.47692103460841e-06, + "loss": 0.2925, + "step": 3422 + }, + { + "epoch": 2.44, + "grad_norm": 13.421806378326073, + "learning_rate": 3.474168493032641e-06, + "loss": 0.3445, + "step": 3423 + }, + { + "epoch": 2.44, + "grad_norm": 16.231543949083306, + "learning_rate": 3.4714164612655387e-06, + "loss": 0.3259, + "step": 3424 + }, + { + "epoch": 2.44, + "grad_norm": 11.791636744953154, + "learning_rate": 3.468664940226609e-06, + "loss": 0.3198, + "step": 3425 + }, + { + "epoch": 2.45, + "grad_norm": 10.274282803160158, + "learning_rate": 3.4659139308351885e-06, + "loss": 0.2417, + "step": 3426 + }, + { + "epoch": 2.45, + "grad_norm": 9.905001077666203, + "learning_rate": 3.4631634340104357e-06, + "loss": 0.2465, + "step": 3427 + }, + { + "epoch": 2.45, + "grad_norm": 12.736134980260223, + "learning_rate": 3.460413450671346e-06, + "loss": 0.2791, + "step": 3428 + }, + { + "epoch": 2.45, + "grad_norm": 14.233692284060737, + "learning_rate": 3.457663981736739e-06, + "loss": 0.4175, + "step": 3429 + }, + { + "epoch": 2.45, + "grad_norm": 8.855372305832327, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.2551, + "step": 3430 + }, + { + "epoch": 2.45, + "grad_norm": 10.398405654424193, + "learning_rate": 3.4521665907553957e-06, + "loss": 0.1948, + "step": 3431 + }, + { + "epoch": 2.45, + "grad_norm": 11.49001839122159, + "learning_rate": 3.4494186705454402e-06, + "loss": 0.2893, + "step": 3432 + }, + { + "epoch": 2.45, + "grad_norm": 13.738087130527903, + "learning_rate": 3.446671268413528e-06, + "loss": 0.2937, + "step": 3433 + }, + { + "epoch": 2.45, + "grad_norm": 11.80364763645785, + "learning_rate": 3.443924385277617e-06, + "loss": 0.2493, + "step": 3434 + }, + { + "epoch": 2.45, + "grad_norm": 8.451947488762436, + "learning_rate": 3.4411780220554937e-06, + "loss": 0.2285, + "step": 3435 + }, + { + "epoch": 2.45, + "grad_norm": 10.469520240369008, + "learning_rate": 3.4384321796647645e-06, + "loss": 0.3096, + "step": 3436 + }, + { + "epoch": 2.45, + "grad_norm": 10.22828595409656, + "learning_rate": 3.4356868590228727e-06, + "loss": 0.2534, + "step": 3437 + }, + { + "epoch": 2.45, + "grad_norm": 13.115450507117941, + "learning_rate": 3.4329420610470745e-06, + "loss": 0.2698, + "step": 3438 + }, + { + "epoch": 2.45, + "grad_norm": 12.542372576236675, + "learning_rate": 3.4301977866544634e-06, + "loss": 0.2993, + "step": 3439 + }, + { + "epoch": 2.46, + "grad_norm": 7.09705606272018, + "learning_rate": 3.427454036761948e-06, + "loss": 0.2233, + "step": 3440 + }, + { + "epoch": 2.46, + "grad_norm": 13.02634813937635, + "learning_rate": 3.4247108122862703e-06, + "loss": 0.2429, + "step": 3441 + }, + { + "epoch": 2.46, + "grad_norm": 9.935522500990654, + "learning_rate": 3.4219681141439907e-06, + "loss": 0.2246, + "step": 3442 + }, + { + "epoch": 2.46, + "grad_norm": 17.667359751130242, + "learning_rate": 3.4192259432514934e-06, + "loss": 0.2803, + "step": 3443 + }, + { + "epoch": 2.46, + "grad_norm": 10.039200499360646, + "learning_rate": 3.4164843005249928e-06, + "loss": 0.2092, + "step": 3444 + }, + { + "epoch": 2.46, + "grad_norm": 10.756691213045567, + "learning_rate": 3.413743186880519e-06, + "loss": 0.2317, + "step": 3445 + }, + { + "epoch": 2.46, + "grad_norm": 14.776664463230171, + "learning_rate": 3.4110026032339317e-06, + "loss": 0.2922, + "step": 3446 + }, + { + "epoch": 2.46, + "grad_norm": 15.417900806390653, + "learning_rate": 3.408262550500908e-06, + "loss": 0.3977, + "step": 3447 + }, + { + "epoch": 2.46, + "grad_norm": 13.178425128085712, + "learning_rate": 3.4055230295969556e-06, + "loss": 0.2422, + "step": 3448 + }, + { + "epoch": 2.46, + "grad_norm": 12.742971833222871, + "learning_rate": 3.4027840414373924e-06, + "loss": 0.344, + "step": 3449 + }, + { + "epoch": 2.46, + "grad_norm": 9.435224415388793, + "learning_rate": 3.4000455869373716e-06, + "loss": 0.2715, + "step": 3450 + }, + { + "epoch": 2.46, + "grad_norm": 10.634275587173237, + "learning_rate": 3.397307667011859e-06, + "loss": 0.3154, + "step": 3451 + }, + { + "epoch": 2.46, + "grad_norm": 15.921317196391337, + "learning_rate": 3.394570282575642e-06, + "loss": 0.2876, + "step": 3452 + }, + { + "epoch": 2.46, + "grad_norm": 19.62653176049099, + "learning_rate": 3.3918334345433367e-06, + "loss": 0.3252, + "step": 3453 + }, + { + "epoch": 2.47, + "grad_norm": 10.047215418020556, + "learning_rate": 3.3890971238293703e-06, + "loss": 0.3218, + "step": 3454 + }, + { + "epoch": 2.47, + "grad_norm": 14.006570126629176, + "learning_rate": 3.386361351347999e-06, + "loss": 0.2898, + "step": 3455 + }, + { + "epoch": 2.47, + "grad_norm": 10.7530458486607, + "learning_rate": 3.3836261180132914e-06, + "loss": 0.2742, + "step": 3456 + }, + { + "epoch": 2.47, + "grad_norm": 9.313128539846193, + "learning_rate": 3.3808914247391437e-06, + "loss": 0.2656, + "step": 3457 + }, + { + "epoch": 2.47, + "grad_norm": 10.2334413075271, + "learning_rate": 3.3781572724392642e-06, + "loss": 0.2427, + "step": 3458 + }, + { + "epoch": 2.47, + "grad_norm": 9.268830421432604, + "learning_rate": 3.3754236620271876e-06, + "loss": 0.2834, + "step": 3459 + }, + { + "epoch": 2.47, + "grad_norm": 7.645365717660876, + "learning_rate": 3.3726905944162615e-06, + "loss": 0.2603, + "step": 3460 + }, + { + "epoch": 2.47, + "grad_norm": 13.475867556089085, + "learning_rate": 3.3699580705196527e-06, + "loss": 0.271, + "step": 3461 + }, + { + "epoch": 2.47, + "grad_norm": 9.891164807765543, + "learning_rate": 3.367226091250353e-06, + "loss": 0.2837, + "step": 3462 + }, + { + "epoch": 2.47, + "grad_norm": 10.147315000935372, + "learning_rate": 3.3644946575211634e-06, + "loss": 0.2432, + "step": 3463 + }, + { + "epoch": 2.47, + "grad_norm": 14.722101869142618, + "learning_rate": 3.36176377024471e-06, + "loss": 0.29, + "step": 3464 + }, + { + "epoch": 2.47, + "grad_norm": 11.016997359020882, + "learning_rate": 3.3590334303334293e-06, + "loss": 0.3162, + "step": 3465 + }, + { + "epoch": 2.47, + "grad_norm": 13.714073886046096, + "learning_rate": 3.356303638699583e-06, + "loss": 0.302, + "step": 3466 + }, + { + "epoch": 2.47, + "grad_norm": 19.31122867211997, + "learning_rate": 3.35357439625524e-06, + "loss": 0.2225, + "step": 3467 + }, + { + "epoch": 2.48, + "grad_norm": 12.443164765186587, + "learning_rate": 3.3508457039122965e-06, + "loss": 0.3494, + "step": 3468 + }, + { + "epoch": 2.48, + "grad_norm": 16.264537881834457, + "learning_rate": 3.348117562582457e-06, + "loss": 0.3677, + "step": 3469 + }, + { + "epoch": 2.48, + "grad_norm": 11.05702734122788, + "learning_rate": 3.345389973177241e-06, + "loss": 0.2539, + "step": 3470 + }, + { + "epoch": 2.48, + "grad_norm": 12.800434645157752, + "learning_rate": 3.342662936607992e-06, + "loss": 0.261, + "step": 3471 + }, + { + "epoch": 2.48, + "grad_norm": 14.088140859929942, + "learning_rate": 3.3399364537858594e-06, + "loss": 0.2424, + "step": 3472 + }, + { + "epoch": 2.48, + "grad_norm": 10.233296365055466, + "learning_rate": 3.3372105256218153e-06, + "loss": 0.3066, + "step": 3473 + }, + { + "epoch": 2.48, + "grad_norm": 7.936594581867434, + "learning_rate": 3.334485153026639e-06, + "loss": 0.2, + "step": 3474 + }, + { + "epoch": 2.48, + "grad_norm": 19.685217490083236, + "learning_rate": 3.3317603369109332e-06, + "loss": 0.2756, + "step": 3475 + }, + { + "epoch": 2.48, + "grad_norm": 12.901083967072752, + "learning_rate": 3.3290360781851055e-06, + "loss": 0.2666, + "step": 3476 + }, + { + "epoch": 2.48, + "grad_norm": 9.973876232975487, + "learning_rate": 3.326312377759383e-06, + "loss": 0.2457, + "step": 3477 + }, + { + "epoch": 2.48, + "grad_norm": 15.582000889219513, + "learning_rate": 3.3235892365438038e-06, + "loss": 0.2554, + "step": 3478 + }, + { + "epoch": 2.48, + "grad_norm": 8.7223189804276, + "learning_rate": 3.3208666554482216e-06, + "loss": 0.2821, + "step": 3479 + }, + { + "epoch": 2.48, + "grad_norm": 8.830264112686557, + "learning_rate": 3.3181446353822997e-06, + "loss": 0.2622, + "step": 3480 + }, + { + "epoch": 2.48, + "grad_norm": 16.7891982910683, + "learning_rate": 3.315423177255516e-06, + "loss": 0.3813, + "step": 3481 + }, + { + "epoch": 2.49, + "grad_norm": 11.65036520970705, + "learning_rate": 3.312702281977161e-06, + "loss": 0.2422, + "step": 3482 + }, + { + "epoch": 2.49, + "grad_norm": 18.934632301351446, + "learning_rate": 3.3099819504563356e-06, + "loss": 0.2981, + "step": 3483 + }, + { + "epoch": 2.49, + "grad_norm": 12.371036590772636, + "learning_rate": 3.3072621836019535e-06, + "loss": 0.2908, + "step": 3484 + }, + { + "epoch": 2.49, + "grad_norm": 6.64869466776249, + "learning_rate": 3.3045429823227405e-06, + "loss": 0.201, + "step": 3485 + }, + { + "epoch": 2.49, + "grad_norm": 9.730727432132026, + "learning_rate": 3.3018243475272282e-06, + "loss": 0.2419, + "step": 3486 + }, + { + "epoch": 2.49, + "grad_norm": 11.465970571860945, + "learning_rate": 3.2991062801237683e-06, + "loss": 0.2417, + "step": 3487 + }, + { + "epoch": 2.49, + "grad_norm": 9.95581933840102, + "learning_rate": 3.296388781020513e-06, + "loss": 0.2732, + "step": 3488 + }, + { + "epoch": 2.49, + "grad_norm": 17.72636391101251, + "learning_rate": 3.293671851125434e-06, + "loss": 0.3291, + "step": 3489 + }, + { + "epoch": 2.49, + "grad_norm": 11.831655869700418, + "learning_rate": 3.2909554913463034e-06, + "loss": 0.2332, + "step": 3490 + }, + { + "epoch": 2.49, + "grad_norm": 11.681447470044946, + "learning_rate": 3.2882397025907114e-06, + "loss": 0.3584, + "step": 3491 + }, + { + "epoch": 2.49, + "grad_norm": 8.474346090221069, + "learning_rate": 3.2855244857660497e-06, + "loss": 0.2732, + "step": 3492 + }, + { + "epoch": 2.49, + "grad_norm": 6.735075386972751, + "learning_rate": 3.2828098417795267e-06, + "loss": 0.2156, + "step": 3493 + }, + { + "epoch": 2.49, + "grad_norm": 14.010717824928776, + "learning_rate": 3.2800957715381537e-06, + "loss": 0.3191, + "step": 3494 + }, + { + "epoch": 2.49, + "grad_norm": 12.239704900786217, + "learning_rate": 3.2773822759487497e-06, + "loss": 0.2515, + "step": 3495 + }, + { + "epoch": 2.5, + "grad_norm": 8.594755050704686, + "learning_rate": 3.2746693559179483e-06, + "loss": 0.2563, + "step": 3496 + }, + { + "epoch": 2.5, + "grad_norm": 13.933295020461518, + "learning_rate": 3.2719570123521816e-06, + "loss": 0.3687, + "step": 3497 + }, + { + "epoch": 2.5, + "grad_norm": 10.270240218891797, + "learning_rate": 3.2692452461576997e-06, + "loss": 0.2876, + "step": 3498 + }, + { + "epoch": 2.5, + "grad_norm": 15.502834749242435, + "learning_rate": 3.266534058240548e-06, + "loss": 0.2979, + "step": 3499 + }, + { + "epoch": 2.5, + "grad_norm": 9.296849614190464, + "learning_rate": 3.2638234495065903e-06, + "loss": 0.2004, + "step": 3500 + }, + { + "epoch": 2.5, + "eval_avg_AUC": 0.7877963736079466, + "eval_avg_Accuracy": 0.6891992705570292, + "eval_avg_Accuracy-right": 0.9025694535020217, + "eval_avg_Accuracy-wrong": 0.3171480554923812, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6606969481112277, + "eval_last_AUC": 0.8078098190488144, + "eval_last_Accuracy": 0.7378149867374005, + "eval_last_Accuracy-right": 0.852028172688144, + "eval_last_Accuracy-wrong": 0.5386627245849442, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6786984038893772, + "eval_max_AUC": 0.7524562987575951, + "eval_max_Accuracy": 0.6421999336870027, + "eval_max_Accuracy-right": 0.981805138907004, + "eval_max_Accuracy-wrong": 0.05003411416875142, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6149865113803138, + "eval_min_AUC": 0.7926767012641479, + "eval_min_Accuracy": 0.7277022546419099, + "eval_min_Accuracy-right": 0.7812051649928264, + "eval_min_Accuracy-wrong": 0.6344098248806004, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6649279504885879, + "eval_prod_AUC": 0.794738394115366, + "eval_prod_Accuracy": 0.7094247347480106, + "eval_prod_Accuracy-right": 0.6642754662840746, + "eval_prod_Accuracy-wrong": 0.788151012053673, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.660230571971425, + "eval_runtime": 246.6392, + "eval_samples_per_second": 97.827, + "eval_steps_per_second": 3.057, + "eval_sum_AUC": 0.6402790236735809, + "eval_sum_Accuracy": 0.6413710212201591, + "eval_sum_Accuracy-right": 0.996869701317334, + "eval_sum_Accuracy-wrong": 0.0214919263133955, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6389305505739054, + "step": 3500 + }, + { + "epoch": 2.5, + "grad_norm": 13.068769827644681, + "learning_rate": 3.261113420861487e-06, + "loss": 0.3, + "step": 3501 + }, + { + "epoch": 2.5, + "grad_norm": 9.556338925829841, + "learning_rate": 3.258403973210713e-06, + "loss": 0.2725, + "step": 3502 + }, + { + "epoch": 2.5, + "grad_norm": 8.173921559638918, + "learning_rate": 3.2556951074595435e-06, + "loss": 0.2256, + "step": 3503 + }, + { + "epoch": 2.5, + "grad_norm": 13.308116193516451, + "learning_rate": 3.2529868245130577e-06, + "loss": 0.3523, + "step": 3504 + }, + { + "epoch": 2.5, + "grad_norm": 8.483362795187777, + "learning_rate": 3.250279125276148e-06, + "loss": 0.1975, + "step": 3505 + }, + { + "epoch": 2.5, + "grad_norm": 10.818400765853749, + "learning_rate": 3.2475720106535036e-06, + "loss": 0.2515, + "step": 3506 + }, + { + "epoch": 2.5, + "grad_norm": 9.579502198721391, + "learning_rate": 3.244865481549625e-06, + "loss": 0.2383, + "step": 3507 + }, + { + "epoch": 2.5, + "grad_norm": 14.595647574214624, + "learning_rate": 3.24215953886881e-06, + "loss": 0.3413, + "step": 3508 + }, + { + "epoch": 2.5, + "grad_norm": 8.566942539380886, + "learning_rate": 3.2394541835151692e-06, + "loss": 0.2266, + "step": 3509 + }, + { + "epoch": 2.51, + "grad_norm": 11.679457066390924, + "learning_rate": 3.2367494163926095e-06, + "loss": 0.3003, + "step": 3510 + }, + { + "epoch": 2.51, + "grad_norm": 12.640679927317136, + "learning_rate": 3.234045238404841e-06, + "loss": 0.3547, + "step": 3511 + }, + { + "epoch": 2.51, + "grad_norm": 12.18961731562165, + "learning_rate": 3.2313416504553852e-06, + "loss": 0.3152, + "step": 3512 + }, + { + "epoch": 2.51, + "grad_norm": 10.22642226732791, + "learning_rate": 3.2286386534475568e-06, + "loss": 0.2302, + "step": 3513 + }, + { + "epoch": 2.51, + "grad_norm": 8.438253560959739, + "learning_rate": 3.2259362482844803e-06, + "loss": 0.2563, + "step": 3514 + }, + { + "epoch": 2.51, + "grad_norm": 14.27856156365522, + "learning_rate": 3.2232344358690765e-06, + "loss": 0.2539, + "step": 3515 + }, + { + "epoch": 2.51, + "grad_norm": 11.93630832767274, + "learning_rate": 3.220533217104075e-06, + "loss": 0.301, + "step": 3516 + }, + { + "epoch": 2.51, + "grad_norm": 11.027334258786563, + "learning_rate": 3.217832592891999e-06, + "loss": 0.2263, + "step": 3517 + }, + { + "epoch": 2.51, + "grad_norm": 10.754441023455815, + "learning_rate": 3.2151325641351817e-06, + "loss": 0.2654, + "step": 3518 + }, + { + "epoch": 2.51, + "grad_norm": 10.74013272575728, + "learning_rate": 3.2124331317357506e-06, + "loss": 0.2236, + "step": 3519 + }, + { + "epoch": 2.51, + "grad_norm": 13.383946788535154, + "learning_rate": 3.2097342965956334e-06, + "loss": 0.2791, + "step": 3520 + }, + { + "epoch": 2.51, + "grad_norm": 11.32091809219386, + "learning_rate": 3.2070360596165667e-06, + "loss": 0.2312, + "step": 3521 + }, + { + "epoch": 2.51, + "grad_norm": 21.730483344555825, + "learning_rate": 3.204338421700076e-06, + "loss": 0.3027, + "step": 3522 + }, + { + "epoch": 2.51, + "grad_norm": 16.68177293630166, + "learning_rate": 3.201641383747498e-06, + "loss": 0.3059, + "step": 3523 + }, + { + "epoch": 2.52, + "grad_norm": 10.746578325895099, + "learning_rate": 3.1989449466599574e-06, + "loss": 0.3159, + "step": 3524 + }, + { + "epoch": 2.52, + "grad_norm": 14.046874777971698, + "learning_rate": 3.1962491113383896e-06, + "loss": 0.3032, + "step": 3525 + }, + { + "epoch": 2.52, + "grad_norm": 9.701317007452005, + "learning_rate": 3.1935538786835183e-06, + "loss": 0.2761, + "step": 3526 + }, + { + "epoch": 2.52, + "grad_norm": 12.573694027775247, + "learning_rate": 3.1908592495958747e-06, + "loss": 0.2598, + "step": 3527 + }, + { + "epoch": 2.52, + "grad_norm": 9.454753496917233, + "learning_rate": 3.1881652249757823e-06, + "loss": 0.2227, + "step": 3528 + }, + { + "epoch": 2.52, + "grad_norm": 8.312287431795568, + "learning_rate": 3.185471805723365e-06, + "loss": 0.2664, + "step": 3529 + }, + { + "epoch": 2.52, + "grad_norm": 10.584289964239222, + "learning_rate": 3.1827789927385444e-06, + "loss": 0.2786, + "step": 3530 + }, + { + "epoch": 2.52, + "grad_norm": 9.770661082389356, + "learning_rate": 3.18008678692104e-06, + "loss": 0.2793, + "step": 3531 + }, + { + "epoch": 2.52, + "grad_norm": 11.506383377691748, + "learning_rate": 3.1773951891703668e-06, + "loss": 0.2068, + "step": 3532 + }, + { + "epoch": 2.52, + "grad_norm": 10.113353081336689, + "learning_rate": 3.1747042003858386e-06, + "loss": 0.2349, + "step": 3533 + }, + { + "epoch": 2.52, + "grad_norm": 8.615187115741376, + "learning_rate": 3.1720138214665643e-06, + "loss": 0.2405, + "step": 3534 + }, + { + "epoch": 2.52, + "grad_norm": 20.977315544926444, + "learning_rate": 3.1693240533114496e-06, + "loss": 0.4512, + "step": 3535 + }, + { + "epoch": 2.52, + "grad_norm": 14.063179683264558, + "learning_rate": 3.1666348968191955e-06, + "loss": 0.2495, + "step": 3536 + }, + { + "epoch": 2.52, + "grad_norm": 8.892504016191015, + "learning_rate": 3.1639463528883007e-06, + "loss": 0.262, + "step": 3537 + }, + { + "epoch": 2.53, + "grad_norm": 9.55789315186977, + "learning_rate": 3.161258422417055e-06, + "loss": 0.2847, + "step": 3538 + }, + { + "epoch": 2.53, + "grad_norm": 11.550961610557891, + "learning_rate": 3.1585711063035496e-06, + "loss": 0.2666, + "step": 3539 + }, + { + "epoch": 2.53, + "grad_norm": 11.899767453488833, + "learning_rate": 3.155884405445663e-06, + "loss": 0.2334, + "step": 3540 + }, + { + "epoch": 2.53, + "grad_norm": 8.23350483466534, + "learning_rate": 3.153198320741074e-06, + "loss": 0.2043, + "step": 3541 + }, + { + "epoch": 2.53, + "grad_norm": 13.410011852557782, + "learning_rate": 3.150512853087253e-06, + "loss": 0.325, + "step": 3542 + }, + { + "epoch": 2.53, + "grad_norm": 8.11365917752667, + "learning_rate": 3.1478280033814657e-06, + "loss": 0.2322, + "step": 3543 + }, + { + "epoch": 2.53, + "grad_norm": 9.164175984205736, + "learning_rate": 3.14514377252077e-06, + "loss": 0.23, + "step": 3544 + }, + { + "epoch": 2.53, + "grad_norm": 12.426890476211831, + "learning_rate": 3.142460161402014e-06, + "loss": 0.2512, + "step": 3545 + }, + { + "epoch": 2.53, + "grad_norm": 12.904224322008433, + "learning_rate": 3.139777170921847e-06, + "loss": 0.3203, + "step": 3546 + }, + { + "epoch": 2.53, + "grad_norm": 9.782016167639494, + "learning_rate": 3.137094801976701e-06, + "loss": 0.2834, + "step": 3547 + }, + { + "epoch": 2.53, + "grad_norm": 11.154378436965436, + "learning_rate": 3.1344130554628104e-06, + "loss": 0.2375, + "step": 3548 + }, + { + "epoch": 2.53, + "grad_norm": 13.41557779364289, + "learning_rate": 3.131731932276193e-06, + "loss": 0.2896, + "step": 3549 + }, + { + "epoch": 2.53, + "grad_norm": 17.215143879601154, + "learning_rate": 3.129051433312664e-06, + "loss": 0.3809, + "step": 3550 + }, + { + "epoch": 2.53, + "grad_norm": 13.634755228372835, + "learning_rate": 3.1263715594678257e-06, + "loss": 0.3027, + "step": 3551 + }, + { + "epoch": 2.54, + "grad_norm": 13.49266302111991, + "learning_rate": 3.1236923116370764e-06, + "loss": 0.2471, + "step": 3552 + }, + { + "epoch": 2.54, + "grad_norm": 9.359540843720524, + "learning_rate": 3.121013690715601e-06, + "loss": 0.1829, + "step": 3553 + }, + { + "epoch": 2.54, + "grad_norm": 8.828670124176265, + "learning_rate": 3.118335697598376e-06, + "loss": 0.2185, + "step": 3554 + }, + { + "epoch": 2.54, + "grad_norm": 12.78806333330499, + "learning_rate": 3.1156583331801703e-06, + "loss": 0.2986, + "step": 3555 + }, + { + "epoch": 2.54, + "grad_norm": 8.261812795713473, + "learning_rate": 3.1129815983555387e-06, + "loss": 0.2212, + "step": 3556 + }, + { + "epoch": 2.54, + "grad_norm": 27.50178705776172, + "learning_rate": 3.1103054940188316e-06, + "loss": 0.3633, + "step": 3557 + }, + { + "epoch": 2.54, + "grad_norm": 13.276709743000696, + "learning_rate": 3.1076300210641814e-06, + "loss": 0.2769, + "step": 3558 + }, + { + "epoch": 2.54, + "grad_norm": 9.091825282135263, + "learning_rate": 3.1049551803855173e-06, + "loss": 0.261, + "step": 3559 + }, + { + "epoch": 2.54, + "grad_norm": 9.497532460353776, + "learning_rate": 3.1022809728765486e-06, + "loss": 0.2812, + "step": 3560 + }, + { + "epoch": 2.54, + "grad_norm": 14.352342983807455, + "learning_rate": 3.0996073994307825e-06, + "loss": 0.2544, + "step": 3561 + }, + { + "epoch": 2.54, + "grad_norm": 10.47067050690066, + "learning_rate": 3.0969344609415076e-06, + "loss": 0.2544, + "step": 3562 + }, + { + "epoch": 2.54, + "grad_norm": 13.14407357274232, + "learning_rate": 3.0942621583017994e-06, + "loss": 0.2639, + "step": 3563 + }, + { + "epoch": 2.54, + "grad_norm": 12.376995841990846, + "learning_rate": 3.0915904924045294e-06, + "loss": 0.2681, + "step": 3564 + }, + { + "epoch": 2.54, + "grad_norm": 8.615971289854569, + "learning_rate": 3.088919464142346e-06, + "loss": 0.2559, + "step": 3565 + }, + { + "epoch": 2.55, + "grad_norm": 13.374764637160622, + "learning_rate": 3.0862490744076928e-06, + "loss": 0.3003, + "step": 3566 + }, + { + "epoch": 2.55, + "grad_norm": 12.754057552940706, + "learning_rate": 3.0835793240927937e-06, + "loss": 0.3311, + "step": 3567 + }, + { + "epoch": 2.55, + "grad_norm": 8.835687713457723, + "learning_rate": 3.0809102140896652e-06, + "loss": 0.2524, + "step": 3568 + }, + { + "epoch": 2.55, + "grad_norm": 17.100356007256956, + "learning_rate": 3.078241745290103e-06, + "loss": 0.3794, + "step": 3569 + }, + { + "epoch": 2.55, + "grad_norm": 13.290591029206796, + "learning_rate": 3.075573918585696e-06, + "loss": 0.2791, + "step": 3570 + }, + { + "epoch": 2.55, + "grad_norm": 7.028395777757924, + "learning_rate": 3.0729067348678127e-06, + "loss": 0.1995, + "step": 3571 + }, + { + "epoch": 2.55, + "grad_norm": 7.641816702608815, + "learning_rate": 3.0702401950276066e-06, + "loss": 0.1987, + "step": 3572 + }, + { + "epoch": 2.55, + "grad_norm": 10.246342226496322, + "learning_rate": 3.067574299956022e-06, + "loss": 0.2441, + "step": 3573 + }, + { + "epoch": 2.55, + "grad_norm": 11.844832056141563, + "learning_rate": 3.0649090505437804e-06, + "loss": 0.2976, + "step": 3574 + }, + { + "epoch": 2.55, + "grad_norm": 12.680252229465067, + "learning_rate": 3.062244447681396e-06, + "loss": 0.3022, + "step": 3575 + }, + { + "epoch": 2.55, + "grad_norm": 8.665615703923262, + "learning_rate": 3.0595804922591564e-06, + "loss": 0.2463, + "step": 3576 + }, + { + "epoch": 2.55, + "grad_norm": 10.440754388319283, + "learning_rate": 3.0569171851671436e-06, + "loss": 0.2668, + "step": 3577 + }, + { + "epoch": 2.55, + "grad_norm": 11.905524709518772, + "learning_rate": 3.054254527295215e-06, + "loss": 0.3701, + "step": 3578 + }, + { + "epoch": 2.55, + "grad_norm": 9.807086250686762, + "learning_rate": 3.0515925195330148e-06, + "loss": 0.2522, + "step": 3579 + }, + { + "epoch": 2.56, + "grad_norm": 10.476465301590263, + "learning_rate": 3.048931162769969e-06, + "loss": 0.2666, + "step": 3580 + }, + { + "epoch": 2.56, + "grad_norm": 11.107929057524471, + "learning_rate": 3.0462704578952874e-06, + "loss": 0.2861, + "step": 3581 + }, + { + "epoch": 2.56, + "grad_norm": 21.986391423429573, + "learning_rate": 3.0436104057979604e-06, + "loss": 0.2964, + "step": 3582 + }, + { + "epoch": 2.56, + "grad_norm": 12.90626976949449, + "learning_rate": 3.0409510073667602e-06, + "loss": 0.2129, + "step": 3583 + }, + { + "epoch": 2.56, + "grad_norm": 13.164803117431044, + "learning_rate": 3.038292263490242e-06, + "loss": 0.3071, + "step": 3584 + }, + { + "epoch": 2.56, + "grad_norm": 15.092854006583256, + "learning_rate": 3.035634175056742e-06, + "loss": 0.3521, + "step": 3585 + }, + { + "epoch": 2.56, + "grad_norm": 14.609423145742273, + "learning_rate": 3.0329767429543767e-06, + "loss": 0.2844, + "step": 3586 + }, + { + "epoch": 2.56, + "grad_norm": 9.691645964790691, + "learning_rate": 3.030319968071043e-06, + "loss": 0.2329, + "step": 3587 + }, + { + "epoch": 2.56, + "grad_norm": 7.81284676046446, + "learning_rate": 3.0276638512944177e-06, + "loss": 0.2225, + "step": 3588 + }, + { + "epoch": 2.56, + "grad_norm": 12.769037847281707, + "learning_rate": 3.025008393511961e-06, + "loss": 0.3105, + "step": 3589 + }, + { + "epoch": 2.56, + "grad_norm": 15.295061680310575, + "learning_rate": 3.022353595610909e-06, + "loss": 0.3152, + "step": 3590 + }, + { + "epoch": 2.56, + "grad_norm": 11.698927211936732, + "learning_rate": 3.01969945847828e-06, + "loss": 0.293, + "step": 3591 + }, + { + "epoch": 2.56, + "grad_norm": 11.609293397061075, + "learning_rate": 3.017045983000871e-06, + "loss": 0.2698, + "step": 3592 + }, + { + "epoch": 2.56, + "grad_norm": 10.157130344506225, + "learning_rate": 3.014393170065256e-06, + "loss": 0.312, + "step": 3593 + }, + { + "epoch": 2.57, + "grad_norm": 13.898830020128793, + "learning_rate": 3.0117410205577903e-06, + "loss": 0.2737, + "step": 3594 + }, + { + "epoch": 2.57, + "grad_norm": 7.558372901547542, + "learning_rate": 3.0090895353646053e-06, + "loss": 0.2512, + "step": 3595 + }, + { + "epoch": 2.57, + "grad_norm": 8.583363620015973, + "learning_rate": 3.006438715371614e-06, + "loss": 0.2542, + "step": 3596 + }, + { + "epoch": 2.57, + "grad_norm": 8.806692519344198, + "learning_rate": 3.0037885614645e-06, + "loss": 0.2227, + "step": 3597 + }, + { + "epoch": 2.57, + "grad_norm": 10.779965721273376, + "learning_rate": 3.001139074528735e-06, + "loss": 0.205, + "step": 3598 + }, + { + "epoch": 2.57, + "grad_norm": 9.531815564612236, + "learning_rate": 2.9984902554495556e-06, + "loss": 0.2676, + "step": 3599 + }, + { + "epoch": 2.57, + "grad_norm": 18.506453906728485, + "learning_rate": 2.995842105111987e-06, + "loss": 0.2793, + "step": 3600 + }, + { + "epoch": 2.57, + "grad_norm": 11.15394731501427, + "learning_rate": 2.99319462440082e-06, + "loss": 0.2083, + "step": 3601 + }, + { + "epoch": 2.57, + "grad_norm": 10.932997861337693, + "learning_rate": 2.990547814200633e-06, + "loss": 0.2432, + "step": 3602 + }, + { + "epoch": 2.57, + "grad_norm": 14.849576244908592, + "learning_rate": 2.987901675395771e-06, + "loss": 0.2454, + "step": 3603 + }, + { + "epoch": 2.57, + "grad_norm": 14.12584654833872, + "learning_rate": 2.985256208870357e-06, + "loss": 0.3494, + "step": 3604 + }, + { + "epoch": 2.57, + "grad_norm": 10.767528797117537, + "learning_rate": 2.982611415508294e-06, + "loss": 0.2358, + "step": 3605 + }, + { + "epoch": 2.57, + "grad_norm": 11.280128754931008, + "learning_rate": 2.9799672961932525e-06, + "loss": 0.2463, + "step": 3606 + }, + { + "epoch": 2.57, + "grad_norm": 8.9328456698097, + "learning_rate": 2.9773238518086866e-06, + "loss": 0.176, + "step": 3607 + }, + { + "epoch": 2.58, + "grad_norm": 9.21994488542098, + "learning_rate": 2.974681083237816e-06, + "loss": 0.262, + "step": 3608 + }, + { + "epoch": 2.58, + "grad_norm": 10.967970336760006, + "learning_rate": 2.972038991363643e-06, + "loss": 0.2529, + "step": 3609 + }, + { + "epoch": 2.58, + "grad_norm": 12.528992859617613, + "learning_rate": 2.9693975770689344e-06, + "loss": 0.2549, + "step": 3610 + }, + { + "epoch": 2.58, + "grad_norm": 14.244301431577744, + "learning_rate": 2.9667568412362415e-06, + "loss": 0.2222, + "step": 3611 + }, + { + "epoch": 2.58, + "grad_norm": 16.112330292266513, + "learning_rate": 2.9641167847478797e-06, + "loss": 0.2761, + "step": 3612 + }, + { + "epoch": 2.58, + "grad_norm": 14.774287510401793, + "learning_rate": 2.96147740848594e-06, + "loss": 0.3308, + "step": 3613 + }, + { + "epoch": 2.58, + "grad_norm": 14.048586990552954, + "learning_rate": 2.9588387133322903e-06, + "loss": 0.2192, + "step": 3614 + }, + { + "epoch": 2.58, + "grad_norm": 13.648117792562253, + "learning_rate": 2.9562007001685644e-06, + "loss": 0.2556, + "step": 3615 + }, + { + "epoch": 2.58, + "grad_norm": 12.586600872006583, + "learning_rate": 2.9535633698761755e-06, + "loss": 0.2891, + "step": 3616 + }, + { + "epoch": 2.58, + "grad_norm": 14.723776113704208, + "learning_rate": 2.9509267233363005e-06, + "loss": 0.3936, + "step": 3617 + }, + { + "epoch": 2.58, + "grad_norm": 8.347490053483744, + "learning_rate": 2.948290761429895e-06, + "loss": 0.2351, + "step": 3618 + }, + { + "epoch": 2.58, + "grad_norm": 10.14243876444085, + "learning_rate": 2.9456554850376805e-06, + "loss": 0.2601, + "step": 3619 + }, + { + "epoch": 2.58, + "grad_norm": 11.440692510978845, + "learning_rate": 2.943020895040155e-06, + "loss": 0.25, + "step": 3620 + }, + { + "epoch": 2.58, + "grad_norm": 29.257448998416535, + "learning_rate": 2.940386992317582e-06, + "loss": 0.4346, + "step": 3621 + }, + { + "epoch": 2.59, + "grad_norm": 10.111642278946286, + "learning_rate": 2.937753777749996e-06, + "loss": 0.2034, + "step": 3622 + }, + { + "epoch": 2.59, + "grad_norm": 14.479860863761866, + "learning_rate": 2.9351212522172056e-06, + "loss": 0.3098, + "step": 3623 + }, + { + "epoch": 2.59, + "grad_norm": 8.382932164941954, + "learning_rate": 2.9324894165987837e-06, + "loss": 0.2429, + "step": 3624 + }, + { + "epoch": 2.59, + "grad_norm": 9.11158565572809, + "learning_rate": 2.9298582717740797e-06, + "loss": 0.2952, + "step": 3625 + }, + { + "epoch": 2.59, + "grad_norm": 8.281584890559799, + "learning_rate": 2.9272278186222025e-06, + "loss": 0.2167, + "step": 3626 + }, + { + "epoch": 2.59, + "grad_norm": 13.801083924218155, + "learning_rate": 2.9245980580220405e-06, + "loss": 0.2754, + "step": 3627 + }, + { + "epoch": 2.59, + "grad_norm": 11.920589259305089, + "learning_rate": 2.921968990852242e-06, + "loss": 0.2957, + "step": 3628 + }, + { + "epoch": 2.59, + "grad_norm": 8.695498611228306, + "learning_rate": 2.9193406179912297e-06, + "loss": 0.252, + "step": 3629 + }, + { + "epoch": 2.59, + "grad_norm": 11.833770314366229, + "learning_rate": 2.91671294031719e-06, + "loss": 0.2573, + "step": 3630 + }, + { + "epoch": 2.59, + "grad_norm": 10.9105743148522, + "learning_rate": 2.91408595870808e-06, + "loss": 0.2749, + "step": 3631 + }, + { + "epoch": 2.59, + "grad_norm": 9.022981742578796, + "learning_rate": 2.9114596740416224e-06, + "loss": 0.2517, + "step": 3632 + }, + { + "epoch": 2.59, + "grad_norm": 10.157774956816326, + "learning_rate": 2.908834087195308e-06, + "loss": 0.3579, + "step": 3633 + }, + { + "epoch": 2.59, + "grad_norm": 14.441924468999797, + "learning_rate": 2.9062091990463935e-06, + "loss": 0.3257, + "step": 3634 + }, + { + "epoch": 2.59, + "grad_norm": 14.911170610783156, + "learning_rate": 2.903585010471904e-06, + "loss": 0.3979, + "step": 3635 + }, + { + "epoch": 2.6, + "grad_norm": 11.658127870248308, + "learning_rate": 2.9009615223486297e-06, + "loss": 0.3418, + "step": 3636 + }, + { + "epoch": 2.6, + "grad_norm": 10.556064566768066, + "learning_rate": 2.898338735553128e-06, + "loss": 0.2759, + "step": 3637 + }, + { + "epoch": 2.6, + "grad_norm": 10.750443370130997, + "learning_rate": 2.895716650961714e-06, + "loss": 0.3328, + "step": 3638 + }, + { + "epoch": 2.6, + "grad_norm": 13.404219192176061, + "learning_rate": 2.8930952694504843e-06, + "loss": 0.3159, + "step": 3639 + }, + { + "epoch": 2.6, + "grad_norm": 8.801417766512998, + "learning_rate": 2.8904745918952833e-06, + "loss": 0.24, + "step": 3640 + }, + { + "epoch": 2.6, + "grad_norm": 10.893767277763509, + "learning_rate": 2.887854619171735e-06, + "loss": 0.2925, + "step": 3641 + }, + { + "epoch": 2.6, + "grad_norm": 8.238072093150642, + "learning_rate": 2.8852353521552135e-06, + "loss": 0.2283, + "step": 3642 + }, + { + "epoch": 2.6, + "grad_norm": 12.888970588309743, + "learning_rate": 2.8826167917208727e-06, + "loss": 0.2354, + "step": 3643 + }, + { + "epoch": 2.6, + "grad_norm": 11.876703842177745, + "learning_rate": 2.8799989387436137e-06, + "loss": 0.2683, + "step": 3644 + }, + { + "epoch": 2.6, + "grad_norm": 8.14852428141835, + "learning_rate": 2.8773817940981186e-06, + "loss": 0.2678, + "step": 3645 + }, + { + "epoch": 2.6, + "grad_norm": 10.672569274403406, + "learning_rate": 2.8747653586588183e-06, + "loss": 0.3386, + "step": 3646 + }, + { + "epoch": 2.6, + "grad_norm": 11.871670166521756, + "learning_rate": 2.872149633299913e-06, + "loss": 0.2263, + "step": 3647 + }, + { + "epoch": 2.6, + "grad_norm": 12.647933339436156, + "learning_rate": 2.8695346188953666e-06, + "loss": 0.2524, + "step": 3648 + }, + { + "epoch": 2.6, + "grad_norm": 11.424868547184808, + "learning_rate": 2.866920316318904e-06, + "loss": 0.3276, + "step": 3649 + }, + { + "epoch": 2.61, + "grad_norm": 12.588799181559985, + "learning_rate": 2.8643067264440116e-06, + "loss": 0.3127, + "step": 3650 + }, + { + "epoch": 2.61, + "grad_norm": 12.677503962337894, + "learning_rate": 2.8616938501439384e-06, + "loss": 0.2363, + "step": 3651 + }, + { + "epoch": 2.61, + "grad_norm": 14.00332101941201, + "learning_rate": 2.8590816882916948e-06, + "loss": 0.2627, + "step": 3652 + }, + { + "epoch": 2.61, + "grad_norm": 13.14346918330707, + "learning_rate": 2.856470241760054e-06, + "loss": 0.3857, + "step": 3653 + }, + { + "epoch": 2.61, + "grad_norm": 13.979054378804332, + "learning_rate": 2.8538595114215472e-06, + "loss": 0.2969, + "step": 3654 + }, + { + "epoch": 2.61, + "grad_norm": 10.350457278452481, + "learning_rate": 2.8512494981484706e-06, + "loss": 0.2629, + "step": 3655 + }, + { + "epoch": 2.61, + "grad_norm": 13.518490136692607, + "learning_rate": 2.848640202812872e-06, + "loss": 0.2688, + "step": 3656 + }, + { + "epoch": 2.61, + "grad_norm": 11.77830597016325, + "learning_rate": 2.846031626286574e-06, + "loss": 0.2463, + "step": 3657 + }, + { + "epoch": 2.61, + "grad_norm": 13.232483324704306, + "learning_rate": 2.8434237694411414e-06, + "loss": 0.2715, + "step": 3658 + }, + { + "epoch": 2.61, + "grad_norm": 17.486679322201393, + "learning_rate": 2.840816633147917e-06, + "loss": 0.335, + "step": 3659 + }, + { + "epoch": 2.61, + "grad_norm": 7.147692850153896, + "learning_rate": 2.8382102182779846e-06, + "loss": 0.1785, + "step": 3660 + }, + { + "epoch": 2.61, + "grad_norm": 9.423807909050868, + "learning_rate": 2.8356045257022037e-06, + "loss": 0.2021, + "step": 3661 + }, + { + "epoch": 2.61, + "grad_norm": 9.72309990205576, + "learning_rate": 2.832999556291177e-06, + "loss": 0.2351, + "step": 3662 + }, + { + "epoch": 2.61, + "grad_norm": 14.42597066218875, + "learning_rate": 2.8303953109152815e-06, + "loss": 0.3379, + "step": 3663 + }, + { + "epoch": 2.62, + "grad_norm": 10.89552475555432, + "learning_rate": 2.827791790444638e-06, + "loss": 0.2471, + "step": 3664 + }, + { + "epoch": 2.62, + "grad_norm": 21.378000080961417, + "learning_rate": 2.8251889957491317e-06, + "loss": 0.4006, + "step": 3665 + }, + { + "epoch": 2.62, + "grad_norm": 10.967603756260177, + "learning_rate": 2.822586927698407e-06, + "loss": 0.2324, + "step": 3666 + }, + { + "epoch": 2.62, + "grad_norm": 12.808368358224756, + "learning_rate": 2.819985587161861e-06, + "loss": 0.2229, + "step": 3667 + }, + { + "epoch": 2.62, + "grad_norm": 20.91800691344456, + "learning_rate": 2.8173849750086513e-06, + "loss": 0.3875, + "step": 3668 + }, + { + "epoch": 2.62, + "grad_norm": 19.77994963136096, + "learning_rate": 2.8147850921076903e-06, + "loss": 0.2908, + "step": 3669 + }, + { + "epoch": 2.62, + "grad_norm": 9.899888131646092, + "learning_rate": 2.8121859393276475e-06, + "loss": 0.2932, + "step": 3670 + }, + { + "epoch": 2.62, + "grad_norm": 13.500226797134557, + "learning_rate": 2.809587517536947e-06, + "loss": 0.2898, + "step": 3671 + }, + { + "epoch": 2.62, + "grad_norm": 13.24302333969292, + "learning_rate": 2.806989827603771e-06, + "loss": 0.2646, + "step": 3672 + }, + { + "epoch": 2.62, + "grad_norm": 9.78450213872708, + "learning_rate": 2.8043928703960565e-06, + "loss": 0.2385, + "step": 3673 + }, + { + "epoch": 2.62, + "grad_norm": 9.502438761869197, + "learning_rate": 2.8017966467814933e-06, + "loss": 0.22, + "step": 3674 + }, + { + "epoch": 2.62, + "grad_norm": 8.757377161825325, + "learning_rate": 2.7992011576275295e-06, + "loss": 0.2163, + "step": 3675 + }, + { + "epoch": 2.62, + "grad_norm": 7.72219922258268, + "learning_rate": 2.7966064038013657e-06, + "loss": 0.1946, + "step": 3676 + }, + { + "epoch": 2.62, + "grad_norm": 10.007379297576533, + "learning_rate": 2.7940123861699577e-06, + "loss": 0.2786, + "step": 3677 + }, + { + "epoch": 2.63, + "grad_norm": 8.593026440283587, + "learning_rate": 2.7914191056000147e-06, + "loss": 0.2473, + "step": 3678 + }, + { + "epoch": 2.63, + "grad_norm": 18.03444982249345, + "learning_rate": 2.788826562958e-06, + "loss": 0.2756, + "step": 3679 + }, + { + "epoch": 2.63, + "grad_norm": 18.91137019627999, + "learning_rate": 2.7862347591101326e-06, + "loss": 0.2871, + "step": 3680 + }, + { + "epoch": 2.63, + "grad_norm": 12.723301260051828, + "learning_rate": 2.7836436949223755e-06, + "loss": 0.2795, + "step": 3681 + }, + { + "epoch": 2.63, + "grad_norm": 9.113635889283595, + "learning_rate": 2.78105337126046e-06, + "loss": 0.2056, + "step": 3682 + }, + { + "epoch": 2.63, + "grad_norm": 16.904639312214186, + "learning_rate": 2.7784637889898534e-06, + "loss": 0.3232, + "step": 3683 + }, + { + "epoch": 2.63, + "grad_norm": 14.888350346451826, + "learning_rate": 2.7758749489757914e-06, + "loss": 0.3789, + "step": 3684 + }, + { + "epoch": 2.63, + "grad_norm": 10.763047444352006, + "learning_rate": 2.7732868520832455e-06, + "loss": 0.2673, + "step": 3685 + }, + { + "epoch": 2.63, + "grad_norm": 10.351563872031413, + "learning_rate": 2.770699499176954e-06, + "loss": 0.2411, + "step": 3686 + }, + { + "epoch": 2.63, + "grad_norm": 12.930545068368955, + "learning_rate": 2.768112891121394e-06, + "loss": 0.2139, + "step": 3687 + }, + { + "epoch": 2.63, + "grad_norm": 11.380131399436635, + "learning_rate": 2.7655270287808045e-06, + "loss": 0.2854, + "step": 3688 + }, + { + "epoch": 2.63, + "grad_norm": 12.686797655460788, + "learning_rate": 2.762941913019166e-06, + "loss": 0.2605, + "step": 3689 + }, + { + "epoch": 2.63, + "grad_norm": 17.096895929111895, + "learning_rate": 2.760357544700215e-06, + "loss": 0.3394, + "step": 3690 + }, + { + "epoch": 2.63, + "grad_norm": 10.428775038502973, + "learning_rate": 2.757773924687437e-06, + "loss": 0.3103, + "step": 3691 + }, + { + "epoch": 2.64, + "grad_norm": 14.336479750202862, + "learning_rate": 2.755191053844068e-06, + "loss": 0.3137, + "step": 3692 + }, + { + "epoch": 2.64, + "grad_norm": 14.401781691538487, + "learning_rate": 2.7526089330330925e-06, + "loss": 0.301, + "step": 3693 + }, + { + "epoch": 2.64, + "grad_norm": 12.653098093782502, + "learning_rate": 2.7500275631172455e-06, + "loss": 0.3079, + "step": 3694 + }, + { + "epoch": 2.64, + "grad_norm": 7.841260346341729, + "learning_rate": 2.74744694495901e-06, + "loss": 0.2393, + "step": 3695 + }, + { + "epoch": 2.64, + "grad_norm": 9.365598543542117, + "learning_rate": 2.74486707942062e-06, + "loss": 0.2288, + "step": 3696 + }, + { + "epoch": 2.64, + "grad_norm": 12.161027149177121, + "learning_rate": 2.7422879673640552e-06, + "loss": 0.2568, + "step": 3697 + }, + { + "epoch": 2.64, + "grad_norm": 12.575772535414341, + "learning_rate": 2.7397096096510467e-06, + "loss": 0.3198, + "step": 3698 + }, + { + "epoch": 2.64, + "grad_norm": 8.191135053850635, + "learning_rate": 2.7371320071430674e-06, + "loss": 0.183, + "step": 3699 + }, + { + "epoch": 2.64, + "grad_norm": 11.824506901967847, + "learning_rate": 2.7345551607013475e-06, + "loss": 0.2175, + "step": 3700 + }, + { + "epoch": 2.64, + "grad_norm": 11.666492088642991, + "learning_rate": 2.7319790711868545e-06, + "loss": 0.2837, + "step": 3701 + }, + { + "epoch": 2.64, + "grad_norm": 14.142683081101087, + "learning_rate": 2.7294037394603135e-06, + "loss": 0.3069, + "step": 3702 + }, + { + "epoch": 2.64, + "grad_norm": 9.323056173012265, + "learning_rate": 2.7268291663821825e-06, + "loss": 0.2463, + "step": 3703 + }, + { + "epoch": 2.64, + "grad_norm": 8.448469652485555, + "learning_rate": 2.7242553528126842e-06, + "loss": 0.261, + "step": 3704 + }, + { + "epoch": 2.64, + "grad_norm": 12.413334200894413, + "learning_rate": 2.72168229961177e-06, + "loss": 0.2285, + "step": 3705 + }, + { + "epoch": 2.65, + "grad_norm": 8.372411343210498, + "learning_rate": 2.7191100076391473e-06, + "loss": 0.28, + "step": 3706 + }, + { + "epoch": 2.65, + "grad_norm": 13.386689789949346, + "learning_rate": 2.716538477754266e-06, + "loss": 0.2786, + "step": 3707 + }, + { + "epoch": 2.65, + "grad_norm": 9.380636299671224, + "learning_rate": 2.713967710816323e-06, + "loss": 0.2209, + "step": 3708 + }, + { + "epoch": 2.65, + "grad_norm": 9.743026725229255, + "learning_rate": 2.7113977076842597e-06, + "loss": 0.262, + "step": 3709 + }, + { + "epoch": 2.65, + "grad_norm": 8.806331974095787, + "learning_rate": 2.7088284692167604e-06, + "loss": 0.2461, + "step": 3710 + }, + { + "epoch": 2.65, + "grad_norm": 10.668378133256628, + "learning_rate": 2.7062599962722563e-06, + "loss": 0.2358, + "step": 3711 + }, + { + "epoch": 2.65, + "grad_norm": 8.267929385459727, + "learning_rate": 2.703692289708922e-06, + "loss": 0.1868, + "step": 3712 + }, + { + "epoch": 2.65, + "grad_norm": 9.835870231063108, + "learning_rate": 2.701125350384676e-06, + "loss": 0.2524, + "step": 3713 + }, + { + "epoch": 2.65, + "grad_norm": 9.563910610196777, + "learning_rate": 2.69855917915718e-06, + "loss": 0.2437, + "step": 3714 + }, + { + "epoch": 2.65, + "grad_norm": 12.04255826697796, + "learning_rate": 2.695993776883839e-06, + "loss": 0.2261, + "step": 3715 + }, + { + "epoch": 2.65, + "grad_norm": 8.574915744949621, + "learning_rate": 2.693429144421803e-06, + "loss": 0.2065, + "step": 3716 + }, + { + "epoch": 2.65, + "grad_norm": 13.575572137934644, + "learning_rate": 2.6908652826279623e-06, + "loss": 0.3191, + "step": 3717 + }, + { + "epoch": 2.65, + "grad_norm": 11.921722163471122, + "learning_rate": 2.688302192358952e-06, + "loss": 0.2988, + "step": 3718 + }, + { + "epoch": 2.65, + "grad_norm": 14.103930678311931, + "learning_rate": 2.6857398744711472e-06, + "loss": 0.2549, + "step": 3719 + }, + { + "epoch": 2.66, + "grad_norm": 12.661523899831044, + "learning_rate": 2.683178329820666e-06, + "loss": 0.272, + "step": 3720 + }, + { + "epoch": 2.66, + "grad_norm": 20.55045890604639, + "learning_rate": 2.680617559263368e-06, + "loss": 0.4014, + "step": 3721 + }, + { + "epoch": 2.66, + "grad_norm": 10.731468401982053, + "learning_rate": 2.6780575636548544e-06, + "loss": 0.2571, + "step": 3722 + }, + { + "epoch": 2.66, + "grad_norm": 10.002086520928234, + "learning_rate": 2.67549834385047e-06, + "loss": 0.23, + "step": 3723 + }, + { + "epoch": 2.66, + "grad_norm": 17.74586382467539, + "learning_rate": 2.67293990070529e-06, + "loss": 0.2725, + "step": 3724 + }, + { + "epoch": 2.66, + "grad_norm": 15.370350233340924, + "learning_rate": 2.6703822350741483e-06, + "loss": 0.2493, + "step": 3725 + }, + { + "epoch": 2.66, + "grad_norm": 13.138445754547114, + "learning_rate": 2.6678253478116e-06, + "loss": 0.2695, + "step": 3726 + }, + { + "epoch": 2.66, + "grad_norm": 17.832773166121346, + "learning_rate": 2.665269239771953e-06, + "loss": 0.3164, + "step": 3727 + }, + { + "epoch": 2.66, + "grad_norm": 11.550195192135488, + "learning_rate": 2.662713911809248e-06, + "loss": 0.2651, + "step": 3728 + }, + { + "epoch": 2.66, + "grad_norm": 9.218101826282929, + "learning_rate": 2.6601593647772696e-06, + "loss": 0.2422, + "step": 3729 + }, + { + "epoch": 2.66, + "grad_norm": 7.894184013419036, + "learning_rate": 2.657605599529538e-06, + "loss": 0.2026, + "step": 3730 + }, + { + "epoch": 2.66, + "grad_norm": 19.05854846221324, + "learning_rate": 2.6550526169193148e-06, + "loss": 0.2878, + "step": 3731 + }, + { + "epoch": 2.66, + "grad_norm": 11.9197967302707, + "learning_rate": 2.6525004177995984e-06, + "loss": 0.2617, + "step": 3732 + }, + { + "epoch": 2.66, + "grad_norm": 10.975852184158727, + "learning_rate": 2.6499490030231255e-06, + "loss": 0.2622, + "step": 3733 + }, + { + "epoch": 2.67, + "grad_norm": 15.155629799506274, + "learning_rate": 2.6473983734423725e-06, + "loss": 0.3186, + "step": 3734 + }, + { + "epoch": 2.67, + "grad_norm": 11.758339565258634, + "learning_rate": 2.644848529909552e-06, + "loss": 0.2964, + "step": 3735 + }, + { + "epoch": 2.67, + "grad_norm": 7.913581401033196, + "learning_rate": 2.6422994732766124e-06, + "loss": 0.2395, + "step": 3736 + }, + { + "epoch": 2.67, + "grad_norm": 11.148566465671331, + "learning_rate": 2.6397512043952422e-06, + "loss": 0.2524, + "step": 3737 + }, + { + "epoch": 2.67, + "grad_norm": 11.657857821556732, + "learning_rate": 2.637203724116865e-06, + "loss": 0.3242, + "step": 3738 + }, + { + "epoch": 2.67, + "grad_norm": 10.957623757375671, + "learning_rate": 2.634657033292644e-06, + "loss": 0.2217, + "step": 3739 + }, + { + "epoch": 2.67, + "grad_norm": 13.508785111154575, + "learning_rate": 2.6321111327734693e-06, + "loss": 0.2539, + "step": 3740 + }, + { + "epoch": 2.67, + "grad_norm": 11.622247025051292, + "learning_rate": 2.6295660234099816e-06, + "loss": 0.291, + "step": 3741 + }, + { + "epoch": 2.67, + "grad_norm": 13.422136750776735, + "learning_rate": 2.6270217060525416e-06, + "loss": 0.2888, + "step": 3742 + }, + { + "epoch": 2.67, + "grad_norm": 7.53914884344272, + "learning_rate": 2.624478181551261e-06, + "loss": 0.2026, + "step": 3743 + }, + { + "epoch": 2.67, + "grad_norm": 10.498572267274076, + "learning_rate": 2.62193545075597e-06, + "loss": 0.2798, + "step": 3744 + }, + { + "epoch": 2.67, + "grad_norm": 15.79889320226917, + "learning_rate": 2.6193935145162507e-06, + "loss": 0.2163, + "step": 3745 + }, + { + "epoch": 2.67, + "grad_norm": 15.306685142228023, + "learning_rate": 2.6168523736814035e-06, + "loss": 0.2278, + "step": 3746 + }, + { + "epoch": 2.67, + "grad_norm": 12.577440943022701, + "learning_rate": 2.6143120291004785e-06, + "loss": 0.2603, + "step": 3747 + }, + { + "epoch": 2.68, + "grad_norm": 9.96685761518886, + "learning_rate": 2.611772481622246e-06, + "loss": 0.2502, + "step": 3748 + }, + { + "epoch": 2.68, + "grad_norm": 9.149397489215964, + "learning_rate": 2.609233732095218e-06, + "loss": 0.2128, + "step": 3749 + }, + { + "epoch": 2.68, + "grad_norm": 14.844697752164103, + "learning_rate": 2.6066957813676375e-06, + "loss": 0.2615, + "step": 3750 + }, + { + "epoch": 2.68, + "grad_norm": 19.15670169536182, + "learning_rate": 2.604158630287482e-06, + "loss": 0.3196, + "step": 3751 + }, + { + "epoch": 2.68, + "grad_norm": 8.279178479626223, + "learning_rate": 2.60162227970246e-06, + "loss": 0.2178, + "step": 3752 + }, + { + "epoch": 2.68, + "grad_norm": 11.751400189966994, + "learning_rate": 2.5990867304600136e-06, + "loss": 0.2583, + "step": 3753 + }, + { + "epoch": 2.68, + "grad_norm": 15.652516885894475, + "learning_rate": 2.5965519834073172e-06, + "loss": 0.3057, + "step": 3754 + }, + { + "epoch": 2.68, + "grad_norm": 19.546997677748575, + "learning_rate": 2.5940180393912767e-06, + "loss": 0.2573, + "step": 3755 + }, + { + "epoch": 2.68, + "grad_norm": 11.260684953345004, + "learning_rate": 2.5914848992585293e-06, + "loss": 0.2771, + "step": 3756 + }, + { + "epoch": 2.68, + "grad_norm": 13.383273870807141, + "learning_rate": 2.588952563855448e-06, + "loss": 0.2675, + "step": 3757 + }, + { + "epoch": 2.68, + "grad_norm": 25.01046644893153, + "learning_rate": 2.5864210340281247e-06, + "loss": 0.2979, + "step": 3758 + }, + { + "epoch": 2.68, + "grad_norm": 9.939204656676262, + "learning_rate": 2.5838903106224004e-06, + "loss": 0.2478, + "step": 3759 + }, + { + "epoch": 2.68, + "grad_norm": 12.320199686371675, + "learning_rate": 2.5813603944838283e-06, + "loss": 0.3015, + "step": 3760 + }, + { + "epoch": 2.68, + "grad_norm": 14.595923087132736, + "learning_rate": 2.578831286457708e-06, + "loss": 0.3175, + "step": 3761 + }, + { + "epoch": 2.69, + "grad_norm": 11.246086611925866, + "learning_rate": 2.5763029873890542e-06, + "loss": 0.2749, + "step": 3762 + }, + { + "epoch": 2.69, + "grad_norm": 12.986807659021308, + "learning_rate": 2.573775498122626e-06, + "loss": 0.2788, + "step": 3763 + }, + { + "epoch": 2.69, + "grad_norm": 15.310292571942231, + "learning_rate": 2.5712488195028972e-06, + "loss": 0.3462, + "step": 3764 + }, + { + "epoch": 2.69, + "grad_norm": 13.136729199970594, + "learning_rate": 2.5687229523740852e-06, + "loss": 0.282, + "step": 3765 + }, + { + "epoch": 2.69, + "grad_norm": 9.782342303010028, + "learning_rate": 2.566197897580124e-06, + "loss": 0.2458, + "step": 3766 + }, + { + "epoch": 2.69, + "grad_norm": 13.133256291376362, + "learning_rate": 2.5636736559646824e-06, + "loss": 0.2234, + "step": 3767 + }, + { + "epoch": 2.69, + "grad_norm": 12.590584644856651, + "learning_rate": 2.5611502283711576e-06, + "loss": 0.3142, + "step": 3768 + }, + { + "epoch": 2.69, + "grad_norm": 6.440635231131513, + "learning_rate": 2.5586276156426726e-06, + "loss": 0.2224, + "step": 3769 + }, + { + "epoch": 2.69, + "grad_norm": 16.245808625760535, + "learning_rate": 2.55610581862208e-06, + "loss": 0.4028, + "step": 3770 + }, + { + "epoch": 2.69, + "grad_norm": 10.723496311621378, + "learning_rate": 2.553584838151959e-06, + "loss": 0.2771, + "step": 3771 + }, + { + "epoch": 2.69, + "grad_norm": 10.5810352576066, + "learning_rate": 2.5510646750746154e-06, + "loss": 0.2427, + "step": 3772 + }, + { + "epoch": 2.69, + "grad_norm": 10.01714154859388, + "learning_rate": 2.548545330232083e-06, + "loss": 0.2751, + "step": 3773 + }, + { + "epoch": 2.69, + "grad_norm": 9.33044917355502, + "learning_rate": 2.5460268044661215e-06, + "loss": 0.2717, + "step": 3774 + }, + { + "epoch": 2.69, + "grad_norm": 8.953209080406706, + "learning_rate": 2.5435090986182176e-06, + "loss": 0.2373, + "step": 3775 + }, + { + "epoch": 2.7, + "grad_norm": 8.763927527803714, + "learning_rate": 2.5409922135295827e-06, + "loss": 0.2861, + "step": 3776 + }, + { + "epoch": 2.7, + "grad_norm": 11.902909129104998, + "learning_rate": 2.538476150041156e-06, + "loss": 0.2371, + "step": 3777 + }, + { + "epoch": 2.7, + "grad_norm": 11.972557490659907, + "learning_rate": 2.5359609089936006e-06, + "loss": 0.3052, + "step": 3778 + }, + { + "epoch": 2.7, + "grad_norm": 9.45909372522568, + "learning_rate": 2.533446491227305e-06, + "loss": 0.2371, + "step": 3779 + }, + { + "epoch": 2.7, + "grad_norm": 15.827756309288517, + "learning_rate": 2.5309328975823834e-06, + "loss": 0.2866, + "step": 3780 + }, + { + "epoch": 2.7, + "grad_norm": 11.749836878444942, + "learning_rate": 2.5284201288986744e-06, + "loss": 0.2866, + "step": 3781 + }, + { + "epoch": 2.7, + "grad_norm": 8.965182716060708, + "learning_rate": 2.5259081860157418e-06, + "loss": 0.2061, + "step": 3782 + }, + { + "epoch": 2.7, + "grad_norm": 24.07848339998907, + "learning_rate": 2.5233970697728673e-06, + "loss": 0.3752, + "step": 3783 + }, + { + "epoch": 2.7, + "grad_norm": 9.527044251899435, + "learning_rate": 2.520886781009068e-06, + "loss": 0.2346, + "step": 3784 + }, + { + "epoch": 2.7, + "grad_norm": 8.012392512661368, + "learning_rate": 2.5183773205630726e-06, + "loss": 0.1793, + "step": 3785 + }, + { + "epoch": 2.7, + "grad_norm": 25.177862422351314, + "learning_rate": 2.515868689273344e-06, + "loss": 0.3994, + "step": 3786 + }, + { + "epoch": 2.7, + "grad_norm": 12.33196589779338, + "learning_rate": 2.513360887978056e-06, + "loss": 0.3093, + "step": 3787 + }, + { + "epoch": 2.7, + "grad_norm": 11.253538907290611, + "learning_rate": 2.510853917515119e-06, + "loss": 0.2842, + "step": 3788 + }, + { + "epoch": 2.7, + "grad_norm": 8.249265295320773, + "learning_rate": 2.50834777872215e-06, + "loss": 0.2053, + "step": 3789 + }, + { + "epoch": 2.71, + "grad_norm": 10.582172247591428, + "learning_rate": 2.505842472436506e-06, + "loss": 0.2583, + "step": 3790 + }, + { + "epoch": 2.71, + "grad_norm": 10.66633784718915, + "learning_rate": 2.5033379994952493e-06, + "loss": 0.2407, + "step": 3791 + }, + { + "epoch": 2.71, + "grad_norm": 12.944642252674152, + "learning_rate": 2.5008343607351733e-06, + "loss": 0.2534, + "step": 3792 + }, + { + "epoch": 2.71, + "grad_norm": 12.936903818038365, + "learning_rate": 2.4983315569927895e-06, + "loss": 0.2915, + "step": 3793 + }, + { + "epoch": 2.71, + "grad_norm": 14.489379129954239, + "learning_rate": 2.495829589104333e-06, + "loss": 0.3008, + "step": 3794 + }, + { + "epoch": 2.71, + "grad_norm": 10.125082832585976, + "learning_rate": 2.493328457905755e-06, + "loss": 0.2649, + "step": 3795 + }, + { + "epoch": 2.71, + "grad_norm": 8.20842735541152, + "learning_rate": 2.490828164232732e-06, + "loss": 0.3149, + "step": 3796 + }, + { + "epoch": 2.71, + "grad_norm": 7.675386676649888, + "learning_rate": 2.4883287089206582e-06, + "loss": 0.1863, + "step": 3797 + }, + { + "epoch": 2.71, + "grad_norm": 20.742830149961485, + "learning_rate": 2.48583009280465e-06, + "loss": 0.3511, + "step": 3798 + }, + { + "epoch": 2.71, + "grad_norm": 14.230891927189097, + "learning_rate": 2.483332316719535e-06, + "loss": 0.2849, + "step": 3799 + }, + { + "epoch": 2.71, + "grad_norm": 10.848726525249605, + "learning_rate": 2.4808353814998747e-06, + "loss": 0.2275, + "step": 3800 + }, + { + "epoch": 2.71, + "grad_norm": 12.124014387257533, + "learning_rate": 2.4783392879799345e-06, + "loss": 0.2949, + "step": 3801 + }, + { + "epoch": 2.71, + "grad_norm": 9.7237259687238, + "learning_rate": 2.4758440369937125e-06, + "loss": 0.2478, + "step": 3802 + }, + { + "epoch": 2.71, + "grad_norm": 13.256139492796564, + "learning_rate": 2.4733496293749116e-06, + "loss": 0.2549, + "step": 3803 + }, + { + "epoch": 2.72, + "grad_norm": 11.982938440215118, + "learning_rate": 2.4708560659569665e-06, + "loss": 0.2588, + "step": 3804 + }, + { + "epoch": 2.72, + "grad_norm": 10.720889972293365, + "learning_rate": 2.4683633475730158e-06, + "loss": 0.2373, + "step": 3805 + }, + { + "epoch": 2.72, + "grad_norm": 12.082888203055472, + "learning_rate": 2.465871475055931e-06, + "loss": 0.2601, + "step": 3806 + }, + { + "epoch": 2.72, + "grad_norm": 11.744412266081433, + "learning_rate": 2.4633804492382866e-06, + "loss": 0.2532, + "step": 3807 + }, + { + "epoch": 2.72, + "grad_norm": 8.257941981624063, + "learning_rate": 2.460890270952383e-06, + "loss": 0.2229, + "step": 3808 + }, + { + "epoch": 2.72, + "grad_norm": 8.784405215041396, + "learning_rate": 2.4584009410302357e-06, + "loss": 0.2222, + "step": 3809 + }, + { + "epoch": 2.72, + "grad_norm": 11.388475216063645, + "learning_rate": 2.4559124603035744e-06, + "loss": 0.2717, + "step": 3810 + }, + { + "epoch": 2.72, + "grad_norm": 14.219406174751503, + "learning_rate": 2.4534248296038488e-06, + "loss": 0.2698, + "step": 3811 + }, + { + "epoch": 2.72, + "grad_norm": 6.6815938869164615, + "learning_rate": 2.4509380497622208e-06, + "loss": 0.22, + "step": 3812 + }, + { + "epoch": 2.72, + "grad_norm": 9.677189639592727, + "learning_rate": 2.448452121609571e-06, + "loss": 0.2183, + "step": 3813 + }, + { + "epoch": 2.72, + "grad_norm": 12.373482728475846, + "learning_rate": 2.445967045976493e-06, + "loss": 0.3013, + "step": 3814 + }, + { + "epoch": 2.72, + "grad_norm": 11.139596133312253, + "learning_rate": 2.443482823693298e-06, + "loss": 0.2468, + "step": 3815 + }, + { + "epoch": 2.72, + "grad_norm": 10.189247114208078, + "learning_rate": 2.4409994555900125e-06, + "loss": 0.2351, + "step": 3816 + }, + { + "epoch": 2.72, + "grad_norm": 12.972346774800624, + "learning_rate": 2.4385169424963696e-06, + "loss": 0.3157, + "step": 3817 + }, + { + "epoch": 2.73, + "grad_norm": 9.086435159510867, + "learning_rate": 2.4360352852418305e-06, + "loss": 0.2595, + "step": 3818 + }, + { + "epoch": 2.73, + "grad_norm": 9.164889104160295, + "learning_rate": 2.4335544846555564e-06, + "loss": 0.2026, + "step": 3819 + }, + { + "epoch": 2.73, + "grad_norm": 8.857166435097096, + "learning_rate": 2.431074541566436e-06, + "loss": 0.2142, + "step": 3820 + }, + { + "epoch": 2.73, + "grad_norm": 12.925741012441494, + "learning_rate": 2.4285954568030566e-06, + "loss": 0.3203, + "step": 3821 + }, + { + "epoch": 2.73, + "grad_norm": 16.356894972943223, + "learning_rate": 2.426117231193735e-06, + "loss": 0.2651, + "step": 3822 + }, + { + "epoch": 2.73, + "grad_norm": 10.0580568696073, + "learning_rate": 2.4236398655664834e-06, + "loss": 0.2673, + "step": 3823 + }, + { + "epoch": 2.73, + "grad_norm": 12.718078840336174, + "learning_rate": 2.4211633607490442e-06, + "loss": 0.3003, + "step": 3824 + }, + { + "epoch": 2.73, + "grad_norm": 10.376049611246835, + "learning_rate": 2.4186877175688576e-06, + "loss": 0.2839, + "step": 3825 + }, + { + "epoch": 2.73, + "grad_norm": 10.126606193237182, + "learning_rate": 2.4162129368530848e-06, + "loss": 0.2422, + "step": 3826 + }, + { + "epoch": 2.73, + "grad_norm": 15.206258398092292, + "learning_rate": 2.413739019428595e-06, + "loss": 0.2449, + "step": 3827 + }, + { + "epoch": 2.73, + "grad_norm": 9.679245970025766, + "learning_rate": 2.41126596612197e-06, + "loss": 0.2478, + "step": 3828 + }, + { + "epoch": 2.73, + "grad_norm": 7.765818992891482, + "learning_rate": 2.408793777759504e-06, + "loss": 0.2175, + "step": 3829 + }, + { + "epoch": 2.73, + "grad_norm": 7.619739347290504, + "learning_rate": 2.4063224551672e-06, + "loss": 0.1636, + "step": 3830 + }, + { + "epoch": 2.73, + "grad_norm": 8.46872304315626, + "learning_rate": 2.4038519991707725e-06, + "loss": 0.217, + "step": 3831 + }, + { + "epoch": 2.74, + "grad_norm": 12.19458341714486, + "learning_rate": 2.4013824105956483e-06, + "loss": 0.2549, + "step": 3832 + }, + { + "epoch": 2.74, + "grad_norm": 10.625624020708948, + "learning_rate": 2.3989136902669614e-06, + "loss": 0.2244, + "step": 3833 + }, + { + "epoch": 2.74, + "grad_norm": 12.101152390439431, + "learning_rate": 2.396445839009558e-06, + "loss": 0.2581, + "step": 3834 + }, + { + "epoch": 2.74, + "grad_norm": 10.164102827836336, + "learning_rate": 2.3939788576479926e-06, + "loss": 0.2493, + "step": 3835 + }, + { + "epoch": 2.74, + "grad_norm": 14.073936277817232, + "learning_rate": 2.39151274700653e-06, + "loss": 0.2141, + "step": 3836 + }, + { + "epoch": 2.74, + "grad_norm": 11.663235322211742, + "learning_rate": 2.389047507909143e-06, + "loss": 0.2766, + "step": 3837 + }, + { + "epoch": 2.74, + "grad_norm": 12.427826685566103, + "learning_rate": 2.3865831411795137e-06, + "loss": 0.2869, + "step": 3838 + }, + { + "epoch": 2.74, + "grad_norm": 9.972113366082914, + "learning_rate": 2.3841196476410337e-06, + "loss": 0.2744, + "step": 3839 + }, + { + "epoch": 2.74, + "grad_norm": 17.7061464713193, + "learning_rate": 2.3816570281168016e-06, + "loss": 0.2939, + "step": 3840 + }, + { + "epoch": 2.74, + "grad_norm": 12.571672824051713, + "learning_rate": 2.379195283429626e-06, + "loss": 0.2527, + "step": 3841 + }, + { + "epoch": 2.74, + "grad_norm": 12.099134977591973, + "learning_rate": 2.3767344144020164e-06, + "loss": 0.2732, + "step": 3842 + }, + { + "epoch": 2.74, + "grad_norm": 11.620004046032994, + "learning_rate": 2.374274421856202e-06, + "loss": 0.2632, + "step": 3843 + }, + { + "epoch": 2.74, + "grad_norm": 8.39076395500453, + "learning_rate": 2.371815306614104e-06, + "loss": 0.2305, + "step": 3844 + }, + { + "epoch": 2.74, + "grad_norm": 10.1166915100912, + "learning_rate": 2.3693570694973673e-06, + "loss": 0.2808, + "step": 3845 + }, + { + "epoch": 2.75, + "grad_norm": 9.41468021588428, + "learning_rate": 2.366899711327326e-06, + "loss": 0.1987, + "step": 3846 + }, + { + "epoch": 2.75, + "grad_norm": 11.880151286249157, + "learning_rate": 2.3644432329250374e-06, + "loss": 0.3016, + "step": 3847 + }, + { + "epoch": 2.75, + "grad_norm": 18.01327928861911, + "learning_rate": 2.3619876351112486e-06, + "loss": 0.2637, + "step": 3848 + }, + { + "epoch": 2.75, + "grad_norm": 12.736270873846344, + "learning_rate": 2.3595329187064282e-06, + "loss": 0.25, + "step": 3849 + }, + { + "epoch": 2.75, + "grad_norm": 9.835553229983626, + "learning_rate": 2.3570790845307367e-06, + "loss": 0.2292, + "step": 3850 + }, + { + "epoch": 2.75, + "grad_norm": 14.202387368493234, + "learning_rate": 2.3546261334040475e-06, + "loss": 0.2852, + "step": 3851 + }, + { + "epoch": 2.75, + "grad_norm": 16.411782936488706, + "learning_rate": 2.352174066145938e-06, + "loss": 0.3911, + "step": 3852 + }, + { + "epoch": 2.75, + "grad_norm": 11.67620624008918, + "learning_rate": 2.3497228835756887e-06, + "loss": 0.2145, + "step": 3853 + }, + { + "epoch": 2.75, + "grad_norm": 11.462214082378056, + "learning_rate": 2.3472725865122854e-06, + "loss": 0.2725, + "step": 3854 + }, + { + "epoch": 2.75, + "grad_norm": 13.87215540966945, + "learning_rate": 2.344823175774418e-06, + "loss": 0.2715, + "step": 3855 + }, + { + "epoch": 2.75, + "grad_norm": 9.18779316525026, + "learning_rate": 2.3423746521804796e-06, + "loss": 0.2561, + "step": 3856 + }, + { + "epoch": 2.75, + "grad_norm": 12.833742282866638, + "learning_rate": 2.339927016548568e-06, + "loss": 0.3223, + "step": 3857 + }, + { + "epoch": 2.75, + "grad_norm": 12.93936635239193, + "learning_rate": 2.3374802696964842e-06, + "loss": 0.3257, + "step": 3858 + }, + { + "epoch": 2.75, + "grad_norm": 6.703243223218935, + "learning_rate": 2.3350344124417336e-06, + "loss": 0.2383, + "step": 3859 + }, + { + "epoch": 2.76, + "grad_norm": 12.33796827383027, + "learning_rate": 2.3325894456015154e-06, + "loss": 0.2477, + "step": 3860 + }, + { + "epoch": 2.76, + "grad_norm": 10.236913282982604, + "learning_rate": 2.3301453699927477e-06, + "loss": 0.2988, + "step": 3861 + }, + { + "epoch": 2.76, + "grad_norm": 11.224723971064577, + "learning_rate": 2.3277021864320332e-06, + "loss": 0.2644, + "step": 3862 + }, + { + "epoch": 2.76, + "grad_norm": 12.688501246139568, + "learning_rate": 2.325259895735693e-06, + "loss": 0.335, + "step": 3863 + }, + { + "epoch": 2.76, + "grad_norm": 8.090819890854913, + "learning_rate": 2.322818498719734e-06, + "loss": 0.239, + "step": 3864 + }, + { + "epoch": 2.76, + "grad_norm": 9.057125126226383, + "learning_rate": 2.3203779961998795e-06, + "loss": 0.2053, + "step": 3865 + }, + { + "epoch": 2.76, + "grad_norm": 8.958169951613703, + "learning_rate": 2.317938388991541e-06, + "loss": 0.1948, + "step": 3866 + }, + { + "epoch": 2.76, + "grad_norm": 14.03811936060471, + "learning_rate": 2.3154996779098405e-06, + "loss": 0.3455, + "step": 3867 + }, + { + "epoch": 2.76, + "grad_norm": 10.677363035725163, + "learning_rate": 2.313061863769594e-06, + "loss": 0.271, + "step": 3868 + }, + { + "epoch": 2.76, + "grad_norm": 7.65190708066888, + "learning_rate": 2.310624947385322e-06, + "loss": 0.2233, + "step": 3869 + }, + { + "epoch": 2.76, + "grad_norm": 10.957316079617797, + "learning_rate": 2.3081889295712434e-06, + "loss": 0.2454, + "step": 3870 + }, + { + "epoch": 2.76, + "grad_norm": 14.538888305745177, + "learning_rate": 2.3057538111412765e-06, + "loss": 0.2864, + "step": 3871 + }, + { + "epoch": 2.76, + "grad_norm": 8.502555486177886, + "learning_rate": 2.3033195929090404e-06, + "loss": 0.2903, + "step": 3872 + }, + { + "epoch": 2.76, + "grad_norm": 10.599925868669361, + "learning_rate": 2.300886275687852e-06, + "loss": 0.2695, + "step": 3873 + }, + { + "epoch": 2.77, + "grad_norm": 8.740186546474655, + "learning_rate": 2.298453860290728e-06, + "loss": 0.2144, + "step": 3874 + }, + { + "epoch": 2.77, + "grad_norm": 11.863302566675813, + "learning_rate": 2.296022347530384e-06, + "loss": 0.229, + "step": 3875 + }, + { + "epoch": 2.77, + "grad_norm": 11.423946125684537, + "learning_rate": 2.293591738219233e-06, + "loss": 0.2622, + "step": 3876 + }, + { + "epoch": 2.77, + "grad_norm": 9.70179065557097, + "learning_rate": 2.2911620331693867e-06, + "loss": 0.2466, + "step": 3877 + }, + { + "epoch": 2.77, + "grad_norm": 10.731816493491, + "learning_rate": 2.2887332331926555e-06, + "loss": 0.2634, + "step": 3878 + }, + { + "epoch": 2.77, + "grad_norm": 9.239045588894083, + "learning_rate": 2.2863053391005462e-06, + "loss": 0.2148, + "step": 3879 + }, + { + "epoch": 2.77, + "grad_norm": 9.005638078267589, + "learning_rate": 2.2838783517042628e-06, + "loss": 0.2544, + "step": 3880 + }, + { + "epoch": 2.77, + "grad_norm": 13.100939806282982, + "learning_rate": 2.281452271814708e-06, + "loss": 0.3584, + "step": 3881 + }, + { + "epoch": 2.77, + "grad_norm": 12.093234947012963, + "learning_rate": 2.2790271002424794e-06, + "loss": 0.293, + "step": 3882 + }, + { + "epoch": 2.77, + "grad_norm": 8.795211245967684, + "learning_rate": 2.276602837797872e-06, + "loss": 0.2092, + "step": 3883 + }, + { + "epoch": 2.77, + "grad_norm": 13.552937714867474, + "learning_rate": 2.274179485290879e-06, + "loss": 0.3291, + "step": 3884 + }, + { + "epoch": 2.77, + "grad_norm": 8.991506238164844, + "learning_rate": 2.271757043531184e-06, + "loss": 0.2747, + "step": 3885 + }, + { + "epoch": 2.77, + "grad_norm": 12.652415067342714, + "learning_rate": 2.2693355133281706e-06, + "loss": 0.2778, + "step": 3886 + }, + { + "epoch": 2.77, + "grad_norm": 8.559659679329272, + "learning_rate": 2.266914895490918e-06, + "loss": 0.249, + "step": 3887 + }, + { + "epoch": 2.78, + "grad_norm": 11.410463329166966, + "learning_rate": 2.2644951908282e-06, + "loss": 0.2316, + "step": 3888 + }, + { + "epoch": 2.78, + "grad_norm": 25.89091889726885, + "learning_rate": 2.262076400148484e-06, + "loss": 0.4292, + "step": 3889 + }, + { + "epoch": 2.78, + "grad_norm": 15.884730752793354, + "learning_rate": 2.2596585242599333e-06, + "loss": 0.3232, + "step": 3890 + }, + { + "epoch": 2.78, + "grad_norm": 10.28933616549401, + "learning_rate": 2.257241563970405e-06, + "loss": 0.2344, + "step": 3891 + }, + { + "epoch": 2.78, + "grad_norm": 11.385456564823834, + "learning_rate": 2.254825520087451e-06, + "loss": 0.2449, + "step": 3892 + }, + { + "epoch": 2.78, + "grad_norm": 9.305077098961187, + "learning_rate": 2.2524103934183154e-06, + "loss": 0.3003, + "step": 3893 + }, + { + "epoch": 2.78, + "grad_norm": 15.773934697174711, + "learning_rate": 2.249996184769938e-06, + "loss": 0.2681, + "step": 3894 + }, + { + "epoch": 2.78, + "grad_norm": 20.217611301648915, + "learning_rate": 2.2475828949489504e-06, + "loss": 0.353, + "step": 3895 + }, + { + "epoch": 2.78, + "grad_norm": 11.849058612686495, + "learning_rate": 2.2451705247616774e-06, + "loss": 0.2488, + "step": 3896 + }, + { + "epoch": 2.78, + "grad_norm": 13.548086863537335, + "learning_rate": 2.2427590750141364e-06, + "loss": 0.3018, + "step": 3897 + }, + { + "epoch": 2.78, + "grad_norm": 9.09548543829281, + "learning_rate": 2.240348546512039e-06, + "loss": 0.2832, + "step": 3898 + }, + { + "epoch": 2.78, + "grad_norm": 12.482779249189054, + "learning_rate": 2.237938940060786e-06, + "loss": 0.2422, + "step": 3899 + }, + { + "epoch": 2.78, + "grad_norm": 10.515407607343274, + "learning_rate": 2.235530256465474e-06, + "loss": 0.2533, + "step": 3900 + }, + { + "epoch": 2.78, + "grad_norm": 9.136175867369973, + "learning_rate": 2.233122496530884e-06, + "loss": 0.2546, + "step": 3901 + }, + { + "epoch": 2.79, + "grad_norm": 11.726217970795506, + "learning_rate": 2.2307156610615e-06, + "loss": 0.2715, + "step": 3902 + }, + { + "epoch": 2.79, + "grad_norm": 8.768133881503765, + "learning_rate": 2.2283097508614837e-06, + "loss": 0.2378, + "step": 3903 + }, + { + "epoch": 2.79, + "grad_norm": 8.681822646089104, + "learning_rate": 2.225904766734702e-06, + "loss": 0.2346, + "step": 3904 + }, + { + "epoch": 2.79, + "grad_norm": 11.275082284621488, + "learning_rate": 2.2235007094846963e-06, + "loss": 0.2439, + "step": 3905 + }, + { + "epoch": 2.79, + "grad_norm": 14.454333835161174, + "learning_rate": 2.2210975799147143e-06, + "loss": 0.2776, + "step": 3906 + }, + { + "epoch": 2.79, + "grad_norm": 9.102927596290863, + "learning_rate": 2.21869537882768e-06, + "loss": 0.2295, + "step": 3907 + }, + { + "epoch": 2.79, + "grad_norm": 9.406469054773485, + "learning_rate": 2.21629410702622e-06, + "loss": 0.2034, + "step": 3908 + }, + { + "epoch": 2.79, + "grad_norm": 6.923690320426484, + "learning_rate": 2.2138937653126393e-06, + "loss": 0.2527, + "step": 3909 + }, + { + "epoch": 2.79, + "grad_norm": 8.679522304935842, + "learning_rate": 2.2114943544889366e-06, + "loss": 0.2437, + "step": 3910 + }, + { + "epoch": 2.79, + "grad_norm": 7.2130763130063285, + "learning_rate": 2.2090958753568013e-06, + "loss": 0.2241, + "step": 3911 + }, + { + "epoch": 2.79, + "grad_norm": 21.126425833535844, + "learning_rate": 2.206698328717609e-06, + "loss": 0.321, + "step": 3912 + }, + { + "epoch": 2.79, + "grad_norm": 10.088147801041128, + "learning_rate": 2.2043017153724253e-06, + "loss": 0.228, + "step": 3913 + }, + { + "epoch": 2.79, + "grad_norm": 9.08748121795016, + "learning_rate": 2.2019060361220036e-06, + "loss": 0.249, + "step": 3914 + }, + { + "epoch": 2.79, + "grad_norm": 24.755148260939105, + "learning_rate": 2.199511291766783e-06, + "loss": 0.2686, + "step": 3915 + }, + { + "epoch": 2.8, + "grad_norm": 12.238612933720066, + "learning_rate": 2.1971174831068944e-06, + "loss": 0.2566, + "step": 3916 + }, + { + "epoch": 2.8, + "grad_norm": 16.062718282153504, + "learning_rate": 2.1947246109421514e-06, + "loss": 0.322, + "step": 3917 + }, + { + "epoch": 2.8, + "grad_norm": 13.06127849954014, + "learning_rate": 2.192332676072061e-06, + "loss": 0.3306, + "step": 3918 + }, + { + "epoch": 2.8, + "grad_norm": 10.283758870531178, + "learning_rate": 2.189941679295807e-06, + "loss": 0.2327, + "step": 3919 + }, + { + "epoch": 2.8, + "grad_norm": 14.062313929604555, + "learning_rate": 2.1875516214122723e-06, + "loss": 0.2507, + "step": 3920 + }, + { + "epoch": 2.8, + "grad_norm": 11.090141149972927, + "learning_rate": 2.185162503220013e-06, + "loss": 0.2285, + "step": 3921 + }, + { + "epoch": 2.8, + "grad_norm": 10.824756292791596, + "learning_rate": 2.182774325517285e-06, + "loss": 0.2534, + "step": 3922 + }, + { + "epoch": 2.8, + "grad_norm": 10.320492406815534, + "learning_rate": 2.180387089102016e-06, + "loss": 0.2246, + "step": 3923 + }, + { + "epoch": 2.8, + "grad_norm": 7.596778425586805, + "learning_rate": 2.1780007947718336e-06, + "loss": 0.1755, + "step": 3924 + }, + { + "epoch": 2.8, + "grad_norm": 19.455925151353494, + "learning_rate": 2.175615443324035e-06, + "loss": 0.3096, + "step": 3925 + }, + { + "epoch": 2.8, + "grad_norm": 13.420610683613315, + "learning_rate": 2.173231035555618e-06, + "loss": 0.2738, + "step": 3926 + }, + { + "epoch": 2.8, + "grad_norm": 12.592246153137603, + "learning_rate": 2.170847572263252e-06, + "loss": 0.2576, + "step": 3927 + }, + { + "epoch": 2.8, + "grad_norm": 15.101309193662187, + "learning_rate": 2.1684650542432985e-06, + "loss": 0.2688, + "step": 3928 + }, + { + "epoch": 2.8, + "grad_norm": 8.79780471508127, + "learning_rate": 2.166083482291801e-06, + "loss": 0.2893, + "step": 3929 + }, + { + "epoch": 2.81, + "grad_norm": 11.65657764794894, + "learning_rate": 2.1637028572044867e-06, + "loss": 0.2734, + "step": 3930 + }, + { + "epoch": 2.81, + "grad_norm": 14.391962577247151, + "learning_rate": 2.1613231797767668e-06, + "loss": 0.2554, + "step": 3931 + }, + { + "epoch": 2.81, + "grad_norm": 16.977331007596224, + "learning_rate": 2.158944450803736e-06, + "loss": 0.2844, + "step": 3932 + }, + { + "epoch": 2.81, + "grad_norm": 10.01757401617662, + "learning_rate": 2.1565666710801714e-06, + "loss": 0.271, + "step": 3933 + }, + { + "epoch": 2.81, + "grad_norm": 13.615886862288278, + "learning_rate": 2.1541898414005343e-06, + "loss": 0.2778, + "step": 3934 + }, + { + "epoch": 2.81, + "grad_norm": 13.59283742513515, + "learning_rate": 2.1518139625589663e-06, + "loss": 0.2664, + "step": 3935 + }, + { + "epoch": 2.81, + "grad_norm": 11.502287160025258, + "learning_rate": 2.1494390353492935e-06, + "loss": 0.3418, + "step": 3936 + }, + { + "epoch": 2.81, + "grad_norm": 7.93143738046724, + "learning_rate": 2.1470650605650235e-06, + "loss": 0.2114, + "step": 3937 + }, + { + "epoch": 2.81, + "grad_norm": 16.164621540582115, + "learning_rate": 2.144692038999345e-06, + "loss": 0.2739, + "step": 3938 + }, + { + "epoch": 2.81, + "grad_norm": 12.845701204985016, + "learning_rate": 2.142319971445129e-06, + "loss": 0.3232, + "step": 3939 + }, + { + "epoch": 2.81, + "grad_norm": 10.959776190394157, + "learning_rate": 2.139948858694926e-06, + "loss": 0.2891, + "step": 3940 + }, + { + "epoch": 2.81, + "grad_norm": 7.79971040797762, + "learning_rate": 2.137578701540971e-06, + "loss": 0.2053, + "step": 3941 + }, + { + "epoch": 2.81, + "grad_norm": 14.638963180765387, + "learning_rate": 2.1352095007751754e-06, + "loss": 0.3005, + "step": 3942 + }, + { + "epoch": 2.81, + "grad_norm": 43.00522394014673, + "learning_rate": 2.132841257189137e-06, + "loss": 0.4937, + "step": 3943 + }, + { + "epoch": 2.82, + "grad_norm": 10.985232967151337, + "learning_rate": 2.1304739715741235e-06, + "loss": 0.3257, + "step": 3944 + }, + { + "epoch": 2.82, + "grad_norm": 8.283080793744645, + "learning_rate": 2.128107644721096e-06, + "loss": 0.1897, + "step": 3945 + }, + { + "epoch": 2.82, + "grad_norm": 11.552268713806683, + "learning_rate": 2.1257422774206816e-06, + "loss": 0.2751, + "step": 3946 + }, + { + "epoch": 2.82, + "grad_norm": 14.25244065240587, + "learning_rate": 2.1233778704632002e-06, + "loss": 0.2583, + "step": 3947 + }, + { + "epoch": 2.82, + "grad_norm": 28.46181807821369, + "learning_rate": 2.1210144246386378e-06, + "loss": 0.4048, + "step": 3948 + }, + { + "epoch": 2.82, + "grad_norm": 13.599125276057082, + "learning_rate": 2.1186519407366725e-06, + "loss": 0.2998, + "step": 3949 + }, + { + "epoch": 2.82, + "grad_norm": 18.69833834877681, + "learning_rate": 2.1162904195466455e-06, + "loss": 0.2974, + "step": 3950 + }, + { + "epoch": 2.82, + "grad_norm": 9.782064797336112, + "learning_rate": 2.113929861857594e-06, + "loss": 0.2415, + "step": 3951 + }, + { + "epoch": 2.82, + "grad_norm": 9.05525476633752, + "learning_rate": 2.1115702684582177e-06, + "loss": 0.2354, + "step": 3952 + }, + { + "epoch": 2.82, + "grad_norm": 8.488998620156735, + "learning_rate": 2.1092116401369033e-06, + "loss": 0.2205, + "step": 3953 + }, + { + "epoch": 2.82, + "grad_norm": 7.6604425215527465, + "learning_rate": 2.1068539776817115e-06, + "loss": 0.1997, + "step": 3954 + }, + { + "epoch": 2.82, + "grad_norm": 10.936366716585368, + "learning_rate": 2.1044972818803816e-06, + "loss": 0.2666, + "step": 3955 + }, + { + "epoch": 2.82, + "grad_norm": 13.919791218319379, + "learning_rate": 2.1021415535203294e-06, + "loss": 0.2935, + "step": 3956 + }, + { + "epoch": 2.82, + "grad_norm": 12.83542796098615, + "learning_rate": 2.0997867933886467e-06, + "loss": 0.2839, + "step": 3957 + }, + { + "epoch": 2.83, + "grad_norm": 11.346017326459489, + "learning_rate": 2.0974330022721044e-06, + "loss": 0.3179, + "step": 3958 + }, + { + "epoch": 2.83, + "grad_norm": 11.944822762505321, + "learning_rate": 2.0950801809571466e-06, + "loss": 0.2749, + "step": 3959 + }, + { + "epoch": 2.83, + "grad_norm": 11.391359360322923, + "learning_rate": 2.0927283302298944e-06, + "loss": 0.303, + "step": 3960 + }, + { + "epoch": 2.83, + "grad_norm": 7.032353529389161, + "learning_rate": 2.0903774508761477e-06, + "loss": 0.2107, + "step": 3961 + }, + { + "epoch": 2.83, + "grad_norm": 6.485677725607823, + "learning_rate": 2.0880275436813726e-06, + "loss": 0.1442, + "step": 3962 + }, + { + "epoch": 2.83, + "grad_norm": 13.077591332146431, + "learning_rate": 2.0856786094307247e-06, + "loss": 0.2629, + "step": 3963 + }, + { + "epoch": 2.83, + "grad_norm": 11.819852695103354, + "learning_rate": 2.0833306489090186e-06, + "loss": 0.356, + "step": 3964 + }, + { + "epoch": 2.83, + "grad_norm": 9.563988233532719, + "learning_rate": 2.08098366290076e-06, + "loss": 0.2456, + "step": 3965 + }, + { + "epoch": 2.83, + "grad_norm": 8.809406976811916, + "learning_rate": 2.078637652190112e-06, + "loss": 0.2141, + "step": 3966 + }, + { + "epoch": 2.83, + "grad_norm": 10.610564298049212, + "learning_rate": 2.0762926175609287e-06, + "loss": 0.2444, + "step": 3967 + }, + { + "epoch": 2.83, + "grad_norm": 13.367480219472963, + "learning_rate": 2.0739485597967237e-06, + "loss": 0.3579, + "step": 3968 + }, + { + "epoch": 2.83, + "grad_norm": 9.40280542559146, + "learning_rate": 2.0716054796806916e-06, + "loss": 0.24, + "step": 3969 + }, + { + "epoch": 2.83, + "grad_norm": 18.096437879516504, + "learning_rate": 2.0692633779956998e-06, + "loss": 0.2996, + "step": 3970 + }, + { + "epoch": 2.83, + "grad_norm": 11.01349355918468, + "learning_rate": 2.0669222555242884e-06, + "loss": 0.3135, + "step": 3971 + }, + { + "epoch": 2.84, + "grad_norm": 11.179708500485882, + "learning_rate": 2.064582113048669e-06, + "loss": 0.2285, + "step": 3972 + }, + { + "epoch": 2.84, + "grad_norm": 9.404710632894906, + "learning_rate": 2.0622429513507275e-06, + "loss": 0.2913, + "step": 3973 + }, + { + "epoch": 2.84, + "grad_norm": 11.974891410309862, + "learning_rate": 2.05990477121202e-06, + "loss": 0.3313, + "step": 3974 + }, + { + "epoch": 2.84, + "grad_norm": 10.45063399330934, + "learning_rate": 2.0575675734137773e-06, + "loss": 0.2144, + "step": 3975 + }, + { + "epoch": 2.84, + "grad_norm": 11.698967473996044, + "learning_rate": 2.0552313587369003e-06, + "loss": 0.2664, + "step": 3976 + }, + { + "epoch": 2.84, + "grad_norm": 12.174398856640751, + "learning_rate": 2.052896127961963e-06, + "loss": 0.2932, + "step": 3977 + }, + { + "epoch": 2.84, + "grad_norm": 15.769798231540381, + "learning_rate": 2.050561881869205e-06, + "loss": 0.2498, + "step": 3978 + }, + { + "epoch": 2.84, + "grad_norm": 10.068055612015428, + "learning_rate": 2.048228621238547e-06, + "loss": 0.2527, + "step": 3979 + }, + { + "epoch": 2.84, + "grad_norm": 9.886413081007158, + "learning_rate": 2.0458963468495692e-06, + "loss": 0.2693, + "step": 3980 + }, + { + "epoch": 2.84, + "grad_norm": 8.972122538335825, + "learning_rate": 2.0435650594815338e-06, + "loss": 0.179, + "step": 3981 + }, + { + "epoch": 2.84, + "grad_norm": 16.441213776975122, + "learning_rate": 2.0412347599133607e-06, + "loss": 0.2732, + "step": 3982 + }, + { + "epoch": 2.84, + "grad_norm": 12.836617592039206, + "learning_rate": 2.0389054489236534e-06, + "loss": 0.2695, + "step": 3983 + }, + { + "epoch": 2.84, + "grad_norm": 11.202241722151822, + "learning_rate": 2.03657712729067e-06, + "loss": 0.2607, + "step": 3984 + }, + { + "epoch": 2.84, + "grad_norm": 8.254407875075486, + "learning_rate": 2.034249795792355e-06, + "loss": 0.177, + "step": 3985 + }, + { + "epoch": 2.85, + "grad_norm": 10.31910673688725, + "learning_rate": 2.031923455206306e-06, + "loss": 0.2949, + "step": 3986 + }, + { + "epoch": 2.85, + "grad_norm": 17.84033377167254, + "learning_rate": 2.0295981063098e-06, + "loss": 0.3264, + "step": 3987 + }, + { + "epoch": 2.85, + "grad_norm": 10.599633875942123, + "learning_rate": 2.027273749879777e-06, + "loss": 0.2413, + "step": 3988 + }, + { + "epoch": 2.85, + "grad_norm": 10.28616607550584, + "learning_rate": 2.02495038669285e-06, + "loss": 0.2134, + "step": 3989 + }, + { + "epoch": 2.85, + "grad_norm": 11.91463101296468, + "learning_rate": 2.0226280175252966e-06, + "loss": 0.2627, + "step": 3990 + }, + { + "epoch": 2.85, + "grad_norm": 11.012332249144231, + "learning_rate": 2.020306643153063e-06, + "loss": 0.2666, + "step": 3991 + }, + { + "epoch": 2.85, + "grad_norm": 16.681187563001785, + "learning_rate": 2.0179862643517657e-06, + "loss": 0.3252, + "step": 3992 + }, + { + "epoch": 2.85, + "grad_norm": 9.817386411445645, + "learning_rate": 2.015666881896684e-06, + "loss": 0.2441, + "step": 3993 + }, + { + "epoch": 2.85, + "grad_norm": 11.064995665760662, + "learning_rate": 2.0133484965627683e-06, + "loss": 0.2886, + "step": 3994 + }, + { + "epoch": 2.85, + "grad_norm": 14.865028740128867, + "learning_rate": 2.0110311091246333e-06, + "loss": 0.3228, + "step": 3995 + }, + { + "epoch": 2.85, + "grad_norm": 10.764784698091306, + "learning_rate": 2.0087147203565614e-06, + "loss": 0.2949, + "step": 3996 + }, + { + "epoch": 2.85, + "grad_norm": 10.094254980125088, + "learning_rate": 2.0063993310325013e-06, + "loss": 0.217, + "step": 3997 + }, + { + "epoch": 2.85, + "grad_norm": 9.135626121609093, + "learning_rate": 2.0040849419260682e-06, + "loss": 0.251, + "step": 3998 + }, + { + "epoch": 2.85, + "grad_norm": 18.33339179679279, + "learning_rate": 2.0017715538105416e-06, + "loss": 0.2485, + "step": 3999 + }, + { + "epoch": 2.86, + "grad_norm": 16.79996387979217, + "learning_rate": 1.9994591674588677e-06, + "loss": 0.2537, + "step": 4000 + }, + { + "epoch": 2.86, + "eval_avg_AUC": 0.7879958586606428, + "eval_avg_Accuracy": 0.6990218832891246, + "eval_avg_Accuracy-right": 0.8709403938959176, + "eval_avg_Accuracy-wrong": 0.3992494882874687, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6623938351746894, + "eval_last_AUC": 0.8070917433092194, + "eval_last_Accuracy": 0.7334631962864722, + "eval_last_Accuracy-right": 0.8252902047737055, + "eval_last_Accuracy-wrong": 0.5733454628155561, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6848945312092272, + "eval_max_AUC": 0.7616910032003927, + "eval_max_Accuracy": 0.6416611405835544, + "eval_max_Accuracy-right": 0.9620451284726751, + "eval_max_Accuracy-wrong": 0.08301114396179213, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6150202437158859, + "eval_min_AUC": 0.7931598852081432, + "eval_min_Accuracy": 0.720407824933687, + "eval_min_Accuracy-right": 0.7332072518586148, + "eval_min_Accuracy-wrong": 0.6980896065499204, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6685353032201301, + "eval_prod_AUC": 0.7945893958373447, + "eval_prod_Accuracy": 0.702420424403183, + "eval_prod_Accuracy-right": 0.6255380200860832, + "eval_prod_Accuracy-wrong": 0.8364794177848534, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6701653120639736, + "eval_runtime": 247.3512, + "eval_samples_per_second": 97.545, + "eval_steps_per_second": 3.048, + "eval_sum_AUC": 0.6614690230562896, + "eval_sum_Accuracy": 0.6374336870026526, + "eval_sum_Accuracy-right": 0.9853919394808921, + "eval_sum_Accuracy-wrong": 0.030702751876279282, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6497230981517113, + "step": 4000 + }, + { + "epoch": 2.86, + "grad_norm": 20.294863167005364, + "learning_rate": 1.9971477836436575e-06, + "loss": 0.3467, + "step": 4001 + }, + { + "epoch": 2.86, + "grad_norm": 10.327232865174512, + "learning_rate": 1.99483740313719e-06, + "loss": 0.2378, + "step": 4002 + }, + { + "epoch": 2.86, + "grad_norm": 11.22041031697138, + "learning_rate": 1.9925280267114e-06, + "loss": 0.248, + "step": 4003 + }, + { + "epoch": 2.86, + "grad_norm": 11.21213166626409, + "learning_rate": 1.9902196551379006e-06, + "loss": 0.2129, + "step": 4004 + }, + { + "epoch": 2.86, + "grad_norm": 13.610468088544783, + "learning_rate": 1.987912289187954e-06, + "loss": 0.2856, + "step": 4005 + }, + { + "epoch": 2.86, + "grad_norm": 12.763270070414201, + "learning_rate": 1.9856059296325027e-06, + "loss": 0.3086, + "step": 4006 + }, + { + "epoch": 2.86, + "grad_norm": 9.34977464133706, + "learning_rate": 1.9833005772421354e-06, + "loss": 0.1912, + "step": 4007 + }, + { + "epoch": 2.86, + "grad_norm": 9.99525600692564, + "learning_rate": 1.980996232787121e-06, + "loss": 0.2854, + "step": 4008 + }, + { + "epoch": 2.86, + "grad_norm": 8.482697059629118, + "learning_rate": 1.978692897037377e-06, + "loss": 0.2537, + "step": 4009 + }, + { + "epoch": 2.86, + "grad_norm": 11.88855064417953, + "learning_rate": 1.9763905707624975e-06, + "loss": 0.2056, + "step": 4010 + }, + { + "epoch": 2.86, + "grad_norm": 12.515119687094433, + "learning_rate": 1.974089254731727e-06, + "loss": 0.3047, + "step": 4011 + }, + { + "epoch": 2.86, + "grad_norm": 10.460042983380315, + "learning_rate": 1.97178894971398e-06, + "loss": 0.1714, + "step": 4012 + }, + { + "epoch": 2.86, + "grad_norm": 11.000891418019652, + "learning_rate": 1.9694896564778317e-06, + "loss": 0.2881, + "step": 4013 + }, + { + "epoch": 2.87, + "grad_norm": 10.457581961485896, + "learning_rate": 1.9671913757915173e-06, + "loss": 0.1953, + "step": 4014 + }, + { + "epoch": 2.87, + "grad_norm": 13.329358437695408, + "learning_rate": 1.964894108422936e-06, + "loss": 0.2489, + "step": 4015 + }, + { + "epoch": 2.87, + "grad_norm": 10.507027263953846, + "learning_rate": 1.962597855139648e-06, + "loss": 0.2153, + "step": 4016 + }, + { + "epoch": 2.87, + "grad_norm": 13.10099282908348, + "learning_rate": 1.960302616708873e-06, + "loss": 0.2883, + "step": 4017 + }, + { + "epoch": 2.87, + "grad_norm": 8.480233105548061, + "learning_rate": 1.9580083938974937e-06, + "loss": 0.223, + "step": 4018 + }, + { + "epoch": 2.87, + "grad_norm": 12.559695185275773, + "learning_rate": 1.9557151874720526e-06, + "loss": 0.2325, + "step": 4019 + }, + { + "epoch": 2.87, + "grad_norm": 13.141810003892173, + "learning_rate": 1.953422998198754e-06, + "loss": 0.1979, + "step": 4020 + }, + { + "epoch": 2.87, + "grad_norm": 14.982944575808363, + "learning_rate": 1.9511318268434554e-06, + "loss": 0.3599, + "step": 4021 + }, + { + "epoch": 2.87, + "grad_norm": 11.211224106691908, + "learning_rate": 1.9488416741716877e-06, + "loss": 0.2395, + "step": 4022 + }, + { + "epoch": 2.87, + "grad_norm": 8.569449118407048, + "learning_rate": 1.946552540948625e-06, + "loss": 0.2422, + "step": 4023 + }, + { + "epoch": 2.87, + "grad_norm": 13.650414366436912, + "learning_rate": 1.944264427939118e-06, + "loss": 0.3, + "step": 4024 + }, + { + "epoch": 2.87, + "grad_norm": 7.739566513501393, + "learning_rate": 1.941977335907659e-06, + "loss": 0.1785, + "step": 4025 + }, + { + "epoch": 2.87, + "grad_norm": 18.6977439093024, + "learning_rate": 1.939691265618417e-06, + "loss": 0.3027, + "step": 4026 + }, + { + "epoch": 2.87, + "grad_norm": 9.915637048581093, + "learning_rate": 1.9374062178352036e-06, + "loss": 0.2341, + "step": 4027 + }, + { + "epoch": 2.88, + "grad_norm": 9.897353050548803, + "learning_rate": 1.935122193321499e-06, + "loss": 0.2625, + "step": 4028 + }, + { + "epoch": 2.88, + "grad_norm": 16.08317279527024, + "learning_rate": 1.932839192840436e-06, + "loss": 0.2683, + "step": 4029 + }, + { + "epoch": 2.88, + "grad_norm": 12.704675205411846, + "learning_rate": 1.930557217154809e-06, + "loss": 0.2295, + "step": 4030 + }, + { + "epoch": 2.88, + "grad_norm": 8.351026891672202, + "learning_rate": 1.9282762670270693e-06, + "loss": 0.2058, + "step": 4031 + }, + { + "epoch": 2.88, + "grad_norm": 11.34895226658225, + "learning_rate": 1.925996343219323e-06, + "loss": 0.3765, + "step": 4032 + }, + { + "epoch": 2.88, + "grad_norm": 11.597504826581677, + "learning_rate": 1.923717446493336e-06, + "loss": 0.3281, + "step": 4033 + }, + { + "epoch": 2.88, + "grad_norm": 11.653477339408518, + "learning_rate": 1.9214395776105297e-06, + "loss": 0.2778, + "step": 4034 + }, + { + "epoch": 2.88, + "grad_norm": 8.944620053801449, + "learning_rate": 1.919162737331983e-06, + "loss": 0.2278, + "step": 4035 + }, + { + "epoch": 2.88, + "grad_norm": 17.550617411583413, + "learning_rate": 1.9168869264184296e-06, + "loss": 0.2656, + "step": 4036 + }, + { + "epoch": 2.88, + "grad_norm": 11.945812481491696, + "learning_rate": 1.9146121456302613e-06, + "loss": 0.2358, + "step": 4037 + }, + { + "epoch": 2.88, + "grad_norm": 11.422504829133128, + "learning_rate": 1.9123383957275237e-06, + "loss": 0.2812, + "step": 4038 + }, + { + "epoch": 2.88, + "grad_norm": 17.311605530516562, + "learning_rate": 1.91006567746992e-06, + "loss": 0.2559, + "step": 4039 + }, + { + "epoch": 2.88, + "grad_norm": 11.786074439406534, + "learning_rate": 1.907793991616806e-06, + "loss": 0.2878, + "step": 4040 + }, + { + "epoch": 2.88, + "grad_norm": 9.6922862971148, + "learning_rate": 1.9055233389271955e-06, + "loss": 0.2671, + "step": 4041 + }, + { + "epoch": 2.89, + "grad_norm": 10.628962310466495, + "learning_rate": 1.9032537201597556e-06, + "loss": 0.218, + "step": 4042 + }, + { + "epoch": 2.89, + "grad_norm": 10.911645743068291, + "learning_rate": 1.9009851360728077e-06, + "loss": 0.2557, + "step": 4043 + }, + { + "epoch": 2.89, + "grad_norm": 22.595771054903054, + "learning_rate": 1.898717587424328e-06, + "loss": 0.3633, + "step": 4044 + }, + { + "epoch": 2.89, + "grad_norm": 8.518102120271452, + "learning_rate": 1.8964510749719484e-06, + "loss": 0.2349, + "step": 4045 + }, + { + "epoch": 2.89, + "grad_norm": 16.22905494698704, + "learning_rate": 1.8941855994729497e-06, + "loss": 0.29, + "step": 4046 + }, + { + "epoch": 2.89, + "grad_norm": 9.207905032025637, + "learning_rate": 1.8919211616842703e-06, + "loss": 0.209, + "step": 4047 + }, + { + "epoch": 2.89, + "grad_norm": 12.410396427093156, + "learning_rate": 1.8896577623625017e-06, + "loss": 0.2229, + "step": 4048 + }, + { + "epoch": 2.89, + "grad_norm": 15.527011086500547, + "learning_rate": 1.887395402263888e-06, + "loss": 0.3206, + "step": 4049 + }, + { + "epoch": 2.89, + "grad_norm": 13.672016334197911, + "learning_rate": 1.8851340821443248e-06, + "loss": 0.2427, + "step": 4050 + }, + { + "epoch": 2.89, + "grad_norm": 11.125832400500446, + "learning_rate": 1.882873802759362e-06, + "loss": 0.312, + "step": 4051 + }, + { + "epoch": 2.89, + "grad_norm": 14.046251858097985, + "learning_rate": 1.8806145648642005e-06, + "loss": 0.2979, + "step": 4052 + }, + { + "epoch": 2.89, + "grad_norm": 9.838718306043596, + "learning_rate": 1.8783563692136936e-06, + "loss": 0.2334, + "step": 4053 + }, + { + "epoch": 2.89, + "grad_norm": 10.943639533290575, + "learning_rate": 1.8760992165623465e-06, + "loss": 0.3025, + "step": 4054 + }, + { + "epoch": 2.89, + "grad_norm": 10.666070064383465, + "learning_rate": 1.873843107664316e-06, + "loss": 0.2688, + "step": 4055 + }, + { + "epoch": 2.9, + "grad_norm": 11.583556219560066, + "learning_rate": 1.87158804327341e-06, + "loss": 0.2915, + "step": 4056 + }, + { + "epoch": 2.9, + "grad_norm": 7.0617200379471425, + "learning_rate": 1.8693340241430874e-06, + "loss": 0.1807, + "step": 4057 + }, + { + "epoch": 2.9, + "grad_norm": 12.72852127215646, + "learning_rate": 1.867081051026458e-06, + "loss": 0.3193, + "step": 4058 + }, + { + "epoch": 2.9, + "grad_norm": 10.749413424323619, + "learning_rate": 1.8648291246762818e-06, + "loss": 0.2511, + "step": 4059 + }, + { + "epoch": 2.9, + "grad_norm": 12.654719307217244, + "learning_rate": 1.8625782458449693e-06, + "loss": 0.2336, + "step": 4060 + }, + { + "epoch": 2.9, + "grad_norm": 10.407631474167662, + "learning_rate": 1.860328415284583e-06, + "loss": 0.2976, + "step": 4061 + }, + { + "epoch": 2.9, + "grad_norm": 10.48686271000459, + "learning_rate": 1.8580796337468276e-06, + "loss": 0.1995, + "step": 4062 + }, + { + "epoch": 2.9, + "grad_norm": 9.274457610353167, + "learning_rate": 1.8558319019830695e-06, + "loss": 0.2031, + "step": 4063 + }, + { + "epoch": 2.9, + "grad_norm": 11.802148591200808, + "learning_rate": 1.853585220744311e-06, + "loss": 0.2812, + "step": 4064 + }, + { + "epoch": 2.9, + "grad_norm": 8.092229309343045, + "learning_rate": 1.851339590781217e-06, + "loss": 0.2485, + "step": 4065 + }, + { + "epoch": 2.9, + "grad_norm": 21.463031361010216, + "learning_rate": 1.8490950128440877e-06, + "loss": 0.355, + "step": 4066 + }, + { + "epoch": 2.9, + "grad_norm": 10.998522968766485, + "learning_rate": 1.8468514876828847e-06, + "loss": 0.2788, + "step": 4067 + }, + { + "epoch": 2.9, + "grad_norm": 14.552298244913189, + "learning_rate": 1.844609016047204e-06, + "loss": 0.3298, + "step": 4068 + }, + { + "epoch": 2.9, + "grad_norm": 16.40982638450866, + "learning_rate": 1.8423675986863054e-06, + "loss": 0.3369, + "step": 4069 + }, + { + "epoch": 2.91, + "grad_norm": 10.645489950141965, + "learning_rate": 1.8401272363490818e-06, + "loss": 0.21, + "step": 4070 + }, + { + "epoch": 2.91, + "grad_norm": 13.723345576862725, + "learning_rate": 1.8378879297840818e-06, + "loss": 0.3018, + "step": 4071 + }, + { + "epoch": 2.91, + "grad_norm": 10.662657815513155, + "learning_rate": 1.8356496797395002e-06, + "loss": 0.2375, + "step": 4072 + }, + { + "epoch": 2.91, + "grad_norm": 13.360949595986845, + "learning_rate": 1.8334124869631765e-06, + "loss": 0.2791, + "step": 4073 + }, + { + "epoch": 2.91, + "grad_norm": 9.685132876470785, + "learning_rate": 1.8311763522025994e-06, + "loss": 0.2039, + "step": 4074 + }, + { + "epoch": 2.91, + "grad_norm": 9.895276388339477, + "learning_rate": 1.828941276204903e-06, + "loss": 0.2727, + "step": 4075 + }, + { + "epoch": 2.91, + "grad_norm": 8.432764560809574, + "learning_rate": 1.8267072597168673e-06, + "loss": 0.2058, + "step": 4076 + }, + { + "epoch": 2.91, + "grad_norm": 8.308441456073734, + "learning_rate": 1.8244743034849193e-06, + "loss": 0.2285, + "step": 4077 + }, + { + "epoch": 2.91, + "grad_norm": 12.095081699709256, + "learning_rate": 1.8222424082551303e-06, + "loss": 0.2605, + "step": 4078 + }, + { + "epoch": 2.91, + "grad_norm": 11.298604259683547, + "learning_rate": 1.820011574773221e-06, + "loss": 0.2684, + "step": 4079 + }, + { + "epoch": 2.91, + "grad_norm": 8.019721690601129, + "learning_rate": 1.8177818037845485e-06, + "loss": 0.2024, + "step": 4080 + }, + { + "epoch": 2.91, + "grad_norm": 16.13667342258012, + "learning_rate": 1.8155530960341273e-06, + "loss": 0.2921, + "step": 4081 + }, + { + "epoch": 2.91, + "grad_norm": 13.865382533021625, + "learning_rate": 1.8133254522666033e-06, + "loss": 0.293, + "step": 4082 + }, + { + "epoch": 2.91, + "grad_norm": 11.783714143176804, + "learning_rate": 1.8110988732262808e-06, + "loss": 0.2706, + "step": 4083 + }, + { + "epoch": 2.92, + "grad_norm": 9.22742606505741, + "learning_rate": 1.8088733596570945e-06, + "loss": 0.2429, + "step": 4084 + }, + { + "epoch": 2.92, + "grad_norm": 11.119724412060055, + "learning_rate": 1.806648912302636e-06, + "loss": 0.2507, + "step": 4085 + }, + { + "epoch": 2.92, + "grad_norm": 8.320477068822154, + "learning_rate": 1.8044255319061287e-06, + "loss": 0.28, + "step": 4086 + }, + { + "epoch": 2.92, + "grad_norm": 8.759806331612182, + "learning_rate": 1.8022032192104517e-06, + "loss": 0.2698, + "step": 4087 + }, + { + "epoch": 2.92, + "grad_norm": 14.35985526948073, + "learning_rate": 1.7999819749581154e-06, + "loss": 0.3721, + "step": 4088 + }, + { + "epoch": 2.92, + "grad_norm": 9.828623960795465, + "learning_rate": 1.797761799891281e-06, + "loss": 0.3364, + "step": 4089 + }, + { + "epoch": 2.92, + "grad_norm": 11.744365228222355, + "learning_rate": 1.7955426947517507e-06, + "loss": 0.24, + "step": 4090 + }, + { + "epoch": 2.92, + "grad_norm": 9.279599538506409, + "learning_rate": 1.793324660280968e-06, + "loss": 0.2603, + "step": 4091 + }, + { + "epoch": 2.92, + "grad_norm": 12.284245558123292, + "learning_rate": 1.7911076972200193e-06, + "loss": 0.2681, + "step": 4092 + }, + { + "epoch": 2.92, + "grad_norm": 10.759102641777156, + "learning_rate": 1.7888918063096334e-06, + "loss": 0.2205, + "step": 4093 + }, + { + "epoch": 2.92, + "grad_norm": 10.414253236529335, + "learning_rate": 1.7866769882901814e-06, + "loss": 0.2959, + "step": 4094 + }, + { + "epoch": 2.92, + "grad_norm": 12.100896779951352, + "learning_rate": 1.784463243901674e-06, + "loss": 0.25, + "step": 4095 + }, + { + "epoch": 2.92, + "grad_norm": 15.39298134834118, + "learning_rate": 1.7822505738837648e-06, + "loss": 0.397, + "step": 4096 + }, + { + "epoch": 2.92, + "grad_norm": 7.320159627540454, + "learning_rate": 1.7800389789757483e-06, + "loss": 0.2217, + "step": 4097 + }, + { + "epoch": 2.93, + "grad_norm": 8.705040302239233, + "learning_rate": 1.7778284599165597e-06, + "loss": 0.2642, + "step": 4098 + }, + { + "epoch": 2.93, + "grad_norm": 12.866603928279114, + "learning_rate": 1.7756190174447734e-06, + "loss": 0.2966, + "step": 4099 + }, + { + "epoch": 2.93, + "grad_norm": 12.300852071369984, + "learning_rate": 1.7734106522986061e-06, + "loss": 0.2854, + "step": 4100 + }, + { + "epoch": 2.93, + "grad_norm": 7.0406718245773545, + "learning_rate": 1.7712033652159133e-06, + "loss": 0.2153, + "step": 4101 + }, + { + "epoch": 2.93, + "grad_norm": 16.366009476816807, + "learning_rate": 1.7689971569341907e-06, + "loss": 0.3169, + "step": 4102 + }, + { + "epoch": 2.93, + "grad_norm": 8.099939081723868, + "learning_rate": 1.7667920281905738e-06, + "loss": 0.1508, + "step": 4103 + }, + { + "epoch": 2.93, + "grad_norm": 17.54395094001839, + "learning_rate": 1.764587979721838e-06, + "loss": 0.3247, + "step": 4104 + }, + { + "epoch": 2.93, + "grad_norm": 9.355552433889878, + "learning_rate": 1.7623850122643926e-06, + "loss": 0.1946, + "step": 4105 + }, + { + "epoch": 2.93, + "grad_norm": 11.451247973760308, + "learning_rate": 1.7601831265542968e-06, + "loss": 0.2495, + "step": 4106 + }, + { + "epoch": 2.93, + "grad_norm": 8.38373080535724, + "learning_rate": 1.7579823233272337e-06, + "loss": 0.2188, + "step": 4107 + }, + { + "epoch": 2.93, + "grad_norm": 32.26992819582037, + "learning_rate": 1.7557826033185404e-06, + "loss": 0.3677, + "step": 4108 + }, + { + "epoch": 2.93, + "grad_norm": 9.31935554317174, + "learning_rate": 1.7535839672631772e-06, + "loss": 0.3042, + "step": 4109 + }, + { + "epoch": 2.93, + "grad_norm": 7.333138141169738, + "learning_rate": 1.7513864158957556e-06, + "loss": 0.1741, + "step": 4110 + }, + { + "epoch": 2.93, + "grad_norm": 12.064201456722603, + "learning_rate": 1.7491899499505122e-06, + "loss": 0.2629, + "step": 4111 + }, + { + "epoch": 2.94, + "grad_norm": 12.246914808812818, + "learning_rate": 1.746994570161334e-06, + "loss": 0.2664, + "step": 4112 + }, + { + "epoch": 2.94, + "grad_norm": 16.025286460767063, + "learning_rate": 1.7448002772617324e-06, + "loss": 0.2764, + "step": 4113 + }, + { + "epoch": 2.94, + "grad_norm": 9.595517375370573, + "learning_rate": 1.7426070719848632e-06, + "loss": 0.1829, + "step": 4114 + }, + { + "epoch": 2.94, + "grad_norm": 9.033101510979936, + "learning_rate": 1.7404149550635173e-06, + "loss": 0.2468, + "step": 4115 + }, + { + "epoch": 2.94, + "grad_norm": 15.395760488217663, + "learning_rate": 1.7382239272301221e-06, + "loss": 0.345, + "step": 4116 + }, + { + "epoch": 2.94, + "grad_norm": 8.718081017809373, + "learning_rate": 1.7360339892167404e-06, + "loss": 0.2185, + "step": 4117 + }, + { + "epoch": 2.94, + "grad_norm": 11.570151019962179, + "learning_rate": 1.7338451417550712e-06, + "loss": 0.2932, + "step": 4118 + }, + { + "epoch": 2.94, + "grad_norm": 15.489633738751614, + "learning_rate": 1.7316573855764485e-06, + "loss": 0.3535, + "step": 4119 + }, + { + "epoch": 2.94, + "grad_norm": 11.204623438557102, + "learning_rate": 1.7294707214118434e-06, + "loss": 0.2524, + "step": 4120 + }, + { + "epoch": 2.94, + "grad_norm": 8.024037924894355, + "learning_rate": 1.7272851499918603e-06, + "loss": 0.2061, + "step": 4121 + }, + { + "epoch": 2.94, + "grad_norm": 10.025616633736863, + "learning_rate": 1.725100672046741e-06, + "loss": 0.2207, + "step": 4122 + }, + { + "epoch": 2.94, + "grad_norm": 9.237786503227326, + "learning_rate": 1.7229172883063556e-06, + "loss": 0.2344, + "step": 4123 + }, + { + "epoch": 2.94, + "grad_norm": 11.400008128488517, + "learning_rate": 1.7207349995002192e-06, + "loss": 0.2041, + "step": 4124 + }, + { + "epoch": 2.94, + "grad_norm": 13.04513731466515, + "learning_rate": 1.7185538063574692e-06, + "loss": 0.3313, + "step": 4125 + }, + { + "epoch": 2.95, + "grad_norm": 9.065512097775526, + "learning_rate": 1.7163737096068883e-06, + "loss": 0.198, + "step": 4126 + }, + { + "epoch": 2.95, + "grad_norm": 11.819709518012287, + "learning_rate": 1.7141947099768818e-06, + "loss": 0.2305, + "step": 4127 + }, + { + "epoch": 2.95, + "grad_norm": 6.6082456220174, + "learning_rate": 1.7120168081955001e-06, + "loss": 0.168, + "step": 4128 + }, + { + "epoch": 2.95, + "grad_norm": 9.349276591227465, + "learning_rate": 1.7098400049904163e-06, + "loss": 0.2913, + "step": 4129 + }, + { + "epoch": 2.95, + "grad_norm": 8.586751031921361, + "learning_rate": 1.707664301088941e-06, + "loss": 0.2065, + "step": 4130 + }, + { + "epoch": 2.95, + "grad_norm": 14.717204391015239, + "learning_rate": 1.705489697218019e-06, + "loss": 0.3105, + "step": 4131 + }, + { + "epoch": 2.95, + "grad_norm": 9.460015525511492, + "learning_rate": 1.7033161941042248e-06, + "loss": 0.208, + "step": 4132 + }, + { + "epoch": 2.95, + "grad_norm": 11.400780736034415, + "learning_rate": 1.7011437924737666e-06, + "loss": 0.3025, + "step": 4133 + }, + { + "epoch": 2.95, + "grad_norm": 10.28553264691427, + "learning_rate": 1.6989724930524843e-06, + "loss": 0.2966, + "step": 4134 + }, + { + "epoch": 2.95, + "grad_norm": 10.477581081372428, + "learning_rate": 1.6968022965658492e-06, + "loss": 0.183, + "step": 4135 + }, + { + "epoch": 2.95, + "grad_norm": 15.331783836914225, + "learning_rate": 1.694633203738964e-06, + "loss": 0.281, + "step": 4136 + }, + { + "epoch": 2.95, + "grad_norm": 10.89865181755367, + "learning_rate": 1.6924652152965632e-06, + "loss": 0.2869, + "step": 4137 + }, + { + "epoch": 2.95, + "grad_norm": 10.774512612157912, + "learning_rate": 1.690298331963014e-06, + "loss": 0.2534, + "step": 4138 + }, + { + "epoch": 2.95, + "grad_norm": 9.468974723185733, + "learning_rate": 1.6881325544623067e-06, + "loss": 0.2205, + "step": 4139 + }, + { + "epoch": 2.96, + "grad_norm": 11.15613380148459, + "learning_rate": 1.6859678835180749e-06, + "loss": 0.2581, + "step": 4140 + }, + { + "epoch": 2.96, + "grad_norm": 9.938617854657009, + "learning_rate": 1.6838043198535693e-06, + "loss": 0.2761, + "step": 4141 + }, + { + "epoch": 2.96, + "grad_norm": 22.526184909674672, + "learning_rate": 1.681641864191682e-06, + "loss": 0.4287, + "step": 4142 + }, + { + "epoch": 2.96, + "grad_norm": 10.610954712676994, + "learning_rate": 1.6794805172549244e-06, + "loss": 0.2856, + "step": 4143 + }, + { + "epoch": 2.96, + "grad_norm": 11.130277611997542, + "learning_rate": 1.6773202797654486e-06, + "loss": 0.2515, + "step": 4144 + }, + { + "epoch": 2.96, + "grad_norm": 11.42493725544239, + "learning_rate": 1.6751611524450235e-06, + "loss": 0.2637, + "step": 4145 + }, + { + "epoch": 2.96, + "grad_norm": 10.082805920327193, + "learning_rate": 1.6730031360150605e-06, + "loss": 0.2311, + "step": 4146 + }, + { + "epoch": 2.96, + "grad_norm": 8.655752640833025, + "learning_rate": 1.670846231196588e-06, + "loss": 0.2336, + "step": 4147 + }, + { + "epoch": 2.96, + "grad_norm": 9.13420076254242, + "learning_rate": 1.6686904387102692e-06, + "loss": 0.2661, + "step": 4148 + }, + { + "epoch": 2.96, + "grad_norm": 10.637960833953823, + "learning_rate": 1.6665357592763948e-06, + "loss": 0.2262, + "step": 4149 + }, + { + "epoch": 2.96, + "grad_norm": 11.514907798772807, + "learning_rate": 1.6643821936148834e-06, + "loss": 0.2236, + "step": 4150 + }, + { + "epoch": 2.96, + "grad_norm": 9.881896754186977, + "learning_rate": 1.6622297424452817e-06, + "loss": 0.2382, + "step": 4151 + }, + { + "epoch": 2.96, + "grad_norm": 6.96329529621935, + "learning_rate": 1.6600784064867625e-06, + "loss": 0.2207, + "step": 4152 + }, + { + "epoch": 2.96, + "grad_norm": 9.235052105560225, + "learning_rate": 1.6579281864581275e-06, + "loss": 0.2438, + "step": 4153 + }, + { + "epoch": 2.97, + "grad_norm": 8.127333202419477, + "learning_rate": 1.6557790830778058e-06, + "loss": 0.2133, + "step": 4154 + }, + { + "epoch": 2.97, + "grad_norm": 11.008855273874723, + "learning_rate": 1.6536310970638525e-06, + "loss": 0.2527, + "step": 4155 + }, + { + "epoch": 2.97, + "grad_norm": 10.218850473628057, + "learning_rate": 1.6514842291339494e-06, + "loss": 0.2563, + "step": 4156 + }, + { + "epoch": 2.97, + "grad_norm": 9.405684231978427, + "learning_rate": 1.6493384800054052e-06, + "loss": 0.2542, + "step": 4157 + }, + { + "epoch": 2.97, + "grad_norm": 12.309209754540694, + "learning_rate": 1.6471938503951546e-06, + "loss": 0.2742, + "step": 4158 + }, + { + "epoch": 2.97, + "grad_norm": 11.300836718904058, + "learning_rate": 1.6450503410197582e-06, + "loss": 0.2483, + "step": 4159 + }, + { + "epoch": 2.97, + "grad_norm": 13.129412391262694, + "learning_rate": 1.6429079525954023e-06, + "loss": 0.4229, + "step": 4160 + }, + { + "epoch": 2.97, + "grad_norm": 9.519718325999454, + "learning_rate": 1.6407666858378985e-06, + "loss": 0.2643, + "step": 4161 + }, + { + "epoch": 2.97, + "grad_norm": 12.957849553694674, + "learning_rate": 1.6386265414626834e-06, + "loss": 0.2549, + "step": 4162 + }, + { + "epoch": 2.97, + "grad_norm": 8.915315627608456, + "learning_rate": 1.636487520184822e-06, + "loss": 0.2188, + "step": 4163 + }, + { + "epoch": 2.97, + "grad_norm": 10.583275524212816, + "learning_rate": 1.6343496227189948e-06, + "loss": 0.2233, + "step": 4164 + }, + { + "epoch": 2.97, + "grad_norm": 11.15496354912215, + "learning_rate": 1.632212849779521e-06, + "loss": 0.2312, + "step": 4165 + }, + { + "epoch": 2.97, + "grad_norm": 9.269917208785433, + "learning_rate": 1.630077202080328e-06, + "loss": 0.1956, + "step": 4166 + }, + { + "epoch": 2.97, + "grad_norm": 12.657935965779476, + "learning_rate": 1.6279426803349828e-06, + "loss": 0.2642, + "step": 4167 + }, + { + "epoch": 2.98, + "grad_norm": 6.290607785408874, + "learning_rate": 1.6258092852566625e-06, + "loss": 0.1294, + "step": 4168 + }, + { + "epoch": 2.98, + "grad_norm": 10.711605990480061, + "learning_rate": 1.6236770175581807e-06, + "loss": 0.2771, + "step": 4169 + }, + { + "epoch": 2.98, + "grad_norm": 9.135466780344332, + "learning_rate": 1.62154587795196e-06, + "loss": 0.2688, + "step": 4170 + }, + { + "epoch": 2.98, + "grad_norm": 16.83078442721911, + "learning_rate": 1.6194158671500616e-06, + "loss": 0.214, + "step": 4171 + }, + { + "epoch": 2.98, + "grad_norm": 10.353277203263982, + "learning_rate": 1.6172869858641554e-06, + "loss": 0.2915, + "step": 4172 + }, + { + "epoch": 2.98, + "grad_norm": 9.629289861141057, + "learning_rate": 1.6151592348055433e-06, + "loss": 0.2556, + "step": 4173 + }, + { + "epoch": 2.98, + "grad_norm": 8.912948810612065, + "learning_rate": 1.6130326146851455e-06, + "loss": 0.2493, + "step": 4174 + }, + { + "epoch": 2.98, + "grad_norm": 12.026915751127602, + "learning_rate": 1.6109071262135056e-06, + "loss": 0.252, + "step": 4175 + }, + { + "epoch": 2.98, + "grad_norm": 13.580442426663522, + "learning_rate": 1.608782770100789e-06, + "loss": 0.2695, + "step": 4176 + }, + { + "epoch": 2.98, + "grad_norm": 11.492422800213438, + "learning_rate": 1.6066595470567825e-06, + "loss": 0.2375, + "step": 4177 + }, + { + "epoch": 2.98, + "grad_norm": 14.85339863796703, + "learning_rate": 1.6045374577908944e-06, + "loss": 0.3145, + "step": 4178 + }, + { + "epoch": 2.98, + "grad_norm": 8.415153390378142, + "learning_rate": 1.6024165030121542e-06, + "loss": 0.1987, + "step": 4179 + }, + { + "epoch": 2.98, + "grad_norm": 16.294077072163837, + "learning_rate": 1.6002966834292116e-06, + "loss": 0.3169, + "step": 4180 + }, + { + "epoch": 2.98, + "grad_norm": 9.594024560968199, + "learning_rate": 1.5981779997503405e-06, + "loss": 0.2385, + "step": 4181 + }, + { + "epoch": 2.99, + "grad_norm": 11.105734484271524, + "learning_rate": 1.5960604526834266e-06, + "loss": 0.2812, + "step": 4182 + }, + { + "epoch": 2.99, + "grad_norm": 11.49417910592248, + "learning_rate": 1.5939440429359888e-06, + "loss": 0.2888, + "step": 4183 + }, + { + "epoch": 2.99, + "grad_norm": 9.633425145280981, + "learning_rate": 1.591828771215152e-06, + "loss": 0.2197, + "step": 4184 + }, + { + "epoch": 2.99, + "grad_norm": 11.785768156691029, + "learning_rate": 1.5897146382276752e-06, + "loss": 0.2354, + "step": 4185 + }, + { + "epoch": 2.99, + "grad_norm": 15.318466928652242, + "learning_rate": 1.587601644679922e-06, + "loss": 0.27, + "step": 4186 + }, + { + "epoch": 2.99, + "grad_norm": 10.41034009289898, + "learning_rate": 1.58548979127789e-06, + "loss": 0.2205, + "step": 4187 + }, + { + "epoch": 2.99, + "grad_norm": 9.937568460758499, + "learning_rate": 1.5833790787271819e-06, + "loss": 0.1968, + "step": 4188 + }, + { + "epoch": 2.99, + "grad_norm": 9.46241310438527, + "learning_rate": 1.5812695077330325e-06, + "loss": 0.2495, + "step": 4189 + }, + { + "epoch": 2.99, + "grad_norm": 16.29389775463895, + "learning_rate": 1.5791610790002838e-06, + "loss": 0.3013, + "step": 4190 + }, + { + "epoch": 2.99, + "grad_norm": 13.717352648476366, + "learning_rate": 1.577053793233403e-06, + "loss": 0.2676, + "step": 4191 + }, + { + "epoch": 2.99, + "grad_norm": 11.711414459758286, + "learning_rate": 1.5749476511364726e-06, + "loss": 0.332, + "step": 4192 + }, + { + "epoch": 2.99, + "grad_norm": 7.705499968975047, + "learning_rate": 1.5728426534131946e-06, + "loss": 0.1589, + "step": 4193 + }, + { + "epoch": 2.99, + "grad_norm": 12.787386008766704, + "learning_rate": 1.5707388007668877e-06, + "loss": 0.3003, + "step": 4194 + }, + { + "epoch": 2.99, + "grad_norm": 7.667900077969025, + "learning_rate": 1.568636093900488e-06, + "loss": 0.2053, + "step": 4195 + }, + { + "epoch": 3.0, + "grad_norm": 15.07311298463385, + "learning_rate": 1.5665345335165488e-06, + "loss": 0.2358, + "step": 4196 + }, + { + "epoch": 3.0, + "grad_norm": 11.966148032918056, + "learning_rate": 1.5644341203172415e-06, + "loss": 0.3, + "step": 4197 + }, + { + "epoch": 3.0, + "grad_norm": 6.900175323732201, + "learning_rate": 1.5623348550043516e-06, + "loss": 0.1863, + "step": 4198 + }, + { + "epoch": 3.0, + "grad_norm": 13.34318877693945, + "learning_rate": 1.5602367382792839e-06, + "loss": 0.2773, + "step": 4199 + }, + { + "epoch": 3.0, + "grad_norm": 14.531517790747417, + "learning_rate": 1.5581397708430578e-06, + "loss": 0.3015, + "step": 4200 + }, + { + "epoch": 3.0, + "grad_norm": 12.05529215822636, + "learning_rate": 1.556043953396309e-06, + "loss": 0.23, + "step": 4201 + }, + { + "epoch": 3.0, + "grad_norm": 10.926738561041796, + "learning_rate": 1.5539492866392891e-06, + "loss": 0.201, + "step": 4202 + }, + { + "epoch": 3.0, + "grad_norm": 11.655803933246485, + "learning_rate": 1.551855771271865e-06, + "loss": 0.2241, + "step": 4203 + }, + { + "epoch": 3.0, + "grad_norm": 9.49786951614942, + "learning_rate": 1.5497634079935198e-06, + "loss": 0.2363, + "step": 4204 + }, + { + "epoch": 3.0, + "grad_norm": 5.562319258107449, + "learning_rate": 1.5476721975033498e-06, + "loss": 0.1233, + "step": 4205 + }, + { + "epoch": 3.0, + "grad_norm": 5.707661563901169, + "learning_rate": 1.5455821405000703e-06, + "loss": 0.1448, + "step": 4206 + }, + { + "epoch": 3.0, + "grad_norm": 3.98664541474231, + "learning_rate": 1.5434932376820039e-06, + "loss": 0.1188, + "step": 4207 + }, + { + "epoch": 3.0, + "grad_norm": 7.560483987455881, + "learning_rate": 1.5414054897470942e-06, + "loss": 0.1791, + "step": 4208 + }, + { + "epoch": 3.0, + "grad_norm": 4.315253135542903, + "learning_rate": 1.5393188973928957e-06, + "loss": 0.1033, + "step": 4209 + }, + { + "epoch": 3.0, + "grad_norm": 4.611585149366166, + "learning_rate": 1.5372334613165784e-06, + "loss": 0.1366, + "step": 4210 + }, + { + "epoch": 3.01, + "grad_norm": 4.021499687105682, + "learning_rate": 1.5351491822149255e-06, + "loss": 0.1254, + "step": 4211 + }, + { + "epoch": 3.01, + "grad_norm": 7.3862962459769514, + "learning_rate": 1.533066060784333e-06, + "loss": 0.1672, + "step": 4212 + }, + { + "epoch": 3.01, + "grad_norm": 4.011539859613148, + "learning_rate": 1.5309840977208096e-06, + "loss": 0.1176, + "step": 4213 + }, + { + "epoch": 3.01, + "grad_norm": 6.460881953756751, + "learning_rate": 1.5289032937199793e-06, + "loss": 0.151, + "step": 4214 + }, + { + "epoch": 3.01, + "grad_norm": 5.886640344653612, + "learning_rate": 1.5268236494770772e-06, + "loss": 0.1545, + "step": 4215 + }, + { + "epoch": 3.01, + "grad_norm": 8.44915060491121, + "learning_rate": 1.5247451656869499e-06, + "loss": 0.163, + "step": 4216 + }, + { + "epoch": 3.01, + "grad_norm": 4.682171952885785, + "learning_rate": 1.5226678430440588e-06, + "loss": 0.1411, + "step": 4217 + }, + { + "epoch": 3.01, + "grad_norm": 5.312109950691412, + "learning_rate": 1.5205916822424755e-06, + "loss": 0.1438, + "step": 4218 + }, + { + "epoch": 3.01, + "grad_norm": 5.1623724941925175, + "learning_rate": 1.5185166839758836e-06, + "loss": 0.1252, + "step": 4219 + }, + { + "epoch": 3.01, + "grad_norm": 6.471874299848998, + "learning_rate": 1.5164428489375789e-06, + "loss": 0.167, + "step": 4220 + }, + { + "epoch": 3.01, + "grad_norm": 4.334160602466491, + "learning_rate": 1.5143701778204683e-06, + "loss": 0.1064, + "step": 4221 + }, + { + "epoch": 3.01, + "grad_norm": 4.790958055528099, + "learning_rate": 1.5122986713170712e-06, + "loss": 0.1418, + "step": 4222 + }, + { + "epoch": 3.01, + "grad_norm": 4.360409064903208, + "learning_rate": 1.510228330119512e-06, + "loss": 0.1225, + "step": 4223 + }, + { + "epoch": 3.01, + "grad_norm": 4.105021948656687, + "learning_rate": 1.5081591549195357e-06, + "loss": 0.1021, + "step": 4224 + }, + { + "epoch": 3.02, + "grad_norm": 7.345073889890468, + "learning_rate": 1.5060911464084864e-06, + "loss": 0.1582, + "step": 4225 + }, + { + "epoch": 3.02, + "grad_norm": 8.936176216847798, + "learning_rate": 1.5040243052773312e-06, + "loss": 0.1592, + "step": 4226 + }, + { + "epoch": 3.02, + "grad_norm": 5.858093703881658, + "learning_rate": 1.5019586322166323e-06, + "loss": 0.1035, + "step": 4227 + }, + { + "epoch": 3.02, + "grad_norm": 7.983548488354241, + "learning_rate": 1.4998941279165773e-06, + "loss": 0.1723, + "step": 4228 + }, + { + "epoch": 3.02, + "grad_norm": 10.277666504790933, + "learning_rate": 1.4978307930669483e-06, + "loss": 0.1005, + "step": 4229 + }, + { + "epoch": 3.02, + "grad_norm": 8.547800791086695, + "learning_rate": 1.4957686283571498e-06, + "loss": 0.1653, + "step": 4230 + }, + { + "epoch": 3.02, + "grad_norm": 7.992527436033497, + "learning_rate": 1.4937076344761858e-06, + "loss": 0.1716, + "step": 4231 + }, + { + "epoch": 3.02, + "grad_norm": 6.936012071215718, + "learning_rate": 1.4916478121126732e-06, + "loss": 0.1029, + "step": 4232 + }, + { + "epoch": 3.02, + "grad_norm": 4.3255729830641885, + "learning_rate": 1.4895891619548374e-06, + "loss": 0.1046, + "step": 4233 + }, + { + "epoch": 3.02, + "grad_norm": 9.902272124427949, + "learning_rate": 1.4875316846905113e-06, + "loss": 0.1895, + "step": 4234 + }, + { + "epoch": 3.02, + "grad_norm": 5.363267661890996, + "learning_rate": 1.4854753810071364e-06, + "loss": 0.1187, + "step": 4235 + }, + { + "epoch": 3.02, + "grad_norm": 10.19363846323527, + "learning_rate": 1.4834202515917628e-06, + "loss": 0.2157, + "step": 4236 + }, + { + "epoch": 3.02, + "grad_norm": 8.752546304397477, + "learning_rate": 1.4813662971310465e-06, + "loss": 0.1414, + "step": 4237 + }, + { + "epoch": 3.02, + "grad_norm": 5.430637602687925, + "learning_rate": 1.4793135183112523e-06, + "loss": 0.1292, + "step": 4238 + }, + { + "epoch": 3.03, + "grad_norm": 8.280105835812476, + "learning_rate": 1.477261915818251e-06, + "loss": 0.2173, + "step": 4239 + }, + { + "epoch": 3.03, + "grad_norm": 5.619705907816034, + "learning_rate": 1.4752114903375243e-06, + "loss": 0.0962, + "step": 4240 + }, + { + "epoch": 3.03, + "grad_norm": 5.991828112437791, + "learning_rate": 1.473162242554151e-06, + "loss": 0.1031, + "step": 4241 + }, + { + "epoch": 3.03, + "grad_norm": 5.770685175159459, + "learning_rate": 1.47111417315283e-06, + "loss": 0.1162, + "step": 4242 + }, + { + "epoch": 3.03, + "grad_norm": 17.414337641647094, + "learning_rate": 1.4690672828178532e-06, + "loss": 0.1656, + "step": 4243 + }, + { + "epoch": 3.03, + "grad_norm": 8.439371331134558, + "learning_rate": 1.467021572233131e-06, + "loss": 0.1172, + "step": 4244 + }, + { + "epoch": 3.03, + "grad_norm": 6.740136894772873, + "learning_rate": 1.4649770420821663e-06, + "loss": 0.1099, + "step": 4245 + }, + { + "epoch": 3.03, + "grad_norm": 6.614518214652328, + "learning_rate": 1.4629336930480813e-06, + "loss": 0.1323, + "step": 4246 + }, + { + "epoch": 3.03, + "grad_norm": 7.219648721235388, + "learning_rate": 1.4608915258135914e-06, + "loss": 0.1555, + "step": 4247 + }, + { + "epoch": 3.03, + "grad_norm": 6.825768758767164, + "learning_rate": 1.4588505410610283e-06, + "loss": 0.141, + "step": 4248 + }, + { + "epoch": 3.03, + "grad_norm": 11.787094208035665, + "learning_rate": 1.4568107394723175e-06, + "loss": 0.1892, + "step": 4249 + }, + { + "epoch": 3.03, + "grad_norm": 7.676246808791587, + "learning_rate": 1.4547721217289972e-06, + "loss": 0.1515, + "step": 4250 + }, + { + "epoch": 3.03, + "grad_norm": 8.044534219415103, + "learning_rate": 1.4527346885122073e-06, + "loss": 0.1312, + "step": 4251 + }, + { + "epoch": 3.03, + "grad_norm": 9.284106118194046, + "learning_rate": 1.450698440502692e-06, + "loss": 0.1499, + "step": 4252 + }, + { + "epoch": 3.04, + "grad_norm": 3.7252324984477916, + "learning_rate": 1.4486633783807997e-06, + "loss": 0.0759, + "step": 4253 + }, + { + "epoch": 3.04, + "grad_norm": 7.906025467327889, + "learning_rate": 1.4466295028264822e-06, + "loss": 0.1543, + "step": 4254 + }, + { + "epoch": 3.04, + "grad_norm": 6.324242408244137, + "learning_rate": 1.4445968145192951e-06, + "loss": 0.1219, + "step": 4255 + }, + { + "epoch": 3.04, + "grad_norm": 5.877864296276188, + "learning_rate": 1.4425653141383977e-06, + "loss": 0.14, + "step": 4256 + }, + { + "epoch": 3.04, + "grad_norm": 5.312508541147513, + "learning_rate": 1.4405350023625514e-06, + "loss": 0.0874, + "step": 4257 + }, + { + "epoch": 3.04, + "grad_norm": 5.865148996474216, + "learning_rate": 1.4385058798701223e-06, + "loss": 0.1302, + "step": 4258 + }, + { + "epoch": 3.04, + "grad_norm": 9.287278930298898, + "learning_rate": 1.4364779473390767e-06, + "loss": 0.1621, + "step": 4259 + }, + { + "epoch": 3.04, + "grad_norm": 8.177031611561803, + "learning_rate": 1.4344512054469855e-06, + "loss": 0.1541, + "step": 4260 + }, + { + "epoch": 3.04, + "grad_norm": 4.308894066948919, + "learning_rate": 1.4324256548710202e-06, + "loss": 0.1062, + "step": 4261 + }, + { + "epoch": 3.04, + "grad_norm": 8.610067458675067, + "learning_rate": 1.430401296287955e-06, + "loss": 0.1289, + "step": 4262 + }, + { + "epoch": 3.04, + "grad_norm": 6.510987310154899, + "learning_rate": 1.4283781303741662e-06, + "loss": 0.1726, + "step": 4263 + }, + { + "epoch": 3.04, + "grad_norm": 5.778662504339967, + "learning_rate": 1.4263561578056307e-06, + "loss": 0.1449, + "step": 4264 + }, + { + "epoch": 3.04, + "grad_norm": 6.4303052868247565, + "learning_rate": 1.4243353792579285e-06, + "loss": 0.1311, + "step": 4265 + }, + { + "epoch": 3.04, + "grad_norm": 4.0518291313422985, + "learning_rate": 1.4223157954062344e-06, + "loss": 0.1038, + "step": 4266 + }, + { + "epoch": 3.05, + "grad_norm": 8.014311988987807, + "learning_rate": 1.4202974069253362e-06, + "loss": 0.1282, + "step": 4267 + }, + { + "epoch": 3.05, + "grad_norm": 6.347104216461915, + "learning_rate": 1.418280214489608e-06, + "loss": 0.1176, + "step": 4268 + }, + { + "epoch": 3.05, + "grad_norm": 6.026484655477969, + "learning_rate": 1.416264218773038e-06, + "loss": 0.1364, + "step": 4269 + }, + { + "epoch": 3.05, + "grad_norm": 6.206163622677682, + "learning_rate": 1.4142494204492007e-06, + "loss": 0.1118, + "step": 4270 + }, + { + "epoch": 3.05, + "grad_norm": 4.449486750709002, + "learning_rate": 1.412235820191285e-06, + "loss": 0.1207, + "step": 4271 + }, + { + "epoch": 3.05, + "grad_norm": 7.169597104216024, + "learning_rate": 1.4102234186720653e-06, + "loss": 0.1562, + "step": 4272 + }, + { + "epoch": 3.05, + "grad_norm": 7.3213306438478165, + "learning_rate": 1.4082122165639285e-06, + "loss": 0.1102, + "step": 4273 + }, + { + "epoch": 3.05, + "grad_norm": 4.644150306624669, + "learning_rate": 1.4062022145388503e-06, + "loss": 0.0866, + "step": 4274 + }, + { + "epoch": 3.05, + "grad_norm": 5.509551286371005, + "learning_rate": 1.4041934132684116e-06, + "loss": 0.1194, + "step": 4275 + }, + { + "epoch": 3.05, + "grad_norm": 7.0171083888829395, + "learning_rate": 1.4021858134237892e-06, + "loss": 0.1792, + "step": 4276 + }, + { + "epoch": 3.05, + "grad_norm": 6.766478418536765, + "learning_rate": 1.4001794156757598e-06, + "loss": 0.1609, + "step": 4277 + }, + { + "epoch": 3.05, + "grad_norm": 5.7061716612934505, + "learning_rate": 1.398174220694699e-06, + "loss": 0.1375, + "step": 4278 + }, + { + "epoch": 3.05, + "grad_norm": 6.22806612837643, + "learning_rate": 1.3961702291505791e-06, + "loss": 0.1154, + "step": 4279 + }, + { + "epoch": 3.05, + "grad_norm": 5.054628039524962, + "learning_rate": 1.3941674417129714e-06, + "loss": 0.1053, + "step": 4280 + }, + { + "epoch": 3.06, + "grad_norm": 9.438939415322599, + "learning_rate": 1.3921658590510434e-06, + "loss": 0.1853, + "step": 4281 + }, + { + "epoch": 3.06, + "grad_norm": 5.909877879321542, + "learning_rate": 1.3901654818335618e-06, + "loss": 0.1347, + "step": 4282 + }, + { + "epoch": 3.06, + "grad_norm": 9.17926909272783, + "learning_rate": 1.3881663107288918e-06, + "loss": 0.1378, + "step": 4283 + }, + { + "epoch": 3.06, + "grad_norm": 4.0576033881925015, + "learning_rate": 1.386168346404988e-06, + "loss": 0.1311, + "step": 4284 + }, + { + "epoch": 3.06, + "grad_norm": 6.618556358799638, + "learning_rate": 1.3841715895294138e-06, + "loss": 0.0911, + "step": 4285 + }, + { + "epoch": 3.06, + "grad_norm": 5.8253753457194595, + "learning_rate": 1.3821760407693175e-06, + "loss": 0.0886, + "step": 4286 + }, + { + "epoch": 3.06, + "grad_norm": 6.792261943559342, + "learning_rate": 1.3801817007914543e-06, + "loss": 0.1329, + "step": 4287 + }, + { + "epoch": 3.06, + "grad_norm": 10.076624429072998, + "learning_rate": 1.3781885702621644e-06, + "loss": 0.2051, + "step": 4288 + }, + { + "epoch": 3.06, + "grad_norm": 5.086198267318938, + "learning_rate": 1.3761966498473956e-06, + "loss": 0.0788, + "step": 4289 + }, + { + "epoch": 3.06, + "grad_norm": 8.435317943548803, + "learning_rate": 1.3742059402126818e-06, + "loss": 0.1439, + "step": 4290 + }, + { + "epoch": 3.06, + "grad_norm": 5.6714105862417075, + "learning_rate": 1.3722164420231565e-06, + "loss": 0.1266, + "step": 4291 + }, + { + "epoch": 3.06, + "grad_norm": 6.760288551086754, + "learning_rate": 1.370228155943548e-06, + "loss": 0.132, + "step": 4292 + }, + { + "epoch": 3.06, + "grad_norm": 5.607704087759127, + "learning_rate": 1.3682410826381816e-06, + "loss": 0.0842, + "step": 4293 + }, + { + "epoch": 3.06, + "grad_norm": 5.449194178955735, + "learning_rate": 1.366255222770973e-06, + "loss": 0.1436, + "step": 4294 + }, + { + "epoch": 3.07, + "grad_norm": 10.9077308370694, + "learning_rate": 1.364270577005436e-06, + "loss": 0.132, + "step": 4295 + }, + { + "epoch": 3.07, + "grad_norm": 5.9434830126409635, + "learning_rate": 1.3622871460046778e-06, + "loss": 0.1117, + "step": 4296 + }, + { + "epoch": 3.07, + "grad_norm": 5.335529396281524, + "learning_rate": 1.3603049304313992e-06, + "loss": 0.1206, + "step": 4297 + }, + { + "epoch": 3.07, + "grad_norm": 6.400241514226316, + "learning_rate": 1.3583239309478953e-06, + "loss": 0.1349, + "step": 4298 + }, + { + "epoch": 3.07, + "grad_norm": 7.442955154627488, + "learning_rate": 1.3563441482160562e-06, + "loss": 0.1261, + "step": 4299 + }, + { + "epoch": 3.07, + "grad_norm": 11.286798997787256, + "learning_rate": 1.35436558289736e-06, + "loss": 0.1791, + "step": 4300 + }, + { + "epoch": 3.07, + "grad_norm": 4.602769205972933, + "learning_rate": 1.3523882356528883e-06, + "loss": 0.1063, + "step": 4301 + }, + { + "epoch": 3.07, + "grad_norm": 8.182228061518014, + "learning_rate": 1.350412107143303e-06, + "loss": 0.1431, + "step": 4302 + }, + { + "epoch": 3.07, + "grad_norm": 5.4663437903465395, + "learning_rate": 1.3484371980288712e-06, + "loss": 0.1296, + "step": 4303 + }, + { + "epoch": 3.07, + "grad_norm": 4.621578796410083, + "learning_rate": 1.3464635089694416e-06, + "loss": 0.0828, + "step": 4304 + }, + { + "epoch": 3.07, + "grad_norm": 5.558582314176478, + "learning_rate": 1.344491040624466e-06, + "loss": 0.1587, + "step": 4305 + }, + { + "epoch": 3.07, + "grad_norm": 6.181336673297239, + "learning_rate": 1.3425197936529766e-06, + "loss": 0.1385, + "step": 4306 + }, + { + "epoch": 3.07, + "grad_norm": 10.412134988285041, + "learning_rate": 1.3405497687136098e-06, + "loss": 0.1656, + "step": 4307 + }, + { + "epoch": 3.07, + "grad_norm": 7.582811581572259, + "learning_rate": 1.3385809664645827e-06, + "loss": 0.1295, + "step": 4308 + }, + { + "epoch": 3.08, + "grad_norm": 8.150651347785129, + "learning_rate": 1.336613387563711e-06, + "loss": 0.1396, + "step": 4309 + }, + { + "epoch": 3.08, + "grad_norm": 5.360245382647963, + "learning_rate": 1.3346470326683986e-06, + "loss": 0.1394, + "step": 4310 + }, + { + "epoch": 3.08, + "grad_norm": 5.92768096906367, + "learning_rate": 1.3326819024356413e-06, + "loss": 0.1395, + "step": 4311 + }, + { + "epoch": 3.08, + "grad_norm": 8.126264782030967, + "learning_rate": 1.3307179975220264e-06, + "loss": 0.1683, + "step": 4312 + }, + { + "epoch": 3.08, + "grad_norm": 7.097276549928228, + "learning_rate": 1.3287553185837298e-06, + "loss": 0.1547, + "step": 4313 + }, + { + "epoch": 3.08, + "grad_norm": 6.968139851957762, + "learning_rate": 1.3267938662765206e-06, + "loss": 0.1147, + "step": 4314 + }, + { + "epoch": 3.08, + "grad_norm": 4.5357337366094725, + "learning_rate": 1.324833641255755e-06, + "loss": 0.1165, + "step": 4315 + }, + { + "epoch": 3.08, + "grad_norm": 6.212179238446433, + "learning_rate": 1.3228746441763813e-06, + "loss": 0.0844, + "step": 4316 + }, + { + "epoch": 3.08, + "grad_norm": 5.800820871659498, + "learning_rate": 1.3209168756929363e-06, + "loss": 0.1323, + "step": 4317 + }, + { + "epoch": 3.08, + "grad_norm": 8.876523252846415, + "learning_rate": 1.3189603364595483e-06, + "loss": 0.1721, + "step": 4318 + }, + { + "epoch": 3.08, + "grad_norm": 6.1759123731714825, + "learning_rate": 1.3170050271299316e-06, + "loss": 0.1298, + "step": 4319 + }, + { + "epoch": 3.08, + "grad_norm": 8.861593791331382, + "learning_rate": 1.315050948357392e-06, + "loss": 0.1289, + "step": 4320 + }, + { + "epoch": 3.08, + "grad_norm": 8.188259150222102, + "learning_rate": 1.3130981007948247e-06, + "loss": 0.1427, + "step": 4321 + }, + { + "epoch": 3.08, + "grad_norm": 6.725476274791131, + "learning_rate": 1.3111464850947103e-06, + "loss": 0.1479, + "step": 4322 + }, + { + "epoch": 3.09, + "grad_norm": 7.735183629373757, + "learning_rate": 1.3091961019091216e-06, + "loss": 0.1288, + "step": 4323 + }, + { + "epoch": 3.09, + "grad_norm": 5.745017577097906, + "learning_rate": 1.3072469518897184e-06, + "loss": 0.1412, + "step": 4324 + }, + { + "epoch": 3.09, + "grad_norm": 8.405808376306096, + "learning_rate": 1.3052990356877444e-06, + "loss": 0.1046, + "step": 4325 + }, + { + "epoch": 3.09, + "grad_norm": 8.517500797990728, + "learning_rate": 1.3033523539540394e-06, + "loss": 0.1934, + "step": 4326 + }, + { + "epoch": 3.09, + "grad_norm": 5.815632664792931, + "learning_rate": 1.3014069073390206e-06, + "loss": 0.1445, + "step": 4327 + }, + { + "epoch": 3.09, + "grad_norm": 6.250647236307092, + "learning_rate": 1.2994626964927042e-06, + "loss": 0.1066, + "step": 4328 + }, + { + "epoch": 3.09, + "grad_norm": 5.161414714395807, + "learning_rate": 1.2975197220646807e-06, + "loss": 0.1146, + "step": 4329 + }, + { + "epoch": 3.09, + "grad_norm": 8.547923821989846, + "learning_rate": 1.29557798470414e-06, + "loss": 0.1351, + "step": 4330 + }, + { + "epoch": 3.09, + "grad_norm": 5.445406438833862, + "learning_rate": 1.293637485059847e-06, + "loss": 0.1071, + "step": 4331 + }, + { + "epoch": 3.09, + "grad_norm": 12.910301333668382, + "learning_rate": 1.291698223780164e-06, + "loss": 0.1795, + "step": 4332 + }, + { + "epoch": 3.09, + "grad_norm": 5.415868085569591, + "learning_rate": 1.2897602015130306e-06, + "loss": 0.1145, + "step": 4333 + }, + { + "epoch": 3.09, + "grad_norm": 7.580899357603486, + "learning_rate": 1.287823418905977e-06, + "loss": 0.1422, + "step": 4334 + }, + { + "epoch": 3.09, + "grad_norm": 7.710105285949303, + "learning_rate": 1.2858878766061178e-06, + "loss": 0.1465, + "step": 4335 + }, + { + "epoch": 3.09, + "grad_norm": 7.719436922598486, + "learning_rate": 1.2839535752601551e-06, + "loss": 0.1892, + "step": 4336 + }, + { + "epoch": 3.1, + "grad_norm": 8.01755051568949, + "learning_rate": 1.2820205155143738e-06, + "loss": 0.1098, + "step": 4337 + }, + { + "epoch": 3.1, + "grad_norm": 5.586621697986031, + "learning_rate": 1.2800886980146453e-06, + "loss": 0.1371, + "step": 4338 + }, + { + "epoch": 3.1, + "grad_norm": 10.331064240612887, + "learning_rate": 1.2781581234064256e-06, + "loss": 0.1504, + "step": 4339 + }, + { + "epoch": 3.1, + "grad_norm": 6.770807504782781, + "learning_rate": 1.276228792334756e-06, + "loss": 0.1121, + "step": 4340 + }, + { + "epoch": 3.1, + "grad_norm": 6.086051023648102, + "learning_rate": 1.274300705444262e-06, + "loss": 0.1239, + "step": 4341 + }, + { + "epoch": 3.1, + "grad_norm": 8.608147481251128, + "learning_rate": 1.2723738633791538e-06, + "loss": 0.2008, + "step": 4342 + }, + { + "epoch": 3.1, + "grad_norm": 4.898067540734754, + "learning_rate": 1.2704482667832218e-06, + "loss": 0.1062, + "step": 4343 + }, + { + "epoch": 3.1, + "grad_norm": 7.497203187118496, + "learning_rate": 1.2685239162998485e-06, + "loss": 0.1232, + "step": 4344 + }, + { + "epoch": 3.1, + "grad_norm": 19.126104586121958, + "learning_rate": 1.2666008125719904e-06, + "loss": 0.1772, + "step": 4345 + }, + { + "epoch": 3.1, + "grad_norm": 8.12284642374614, + "learning_rate": 1.2646789562421975e-06, + "loss": 0.1339, + "step": 4346 + }, + { + "epoch": 3.1, + "grad_norm": 8.079010398172034, + "learning_rate": 1.2627583479525913e-06, + "loss": 0.1396, + "step": 4347 + }, + { + "epoch": 3.1, + "grad_norm": 4.1620164187548765, + "learning_rate": 1.2608389883448896e-06, + "loss": 0.1212, + "step": 4348 + }, + { + "epoch": 3.1, + "grad_norm": 7.859990849012603, + "learning_rate": 1.2589208780603795e-06, + "loss": 0.1313, + "step": 4349 + }, + { + "epoch": 3.1, + "grad_norm": 7.106346065393317, + "learning_rate": 1.2570040177399435e-06, + "loss": 0.1424, + "step": 4350 + }, + { + "epoch": 3.11, + "grad_norm": 5.06095769329329, + "learning_rate": 1.255088408024036e-06, + "loss": 0.1101, + "step": 4351 + }, + { + "epoch": 3.11, + "grad_norm": 5.6470216837778135, + "learning_rate": 1.2531740495526989e-06, + "loss": 0.1184, + "step": 4352 + }, + { + "epoch": 3.11, + "grad_norm": 10.158476286310746, + "learning_rate": 1.2512609429655553e-06, + "loss": 0.1348, + "step": 4353 + }, + { + "epoch": 3.11, + "grad_norm": 7.516757987633382, + "learning_rate": 1.249349088901809e-06, + "loss": 0.1202, + "step": 4354 + }, + { + "epoch": 3.11, + "grad_norm": 5.6383151746155065, + "learning_rate": 1.247438488000247e-06, + "loss": 0.0861, + "step": 4355 + }, + { + "epoch": 3.11, + "grad_norm": 6.12153217876157, + "learning_rate": 1.245529140899236e-06, + "loss": 0.163, + "step": 4356 + }, + { + "epoch": 3.11, + "grad_norm": 4.4984484754078276, + "learning_rate": 1.2436210482367245e-06, + "loss": 0.0786, + "step": 4357 + }, + { + "epoch": 3.11, + "grad_norm": 7.595141893853033, + "learning_rate": 1.2417142106502418e-06, + "loss": 0.0952, + "step": 4358 + }, + { + "epoch": 3.11, + "grad_norm": 7.398814994664079, + "learning_rate": 1.2398086287768969e-06, + "loss": 0.1138, + "step": 4359 + }, + { + "epoch": 3.11, + "grad_norm": 7.727230258964825, + "learning_rate": 1.237904303253381e-06, + "loss": 0.1183, + "step": 4360 + }, + { + "epoch": 3.11, + "grad_norm": 5.279257418876605, + "learning_rate": 1.236001234715965e-06, + "loss": 0.1007, + "step": 4361 + }, + { + "epoch": 3.11, + "grad_norm": 6.402366707274822, + "learning_rate": 1.2340994238004987e-06, + "loss": 0.1294, + "step": 4362 + }, + { + "epoch": 3.11, + "grad_norm": 5.776850831934081, + "learning_rate": 1.2321988711424132e-06, + "loss": 0.1176, + "step": 4363 + }, + { + "epoch": 3.11, + "grad_norm": 6.872878484333298, + "learning_rate": 1.2302995773767174e-06, + "loss": 0.1392, + "step": 4364 + }, + { + "epoch": 3.12, + "grad_norm": 10.12314710750376, + "learning_rate": 1.2284015431380015e-06, + "loss": 0.1311, + "step": 4365 + }, + { + "epoch": 3.12, + "grad_norm": 8.469770250969509, + "learning_rate": 1.2265047690604354e-06, + "loss": 0.1562, + "step": 4366 + }, + { + "epoch": 3.12, + "grad_norm": 7.966300776911517, + "learning_rate": 1.2246092557777633e-06, + "loss": 0.1357, + "step": 4367 + }, + { + "epoch": 3.12, + "grad_norm": 7.398989033867051, + "learning_rate": 1.2227150039233132e-06, + "loss": 0.157, + "step": 4368 + }, + { + "epoch": 3.12, + "grad_norm": 12.612083517699997, + "learning_rate": 1.2208220141299893e-06, + "loss": 0.1385, + "step": 4369 + }, + { + "epoch": 3.12, + "grad_norm": 5.966421949471924, + "learning_rate": 1.2189302870302755e-06, + "loss": 0.1266, + "step": 4370 + }, + { + "epoch": 3.12, + "grad_norm": 5.641265236099781, + "learning_rate": 1.2170398232562324e-06, + "loss": 0.1183, + "step": 4371 + }, + { + "epoch": 3.12, + "grad_norm": 20.375633559108344, + "learning_rate": 1.2151506234395e-06, + "loss": 0.2162, + "step": 4372 + }, + { + "epoch": 3.12, + "grad_norm": 10.58226033610902, + "learning_rate": 1.2132626882112935e-06, + "loss": 0.1302, + "step": 4373 + }, + { + "epoch": 3.12, + "grad_norm": 9.180226888360846, + "learning_rate": 1.211376018202408e-06, + "loss": 0.1749, + "step": 4374 + }, + { + "epoch": 3.12, + "grad_norm": 6.08472849567162, + "learning_rate": 1.2094906140432155e-06, + "loss": 0.1486, + "step": 4375 + }, + { + "epoch": 3.12, + "grad_norm": 5.74121178158958, + "learning_rate": 1.2076064763636641e-06, + "loss": 0.0946, + "step": 4376 + }, + { + "epoch": 3.12, + "grad_norm": 8.286183532915876, + "learning_rate": 1.205723605793279e-06, + "loss": 0.1323, + "step": 4377 + }, + { + "epoch": 3.12, + "grad_norm": 6.530368951438912, + "learning_rate": 1.2038420029611625e-06, + "loss": 0.1406, + "step": 4378 + }, + { + "epoch": 3.13, + "grad_norm": 9.283823696986877, + "learning_rate": 1.2019616684959934e-06, + "loss": 0.1354, + "step": 4379 + }, + { + "epoch": 3.13, + "grad_norm": 5.768580237770132, + "learning_rate": 1.2000826030260254e-06, + "loss": 0.1118, + "step": 4380 + }, + { + "epoch": 3.13, + "grad_norm": 7.393517881119877, + "learning_rate": 1.1982048071790903e-06, + "loss": 0.15, + "step": 4381 + }, + { + "epoch": 3.13, + "grad_norm": 7.074275685681868, + "learning_rate": 1.1963282815825938e-06, + "loss": 0.135, + "step": 4382 + }, + { + "epoch": 3.13, + "grad_norm": 5.8588043109415, + "learning_rate": 1.194453026863519e-06, + "loss": 0.0863, + "step": 4383 + }, + { + "epoch": 3.13, + "grad_norm": 8.252158205024786, + "learning_rate": 1.1925790436484219e-06, + "loss": 0.1028, + "step": 4384 + }, + { + "epoch": 3.13, + "grad_norm": 4.4442688489776465, + "learning_rate": 1.1907063325634376e-06, + "loss": 0.0718, + "step": 4385 + }, + { + "epoch": 3.13, + "grad_norm": 6.31765034734141, + "learning_rate": 1.1888348942342697e-06, + "loss": 0.1121, + "step": 4386 + }, + { + "epoch": 3.13, + "grad_norm": 8.438393240156737, + "learning_rate": 1.1869647292862051e-06, + "loss": 0.1555, + "step": 4387 + }, + { + "epoch": 3.13, + "grad_norm": 11.366731513055292, + "learning_rate": 1.1850958383440957e-06, + "loss": 0.1699, + "step": 4388 + }, + { + "epoch": 3.13, + "grad_norm": 5.652295308843194, + "learning_rate": 1.183228222032378e-06, + "loss": 0.1521, + "step": 4389 + }, + { + "epoch": 3.13, + "grad_norm": 6.177913643706088, + "learning_rate": 1.181361880975052e-06, + "loss": 0.1331, + "step": 4390 + }, + { + "epoch": 3.13, + "grad_norm": 5.520601566917642, + "learning_rate": 1.1794968157957026e-06, + "loss": 0.0956, + "step": 4391 + }, + { + "epoch": 3.13, + "grad_norm": 5.836669023955266, + "learning_rate": 1.1776330271174786e-06, + "loss": 0.1168, + "step": 4392 + }, + { + "epoch": 3.14, + "grad_norm": 5.603939921185731, + "learning_rate": 1.1757705155631072e-06, + "loss": 0.1146, + "step": 4393 + }, + { + "epoch": 3.14, + "grad_norm": 6.98450834534631, + "learning_rate": 1.1739092817548887e-06, + "loss": 0.1356, + "step": 4394 + }, + { + "epoch": 3.14, + "grad_norm": 5.048629847874484, + "learning_rate": 1.172049326314696e-06, + "loss": 0.1296, + "step": 4395 + }, + { + "epoch": 3.14, + "grad_norm": 6.414733074091697, + "learning_rate": 1.1701906498639741e-06, + "loss": 0.0904, + "step": 4396 + }, + { + "epoch": 3.14, + "grad_norm": 6.206320184441604, + "learning_rate": 1.1683332530237423e-06, + "loss": 0.0958, + "step": 4397 + }, + { + "epoch": 3.14, + "grad_norm": 8.848301273909941, + "learning_rate": 1.1664771364145905e-06, + "loss": 0.182, + "step": 4398 + }, + { + "epoch": 3.14, + "grad_norm": 18.28608439063244, + "learning_rate": 1.1646223006566827e-06, + "loss": 0.3428, + "step": 4399 + }, + { + "epoch": 3.14, + "grad_norm": 5.770307322322705, + "learning_rate": 1.162768746369753e-06, + "loss": 0.1066, + "step": 4400 + }, + { + "epoch": 3.14, + "grad_norm": 12.372434701004785, + "learning_rate": 1.1609164741731105e-06, + "loss": 0.2958, + "step": 4401 + }, + { + "epoch": 3.14, + "grad_norm": 5.22763139963171, + "learning_rate": 1.1590654846856291e-06, + "loss": 0.1393, + "step": 4402 + }, + { + "epoch": 3.14, + "grad_norm": 4.248723078530217, + "learning_rate": 1.1572157785257643e-06, + "loss": 0.0935, + "step": 4403 + }, + { + "epoch": 3.14, + "grad_norm": 6.694964132694257, + "learning_rate": 1.1553673563115325e-06, + "loss": 0.1555, + "step": 4404 + }, + { + "epoch": 3.14, + "grad_norm": 8.188557172066655, + "learning_rate": 1.153520218660531e-06, + "loss": 0.1984, + "step": 4405 + }, + { + "epoch": 3.14, + "grad_norm": 6.541895242637127, + "learning_rate": 1.1516743661899172e-06, + "loss": 0.1124, + "step": 4406 + }, + { + "epoch": 3.15, + "grad_norm": 10.548845679168716, + "learning_rate": 1.1498297995164305e-06, + "loss": 0.186, + "step": 4407 + }, + { + "epoch": 3.15, + "grad_norm": 8.565033160611465, + "learning_rate": 1.1479865192563683e-06, + "loss": 0.1838, + "step": 4408 + }, + { + "epoch": 3.15, + "grad_norm": 4.752373240136069, + "learning_rate": 1.146144526025612e-06, + "loss": 0.0936, + "step": 4409 + }, + { + "epoch": 3.15, + "grad_norm": 7.942694309672477, + "learning_rate": 1.1443038204396007e-06, + "loss": 0.1306, + "step": 4410 + }, + { + "epoch": 3.15, + "grad_norm": 6.89945116790705, + "learning_rate": 1.1424644031133502e-06, + "loss": 0.1313, + "step": 4411 + }, + { + "epoch": 3.15, + "grad_norm": 9.26264775999036, + "learning_rate": 1.1406262746614433e-06, + "loss": 0.1857, + "step": 4412 + }, + { + "epoch": 3.15, + "grad_norm": 5.4397623406990405, + "learning_rate": 1.1387894356980334e-06, + "loss": 0.1163, + "step": 4413 + }, + { + "epoch": 3.15, + "grad_norm": 8.491939243996956, + "learning_rate": 1.1369538868368424e-06, + "loss": 0.1293, + "step": 4414 + }, + { + "epoch": 3.15, + "grad_norm": 5.750098964300776, + "learning_rate": 1.1351196286911615e-06, + "loss": 0.1233, + "step": 4415 + }, + { + "epoch": 3.15, + "grad_norm": 6.21545036806548, + "learning_rate": 1.1332866618738498e-06, + "loss": 0.1461, + "step": 4416 + }, + { + "epoch": 3.15, + "grad_norm": 4.539810649852684, + "learning_rate": 1.1314549869973363e-06, + "loss": 0.1044, + "step": 4417 + }, + { + "epoch": 3.15, + "grad_norm": 4.573431615715713, + "learning_rate": 1.1296246046736176e-06, + "loss": 0.0757, + "step": 4418 + }, + { + "epoch": 3.15, + "grad_norm": 11.731198672608688, + "learning_rate": 1.1277955155142578e-06, + "loss": 0.1499, + "step": 4419 + }, + { + "epoch": 3.15, + "grad_norm": 5.893278961301827, + "learning_rate": 1.1259677201303905e-06, + "loss": 0.1198, + "step": 4420 + }, + { + "epoch": 3.16, + "grad_norm": 5.0550091247501125, + "learning_rate": 1.1241412191327155e-06, + "loss": 0.1017, + "step": 4421 + }, + { + "epoch": 3.16, + "grad_norm": 6.282672874051473, + "learning_rate": 1.1223160131315008e-06, + "loss": 0.1163, + "step": 4422 + }, + { + "epoch": 3.16, + "grad_norm": 12.271735935923063, + "learning_rate": 1.1204921027365818e-06, + "loss": 0.1686, + "step": 4423 + }, + { + "epoch": 3.16, + "grad_norm": 6.60070124272832, + "learning_rate": 1.1186694885573602e-06, + "loss": 0.1394, + "step": 4424 + }, + { + "epoch": 3.16, + "grad_norm": 6.119314793118176, + "learning_rate": 1.1168481712028061e-06, + "loss": 0.1188, + "step": 4425 + }, + { + "epoch": 3.16, + "grad_norm": 5.845012516399945, + "learning_rate": 1.115028151281457e-06, + "loss": 0.0951, + "step": 4426 + }, + { + "epoch": 3.16, + "grad_norm": 4.51100338272369, + "learning_rate": 1.1132094294014106e-06, + "loss": 0.0892, + "step": 4427 + }, + { + "epoch": 3.16, + "grad_norm": 8.698616071989141, + "learning_rate": 1.1113920061703416e-06, + "loss": 0.1289, + "step": 4428 + }, + { + "epoch": 3.16, + "grad_norm": 6.597902292498433, + "learning_rate": 1.1095758821954788e-06, + "loss": 0.1199, + "step": 4429 + }, + { + "epoch": 3.16, + "grad_norm": 7.13322096070308, + "learning_rate": 1.107761058083629e-06, + "loss": 0.1552, + "step": 4430 + }, + { + "epoch": 3.16, + "grad_norm": 5.3313785556010505, + "learning_rate": 1.1059475344411535e-06, + "loss": 0.0945, + "step": 4431 + }, + { + "epoch": 3.16, + "grad_norm": 8.065704209387357, + "learning_rate": 1.104135311873989e-06, + "loss": 0.1471, + "step": 4432 + }, + { + "epoch": 3.16, + "grad_norm": 7.85943646738916, + "learning_rate": 1.1023243909876275e-06, + "loss": 0.1545, + "step": 4433 + }, + { + "epoch": 3.16, + "grad_norm": 5.066634102847706, + "learning_rate": 1.1005147723871374e-06, + "loss": 0.0896, + "step": 4434 + }, + { + "epoch": 3.17, + "grad_norm": 7.8424079666970155, + "learning_rate": 1.0987064566771405e-06, + "loss": 0.1359, + "step": 4435 + }, + { + "epoch": 3.17, + "grad_norm": 7.6459659701311695, + "learning_rate": 1.0968994444618313e-06, + "loss": 0.167, + "step": 4436 + }, + { + "epoch": 3.17, + "grad_norm": 5.500227738741658, + "learning_rate": 1.0950937363449659e-06, + "loss": 0.1115, + "step": 4437 + }, + { + "epoch": 3.17, + "grad_norm": 7.2102276882225445, + "learning_rate": 1.0932893329298643e-06, + "loss": 0.1222, + "step": 4438 + }, + { + "epoch": 3.17, + "grad_norm": 7.847799303012751, + "learning_rate": 1.0914862348194121e-06, + "loss": 0.1068, + "step": 4439 + }, + { + "epoch": 3.17, + "grad_norm": 6.851543694579042, + "learning_rate": 1.0896844426160575e-06, + "loss": 0.1298, + "step": 4440 + }, + { + "epoch": 3.17, + "grad_norm": 6.889551647224606, + "learning_rate": 1.0878839569218124e-06, + "loss": 0.116, + "step": 4441 + }, + { + "epoch": 3.17, + "grad_norm": 7.152322142099854, + "learning_rate": 1.0860847783382534e-06, + "loss": 0.1624, + "step": 4442 + }, + { + "epoch": 3.17, + "grad_norm": 7.011318397121267, + "learning_rate": 1.0842869074665186e-06, + "loss": 0.1464, + "step": 4443 + }, + { + "epoch": 3.17, + "grad_norm": 8.68536297533047, + "learning_rate": 1.0824903449073115e-06, + "loss": 0.2058, + "step": 4444 + }, + { + "epoch": 3.17, + "grad_norm": 8.256267124703266, + "learning_rate": 1.0806950912608937e-06, + "loss": 0.1498, + "step": 4445 + }, + { + "epoch": 3.17, + "grad_norm": 9.177033572690247, + "learning_rate": 1.0789011471270983e-06, + "loss": 0.1533, + "step": 4446 + }, + { + "epoch": 3.17, + "grad_norm": 9.42342228155218, + "learning_rate": 1.0771085131053087e-06, + "loss": 0.1993, + "step": 4447 + }, + { + "epoch": 3.17, + "grad_norm": 5.096230095880606, + "learning_rate": 1.0753171897944835e-06, + "loss": 0.1036, + "step": 4448 + }, + { + "epoch": 3.18, + "grad_norm": 8.154504511502616, + "learning_rate": 1.0735271777931322e-06, + "loss": 0.1755, + "step": 4449 + }, + { + "epoch": 3.18, + "grad_norm": 7.04583975692834, + "learning_rate": 1.0717384776993356e-06, + "loss": 0.2039, + "step": 4450 + }, + { + "epoch": 3.18, + "grad_norm": 6.7824108907953224, + "learning_rate": 1.069951090110728e-06, + "loss": 0.1451, + "step": 4451 + }, + { + "epoch": 3.18, + "grad_norm": 5.187754014764868, + "learning_rate": 1.06816501562451e-06, + "loss": 0.0834, + "step": 4452 + }, + { + "epoch": 3.18, + "grad_norm": 7.278261677229582, + "learning_rate": 1.0663802548374424e-06, + "loss": 0.1149, + "step": 4453 + }, + { + "epoch": 3.18, + "grad_norm": 4.340393440241648, + "learning_rate": 1.064596808345847e-06, + "loss": 0.1205, + "step": 4454 + }, + { + "epoch": 3.18, + "grad_norm": 7.746103373611341, + "learning_rate": 1.0628146767456066e-06, + "loss": 0.166, + "step": 4455 + }, + { + "epoch": 3.18, + "grad_norm": 6.3971573426174295, + "learning_rate": 1.061033860632164e-06, + "loss": 0.1436, + "step": 4456 + }, + { + "epoch": 3.18, + "grad_norm": 9.242403232576306, + "learning_rate": 1.0592543606005235e-06, + "loss": 0.181, + "step": 4457 + }, + { + "epoch": 3.18, + "grad_norm": 10.503336807115314, + "learning_rate": 1.0574761772452486e-06, + "loss": 0.2029, + "step": 4458 + }, + { + "epoch": 3.18, + "grad_norm": 7.744097047628762, + "learning_rate": 1.0556993111604635e-06, + "loss": 0.1935, + "step": 4459 + }, + { + "epoch": 3.18, + "grad_norm": 9.014041823353091, + "learning_rate": 1.0539237629398536e-06, + "loss": 0.1501, + "step": 4460 + }, + { + "epoch": 3.18, + "grad_norm": 8.4496955488824, + "learning_rate": 1.052149533176659e-06, + "loss": 0.1421, + "step": 4461 + }, + { + "epoch": 3.18, + "grad_norm": 6.407043789529695, + "learning_rate": 1.050376622463688e-06, + "loss": 0.1115, + "step": 4462 + }, + { + "epoch": 3.19, + "grad_norm": 5.934429129123148, + "learning_rate": 1.0486050313932972e-06, + "loss": 0.1068, + "step": 4463 + }, + { + "epoch": 3.19, + "grad_norm": 3.89642147430061, + "learning_rate": 1.0468347605574137e-06, + "loss": 0.0684, + "step": 4464 + }, + { + "epoch": 3.19, + "grad_norm": 6.884801342876653, + "learning_rate": 1.0450658105475126e-06, + "loss": 0.1141, + "step": 4465 + }, + { + "epoch": 3.19, + "grad_norm": 5.913543630708609, + "learning_rate": 1.0432981819546384e-06, + "loss": 0.1215, + "step": 4466 + }, + { + "epoch": 3.19, + "grad_norm": 7.435283990174136, + "learning_rate": 1.0415318753693837e-06, + "loss": 0.1531, + "step": 4467 + }, + { + "epoch": 3.19, + "grad_norm": 9.051636352276448, + "learning_rate": 1.0397668913819086e-06, + "loss": 0.1371, + "step": 4468 + }, + { + "epoch": 3.19, + "grad_norm": 6.3128900776674755, + "learning_rate": 1.0380032305819243e-06, + "loss": 0.1064, + "step": 4469 + }, + { + "epoch": 3.19, + "grad_norm": 8.142204214879126, + "learning_rate": 1.0362408935587026e-06, + "loss": 0.1516, + "step": 4470 + }, + { + "epoch": 3.19, + "grad_norm": 7.661871841618407, + "learning_rate": 1.0344798809010748e-06, + "loss": 0.1476, + "step": 4471 + }, + { + "epoch": 3.19, + "grad_norm": 11.682682233942165, + "learning_rate": 1.0327201931974262e-06, + "loss": 0.188, + "step": 4472 + }, + { + "epoch": 3.19, + "grad_norm": 4.381237196793194, + "learning_rate": 1.0309618310357023e-06, + "loss": 0.1077, + "step": 4473 + }, + { + "epoch": 3.19, + "grad_norm": 7.058241319504342, + "learning_rate": 1.0292047950034046e-06, + "loss": 0.1934, + "step": 4474 + }, + { + "epoch": 3.19, + "grad_norm": 4.707890326892382, + "learning_rate": 1.0274490856875908e-06, + "loss": 0.0759, + "step": 4475 + }, + { + "epoch": 3.19, + "grad_norm": 6.573092415383323, + "learning_rate": 1.0256947036748766e-06, + "loss": 0.1527, + "step": 4476 + }, + { + "epoch": 3.2, + "grad_norm": 5.479401416008466, + "learning_rate": 1.0239416495514331e-06, + "loss": 0.124, + "step": 4477 + }, + { + "epoch": 3.2, + "grad_norm": 4.404838785894077, + "learning_rate": 1.0221899239029887e-06, + "loss": 0.0833, + "step": 4478 + }, + { + "epoch": 3.2, + "grad_norm": 5.3591410165394775, + "learning_rate": 1.0204395273148277e-06, + "loss": 0.0904, + "step": 4479 + }, + { + "epoch": 3.2, + "grad_norm": 8.06517124521747, + "learning_rate": 1.0186904603717894e-06, + "loss": 0.1272, + "step": 4480 + }, + { + "epoch": 3.2, + "grad_norm": 8.295034878534254, + "learning_rate": 1.0169427236582702e-06, + "loss": 0.1395, + "step": 4481 + }, + { + "epoch": 3.2, + "grad_norm": 5.6017148138313075, + "learning_rate": 1.0151963177582208e-06, + "loss": 0.0892, + "step": 4482 + }, + { + "epoch": 3.2, + "grad_norm": 5.168505372280764, + "learning_rate": 1.0134512432551492e-06, + "loss": 0.0732, + "step": 4483 + }, + { + "epoch": 3.2, + "grad_norm": 4.844662307023592, + "learning_rate": 1.0117075007321152e-06, + "loss": 0.0717, + "step": 4484 + }, + { + "epoch": 3.2, + "grad_norm": 5.47237204481135, + "learning_rate": 1.009965090771739e-06, + "loss": 0.1017, + "step": 4485 + }, + { + "epoch": 3.2, + "grad_norm": 8.845533470002579, + "learning_rate": 1.0082240139561866e-06, + "loss": 0.1592, + "step": 4486 + }, + { + "epoch": 3.2, + "grad_norm": 8.253239344807614, + "learning_rate": 1.0064842708671908e-06, + "loss": 0.141, + "step": 4487 + }, + { + "epoch": 3.2, + "grad_norm": 6.3375223471928095, + "learning_rate": 1.0047458620860251e-06, + "loss": 0.1008, + "step": 4488 + }, + { + "epoch": 3.2, + "grad_norm": 5.7321531585346435, + "learning_rate": 1.0030087881935308e-06, + "loss": 0.0897, + "step": 4489 + }, + { + "epoch": 3.2, + "grad_norm": 7.816882454233154, + "learning_rate": 1.0012730497700912e-06, + "loss": 0.1459, + "step": 4490 + }, + { + "epoch": 3.21, + "grad_norm": 11.259126205309581, + "learning_rate": 9.995386473956531e-07, + "loss": 0.1963, + "step": 4491 + }, + { + "epoch": 3.21, + "grad_norm": 4.533563851322039, + "learning_rate": 9.978055816497084e-07, + "loss": 0.0829, + "step": 4492 + }, + { + "epoch": 3.21, + "grad_norm": 8.880117960445354, + "learning_rate": 9.960738531113118e-07, + "loss": 0.1495, + "step": 4493 + }, + { + "epoch": 3.21, + "grad_norm": 4.388909654201851, + "learning_rate": 9.94343462359061e-07, + "loss": 0.1064, + "step": 4494 + }, + { + "epoch": 3.21, + "grad_norm": 5.589157030071463, + "learning_rate": 9.926144099711138e-07, + "loss": 0.1085, + "step": 4495 + }, + { + "epoch": 3.21, + "grad_norm": 6.486877158713429, + "learning_rate": 9.90886696525179e-07, + "loss": 0.1301, + "step": 4496 + }, + { + "epoch": 3.21, + "grad_norm": 9.916504210295873, + "learning_rate": 9.89160322598517e-07, + "loss": 0.1409, + "step": 4497 + }, + { + "epoch": 3.21, + "grad_norm": 7.150060234978749, + "learning_rate": 9.874352887679416e-07, + "loss": 0.1315, + "step": 4498 + }, + { + "epoch": 3.21, + "grad_norm": 7.447202722521027, + "learning_rate": 9.857115956098196e-07, + "loss": 0.1461, + "step": 4499 + }, + { + "epoch": 3.21, + "grad_norm": 7.592805113175272, + "learning_rate": 9.839892437000675e-07, + "loss": 0.0893, + "step": 4500 + }, + { + "epoch": 3.21, + "eval_avg_AUC": 0.7918247035704028, + "eval_avg_Accuracy": 0.711414124668435, + "eval_avg_Accuracy-right": 0.8560062605973653, + "eval_avg_Accuracy-wrong": 0.45929042528997044, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6577722838853147, + "eval_last_AUC": 0.8121226339774985, + "eval_last_Accuracy": 0.7365716180371353, + "eval_last_Accuracy-right": 0.7942480761706012, + "eval_last_Accuracy-wrong": 0.6360018194223334, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6811064518145272, + "eval_max_AUC": 0.766090631206006, + "eval_max_Accuracy": 0.6466760610079576, + "eval_max_Accuracy-right": 0.9701317334028955, + "eval_max_Accuracy-wrong": 0.08267000227427791, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6136546806309608, + "eval_min_AUC": 0.7982965177859538, + "eval_min_Accuracy": 0.7182940981432361, + "eval_min_Accuracy-right": 0.6972088170079562, + "eval_min_Accuracy-wrong": 0.7550602683647942, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6616380620169989, + "eval_prod_AUC": 0.7992793561684441, + "eval_prod_Accuracy": 0.7075596816976127, + "eval_prod_Accuracy-right": 0.6415155862788574, + "eval_prod_Accuracy-wrong": 0.8227200363884467, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6610387644238402, + "eval_runtime": 246.5009, + "eval_samples_per_second": 97.882, + "eval_steps_per_second": 3.059, + "eval_sum_AUC": 0.6820948876089348, + "eval_sum_Accuracy": 0.6409565649867374, + "eval_sum_Accuracy-right": 0.9834355028042259, + "eval_sum_Accuracy-wrong": 0.04377984989765749, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6492510686816373, + "step": 4500 + }, + { + "epoch": 3.21, + "grad_norm": 6.017970370822423, + "learning_rate": 9.822682336141558e-07, + "loss": 0.1088, + "step": 4501 + }, + { + "epoch": 3.21, + "grad_norm": 8.347395956475857, + "learning_rate": 9.805485659271064e-07, + "loss": 0.1423, + "step": 4502 + }, + { + "epoch": 3.21, + "grad_norm": 6.147141262858037, + "learning_rate": 9.788302412134931e-07, + "loss": 0.1143, + "step": 4503 + }, + { + "epoch": 3.21, + "grad_norm": 5.992507716204142, + "learning_rate": 9.77113260047436e-07, + "loss": 0.1105, + "step": 4504 + }, + { + "epoch": 3.22, + "grad_norm": 5.896232384462128, + "learning_rate": 9.753976230026158e-07, + "loss": 0.1024, + "step": 4505 + }, + { + "epoch": 3.22, + "grad_norm": 7.730396170031993, + "learning_rate": 9.736833306522537e-07, + "loss": 0.1816, + "step": 4506 + }, + { + "epoch": 3.22, + "grad_norm": 9.87791454318401, + "learning_rate": 9.719703835691314e-07, + "loss": 0.1908, + "step": 4507 + }, + { + "epoch": 3.22, + "grad_norm": 9.305401921388526, + "learning_rate": 9.702587823255715e-07, + "loss": 0.1743, + "step": 4508 + }, + { + "epoch": 3.22, + "grad_norm": 12.725995171162081, + "learning_rate": 9.685485274934576e-07, + "loss": 0.2882, + "step": 4509 + }, + { + "epoch": 3.22, + "grad_norm": 9.06907116647378, + "learning_rate": 9.66839619644211e-07, + "loss": 0.1427, + "step": 4510 + }, + { + "epoch": 3.22, + "grad_norm": 5.154694219463353, + "learning_rate": 9.651320593488162e-07, + "loss": 0.1312, + "step": 4511 + }, + { + "epoch": 3.22, + "grad_norm": 5.18461317267077, + "learning_rate": 9.634258471777958e-07, + "loss": 0.1302, + "step": 4512 + }, + { + "epoch": 3.22, + "grad_norm": 8.304539402084123, + "learning_rate": 9.617209837012287e-07, + "loss": 0.129, + "step": 4513 + }, + { + "epoch": 3.22, + "grad_norm": 7.210423547955145, + "learning_rate": 9.600174694887421e-07, + "loss": 0.1196, + "step": 4514 + }, + { + "epoch": 3.22, + "grad_norm": 6.6024670811500705, + "learning_rate": 9.583153051095107e-07, + "loss": 0.113, + "step": 4515 + }, + { + "epoch": 3.22, + "grad_norm": 7.713142607529561, + "learning_rate": 9.5661449113226e-07, + "loss": 0.1359, + "step": 4516 + }, + { + "epoch": 3.22, + "grad_norm": 6.483198577305126, + "learning_rate": 9.549150281252633e-07, + "loss": 0.1104, + "step": 4517 + }, + { + "epoch": 3.22, + "grad_norm": 7.5373377619248, + "learning_rate": 9.532169166563426e-07, + "loss": 0.1328, + "step": 4518 + }, + { + "epoch": 3.23, + "grad_norm": 5.18316197627452, + "learning_rate": 9.515201572928689e-07, + "loss": 0.0995, + "step": 4519 + }, + { + "epoch": 3.23, + "grad_norm": 8.12338317679702, + "learning_rate": 9.49824750601761e-07, + "loss": 0.1083, + "step": 4520 + }, + { + "epoch": 3.23, + "grad_norm": 7.602592282544346, + "learning_rate": 9.481306971494858e-07, + "loss": 0.1244, + "step": 4521 + }, + { + "epoch": 3.23, + "grad_norm": 6.777997231167417, + "learning_rate": 9.464379975020576e-07, + "loss": 0.1606, + "step": 4522 + }, + { + "epoch": 3.23, + "grad_norm": 6.124881227801997, + "learning_rate": 9.447466522250393e-07, + "loss": 0.1326, + "step": 4523 + }, + { + "epoch": 3.23, + "grad_norm": 5.658499339226998, + "learning_rate": 9.430566618835407e-07, + "loss": 0.1031, + "step": 4524 + }, + { + "epoch": 3.23, + "grad_norm": 8.542957014977226, + "learning_rate": 9.413680270422187e-07, + "loss": 0.1439, + "step": 4525 + }, + { + "epoch": 3.23, + "grad_norm": 8.15924583194008, + "learning_rate": 9.396807482652775e-07, + "loss": 0.1235, + "step": 4526 + }, + { + "epoch": 3.23, + "grad_norm": 5.153176455458262, + "learning_rate": 9.3799482611647e-07, + "loss": 0.14, + "step": 4527 + }, + { + "epoch": 3.23, + "grad_norm": 6.857510589303771, + "learning_rate": 9.363102611590918e-07, + "loss": 0.1151, + "step": 4528 + }, + { + "epoch": 3.23, + "grad_norm": 6.841608683622035, + "learning_rate": 9.346270539559882e-07, + "loss": 0.1255, + "step": 4529 + }, + { + "epoch": 3.23, + "grad_norm": 6.349487702065171, + "learning_rate": 9.329452050695497e-07, + "loss": 0.1427, + "step": 4530 + }, + { + "epoch": 3.23, + "grad_norm": 5.185775397201693, + "learning_rate": 9.312647150617144e-07, + "loss": 0.1322, + "step": 4531 + }, + { + "epoch": 3.23, + "grad_norm": 5.993422226179697, + "learning_rate": 9.295855844939639e-07, + "loss": 0.1133, + "step": 4532 + }, + { + "epoch": 3.24, + "grad_norm": 6.877262751498541, + "learning_rate": 9.279078139273279e-07, + "loss": 0.1621, + "step": 4533 + }, + { + "epoch": 3.24, + "grad_norm": 10.9101913835771, + "learning_rate": 9.262314039223802e-07, + "loss": 0.1537, + "step": 4534 + }, + { + "epoch": 3.24, + "grad_norm": 8.089060660483195, + "learning_rate": 9.245563550392406e-07, + "loss": 0.1165, + "step": 4535 + }, + { + "epoch": 3.24, + "grad_norm": 5.9168601004208625, + "learning_rate": 9.22882667837574e-07, + "loss": 0.1515, + "step": 4536 + }, + { + "epoch": 3.24, + "grad_norm": 3.366227246368016, + "learning_rate": 9.212103428765912e-07, + "loss": 0.0607, + "step": 4537 + }, + { + "epoch": 3.24, + "grad_norm": 6.537651822483652, + "learning_rate": 9.19539380715046e-07, + "loss": 0.1354, + "step": 4538 + }, + { + "epoch": 3.24, + "grad_norm": 5.770155298405159, + "learning_rate": 9.178697819112381e-07, + "loss": 0.1273, + "step": 4539 + }, + { + "epoch": 3.24, + "grad_norm": 9.281594784034297, + "learning_rate": 9.162015470230123e-07, + "loss": 0.1356, + "step": 4540 + }, + { + "epoch": 3.24, + "grad_norm": 8.028631915325574, + "learning_rate": 9.145346766077562e-07, + "loss": 0.1834, + "step": 4541 + }, + { + "epoch": 3.24, + "grad_norm": 4.045279042279176, + "learning_rate": 9.128691712224025e-07, + "loss": 0.1058, + "step": 4542 + }, + { + "epoch": 3.24, + "grad_norm": 6.339980352393081, + "learning_rate": 9.112050314234272e-07, + "loss": 0.1326, + "step": 4543 + }, + { + "epoch": 3.24, + "grad_norm": 3.1982368923211744, + "learning_rate": 9.0954225776685e-07, + "loss": 0.0649, + "step": 4544 + }, + { + "epoch": 3.24, + "grad_norm": 5.253249493987398, + "learning_rate": 9.078808508082354e-07, + "loss": 0.0817, + "step": 4545 + }, + { + "epoch": 3.24, + "grad_norm": 7.390707096657996, + "learning_rate": 9.06220811102691e-07, + "loss": 0.1368, + "step": 4546 + }, + { + "epoch": 3.25, + "grad_norm": 15.032324120582194, + "learning_rate": 9.045621392048637e-07, + "loss": 0.2621, + "step": 4547 + }, + { + "epoch": 3.25, + "grad_norm": 8.212400030385552, + "learning_rate": 9.029048356689507e-07, + "loss": 0.1085, + "step": 4548 + }, + { + "epoch": 3.25, + "grad_norm": 7.560047001509493, + "learning_rate": 9.012489010486835e-07, + "loss": 0.1552, + "step": 4549 + }, + { + "epoch": 3.25, + "grad_norm": 9.24670417792784, + "learning_rate": 8.995943358973463e-07, + "loss": 0.134, + "step": 4550 + }, + { + "epoch": 3.25, + "grad_norm": 8.061594801279545, + "learning_rate": 8.979411407677535e-07, + "loss": 0.1624, + "step": 4551 + }, + { + "epoch": 3.25, + "grad_norm": 7.841536214499788, + "learning_rate": 8.962893162122749e-07, + "loss": 0.1432, + "step": 4552 + }, + { + "epoch": 3.25, + "grad_norm": 7.80332838186666, + "learning_rate": 8.946388627828106e-07, + "loss": 0.1351, + "step": 4553 + }, + { + "epoch": 3.25, + "grad_norm": 5.577103709697863, + "learning_rate": 8.929897810308102e-07, + "loss": 0.1639, + "step": 4554 + }, + { + "epoch": 3.25, + "grad_norm": 6.890676018724506, + "learning_rate": 8.913420715072619e-07, + "loss": 0.1366, + "step": 4555 + }, + { + "epoch": 3.25, + "grad_norm": 7.615943828764504, + "learning_rate": 8.896957347626966e-07, + "loss": 0.1583, + "step": 4556 + }, + { + "epoch": 3.25, + "grad_norm": 5.849886192759192, + "learning_rate": 8.880507713471853e-07, + "loss": 0.1052, + "step": 4557 + }, + { + "epoch": 3.25, + "grad_norm": 6.021066135784146, + "learning_rate": 8.864071818103415e-07, + "loss": 0.1222, + "step": 4558 + }, + { + "epoch": 3.25, + "grad_norm": 7.485990965887972, + "learning_rate": 8.847649667013187e-07, + "loss": 0.12, + "step": 4559 + }, + { + "epoch": 3.25, + "grad_norm": 5.798179875468671, + "learning_rate": 8.831241265688112e-07, + "loss": 0.145, + "step": 4560 + }, + { + "epoch": 3.26, + "grad_norm": 7.884685397021309, + "learning_rate": 8.814846619610545e-07, + "loss": 0.1251, + "step": 4561 + }, + { + "epoch": 3.26, + "grad_norm": 9.08712853808379, + "learning_rate": 8.79846573425826e-07, + "loss": 0.1326, + "step": 4562 + }, + { + "epoch": 3.26, + "grad_norm": 6.151088191408366, + "learning_rate": 8.782098615104373e-07, + "loss": 0.1555, + "step": 4563 + }, + { + "epoch": 3.26, + "grad_norm": 7.585973402310389, + "learning_rate": 8.765745267617487e-07, + "loss": 0.1342, + "step": 4564 + }, + { + "epoch": 3.26, + "grad_norm": 6.157889175348612, + "learning_rate": 8.749405697261515e-07, + "loss": 0.1348, + "step": 4565 + }, + { + "epoch": 3.26, + "grad_norm": 6.384538264495767, + "learning_rate": 8.733079909495868e-07, + "loss": 0.1028, + "step": 4566 + }, + { + "epoch": 3.26, + "grad_norm": 6.53422431942535, + "learning_rate": 8.716767909775231e-07, + "loss": 0.1035, + "step": 4567 + }, + { + "epoch": 3.26, + "grad_norm": 5.424476627599804, + "learning_rate": 8.700469703549802e-07, + "loss": 0.0785, + "step": 4568 + }, + { + "epoch": 3.26, + "grad_norm": 8.037680257428233, + "learning_rate": 8.684185296265074e-07, + "loss": 0.1572, + "step": 4569 + }, + { + "epoch": 3.26, + "grad_norm": 6.581140538771417, + "learning_rate": 8.667914693362006e-07, + "loss": 0.1245, + "step": 4570 + }, + { + "epoch": 3.26, + "grad_norm": 7.29417094424049, + "learning_rate": 8.651657900276878e-07, + "loss": 0.1288, + "step": 4571 + }, + { + "epoch": 3.26, + "grad_norm": 5.612495200486463, + "learning_rate": 8.635414922441398e-07, + "loss": 0.1091, + "step": 4572 + }, + { + "epoch": 3.26, + "grad_norm": 5.7330667780604205, + "learning_rate": 8.61918576528265e-07, + "loss": 0.147, + "step": 4573 + }, + { + "epoch": 3.26, + "grad_norm": 7.977158518523487, + "learning_rate": 8.60297043422309e-07, + "loss": 0.1433, + "step": 4574 + }, + { + "epoch": 3.27, + "grad_norm": 9.135644940015077, + "learning_rate": 8.586768934680572e-07, + "loss": 0.1904, + "step": 4575 + }, + { + "epoch": 3.27, + "grad_norm": 6.314051493515507, + "learning_rate": 8.570581272068307e-07, + "loss": 0.0915, + "step": 4576 + }, + { + "epoch": 3.27, + "grad_norm": 5.336134974937514, + "learning_rate": 8.554407451794905e-07, + "loss": 0.0985, + "step": 4577 + }, + { + "epoch": 3.27, + "grad_norm": 7.694521064289034, + "learning_rate": 8.538247479264327e-07, + "loss": 0.1609, + "step": 4578 + }, + { + "epoch": 3.27, + "grad_norm": 7.528554460208515, + "learning_rate": 8.522101359875934e-07, + "loss": 0.1183, + "step": 4579 + }, + { + "epoch": 3.27, + "grad_norm": 9.875424450214489, + "learning_rate": 8.505969099024436e-07, + "loss": 0.1346, + "step": 4580 + }, + { + "epoch": 3.27, + "grad_norm": 6.967351372296031, + "learning_rate": 8.489850702099922e-07, + "loss": 0.1478, + "step": 4581 + }, + { + "epoch": 3.27, + "grad_norm": 5.9745850342812785, + "learning_rate": 8.473746174487846e-07, + "loss": 0.1324, + "step": 4582 + }, + { + "epoch": 3.27, + "grad_norm": 9.59845328096796, + "learning_rate": 8.457655521569036e-07, + "loss": 0.1688, + "step": 4583 + }, + { + "epoch": 3.27, + "grad_norm": 6.193901791217303, + "learning_rate": 8.441578748719676e-07, + "loss": 0.1298, + "step": 4584 + }, + { + "epoch": 3.27, + "grad_norm": 5.411916620325072, + "learning_rate": 8.425515861311312e-07, + "loss": 0.1118, + "step": 4585 + }, + { + "epoch": 3.27, + "grad_norm": 11.622734989607526, + "learning_rate": 8.409466864710858e-07, + "loss": 0.1835, + "step": 4586 + }, + { + "epoch": 3.27, + "grad_norm": 7.586682013277674, + "learning_rate": 8.393431764280591e-07, + "loss": 0.123, + "step": 4587 + }, + { + "epoch": 3.27, + "grad_norm": 4.8412719082832485, + "learning_rate": 8.377410565378097e-07, + "loss": 0.1078, + "step": 4588 + }, + { + "epoch": 3.28, + "grad_norm": 8.79076733178493, + "learning_rate": 8.361403273356411e-07, + "loss": 0.1606, + "step": 4589 + }, + { + "epoch": 3.28, + "grad_norm": 6.946585563110569, + "learning_rate": 8.345409893563816e-07, + "loss": 0.1257, + "step": 4590 + }, + { + "epoch": 3.28, + "grad_norm": 11.064964839267628, + "learning_rate": 8.329430431344043e-07, + "loss": 0.174, + "step": 4591 + }, + { + "epoch": 3.28, + "grad_norm": 6.590378379957648, + "learning_rate": 8.313464892036083e-07, + "loss": 0.0907, + "step": 4592 + }, + { + "epoch": 3.28, + "grad_norm": 7.76238767588108, + "learning_rate": 8.297513280974362e-07, + "loss": 0.1501, + "step": 4593 + }, + { + "epoch": 3.28, + "grad_norm": 6.591967597672873, + "learning_rate": 8.281575603488573e-07, + "loss": 0.1106, + "step": 4594 + }, + { + "epoch": 3.28, + "grad_norm": 7.372299842456041, + "learning_rate": 8.265651864903823e-07, + "loss": 0.1545, + "step": 4595 + }, + { + "epoch": 3.28, + "grad_norm": 7.960406328039799, + "learning_rate": 8.249742070540506e-07, + "loss": 0.1243, + "step": 4596 + }, + { + "epoch": 3.28, + "grad_norm": 10.592265053976755, + "learning_rate": 8.233846225714386e-07, + "loss": 0.1826, + "step": 4597 + }, + { + "epoch": 3.28, + "grad_norm": 8.888448431893492, + "learning_rate": 8.217964335736556e-07, + "loss": 0.1232, + "step": 4598 + }, + { + "epoch": 3.28, + "grad_norm": 5.383633290511777, + "learning_rate": 8.202096405913462e-07, + "loss": 0.0842, + "step": 4599 + }, + { + "epoch": 3.28, + "grad_norm": 7.216804849791828, + "learning_rate": 8.186242441546866e-07, + "loss": 0.0851, + "step": 4600 + }, + { + "epoch": 3.28, + "grad_norm": 10.806207876507727, + "learning_rate": 8.170402447933873e-07, + "loss": 0.1648, + "step": 4601 + }, + { + "epoch": 3.28, + "grad_norm": 6.586285027658275, + "learning_rate": 8.154576430366922e-07, + "loss": 0.0989, + "step": 4602 + }, + { + "epoch": 3.29, + "grad_norm": 7.464315850177548, + "learning_rate": 8.13876439413378e-07, + "loss": 0.1607, + "step": 4603 + }, + { + "epoch": 3.29, + "grad_norm": 11.060642250176604, + "learning_rate": 8.122966344517536e-07, + "loss": 0.1982, + "step": 4604 + }, + { + "epoch": 3.29, + "grad_norm": 5.741654866635015, + "learning_rate": 8.107182286796633e-07, + "loss": 0.0873, + "step": 4605 + }, + { + "epoch": 3.29, + "grad_norm": 7.245045678499837, + "learning_rate": 8.091412226244771e-07, + "loss": 0.1475, + "step": 4606 + }, + { + "epoch": 3.29, + "grad_norm": 6.950428201411654, + "learning_rate": 8.07565616813108e-07, + "loss": 0.145, + "step": 4607 + }, + { + "epoch": 3.29, + "grad_norm": 10.063829105184517, + "learning_rate": 8.059914117719897e-07, + "loss": 0.1516, + "step": 4608 + }, + { + "epoch": 3.29, + "grad_norm": 7.472916859631381, + "learning_rate": 8.044186080270983e-07, + "loss": 0.1149, + "step": 4609 + }, + { + "epoch": 3.29, + "grad_norm": 6.777826055951287, + "learning_rate": 8.028472061039322e-07, + "loss": 0.1243, + "step": 4610 + }, + { + "epoch": 3.29, + "grad_norm": 10.824941387370806, + "learning_rate": 8.012772065275304e-07, + "loss": 0.1042, + "step": 4611 + }, + { + "epoch": 3.29, + "grad_norm": 9.813588704435642, + "learning_rate": 7.997086098224555e-07, + "loss": 0.1234, + "step": 4612 + }, + { + "epoch": 3.29, + "grad_norm": 9.105923319287843, + "learning_rate": 7.981414165128065e-07, + "loss": 0.1295, + "step": 4613 + }, + { + "epoch": 3.29, + "grad_norm": 6.18452983325367, + "learning_rate": 7.965756271222108e-07, + "loss": 0.1241, + "step": 4614 + }, + { + "epoch": 3.29, + "grad_norm": 5.555813455964336, + "learning_rate": 7.950112421738282e-07, + "loss": 0.101, + "step": 4615 + }, + { + "epoch": 3.29, + "grad_norm": 7.138099022811065, + "learning_rate": 7.934482621903494e-07, + "loss": 0.153, + "step": 4616 + }, + { + "epoch": 3.3, + "grad_norm": 5.029921247555546, + "learning_rate": 7.91886687693994e-07, + "loss": 0.103, + "step": 4617 + }, + { + "epoch": 3.3, + "grad_norm": 5.58503608433331, + "learning_rate": 7.903265192065141e-07, + "loss": 0.0841, + "step": 4618 + }, + { + "epoch": 3.3, + "grad_norm": 7.5716326952600115, + "learning_rate": 7.887677572491903e-07, + "loss": 0.1713, + "step": 4619 + }, + { + "epoch": 3.3, + "grad_norm": 6.81055005735616, + "learning_rate": 7.872104023428339e-07, + "loss": 0.1244, + "step": 4620 + }, + { + "epoch": 3.3, + "grad_norm": 4.568296952845966, + "learning_rate": 7.856544550077883e-07, + "loss": 0.088, + "step": 4621 + }, + { + "epoch": 3.3, + "grad_norm": 6.472127459738777, + "learning_rate": 7.840999157639195e-07, + "loss": 0.1244, + "step": 4622 + }, + { + "epoch": 3.3, + "grad_norm": 20.818327230629205, + "learning_rate": 7.825467851306335e-07, + "loss": 0.1603, + "step": 4623 + }, + { + "epoch": 3.3, + "grad_norm": 7.986882546336097, + "learning_rate": 7.809950636268554e-07, + "loss": 0.2106, + "step": 4624 + }, + { + "epoch": 3.3, + "grad_norm": 7.765351049579674, + "learning_rate": 7.794447517710485e-07, + "loss": 0.1481, + "step": 4625 + }, + { + "epoch": 3.3, + "grad_norm": 13.522443356939034, + "learning_rate": 7.778958500811961e-07, + "loss": 0.1749, + "step": 4626 + }, + { + "epoch": 3.3, + "grad_norm": 5.920109695380363, + "learning_rate": 7.7634835907482e-07, + "loss": 0.1223, + "step": 4627 + }, + { + "epoch": 3.3, + "grad_norm": 7.115723186086778, + "learning_rate": 7.748022792689613e-07, + "loss": 0.0972, + "step": 4628 + }, + { + "epoch": 3.3, + "grad_norm": 4.771496666945104, + "learning_rate": 7.732576111801982e-07, + "loss": 0.0928, + "step": 4629 + }, + { + "epoch": 3.3, + "grad_norm": 11.926008165260582, + "learning_rate": 7.717143553246298e-07, + "loss": 0.1371, + "step": 4630 + }, + { + "epoch": 3.31, + "grad_norm": 7.99102315789066, + "learning_rate": 7.701725122178871e-07, + "loss": 0.1481, + "step": 4631 + }, + { + "epoch": 3.31, + "grad_norm": 12.122142585290227, + "learning_rate": 7.686320823751298e-07, + "loss": 0.146, + "step": 4632 + }, + { + "epoch": 3.31, + "grad_norm": 8.408071273100916, + "learning_rate": 7.670930663110426e-07, + "loss": 0.1442, + "step": 4633 + }, + { + "epoch": 3.31, + "grad_norm": 6.679326638442814, + "learning_rate": 7.655554645398405e-07, + "loss": 0.1382, + "step": 4634 + }, + { + "epoch": 3.31, + "grad_norm": 25.188966590030155, + "learning_rate": 7.640192775752647e-07, + "loss": 0.129, + "step": 4635 + }, + { + "epoch": 3.31, + "grad_norm": 7.961221286961985, + "learning_rate": 7.624845059305836e-07, + "loss": 0.1493, + "step": 4636 + }, + { + "epoch": 3.31, + "grad_norm": 9.194206304233527, + "learning_rate": 7.609511501185929e-07, + "loss": 0.139, + "step": 4637 + }, + { + "epoch": 3.31, + "grad_norm": 6.073454340195965, + "learning_rate": 7.594192106516151e-07, + "loss": 0.1255, + "step": 4638 + }, + { + "epoch": 3.31, + "grad_norm": 6.415599395063826, + "learning_rate": 7.578886880414999e-07, + "loss": 0.1212, + "step": 4639 + }, + { + "epoch": 3.31, + "grad_norm": 5.17906997606143, + "learning_rate": 7.563595827996235e-07, + "loss": 0.1142, + "step": 4640 + }, + { + "epoch": 3.31, + "grad_norm": 9.769899102967289, + "learning_rate": 7.548318954368883e-07, + "loss": 0.1178, + "step": 4641 + }, + { + "epoch": 3.31, + "grad_norm": 6.54723718176854, + "learning_rate": 7.533056264637228e-07, + "loss": 0.1173, + "step": 4642 + }, + { + "epoch": 3.31, + "grad_norm": 5.078416054162937, + "learning_rate": 7.51780776390082e-07, + "loss": 0.1028, + "step": 4643 + }, + { + "epoch": 3.31, + "grad_norm": 10.329231158209321, + "learning_rate": 7.50257345725447e-07, + "loss": 0.1575, + "step": 4644 + }, + { + "epoch": 3.32, + "grad_norm": 5.934461236117057, + "learning_rate": 7.487353349788234e-07, + "loss": 0.1179, + "step": 4645 + }, + { + "epoch": 3.32, + "grad_norm": 11.00570511198538, + "learning_rate": 7.472147446587452e-07, + "loss": 0.1691, + "step": 4646 + }, + { + "epoch": 3.32, + "grad_norm": 6.461797158871515, + "learning_rate": 7.456955752732659e-07, + "loss": 0.1371, + "step": 4647 + }, + { + "epoch": 3.32, + "grad_norm": 5.756284807242618, + "learning_rate": 7.441778273299738e-07, + "loss": 0.0889, + "step": 4648 + }, + { + "epoch": 3.32, + "grad_norm": 7.33097177974351, + "learning_rate": 7.426615013359706e-07, + "loss": 0.1659, + "step": 4649 + }, + { + "epoch": 3.32, + "grad_norm": 5.974735983581375, + "learning_rate": 7.411465977978949e-07, + "loss": 0.1, + "step": 4650 + }, + { + "epoch": 3.32, + "grad_norm": 11.534506781082133, + "learning_rate": 7.396331172218996e-07, + "loss": 0.1552, + "step": 4651 + }, + { + "epoch": 3.32, + "grad_norm": 5.406433437663972, + "learning_rate": 7.381210601136702e-07, + "loss": 0.1144, + "step": 4652 + }, + { + "epoch": 3.32, + "grad_norm": 5.408309239317868, + "learning_rate": 7.366104269784086e-07, + "loss": 0.1577, + "step": 4653 + }, + { + "epoch": 3.32, + "grad_norm": 11.29949891157189, + "learning_rate": 7.351012183208511e-07, + "loss": 0.2051, + "step": 4654 + }, + { + "epoch": 3.32, + "grad_norm": 7.593810512608232, + "learning_rate": 7.335934346452484e-07, + "loss": 0.1118, + "step": 4655 + }, + { + "epoch": 3.32, + "grad_norm": 7.302955662252433, + "learning_rate": 7.320870764553795e-07, + "loss": 0.1782, + "step": 4656 + }, + { + "epoch": 3.32, + "grad_norm": 8.661085471989326, + "learning_rate": 7.305821442545474e-07, + "loss": 0.1105, + "step": 4657 + }, + { + "epoch": 3.32, + "grad_norm": 6.860609598218582, + "learning_rate": 7.290786385455778e-07, + "loss": 0.1268, + "step": 4658 + }, + { + "epoch": 3.33, + "grad_norm": 9.255949888842219, + "learning_rate": 7.275765598308199e-07, + "loss": 0.1941, + "step": 4659 + }, + { + "epoch": 3.33, + "grad_norm": 8.14943801715935, + "learning_rate": 7.26075908612146e-07, + "loss": 0.1467, + "step": 4660 + }, + { + "epoch": 3.33, + "grad_norm": 8.30112212336164, + "learning_rate": 7.245766853909519e-07, + "loss": 0.1393, + "step": 4661 + }, + { + "epoch": 3.33, + "grad_norm": 6.7267328951619785, + "learning_rate": 7.230788906681558e-07, + "loss": 0.1006, + "step": 4662 + }, + { + "epoch": 3.33, + "grad_norm": 9.21055385244117, + "learning_rate": 7.215825249441982e-07, + "loss": 0.1885, + "step": 4663 + }, + { + "epoch": 3.33, + "grad_norm": 12.814543386192565, + "learning_rate": 7.200875887190445e-07, + "loss": 0.1476, + "step": 4664 + }, + { + "epoch": 3.33, + "grad_norm": 9.004947303321043, + "learning_rate": 7.185940824921772e-07, + "loss": 0.1779, + "step": 4665 + }, + { + "epoch": 3.33, + "grad_norm": 7.554539232706164, + "learning_rate": 7.171020067626089e-07, + "loss": 0.1388, + "step": 4666 + }, + { + "epoch": 3.33, + "grad_norm": 8.517269227462895, + "learning_rate": 7.156113620288646e-07, + "loss": 0.2025, + "step": 4667 + }, + { + "epoch": 3.33, + "grad_norm": 6.16280886352939, + "learning_rate": 7.141221487890027e-07, + "loss": 0.1512, + "step": 4668 + }, + { + "epoch": 3.33, + "grad_norm": 6.039791042262847, + "learning_rate": 7.126343675405905e-07, + "loss": 0.084, + "step": 4669 + }, + { + "epoch": 3.33, + "grad_norm": 7.185117749237396, + "learning_rate": 7.111480187807296e-07, + "loss": 0.1752, + "step": 4670 + }, + { + "epoch": 3.33, + "grad_norm": 4.745690408307122, + "learning_rate": 7.096631030060308e-07, + "loss": 0.0933, + "step": 4671 + }, + { + "epoch": 3.33, + "grad_norm": 3.9267132250498196, + "learning_rate": 7.081796207126373e-07, + "loss": 0.0792, + "step": 4672 + }, + { + "epoch": 3.34, + "grad_norm": 3.914207317636215, + "learning_rate": 7.06697572396205e-07, + "loss": 0.0667, + "step": 4673 + }, + { + "epoch": 3.34, + "grad_norm": 5.048328757763309, + "learning_rate": 7.052169585519142e-07, + "loss": 0.0873, + "step": 4674 + }, + { + "epoch": 3.34, + "grad_norm": 6.948918663582835, + "learning_rate": 7.037377796744666e-07, + "loss": 0.1484, + "step": 4675 + }, + { + "epoch": 3.34, + "grad_norm": 5.111718013258758, + "learning_rate": 7.022600362580817e-07, + "loss": 0.1198, + "step": 4676 + }, + { + "epoch": 3.34, + "grad_norm": 4.507791776880594, + "learning_rate": 7.007837287965024e-07, + "loss": 0.1039, + "step": 4677 + }, + { + "epoch": 3.34, + "grad_norm": 7.7886653073378005, + "learning_rate": 6.993088577829904e-07, + "loss": 0.1141, + "step": 4678 + }, + { + "epoch": 3.34, + "grad_norm": 11.19723147804638, + "learning_rate": 6.978354237103264e-07, + "loss": 0.1788, + "step": 4679 + }, + { + "epoch": 3.34, + "grad_norm": 5.665500446115932, + "learning_rate": 6.963634270708137e-07, + "loss": 0.1144, + "step": 4680 + }, + { + "epoch": 3.34, + "grad_norm": 4.1211146195001245, + "learning_rate": 6.948928683562722e-07, + "loss": 0.0544, + "step": 4681 + }, + { + "epoch": 3.34, + "grad_norm": 5.297345133083735, + "learning_rate": 6.934237480580435e-07, + "loss": 0.0996, + "step": 4682 + }, + { + "epoch": 3.34, + "grad_norm": 6.441164589598402, + "learning_rate": 6.919560666669889e-07, + "loss": 0.1211, + "step": 4683 + }, + { + "epoch": 3.34, + "grad_norm": 7.607568415682107, + "learning_rate": 6.904898246734864e-07, + "loss": 0.1151, + "step": 4684 + }, + { + "epoch": 3.34, + "grad_norm": 7.063782633192273, + "learning_rate": 6.890250225674361e-07, + "loss": 0.134, + "step": 4685 + }, + { + "epoch": 3.34, + "grad_norm": 7.207873640293048, + "learning_rate": 6.875616608382562e-07, + "loss": 0.153, + "step": 4686 + }, + { + "epoch": 3.35, + "grad_norm": 6.1416823543894346, + "learning_rate": 6.860997399748792e-07, + "loss": 0.1389, + "step": 4687 + }, + { + "epoch": 3.35, + "grad_norm": 4.1354245919128685, + "learning_rate": 6.846392604657653e-07, + "loss": 0.0589, + "step": 4688 + }, + { + "epoch": 3.35, + "grad_norm": 5.418657085737841, + "learning_rate": 6.831802227988843e-07, + "loss": 0.0791, + "step": 4689 + }, + { + "epoch": 3.35, + "grad_norm": 5.212588473079516, + "learning_rate": 6.817226274617283e-07, + "loss": 0.0958, + "step": 4690 + }, + { + "epoch": 3.35, + "grad_norm": 5.918837142437118, + "learning_rate": 6.802664749413079e-07, + "loss": 0.1281, + "step": 4691 + }, + { + "epoch": 3.35, + "grad_norm": 8.434507021495676, + "learning_rate": 6.788117657241506e-07, + "loss": 0.1213, + "step": 4692 + }, + { + "epoch": 3.35, + "grad_norm": 7.487469942524017, + "learning_rate": 6.773585002963007e-07, + "loss": 0.1587, + "step": 4693 + }, + { + "epoch": 3.35, + "grad_norm": 7.914773574255319, + "learning_rate": 6.759066791433228e-07, + "loss": 0.1387, + "step": 4694 + }, + { + "epoch": 3.35, + "grad_norm": 9.230097938274037, + "learning_rate": 6.744563027502959e-07, + "loss": 0.139, + "step": 4695 + }, + { + "epoch": 3.35, + "grad_norm": 7.406132054228652, + "learning_rate": 6.730073716018187e-07, + "loss": 0.1562, + "step": 4696 + }, + { + "epoch": 3.35, + "grad_norm": 6.54202900315914, + "learning_rate": 6.715598861820055e-07, + "loss": 0.1226, + "step": 4697 + }, + { + "epoch": 3.35, + "grad_norm": 5.922145967208217, + "learning_rate": 6.701138469744883e-07, + "loss": 0.111, + "step": 4698 + }, + { + "epoch": 3.35, + "grad_norm": 6.397305802789072, + "learning_rate": 6.686692544624157e-07, + "loss": 0.1105, + "step": 4699 + }, + { + "epoch": 3.35, + "grad_norm": 6.660593338022077, + "learning_rate": 6.672261091284526e-07, + "loss": 0.1178, + "step": 4700 + }, + { + "epoch": 3.36, + "grad_norm": 8.2651428920971, + "learning_rate": 6.657844114547812e-07, + "loss": 0.1681, + "step": 4701 + }, + { + "epoch": 3.36, + "grad_norm": 5.319009621262608, + "learning_rate": 6.643441619230989e-07, + "loss": 0.094, + "step": 4702 + }, + { + "epoch": 3.36, + "grad_norm": 6.571037009481898, + "learning_rate": 6.629053610146202e-07, + "loss": 0.1138, + "step": 4703 + }, + { + "epoch": 3.36, + "grad_norm": 7.379231512952576, + "learning_rate": 6.61468009210075e-07, + "loss": 0.1219, + "step": 4704 + }, + { + "epoch": 3.36, + "grad_norm": 6.991505241069526, + "learning_rate": 6.600321069897097e-07, + "loss": 0.1466, + "step": 4705 + }, + { + "epoch": 3.36, + "grad_norm": 11.1500655678584, + "learning_rate": 6.585976548332856e-07, + "loss": 0.2023, + "step": 4706 + }, + { + "epoch": 3.36, + "grad_norm": 7.836974094823008, + "learning_rate": 6.571646532200815e-07, + "loss": 0.1, + "step": 4707 + }, + { + "epoch": 3.36, + "grad_norm": 3.2192067894980907, + "learning_rate": 6.557331026288855e-07, + "loss": 0.0592, + "step": 4708 + }, + { + "epoch": 3.36, + "grad_norm": 4.990968911316154, + "learning_rate": 6.543030035380099e-07, + "loss": 0.1028, + "step": 4709 + }, + { + "epoch": 3.36, + "grad_norm": 5.338984666872942, + "learning_rate": 6.528743564252737e-07, + "loss": 0.1162, + "step": 4710 + }, + { + "epoch": 3.36, + "grad_norm": 5.645866648570372, + "learning_rate": 6.514471617680184e-07, + "loss": 0.1099, + "step": 4711 + }, + { + "epoch": 3.36, + "grad_norm": 4.5825772199128085, + "learning_rate": 6.500214200430921e-07, + "loss": 0.1029, + "step": 4712 + }, + { + "epoch": 3.36, + "grad_norm": 5.144375515006759, + "learning_rate": 6.485971317268658e-07, + "loss": 0.1123, + "step": 4713 + }, + { + "epoch": 3.36, + "grad_norm": 5.7099658296260705, + "learning_rate": 6.471742972952172e-07, + "loss": 0.1021, + "step": 4714 + }, + { + "epoch": 3.37, + "grad_norm": 6.454486821044744, + "learning_rate": 6.457529172235427e-07, + "loss": 0.1027, + "step": 4715 + }, + { + "epoch": 3.37, + "grad_norm": 5.6424283925645256, + "learning_rate": 6.44332991986753e-07, + "loss": 0.1061, + "step": 4716 + }, + { + "epoch": 3.37, + "grad_norm": 5.0353165182692345, + "learning_rate": 6.429145220592703e-07, + "loss": 0.1191, + "step": 4717 + }, + { + "epoch": 3.37, + "grad_norm": 7.423744958784793, + "learning_rate": 6.414975079150321e-07, + "loss": 0.1349, + "step": 4718 + }, + { + "epoch": 3.37, + "grad_norm": 7.998700076551767, + "learning_rate": 6.400819500274891e-07, + "loss": 0.1064, + "step": 4719 + }, + { + "epoch": 3.37, + "grad_norm": 6.338283445411675, + "learning_rate": 6.386678488696057e-07, + "loss": 0.1465, + "step": 4720 + }, + { + "epoch": 3.37, + "grad_norm": 7.025518446998439, + "learning_rate": 6.372552049138591e-07, + "loss": 0.1173, + "step": 4721 + }, + { + "epoch": 3.37, + "grad_norm": 6.876253982615516, + "learning_rate": 6.358440186322401e-07, + "loss": 0.092, + "step": 4722 + }, + { + "epoch": 3.37, + "grad_norm": 8.410474264470912, + "learning_rate": 6.344342904962536e-07, + "loss": 0.0841, + "step": 4723 + }, + { + "epoch": 3.37, + "grad_norm": 5.331845239953185, + "learning_rate": 6.330260209769124e-07, + "loss": 0.1038, + "step": 4724 + }, + { + "epoch": 3.37, + "grad_norm": 7.044163364551721, + "learning_rate": 6.316192105447499e-07, + "loss": 0.1189, + "step": 4725 + }, + { + "epoch": 3.37, + "grad_norm": 4.123656229134195, + "learning_rate": 6.302138596698032e-07, + "loss": 0.0798, + "step": 4726 + }, + { + "epoch": 3.37, + "grad_norm": 7.533338908312375, + "learning_rate": 6.288099688216309e-07, + "loss": 0.1392, + "step": 4727 + }, + { + "epoch": 3.37, + "grad_norm": 7.886494343160055, + "learning_rate": 6.27407538469294e-07, + "loss": 0.1273, + "step": 4728 + }, + { + "epoch": 3.38, + "grad_norm": 7.451374710261631, + "learning_rate": 6.260065690813754e-07, + "loss": 0.163, + "step": 4729 + }, + { + "epoch": 3.38, + "grad_norm": 5.280301091354748, + "learning_rate": 6.246070611259603e-07, + "loss": 0.0771, + "step": 4730 + }, + { + "epoch": 3.38, + "grad_norm": 6.950197928743349, + "learning_rate": 6.232090150706555e-07, + "loss": 0.1183, + "step": 4731 + }, + { + "epoch": 3.38, + "grad_norm": 7.311848775780406, + "learning_rate": 6.218124313825696e-07, + "loss": 0.1131, + "step": 4732 + }, + { + "epoch": 3.38, + "grad_norm": 6.961362394740585, + "learning_rate": 6.204173105283295e-07, + "loss": 0.1559, + "step": 4733 + }, + { + "epoch": 3.38, + "grad_norm": 6.77422676116153, + "learning_rate": 6.190236529740701e-07, + "loss": 0.1699, + "step": 4734 + }, + { + "epoch": 3.38, + "grad_norm": 8.929780803606354, + "learning_rate": 6.176314591854388e-07, + "loss": 0.168, + "step": 4735 + }, + { + "epoch": 3.38, + "grad_norm": 6.220542108817106, + "learning_rate": 6.162407296275936e-07, + "loss": 0.1192, + "step": 4736 + }, + { + "epoch": 3.38, + "grad_norm": 20.56705499979564, + "learning_rate": 6.148514647652026e-07, + "loss": 0.149, + "step": 4737 + }, + { + "epoch": 3.38, + "grad_norm": 9.716608379449566, + "learning_rate": 6.134636650624448e-07, + "loss": 0.1365, + "step": 4738 + }, + { + "epoch": 3.38, + "grad_norm": 5.082791614875223, + "learning_rate": 6.120773309830108e-07, + "loss": 0.1092, + "step": 4739 + }, + { + "epoch": 3.38, + "grad_norm": 4.58955029935693, + "learning_rate": 6.106924629900996e-07, + "loss": 0.1178, + "step": 4740 + }, + { + "epoch": 3.38, + "grad_norm": 7.309633988207233, + "learning_rate": 6.09309061546422e-07, + "loss": 0.0879, + "step": 4741 + }, + { + "epoch": 3.38, + "grad_norm": 8.559534602189101, + "learning_rate": 6.079271271141979e-07, + "loss": 0.1665, + "step": 4742 + }, + { + "epoch": 3.39, + "grad_norm": 6.906477266849566, + "learning_rate": 6.065466601551578e-07, + "loss": 0.1103, + "step": 4743 + }, + { + "epoch": 3.39, + "grad_norm": 9.231629348906024, + "learning_rate": 6.051676611305401e-07, + "loss": 0.1409, + "step": 4744 + }, + { + "epoch": 3.39, + "grad_norm": 5.120806072520015, + "learning_rate": 6.037901305010951e-07, + "loss": 0.1036, + "step": 4745 + }, + { + "epoch": 3.39, + "grad_norm": 7.357155083012127, + "learning_rate": 6.024140687270813e-07, + "loss": 0.1104, + "step": 4746 + }, + { + "epoch": 3.39, + "grad_norm": 5.399708029837064, + "learning_rate": 6.010394762682659e-07, + "loss": 0.0915, + "step": 4747 + }, + { + "epoch": 3.39, + "grad_norm": 5.7543160484824085, + "learning_rate": 5.996663535839275e-07, + "loss": 0.0948, + "step": 4748 + }, + { + "epoch": 3.39, + "grad_norm": 10.87274696278645, + "learning_rate": 5.982947011328489e-07, + "loss": 0.1655, + "step": 4749 + }, + { + "epoch": 3.39, + "grad_norm": 6.79729828235239, + "learning_rate": 5.969245193733275e-07, + "loss": 0.0967, + "step": 4750 + }, + { + "epoch": 3.39, + "grad_norm": 7.446921297838255, + "learning_rate": 5.955558087631641e-07, + "loss": 0.1556, + "step": 4751 + }, + { + "epoch": 3.39, + "grad_norm": 6.479713414080842, + "learning_rate": 5.941885697596734e-07, + "loss": 0.1226, + "step": 4752 + }, + { + "epoch": 3.39, + "grad_norm": 8.37872487494799, + "learning_rate": 5.928228028196714e-07, + "loss": 0.1355, + "step": 4753 + }, + { + "epoch": 3.39, + "grad_norm": 7.936869826658227, + "learning_rate": 5.914585083994906e-07, + "loss": 0.1339, + "step": 4754 + }, + { + "epoch": 3.39, + "grad_norm": 6.964322624359045, + "learning_rate": 5.900956869549629e-07, + "loss": 0.1367, + "step": 4755 + }, + { + "epoch": 3.39, + "grad_norm": 7.650251277890639, + "learning_rate": 5.887343389414363e-07, + "loss": 0.1934, + "step": 4756 + }, + { + "epoch": 3.4, + "grad_norm": 9.021005113566153, + "learning_rate": 5.873744648137592e-07, + "loss": 0.1357, + "step": 4757 + }, + { + "epoch": 3.4, + "grad_norm": 11.186719406066896, + "learning_rate": 5.860160650262925e-07, + "loss": 0.1373, + "step": 4758 + }, + { + "epoch": 3.4, + "grad_norm": 3.9630385462474855, + "learning_rate": 5.846591400329021e-07, + "loss": 0.0551, + "step": 4759 + }, + { + "epoch": 3.4, + "grad_norm": 7.011343925108888, + "learning_rate": 5.833036902869626e-07, + "loss": 0.0994, + "step": 4760 + }, + { + "epoch": 3.4, + "grad_norm": 8.396161662969257, + "learning_rate": 5.81949716241354e-07, + "loss": 0.1549, + "step": 4761 + }, + { + "epoch": 3.4, + "grad_norm": 7.423724973182227, + "learning_rate": 5.805972183484654e-07, + "loss": 0.1567, + "step": 4762 + }, + { + "epoch": 3.4, + "grad_norm": 6.139312833169541, + "learning_rate": 5.792461970601903e-07, + "loss": 0.1575, + "step": 4763 + }, + { + "epoch": 3.4, + "grad_norm": 5.275973947423691, + "learning_rate": 5.778966528279306e-07, + "loss": 0.0974, + "step": 4764 + }, + { + "epoch": 3.4, + "grad_norm": 6.929297521573184, + "learning_rate": 5.765485861025944e-07, + "loss": 0.1428, + "step": 4765 + }, + { + "epoch": 3.4, + "grad_norm": 9.094874672404059, + "learning_rate": 5.752019973345963e-07, + "loss": 0.1667, + "step": 4766 + }, + { + "epoch": 3.4, + "grad_norm": 5.088141421563402, + "learning_rate": 5.738568869738537e-07, + "loss": 0.1113, + "step": 4767 + }, + { + "epoch": 3.4, + "grad_norm": 6.680419138288556, + "learning_rate": 5.725132554697971e-07, + "loss": 0.1432, + "step": 4768 + }, + { + "epoch": 3.4, + "grad_norm": 5.9210508941282285, + "learning_rate": 5.711711032713547e-07, + "loss": 0.1071, + "step": 4769 + }, + { + "epoch": 3.4, + "grad_norm": 7.310725061193263, + "learning_rate": 5.698304308269686e-07, + "loss": 0.1504, + "step": 4770 + }, + { + "epoch": 3.41, + "grad_norm": 6.741656309899917, + "learning_rate": 5.684912385845786e-07, + "loss": 0.1337, + "step": 4771 + }, + { + "epoch": 3.41, + "grad_norm": 8.428815406923771, + "learning_rate": 5.671535269916373e-07, + "loss": 0.1229, + "step": 4772 + }, + { + "epoch": 3.41, + "grad_norm": 6.190650489682488, + "learning_rate": 5.658172964950953e-07, + "loss": 0.1238, + "step": 4773 + }, + { + "epoch": 3.41, + "grad_norm": 4.782034891009411, + "learning_rate": 5.644825475414162e-07, + "loss": 0.0731, + "step": 4774 + }, + { + "epoch": 3.41, + "grad_norm": 6.444261645843906, + "learning_rate": 5.631492805765609e-07, + "loss": 0.1149, + "step": 4775 + }, + { + "epoch": 3.41, + "grad_norm": 6.550284656212252, + "learning_rate": 5.618174960459999e-07, + "loss": 0.1343, + "step": 4776 + }, + { + "epoch": 3.41, + "grad_norm": 6.641197272560726, + "learning_rate": 5.604871943947071e-07, + "loss": 0.1244, + "step": 4777 + }, + { + "epoch": 3.41, + "grad_norm": 16.192125850606384, + "learning_rate": 5.591583760671609e-07, + "loss": 0.175, + "step": 4778 + }, + { + "epoch": 3.41, + "grad_norm": 4.770239360221412, + "learning_rate": 5.578310415073451e-07, + "loss": 0.0812, + "step": 4779 + }, + { + "epoch": 3.41, + "grad_norm": 9.519992706791493, + "learning_rate": 5.565051911587455e-07, + "loss": 0.156, + "step": 4780 + }, + { + "epoch": 3.41, + "grad_norm": 6.899766338731636, + "learning_rate": 5.551808254643543e-07, + "loss": 0.1325, + "step": 4781 + }, + { + "epoch": 3.41, + "grad_norm": 5.377638893489806, + "learning_rate": 5.538579448666675e-07, + "loss": 0.1172, + "step": 4782 + }, + { + "epoch": 3.41, + "grad_norm": 5.657629714932012, + "learning_rate": 5.525365498076807e-07, + "loss": 0.0878, + "step": 4783 + }, + { + "epoch": 3.41, + "grad_norm": 6.999669463350998, + "learning_rate": 5.51216640728901e-07, + "loss": 0.1361, + "step": 4784 + }, + { + "epoch": 3.42, + "grad_norm": 7.528022836641468, + "learning_rate": 5.498982180713308e-07, + "loss": 0.161, + "step": 4785 + }, + { + "epoch": 3.42, + "grad_norm": 13.981093221771234, + "learning_rate": 5.485812822754826e-07, + "loss": 0.2415, + "step": 4786 + }, + { + "epoch": 3.42, + "grad_norm": 6.204609953884755, + "learning_rate": 5.472658337813664e-07, + "loss": 0.0935, + "step": 4787 + }, + { + "epoch": 3.42, + "grad_norm": 9.851833696225833, + "learning_rate": 5.459518730285007e-07, + "loss": 0.1015, + "step": 4788 + }, + { + "epoch": 3.42, + "grad_norm": 8.172761495859053, + "learning_rate": 5.446394004559008e-07, + "loss": 0.1106, + "step": 4789 + }, + { + "epoch": 3.42, + "grad_norm": 4.069756351444801, + "learning_rate": 5.43328416502093e-07, + "loss": 0.0833, + "step": 4790 + }, + { + "epoch": 3.42, + "grad_norm": 7.754921719414479, + "learning_rate": 5.420189216050969e-07, + "loss": 0.1276, + "step": 4791 + }, + { + "epoch": 3.42, + "grad_norm": 6.097853307500944, + "learning_rate": 5.407109162024409e-07, + "loss": 0.1288, + "step": 4792 + }, + { + "epoch": 3.42, + "grad_norm": 7.6596367143990065, + "learning_rate": 5.394044007311544e-07, + "loss": 0.1259, + "step": 4793 + }, + { + "epoch": 3.42, + "grad_norm": 9.109859513949639, + "learning_rate": 5.380993756277675e-07, + "loss": 0.1785, + "step": 4794 + }, + { + "epoch": 3.42, + "grad_norm": 5.758617786067658, + "learning_rate": 5.367958413283141e-07, + "loss": 0.1019, + "step": 4795 + }, + { + "epoch": 3.42, + "grad_norm": 10.073887412643687, + "learning_rate": 5.354937982683283e-07, + "loss": 0.1434, + "step": 4796 + }, + { + "epoch": 3.42, + "grad_norm": 20.106138254105133, + "learning_rate": 5.341932468828481e-07, + "loss": 0.0942, + "step": 4797 + }, + { + "epoch": 3.42, + "grad_norm": 7.547332850402219, + "learning_rate": 5.328941876064114e-07, + "loss": 0.1196, + "step": 4798 + }, + { + "epoch": 3.43, + "grad_norm": 9.800610244619884, + "learning_rate": 5.315966208730578e-07, + "loss": 0.1415, + "step": 4799 + }, + { + "epoch": 3.43, + "grad_norm": 5.735215749717641, + "learning_rate": 5.30300547116328e-07, + "loss": 0.1469, + "step": 4800 + }, + { + "epoch": 3.43, + "grad_norm": 6.803423115653081, + "learning_rate": 5.290059667692655e-07, + "loss": 0.1031, + "step": 4801 + }, + { + "epoch": 3.43, + "grad_norm": 7.114085098454165, + "learning_rate": 5.277128802644133e-07, + "loss": 0.1191, + "step": 4802 + }, + { + "epoch": 3.43, + "grad_norm": 10.514855529211154, + "learning_rate": 5.264212880338154e-07, + "loss": 0.1995, + "step": 4803 + }, + { + "epoch": 3.43, + "grad_norm": 7.725958167463443, + "learning_rate": 5.251311905090167e-07, + "loss": 0.0878, + "step": 4804 + }, + { + "epoch": 3.43, + "grad_norm": 6.131421273398189, + "learning_rate": 5.238425881210624e-07, + "loss": 0.0955, + "step": 4805 + }, + { + "epoch": 3.43, + "grad_norm": 9.949154525113872, + "learning_rate": 5.225554813004996e-07, + "loss": 0.1548, + "step": 4806 + }, + { + "epoch": 3.43, + "grad_norm": 8.331347956438186, + "learning_rate": 5.21269870477375e-07, + "loss": 0.1467, + "step": 4807 + }, + { + "epoch": 3.43, + "grad_norm": 6.448868593975817, + "learning_rate": 5.199857560812316e-07, + "loss": 0.1307, + "step": 4808 + }, + { + "epoch": 3.43, + "grad_norm": 9.401074138435595, + "learning_rate": 5.187031385411206e-07, + "loss": 0.1598, + "step": 4809 + }, + { + "epoch": 3.43, + "grad_norm": 8.997603436753522, + "learning_rate": 5.174220182855844e-07, + "loss": 0.1896, + "step": 4810 + }, + { + "epoch": 3.43, + "grad_norm": 5.9623142347970015, + "learning_rate": 5.161423957426725e-07, + "loss": 0.1287, + "step": 4811 + }, + { + "epoch": 3.43, + "grad_norm": 7.338573874367029, + "learning_rate": 5.148642713399272e-07, + "loss": 0.112, + "step": 4812 + }, + { + "epoch": 3.44, + "grad_norm": 7.146812781041527, + "learning_rate": 5.13587645504397e-07, + "loss": 0.1658, + "step": 4813 + }, + { + "epoch": 3.44, + "grad_norm": 9.006764590662835, + "learning_rate": 5.123125186626227e-07, + "loss": 0.2671, + "step": 4814 + }, + { + "epoch": 3.44, + "grad_norm": 6.677054214930086, + "learning_rate": 5.110388912406517e-07, + "loss": 0.1119, + "step": 4815 + }, + { + "epoch": 3.44, + "grad_norm": 5.786888005642126, + "learning_rate": 5.097667636640241e-07, + "loss": 0.117, + "step": 4816 + }, + { + "epoch": 3.44, + "grad_norm": 8.566924141847709, + "learning_rate": 5.084961363577817e-07, + "loss": 0.1681, + "step": 4817 + }, + { + "epoch": 3.44, + "grad_norm": 6.283648205807579, + "learning_rate": 5.072270097464649e-07, + "loss": 0.1382, + "step": 4818 + }, + { + "epoch": 3.44, + "grad_norm": 6.70248104170661, + "learning_rate": 5.059593842541127e-07, + "loss": 0.1403, + "step": 4819 + }, + { + "epoch": 3.44, + "grad_norm": 4.145804986104878, + "learning_rate": 5.04693260304262e-07, + "loss": 0.0723, + "step": 4820 + }, + { + "epoch": 3.44, + "grad_norm": 7.58725175311385, + "learning_rate": 5.034286383199488e-07, + "loss": 0.1171, + "step": 4821 + }, + { + "epoch": 3.44, + "grad_norm": 5.057300916102727, + "learning_rate": 5.021655187237067e-07, + "loss": 0.1077, + "step": 4822 + }, + { + "epoch": 3.44, + "grad_norm": 4.823300742743953, + "learning_rate": 5.009039019375672e-07, + "loss": 0.1105, + "step": 4823 + }, + { + "epoch": 3.44, + "grad_norm": 6.406379850384206, + "learning_rate": 4.996437883830596e-07, + "loss": 0.1053, + "step": 4824 + }, + { + "epoch": 3.44, + "grad_norm": 7.476297609589221, + "learning_rate": 4.983851784812127e-07, + "loss": 0.1573, + "step": 4825 + }, + { + "epoch": 3.44, + "grad_norm": 10.028708462150261, + "learning_rate": 4.97128072652549e-07, + "loss": 0.0988, + "step": 4826 + }, + { + "epoch": 3.45, + "grad_norm": 6.70771060659306, + "learning_rate": 4.958724713170943e-07, + "loss": 0.121, + "step": 4827 + }, + { + "epoch": 3.45, + "grad_norm": 7.550234303625725, + "learning_rate": 4.946183748943639e-07, + "loss": 0.1516, + "step": 4828 + }, + { + "epoch": 3.45, + "grad_norm": 4.250072350581676, + "learning_rate": 4.933657838033795e-07, + "loss": 0.0782, + "step": 4829 + }, + { + "epoch": 3.45, + "grad_norm": 14.901528479240605, + "learning_rate": 4.921146984626507e-07, + "loss": 0.2083, + "step": 4830 + }, + { + "epoch": 3.45, + "grad_norm": 8.861570669785552, + "learning_rate": 4.908651192901926e-07, + "loss": 0.1427, + "step": 4831 + }, + { + "epoch": 3.45, + "grad_norm": 7.774296537989752, + "learning_rate": 4.896170467035089e-07, + "loss": 0.1311, + "step": 4832 + }, + { + "epoch": 3.45, + "grad_norm": 6.097843488270496, + "learning_rate": 4.883704811196072e-07, + "loss": 0.1015, + "step": 4833 + }, + { + "epoch": 3.45, + "grad_norm": 5.944954993942699, + "learning_rate": 4.871254229549855e-07, + "loss": 0.1368, + "step": 4834 + }, + { + "epoch": 3.45, + "grad_norm": 6.610591286434344, + "learning_rate": 4.858818726256425e-07, + "loss": 0.1675, + "step": 4835 + }, + { + "epoch": 3.45, + "grad_norm": 7.955225527923404, + "learning_rate": 4.846398305470712e-07, + "loss": 0.1721, + "step": 4836 + }, + { + "epoch": 3.45, + "grad_norm": 10.896382771601361, + "learning_rate": 4.833992971342604e-07, + "loss": 0.1346, + "step": 4837 + }, + { + "epoch": 3.45, + "grad_norm": 4.712058505532531, + "learning_rate": 4.821602728016955e-07, + "loss": 0.0718, + "step": 4838 + }, + { + "epoch": 3.45, + "grad_norm": 4.889206680905457, + "learning_rate": 4.809227579633585e-07, + "loss": 0.0939, + "step": 4839 + }, + { + "epoch": 3.45, + "grad_norm": 6.531400313220989, + "learning_rate": 4.796867530327249e-07, + "loss": 0.1354, + "step": 4840 + }, + { + "epoch": 3.46, + "grad_norm": 6.121051370381773, + "learning_rate": 4.784522584227675e-07, + "loss": 0.1246, + "step": 4841 + }, + { + "epoch": 3.46, + "grad_norm": 7.447583991391642, + "learning_rate": 4.772192745459536e-07, + "loss": 0.1349, + "step": 4842 + }, + { + "epoch": 3.46, + "grad_norm": 7.9717320548571236, + "learning_rate": 4.7598780181424666e-07, + "loss": 0.1399, + "step": 4843 + }, + { + "epoch": 3.46, + "grad_norm": 7.189759067456543, + "learning_rate": 4.7475784063910404e-07, + "loss": 0.1154, + "step": 4844 + }, + { + "epoch": 3.46, + "grad_norm": 4.200052811630355, + "learning_rate": 4.7352939143147927e-07, + "loss": 0.1024, + "step": 4845 + }, + { + "epoch": 3.46, + "grad_norm": 7.327313296684159, + "learning_rate": 4.72302454601819e-07, + "loss": 0.1127, + "step": 4846 + }, + { + "epoch": 3.46, + "grad_norm": 4.821690226987369, + "learning_rate": 4.7107703056006706e-07, + "loss": 0.1036, + "step": 4847 + }, + { + "epoch": 3.46, + "grad_norm": 4.37306344284546, + "learning_rate": 4.6985311971565806e-07, + "loss": 0.0723, + "step": 4848 + }, + { + "epoch": 3.46, + "grad_norm": 5.177107325600691, + "learning_rate": 4.6863072247752664e-07, + "loss": 0.0664, + "step": 4849 + }, + { + "epoch": 3.46, + "grad_norm": 6.044564249735391, + "learning_rate": 4.67409839254096e-07, + "loss": 0.1136, + "step": 4850 + }, + { + "epoch": 3.46, + "grad_norm": 5.84423169120056, + "learning_rate": 4.66190470453286e-07, + "loss": 0.1257, + "step": 4851 + }, + { + "epoch": 3.46, + "grad_norm": 6.516029681667653, + "learning_rate": 4.6497261648251134e-07, + "loss": 0.095, + "step": 4852 + }, + { + "epoch": 3.46, + "grad_norm": 5.602034097877229, + "learning_rate": 4.6375627774867925e-07, + "loss": 0.1377, + "step": 4853 + }, + { + "epoch": 3.46, + "grad_norm": 8.561475016663382, + "learning_rate": 4.6254145465819134e-07, + "loss": 0.1226, + "step": 4854 + }, + { + "epoch": 3.47, + "grad_norm": 6.043790120678902, + "learning_rate": 4.6132814761694234e-07, + "loss": 0.0833, + "step": 4855 + }, + { + "epoch": 3.47, + "grad_norm": 5.934699069054085, + "learning_rate": 4.6011635703032075e-07, + "loss": 0.0867, + "step": 4856 + }, + { + "epoch": 3.47, + "grad_norm": 8.280932206636068, + "learning_rate": 4.589060833032083e-07, + "loss": 0.1135, + "step": 4857 + }, + { + "epoch": 3.47, + "grad_norm": 10.11043341804737, + "learning_rate": 4.5769732683997983e-07, + "loss": 0.1636, + "step": 4858 + }, + { + "epoch": 3.47, + "grad_norm": 4.578783922415493, + "learning_rate": 4.564900880445039e-07, + "loss": 0.1052, + "step": 4859 + }, + { + "epoch": 3.47, + "grad_norm": 6.231779936962509, + "learning_rate": 4.552843673201407e-07, + "loss": 0.105, + "step": 4860 + }, + { + "epoch": 3.47, + "grad_norm": 6.714870031308601, + "learning_rate": 4.540801650697446e-07, + "loss": 0.176, + "step": 4861 + }, + { + "epoch": 3.47, + "grad_norm": 5.82521739237652, + "learning_rate": 4.528774816956616e-07, + "loss": 0.0887, + "step": 4862 + }, + { + "epoch": 3.47, + "grad_norm": 7.575021134589715, + "learning_rate": 4.516763175997302e-07, + "loss": 0.1743, + "step": 4863 + }, + { + "epoch": 3.47, + "grad_norm": 7.637041020100914, + "learning_rate": 4.5047667318328215e-07, + "loss": 0.0961, + "step": 4864 + }, + { + "epoch": 3.47, + "grad_norm": 5.555943353432021, + "learning_rate": 4.492785488471413e-07, + "loss": 0.1068, + "step": 4865 + }, + { + "epoch": 3.47, + "grad_norm": 6.394841605960666, + "learning_rate": 4.480819449916224e-07, + "loss": 0.1062, + "step": 4866 + }, + { + "epoch": 3.47, + "grad_norm": 6.625874447124118, + "learning_rate": 4.468868620165334e-07, + "loss": 0.1735, + "step": 4867 + }, + { + "epoch": 3.47, + "grad_norm": 9.268464740985305, + "learning_rate": 4.4569330032117496e-07, + "loss": 0.1744, + "step": 4868 + }, + { + "epoch": 3.48, + "grad_norm": 7.161236160543748, + "learning_rate": 4.445012603043347e-07, + "loss": 0.158, + "step": 4869 + }, + { + "epoch": 3.48, + "grad_norm": 6.150201890120088, + "learning_rate": 4.4331074236430014e-07, + "loss": 0.1084, + "step": 4870 + }, + { + "epoch": 3.48, + "grad_norm": 5.604703088860564, + "learning_rate": 4.421217468988409e-07, + "loss": 0.1189, + "step": 4871 + }, + { + "epoch": 3.48, + "grad_norm": 6.470583792177313, + "learning_rate": 4.409342743052264e-07, + "loss": 0.1427, + "step": 4872 + }, + { + "epoch": 3.48, + "grad_norm": 6.157097944965909, + "learning_rate": 4.3974832498020983e-07, + "loss": 0.1149, + "step": 4873 + }, + { + "epoch": 3.48, + "grad_norm": 7.227392034283128, + "learning_rate": 4.385638993200425e-07, + "loss": 0.1059, + "step": 4874 + }, + { + "epoch": 3.48, + "grad_norm": 5.359135702963531, + "learning_rate": 4.3738099772045963e-07, + "loss": 0.0933, + "step": 4875 + }, + { + "epoch": 3.48, + "grad_norm": 5.540963344549303, + "learning_rate": 4.3619962057669216e-07, + "loss": 0.1465, + "step": 4876 + }, + { + "epoch": 3.48, + "grad_norm": 8.921949783265053, + "learning_rate": 4.350197682834606e-07, + "loss": 0.1624, + "step": 4877 + }, + { + "epoch": 3.48, + "grad_norm": 6.8268412987785485, + "learning_rate": 4.338414412349745e-07, + "loss": 0.1013, + "step": 4878 + }, + { + "epoch": 3.48, + "grad_norm": 7.046004451519703, + "learning_rate": 4.3266463982493566e-07, + "loss": 0.1239, + "step": 4879 + }, + { + "epoch": 3.48, + "grad_norm": 6.6020244320462504, + "learning_rate": 4.314893644465351e-07, + "loss": 0.1201, + "step": 4880 + }, + { + "epoch": 3.48, + "grad_norm": 5.97689046629764, + "learning_rate": 4.303156154924537e-07, + "loss": 0.1025, + "step": 4881 + }, + { + "epoch": 3.48, + "grad_norm": 16.85592779467006, + "learning_rate": 4.291433933548633e-07, + "loss": 0.1746, + "step": 4882 + }, + { + "epoch": 3.49, + "grad_norm": 20.986355936055386, + "learning_rate": 4.279726984254251e-07, + "loss": 0.2146, + "step": 4883 + }, + { + "epoch": 3.49, + "grad_norm": 8.537459210196602, + "learning_rate": 4.268035310952906e-07, + "loss": 0.1295, + "step": 4884 + }, + { + "epoch": 3.49, + "grad_norm": 3.993488754974167, + "learning_rate": 4.256358917550979e-07, + "loss": 0.0913, + "step": 4885 + }, + { + "epoch": 3.49, + "grad_norm": 5.377518655838321, + "learning_rate": 4.244697807949805e-07, + "loss": 0.0779, + "step": 4886 + }, + { + "epoch": 3.49, + "grad_norm": 6.679235808102704, + "learning_rate": 4.2330519860455446e-07, + "loss": 0.1, + "step": 4887 + }, + { + "epoch": 3.49, + "grad_norm": 6.926326752680834, + "learning_rate": 4.2214214557293133e-07, + "loss": 0.1694, + "step": 4888 + }, + { + "epoch": 3.49, + "grad_norm": 10.173919781648312, + "learning_rate": 4.209806220887053e-07, + "loss": 0.1553, + "step": 4889 + }, + { + "epoch": 3.49, + "grad_norm": 8.716790283479154, + "learning_rate": 4.1982062853996695e-07, + "loss": 0.1282, + "step": 4890 + }, + { + "epoch": 3.49, + "grad_norm": 5.741222580663796, + "learning_rate": 4.1866216531428806e-07, + "loss": 0.0996, + "step": 4891 + }, + { + "epoch": 3.49, + "grad_norm": 8.604221773949565, + "learning_rate": 4.1750523279873613e-07, + "loss": 0.119, + "step": 4892 + }, + { + "epoch": 3.49, + "grad_norm": 6.081160173256382, + "learning_rate": 4.1634983137986083e-07, + "loss": 0.1091, + "step": 4893 + }, + { + "epoch": 3.49, + "grad_norm": 7.791749150013185, + "learning_rate": 4.151959614437046e-07, + "loss": 0.1576, + "step": 4894 + }, + { + "epoch": 3.49, + "grad_norm": 7.986445225209414, + "learning_rate": 4.1404362337579716e-07, + "loss": 0.1707, + "step": 4895 + }, + { + "epoch": 3.49, + "grad_norm": 8.400952391001237, + "learning_rate": 4.128928175611546e-07, + "loss": 0.1184, + "step": 4896 + }, + { + "epoch": 3.5, + "grad_norm": 5.741392906608111, + "learning_rate": 4.1174354438428434e-07, + "loss": 0.1042, + "step": 4897 + }, + { + "epoch": 3.5, + "grad_norm": 7.079323754898526, + "learning_rate": 4.105958042291791e-07, + "loss": 0.1346, + "step": 4898 + }, + { + "epoch": 3.5, + "grad_norm": 8.395224511829195, + "learning_rate": 4.0944959747931945e-07, + "loss": 0.1548, + "step": 4899 + }, + { + "epoch": 3.5, + "grad_norm": 8.677897205722589, + "learning_rate": 4.0830492451767566e-07, + "loss": 0.1658, + "step": 4900 + }, + { + "epoch": 3.5, + "grad_norm": 5.826376865813371, + "learning_rate": 4.0716178572670405e-07, + "loss": 0.1022, + "step": 4901 + }, + { + "epoch": 3.5, + "grad_norm": 7.570457325496883, + "learning_rate": 4.060201814883474e-07, + "loss": 0.1015, + "step": 4902 + }, + { + "epoch": 3.5, + "grad_norm": 6.666909786473492, + "learning_rate": 4.0488011218403844e-07, + "loss": 0.1423, + "step": 4903 + }, + { + "epoch": 3.5, + "grad_norm": 10.844528241311131, + "learning_rate": 4.0374157819469406e-07, + "loss": 0.1428, + "step": 4904 + }, + { + "epoch": 3.5, + "grad_norm": 5.958351896197906, + "learning_rate": 4.0260457990072113e-07, + "loss": 0.106, + "step": 4905 + }, + { + "epoch": 3.5, + "grad_norm": 6.199859560189758, + "learning_rate": 4.014691176820107e-07, + "loss": 0.1133, + "step": 4906 + }, + { + "epoch": 3.5, + "grad_norm": 6.634470604062944, + "learning_rate": 4.003351919179421e-07, + "loss": 0.1403, + "step": 4907 + }, + { + "epoch": 3.5, + "grad_norm": 7.633691260784369, + "learning_rate": 3.9920280298738125e-07, + "loss": 0.2009, + "step": 4908 + }, + { + "epoch": 3.5, + "grad_norm": 5.181243123537604, + "learning_rate": 3.980719512686809e-07, + "loss": 0.1056, + "step": 4909 + }, + { + "epoch": 3.5, + "grad_norm": 5.055526650996727, + "learning_rate": 3.969426371396773e-07, + "loss": 0.0876, + "step": 4910 + }, + { + "epoch": 3.51, + "grad_norm": 9.38989387103326, + "learning_rate": 3.9581486097769905e-07, + "loss": 0.1478, + "step": 4911 + }, + { + "epoch": 3.51, + "grad_norm": 6.137927650461356, + "learning_rate": 3.946886231595526e-07, + "loss": 0.1445, + "step": 4912 + }, + { + "epoch": 3.51, + "grad_norm": 4.845052725437297, + "learning_rate": 3.935639240615396e-07, + "loss": 0.1101, + "step": 4913 + }, + { + "epoch": 3.51, + "grad_norm": 9.620807970425544, + "learning_rate": 3.924407640594391e-07, + "loss": 0.1301, + "step": 4914 + }, + { + "epoch": 3.51, + "grad_norm": 5.003315815491063, + "learning_rate": 3.913191435285224e-07, + "loss": 0.0854, + "step": 4915 + }, + { + "epoch": 3.51, + "grad_norm": 10.51854542922581, + "learning_rate": 3.9019906284354145e-07, + "loss": 0.1211, + "step": 4916 + }, + { + "epoch": 3.51, + "grad_norm": 11.27692744405923, + "learning_rate": 3.8908052237873863e-07, + "loss": 0.1339, + "step": 4917 + }, + { + "epoch": 3.51, + "grad_norm": 6.778719813958779, + "learning_rate": 3.879635225078371e-07, + "loss": 0.1556, + "step": 4918 + }, + { + "epoch": 3.51, + "grad_norm": 6.5589312805748445, + "learning_rate": 3.868480636040484e-07, + "loss": 0.094, + "step": 4919 + }, + { + "epoch": 3.51, + "grad_norm": 11.230659542738735, + "learning_rate": 3.857341460400665e-07, + "loss": 0.1584, + "step": 4920 + }, + { + "epoch": 3.51, + "grad_norm": 6.838176925618872, + "learning_rate": 3.846217701880739e-07, + "loss": 0.1112, + "step": 4921 + }, + { + "epoch": 3.51, + "grad_norm": 4.807557680870521, + "learning_rate": 3.835109364197348e-07, + "loss": 0.0952, + "step": 4922 + }, + { + "epoch": 3.51, + "grad_norm": 5.99047386603123, + "learning_rate": 3.8240164510620017e-07, + "loss": 0.0955, + "step": 4923 + }, + { + "epoch": 3.51, + "grad_norm": 5.921585109056007, + "learning_rate": 3.81293896618104e-07, + "loss": 0.1046, + "step": 4924 + }, + { + "epoch": 3.52, + "grad_norm": 4.71667528673715, + "learning_rate": 3.8018769132556644e-07, + "loss": 0.0726, + "step": 4925 + }, + { + "epoch": 3.52, + "grad_norm": 6.07125569493634, + "learning_rate": 3.790830295981912e-07, + "loss": 0.1105, + "step": 4926 + }, + { + "epoch": 3.52, + "grad_norm": 4.207268748077647, + "learning_rate": 3.7797991180506643e-07, + "loss": 0.0854, + "step": 4927 + }, + { + "epoch": 3.52, + "grad_norm": 6.1493317988743845, + "learning_rate": 3.768783383147623e-07, + "loss": 0.0932, + "step": 4928 + }, + { + "epoch": 3.52, + "grad_norm": 7.303017501676457, + "learning_rate": 3.757783094953382e-07, + "loss": 0.1523, + "step": 4929 + }, + { + "epoch": 3.52, + "grad_norm": 10.291370181214797, + "learning_rate": 3.746798257143314e-07, + "loss": 0.1628, + "step": 4930 + }, + { + "epoch": 3.52, + "grad_norm": 4.898999805060367, + "learning_rate": 3.735828873387681e-07, + "loss": 0.0839, + "step": 4931 + }, + { + "epoch": 3.52, + "grad_norm": 4.965168117910225, + "learning_rate": 3.724874947351531e-07, + "loss": 0.1173, + "step": 4932 + }, + { + "epoch": 3.52, + "grad_norm": 5.063017750866719, + "learning_rate": 3.7139364826948077e-07, + "loss": 0.0971, + "step": 4933 + }, + { + "epoch": 3.52, + "grad_norm": 5.753008355329342, + "learning_rate": 3.7030134830722207e-07, + "loss": 0.0903, + "step": 4934 + }, + { + "epoch": 3.52, + "grad_norm": 4.717973246272266, + "learning_rate": 3.692105952133379e-07, + "loss": 0.0975, + "step": 4935 + }, + { + "epoch": 3.52, + "grad_norm": 8.333430271233516, + "learning_rate": 3.681213893522667e-07, + "loss": 0.1337, + "step": 4936 + }, + { + "epoch": 3.52, + "grad_norm": 10.107222632655112, + "learning_rate": 3.670337310879335e-07, + "loss": 0.1675, + "step": 4937 + }, + { + "epoch": 3.52, + "grad_norm": 9.419377853290698, + "learning_rate": 3.6594762078374536e-07, + "loss": 0.1014, + "step": 4938 + }, + { + "epoch": 3.53, + "grad_norm": 12.43896448622122, + "learning_rate": 3.6486305880259085e-07, + "loss": 0.2437, + "step": 4939 + }, + { + "epoch": 3.53, + "grad_norm": 5.616562037480525, + "learning_rate": 3.6378004550684355e-07, + "loss": 0.0884, + "step": 4940 + }, + { + "epoch": 3.53, + "grad_norm": 6.338766725755504, + "learning_rate": 3.626985812583572e-07, + "loss": 0.1049, + "step": 4941 + }, + { + "epoch": 3.53, + "grad_norm": 5.202111188217427, + "learning_rate": 3.6161866641847007e-07, + "loss": 0.1033, + "step": 4942 + }, + { + "epoch": 3.53, + "grad_norm": 7.316758540310851, + "learning_rate": 3.6054030134800243e-07, + "loss": 0.1517, + "step": 4943 + }, + { + "epoch": 3.53, + "grad_norm": 7.275565880480988, + "learning_rate": 3.594634864072527e-07, + "loss": 0.1464, + "step": 4944 + }, + { + "epoch": 3.53, + "grad_norm": 10.317802181069961, + "learning_rate": 3.583882219560092e-07, + "loss": 0.2065, + "step": 4945 + }, + { + "epoch": 3.53, + "grad_norm": 8.125843837992509, + "learning_rate": 3.57314508353534e-07, + "loss": 0.1593, + "step": 4946 + }, + { + "epoch": 3.53, + "grad_norm": 5.59942806326855, + "learning_rate": 3.5624234595857787e-07, + "loss": 0.1151, + "step": 4947 + }, + { + "epoch": 3.53, + "grad_norm": 7.182502216677451, + "learning_rate": 3.551717351293676e-07, + "loss": 0.1285, + "step": 4948 + }, + { + "epoch": 3.53, + "grad_norm": 8.035563717929566, + "learning_rate": 3.541026762236166e-07, + "loss": 0.1669, + "step": 4949 + }, + { + "epoch": 3.53, + "grad_norm": 5.790990792314284, + "learning_rate": 3.5303516959851405e-07, + "loss": 0.1127, + "step": 4950 + }, + { + "epoch": 3.53, + "grad_norm": 4.881110923975365, + "learning_rate": 3.519692156107379e-07, + "loss": 0.1031, + "step": 4951 + }, + { + "epoch": 3.53, + "grad_norm": 9.586597575490307, + "learning_rate": 3.509048146164401e-07, + "loss": 0.1537, + "step": 4952 + }, + { + "epoch": 3.54, + "grad_norm": 5.858555298946315, + "learning_rate": 3.4984196697125827e-07, + "loss": 0.0942, + "step": 4953 + }, + { + "epoch": 3.54, + "grad_norm": 3.355412324174652, + "learning_rate": 3.4878067303030836e-07, + "loss": 0.0513, + "step": 4954 + }, + { + "epoch": 3.54, + "grad_norm": 9.169699928002089, + "learning_rate": 3.4772093314818957e-07, + "loss": 0.1544, + "step": 4955 + }, + { + "epoch": 3.54, + "grad_norm": 7.753478969791648, + "learning_rate": 3.4666274767897967e-07, + "loss": 0.1509, + "step": 4956 + }, + { + "epoch": 3.54, + "grad_norm": 7.529002063527844, + "learning_rate": 3.456061169762392e-07, + "loss": 0.1528, + "step": 4957 + }, + { + "epoch": 3.54, + "grad_norm": 6.793947603721516, + "learning_rate": 3.44551041393007e-07, + "loss": 0.1267, + "step": 4958 + }, + { + "epoch": 3.54, + "grad_norm": 7.569499505899291, + "learning_rate": 3.434975212818048e-07, + "loss": 0.1304, + "step": 4959 + }, + { + "epoch": 3.54, + "grad_norm": 6.899698816129731, + "learning_rate": 3.424455569946317e-07, + "loss": 0.1163, + "step": 4960 + }, + { + "epoch": 3.54, + "grad_norm": 7.341545137728287, + "learning_rate": 3.4139514888296975e-07, + "loss": 0.1301, + "step": 4961 + }, + { + "epoch": 3.54, + "grad_norm": 10.0706321470383, + "learning_rate": 3.403462972977789e-07, + "loss": 0.1243, + "step": 4962 + }, + { + "epoch": 3.54, + "grad_norm": 6.231489025292407, + "learning_rate": 3.392990025895004e-07, + "loss": 0.1277, + "step": 4963 + }, + { + "epoch": 3.54, + "grad_norm": 10.311986109419417, + "learning_rate": 3.3825326510805556e-07, + "loss": 0.1787, + "step": 4964 + }, + { + "epoch": 3.54, + "grad_norm": 7.1396184042409105, + "learning_rate": 3.372090852028437e-07, + "loss": 0.1366, + "step": 4965 + }, + { + "epoch": 3.54, + "grad_norm": 5.781959199435736, + "learning_rate": 3.361664632227446e-07, + "loss": 0.0825, + "step": 4966 + }, + { + "epoch": 3.55, + "grad_norm": 6.2875082133414395, + "learning_rate": 3.3512539951611856e-07, + "loss": 0.147, + "step": 4967 + }, + { + "epoch": 3.55, + "grad_norm": 5.914439645723257, + "learning_rate": 3.3408589443080395e-07, + "loss": 0.1083, + "step": 4968 + }, + { + "epoch": 3.55, + "grad_norm": 7.325977047729606, + "learning_rate": 3.3304794831411804e-07, + "loss": 0.1431, + "step": 4969 + }, + { + "epoch": 3.55, + "grad_norm": 8.27154445640713, + "learning_rate": 3.3201156151285994e-07, + "loss": 0.1476, + "step": 4970 + }, + { + "epoch": 3.55, + "grad_norm": 6.2873062920787355, + "learning_rate": 3.309767343733028e-07, + "loss": 0.0926, + "step": 4971 + }, + { + "epoch": 3.55, + "grad_norm": 3.7912487931444017, + "learning_rate": 3.299434672412044e-07, + "loss": 0.0831, + "step": 4972 + }, + { + "epoch": 3.55, + "grad_norm": 6.907191593542313, + "learning_rate": 3.2891176046179583e-07, + "loss": 0.1079, + "step": 4973 + }, + { + "epoch": 3.55, + "grad_norm": 7.634784204009729, + "learning_rate": 3.278816143797919e-07, + "loss": 0.1184, + "step": 4974 + }, + { + "epoch": 3.55, + "grad_norm": 8.738231719908597, + "learning_rate": 3.2685302933938177e-07, + "loss": 0.1627, + "step": 4975 + }, + { + "epoch": 3.55, + "grad_norm": 7.051208068222329, + "learning_rate": 3.2582600568423715e-07, + "loss": 0.1112, + "step": 4976 + }, + { + "epoch": 3.55, + "grad_norm": 5.633258071747942, + "learning_rate": 3.2480054375750305e-07, + "loss": 0.0843, + "step": 4977 + }, + { + "epoch": 3.55, + "grad_norm": 6.457146068070933, + "learning_rate": 3.237766439018064e-07, + "loss": 0.1165, + "step": 4978 + }, + { + "epoch": 3.55, + "grad_norm": 4.686262096507274, + "learning_rate": 3.227543064592514e-07, + "loss": 0.0764, + "step": 4979 + }, + { + "epoch": 3.55, + "grad_norm": 4.730731801506817, + "learning_rate": 3.2173353177142044e-07, + "loss": 0.0969, + "step": 4980 + }, + { + "epoch": 3.56, + "grad_norm": 4.936171738164148, + "learning_rate": 3.207143201793722e-07, + "loss": 0.0854, + "step": 4981 + }, + { + "epoch": 3.56, + "grad_norm": 5.8137612074069, + "learning_rate": 3.1969667202364496e-07, + "loss": 0.1029, + "step": 4982 + }, + { + "epoch": 3.56, + "grad_norm": 9.229282590022793, + "learning_rate": 3.1868058764425337e-07, + "loss": 0.1616, + "step": 4983 + }, + { + "epoch": 3.56, + "grad_norm": 7.105331418972408, + "learning_rate": 3.1766606738069084e-07, + "loss": 0.1034, + "step": 4984 + }, + { + "epoch": 3.56, + "grad_norm": 7.807312628779206, + "learning_rate": 3.166531115719268e-07, + "loss": 0.144, + "step": 4985 + }, + { + "epoch": 3.56, + "grad_norm": 7.869038868344821, + "learning_rate": 3.1564172055640994e-07, + "loss": 0.0964, + "step": 4986 + }, + { + "epoch": 3.56, + "grad_norm": 8.875331548761368, + "learning_rate": 3.1463189467206166e-07, + "loss": 0.1221, + "step": 4987 + }, + { + "epoch": 3.56, + "grad_norm": 4.938849961784851, + "learning_rate": 3.1362363425628763e-07, + "loss": 0.0954, + "step": 4988 + }, + { + "epoch": 3.56, + "grad_norm": 10.180385680278965, + "learning_rate": 3.1261693964596275e-07, + "loss": 0.1552, + "step": 4989 + }, + { + "epoch": 3.56, + "grad_norm": 6.039383569321269, + "learning_rate": 3.116118111774452e-07, + "loss": 0.0889, + "step": 4990 + }, + { + "epoch": 3.56, + "grad_norm": 5.068556225143177, + "learning_rate": 3.106082491865647e-07, + "loss": 0.093, + "step": 4991 + }, + { + "epoch": 3.56, + "grad_norm": 7.284668219658361, + "learning_rate": 3.0960625400863253e-07, + "loss": 0.1415, + "step": 4992 + }, + { + "epoch": 3.56, + "grad_norm": 14.454110851656004, + "learning_rate": 3.0860582597843137e-07, + "loss": 0.2096, + "step": 4993 + }, + { + "epoch": 3.56, + "grad_norm": 6.243067078626986, + "learning_rate": 3.0760696543022496e-07, + "loss": 0.0966, + "step": 4994 + }, + { + "epoch": 3.57, + "grad_norm": 5.398929947949957, + "learning_rate": 3.066096726977502e-07, + "loss": 0.0952, + "step": 4995 + }, + { + "epoch": 3.57, + "grad_norm": 6.239239492157707, + "learning_rate": 3.056139481142206e-07, + "loss": 0.101, + "step": 4996 + }, + { + "epoch": 3.57, + "grad_norm": 9.023044388173963, + "learning_rate": 3.0461979201232674e-07, + "loss": 0.1794, + "step": 4997 + }, + { + "epoch": 3.57, + "grad_norm": 7.570504247937259, + "learning_rate": 3.0362720472423503e-07, + "loss": 0.1161, + "step": 4998 + }, + { + "epoch": 3.57, + "grad_norm": 10.355018381146655, + "learning_rate": 3.026361865815869e-07, + "loss": 0.1808, + "step": 4999 + }, + { + "epoch": 3.57, + "grad_norm": 6.25080861497107, + "learning_rate": 3.016467379154997e-07, + "loss": 0.1207, + "step": 5000 + }, + { + "epoch": 3.57, + "eval_avg_AUC": 0.7939005198150356, + "eval_avg_Accuracy": 0.7027105437665783, + "eval_avg_Accuracy-right": 0.8833963740706926, + "eval_avg_Accuracy-wrong": 0.38765067091198546, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6581912384379904, + "eval_last_AUC": 0.8149474201895899, + "eval_last_Accuracy": 0.738395225464191, + "eval_last_Accuracy-right": 0.8196817529672623, + "eval_last_Accuracy-wrong": 0.5966568114623607, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6804592176400829, + "eval_max_AUC": 0.7778448578196614, + "eval_max_Accuracy": 0.6441893236074271, + "eval_max_Accuracy-right": 0.9792617712273379, + "eval_max_Accuracy-wrong": 0.05992722310666363, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6216123178884508, + "eval_min_AUC": 0.7975842894056173, + "eval_min_Accuracy": 0.7238478116710876, + "eval_min_Accuracy-right": 0.7295552367288379, + "eval_min_Accuracy-wrong": 0.7138958380714123, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6586734622814718, + "eval_prod_AUC": 0.8002513229863527, + "eval_prod_Accuracy": 0.7148955570291777, + "eval_prod_Accuracy-right": 0.6743185078909613, + "eval_prod_Accuracy-wrong": 0.7856493063452354, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.661249401851051, + "eval_runtime": 251.6637, + "eval_samples_per_second": 95.874, + "eval_steps_per_second": 2.996, + "eval_sum_AUC": 0.6658840128941205, + "eval_sum_Accuracy": 0.638967175066313, + "eval_sum_Accuracy-right": 0.9868918742663363, + "eval_sum_Accuracy-wrong": 0.032294746418012284, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6474363870292106, + "step": 5000 + }, + { + "epoch": 3.57, + "grad_norm": 8.039699710822118, + "learning_rate": 3.0065885905656733e-07, + "loss": 0.139, + "step": 5001 + }, + { + "epoch": 3.57, + "grad_norm": 3.561175020518558, + "learning_rate": 2.99672550334858e-07, + "loss": 0.0732, + "step": 5002 + }, + { + "epoch": 3.57, + "grad_norm": 5.489308809958234, + "learning_rate": 2.986878120799158e-07, + "loss": 0.1164, + "step": 5003 + }, + { + "epoch": 3.57, + "grad_norm": 7.287913363035142, + "learning_rate": 2.977046446207604e-07, + "loss": 0.0874, + "step": 5004 + }, + { + "epoch": 3.57, + "grad_norm": 7.221934125161167, + "learning_rate": 2.967230482858863e-07, + "loss": 0.1486, + "step": 5005 + }, + { + "epoch": 3.57, + "grad_norm": 8.007176575502351, + "learning_rate": 2.957430234032627e-07, + "loss": 0.1381, + "step": 5006 + }, + { + "epoch": 3.57, + "grad_norm": 9.070326449400396, + "learning_rate": 2.947645703003338e-07, + "loss": 0.1473, + "step": 5007 + }, + { + "epoch": 3.57, + "grad_norm": 10.242941718824545, + "learning_rate": 2.937876893040209e-07, + "loss": 0.1548, + "step": 5008 + }, + { + "epoch": 3.58, + "grad_norm": 7.051438057132722, + "learning_rate": 2.9281238074071463e-07, + "loss": 0.1425, + "step": 5009 + }, + { + "epoch": 3.58, + "grad_norm": 8.348048515530829, + "learning_rate": 2.9183864493628756e-07, + "loss": 0.1549, + "step": 5010 + }, + { + "epoch": 3.58, + "grad_norm": 9.348540506356116, + "learning_rate": 2.908664822160806e-07, + "loss": 0.1504, + "step": 5011 + }, + { + "epoch": 3.58, + "grad_norm": 6.608695626483558, + "learning_rate": 2.898958929049117e-07, + "loss": 0.0918, + "step": 5012 + }, + { + "epoch": 3.58, + "grad_norm": 7.522212380654753, + "learning_rate": 2.889268773270731e-07, + "loss": 0.166, + "step": 5013 + }, + { + "epoch": 3.58, + "grad_norm": 6.086064311100961, + "learning_rate": 2.879594358063303e-07, + "loss": 0.117, + "step": 5014 + }, + { + "epoch": 3.58, + "grad_norm": 6.2796926492958995, + "learning_rate": 2.869935686659248e-07, + "loss": 0.0981, + "step": 5015 + }, + { + "epoch": 3.58, + "grad_norm": 6.291532974607174, + "learning_rate": 2.8602927622856935e-07, + "loss": 0.1378, + "step": 5016 + }, + { + "epoch": 3.58, + "grad_norm": 5.3972265354997235, + "learning_rate": 2.8506655881645305e-07, + "loss": 0.0914, + "step": 5017 + }, + { + "epoch": 3.58, + "grad_norm": 7.768331165784399, + "learning_rate": 2.841054167512369e-07, + "loss": 0.1865, + "step": 5018 + }, + { + "epoch": 3.58, + "grad_norm": 7.9944003757927975, + "learning_rate": 2.8314585035405683e-07, + "loss": 0.1632, + "step": 5019 + }, + { + "epoch": 3.58, + "grad_norm": 6.256066146658917, + "learning_rate": 2.8218785994552136e-07, + "loss": 0.106, + "step": 5020 + }, + { + "epoch": 3.58, + "grad_norm": 14.01419943071183, + "learning_rate": 2.8123144584571326e-07, + "loss": 0.1823, + "step": 5021 + }, + { + "epoch": 3.58, + "grad_norm": 7.421782138203785, + "learning_rate": 2.8027660837418813e-07, + "loss": 0.1256, + "step": 5022 + }, + { + "epoch": 3.59, + "grad_norm": 6.145061257805026, + "learning_rate": 2.793233478499752e-07, + "loss": 0.1068, + "step": 5023 + }, + { + "epoch": 3.59, + "grad_norm": 7.773542257086725, + "learning_rate": 2.7837166459157625e-07, + "loss": 0.1125, + "step": 5024 + }, + { + "epoch": 3.59, + "grad_norm": 4.764175341048938, + "learning_rate": 2.77421558916966e-07, + "loss": 0.1198, + "step": 5025 + }, + { + "epoch": 3.59, + "grad_norm": 4.684493454442424, + "learning_rate": 2.764730311435931e-07, + "loss": 0.0885, + "step": 5026 + }, + { + "epoch": 3.59, + "grad_norm": 7.440949476639598, + "learning_rate": 2.755260815883781e-07, + "loss": 0.1259, + "step": 5027 + }, + { + "epoch": 3.59, + "grad_norm": 9.466050890106064, + "learning_rate": 2.745807105677145e-07, + "loss": 0.1307, + "step": 5028 + }, + { + "epoch": 3.59, + "grad_norm": 5.8345255587512765, + "learning_rate": 2.736369183974685e-07, + "loss": 0.0985, + "step": 5029 + }, + { + "epoch": 3.59, + "grad_norm": 7.466645169078422, + "learning_rate": 2.726947053929768e-07, + "loss": 0.1625, + "step": 5030 + }, + { + "epoch": 3.59, + "grad_norm": 13.413102283651124, + "learning_rate": 2.7175407186905367e-07, + "loss": 0.1526, + "step": 5031 + }, + { + "epoch": 3.59, + "grad_norm": 9.913384538686545, + "learning_rate": 2.708150181399788e-07, + "loss": 0.1227, + "step": 5032 + }, + { + "epoch": 3.59, + "grad_norm": 29.713012887802034, + "learning_rate": 2.698775445195101e-07, + "loss": 0.1667, + "step": 5033 + }, + { + "epoch": 3.59, + "grad_norm": 5.012468307125459, + "learning_rate": 2.689416513208726e-07, + "loss": 0.0981, + "step": 5034 + }, + { + "epoch": 3.59, + "grad_norm": 9.259889616367115, + "learning_rate": 2.6800733885676833e-07, + "loss": 0.1725, + "step": 5035 + }, + { + "epoch": 3.59, + "grad_norm": 8.444744690254245, + "learning_rate": 2.6707460743936653e-07, + "loss": 0.1523, + "step": 5036 + }, + { + "epoch": 3.6, + "grad_norm": 10.368928285325568, + "learning_rate": 2.6614345738031014e-07, + "loss": 0.1543, + "step": 5037 + }, + { + "epoch": 3.6, + "grad_norm": 7.8762410419001005, + "learning_rate": 2.6521388899071467e-07, + "loss": 0.1006, + "step": 5038 + }, + { + "epoch": 3.6, + "grad_norm": 7.032417525664705, + "learning_rate": 2.642859025811656e-07, + "loss": 0.1088, + "step": 5039 + }, + { + "epoch": 3.6, + "grad_norm": 8.871769638082888, + "learning_rate": 2.633594984617199e-07, + "loss": 0.1471, + "step": 5040 + }, + { + "epoch": 3.6, + "grad_norm": 6.930037290213703, + "learning_rate": 2.624346769419078e-07, + "loss": 0.1376, + "step": 5041 + }, + { + "epoch": 3.6, + "grad_norm": 7.1968598150909955, + "learning_rate": 2.6151143833072824e-07, + "loss": 0.1517, + "step": 5042 + }, + { + "epoch": 3.6, + "grad_norm": 5.261682987100089, + "learning_rate": 2.605897829366527e-07, + "loss": 0.0737, + "step": 5043 + }, + { + "epoch": 3.6, + "grad_norm": 6.097460825799445, + "learning_rate": 2.596697110676233e-07, + "loss": 0.0956, + "step": 5044 + }, + { + "epoch": 3.6, + "grad_norm": 8.016509501397028, + "learning_rate": 2.5875122303105403e-07, + "loss": 0.1566, + "step": 5045 + }, + { + "epoch": 3.6, + "grad_norm": 6.711365280336349, + "learning_rate": 2.5783431913382673e-07, + "loss": 0.1152, + "step": 5046 + }, + { + "epoch": 3.6, + "grad_norm": 5.2815052281868216, + "learning_rate": 2.5691899968229904e-07, + "loss": 0.1055, + "step": 5047 + }, + { + "epoch": 3.6, + "grad_norm": 4.197556171742845, + "learning_rate": 2.560052649822925e-07, + "loss": 0.0975, + "step": 5048 + }, + { + "epoch": 3.6, + "grad_norm": 12.246007633872518, + "learning_rate": 2.5509311533910674e-07, + "loss": 0.1521, + "step": 5049 + }, + { + "epoch": 3.6, + "grad_norm": 7.104104832857858, + "learning_rate": 2.5418255105750465e-07, + "loss": 0.1245, + "step": 5050 + }, + { + "epoch": 3.61, + "grad_norm": 7.889736914067923, + "learning_rate": 2.532735724417251e-07, + "loss": 0.1339, + "step": 5051 + }, + { + "epoch": 3.61, + "grad_norm": 9.149482943543847, + "learning_rate": 2.52366179795473e-07, + "loss": 0.1666, + "step": 5052 + }, + { + "epoch": 3.61, + "grad_norm": 11.841217531696419, + "learning_rate": 2.5146037342192673e-07, + "loss": 0.2165, + "step": 5053 + }, + { + "epoch": 3.61, + "grad_norm": 8.367402162775445, + "learning_rate": 2.505561536237311e-07, + "loss": 0.158, + "step": 5054 + }, + { + "epoch": 3.61, + "grad_norm": 4.577698639400431, + "learning_rate": 2.496535207030043e-07, + "loss": 0.0587, + "step": 5055 + }, + { + "epoch": 3.61, + "grad_norm": 9.757830561055071, + "learning_rate": 2.4875247496133234e-07, + "loss": 0.1003, + "step": 5056 + }, + { + "epoch": 3.61, + "grad_norm": 10.30190449595356, + "learning_rate": 2.4785301669977116e-07, + "loss": 0.1698, + "step": 5057 + }, + { + "epoch": 3.61, + "grad_norm": 7.289239425103564, + "learning_rate": 2.469551462188463e-07, + "loss": 0.1116, + "step": 5058 + }, + { + "epoch": 3.61, + "grad_norm": 6.916398851164515, + "learning_rate": 2.460588638185535e-07, + "loss": 0.1403, + "step": 5059 + }, + { + "epoch": 3.61, + "grad_norm": 8.156390119211379, + "learning_rate": 2.45164169798357e-07, + "loss": 0.1232, + "step": 5060 + }, + { + "epoch": 3.61, + "grad_norm": 7.444029203628988, + "learning_rate": 2.4427106445719053e-07, + "loss": 0.1094, + "step": 5061 + }, + { + "epoch": 3.61, + "grad_norm": 7.602904335530537, + "learning_rate": 2.4337954809345807e-07, + "loss": 0.1433, + "step": 5062 + }, + { + "epoch": 3.61, + "grad_norm": 5.715241288214859, + "learning_rate": 2.4248962100503095e-07, + "loss": 0.0869, + "step": 5063 + }, + { + "epoch": 3.61, + "grad_norm": 4.962640828966288, + "learning_rate": 2.416012834892506e-07, + "loss": 0.1055, + "step": 5064 + }, + { + "epoch": 3.62, + "grad_norm": 6.660676204308661, + "learning_rate": 2.4071453584292693e-07, + "loss": 0.1177, + "step": 5065 + }, + { + "epoch": 3.62, + "grad_norm": 7.962611371474207, + "learning_rate": 2.3982937836233954e-07, + "loss": 0.1488, + "step": 5066 + }, + { + "epoch": 3.62, + "grad_norm": 7.174907994976234, + "learning_rate": 2.389458113432347e-07, + "loss": 0.1106, + "step": 5067 + }, + { + "epoch": 3.62, + "grad_norm": 5.848634135676804, + "learning_rate": 2.380638350808301e-07, + "loss": 0.1166, + "step": 5068 + }, + { + "epoch": 3.62, + "grad_norm": 4.030302418548135, + "learning_rate": 2.371834498698089e-07, + "loss": 0.0978, + "step": 5069 + }, + { + "epoch": 3.62, + "grad_norm": 6.902607563471164, + "learning_rate": 2.363046560043264e-07, + "loss": 0.1177, + "step": 5070 + }, + { + "epoch": 3.62, + "grad_norm": 4.432084198181119, + "learning_rate": 2.3542745377800046e-07, + "loss": 0.0757, + "step": 5071 + }, + { + "epoch": 3.62, + "grad_norm": 4.034887652754706, + "learning_rate": 2.3455184348392446e-07, + "loss": 0.0648, + "step": 5072 + }, + { + "epoch": 3.62, + "grad_norm": 6.045491895151377, + "learning_rate": 2.3367782541465268e-07, + "loss": 0.1531, + "step": 5073 + }, + { + "epoch": 3.62, + "grad_norm": 10.244042229911464, + "learning_rate": 2.3280539986221317e-07, + "loss": 0.1196, + "step": 5074 + }, + { + "epoch": 3.62, + "grad_norm": 4.67069532901348, + "learning_rate": 2.3193456711809837e-07, + "loss": 0.1213, + "step": 5075 + }, + { + "epoch": 3.62, + "grad_norm": 5.294449192222114, + "learning_rate": 2.3106532747327104e-07, + "loss": 0.0883, + "step": 5076 + }, + { + "epoch": 3.62, + "grad_norm": 9.311678585887437, + "learning_rate": 2.3019768121815777e-07, + "loss": 0.1554, + "step": 5077 + }, + { + "epoch": 3.62, + "grad_norm": 20.638914783602576, + "learning_rate": 2.2933162864265836e-07, + "loss": 0.1319, + "step": 5078 + }, + { + "epoch": 3.63, + "grad_norm": 9.5037044607945, + "learning_rate": 2.2846717003613462e-07, + "loss": 0.1365, + "step": 5079 + }, + { + "epoch": 3.63, + "grad_norm": 5.275015466909017, + "learning_rate": 2.2760430568741943e-07, + "loss": 0.0837, + "step": 5080 + }, + { + "epoch": 3.63, + "grad_norm": 8.731705867976217, + "learning_rate": 2.2674303588481162e-07, + "loss": 0.1273, + "step": 5081 + }, + { + "epoch": 3.63, + "grad_norm": 9.145995314417187, + "learning_rate": 2.258833609160771e-07, + "loss": 0.1223, + "step": 5082 + }, + { + "epoch": 3.63, + "grad_norm": 6.35548689524532, + "learning_rate": 2.2502528106845e-07, + "loss": 0.1062, + "step": 5083 + }, + { + "epoch": 3.63, + "grad_norm": 8.025136625142972, + "learning_rate": 2.241687966286299e-07, + "loss": 0.1373, + "step": 5084 + }, + { + "epoch": 3.63, + "grad_norm": 4.627138376379124, + "learning_rate": 2.233139078827845e-07, + "loss": 0.0949, + "step": 5085 + }, + { + "epoch": 3.63, + "grad_norm": 5.550861440722993, + "learning_rate": 2.2246061511654816e-07, + "loss": 0.0856, + "step": 5086 + }, + { + "epoch": 3.63, + "grad_norm": 7.0832967651621646, + "learning_rate": 2.2160891861502165e-07, + "loss": 0.1456, + "step": 5087 + }, + { + "epoch": 3.63, + "grad_norm": 5.494887892207766, + "learning_rate": 2.2075881866277348e-07, + "loss": 0.1138, + "step": 5088 + }, + { + "epoch": 3.63, + "grad_norm": 5.653845807621733, + "learning_rate": 2.199103155438359e-07, + "loss": 0.149, + "step": 5089 + }, + { + "epoch": 3.63, + "grad_norm": 7.533091223076104, + "learning_rate": 2.1906340954171212e-07, + "loss": 0.1851, + "step": 5090 + }, + { + "epoch": 3.63, + "grad_norm": 6.574364381103182, + "learning_rate": 2.1821810093936636e-07, + "loss": 0.1305, + "step": 5091 + }, + { + "epoch": 3.63, + "grad_norm": 4.576468541271489, + "learning_rate": 2.1737439001923488e-07, + "loss": 0.0978, + "step": 5092 + }, + { + "epoch": 3.64, + "grad_norm": 10.54344185381927, + "learning_rate": 2.1653227706321388e-07, + "loss": 0.2559, + "step": 5093 + }, + { + "epoch": 3.64, + "grad_norm": 5.76013310329922, + "learning_rate": 2.156917623526722e-07, + "loss": 0.1378, + "step": 5094 + }, + { + "epoch": 3.64, + "grad_norm": 6.096648775601011, + "learning_rate": 2.1485284616843904e-07, + "loss": 0.0791, + "step": 5095 + }, + { + "epoch": 3.64, + "grad_norm": 7.449715752312049, + "learning_rate": 2.140155287908141e-07, + "loss": 0.1338, + "step": 5096 + }, + { + "epoch": 3.64, + "grad_norm": 6.19787476290784, + "learning_rate": 2.131798104995586e-07, + "loss": 0.1239, + "step": 5097 + }, + { + "epoch": 3.64, + "grad_norm": 5.365081910281841, + "learning_rate": 2.123456915739025e-07, + "loss": 0.0817, + "step": 5098 + }, + { + "epoch": 3.64, + "grad_norm": 5.7839621749109735, + "learning_rate": 2.115131722925401e-07, + "loss": 0.1301, + "step": 5099 + }, + { + "epoch": 3.64, + "grad_norm": 7.328264787341496, + "learning_rate": 2.1068225293363166e-07, + "loss": 0.1644, + "step": 5100 + }, + { + "epoch": 3.64, + "grad_norm": 5.248602303859766, + "learning_rate": 2.0985293377480342e-07, + "loss": 0.0967, + "step": 5101 + }, + { + "epoch": 3.64, + "grad_norm": 6.184726044484055, + "learning_rate": 2.0902521509314543e-07, + "loss": 0.1346, + "step": 5102 + }, + { + "epoch": 3.64, + "grad_norm": 6.470435895771347, + "learning_rate": 2.0819909716521426e-07, + "loss": 0.1361, + "step": 5103 + }, + { + "epoch": 3.64, + "grad_norm": 6.351606542505066, + "learning_rate": 2.0737458026703182e-07, + "loss": 0.113, + "step": 5104 + }, + { + "epoch": 3.64, + "grad_norm": 6.701711181172379, + "learning_rate": 2.0655166467408283e-07, + "loss": 0.1284, + "step": 5105 + }, + { + "epoch": 3.64, + "grad_norm": 6.303422896520405, + "learning_rate": 2.057303506613212e-07, + "loss": 0.1362, + "step": 5106 + }, + { + "epoch": 3.65, + "grad_norm": 6.235708485944981, + "learning_rate": 2.049106385031602e-07, + "loss": 0.131, + "step": 5107 + }, + { + "epoch": 3.65, + "grad_norm": 8.200959008051413, + "learning_rate": 2.0409252847348404e-07, + "loss": 0.1003, + "step": 5108 + }, + { + "epoch": 3.65, + "grad_norm": 6.76348332083974, + "learning_rate": 2.032760208456358e-07, + "loss": 0.0938, + "step": 5109 + }, + { + "epoch": 3.65, + "grad_norm": 10.725542818280745, + "learning_rate": 2.0246111589242835e-07, + "loss": 0.1349, + "step": 5110 + }, + { + "epoch": 3.65, + "grad_norm": 6.858265680340651, + "learning_rate": 2.0164781388613386e-07, + "loss": 0.1703, + "step": 5111 + }, + { + "epoch": 3.65, + "grad_norm": 7.221302184416432, + "learning_rate": 2.0083611509849443e-07, + "loss": 0.0912, + "step": 5112 + }, + { + "epoch": 3.65, + "grad_norm": 6.110379248791678, + "learning_rate": 2.0002601980071145e-07, + "loss": 0.1471, + "step": 5113 + }, + { + "epoch": 3.65, + "grad_norm": 9.264511652438255, + "learning_rate": 1.9921752826345397e-07, + "loss": 0.1936, + "step": 5114 + }, + { + "epoch": 3.65, + "grad_norm": 9.3500365337071, + "learning_rate": 1.9841064075685367e-07, + "loss": 0.1804, + "step": 5115 + }, + { + "epoch": 3.65, + "grad_norm": 7.0264249501411795, + "learning_rate": 1.9760535755050715e-07, + "loss": 0.0991, + "step": 5116 + }, + { + "epoch": 3.65, + "grad_norm": 4.367028898943652, + "learning_rate": 1.9680167891347356e-07, + "loss": 0.0659, + "step": 5117 + }, + { + "epoch": 3.65, + "grad_norm": 6.694350207237627, + "learning_rate": 1.9599960511427761e-07, + "loss": 0.1215, + "step": 5118 + }, + { + "epoch": 3.65, + "grad_norm": 5.01614665495544, + "learning_rate": 1.9519913642090715e-07, + "loss": 0.0903, + "step": 5119 + }, + { + "epoch": 3.65, + "grad_norm": 7.6950343847045115, + "learning_rate": 1.9440027310081323e-07, + "loss": 0.1492, + "step": 5120 + }, + { + "epoch": 3.66, + "grad_norm": 13.035091459384203, + "learning_rate": 1.9360301542091065e-07, + "loss": 0.1222, + "step": 5121 + }, + { + "epoch": 3.66, + "grad_norm": 8.006794034185171, + "learning_rate": 1.9280736364757912e-07, + "loss": 0.1417, + "step": 5122 + }, + { + "epoch": 3.66, + "grad_norm": 3.606990746506003, + "learning_rate": 1.9201331804665934e-07, + "loss": 0.0756, + "step": 5123 + }, + { + "epoch": 3.66, + "grad_norm": 6.174082855551437, + "learning_rate": 1.9122087888345798e-07, + "loss": 0.1083, + "step": 5124 + }, + { + "epoch": 3.66, + "grad_norm": 6.662992780244101, + "learning_rate": 1.9043004642274266e-07, + "loss": 0.1226, + "step": 5125 + }, + { + "epoch": 3.66, + "grad_norm": 7.587789426133568, + "learning_rate": 1.896408209287459e-07, + "loss": 0.14, + "step": 5126 + }, + { + "epoch": 3.66, + "grad_norm": 11.678374790231842, + "learning_rate": 1.888532026651624e-07, + "loss": 0.1396, + "step": 5127 + }, + { + "epoch": 3.66, + "grad_norm": 3.977244116302907, + "learning_rate": 1.880671918951499e-07, + "loss": 0.0615, + "step": 5128 + }, + { + "epoch": 3.66, + "grad_norm": 6.1018897296285175, + "learning_rate": 1.8728278888132944e-07, + "loss": 0.1375, + "step": 5129 + }, + { + "epoch": 3.66, + "grad_norm": 7.569430217114574, + "learning_rate": 1.864999938857842e-07, + "loss": 0.1296, + "step": 5130 + }, + { + "epoch": 3.66, + "grad_norm": 6.084231927095735, + "learning_rate": 1.8571880717006218e-07, + "loss": 0.0961, + "step": 5131 + }, + { + "epoch": 3.66, + "grad_norm": 7.006697709775796, + "learning_rate": 1.8493922899516902e-07, + "loss": 0.1226, + "step": 5132 + }, + { + "epoch": 3.66, + "grad_norm": 7.603508544129782, + "learning_rate": 1.8416125962157971e-07, + "loss": 0.1912, + "step": 5133 + }, + { + "epoch": 3.66, + "grad_norm": 6.0061495778600555, + "learning_rate": 1.8338489930922632e-07, + "loss": 0.1002, + "step": 5134 + }, + { + "epoch": 3.67, + "grad_norm": 6.034224352944826, + "learning_rate": 1.8261014831750633e-07, + "loss": 0.105, + "step": 5135 + }, + { + "epoch": 3.67, + "grad_norm": 6.209079940714725, + "learning_rate": 1.8183700690527717e-07, + "loss": 0.0973, + "step": 5136 + }, + { + "epoch": 3.67, + "grad_norm": 7.222408389143445, + "learning_rate": 1.810654753308616e-07, + "loss": 0.1681, + "step": 5137 + }, + { + "epoch": 3.67, + "grad_norm": 8.624917976457603, + "learning_rate": 1.8029555385204067e-07, + "loss": 0.1587, + "step": 5138 + }, + { + "epoch": 3.67, + "grad_norm": 6.111127472026909, + "learning_rate": 1.795272427260608e-07, + "loss": 0.0766, + "step": 5139 + }, + { + "epoch": 3.67, + "grad_norm": 5.316408256721683, + "learning_rate": 1.7876054220962835e-07, + "loss": 0.0811, + "step": 5140 + }, + { + "epoch": 3.67, + "grad_norm": 6.678820447902406, + "learning_rate": 1.779954525589128e-07, + "loss": 0.165, + "step": 5141 + }, + { + "epoch": 3.67, + "grad_norm": 3.925332943931374, + "learning_rate": 1.7723197402954419e-07, + "loss": 0.0615, + "step": 5142 + }, + { + "epoch": 3.67, + "grad_norm": 8.13593332874638, + "learning_rate": 1.7647010687661558e-07, + "loss": 0.101, + "step": 5143 + }, + { + "epoch": 3.67, + "grad_norm": 5.418752250867253, + "learning_rate": 1.757098513546801e-07, + "loss": 0.0764, + "step": 5144 + }, + { + "epoch": 3.67, + "grad_norm": 13.33740238830133, + "learning_rate": 1.74951207717754e-07, + "loss": 0.1919, + "step": 5145 + }, + { + "epoch": 3.67, + "grad_norm": 5.192594742264245, + "learning_rate": 1.7419417621931388e-07, + "loss": 0.0905, + "step": 5146 + }, + { + "epoch": 3.67, + "grad_norm": 6.938927349266944, + "learning_rate": 1.7343875711229864e-07, + "loss": 0.1296, + "step": 5147 + }, + { + "epoch": 3.67, + "grad_norm": 5.150904395129445, + "learning_rate": 1.7268495064910574e-07, + "loss": 0.0952, + "step": 5148 + }, + { + "epoch": 3.68, + "grad_norm": 6.645643287522623, + "learning_rate": 1.719327570815993e-07, + "loss": 0.123, + "step": 5149 + }, + { + "epoch": 3.68, + "grad_norm": 8.033024279828727, + "learning_rate": 1.711821766610977e-07, + "loss": 0.1221, + "step": 5150 + }, + { + "epoch": 3.68, + "grad_norm": 7.151014646433906, + "learning_rate": 1.704332096383865e-07, + "loss": 0.144, + "step": 5151 + }, + { + "epoch": 3.68, + "grad_norm": 6.751515538072794, + "learning_rate": 1.696858562637077e-07, + "loss": 0.1128, + "step": 5152 + }, + { + "epoch": 3.68, + "grad_norm": 7.3260407768072815, + "learning_rate": 1.689401167867677e-07, + "loss": 0.137, + "step": 5153 + }, + { + "epoch": 3.68, + "grad_norm": 6.967392348291816, + "learning_rate": 1.6819599145672993e-07, + "loss": 0.1198, + "step": 5154 + }, + { + "epoch": 3.68, + "grad_norm": 9.119755770264783, + "learning_rate": 1.674534805222222e-07, + "loss": 0.1946, + "step": 5155 + }, + { + "epoch": 3.68, + "grad_norm": 5.9494333508876265, + "learning_rate": 1.667125842313305e-07, + "loss": 0.0823, + "step": 5156 + }, + { + "epoch": 3.68, + "grad_norm": 3.699974499969946, + "learning_rate": 1.6597330283160184e-07, + "loss": 0.0596, + "step": 5157 + }, + { + "epoch": 3.68, + "grad_norm": 9.275525343827553, + "learning_rate": 1.6523563657004416e-07, + "loss": 0.1776, + "step": 5158 + }, + { + "epoch": 3.68, + "grad_norm": 7.8398354915929325, + "learning_rate": 1.644995856931253e-07, + "loss": 0.1151, + "step": 5159 + }, + { + "epoch": 3.68, + "grad_norm": 7.010095295632242, + "learning_rate": 1.6376515044677354e-07, + "loss": 0.1351, + "step": 5160 + }, + { + "epoch": 3.68, + "grad_norm": 10.561996804977277, + "learning_rate": 1.630323310763776e-07, + "loss": 0.14, + "step": 5161 + }, + { + "epoch": 3.68, + "grad_norm": 6.3885597894295945, + "learning_rate": 1.6230112782678608e-07, + "loss": 0.1088, + "step": 5162 + }, + { + "epoch": 3.69, + "grad_norm": 7.464820972006126, + "learning_rate": 1.6157154094230744e-07, + "loss": 0.1079, + "step": 5163 + }, + { + "epoch": 3.69, + "grad_norm": 8.489218484166322, + "learning_rate": 1.6084357066670997e-07, + "loss": 0.1548, + "step": 5164 + }, + { + "epoch": 3.69, + "grad_norm": 7.166544166806824, + "learning_rate": 1.601172172432225e-07, + "loss": 0.0991, + "step": 5165 + }, + { + "epoch": 3.69, + "grad_norm": 6.044756449618914, + "learning_rate": 1.5939248091453252e-07, + "loss": 0.1147, + "step": 5166 + }, + { + "epoch": 3.69, + "grad_norm": 7.407020648139834, + "learning_rate": 1.5866936192278915e-07, + "loss": 0.1702, + "step": 5167 + }, + { + "epoch": 3.69, + "grad_norm": 6.543503830650654, + "learning_rate": 1.5794786050959797e-07, + "loss": 0.1196, + "step": 5168 + }, + { + "epoch": 3.69, + "grad_norm": 8.94521426336034, + "learning_rate": 1.5722797691602842e-07, + "loss": 0.1217, + "step": 5169 + }, + { + "epoch": 3.69, + "grad_norm": 5.678678949600126, + "learning_rate": 1.5650971138260473e-07, + "loss": 0.1138, + "step": 5170 + }, + { + "epoch": 3.69, + "grad_norm": 6.541004043762944, + "learning_rate": 1.5579306414931493e-07, + "loss": 0.1263, + "step": 5171 + }, + { + "epoch": 3.69, + "grad_norm": 8.703199754397636, + "learning_rate": 1.5507803545560195e-07, + "loss": 0.1287, + "step": 5172 + }, + { + "epoch": 3.69, + "grad_norm": 6.079487966585593, + "learning_rate": 1.543646255403719e-07, + "loss": 0.1176, + "step": 5173 + }, + { + "epoch": 3.69, + "grad_norm": 8.247351544243093, + "learning_rate": 1.5365283464198743e-07, + "loss": 0.1897, + "step": 5174 + }, + { + "epoch": 3.69, + "grad_norm": 9.843355071661746, + "learning_rate": 1.529426629982711e-07, + "loss": 0.1487, + "step": 5175 + }, + { + "epoch": 3.69, + "grad_norm": 7.88124033472976, + "learning_rate": 1.5223411084650476e-07, + "loss": 0.1071, + "step": 5176 + }, + { + "epoch": 3.7, + "grad_norm": 8.117452681694125, + "learning_rate": 1.5152717842342845e-07, + "loss": 0.1221, + "step": 5177 + }, + { + "epoch": 3.7, + "grad_norm": 6.399799739093315, + "learning_rate": 1.5082186596524218e-07, + "loss": 0.124, + "step": 5178 + }, + { + "epoch": 3.7, + "grad_norm": 8.196187923853193, + "learning_rate": 1.501181737076035e-07, + "loss": 0.1299, + "step": 5179 + }, + { + "epoch": 3.7, + "grad_norm": 8.07108956974622, + "learning_rate": 1.4941610188562884e-07, + "loss": 0.1139, + "step": 5180 + }, + { + "epoch": 3.7, + "grad_norm": 9.192620314889446, + "learning_rate": 1.4871565073389382e-07, + "loss": 0.1345, + "step": 5181 + }, + { + "epoch": 3.7, + "grad_norm": 7.048174028646196, + "learning_rate": 1.4801682048643183e-07, + "loss": 0.1466, + "step": 5182 + }, + { + "epoch": 3.7, + "grad_norm": 6.816161354114099, + "learning_rate": 1.4731961137673555e-07, + "loss": 0.1024, + "step": 5183 + }, + { + "epoch": 3.7, + "grad_norm": 16.351201920865254, + "learning_rate": 1.466240236377553e-07, + "loss": 0.1097, + "step": 5184 + }, + { + "epoch": 3.7, + "grad_norm": 7.299762715915274, + "learning_rate": 1.4593005750189958e-07, + "loss": 0.1072, + "step": 5185 + }, + { + "epoch": 3.7, + "grad_norm": 5.167377084530258, + "learning_rate": 1.4523771320103574e-07, + "loss": 0.0734, + "step": 5186 + }, + { + "epoch": 3.7, + "grad_norm": 6.275327813941812, + "learning_rate": 1.4454699096648873e-07, + "loss": 0.1506, + "step": 5187 + }, + { + "epoch": 3.7, + "grad_norm": 5.996799451568316, + "learning_rate": 1.4385789102904168e-07, + "loss": 0.111, + "step": 5188 + }, + { + "epoch": 3.7, + "grad_norm": 7.7803406635382215, + "learning_rate": 1.4317041361893546e-07, + "loss": 0.1682, + "step": 5189 + }, + { + "epoch": 3.7, + "grad_norm": 5.665189120267458, + "learning_rate": 1.4248455896587022e-07, + "loss": 0.0935, + "step": 5190 + }, + { + "epoch": 3.71, + "grad_norm": 8.005633628165455, + "learning_rate": 1.418003272990004e-07, + "loss": 0.1013, + "step": 5191 + }, + { + "epoch": 3.71, + "grad_norm": 6.9506037573191515, + "learning_rate": 1.4111771884694315e-07, + "loss": 0.1016, + "step": 5192 + }, + { + "epoch": 3.71, + "grad_norm": 5.8278679045020185, + "learning_rate": 1.4043673383776825e-07, + "loss": 0.0898, + "step": 5193 + }, + { + "epoch": 3.71, + "grad_norm": 7.641464266293792, + "learning_rate": 1.3975737249900812e-07, + "loss": 0.1395, + "step": 5194 + }, + { + "epoch": 3.71, + "grad_norm": 10.172133880539453, + "learning_rate": 1.3907963505764731e-07, + "loss": 0.1418, + "step": 5195 + }, + { + "epoch": 3.71, + "grad_norm": 8.166463290616882, + "learning_rate": 1.384035217401325e-07, + "loss": 0.1249, + "step": 5196 + }, + { + "epoch": 3.71, + "grad_norm": 7.385868049756229, + "learning_rate": 1.3772903277236404e-07, + "loss": 0.1636, + "step": 5197 + }, + { + "epoch": 3.71, + "grad_norm": 7.952994717184201, + "learning_rate": 1.370561683797028e-07, + "loss": 0.1356, + "step": 5198 + }, + { + "epoch": 3.71, + "grad_norm": 10.124774749021338, + "learning_rate": 1.363849287869645e-07, + "loss": 0.1481, + "step": 5199 + }, + { + "epoch": 3.71, + "grad_norm": 6.320488348773809, + "learning_rate": 1.3571531421842256e-07, + "loss": 0.106, + "step": 5200 + }, + { + "epoch": 3.71, + "grad_norm": 6.804058988616448, + "learning_rate": 1.3504732489780849e-07, + "loss": 0.114, + "step": 5201 + }, + { + "epoch": 3.71, + "grad_norm": 7.514633914232519, + "learning_rate": 1.3438096104830879e-07, + "loss": 0.1368, + "step": 5202 + }, + { + "epoch": 3.71, + "grad_norm": 5.702232298090348, + "learning_rate": 1.3371622289256869e-07, + "loss": 0.1162, + "step": 5203 + }, + { + "epoch": 3.71, + "grad_norm": 4.859181618551624, + "learning_rate": 1.3305311065269e-07, + "loss": 0.0895, + "step": 5204 + }, + { + "epoch": 3.72, + "grad_norm": 10.143736353282302, + "learning_rate": 1.323916245502299e-07, + "loss": 0.1407, + "step": 5205 + }, + { + "epoch": 3.72, + "grad_norm": 6.899522459445874, + "learning_rate": 1.3173176480620442e-07, + "loss": 0.1295, + "step": 5206 + }, + { + "epoch": 3.72, + "grad_norm": 5.406089544140982, + "learning_rate": 1.3107353164108273e-07, + "loss": 0.094, + "step": 5207 + }, + { + "epoch": 3.72, + "grad_norm": 6.286668556628162, + "learning_rate": 1.3041692527479556e-07, + "loss": 0.125, + "step": 5208 + }, + { + "epoch": 3.72, + "grad_norm": 6.2593888521125, + "learning_rate": 1.2976194592672465e-07, + "loss": 0.1161, + "step": 5209 + }, + { + "epoch": 3.72, + "grad_norm": 10.413500581443682, + "learning_rate": 1.2910859381571327e-07, + "loss": 0.1522, + "step": 5210 + }, + { + "epoch": 3.72, + "grad_norm": 7.838444174981346, + "learning_rate": 1.284568691600563e-07, + "loss": 0.1145, + "step": 5211 + }, + { + "epoch": 3.72, + "grad_norm": 5.995371120181053, + "learning_rate": 1.2780677217750949e-07, + "loss": 0.1064, + "step": 5212 + }, + { + "epoch": 3.72, + "grad_norm": 7.572479545934153, + "learning_rate": 1.271583030852791e-07, + "loss": 0.1183, + "step": 5213 + }, + { + "epoch": 3.72, + "grad_norm": 6.113754497506733, + "learning_rate": 1.2651146210003406e-07, + "loss": 0.1029, + "step": 5214 + }, + { + "epoch": 3.72, + "grad_norm": 5.198270804458642, + "learning_rate": 1.2586624943789372e-07, + "loss": 0.1324, + "step": 5215 + }, + { + "epoch": 3.72, + "grad_norm": 7.473074709137638, + "learning_rate": 1.2522266531443616e-07, + "loss": 0.1241, + "step": 5216 + }, + { + "epoch": 3.72, + "grad_norm": 13.207193018450559, + "learning_rate": 1.245807099446955e-07, + "loss": 0.1882, + "step": 5217 + }, + { + "epoch": 3.72, + "grad_norm": 6.335816731636773, + "learning_rate": 1.239403835431602e-07, + "loss": 0.0775, + "step": 5218 + }, + { + "epoch": 3.73, + "grad_norm": 10.620746803482923, + "learning_rate": 1.2330168632377514e-07, + "loss": 0.1576, + "step": 5219 + }, + { + "epoch": 3.73, + "grad_norm": 7.84388438837485, + "learning_rate": 1.2266461849994138e-07, + "loss": 0.1257, + "step": 5220 + }, + { + "epoch": 3.73, + "grad_norm": 7.3623646823821565, + "learning_rate": 1.2202918028451527e-07, + "loss": 0.1428, + "step": 5221 + }, + { + "epoch": 3.73, + "grad_norm": 7.362173666685328, + "learning_rate": 1.2139537188980753e-07, + "loss": 0.1882, + "step": 5222 + }, + { + "epoch": 3.73, + "grad_norm": 5.891116663875299, + "learning_rate": 1.207631935275866e-07, + "loss": 0.1367, + "step": 5223 + }, + { + "epoch": 3.73, + "grad_norm": 5.184386123808496, + "learning_rate": 1.2013264540907455e-07, + "loss": 0.1078, + "step": 5224 + }, + { + "epoch": 3.73, + "grad_norm": 6.019083925984621, + "learning_rate": 1.1950372774494846e-07, + "loss": 0.0882, + "step": 5225 + }, + { + "epoch": 3.73, + "grad_norm": 6.246318195836614, + "learning_rate": 1.1887644074534244e-07, + "loss": 0.1174, + "step": 5226 + }, + { + "epoch": 3.73, + "grad_norm": 7.847553205535199, + "learning_rate": 1.182507846198444e-07, + "loss": 0.1487, + "step": 5227 + }, + { + "epoch": 3.73, + "grad_norm": 11.611646202895807, + "learning_rate": 1.1762675957749769e-07, + "loss": 0.2279, + "step": 5228 + }, + { + "epoch": 3.73, + "grad_norm": 5.737661801161129, + "learning_rate": 1.1700436582680108e-07, + "loss": 0.1067, + "step": 5229 + }, + { + "epoch": 3.73, + "grad_norm": 8.299874718836872, + "learning_rate": 1.1638360357570654e-07, + "loss": 0.1802, + "step": 5230 + }, + { + "epoch": 3.73, + "grad_norm": 7.031326583486857, + "learning_rate": 1.157644730316243e-07, + "loss": 0.115, + "step": 5231 + }, + { + "epoch": 3.73, + "grad_norm": 6.615993043157198, + "learning_rate": 1.1514697440141498e-07, + "loss": 0.116, + "step": 5232 + }, + { + "epoch": 3.74, + "grad_norm": 6.184865547249952, + "learning_rate": 1.1453110789139855e-07, + "loss": 0.1007, + "step": 5233 + }, + { + "epoch": 3.74, + "grad_norm": 6.274144565026902, + "learning_rate": 1.1391687370734594e-07, + "loss": 0.1247, + "step": 5234 + }, + { + "epoch": 3.74, + "grad_norm": 8.242541318513114, + "learning_rate": 1.1330427205448579e-07, + "loss": 0.1362, + "step": 5235 + }, + { + "epoch": 3.74, + "grad_norm": 12.150162498817286, + "learning_rate": 1.1269330313749715e-07, + "loss": 0.1766, + "step": 5236 + }, + { + "epoch": 3.74, + "grad_norm": 6.308670112443291, + "learning_rate": 1.1208396716051895e-07, + "loss": 0.1266, + "step": 5237 + }, + { + "epoch": 3.74, + "grad_norm": 6.6392974652541605, + "learning_rate": 1.1147626432713943e-07, + "loss": 0.0908, + "step": 5238 + }, + { + "epoch": 3.74, + "grad_norm": 7.564703684509811, + "learning_rate": 1.1087019484040562e-07, + "loss": 0.1458, + "step": 5239 + }, + { + "epoch": 3.74, + "grad_norm": 7.470040083902322, + "learning_rate": 1.1026575890281443e-07, + "loss": 0.1246, + "step": 5240 + }, + { + "epoch": 3.74, + "grad_norm": 6.086766086218585, + "learning_rate": 1.0966295671632043e-07, + "loss": 0.1151, + "step": 5241 + }, + { + "epoch": 3.74, + "grad_norm": 6.4424206168832585, + "learning_rate": 1.0906178848233029e-07, + "loss": 0.0956, + "step": 5242 + }, + { + "epoch": 3.74, + "grad_norm": 7.511100671159325, + "learning_rate": 1.0846225440170611e-07, + "loss": 0.1285, + "step": 5243 + }, + { + "epoch": 3.74, + "grad_norm": 5.8044703706351815, + "learning_rate": 1.0786435467476264e-07, + "loss": 0.1116, + "step": 5244 + }, + { + "epoch": 3.74, + "grad_norm": 5.525857288049917, + "learning_rate": 1.072680895012701e-07, + "loss": 0.1162, + "step": 5245 + }, + { + "epoch": 3.74, + "grad_norm": 6.722086445426205, + "learning_rate": 1.0667345908045135e-07, + "loss": 0.1187, + "step": 5246 + }, + { + "epoch": 3.75, + "grad_norm": 5.848587661746584, + "learning_rate": 1.0608046361098356e-07, + "loss": 0.1238, + "step": 5247 + }, + { + "epoch": 3.75, + "grad_norm": 8.613934378361632, + "learning_rate": 1.0548910329099771e-07, + "loss": 0.1035, + "step": 5248 + }, + { + "epoch": 3.75, + "grad_norm": 5.758696950894796, + "learning_rate": 1.048993783180785e-07, + "loss": 0.1001, + "step": 5249 + }, + { + "epoch": 3.75, + "grad_norm": 11.99570441487423, + "learning_rate": 1.0431128888926222e-07, + "loss": 0.1798, + "step": 5250 + }, + { + "epoch": 3.75, + "grad_norm": 9.68337979709574, + "learning_rate": 1.0372483520104337e-07, + "loss": 0.2222, + "step": 5251 + }, + { + "epoch": 3.75, + "grad_norm": 9.159716336335311, + "learning_rate": 1.0314001744936409e-07, + "loss": 0.1133, + "step": 5252 + }, + { + "epoch": 3.75, + "grad_norm": 12.181529893720981, + "learning_rate": 1.0255683582962583e-07, + "loss": 0.1626, + "step": 5253 + }, + { + "epoch": 3.75, + "grad_norm": 5.799677606632576, + "learning_rate": 1.0197529053667721e-07, + "loss": 0.0961, + "step": 5254 + }, + { + "epoch": 3.75, + "grad_norm": 6.57621765664244, + "learning_rate": 1.013953817648261e-07, + "loss": 0.1383, + "step": 5255 + }, + { + "epoch": 3.75, + "grad_norm": 6.189186863181362, + "learning_rate": 1.008171097078292e-07, + "loss": 0.1518, + "step": 5256 + }, + { + "epoch": 3.75, + "grad_norm": 12.319055223640666, + "learning_rate": 1.0024047455889918e-07, + "loss": 0.1925, + "step": 5257 + }, + { + "epoch": 3.75, + "grad_norm": 4.369429730589778, + "learning_rate": 9.966547651069913e-08, + "loss": 0.0916, + "step": 5258 + }, + { + "epoch": 3.75, + "grad_norm": 7.524207259807521, + "learning_rate": 9.909211575534705e-08, + "loss": 0.132, + "step": 5259 + }, + { + "epoch": 3.75, + "grad_norm": 7.236946977446954, + "learning_rate": 9.852039248441414e-08, + "loss": 0.1023, + "step": 5260 + }, + { + "epoch": 3.76, + "grad_norm": 9.47859230035742, + "learning_rate": 9.79503068889226e-08, + "loss": 0.1814, + "step": 5261 + }, + { + "epoch": 3.76, + "grad_norm": 8.758267109966468, + "learning_rate": 9.738185915935005e-08, + "loss": 0.175, + "step": 5262 + }, + { + "epoch": 3.76, + "grad_norm": 9.27801847977357, + "learning_rate": 9.681504948562403e-08, + "loss": 0.0741, + "step": 5263 + }, + { + "epoch": 3.76, + "grad_norm": 5.229611351433055, + "learning_rate": 9.624987805712749e-08, + "loss": 0.1229, + "step": 5264 + }, + { + "epoch": 3.76, + "grad_norm": 7.05122561410525, + "learning_rate": 9.568634506269381e-08, + "loss": 0.1503, + "step": 5265 + }, + { + "epoch": 3.76, + "grad_norm": 6.437579163294752, + "learning_rate": 9.51244506906096e-08, + "loss": 0.0978, + "step": 5266 + }, + { + "epoch": 3.76, + "grad_norm": 4.629315173299733, + "learning_rate": 9.45641951286158e-08, + "loss": 0.076, + "step": 5267 + }, + { + "epoch": 3.76, + "grad_norm": 12.414008878645971, + "learning_rate": 9.400557856390158e-08, + "loss": 0.1239, + "step": 5268 + }, + { + "epoch": 3.76, + "grad_norm": 7.86659051015309, + "learning_rate": 9.344860118311427e-08, + "loss": 0.1056, + "step": 5269 + }, + { + "epoch": 3.76, + "grad_norm": 4.829098697103711, + "learning_rate": 9.289326317234726e-08, + "loss": 0.114, + "step": 5270 + }, + { + "epoch": 3.76, + "grad_norm": 5.748563587949899, + "learning_rate": 9.23395647171521e-08, + "loss": 0.1136, + "step": 5271 + }, + { + "epoch": 3.76, + "grad_norm": 9.201838202910588, + "learning_rate": 9.178750600252695e-08, + "loss": 0.1781, + "step": 5272 + }, + { + "epoch": 3.76, + "grad_norm": 10.329283760051627, + "learning_rate": 9.123708721292756e-08, + "loss": 0.1503, + "step": 5273 + }, + { + "epoch": 3.76, + "grad_norm": 5.90976125143801, + "learning_rate": 9.06883085322574e-08, + "loss": 0.0987, + "step": 5274 + }, + { + "epoch": 3.77, + "grad_norm": 9.31051888219555, + "learning_rate": 9.014117014387424e-08, + "loss": 0.1438, + "step": 5275 + }, + { + "epoch": 3.77, + "grad_norm": 5.3011211156186935, + "learning_rate": 8.95956722305874e-08, + "loss": 0.1042, + "step": 5276 + }, + { + "epoch": 3.77, + "grad_norm": 5.25084946851794, + "learning_rate": 8.905181497465664e-08, + "loss": 0.1144, + "step": 5277 + }, + { + "epoch": 3.77, + "grad_norm": 12.014268472267695, + "learning_rate": 8.850959855779662e-08, + "loss": 0.1437, + "step": 5278 + }, + { + "epoch": 3.77, + "grad_norm": 6.3680562353578, + "learning_rate": 8.796902316117018e-08, + "loss": 0.0974, + "step": 5279 + }, + { + "epoch": 3.77, + "grad_norm": 8.104205605705294, + "learning_rate": 8.743008896539451e-08, + "loss": 0.1185, + "step": 5280 + }, + { + "epoch": 3.77, + "grad_norm": 5.822397728872039, + "learning_rate": 8.68927961505378e-08, + "loss": 0.0966, + "step": 5281 + }, + { + "epoch": 3.77, + "grad_norm": 6.962525356143755, + "learning_rate": 8.635714489611868e-08, + "loss": 0.1772, + "step": 5282 + }, + { + "epoch": 3.77, + "grad_norm": 3.4875000040030812, + "learning_rate": 8.582313538110898e-08, + "loss": 0.0679, + "step": 5283 + }, + { + "epoch": 3.77, + "grad_norm": 5.751258070965259, + "learning_rate": 8.529076778393097e-08, + "loss": 0.1108, + "step": 5284 + }, + { + "epoch": 3.77, + "grad_norm": 22.636631698978753, + "learning_rate": 8.476004228245848e-08, + "loss": 0.2013, + "step": 5285 + }, + { + "epoch": 3.77, + "grad_norm": 6.386527686790105, + "learning_rate": 8.42309590540169e-08, + "loss": 0.1111, + "step": 5286 + }, + { + "epoch": 3.77, + "grad_norm": 7.245454822317008, + "learning_rate": 8.370351827538259e-08, + "loss": 0.1567, + "step": 5287 + }, + { + "epoch": 3.77, + "grad_norm": 7.696534926561681, + "learning_rate": 8.317772012278347e-08, + "loss": 0.0975, + "step": 5288 + }, + { + "epoch": 3.78, + "grad_norm": 7.159676601821471, + "learning_rate": 8.26535647718979e-08, + "loss": 0.1014, + "step": 5289 + }, + { + "epoch": 3.78, + "grad_norm": 7.323349288176798, + "learning_rate": 8.213105239785691e-08, + "loss": 0.121, + "step": 5290 + }, + { + "epoch": 3.78, + "grad_norm": 6.626972303043425, + "learning_rate": 8.161018317524139e-08, + "loss": 0.1101, + "step": 5291 + }, + { + "epoch": 3.78, + "grad_norm": 9.572853627923612, + "learning_rate": 8.109095727808269e-08, + "loss": 0.1646, + "step": 5292 + }, + { + "epoch": 3.78, + "grad_norm": 6.1048890627998045, + "learning_rate": 8.057337487986427e-08, + "loss": 0.1073, + "step": 5293 + }, + { + "epoch": 3.78, + "grad_norm": 8.486123430984657, + "learning_rate": 8.005743615352057e-08, + "loss": 0.1229, + "step": 5294 + }, + { + "epoch": 3.78, + "grad_norm": 10.624680701422056, + "learning_rate": 7.954314127143481e-08, + "loss": 0.1277, + "step": 5295 + }, + { + "epoch": 3.78, + "grad_norm": 9.68142575432856, + "learning_rate": 7.903049040544453e-08, + "loss": 0.1707, + "step": 5296 + }, + { + "epoch": 3.78, + "grad_norm": 5.035336991397328, + "learning_rate": 7.851948372683382e-08, + "loss": 0.1169, + "step": 5297 + }, + { + "epoch": 3.78, + "grad_norm": 6.927256106603059, + "learning_rate": 7.801012140634167e-08, + "loss": 0.1127, + "step": 5298 + }, + { + "epoch": 3.78, + "grad_norm": 8.320357114365319, + "learning_rate": 7.750240361415362e-08, + "loss": 0.1432, + "step": 5299 + }, + { + "epoch": 3.78, + "grad_norm": 8.895191827361518, + "learning_rate": 7.69963305199084e-08, + "loss": 0.1588, + "step": 5300 + }, + { + "epoch": 3.78, + "grad_norm": 8.444645977840397, + "learning_rate": 7.64919022926941e-08, + "loss": 0.1382, + "step": 5301 + }, + { + "epoch": 3.78, + "grad_norm": 8.356571405819423, + "learning_rate": 7.598911910105033e-08, + "loss": 0.1312, + "step": 5302 + }, + { + "epoch": 3.79, + "grad_norm": 9.29974473920337, + "learning_rate": 7.548798111296552e-08, + "loss": 0.1755, + "step": 5303 + }, + { + "epoch": 3.79, + "grad_norm": 6.36759012920459, + "learning_rate": 7.498848849588015e-08, + "loss": 0.1384, + "step": 5304 + }, + { + "epoch": 3.79, + "grad_norm": 6.686410092064511, + "learning_rate": 7.449064141668238e-08, + "loss": 0.0949, + "step": 5305 + }, + { + "epoch": 3.79, + "grad_norm": 6.1045580573583695, + "learning_rate": 7.399444004171364e-08, + "loss": 0.1318, + "step": 5306 + }, + { + "epoch": 3.79, + "grad_norm": 6.181571610180925, + "learning_rate": 7.349988453676349e-08, + "loss": 0.1101, + "step": 5307 + }, + { + "epoch": 3.79, + "grad_norm": 5.948354119093815, + "learning_rate": 7.300697506707254e-08, + "loss": 0.1398, + "step": 5308 + }, + { + "epoch": 3.79, + "grad_norm": 6.1083301561778685, + "learning_rate": 7.251571179732963e-08, + "loss": 0.1018, + "step": 5309 + }, + { + "epoch": 3.79, + "grad_norm": 6.193753633726565, + "learning_rate": 7.202609489167734e-08, + "loss": 0.1401, + "step": 5310 + }, + { + "epoch": 3.79, + "grad_norm": 6.971897988354667, + "learning_rate": 7.153812451370312e-08, + "loss": 0.1178, + "step": 5311 + }, + { + "epoch": 3.79, + "grad_norm": 8.547983835503912, + "learning_rate": 7.10518008264488e-08, + "loss": 0.1567, + "step": 5312 + }, + { + "epoch": 3.79, + "grad_norm": 5.901556010148924, + "learning_rate": 7.056712399240274e-08, + "loss": 0.0923, + "step": 5313 + }, + { + "epoch": 3.79, + "grad_norm": 5.1229350172421615, + "learning_rate": 7.008409417350648e-08, + "loss": 0.1046, + "step": 5314 + }, + { + "epoch": 3.79, + "grad_norm": 21.260270101970896, + "learning_rate": 6.960271153114706e-08, + "loss": 0.279, + "step": 5315 + }, + { + "epoch": 3.79, + "grad_norm": 4.805978363368682, + "learning_rate": 6.912297622616526e-08, + "loss": 0.0815, + "step": 5316 + }, + { + "epoch": 3.8, + "grad_norm": 11.926823320423209, + "learning_rate": 6.864488841884786e-08, + "loss": 0.1357, + "step": 5317 + }, + { + "epoch": 3.8, + "grad_norm": 5.122540845339858, + "learning_rate": 6.816844826893431e-08, + "loss": 0.1118, + "step": 5318 + }, + { + "epoch": 3.8, + "grad_norm": 8.90771512298503, + "learning_rate": 6.769365593561117e-08, + "loss": 0.1603, + "step": 5319 + }, + { + "epoch": 3.8, + "grad_norm": 5.151048217047981, + "learning_rate": 6.722051157751597e-08, + "loss": 0.0963, + "step": 5320 + }, + { + "epoch": 3.8, + "grad_norm": 7.657265995185122, + "learning_rate": 6.674901535273448e-08, + "loss": 0.101, + "step": 5321 + }, + { + "epoch": 3.8, + "grad_norm": 6.07689362939208, + "learning_rate": 6.627916741880291e-08, + "loss": 0.1242, + "step": 5322 + }, + { + "epoch": 3.8, + "grad_norm": 6.1248540676091565, + "learning_rate": 6.581096793270625e-08, + "loss": 0.073, + "step": 5323 + }, + { + "epoch": 3.8, + "grad_norm": 9.295501650534323, + "learning_rate": 6.534441705087768e-08, + "loss": 0.1558, + "step": 5324 + }, + { + "epoch": 3.8, + "grad_norm": 14.16116865988068, + "learning_rate": 6.487951492920141e-08, + "loss": 0.1665, + "step": 5325 + }, + { + "epoch": 3.8, + "grad_norm": 6.979231240584932, + "learning_rate": 6.441626172300986e-08, + "loss": 0.149, + "step": 5326 + }, + { + "epoch": 3.8, + "grad_norm": 8.526966234599504, + "learning_rate": 6.395465758708419e-08, + "loss": 0.1081, + "step": 5327 + }, + { + "epoch": 3.8, + "grad_norm": 13.410360679446114, + "learning_rate": 6.349470267565549e-08, + "loss": 0.1249, + "step": 5328 + }, + { + "epoch": 3.8, + "grad_norm": 6.87362720959621, + "learning_rate": 6.303639714240196e-08, + "loss": 0.1234, + "step": 5329 + }, + { + "epoch": 3.8, + "grad_norm": 5.4425196121684865, + "learning_rate": 6.257974114045385e-08, + "loss": 0.0964, + "step": 5330 + }, + { + "epoch": 3.81, + "grad_norm": 5.646662279495886, + "learning_rate": 6.212473482238635e-08, + "loss": 0.1497, + "step": 5331 + }, + { + "epoch": 3.81, + "grad_norm": 6.057643625550772, + "learning_rate": 6.167137834022785e-08, + "loss": 0.0977, + "step": 5332 + }, + { + "epoch": 3.81, + "grad_norm": 6.539793036897762, + "learning_rate": 6.121967184545107e-08, + "loss": 0.1141, + "step": 5333 + }, + { + "epoch": 3.81, + "grad_norm": 5.761339793280051, + "learning_rate": 6.076961548898086e-08, + "loss": 0.0969, + "step": 5334 + }, + { + "epoch": 3.81, + "grad_norm": 6.800019097577064, + "learning_rate": 6.032120942118858e-08, + "loss": 0.1007, + "step": 5335 + }, + { + "epoch": 3.81, + "grad_norm": 5.289423884200621, + "learning_rate": 5.98744537918955e-08, + "loss": 0.0994, + "step": 5336 + }, + { + "epoch": 3.81, + "grad_norm": 6.500424630787519, + "learning_rate": 5.9429348750371097e-08, + "loss": 0.1061, + "step": 5337 + }, + { + "epoch": 3.81, + "grad_norm": 7.843475544162, + "learning_rate": 5.898589444533254e-08, + "loss": 0.146, + "step": 5338 + }, + { + "epoch": 3.81, + "grad_norm": 11.118806698310935, + "learning_rate": 5.85440910249474e-08, + "loss": 0.1675, + "step": 5339 + }, + { + "epoch": 3.81, + "grad_norm": 7.320211335379985, + "learning_rate": 5.810393863682873e-08, + "loss": 0.1436, + "step": 5340 + }, + { + "epoch": 3.81, + "grad_norm": 6.079462595694946, + "learning_rate": 5.7665437428041096e-08, + "loss": 0.1572, + "step": 5341 + }, + { + "epoch": 3.81, + "grad_norm": 7.477284639355565, + "learning_rate": 5.722858754509564e-08, + "loss": 0.1337, + "step": 5342 + }, + { + "epoch": 3.81, + "grad_norm": 5.941412410817457, + "learning_rate": 5.679338913395116e-08, + "loss": 0.0836, + "step": 5343 + }, + { + "epoch": 3.81, + "grad_norm": 15.49462238075577, + "learning_rate": 5.6359842340016904e-08, + "loss": 0.168, + "step": 5344 + }, + { + "epoch": 3.82, + "grad_norm": 7.814014540138655, + "learning_rate": 5.5927947308147545e-08, + "loss": 0.0989, + "step": 5345 + }, + { + "epoch": 3.82, + "grad_norm": 6.40539018080863, + "learning_rate": 5.549770418264766e-08, + "loss": 0.0836, + "step": 5346 + }, + { + "epoch": 3.82, + "grad_norm": 8.444026783490271, + "learning_rate": 5.5069113107270034e-08, + "loss": 0.1371, + "step": 5347 + }, + { + "epoch": 3.82, + "grad_norm": 6.902013975947065, + "learning_rate": 5.464217422521456e-08, + "loss": 0.0989, + "step": 5348 + }, + { + "epoch": 3.82, + "grad_norm": 4.274615043574269, + "learning_rate": 5.421688767912936e-08, + "loss": 0.0985, + "step": 5349 + }, + { + "epoch": 3.82, + "grad_norm": 5.836113346579141, + "learning_rate": 5.3793253611110206e-08, + "loss": 0.0904, + "step": 5350 + }, + { + "epoch": 3.82, + "grad_norm": 10.277038370643657, + "learning_rate": 5.3371272162702214e-08, + "loss": 0.1272, + "step": 5351 + }, + { + "epoch": 3.82, + "grad_norm": 7.427580721132972, + "learning_rate": 5.295094347489593e-08, + "loss": 0.1472, + "step": 5352 + }, + { + "epoch": 3.82, + "grad_norm": 5.647665876407714, + "learning_rate": 5.253226768813235e-08, + "loss": 0.0901, + "step": 5353 + }, + { + "epoch": 3.82, + "grad_norm": 9.858056079098924, + "learning_rate": 5.211524494229736e-08, + "loss": 0.1442, + "step": 5354 + }, + { + "epoch": 3.82, + "grad_norm": 7.289792068781541, + "learning_rate": 5.169987537672727e-08, + "loss": 0.135, + "step": 5355 + }, + { + "epoch": 3.82, + "grad_norm": 8.232332580344814, + "learning_rate": 5.128615913020385e-08, + "loss": 0.1414, + "step": 5356 + }, + { + "epoch": 3.82, + "grad_norm": 6.759511421442111, + "learning_rate": 5.087409634095819e-08, + "loss": 0.1025, + "step": 5357 + }, + { + "epoch": 3.82, + "grad_norm": 9.901571938672475, + "learning_rate": 5.046368714666683e-08, + "loss": 0.1069, + "step": 5358 + }, + { + "epoch": 3.83, + "grad_norm": 6.677195791425341, + "learning_rate": 5.0054931684457296e-08, + "loss": 0.1721, + "step": 5359 + }, + { + "epoch": 3.83, + "grad_norm": 7.5234884387987, + "learning_rate": 4.964783009090035e-08, + "loss": 0.1272, + "step": 5360 + }, + { + "epoch": 3.83, + "grad_norm": 7.1565601746836665, + "learning_rate": 4.9242382502017185e-08, + "loss": 0.1442, + "step": 5361 + }, + { + "epoch": 3.83, + "grad_norm": 6.612248876782069, + "learning_rate": 4.883858905327499e-08, + "loss": 0.1156, + "step": 5362 + }, + { + "epoch": 3.83, + "grad_norm": 6.653265623648825, + "learning_rate": 4.843644987958862e-08, + "loss": 0.109, + "step": 5363 + }, + { + "epoch": 3.83, + "grad_norm": 7.539454475937336, + "learning_rate": 4.8035965115320604e-08, + "loss": 0.1241, + "step": 5364 + }, + { + "epoch": 3.83, + "grad_norm": 9.420747332016814, + "learning_rate": 4.763713489428001e-08, + "loss": 0.121, + "step": 5365 + }, + { + "epoch": 3.83, + "grad_norm": 5.7408741621782, + "learning_rate": 4.723995934972414e-08, + "loss": 0.1077, + "step": 5366 + }, + { + "epoch": 3.83, + "grad_norm": 7.081059176994714, + "learning_rate": 4.684443861435572e-08, + "loss": 0.1456, + "step": 5367 + }, + { + "epoch": 3.83, + "grad_norm": 12.657636446362256, + "learning_rate": 4.6450572820325727e-08, + "loss": 0.1986, + "step": 5368 + }, + { + "epoch": 3.83, + "grad_norm": 5.913660788091783, + "learning_rate": 4.605836209923331e-08, + "loss": 0.1097, + "step": 5369 + }, + { + "epoch": 3.83, + "grad_norm": 7.210402941111909, + "learning_rate": 4.566780658212144e-08, + "loss": 0.1443, + "step": 5370 + }, + { + "epoch": 3.83, + "grad_norm": 5.194586966451776, + "learning_rate": 4.5278906399483516e-08, + "loss": 0.0763, + "step": 5371 + }, + { + "epoch": 3.83, + "grad_norm": 7.14924387954306, + "learning_rate": 4.489166168125725e-08, + "loss": 0.1792, + "step": 5372 + }, + { + "epoch": 3.84, + "grad_norm": 9.147775379876485, + "learning_rate": 4.4506072556829704e-08, + "loss": 0.1053, + "step": 5373 + }, + { + "epoch": 3.84, + "grad_norm": 10.730163857110522, + "learning_rate": 4.4122139155031717e-08, + "loss": 0.0945, + "step": 5374 + }, + { + "epoch": 3.84, + "grad_norm": 9.80794922773675, + "learning_rate": 4.373986160414345e-08, + "loss": 0.117, + "step": 5375 + }, + { + "epoch": 3.84, + "grad_norm": 5.52024439305837, + "learning_rate": 4.335924003189107e-08, + "loss": 0.0768, + "step": 5376 + }, + { + "epoch": 3.84, + "grad_norm": 6.642666235134706, + "learning_rate": 4.298027456544674e-08, + "loss": 0.0894, + "step": 5377 + }, + { + "epoch": 3.84, + "grad_norm": 8.033102864873747, + "learning_rate": 4.260296533143027e-08, + "loss": 0.1226, + "step": 5378 + }, + { + "epoch": 3.84, + "grad_norm": 7.958363936548108, + "learning_rate": 4.22273124559075e-08, + "loss": 0.1164, + "step": 5379 + }, + { + "epoch": 3.84, + "grad_norm": 6.064455639298206, + "learning_rate": 4.185331606439136e-08, + "loss": 0.116, + "step": 5380 + }, + { + "epoch": 3.84, + "grad_norm": 7.2571709246078715, + "learning_rate": 4.148097628184078e-08, + "loss": 0.1587, + "step": 5381 + }, + { + "epoch": 3.84, + "grad_norm": 5.531175996724435, + "learning_rate": 4.111029323266125e-08, + "loss": 0.1157, + "step": 5382 + }, + { + "epoch": 3.84, + "grad_norm": 3.8346388590651186, + "learning_rate": 4.07412670407048e-08, + "loss": 0.0787, + "step": 5383 + }, + { + "epoch": 3.84, + "grad_norm": 15.590850711055705, + "learning_rate": 4.037389782927059e-08, + "loss": 0.222, + "step": 5384 + }, + { + "epoch": 3.84, + "grad_norm": 9.156488520749987, + "learning_rate": 4.000818572110265e-08, + "loss": 0.1475, + "step": 5385 + }, + { + "epoch": 3.84, + "grad_norm": 6.095122261021763, + "learning_rate": 3.964413083839269e-08, + "loss": 0.0922, + "step": 5386 + }, + { + "epoch": 3.85, + "grad_norm": 7.495658799999846, + "learning_rate": 3.9281733302778404e-08, + "loss": 0.099, + "step": 5387 + }, + { + "epoch": 3.85, + "grad_norm": 6.902837764596447, + "learning_rate": 3.892099323534293e-08, + "loss": 0.1339, + "step": 5388 + }, + { + "epoch": 3.85, + "grad_norm": 5.998154833344351, + "learning_rate": 3.856191075661708e-08, + "loss": 0.0916, + "step": 5389 + }, + { + "epoch": 3.85, + "grad_norm": 8.054604203914527, + "learning_rate": 3.8204485986576e-08, + "loss": 0.1375, + "step": 5390 + }, + { + "epoch": 3.85, + "grad_norm": 5.718326581624709, + "learning_rate": 3.784871904464249e-08, + "loss": 0.1193, + "step": 5391 + }, + { + "epoch": 3.85, + "grad_norm": 6.0495502960815655, + "learning_rate": 3.7494610049684796e-08, + "loss": 0.1162, + "step": 5392 + }, + { + "epoch": 3.85, + "grad_norm": 10.193965675630396, + "learning_rate": 3.714215912001773e-08, + "loss": 0.1565, + "step": 5393 + }, + { + "epoch": 3.85, + "grad_norm": 5.735370222498481, + "learning_rate": 3.6791366373400974e-08, + "loss": 0.0824, + "step": 5394 + }, + { + "epoch": 3.85, + "grad_norm": 8.796648636559523, + "learning_rate": 3.6442231927041324e-08, + "loss": 0.118, + "step": 5395 + }, + { + "epoch": 3.85, + "grad_norm": 8.101682970617409, + "learning_rate": 3.609475589759104e-08, + "loss": 0.1078, + "step": 5396 + }, + { + "epoch": 3.85, + "grad_norm": 6.642030857014743, + "learning_rate": 3.574893840114835e-08, + "loss": 0.1002, + "step": 5397 + }, + { + "epoch": 3.85, + "grad_norm": 6.008978983404813, + "learning_rate": 3.5404779553257494e-08, + "loss": 0.1146, + "step": 5398 + }, + { + "epoch": 3.85, + "grad_norm": 8.787617973801435, + "learning_rate": 3.506227946890761e-08, + "loss": 0.209, + "step": 5399 + }, + { + "epoch": 3.85, + "grad_norm": 5.136877676664234, + "learning_rate": 3.4721438262534935e-08, + "loss": 0.0845, + "step": 5400 + }, + { + "epoch": 3.86, + "grad_norm": 6.842953334692523, + "learning_rate": 3.438225604802115e-08, + "loss": 0.1489, + "step": 5401 + }, + { + "epoch": 3.86, + "grad_norm": 5.907887079400435, + "learning_rate": 3.404473293869226e-08, + "loss": 0.1051, + "step": 5402 + }, + { + "epoch": 3.86, + "grad_norm": 8.675482010996637, + "learning_rate": 3.370886904732196e-08, + "loss": 0.1149, + "step": 5403 + }, + { + "epoch": 3.86, + "grad_norm": 5.556595931410459, + "learning_rate": 3.33746644861288e-08, + "loss": 0.0774, + "step": 5404 + }, + { + "epoch": 3.86, + "grad_norm": 7.278531821366081, + "learning_rate": 3.30421193667757e-08, + "loss": 0.1306, + "step": 5405 + }, + { + "epoch": 3.86, + "grad_norm": 5.349133272872112, + "learning_rate": 3.271123380037322e-08, + "loss": 0.0956, + "step": 5406 + }, + { + "epoch": 3.86, + "grad_norm": 6.334550501760739, + "learning_rate": 3.2382007897475695e-08, + "loss": 0.0842, + "step": 5407 + }, + { + "epoch": 3.86, + "grad_norm": 6.826027471570106, + "learning_rate": 3.2054441768083477e-08, + "loss": 0.1401, + "step": 5408 + }, + { + "epoch": 3.86, + "grad_norm": 6.591080668686281, + "learning_rate": 3.1728535521643454e-08, + "loss": 0.1153, + "step": 5409 + }, + { + "epoch": 3.86, + "grad_norm": 4.910777250160643, + "learning_rate": 3.1404289267046305e-08, + "loss": 0.0955, + "step": 5410 + }, + { + "epoch": 3.86, + "grad_norm": 6.193743146559833, + "learning_rate": 3.1081703112628146e-08, + "loss": 0.1156, + "step": 5411 + }, + { + "epoch": 3.86, + "grad_norm": 4.131743099975989, + "learning_rate": 3.0760777166172206e-08, + "loss": 0.0914, + "step": 5412 + }, + { + "epoch": 3.86, + "grad_norm": 9.147186309625447, + "learning_rate": 3.0441511534904934e-08, + "loss": 0.186, + "step": 5413 + }, + { + "epoch": 3.86, + "grad_norm": 16.89801450783667, + "learning_rate": 3.012390632549933e-08, + "loss": 0.2162, + "step": 5414 + }, + { + "epoch": 3.87, + "grad_norm": 7.065640199939804, + "learning_rate": 2.9807961644073294e-08, + "loss": 0.1641, + "step": 5415 + }, + { + "epoch": 3.87, + "grad_norm": 5.507514754630227, + "learning_rate": 2.9493677596189595e-08, + "loss": 0.1292, + "step": 5416 + }, + { + "epoch": 3.87, + "grad_norm": 6.253875594598688, + "learning_rate": 2.9181054286855916e-08, + "loss": 0.1044, + "step": 5417 + }, + { + "epoch": 3.87, + "grad_norm": 9.876290985174975, + "learning_rate": 2.887009182052647e-08, + "loss": 0.2092, + "step": 5418 + }, + { + "epoch": 3.87, + "grad_norm": 8.262022252186037, + "learning_rate": 2.8560790301098705e-08, + "loss": 0.1256, + "step": 5419 + }, + { + "epoch": 3.87, + "grad_norm": 4.444790652855763, + "learning_rate": 2.825314983191718e-08, + "loss": 0.0795, + "step": 5420 + }, + { + "epoch": 3.87, + "grad_norm": 5.034772326384623, + "learning_rate": 2.7947170515768562e-08, + "loss": 0.1035, + "step": 5421 + }, + { + "epoch": 3.87, + "grad_norm": 8.398317634313441, + "learning_rate": 2.7642852454887736e-08, + "loss": 0.1326, + "step": 5422 + }, + { + "epoch": 3.87, + "grad_norm": 4.212256085984667, + "learning_rate": 2.7340195750952813e-08, + "loss": 0.0872, + "step": 5423 + }, + { + "epoch": 3.87, + "grad_norm": 6.414797855257343, + "learning_rate": 2.703920050508624e-08, + "loss": 0.1196, + "step": 5424 + }, + { + "epoch": 3.87, + "grad_norm": 6.604779209237305, + "learning_rate": 2.673986681785645e-08, + "loss": 0.1197, + "step": 5425 + }, + { + "epoch": 3.87, + "grad_norm": 4.618675757820333, + "learning_rate": 2.6442194789277342e-08, + "loss": 0.0822, + "step": 5426 + }, + { + "epoch": 3.87, + "grad_norm": 10.808901861160757, + "learning_rate": 2.6146184518804908e-08, + "loss": 0.1245, + "step": 5427 + }, + { + "epoch": 3.87, + "grad_norm": 4.610150315103935, + "learning_rate": 2.5851836105343363e-08, + "loss": 0.098, + "step": 5428 + }, + { + "epoch": 3.88, + "grad_norm": 9.495465719612765, + "learning_rate": 2.555914964723849e-08, + "loss": 0.1179, + "step": 5429 + }, + { + "epoch": 3.88, + "grad_norm": 6.207059094738852, + "learning_rate": 2.5268125242283724e-08, + "loss": 0.0848, + "step": 5430 + }, + { + "epoch": 3.88, + "grad_norm": 6.450983131869945, + "learning_rate": 2.4978762987714067e-08, + "loss": 0.123, + "step": 5431 + }, + { + "epoch": 3.88, + "grad_norm": 10.375350232266669, + "learning_rate": 2.469106298021273e-08, + "loss": 0.165, + "step": 5432 + }, + { + "epoch": 3.88, + "grad_norm": 6.503010938323237, + "learning_rate": 2.4405025315904495e-08, + "loss": 0.1229, + "step": 5433 + }, + { + "epoch": 3.88, + "grad_norm": 8.816882888336334, + "learning_rate": 2.412065009036013e-08, + "loss": 0.1213, + "step": 5434 + }, + { + "epoch": 3.88, + "grad_norm": 5.889173798463627, + "learning_rate": 2.3837937398594747e-08, + "loss": 0.1117, + "step": 5435 + }, + { + "epoch": 3.88, + "grad_norm": 8.450886362130994, + "learning_rate": 2.3556887335067223e-08, + "loss": 0.1573, + "step": 5436 + }, + { + "epoch": 3.88, + "grad_norm": 7.633750300186745, + "learning_rate": 2.3277499993682452e-08, + "loss": 0.1022, + "step": 5437 + }, + { + "epoch": 3.88, + "grad_norm": 5.677983573724589, + "learning_rate": 2.2999775467788532e-08, + "loss": 0.0836, + "step": 5438 + }, + { + "epoch": 3.88, + "grad_norm": 6.121189112026573, + "learning_rate": 2.272371385017902e-08, + "loss": 0.1059, + "step": 5439 + }, + { + "epoch": 3.88, + "grad_norm": 6.797540986482973, + "learning_rate": 2.244931523309013e-08, + "loss": 0.1263, + "step": 5440 + }, + { + "epoch": 3.88, + "grad_norm": 6.379117284864385, + "learning_rate": 2.2176579708204636e-08, + "loss": 0.1324, + "step": 5441 + }, + { + "epoch": 3.88, + "grad_norm": 10.852701175072118, + "learning_rate": 2.190550736664798e-08, + "loss": 0.2468, + "step": 5442 + }, + { + "epoch": 3.89, + "grad_norm": 7.513796502846481, + "learning_rate": 2.163609829898994e-08, + "loss": 0.1409, + "step": 5443 + }, + { + "epoch": 3.89, + "grad_norm": 9.610713700051518, + "learning_rate": 2.136835259524628e-08, + "loss": 0.1302, + "step": 5444 + }, + { + "epoch": 3.89, + "grad_norm": 7.7294639743377545, + "learning_rate": 2.1102270344874887e-08, + "loss": 0.1514, + "step": 5445 + }, + { + "epoch": 3.89, + "grad_norm": 5.143827292567964, + "learning_rate": 2.083785163677965e-08, + "loss": 0.0825, + "step": 5446 + }, + { + "epoch": 3.89, + "grad_norm": 5.874115522509679, + "learning_rate": 2.0575096559306564e-08, + "loss": 0.1707, + "step": 5447 + }, + { + "epoch": 3.89, + "grad_norm": 5.5660499922054685, + "learning_rate": 2.0314005200248178e-08, + "loss": 0.0833, + "step": 5448 + }, + { + "epoch": 3.89, + "grad_norm": 6.291346107187886, + "learning_rate": 2.0054577646839156e-08, + "loss": 0.104, + "step": 5449 + }, + { + "epoch": 3.89, + "grad_norm": 7.101207817053639, + "learning_rate": 1.979681398575961e-08, + "loss": 0.1224, + "step": 5450 + }, + { + "epoch": 3.89, + "grad_norm": 8.288766121361911, + "learning_rate": 1.954071430313287e-08, + "loss": 0.1025, + "step": 5451 + }, + { + "epoch": 3.89, + "grad_norm": 7.137866886658638, + "learning_rate": 1.9286278684526593e-08, + "loss": 0.1288, + "step": 5452 + }, + { + "epoch": 3.89, + "grad_norm": 8.577387327531774, + "learning_rate": 1.9033507214952784e-08, + "loss": 0.0977, + "step": 5453 + }, + { + "epoch": 3.89, + "grad_norm": 7.789300210080421, + "learning_rate": 1.878239997886666e-08, + "loss": 0.1188, + "step": 5454 + }, + { + "epoch": 3.89, + "grad_norm": 8.261173115162729, + "learning_rate": 1.853295706016778e-08, + "loss": 0.145, + "step": 5455 + }, + { + "epoch": 3.89, + "grad_norm": 8.57419604447386, + "learning_rate": 1.8285178542200022e-08, + "loss": 0.1451, + "step": 5456 + }, + { + "epoch": 3.9, + "grad_norm": 8.624820178787198, + "learning_rate": 1.8039064507750503e-08, + "loss": 0.0851, + "step": 5457 + }, + { + "epoch": 3.9, + "grad_norm": 5.3281446819414855, + "learning_rate": 1.7794615039050665e-08, + "loss": 0.1077, + "step": 5458 + }, + { + "epoch": 3.9, + "grad_norm": 6.719079268451326, + "learning_rate": 1.7551830217775734e-08, + "loss": 0.1064, + "step": 5459 + }, + { + "epoch": 3.9, + "grad_norm": 6.361150677660455, + "learning_rate": 1.7310710125044707e-08, + "loss": 0.1342, + "step": 5460 + }, + { + "epoch": 3.9, + "grad_norm": 9.934178727478375, + "learning_rate": 1.7071254841419805e-08, + "loss": 0.188, + "step": 5461 + }, + { + "epoch": 3.9, + "grad_norm": 5.961703433504403, + "learning_rate": 1.6833464446907588e-08, + "loss": 0.126, + "step": 5462 + }, + { + "epoch": 3.9, + "grad_norm": 4.709967478366248, + "learning_rate": 1.6597339020958393e-08, + "loss": 0.0972, + "step": 5463 + }, + { + "epoch": 3.9, + "grad_norm": 7.621860983040733, + "learning_rate": 1.6362878642466328e-08, + "loss": 0.1342, + "step": 5464 + }, + { + "epoch": 3.9, + "grad_norm": 10.209473650410427, + "learning_rate": 1.6130083389768735e-08, + "loss": 0.1364, + "step": 5465 + }, + { + "epoch": 3.9, + "grad_norm": 6.139084520540914, + "learning_rate": 1.5898953340646728e-08, + "loss": 0.1218, + "step": 5466 + }, + { + "epoch": 3.9, + "grad_norm": 9.71994860554082, + "learning_rate": 1.5669488572325197e-08, + "loss": 0.1554, + "step": 5467 + }, + { + "epoch": 3.9, + "grad_norm": 12.086414770955265, + "learning_rate": 1.5441689161472816e-08, + "loss": 0.1787, + "step": 5468 + }, + { + "epoch": 3.9, + "grad_norm": 24.407807669084864, + "learning_rate": 1.521555518420148e-08, + "loss": 0.1927, + "step": 5469 + }, + { + "epoch": 3.9, + "grad_norm": 6.650159393488484, + "learning_rate": 1.499108671606686e-08, + "loss": 0.1473, + "step": 5470 + }, + { + "epoch": 3.91, + "grad_norm": 6.438063211825578, + "learning_rate": 1.4768283832067853e-08, + "loss": 0.1339, + "step": 5471 + }, + { + "epoch": 3.91, + "grad_norm": 15.259887152827618, + "learning_rate": 1.4547146606646578e-08, + "loss": 0.2236, + "step": 5472 + }, + { + "epoch": 3.91, + "grad_norm": 5.182419293842352, + "learning_rate": 1.4327675113690598e-08, + "loss": 0.0764, + "step": 5473 + }, + { + "epoch": 3.91, + "grad_norm": 9.2094503251359, + "learning_rate": 1.4109869426527368e-08, + "loss": 0.1317, + "step": 5474 + }, + { + "epoch": 3.91, + "grad_norm": 6.283356876259326, + "learning_rate": 1.3893729617931451e-08, + "loss": 0.1119, + "step": 5475 + }, + { + "epoch": 3.91, + "grad_norm": 6.905426440583543, + "learning_rate": 1.3679255760118415e-08, + "loss": 0.1092, + "step": 5476 + }, + { + "epoch": 3.91, + "grad_norm": 5.641634779840197, + "learning_rate": 1.3466447924748716e-08, + "loss": 0.099, + "step": 5477 + }, + { + "epoch": 3.91, + "grad_norm": 4.588743851538649, + "learning_rate": 1.3255306182924365e-08, + "loss": 0.0688, + "step": 5478 + }, + { + "epoch": 3.91, + "grad_norm": 8.406396921378816, + "learning_rate": 1.3045830605192266e-08, + "loss": 0.1873, + "step": 5479 + }, + { + "epoch": 3.91, + "grad_norm": 5.491021032175865, + "learning_rate": 1.2838021261541988e-08, + "loss": 0.134, + "step": 5480 + }, + { + "epoch": 3.91, + "grad_norm": 7.735288437189416, + "learning_rate": 1.263187822140688e-08, + "loss": 0.1151, + "step": 5481 + }, + { + "epoch": 3.91, + "grad_norm": 9.538077504415023, + "learning_rate": 1.2427401553662955e-08, + "loss": 0.1193, + "step": 5482 + }, + { + "epoch": 3.91, + "grad_norm": 13.617139287693995, + "learning_rate": 1.2224591326628898e-08, + "loss": 0.2377, + "step": 5483 + }, + { + "epoch": 3.91, + "grad_norm": 7.037545116460254, + "learning_rate": 1.2023447608068283e-08, + "loss": 0.1289, + "step": 5484 + }, + { + "epoch": 3.92, + "grad_norm": 7.051654601622574, + "learning_rate": 1.182397046518735e-08, + "loss": 0.0864, + "step": 5485 + }, + { + "epoch": 3.92, + "grad_norm": 7.660167436446467, + "learning_rate": 1.1626159964633899e-08, + "loss": 0.1224, + "step": 5486 + }, + { + "epoch": 3.92, + "grad_norm": 5.448552963768357, + "learning_rate": 1.1430016172501169e-08, + "loss": 0.0796, + "step": 5487 + }, + { + "epoch": 3.92, + "grad_norm": 5.531596088000746, + "learning_rate": 1.1235539154323405e-08, + "loss": 0.1172, + "step": 5488 + }, + { + "epoch": 3.92, + "grad_norm": 7.503280936817476, + "learning_rate": 1.1042728975079741e-08, + "loss": 0.1206, + "step": 5489 + }, + { + "epoch": 3.92, + "grad_norm": 13.364423091479154, + "learning_rate": 1.0851585699191425e-08, + "loss": 0.1788, + "step": 5490 + }, + { + "epoch": 3.92, + "grad_norm": 5.933243904552928, + "learning_rate": 1.0662109390522924e-08, + "loss": 0.1199, + "step": 5491 + }, + { + "epoch": 3.92, + "grad_norm": 7.257081865184139, + "learning_rate": 1.047430011238193e-08, + "loss": 0.1149, + "step": 5492 + }, + { + "epoch": 3.92, + "grad_norm": 7.065807785440134, + "learning_rate": 1.028815792751936e-08, + "loss": 0.11, + "step": 5493 + }, + { + "epoch": 3.92, + "grad_norm": 4.983139579728731, + "learning_rate": 1.0103682898128241e-08, + "loss": 0.075, + "step": 5494 + }, + { + "epoch": 3.92, + "grad_norm": 10.618784716584416, + "learning_rate": 9.920875085845383e-09, + "loss": 0.1307, + "step": 5495 + }, + { + "epoch": 3.92, + "grad_norm": 6.388645197421817, + "learning_rate": 9.739734551749703e-09, + "loss": 0.1406, + "step": 5496 + }, + { + "epoch": 3.92, + "grad_norm": 10.393540666994594, + "learning_rate": 9.560261356364452e-09, + "loss": 0.1676, + "step": 5497 + }, + { + "epoch": 3.92, + "grad_norm": 6.166647278650001, + "learning_rate": 9.382455559654446e-09, + "loss": 0.1536, + "step": 5498 + }, + { + "epoch": 3.93, + "grad_norm": 4.81742399868161, + "learning_rate": 9.206317221027717e-09, + "loss": 0.0848, + "step": 5499 + }, + { + "epoch": 3.93, + "grad_norm": 7.5698480294633415, + "learning_rate": 9.031846399336075e-09, + "loss": 0.1356, + "step": 5500 + }, + { + "epoch": 3.93, + "eval_avg_AUC": 0.7950252002273744, + "eval_avg_Accuracy": 0.7066893236074271, + "eval_avg_Accuracy-right": 0.8786357114908048, + "eval_avg_Accuracy-wrong": 0.4068683193086195, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6590350719883701, + "eval_last_AUC": 0.8161622767447089, + "eval_last_Accuracy": 0.741959549071618, + "eval_last_Accuracy-right": 0.8153123777227077, + "eval_last_Accuracy-wrong": 0.6140550375255857, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6823020710938568, + "eval_max_AUC": 0.77593042794898, + "eval_max_Accuracy": 0.6453083554376657, + "eval_max_Accuracy-right": 0.9788704838920047, + "eval_max_Accuracy-wrong": 0.06367978166932, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6221345468988383, + "eval_min_AUC": 0.8001189509346562, + "eval_min_Accuracy": 0.7241379310344828, + "eval_min_Accuracy-right": 0.7241424285900613, + "eval_min_Accuracy-wrong": 0.7241300886968387, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6609511576435902, + "eval_prod_AUC": 0.802284305563165, + "eval_prod_Accuracy": 0.7134864058355438, + "eval_prod_Accuracy-right": 0.6670796921872962, + "eval_prod_Accuracy-wrong": 0.7944052763247669, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6623692613743422, + "eval_runtime": 248.6606, + "eval_samples_per_second": 97.032, + "eval_steps_per_second": 3.032, + "eval_sum_AUC": 0.669465043974663, + "eval_sum_Accuracy": 0.6391329575596817, + "eval_sum_Accuracy-right": 0.9867614451545585, + "eval_sum_Accuracy-wrong": 0.03297702979304071, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6489398581558867, + "step": 5500 + }, + { + "epoch": 3.93, + "grad_norm": 8.429106410555258, + "learning_rate": 8.859043152872892e-09, + "loss": 0.1237, + "step": 5501 + }, + { + "epoch": 3.93, + "grad_norm": 8.009167896999422, + "learning_rate": 8.687907539375318e-09, + "loss": 0.1164, + "step": 5502 + }, + { + "epoch": 3.93, + "grad_norm": 5.780427122348584, + "learning_rate": 8.518439616022057e-09, + "loss": 0.1174, + "step": 5503 + }, + { + "epoch": 3.93, + "grad_norm": 7.547785765991373, + "learning_rate": 8.350639439436703e-09, + "loss": 0.1143, + "step": 5504 + }, + { + "epoch": 3.93, + "grad_norm": 6.131647170129058, + "learning_rate": 8.184507065683855e-09, + "loss": 0.0824, + "step": 5505 + }, + { + "epoch": 3.93, + "grad_norm": 11.483331590260356, + "learning_rate": 8.020042550271889e-09, + "loss": 0.1688, + "step": 5506 + }, + { + "epoch": 3.93, + "grad_norm": 4.2724277741797625, + "learning_rate": 7.857245948150183e-09, + "loss": 0.1176, + "step": 5507 + }, + { + "epoch": 3.93, + "grad_norm": 5.573181505757999, + "learning_rate": 7.696117313713559e-09, + "loss": 0.1044, + "step": 5508 + }, + { + "epoch": 3.93, + "grad_norm": 4.598852408301717, + "learning_rate": 7.536656700797284e-09, + "loss": 0.0922, + "step": 5509 + }, + { + "epoch": 3.93, + "grad_norm": 10.981912827262958, + "learning_rate": 7.37886416268041e-09, + "loss": 0.1188, + "step": 5510 + }, + { + "epoch": 3.93, + "grad_norm": 10.405947394175044, + "learning_rate": 7.222739752084096e-09, + "loss": 0.1179, + "step": 5511 + }, + { + "epoch": 3.93, + "grad_norm": 6.540864550860748, + "learning_rate": 7.068283521172725e-09, + "loss": 0.1253, + "step": 5512 + }, + { + "epoch": 3.94, + "grad_norm": 7.525882733341714, + "learning_rate": 6.915495521552795e-09, + "loss": 0.1077, + "step": 5513 + }, + { + "epoch": 3.94, + "grad_norm": 6.164782248488719, + "learning_rate": 6.764375804274026e-09, + "loss": 0.1436, + "step": 5514 + }, + { + "epoch": 3.94, + "grad_norm": 7.444440508428989, + "learning_rate": 6.61492441982714e-09, + "loss": 0.1319, + "step": 5515 + }, + { + "epoch": 3.94, + "grad_norm": 4.812978938104751, + "learning_rate": 6.467141418147748e-09, + "loss": 0.0881, + "step": 5516 + }, + { + "epoch": 3.94, + "grad_norm": 5.184902484678889, + "learning_rate": 6.321026848613021e-09, + "loss": 0.1021, + "step": 5517 + }, + { + "epoch": 3.94, + "grad_norm": 4.699439697594633, + "learning_rate": 6.176580760041684e-09, + "loss": 0.0901, + "step": 5518 + }, + { + "epoch": 3.94, + "grad_norm": 7.963371822825025, + "learning_rate": 6.033803200696242e-09, + "loss": 0.1211, + "step": 5519 + }, + { + "epoch": 3.94, + "grad_norm": 8.425984631270113, + "learning_rate": 5.892694218281869e-09, + "loss": 0.1633, + "step": 5520 + }, + { + "epoch": 3.94, + "grad_norm": 7.949188213453105, + "learning_rate": 5.753253859944741e-09, + "loss": 0.1158, + "step": 5521 + }, + { + "epoch": 3.94, + "grad_norm": 9.136952727452089, + "learning_rate": 5.615482172275366e-09, + "loss": 0.1742, + "step": 5522 + }, + { + "epoch": 3.94, + "grad_norm": 7.699454602256628, + "learning_rate": 5.479379201305257e-09, + "loss": 0.1188, + "step": 5523 + }, + { + "epoch": 3.94, + "grad_norm": 7.953008550854942, + "learning_rate": 5.344944992509149e-09, + "loss": 0.1521, + "step": 5524 + }, + { + "epoch": 3.94, + "grad_norm": 6.618564531824671, + "learning_rate": 5.212179590803335e-09, + "loss": 0.1507, + "step": 5525 + }, + { + "epoch": 3.94, + "grad_norm": 9.999104323435565, + "learning_rate": 5.08108304054844e-09, + "loss": 0.1151, + "step": 5526 + }, + { + "epoch": 3.95, + "grad_norm": 15.707971260259198, + "learning_rate": 4.9516553855455395e-09, + "loss": 0.1814, + "step": 5527 + }, + { + "epoch": 3.95, + "grad_norm": 8.642743845716193, + "learning_rate": 4.82389666903893e-09, + "loss": 0.116, + "step": 5528 + }, + { + "epoch": 3.95, + "grad_norm": 7.118772631160384, + "learning_rate": 4.697806933715021e-09, + "loss": 0.125, + "step": 5529 + }, + { + "epoch": 3.95, + "grad_norm": 9.23091508452254, + "learning_rate": 4.573386221703446e-09, + "loss": 0.1327, + "step": 5530 + }, + { + "epoch": 3.95, + "grad_norm": 13.387861094338058, + "learning_rate": 4.450634574574286e-09, + "loss": 0.1758, + "step": 5531 + }, + { + "epoch": 3.95, + "grad_norm": 5.407477037058821, + "learning_rate": 4.329552033341955e-09, + "loss": 0.0692, + "step": 5532 + }, + { + "epoch": 3.95, + "grad_norm": 5.614578814740924, + "learning_rate": 4.210138638462424e-09, + "loss": 0.0916, + "step": 5533 + }, + { + "epoch": 3.95, + "grad_norm": 10.253247016948384, + "learning_rate": 4.0923944298337796e-09, + "loss": 0.1483, + "step": 5534 + }, + { + "epoch": 3.95, + "grad_norm": 6.932631685042542, + "learning_rate": 3.976319446795662e-09, + "loss": 0.1497, + "step": 5535 + }, + { + "epoch": 3.95, + "grad_norm": 7.467510399283004, + "learning_rate": 3.8619137281326044e-09, + "loss": 0.1044, + "step": 5536 + }, + { + "epoch": 3.95, + "grad_norm": 8.923637692156998, + "learning_rate": 3.749177312068475e-09, + "loss": 0.1453, + "step": 5537 + }, + { + "epoch": 3.95, + "grad_norm": 13.938583660145987, + "learning_rate": 3.63811023627092e-09, + "loss": 0.183, + "step": 5538 + }, + { + "epoch": 3.95, + "grad_norm": 10.314501087676051, + "learning_rate": 3.528712537849144e-09, + "loss": 0.1874, + "step": 5539 + }, + { + "epoch": 3.95, + "grad_norm": 7.711784952194379, + "learning_rate": 3.42098425335613e-09, + "loss": 0.1631, + "step": 5540 + }, + { + "epoch": 3.96, + "grad_norm": 7.6746819815164455, + "learning_rate": 3.3149254187841985e-09, + "loss": 0.1301, + "step": 5541 + }, + { + "epoch": 3.96, + "grad_norm": 6.077385997300826, + "learning_rate": 3.210536069571113e-09, + "loss": 0.1223, + "step": 5542 + }, + { + "epoch": 3.96, + "grad_norm": 6.60490536386744, + "learning_rate": 3.1078162405939747e-09, + "loss": 0.1371, + "step": 5543 + }, + { + "epoch": 3.96, + "grad_norm": 6.546314650170883, + "learning_rate": 3.006765966174774e-09, + "loss": 0.1312, + "step": 5544 + }, + { + "epoch": 3.96, + "grad_norm": 6.402040317933734, + "learning_rate": 2.907385280075392e-09, + "loss": 0.1078, + "step": 5545 + }, + { + "epoch": 3.96, + "grad_norm": 7.1113178071259355, + "learning_rate": 2.80967421550038e-09, + "loss": 0.124, + "step": 5546 + }, + { + "epoch": 3.96, + "grad_norm": 9.882733207585435, + "learning_rate": 2.7136328050980654e-09, + "loss": 0.2028, + "step": 5547 + }, + { + "epoch": 3.96, + "grad_norm": 8.22923561778551, + "learning_rate": 2.6192610809566697e-09, + "loss": 0.1747, + "step": 5548 + }, + { + "epoch": 3.96, + "grad_norm": 9.744168083139122, + "learning_rate": 2.5265590746076373e-09, + "loss": 0.0875, + "step": 5549 + }, + { + "epoch": 3.96, + "grad_norm": 7.470206404886737, + "learning_rate": 2.43552681702508e-09, + "loss": 0.1339, + "step": 5550 + }, + { + "epoch": 3.96, + "grad_norm": 5.8082392348378225, + "learning_rate": 2.346164338624113e-09, + "loss": 0.1084, + "step": 5551 + }, + { + "epoch": 3.96, + "grad_norm": 6.477160650962435, + "learning_rate": 2.2584716692619636e-09, + "loss": 0.1305, + "step": 5552 + }, + { + "epoch": 3.96, + "grad_norm": 6.159335395880286, + "learning_rate": 2.172448838239083e-09, + "loss": 0.1267, + "step": 5553 + }, + { + "epoch": 3.96, + "grad_norm": 5.573462560585101, + "learning_rate": 2.08809587429748e-09, + "loss": 0.0966, + "step": 5554 + }, + { + "epoch": 3.97, + "grad_norm": 4.343642412756653, + "learning_rate": 2.0054128056201662e-09, + "loss": 0.0584, + "step": 5555 + }, + { + "epoch": 3.97, + "grad_norm": 8.254147958795857, + "learning_rate": 1.924399659833376e-09, + "loss": 0.139, + "step": 5556 + }, + { + "epoch": 3.97, + "grad_norm": 10.092190845622381, + "learning_rate": 1.8450564640054569e-09, + "loss": 0.1674, + "step": 5557 + }, + { + "epoch": 3.97, + "grad_norm": 8.035445013347708, + "learning_rate": 1.7673832446463146e-09, + "loss": 0.1506, + "step": 5558 + }, + { + "epoch": 3.97, + "grad_norm": 8.832099606863865, + "learning_rate": 1.6913800277085225e-09, + "loss": 0.1419, + "step": 5559 + }, + { + "epoch": 3.97, + "grad_norm": 9.404074142401095, + "learning_rate": 1.6170468385845462e-09, + "loss": 0.1946, + "step": 5560 + }, + { + "epoch": 3.97, + "grad_norm": 4.577566022959948, + "learning_rate": 1.5443837021122954e-09, + "loss": 0.0952, + "step": 5561 + }, + { + "epoch": 3.97, + "grad_norm": 7.343705255356882, + "learning_rate": 1.473390642569017e-09, + "loss": 0.1126, + "step": 5562 + }, + { + "epoch": 3.97, + "grad_norm": 7.2585089112143795, + "learning_rate": 1.4040676836746259e-09, + "loss": 0.1133, + "step": 5563 + }, + { + "epoch": 3.97, + "grad_norm": 6.784254763986543, + "learning_rate": 1.336414848591705e-09, + "loss": 0.1276, + "step": 5564 + }, + { + "epoch": 3.97, + "grad_norm": 5.854702420201817, + "learning_rate": 1.2704321599243951e-09, + "loss": 0.0994, + "step": 5565 + }, + { + "epoch": 3.97, + "grad_norm": 5.753710545202959, + "learning_rate": 1.206119639718395e-09, + "loss": 0.1279, + "step": 5566 + }, + { + "epoch": 3.97, + "grad_norm": 7.307788810185479, + "learning_rate": 1.1434773094615158e-09, + "loss": 0.1118, + "step": 5567 + }, + { + "epoch": 3.97, + "grad_norm": 16.079904243512647, + "learning_rate": 1.0825051900842377e-09, + "loss": 0.1475, + "step": 5568 + }, + { + "epoch": 3.98, + "grad_norm": 6.869485387673937, + "learning_rate": 1.0232033019580423e-09, + "loss": 0.1205, + "step": 5569 + }, + { + "epoch": 3.98, + "grad_norm": 9.561539939824815, + "learning_rate": 9.655716648970804e-10, + "loss": 0.1674, + "step": 5570 + }, + { + "epoch": 3.98, + "grad_norm": 8.986453072022883, + "learning_rate": 9.096102981570598e-10, + "loss": 0.166, + "step": 5571 + }, + { + "epoch": 3.98, + "grad_norm": 4.372199399777608, + "learning_rate": 8.553192204358018e-10, + "loss": 0.0487, + "step": 5572 + }, + { + "epoch": 3.98, + "grad_norm": 8.885464213647072, + "learning_rate": 8.026984498726853e-10, + "loss": 0.1139, + "step": 5573 + }, + { + "epoch": 3.98, + "grad_norm": 4.969673215125041, + "learning_rate": 7.517480040497572e-10, + "loss": 0.0748, + "step": 5574 + }, + { + "epoch": 3.98, + "grad_norm": 8.019389192351877, + "learning_rate": 7.024678999900669e-10, + "loss": 0.1306, + "step": 5575 + }, + { + "epoch": 3.98, + "grad_norm": 6.295109377091253, + "learning_rate": 6.548581541593324e-10, + "loss": 0.1327, + "step": 5576 + }, + { + "epoch": 3.98, + "grad_norm": 8.69637612538181, + "learning_rate": 6.08918782464829e-10, + "loss": 0.1021, + "step": 5577 + }, + { + "epoch": 3.98, + "grad_norm": 6.261474454096343, + "learning_rate": 5.646498002553902e-10, + "loss": 0.0916, + "step": 5578 + }, + { + "epoch": 3.98, + "grad_norm": 16.482766831647567, + "learning_rate": 5.220512223219621e-10, + "loss": 0.2062, + "step": 5579 + }, + { + "epoch": 3.98, + "grad_norm": 7.218560157195067, + "learning_rate": 4.81123062898714e-10, + "loss": 0.1332, + "step": 5580 + }, + { + "epoch": 3.98, + "grad_norm": 4.056314731703903, + "learning_rate": 4.4186533565915293e-10, + "loss": 0.0832, + "step": 5581 + }, + { + "epoch": 3.98, + "grad_norm": 6.337018112627061, + "learning_rate": 4.042780537205637e-10, + "loss": 0.1094, + "step": 5582 + }, + { + "epoch": 3.99, + "grad_norm": 8.678745644886078, + "learning_rate": 3.6836122964178934e-10, + "loss": 0.1344, + "step": 5583 + }, + { + "epoch": 3.99, + "grad_norm": 8.323983732961677, + "learning_rate": 3.341148754232304e-10, + "loss": 0.112, + "step": 5584 + }, + { + "epoch": 3.99, + "grad_norm": 6.730233285070333, + "learning_rate": 3.015390025068454e-10, + "loss": 0.1141, + "step": 5585 + }, + { + "epoch": 3.99, + "grad_norm": 5.2887206705015775, + "learning_rate": 2.706336217767058e-10, + "loss": 0.0966, + "step": 5586 + }, + { + "epoch": 3.99, + "grad_norm": 10.523107370264057, + "learning_rate": 2.4139874355955105e-10, + "loss": 0.1647, + "step": 5587 + }, + { + "epoch": 3.99, + "grad_norm": 7.034757136628954, + "learning_rate": 2.138343776231233e-10, + "loss": 0.1074, + "step": 5588 + }, + { + "epoch": 3.99, + "grad_norm": 6.404901800476969, + "learning_rate": 1.8794053317672255e-10, + "loss": 0.1074, + "step": 5589 + }, + { + "epoch": 3.99, + "grad_norm": 6.157714487346645, + "learning_rate": 1.6371721887287196e-10, + "loss": 0.1122, + "step": 5590 + }, + { + "epoch": 3.99, + "grad_norm": 8.352229011291241, + "learning_rate": 1.4116444280398711e-10, + "loss": 0.1099, + "step": 5591 + }, + { + "epoch": 3.99, + "grad_norm": 5.989421477649134, + "learning_rate": 1.2028221250570683e-10, + "loss": 0.0992, + "step": 5592 + }, + { + "epoch": 3.99, + "grad_norm": 9.9800678266692, + "learning_rate": 1.0107053495522767e-10, + "loss": 0.1093, + "step": 5593 + }, + { + "epoch": 3.99, + "grad_norm": 11.303595468417102, + "learning_rate": 8.35294165718592e-11, + "loss": 0.15, + "step": 5594 + }, + { + "epoch": 3.99, + "grad_norm": 9.776566003654384, + "learning_rate": 6.765886321646874e-11, + "loss": 0.1285, + "step": 5595 + }, + { + "epoch": 3.99, + "grad_norm": 9.512879527949313, + "learning_rate": 5.345888019092638e-11, + "loss": 0.1599, + "step": 5596 + }, + { + "epoch": 4.0, + "grad_norm": 9.328733178787845, + "learning_rate": 4.092947224032529e-11, + "loss": 0.1466, + "step": 5597 + }, + { + "epoch": 4.0, + "grad_norm": 12.1455101579189, + "learning_rate": 3.007064355076139e-11, + "loss": 0.1296, + "step": 5598 + }, + { + "epoch": 4.0, + "grad_norm": 6.318880612200429, + "learning_rate": 2.088239775044354e-11, + "loss": 0.0999, + "step": 5599 + }, + { + "epoch": 4.0, + "grad_norm": 4.899339012947326, + "learning_rate": 1.3364737909138392e-11, + "loss": 0.0953, + "step": 5600 + }, + { + "epoch": 4.0, + "grad_norm": 7.1594901159283815, + "learning_rate": 7.517666539280654e-12, + "loss": 0.1243, + "step": 5601 + }, + { + "epoch": 4.0, + "grad_norm": 9.295531250814186, + "learning_rate": 3.3411855937526273e-12, + "loss": 0.1388, + "step": 5602 + }, + { + "epoch": 4.0, + "grad_norm": 9.718829953860425, + "learning_rate": 8.352964681046516e-13, + "loss": 0.1476, + "step": 5603 + }, + { + "epoch": 4.0, + "grad_norm": 5.714918142396491, + "learning_rate": 0.0, + "loss": 0.1135, + "step": 5604 + }, + { + "epoch": 4.0, + "step": 5604, + "total_flos": 750239094767616.0, + "train_loss": 0.36047464298572307, + "train_runtime": 15643.8891, + "train_samples_per_second": 22.919, + "train_steps_per_second": 0.358 + } + ], + "logging_steps": 1.0, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 24000, + "total_flos": 750239094767616.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}