diff --git "a/checkpoint-60204/trainer_state.json" "b/checkpoint-60204/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-60204/trainer_state.json" @@ -0,0 +1,16931 @@ +{ + "best_metric": 0.41398245096206665, + "best_model_checkpoint": "autotrain-y4yyk-zwyl5/checkpoint-60204", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 60204, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 52.348899841308594, + "learning_rate": 2.0760670984886232e-07, + "loss": 2.2769, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 28.035192489624023, + "learning_rate": 4.1521341969772463e-07, + "loss": 1.9582, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 32.72040557861328, + "learning_rate": 6.22820129546587e-07, + "loss": 1.5803, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 26.133230209350586, + "learning_rate": 8.304268393954493e-07, + "loss": 1.1819, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 22.792537689208984, + "learning_rate": 1.0380335492443115e-06, + "loss": 0.927, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 32.68214797973633, + "learning_rate": 1.245640259093174e-06, + "loss": 0.8114, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 22.632478713989258, + "learning_rate": 1.4532469689420362e-06, + "loss": 0.8253, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 19.682205200195312, + "learning_rate": 1.6608536787908985e-06, + "loss": 0.8464, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 12.069725036621094, + "learning_rate": 1.868460388639761e-06, + "loss": 0.7647, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 24.726076126098633, + "learning_rate": 2.076067098488623e-06, + "loss": 0.8112, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 33.87165451049805, + "learning_rate": 2.2836738083374856e-06, + "loss": 0.7807, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 18.317752838134766, + "learning_rate": 2.491280518186348e-06, + "loss": 0.7254, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 15.429110527038574, + "learning_rate": 2.6988872280352103e-06, + "loss": 0.8093, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 24.479049682617188, + "learning_rate": 2.9064939378840724e-06, + "loss": 0.8076, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 38.9715461730957, + "learning_rate": 3.114100647732935e-06, + "loss": 0.819, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 39.28196716308594, + "learning_rate": 3.321707357581797e-06, + "loss": 0.7896, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 20.000776290893555, + "learning_rate": 3.529314067430659e-06, + "loss": 0.8073, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 38.01788330078125, + "learning_rate": 3.736920777279522e-06, + "loss": 0.763, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 23.642269134521484, + "learning_rate": 3.944527487128384e-06, + "loss": 0.7345, + "step": 475 + }, + { + "epoch": 0.02, + "grad_norm": 11.121870994567871, + "learning_rate": 4.152134196977246e-06, + "loss": 0.8294, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 13.160523414611816, + "learning_rate": 4.359740906826109e-06, + "loss": 0.7715, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 26.109317779541016, + "learning_rate": 4.567347616674971e-06, + "loss": 0.6893, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 13.156538963317871, + "learning_rate": 4.774954326523833e-06, + "loss": 0.7625, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 16.808387756347656, + "learning_rate": 4.982561036372696e-06, + "loss": 0.743, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 14.919551849365234, + "learning_rate": 5.190167746221558e-06, + "loss": 0.8217, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 19.161149978637695, + "learning_rate": 5.3977744560704205e-06, + "loss": 0.7677, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 15.927726745605469, + "learning_rate": 5.605381165919283e-06, + "loss": 0.7144, + "step": 675 + }, + { + "epoch": 0.03, + "grad_norm": 13.279458045959473, + "learning_rate": 5.812987875768145e-06, + "loss": 0.7376, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 38.41741943359375, + "learning_rate": 6.020594585617007e-06, + "loss": 0.7485, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 14.47808837890625, + "learning_rate": 6.22820129546587e-06, + "loss": 0.7733, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 15.392128944396973, + "learning_rate": 6.4358080053147324e-06, + "loss": 0.6558, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 42.021183013916016, + "learning_rate": 6.643414715163594e-06, + "loss": 0.7423, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 21.30915641784668, + "learning_rate": 6.851021425012457e-06, + "loss": 0.7232, + "step": 825 + }, + { + "epoch": 0.04, + "grad_norm": 38.241329193115234, + "learning_rate": 7.058628134861318e-06, + "loss": 0.6753, + "step": 850 + }, + { + "epoch": 0.04, + "grad_norm": 23.84617042541504, + "learning_rate": 7.266234844710182e-06, + "loss": 0.6784, + "step": 875 + }, + { + "epoch": 0.04, + "grad_norm": 18.533485412597656, + "learning_rate": 7.473841554559044e-06, + "loss": 0.7696, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 54.84447479248047, + "learning_rate": 7.681448264407906e-06, + "loss": 0.6925, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 17.614309310913086, + "learning_rate": 7.889054974256769e-06, + "loss": 0.6852, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 19.179882049560547, + "learning_rate": 8.09666168410563e-06, + "loss": 0.6878, + "step": 975 + }, + { + "epoch": 0.05, + "grad_norm": 22.098026275634766, + "learning_rate": 8.304268393954492e-06, + "loss": 0.7911, + "step": 1000 + }, + { + "epoch": 0.05, + "grad_norm": 17.752079010009766, + "learning_rate": 8.511875103803356e-06, + "loss": 0.6564, + "step": 1025 + }, + { + "epoch": 0.05, + "grad_norm": 8.17482852935791, + "learning_rate": 8.719481813652217e-06, + "loss": 0.7486, + "step": 1050 + }, + { + "epoch": 0.05, + "grad_norm": 22.92099952697754, + "learning_rate": 8.92708852350108e-06, + "loss": 0.6488, + "step": 1075 + }, + { + "epoch": 0.05, + "grad_norm": 16.468435287475586, + "learning_rate": 9.134695233349942e-06, + "loss": 0.7229, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 17.73703384399414, + "learning_rate": 9.342301943198805e-06, + "loss": 0.6508, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 15.647977828979492, + "learning_rate": 9.549908653047666e-06, + "loss": 0.6739, + "step": 1150 + }, + { + "epoch": 0.06, + "grad_norm": 13.75230884552002, + "learning_rate": 9.75751536289653e-06, + "loss": 0.5561, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 43.96486282348633, + "learning_rate": 9.965122072745393e-06, + "loss": 0.7703, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 14.04170036315918, + "learning_rate": 1.0172728782594253e-05, + "loss": 0.7327, + "step": 1225 + }, + { + "epoch": 0.06, + "grad_norm": 7.018296718597412, + "learning_rate": 1.0380335492443116e-05, + "loss": 0.6681, + "step": 1250 + }, + { + "epoch": 0.06, + "grad_norm": 31.058786392211914, + "learning_rate": 1.0587942202291978e-05, + "loss": 0.71, + "step": 1275 + }, + { + "epoch": 0.06, + "grad_norm": 28.84793472290039, + "learning_rate": 1.0795548912140841e-05, + "loss": 0.5982, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 34.36916732788086, + "learning_rate": 1.1003155621989704e-05, + "loss": 0.7612, + "step": 1325 + }, + { + "epoch": 0.07, + "grad_norm": 22.128067016601562, + "learning_rate": 1.1210762331838566e-05, + "loss": 0.6835, + "step": 1350 + }, + { + "epoch": 0.07, + "grad_norm": 73.33786010742188, + "learning_rate": 1.1418369041687429e-05, + "loss": 0.5649, + "step": 1375 + }, + { + "epoch": 0.07, + "grad_norm": 12.824825286865234, + "learning_rate": 1.162597575153629e-05, + "loss": 0.638, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 11.171917915344238, + "learning_rate": 1.1833582461385152e-05, + "loss": 0.6838, + "step": 1425 + }, + { + "epoch": 0.07, + "grad_norm": 19.83733558654785, + "learning_rate": 1.2041189171234015e-05, + "loss": 0.6683, + "step": 1450 + }, + { + "epoch": 0.07, + "grad_norm": 19.352357864379883, + "learning_rate": 1.2248795881082877e-05, + "loss": 0.7044, + "step": 1475 + }, + { + "epoch": 0.07, + "grad_norm": 33.03659439086914, + "learning_rate": 1.245640259093174e-05, + "loss": 0.6281, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 160.98451232910156, + "learning_rate": 1.2664009300780602e-05, + "loss": 0.7068, + "step": 1525 + }, + { + "epoch": 0.08, + "grad_norm": 16.802547454833984, + "learning_rate": 1.2871616010629465e-05, + "loss": 0.676, + "step": 1550 + }, + { + "epoch": 0.08, + "grad_norm": 19.06223487854004, + "learning_rate": 1.3079222720478326e-05, + "loss": 0.7226, + "step": 1575 + }, + { + "epoch": 0.08, + "grad_norm": 18.991416931152344, + "learning_rate": 1.3286829430327188e-05, + "loss": 0.6897, + "step": 1600 + }, + { + "epoch": 0.08, + "grad_norm": 3.39277982711792, + "learning_rate": 1.3494436140176051e-05, + "loss": 0.4943, + "step": 1625 + }, + { + "epoch": 0.08, + "grad_norm": 14.731120109558105, + "learning_rate": 1.3702042850024913e-05, + "loss": 0.7963, + "step": 1650 + }, + { + "epoch": 0.08, + "grad_norm": 17.09242057800293, + "learning_rate": 1.3909649559873774e-05, + "loss": 0.6485, + "step": 1675 + }, + { + "epoch": 0.08, + "grad_norm": 49.163719177246094, + "learning_rate": 1.4117256269722637e-05, + "loss": 0.5697, + "step": 1700 + }, + { + "epoch": 0.09, + "grad_norm": 6.796900749206543, + "learning_rate": 1.4324862979571501e-05, + "loss": 0.6677, + "step": 1725 + }, + { + "epoch": 0.09, + "grad_norm": 13.346964836120605, + "learning_rate": 1.4532469689420364e-05, + "loss": 0.632, + "step": 1750 + }, + { + "epoch": 0.09, + "grad_norm": 19.15069007873535, + "learning_rate": 1.4740076399269226e-05, + "loss": 0.6362, + "step": 1775 + }, + { + "epoch": 0.09, + "grad_norm": 14.083284378051758, + "learning_rate": 1.4947683109118089e-05, + "loss": 0.6034, + "step": 1800 + }, + { + "epoch": 0.09, + "grad_norm": 23.270427703857422, + "learning_rate": 1.515528981896695e-05, + "loss": 0.6496, + "step": 1825 + }, + { + "epoch": 0.09, + "grad_norm": 27.949331283569336, + "learning_rate": 1.5362896528815812e-05, + "loss": 0.6456, + "step": 1850 + }, + { + "epoch": 0.09, + "grad_norm": 9.217957496643066, + "learning_rate": 1.5570503238664673e-05, + "loss": 0.5383, + "step": 1875 + }, + { + "epoch": 0.09, + "grad_norm": 26.01340675354004, + "learning_rate": 1.5778109948513537e-05, + "loss": 0.5334, + "step": 1900 + }, + { + "epoch": 0.1, + "grad_norm": 4.734766483306885, + "learning_rate": 1.5985716658362398e-05, + "loss": 0.6377, + "step": 1925 + }, + { + "epoch": 0.1, + "grad_norm": 9.05659008026123, + "learning_rate": 1.619332336821126e-05, + "loss": 0.6443, + "step": 1950 + }, + { + "epoch": 0.1, + "grad_norm": 12.056586265563965, + "learning_rate": 1.6400930078060123e-05, + "loss": 0.6515, + "step": 1975 + }, + { + "epoch": 0.1, + "grad_norm": 9.164318084716797, + "learning_rate": 1.6608536787908984e-05, + "loss": 0.6515, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 7.113138675689697, + "learning_rate": 1.681614349775785e-05, + "loss": 0.5369, + "step": 2025 + }, + { + "epoch": 0.1, + "grad_norm": 3.411200523376465, + "learning_rate": 1.7023750207606713e-05, + "loss": 0.5783, + "step": 2050 + }, + { + "epoch": 0.1, + "grad_norm": 11.924202919006348, + "learning_rate": 1.7231356917455573e-05, + "loss": 0.5549, + "step": 2075 + }, + { + "epoch": 0.1, + "grad_norm": 5.846712112426758, + "learning_rate": 1.7438963627304434e-05, + "loss": 0.5598, + "step": 2100 + }, + { + "epoch": 0.11, + "grad_norm": 16.387100219726562, + "learning_rate": 1.76465703371533e-05, + "loss": 0.5761, + "step": 2125 + }, + { + "epoch": 0.11, + "grad_norm": 5.820246696472168, + "learning_rate": 1.785417704700216e-05, + "loss": 0.5706, + "step": 2150 + }, + { + "epoch": 0.11, + "grad_norm": 5.995615482330322, + "learning_rate": 1.8061783756851024e-05, + "loss": 0.5396, + "step": 2175 + }, + { + "epoch": 0.11, + "grad_norm": 11.19775104522705, + "learning_rate": 1.8269390466699885e-05, + "loss": 0.5248, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 12.269454956054688, + "learning_rate": 1.8476997176548745e-05, + "loss": 0.5094, + "step": 2225 + }, + { + "epoch": 0.11, + "grad_norm": 13.744296073913574, + "learning_rate": 1.868460388639761e-05, + "loss": 0.4868, + "step": 2250 + }, + { + "epoch": 0.11, + "grad_norm": 8.654863357543945, + "learning_rate": 1.889221059624647e-05, + "loss": 0.6201, + "step": 2275 + }, + { + "epoch": 0.11, + "grad_norm": 26.114696502685547, + "learning_rate": 1.909981730609533e-05, + "loss": 0.5775, + "step": 2300 + }, + { + "epoch": 0.12, + "grad_norm": 5.729313373565674, + "learning_rate": 1.9307424015944196e-05, + "loss": 0.8127, + "step": 2325 + }, + { + "epoch": 0.12, + "grad_norm": 10.743517875671387, + "learning_rate": 1.951503072579306e-05, + "loss": 0.7149, + "step": 2350 + }, + { + "epoch": 0.12, + "grad_norm": 24.40814781188965, + "learning_rate": 1.972263743564192e-05, + "loss": 0.633, + "step": 2375 + }, + { + "epoch": 0.12, + "grad_norm": 50.178707122802734, + "learning_rate": 1.9930244145490785e-05, + "loss": 0.6349, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 29.33345603942871, + "learning_rate": 2.0137850855339646e-05, + "loss": 0.5914, + "step": 2425 + }, + { + "epoch": 0.12, + "grad_norm": 21.70145606994629, + "learning_rate": 2.0345457565188507e-05, + "loss": 0.6143, + "step": 2450 + }, + { + "epoch": 0.12, + "grad_norm": 40.49439239501953, + "learning_rate": 2.055306427503737e-05, + "loss": 0.7122, + "step": 2475 + }, + { + "epoch": 0.12, + "grad_norm": 14.44583797454834, + "learning_rate": 2.0760670984886232e-05, + "loss": 0.5066, + "step": 2500 + }, + { + "epoch": 0.13, + "grad_norm": 8.811293601989746, + "learning_rate": 2.0968277694735096e-05, + "loss": 0.6458, + "step": 2525 + }, + { + "epoch": 0.13, + "grad_norm": 10.258934020996094, + "learning_rate": 2.1175884404583957e-05, + "loss": 0.596, + "step": 2550 + }, + { + "epoch": 0.13, + "grad_norm": 9.4044828414917, + "learning_rate": 2.1383491114432818e-05, + "loss": 0.5651, + "step": 2575 + }, + { + "epoch": 0.13, + "grad_norm": 11.9262056350708, + "learning_rate": 2.1591097824281682e-05, + "loss": 0.5821, + "step": 2600 + }, + { + "epoch": 0.13, + "grad_norm": 4.892654895782471, + "learning_rate": 2.1798704534130543e-05, + "loss": 0.5606, + "step": 2625 + }, + { + "epoch": 0.13, + "grad_norm": 58.10175323486328, + "learning_rate": 2.2006311243979407e-05, + "loss": 0.5521, + "step": 2650 + }, + { + "epoch": 0.13, + "grad_norm": 22.51751708984375, + "learning_rate": 2.221391795382827e-05, + "loss": 0.5206, + "step": 2675 + }, + { + "epoch": 0.13, + "grad_norm": 12.897263526916504, + "learning_rate": 2.2421524663677132e-05, + "loss": 0.6067, + "step": 2700 + }, + { + "epoch": 0.14, + "grad_norm": 16.60735511779785, + "learning_rate": 2.2629131373525993e-05, + "loss": 0.5365, + "step": 2725 + }, + { + "epoch": 0.14, + "grad_norm": 7.0244622230529785, + "learning_rate": 2.2836738083374857e-05, + "loss": 0.5689, + "step": 2750 + }, + { + "epoch": 0.14, + "grad_norm": 13.883830070495605, + "learning_rate": 2.3044344793223718e-05, + "loss": 0.5949, + "step": 2775 + }, + { + "epoch": 0.14, + "grad_norm": 6.281734466552734, + "learning_rate": 2.325195150307258e-05, + "loss": 0.5586, + "step": 2800 + }, + { + "epoch": 0.14, + "grad_norm": 15.306391716003418, + "learning_rate": 2.3459558212921443e-05, + "loss": 0.5077, + "step": 2825 + }, + { + "epoch": 0.14, + "grad_norm": 7.228984355926514, + "learning_rate": 2.3667164922770304e-05, + "loss": 0.5468, + "step": 2850 + }, + { + "epoch": 0.14, + "grad_norm": 21.7454891204834, + "learning_rate": 2.3874771632619165e-05, + "loss": 0.6936, + "step": 2875 + }, + { + "epoch": 0.14, + "grad_norm": 7.734707355499268, + "learning_rate": 2.408237834246803e-05, + "loss": 0.4939, + "step": 2900 + }, + { + "epoch": 0.15, + "grad_norm": 3.2207603454589844, + "learning_rate": 2.428998505231689e-05, + "loss": 0.5253, + "step": 2925 + }, + { + "epoch": 0.15, + "grad_norm": 22.73624038696289, + "learning_rate": 2.4497591762165754e-05, + "loss": 0.5403, + "step": 2950 + }, + { + "epoch": 0.15, + "grad_norm": 82.33477020263672, + "learning_rate": 2.470519847201462e-05, + "loss": 0.5683, + "step": 2975 + }, + { + "epoch": 0.15, + "grad_norm": 19.013668060302734, + "learning_rate": 2.491280518186348e-05, + "loss": 0.56, + "step": 3000 + }, + { + "epoch": 0.15, + "grad_norm": 11.089810371398926, + "learning_rate": 2.512041189171234e-05, + "loss": 0.5, + "step": 3025 + }, + { + "epoch": 0.15, + "grad_norm": 10.986812591552734, + "learning_rate": 2.5328018601561205e-05, + "loss": 0.5554, + "step": 3050 + }, + { + "epoch": 0.15, + "grad_norm": 6.21177339553833, + "learning_rate": 2.5535625311410066e-05, + "loss": 0.4463, + "step": 3075 + }, + { + "epoch": 0.15, + "grad_norm": 8.5263090133667, + "learning_rate": 2.574323202125893e-05, + "loss": 0.5294, + "step": 3100 + }, + { + "epoch": 0.16, + "grad_norm": 11.42646598815918, + "learning_rate": 2.595083873110779e-05, + "loss": 0.5584, + "step": 3125 + }, + { + "epoch": 0.16, + "grad_norm": 46.24335479736328, + "learning_rate": 2.615844544095665e-05, + "loss": 0.5024, + "step": 3150 + }, + { + "epoch": 0.16, + "grad_norm": 7.996013164520264, + "learning_rate": 2.6366052150805516e-05, + "loss": 0.474, + "step": 3175 + }, + { + "epoch": 0.16, + "grad_norm": 53.60097122192383, + "learning_rate": 2.6573658860654377e-05, + "loss": 0.5053, + "step": 3200 + }, + { + "epoch": 0.16, + "grad_norm": 19.06157112121582, + "learning_rate": 2.6781265570503237e-05, + "loss": 0.5921, + "step": 3225 + }, + { + "epoch": 0.16, + "grad_norm": 37.257633209228516, + "learning_rate": 2.6988872280352102e-05, + "loss": 0.5614, + "step": 3250 + }, + { + "epoch": 0.16, + "grad_norm": 16.04386329650879, + "learning_rate": 2.7196478990200963e-05, + "loss": 0.6043, + "step": 3275 + }, + { + "epoch": 0.16, + "grad_norm": 3.808250665664673, + "learning_rate": 2.7404085700049827e-05, + "loss": 0.5175, + "step": 3300 + }, + { + "epoch": 0.17, + "grad_norm": 10.740408897399902, + "learning_rate": 2.7611692409898688e-05, + "loss": 0.4591, + "step": 3325 + }, + { + "epoch": 0.17, + "grad_norm": 10.769336700439453, + "learning_rate": 2.781929911974755e-05, + "loss": 0.6154, + "step": 3350 + }, + { + "epoch": 0.17, + "grad_norm": 11.40727424621582, + "learning_rate": 2.8026905829596413e-05, + "loss": 0.4908, + "step": 3375 + }, + { + "epoch": 0.17, + "grad_norm": 3.83750319480896, + "learning_rate": 2.8234512539445274e-05, + "loss": 0.5967, + "step": 3400 + }, + { + "epoch": 0.17, + "grad_norm": 27.651927947998047, + "learning_rate": 2.844211924929414e-05, + "loss": 0.5095, + "step": 3425 + }, + { + "epoch": 0.17, + "grad_norm": 4.459410190582275, + "learning_rate": 2.8649725959143002e-05, + "loss": 0.5288, + "step": 3450 + }, + { + "epoch": 0.17, + "grad_norm": 5.757278919219971, + "learning_rate": 2.8857332668991866e-05, + "loss": 0.5613, + "step": 3475 + }, + { + "epoch": 0.17, + "grad_norm": 20.3660888671875, + "learning_rate": 2.9064939378840727e-05, + "loss": 0.5968, + "step": 3500 + }, + { + "epoch": 0.18, + "grad_norm": 4.496192932128906, + "learning_rate": 2.9272546088689588e-05, + "loss": 0.4816, + "step": 3525 + }, + { + "epoch": 0.18, + "grad_norm": 10.203181266784668, + "learning_rate": 2.9480152798538452e-05, + "loss": 0.5615, + "step": 3550 + }, + { + "epoch": 0.18, + "grad_norm": 18.76901626586914, + "learning_rate": 2.9687759508387313e-05, + "loss": 0.5196, + "step": 3575 + }, + { + "epoch": 0.18, + "grad_norm": 51.70024490356445, + "learning_rate": 2.9895366218236178e-05, + "loss": 0.6804, + "step": 3600 + }, + { + "epoch": 0.18, + "grad_norm": 2.5735995769500732, + "learning_rate": 3.010297292808504e-05, + "loss": 0.5755, + "step": 3625 + }, + { + "epoch": 0.18, + "grad_norm": 11.979140281677246, + "learning_rate": 3.03105796379339e-05, + "loss": 0.5352, + "step": 3650 + }, + { + "epoch": 0.18, + "grad_norm": 2.2723140716552734, + "learning_rate": 3.051818634778276e-05, + "loss": 0.5037, + "step": 3675 + }, + { + "epoch": 0.18, + "grad_norm": 1.0300755500793457, + "learning_rate": 3.0725793057631624e-05, + "loss": 0.4458, + "step": 3700 + }, + { + "epoch": 0.19, + "grad_norm": 7.549310207366943, + "learning_rate": 3.093339976748049e-05, + "loss": 0.5912, + "step": 3725 + }, + { + "epoch": 0.19, + "grad_norm": 1.4594346284866333, + "learning_rate": 3.1141006477329346e-05, + "loss": 0.6347, + "step": 3750 + }, + { + "epoch": 0.19, + "grad_norm": 8.113776206970215, + "learning_rate": 3.134861318717821e-05, + "loss": 0.5614, + "step": 3775 + }, + { + "epoch": 0.19, + "grad_norm": 1.6310756206512451, + "learning_rate": 3.1556219897027075e-05, + "loss": 0.5177, + "step": 3800 + }, + { + "epoch": 0.19, + "grad_norm": 5.420190811157227, + "learning_rate": 3.176382660687593e-05, + "loss": 0.5223, + "step": 3825 + }, + { + "epoch": 0.19, + "grad_norm": 7.095544338226318, + "learning_rate": 3.1971433316724796e-05, + "loss": 0.4932, + "step": 3850 + }, + { + "epoch": 0.19, + "grad_norm": 17.664386749267578, + "learning_rate": 3.217904002657366e-05, + "loss": 0.5635, + "step": 3875 + }, + { + "epoch": 0.19, + "grad_norm": 25.511335372924805, + "learning_rate": 3.238664673642252e-05, + "loss": 0.4911, + "step": 3900 + }, + { + "epoch": 0.2, + "grad_norm": 20.20481300354004, + "learning_rate": 3.259425344627138e-05, + "loss": 0.481, + "step": 3925 + }, + { + "epoch": 0.2, + "grad_norm": 1.4173545837402344, + "learning_rate": 3.2801860156120247e-05, + "loss": 0.4748, + "step": 3950 + }, + { + "epoch": 0.2, + "grad_norm": 15.927542686462402, + "learning_rate": 3.300946686596911e-05, + "loss": 0.3865, + "step": 3975 + }, + { + "epoch": 0.2, + "grad_norm": 7.046652317047119, + "learning_rate": 3.321707357581797e-05, + "loss": 0.6209, + "step": 4000 + }, + { + "epoch": 0.2, + "grad_norm": 5.920745849609375, + "learning_rate": 3.342468028566683e-05, + "loss": 0.5593, + "step": 4025 + }, + { + "epoch": 0.2, + "grad_norm": 13.87470817565918, + "learning_rate": 3.36322869955157e-05, + "loss": 0.5426, + "step": 4050 + }, + { + "epoch": 0.2, + "grad_norm": 1.830471396446228, + "learning_rate": 3.383989370536456e-05, + "loss": 0.3738, + "step": 4075 + }, + { + "epoch": 0.2, + "grad_norm": 57.95819091796875, + "learning_rate": 3.4047500415213425e-05, + "loss": 0.7062, + "step": 4100 + }, + { + "epoch": 0.21, + "grad_norm": 6.367457389831543, + "learning_rate": 3.425510712506228e-05, + "loss": 0.5627, + "step": 4125 + }, + { + "epoch": 0.21, + "grad_norm": 26.165958404541016, + "learning_rate": 3.446271383491115e-05, + "loss": 0.5509, + "step": 4150 + }, + { + "epoch": 0.21, + "grad_norm": 32.05531692504883, + "learning_rate": 3.467032054476001e-05, + "loss": 0.5006, + "step": 4175 + }, + { + "epoch": 0.21, + "grad_norm": 6.160696029663086, + "learning_rate": 3.487792725460887e-05, + "loss": 0.5143, + "step": 4200 + }, + { + "epoch": 0.21, + "grad_norm": 5.989635944366455, + "learning_rate": 3.508553396445773e-05, + "loss": 0.5025, + "step": 4225 + }, + { + "epoch": 0.21, + "grad_norm": 10.148011207580566, + "learning_rate": 3.52931406743066e-05, + "loss": 0.626, + "step": 4250 + }, + { + "epoch": 0.21, + "grad_norm": 28.23525619506836, + "learning_rate": 3.550074738415546e-05, + "loss": 0.4305, + "step": 4275 + }, + { + "epoch": 0.21, + "grad_norm": 3.1064085960388184, + "learning_rate": 3.570835409400432e-05, + "loss": 0.5236, + "step": 4300 + }, + { + "epoch": 0.22, + "grad_norm": 3.7058463096618652, + "learning_rate": 3.591596080385318e-05, + "loss": 0.5068, + "step": 4325 + }, + { + "epoch": 0.22, + "grad_norm": 8.256972312927246, + "learning_rate": 3.612356751370205e-05, + "loss": 0.4297, + "step": 4350 + }, + { + "epoch": 0.22, + "grad_norm": 6.961075782775879, + "learning_rate": 3.6331174223550905e-05, + "loss": 0.4994, + "step": 4375 + }, + { + "epoch": 0.22, + "grad_norm": 1.5909925699234009, + "learning_rate": 3.653878093339977e-05, + "loss": 0.764, + "step": 4400 + }, + { + "epoch": 0.22, + "grad_norm": 15.037834167480469, + "learning_rate": 3.674638764324863e-05, + "loss": 0.5137, + "step": 4425 + }, + { + "epoch": 0.22, + "grad_norm": 3.972102642059326, + "learning_rate": 3.695399435309749e-05, + "loss": 0.4061, + "step": 4450 + }, + { + "epoch": 0.22, + "grad_norm": 26.55154800415039, + "learning_rate": 3.7161601062946355e-05, + "loss": 0.59, + "step": 4475 + }, + { + "epoch": 0.22, + "grad_norm": 8.56869888305664, + "learning_rate": 3.736920777279522e-05, + "loss": 0.544, + "step": 4500 + }, + { + "epoch": 0.23, + "grad_norm": 20.557531356811523, + "learning_rate": 3.757681448264408e-05, + "loss": 0.5488, + "step": 4525 + }, + { + "epoch": 0.23, + "grad_norm": 5.790639877319336, + "learning_rate": 3.778442119249294e-05, + "loss": 0.4406, + "step": 4550 + }, + { + "epoch": 0.23, + "grad_norm": 23.40848731994629, + "learning_rate": 3.7992027902341805e-05, + "loss": 0.6572, + "step": 4575 + }, + { + "epoch": 0.23, + "grad_norm": 1.9709395170211792, + "learning_rate": 3.819963461219066e-05, + "loss": 0.4951, + "step": 4600 + }, + { + "epoch": 0.23, + "grad_norm": 4.339324951171875, + "learning_rate": 3.840724132203953e-05, + "loss": 0.5135, + "step": 4625 + }, + { + "epoch": 0.23, + "grad_norm": 80.32009887695312, + "learning_rate": 3.861484803188839e-05, + "loss": 0.4385, + "step": 4650 + }, + { + "epoch": 0.23, + "grad_norm": 22.651634216308594, + "learning_rate": 3.8822454741737256e-05, + "loss": 0.533, + "step": 4675 + }, + { + "epoch": 0.23, + "grad_norm": 2.43072247505188, + "learning_rate": 3.903006145158612e-05, + "loss": 0.523, + "step": 4700 + }, + { + "epoch": 0.24, + "grad_norm": 19.212390899658203, + "learning_rate": 3.9237668161434984e-05, + "loss": 0.6725, + "step": 4725 + }, + { + "epoch": 0.24, + "grad_norm": 2.255681276321411, + "learning_rate": 3.944527487128384e-05, + "loss": 0.4607, + "step": 4750 + }, + { + "epoch": 0.24, + "grad_norm": 11.483686447143555, + "learning_rate": 3.9652881581132706e-05, + "loss": 0.4615, + "step": 4775 + }, + { + "epoch": 0.24, + "grad_norm": 4.3608527183532715, + "learning_rate": 3.986048829098157e-05, + "loss": 0.5218, + "step": 4800 + }, + { + "epoch": 0.24, + "grad_norm": 36.954200744628906, + "learning_rate": 4.006809500083043e-05, + "loss": 0.4754, + "step": 4825 + }, + { + "epoch": 0.24, + "grad_norm": 1.0579265356063843, + "learning_rate": 4.027570171067929e-05, + "loss": 0.4862, + "step": 4850 + }, + { + "epoch": 0.24, + "grad_norm": 3.5092215538024902, + "learning_rate": 4.0483308420528156e-05, + "loss": 0.6089, + "step": 4875 + }, + { + "epoch": 0.24, + "grad_norm": 7.657075881958008, + "learning_rate": 4.0690915130377013e-05, + "loss": 0.4453, + "step": 4900 + }, + { + "epoch": 0.25, + "grad_norm": 121.91183471679688, + "learning_rate": 4.089852184022588e-05, + "loss": 0.4193, + "step": 4925 + }, + { + "epoch": 0.25, + "grad_norm": 23.41520118713379, + "learning_rate": 4.110612855007474e-05, + "loss": 0.5111, + "step": 4950 + }, + { + "epoch": 0.25, + "grad_norm": 8.844143867492676, + "learning_rate": 4.13137352599236e-05, + "loss": 0.5808, + "step": 4975 + }, + { + "epoch": 0.25, + "grad_norm": 2.992230176925659, + "learning_rate": 4.1521341969772464e-05, + "loss": 0.4588, + "step": 5000 + }, + { + "epoch": 0.25, + "grad_norm": 3.5053603649139404, + "learning_rate": 4.172894867962133e-05, + "loss": 0.3961, + "step": 5025 + }, + { + "epoch": 0.25, + "grad_norm": 6.6925740242004395, + "learning_rate": 4.193655538947019e-05, + "loss": 0.5423, + "step": 5050 + }, + { + "epoch": 0.25, + "grad_norm": 15.226505279541016, + "learning_rate": 4.214416209931905e-05, + "loss": 0.5825, + "step": 5075 + }, + { + "epoch": 0.25, + "grad_norm": 3.675689220428467, + "learning_rate": 4.2351768809167914e-05, + "loss": 0.53, + "step": 5100 + }, + { + "epoch": 0.26, + "grad_norm": 17.315486907958984, + "learning_rate": 4.255937551901678e-05, + "loss": 0.4808, + "step": 5125 + }, + { + "epoch": 0.26, + "grad_norm": 51.50734329223633, + "learning_rate": 4.2766982228865636e-05, + "loss": 0.5167, + "step": 5150 + }, + { + "epoch": 0.26, + "grad_norm": 0.9922437071800232, + "learning_rate": 4.29745889387145e-05, + "loss": 0.6093, + "step": 5175 + }, + { + "epoch": 0.26, + "grad_norm": 4.319682598114014, + "learning_rate": 4.3182195648563364e-05, + "loss": 0.4104, + "step": 5200 + }, + { + "epoch": 0.26, + "grad_norm": 38.934505462646484, + "learning_rate": 4.338980235841222e-05, + "loss": 0.524, + "step": 5225 + }, + { + "epoch": 0.26, + "grad_norm": 19.051517486572266, + "learning_rate": 4.3597409068261086e-05, + "loss": 0.5192, + "step": 5250 + }, + { + "epoch": 0.26, + "grad_norm": 25.00204849243164, + "learning_rate": 4.380501577810995e-05, + "loss": 0.5483, + "step": 5275 + }, + { + "epoch": 0.26, + "grad_norm": 46.21389389038086, + "learning_rate": 4.4012622487958814e-05, + "loss": 0.4743, + "step": 5300 + }, + { + "epoch": 0.27, + "grad_norm": 6.3099565505981445, + "learning_rate": 4.422022919780768e-05, + "loss": 0.5155, + "step": 5325 + }, + { + "epoch": 0.27, + "grad_norm": 17.59408950805664, + "learning_rate": 4.442783590765654e-05, + "loss": 0.5393, + "step": 5350 + }, + { + "epoch": 0.27, + "grad_norm": 2.886939287185669, + "learning_rate": 4.46354426175054e-05, + "loss": 0.6733, + "step": 5375 + }, + { + "epoch": 0.27, + "grad_norm": 3.5486369132995605, + "learning_rate": 4.4843049327354265e-05, + "loss": 0.6322, + "step": 5400 + }, + { + "epoch": 0.27, + "grad_norm": 26.038715362548828, + "learning_rate": 4.505065603720313e-05, + "loss": 0.5641, + "step": 5425 + }, + { + "epoch": 0.27, + "grad_norm": 5.1668901443481445, + "learning_rate": 4.5258262747051986e-05, + "loss": 0.5498, + "step": 5450 + }, + { + "epoch": 0.27, + "grad_norm": 4.700604438781738, + "learning_rate": 4.546586945690085e-05, + "loss": 0.4702, + "step": 5475 + }, + { + "epoch": 0.27, + "grad_norm": 16.96105194091797, + "learning_rate": 4.5673476166749715e-05, + "loss": 0.465, + "step": 5500 + }, + { + "epoch": 0.28, + "grad_norm": 4.128838062286377, + "learning_rate": 4.588108287659857e-05, + "loss": 0.519, + "step": 5525 + }, + { + "epoch": 0.28, + "grad_norm": 18.705781936645508, + "learning_rate": 4.6088689586447437e-05, + "loss": 0.4802, + "step": 5550 + }, + { + "epoch": 0.28, + "grad_norm": 1.0547155141830444, + "learning_rate": 4.62962962962963e-05, + "loss": 0.556, + "step": 5575 + }, + { + "epoch": 0.28, + "grad_norm": 7.441427230834961, + "learning_rate": 4.650390300614516e-05, + "loss": 0.6299, + "step": 5600 + }, + { + "epoch": 0.28, + "grad_norm": 2.871375799179077, + "learning_rate": 4.671150971599402e-05, + "loss": 0.6662, + "step": 5625 + }, + { + "epoch": 0.28, + "grad_norm": 5.272790431976318, + "learning_rate": 4.691911642584289e-05, + "loss": 0.5033, + "step": 5650 + }, + { + "epoch": 0.28, + "grad_norm": 4.835639476776123, + "learning_rate": 4.7126723135691744e-05, + "loss": 0.3825, + "step": 5675 + }, + { + "epoch": 0.28, + "grad_norm": 8.706759452819824, + "learning_rate": 4.733432984554061e-05, + "loss": 0.5942, + "step": 5700 + }, + { + "epoch": 0.29, + "grad_norm": 7.657235145568848, + "learning_rate": 4.754193655538947e-05, + "loss": 0.5206, + "step": 5725 + }, + { + "epoch": 0.29, + "grad_norm": 7.8590874671936035, + "learning_rate": 4.774954326523833e-05, + "loss": 0.4882, + "step": 5750 + }, + { + "epoch": 0.29, + "grad_norm": 8.485298156738281, + "learning_rate": 4.7957149975087194e-05, + "loss": 0.4087, + "step": 5775 + }, + { + "epoch": 0.29, + "grad_norm": 2.479032039642334, + "learning_rate": 4.816475668493606e-05, + "loss": 0.5665, + "step": 5800 + }, + { + "epoch": 0.29, + "grad_norm": 50.31918716430664, + "learning_rate": 4.837236339478492e-05, + "loss": 0.5743, + "step": 5825 + }, + { + "epoch": 0.29, + "grad_norm": 22.77115821838379, + "learning_rate": 4.857997010463378e-05, + "loss": 0.5766, + "step": 5850 + }, + { + "epoch": 0.29, + "grad_norm": 13.277716636657715, + "learning_rate": 4.8787576814482645e-05, + "loss": 0.6116, + "step": 5875 + }, + { + "epoch": 0.29, + "grad_norm": 6.051513671875, + "learning_rate": 4.899518352433151e-05, + "loss": 0.4923, + "step": 5900 + }, + { + "epoch": 0.3, + "grad_norm": 0.7650220394134521, + "learning_rate": 4.920279023418037e-05, + "loss": 0.4387, + "step": 5925 + }, + { + "epoch": 0.3, + "grad_norm": 3.7846858501434326, + "learning_rate": 4.941039694402924e-05, + "loss": 0.5334, + "step": 5950 + }, + { + "epoch": 0.3, + "grad_norm": 6.026222229003906, + "learning_rate": 4.9618003653878095e-05, + "loss": 0.4932, + "step": 5975 + }, + { + "epoch": 0.3, + "grad_norm": 16.16804313659668, + "learning_rate": 4.982561036372696e-05, + "loss": 0.4351, + "step": 6000 + }, + { + "epoch": 0.3, + "grad_norm": 20.083784103393555, + "learning_rate": 4.9996308805344855e-05, + "loss": 0.3674, + "step": 6025 + }, + { + "epoch": 0.3, + "grad_norm": 7.546694755554199, + "learning_rate": 4.9973238838750166e-05, + "loss": 0.5934, + "step": 6050 + }, + { + "epoch": 0.3, + "grad_norm": 33.574642181396484, + "learning_rate": 4.995016887215547e-05, + "loss": 0.4848, + "step": 6075 + }, + { + "epoch": 0.3, + "grad_norm": 1.2845981121063232, + "learning_rate": 4.992709890556079e-05, + "loss": 0.4931, + "step": 6100 + }, + { + "epoch": 0.31, + "grad_norm": 1.067752718925476, + "learning_rate": 4.9904028938966094e-05, + "loss": 0.5156, + "step": 6125 + }, + { + "epoch": 0.31, + "grad_norm": 8.510303497314453, + "learning_rate": 4.988095897237141e-05, + "loss": 0.4853, + "step": 6150 + }, + { + "epoch": 0.31, + "grad_norm": 24.643705368041992, + "learning_rate": 4.985788900577672e-05, + "loss": 0.5828, + "step": 6175 + }, + { + "epoch": 0.31, + "grad_norm": 3.2508111000061035, + "learning_rate": 4.9834819039182034e-05, + "loss": 0.528, + "step": 6200 + }, + { + "epoch": 0.31, + "grad_norm": 3.186040163040161, + "learning_rate": 4.9811749072587346e-05, + "loss": 0.4924, + "step": 6225 + }, + { + "epoch": 0.31, + "grad_norm": 5.922456741333008, + "learning_rate": 4.978867910599266e-05, + "loss": 0.4725, + "step": 6250 + }, + { + "epoch": 0.31, + "grad_norm": 18.49318504333496, + "learning_rate": 4.976560913939797e-05, + "loss": 0.5518, + "step": 6275 + }, + { + "epoch": 0.31, + "grad_norm": 16.115798950195312, + "learning_rate": 4.974253917280328e-05, + "loss": 0.5084, + "step": 6300 + }, + { + "epoch": 0.32, + "grad_norm": 27.040874481201172, + "learning_rate": 4.971946920620859e-05, + "loss": 0.6729, + "step": 6325 + }, + { + "epoch": 0.32, + "grad_norm": 3.820512056350708, + "learning_rate": 4.96963992396139e-05, + "loss": 0.5121, + "step": 6350 + }, + { + "epoch": 0.32, + "grad_norm": 46.661659240722656, + "learning_rate": 4.9673329273019214e-05, + "loss": 0.4332, + "step": 6375 + }, + { + "epoch": 0.32, + "grad_norm": 5.819820404052734, + "learning_rate": 4.965025930642453e-05, + "loss": 0.4221, + "step": 6400 + }, + { + "epoch": 0.32, + "grad_norm": 2.76592755317688, + "learning_rate": 4.962718933982984e-05, + "loss": 0.4391, + "step": 6425 + }, + { + "epoch": 0.32, + "grad_norm": 3.112344980239868, + "learning_rate": 4.960411937323515e-05, + "loss": 0.3965, + "step": 6450 + }, + { + "epoch": 0.32, + "grad_norm": 0.7750204205513, + "learning_rate": 4.958104940664046e-05, + "loss": 0.4986, + "step": 6475 + }, + { + "epoch": 0.32, + "grad_norm": 108.8819808959961, + "learning_rate": 4.955797944004577e-05, + "loss": 0.511, + "step": 6500 + }, + { + "epoch": 0.33, + "grad_norm": 13.656049728393555, + "learning_rate": 4.953490947345108e-05, + "loss": 0.4491, + "step": 6525 + }, + { + "epoch": 0.33, + "grad_norm": 33.91952896118164, + "learning_rate": 4.9511839506856393e-05, + "loss": 0.4709, + "step": 6550 + }, + { + "epoch": 0.33, + "grad_norm": 142.18125915527344, + "learning_rate": 4.948876954026171e-05, + "loss": 0.5995, + "step": 6575 + }, + { + "epoch": 0.33, + "grad_norm": 5.344429016113281, + "learning_rate": 4.9465699573667016e-05, + "loss": 0.602, + "step": 6600 + }, + { + "epoch": 0.33, + "grad_norm": 0.9350224137306213, + "learning_rate": 4.9442629607072334e-05, + "loss": 0.4517, + "step": 6625 + }, + { + "epoch": 0.33, + "grad_norm": 10.79479694366455, + "learning_rate": 4.941955964047764e-05, + "loss": 0.482, + "step": 6650 + }, + { + "epoch": 0.33, + "grad_norm": 22.03511619567871, + "learning_rate": 4.939648967388296e-05, + "loss": 0.4952, + "step": 6675 + }, + { + "epoch": 0.33, + "grad_norm": 3.9321937561035156, + "learning_rate": 4.937341970728827e-05, + "loss": 0.4992, + "step": 6700 + }, + { + "epoch": 0.34, + "grad_norm": 2.330125331878662, + "learning_rate": 4.935034974069358e-05, + "loss": 0.5021, + "step": 6725 + }, + { + "epoch": 0.34, + "grad_norm": 230.00167846679688, + "learning_rate": 4.932727977409889e-05, + "loss": 0.4186, + "step": 6750 + }, + { + "epoch": 0.34, + "grad_norm": 2.341181516647339, + "learning_rate": 4.93042098075042e-05, + "loss": 0.519, + "step": 6775 + }, + { + "epoch": 0.34, + "grad_norm": 0.8861842155456543, + "learning_rate": 4.9281139840909514e-05, + "loss": 0.4363, + "step": 6800 + }, + { + "epoch": 0.34, + "grad_norm": 41.04738998413086, + "learning_rate": 4.925806987431482e-05, + "loss": 0.5219, + "step": 6825 + }, + { + "epoch": 0.34, + "grad_norm": 6.22245454788208, + "learning_rate": 4.9234999907720137e-05, + "loss": 0.5745, + "step": 6850 + }, + { + "epoch": 0.34, + "grad_norm": 1.7552106380462646, + "learning_rate": 4.921192994112545e-05, + "loss": 0.5304, + "step": 6875 + }, + { + "epoch": 0.34, + "grad_norm": 1.7670912742614746, + "learning_rate": 4.918885997453076e-05, + "loss": 0.5369, + "step": 6900 + }, + { + "epoch": 0.35, + "grad_norm": 2.4497487545013428, + "learning_rate": 4.916579000793607e-05, + "loss": 0.4918, + "step": 6925 + }, + { + "epoch": 0.35, + "grad_norm": 1.5100780725479126, + "learning_rate": 4.914272004134138e-05, + "loss": 0.4696, + "step": 6950 + }, + { + "epoch": 0.35, + "grad_norm": 8.250646591186523, + "learning_rate": 4.911965007474669e-05, + "loss": 0.4365, + "step": 6975 + }, + { + "epoch": 0.35, + "grad_norm": 6.092109203338623, + "learning_rate": 4.9096580108152005e-05, + "loss": 0.5336, + "step": 7000 + }, + { + "epoch": 0.35, + "grad_norm": 57.527923583984375, + "learning_rate": 4.9073510141557316e-05, + "loss": 0.5304, + "step": 7025 + }, + { + "epoch": 0.35, + "grad_norm": 5.689362049102783, + "learning_rate": 4.905044017496263e-05, + "loss": 0.471, + "step": 7050 + }, + { + "epoch": 0.35, + "grad_norm": 5.691891670227051, + "learning_rate": 4.902737020836794e-05, + "loss": 0.5668, + "step": 7075 + }, + { + "epoch": 0.35, + "grad_norm": 8.563892364501953, + "learning_rate": 4.900430024177326e-05, + "loss": 0.492, + "step": 7100 + }, + { + "epoch": 0.36, + "grad_norm": 18.62669563293457, + "learning_rate": 4.898123027517856e-05, + "loss": 0.5833, + "step": 7125 + }, + { + "epoch": 0.36, + "grad_norm": 60.172019958496094, + "learning_rate": 4.895816030858388e-05, + "loss": 0.4275, + "step": 7150 + }, + { + "epoch": 0.36, + "grad_norm": 10.911247253417969, + "learning_rate": 4.8935090341989184e-05, + "loss": 0.6028, + "step": 7175 + }, + { + "epoch": 0.36, + "grad_norm": 105.57086181640625, + "learning_rate": 4.8912020375394496e-05, + "loss": 0.5596, + "step": 7200 + }, + { + "epoch": 0.36, + "grad_norm": 19.55571746826172, + "learning_rate": 4.8888950408799814e-05, + "loss": 0.6494, + "step": 7225 + }, + { + "epoch": 0.36, + "grad_norm": 2.126455307006836, + "learning_rate": 4.886588044220512e-05, + "loss": 0.4685, + "step": 7250 + }, + { + "epoch": 0.36, + "grad_norm": 6.115291118621826, + "learning_rate": 4.8842810475610436e-05, + "loss": 0.5611, + "step": 7275 + }, + { + "epoch": 0.36, + "grad_norm": 3.771648645401001, + "learning_rate": 4.881974050901574e-05, + "loss": 0.5599, + "step": 7300 + }, + { + "epoch": 0.37, + "grad_norm": 6.9629225730896, + "learning_rate": 4.879667054242106e-05, + "loss": 0.5673, + "step": 7325 + }, + { + "epoch": 0.37, + "grad_norm": 18.98270034790039, + "learning_rate": 4.8773600575826364e-05, + "loss": 0.3507, + "step": 7350 + }, + { + "epoch": 0.37, + "grad_norm": 14.278607368469238, + "learning_rate": 4.875053060923168e-05, + "loss": 0.4875, + "step": 7375 + }, + { + "epoch": 0.37, + "grad_norm": 15.664734840393066, + "learning_rate": 4.872746064263699e-05, + "loss": 0.6233, + "step": 7400 + }, + { + "epoch": 0.37, + "grad_norm": 17.01627540588379, + "learning_rate": 4.8704390676042305e-05, + "loss": 0.4525, + "step": 7425 + }, + { + "epoch": 0.37, + "grad_norm": 3.3598997592926025, + "learning_rate": 4.8681320709447616e-05, + "loss": 0.517, + "step": 7450 + }, + { + "epoch": 0.37, + "grad_norm": 11.31663703918457, + "learning_rate": 4.865825074285293e-05, + "loss": 0.482, + "step": 7475 + }, + { + "epoch": 0.37, + "grad_norm": 3.1184511184692383, + "learning_rate": 4.863518077625824e-05, + "loss": 0.5363, + "step": 7500 + }, + { + "epoch": 0.37, + "grad_norm": 60.25141906738281, + "learning_rate": 4.861211080966355e-05, + "loss": 0.5031, + "step": 7525 + }, + { + "epoch": 0.38, + "grad_norm": 36.034324645996094, + "learning_rate": 4.858904084306886e-05, + "loss": 0.5599, + "step": 7550 + }, + { + "epoch": 0.38, + "grad_norm": 11.539170265197754, + "learning_rate": 4.856597087647417e-05, + "loss": 0.5183, + "step": 7575 + }, + { + "epoch": 0.38, + "grad_norm": 28.977872848510742, + "learning_rate": 4.8542900909879484e-05, + "loss": 0.5144, + "step": 7600 + }, + { + "epoch": 0.38, + "grad_norm": 1.6298623085021973, + "learning_rate": 4.8519830943284796e-05, + "loss": 0.4624, + "step": 7625 + }, + { + "epoch": 0.38, + "grad_norm": 12.326157569885254, + "learning_rate": 4.849676097669011e-05, + "loss": 0.4219, + "step": 7650 + }, + { + "epoch": 0.38, + "grad_norm": 11.35020923614502, + "learning_rate": 4.847369101009542e-05, + "loss": 0.5089, + "step": 7675 + }, + { + "epoch": 0.38, + "grad_norm": 0.975980281829834, + "learning_rate": 4.845062104350073e-05, + "loss": 0.4878, + "step": 7700 + }, + { + "epoch": 0.38, + "grad_norm": 21.873689651489258, + "learning_rate": 4.842755107690604e-05, + "loss": 0.491, + "step": 7725 + }, + { + "epoch": 0.39, + "grad_norm": 2.1827073097229004, + "learning_rate": 4.840448111031136e-05, + "loss": 0.4746, + "step": 7750 + }, + { + "epoch": 0.39, + "grad_norm": 19.413707733154297, + "learning_rate": 4.8381411143716664e-05, + "loss": 0.686, + "step": 7775 + }, + { + "epoch": 0.39, + "grad_norm": 68.73217010498047, + "learning_rate": 4.835834117712198e-05, + "loss": 0.4889, + "step": 7800 + }, + { + "epoch": 0.39, + "grad_norm": 3.8209142684936523, + "learning_rate": 4.8335271210527286e-05, + "loss": 0.4441, + "step": 7825 + }, + { + "epoch": 0.39, + "grad_norm": 8.087347984313965, + "learning_rate": 4.8312201243932605e-05, + "loss": 0.396, + "step": 7850 + }, + { + "epoch": 0.39, + "grad_norm": 24.778383255004883, + "learning_rate": 4.828913127733791e-05, + "loss": 0.5427, + "step": 7875 + }, + { + "epoch": 0.39, + "grad_norm": 2.647791624069214, + "learning_rate": 4.826606131074323e-05, + "loss": 0.504, + "step": 7900 + }, + { + "epoch": 0.39, + "grad_norm": 15.89461612701416, + "learning_rate": 4.824299134414854e-05, + "loss": 0.4808, + "step": 7925 + }, + { + "epoch": 0.4, + "grad_norm": 6.659229755401611, + "learning_rate": 4.821992137755384e-05, + "loss": 0.598, + "step": 7950 + }, + { + "epoch": 0.4, + "grad_norm": 8.807088851928711, + "learning_rate": 4.819685141095916e-05, + "loss": 0.3952, + "step": 7975 + }, + { + "epoch": 0.4, + "grad_norm": 4.321648120880127, + "learning_rate": 4.8173781444364466e-05, + "loss": 0.5091, + "step": 8000 + }, + { + "epoch": 0.4, + "grad_norm": 2.7659878730773926, + "learning_rate": 4.8150711477769784e-05, + "loss": 0.4897, + "step": 8025 + }, + { + "epoch": 0.4, + "grad_norm": 2.758430004119873, + "learning_rate": 4.8127641511175095e-05, + "loss": 0.5376, + "step": 8050 + }, + { + "epoch": 0.4, + "grad_norm": 8.904570579528809, + "learning_rate": 4.810457154458041e-05, + "loss": 0.5006, + "step": 8075 + }, + { + "epoch": 0.4, + "grad_norm": 45.822593688964844, + "learning_rate": 4.808150157798572e-05, + "loss": 0.4573, + "step": 8100 + }, + { + "epoch": 0.4, + "grad_norm": 3.2372379302978516, + "learning_rate": 4.805843161139103e-05, + "loss": 0.3971, + "step": 8125 + }, + { + "epoch": 0.41, + "grad_norm": 84.43883514404297, + "learning_rate": 4.803536164479634e-05, + "loss": 0.3685, + "step": 8150 + }, + { + "epoch": 0.41, + "grad_norm": 5.58286714553833, + "learning_rate": 4.801229167820165e-05, + "loss": 0.5961, + "step": 8175 + }, + { + "epoch": 0.41, + "grad_norm": 1.6324478387832642, + "learning_rate": 4.7989221711606964e-05, + "loss": 0.6016, + "step": 8200 + }, + { + "epoch": 0.41, + "grad_norm": 4.677681922912598, + "learning_rate": 4.7966151745012275e-05, + "loss": 0.4963, + "step": 8225 + }, + { + "epoch": 0.41, + "grad_norm": 9.602376937866211, + "learning_rate": 4.7943081778417586e-05, + "loss": 0.5354, + "step": 8250 + }, + { + "epoch": 0.41, + "grad_norm": 2.9427170753479004, + "learning_rate": 4.7920011811822904e-05, + "loss": 0.4566, + "step": 8275 + }, + { + "epoch": 0.41, + "grad_norm": 11.23338794708252, + "learning_rate": 4.789694184522821e-05, + "loss": 0.5302, + "step": 8300 + }, + { + "epoch": 0.41, + "grad_norm": 15.995217323303223, + "learning_rate": 4.787387187863352e-05, + "loss": 0.5199, + "step": 8325 + }, + { + "epoch": 0.42, + "grad_norm": 24.4979190826416, + "learning_rate": 4.785080191203883e-05, + "loss": 0.3849, + "step": 8350 + }, + { + "epoch": 0.42, + "grad_norm": 1.894063949584961, + "learning_rate": 4.782773194544414e-05, + "loss": 0.5529, + "step": 8375 + }, + { + "epoch": 0.42, + "grad_norm": 18.658781051635742, + "learning_rate": 4.7804661978849454e-05, + "loss": 0.5758, + "step": 8400 + }, + { + "epoch": 0.42, + "grad_norm": 4.197040557861328, + "learning_rate": 4.7781592012254766e-05, + "loss": 0.4704, + "step": 8425 + }, + { + "epoch": 0.42, + "grad_norm": 1.2483537197113037, + "learning_rate": 4.7758522045660084e-05, + "loss": 0.4748, + "step": 8450 + }, + { + "epoch": 0.42, + "grad_norm": 23.75387954711914, + "learning_rate": 4.773545207906539e-05, + "loss": 0.5097, + "step": 8475 + }, + { + "epoch": 0.42, + "grad_norm": 3.9099292755126953, + "learning_rate": 4.771238211247071e-05, + "loss": 0.4713, + "step": 8500 + }, + { + "epoch": 0.42, + "grad_norm": 0.816892683506012, + "learning_rate": 4.768931214587601e-05, + "loss": 0.4907, + "step": 8525 + }, + { + "epoch": 0.43, + "grad_norm": 3.892925977706909, + "learning_rate": 4.766624217928133e-05, + "loss": 0.6005, + "step": 8550 + }, + { + "epoch": 0.43, + "grad_norm": 3.5819435119628906, + "learning_rate": 4.764317221268664e-05, + "loss": 0.4862, + "step": 8575 + }, + { + "epoch": 0.43, + "grad_norm": 4.449002265930176, + "learning_rate": 4.762010224609195e-05, + "loss": 0.557, + "step": 8600 + }, + { + "epoch": 0.43, + "grad_norm": 28.792911529541016, + "learning_rate": 4.7597032279497263e-05, + "loss": 0.4196, + "step": 8625 + }, + { + "epoch": 0.43, + "grad_norm": 66.64269256591797, + "learning_rate": 4.7573962312902575e-05, + "loss": 0.4574, + "step": 8650 + }, + { + "epoch": 0.43, + "grad_norm": 37.438262939453125, + "learning_rate": 4.7550892346307886e-05, + "loss": 0.4338, + "step": 8675 + }, + { + "epoch": 0.43, + "grad_norm": 6.17923641204834, + "learning_rate": 4.752782237971319e-05, + "loss": 0.461, + "step": 8700 + }, + { + "epoch": 0.43, + "grad_norm": 6.727214336395264, + "learning_rate": 4.750475241311851e-05, + "loss": 0.5826, + "step": 8725 + }, + { + "epoch": 0.44, + "grad_norm": 14.901581764221191, + "learning_rate": 4.748168244652382e-05, + "loss": 0.5953, + "step": 8750 + }, + { + "epoch": 0.44, + "grad_norm": 11.645687103271484, + "learning_rate": 4.745861247992913e-05, + "loss": 0.5776, + "step": 8775 + }, + { + "epoch": 0.44, + "grad_norm": 2.653366804122925, + "learning_rate": 4.743554251333444e-05, + "loss": 0.5414, + "step": 8800 + }, + { + "epoch": 0.44, + "grad_norm": 95.31356811523438, + "learning_rate": 4.7412472546739754e-05, + "loss": 0.527, + "step": 8825 + }, + { + "epoch": 0.44, + "grad_norm": 6.815218925476074, + "learning_rate": 4.7389402580145066e-05, + "loss": 0.4701, + "step": 8850 + }, + { + "epoch": 0.44, + "grad_norm": 2.4729785919189453, + "learning_rate": 4.736633261355038e-05, + "loss": 0.4491, + "step": 8875 + }, + { + "epoch": 0.44, + "grad_norm": 13.656978607177734, + "learning_rate": 4.734326264695569e-05, + "loss": 0.4799, + "step": 8900 + }, + { + "epoch": 0.44, + "grad_norm": 2.3728275299072266, + "learning_rate": 4.7320192680361e-05, + "loss": 0.4938, + "step": 8925 + }, + { + "epoch": 0.45, + "grad_norm": 20.797361373901367, + "learning_rate": 4.729712271376631e-05, + "loss": 0.5013, + "step": 8950 + }, + { + "epoch": 0.45, + "grad_norm": 0.904093325138092, + "learning_rate": 4.727405274717163e-05, + "loss": 0.3779, + "step": 8975 + }, + { + "epoch": 0.45, + "grad_norm": 9.007451057434082, + "learning_rate": 4.7250982780576934e-05, + "loss": 0.7228, + "step": 9000 + }, + { + "epoch": 0.45, + "grad_norm": 2.9835093021392822, + "learning_rate": 4.722791281398225e-05, + "loss": 0.3662, + "step": 9025 + }, + { + "epoch": 0.45, + "grad_norm": 13.8798189163208, + "learning_rate": 4.7204842847387557e-05, + "loss": 0.4275, + "step": 9050 + }, + { + "epoch": 0.45, + "grad_norm": 6.156960487365723, + "learning_rate": 4.718177288079287e-05, + "loss": 0.4141, + "step": 9075 + }, + { + "epoch": 0.45, + "grad_norm": 27.34996795654297, + "learning_rate": 4.7158702914198186e-05, + "loss": 0.423, + "step": 9100 + }, + { + "epoch": 0.45, + "grad_norm": 1.5032697916030884, + "learning_rate": 4.713563294760349e-05, + "loss": 0.465, + "step": 9125 + }, + { + "epoch": 0.46, + "grad_norm": 3.1437747478485107, + "learning_rate": 4.711256298100881e-05, + "loss": 0.4046, + "step": 9150 + }, + { + "epoch": 0.46, + "grad_norm": 3.2048637866973877, + "learning_rate": 4.708949301441411e-05, + "loss": 0.4301, + "step": 9175 + }, + { + "epoch": 0.46, + "grad_norm": 4.686911582946777, + "learning_rate": 4.706642304781943e-05, + "loss": 0.5915, + "step": 9200 + }, + { + "epoch": 0.46, + "grad_norm": 4.331536769866943, + "learning_rate": 4.7043353081224736e-05, + "loss": 0.5044, + "step": 9225 + }, + { + "epoch": 0.46, + "grad_norm": 5.540689468383789, + "learning_rate": 4.7020283114630054e-05, + "loss": 0.5204, + "step": 9250 + }, + { + "epoch": 0.46, + "grad_norm": 15.265934944152832, + "learning_rate": 4.6997213148035366e-05, + "loss": 0.4567, + "step": 9275 + }, + { + "epoch": 0.46, + "grad_norm": 4.857421398162842, + "learning_rate": 4.697414318144068e-05, + "loss": 0.5886, + "step": 9300 + }, + { + "epoch": 0.46, + "grad_norm": 1.7066471576690674, + "learning_rate": 4.695107321484599e-05, + "loss": 0.4756, + "step": 9325 + }, + { + "epoch": 0.47, + "grad_norm": 20.88972282409668, + "learning_rate": 4.69280032482513e-05, + "loss": 0.4356, + "step": 9350 + }, + { + "epoch": 0.47, + "grad_norm": 11.743831634521484, + "learning_rate": 4.690493328165661e-05, + "loss": 0.3707, + "step": 9375 + }, + { + "epoch": 0.47, + "grad_norm": 19.77294158935547, + "learning_rate": 4.688186331506192e-05, + "loss": 0.519, + "step": 9400 + }, + { + "epoch": 0.47, + "grad_norm": 0.8997837901115417, + "learning_rate": 4.6858793348467234e-05, + "loss": 0.4333, + "step": 9425 + }, + { + "epoch": 0.47, + "grad_norm": 3.1104369163513184, + "learning_rate": 4.6835723381872545e-05, + "loss": 0.4259, + "step": 9450 + }, + { + "epoch": 0.47, + "grad_norm": 9.724773406982422, + "learning_rate": 4.6812653415277856e-05, + "loss": 0.5336, + "step": 9475 + }, + { + "epoch": 0.47, + "grad_norm": 6.291820049285889, + "learning_rate": 4.678958344868317e-05, + "loss": 0.4473, + "step": 9500 + }, + { + "epoch": 0.47, + "grad_norm": 37.74559783935547, + "learning_rate": 4.676651348208848e-05, + "loss": 0.5415, + "step": 9525 + }, + { + "epoch": 0.48, + "grad_norm": 2.051961898803711, + "learning_rate": 4.674344351549379e-05, + "loss": 0.4485, + "step": 9550 + }, + { + "epoch": 0.48, + "grad_norm": 6.569771766662598, + "learning_rate": 4.67203735488991e-05, + "loss": 0.5229, + "step": 9575 + }, + { + "epoch": 0.48, + "grad_norm": 2.9228270053863525, + "learning_rate": 4.669730358230441e-05, + "loss": 0.3779, + "step": 9600 + }, + { + "epoch": 0.48, + "grad_norm": 3.5740840435028076, + "learning_rate": 4.667423361570973e-05, + "loss": 0.4852, + "step": 9625 + }, + { + "epoch": 0.48, + "grad_norm": 5.161360740661621, + "learning_rate": 4.6651163649115036e-05, + "loss": 0.483, + "step": 9650 + }, + { + "epoch": 0.48, + "grad_norm": 1.6016749143600464, + "learning_rate": 4.6628093682520354e-05, + "loss": 0.3926, + "step": 9675 + }, + { + "epoch": 0.48, + "grad_norm": 28.20662498474121, + "learning_rate": 4.660502371592566e-05, + "loss": 0.4485, + "step": 9700 + }, + { + "epoch": 0.48, + "grad_norm": 3.143033742904663, + "learning_rate": 4.658195374933098e-05, + "loss": 0.523, + "step": 9725 + }, + { + "epoch": 0.49, + "grad_norm": 4.893307685852051, + "learning_rate": 4.655888378273628e-05, + "loss": 0.586, + "step": 9750 + }, + { + "epoch": 0.49, + "grad_norm": 2.808121681213379, + "learning_rate": 4.65358138161416e-05, + "loss": 0.4798, + "step": 9775 + }, + { + "epoch": 0.49, + "grad_norm": 1.5118587017059326, + "learning_rate": 4.651274384954691e-05, + "loss": 0.5001, + "step": 9800 + }, + { + "epoch": 0.49, + "grad_norm": 1.5315314531326294, + "learning_rate": 4.6489673882952215e-05, + "loss": 0.4757, + "step": 9825 + }, + { + "epoch": 0.49, + "grad_norm": 26.32784080505371, + "learning_rate": 4.6466603916357534e-05, + "loss": 0.5262, + "step": 9850 + }, + { + "epoch": 0.49, + "grad_norm": 11.232955932617188, + "learning_rate": 4.644353394976284e-05, + "loss": 0.4995, + "step": 9875 + }, + { + "epoch": 0.49, + "grad_norm": 2.1047329902648926, + "learning_rate": 4.6420463983168156e-05, + "loss": 0.4882, + "step": 9900 + }, + { + "epoch": 0.49, + "grad_norm": 1.3566862344741821, + "learning_rate": 4.639739401657347e-05, + "loss": 0.5503, + "step": 9925 + }, + { + "epoch": 0.5, + "grad_norm": 2.307016134262085, + "learning_rate": 4.637432404997878e-05, + "loss": 0.359, + "step": 9950 + }, + { + "epoch": 0.5, + "grad_norm": 6.331679821014404, + "learning_rate": 4.635125408338409e-05, + "loss": 0.4108, + "step": 9975 + }, + { + "epoch": 0.5, + "grad_norm": 5.821734428405762, + "learning_rate": 4.63281841167894e-05, + "loss": 0.438, + "step": 10000 + }, + { + "epoch": 0.5, + "grad_norm": 8.801206588745117, + "learning_rate": 4.630511415019471e-05, + "loss": 0.5269, + "step": 10025 + }, + { + "epoch": 0.5, + "grad_norm": 1.4149796962738037, + "learning_rate": 4.6282044183600025e-05, + "loss": 0.5253, + "step": 10050 + }, + { + "epoch": 0.5, + "grad_norm": 1.5596532821655273, + "learning_rate": 4.6258974217005336e-05, + "loss": 0.4622, + "step": 10075 + }, + { + "epoch": 0.5, + "grad_norm": 0.8093569874763489, + "learning_rate": 4.623590425041065e-05, + "loss": 0.3444, + "step": 10100 + }, + { + "epoch": 0.5, + "grad_norm": 4.408332824707031, + "learning_rate": 4.621283428381596e-05, + "loss": 0.5996, + "step": 10125 + }, + { + "epoch": 0.51, + "grad_norm": 1.2513713836669922, + "learning_rate": 4.618976431722128e-05, + "loss": 0.3927, + "step": 10150 + }, + { + "epoch": 0.51, + "grad_norm": 118.37767791748047, + "learning_rate": 4.616669435062658e-05, + "loss": 0.4931, + "step": 10175 + }, + { + "epoch": 0.51, + "grad_norm": 3.265443801879883, + "learning_rate": 4.614362438403189e-05, + "loss": 0.5738, + "step": 10200 + }, + { + "epoch": 0.51, + "grad_norm": 3.070544719696045, + "learning_rate": 4.6120554417437204e-05, + "loss": 0.4596, + "step": 10225 + }, + { + "epoch": 0.51, + "grad_norm": 6.112723350524902, + "learning_rate": 4.6097484450842515e-05, + "loss": 0.4394, + "step": 10250 + }, + { + "epoch": 0.51, + "grad_norm": 6.3622307777404785, + "learning_rate": 4.607441448424783e-05, + "loss": 0.3585, + "step": 10275 + }, + { + "epoch": 0.51, + "grad_norm": 4.152879238128662, + "learning_rate": 4.605134451765314e-05, + "loss": 0.5883, + "step": 10300 + }, + { + "epoch": 0.51, + "grad_norm": 3.7687737941741943, + "learning_rate": 4.6028274551058456e-05, + "loss": 0.3813, + "step": 10325 + }, + { + "epoch": 0.52, + "grad_norm": 3.0186991691589355, + "learning_rate": 4.600520458446376e-05, + "loss": 0.5107, + "step": 10350 + }, + { + "epoch": 0.52, + "grad_norm": 0.6705989837646484, + "learning_rate": 4.598213461786908e-05, + "loss": 0.3563, + "step": 10375 + }, + { + "epoch": 0.52, + "grad_norm": 3.8952481746673584, + "learning_rate": 4.5959064651274384e-05, + "loss": 0.4218, + "step": 10400 + }, + { + "epoch": 0.52, + "grad_norm": 1.3632725477218628, + "learning_rate": 4.59359946846797e-05, + "loss": 0.4832, + "step": 10425 + }, + { + "epoch": 0.52, + "grad_norm": 1.1923586130142212, + "learning_rate": 4.591292471808501e-05, + "loss": 0.4351, + "step": 10450 + }, + { + "epoch": 0.52, + "grad_norm": 3.5886104106903076, + "learning_rate": 4.5889854751490324e-05, + "loss": 0.5084, + "step": 10475 + }, + { + "epoch": 0.52, + "grad_norm": 0.7589617967605591, + "learning_rate": 4.5866784784895636e-05, + "loss": 0.4147, + "step": 10500 + }, + { + "epoch": 0.52, + "grad_norm": 16.9898681640625, + "learning_rate": 4.584371481830095e-05, + "loss": 0.5198, + "step": 10525 + }, + { + "epoch": 0.53, + "grad_norm": 3.4914662837982178, + "learning_rate": 4.582064485170626e-05, + "loss": 0.4613, + "step": 10550 + }, + { + "epoch": 0.53, + "grad_norm": 8.139269828796387, + "learning_rate": 4.579757488511156e-05, + "loss": 0.4438, + "step": 10575 + }, + { + "epoch": 0.53, + "grad_norm": 5.789552688598633, + "learning_rate": 4.577450491851688e-05, + "loss": 0.4678, + "step": 10600 + }, + { + "epoch": 0.53, + "grad_norm": 3.183175802230835, + "learning_rate": 4.575143495192219e-05, + "loss": 0.3469, + "step": 10625 + }, + { + "epoch": 0.53, + "grad_norm": 2.14797043800354, + "learning_rate": 4.5728364985327504e-05, + "loss": 0.5492, + "step": 10650 + }, + { + "epoch": 0.53, + "grad_norm": 6.795634746551514, + "learning_rate": 4.5705295018732815e-05, + "loss": 0.4079, + "step": 10675 + }, + { + "epoch": 0.53, + "grad_norm": 4.230640888214111, + "learning_rate": 4.568222505213813e-05, + "loss": 0.5704, + "step": 10700 + }, + { + "epoch": 0.53, + "grad_norm": 3.867367744445801, + "learning_rate": 4.565915508554344e-05, + "loss": 0.4377, + "step": 10725 + }, + { + "epoch": 0.54, + "grad_norm": 3.2728750705718994, + "learning_rate": 4.563608511894875e-05, + "loss": 0.55, + "step": 10750 + }, + { + "epoch": 0.54, + "grad_norm": 4.1966552734375, + "learning_rate": 4.561301515235406e-05, + "loss": 0.4194, + "step": 10775 + }, + { + "epoch": 0.54, + "grad_norm": 0.6582638025283813, + "learning_rate": 4.558994518575937e-05, + "loss": 0.4398, + "step": 10800 + }, + { + "epoch": 0.54, + "grad_norm": 0.6245179176330566, + "learning_rate": 4.5566875219164683e-05, + "loss": 0.3498, + "step": 10825 + }, + { + "epoch": 0.54, + "grad_norm": 18.104259490966797, + "learning_rate": 4.554380525257e-05, + "loss": 0.6087, + "step": 10850 + }, + { + "epoch": 0.54, + "grad_norm": 0.6365619897842407, + "learning_rate": 4.5520735285975306e-05, + "loss": 0.4309, + "step": 10875 + }, + { + "epoch": 0.54, + "grad_norm": 15.83073616027832, + "learning_rate": 4.5497665319380624e-05, + "loss": 0.566, + "step": 10900 + }, + { + "epoch": 0.54, + "grad_norm": 32.08317947387695, + "learning_rate": 4.547459535278593e-05, + "loss": 0.4114, + "step": 10925 + }, + { + "epoch": 0.55, + "grad_norm": 74.06217956542969, + "learning_rate": 4.545152538619124e-05, + "loss": 0.3804, + "step": 10950 + }, + { + "epoch": 0.55, + "grad_norm": 6.568449974060059, + "learning_rate": 4.542845541959656e-05, + "loss": 0.4735, + "step": 10975 + }, + { + "epoch": 0.55, + "grad_norm": 3.6441125869750977, + "learning_rate": 4.540538545300186e-05, + "loss": 0.5924, + "step": 11000 + }, + { + "epoch": 0.55, + "grad_norm": 5.191139221191406, + "learning_rate": 4.538231548640718e-05, + "loss": 0.4658, + "step": 11025 + }, + { + "epoch": 0.55, + "grad_norm": 185.90740966796875, + "learning_rate": 4.5359245519812486e-05, + "loss": 0.4915, + "step": 11050 + }, + { + "epoch": 0.55, + "grad_norm": 6.593203067779541, + "learning_rate": 4.5336175553217804e-05, + "loss": 0.4706, + "step": 11075 + }, + { + "epoch": 0.55, + "grad_norm": 9.717700004577637, + "learning_rate": 4.531310558662311e-05, + "loss": 0.4965, + "step": 11100 + }, + { + "epoch": 0.55, + "grad_norm": 74.32408905029297, + "learning_rate": 4.5290035620028427e-05, + "loss": 0.5012, + "step": 11125 + }, + { + "epoch": 0.56, + "grad_norm": 3.009906530380249, + "learning_rate": 4.526696565343374e-05, + "loss": 0.5012, + "step": 11150 + }, + { + "epoch": 0.56, + "grad_norm": 7.148441314697266, + "learning_rate": 4.524389568683905e-05, + "loss": 0.4644, + "step": 11175 + }, + { + "epoch": 0.56, + "grad_norm": 3.9309608936309814, + "learning_rate": 4.522082572024436e-05, + "loss": 0.5467, + "step": 11200 + }, + { + "epoch": 0.56, + "grad_norm": 2.191103219985962, + "learning_rate": 4.519775575364967e-05, + "loss": 0.4983, + "step": 11225 + }, + { + "epoch": 0.56, + "grad_norm": 29.83028793334961, + "learning_rate": 4.517468578705498e-05, + "loss": 0.4761, + "step": 11250 + }, + { + "epoch": 0.56, + "grad_norm": 3.0909762382507324, + "learning_rate": 4.5151615820460295e-05, + "loss": 0.5817, + "step": 11275 + }, + { + "epoch": 0.56, + "grad_norm": 7.159379482269287, + "learning_rate": 4.5128545853865606e-05, + "loss": 0.6388, + "step": 11300 + }, + { + "epoch": 0.56, + "grad_norm": 1.8289567232131958, + "learning_rate": 4.510547588727092e-05, + "loss": 0.5193, + "step": 11325 + }, + { + "epoch": 0.57, + "grad_norm": 1.2371580600738525, + "learning_rate": 4.508240592067623e-05, + "loss": 0.4833, + "step": 11350 + }, + { + "epoch": 0.57, + "grad_norm": 2.108119010925293, + "learning_rate": 4.505933595408154e-05, + "loss": 0.4918, + "step": 11375 + }, + { + "epoch": 0.57, + "grad_norm": 1.026004433631897, + "learning_rate": 4.503626598748685e-05, + "loss": 0.4207, + "step": 11400 + }, + { + "epoch": 0.57, + "grad_norm": 1.4752520322799683, + "learning_rate": 4.501319602089216e-05, + "loss": 0.4227, + "step": 11425 + }, + { + "epoch": 0.57, + "grad_norm": 3.9238433837890625, + "learning_rate": 4.4990126054297474e-05, + "loss": 0.3833, + "step": 11450 + }, + { + "epoch": 0.57, + "grad_norm": 1.4511189460754395, + "learning_rate": 4.4967056087702786e-05, + "loss": 0.6285, + "step": 11475 + }, + { + "epoch": 0.57, + "grad_norm": 4.272202968597412, + "learning_rate": 4.4943986121108104e-05, + "loss": 0.4541, + "step": 11500 + }, + { + "epoch": 0.57, + "grad_norm": 5.563125133514404, + "learning_rate": 4.492091615451341e-05, + "loss": 0.4257, + "step": 11525 + }, + { + "epoch": 0.58, + "grad_norm": 7.777960300445557, + "learning_rate": 4.4897846187918726e-05, + "loss": 0.5858, + "step": 11550 + }, + { + "epoch": 0.58, + "grad_norm": 24.097171783447266, + "learning_rate": 4.487477622132403e-05, + "loss": 0.4463, + "step": 11575 + }, + { + "epoch": 0.58, + "grad_norm": 0.9840202927589417, + "learning_rate": 4.485170625472935e-05, + "loss": 0.5353, + "step": 11600 + }, + { + "epoch": 0.58, + "grad_norm": 37.77027130126953, + "learning_rate": 4.4828636288134654e-05, + "loss": 0.5239, + "step": 11625 + }, + { + "epoch": 0.58, + "grad_norm": 0.9202004075050354, + "learning_rate": 4.480556632153997e-05, + "loss": 0.4119, + "step": 11650 + }, + { + "epoch": 0.58, + "grad_norm": 5.5572428703308105, + "learning_rate": 4.478249635494528e-05, + "loss": 0.553, + "step": 11675 + }, + { + "epoch": 0.58, + "grad_norm": 6.949309825897217, + "learning_rate": 4.475942638835059e-05, + "loss": 0.4679, + "step": 11700 + }, + { + "epoch": 0.58, + "grad_norm": 2.6731739044189453, + "learning_rate": 4.4736356421755906e-05, + "loss": 0.5218, + "step": 11725 + }, + { + "epoch": 0.59, + "grad_norm": 1.7549457550048828, + "learning_rate": 4.471328645516121e-05, + "loss": 0.4876, + "step": 11750 + }, + { + "epoch": 0.59, + "grad_norm": 3.102992534637451, + "learning_rate": 4.469021648856653e-05, + "loss": 0.5375, + "step": 11775 + }, + { + "epoch": 0.59, + "grad_norm": 6.229030132293701, + "learning_rate": 4.466714652197183e-05, + "loss": 0.4951, + "step": 11800 + }, + { + "epoch": 0.59, + "grad_norm": 9.176125526428223, + "learning_rate": 4.464407655537715e-05, + "loss": 0.5659, + "step": 11825 + }, + { + "epoch": 0.59, + "grad_norm": 19.079057693481445, + "learning_rate": 4.462100658878246e-05, + "loss": 0.3716, + "step": 11850 + }, + { + "epoch": 0.59, + "grad_norm": 0.6645026206970215, + "learning_rate": 4.4597936622187774e-05, + "loss": 0.5027, + "step": 11875 + }, + { + "epoch": 0.59, + "grad_norm": 2.9463725090026855, + "learning_rate": 4.4574866655593085e-05, + "loss": 0.5442, + "step": 11900 + }, + { + "epoch": 0.59, + "grad_norm": 3.1484124660491943, + "learning_rate": 4.45517966889984e-05, + "loss": 0.4664, + "step": 11925 + }, + { + "epoch": 0.6, + "grad_norm": 24.31441307067871, + "learning_rate": 4.452872672240371e-05, + "loss": 0.5003, + "step": 11950 + }, + { + "epoch": 0.6, + "grad_norm": 3.507324695587158, + "learning_rate": 4.450565675580902e-05, + "loss": 0.4564, + "step": 11975 + }, + { + "epoch": 0.6, + "grad_norm": 18.16114044189453, + "learning_rate": 4.448258678921433e-05, + "loss": 0.4471, + "step": 12000 + }, + { + "epoch": 0.6, + "grad_norm": 4.681582927703857, + "learning_rate": 4.445951682261965e-05, + "loss": 0.5089, + "step": 12025 + }, + { + "epoch": 0.6, + "grad_norm": 27.28392791748047, + "learning_rate": 4.4436446856024954e-05, + "loss": 0.49, + "step": 12050 + }, + { + "epoch": 0.6, + "grad_norm": 4.916137218475342, + "learning_rate": 4.4413376889430265e-05, + "loss": 0.4045, + "step": 12075 + }, + { + "epoch": 0.6, + "grad_norm": 7.058281898498535, + "learning_rate": 4.4390306922835576e-05, + "loss": 0.3812, + "step": 12100 + }, + { + "epoch": 0.6, + "grad_norm": 4.769257545471191, + "learning_rate": 4.436723695624089e-05, + "loss": 0.4888, + "step": 12125 + }, + { + "epoch": 0.61, + "grad_norm": 7.5978288650512695, + "learning_rate": 4.43441669896462e-05, + "loss": 0.6009, + "step": 12150 + }, + { + "epoch": 0.61, + "grad_norm": 1.2456187009811401, + "learning_rate": 4.432109702305151e-05, + "loss": 0.4086, + "step": 12175 + }, + { + "epoch": 0.61, + "grad_norm": 5.981049537658691, + "learning_rate": 4.429802705645683e-05, + "loss": 0.4829, + "step": 12200 + }, + { + "epoch": 0.61, + "grad_norm": 0.44680818915367126, + "learning_rate": 4.427495708986213e-05, + "loss": 0.3603, + "step": 12225 + }, + { + "epoch": 0.61, + "grad_norm": 33.1859016418457, + "learning_rate": 4.425188712326745e-05, + "loss": 0.4617, + "step": 12250 + }, + { + "epoch": 0.61, + "grad_norm": 0.866825520992279, + "learning_rate": 4.4228817156672756e-05, + "loss": 0.509, + "step": 12275 + }, + { + "epoch": 0.61, + "grad_norm": 20.547454833984375, + "learning_rate": 4.4205747190078074e-05, + "loss": 0.5846, + "step": 12300 + }, + { + "epoch": 0.61, + "grad_norm": 2.952171564102173, + "learning_rate": 4.418267722348338e-05, + "loss": 0.4625, + "step": 12325 + }, + { + "epoch": 0.62, + "grad_norm": 3.1319684982299805, + "learning_rate": 4.41596072568887e-05, + "loss": 0.4154, + "step": 12350 + }, + { + "epoch": 0.62, + "grad_norm": 6.610553741455078, + "learning_rate": 4.413653729029401e-05, + "loss": 0.3864, + "step": 12375 + }, + { + "epoch": 0.62, + "grad_norm": 8.751646041870117, + "learning_rate": 4.411346732369932e-05, + "loss": 0.3609, + "step": 12400 + }, + { + "epoch": 0.62, + "grad_norm": 3.6355435848236084, + "learning_rate": 4.409039735710463e-05, + "loss": 0.5158, + "step": 12425 + }, + { + "epoch": 0.62, + "grad_norm": 5.307819366455078, + "learning_rate": 4.4067327390509935e-05, + "loss": 0.4856, + "step": 12450 + }, + { + "epoch": 0.62, + "grad_norm": 2.587369918823242, + "learning_rate": 4.4044257423915254e-05, + "loss": 0.4296, + "step": 12475 + }, + { + "epoch": 0.62, + "grad_norm": 3.888181686401367, + "learning_rate": 4.4021187457320565e-05, + "loss": 0.5317, + "step": 12500 + }, + { + "epoch": 0.62, + "grad_norm": 15.251875877380371, + "learning_rate": 4.3998117490725876e-05, + "loss": 0.4947, + "step": 12525 + }, + { + "epoch": 0.63, + "grad_norm": 2.4063093662261963, + "learning_rate": 4.397504752413119e-05, + "loss": 0.4243, + "step": 12550 + }, + { + "epoch": 0.63, + "grad_norm": 13.604540824890137, + "learning_rate": 4.39519775575365e-05, + "loss": 0.5706, + "step": 12575 + }, + { + "epoch": 0.63, + "grad_norm": 4.274982929229736, + "learning_rate": 4.392890759094181e-05, + "loss": 0.3754, + "step": 12600 + }, + { + "epoch": 0.63, + "grad_norm": 2.1278367042541504, + "learning_rate": 4.390583762434712e-05, + "loss": 0.45, + "step": 12625 + }, + { + "epoch": 0.63, + "grad_norm": 25.345394134521484, + "learning_rate": 4.388276765775243e-05, + "loss": 0.5732, + "step": 12650 + }, + { + "epoch": 0.63, + "grad_norm": 2.4565417766571045, + "learning_rate": 4.3859697691157744e-05, + "loss": 0.554, + "step": 12675 + }, + { + "epoch": 0.63, + "grad_norm": 16.725379943847656, + "learning_rate": 4.3836627724563056e-05, + "loss": 0.3648, + "step": 12700 + }, + { + "epoch": 0.63, + "grad_norm": 14.322278022766113, + "learning_rate": 4.3813557757968374e-05, + "loss": 0.4051, + "step": 12725 + }, + { + "epoch": 0.64, + "grad_norm": 3.2702584266662598, + "learning_rate": 4.379048779137368e-05, + "loss": 0.4747, + "step": 12750 + }, + { + "epoch": 0.64, + "grad_norm": 25.298032760620117, + "learning_rate": 4.3767417824778997e-05, + "loss": 0.495, + "step": 12775 + }, + { + "epoch": 0.64, + "grad_norm": 5.421559810638428, + "learning_rate": 4.37443478581843e-05, + "loss": 0.5211, + "step": 12800 + }, + { + "epoch": 0.64, + "grad_norm": 11.022579193115234, + "learning_rate": 4.372127789158961e-05, + "loss": 0.4934, + "step": 12825 + }, + { + "epoch": 0.64, + "grad_norm": 2.5336997509002686, + "learning_rate": 4.3698207924994924e-05, + "loss": 0.5397, + "step": 12850 + }, + { + "epoch": 0.64, + "grad_norm": 23.128211975097656, + "learning_rate": 4.3675137958400235e-05, + "loss": 0.4359, + "step": 12875 + }, + { + "epoch": 0.64, + "grad_norm": 25.914918899536133, + "learning_rate": 4.3652067991805553e-05, + "loss": 0.3482, + "step": 12900 + }, + { + "epoch": 0.64, + "grad_norm": 2.854637861251831, + "learning_rate": 4.362899802521086e-05, + "loss": 0.4871, + "step": 12925 + }, + { + "epoch": 0.65, + "grad_norm": 1.9324437379837036, + "learning_rate": 4.3605928058616176e-05, + "loss": 0.5043, + "step": 12950 + }, + { + "epoch": 0.65, + "grad_norm": 12.712705612182617, + "learning_rate": 4.358285809202148e-05, + "loss": 0.4016, + "step": 12975 + }, + { + "epoch": 0.65, + "grad_norm": 3.4530861377716064, + "learning_rate": 4.35597881254268e-05, + "loss": 0.5581, + "step": 13000 + }, + { + "epoch": 0.65, + "grad_norm": 3.0090839862823486, + "learning_rate": 4.353671815883211e-05, + "loss": 0.4368, + "step": 13025 + }, + { + "epoch": 0.65, + "grad_norm": 1.4557719230651855, + "learning_rate": 4.351364819223742e-05, + "loss": 0.4501, + "step": 13050 + }, + { + "epoch": 0.65, + "grad_norm": 4.6525750160217285, + "learning_rate": 4.349057822564273e-05, + "loss": 0.4408, + "step": 13075 + }, + { + "epoch": 0.65, + "grad_norm": 4.434586524963379, + "learning_rate": 4.3467508259048044e-05, + "loss": 0.415, + "step": 13100 + }, + { + "epoch": 0.65, + "grad_norm": 9.57250690460205, + "learning_rate": 4.3444438292453356e-05, + "loss": 0.3814, + "step": 13125 + }, + { + "epoch": 0.66, + "grad_norm": 1.1053205728530884, + "learning_rate": 4.342136832585867e-05, + "loss": 0.6334, + "step": 13150 + }, + { + "epoch": 0.66, + "grad_norm": 2.1338109970092773, + "learning_rate": 4.339829835926398e-05, + "loss": 0.5457, + "step": 13175 + }, + { + "epoch": 0.66, + "grad_norm": 3.43638277053833, + "learning_rate": 4.337522839266929e-05, + "loss": 0.6277, + "step": 13200 + }, + { + "epoch": 0.66, + "grad_norm": 5.341610908508301, + "learning_rate": 4.33521584260746e-05, + "loss": 0.533, + "step": 13225 + }, + { + "epoch": 0.66, + "grad_norm": 12.302016258239746, + "learning_rate": 4.332908845947991e-05, + "loss": 0.4382, + "step": 13250 + }, + { + "epoch": 0.66, + "grad_norm": 1.702040195465088, + "learning_rate": 4.3306018492885224e-05, + "loss": 0.4355, + "step": 13275 + }, + { + "epoch": 0.66, + "grad_norm": 0.8742398023605347, + "learning_rate": 4.3282948526290535e-05, + "loss": 0.4374, + "step": 13300 + }, + { + "epoch": 0.66, + "grad_norm": 1.9180805683135986, + "learning_rate": 4.3259878559695847e-05, + "loss": 0.4913, + "step": 13325 + }, + { + "epoch": 0.67, + "grad_norm": 1.864307165145874, + "learning_rate": 4.323680859310116e-05, + "loss": 0.7072, + "step": 13350 + }, + { + "epoch": 0.67, + "grad_norm": 5.213928699493408, + "learning_rate": 4.321373862650647e-05, + "loss": 0.428, + "step": 13375 + }, + { + "epoch": 0.67, + "grad_norm": 3.17887020111084, + "learning_rate": 4.319066865991178e-05, + "loss": 0.4272, + "step": 13400 + }, + { + "epoch": 0.67, + "grad_norm": 2.970327377319336, + "learning_rate": 4.31675986933171e-05, + "loss": 0.472, + "step": 13425 + }, + { + "epoch": 0.67, + "grad_norm": 1.9041247367858887, + "learning_rate": 4.31445287267224e-05, + "loss": 0.4437, + "step": 13450 + }, + { + "epoch": 0.67, + "grad_norm": 4.694064617156982, + "learning_rate": 4.312145876012772e-05, + "loss": 0.4872, + "step": 13475 + }, + { + "epoch": 0.67, + "grad_norm": 21.480302810668945, + "learning_rate": 4.3098388793533026e-05, + "loss": 0.4416, + "step": 13500 + }, + { + "epoch": 0.67, + "grad_norm": 19.836280822753906, + "learning_rate": 4.3075318826938344e-05, + "loss": 0.5722, + "step": 13525 + }, + { + "epoch": 0.68, + "grad_norm": 2.7376246452331543, + "learning_rate": 4.3052248860343656e-05, + "loss": 0.5197, + "step": 13550 + }, + { + "epoch": 0.68, + "grad_norm": 7.109920024871826, + "learning_rate": 4.302917889374896e-05, + "loss": 0.5437, + "step": 13575 + }, + { + "epoch": 0.68, + "grad_norm": 50.2864875793457, + "learning_rate": 4.300610892715428e-05, + "loss": 0.5431, + "step": 13600 + }, + { + "epoch": 0.68, + "grad_norm": 19.68246078491211, + "learning_rate": 4.298303896055958e-05, + "loss": 0.4416, + "step": 13625 + }, + { + "epoch": 0.68, + "grad_norm": 17.506366729736328, + "learning_rate": 4.29599689939649e-05, + "loss": 0.5243, + "step": 13650 + }, + { + "epoch": 0.68, + "grad_norm": 2.211810827255249, + "learning_rate": 4.2936899027370206e-05, + "loss": 0.4889, + "step": 13675 + }, + { + "epoch": 0.68, + "grad_norm": 4.403297424316406, + "learning_rate": 4.2913829060775524e-05, + "loss": 0.4649, + "step": 13700 + }, + { + "epoch": 0.68, + "grad_norm": 1.4525673389434814, + "learning_rate": 4.2890759094180835e-05, + "loss": 0.4872, + "step": 13725 + }, + { + "epoch": 0.69, + "grad_norm": 24.48954200744629, + "learning_rate": 4.2867689127586146e-05, + "loss": 0.5875, + "step": 13750 + }, + { + "epoch": 0.69, + "grad_norm": 9.83705997467041, + "learning_rate": 4.284461916099146e-05, + "loss": 0.4716, + "step": 13775 + }, + { + "epoch": 0.69, + "grad_norm": 2.3815503120422363, + "learning_rate": 4.282154919439677e-05, + "loss": 0.5094, + "step": 13800 + }, + { + "epoch": 0.69, + "grad_norm": 7.4601149559021, + "learning_rate": 4.279847922780208e-05, + "loss": 0.4932, + "step": 13825 + }, + { + "epoch": 0.69, + "grad_norm": 2.108978033065796, + "learning_rate": 4.277540926120739e-05, + "loss": 0.5116, + "step": 13850 + }, + { + "epoch": 0.69, + "grad_norm": 2.290255546569824, + "learning_rate": 4.27523392946127e-05, + "loss": 0.4903, + "step": 13875 + }, + { + "epoch": 0.69, + "grad_norm": 3.3242883682250977, + "learning_rate": 4.2729269328018015e-05, + "loss": 0.5187, + "step": 13900 + }, + { + "epoch": 0.69, + "grad_norm": 1.0054996013641357, + "learning_rate": 4.2706199361423326e-05, + "loss": 0.4068, + "step": 13925 + }, + { + "epoch": 0.7, + "grad_norm": 5.556132793426514, + "learning_rate": 4.268312939482864e-05, + "loss": 0.6164, + "step": 13950 + }, + { + "epoch": 0.7, + "grad_norm": 4.102598190307617, + "learning_rate": 4.266005942823395e-05, + "loss": 0.4948, + "step": 13975 + }, + { + "epoch": 0.7, + "grad_norm": 1.5323078632354736, + "learning_rate": 4.263698946163926e-05, + "loss": 0.5195, + "step": 14000 + }, + { + "epoch": 0.7, + "grad_norm": 4.196296215057373, + "learning_rate": 4.261391949504457e-05, + "loss": 0.5766, + "step": 14025 + }, + { + "epoch": 0.7, + "grad_norm": 12.170926094055176, + "learning_rate": 4.259084952844988e-05, + "loss": 0.4864, + "step": 14050 + }, + { + "epoch": 0.7, + "grad_norm": 174.35714721679688, + "learning_rate": 4.25677795618552e-05, + "loss": 0.4185, + "step": 14075 + }, + { + "epoch": 0.7, + "grad_norm": 0.7264081835746765, + "learning_rate": 4.2544709595260505e-05, + "loss": 0.364, + "step": 14100 + }, + { + "epoch": 0.7, + "grad_norm": 3.4145443439483643, + "learning_rate": 4.2521639628665824e-05, + "loss": 0.4507, + "step": 14125 + }, + { + "epoch": 0.71, + "grad_norm": 14.267853736877441, + "learning_rate": 4.249856966207113e-05, + "loss": 0.563, + "step": 14150 + }, + { + "epoch": 0.71, + "grad_norm": 2.8452975749969482, + "learning_rate": 4.2475499695476446e-05, + "loss": 0.5582, + "step": 14175 + }, + { + "epoch": 0.71, + "grad_norm": 1.986310362815857, + "learning_rate": 4.245242972888175e-05, + "loss": 0.539, + "step": 14200 + }, + { + "epoch": 0.71, + "grad_norm": 279.5508117675781, + "learning_rate": 4.242935976228707e-05, + "loss": 0.4617, + "step": 14225 + }, + { + "epoch": 0.71, + "grad_norm": 0.829439640045166, + "learning_rate": 4.240628979569238e-05, + "loss": 0.4024, + "step": 14250 + }, + { + "epoch": 0.71, + "grad_norm": 5.14355993270874, + "learning_rate": 4.2383219829097685e-05, + "loss": 0.4953, + "step": 14275 + }, + { + "epoch": 0.71, + "grad_norm": 69.40645599365234, + "learning_rate": 4.2360149862503e-05, + "loss": 0.6197, + "step": 14300 + }, + { + "epoch": 0.71, + "grad_norm": 53.14515686035156, + "learning_rate": 4.233707989590831e-05, + "loss": 0.5586, + "step": 14325 + }, + { + "epoch": 0.72, + "grad_norm": 60.66070556640625, + "learning_rate": 4.2314009929313626e-05, + "loss": 0.4631, + "step": 14350 + }, + { + "epoch": 0.72, + "grad_norm": 1.0162568092346191, + "learning_rate": 4.229093996271894e-05, + "loss": 0.5302, + "step": 14375 + }, + { + "epoch": 0.72, + "grad_norm": 0.6008943319320679, + "learning_rate": 4.226786999612425e-05, + "loss": 0.3437, + "step": 14400 + }, + { + "epoch": 0.72, + "grad_norm": 3.1900744438171387, + "learning_rate": 4.224480002952956e-05, + "loss": 0.3703, + "step": 14425 + }, + { + "epoch": 0.72, + "grad_norm": 11.4000825881958, + "learning_rate": 4.222173006293487e-05, + "loss": 0.4234, + "step": 14450 + }, + { + "epoch": 0.72, + "grad_norm": 5.070300102233887, + "learning_rate": 4.219866009634018e-05, + "loss": 0.6395, + "step": 14475 + }, + { + "epoch": 0.72, + "grad_norm": 1.2910149097442627, + "learning_rate": 4.2175590129745494e-05, + "loss": 0.4914, + "step": 14500 + }, + { + "epoch": 0.72, + "grad_norm": 7.105856895446777, + "learning_rate": 4.2152520163150805e-05, + "loss": 0.549, + "step": 14525 + }, + { + "epoch": 0.73, + "grad_norm": 3.345780611038208, + "learning_rate": 4.212945019655612e-05, + "loss": 0.4593, + "step": 14550 + }, + { + "epoch": 0.73, + "grad_norm": 8.449474334716797, + "learning_rate": 4.210638022996143e-05, + "loss": 0.4493, + "step": 14575 + }, + { + "epoch": 0.73, + "grad_norm": 40.08137130737305, + "learning_rate": 4.2083310263366746e-05, + "loss": 0.4946, + "step": 14600 + }, + { + "epoch": 0.73, + "grad_norm": 2.8794825077056885, + "learning_rate": 4.206024029677205e-05, + "loss": 0.4632, + "step": 14625 + }, + { + "epoch": 0.73, + "grad_norm": 1.4181504249572754, + "learning_rate": 4.203717033017736e-05, + "loss": 0.554, + "step": 14650 + }, + { + "epoch": 0.73, + "grad_norm": 34.15278244018555, + "learning_rate": 4.2014100363582673e-05, + "loss": 0.4662, + "step": 14675 + }, + { + "epoch": 0.73, + "grad_norm": 1.2469291687011719, + "learning_rate": 4.1991030396987985e-05, + "loss": 0.4408, + "step": 14700 + }, + { + "epoch": 0.73, + "grad_norm": 2.87349009513855, + "learning_rate": 4.1967960430393296e-05, + "loss": 0.3956, + "step": 14725 + }, + { + "epoch": 0.74, + "grad_norm": 21.91674041748047, + "learning_rate": 4.194489046379861e-05, + "loss": 0.5344, + "step": 14750 + }, + { + "epoch": 0.74, + "grad_norm": 38.84528350830078, + "learning_rate": 4.1921820497203926e-05, + "loss": 0.5154, + "step": 14775 + }, + { + "epoch": 0.74, + "grad_norm": 122.549072265625, + "learning_rate": 4.189875053060923e-05, + "loss": 0.5487, + "step": 14800 + }, + { + "epoch": 0.74, + "grad_norm": 198.7172088623047, + "learning_rate": 4.187568056401455e-05, + "loss": 0.4841, + "step": 14825 + }, + { + "epoch": 0.74, + "grad_norm": 1.6492382287979126, + "learning_rate": 4.185261059741985e-05, + "loss": 0.4998, + "step": 14850 + }, + { + "epoch": 0.74, + "grad_norm": 1.0393104553222656, + "learning_rate": 4.182954063082517e-05, + "loss": 0.4709, + "step": 14875 + }, + { + "epoch": 0.74, + "grad_norm": 10.945531845092773, + "learning_rate": 4.180647066423048e-05, + "loss": 0.429, + "step": 14900 + }, + { + "epoch": 0.74, + "grad_norm": 3.315153121948242, + "learning_rate": 4.1783400697635794e-05, + "loss": 0.5257, + "step": 14925 + }, + { + "epoch": 0.74, + "grad_norm": 3.1690900325775146, + "learning_rate": 4.1760330731041105e-05, + "loss": 0.423, + "step": 14950 + }, + { + "epoch": 0.75, + "grad_norm": 30.004865646362305, + "learning_rate": 4.1737260764446417e-05, + "loss": 0.4509, + "step": 14975 + }, + { + "epoch": 0.75, + "grad_norm": 2.8555517196655273, + "learning_rate": 4.171419079785173e-05, + "loss": 0.4077, + "step": 15000 + }, + { + "epoch": 0.75, + "grad_norm": 0.5392211079597473, + "learning_rate": 4.169112083125703e-05, + "loss": 0.3312, + "step": 15025 + }, + { + "epoch": 0.75, + "grad_norm": 1.0116759538650513, + "learning_rate": 4.166805086466235e-05, + "loss": 0.5331, + "step": 15050 + }, + { + "epoch": 0.75, + "grad_norm": 0.6311452388763428, + "learning_rate": 4.164498089806766e-05, + "loss": 0.4219, + "step": 15075 + }, + { + "epoch": 0.75, + "grad_norm": 2.8859007358551025, + "learning_rate": 4.162191093147297e-05, + "loss": 0.5315, + "step": 15100 + }, + { + "epoch": 0.75, + "grad_norm": 12.960403442382812, + "learning_rate": 4.1598840964878285e-05, + "loss": 0.5065, + "step": 15125 + }, + { + "epoch": 0.75, + "grad_norm": 65.0334701538086, + "learning_rate": 4.1575770998283596e-05, + "loss": 0.3939, + "step": 15150 + }, + { + "epoch": 0.76, + "grad_norm": 0.8992050886154175, + "learning_rate": 4.155270103168891e-05, + "loss": 0.5015, + "step": 15175 + }, + { + "epoch": 0.76, + "grad_norm": 6.5797834396362305, + "learning_rate": 4.152963106509422e-05, + "loss": 0.4902, + "step": 15200 + }, + { + "epoch": 0.76, + "grad_norm": 90.81952667236328, + "learning_rate": 4.150656109849953e-05, + "loss": 0.6276, + "step": 15225 + }, + { + "epoch": 0.76, + "grad_norm": 12.694025993347168, + "learning_rate": 4.148349113190484e-05, + "loss": 0.521, + "step": 15250 + }, + { + "epoch": 0.76, + "grad_norm": 3.315805673599243, + "learning_rate": 4.146042116531015e-05, + "loss": 0.3754, + "step": 15275 + }, + { + "epoch": 0.76, + "grad_norm": 6.213558197021484, + "learning_rate": 4.143735119871547e-05, + "loss": 0.4641, + "step": 15300 + }, + { + "epoch": 0.76, + "grad_norm": 5.649552345275879, + "learning_rate": 4.1414281232120776e-05, + "loss": 0.5123, + "step": 15325 + }, + { + "epoch": 0.76, + "grad_norm": 1.9310487508773804, + "learning_rate": 4.1391211265526094e-05, + "loss": 0.4957, + "step": 15350 + }, + { + "epoch": 0.77, + "grad_norm": 7.260229110717773, + "learning_rate": 4.13681412989314e-05, + "loss": 0.4961, + "step": 15375 + }, + { + "epoch": 0.77, + "grad_norm": 2.5693869590759277, + "learning_rate": 4.134507133233671e-05, + "loss": 0.4218, + "step": 15400 + }, + { + "epoch": 0.77, + "grad_norm": 0.8206908106803894, + "learning_rate": 4.132200136574203e-05, + "loss": 0.5148, + "step": 15425 + }, + { + "epoch": 0.77, + "grad_norm": 0.8234782814979553, + "learning_rate": 4.129893139914733e-05, + "loss": 0.5661, + "step": 15450 + }, + { + "epoch": 0.77, + "grad_norm": 3.116828680038452, + "learning_rate": 4.127586143255265e-05, + "loss": 0.4217, + "step": 15475 + }, + { + "epoch": 0.77, + "grad_norm": 94.01576232910156, + "learning_rate": 4.1252791465957955e-05, + "loss": 0.5409, + "step": 15500 + }, + { + "epoch": 0.77, + "grad_norm": 1.1207560300827026, + "learning_rate": 4.122972149936327e-05, + "loss": 0.5052, + "step": 15525 + }, + { + "epoch": 0.77, + "grad_norm": 0.757140576839447, + "learning_rate": 4.120665153276858e-05, + "loss": 0.4548, + "step": 15550 + }, + { + "epoch": 0.78, + "grad_norm": 6.338544845581055, + "learning_rate": 4.1183581566173896e-05, + "loss": 0.6972, + "step": 15575 + }, + { + "epoch": 0.78, + "grad_norm": 10.121586799621582, + "learning_rate": 4.116051159957921e-05, + "loss": 0.51, + "step": 15600 + }, + { + "epoch": 0.78, + "grad_norm": 2.8469491004943848, + "learning_rate": 4.113744163298452e-05, + "loss": 0.4761, + "step": 15625 + }, + { + "epoch": 0.78, + "grad_norm": 3.0769906044006348, + "learning_rate": 4.111437166638983e-05, + "loss": 0.4206, + "step": 15650 + }, + { + "epoch": 0.78, + "grad_norm": 4.13899040222168, + "learning_rate": 4.109130169979514e-05, + "loss": 0.6207, + "step": 15675 + }, + { + "epoch": 0.78, + "grad_norm": 1.6825227737426758, + "learning_rate": 4.106823173320045e-05, + "loss": 0.4976, + "step": 15700 + }, + { + "epoch": 0.78, + "grad_norm": 3.750791311264038, + "learning_rate": 4.1045161766605764e-05, + "loss": 0.6228, + "step": 15725 + }, + { + "epoch": 0.78, + "grad_norm": 1.2766634225845337, + "learning_rate": 4.1022091800011076e-05, + "loss": 0.4583, + "step": 15750 + }, + { + "epoch": 0.79, + "grad_norm": 23.536924362182617, + "learning_rate": 4.099902183341639e-05, + "loss": 0.5756, + "step": 15775 + }, + { + "epoch": 0.79, + "grad_norm": 262.17529296875, + "learning_rate": 4.09759518668217e-05, + "loss": 0.4269, + "step": 15800 + }, + { + "epoch": 0.79, + "grad_norm": 5.392236709594727, + "learning_rate": 4.095288190022701e-05, + "loss": 0.4978, + "step": 15825 + }, + { + "epoch": 0.79, + "grad_norm": 36.02292251586914, + "learning_rate": 4.092981193363232e-05, + "loss": 0.4801, + "step": 15850 + }, + { + "epoch": 0.79, + "grad_norm": 2.4638888835906982, + "learning_rate": 4.090674196703763e-05, + "loss": 0.4653, + "step": 15875 + }, + { + "epoch": 0.79, + "grad_norm": 1.074580430984497, + "learning_rate": 4.0883672000442944e-05, + "loss": 0.4546, + "step": 15900 + }, + { + "epoch": 0.79, + "grad_norm": 4.202010154724121, + "learning_rate": 4.0860602033848255e-05, + "loss": 0.5812, + "step": 15925 + }, + { + "epoch": 0.79, + "grad_norm": 25.171972274780273, + "learning_rate": 4.083753206725357e-05, + "loss": 0.511, + "step": 15950 + }, + { + "epoch": 0.8, + "grad_norm": 39.12727355957031, + "learning_rate": 4.081446210065888e-05, + "loss": 0.481, + "step": 15975 + }, + { + "epoch": 0.8, + "grad_norm": 1.353677749633789, + "learning_rate": 4.0791392134064196e-05, + "loss": 0.462, + "step": 16000 + }, + { + "epoch": 0.8, + "grad_norm": 39.51011657714844, + "learning_rate": 4.07683221674695e-05, + "loss": 0.4089, + "step": 16025 + }, + { + "epoch": 0.8, + "grad_norm": 11.377568244934082, + "learning_rate": 4.074525220087482e-05, + "loss": 0.4299, + "step": 16050 + }, + { + "epoch": 0.8, + "grad_norm": 4.243354797363281, + "learning_rate": 4.072218223428012e-05, + "loss": 0.4012, + "step": 16075 + }, + { + "epoch": 0.8, + "grad_norm": 1.647400975227356, + "learning_rate": 4.069911226768544e-05, + "loss": 0.4251, + "step": 16100 + }, + { + "epoch": 0.8, + "grad_norm": 1.1065517663955688, + "learning_rate": 4.067604230109075e-05, + "loss": 0.4315, + "step": 16125 + }, + { + "epoch": 0.8, + "grad_norm": 3.7527124881744385, + "learning_rate": 4.065297233449606e-05, + "loss": 0.466, + "step": 16150 + }, + { + "epoch": 0.81, + "grad_norm": 7.145634174346924, + "learning_rate": 4.0629902367901375e-05, + "loss": 0.4597, + "step": 16175 + }, + { + "epoch": 0.81, + "grad_norm": 3.0760304927825928, + "learning_rate": 4.060683240130668e-05, + "loss": 0.5086, + "step": 16200 + }, + { + "epoch": 0.81, + "grad_norm": 2.9849870204925537, + "learning_rate": 4.0583762434712e-05, + "loss": 0.5209, + "step": 16225 + }, + { + "epoch": 0.81, + "grad_norm": 24.513986587524414, + "learning_rate": 4.056069246811731e-05, + "loss": 0.4571, + "step": 16250 + }, + { + "epoch": 0.81, + "grad_norm": 2.497976541519165, + "learning_rate": 4.053762250152262e-05, + "loss": 0.4176, + "step": 16275 + }, + { + "epoch": 0.81, + "grad_norm": 2.574549674987793, + "learning_rate": 4.051455253492793e-05, + "loss": 0.3929, + "step": 16300 + }, + { + "epoch": 0.81, + "grad_norm": 0.813434898853302, + "learning_rate": 4.0491482568333244e-05, + "loss": 0.4107, + "step": 16325 + }, + { + "epoch": 0.81, + "grad_norm": 3.409647226333618, + "learning_rate": 4.0468412601738555e-05, + "loss": 0.5186, + "step": 16350 + }, + { + "epoch": 0.82, + "grad_norm": 0.8358980417251587, + "learning_rate": 4.0445342635143866e-05, + "loss": 0.4876, + "step": 16375 + }, + { + "epoch": 0.82, + "grad_norm": 1.4873727560043335, + "learning_rate": 4.042227266854918e-05, + "loss": 0.5257, + "step": 16400 + }, + { + "epoch": 0.82, + "grad_norm": 13.529160499572754, + "learning_rate": 4.039920270195449e-05, + "loss": 0.4294, + "step": 16425 + }, + { + "epoch": 0.82, + "grad_norm": 1.2024579048156738, + "learning_rate": 4.03761327353598e-05, + "loss": 0.5386, + "step": 16450 + }, + { + "epoch": 0.82, + "grad_norm": 2.024953842163086, + "learning_rate": 4.035306276876512e-05, + "loss": 0.5225, + "step": 16475 + }, + { + "epoch": 0.82, + "grad_norm": 9.599167823791504, + "learning_rate": 4.032999280217042e-05, + "loss": 0.4629, + "step": 16500 + }, + { + "epoch": 0.82, + "grad_norm": 2.511319160461426, + "learning_rate": 4.0306922835575734e-05, + "loss": 0.6207, + "step": 16525 + }, + { + "epoch": 0.82, + "grad_norm": 2.126314163208008, + "learning_rate": 4.0283852868981046e-05, + "loss": 0.4982, + "step": 16550 + }, + { + "epoch": 0.83, + "grad_norm": 2.2506449222564697, + "learning_rate": 4.026078290238636e-05, + "loss": 0.3821, + "step": 16575 + }, + { + "epoch": 0.83, + "grad_norm": 7.21665620803833, + "learning_rate": 4.023771293579167e-05, + "loss": 0.5274, + "step": 16600 + }, + { + "epoch": 0.83, + "grad_norm": 1.9467825889587402, + "learning_rate": 4.021464296919698e-05, + "loss": 0.6044, + "step": 16625 + }, + { + "epoch": 0.83, + "grad_norm": 1.0800065994262695, + "learning_rate": 4.01915730026023e-05, + "loss": 0.3538, + "step": 16650 + }, + { + "epoch": 0.83, + "grad_norm": 18.513242721557617, + "learning_rate": 4.01685030360076e-05, + "loss": 0.5133, + "step": 16675 + }, + { + "epoch": 0.83, + "grad_norm": 1.1052954196929932, + "learning_rate": 4.014543306941292e-05, + "loss": 0.5397, + "step": 16700 + }, + { + "epoch": 0.83, + "grad_norm": 5.234750270843506, + "learning_rate": 4.0122363102818225e-05, + "loss": 0.5056, + "step": 16725 + }, + { + "epoch": 0.83, + "grad_norm": 3.360717296600342, + "learning_rate": 4.0099293136223543e-05, + "loss": 0.4306, + "step": 16750 + }, + { + "epoch": 0.84, + "grad_norm": 10.082225799560547, + "learning_rate": 4.0076223169628855e-05, + "loss": 0.4634, + "step": 16775 + }, + { + "epoch": 0.84, + "grad_norm": 5.244103908538818, + "learning_rate": 4.0053153203034166e-05, + "loss": 0.4331, + "step": 16800 + }, + { + "epoch": 0.84, + "grad_norm": 9.171998977661133, + "learning_rate": 4.003008323643948e-05, + "loss": 0.5922, + "step": 16825 + }, + { + "epoch": 0.84, + "grad_norm": 1.5789293050765991, + "learning_rate": 4.000701326984479e-05, + "loss": 0.4436, + "step": 16850 + }, + { + "epoch": 0.84, + "grad_norm": 13.523552894592285, + "learning_rate": 3.99839433032501e-05, + "loss": 0.5009, + "step": 16875 + }, + { + "epoch": 0.84, + "grad_norm": 31.91691780090332, + "learning_rate": 3.9960873336655405e-05, + "loss": 0.477, + "step": 16900 + }, + { + "epoch": 0.84, + "grad_norm": 3.873971939086914, + "learning_rate": 3.993780337006072e-05, + "loss": 0.4808, + "step": 16925 + }, + { + "epoch": 0.84, + "grad_norm": 1.8899236917495728, + "learning_rate": 3.9914733403466034e-05, + "loss": 0.4754, + "step": 16950 + }, + { + "epoch": 0.85, + "grad_norm": 5.686938285827637, + "learning_rate": 3.9891663436871346e-05, + "loss": 0.4095, + "step": 16975 + }, + { + "epoch": 0.85, + "grad_norm": 3.4337737560272217, + "learning_rate": 3.986859347027666e-05, + "loss": 0.534, + "step": 17000 + }, + { + "epoch": 0.85, + "grad_norm": 1.897147536277771, + "learning_rate": 3.984552350368197e-05, + "loss": 0.4931, + "step": 17025 + }, + { + "epoch": 0.85, + "grad_norm": 5.539985179901123, + "learning_rate": 3.982245353708728e-05, + "loss": 0.4568, + "step": 17050 + }, + { + "epoch": 0.85, + "grad_norm": 1.8197060823440552, + "learning_rate": 3.979938357049259e-05, + "loss": 0.4894, + "step": 17075 + }, + { + "epoch": 0.85, + "grad_norm": 2.8475852012634277, + "learning_rate": 3.97763136038979e-05, + "loss": 0.6445, + "step": 17100 + }, + { + "epoch": 0.85, + "grad_norm": 18.02142906188965, + "learning_rate": 3.9753243637303214e-05, + "loss": 0.4647, + "step": 17125 + }, + { + "epoch": 0.85, + "grad_norm": 2.6763222217559814, + "learning_rate": 3.9730173670708525e-05, + "loss": 0.4201, + "step": 17150 + }, + { + "epoch": 0.86, + "grad_norm": 2.0824146270751953, + "learning_rate": 3.970710370411384e-05, + "loss": 0.4473, + "step": 17175 + }, + { + "epoch": 0.86, + "grad_norm": 2.7089695930480957, + "learning_rate": 3.968403373751915e-05, + "loss": 0.4026, + "step": 17200 + }, + { + "epoch": 0.86, + "grad_norm": 8.540483474731445, + "learning_rate": 3.9660963770924466e-05, + "loss": 0.5615, + "step": 17225 + }, + { + "epoch": 0.86, + "grad_norm": 0.8385511040687561, + "learning_rate": 3.963789380432977e-05, + "loss": 0.5148, + "step": 17250 + }, + { + "epoch": 0.86, + "grad_norm": 7.858611583709717, + "learning_rate": 3.961482383773508e-05, + "loss": 0.6374, + "step": 17275 + }, + { + "epoch": 0.86, + "grad_norm": 1.7679189443588257, + "learning_rate": 3.95917538711404e-05, + "loss": 0.5359, + "step": 17300 + }, + { + "epoch": 0.86, + "grad_norm": 14.490900039672852, + "learning_rate": 3.9568683904545705e-05, + "loss": 1.2369, + "step": 17325 + }, + { + "epoch": 0.86, + "grad_norm": 14.354183197021484, + "learning_rate": 3.954561393795102e-05, + "loss": 0.823, + "step": 17350 + }, + { + "epoch": 0.87, + "grad_norm": 4.188384532928467, + "learning_rate": 3.952254397135633e-05, + "loss": 0.5664, + "step": 17375 + }, + { + "epoch": 0.87, + "grad_norm": 5.154536247253418, + "learning_rate": 3.9499474004761646e-05, + "loss": 0.5828, + "step": 17400 + }, + { + "epoch": 0.87, + "grad_norm": 4.994510173797607, + "learning_rate": 3.947640403816695e-05, + "loss": 0.4817, + "step": 17425 + }, + { + "epoch": 0.87, + "grad_norm": 138.496337890625, + "learning_rate": 3.945333407157227e-05, + "loss": 0.5828, + "step": 17450 + }, + { + "epoch": 0.87, + "grad_norm": 4.370205879211426, + "learning_rate": 3.943026410497758e-05, + "loss": 0.5262, + "step": 17475 + }, + { + "epoch": 0.87, + "grad_norm": 16.109251022338867, + "learning_rate": 3.940719413838289e-05, + "loss": 0.4149, + "step": 17500 + }, + { + "epoch": 0.87, + "grad_norm": 10.229596138000488, + "learning_rate": 3.93841241717882e-05, + "loss": 0.5403, + "step": 17525 + }, + { + "epoch": 0.87, + "grad_norm": 3.327038288116455, + "learning_rate": 3.9361054205193514e-05, + "loss": 0.6732, + "step": 17550 + }, + { + "epoch": 0.88, + "grad_norm": 4.285974502563477, + "learning_rate": 3.9337984238598825e-05, + "loss": 0.5621, + "step": 17575 + }, + { + "epoch": 0.88, + "grad_norm": 3.3494739532470703, + "learning_rate": 3.9314914272004136e-05, + "loss": 0.5229, + "step": 17600 + }, + { + "epoch": 0.88, + "grad_norm": 1.3700764179229736, + "learning_rate": 3.929184430540945e-05, + "loss": 0.5192, + "step": 17625 + }, + { + "epoch": 0.88, + "grad_norm": 2.759855031967163, + "learning_rate": 3.926877433881476e-05, + "loss": 0.4136, + "step": 17650 + }, + { + "epoch": 0.88, + "grad_norm": 3.0871315002441406, + "learning_rate": 3.924570437222007e-05, + "loss": 0.553, + "step": 17675 + }, + { + "epoch": 0.88, + "grad_norm": 93.34503936767578, + "learning_rate": 3.922263440562538e-05, + "loss": 0.4719, + "step": 17700 + }, + { + "epoch": 0.88, + "grad_norm": 3.604959726333618, + "learning_rate": 3.919956443903069e-05, + "loss": 0.5601, + "step": 17725 + }, + { + "epoch": 0.88, + "grad_norm": 38.621551513671875, + "learning_rate": 3.9176494472436005e-05, + "loss": 0.6033, + "step": 17750 + }, + { + "epoch": 0.89, + "grad_norm": 1.2768781185150146, + "learning_rate": 3.9153424505841316e-05, + "loss": 0.4388, + "step": 17775 + }, + { + "epoch": 0.89, + "grad_norm": 3.988166332244873, + "learning_rate": 3.913035453924663e-05, + "loss": 0.4533, + "step": 17800 + }, + { + "epoch": 0.89, + "grad_norm": 1.5384948253631592, + "learning_rate": 3.9107284572651945e-05, + "loss": 0.6735, + "step": 17825 + }, + { + "epoch": 0.89, + "grad_norm": 7.3451032638549805, + "learning_rate": 3.908421460605725e-05, + "loss": 0.5567, + "step": 17850 + }, + { + "epoch": 0.89, + "grad_norm": 2.0618393421173096, + "learning_rate": 3.906114463946257e-05, + "loss": 0.5047, + "step": 17875 + }, + { + "epoch": 0.89, + "grad_norm": 2.9151611328125, + "learning_rate": 3.903807467286787e-05, + "loss": 0.4682, + "step": 17900 + }, + { + "epoch": 0.89, + "grad_norm": 3.500939130783081, + "learning_rate": 3.901500470627319e-05, + "loss": 0.5246, + "step": 17925 + }, + { + "epoch": 0.89, + "grad_norm": 8.571981430053711, + "learning_rate": 3.8991934739678495e-05, + "loss": 0.5462, + "step": 17950 + }, + { + "epoch": 0.9, + "grad_norm": 39.09160614013672, + "learning_rate": 3.8968864773083814e-05, + "loss": 0.4873, + "step": 17975 + }, + { + "epoch": 0.9, + "grad_norm": 16.766748428344727, + "learning_rate": 3.8945794806489125e-05, + "loss": 0.4964, + "step": 18000 + }, + { + "epoch": 0.9, + "grad_norm": 18.48149299621582, + "learning_rate": 3.892272483989443e-05, + "loss": 0.4481, + "step": 18025 + }, + { + "epoch": 0.9, + "grad_norm": 1.355155110359192, + "learning_rate": 3.889965487329975e-05, + "loss": 0.4815, + "step": 18050 + }, + { + "epoch": 0.9, + "grad_norm": 35.24787139892578, + "learning_rate": 3.887658490670505e-05, + "loss": 0.4782, + "step": 18075 + }, + { + "epoch": 0.9, + "grad_norm": 3.5112314224243164, + "learning_rate": 3.885351494011037e-05, + "loss": 0.4727, + "step": 18100 + }, + { + "epoch": 0.9, + "grad_norm": 5.651079177856445, + "learning_rate": 3.8830444973515675e-05, + "loss": 0.5468, + "step": 18125 + }, + { + "epoch": 0.9, + "grad_norm": 3.5044541358947754, + "learning_rate": 3.880737500692099e-05, + "loss": 0.5182, + "step": 18150 + }, + { + "epoch": 0.91, + "grad_norm": 1.015687346458435, + "learning_rate": 3.8784305040326305e-05, + "loss": 0.3501, + "step": 18175 + }, + { + "epoch": 0.91, + "grad_norm": 2.154855728149414, + "learning_rate": 3.8761235073731616e-05, + "loss": 0.4771, + "step": 18200 + }, + { + "epoch": 0.91, + "grad_norm": 41.30935287475586, + "learning_rate": 3.873816510713693e-05, + "loss": 0.3895, + "step": 18225 + }, + { + "epoch": 0.91, + "grad_norm": 3.7652475833892822, + "learning_rate": 3.871509514054224e-05, + "loss": 0.522, + "step": 18250 + }, + { + "epoch": 0.91, + "grad_norm": 6.1274943351745605, + "learning_rate": 3.869202517394755e-05, + "loss": 0.5085, + "step": 18275 + }, + { + "epoch": 0.91, + "grad_norm": 5.170544624328613, + "learning_rate": 3.866895520735286e-05, + "loss": 0.4827, + "step": 18300 + }, + { + "epoch": 0.91, + "grad_norm": 1.431235671043396, + "learning_rate": 3.864588524075817e-05, + "loss": 0.4614, + "step": 18325 + }, + { + "epoch": 0.91, + "grad_norm": 0.8098027110099792, + "learning_rate": 3.862281527416349e-05, + "loss": 0.4926, + "step": 18350 + }, + { + "epoch": 0.92, + "grad_norm": 3.5741729736328125, + "learning_rate": 3.8599745307568795e-05, + "loss": 0.3676, + "step": 18375 + }, + { + "epoch": 0.92, + "grad_norm": 3.1784658432006836, + "learning_rate": 3.857667534097411e-05, + "loss": 0.6016, + "step": 18400 + }, + { + "epoch": 0.92, + "grad_norm": 4.085948944091797, + "learning_rate": 3.855360537437942e-05, + "loss": 0.5894, + "step": 18425 + }, + { + "epoch": 0.92, + "grad_norm": 4.510144233703613, + "learning_rate": 3.853053540778473e-05, + "loss": 0.6006, + "step": 18450 + }, + { + "epoch": 0.92, + "grad_norm": 2.849971055984497, + "learning_rate": 3.850746544119004e-05, + "loss": 0.5905, + "step": 18475 + }, + { + "epoch": 0.92, + "grad_norm": 53.399227142333984, + "learning_rate": 3.848439547459535e-05, + "loss": 0.511, + "step": 18500 + }, + { + "epoch": 0.92, + "grad_norm": 1.8678909540176392, + "learning_rate": 3.846132550800067e-05, + "loss": 0.5009, + "step": 18525 + }, + { + "epoch": 0.92, + "grad_norm": 4.071389198303223, + "learning_rate": 3.8438255541405975e-05, + "loss": 0.5701, + "step": 18550 + }, + { + "epoch": 0.93, + "grad_norm": 9.301192283630371, + "learning_rate": 3.841518557481129e-05, + "loss": 0.5051, + "step": 18575 + }, + { + "epoch": 0.93, + "grad_norm": 6.390158653259277, + "learning_rate": 3.83921156082166e-05, + "loss": 0.6236, + "step": 18600 + }, + { + "epoch": 0.93, + "grad_norm": 1.5711665153503418, + "learning_rate": 3.8369045641621916e-05, + "loss": 0.5415, + "step": 18625 + }, + { + "epoch": 0.93, + "grad_norm": 100.99102783203125, + "learning_rate": 3.834597567502722e-05, + "loss": 0.5929, + "step": 18650 + }, + { + "epoch": 0.93, + "grad_norm": 2.2588348388671875, + "learning_rate": 3.832290570843254e-05, + "loss": 0.4933, + "step": 18675 + }, + { + "epoch": 0.93, + "grad_norm": 12.144977569580078, + "learning_rate": 3.829983574183785e-05, + "loss": 0.5072, + "step": 18700 + }, + { + "epoch": 0.93, + "grad_norm": 16.152042388916016, + "learning_rate": 3.827676577524316e-05, + "loss": 0.5485, + "step": 18725 + }, + { + "epoch": 0.93, + "grad_norm": 1.2403534650802612, + "learning_rate": 3.825369580864847e-05, + "loss": 0.4722, + "step": 18750 + }, + { + "epoch": 0.94, + "grad_norm": 1.4384055137634277, + "learning_rate": 3.823062584205378e-05, + "loss": 0.5137, + "step": 18775 + }, + { + "epoch": 0.94, + "grad_norm": 40.7564697265625, + "learning_rate": 3.8207555875459095e-05, + "loss": 0.513, + "step": 18800 + }, + { + "epoch": 0.94, + "grad_norm": 71.0677490234375, + "learning_rate": 3.818448590886441e-05, + "loss": 0.4625, + "step": 18825 + }, + { + "epoch": 0.94, + "grad_norm": 1.8693602085113525, + "learning_rate": 3.816141594226972e-05, + "loss": 0.3826, + "step": 18850 + }, + { + "epoch": 0.94, + "grad_norm": 8.29848575592041, + "learning_rate": 3.813834597567503e-05, + "loss": 0.5599, + "step": 18875 + }, + { + "epoch": 0.94, + "grad_norm": 2.729771375656128, + "learning_rate": 3.811527600908034e-05, + "loss": 0.4568, + "step": 18900 + }, + { + "epoch": 0.94, + "grad_norm": 76.36131286621094, + "learning_rate": 3.809220604248565e-05, + "loss": 0.6015, + "step": 18925 + }, + { + "epoch": 0.94, + "grad_norm": 0.9493357539176941, + "learning_rate": 3.8069136075890963e-05, + "loss": 0.4385, + "step": 18950 + }, + { + "epoch": 0.95, + "grad_norm": 3.290364980697632, + "learning_rate": 3.8046066109296275e-05, + "loss": 0.465, + "step": 18975 + }, + { + "epoch": 0.95, + "grad_norm": 9.84317684173584, + "learning_rate": 3.8022996142701586e-05, + "loss": 0.5778, + "step": 19000 + }, + { + "epoch": 0.95, + "grad_norm": 1.622100830078125, + "learning_rate": 3.79999261761069e-05, + "loss": 0.4833, + "step": 19025 + }, + { + "epoch": 0.95, + "grad_norm": 4.288991928100586, + "learning_rate": 3.7976856209512216e-05, + "loss": 0.5373, + "step": 19050 + }, + { + "epoch": 0.95, + "grad_norm": 3.3950583934783936, + "learning_rate": 3.795378624291752e-05, + "loss": 0.3924, + "step": 19075 + }, + { + "epoch": 0.95, + "grad_norm": 28.451183319091797, + "learning_rate": 3.793071627632284e-05, + "loss": 0.4863, + "step": 19100 + }, + { + "epoch": 0.95, + "grad_norm": 6.996535301208496, + "learning_rate": 3.790764630972814e-05, + "loss": 0.6353, + "step": 19125 + }, + { + "epoch": 0.95, + "grad_norm": 3.060659885406494, + "learning_rate": 3.7884576343133454e-05, + "loss": 0.5, + "step": 19150 + }, + { + "epoch": 0.96, + "grad_norm": 14.792171478271484, + "learning_rate": 3.7861506376538766e-05, + "loss": 0.5163, + "step": 19175 + }, + { + "epoch": 0.96, + "grad_norm": 39.17354202270508, + "learning_rate": 3.783843640994408e-05, + "loss": 0.6595, + "step": 19200 + }, + { + "epoch": 0.96, + "grad_norm": 51.1176872253418, + "learning_rate": 3.7815366443349395e-05, + "loss": 0.5817, + "step": 19225 + }, + { + "epoch": 0.96, + "grad_norm": 1.3194643259048462, + "learning_rate": 3.77922964767547e-05, + "loss": 0.4769, + "step": 19250 + }, + { + "epoch": 0.96, + "grad_norm": 7.984396457672119, + "learning_rate": 3.776922651016002e-05, + "loss": 0.4827, + "step": 19275 + }, + { + "epoch": 0.96, + "grad_norm": 2.9708173274993896, + "learning_rate": 3.774615654356532e-05, + "loss": 0.4073, + "step": 19300 + }, + { + "epoch": 0.96, + "grad_norm": 5.974608421325684, + "learning_rate": 3.772308657697064e-05, + "loss": 0.5785, + "step": 19325 + }, + { + "epoch": 0.96, + "grad_norm": 26.67017364501953, + "learning_rate": 3.770001661037595e-05, + "loss": 0.5199, + "step": 19350 + }, + { + "epoch": 0.97, + "grad_norm": 69.32394409179688, + "learning_rate": 3.767694664378126e-05, + "loss": 0.5085, + "step": 19375 + }, + { + "epoch": 0.97, + "grad_norm": 16.79060935974121, + "learning_rate": 3.7653876677186575e-05, + "loss": 0.5602, + "step": 19400 + }, + { + "epoch": 0.97, + "grad_norm": 4.172871112823486, + "learning_rate": 3.7630806710591886e-05, + "loss": 0.3412, + "step": 19425 + }, + { + "epoch": 0.97, + "grad_norm": 4.1554975509643555, + "learning_rate": 3.76077367439972e-05, + "loss": 0.5426, + "step": 19450 + }, + { + "epoch": 0.97, + "grad_norm": 20.442163467407227, + "learning_rate": 3.758466677740251e-05, + "loss": 0.543, + "step": 19475 + }, + { + "epoch": 0.97, + "grad_norm": 0.9975307583808899, + "learning_rate": 3.756159681080782e-05, + "loss": 0.511, + "step": 19500 + }, + { + "epoch": 0.97, + "grad_norm": 6.242922782897949, + "learning_rate": 3.753852684421313e-05, + "loss": 0.5415, + "step": 19525 + }, + { + "epoch": 0.97, + "grad_norm": 5.49556827545166, + "learning_rate": 3.751545687761844e-05, + "loss": 0.3906, + "step": 19550 + }, + { + "epoch": 0.98, + "grad_norm": 1.480525255203247, + "learning_rate": 3.7492386911023754e-05, + "loss": 0.3721, + "step": 19575 + }, + { + "epoch": 0.98, + "grad_norm": 7.201631546020508, + "learning_rate": 3.7469316944429066e-05, + "loss": 0.6057, + "step": 19600 + }, + { + "epoch": 0.98, + "grad_norm": 3.685736894607544, + "learning_rate": 3.744624697783438e-05, + "loss": 0.5243, + "step": 19625 + }, + { + "epoch": 0.98, + "grad_norm": 8.694090843200684, + "learning_rate": 3.742317701123969e-05, + "loss": 0.4655, + "step": 19650 + }, + { + "epoch": 0.98, + "grad_norm": 3.63700795173645, + "learning_rate": 3.7400107044645e-05, + "loss": 0.7424, + "step": 19675 + }, + { + "epoch": 0.98, + "grad_norm": 3.734907627105713, + "learning_rate": 3.737703707805031e-05, + "loss": 0.516, + "step": 19700 + }, + { + "epoch": 0.98, + "grad_norm": 14.383971214294434, + "learning_rate": 3.735396711145562e-05, + "loss": 0.4659, + "step": 19725 + }, + { + "epoch": 0.98, + "grad_norm": 4.4923930168151855, + "learning_rate": 3.733089714486094e-05, + "loss": 0.4505, + "step": 19750 + }, + { + "epoch": 0.99, + "grad_norm": 2.624192714691162, + "learning_rate": 3.7307827178266245e-05, + "loss": 0.5293, + "step": 19775 + }, + { + "epoch": 0.99, + "grad_norm": 3.719897985458374, + "learning_rate": 3.728475721167156e-05, + "loss": 0.5854, + "step": 19800 + }, + { + "epoch": 0.99, + "grad_norm": 1.4075324535369873, + "learning_rate": 3.726168724507687e-05, + "loss": 0.5475, + "step": 19825 + }, + { + "epoch": 0.99, + "grad_norm": 87.31964874267578, + "learning_rate": 3.7238617278482186e-05, + "loss": 0.6098, + "step": 19850 + }, + { + "epoch": 0.99, + "grad_norm": 66.36661529541016, + "learning_rate": 3.72155473118875e-05, + "loss": 0.4597, + "step": 19875 + }, + { + "epoch": 0.99, + "grad_norm": 3.5049967765808105, + "learning_rate": 3.71924773452928e-05, + "loss": 0.5159, + "step": 19900 + }, + { + "epoch": 0.99, + "grad_norm": 1.5183825492858887, + "learning_rate": 3.716940737869812e-05, + "loss": 0.4156, + "step": 19925 + }, + { + "epoch": 0.99, + "grad_norm": 5.584221363067627, + "learning_rate": 3.7146337412103425e-05, + "loss": 0.4939, + "step": 19950 + }, + { + "epoch": 1.0, + "grad_norm": 32.5567512512207, + "learning_rate": 3.712326744550874e-05, + "loss": 0.6205, + "step": 19975 + }, + { + "epoch": 1.0, + "grad_norm": 4.678618907928467, + "learning_rate": 3.710019747891405e-05, + "loss": 0.554, + "step": 20000 + }, + { + "epoch": 1.0, + "grad_norm": 2.869126796722412, + "learning_rate": 3.7077127512319365e-05, + "loss": 0.5592, + "step": 20025 + }, + { + "epoch": 1.0, + "grad_norm": 4.049469470977783, + "learning_rate": 3.705405754572468e-05, + "loss": 0.4635, + "step": 20050 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.852423072131556, + "eval_f1_macro": 0.740011717394454, + "eval_f1_micro": 0.852423072131556, + "eval_f1_weighted": 0.8490416400281636, + "eval_loss": 0.46900421380996704, + "eval_precision_macro": 0.8007344689241704, + "eval_precision_micro": 0.852423072131556, + "eval_precision_weighted": 0.8510035524325744, + "eval_recall_macro": 0.7068611044442278, + "eval_recall_micro": 0.852423072131556, + "eval_recall_weighted": 0.852423072131556, + "eval_runtime": 8175.3699, + "eval_samples_per_second": 4.909, + "eval_steps_per_second": 0.307, + "step": 20068 + }, + { + "epoch": 1.0, + "grad_norm": 2.8573710918426514, + "learning_rate": 3.703098757912999e-05, + "loss": 0.6037, + "step": 20075 + }, + { + "epoch": 1.0, + "grad_norm": 1.9019801616668701, + "learning_rate": 3.70079176125353e-05, + "loss": 0.524, + "step": 20100 + }, + { + "epoch": 1.0, + "grad_norm": 43.846824645996094, + "learning_rate": 3.698484764594061e-05, + "loss": 0.4374, + "step": 20125 + }, + { + "epoch": 1.0, + "grad_norm": 5.801967144012451, + "learning_rate": 3.696177767934592e-05, + "loss": 0.5659, + "step": 20150 + }, + { + "epoch": 1.01, + "grad_norm": 41.790992736816406, + "learning_rate": 3.6938707712751234e-05, + "loss": 0.5, + "step": 20175 + }, + { + "epoch": 1.01, + "grad_norm": 1.0711537599563599, + "learning_rate": 3.6915637746156545e-05, + "loss": 0.4129, + "step": 20200 + }, + { + "epoch": 1.01, + "grad_norm": 1.601976990699768, + "learning_rate": 3.6892567779561856e-05, + "loss": 0.4549, + "step": 20225 + }, + { + "epoch": 1.01, + "grad_norm": 5.723679065704346, + "learning_rate": 3.686949781296717e-05, + "loss": 0.4861, + "step": 20250 + }, + { + "epoch": 1.01, + "grad_norm": 0.9833848476409912, + "learning_rate": 3.684642784637248e-05, + "loss": 0.4366, + "step": 20275 + }, + { + "epoch": 1.01, + "grad_norm": 2.463611125946045, + "learning_rate": 3.682335787977779e-05, + "loss": 0.4352, + "step": 20300 + }, + { + "epoch": 1.01, + "grad_norm": 3.361196994781494, + "learning_rate": 3.68002879131831e-05, + "loss": 0.4889, + "step": 20325 + }, + { + "epoch": 1.01, + "grad_norm": 3.9204843044281006, + "learning_rate": 3.677721794658841e-05, + "loss": 0.4408, + "step": 20350 + }, + { + "epoch": 1.02, + "grad_norm": 5.163851737976074, + "learning_rate": 3.6754147979993724e-05, + "loss": 0.4778, + "step": 20375 + }, + { + "epoch": 1.02, + "grad_norm": 1.4100154638290405, + "learning_rate": 3.673107801339904e-05, + "loss": 0.4129, + "step": 20400 + }, + { + "epoch": 1.02, + "grad_norm": 1.0082415342330933, + "learning_rate": 3.670800804680435e-05, + "loss": 0.3991, + "step": 20425 + }, + { + "epoch": 1.02, + "grad_norm": 3.754681348800659, + "learning_rate": 3.6684938080209665e-05, + "loss": 0.4297, + "step": 20450 + }, + { + "epoch": 1.02, + "grad_norm": 3.615309000015259, + "learning_rate": 3.666186811361497e-05, + "loss": 0.5264, + "step": 20475 + }, + { + "epoch": 1.02, + "grad_norm": 5.391434192657471, + "learning_rate": 3.663879814702029e-05, + "loss": 0.5161, + "step": 20500 + }, + { + "epoch": 1.02, + "grad_norm": 0.9345567226409912, + "learning_rate": 3.661572818042559e-05, + "loss": 0.4751, + "step": 20525 + }, + { + "epoch": 1.02, + "grad_norm": 83.7790298461914, + "learning_rate": 3.659265821383091e-05, + "loss": 0.5471, + "step": 20550 + }, + { + "epoch": 1.03, + "grad_norm": 5.505739212036133, + "learning_rate": 3.656958824723622e-05, + "loss": 0.442, + "step": 20575 + }, + { + "epoch": 1.03, + "grad_norm": 2.3913259506225586, + "learning_rate": 3.6546518280641534e-05, + "loss": 0.3406, + "step": 20600 + }, + { + "epoch": 1.03, + "grad_norm": 3.856517791748047, + "learning_rate": 3.6523448314046845e-05, + "loss": 0.4651, + "step": 20625 + }, + { + "epoch": 1.03, + "grad_norm": 36.363807678222656, + "learning_rate": 3.650037834745215e-05, + "loss": 0.3299, + "step": 20650 + }, + { + "epoch": 1.03, + "grad_norm": 4.59905481338501, + "learning_rate": 3.647730838085747e-05, + "loss": 0.4673, + "step": 20675 + }, + { + "epoch": 1.03, + "grad_norm": 2.024817705154419, + "learning_rate": 3.645423841426278e-05, + "loss": 0.6025, + "step": 20700 + }, + { + "epoch": 1.03, + "grad_norm": 0.8249455094337463, + "learning_rate": 3.643116844766809e-05, + "loss": 0.4823, + "step": 20725 + }, + { + "epoch": 1.03, + "grad_norm": 3.736368179321289, + "learning_rate": 3.64080984810734e-05, + "loss": 0.4693, + "step": 20750 + }, + { + "epoch": 1.04, + "grad_norm": 1.0846278667449951, + "learning_rate": 3.638502851447871e-05, + "loss": 0.5291, + "step": 20775 + }, + { + "epoch": 1.04, + "grad_norm": 3.0749387741088867, + "learning_rate": 3.6361958547884024e-05, + "loss": 0.4587, + "step": 20800 + }, + { + "epoch": 1.04, + "grad_norm": 1.6911934614181519, + "learning_rate": 3.6338888581289336e-05, + "loss": 0.4702, + "step": 20825 + }, + { + "epoch": 1.04, + "grad_norm": 1.1063579320907593, + "learning_rate": 3.631581861469465e-05, + "loss": 0.5319, + "step": 20850 + }, + { + "epoch": 1.04, + "grad_norm": 2.875474691390991, + "learning_rate": 3.629274864809996e-05, + "loss": 0.657, + "step": 20875 + }, + { + "epoch": 1.04, + "grad_norm": 10.830842018127441, + "learning_rate": 3.626967868150527e-05, + "loss": 0.4379, + "step": 20900 + }, + { + "epoch": 1.04, + "grad_norm": 5.642008304595947, + "learning_rate": 3.624660871491059e-05, + "loss": 0.6474, + "step": 20925 + }, + { + "epoch": 1.04, + "grad_norm": 24.012779235839844, + "learning_rate": 3.622353874831589e-05, + "loss": 0.5086, + "step": 20950 + }, + { + "epoch": 1.05, + "grad_norm": 1.3956573009490967, + "learning_rate": 3.620046878172121e-05, + "loss": 0.5026, + "step": 20975 + }, + { + "epoch": 1.05, + "grad_norm": 0.8949878215789795, + "learning_rate": 3.6177398815126515e-05, + "loss": 0.4177, + "step": 21000 + }, + { + "epoch": 1.05, + "grad_norm": 8.444169044494629, + "learning_rate": 3.615432884853183e-05, + "loss": 0.5521, + "step": 21025 + }, + { + "epoch": 1.05, + "grad_norm": 1.7907155752182007, + "learning_rate": 3.613125888193714e-05, + "loss": 0.4044, + "step": 21050 + }, + { + "epoch": 1.05, + "grad_norm": 3.08500599861145, + "learning_rate": 3.610818891534245e-05, + "loss": 0.4453, + "step": 21075 + }, + { + "epoch": 1.05, + "grad_norm": 4.027458667755127, + "learning_rate": 3.608511894874777e-05, + "loss": 0.4518, + "step": 21100 + }, + { + "epoch": 1.05, + "grad_norm": 3.554227590560913, + "learning_rate": 3.606204898215307e-05, + "loss": 0.3573, + "step": 21125 + }, + { + "epoch": 1.05, + "grad_norm": 1.3447580337524414, + "learning_rate": 3.603897901555839e-05, + "loss": 0.415, + "step": 21150 + }, + { + "epoch": 1.06, + "grad_norm": 4.002223014831543, + "learning_rate": 3.6015909048963695e-05, + "loss": 0.4981, + "step": 21175 + }, + { + "epoch": 1.06, + "grad_norm": 13.900261878967285, + "learning_rate": 3.599283908236901e-05, + "loss": 0.5881, + "step": 21200 + }, + { + "epoch": 1.06, + "grad_norm": 6.735679626464844, + "learning_rate": 3.5969769115774324e-05, + "loss": 0.4258, + "step": 21225 + }, + { + "epoch": 1.06, + "grad_norm": 3.4372642040252686, + "learning_rate": 3.5946699149179636e-05, + "loss": 0.4882, + "step": 21250 + }, + { + "epoch": 1.06, + "grad_norm": 7.4543890953063965, + "learning_rate": 3.592362918258495e-05, + "loss": 0.5204, + "step": 21275 + }, + { + "epoch": 1.06, + "grad_norm": 27.05744743347168, + "learning_rate": 3.590055921599026e-05, + "loss": 0.3268, + "step": 21300 + }, + { + "epoch": 1.06, + "grad_norm": 3.965268850326538, + "learning_rate": 3.587748924939557e-05, + "loss": 0.5526, + "step": 21325 + }, + { + "epoch": 1.06, + "grad_norm": 2.135265827178955, + "learning_rate": 3.585441928280088e-05, + "loss": 0.3802, + "step": 21350 + }, + { + "epoch": 1.07, + "grad_norm": 65.5523452758789, + "learning_rate": 3.583134931620619e-05, + "loss": 0.4903, + "step": 21375 + }, + { + "epoch": 1.07, + "grad_norm": 3.186577558517456, + "learning_rate": 3.5808279349611504e-05, + "loss": 0.5125, + "step": 21400 + }, + { + "epoch": 1.07, + "grad_norm": 54.349143981933594, + "learning_rate": 3.5785209383016815e-05, + "loss": 0.3316, + "step": 21425 + }, + { + "epoch": 1.07, + "grad_norm": 3.5075273513793945, + "learning_rate": 3.5762139416422127e-05, + "loss": 0.3988, + "step": 21450 + }, + { + "epoch": 1.07, + "grad_norm": 5.802154064178467, + "learning_rate": 3.573906944982744e-05, + "loss": 0.565, + "step": 21475 + }, + { + "epoch": 1.07, + "grad_norm": 5.111006259918213, + "learning_rate": 3.571599948323275e-05, + "loss": 0.5855, + "step": 21500 + }, + { + "epoch": 1.07, + "grad_norm": 6.996493339538574, + "learning_rate": 3.569292951663806e-05, + "loss": 0.4446, + "step": 21525 + }, + { + "epoch": 1.07, + "grad_norm": 16.910449981689453, + "learning_rate": 3.566985955004337e-05, + "loss": 0.4844, + "step": 21550 + }, + { + "epoch": 1.08, + "grad_norm": 6.590620994567871, + "learning_rate": 3.564678958344868e-05, + "loss": 0.5894, + "step": 21575 + }, + { + "epoch": 1.08, + "grad_norm": 3.3420209884643555, + "learning_rate": 3.5623719616853995e-05, + "loss": 0.398, + "step": 21600 + }, + { + "epoch": 1.08, + "grad_norm": 2.7199301719665527, + "learning_rate": 3.560064965025931e-05, + "loss": 0.5236, + "step": 21625 + }, + { + "epoch": 1.08, + "grad_norm": 4.216751575469971, + "learning_rate": 3.557757968366462e-05, + "loss": 0.3373, + "step": 21650 + }, + { + "epoch": 1.08, + "grad_norm": 28.876190185546875, + "learning_rate": 3.5554509717069936e-05, + "loss": 0.4816, + "step": 21675 + }, + { + "epoch": 1.08, + "grad_norm": 6.362488269805908, + "learning_rate": 3.553143975047524e-05, + "loss": 0.4663, + "step": 21700 + }, + { + "epoch": 1.08, + "grad_norm": 2.0933680534362793, + "learning_rate": 3.550836978388056e-05, + "loss": 0.5506, + "step": 21725 + }, + { + "epoch": 1.08, + "grad_norm": 6.475625514984131, + "learning_rate": 3.548529981728587e-05, + "loss": 0.4748, + "step": 21750 + }, + { + "epoch": 1.09, + "grad_norm": 6.892195701599121, + "learning_rate": 3.5462229850691174e-05, + "loss": 0.5035, + "step": 21775 + }, + { + "epoch": 1.09, + "grad_norm": 2.857177495956421, + "learning_rate": 3.543915988409649e-05, + "loss": 0.5092, + "step": 21800 + }, + { + "epoch": 1.09, + "grad_norm": 6.091822624206543, + "learning_rate": 3.54160899175018e-05, + "loss": 0.4765, + "step": 21825 + }, + { + "epoch": 1.09, + "grad_norm": 2.356370687484741, + "learning_rate": 3.5393019950907115e-05, + "loss": 0.484, + "step": 21850 + }, + { + "epoch": 1.09, + "grad_norm": 8.744856834411621, + "learning_rate": 3.536994998431242e-05, + "loss": 0.4418, + "step": 21875 + }, + { + "epoch": 1.09, + "grad_norm": 2.4514973163604736, + "learning_rate": 3.534688001771774e-05, + "loss": 0.4823, + "step": 21900 + }, + { + "epoch": 1.09, + "grad_norm": 1.6482782363891602, + "learning_rate": 3.532381005112305e-05, + "loss": 0.409, + "step": 21925 + }, + { + "epoch": 1.09, + "grad_norm": 6.030589580535889, + "learning_rate": 3.530074008452836e-05, + "loss": 0.4702, + "step": 21950 + }, + { + "epoch": 1.1, + "grad_norm": 3.8618252277374268, + "learning_rate": 3.527767011793367e-05, + "loss": 0.4308, + "step": 21975 + }, + { + "epoch": 1.1, + "grad_norm": 0.9432287812232971, + "learning_rate": 3.525460015133898e-05, + "loss": 0.4436, + "step": 22000 + }, + { + "epoch": 1.1, + "grad_norm": 47.525272369384766, + "learning_rate": 3.5231530184744295e-05, + "loss": 0.5559, + "step": 22025 + }, + { + "epoch": 1.1, + "grad_norm": 3.386293411254883, + "learning_rate": 3.5208460218149606e-05, + "loss": 0.4641, + "step": 22050 + }, + { + "epoch": 1.1, + "grad_norm": 6.038970947265625, + "learning_rate": 3.518539025155492e-05, + "loss": 0.4247, + "step": 22075 + }, + { + "epoch": 1.1, + "grad_norm": 8.06563663482666, + "learning_rate": 3.516232028496023e-05, + "loss": 0.4515, + "step": 22100 + }, + { + "epoch": 1.1, + "grad_norm": 5.882358551025391, + "learning_rate": 3.513925031836554e-05, + "loss": 0.5434, + "step": 22125 + }, + { + "epoch": 1.1, + "grad_norm": 18.23955726623535, + "learning_rate": 3.511618035177085e-05, + "loss": 0.4401, + "step": 22150 + }, + { + "epoch": 1.1, + "grad_norm": 16.53597068786621, + "learning_rate": 3.509311038517616e-05, + "loss": 0.5405, + "step": 22175 + }, + { + "epoch": 1.11, + "grad_norm": 6.848310947418213, + "learning_rate": 3.5070040418581474e-05, + "loss": 0.4074, + "step": 22200 + }, + { + "epoch": 1.11, + "grad_norm": 2.1513144969940186, + "learning_rate": 3.5046970451986785e-05, + "loss": 0.4812, + "step": 22225 + }, + { + "epoch": 1.11, + "grad_norm": 15.589600563049316, + "learning_rate": 3.50239004853921e-05, + "loss": 0.4576, + "step": 22250 + }, + { + "epoch": 1.11, + "grad_norm": 5.110909938812256, + "learning_rate": 3.5000830518797415e-05, + "loss": 0.5207, + "step": 22275 + }, + { + "epoch": 1.11, + "grad_norm": 1.8498058319091797, + "learning_rate": 3.497776055220272e-05, + "loss": 0.5286, + "step": 22300 + }, + { + "epoch": 1.11, + "grad_norm": 3.860257387161255, + "learning_rate": 3.495469058560804e-05, + "loss": 0.5053, + "step": 22325 + }, + { + "epoch": 1.11, + "grad_norm": 3.398019313812256, + "learning_rate": 3.493162061901334e-05, + "loss": 0.4769, + "step": 22350 + }, + { + "epoch": 1.11, + "grad_norm": 2.5294039249420166, + "learning_rate": 3.490855065241866e-05, + "loss": 0.4796, + "step": 22375 + }, + { + "epoch": 1.12, + "grad_norm": 3.215113639831543, + "learning_rate": 3.4885480685823965e-05, + "loss": 0.6287, + "step": 22400 + }, + { + "epoch": 1.12, + "grad_norm": 8.600980758666992, + "learning_rate": 3.486241071922928e-05, + "loss": 0.5, + "step": 22425 + }, + { + "epoch": 1.12, + "grad_norm": 3.411648988723755, + "learning_rate": 3.4839340752634594e-05, + "loss": 0.4635, + "step": 22450 + }, + { + "epoch": 1.12, + "grad_norm": 1.5979714393615723, + "learning_rate": 3.4816270786039906e-05, + "loss": 0.5175, + "step": 22475 + }, + { + "epoch": 1.12, + "grad_norm": 15.22208023071289, + "learning_rate": 3.479320081944522e-05, + "loss": 0.5597, + "step": 22500 + }, + { + "epoch": 1.12, + "grad_norm": 6.04987096786499, + "learning_rate": 3.477013085285052e-05, + "loss": 0.5439, + "step": 22525 + }, + { + "epoch": 1.12, + "grad_norm": 6.620021343231201, + "learning_rate": 3.474706088625584e-05, + "loss": 0.4821, + "step": 22550 + }, + { + "epoch": 1.12, + "grad_norm": 1.7440756559371948, + "learning_rate": 3.472399091966115e-05, + "loss": 0.4697, + "step": 22575 + }, + { + "epoch": 1.13, + "grad_norm": 122.33167266845703, + "learning_rate": 3.470092095306646e-05, + "loss": 0.5446, + "step": 22600 + }, + { + "epoch": 1.13, + "grad_norm": 1.3191596269607544, + "learning_rate": 3.4677850986471774e-05, + "loss": 0.4383, + "step": 22625 + }, + { + "epoch": 1.13, + "grad_norm": 2.9673454761505127, + "learning_rate": 3.4654781019877085e-05, + "loss": 0.6085, + "step": 22650 + }, + { + "epoch": 1.13, + "grad_norm": 2.2541136741638184, + "learning_rate": 3.46317110532824e-05, + "loss": 0.5107, + "step": 22675 + }, + { + "epoch": 1.13, + "grad_norm": 3.6315643787384033, + "learning_rate": 3.460864108668771e-05, + "loss": 0.4303, + "step": 22700 + }, + { + "epoch": 1.13, + "grad_norm": 9.80652141571045, + "learning_rate": 3.458557112009302e-05, + "loss": 0.5278, + "step": 22725 + }, + { + "epoch": 1.13, + "grad_norm": 306.9204406738281, + "learning_rate": 3.456250115349833e-05, + "loss": 0.5635, + "step": 22750 + }, + { + "epoch": 1.13, + "grad_norm": 7.718824863433838, + "learning_rate": 3.453943118690364e-05, + "loss": 0.5989, + "step": 22775 + }, + { + "epoch": 1.14, + "grad_norm": 6.900171279907227, + "learning_rate": 3.451636122030896e-05, + "loss": 0.5765, + "step": 22800 + }, + { + "epoch": 1.14, + "grad_norm": 2.9811198711395264, + "learning_rate": 3.4493291253714265e-05, + "loss": 0.6049, + "step": 22825 + }, + { + "epoch": 1.14, + "grad_norm": 3.2900378704071045, + "learning_rate": 3.447022128711958e-05, + "loss": 0.7042, + "step": 22850 + }, + { + "epoch": 1.14, + "grad_norm": 1.831827163696289, + "learning_rate": 3.444715132052489e-05, + "loss": 0.5314, + "step": 22875 + }, + { + "epoch": 1.14, + "grad_norm": 251.12879943847656, + "learning_rate": 3.44240813539302e-05, + "loss": 0.5267, + "step": 22900 + }, + { + "epoch": 1.14, + "grad_norm": 4.73264217376709, + "learning_rate": 3.440101138733551e-05, + "loss": 0.5608, + "step": 22925 + }, + { + "epoch": 1.14, + "grad_norm": 130.6006622314453, + "learning_rate": 3.437794142074082e-05, + "loss": 0.532, + "step": 22950 + }, + { + "epoch": 1.14, + "grad_norm": 1.9377081394195557, + "learning_rate": 3.435487145414614e-05, + "loss": 0.4259, + "step": 22975 + }, + { + "epoch": 1.15, + "grad_norm": 2.885848045349121, + "learning_rate": 3.4331801487551444e-05, + "loss": 0.5345, + "step": 23000 + }, + { + "epoch": 1.15, + "grad_norm": 2.7751710414886475, + "learning_rate": 3.430873152095676e-05, + "loss": 0.4862, + "step": 23025 + }, + { + "epoch": 1.15, + "grad_norm": 2.8456525802612305, + "learning_rate": 3.428566155436207e-05, + "loss": 0.5253, + "step": 23050 + }, + { + "epoch": 1.15, + "grad_norm": 3.7487411499023438, + "learning_rate": 3.4262591587767385e-05, + "loss": 0.5467, + "step": 23075 + }, + { + "epoch": 1.15, + "grad_norm": 1.3666393756866455, + "learning_rate": 3.4239521621172697e-05, + "loss": 0.4968, + "step": 23100 + }, + { + "epoch": 1.15, + "grad_norm": 1.7452878952026367, + "learning_rate": 3.421645165457801e-05, + "loss": 0.5518, + "step": 23125 + }, + { + "epoch": 1.15, + "grad_norm": 2.3537375926971436, + "learning_rate": 3.419338168798332e-05, + "loss": 0.4887, + "step": 23150 + }, + { + "epoch": 1.15, + "grad_norm": 3.2377073764801025, + "learning_rate": 3.417031172138863e-05, + "loss": 0.6111, + "step": 23175 + }, + { + "epoch": 1.16, + "grad_norm": 16.65485954284668, + "learning_rate": 3.414724175479394e-05, + "loss": 0.4488, + "step": 23200 + }, + { + "epoch": 1.16, + "grad_norm": 5.436980247497559, + "learning_rate": 3.412417178819925e-05, + "loss": 0.5589, + "step": 23225 + }, + { + "epoch": 1.16, + "grad_norm": 4.784524440765381, + "learning_rate": 3.4101101821604565e-05, + "loss": 0.5464, + "step": 23250 + }, + { + "epoch": 1.16, + "grad_norm": 1.517666220664978, + "learning_rate": 3.4078031855009876e-05, + "loss": 0.4781, + "step": 23275 + }, + { + "epoch": 1.16, + "grad_norm": 6.419158935546875, + "learning_rate": 3.405496188841519e-05, + "loss": 0.3523, + "step": 23300 + }, + { + "epoch": 1.16, + "grad_norm": 14.737177848815918, + "learning_rate": 3.40318919218205e-05, + "loss": 0.574, + "step": 23325 + }, + { + "epoch": 1.16, + "grad_norm": 1.729177713394165, + "learning_rate": 3.400882195522581e-05, + "loss": 0.5458, + "step": 23350 + }, + { + "epoch": 1.16, + "grad_norm": 1.4529409408569336, + "learning_rate": 3.398575198863112e-05, + "loss": 0.4995, + "step": 23375 + }, + { + "epoch": 1.17, + "grad_norm": 1.5424673557281494, + "learning_rate": 3.396268202203643e-05, + "loss": 0.4908, + "step": 23400 + }, + { + "epoch": 1.17, + "grad_norm": 3.3175251483917236, + "learning_rate": 3.3939612055441744e-05, + "loss": 0.5051, + "step": 23425 + }, + { + "epoch": 1.17, + "grad_norm": 6.083924770355225, + "learning_rate": 3.3916542088847056e-05, + "loss": 0.4548, + "step": 23450 + }, + { + "epoch": 1.17, + "grad_norm": 4.571666717529297, + "learning_rate": 3.389347212225237e-05, + "loss": 0.4662, + "step": 23475 + }, + { + "epoch": 1.17, + "grad_norm": 2.936976671218872, + "learning_rate": 3.3870402155657685e-05, + "loss": 0.4087, + "step": 23500 + }, + { + "epoch": 1.17, + "grad_norm": 6.462584495544434, + "learning_rate": 3.384733218906299e-05, + "loss": 0.4337, + "step": 23525 + }, + { + "epoch": 1.17, + "grad_norm": 0.8171650171279907, + "learning_rate": 3.382426222246831e-05, + "loss": 0.4279, + "step": 23550 + }, + { + "epoch": 1.17, + "grad_norm": 27.758996963500977, + "learning_rate": 3.380119225587361e-05, + "loss": 0.4673, + "step": 23575 + }, + { + "epoch": 1.18, + "grad_norm": 1.5759187936782837, + "learning_rate": 3.3778122289278924e-05, + "loss": 0.4649, + "step": 23600 + }, + { + "epoch": 1.18, + "grad_norm": 4.116755962371826, + "learning_rate": 3.375505232268424e-05, + "loss": 0.5568, + "step": 23625 + }, + { + "epoch": 1.18, + "grad_norm": 12.596151351928711, + "learning_rate": 3.3731982356089547e-05, + "loss": 0.6374, + "step": 23650 + }, + { + "epoch": 1.18, + "grad_norm": 2.9202468395233154, + "learning_rate": 3.3708912389494865e-05, + "loss": 0.4618, + "step": 23675 + }, + { + "epoch": 1.18, + "grad_norm": 2.892897367477417, + "learning_rate": 3.368584242290017e-05, + "loss": 0.385, + "step": 23700 + }, + { + "epoch": 1.18, + "grad_norm": 4.933587551116943, + "learning_rate": 3.366277245630549e-05, + "loss": 0.5328, + "step": 23725 + }, + { + "epoch": 1.18, + "grad_norm": 1.407938003540039, + "learning_rate": 3.363970248971079e-05, + "loss": 0.4736, + "step": 23750 + }, + { + "epoch": 1.18, + "grad_norm": 1.8879458904266357, + "learning_rate": 3.361663252311611e-05, + "loss": 0.582, + "step": 23775 + }, + { + "epoch": 1.19, + "grad_norm": 1.3088123798370361, + "learning_rate": 3.359356255652142e-05, + "loss": 0.4891, + "step": 23800 + }, + { + "epoch": 1.19, + "grad_norm": 2.8642852306365967, + "learning_rate": 3.357049258992673e-05, + "loss": 0.4502, + "step": 23825 + }, + { + "epoch": 1.19, + "grad_norm": 2.1138124465942383, + "learning_rate": 3.3547422623332044e-05, + "loss": 0.5602, + "step": 23850 + }, + { + "epoch": 1.19, + "grad_norm": 22.736108779907227, + "learning_rate": 3.3524352656737356e-05, + "loss": 0.4927, + "step": 23875 + }, + { + "epoch": 1.19, + "grad_norm": 19.614036560058594, + "learning_rate": 3.350128269014267e-05, + "loss": 0.4496, + "step": 23900 + }, + { + "epoch": 1.19, + "grad_norm": 6.80272912979126, + "learning_rate": 3.347821272354798e-05, + "loss": 0.5073, + "step": 23925 + }, + { + "epoch": 1.19, + "grad_norm": 1.0270265340805054, + "learning_rate": 3.345514275695329e-05, + "loss": 0.3707, + "step": 23950 + }, + { + "epoch": 1.19, + "grad_norm": 44.71577072143555, + "learning_rate": 3.34320727903586e-05, + "loss": 0.4172, + "step": 23975 + }, + { + "epoch": 1.2, + "grad_norm": 3.711918354034424, + "learning_rate": 3.340900282376391e-05, + "loss": 0.5641, + "step": 24000 + }, + { + "epoch": 1.2, + "grad_norm": 0.8723378777503967, + "learning_rate": 3.3385932857169224e-05, + "loss": 0.4355, + "step": 24025 + }, + { + "epoch": 1.2, + "grad_norm": 0.9061195850372314, + "learning_rate": 3.3362862890574535e-05, + "loss": 0.4956, + "step": 24050 + }, + { + "epoch": 1.2, + "grad_norm": 12.934722900390625, + "learning_rate": 3.3339792923979846e-05, + "loss": 0.3641, + "step": 24075 + }, + { + "epoch": 1.2, + "grad_norm": 3.357600450515747, + "learning_rate": 3.331672295738516e-05, + "loss": 0.4216, + "step": 24100 + }, + { + "epoch": 1.2, + "grad_norm": 2.2739009857177734, + "learning_rate": 3.329365299079047e-05, + "loss": 0.6057, + "step": 24125 + }, + { + "epoch": 1.2, + "grad_norm": 12.44058895111084, + "learning_rate": 3.327058302419579e-05, + "loss": 0.567, + "step": 24150 + }, + { + "epoch": 1.2, + "grad_norm": 6.135155200958252, + "learning_rate": 3.324751305760109e-05, + "loss": 0.4215, + "step": 24175 + }, + { + "epoch": 1.21, + "grad_norm": 18.950393676757812, + "learning_rate": 3.322444309100641e-05, + "loss": 0.5002, + "step": 24200 + }, + { + "epoch": 1.21, + "grad_norm": 18.473573684692383, + "learning_rate": 3.3201373124411715e-05, + "loss": 0.6292, + "step": 24225 + }, + { + "epoch": 1.21, + "grad_norm": 2.5561163425445557, + "learning_rate": 3.317830315781703e-05, + "loss": 0.4706, + "step": 24250 + }, + { + "epoch": 1.21, + "grad_norm": 1.382630467414856, + "learning_rate": 3.315523319122234e-05, + "loss": 0.295, + "step": 24275 + }, + { + "epoch": 1.21, + "grad_norm": 0.9985918998718262, + "learning_rate": 3.3132163224627655e-05, + "loss": 0.5048, + "step": 24300 + }, + { + "epoch": 1.21, + "grad_norm": 22.975221633911133, + "learning_rate": 3.310909325803297e-05, + "loss": 0.4621, + "step": 24325 + }, + { + "epoch": 1.21, + "grad_norm": 3.7666635513305664, + "learning_rate": 3.308602329143827e-05, + "loss": 0.4485, + "step": 24350 + }, + { + "epoch": 1.21, + "grad_norm": 3.244364023208618, + "learning_rate": 3.306295332484359e-05, + "loss": 0.4557, + "step": 24375 + }, + { + "epoch": 1.22, + "grad_norm": 1.085253119468689, + "learning_rate": 3.3039883358248894e-05, + "loss": 0.4611, + "step": 24400 + }, + { + "epoch": 1.22, + "grad_norm": 3.019808292388916, + "learning_rate": 3.301681339165421e-05, + "loss": 0.534, + "step": 24425 + }, + { + "epoch": 1.22, + "grad_norm": 3.1418190002441406, + "learning_rate": 3.299374342505952e-05, + "loss": 0.5937, + "step": 24450 + }, + { + "epoch": 1.22, + "grad_norm": 1.0198239088058472, + "learning_rate": 3.2970673458464835e-05, + "loss": 0.513, + "step": 24475 + }, + { + "epoch": 1.22, + "grad_norm": 3.4835383892059326, + "learning_rate": 3.2947603491870146e-05, + "loss": 0.3619, + "step": 24500 + }, + { + "epoch": 1.22, + "grad_norm": 2.089794635772705, + "learning_rate": 3.292453352527546e-05, + "loss": 0.5536, + "step": 24525 + }, + { + "epoch": 1.22, + "grad_norm": 1.106353998184204, + "learning_rate": 3.290146355868077e-05, + "loss": 0.4232, + "step": 24550 + }, + { + "epoch": 1.22, + "grad_norm": 3.080213785171509, + "learning_rate": 3.287839359208608e-05, + "loss": 0.5168, + "step": 24575 + }, + { + "epoch": 1.23, + "grad_norm": 5.004645824432373, + "learning_rate": 3.285532362549139e-05, + "loss": 0.6027, + "step": 24600 + }, + { + "epoch": 1.23, + "grad_norm": 4.234391689300537, + "learning_rate": 3.28322536588967e-05, + "loss": 0.3551, + "step": 24625 + }, + { + "epoch": 1.23, + "grad_norm": 66.61769104003906, + "learning_rate": 3.2809183692302014e-05, + "loss": 0.513, + "step": 24650 + }, + { + "epoch": 1.23, + "grad_norm": 1.416574239730835, + "learning_rate": 3.278611372570733e-05, + "loss": 0.3775, + "step": 24675 + }, + { + "epoch": 1.23, + "grad_norm": 6.356552600860596, + "learning_rate": 3.276304375911264e-05, + "loss": 0.4529, + "step": 24700 + }, + { + "epoch": 1.23, + "grad_norm": 1.3848146200180054, + "learning_rate": 3.273997379251795e-05, + "loss": 0.4677, + "step": 24725 + }, + { + "epoch": 1.23, + "grad_norm": 2.0824124813079834, + "learning_rate": 3.271690382592326e-05, + "loss": 0.5075, + "step": 24750 + }, + { + "epoch": 1.23, + "grad_norm": 1.3122801780700684, + "learning_rate": 3.269383385932857e-05, + "loss": 0.4526, + "step": 24775 + }, + { + "epoch": 1.24, + "grad_norm": 1.8558235168457031, + "learning_rate": 3.267076389273388e-05, + "loss": 0.5952, + "step": 24800 + }, + { + "epoch": 1.24, + "grad_norm": 1.6498969793319702, + "learning_rate": 3.2647693926139194e-05, + "loss": 0.5372, + "step": 24825 + }, + { + "epoch": 1.24, + "grad_norm": 1.6553959846496582, + "learning_rate": 3.262462395954451e-05, + "loss": 0.4644, + "step": 24850 + }, + { + "epoch": 1.24, + "grad_norm": 9.277535438537598, + "learning_rate": 3.260155399294982e-05, + "loss": 0.568, + "step": 24875 + }, + { + "epoch": 1.24, + "grad_norm": 1.7730066776275635, + "learning_rate": 3.2578484026355135e-05, + "loss": 0.3625, + "step": 24900 + }, + { + "epoch": 1.24, + "grad_norm": 1.2242295742034912, + "learning_rate": 3.255541405976044e-05, + "loss": 0.411, + "step": 24925 + }, + { + "epoch": 1.24, + "grad_norm": 3.2038660049438477, + "learning_rate": 3.253234409316576e-05, + "loss": 0.316, + "step": 24950 + }, + { + "epoch": 1.24, + "grad_norm": 6.638166427612305, + "learning_rate": 3.250927412657106e-05, + "loss": 0.5497, + "step": 24975 + }, + { + "epoch": 1.25, + "grad_norm": 0.8951555490493774, + "learning_rate": 3.248620415997638e-05, + "loss": 0.432, + "step": 25000 + }, + { + "epoch": 1.25, + "grad_norm": 3.3353822231292725, + "learning_rate": 3.246313419338169e-05, + "loss": 0.4795, + "step": 25025 + }, + { + "epoch": 1.25, + "grad_norm": 2.7513251304626465, + "learning_rate": 3.2440064226787e-05, + "loss": 0.5432, + "step": 25050 + }, + { + "epoch": 1.25, + "grad_norm": 0.9779685735702515, + "learning_rate": 3.2416994260192314e-05, + "loss": 0.5657, + "step": 25075 + }, + { + "epoch": 1.25, + "grad_norm": 4.009624004364014, + "learning_rate": 3.239392429359762e-05, + "loss": 0.4931, + "step": 25100 + }, + { + "epoch": 1.25, + "grad_norm": 1.116674542427063, + "learning_rate": 3.237085432700294e-05, + "loss": 0.5125, + "step": 25125 + }, + { + "epoch": 1.25, + "grad_norm": 5.3419976234436035, + "learning_rate": 3.234778436040825e-05, + "loss": 0.5456, + "step": 25150 + }, + { + "epoch": 1.25, + "grad_norm": 4.242710590362549, + "learning_rate": 3.232471439381356e-05, + "loss": 0.4269, + "step": 25175 + }, + { + "epoch": 1.26, + "grad_norm": 1.4048486948013306, + "learning_rate": 3.230164442721887e-05, + "loss": 0.348, + "step": 25200 + }, + { + "epoch": 1.26, + "grad_norm": 8.610922813415527, + "learning_rate": 3.227857446062418e-05, + "loss": 0.5558, + "step": 25225 + }, + { + "epoch": 1.26, + "grad_norm": 0.9171805381774902, + "learning_rate": 3.2255504494029494e-05, + "loss": 0.547, + "step": 25250 + }, + { + "epoch": 1.26, + "grad_norm": 5.653646945953369, + "learning_rate": 3.2232434527434805e-05, + "loss": 0.529, + "step": 25275 + }, + { + "epoch": 1.26, + "grad_norm": 4.338497638702393, + "learning_rate": 3.2209364560840117e-05, + "loss": 0.5316, + "step": 25300 + }, + { + "epoch": 1.26, + "grad_norm": 1.1895941495895386, + "learning_rate": 3.218629459424543e-05, + "loss": 0.3412, + "step": 25325 + }, + { + "epoch": 1.26, + "grad_norm": 3.513719081878662, + "learning_rate": 3.216322462765074e-05, + "loss": 0.5027, + "step": 25350 + }, + { + "epoch": 1.26, + "grad_norm": 3.2415273189544678, + "learning_rate": 3.214015466105606e-05, + "loss": 0.416, + "step": 25375 + }, + { + "epoch": 1.27, + "grad_norm": 3.46956205368042, + "learning_rate": 3.211708469446136e-05, + "loss": 0.5001, + "step": 25400 + }, + { + "epoch": 1.27, + "grad_norm": 5523.3828125, + "learning_rate": 3.209401472786668e-05, + "loss": 0.5623, + "step": 25425 + }, + { + "epoch": 1.27, + "grad_norm": 3.629899024963379, + "learning_rate": 3.2070944761271985e-05, + "loss": 0.4153, + "step": 25450 + }, + { + "epoch": 1.27, + "grad_norm": 11.945967674255371, + "learning_rate": 3.2047874794677296e-05, + "loss": 0.5597, + "step": 25475 + }, + { + "epoch": 1.27, + "grad_norm": 1.553520917892456, + "learning_rate": 3.202480482808261e-05, + "loss": 0.4993, + "step": 25500 + }, + { + "epoch": 1.27, + "grad_norm": 1.838877558708191, + "learning_rate": 3.200173486148792e-05, + "loss": 0.3805, + "step": 25525 + }, + { + "epoch": 1.27, + "grad_norm": 1.2891180515289307, + "learning_rate": 3.197866489489324e-05, + "loss": 0.6021, + "step": 25550 + }, + { + "epoch": 1.27, + "grad_norm": 1.362278699874878, + "learning_rate": 3.195559492829854e-05, + "loss": 0.4973, + "step": 25575 + }, + { + "epoch": 1.28, + "grad_norm": 50.55386734008789, + "learning_rate": 3.193252496170386e-05, + "loss": 0.403, + "step": 25600 + }, + { + "epoch": 1.28, + "grad_norm": 3.0399601459503174, + "learning_rate": 3.1909454995109164e-05, + "loss": 0.6989, + "step": 25625 + }, + { + "epoch": 1.28, + "grad_norm": 2.681670665740967, + "learning_rate": 3.188638502851448e-05, + "loss": 0.5274, + "step": 25650 + }, + { + "epoch": 1.28, + "grad_norm": 6.848381042480469, + "learning_rate": 3.1863315061919794e-05, + "loss": 0.3641, + "step": 25675 + }, + { + "epoch": 1.28, + "grad_norm": 1.4316284656524658, + "learning_rate": 3.1840245095325105e-05, + "loss": 0.4657, + "step": 25700 + }, + { + "epoch": 1.28, + "grad_norm": 1.1102467775344849, + "learning_rate": 3.1817175128730416e-05, + "loss": 0.4894, + "step": 25725 + }, + { + "epoch": 1.28, + "grad_norm": 8.56735610961914, + "learning_rate": 3.179410516213573e-05, + "loss": 0.5228, + "step": 25750 + }, + { + "epoch": 1.28, + "grad_norm": 2.549102544784546, + "learning_rate": 3.177103519554104e-05, + "loss": 0.4387, + "step": 25775 + }, + { + "epoch": 1.29, + "grad_norm": 8.965352058410645, + "learning_rate": 3.174796522894635e-05, + "loss": 0.6523, + "step": 25800 + }, + { + "epoch": 1.29, + "grad_norm": 3.4664347171783447, + "learning_rate": 3.172489526235166e-05, + "loss": 0.6127, + "step": 25825 + }, + { + "epoch": 1.29, + "grad_norm": 2.6698806285858154, + "learning_rate": 3.170182529575697e-05, + "loss": 0.4529, + "step": 25850 + }, + { + "epoch": 1.29, + "grad_norm": 6.293718338012695, + "learning_rate": 3.1678755329162285e-05, + "loss": 0.5818, + "step": 25875 + }, + { + "epoch": 1.29, + "grad_norm": 1.8514574766159058, + "learning_rate": 3.1655685362567596e-05, + "loss": 0.4983, + "step": 25900 + }, + { + "epoch": 1.29, + "grad_norm": 1.4830529689788818, + "learning_rate": 3.163261539597291e-05, + "loss": 0.5502, + "step": 25925 + }, + { + "epoch": 1.29, + "grad_norm": 1.423803687095642, + "learning_rate": 3.160954542937822e-05, + "loss": 0.442, + "step": 25950 + }, + { + "epoch": 1.29, + "grad_norm": 1.1572527885437012, + "learning_rate": 3.158647546278353e-05, + "loss": 0.5089, + "step": 25975 + }, + { + "epoch": 1.3, + "grad_norm": 2.6536920070648193, + "learning_rate": 3.156340549618884e-05, + "loss": 0.4971, + "step": 26000 + }, + { + "epoch": 1.3, + "grad_norm": 3.075524091720581, + "learning_rate": 3.154033552959415e-05, + "loss": 0.5367, + "step": 26025 + }, + { + "epoch": 1.3, + "grad_norm": 1.6290290355682373, + "learning_rate": 3.1517265562999464e-05, + "loss": 0.529, + "step": 26050 + }, + { + "epoch": 1.3, + "grad_norm": 2.615154266357422, + "learning_rate": 3.149419559640478e-05, + "loss": 0.5062, + "step": 26075 + }, + { + "epoch": 1.3, + "grad_norm": 4.603691101074219, + "learning_rate": 3.147112562981009e-05, + "loss": 0.443, + "step": 26100 + }, + { + "epoch": 1.3, + "grad_norm": 1.1716935634613037, + "learning_rate": 3.1448055663215405e-05, + "loss": 0.5058, + "step": 26125 + }, + { + "epoch": 1.3, + "grad_norm": 1.8815289735794067, + "learning_rate": 3.142498569662071e-05, + "loss": 0.3954, + "step": 26150 + }, + { + "epoch": 1.3, + "grad_norm": 3.429013252258301, + "learning_rate": 3.140191573002603e-05, + "loss": 0.538, + "step": 26175 + }, + { + "epoch": 1.31, + "grad_norm": 4.653341293334961, + "learning_rate": 3.137884576343134e-05, + "loss": 0.5478, + "step": 26200 + }, + { + "epoch": 1.31, + "grad_norm": 51.642330169677734, + "learning_rate": 3.1355775796836644e-05, + "loss": 0.5739, + "step": 26225 + }, + { + "epoch": 1.31, + "grad_norm": 3.7197000980377197, + "learning_rate": 3.133270583024196e-05, + "loss": 0.5179, + "step": 26250 + }, + { + "epoch": 1.31, + "grad_norm": 5.795961380004883, + "learning_rate": 3.1309635863647266e-05, + "loss": 0.4714, + "step": 26275 + }, + { + "epoch": 1.31, + "grad_norm": 4.7175703048706055, + "learning_rate": 3.1286565897052585e-05, + "loss": 0.4698, + "step": 26300 + }, + { + "epoch": 1.31, + "grad_norm": 2.4297192096710205, + "learning_rate": 3.126349593045789e-05, + "loss": 0.4828, + "step": 26325 + }, + { + "epoch": 1.31, + "grad_norm": 3.88303279876709, + "learning_rate": 3.124042596386321e-05, + "loss": 0.5602, + "step": 26350 + }, + { + "epoch": 1.31, + "grad_norm": 6.631831169128418, + "learning_rate": 3.121735599726852e-05, + "loss": 0.4625, + "step": 26375 + }, + { + "epoch": 1.32, + "grad_norm": 5.661457061767578, + "learning_rate": 3.119428603067383e-05, + "loss": 0.5065, + "step": 26400 + }, + { + "epoch": 1.32, + "grad_norm": 11.636333465576172, + "learning_rate": 3.117121606407914e-05, + "loss": 0.3525, + "step": 26425 + }, + { + "epoch": 1.32, + "grad_norm": 1.0581868886947632, + "learning_rate": 3.114814609748445e-05, + "loss": 0.4466, + "step": 26450 + }, + { + "epoch": 1.32, + "grad_norm": 3.096194267272949, + "learning_rate": 3.1125076130889764e-05, + "loss": 0.4712, + "step": 26475 + }, + { + "epoch": 1.32, + "grad_norm": 6.5298075675964355, + "learning_rate": 3.1102006164295075e-05, + "loss": 0.5473, + "step": 26500 + }, + { + "epoch": 1.32, + "grad_norm": 3.341557025909424, + "learning_rate": 3.107893619770039e-05, + "loss": 0.5103, + "step": 26525 + }, + { + "epoch": 1.32, + "grad_norm": 7.310173988342285, + "learning_rate": 3.10558662311057e-05, + "loss": 0.4153, + "step": 26550 + }, + { + "epoch": 1.32, + "grad_norm": 1.1792340278625488, + "learning_rate": 3.103279626451101e-05, + "loss": 0.5409, + "step": 26575 + }, + { + "epoch": 1.33, + "grad_norm": 14.870302200317383, + "learning_rate": 3.100972629791632e-05, + "loss": 0.4116, + "step": 26600 + }, + { + "epoch": 1.33, + "grad_norm": 1.2999331951141357, + "learning_rate": 3.098665633132163e-05, + "loss": 0.4566, + "step": 26625 + }, + { + "epoch": 1.33, + "grad_norm": 6.613809585571289, + "learning_rate": 3.0963586364726944e-05, + "loss": 0.534, + "step": 26650 + }, + { + "epoch": 1.33, + "grad_norm": 2.9065253734588623, + "learning_rate": 3.0940516398132255e-05, + "loss": 0.5131, + "step": 26675 + }, + { + "epoch": 1.33, + "grad_norm": 2.785900115966797, + "learning_rate": 3.0917446431537566e-05, + "loss": 0.442, + "step": 26700 + }, + { + "epoch": 1.33, + "grad_norm": 14.88037109375, + "learning_rate": 3.0894376464942884e-05, + "loss": 0.4574, + "step": 26725 + }, + { + "epoch": 1.33, + "grad_norm": 4.1157636642456055, + "learning_rate": 3.087130649834819e-05, + "loss": 0.5106, + "step": 26750 + }, + { + "epoch": 1.33, + "grad_norm": 2.2302122116088867, + "learning_rate": 3.084823653175351e-05, + "loss": 0.4049, + "step": 26775 + }, + { + "epoch": 1.34, + "grad_norm": 4.412403106689453, + "learning_rate": 3.082516656515881e-05, + "loss": 0.4754, + "step": 26800 + }, + { + "epoch": 1.34, + "grad_norm": 5.906436443328857, + "learning_rate": 3.080209659856413e-05, + "loss": 0.537, + "step": 26825 + }, + { + "epoch": 1.34, + "grad_norm": 5.8190836906433105, + "learning_rate": 3.0779026631969434e-05, + "loss": 0.5667, + "step": 26850 + }, + { + "epoch": 1.34, + "grad_norm": 2.78393292427063, + "learning_rate": 3.075595666537475e-05, + "loss": 0.4906, + "step": 26875 + }, + { + "epoch": 1.34, + "grad_norm": 3.552785873413086, + "learning_rate": 3.0732886698780064e-05, + "loss": 0.4337, + "step": 26900 + }, + { + "epoch": 1.34, + "grad_norm": 4.454800128936768, + "learning_rate": 3.0709816732185375e-05, + "loss": 0.4113, + "step": 26925 + }, + { + "epoch": 1.34, + "grad_norm": 4.132376670837402, + "learning_rate": 3.068674676559069e-05, + "loss": 0.4546, + "step": 26950 + }, + { + "epoch": 1.34, + "grad_norm": 5.598781108856201, + "learning_rate": 3.066367679899599e-05, + "loss": 0.4533, + "step": 26975 + }, + { + "epoch": 1.35, + "grad_norm": 19.695556640625, + "learning_rate": 3.064060683240131e-05, + "loss": 0.5621, + "step": 27000 + }, + { + "epoch": 1.35, + "grad_norm": 1.143239140510559, + "learning_rate": 3.061753686580662e-05, + "loss": 0.5251, + "step": 27025 + }, + { + "epoch": 1.35, + "grad_norm": 1.0635647773742676, + "learning_rate": 3.059446689921193e-05, + "loss": 0.4429, + "step": 27050 + }, + { + "epoch": 1.35, + "grad_norm": 43.61861801147461, + "learning_rate": 3.0571396932617243e-05, + "loss": 0.4055, + "step": 27075 + }, + { + "epoch": 1.35, + "grad_norm": 6.694369316101074, + "learning_rate": 3.0548326966022555e-05, + "loss": 0.4343, + "step": 27100 + }, + { + "epoch": 1.35, + "grad_norm": 3.3620946407318115, + "learning_rate": 3.0525256999427866e-05, + "loss": 0.5503, + "step": 27125 + }, + { + "epoch": 1.35, + "grad_norm": 1.006640911102295, + "learning_rate": 3.050218703283318e-05, + "loss": 0.5391, + "step": 27150 + }, + { + "epoch": 1.35, + "grad_norm": 3.3535425662994385, + "learning_rate": 3.047911706623849e-05, + "loss": 0.6485, + "step": 27175 + }, + { + "epoch": 1.36, + "grad_norm": 1.531585931777954, + "learning_rate": 3.0456047099643804e-05, + "loss": 0.4886, + "step": 27200 + }, + { + "epoch": 1.36, + "grad_norm": 5.89702844619751, + "learning_rate": 3.043297713304911e-05, + "loss": 0.6445, + "step": 27225 + }, + { + "epoch": 1.36, + "grad_norm": 2.3690192699432373, + "learning_rate": 3.0409907166454426e-05, + "loss": 0.4703, + "step": 27250 + }, + { + "epoch": 1.36, + "grad_norm": 2.31122088432312, + "learning_rate": 3.0386837199859734e-05, + "loss": 0.5204, + "step": 27275 + }, + { + "epoch": 1.36, + "grad_norm": 1.4227750301361084, + "learning_rate": 3.036376723326505e-05, + "loss": 0.464, + "step": 27300 + }, + { + "epoch": 1.36, + "grad_norm": 1.3377498388290405, + "learning_rate": 3.034069726667036e-05, + "loss": 0.4762, + "step": 27325 + }, + { + "epoch": 1.36, + "grad_norm": 4.721393585205078, + "learning_rate": 3.031762730007567e-05, + "loss": 0.4241, + "step": 27350 + }, + { + "epoch": 1.36, + "grad_norm": 1.1271649599075317, + "learning_rate": 3.0294557333480983e-05, + "loss": 0.3842, + "step": 27375 + }, + { + "epoch": 1.37, + "grad_norm": 49.33847427368164, + "learning_rate": 3.027148736688629e-05, + "loss": 0.4582, + "step": 27400 + }, + { + "epoch": 1.37, + "grad_norm": 6.304983615875244, + "learning_rate": 3.0248417400291606e-05, + "loss": 0.447, + "step": 27425 + }, + { + "epoch": 1.37, + "grad_norm": 20.872663497924805, + "learning_rate": 3.0225347433696917e-05, + "loss": 0.461, + "step": 27450 + }, + { + "epoch": 1.37, + "grad_norm": 0.8352044224739075, + "learning_rate": 3.020227746710223e-05, + "loss": 0.4513, + "step": 27475 + }, + { + "epoch": 1.37, + "grad_norm": 3.6428568363189697, + "learning_rate": 3.017920750050754e-05, + "loss": 0.4375, + "step": 27500 + }, + { + "epoch": 1.37, + "grad_norm": 0.8028072714805603, + "learning_rate": 3.0156137533912855e-05, + "loss": 0.3919, + "step": 27525 + }, + { + "epoch": 1.37, + "grad_norm": 3.726179838180542, + "learning_rate": 3.0133067567318163e-05, + "loss": 0.5349, + "step": 27550 + }, + { + "epoch": 1.37, + "grad_norm": 2.6823785305023193, + "learning_rate": 3.0109997600723477e-05, + "loss": 0.6074, + "step": 27575 + }, + { + "epoch": 1.38, + "grad_norm": 6.498818397521973, + "learning_rate": 3.0086927634128785e-05, + "loss": 0.3749, + "step": 27600 + }, + { + "epoch": 1.38, + "grad_norm": 4.5009026527404785, + "learning_rate": 3.00638576675341e-05, + "loss": 0.4709, + "step": 27625 + }, + { + "epoch": 1.38, + "grad_norm": 64.75154113769531, + "learning_rate": 3.004078770093941e-05, + "loss": 0.3149, + "step": 27650 + }, + { + "epoch": 1.38, + "grad_norm": 3.188356637954712, + "learning_rate": 3.0017717734344726e-05, + "loss": 0.4817, + "step": 27675 + }, + { + "epoch": 1.38, + "grad_norm": 1.605098009109497, + "learning_rate": 2.9994647767750034e-05, + "loss": 0.532, + "step": 27700 + }, + { + "epoch": 1.38, + "grad_norm": 3.195401191711426, + "learning_rate": 2.9971577801155342e-05, + "loss": 0.3489, + "step": 27725 + }, + { + "epoch": 1.38, + "grad_norm": 1.9191489219665527, + "learning_rate": 2.9948507834560657e-05, + "loss": 0.4145, + "step": 27750 + }, + { + "epoch": 1.38, + "grad_norm": 0.909359335899353, + "learning_rate": 2.9925437867965965e-05, + "loss": 0.4814, + "step": 27775 + }, + { + "epoch": 1.39, + "grad_norm": 3497.900634765625, + "learning_rate": 2.990236790137128e-05, + "loss": 0.6498, + "step": 27800 + }, + { + "epoch": 1.39, + "grad_norm": 3.320150375366211, + "learning_rate": 2.987929793477659e-05, + "loss": 0.3288, + "step": 27825 + }, + { + "epoch": 1.39, + "grad_norm": 20.31009292602539, + "learning_rate": 2.9856227968181906e-05, + "loss": 0.5846, + "step": 27850 + }, + { + "epoch": 1.39, + "grad_norm": 11.665755271911621, + "learning_rate": 2.9833158001587214e-05, + "loss": 0.4474, + "step": 27875 + }, + { + "epoch": 1.39, + "grad_norm": 3.6207375526428223, + "learning_rate": 2.981008803499253e-05, + "loss": 0.6815, + "step": 27900 + }, + { + "epoch": 1.39, + "grad_norm": 5.2110419273376465, + "learning_rate": 2.9787018068397836e-05, + "loss": 0.5543, + "step": 27925 + }, + { + "epoch": 1.39, + "grad_norm": 1.0339158773422241, + "learning_rate": 2.976394810180315e-05, + "loss": 0.4515, + "step": 27950 + }, + { + "epoch": 1.39, + "grad_norm": 3.7517521381378174, + "learning_rate": 2.9740878135208463e-05, + "loss": 0.5052, + "step": 27975 + }, + { + "epoch": 1.4, + "grad_norm": 4.305375576019287, + "learning_rate": 2.9717808168613774e-05, + "loss": 0.5438, + "step": 28000 + }, + { + "epoch": 1.4, + "grad_norm": 2.0723867416381836, + "learning_rate": 2.9694738202019085e-05, + "loss": 0.6635, + "step": 28025 + }, + { + "epoch": 1.4, + "grad_norm": 1.6509737968444824, + "learning_rate": 2.96716682354244e-05, + "loss": 0.4156, + "step": 28050 + }, + { + "epoch": 1.4, + "grad_norm": 3.1618893146514893, + "learning_rate": 2.9648598268829708e-05, + "loss": 0.4622, + "step": 28075 + }, + { + "epoch": 1.4, + "grad_norm": 6.950445175170898, + "learning_rate": 2.9625528302235016e-05, + "loss": 0.4638, + "step": 28100 + }, + { + "epoch": 1.4, + "grad_norm": 2.7668297290802, + "learning_rate": 2.960245833564033e-05, + "loss": 0.3667, + "step": 28125 + }, + { + "epoch": 1.4, + "grad_norm": 1.7094498872756958, + "learning_rate": 2.9579388369045642e-05, + "loss": 0.5978, + "step": 28150 + }, + { + "epoch": 1.4, + "grad_norm": 5.577686786651611, + "learning_rate": 2.9556318402450957e-05, + "loss": 0.5579, + "step": 28175 + }, + { + "epoch": 1.41, + "grad_norm": 2.0594027042388916, + "learning_rate": 2.9533248435856265e-05, + "loss": 0.4377, + "step": 28200 + }, + { + "epoch": 1.41, + "grad_norm": 3.223324775695801, + "learning_rate": 2.951017846926158e-05, + "loss": 0.4961, + "step": 28225 + }, + { + "epoch": 1.41, + "grad_norm": 3.0184011459350586, + "learning_rate": 2.9487108502666888e-05, + "loss": 0.4569, + "step": 28250 + }, + { + "epoch": 1.41, + "grad_norm": 4.878507614135742, + "learning_rate": 2.9464038536072202e-05, + "loss": 0.6598, + "step": 28275 + }, + { + "epoch": 1.41, + "grad_norm": 2.9261884689331055, + "learning_rate": 2.944096856947751e-05, + "loss": 0.5696, + "step": 28300 + }, + { + "epoch": 1.41, + "grad_norm": 1.7874714136123657, + "learning_rate": 2.9417898602882825e-05, + "loss": 0.6261, + "step": 28325 + }, + { + "epoch": 1.41, + "grad_norm": 2.966109275817871, + "learning_rate": 2.9394828636288136e-05, + "loss": 0.4914, + "step": 28350 + }, + { + "epoch": 1.41, + "grad_norm": 2.941281795501709, + "learning_rate": 2.937175866969345e-05, + "loss": 0.451, + "step": 28375 + }, + { + "epoch": 1.42, + "grad_norm": 3.6125543117523193, + "learning_rate": 2.934868870309876e-05, + "loss": 0.5107, + "step": 28400 + }, + { + "epoch": 1.42, + "grad_norm": 4.665797710418701, + "learning_rate": 2.9325618736504074e-05, + "loss": 0.6126, + "step": 28425 + }, + { + "epoch": 1.42, + "grad_norm": 1.2434524297714233, + "learning_rate": 2.9302548769909382e-05, + "loss": 0.4911, + "step": 28450 + }, + { + "epoch": 1.42, + "grad_norm": 93.26859283447266, + "learning_rate": 2.9279478803314693e-05, + "loss": 0.4562, + "step": 28475 + }, + { + "epoch": 1.42, + "grad_norm": 5.395887851715088, + "learning_rate": 2.9256408836720008e-05, + "loss": 0.5041, + "step": 28500 + }, + { + "epoch": 1.42, + "grad_norm": 6.143950939178467, + "learning_rate": 2.9233338870125316e-05, + "loss": 0.5255, + "step": 28525 + }, + { + "epoch": 1.42, + "grad_norm": 1.3129888772964478, + "learning_rate": 2.921026890353063e-05, + "loss": 0.5034, + "step": 28550 + }, + { + "epoch": 1.42, + "grad_norm": 3.8713059425354004, + "learning_rate": 2.918719893693594e-05, + "loss": 0.3438, + "step": 28575 + }, + { + "epoch": 1.43, + "grad_norm": 0.9864943027496338, + "learning_rate": 2.9164128970341253e-05, + "loss": 0.5203, + "step": 28600 + }, + { + "epoch": 1.43, + "grad_norm": 3.4831087589263916, + "learning_rate": 2.914105900374656e-05, + "loss": 0.5231, + "step": 28625 + }, + { + "epoch": 1.43, + "grad_norm": 4.226259708404541, + "learning_rate": 2.9117989037151876e-05, + "loss": 0.5852, + "step": 28650 + }, + { + "epoch": 1.43, + "grad_norm": 1.6076862812042236, + "learning_rate": 2.9094919070557187e-05, + "loss": 0.5976, + "step": 28675 + }, + { + "epoch": 1.43, + "grad_norm": 2.1014654636383057, + "learning_rate": 2.9071849103962502e-05, + "loss": 0.4824, + "step": 28700 + }, + { + "epoch": 1.43, + "grad_norm": 1.2649108171463013, + "learning_rate": 2.904877913736781e-05, + "loss": 0.5148, + "step": 28725 + }, + { + "epoch": 1.43, + "grad_norm": 1.9344068765640259, + "learning_rate": 2.9025709170773125e-05, + "loss": 0.5006, + "step": 28750 + }, + { + "epoch": 1.43, + "grad_norm": 1.1153630018234253, + "learning_rate": 2.9002639204178433e-05, + "loss": 0.3174, + "step": 28775 + }, + { + "epoch": 1.44, + "grad_norm": 3.08996319770813, + "learning_rate": 2.8979569237583748e-05, + "loss": 0.4734, + "step": 28800 + }, + { + "epoch": 1.44, + "grad_norm": 0.8343068361282349, + "learning_rate": 2.8956499270989056e-05, + "loss": 0.4373, + "step": 28825 + }, + { + "epoch": 1.44, + "grad_norm": 2.541003465652466, + "learning_rate": 2.8933429304394367e-05, + "loss": 0.5624, + "step": 28850 + }, + { + "epoch": 1.44, + "grad_norm": 1.278167486190796, + "learning_rate": 2.891035933779968e-05, + "loss": 0.3731, + "step": 28875 + }, + { + "epoch": 1.44, + "grad_norm": 3.0557451248168945, + "learning_rate": 2.888728937120499e-05, + "loss": 0.3506, + "step": 28900 + }, + { + "epoch": 1.44, + "grad_norm": 4.243703365325928, + "learning_rate": 2.8864219404610304e-05, + "loss": 0.4886, + "step": 28925 + }, + { + "epoch": 1.44, + "grad_norm": 3.341449737548828, + "learning_rate": 2.8841149438015612e-05, + "loss": 0.4455, + "step": 28950 + }, + { + "epoch": 1.44, + "grad_norm": 2.9293224811553955, + "learning_rate": 2.8818079471420927e-05, + "loss": 0.4343, + "step": 28975 + }, + { + "epoch": 1.45, + "grad_norm": 5.4256205558776855, + "learning_rate": 2.879500950482624e-05, + "loss": 0.5741, + "step": 29000 + }, + { + "epoch": 1.45, + "grad_norm": 0.9868775606155396, + "learning_rate": 2.877193953823155e-05, + "loss": 0.4025, + "step": 29025 + }, + { + "epoch": 1.45, + "grad_norm": 0.9488250017166138, + "learning_rate": 2.874886957163686e-05, + "loss": 0.4695, + "step": 29050 + }, + { + "epoch": 1.45, + "grad_norm": 1.0704574584960938, + "learning_rate": 2.8725799605042176e-05, + "loss": 0.4965, + "step": 29075 + }, + { + "epoch": 1.45, + "grad_norm": 1.0031787157058716, + "learning_rate": 2.8702729638447484e-05, + "loss": 0.4797, + "step": 29100 + }, + { + "epoch": 1.45, + "grad_norm": 2.660698413848877, + "learning_rate": 2.86796596718528e-05, + "loss": 0.6116, + "step": 29125 + }, + { + "epoch": 1.45, + "grad_norm": 2.4585986137390137, + "learning_rate": 2.8656589705258107e-05, + "loss": 0.4994, + "step": 29150 + }, + { + "epoch": 1.45, + "grad_norm": 1.0483205318450928, + "learning_rate": 2.863351973866342e-05, + "loss": 0.5072, + "step": 29175 + }, + { + "epoch": 1.46, + "grad_norm": 5.655982494354248, + "learning_rate": 2.8610449772068733e-05, + "loss": 0.5917, + "step": 29200 + }, + { + "epoch": 1.46, + "grad_norm": 3.0197324752807617, + "learning_rate": 2.858737980547404e-05, + "loss": 0.3059, + "step": 29225 + }, + { + "epoch": 1.46, + "grad_norm": 3.459602117538452, + "learning_rate": 2.8564309838879355e-05, + "loss": 0.5433, + "step": 29250 + }, + { + "epoch": 1.46, + "grad_norm": 1.178184986114502, + "learning_rate": 2.8541239872284663e-05, + "loss": 0.4661, + "step": 29275 + }, + { + "epoch": 1.46, + "grad_norm": 16.78719711303711, + "learning_rate": 2.8518169905689978e-05, + "loss": 0.4727, + "step": 29300 + }, + { + "epoch": 1.46, + "grad_norm": 9.509754180908203, + "learning_rate": 2.8495099939095286e-05, + "loss": 0.488, + "step": 29325 + }, + { + "epoch": 1.46, + "grad_norm": 9.90145492553711, + "learning_rate": 2.84720299725006e-05, + "loss": 0.5822, + "step": 29350 + }, + { + "epoch": 1.46, + "grad_norm": 12.901089668273926, + "learning_rate": 2.8448960005905912e-05, + "loss": 0.4885, + "step": 29375 + }, + { + "epoch": 1.47, + "grad_norm": 3.088007926940918, + "learning_rate": 2.8425890039311227e-05, + "loss": 0.6254, + "step": 29400 + }, + { + "epoch": 1.47, + "grad_norm": 3.0785930156707764, + "learning_rate": 2.8402820072716535e-05, + "loss": 0.4276, + "step": 29425 + }, + { + "epoch": 1.47, + "grad_norm": 6.4305572509765625, + "learning_rate": 2.837975010612185e-05, + "loss": 0.53, + "step": 29450 + }, + { + "epoch": 1.47, + "grad_norm": 2.6937947273254395, + "learning_rate": 2.8356680139527158e-05, + "loss": 0.4267, + "step": 29475 + }, + { + "epoch": 1.47, + "grad_norm": 30.603422164916992, + "learning_rate": 2.8333610172932472e-05, + "loss": 0.3964, + "step": 29500 + }, + { + "epoch": 1.47, + "grad_norm": 3.560316562652588, + "learning_rate": 2.8310540206337784e-05, + "loss": 0.4128, + "step": 29525 + }, + { + "epoch": 1.47, + "grad_norm": 16.946557998657227, + "learning_rate": 2.8287470239743095e-05, + "loss": 0.5844, + "step": 29550 + }, + { + "epoch": 1.47, + "grad_norm": 3.1420838832855225, + "learning_rate": 2.8264400273148407e-05, + "loss": 0.5387, + "step": 29575 + }, + { + "epoch": 1.47, + "grad_norm": 8.17186450958252, + "learning_rate": 2.8241330306553714e-05, + "loss": 0.4568, + "step": 29600 + }, + { + "epoch": 1.48, + "grad_norm": 4.252970218658447, + "learning_rate": 2.821826033995903e-05, + "loss": 0.4813, + "step": 29625 + }, + { + "epoch": 1.48, + "grad_norm": 17.932771682739258, + "learning_rate": 2.8195190373364337e-05, + "loss": 0.5344, + "step": 29650 + }, + { + "epoch": 1.48, + "grad_norm": 1.9828652143478394, + "learning_rate": 2.8172120406769652e-05, + "loss": 0.5218, + "step": 29675 + }, + { + "epoch": 1.48, + "grad_norm": 7.818408489227295, + "learning_rate": 2.8149050440174963e-05, + "loss": 0.4881, + "step": 29700 + }, + { + "epoch": 1.48, + "grad_norm": 1.1083383560180664, + "learning_rate": 2.8125980473580278e-05, + "loss": 0.5118, + "step": 29725 + }, + { + "epoch": 1.48, + "grad_norm": 1.450125813484192, + "learning_rate": 2.8102910506985586e-05, + "loss": 0.6522, + "step": 29750 + }, + { + "epoch": 1.48, + "grad_norm": 6.853457927703857, + "learning_rate": 2.80798405403909e-05, + "loss": 0.5363, + "step": 29775 + }, + { + "epoch": 1.48, + "grad_norm": 6.707515716552734, + "learning_rate": 2.805677057379621e-05, + "loss": 0.493, + "step": 29800 + }, + { + "epoch": 1.49, + "grad_norm": 1.163601279258728, + "learning_rate": 2.8033700607201524e-05, + "loss": 0.3708, + "step": 29825 + }, + { + "epoch": 1.49, + "grad_norm": 8.384958267211914, + "learning_rate": 2.801063064060683e-05, + "loss": 0.4337, + "step": 29850 + }, + { + "epoch": 1.49, + "grad_norm": 3.240912914276123, + "learning_rate": 2.7987560674012146e-05, + "loss": 0.4612, + "step": 29875 + }, + { + "epoch": 1.49, + "grad_norm": 1.8176313638687134, + "learning_rate": 2.7964490707417458e-05, + "loss": 0.5031, + "step": 29900 + }, + { + "epoch": 1.49, + "grad_norm": 1.002866506576538, + "learning_rate": 2.7941420740822772e-05, + "loss": 0.4217, + "step": 29925 + }, + { + "epoch": 1.49, + "grad_norm": 2.510270357131958, + "learning_rate": 2.791835077422808e-05, + "loss": 0.5367, + "step": 29950 + }, + { + "epoch": 1.49, + "grad_norm": 16.336475372314453, + "learning_rate": 2.7895280807633388e-05, + "loss": 0.5, + "step": 29975 + }, + { + "epoch": 1.49, + "grad_norm": 1.0017398595809937, + "learning_rate": 2.7872210841038703e-05, + "loss": 0.2951, + "step": 30000 + }, + { + "epoch": 1.5, + "grad_norm": 3.2033021450042725, + "learning_rate": 2.7849140874444014e-05, + "loss": 0.5003, + "step": 30025 + }, + { + "epoch": 1.5, + "grad_norm": 1.3507925271987915, + "learning_rate": 2.782607090784933e-05, + "loss": 0.4622, + "step": 30050 + }, + { + "epoch": 1.5, + "grad_norm": 1.161468505859375, + "learning_rate": 2.7803000941254637e-05, + "loss": 0.4648, + "step": 30075 + }, + { + "epoch": 1.5, + "grad_norm": 3.3827836513519287, + "learning_rate": 2.7779930974659952e-05, + "loss": 0.414, + "step": 30100 + }, + { + "epoch": 1.5, + "grad_norm": 3.53877329826355, + "learning_rate": 2.775686100806526e-05, + "loss": 0.4735, + "step": 30125 + }, + { + "epoch": 1.5, + "grad_norm": 14.931818962097168, + "learning_rate": 2.7733791041470575e-05, + "loss": 0.4685, + "step": 30150 + }, + { + "epoch": 1.5, + "grad_norm": 1.213295340538025, + "learning_rate": 2.7710721074875883e-05, + "loss": 0.4797, + "step": 30175 + }, + { + "epoch": 1.5, + "grad_norm": 16.991479873657227, + "learning_rate": 2.7687651108281197e-05, + "loss": 0.5182, + "step": 30200 + }, + { + "epoch": 1.51, + "grad_norm": 4.055248260498047, + "learning_rate": 2.766458114168651e-05, + "loss": 0.5566, + "step": 30225 + }, + { + "epoch": 1.51, + "grad_norm": 3.4977238178253174, + "learning_rate": 2.7641511175091823e-05, + "loss": 0.5738, + "step": 30250 + }, + { + "epoch": 1.51, + "grad_norm": 111.34226989746094, + "learning_rate": 2.761844120849713e-05, + "loss": 0.4432, + "step": 30275 + }, + { + "epoch": 1.51, + "grad_norm": 7.632834434509277, + "learning_rate": 2.7595371241902446e-05, + "loss": 0.5878, + "step": 30300 + }, + { + "epoch": 1.51, + "grad_norm": 13.939888954162598, + "learning_rate": 2.7572301275307754e-05, + "loss": 0.6465, + "step": 30325 + }, + { + "epoch": 1.51, + "grad_norm": 6.650832653045654, + "learning_rate": 2.7549231308713065e-05, + "loss": 0.5544, + "step": 30350 + }, + { + "epoch": 1.51, + "grad_norm": 3.3435723781585693, + "learning_rate": 2.7526161342118377e-05, + "loss": 0.5943, + "step": 30375 + }, + { + "epoch": 1.51, + "grad_norm": 1.8094165325164795, + "learning_rate": 2.7503091375523688e-05, + "loss": 0.4553, + "step": 30400 + }, + { + "epoch": 1.52, + "grad_norm": 13.593605041503906, + "learning_rate": 2.7480021408929003e-05, + "loss": 0.5378, + "step": 30425 + }, + { + "epoch": 1.52, + "grad_norm": 2.0035171508789062, + "learning_rate": 2.745695144233431e-05, + "loss": 0.3657, + "step": 30450 + }, + { + "epoch": 1.52, + "grad_norm": 5.202415943145752, + "learning_rate": 2.7433881475739626e-05, + "loss": 0.4987, + "step": 30475 + }, + { + "epoch": 1.52, + "grad_norm": 2.9139647483825684, + "learning_rate": 2.7410811509144934e-05, + "loss": 0.4868, + "step": 30500 + }, + { + "epoch": 1.52, + "grad_norm": 56.17787551879883, + "learning_rate": 2.738774154255025e-05, + "loss": 0.5008, + "step": 30525 + }, + { + "epoch": 1.52, + "grad_norm": 3.269890308380127, + "learning_rate": 2.736467157595556e-05, + "loss": 0.505, + "step": 30550 + }, + { + "epoch": 1.52, + "grad_norm": 0.9548706412315369, + "learning_rate": 2.7341601609360874e-05, + "loss": 0.39, + "step": 30575 + }, + { + "epoch": 1.52, + "grad_norm": 1.014391541481018, + "learning_rate": 2.7318531642766182e-05, + "loss": 0.5306, + "step": 30600 + }, + { + "epoch": 1.53, + "grad_norm": 337.91876220703125, + "learning_rate": 2.7295461676171497e-05, + "loss": 0.5413, + "step": 30625 + }, + { + "epoch": 1.53, + "grad_norm": 4.666881084442139, + "learning_rate": 2.7272391709576805e-05, + "loss": 0.5424, + "step": 30650 + }, + { + "epoch": 1.53, + "grad_norm": 0.9372049570083618, + "learning_rate": 2.724932174298212e-05, + "loss": 0.4929, + "step": 30675 + }, + { + "epoch": 1.53, + "grad_norm": 1.1089849472045898, + "learning_rate": 2.7226251776387428e-05, + "loss": 0.3852, + "step": 30700 + }, + { + "epoch": 1.53, + "grad_norm": 4.649927616119385, + "learning_rate": 2.720318180979274e-05, + "loss": 0.5104, + "step": 30725 + }, + { + "epoch": 1.53, + "grad_norm": 2.7719218730926514, + "learning_rate": 2.7180111843198054e-05, + "loss": 0.5801, + "step": 30750 + }, + { + "epoch": 1.53, + "grad_norm": 3.2304704189300537, + "learning_rate": 2.7157041876603362e-05, + "loss": 0.559, + "step": 30775 + }, + { + "epoch": 1.53, + "grad_norm": 1.006312370300293, + "learning_rate": 2.7133971910008677e-05, + "loss": 0.3751, + "step": 30800 + }, + { + "epoch": 1.54, + "grad_norm": 6.04379940032959, + "learning_rate": 2.7110901943413985e-05, + "loss": 0.4593, + "step": 30825 + }, + { + "epoch": 1.54, + "grad_norm": 1.1266270875930786, + "learning_rate": 2.70878319768193e-05, + "loss": 0.4689, + "step": 30850 + }, + { + "epoch": 1.54, + "grad_norm": 1.9402873516082764, + "learning_rate": 2.706476201022461e-05, + "loss": 0.4564, + "step": 30875 + }, + { + "epoch": 1.54, + "grad_norm": 3.6099956035614014, + "learning_rate": 2.7041692043629922e-05, + "loss": 0.4079, + "step": 30900 + }, + { + "epoch": 1.54, + "grad_norm": 1.433536171913147, + "learning_rate": 2.7018622077035234e-05, + "loss": 0.4365, + "step": 30925 + }, + { + "epoch": 1.54, + "grad_norm": 3.8893444538116455, + "learning_rate": 2.6995552110440548e-05, + "loss": 0.5526, + "step": 30950 + }, + { + "epoch": 1.54, + "grad_norm": 1.160085916519165, + "learning_rate": 2.6972482143845856e-05, + "loss": 0.4243, + "step": 30975 + }, + { + "epoch": 1.54, + "grad_norm": 6.14224100112915, + "learning_rate": 2.694941217725117e-05, + "loss": 0.4999, + "step": 31000 + }, + { + "epoch": 1.55, + "grad_norm": 6.147780895233154, + "learning_rate": 2.692634221065648e-05, + "loss": 0.4961, + "step": 31025 + }, + { + "epoch": 1.55, + "grad_norm": 2.3592782020568848, + "learning_rate": 2.6903272244061794e-05, + "loss": 0.5433, + "step": 31050 + }, + { + "epoch": 1.55, + "grad_norm": 1.0809153318405151, + "learning_rate": 2.6880202277467105e-05, + "loss": 0.5113, + "step": 31075 + }, + { + "epoch": 1.55, + "grad_norm": 3.5836567878723145, + "learning_rate": 2.6857132310872413e-05, + "loss": 0.4979, + "step": 31100 + }, + { + "epoch": 1.55, + "grad_norm": 0.9541612267494202, + "learning_rate": 2.6834062344277728e-05, + "loss": 0.3614, + "step": 31125 + }, + { + "epoch": 1.55, + "grad_norm": 3.576939821243286, + "learning_rate": 2.6810992377683036e-05, + "loss": 0.4558, + "step": 31150 + }, + { + "epoch": 1.55, + "grad_norm": 7.013927459716797, + "learning_rate": 2.678792241108835e-05, + "loss": 0.4799, + "step": 31175 + }, + { + "epoch": 1.55, + "grad_norm": 5.320522308349609, + "learning_rate": 2.676485244449366e-05, + "loss": 0.518, + "step": 31200 + }, + { + "epoch": 1.56, + "grad_norm": 1.2402065992355347, + "learning_rate": 2.6741782477898973e-05, + "loss": 0.409, + "step": 31225 + }, + { + "epoch": 1.56, + "grad_norm": 1.8643181324005127, + "learning_rate": 2.6718712511304285e-05, + "loss": 0.5644, + "step": 31250 + }, + { + "epoch": 1.56, + "grad_norm": 3.7191162109375, + "learning_rate": 2.66956425447096e-05, + "loss": 0.5179, + "step": 31275 + }, + { + "epoch": 1.56, + "grad_norm": 3.634350061416626, + "learning_rate": 2.6672572578114907e-05, + "loss": 0.5734, + "step": 31300 + }, + { + "epoch": 1.56, + "grad_norm": 1.830993890762329, + "learning_rate": 2.6649502611520222e-05, + "loss": 0.6335, + "step": 31325 + }, + { + "epoch": 1.56, + "grad_norm": 2.8177618980407715, + "learning_rate": 2.662643264492553e-05, + "loss": 0.5966, + "step": 31350 + }, + { + "epoch": 1.56, + "grad_norm": 4.917094707489014, + "learning_rate": 2.6603362678330845e-05, + "loss": 0.6458, + "step": 31375 + }, + { + "epoch": 1.56, + "grad_norm": 3.4671552181243896, + "learning_rate": 2.6580292711736156e-05, + "loss": 0.6744, + "step": 31400 + }, + { + "epoch": 1.57, + "grad_norm": 3.410064458847046, + "learning_rate": 2.6557222745141467e-05, + "loss": 0.6604, + "step": 31425 + }, + { + "epoch": 1.57, + "grad_norm": 5.093179225921631, + "learning_rate": 2.653415277854678e-05, + "loss": 0.6048, + "step": 31450 + }, + { + "epoch": 1.57, + "grad_norm": 1.9258631467819214, + "learning_rate": 2.6511082811952087e-05, + "loss": 0.6169, + "step": 31475 + }, + { + "epoch": 1.57, + "grad_norm": 2.269477605819702, + "learning_rate": 2.64880128453574e-05, + "loss": 0.6868, + "step": 31500 + }, + { + "epoch": 1.57, + "grad_norm": 1.7670726776123047, + "learning_rate": 2.646494287876271e-05, + "loss": 0.6663, + "step": 31525 + }, + { + "epoch": 1.57, + "grad_norm": 1.7811284065246582, + "learning_rate": 2.6441872912168024e-05, + "loss": 0.7178, + "step": 31550 + }, + { + "epoch": 1.57, + "grad_norm": 6.065280437469482, + "learning_rate": 2.6418802945573336e-05, + "loss": 0.6134, + "step": 31575 + }, + { + "epoch": 1.57, + "grad_norm": 2.023129463195801, + "learning_rate": 2.639573297897865e-05, + "loss": 0.637, + "step": 31600 + }, + { + "epoch": 1.58, + "grad_norm": 4.560355186462402, + "learning_rate": 2.637266301238396e-05, + "loss": 0.7594, + "step": 31625 + }, + { + "epoch": 1.58, + "grad_norm": 3.2106659412384033, + "learning_rate": 2.6349593045789273e-05, + "loss": 0.7643, + "step": 31650 + }, + { + "epoch": 1.58, + "grad_norm": 2.52014422416687, + "learning_rate": 2.632652307919458e-05, + "loss": 0.749, + "step": 31675 + }, + { + "epoch": 1.58, + "grad_norm": 4.3201751708984375, + "learning_rate": 2.6303453112599896e-05, + "loss": 0.8099, + "step": 31700 + }, + { + "epoch": 1.58, + "grad_norm": 3.502185344696045, + "learning_rate": 2.6280383146005204e-05, + "loss": 0.8316, + "step": 31725 + }, + { + "epoch": 1.58, + "grad_norm": 5.395108699798584, + "learning_rate": 2.625731317941052e-05, + "loss": 0.8306, + "step": 31750 + }, + { + "epoch": 1.58, + "grad_norm": 2.213697910308838, + "learning_rate": 2.623424321281583e-05, + "loss": 0.7331, + "step": 31775 + }, + { + "epoch": 1.58, + "grad_norm": 3.1291141510009766, + "learning_rate": 2.6211173246221145e-05, + "loss": 0.7876, + "step": 31800 + }, + { + "epoch": 1.59, + "grad_norm": 1.7159450054168701, + "learning_rate": 2.6188103279626453e-05, + "loss": 0.7596, + "step": 31825 + }, + { + "epoch": 1.59, + "grad_norm": 2.6130173206329346, + "learning_rate": 2.616503331303176e-05, + "loss": 0.8343, + "step": 31850 + }, + { + "epoch": 1.59, + "grad_norm": 3.8219878673553467, + "learning_rate": 2.6141963346437075e-05, + "loss": 0.7811, + "step": 31875 + }, + { + "epoch": 1.59, + "grad_norm": 2.0162181854248047, + "learning_rate": 2.6118893379842387e-05, + "loss": 0.7997, + "step": 31900 + }, + { + "epoch": 1.59, + "grad_norm": 3.441479444503784, + "learning_rate": 2.6095823413247698e-05, + "loss": 0.8247, + "step": 31925 + }, + { + "epoch": 1.59, + "grad_norm": 6.634566307067871, + "learning_rate": 2.607275344665301e-05, + "loss": 0.751, + "step": 31950 + }, + { + "epoch": 1.59, + "grad_norm": 2.1950550079345703, + "learning_rate": 2.6049683480058324e-05, + "loss": 0.7754, + "step": 31975 + }, + { + "epoch": 1.59, + "grad_norm": 2.526723623275757, + "learning_rate": 2.6026613513463632e-05, + "loss": 0.9298, + "step": 32000 + }, + { + "epoch": 1.6, + "grad_norm": 4.879652976989746, + "learning_rate": 2.6003543546868947e-05, + "loss": 0.751, + "step": 32025 + }, + { + "epoch": 1.6, + "grad_norm": 6.0123186111450195, + "learning_rate": 2.5980473580274255e-05, + "loss": 0.7762, + "step": 32050 + }, + { + "epoch": 1.6, + "grad_norm": 4.540564060211182, + "learning_rate": 2.595740361367957e-05, + "loss": 0.8824, + "step": 32075 + }, + { + "epoch": 1.6, + "grad_norm": 2.412745952606201, + "learning_rate": 2.593433364708488e-05, + "loss": 0.8124, + "step": 32100 + }, + { + "epoch": 1.6, + "grad_norm": 5.162590503692627, + "learning_rate": 2.5911263680490196e-05, + "loss": 0.7411, + "step": 32125 + }, + { + "epoch": 1.6, + "grad_norm": 3.4849741458892822, + "learning_rate": 2.5888193713895504e-05, + "loss": 0.7982, + "step": 32150 + }, + { + "epoch": 1.6, + "grad_norm": 2.908165693283081, + "learning_rate": 2.586512374730082e-05, + "loss": 0.826, + "step": 32175 + }, + { + "epoch": 1.6, + "grad_norm": 2.439852237701416, + "learning_rate": 2.5842053780706126e-05, + "loss": 0.8829, + "step": 32200 + }, + { + "epoch": 1.61, + "grad_norm": 3.7252159118652344, + "learning_rate": 2.5818983814111434e-05, + "loss": 0.8104, + "step": 32225 + }, + { + "epoch": 1.61, + "grad_norm": 1.851833701133728, + "learning_rate": 2.579591384751675e-05, + "loss": 0.8495, + "step": 32250 + }, + { + "epoch": 1.61, + "grad_norm": 3.105205774307251, + "learning_rate": 2.577284388092206e-05, + "loss": 0.7484, + "step": 32275 + }, + { + "epoch": 1.61, + "grad_norm": 5.3158650398254395, + "learning_rate": 2.5749773914327375e-05, + "loss": 0.8756, + "step": 32300 + }, + { + "epoch": 1.61, + "grad_norm": 6.524014949798584, + "learning_rate": 2.5726703947732683e-05, + "loss": 0.8681, + "step": 32325 + }, + { + "epoch": 1.61, + "grad_norm": 2.6202030181884766, + "learning_rate": 2.5703633981137998e-05, + "loss": 0.8961, + "step": 32350 + }, + { + "epoch": 1.61, + "grad_norm": 3.272714853286743, + "learning_rate": 2.5680564014543306e-05, + "loss": 0.8115, + "step": 32375 + }, + { + "epoch": 1.61, + "grad_norm": 5.991594314575195, + "learning_rate": 2.565749404794862e-05, + "loss": 0.8041, + "step": 32400 + }, + { + "epoch": 1.62, + "grad_norm": 4.424630641937256, + "learning_rate": 2.5634424081353932e-05, + "loss": 0.8302, + "step": 32425 + }, + { + "epoch": 1.62, + "grad_norm": 4.359757900238037, + "learning_rate": 2.5611354114759243e-05, + "loss": 0.824, + "step": 32450 + }, + { + "epoch": 1.62, + "grad_norm": 3.0658962726593018, + "learning_rate": 2.5588284148164555e-05, + "loss": 0.8463, + "step": 32475 + }, + { + "epoch": 1.62, + "grad_norm": 9.23780632019043, + "learning_rate": 2.556521418156987e-05, + "loss": 0.7813, + "step": 32500 + }, + { + "epoch": 1.62, + "grad_norm": 3.0023868083953857, + "learning_rate": 2.5542144214975177e-05, + "loss": 0.8218, + "step": 32525 + }, + { + "epoch": 1.62, + "grad_norm": 2.9863386154174805, + "learning_rate": 2.5519074248380492e-05, + "loss": 0.8183, + "step": 32550 + }, + { + "epoch": 1.62, + "grad_norm": 5.385082244873047, + "learning_rate": 2.54960042817858e-05, + "loss": 0.7862, + "step": 32575 + }, + { + "epoch": 1.62, + "grad_norm": 2.6118578910827637, + "learning_rate": 2.547293431519111e-05, + "loss": 0.8312, + "step": 32600 + }, + { + "epoch": 1.63, + "grad_norm": 3.4267027378082275, + "learning_rate": 2.5449864348596426e-05, + "loss": 0.8242, + "step": 32625 + }, + { + "epoch": 1.63, + "grad_norm": 5.174554347991943, + "learning_rate": 2.5426794382001734e-05, + "loss": 0.7855, + "step": 32650 + }, + { + "epoch": 1.63, + "grad_norm": 4.1581807136535645, + "learning_rate": 2.540372441540705e-05, + "loss": 0.8032, + "step": 32675 + }, + { + "epoch": 1.63, + "grad_norm": 4.021723747253418, + "learning_rate": 2.5380654448812357e-05, + "loss": 0.7469, + "step": 32700 + }, + { + "epoch": 1.63, + "grad_norm": 3.3047544956207275, + "learning_rate": 2.5357584482217672e-05, + "loss": 0.8432, + "step": 32725 + }, + { + "epoch": 1.63, + "grad_norm": 1.8279719352722168, + "learning_rate": 2.533451451562298e-05, + "loss": 0.8011, + "step": 32750 + }, + { + "epoch": 1.63, + "grad_norm": 3.484745979309082, + "learning_rate": 2.5311444549028294e-05, + "loss": 0.77, + "step": 32775 + }, + { + "epoch": 1.63, + "grad_norm": 2.0162298679351807, + "learning_rate": 2.5288374582433606e-05, + "loss": 0.7745, + "step": 32800 + }, + { + "epoch": 1.64, + "grad_norm": 2.171647071838379, + "learning_rate": 2.526530461583892e-05, + "loss": 0.8607, + "step": 32825 + }, + { + "epoch": 1.64, + "grad_norm": 3.357196807861328, + "learning_rate": 2.524223464924423e-05, + "loss": 0.7857, + "step": 32850 + }, + { + "epoch": 1.64, + "grad_norm": 6.561802864074707, + "learning_rate": 2.5219164682649543e-05, + "loss": 0.838, + "step": 32875 + }, + { + "epoch": 1.64, + "grad_norm": 4.378182888031006, + "learning_rate": 2.519609471605485e-05, + "loss": 0.8374, + "step": 32900 + }, + { + "epoch": 1.64, + "grad_norm": 1.8582767248153687, + "learning_rate": 2.5173024749460166e-05, + "loss": 0.7927, + "step": 32925 + }, + { + "epoch": 1.64, + "grad_norm": 6.331790447235107, + "learning_rate": 2.5149954782865477e-05, + "loss": 0.7299, + "step": 32950 + }, + { + "epoch": 1.64, + "grad_norm": 4.185370922088623, + "learning_rate": 2.5126884816270785e-05, + "loss": 0.8594, + "step": 32975 + }, + { + "epoch": 1.64, + "grad_norm": 6.100175380706787, + "learning_rate": 2.51038148496761e-05, + "loss": 0.7837, + "step": 33000 + }, + { + "epoch": 1.65, + "grad_norm": 6.206463813781738, + "learning_rate": 2.5080744883081408e-05, + "loss": 0.7516, + "step": 33025 + }, + { + "epoch": 1.65, + "grad_norm": 5.179276943206787, + "learning_rate": 2.5057674916486723e-05, + "loss": 0.7947, + "step": 33050 + }, + { + "epoch": 1.65, + "grad_norm": 5.463741302490234, + "learning_rate": 2.503460494989203e-05, + "loss": 0.8988, + "step": 33075 + }, + { + "epoch": 1.65, + "grad_norm": 2.222621202468872, + "learning_rate": 2.5011534983297346e-05, + "loss": 0.835, + "step": 33100 + }, + { + "epoch": 1.65, + "grad_norm": 2.5178072452545166, + "learning_rate": 2.4988465016702657e-05, + "loss": 0.809, + "step": 33125 + }, + { + "epoch": 1.65, + "grad_norm": 3.1115121841430664, + "learning_rate": 2.4965395050107968e-05, + "loss": 0.7798, + "step": 33150 + }, + { + "epoch": 1.65, + "grad_norm": 3.3300487995147705, + "learning_rate": 2.494232508351328e-05, + "loss": 0.8173, + "step": 33175 + }, + { + "epoch": 1.65, + "grad_norm": 2.000523090362549, + "learning_rate": 2.491925511691859e-05, + "loss": 0.8308, + "step": 33200 + }, + { + "epoch": 1.66, + "grad_norm": 10.636764526367188, + "learning_rate": 2.4896185150323902e-05, + "loss": 0.8941, + "step": 33225 + }, + { + "epoch": 1.66, + "grad_norm": 1.9837878942489624, + "learning_rate": 2.4873115183729214e-05, + "loss": 0.8287, + "step": 33250 + }, + { + "epoch": 1.66, + "grad_norm": 3.8704280853271484, + "learning_rate": 2.4850045217134525e-05, + "loss": 0.79, + "step": 33275 + }, + { + "epoch": 1.66, + "grad_norm": 5.006420612335205, + "learning_rate": 2.482697525053984e-05, + "loss": 0.838, + "step": 33300 + }, + { + "epoch": 1.66, + "grad_norm": 5.825026035308838, + "learning_rate": 2.480390528394515e-05, + "loss": 0.8216, + "step": 33325 + }, + { + "epoch": 1.66, + "grad_norm": 2.980112075805664, + "learning_rate": 2.4780835317350463e-05, + "loss": 0.8636, + "step": 33350 + }, + { + "epoch": 1.66, + "grad_norm": 3.990055799484253, + "learning_rate": 2.4757765350755774e-05, + "loss": 0.8725, + "step": 33375 + }, + { + "epoch": 1.66, + "grad_norm": 2.0065150260925293, + "learning_rate": 2.4734695384161085e-05, + "loss": 0.8054, + "step": 33400 + }, + { + "epoch": 1.67, + "grad_norm": 8.030442237854004, + "learning_rate": 2.4711625417566397e-05, + "loss": 0.8314, + "step": 33425 + }, + { + "epoch": 1.67, + "grad_norm": 2.8273167610168457, + "learning_rate": 2.4688555450971708e-05, + "loss": 0.8094, + "step": 33450 + }, + { + "epoch": 1.67, + "grad_norm": 2.2414333820343018, + "learning_rate": 2.4665485484377023e-05, + "loss": 0.889, + "step": 33475 + }, + { + "epoch": 1.67, + "grad_norm": 3.5151126384735107, + "learning_rate": 2.464241551778233e-05, + "loss": 0.8246, + "step": 33500 + }, + { + "epoch": 1.67, + "grad_norm": 9.038124084472656, + "learning_rate": 2.4619345551187642e-05, + "loss": 0.811, + "step": 33525 + }, + { + "epoch": 1.67, + "grad_norm": 2.8050408363342285, + "learning_rate": 2.4596275584592953e-05, + "loss": 0.822, + "step": 33550 + }, + { + "epoch": 1.67, + "grad_norm": 4.081467151641846, + "learning_rate": 2.4573205617998265e-05, + "loss": 0.7503, + "step": 33575 + }, + { + "epoch": 1.67, + "grad_norm": 2.958592414855957, + "learning_rate": 2.4550135651403576e-05, + "loss": 0.8151, + "step": 33600 + }, + { + "epoch": 1.68, + "grad_norm": 5.844561576843262, + "learning_rate": 2.452706568480889e-05, + "loss": 0.7702, + "step": 33625 + }, + { + "epoch": 1.68, + "grad_norm": 2.1601386070251465, + "learning_rate": 2.4503995718214202e-05, + "loss": 0.7862, + "step": 33650 + }, + { + "epoch": 1.68, + "grad_norm": 2.165982246398926, + "learning_rate": 2.4480925751619514e-05, + "loss": 0.8787, + "step": 33675 + }, + { + "epoch": 1.68, + "grad_norm": 5.302867889404297, + "learning_rate": 2.4457855785024825e-05, + "loss": 0.8076, + "step": 33700 + }, + { + "epoch": 1.68, + "grad_norm": 2.251314163208008, + "learning_rate": 2.4434785818430136e-05, + "loss": 0.8352, + "step": 33725 + }, + { + "epoch": 1.68, + "grad_norm": 2.4113807678222656, + "learning_rate": 2.4411715851835448e-05, + "loss": 0.8318, + "step": 33750 + }, + { + "epoch": 1.68, + "grad_norm": 3.341135263442993, + "learning_rate": 2.438864588524076e-05, + "loss": 0.7658, + "step": 33775 + }, + { + "epoch": 1.68, + "grad_norm": 5.227036952972412, + "learning_rate": 2.436557591864607e-05, + "loss": 0.8928, + "step": 33800 + }, + { + "epoch": 1.69, + "grad_norm": 7.890278339385986, + "learning_rate": 2.4342505952051385e-05, + "loss": 0.8021, + "step": 33825 + }, + { + "epoch": 1.69, + "grad_norm": 2.11730694770813, + "learning_rate": 2.4319435985456696e-05, + "loss": 0.8418, + "step": 33850 + }, + { + "epoch": 1.69, + "grad_norm": 4.105464935302734, + "learning_rate": 2.4296366018862004e-05, + "loss": 0.7946, + "step": 33875 + }, + { + "epoch": 1.69, + "grad_norm": 4.777904033660889, + "learning_rate": 2.4273296052267316e-05, + "loss": 0.7699, + "step": 33900 + }, + { + "epoch": 1.69, + "grad_norm": 8.123181343078613, + "learning_rate": 2.4250226085672627e-05, + "loss": 0.8535, + "step": 33925 + }, + { + "epoch": 1.69, + "grad_norm": 5.432234764099121, + "learning_rate": 2.422715611907794e-05, + "loss": 0.84, + "step": 33950 + }, + { + "epoch": 1.69, + "grad_norm": 5.834948539733887, + "learning_rate": 2.4204086152483253e-05, + "loss": 0.8552, + "step": 33975 + }, + { + "epoch": 1.69, + "grad_norm": 3.680751323699951, + "learning_rate": 2.4181016185888565e-05, + "loss": 0.8243, + "step": 34000 + }, + { + "epoch": 1.7, + "grad_norm": 9.424110412597656, + "learning_rate": 2.4157946219293876e-05, + "loss": 0.7783, + "step": 34025 + }, + { + "epoch": 1.7, + "grad_norm": 4.688609600067139, + "learning_rate": 2.4134876252699187e-05, + "loss": 0.8046, + "step": 34050 + }, + { + "epoch": 1.7, + "grad_norm": 4.4300432205200195, + "learning_rate": 2.41118062861045e-05, + "loss": 0.854, + "step": 34075 + }, + { + "epoch": 1.7, + "grad_norm": 5.35884428024292, + "learning_rate": 2.408873631950981e-05, + "loss": 0.7425, + "step": 34100 + }, + { + "epoch": 1.7, + "grad_norm": 2.651151657104492, + "learning_rate": 2.406566635291512e-05, + "loss": 0.847, + "step": 34125 + }, + { + "epoch": 1.7, + "grad_norm": 9.372386932373047, + "learning_rate": 2.4042596386320436e-05, + "loss": 0.8982, + "step": 34150 + }, + { + "epoch": 1.7, + "grad_norm": 2.5536534786224365, + "learning_rate": 2.4019526419725748e-05, + "loss": 0.8317, + "step": 34175 + }, + { + "epoch": 1.7, + "grad_norm": 4.036647796630859, + "learning_rate": 2.399645645313106e-05, + "loss": 0.8515, + "step": 34200 + }, + { + "epoch": 1.71, + "grad_norm": 3.1766724586486816, + "learning_rate": 2.397338648653637e-05, + "loss": 0.8698, + "step": 34225 + }, + { + "epoch": 1.71, + "grad_norm": 9.911266326904297, + "learning_rate": 2.3950316519941678e-05, + "loss": 0.8694, + "step": 34250 + }, + { + "epoch": 1.71, + "grad_norm": 8.645268440246582, + "learning_rate": 2.392724655334699e-05, + "loss": 0.8196, + "step": 34275 + }, + { + "epoch": 1.71, + "grad_norm": 5.725925445556641, + "learning_rate": 2.3904176586752304e-05, + "loss": 0.839, + "step": 34300 + }, + { + "epoch": 1.71, + "grad_norm": 1.862838625907898, + "learning_rate": 2.3881106620157616e-05, + "loss": 0.8578, + "step": 34325 + }, + { + "epoch": 1.71, + "grad_norm": 2.1054186820983887, + "learning_rate": 2.3858036653562927e-05, + "loss": 0.8456, + "step": 34350 + }, + { + "epoch": 1.71, + "grad_norm": 1.862760066986084, + "learning_rate": 2.383496668696824e-05, + "loss": 0.7978, + "step": 34375 + }, + { + "epoch": 1.71, + "grad_norm": 5.457796096801758, + "learning_rate": 2.381189672037355e-05, + "loss": 0.8132, + "step": 34400 + }, + { + "epoch": 1.72, + "grad_norm": 4.211350917816162, + "learning_rate": 2.378882675377886e-05, + "loss": 0.86, + "step": 34425 + }, + { + "epoch": 1.72, + "grad_norm": 4.961258411407471, + "learning_rate": 2.3765756787184172e-05, + "loss": 0.8715, + "step": 34450 + }, + { + "epoch": 1.72, + "grad_norm": 1.8565328121185303, + "learning_rate": 2.3742686820589484e-05, + "loss": 0.8418, + "step": 34475 + }, + { + "epoch": 1.72, + "grad_norm": 2.928229570388794, + "learning_rate": 2.37196168539948e-05, + "loss": 0.8232, + "step": 34500 + }, + { + "epoch": 1.72, + "grad_norm": 7.252219200134277, + "learning_rate": 2.369654688740011e-05, + "loss": 0.8422, + "step": 34525 + }, + { + "epoch": 1.72, + "grad_norm": 7.312279224395752, + "learning_rate": 2.367347692080542e-05, + "loss": 0.8442, + "step": 34550 + }, + { + "epoch": 1.72, + "grad_norm": 3.1411311626434326, + "learning_rate": 2.3650406954210733e-05, + "loss": 0.7978, + "step": 34575 + }, + { + "epoch": 1.72, + "grad_norm": 2.388169050216675, + "learning_rate": 2.3627336987616044e-05, + "loss": 0.8161, + "step": 34600 + }, + { + "epoch": 1.73, + "grad_norm": 2.1744918823242188, + "learning_rate": 2.3604267021021352e-05, + "loss": 0.8981, + "step": 34625 + }, + { + "epoch": 1.73, + "grad_norm": 4.800972938537598, + "learning_rate": 2.3581197054426667e-05, + "loss": 0.831, + "step": 34650 + }, + { + "epoch": 1.73, + "grad_norm": 2.453704357147217, + "learning_rate": 2.3558127087831978e-05, + "loss": 0.7941, + "step": 34675 + }, + { + "epoch": 1.73, + "grad_norm": 2.5256292819976807, + "learning_rate": 2.353505712123729e-05, + "loss": 0.8007, + "step": 34700 + }, + { + "epoch": 1.73, + "grad_norm": 4.391716003417969, + "learning_rate": 2.35119871546426e-05, + "loss": 0.8477, + "step": 34725 + }, + { + "epoch": 1.73, + "grad_norm": 2.577244997024536, + "learning_rate": 2.3488917188047912e-05, + "loss": 0.8366, + "step": 34750 + }, + { + "epoch": 1.73, + "grad_norm": 3.9222066402435303, + "learning_rate": 2.3465847221453224e-05, + "loss": 0.8394, + "step": 34775 + }, + { + "epoch": 1.73, + "grad_norm": 4.5864715576171875, + "learning_rate": 2.3442777254858535e-05, + "loss": 0.827, + "step": 34800 + }, + { + "epoch": 1.74, + "grad_norm": 2.346217155456543, + "learning_rate": 2.3419707288263846e-05, + "loss": 0.8577, + "step": 34825 + }, + { + "epoch": 1.74, + "grad_norm": 1.9931409358978271, + "learning_rate": 2.339663732166916e-05, + "loss": 0.8782, + "step": 34850 + }, + { + "epoch": 1.74, + "grad_norm": 8.162564277648926, + "learning_rate": 2.3373567355074472e-05, + "loss": 0.781, + "step": 34875 + }, + { + "epoch": 1.74, + "grad_norm": 7.050436973571777, + "learning_rate": 2.3350497388479784e-05, + "loss": 0.797, + "step": 34900 + }, + { + "epoch": 1.74, + "grad_norm": 5.392847537994385, + "learning_rate": 2.3327427421885095e-05, + "loss": 0.7826, + "step": 34925 + }, + { + "epoch": 1.74, + "grad_norm": 3.3648784160614014, + "learning_rate": 2.3304357455290406e-05, + "loss": 0.9437, + "step": 34950 + }, + { + "epoch": 1.74, + "grad_norm": 7.639132022857666, + "learning_rate": 2.3281287488695718e-05, + "loss": 0.8861, + "step": 34975 + }, + { + "epoch": 1.74, + "grad_norm": 5.233222007751465, + "learning_rate": 2.325821752210103e-05, + "loss": 0.8305, + "step": 35000 + }, + { + "epoch": 1.75, + "grad_norm": 2.399179220199585, + "learning_rate": 2.323514755550634e-05, + "loss": 0.8388, + "step": 35025 + }, + { + "epoch": 1.75, + "grad_norm": 2.207289218902588, + "learning_rate": 2.3212077588911652e-05, + "loss": 0.7974, + "step": 35050 + }, + { + "epoch": 1.75, + "grad_norm": 3.358884572982788, + "learning_rate": 2.3189007622316963e-05, + "loss": 0.8339, + "step": 35075 + }, + { + "epoch": 1.75, + "grad_norm": 6.761455059051514, + "learning_rate": 2.3165937655722275e-05, + "loss": 0.8042, + "step": 35100 + }, + { + "epoch": 1.75, + "grad_norm": 1.8530350923538208, + "learning_rate": 2.3142867689127586e-05, + "loss": 0.8953, + "step": 35125 + }, + { + "epoch": 1.75, + "grad_norm": 2.0336318016052246, + "learning_rate": 2.3119797722532897e-05, + "loss": 0.8082, + "step": 35150 + }, + { + "epoch": 1.75, + "grad_norm": 1.8673982620239258, + "learning_rate": 2.3096727755938212e-05, + "loss": 0.7853, + "step": 35175 + }, + { + "epoch": 1.75, + "grad_norm": 5.2938761711120605, + "learning_rate": 2.3073657789343523e-05, + "loss": 0.7803, + "step": 35200 + }, + { + "epoch": 1.76, + "grad_norm": 2.7945752143859863, + "learning_rate": 2.3050587822748835e-05, + "loss": 0.8236, + "step": 35225 + }, + { + "epoch": 1.76, + "grad_norm": 8.166348457336426, + "learning_rate": 2.3027517856154146e-05, + "loss": 0.8583, + "step": 35250 + }, + { + "epoch": 1.76, + "grad_norm": 7.2717790603637695, + "learning_rate": 2.3004447889559458e-05, + "loss": 0.8103, + "step": 35275 + }, + { + "epoch": 1.76, + "grad_norm": 4.064415454864502, + "learning_rate": 2.298137792296477e-05, + "loss": 0.8061, + "step": 35300 + }, + { + "epoch": 1.76, + "grad_norm": 6.309088706970215, + "learning_rate": 2.295830795637008e-05, + "loss": 0.8702, + "step": 35325 + }, + { + "epoch": 1.76, + "grad_norm": 4.678400993347168, + "learning_rate": 2.293523798977539e-05, + "loss": 0.7731, + "step": 35350 + }, + { + "epoch": 1.76, + "grad_norm": 1.9187320470809937, + "learning_rate": 2.2912168023180703e-05, + "loss": 0.8482, + "step": 35375 + }, + { + "epoch": 1.76, + "grad_norm": 4.612795352935791, + "learning_rate": 2.2889098056586014e-05, + "loss": 0.8311, + "step": 35400 + }, + { + "epoch": 1.77, + "grad_norm": 5.448324203491211, + "learning_rate": 2.2866028089991326e-05, + "loss": 0.854, + "step": 35425 + }, + { + "epoch": 1.77, + "grad_norm": 7.444492816925049, + "learning_rate": 2.2842958123396637e-05, + "loss": 0.8343, + "step": 35450 + }, + { + "epoch": 1.77, + "grad_norm": 3.258119821548462, + "learning_rate": 2.281988815680195e-05, + "loss": 0.8172, + "step": 35475 + }, + { + "epoch": 1.77, + "grad_norm": 2.3989317417144775, + "learning_rate": 2.279681819020726e-05, + "loss": 0.8792, + "step": 35500 + }, + { + "epoch": 1.77, + "grad_norm": 6.7266693115234375, + "learning_rate": 2.2773748223612575e-05, + "loss": 0.8326, + "step": 35525 + }, + { + "epoch": 1.77, + "grad_norm": 8.093158721923828, + "learning_rate": 2.2750678257017886e-05, + "loss": 0.7305, + "step": 35550 + }, + { + "epoch": 1.77, + "grad_norm": 6.749348163604736, + "learning_rate": 2.2727608290423197e-05, + "loss": 0.9024, + "step": 35575 + }, + { + "epoch": 1.77, + "grad_norm": 1.8120150566101074, + "learning_rate": 2.270453832382851e-05, + "loss": 0.8003, + "step": 35600 + }, + { + "epoch": 1.78, + "grad_norm": 5.8295793533325195, + "learning_rate": 2.268146835723382e-05, + "loss": 0.8163, + "step": 35625 + }, + { + "epoch": 1.78, + "grad_norm": 5.463387489318848, + "learning_rate": 2.265839839063913e-05, + "loss": 0.8287, + "step": 35650 + }, + { + "epoch": 1.78, + "grad_norm": 2.2227492332458496, + "learning_rate": 2.2635328424044443e-05, + "loss": 0.808, + "step": 35675 + }, + { + "epoch": 1.78, + "grad_norm": 7.72734260559082, + "learning_rate": 2.2612258457449757e-05, + "loss": 0.7966, + "step": 35700 + }, + { + "epoch": 1.78, + "grad_norm": 3.9670233726501465, + "learning_rate": 2.2589188490855065e-05, + "loss": 0.7873, + "step": 35725 + }, + { + "epoch": 1.78, + "grad_norm": 4.4433207511901855, + "learning_rate": 2.2566118524260377e-05, + "loss": 0.9153, + "step": 35750 + }, + { + "epoch": 1.78, + "grad_norm": 2.567361831665039, + "learning_rate": 2.2543048557665688e-05, + "loss": 0.816, + "step": 35775 + }, + { + "epoch": 1.78, + "grad_norm": 2.7919533252716064, + "learning_rate": 2.2519978591071e-05, + "loss": 0.7996, + "step": 35800 + }, + { + "epoch": 1.79, + "grad_norm": 1.8410005569458008, + "learning_rate": 2.249690862447631e-05, + "loss": 0.8555, + "step": 35825 + }, + { + "epoch": 1.79, + "grad_norm": 3.8902974128723145, + "learning_rate": 2.2473838657881626e-05, + "loss": 0.8363, + "step": 35850 + }, + { + "epoch": 1.79, + "grad_norm": 2.6485462188720703, + "learning_rate": 2.2450768691286937e-05, + "loss": 0.8928, + "step": 35875 + }, + { + "epoch": 1.79, + "grad_norm": 3.155224323272705, + "learning_rate": 2.2427698724692248e-05, + "loss": 0.8476, + "step": 35900 + }, + { + "epoch": 1.79, + "grad_norm": 2.0526411533355713, + "learning_rate": 2.240462875809756e-05, + "loss": 0.8015, + "step": 35925 + }, + { + "epoch": 1.79, + "grad_norm": 2.686580181121826, + "learning_rate": 2.238155879150287e-05, + "loss": 0.7542, + "step": 35950 + }, + { + "epoch": 1.79, + "grad_norm": 4.2547688484191895, + "learning_rate": 2.2358488824908182e-05, + "loss": 0.8228, + "step": 35975 + }, + { + "epoch": 1.79, + "grad_norm": 5.983592987060547, + "learning_rate": 2.2335418858313494e-05, + "loss": 0.8093, + "step": 36000 + }, + { + "epoch": 1.8, + "grad_norm": 7.171198844909668, + "learning_rate": 2.2312348891718805e-05, + "loss": 0.7551, + "step": 36025 + }, + { + "epoch": 1.8, + "grad_norm": 5.765264511108398, + "learning_rate": 2.228927892512412e-05, + "loss": 0.9014, + "step": 36050 + }, + { + "epoch": 1.8, + "grad_norm": 2.6858668327331543, + "learning_rate": 2.226620895852943e-05, + "loss": 0.8089, + "step": 36075 + }, + { + "epoch": 1.8, + "grad_norm": 7.885138034820557, + "learning_rate": 2.224313899193474e-05, + "loss": 0.8527, + "step": 36100 + }, + { + "epoch": 1.8, + "grad_norm": 2.0656027793884277, + "learning_rate": 2.222006902534005e-05, + "loss": 0.7826, + "step": 36125 + }, + { + "epoch": 1.8, + "grad_norm": 5.330872535705566, + "learning_rate": 2.2196999058745362e-05, + "loss": 0.7946, + "step": 36150 + }, + { + "epoch": 1.8, + "grad_norm": 6.922634124755859, + "learning_rate": 2.2173929092150673e-05, + "loss": 0.8434, + "step": 36175 + }, + { + "epoch": 1.8, + "grad_norm": 2.980771064758301, + "learning_rate": 2.2150859125555988e-05, + "loss": 0.7595, + "step": 36200 + }, + { + "epoch": 1.81, + "grad_norm": 2.326483726501465, + "learning_rate": 2.21277891589613e-05, + "loss": 0.8722, + "step": 36225 + }, + { + "epoch": 1.81, + "grad_norm": 4.783517360687256, + "learning_rate": 2.210471919236661e-05, + "loss": 0.7853, + "step": 36250 + }, + { + "epoch": 1.81, + "grad_norm": 5.524138450622559, + "learning_rate": 2.2081649225771922e-05, + "loss": 0.75, + "step": 36275 + }, + { + "epoch": 1.81, + "grad_norm": 2.14884352684021, + "learning_rate": 2.2058579259177233e-05, + "loss": 0.9266, + "step": 36300 + }, + { + "epoch": 1.81, + "grad_norm": 4.895349502563477, + "learning_rate": 2.2035509292582545e-05, + "loss": 0.8576, + "step": 36325 + }, + { + "epoch": 1.81, + "grad_norm": 5.256768226623535, + "learning_rate": 2.2012439325987856e-05, + "loss": 0.8131, + "step": 36350 + }, + { + "epoch": 1.81, + "grad_norm": 2.264431953430176, + "learning_rate": 2.198936935939317e-05, + "loss": 0.8111, + "step": 36375 + }, + { + "epoch": 1.81, + "grad_norm": 2.4203009605407715, + "learning_rate": 2.1966299392798482e-05, + "loss": 0.8239, + "step": 36400 + }, + { + "epoch": 1.82, + "grad_norm": 2.2608046531677246, + "learning_rate": 2.1943229426203794e-05, + "loss": 0.845, + "step": 36425 + }, + { + "epoch": 1.82, + "grad_norm": 2.1424031257629395, + "learning_rate": 2.1920159459609105e-05, + "loss": 0.7855, + "step": 36450 + }, + { + "epoch": 1.82, + "grad_norm": 4.636570453643799, + "learning_rate": 2.1897089493014413e-05, + "loss": 0.8291, + "step": 36475 + }, + { + "epoch": 1.82, + "grad_norm": 4.953878402709961, + "learning_rate": 2.1874019526419724e-05, + "loss": 0.8101, + "step": 36500 + }, + { + "epoch": 1.82, + "grad_norm": 6.59492826461792, + "learning_rate": 2.185094955982504e-05, + "loss": 0.798, + "step": 36525 + }, + { + "epoch": 1.82, + "grad_norm": 3.5249340534210205, + "learning_rate": 2.182787959323035e-05, + "loss": 0.7401, + "step": 36550 + }, + { + "epoch": 1.82, + "grad_norm": 4.184223651885986, + "learning_rate": 2.1804809626635662e-05, + "loss": 0.8175, + "step": 36575 + }, + { + "epoch": 1.82, + "grad_norm": 2.470923900604248, + "learning_rate": 2.1781739660040973e-05, + "loss": 0.8137, + "step": 36600 + }, + { + "epoch": 1.83, + "grad_norm": 2.4369516372680664, + "learning_rate": 2.1758669693446285e-05, + "loss": 0.7938, + "step": 36625 + }, + { + "epoch": 1.83, + "grad_norm": 3.2020232677459717, + "learning_rate": 2.1735599726851596e-05, + "loss": 0.7854, + "step": 36650 + }, + { + "epoch": 1.83, + "grad_norm": 4.032663822174072, + "learning_rate": 2.1712529760256907e-05, + "loss": 0.8469, + "step": 36675 + }, + { + "epoch": 1.83, + "grad_norm": 12.03586196899414, + "learning_rate": 2.168945979366222e-05, + "loss": 0.8206, + "step": 36700 + }, + { + "epoch": 1.83, + "grad_norm": 4.0775556564331055, + "learning_rate": 2.1666389827067533e-05, + "loss": 0.7563, + "step": 36725 + }, + { + "epoch": 1.83, + "grad_norm": 2.1432387828826904, + "learning_rate": 2.1643319860472845e-05, + "loss": 0.869, + "step": 36750 + }, + { + "epoch": 1.83, + "grad_norm": 11.407499313354492, + "learning_rate": 2.1620249893878156e-05, + "loss": 0.7877, + "step": 36775 + }, + { + "epoch": 1.83, + "grad_norm": 5.67279052734375, + "learning_rate": 2.1597179927283467e-05, + "loss": 0.828, + "step": 36800 + }, + { + "epoch": 1.84, + "grad_norm": 2.022274971008301, + "learning_rate": 2.157410996068878e-05, + "loss": 0.82, + "step": 36825 + }, + { + "epoch": 1.84, + "grad_norm": 2.832973003387451, + "learning_rate": 2.1551039994094087e-05, + "loss": 0.813, + "step": 36850 + }, + { + "epoch": 1.84, + "grad_norm": 2.9639811515808105, + "learning_rate": 2.15279700274994e-05, + "loss": 0.8116, + "step": 36875 + }, + { + "epoch": 1.84, + "grad_norm": 4.492002010345459, + "learning_rate": 2.1504900060904713e-05, + "loss": 0.8496, + "step": 36900 + }, + { + "epoch": 1.84, + "grad_norm": 3.4977283477783203, + "learning_rate": 2.1481830094310024e-05, + "loss": 0.8079, + "step": 36925 + }, + { + "epoch": 1.84, + "grad_norm": 6.909940719604492, + "learning_rate": 2.1458760127715336e-05, + "loss": 0.824, + "step": 36950 + }, + { + "epoch": 1.84, + "grad_norm": 4.175833702087402, + "learning_rate": 2.1435690161120647e-05, + "loss": 0.888, + "step": 36975 + }, + { + "epoch": 1.84, + "grad_norm": 4.707504749298096, + "learning_rate": 2.1412620194525958e-05, + "loss": 0.775, + "step": 37000 + }, + { + "epoch": 1.84, + "grad_norm": 3.5683321952819824, + "learning_rate": 2.138955022793127e-05, + "loss": 0.8675, + "step": 37025 + }, + { + "epoch": 1.85, + "grad_norm": 4.9137043952941895, + "learning_rate": 2.1366480261336584e-05, + "loss": 0.7563, + "step": 37050 + }, + { + "epoch": 1.85, + "grad_norm": 5.058765888214111, + "learning_rate": 2.1343410294741896e-05, + "loss": 0.862, + "step": 37075 + }, + { + "epoch": 1.85, + "grad_norm": 2.8901517391204834, + "learning_rate": 2.1320340328147207e-05, + "loss": 0.8354, + "step": 37100 + }, + { + "epoch": 1.85, + "grad_norm": 2.0617527961730957, + "learning_rate": 2.129727036155252e-05, + "loss": 0.8047, + "step": 37125 + }, + { + "epoch": 1.85, + "grad_norm": 7.36972713470459, + "learning_rate": 2.127420039495783e-05, + "loss": 0.7663, + "step": 37150 + }, + { + "epoch": 1.85, + "grad_norm": 3.1017777919769287, + "learning_rate": 2.125113042836314e-05, + "loss": 0.8682, + "step": 37175 + }, + { + "epoch": 1.85, + "grad_norm": 2.616933584213257, + "learning_rate": 2.1228060461768453e-05, + "loss": 0.8047, + "step": 37200 + }, + { + "epoch": 1.85, + "grad_norm": 4.542732238769531, + "learning_rate": 2.1204990495173764e-05, + "loss": 0.8326, + "step": 37225 + }, + { + "epoch": 1.86, + "grad_norm": 2.4547290802001953, + "learning_rate": 2.1181920528579075e-05, + "loss": 0.8496, + "step": 37250 + }, + { + "epoch": 1.86, + "grad_norm": 6.471099376678467, + "learning_rate": 2.1158850561984387e-05, + "loss": 0.8131, + "step": 37275 + }, + { + "epoch": 1.86, + "grad_norm": 2.56229567527771, + "learning_rate": 2.1135780595389698e-05, + "loss": 0.8289, + "step": 37300 + }, + { + "epoch": 1.86, + "grad_norm": 4.7177300453186035, + "learning_rate": 2.111271062879501e-05, + "loss": 0.8691, + "step": 37325 + }, + { + "epoch": 1.86, + "grad_norm": 3.8973240852355957, + "learning_rate": 2.108964066220032e-05, + "loss": 0.7911, + "step": 37350 + }, + { + "epoch": 1.86, + "grad_norm": 5.638448715209961, + "learning_rate": 2.1066570695605632e-05, + "loss": 0.774, + "step": 37375 + }, + { + "epoch": 1.86, + "grad_norm": 2.739816427230835, + "learning_rate": 2.1043500729010947e-05, + "loss": 0.7736, + "step": 37400 + }, + { + "epoch": 1.86, + "grad_norm": 2.312668561935425, + "learning_rate": 2.1020430762416258e-05, + "loss": 0.8059, + "step": 37425 + }, + { + "epoch": 1.87, + "grad_norm": 8.428336143493652, + "learning_rate": 2.099736079582157e-05, + "loss": 0.7917, + "step": 37450 + }, + { + "epoch": 1.87, + "grad_norm": 2.120487928390503, + "learning_rate": 2.097429082922688e-05, + "loss": 0.8561, + "step": 37475 + }, + { + "epoch": 1.87, + "grad_norm": 2.118880271911621, + "learning_rate": 2.0951220862632192e-05, + "loss": 0.8347, + "step": 37500 + }, + { + "epoch": 1.87, + "grad_norm": 2.0460243225097656, + "learning_rate": 2.0928150896037504e-05, + "loss": 0.7991, + "step": 37525 + }, + { + "epoch": 1.87, + "grad_norm": 1.8884197473526, + "learning_rate": 2.0905080929442815e-05, + "loss": 0.7781, + "step": 37550 + }, + { + "epoch": 1.87, + "grad_norm": 1.735609769821167, + "learning_rate": 2.088201096284813e-05, + "loss": 0.7974, + "step": 37575 + }, + { + "epoch": 1.87, + "grad_norm": 7.059275150299072, + "learning_rate": 2.0858940996253438e-05, + "loss": 0.7835, + "step": 37600 + }, + { + "epoch": 1.87, + "grad_norm": 4.521622657775879, + "learning_rate": 2.083587102965875e-05, + "loss": 0.8168, + "step": 37625 + }, + { + "epoch": 1.88, + "grad_norm": 1.8211438655853271, + "learning_rate": 2.081280106306406e-05, + "loss": 0.772, + "step": 37650 + }, + { + "epoch": 1.88, + "grad_norm": 6.678451061248779, + "learning_rate": 2.0789731096469372e-05, + "loss": 0.8403, + "step": 37675 + }, + { + "epoch": 1.88, + "grad_norm": 7.757253170013428, + "learning_rate": 2.0766661129874683e-05, + "loss": 0.8458, + "step": 37700 + }, + { + "epoch": 1.88, + "grad_norm": 2.1668477058410645, + "learning_rate": 2.0743591163279995e-05, + "loss": 0.8513, + "step": 37725 + }, + { + "epoch": 1.88, + "grad_norm": 3.1951613426208496, + "learning_rate": 2.072052119668531e-05, + "loss": 0.8782, + "step": 37750 + }, + { + "epoch": 1.88, + "grad_norm": 3.6583151817321777, + "learning_rate": 2.069745123009062e-05, + "loss": 0.8233, + "step": 37775 + }, + { + "epoch": 1.88, + "grad_norm": 4.582195281982422, + "learning_rate": 2.0674381263495932e-05, + "loss": 0.8616, + "step": 37800 + }, + { + "epoch": 1.88, + "grad_norm": 4.346683025360107, + "learning_rate": 2.0651311296901243e-05, + "loss": 0.7653, + "step": 37825 + }, + { + "epoch": 1.89, + "grad_norm": 6.112051486968994, + "learning_rate": 2.0628241330306555e-05, + "loss": 0.8561, + "step": 37850 + }, + { + "epoch": 1.89, + "grad_norm": 3.2946393489837646, + "learning_rate": 2.0605171363711866e-05, + "loss": 0.775, + "step": 37875 + }, + { + "epoch": 1.89, + "grad_norm": 1.5814847946166992, + "learning_rate": 2.0582101397117177e-05, + "loss": 0.506, + "step": 37900 + }, + { + "epoch": 1.89, + "grad_norm": 3.392300844192505, + "learning_rate": 2.0559031430522492e-05, + "loss": 0.5086, + "step": 37925 + }, + { + "epoch": 1.89, + "grad_norm": 5.850569725036621, + "learning_rate": 2.0535961463927804e-05, + "loss": 0.55, + "step": 37950 + }, + { + "epoch": 1.89, + "grad_norm": 2.7387137413024902, + "learning_rate": 2.051289149733311e-05, + "loss": 0.4841, + "step": 37975 + }, + { + "epoch": 1.89, + "grad_norm": 2.8632755279541016, + "learning_rate": 2.0489821530738423e-05, + "loss": 0.4663, + "step": 38000 + }, + { + "epoch": 1.89, + "grad_norm": 4.828920841217041, + "learning_rate": 2.0466751564143734e-05, + "loss": 0.8887, + "step": 38025 + }, + { + "epoch": 1.9, + "grad_norm": 3.6731338500976562, + "learning_rate": 2.0443681597549046e-05, + "loss": 0.5948, + "step": 38050 + }, + { + "epoch": 1.9, + "grad_norm": 5.887766361236572, + "learning_rate": 2.042061163095436e-05, + "loss": 0.4842, + "step": 38075 + }, + { + "epoch": 1.9, + "grad_norm": 3.1257717609405518, + "learning_rate": 2.039754166435967e-05, + "loss": 0.5482, + "step": 38100 + }, + { + "epoch": 1.9, + "grad_norm": 1.9796582460403442, + "learning_rate": 2.0374471697764983e-05, + "loss": 0.4274, + "step": 38125 + }, + { + "epoch": 1.9, + "grad_norm": 2.342222213745117, + "learning_rate": 2.0351401731170294e-05, + "loss": 0.4282, + "step": 38150 + }, + { + "epoch": 1.9, + "grad_norm": 3.084747791290283, + "learning_rate": 2.0328331764575606e-05, + "loss": 0.517, + "step": 38175 + }, + { + "epoch": 1.9, + "grad_norm": 4.009477615356445, + "learning_rate": 2.0305261797980917e-05, + "loss": 0.3951, + "step": 38200 + }, + { + "epoch": 1.9, + "grad_norm": 5.501810073852539, + "learning_rate": 2.028219183138623e-05, + "loss": 0.4186, + "step": 38225 + }, + { + "epoch": 1.91, + "grad_norm": 1.4934760332107544, + "learning_rate": 2.025912186479154e-05, + "loss": 0.4899, + "step": 38250 + }, + { + "epoch": 1.91, + "grad_norm": 4.006719589233398, + "learning_rate": 2.0236051898196855e-05, + "loss": 0.6189, + "step": 38275 + }, + { + "epoch": 1.91, + "grad_norm": 2.458674907684326, + "learning_rate": 2.0212981931602166e-05, + "loss": 0.4577, + "step": 38300 + }, + { + "epoch": 1.91, + "grad_norm": 2.8802576065063477, + "learning_rate": 2.0189911965007477e-05, + "loss": 0.5431, + "step": 38325 + }, + { + "epoch": 1.91, + "grad_norm": 6.646295070648193, + "learning_rate": 2.0166841998412785e-05, + "loss": 0.4843, + "step": 38350 + }, + { + "epoch": 1.91, + "grad_norm": 3.5825748443603516, + "learning_rate": 2.0143772031818097e-05, + "loss": 0.5397, + "step": 38375 + }, + { + "epoch": 1.91, + "grad_norm": 3.379711389541626, + "learning_rate": 2.0120702065223408e-05, + "loss": 0.6215, + "step": 38400 + }, + { + "epoch": 1.91, + "grad_norm": 4.383761882781982, + "learning_rate": 2.0097632098628723e-05, + "loss": 0.6691, + "step": 38425 + }, + { + "epoch": 1.92, + "grad_norm": 2.3637311458587646, + "learning_rate": 2.0074562132034034e-05, + "loss": 0.509, + "step": 38450 + }, + { + "epoch": 1.92, + "grad_norm": 3.028691291809082, + "learning_rate": 2.0051492165439345e-05, + "loss": 0.4861, + "step": 38475 + }, + { + "epoch": 1.92, + "grad_norm": 2.57072377204895, + "learning_rate": 2.0028422198844657e-05, + "loss": 0.4268, + "step": 38500 + }, + { + "epoch": 1.92, + "grad_norm": 1.385330319404602, + "learning_rate": 2.0005352232249968e-05, + "loss": 0.541, + "step": 38525 + }, + { + "epoch": 1.92, + "grad_norm": 4.9758758544921875, + "learning_rate": 1.998228226565528e-05, + "loss": 0.4891, + "step": 38550 + }, + { + "epoch": 1.92, + "grad_norm": 1.3216552734375, + "learning_rate": 1.995921229906059e-05, + "loss": 0.7594, + "step": 38575 + }, + { + "epoch": 1.92, + "grad_norm": 1.4653106927871704, + "learning_rate": 1.9936142332465906e-05, + "loss": 0.6146, + "step": 38600 + }, + { + "epoch": 1.92, + "grad_norm": 2.2742831707000732, + "learning_rate": 1.9913072365871217e-05, + "loss": 0.6672, + "step": 38625 + }, + { + "epoch": 1.93, + "grad_norm": 3.1644742488861084, + "learning_rate": 1.989000239927653e-05, + "loss": 0.5287, + "step": 38650 + }, + { + "epoch": 1.93, + "grad_norm": 2.7336010932922363, + "learning_rate": 1.986693243268184e-05, + "loss": 0.5898, + "step": 38675 + }, + { + "epoch": 1.93, + "grad_norm": 3.7895514965057373, + "learning_rate": 1.984386246608715e-05, + "loss": 0.45, + "step": 38700 + }, + { + "epoch": 1.93, + "grad_norm": 2.277320384979248, + "learning_rate": 1.982079249949246e-05, + "loss": 0.4729, + "step": 38725 + }, + { + "epoch": 1.93, + "grad_norm": 2.145059108734131, + "learning_rate": 1.9797722532897774e-05, + "loss": 0.5978, + "step": 38750 + }, + { + "epoch": 1.93, + "grad_norm": 5.892246246337891, + "learning_rate": 1.9774652566303085e-05, + "loss": 0.4955, + "step": 38775 + }, + { + "epoch": 1.93, + "grad_norm": 1.9532073736190796, + "learning_rate": 1.9751582599708397e-05, + "loss": 0.4587, + "step": 38800 + }, + { + "epoch": 1.93, + "grad_norm": 3.237112283706665, + "learning_rate": 1.9728512633113708e-05, + "loss": 0.5907, + "step": 38825 + }, + { + "epoch": 1.94, + "grad_norm": 2.5157840251922607, + "learning_rate": 1.970544266651902e-05, + "loss": 0.6024, + "step": 38850 + }, + { + "epoch": 1.94, + "grad_norm": 2.959118127822876, + "learning_rate": 1.968237269992433e-05, + "loss": 0.4482, + "step": 38875 + }, + { + "epoch": 1.94, + "grad_norm": 1.199279546737671, + "learning_rate": 1.9659302733329642e-05, + "loss": 0.6254, + "step": 38900 + }, + { + "epoch": 1.94, + "grad_norm": 2.68166184425354, + "learning_rate": 1.9636232766734953e-05, + "loss": 0.5216, + "step": 38925 + }, + { + "epoch": 1.94, + "grad_norm": 3.7433533668518066, + "learning_rate": 1.9613162800140268e-05, + "loss": 0.5939, + "step": 38950 + }, + { + "epoch": 1.94, + "grad_norm": 2.2293860912323, + "learning_rate": 1.959009283354558e-05, + "loss": 0.4903, + "step": 38975 + }, + { + "epoch": 1.94, + "grad_norm": 4.912618637084961, + "learning_rate": 1.956702286695089e-05, + "loss": 0.4401, + "step": 39000 + }, + { + "epoch": 1.94, + "grad_norm": 10.129170417785645, + "learning_rate": 1.9543952900356202e-05, + "loss": 0.5514, + "step": 39025 + }, + { + "epoch": 1.95, + "grad_norm": 2.102691173553467, + "learning_rate": 1.9520882933761514e-05, + "loss": 0.4956, + "step": 39050 + }, + { + "epoch": 1.95, + "grad_norm": 2.0237252712249756, + "learning_rate": 1.9497812967166825e-05, + "loss": 0.52, + "step": 39075 + }, + { + "epoch": 1.95, + "grad_norm": 2.302222490310669, + "learning_rate": 1.9474743000572136e-05, + "loss": 0.3286, + "step": 39100 + }, + { + "epoch": 1.95, + "grad_norm": 1.6583091020584106, + "learning_rate": 1.9451673033977448e-05, + "loss": 0.4163, + "step": 39125 + }, + { + "epoch": 1.95, + "grad_norm": 2.9822421073913574, + "learning_rate": 1.942860306738276e-05, + "loss": 0.4914, + "step": 39150 + }, + { + "epoch": 1.95, + "grad_norm": 3.1285948753356934, + "learning_rate": 1.940553310078807e-05, + "loss": 0.5983, + "step": 39175 + }, + { + "epoch": 1.95, + "grad_norm": 3.0795788764953613, + "learning_rate": 1.938246313419338e-05, + "loss": 0.4094, + "step": 39200 + }, + { + "epoch": 1.95, + "grad_norm": 20.059349060058594, + "learning_rate": 1.9359393167598693e-05, + "loss": 0.4936, + "step": 39225 + }, + { + "epoch": 1.96, + "grad_norm": 2.8071374893188477, + "learning_rate": 1.9336323201004004e-05, + "loss": 0.5021, + "step": 39250 + }, + { + "epoch": 1.96, + "grad_norm": 27.87400245666504, + "learning_rate": 1.931325323440932e-05, + "loss": 0.4801, + "step": 39275 + }, + { + "epoch": 1.96, + "grad_norm": 3.6269407272338867, + "learning_rate": 1.929018326781463e-05, + "loss": 0.505, + "step": 39300 + }, + { + "epoch": 1.96, + "grad_norm": 3.781467914581299, + "learning_rate": 1.9267113301219942e-05, + "loss": 0.4676, + "step": 39325 + }, + { + "epoch": 1.96, + "grad_norm": 2.3654417991638184, + "learning_rate": 1.9244043334625253e-05, + "loss": 0.5953, + "step": 39350 + }, + { + "epoch": 1.96, + "grad_norm": 3.031672954559326, + "learning_rate": 1.9220973368030565e-05, + "loss": 0.5119, + "step": 39375 + }, + { + "epoch": 1.96, + "grad_norm": 3.2161550521850586, + "learning_rate": 1.9197903401435876e-05, + "loss": 0.4759, + "step": 39400 + }, + { + "epoch": 1.96, + "grad_norm": 1.5116912126541138, + "learning_rate": 1.9174833434841187e-05, + "loss": 0.4879, + "step": 39425 + }, + { + "epoch": 1.97, + "grad_norm": 4.8071770668029785, + "learning_rate": 1.91517634682465e-05, + "loss": 0.5768, + "step": 39450 + }, + { + "epoch": 1.97, + "grad_norm": 2.3467979431152344, + "learning_rate": 1.912869350165181e-05, + "loss": 0.4743, + "step": 39475 + }, + { + "epoch": 1.97, + "grad_norm": 3.0653600692749023, + "learning_rate": 1.910562353505712e-05, + "loss": 0.5656, + "step": 39500 + }, + { + "epoch": 1.97, + "grad_norm": 3.336280584335327, + "learning_rate": 1.9082553568462433e-05, + "loss": 0.4947, + "step": 39525 + }, + { + "epoch": 1.97, + "grad_norm": 3.2678709030151367, + "learning_rate": 1.9059483601867744e-05, + "loss": 0.4589, + "step": 39550 + }, + { + "epoch": 1.97, + "grad_norm": 2.3263626098632812, + "learning_rate": 1.9036413635273055e-05, + "loss": 0.446, + "step": 39575 + }, + { + "epoch": 1.97, + "grad_norm": 2.6241798400878906, + "learning_rate": 1.9013343668678367e-05, + "loss": 0.5622, + "step": 39600 + }, + { + "epoch": 1.97, + "grad_norm": 41.76066970825195, + "learning_rate": 1.899027370208368e-05, + "loss": 0.4721, + "step": 39625 + }, + { + "epoch": 1.98, + "grad_norm": 5.291028022766113, + "learning_rate": 1.8967203735488993e-05, + "loss": 0.4655, + "step": 39650 + }, + { + "epoch": 1.98, + "grad_norm": 1.498937964439392, + "learning_rate": 1.8944133768894304e-05, + "loss": 0.4562, + "step": 39675 + }, + { + "epoch": 1.98, + "grad_norm": 5.547112464904785, + "learning_rate": 1.8921063802299616e-05, + "loss": 0.4972, + "step": 39700 + }, + { + "epoch": 1.98, + "grad_norm": 3.455609083175659, + "learning_rate": 1.8897993835704927e-05, + "loss": 0.5478, + "step": 39725 + }, + { + "epoch": 1.98, + "grad_norm": 1.4441814422607422, + "learning_rate": 1.887492386911024e-05, + "loss": 0.5511, + "step": 39750 + }, + { + "epoch": 1.98, + "grad_norm": 3.192058563232422, + "learning_rate": 1.885185390251555e-05, + "loss": 0.4308, + "step": 39775 + }, + { + "epoch": 1.98, + "grad_norm": 1.615493655204773, + "learning_rate": 1.8828783935920864e-05, + "loss": 0.3588, + "step": 39800 + }, + { + "epoch": 1.98, + "grad_norm": 1.5515351295471191, + "learning_rate": 1.8805713969326176e-05, + "loss": 0.4596, + "step": 39825 + }, + { + "epoch": 1.99, + "grad_norm": 13.214730262756348, + "learning_rate": 1.8782644002731484e-05, + "loss": 0.4913, + "step": 39850 + }, + { + "epoch": 1.99, + "grad_norm": 1.2539523839950562, + "learning_rate": 1.8759574036136795e-05, + "loss": 0.3608, + "step": 39875 + }, + { + "epoch": 1.99, + "grad_norm": 1.4073877334594727, + "learning_rate": 1.8736504069542107e-05, + "loss": 0.4483, + "step": 39900 + }, + { + "epoch": 1.99, + "grad_norm": 1.7948969602584839, + "learning_rate": 1.8713434102947418e-05, + "loss": 0.4403, + "step": 39925 + }, + { + "epoch": 1.99, + "grad_norm": 3.484506130218506, + "learning_rate": 1.8690364136352733e-05, + "loss": 0.4113, + "step": 39950 + }, + { + "epoch": 1.99, + "grad_norm": 2.7683985233306885, + "learning_rate": 1.8667294169758044e-05, + "loss": 0.499, + "step": 39975 + }, + { + "epoch": 1.99, + "grad_norm": 6.0013885498046875, + "learning_rate": 1.8644224203163355e-05, + "loss": 0.5183, + "step": 40000 + }, + { + "epoch": 1.99, + "grad_norm": 2.2845098972320557, + "learning_rate": 1.8621154236568667e-05, + "loss": 0.4011, + "step": 40025 + }, + { + "epoch": 2.0, + "grad_norm": 3.0258421897888184, + "learning_rate": 1.8598084269973978e-05, + "loss": 0.5238, + "step": 40050 + }, + { + "epoch": 2.0, + "grad_norm": 5.795552730560303, + "learning_rate": 1.857501430337929e-05, + "loss": 0.5491, + "step": 40075 + }, + { + "epoch": 2.0, + "grad_norm": 3.7295963764190674, + "learning_rate": 1.85519443367846e-05, + "loss": 0.417, + "step": 40100 + }, + { + "epoch": 2.0, + "grad_norm": 9.04057788848877, + "learning_rate": 1.8528874370189912e-05, + "loss": 0.5175, + "step": 40125 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.8620156970225489, + "eval_f1_macro": 0.752511699878362, + "eval_f1_micro": 0.8620156970225489, + "eval_f1_weighted": 0.8588178797835208, + "eval_loss": 0.4766501784324646, + "eval_precision_macro": 0.819008993064034, + "eval_precision_micro": 0.8620156970225489, + "eval_precision_weighted": 0.8602145721151697, + "eval_recall_macro": 0.7172989796317722, + "eval_recall_micro": 0.8620156970225489, + "eval_recall_weighted": 0.8620156970225489, + "eval_runtime": 7058.5264, + "eval_samples_per_second": 5.686, + "eval_steps_per_second": 0.355, + "step": 40136 + }, + { + "epoch": 2.0, + "grad_norm": 1.2662018537521362, + "learning_rate": 1.8505804403595227e-05, + "loss": 0.5362, + "step": 40150 + }, + { + "epoch": 2.0, + "grad_norm": 6.242440700531006, + "learning_rate": 1.8482734437000538e-05, + "loss": 0.4781, + "step": 40175 + }, + { + "epoch": 2.0, + "grad_norm": 2.6231377124786377, + "learning_rate": 1.845966447040585e-05, + "loss": 0.4311, + "step": 40200 + }, + { + "epoch": 2.0, + "grad_norm": 2.520632028579712, + "learning_rate": 1.8436594503811158e-05, + "loss": 0.4914, + "step": 40225 + }, + { + "epoch": 2.01, + "grad_norm": 3.3773555755615234, + "learning_rate": 1.841352453721647e-05, + "loss": 0.4345, + "step": 40250 + }, + { + "epoch": 2.01, + "grad_norm": 5.317331790924072, + "learning_rate": 1.839045457062178e-05, + "loss": 0.4221, + "step": 40275 + }, + { + "epoch": 2.01, + "grad_norm": 2.9365406036376953, + "learning_rate": 1.8367384604027095e-05, + "loss": 0.5238, + "step": 40300 + }, + { + "epoch": 2.01, + "grad_norm": 6.145638465881348, + "learning_rate": 1.8344314637432406e-05, + "loss": 0.5701, + "step": 40325 + }, + { + "epoch": 2.01, + "grad_norm": 4.878846168518066, + "learning_rate": 1.8321244670837718e-05, + "loss": 0.5091, + "step": 40350 + }, + { + "epoch": 2.01, + "grad_norm": 5.4882493019104, + "learning_rate": 1.829817470424303e-05, + "loss": 0.5533, + "step": 40375 + }, + { + "epoch": 2.01, + "grad_norm": 1.296852469444275, + "learning_rate": 1.827510473764834e-05, + "loss": 0.5923, + "step": 40400 + }, + { + "epoch": 2.01, + "grad_norm": 9.215230941772461, + "learning_rate": 1.8252034771053652e-05, + "loss": 0.4332, + "step": 40425 + }, + { + "epoch": 2.02, + "grad_norm": 3.852431058883667, + "learning_rate": 1.8228964804458963e-05, + "loss": 0.5316, + "step": 40450 + }, + { + "epoch": 2.02, + "grad_norm": 1.412146806716919, + "learning_rate": 1.8205894837864278e-05, + "loss": 0.4826, + "step": 40475 + }, + { + "epoch": 2.02, + "grad_norm": 1.4537949562072754, + "learning_rate": 1.818282487126959e-05, + "loss": 0.5232, + "step": 40500 + }, + { + "epoch": 2.02, + "grad_norm": 4.489310264587402, + "learning_rate": 1.81597549046749e-05, + "loss": 0.5984, + "step": 40525 + }, + { + "epoch": 2.02, + "grad_norm": 51.95314025878906, + "learning_rate": 1.8136684938080212e-05, + "loss": 0.5185, + "step": 40550 + }, + { + "epoch": 2.02, + "grad_norm": 3.217198371887207, + "learning_rate": 1.811361497148552e-05, + "loss": 0.4918, + "step": 40575 + }, + { + "epoch": 2.02, + "grad_norm": 4.193581581115723, + "learning_rate": 1.809054500489083e-05, + "loss": 0.5057, + "step": 40600 + }, + { + "epoch": 2.02, + "grad_norm": 11.709174156188965, + "learning_rate": 1.8067475038296143e-05, + "loss": 0.4267, + "step": 40625 + }, + { + "epoch": 2.03, + "grad_norm": 1.6062325239181519, + "learning_rate": 1.8044405071701457e-05, + "loss": 0.4338, + "step": 40650 + }, + { + "epoch": 2.03, + "grad_norm": 3.2066144943237305, + "learning_rate": 1.802133510510677e-05, + "loss": 0.4786, + "step": 40675 + }, + { + "epoch": 2.03, + "grad_norm": 72.72689819335938, + "learning_rate": 1.799826513851208e-05, + "loss": 0.4422, + "step": 40700 + }, + { + "epoch": 2.03, + "grad_norm": 8.56615161895752, + "learning_rate": 1.797519517191739e-05, + "loss": 0.5187, + "step": 40725 + }, + { + "epoch": 2.03, + "grad_norm": 2.4742374420166016, + "learning_rate": 1.7952125205322703e-05, + "loss": 0.4785, + "step": 40750 + }, + { + "epoch": 2.03, + "grad_norm": 0.9956190586090088, + "learning_rate": 1.7929055238728014e-05, + "loss": 0.3973, + "step": 40775 + }, + { + "epoch": 2.03, + "grad_norm": 1.506685495376587, + "learning_rate": 1.7905985272133326e-05, + "loss": 0.4557, + "step": 40800 + }, + { + "epoch": 2.03, + "grad_norm": 2.4764609336853027, + "learning_rate": 1.788291530553864e-05, + "loss": 0.5184, + "step": 40825 + }, + { + "epoch": 2.04, + "grad_norm": 6.196167469024658, + "learning_rate": 1.7859845338943952e-05, + "loss": 0.3534, + "step": 40850 + }, + { + "epoch": 2.04, + "grad_norm": 3.4392249584198, + "learning_rate": 1.7836775372349263e-05, + "loss": 0.4914, + "step": 40875 + }, + { + "epoch": 2.04, + "grad_norm": 3.8605923652648926, + "learning_rate": 1.7813705405754574e-05, + "loss": 0.4917, + "step": 40900 + }, + { + "epoch": 2.04, + "grad_norm": 2.812488317489624, + "learning_rate": 1.7790635439159886e-05, + "loss": 0.4589, + "step": 40925 + }, + { + "epoch": 2.04, + "grad_norm": 1.0951435565948486, + "learning_rate": 1.7767565472565194e-05, + "loss": 0.5361, + "step": 40950 + }, + { + "epoch": 2.04, + "grad_norm": 3.0446274280548096, + "learning_rate": 1.774449550597051e-05, + "loss": 0.4389, + "step": 40975 + }, + { + "epoch": 2.04, + "grad_norm": 1.0899313688278198, + "learning_rate": 1.772142553937582e-05, + "loss": 0.3562, + "step": 41000 + }, + { + "epoch": 2.04, + "grad_norm": 3.682464122772217, + "learning_rate": 1.769835557278113e-05, + "loss": 0.4362, + "step": 41025 + }, + { + "epoch": 2.05, + "grad_norm": 5.872410774230957, + "learning_rate": 1.7675285606186443e-05, + "loss": 0.3763, + "step": 41050 + }, + { + "epoch": 2.05, + "grad_norm": 2.9771783351898193, + "learning_rate": 1.7652215639591754e-05, + "loss": 0.4498, + "step": 41075 + }, + { + "epoch": 2.05, + "grad_norm": 1.2145291566848755, + "learning_rate": 1.7629145672997065e-05, + "loss": 0.5256, + "step": 41100 + }, + { + "epoch": 2.05, + "grad_norm": 23.237903594970703, + "learning_rate": 1.7606075706402377e-05, + "loss": 0.4201, + "step": 41125 + }, + { + "epoch": 2.05, + "grad_norm": 35.532711029052734, + "learning_rate": 1.7583005739807688e-05, + "loss": 0.4249, + "step": 41150 + }, + { + "epoch": 2.05, + "grad_norm": 1.2682620286941528, + "learning_rate": 1.7559935773213003e-05, + "loss": 0.4446, + "step": 41175 + }, + { + "epoch": 2.05, + "grad_norm": 3.1225569248199463, + "learning_rate": 1.7536865806618314e-05, + "loss": 0.4662, + "step": 41200 + }, + { + "epoch": 2.05, + "grad_norm": 33.2777099609375, + "learning_rate": 1.7513795840023626e-05, + "loss": 0.4484, + "step": 41225 + }, + { + "epoch": 2.06, + "grad_norm": 2.847723960876465, + "learning_rate": 1.7490725873428937e-05, + "loss": 0.52, + "step": 41250 + }, + { + "epoch": 2.06, + "grad_norm": 56.711578369140625, + "learning_rate": 1.7467655906834248e-05, + "loss": 0.5024, + "step": 41275 + }, + { + "epoch": 2.06, + "grad_norm": 6.478834629058838, + "learning_rate": 1.744458594023956e-05, + "loss": 0.4259, + "step": 41300 + }, + { + "epoch": 2.06, + "grad_norm": 44.3673095703125, + "learning_rate": 1.742151597364487e-05, + "loss": 0.5075, + "step": 41325 + }, + { + "epoch": 2.06, + "grad_norm": 8.986730575561523, + "learning_rate": 1.7398446007050182e-05, + "loss": 0.5224, + "step": 41350 + }, + { + "epoch": 2.06, + "grad_norm": 1.1015548706054688, + "learning_rate": 1.7375376040455494e-05, + "loss": 0.519, + "step": 41375 + }, + { + "epoch": 2.06, + "grad_norm": 4.729287147521973, + "learning_rate": 1.7352306073860805e-05, + "loss": 0.3976, + "step": 41400 + }, + { + "epoch": 2.06, + "grad_norm": 12.000980377197266, + "learning_rate": 1.7329236107266116e-05, + "loss": 0.4831, + "step": 41425 + }, + { + "epoch": 2.07, + "grad_norm": 3.545671224594116, + "learning_rate": 1.7306166140671428e-05, + "loss": 0.5621, + "step": 41450 + }, + { + "epoch": 2.07, + "grad_norm": 2.6213879585266113, + "learning_rate": 1.728309617407674e-05, + "loss": 0.3901, + "step": 41475 + }, + { + "epoch": 2.07, + "grad_norm": 3.5966148376464844, + "learning_rate": 1.7260026207482054e-05, + "loss": 0.4114, + "step": 41500 + }, + { + "epoch": 2.07, + "grad_norm": 1.0668089389801025, + "learning_rate": 1.7236956240887365e-05, + "loss": 0.4352, + "step": 41525 + }, + { + "epoch": 2.07, + "grad_norm": 2.247910976409912, + "learning_rate": 1.7213886274292677e-05, + "loss": 0.352, + "step": 41550 + }, + { + "epoch": 2.07, + "grad_norm": 40.24725341796875, + "learning_rate": 1.7190816307697988e-05, + "loss": 0.4396, + "step": 41575 + }, + { + "epoch": 2.07, + "grad_norm": 3.1879642009735107, + "learning_rate": 1.71677463411033e-05, + "loss": 0.4884, + "step": 41600 + }, + { + "epoch": 2.07, + "grad_norm": 0.8823301196098328, + "learning_rate": 1.714467637450861e-05, + "loss": 0.5067, + "step": 41625 + }, + { + "epoch": 2.08, + "grad_norm": 0.8447086811065674, + "learning_rate": 1.7121606407913922e-05, + "loss": 0.2941, + "step": 41650 + }, + { + "epoch": 2.08, + "grad_norm": 0.8696421980857849, + "learning_rate": 1.7098536441319233e-05, + "loss": 0.4713, + "step": 41675 + }, + { + "epoch": 2.08, + "grad_norm": 3.203528642654419, + "learning_rate": 1.7075466474724545e-05, + "loss": 0.5117, + "step": 41700 + }, + { + "epoch": 2.08, + "grad_norm": 2.9464070796966553, + "learning_rate": 1.7052396508129856e-05, + "loss": 0.5354, + "step": 41725 + }, + { + "epoch": 2.08, + "grad_norm": 3.2064943313598633, + "learning_rate": 1.7029326541535167e-05, + "loss": 0.5292, + "step": 41750 + }, + { + "epoch": 2.08, + "grad_norm": 2.7801599502563477, + "learning_rate": 1.700625657494048e-05, + "loss": 0.4284, + "step": 41775 + }, + { + "epoch": 2.08, + "grad_norm": 1.081207513809204, + "learning_rate": 1.698318660834579e-05, + "loss": 0.3817, + "step": 41800 + }, + { + "epoch": 2.08, + "grad_norm": 3.171081066131592, + "learning_rate": 1.69601166417511e-05, + "loss": 0.6747, + "step": 41825 + }, + { + "epoch": 2.09, + "grad_norm": 2.7076244354248047, + "learning_rate": 1.6937046675156416e-05, + "loss": 0.4907, + "step": 41850 + }, + { + "epoch": 2.09, + "grad_norm": 5.660066604614258, + "learning_rate": 1.6913976708561728e-05, + "loss": 0.6036, + "step": 41875 + }, + { + "epoch": 2.09, + "grad_norm": 5.710346698760986, + "learning_rate": 1.689090674196704e-05, + "loss": 0.5117, + "step": 41900 + }, + { + "epoch": 2.09, + "grad_norm": 98.65436553955078, + "learning_rate": 1.686783677537235e-05, + "loss": 0.4043, + "step": 41925 + }, + { + "epoch": 2.09, + "grad_norm": 20.60503387451172, + "learning_rate": 1.6844766808777662e-05, + "loss": 0.5613, + "step": 41950 + }, + { + "epoch": 2.09, + "grad_norm": 1.6520097255706787, + "learning_rate": 1.6821696842182973e-05, + "loss": 0.3941, + "step": 41975 + }, + { + "epoch": 2.09, + "grad_norm": 3.609546184539795, + "learning_rate": 1.6798626875588284e-05, + "loss": 0.5384, + "step": 42000 + }, + { + "epoch": 2.09, + "grad_norm": 6.308892250061035, + "learning_rate": 1.67755569089936e-05, + "loss": 0.4076, + "step": 42025 + }, + { + "epoch": 2.1, + "grad_norm": 5.283653736114502, + "learning_rate": 1.675248694239891e-05, + "loss": 0.605, + "step": 42050 + }, + { + "epoch": 2.1, + "grad_norm": 1.1643218994140625, + "learning_rate": 1.672941697580422e-05, + "loss": 0.4295, + "step": 42075 + }, + { + "epoch": 2.1, + "grad_norm": 6.132506847381592, + "learning_rate": 1.670634700920953e-05, + "loss": 0.4605, + "step": 42100 + }, + { + "epoch": 2.1, + "grad_norm": 2.9161691665649414, + "learning_rate": 1.668327704261484e-05, + "loss": 0.3778, + "step": 42125 + }, + { + "epoch": 2.1, + "grad_norm": 0.9792761206626892, + "learning_rate": 1.6660207076020153e-05, + "loss": 0.4585, + "step": 42150 + }, + { + "epoch": 2.1, + "grad_norm": 1.2763252258300781, + "learning_rate": 1.6637137109425467e-05, + "loss": 0.5073, + "step": 42175 + }, + { + "epoch": 2.1, + "grad_norm": 1.0612924098968506, + "learning_rate": 1.661406714283078e-05, + "loss": 0.5103, + "step": 42200 + }, + { + "epoch": 2.1, + "grad_norm": 14.235264778137207, + "learning_rate": 1.659099717623609e-05, + "loss": 0.4173, + "step": 42225 + }, + { + "epoch": 2.11, + "grad_norm": 3.783046007156372, + "learning_rate": 1.65679272096414e-05, + "loss": 0.4379, + "step": 42250 + }, + { + "epoch": 2.11, + "grad_norm": 6.655981063842773, + "learning_rate": 1.6544857243046713e-05, + "loss": 0.624, + "step": 42275 + }, + { + "epoch": 2.11, + "grad_norm": 3.5701019763946533, + "learning_rate": 1.6521787276452024e-05, + "loss": 0.4916, + "step": 42300 + }, + { + "epoch": 2.11, + "grad_norm": 6.2817769050598145, + "learning_rate": 1.6498717309857336e-05, + "loss": 0.4803, + "step": 42325 + }, + { + "epoch": 2.11, + "grad_norm": 3.3220369815826416, + "learning_rate": 1.6475647343262647e-05, + "loss": 0.4698, + "step": 42350 + }, + { + "epoch": 2.11, + "grad_norm": 2.752890110015869, + "learning_rate": 1.645257737666796e-05, + "loss": 0.4165, + "step": 42375 + }, + { + "epoch": 2.11, + "grad_norm": 1.393794298171997, + "learning_rate": 1.6429507410073273e-05, + "loss": 0.4023, + "step": 42400 + }, + { + "epoch": 2.11, + "grad_norm": 5.880159854888916, + "learning_rate": 1.6406437443478584e-05, + "loss": 0.5125, + "step": 42425 + }, + { + "epoch": 2.12, + "grad_norm": 3.1372454166412354, + "learning_rate": 1.6383367476883892e-05, + "loss": 0.4362, + "step": 42450 + }, + { + "epoch": 2.12, + "grad_norm": 2.6483569145202637, + "learning_rate": 1.6360297510289204e-05, + "loss": 0.42, + "step": 42475 + }, + { + "epoch": 2.12, + "grad_norm": 0.8303941488265991, + "learning_rate": 1.6337227543694515e-05, + "loss": 0.5208, + "step": 42500 + }, + { + "epoch": 2.12, + "grad_norm": 2.7142579555511475, + "learning_rate": 1.631415757709983e-05, + "loss": 0.5796, + "step": 42525 + }, + { + "epoch": 2.12, + "grad_norm": 3.3541243076324463, + "learning_rate": 1.629108761050514e-05, + "loss": 0.4091, + "step": 42550 + }, + { + "epoch": 2.12, + "grad_norm": 42.22705078125, + "learning_rate": 1.6268017643910453e-05, + "loss": 0.4733, + "step": 42575 + }, + { + "epoch": 2.12, + "grad_norm": 1.1842784881591797, + "learning_rate": 1.6244947677315764e-05, + "loss": 0.3802, + "step": 42600 + }, + { + "epoch": 2.12, + "grad_norm": 4.394431114196777, + "learning_rate": 1.6221877710721075e-05, + "loss": 0.5847, + "step": 42625 + }, + { + "epoch": 2.13, + "grad_norm": 0.9206206798553467, + "learning_rate": 1.6198807744126387e-05, + "loss": 0.5002, + "step": 42650 + }, + { + "epoch": 2.13, + "grad_norm": 6.831622123718262, + "learning_rate": 1.6175737777531698e-05, + "loss": 0.6661, + "step": 42675 + }, + { + "epoch": 2.13, + "grad_norm": 1.1463747024536133, + "learning_rate": 1.6152667810937013e-05, + "loss": 0.5326, + "step": 42700 + }, + { + "epoch": 2.13, + "grad_norm": 0.8590452671051025, + "learning_rate": 1.6129597844342324e-05, + "loss": 0.2823, + "step": 42725 + }, + { + "epoch": 2.13, + "grad_norm": 4.416738510131836, + "learning_rate": 1.6106527877747635e-05, + "loss": 0.3593, + "step": 42750 + }, + { + "epoch": 2.13, + "grad_norm": 53.58966064453125, + "learning_rate": 1.6083457911152947e-05, + "loss": 0.4593, + "step": 42775 + }, + { + "epoch": 2.13, + "grad_norm": 3.2743759155273438, + "learning_rate": 1.6060387944558258e-05, + "loss": 0.4814, + "step": 42800 + }, + { + "epoch": 2.13, + "grad_norm": 37.638797760009766, + "learning_rate": 1.6037317977963566e-05, + "loss": 0.5561, + "step": 42825 + }, + { + "epoch": 2.14, + "grad_norm": 3.340258836746216, + "learning_rate": 1.601424801136888e-05, + "loss": 0.4613, + "step": 42850 + }, + { + "epoch": 2.14, + "grad_norm": 3.070276975631714, + "learning_rate": 1.5991178044774192e-05, + "loss": 0.3615, + "step": 42875 + }, + { + "epoch": 2.14, + "grad_norm": 2.999905824661255, + "learning_rate": 1.5968108078179504e-05, + "loss": 0.4379, + "step": 42900 + }, + { + "epoch": 2.14, + "grad_norm": 2.243607759475708, + "learning_rate": 1.5945038111584815e-05, + "loss": 0.5919, + "step": 42925 + }, + { + "epoch": 2.14, + "grad_norm": 0.8025704026222229, + "learning_rate": 1.5921968144990126e-05, + "loss": 0.3943, + "step": 42950 + }, + { + "epoch": 2.14, + "grad_norm": 3.583141803741455, + "learning_rate": 1.5898898178395438e-05, + "loss": 0.4952, + "step": 42975 + }, + { + "epoch": 2.14, + "grad_norm": 5.429011821746826, + "learning_rate": 1.587582821180075e-05, + "loss": 0.5703, + "step": 43000 + }, + { + "epoch": 2.14, + "grad_norm": 4.520952224731445, + "learning_rate": 1.585275824520606e-05, + "loss": 0.6043, + "step": 43025 + }, + { + "epoch": 2.15, + "grad_norm": 1.0744482278823853, + "learning_rate": 1.5829688278611375e-05, + "loss": 0.4658, + "step": 43050 + }, + { + "epoch": 2.15, + "grad_norm": 3.3810923099517822, + "learning_rate": 1.5806618312016686e-05, + "loss": 0.3332, + "step": 43075 + }, + { + "epoch": 2.15, + "grad_norm": 3.215128183364868, + "learning_rate": 1.5783548345421998e-05, + "loss": 0.4496, + "step": 43100 + }, + { + "epoch": 2.15, + "grad_norm": 4.319766998291016, + "learning_rate": 1.576047837882731e-05, + "loss": 0.4954, + "step": 43125 + }, + { + "epoch": 2.15, + "grad_norm": 0.8192872405052185, + "learning_rate": 1.573740841223262e-05, + "loss": 0.4935, + "step": 43150 + }, + { + "epoch": 2.15, + "grad_norm": 8.027621269226074, + "learning_rate": 1.5714338445637932e-05, + "loss": 0.566, + "step": 43175 + }, + { + "epoch": 2.15, + "grad_norm": 3.3970248699188232, + "learning_rate": 1.5691268479043243e-05, + "loss": 0.3207, + "step": 43200 + }, + { + "epoch": 2.15, + "grad_norm": 0.9151510000228882, + "learning_rate": 1.5668198512448555e-05, + "loss": 0.4512, + "step": 43225 + }, + { + "epoch": 2.16, + "grad_norm": 0.9420716762542725, + "learning_rate": 1.5645128545853866e-05, + "loss": 0.4624, + "step": 43250 + }, + { + "epoch": 2.16, + "grad_norm": 3.626981258392334, + "learning_rate": 1.5622058579259177e-05, + "loss": 0.4893, + "step": 43275 + }, + { + "epoch": 2.16, + "grad_norm": 6.8483686447143555, + "learning_rate": 1.559898861266449e-05, + "loss": 0.425, + "step": 43300 + }, + { + "epoch": 2.16, + "grad_norm": 1.0762109756469727, + "learning_rate": 1.55759186460698e-05, + "loss": 0.4628, + "step": 43325 + }, + { + "epoch": 2.16, + "grad_norm": 5.211805820465088, + "learning_rate": 1.555284867947511e-05, + "loss": 0.4319, + "step": 43350 + }, + { + "epoch": 2.16, + "grad_norm": 1.066054344177246, + "learning_rate": 1.5529778712880426e-05, + "loss": 0.4175, + "step": 43375 + }, + { + "epoch": 2.16, + "grad_norm": 2.201251983642578, + "learning_rate": 1.5506708746285738e-05, + "loss": 0.4262, + "step": 43400 + }, + { + "epoch": 2.16, + "grad_norm": 4.746454238891602, + "learning_rate": 1.548363877969105e-05, + "loss": 0.5822, + "step": 43425 + }, + { + "epoch": 2.17, + "grad_norm": 0.9321187734603882, + "learning_rate": 1.546056881309636e-05, + "loss": 0.4756, + "step": 43450 + }, + { + "epoch": 2.17, + "grad_norm": 2.94303035736084, + "learning_rate": 1.543749884650167e-05, + "loss": 0.3573, + "step": 43475 + }, + { + "epoch": 2.17, + "grad_norm": 6.546296119689941, + "learning_rate": 1.5414428879906983e-05, + "loss": 0.4012, + "step": 43500 + }, + { + "epoch": 2.17, + "grad_norm": 3.930738687515259, + "learning_rate": 1.5391358913312294e-05, + "loss": 0.4391, + "step": 43525 + }, + { + "epoch": 2.17, + "grad_norm": 3.519620895385742, + "learning_rate": 1.5368288946717606e-05, + "loss": 0.5147, + "step": 43550 + }, + { + "epoch": 2.17, + "grad_norm": 23.809885025024414, + "learning_rate": 1.5345218980122917e-05, + "loss": 0.387, + "step": 43575 + }, + { + "epoch": 2.17, + "grad_norm": 2.7929399013519287, + "learning_rate": 1.532214901352823e-05, + "loss": 0.4934, + "step": 43600 + }, + { + "epoch": 2.17, + "grad_norm": 0.9034756422042847, + "learning_rate": 1.529907904693354e-05, + "loss": 0.3842, + "step": 43625 + }, + { + "epoch": 2.18, + "grad_norm": 7.339956760406494, + "learning_rate": 1.527600908033885e-05, + "loss": 0.4575, + "step": 43650 + }, + { + "epoch": 2.18, + "grad_norm": 3.778557777404785, + "learning_rate": 1.5252939113744164e-05, + "loss": 0.4229, + "step": 43675 + }, + { + "epoch": 2.18, + "grad_norm": 1.220953345298767, + "learning_rate": 1.5229869147149476e-05, + "loss": 0.3305, + "step": 43700 + }, + { + "epoch": 2.18, + "grad_norm": 3.4920401573181152, + "learning_rate": 1.5206799180554787e-05, + "loss": 0.536, + "step": 43725 + }, + { + "epoch": 2.18, + "grad_norm": 1.343904972076416, + "learning_rate": 1.5183729213960098e-05, + "loss": 0.3731, + "step": 43750 + }, + { + "epoch": 2.18, + "grad_norm": 18.415124893188477, + "learning_rate": 1.5160659247365411e-05, + "loss": 0.4315, + "step": 43775 + }, + { + "epoch": 2.18, + "grad_norm": 128.90646362304688, + "learning_rate": 1.5137589280770723e-05, + "loss": 0.4811, + "step": 43800 + }, + { + "epoch": 2.18, + "grad_norm": 2.1207547187805176, + "learning_rate": 1.5114519314176034e-05, + "loss": 0.4449, + "step": 43825 + }, + { + "epoch": 2.19, + "grad_norm": 28.949186325073242, + "learning_rate": 1.5091449347581345e-05, + "loss": 0.398, + "step": 43850 + }, + { + "epoch": 2.19, + "grad_norm": 3.406864881515503, + "learning_rate": 1.5068379380986658e-05, + "loss": 0.3874, + "step": 43875 + }, + { + "epoch": 2.19, + "grad_norm": 4.393004417419434, + "learning_rate": 1.504530941439197e-05, + "loss": 0.4774, + "step": 43900 + }, + { + "epoch": 2.19, + "grad_norm": 1.2055267095565796, + "learning_rate": 1.5022239447797281e-05, + "loss": 0.418, + "step": 43925 + }, + { + "epoch": 2.19, + "grad_norm": 8.036547660827637, + "learning_rate": 1.499916948120259e-05, + "loss": 0.4491, + "step": 43950 + }, + { + "epoch": 2.19, + "grad_norm": 3.114225387573242, + "learning_rate": 1.4976099514607902e-05, + "loss": 0.338, + "step": 43975 + }, + { + "epoch": 2.19, + "grad_norm": 3.2484042644500732, + "learning_rate": 1.4953029548013214e-05, + "loss": 0.3471, + "step": 44000 + }, + { + "epoch": 2.19, + "grad_norm": 3.2204830646514893, + "learning_rate": 1.4929959581418527e-05, + "loss": 0.4569, + "step": 44025 + }, + { + "epoch": 2.2, + "grad_norm": 25.30776023864746, + "learning_rate": 1.4906889614823838e-05, + "loss": 0.4698, + "step": 44050 + }, + { + "epoch": 2.2, + "grad_norm": 2.0941309928894043, + "learning_rate": 1.488381964822915e-05, + "loss": 0.4937, + "step": 44075 + }, + { + "epoch": 2.2, + "grad_norm": 0.8043497204780579, + "learning_rate": 1.4860749681634462e-05, + "loss": 0.4964, + "step": 44100 + }, + { + "epoch": 2.2, + "grad_norm": 1.6714893579483032, + "learning_rate": 1.4837679715039774e-05, + "loss": 0.3856, + "step": 44125 + }, + { + "epoch": 2.2, + "grad_norm": 4.106298446655273, + "learning_rate": 1.4814609748445085e-05, + "loss": 0.6539, + "step": 44150 + }, + { + "epoch": 2.2, + "grad_norm": 5.138443946838379, + "learning_rate": 1.4791539781850396e-05, + "loss": 0.4228, + "step": 44175 + }, + { + "epoch": 2.2, + "grad_norm": 0.8958789110183716, + "learning_rate": 1.476846981525571e-05, + "loss": 0.4342, + "step": 44200 + }, + { + "epoch": 2.2, + "grad_norm": 4.751018524169922, + "learning_rate": 1.4745399848661021e-05, + "loss": 0.4827, + "step": 44225 + }, + { + "epoch": 2.21, + "grad_norm": 76.34806060791016, + "learning_rate": 1.4722329882066332e-05, + "loss": 0.3532, + "step": 44250 + }, + { + "epoch": 2.21, + "grad_norm": 40.389320373535156, + "learning_rate": 1.4699259915471644e-05, + "loss": 0.5208, + "step": 44275 + }, + { + "epoch": 2.21, + "grad_norm": 12.967884063720703, + "learning_rate": 1.4676189948876957e-05, + "loss": 0.4516, + "step": 44300 + }, + { + "epoch": 2.21, + "grad_norm": 1.6726328134536743, + "learning_rate": 1.4653119982282265e-05, + "loss": 0.4281, + "step": 44325 + }, + { + "epoch": 2.21, + "grad_norm": 5.279252052307129, + "learning_rate": 1.4630050015687578e-05, + "loss": 0.6983, + "step": 44350 + }, + { + "epoch": 2.21, + "grad_norm": 1.5335670709609985, + "learning_rate": 1.4606980049092889e-05, + "loss": 0.4025, + "step": 44375 + }, + { + "epoch": 2.21, + "grad_norm": 1.3843210935592651, + "learning_rate": 1.45839100824982e-05, + "loss": 0.4038, + "step": 44400 + }, + { + "epoch": 2.21, + "grad_norm": 3.566404104232788, + "learning_rate": 1.4560840115903512e-05, + "loss": 0.5917, + "step": 44425 + }, + { + "epoch": 2.21, + "grad_norm": 0.7694686055183411, + "learning_rate": 1.4537770149308825e-05, + "loss": 0.3287, + "step": 44450 + }, + { + "epoch": 2.22, + "grad_norm": 11.228218078613281, + "learning_rate": 1.4514700182714136e-05, + "loss": 0.3775, + "step": 44475 + }, + { + "epoch": 2.22, + "grad_norm": 1.0687758922576904, + "learning_rate": 1.4491630216119448e-05, + "loss": 0.5109, + "step": 44500 + }, + { + "epoch": 2.22, + "grad_norm": 3.5314488410949707, + "learning_rate": 1.4468560249524759e-05, + "loss": 0.5578, + "step": 44525 + }, + { + "epoch": 2.22, + "grad_norm": 1.0430461168289185, + "learning_rate": 1.4445490282930072e-05, + "loss": 0.383, + "step": 44550 + }, + { + "epoch": 2.22, + "grad_norm": 0.908136785030365, + "learning_rate": 1.4422420316335383e-05, + "loss": 0.4595, + "step": 44575 + }, + { + "epoch": 2.22, + "grad_norm": 5.896038055419922, + "learning_rate": 1.4399350349740695e-05, + "loss": 0.5647, + "step": 44600 + }, + { + "epoch": 2.22, + "grad_norm": 1.4894803762435913, + "learning_rate": 1.4376280383146008e-05, + "loss": 0.3499, + "step": 44625 + }, + { + "epoch": 2.22, + "grad_norm": 3.1445472240448, + "learning_rate": 1.4353210416551319e-05, + "loss": 0.4634, + "step": 44650 + }, + { + "epoch": 2.23, + "grad_norm": 1.3199021816253662, + "learning_rate": 1.433014044995663e-05, + "loss": 0.4512, + "step": 44675 + }, + { + "epoch": 2.23, + "grad_norm": 3.4122154712677, + "learning_rate": 1.430707048336194e-05, + "loss": 0.43, + "step": 44700 + }, + { + "epoch": 2.23, + "grad_norm": 2.9166758060455322, + "learning_rate": 1.4284000516767251e-05, + "loss": 0.3729, + "step": 44725 + }, + { + "epoch": 2.23, + "grad_norm": 5.693099021911621, + "learning_rate": 1.4260930550172563e-05, + "loss": 0.5361, + "step": 44750 + }, + { + "epoch": 2.23, + "grad_norm": 3.788499116897583, + "learning_rate": 1.4237860583577874e-05, + "loss": 0.4689, + "step": 44775 + }, + { + "epoch": 2.23, + "grad_norm": 5.346036434173584, + "learning_rate": 1.4214790616983187e-05, + "loss": 0.4982, + "step": 44800 + }, + { + "epoch": 2.23, + "grad_norm": 12.89140510559082, + "learning_rate": 1.4191720650388499e-05, + "loss": 0.3943, + "step": 44825 + }, + { + "epoch": 2.23, + "grad_norm": 7.905126094818115, + "learning_rate": 1.416865068379381e-05, + "loss": 0.4794, + "step": 44850 + }, + { + "epoch": 2.24, + "grad_norm": 6.943532466888428, + "learning_rate": 1.4145580717199123e-05, + "loss": 0.4801, + "step": 44875 + }, + { + "epoch": 2.24, + "grad_norm": 75.09638977050781, + "learning_rate": 1.4122510750604434e-05, + "loss": 0.6133, + "step": 44900 + }, + { + "epoch": 2.24, + "grad_norm": 6.250883102416992, + "learning_rate": 1.4099440784009746e-05, + "loss": 0.425, + "step": 44925 + }, + { + "epoch": 2.24, + "grad_norm": 5.3356709480285645, + "learning_rate": 1.4076370817415057e-05, + "loss": 0.5111, + "step": 44950 + }, + { + "epoch": 2.24, + "grad_norm": 5.9915242195129395, + "learning_rate": 1.405330085082037e-05, + "loss": 0.6025, + "step": 44975 + }, + { + "epoch": 2.24, + "grad_norm": 1.2483477592468262, + "learning_rate": 1.4030230884225682e-05, + "loss": 0.4851, + "step": 45000 + }, + { + "epoch": 2.24, + "grad_norm": 4.203946590423584, + "learning_rate": 1.4007160917630993e-05, + "loss": 0.5118, + "step": 45025 + }, + { + "epoch": 2.24, + "grad_norm": 2042.261962890625, + "learning_rate": 1.3984090951036304e-05, + "loss": 0.3955, + "step": 45050 + }, + { + "epoch": 2.25, + "grad_norm": 1.6488062143325806, + "learning_rate": 1.3961020984441614e-05, + "loss": 0.4482, + "step": 45075 + }, + { + "epoch": 2.25, + "grad_norm": 2.0874555110931396, + "learning_rate": 1.3937951017846925e-05, + "loss": 0.4248, + "step": 45100 + }, + { + "epoch": 2.25, + "grad_norm": 4.379459857940674, + "learning_rate": 1.3914881051252238e-05, + "loss": 0.4551, + "step": 45125 + }, + { + "epoch": 2.25, + "grad_norm": 1.5643028020858765, + "learning_rate": 1.389181108465755e-05, + "loss": 0.4521, + "step": 45150 + }, + { + "epoch": 2.25, + "grad_norm": 2.6934609413146973, + "learning_rate": 1.3868741118062861e-05, + "loss": 0.4903, + "step": 45175 + }, + { + "epoch": 2.25, + "grad_norm": 1.576201319694519, + "learning_rate": 1.3845671151468172e-05, + "loss": 0.5084, + "step": 45200 + }, + { + "epoch": 2.25, + "grad_norm": 3.358137845993042, + "learning_rate": 1.3822601184873485e-05, + "loss": 0.4601, + "step": 45225 + }, + { + "epoch": 2.25, + "grad_norm": 4.92896842956543, + "learning_rate": 1.3799531218278797e-05, + "loss": 0.4735, + "step": 45250 + }, + { + "epoch": 2.26, + "grad_norm": 1.0355300903320312, + "learning_rate": 1.3776461251684108e-05, + "loss": 0.3459, + "step": 45275 + }, + { + "epoch": 2.26, + "grad_norm": 1.0891985893249512, + "learning_rate": 1.375339128508942e-05, + "loss": 0.4327, + "step": 45300 + }, + { + "epoch": 2.26, + "grad_norm": 4.514690399169922, + "learning_rate": 1.3730321318494733e-05, + "loss": 0.3761, + "step": 45325 + }, + { + "epoch": 2.26, + "grad_norm": 3.550197124481201, + "learning_rate": 1.3707251351900044e-05, + "loss": 0.498, + "step": 45350 + }, + { + "epoch": 2.26, + "grad_norm": 4.564043045043945, + "learning_rate": 1.3684181385305355e-05, + "loss": 0.5021, + "step": 45375 + }, + { + "epoch": 2.26, + "grad_norm": 0.9587386846542358, + "learning_rate": 1.3661111418710668e-05, + "loss": 0.3791, + "step": 45400 + }, + { + "epoch": 2.26, + "grad_norm": 3.4362361431121826, + "learning_rate": 1.3638041452115976e-05, + "loss": 0.4855, + "step": 45425 + }, + { + "epoch": 2.26, + "grad_norm": 1.1819294691085815, + "learning_rate": 1.3614971485521288e-05, + "loss": 0.3571, + "step": 45450 + }, + { + "epoch": 2.27, + "grad_norm": 3.3091654777526855, + "learning_rate": 1.35919015189266e-05, + "loss": 0.5136, + "step": 45475 + }, + { + "epoch": 2.27, + "grad_norm": 0.8691762089729309, + "learning_rate": 1.3568831552331912e-05, + "loss": 0.5167, + "step": 45500 + }, + { + "epoch": 2.27, + "grad_norm": 3.3368494510650635, + "learning_rate": 1.3545761585737223e-05, + "loss": 0.5208, + "step": 45525 + }, + { + "epoch": 2.27, + "grad_norm": 11.81055736541748, + "learning_rate": 1.3522691619142537e-05, + "loss": 0.4869, + "step": 45550 + }, + { + "epoch": 2.27, + "grad_norm": 1.5365840196609497, + "learning_rate": 1.3499621652547848e-05, + "loss": 0.4494, + "step": 45575 + }, + { + "epoch": 2.27, + "grad_norm": 1.9530702829360962, + "learning_rate": 1.347655168595316e-05, + "loss": 0.5195, + "step": 45600 + }, + { + "epoch": 2.27, + "grad_norm": 11.941510200500488, + "learning_rate": 1.345348171935847e-05, + "loss": 0.3484, + "step": 45625 + }, + { + "epoch": 2.27, + "grad_norm": 1.299529790878296, + "learning_rate": 1.3430411752763784e-05, + "loss": 0.4795, + "step": 45650 + }, + { + "epoch": 2.28, + "grad_norm": 3.4457457065582275, + "learning_rate": 1.3407341786169095e-05, + "loss": 0.4536, + "step": 45675 + }, + { + "epoch": 2.28, + "grad_norm": 3.7312796115875244, + "learning_rate": 1.3384271819574406e-05, + "loss": 0.346, + "step": 45700 + }, + { + "epoch": 2.28, + "grad_norm": 1.1145527362823486, + "learning_rate": 1.3361201852979718e-05, + "loss": 0.4283, + "step": 45725 + }, + { + "epoch": 2.28, + "grad_norm": 3.5182554721832275, + "learning_rate": 1.333813188638503e-05, + "loss": 0.4792, + "step": 45750 + }, + { + "epoch": 2.28, + "grad_norm": 19.439828872680664, + "learning_rate": 1.3315061919790342e-05, + "loss": 0.434, + "step": 45775 + }, + { + "epoch": 2.28, + "grad_norm": 8.909052848815918, + "learning_rate": 1.3291991953195652e-05, + "loss": 0.4421, + "step": 45800 + }, + { + "epoch": 2.28, + "grad_norm": 3.3405191898345947, + "learning_rate": 1.3268921986600963e-05, + "loss": 0.4626, + "step": 45825 + }, + { + "epoch": 2.28, + "grad_norm": 1.1706880331039429, + "learning_rate": 1.3245852020006275e-05, + "loss": 0.445, + "step": 45850 + }, + { + "epoch": 2.29, + "grad_norm": 7.387587547302246, + "learning_rate": 1.3222782053411586e-05, + "loss": 0.361, + "step": 45875 + }, + { + "epoch": 2.29, + "grad_norm": 3.0818896293640137, + "learning_rate": 1.3199712086816899e-05, + "loss": 0.5711, + "step": 45900 + }, + { + "epoch": 2.29, + "grad_norm": 2.2934417724609375, + "learning_rate": 1.317664212022221e-05, + "loss": 0.5208, + "step": 45925 + }, + { + "epoch": 2.29, + "grad_norm": 2.78090238571167, + "learning_rate": 1.3153572153627522e-05, + "loss": 0.4421, + "step": 45950 + }, + { + "epoch": 2.29, + "grad_norm": 5.722506046295166, + "learning_rate": 1.3130502187032833e-05, + "loss": 0.4551, + "step": 45975 + }, + { + "epoch": 2.29, + "grad_norm": 7.568353176116943, + "learning_rate": 1.3107432220438146e-05, + "loss": 0.3366, + "step": 46000 + }, + { + "epoch": 2.29, + "grad_norm": 5.417125225067139, + "learning_rate": 1.3084362253843457e-05, + "loss": 0.4595, + "step": 46025 + }, + { + "epoch": 2.29, + "grad_norm": 12.71193790435791, + "learning_rate": 1.3061292287248769e-05, + "loss": 0.4978, + "step": 46050 + }, + { + "epoch": 2.3, + "grad_norm": 6.872551441192627, + "learning_rate": 1.3038222320654082e-05, + "loss": 0.3987, + "step": 46075 + }, + { + "epoch": 2.3, + "grad_norm": 14.637410163879395, + "learning_rate": 1.3015152354059393e-05, + "loss": 0.5225, + "step": 46100 + }, + { + "epoch": 2.3, + "grad_norm": 3.4603843688964844, + "learning_rate": 1.2992082387464705e-05, + "loss": 0.4371, + "step": 46125 + }, + { + "epoch": 2.3, + "grad_norm": 3.496575355529785, + "learning_rate": 1.2969012420870016e-05, + "loss": 0.3872, + "step": 46150 + }, + { + "epoch": 2.3, + "grad_norm": 0.7497350573539734, + "learning_rate": 1.2945942454275326e-05, + "loss": 0.3906, + "step": 46175 + }, + { + "epoch": 2.3, + "grad_norm": 2.979525566101074, + "learning_rate": 1.2922872487680637e-05, + "loss": 0.4707, + "step": 46200 + }, + { + "epoch": 2.3, + "grad_norm": 0.837188184261322, + "learning_rate": 1.2899802521085948e-05, + "loss": 0.3688, + "step": 46225 + }, + { + "epoch": 2.3, + "grad_norm": 0.7865057587623596, + "learning_rate": 1.2876732554491261e-05, + "loss": 0.425, + "step": 46250 + }, + { + "epoch": 2.31, + "grad_norm": 1.184089183807373, + "learning_rate": 1.2853662587896573e-05, + "loss": 0.4349, + "step": 46275 + }, + { + "epoch": 2.31, + "grad_norm": 6.87714958190918, + "learning_rate": 1.2830592621301884e-05, + "loss": 0.4748, + "step": 46300 + }, + { + "epoch": 2.31, + "grad_norm": 0.8082959055900574, + "learning_rate": 1.2807522654707197e-05, + "loss": 0.4739, + "step": 46325 + }, + { + "epoch": 2.31, + "grad_norm": 2.2679901123046875, + "learning_rate": 1.2784452688112508e-05, + "loss": 0.4529, + "step": 46350 + }, + { + "epoch": 2.31, + "grad_norm": 0.8802863955497742, + "learning_rate": 1.276138272151782e-05, + "loss": 0.4996, + "step": 46375 + }, + { + "epoch": 2.31, + "grad_norm": 34.556427001953125, + "learning_rate": 1.2738312754923131e-05, + "loss": 0.4807, + "step": 46400 + }, + { + "epoch": 2.31, + "grad_norm": 3.192739486694336, + "learning_rate": 1.2715242788328444e-05, + "loss": 0.6016, + "step": 46425 + }, + { + "epoch": 2.31, + "grad_norm": 1.1725696325302124, + "learning_rate": 1.2692172821733756e-05, + "loss": 0.3707, + "step": 46450 + }, + { + "epoch": 2.32, + "grad_norm": 4.39725399017334, + "learning_rate": 1.2669102855139067e-05, + "loss": 0.4125, + "step": 46475 + }, + { + "epoch": 2.32, + "grad_norm": 1.110650658607483, + "learning_rate": 1.2646032888544378e-05, + "loss": 0.4464, + "step": 46500 + }, + { + "epoch": 2.32, + "grad_norm": 3.1061861515045166, + "learning_rate": 1.2622962921949691e-05, + "loss": 0.64, + "step": 46525 + }, + { + "epoch": 2.32, + "grad_norm": 17.628704071044922, + "learning_rate": 1.2599892955355e-05, + "loss": 0.4659, + "step": 46550 + }, + { + "epoch": 2.32, + "grad_norm": 0.8352084755897522, + "learning_rate": 1.2576822988760312e-05, + "loss": 0.4472, + "step": 46575 + }, + { + "epoch": 2.32, + "grad_norm": 0.9835976958274841, + "learning_rate": 1.2553753022165624e-05, + "loss": 0.4686, + "step": 46600 + }, + { + "epoch": 2.32, + "grad_norm": 31.921051025390625, + "learning_rate": 1.2530683055570935e-05, + "loss": 0.6688, + "step": 46625 + }, + { + "epoch": 2.32, + "grad_norm": 3.089951992034912, + "learning_rate": 1.2507613088976246e-05, + "loss": 0.4112, + "step": 46650 + }, + { + "epoch": 2.33, + "grad_norm": 6.3877410888671875, + "learning_rate": 1.248454312238156e-05, + "loss": 0.5201, + "step": 46675 + }, + { + "epoch": 2.33, + "grad_norm": 0.9439681172370911, + "learning_rate": 1.2461473155786871e-05, + "loss": 0.3882, + "step": 46700 + }, + { + "epoch": 2.33, + "grad_norm": 0.9483340382575989, + "learning_rate": 1.2438403189192182e-05, + "loss": 0.5469, + "step": 46725 + }, + { + "epoch": 2.33, + "grad_norm": 4.738260269165039, + "learning_rate": 1.2415333222597494e-05, + "loss": 0.4783, + "step": 46750 + }, + { + "epoch": 2.33, + "grad_norm": 3.267001152038574, + "learning_rate": 1.2392263256002807e-05, + "loss": 0.435, + "step": 46775 + }, + { + "epoch": 2.33, + "grad_norm": 43.56253433227539, + "learning_rate": 1.2369193289408118e-05, + "loss": 0.4588, + "step": 46800 + }, + { + "epoch": 2.33, + "grad_norm": 2.040438652038574, + "learning_rate": 1.2346123322813428e-05, + "loss": 0.3251, + "step": 46825 + }, + { + "epoch": 2.33, + "grad_norm": 2.8884494304656982, + "learning_rate": 1.232305335621874e-05, + "loss": 0.3172, + "step": 46850 + }, + { + "epoch": 2.34, + "grad_norm": 0.7062200903892517, + "learning_rate": 1.2299983389624052e-05, + "loss": 0.4494, + "step": 46875 + }, + { + "epoch": 2.34, + "grad_norm": 15.709783554077148, + "learning_rate": 1.2276913423029363e-05, + "loss": 0.4507, + "step": 46900 + }, + { + "epoch": 2.34, + "grad_norm": 3.04421067237854, + "learning_rate": 1.2253843456434677e-05, + "loss": 0.6037, + "step": 46925 + }, + { + "epoch": 2.34, + "grad_norm": 2.8988988399505615, + "learning_rate": 1.2230773489839988e-05, + "loss": 0.5066, + "step": 46950 + }, + { + "epoch": 2.34, + "grad_norm": 4.624483585357666, + "learning_rate": 1.22077035232453e-05, + "loss": 0.6149, + "step": 46975 + }, + { + "epoch": 2.34, + "grad_norm": 0.8602836728096008, + "learning_rate": 1.218463355665061e-05, + "loss": 0.4331, + "step": 47000 + }, + { + "epoch": 2.34, + "grad_norm": 1.0193194150924683, + "learning_rate": 1.2161563590055922e-05, + "loss": 0.3927, + "step": 47025 + }, + { + "epoch": 2.34, + "grad_norm": 3.1821088790893555, + "learning_rate": 1.2138493623461233e-05, + "loss": 0.4709, + "step": 47050 + }, + { + "epoch": 2.35, + "grad_norm": 0.8067628741264343, + "learning_rate": 1.2115423656866545e-05, + "loss": 0.4642, + "step": 47075 + }, + { + "epoch": 2.35, + "grad_norm": 6.950100898742676, + "learning_rate": 1.2092353690271858e-05, + "loss": 0.4884, + "step": 47100 + }, + { + "epoch": 2.35, + "grad_norm": 15.024262428283691, + "learning_rate": 1.2069283723677169e-05, + "loss": 0.2618, + "step": 47125 + }, + { + "epoch": 2.35, + "grad_norm": 6.122317790985107, + "learning_rate": 1.204621375708248e-05, + "loss": 0.6606, + "step": 47150 + }, + { + "epoch": 2.35, + "grad_norm": 4.26874303817749, + "learning_rate": 1.2023143790487792e-05, + "loss": 0.4987, + "step": 47175 + }, + { + "epoch": 2.35, + "grad_norm": 3.464191198348999, + "learning_rate": 1.2000073823893103e-05, + "loss": 0.4721, + "step": 47200 + }, + { + "epoch": 2.35, + "grad_norm": 3.5283408164978027, + "learning_rate": 1.1977003857298415e-05, + "loss": 0.3801, + "step": 47225 + }, + { + "epoch": 2.35, + "grad_norm": 5.1887311935424805, + "learning_rate": 1.1953933890703726e-05, + "loss": 0.4973, + "step": 47250 + }, + { + "epoch": 2.36, + "grad_norm": 6.538787364959717, + "learning_rate": 1.1930863924109039e-05, + "loss": 0.4962, + "step": 47275 + }, + { + "epoch": 2.36, + "grad_norm": 4.12895393371582, + "learning_rate": 1.190779395751435e-05, + "loss": 0.4363, + "step": 47300 + }, + { + "epoch": 2.36, + "grad_norm": 3.3740382194519043, + "learning_rate": 1.1884723990919662e-05, + "loss": 0.5633, + "step": 47325 + }, + { + "epoch": 2.36, + "grad_norm": 58.57019805908203, + "learning_rate": 1.1861654024324973e-05, + "loss": 0.362, + "step": 47350 + }, + { + "epoch": 2.36, + "grad_norm": 3.191279888153076, + "learning_rate": 1.1838584057730286e-05, + "loss": 0.3877, + "step": 47375 + }, + { + "epoch": 2.36, + "grad_norm": 3.0479769706726074, + "learning_rate": 1.1815514091135596e-05, + "loss": 0.6029, + "step": 47400 + }, + { + "epoch": 2.36, + "grad_norm": 2.293454170227051, + "learning_rate": 1.1792444124540907e-05, + "loss": 0.4918, + "step": 47425 + }, + { + "epoch": 2.36, + "grad_norm": 3.393441677093506, + "learning_rate": 1.176937415794622e-05, + "loss": 0.4612, + "step": 47450 + }, + { + "epoch": 2.37, + "grad_norm": 0.7810852527618408, + "learning_rate": 1.1746304191351532e-05, + "loss": 0.4292, + "step": 47475 + }, + { + "epoch": 2.37, + "grad_norm": 3.1317076683044434, + "learning_rate": 1.1723234224756843e-05, + "loss": 0.5239, + "step": 47500 + }, + { + "epoch": 2.37, + "grad_norm": 55.293575286865234, + "learning_rate": 1.1700164258162156e-05, + "loss": 0.5202, + "step": 47525 + }, + { + "epoch": 2.37, + "grad_norm": 32.76906967163086, + "learning_rate": 1.1677094291567467e-05, + "loss": 0.4267, + "step": 47550 + }, + { + "epoch": 2.37, + "grad_norm": 3.1955676078796387, + "learning_rate": 1.1654024324972777e-05, + "loss": 0.3918, + "step": 47575 + }, + { + "epoch": 2.37, + "grad_norm": 0.926774799823761, + "learning_rate": 1.1630954358378088e-05, + "loss": 0.6901, + "step": 47600 + }, + { + "epoch": 2.37, + "grad_norm": 6.414040565490723, + "learning_rate": 1.1607884391783401e-05, + "loss": 0.5149, + "step": 47625 + }, + { + "epoch": 2.37, + "grad_norm": 1.0682789087295532, + "learning_rate": 1.1584814425188713e-05, + "loss": 0.4258, + "step": 47650 + }, + { + "epoch": 2.38, + "grad_norm": 1.077889323234558, + "learning_rate": 1.1561744458594024e-05, + "loss": 0.3138, + "step": 47675 + }, + { + "epoch": 2.38, + "grad_norm": 9.041423797607422, + "learning_rate": 1.1538674491999337e-05, + "loss": 0.346, + "step": 47700 + }, + { + "epoch": 2.38, + "grad_norm": 4.021137237548828, + "learning_rate": 1.1515604525404649e-05, + "loss": 0.4743, + "step": 47725 + }, + { + "epoch": 2.38, + "grad_norm": 6.2052836418151855, + "learning_rate": 1.149253455880996e-05, + "loss": 0.4666, + "step": 47750 + }, + { + "epoch": 2.38, + "grad_norm": 17.59967041015625, + "learning_rate": 1.1469464592215271e-05, + "loss": 0.4114, + "step": 47775 + }, + { + "epoch": 2.38, + "grad_norm": 3.0670289993286133, + "learning_rate": 1.1446394625620583e-05, + "loss": 0.4274, + "step": 47800 + }, + { + "epoch": 2.38, + "grad_norm": 3.6514852046966553, + "learning_rate": 1.1423324659025894e-05, + "loss": 0.524, + "step": 47825 + }, + { + "epoch": 2.38, + "grad_norm": 2.6334304809570312, + "learning_rate": 1.1400254692431205e-05, + "loss": 0.3774, + "step": 47850 + }, + { + "epoch": 2.39, + "grad_norm": 4.937973976135254, + "learning_rate": 1.1377184725836518e-05, + "loss": 0.3975, + "step": 47875 + }, + { + "epoch": 2.39, + "grad_norm": 4.668248653411865, + "learning_rate": 1.135411475924183e-05, + "loss": 0.4265, + "step": 47900 + }, + { + "epoch": 2.39, + "grad_norm": 4.245274543762207, + "learning_rate": 1.1331044792647141e-05, + "loss": 0.3939, + "step": 47925 + }, + { + "epoch": 2.39, + "grad_norm": 1.0817639827728271, + "learning_rate": 1.1307974826052452e-05, + "loss": 0.4652, + "step": 47950 + }, + { + "epoch": 2.39, + "grad_norm": 1.1002476215362549, + "learning_rate": 1.1284904859457764e-05, + "loss": 0.498, + "step": 47975 + }, + { + "epoch": 2.39, + "grad_norm": 0.6932094097137451, + "learning_rate": 1.1261834892863075e-05, + "loss": 0.3954, + "step": 48000 + }, + { + "epoch": 2.39, + "grad_norm": 0.7378042936325073, + "learning_rate": 1.1238764926268387e-05, + "loss": 0.451, + "step": 48025 + }, + { + "epoch": 2.39, + "grad_norm": 4.199863433837891, + "learning_rate": 1.12156949596737e-05, + "loss": 0.4264, + "step": 48050 + }, + { + "epoch": 2.4, + "grad_norm": 1.039255142211914, + "learning_rate": 1.1192624993079011e-05, + "loss": 0.434, + "step": 48075 + }, + { + "epoch": 2.4, + "grad_norm": 53.35725784301758, + "learning_rate": 1.1169555026484322e-05, + "loss": 0.5869, + "step": 48100 + }, + { + "epoch": 2.4, + "grad_norm": 0.8894838690757751, + "learning_rate": 1.1146485059889634e-05, + "loss": 0.4002, + "step": 48125 + }, + { + "epoch": 2.4, + "grad_norm": 3.4555165767669678, + "learning_rate": 1.1123415093294945e-05, + "loss": 0.3874, + "step": 48150 + }, + { + "epoch": 2.4, + "grad_norm": 35.925559997558594, + "learning_rate": 1.1100345126700256e-05, + "loss": 0.3686, + "step": 48175 + }, + { + "epoch": 2.4, + "grad_norm": 4.092686176300049, + "learning_rate": 1.1077275160105568e-05, + "loss": 0.3766, + "step": 48200 + }, + { + "epoch": 2.4, + "grad_norm": 0.7535989880561829, + "learning_rate": 1.105420519351088e-05, + "loss": 0.3985, + "step": 48225 + }, + { + "epoch": 2.4, + "grad_norm": 1.280040979385376, + "learning_rate": 1.1031135226916192e-05, + "loss": 0.5734, + "step": 48250 + }, + { + "epoch": 2.41, + "grad_norm": 3.505943775177002, + "learning_rate": 1.1008065260321504e-05, + "loss": 0.3752, + "step": 48275 + }, + { + "epoch": 2.41, + "grad_norm": 0.8014325499534607, + "learning_rate": 1.0984995293726817e-05, + "loss": 0.5073, + "step": 48300 + }, + { + "epoch": 2.41, + "grad_norm": 3.3095195293426514, + "learning_rate": 1.0961925327132126e-05, + "loss": 0.5718, + "step": 48325 + }, + { + "epoch": 2.41, + "grad_norm": 14.79623031616211, + "learning_rate": 1.0938855360537438e-05, + "loss": 0.4989, + "step": 48350 + }, + { + "epoch": 2.41, + "grad_norm": 15.96692180633545, + "learning_rate": 1.091578539394275e-05, + "loss": 0.5859, + "step": 48375 + }, + { + "epoch": 2.41, + "grad_norm": 3.386098861694336, + "learning_rate": 1.0892715427348062e-05, + "loss": 0.377, + "step": 48400 + }, + { + "epoch": 2.41, + "grad_norm": 4.3916754722595215, + "learning_rate": 1.0869645460753373e-05, + "loss": 0.3962, + "step": 48425 + }, + { + "epoch": 2.41, + "grad_norm": 21.46055793762207, + "learning_rate": 1.0846575494158685e-05, + "loss": 0.4741, + "step": 48450 + }, + { + "epoch": 2.42, + "grad_norm": 0.8601337671279907, + "learning_rate": 1.0823505527563998e-05, + "loss": 0.4751, + "step": 48475 + }, + { + "epoch": 2.42, + "grad_norm": 8.975151062011719, + "learning_rate": 1.0800435560969309e-05, + "loss": 0.4602, + "step": 48500 + }, + { + "epoch": 2.42, + "grad_norm": 0.9004390835762024, + "learning_rate": 1.0777365594374619e-05, + "loss": 0.4327, + "step": 48525 + }, + { + "epoch": 2.42, + "grad_norm": 0.8999045491218567, + "learning_rate": 1.0754295627779932e-05, + "loss": 0.4216, + "step": 48550 + }, + { + "epoch": 2.42, + "grad_norm": 5.836853981018066, + "learning_rate": 1.0731225661185243e-05, + "loss": 0.402, + "step": 48575 + }, + { + "epoch": 2.42, + "grad_norm": 1169.8812255859375, + "learning_rate": 1.0708155694590555e-05, + "loss": 0.2952, + "step": 48600 + }, + { + "epoch": 2.42, + "grad_norm": 6.735498905181885, + "learning_rate": 1.0685085727995866e-05, + "loss": 0.3534, + "step": 48625 + }, + { + "epoch": 2.42, + "grad_norm": 4.032488822937012, + "learning_rate": 1.0662015761401179e-05, + "loss": 0.4946, + "step": 48650 + }, + { + "epoch": 2.43, + "grad_norm": 6.46728515625, + "learning_rate": 1.063894579480649e-05, + "loss": 0.4081, + "step": 48675 + }, + { + "epoch": 2.43, + "grad_norm": 0.7925447821617126, + "learning_rate": 1.06158758282118e-05, + "loss": 0.424, + "step": 48700 + }, + { + "epoch": 2.43, + "grad_norm": 0.7554740905761719, + "learning_rate": 1.0592805861617113e-05, + "loss": 0.3596, + "step": 48725 + }, + { + "epoch": 2.43, + "grad_norm": 88.7288818359375, + "learning_rate": 1.0569735895022424e-05, + "loss": 0.4129, + "step": 48750 + }, + { + "epoch": 2.43, + "grad_norm": 0.6437362432479858, + "learning_rate": 1.0546665928427736e-05, + "loss": 0.4925, + "step": 48775 + }, + { + "epoch": 2.43, + "grad_norm": 1.6111449003219604, + "learning_rate": 1.0523595961833047e-05, + "loss": 0.5669, + "step": 48800 + }, + { + "epoch": 2.43, + "grad_norm": 4.0401740074157715, + "learning_rate": 1.050052599523836e-05, + "loss": 0.4289, + "step": 48825 + }, + { + "epoch": 2.43, + "grad_norm": 5.602783679962158, + "learning_rate": 1.0477456028643672e-05, + "loss": 0.4176, + "step": 48850 + }, + { + "epoch": 2.44, + "grad_norm": 3.130173683166504, + "learning_rate": 1.0454386062048981e-05, + "loss": 0.3674, + "step": 48875 + }, + { + "epoch": 2.44, + "grad_norm": 3.0914132595062256, + "learning_rate": 1.0431316095454294e-05, + "loss": 0.4028, + "step": 48900 + }, + { + "epoch": 2.44, + "grad_norm": 0.9653416275978088, + "learning_rate": 1.0408246128859606e-05, + "loss": 0.4655, + "step": 48925 + }, + { + "epoch": 2.44, + "grad_norm": 1.5674058198928833, + "learning_rate": 1.0385176162264917e-05, + "loss": 0.3246, + "step": 48950 + }, + { + "epoch": 2.44, + "grad_norm": 2.738037109375, + "learning_rate": 1.036210619567023e-05, + "loss": 0.3521, + "step": 48975 + }, + { + "epoch": 2.44, + "grad_norm": 3.1687607765197754, + "learning_rate": 1.0339036229075541e-05, + "loss": 0.5494, + "step": 49000 + }, + { + "epoch": 2.44, + "grad_norm": 6.417214870452881, + "learning_rate": 1.0315966262480853e-05, + "loss": 0.3474, + "step": 49025 + }, + { + "epoch": 2.44, + "grad_norm": 0.7319241762161255, + "learning_rate": 1.0292896295886164e-05, + "loss": 0.4822, + "step": 49050 + }, + { + "epoch": 2.45, + "grad_norm": 3.495887517929077, + "learning_rate": 1.0269826329291475e-05, + "loss": 0.3905, + "step": 49075 + }, + { + "epoch": 2.45, + "grad_norm": 3.4438161849975586, + "learning_rate": 1.0246756362696787e-05, + "loss": 0.5075, + "step": 49100 + }, + { + "epoch": 2.45, + "grad_norm": 6.572171688079834, + "learning_rate": 1.0223686396102098e-05, + "loss": 0.4268, + "step": 49125 + }, + { + "epoch": 2.45, + "grad_norm": 52.90046691894531, + "learning_rate": 1.0200616429507411e-05, + "loss": 0.6094, + "step": 49150 + }, + { + "epoch": 2.45, + "grad_norm": 5.936100006103516, + "learning_rate": 1.0177546462912723e-05, + "loss": 0.3771, + "step": 49175 + }, + { + "epoch": 2.45, + "grad_norm": 0.8982616662979126, + "learning_rate": 1.0154476496318034e-05, + "loss": 0.3057, + "step": 49200 + }, + { + "epoch": 2.45, + "grad_norm": 2.5868983268737793, + "learning_rate": 1.0131406529723345e-05, + "loss": 0.4864, + "step": 49225 + }, + { + "epoch": 2.45, + "grad_norm": 11.693443298339844, + "learning_rate": 1.0108336563128657e-05, + "loss": 0.5302, + "step": 49250 + }, + { + "epoch": 2.46, + "grad_norm": 1.0244178771972656, + "learning_rate": 1.0085266596533968e-05, + "loss": 0.4169, + "step": 49275 + }, + { + "epoch": 2.46, + "grad_norm": 4.7818603515625, + "learning_rate": 1.006219662993928e-05, + "loss": 0.409, + "step": 49300 + }, + { + "epoch": 2.46, + "grad_norm": 0.8569893836975098, + "learning_rate": 1.0039126663344592e-05, + "loss": 0.3737, + "step": 49325 + }, + { + "epoch": 2.46, + "grad_norm": 0.7575650811195374, + "learning_rate": 1.0016056696749904e-05, + "loss": 0.4904, + "step": 49350 + }, + { + "epoch": 2.46, + "grad_norm": 0.7171018123626709, + "learning_rate": 9.992986730155215e-06, + "loss": 0.3005, + "step": 49375 + }, + { + "epoch": 2.46, + "grad_norm": 86.97425842285156, + "learning_rate": 9.969916763560527e-06, + "loss": 0.4328, + "step": 49400 + }, + { + "epoch": 2.46, + "grad_norm": 5.976800441741943, + "learning_rate": 9.94684679696584e-06, + "loss": 0.4754, + "step": 49425 + }, + { + "epoch": 2.46, + "grad_norm": 6.821709156036377, + "learning_rate": 9.92377683037115e-06, + "loss": 0.3567, + "step": 49450 + }, + { + "epoch": 2.47, + "grad_norm": 21.48541831970215, + "learning_rate": 9.90070686377646e-06, + "loss": 0.5048, + "step": 49475 + }, + { + "epoch": 2.47, + "grad_norm": 0.8857642412185669, + "learning_rate": 9.877636897181774e-06, + "loss": 0.3556, + "step": 49500 + }, + { + "epoch": 2.47, + "grad_norm": 2.9118235111236572, + "learning_rate": 9.854566930587085e-06, + "loss": 0.4667, + "step": 49525 + }, + { + "epoch": 2.47, + "grad_norm": 30.48115348815918, + "learning_rate": 9.831496963992396e-06, + "loss": 0.3564, + "step": 49550 + }, + { + "epoch": 2.47, + "grad_norm": 3.914069652557373, + "learning_rate": 9.808426997397708e-06, + "loss": 0.3922, + "step": 49575 + }, + { + "epoch": 2.47, + "grad_norm": 3.106795072555542, + "learning_rate": 9.78535703080302e-06, + "loss": 0.46, + "step": 49600 + }, + { + "epoch": 2.47, + "grad_norm": 1.6387038230895996, + "learning_rate": 9.76228706420833e-06, + "loss": 0.4054, + "step": 49625 + }, + { + "epoch": 2.47, + "grad_norm": 65.76454162597656, + "learning_rate": 9.739217097613642e-06, + "loss": 0.4163, + "step": 49650 + }, + { + "epoch": 2.48, + "grad_norm": 6.016766548156738, + "learning_rate": 9.716147131018955e-06, + "loss": 0.3834, + "step": 49675 + }, + { + "epoch": 2.48, + "grad_norm": 4.530858993530273, + "learning_rate": 9.693077164424266e-06, + "loss": 0.3758, + "step": 49700 + }, + { + "epoch": 2.48, + "grad_norm": 14.341697692871094, + "learning_rate": 9.670007197829578e-06, + "loss": 0.3814, + "step": 49725 + }, + { + "epoch": 2.48, + "grad_norm": 3.512382745742798, + "learning_rate": 9.64693723123489e-06, + "loss": 0.4102, + "step": 49750 + }, + { + "epoch": 2.48, + "grad_norm": 67.73419189453125, + "learning_rate": 9.623867264640202e-06, + "loss": 0.5958, + "step": 49775 + }, + { + "epoch": 2.48, + "grad_norm": 0.7057674527168274, + "learning_rate": 9.600797298045513e-06, + "loss": 0.3469, + "step": 49800 + }, + { + "epoch": 2.48, + "grad_norm": 1.1521495580673218, + "learning_rate": 9.577727331450825e-06, + "loss": 0.4476, + "step": 49825 + }, + { + "epoch": 2.48, + "grad_norm": 0.952574610710144, + "learning_rate": 9.554657364856136e-06, + "loss": 0.5608, + "step": 49850 + }, + { + "epoch": 2.49, + "grad_norm": 4.289364814758301, + "learning_rate": 9.531587398261447e-06, + "loss": 0.4397, + "step": 49875 + }, + { + "epoch": 2.49, + "grad_norm": 4.029369831085205, + "learning_rate": 9.508517431666759e-06, + "loss": 0.5829, + "step": 49900 + }, + { + "epoch": 2.49, + "grad_norm": 67.86260986328125, + "learning_rate": 9.485447465072072e-06, + "loss": 0.4823, + "step": 49925 + }, + { + "epoch": 2.49, + "grad_norm": 0.8601451516151428, + "learning_rate": 9.462377498477383e-06, + "loss": 0.3746, + "step": 49950 + }, + { + "epoch": 2.49, + "grad_norm": 0.7222571969032288, + "learning_rate": 9.439307531882695e-06, + "loss": 0.5386, + "step": 49975 + }, + { + "epoch": 2.49, + "grad_norm": 3.494142770767212, + "learning_rate": 9.416237565288006e-06, + "loss": 0.5036, + "step": 50000 + }, + { + "epoch": 2.49, + "grad_norm": 0.7426605820655823, + "learning_rate": 9.393167598693317e-06, + "loss": 0.3962, + "step": 50025 + }, + { + "epoch": 2.49, + "grad_norm": 18.78670883178711, + "learning_rate": 9.370097632098629e-06, + "loss": 0.4831, + "step": 50050 + }, + { + "epoch": 2.5, + "grad_norm": 3.5823943614959717, + "learning_rate": 9.34702766550394e-06, + "loss": 0.4929, + "step": 50075 + }, + { + "epoch": 2.5, + "grad_norm": 3.229736566543579, + "learning_rate": 9.323957698909253e-06, + "loss": 0.4353, + "step": 50100 + }, + { + "epoch": 2.5, + "grad_norm": 3.537930965423584, + "learning_rate": 9.300887732314564e-06, + "loss": 0.4397, + "step": 50125 + }, + { + "epoch": 2.5, + "grad_norm": 2.34350323677063, + "learning_rate": 9.277817765719876e-06, + "loss": 0.4432, + "step": 50150 + }, + { + "epoch": 2.5, + "grad_norm": 5.109536170959473, + "learning_rate": 9.254747799125187e-06, + "loss": 0.4513, + "step": 50175 + }, + { + "epoch": 2.5, + "grad_norm": 0.7203817367553711, + "learning_rate": 9.231677832530499e-06, + "loss": 0.3611, + "step": 50200 + }, + { + "epoch": 2.5, + "grad_norm": 1.0782755613327026, + "learning_rate": 9.20860786593581e-06, + "loss": 0.4285, + "step": 50225 + }, + { + "epoch": 2.5, + "grad_norm": 0.7718564867973328, + "learning_rate": 9.185537899341121e-06, + "loss": 0.3597, + "step": 50250 + }, + { + "epoch": 2.51, + "grad_norm": 4.9814133644104, + "learning_rate": 9.162467932746434e-06, + "loss": 0.4823, + "step": 50275 + }, + { + "epoch": 2.51, + "grad_norm": 3.0942840576171875, + "learning_rate": 9.139397966151746e-06, + "loss": 0.3619, + "step": 50300 + }, + { + "epoch": 2.51, + "grad_norm": 3.4611849784851074, + "learning_rate": 9.116327999557057e-06, + "loss": 0.4649, + "step": 50325 + }, + { + "epoch": 2.51, + "grad_norm": 0.7275696992874146, + "learning_rate": 9.09325803296237e-06, + "loss": 0.3838, + "step": 50350 + }, + { + "epoch": 2.51, + "grad_norm": 5.559508800506592, + "learning_rate": 9.07018806636768e-06, + "loss": 0.4082, + "step": 50375 + }, + { + "epoch": 2.51, + "grad_norm": 2.714812994003296, + "learning_rate": 9.047118099772991e-06, + "loss": 0.4573, + "step": 50400 + }, + { + "epoch": 2.51, + "grad_norm": 1.6366543769836426, + "learning_rate": 9.024048133178304e-06, + "loss": 0.6329, + "step": 50425 + }, + { + "epoch": 2.51, + "grad_norm": 0.8482502698898315, + "learning_rate": 9.000978166583616e-06, + "loss": 0.3166, + "step": 50450 + }, + { + "epoch": 2.52, + "grad_norm": 3.0310323238372803, + "learning_rate": 8.977908199988927e-06, + "loss": 0.4704, + "step": 50475 + }, + { + "epoch": 2.52, + "grad_norm": 37.44198989868164, + "learning_rate": 8.954838233394238e-06, + "loss": 0.592, + "step": 50500 + }, + { + "epoch": 2.52, + "grad_norm": 8.434538841247559, + "learning_rate": 8.931768266799551e-06, + "loss": 0.4933, + "step": 50525 + }, + { + "epoch": 2.52, + "grad_norm": 18.220142364501953, + "learning_rate": 8.908698300204863e-06, + "loss": 0.5272, + "step": 50550 + }, + { + "epoch": 2.52, + "grad_norm": 10.412059783935547, + "learning_rate": 8.885628333610172e-06, + "loss": 0.3622, + "step": 50575 + }, + { + "epoch": 2.52, + "grad_norm": 2.166194438934326, + "learning_rate": 8.862558367015485e-06, + "loss": 0.4643, + "step": 50600 + }, + { + "epoch": 2.52, + "grad_norm": 6.727598190307617, + "learning_rate": 8.839488400420797e-06, + "loss": 0.5054, + "step": 50625 + }, + { + "epoch": 2.52, + "grad_norm": 5.515819549560547, + "learning_rate": 8.816418433826108e-06, + "loss": 0.4443, + "step": 50650 + }, + { + "epoch": 2.53, + "grad_norm": 0.8282546401023865, + "learning_rate": 8.79334846723142e-06, + "loss": 0.4671, + "step": 50675 + }, + { + "epoch": 2.53, + "grad_norm": 1.9409846067428589, + "learning_rate": 8.770278500636733e-06, + "loss": 0.4, + "step": 50700 + }, + { + "epoch": 2.53, + "grad_norm": 6.412890911102295, + "learning_rate": 8.747208534042044e-06, + "loss": 0.4176, + "step": 50725 + }, + { + "epoch": 2.53, + "grad_norm": 7.00205659866333, + "learning_rate": 8.724138567447354e-06, + "loss": 0.3337, + "step": 50750 + }, + { + "epoch": 2.53, + "grad_norm": 14.442793846130371, + "learning_rate": 8.701068600852667e-06, + "loss": 0.4154, + "step": 50775 + }, + { + "epoch": 2.53, + "grad_norm": 1.0041389465332031, + "learning_rate": 8.677998634257978e-06, + "loss": 0.4439, + "step": 50800 + }, + { + "epoch": 2.53, + "grad_norm": 3.049905776977539, + "learning_rate": 8.65492866766329e-06, + "loss": 0.4632, + "step": 50825 + }, + { + "epoch": 2.53, + "grad_norm": 2.8461859226226807, + "learning_rate": 8.6318587010686e-06, + "loss": 0.5343, + "step": 50850 + }, + { + "epoch": 2.54, + "grad_norm": 4.451079368591309, + "learning_rate": 8.608788734473914e-06, + "loss": 0.5663, + "step": 50875 + }, + { + "epoch": 2.54, + "grad_norm": 42.84324645996094, + "learning_rate": 8.585718767879225e-06, + "loss": 0.4916, + "step": 50900 + }, + { + "epoch": 2.54, + "grad_norm": 5.204446315765381, + "learning_rate": 8.562648801284536e-06, + "loss": 0.3823, + "step": 50925 + }, + { + "epoch": 2.54, + "grad_norm": 87.77778625488281, + "learning_rate": 8.539578834689848e-06, + "loss": 0.2921, + "step": 50950 + }, + { + "epoch": 2.54, + "grad_norm": 0.7765432000160217, + "learning_rate": 8.51650886809516e-06, + "loss": 0.4655, + "step": 50975 + }, + { + "epoch": 2.54, + "grad_norm": 8.35226058959961, + "learning_rate": 8.49343890150047e-06, + "loss": 0.4879, + "step": 51000 + }, + { + "epoch": 2.54, + "grad_norm": 6.179222106933594, + "learning_rate": 8.470368934905782e-06, + "loss": 0.3122, + "step": 51025 + }, + { + "epoch": 2.54, + "grad_norm": 6.96940803527832, + "learning_rate": 8.447298968311095e-06, + "loss": 0.5331, + "step": 51050 + }, + { + "epoch": 2.55, + "grad_norm": 3.286719799041748, + "learning_rate": 8.424229001716406e-06, + "loss": 0.5512, + "step": 51075 + }, + { + "epoch": 2.55, + "grad_norm": 1.991182565689087, + "learning_rate": 8.401159035121718e-06, + "loss": 0.582, + "step": 51100 + }, + { + "epoch": 2.55, + "grad_norm": 2.020848035812378, + "learning_rate": 8.378089068527029e-06, + "loss": 0.6097, + "step": 51125 + }, + { + "epoch": 2.55, + "grad_norm": 3.684962511062622, + "learning_rate": 8.35501910193234e-06, + "loss": 0.5262, + "step": 51150 + }, + { + "epoch": 2.55, + "grad_norm": 2.4446282386779785, + "learning_rate": 8.331949135337652e-06, + "loss": 0.3397, + "step": 51175 + }, + { + "epoch": 2.55, + "grad_norm": 6.0752482414245605, + "learning_rate": 8.308879168742965e-06, + "loss": 0.4074, + "step": 51200 + }, + { + "epoch": 2.55, + "grad_norm": 0.973638117313385, + "learning_rate": 8.285809202148276e-06, + "loss": 0.4706, + "step": 51225 + }, + { + "epoch": 2.55, + "grad_norm": 0.7517675757408142, + "learning_rate": 8.262739235553588e-06, + "loss": 0.4092, + "step": 51250 + }, + { + "epoch": 2.56, + "grad_norm": 0.8337798714637756, + "learning_rate": 8.239669268958899e-06, + "loss": 0.5036, + "step": 51275 + }, + { + "epoch": 2.56, + "grad_norm": 5.119658946990967, + "learning_rate": 8.21659930236421e-06, + "loss": 0.3791, + "step": 51300 + }, + { + "epoch": 2.56, + "grad_norm": 0.7112604379653931, + "learning_rate": 8.193529335769522e-06, + "loss": 0.3078, + "step": 51325 + }, + { + "epoch": 2.56, + "grad_norm": 2.3811254501342773, + "learning_rate": 8.170459369174833e-06, + "loss": 0.3596, + "step": 51350 + }, + { + "epoch": 2.56, + "grad_norm": 6.41787052154541, + "learning_rate": 8.147389402580146e-06, + "loss": 0.4551, + "step": 51375 + }, + { + "epoch": 2.56, + "grad_norm": 7.111554145812988, + "learning_rate": 8.124319435985457e-06, + "loss": 0.3701, + "step": 51400 + }, + { + "epoch": 2.56, + "grad_norm": 0.8633186221122742, + "learning_rate": 8.101249469390769e-06, + "loss": 0.3227, + "step": 51425 + }, + { + "epoch": 2.56, + "grad_norm": 3.249757766723633, + "learning_rate": 8.07817950279608e-06, + "loss": 0.369, + "step": 51450 + }, + { + "epoch": 2.57, + "grad_norm": 0.7638012170791626, + "learning_rate": 8.055109536201393e-06, + "loss": 0.5205, + "step": 51475 + }, + { + "epoch": 2.57, + "grad_norm": 2.608543634414673, + "learning_rate": 8.032039569606703e-06, + "loss": 0.3419, + "step": 51500 + }, + { + "epoch": 2.57, + "grad_norm": 3.6765499114990234, + "learning_rate": 8.008969603012014e-06, + "loss": 0.3323, + "step": 51525 + }, + { + "epoch": 2.57, + "grad_norm": 3.2431704998016357, + "learning_rate": 7.985899636417327e-06, + "loss": 0.4416, + "step": 51550 + }, + { + "epoch": 2.57, + "grad_norm": 4.2475104331970215, + "learning_rate": 7.962829669822639e-06, + "loss": 0.467, + "step": 51575 + }, + { + "epoch": 2.57, + "grad_norm": 0.6722660660743713, + "learning_rate": 7.93975970322795e-06, + "loss": 0.4526, + "step": 51600 + }, + { + "epoch": 2.57, + "grad_norm": 4.108065128326416, + "learning_rate": 7.916689736633261e-06, + "loss": 0.4785, + "step": 51625 + }, + { + "epoch": 2.57, + "grad_norm": 27.554222106933594, + "learning_rate": 7.893619770038574e-06, + "loss": 0.3775, + "step": 51650 + }, + { + "epoch": 2.57, + "grad_norm": 5.31900691986084, + "learning_rate": 7.870549803443884e-06, + "loss": 0.4917, + "step": 51675 + }, + { + "epoch": 2.58, + "grad_norm": 6.323947429656982, + "learning_rate": 7.847479836849195e-06, + "loss": 0.4157, + "step": 51700 + }, + { + "epoch": 2.58, + "grad_norm": 0.7022064328193665, + "learning_rate": 7.824409870254508e-06, + "loss": 0.371, + "step": 51725 + }, + { + "epoch": 2.58, + "grad_norm": 6.228670120239258, + "learning_rate": 7.80133990365982e-06, + "loss": 0.4627, + "step": 51750 + }, + { + "epoch": 2.58, + "grad_norm": 0.704119861125946, + "learning_rate": 7.778269937065131e-06, + "loss": 0.4452, + "step": 51775 + }, + { + "epoch": 2.58, + "grad_norm": 2.9643666744232178, + "learning_rate": 7.755199970470444e-06, + "loss": 0.3993, + "step": 51800 + }, + { + "epoch": 2.58, + "grad_norm": 0.7044605016708374, + "learning_rate": 7.732130003875756e-06, + "loss": 0.481, + "step": 51825 + }, + { + "epoch": 2.58, + "grad_norm": 22.700096130371094, + "learning_rate": 7.709060037281067e-06, + "loss": 0.3396, + "step": 51850 + }, + { + "epoch": 2.58, + "grad_norm": 0.7432065010070801, + "learning_rate": 7.685990070686378e-06, + "loss": 0.3764, + "step": 51875 + }, + { + "epoch": 2.59, + "grad_norm": 1.9928373098373413, + "learning_rate": 7.66292010409169e-06, + "loss": 0.3288, + "step": 51900 + }, + { + "epoch": 2.59, + "grad_norm": 3.4592535495758057, + "learning_rate": 7.639850137497001e-06, + "loss": 0.3718, + "step": 51925 + }, + { + "epoch": 2.59, + "grad_norm": 0.9576864242553711, + "learning_rate": 7.616780170902313e-06, + "loss": 0.445, + "step": 51950 + }, + { + "epoch": 2.59, + "grad_norm": 20.564001083374023, + "learning_rate": 7.5937102043076246e-06, + "loss": 0.3948, + "step": 51975 + }, + { + "epoch": 2.59, + "grad_norm": 0.7595285177230835, + "learning_rate": 7.570640237712937e-06, + "loss": 0.345, + "step": 52000 + }, + { + "epoch": 2.59, + "grad_norm": 13.459945678710938, + "learning_rate": 7.547570271118248e-06, + "loss": 0.5181, + "step": 52025 + }, + { + "epoch": 2.59, + "grad_norm": 0.6884210705757141, + "learning_rate": 7.524500304523559e-06, + "loss": 0.4252, + "step": 52050 + }, + { + "epoch": 2.59, + "grad_norm": 2.0429818630218506, + "learning_rate": 7.501430337928871e-06, + "loss": 0.3594, + "step": 52075 + }, + { + "epoch": 2.6, + "grad_norm": 5.49559211730957, + "learning_rate": 7.478360371334182e-06, + "loss": 0.401, + "step": 52100 + }, + { + "epoch": 2.6, + "grad_norm": 8.92778205871582, + "learning_rate": 7.455290404739494e-06, + "loss": 0.5722, + "step": 52125 + }, + { + "epoch": 2.6, + "grad_norm": 3.305980920791626, + "learning_rate": 7.432220438144806e-06, + "loss": 0.3295, + "step": 52150 + }, + { + "epoch": 2.6, + "grad_norm": 0.6217209696769714, + "learning_rate": 7.409150471550118e-06, + "loss": 0.4052, + "step": 52175 + }, + { + "epoch": 2.6, + "grad_norm": 0.6542679667472839, + "learning_rate": 7.386080504955429e-06, + "loss": 0.2494, + "step": 52200 + }, + { + "epoch": 2.6, + "grad_norm": 0.6969010233879089, + "learning_rate": 7.3630105383607416e-06, + "loss": 0.5526, + "step": 52225 + }, + { + "epoch": 2.6, + "grad_norm": 2.3818559646606445, + "learning_rate": 7.339940571766052e-06, + "loss": 0.4938, + "step": 52250 + }, + { + "epoch": 2.6, + "grad_norm": 15.166160583496094, + "learning_rate": 7.3168706051713634e-06, + "loss": 0.3946, + "step": 52275 + }, + { + "epoch": 2.61, + "grad_norm": 16.992691040039062, + "learning_rate": 7.293800638576676e-06, + "loss": 0.4292, + "step": 52300 + }, + { + "epoch": 2.61, + "grad_norm": 1.524154782295227, + "learning_rate": 7.270730671981987e-06, + "loss": 0.5369, + "step": 52325 + }, + { + "epoch": 2.61, + "grad_norm": 3.3624045848846436, + "learning_rate": 7.247660705387299e-06, + "loss": 0.3681, + "step": 52350 + }, + { + "epoch": 2.61, + "grad_norm": 15.962396621704102, + "learning_rate": 7.2245907387926106e-06, + "loss": 0.366, + "step": 52375 + }, + { + "epoch": 2.61, + "grad_norm": 1.18215811252594, + "learning_rate": 7.201520772197923e-06, + "loss": 0.3614, + "step": 52400 + }, + { + "epoch": 2.61, + "grad_norm": 2.024662494659424, + "learning_rate": 7.178450805603233e-06, + "loss": 0.3949, + "step": 52425 + }, + { + "epoch": 2.61, + "grad_norm": 4.9505228996276855, + "learning_rate": 7.155380839008545e-06, + "loss": 0.4286, + "step": 52450 + }, + { + "epoch": 2.61, + "grad_norm": 0.6292359232902527, + "learning_rate": 7.132310872413857e-06, + "loss": 0.3199, + "step": 52475 + }, + { + "epoch": 2.62, + "grad_norm": 0.6973364949226379, + "learning_rate": 7.109240905819169e-06, + "loss": 0.4549, + "step": 52500 + }, + { + "epoch": 2.62, + "grad_norm": 22.525678634643555, + "learning_rate": 7.08617093922448e-06, + "loss": 0.4789, + "step": 52525 + }, + { + "epoch": 2.62, + "grad_norm": 0.7233843207359314, + "learning_rate": 7.063100972629793e-06, + "loss": 0.4184, + "step": 52550 + }, + { + "epoch": 2.62, + "grad_norm": 3.2459542751312256, + "learning_rate": 7.040031006035104e-06, + "loss": 0.3048, + "step": 52575 + }, + { + "epoch": 2.62, + "grad_norm": 5.569832801818848, + "learning_rate": 7.016961039440416e-06, + "loss": 0.3551, + "step": 52600 + }, + { + "epoch": 2.62, + "grad_norm": 0.7612528204917908, + "learning_rate": 6.993891072845727e-06, + "loss": 0.4557, + "step": 52625 + }, + { + "epoch": 2.62, + "grad_norm": 11.590838432312012, + "learning_rate": 6.970821106251038e-06, + "loss": 0.4279, + "step": 52650 + }, + { + "epoch": 2.62, + "grad_norm": 0.667598307132721, + "learning_rate": 6.94775113965635e-06, + "loss": 0.3403, + "step": 52675 + }, + { + "epoch": 2.63, + "grad_norm": 3.3274714946746826, + "learning_rate": 6.924681173061662e-06, + "loss": 0.4698, + "step": 52700 + }, + { + "epoch": 2.63, + "grad_norm": 3.191425323486328, + "learning_rate": 6.901611206466974e-06, + "loss": 0.48, + "step": 52725 + }, + { + "epoch": 2.63, + "grad_norm": 0.7707619071006775, + "learning_rate": 6.878541239872285e-06, + "loss": 0.3145, + "step": 52750 + }, + { + "epoch": 2.63, + "grad_norm": 0.6428590416908264, + "learning_rate": 6.855471273277597e-06, + "loss": 0.4764, + "step": 52775 + }, + { + "epoch": 2.63, + "grad_norm": 3.274758815765381, + "learning_rate": 6.832401306682908e-06, + "loss": 0.4921, + "step": 52800 + }, + { + "epoch": 2.63, + "grad_norm": 0.732591450214386, + "learning_rate": 6.809331340088219e-06, + "loss": 0.3035, + "step": 52825 + }, + { + "epoch": 2.63, + "grad_norm": 3.3230175971984863, + "learning_rate": 6.7862613734935315e-06, + "loss": 0.5051, + "step": 52850 + }, + { + "epoch": 2.63, + "grad_norm": 4.391756534576416, + "learning_rate": 6.763191406898843e-06, + "loss": 0.3918, + "step": 52875 + }, + { + "epoch": 2.64, + "grad_norm": 9.392721176147461, + "learning_rate": 6.740121440304155e-06, + "loss": 0.4598, + "step": 52900 + }, + { + "epoch": 2.64, + "grad_norm": 0.6300041675567627, + "learning_rate": 6.717051473709466e-06, + "loss": 0.3568, + "step": 52925 + }, + { + "epoch": 2.64, + "grad_norm": 3.346933126449585, + "learning_rate": 6.693981507114779e-06, + "loss": 0.4281, + "step": 52950 + }, + { + "epoch": 2.64, + "grad_norm": 15.497859954833984, + "learning_rate": 6.67091154052009e-06, + "loss": 0.4197, + "step": 52975 + }, + { + "epoch": 2.64, + "grad_norm": 6.255995750427246, + "learning_rate": 6.6478415739254005e-06, + "loss": 0.4091, + "step": 53000 + }, + { + "epoch": 2.64, + "grad_norm": 1.5552594661712646, + "learning_rate": 6.624771607330713e-06, + "loss": 0.4507, + "step": 53025 + }, + { + "epoch": 2.64, + "grad_norm": 35.074859619140625, + "learning_rate": 6.601701640736024e-06, + "loss": 0.3937, + "step": 53050 + }, + { + "epoch": 2.64, + "grad_norm": 3.185941696166992, + "learning_rate": 6.578631674141336e-06, + "loss": 0.4116, + "step": 53075 + }, + { + "epoch": 2.65, + "grad_norm": 0.6813209056854248, + "learning_rate": 6.555561707546648e-06, + "loss": 0.3293, + "step": 53100 + }, + { + "epoch": 2.65, + "grad_norm": 25.33660888671875, + "learning_rate": 6.53249174095196e-06, + "loss": 0.4416, + "step": 53125 + }, + { + "epoch": 2.65, + "grad_norm": 1.5817681550979614, + "learning_rate": 6.509421774357272e-06, + "loss": 0.5166, + "step": 53150 + }, + { + "epoch": 2.65, + "grad_norm": 3.9538614749908447, + "learning_rate": 6.486351807762582e-06, + "loss": 0.5216, + "step": 53175 + }, + { + "epoch": 2.65, + "grad_norm": 6.135497570037842, + "learning_rate": 6.463281841167894e-06, + "loss": 0.4999, + "step": 53200 + }, + { + "epoch": 2.65, + "grad_norm": 1.1646127700805664, + "learning_rate": 6.440211874573206e-06, + "loss": 0.4203, + "step": 53225 + }, + { + "epoch": 2.65, + "grad_norm": 7.302920341491699, + "learning_rate": 6.4171419079785175e-06, + "loss": 0.3305, + "step": 53250 + }, + { + "epoch": 2.65, + "grad_norm": 2.83675479888916, + "learning_rate": 6.39407194138383e-06, + "loss": 0.3606, + "step": 53275 + }, + { + "epoch": 2.66, + "grad_norm": 1.0463924407958984, + "learning_rate": 6.371001974789141e-06, + "loss": 0.417, + "step": 53300 + }, + { + "epoch": 2.66, + "grad_norm": 0.7047767043113708, + "learning_rate": 6.347932008194453e-06, + "loss": 0.3989, + "step": 53325 + }, + { + "epoch": 2.66, + "grad_norm": 20.336246490478516, + "learning_rate": 6.324862041599765e-06, + "loss": 0.4479, + "step": 53350 + }, + { + "epoch": 2.66, + "grad_norm": 3.4728808403015137, + "learning_rate": 6.301792075005075e-06, + "loss": 0.3529, + "step": 53375 + }, + { + "epoch": 2.66, + "grad_norm": 22.320112228393555, + "learning_rate": 6.278722108410387e-06, + "loss": 0.3883, + "step": 53400 + }, + { + "epoch": 2.66, + "grad_norm": 6.709505558013916, + "learning_rate": 6.255652141815699e-06, + "loss": 0.4964, + "step": 53425 + }, + { + "epoch": 2.66, + "grad_norm": 7.791197776794434, + "learning_rate": 6.232582175221011e-06, + "loss": 0.3674, + "step": 53450 + }, + { + "epoch": 2.66, + "grad_norm": 0.6814228296279907, + "learning_rate": 6.209512208626322e-06, + "loss": 0.3593, + "step": 53475 + }, + { + "epoch": 2.67, + "grad_norm": 4.702853679656982, + "learning_rate": 6.186442242031634e-06, + "loss": 0.4375, + "step": 53500 + }, + { + "epoch": 2.67, + "grad_norm": 3.231199264526367, + "learning_rate": 6.163372275436945e-06, + "loss": 0.3586, + "step": 53525 + }, + { + "epoch": 2.67, + "grad_norm": 3.124828815460205, + "learning_rate": 6.140302308842257e-06, + "loss": 0.3698, + "step": 53550 + }, + { + "epoch": 2.67, + "grad_norm": 0.6851596832275391, + "learning_rate": 6.117232342247569e-06, + "loss": 0.4728, + "step": 53575 + }, + { + "epoch": 2.67, + "grad_norm": 32.70745849609375, + "learning_rate": 6.09416237565288e-06, + "loss": 0.4506, + "step": 53600 + }, + { + "epoch": 2.67, + "grad_norm": 5.827382564544678, + "learning_rate": 6.071092409058192e-06, + "loss": 0.5484, + "step": 53625 + }, + { + "epoch": 2.67, + "grad_norm": 1.0193488597869873, + "learning_rate": 6.0480224424635035e-06, + "loss": 0.4257, + "step": 53650 + }, + { + "epoch": 2.67, + "grad_norm": 6.486295223236084, + "learning_rate": 6.024952475868816e-06, + "loss": 0.5501, + "step": 53675 + }, + { + "epoch": 2.68, + "grad_norm": 0.740279495716095, + "learning_rate": 6.001882509274127e-06, + "loss": 0.4992, + "step": 53700 + }, + { + "epoch": 2.68, + "grad_norm": 3.4536116123199463, + "learning_rate": 5.978812542679438e-06, + "loss": 0.4032, + "step": 53725 + }, + { + "epoch": 2.68, + "grad_norm": 2.0655429363250732, + "learning_rate": 5.955742576084751e-06, + "loss": 0.5314, + "step": 53750 + }, + { + "epoch": 2.68, + "grad_norm": 3.477311134338379, + "learning_rate": 5.932672609490061e-06, + "loss": 0.4435, + "step": 53775 + }, + { + "epoch": 2.68, + "grad_norm": 0.6453719735145569, + "learning_rate": 5.909602642895373e-06, + "loss": 0.311, + "step": 53800 + }, + { + "epoch": 2.68, + "grad_norm": 6.679521083831787, + "learning_rate": 5.886532676300685e-06, + "loss": 0.3425, + "step": 53825 + }, + { + "epoch": 2.68, + "grad_norm": 0.7992690801620483, + "learning_rate": 5.863462709705997e-06, + "loss": 0.4486, + "step": 53850 + }, + { + "epoch": 2.68, + "grad_norm": 3.493523120880127, + "learning_rate": 5.840392743111308e-06, + "loss": 0.3975, + "step": 53875 + }, + { + "epoch": 2.69, + "grad_norm": 1.6282230615615845, + "learning_rate": 5.81732277651662e-06, + "loss": 0.3937, + "step": 53900 + }, + { + "epoch": 2.69, + "grad_norm": 3.9338877201080322, + "learning_rate": 5.794252809921932e-06, + "loss": 0.5131, + "step": 53925 + }, + { + "epoch": 2.69, + "grad_norm": 2.9956166744232178, + "learning_rate": 5.771182843327243e-06, + "loss": 0.5215, + "step": 53950 + }, + { + "epoch": 2.69, + "grad_norm": 1.5365872383117676, + "learning_rate": 5.7481128767325545e-06, + "loss": 0.4076, + "step": 53975 + }, + { + "epoch": 2.69, + "grad_norm": 6.710165023803711, + "learning_rate": 5.725042910137867e-06, + "loss": 0.4186, + "step": 54000 + }, + { + "epoch": 2.69, + "grad_norm": 3.2934117317199707, + "learning_rate": 5.701972943543178e-06, + "loss": 0.3445, + "step": 54025 + }, + { + "epoch": 2.69, + "grad_norm": 1.4656898975372314, + "learning_rate": 5.67890297694849e-06, + "loss": 0.3595, + "step": 54050 + }, + { + "epoch": 2.69, + "grad_norm": 6.39401388168335, + "learning_rate": 5.655833010353801e-06, + "loss": 0.2611, + "step": 54075 + }, + { + "epoch": 2.7, + "grad_norm": 0.6337581276893616, + "learning_rate": 5.632763043759113e-06, + "loss": 0.2903, + "step": 54100 + }, + { + "epoch": 2.7, + "grad_norm": 27.258468627929688, + "learning_rate": 5.609693077164424e-06, + "loss": 0.3028, + "step": 54125 + }, + { + "epoch": 2.7, + "grad_norm": 3.1868808269500732, + "learning_rate": 5.586623110569736e-06, + "loss": 0.4316, + "step": 54150 + }, + { + "epoch": 2.7, + "grad_norm": 3.674356698989868, + "learning_rate": 5.563553143975048e-06, + "loss": 0.4515, + "step": 54175 + }, + { + "epoch": 2.7, + "grad_norm": 3.406693696975708, + "learning_rate": 5.540483177380359e-06, + "loss": 0.4075, + "step": 54200 + }, + { + "epoch": 2.7, + "grad_norm": 3.3194386959075928, + "learning_rate": 5.5174132107856715e-06, + "loss": 0.3291, + "step": 54225 + }, + { + "epoch": 2.7, + "grad_norm": 19.359365463256836, + "learning_rate": 5.494343244190982e-06, + "loss": 0.4357, + "step": 54250 + }, + { + "epoch": 2.7, + "grad_norm": 0.826265275478363, + "learning_rate": 5.471273277596294e-06, + "loss": 0.4412, + "step": 54275 + }, + { + "epoch": 2.71, + "grad_norm": 7.969943523406982, + "learning_rate": 5.4482033110016064e-06, + "loss": 0.4301, + "step": 54300 + }, + { + "epoch": 2.71, + "grad_norm": 73.21637725830078, + "learning_rate": 5.425133344406918e-06, + "loss": 0.4003, + "step": 54325 + }, + { + "epoch": 2.71, + "grad_norm": 1.427546739578247, + "learning_rate": 5.402063377812229e-06, + "loss": 0.4922, + "step": 54350 + }, + { + "epoch": 2.71, + "grad_norm": 3.304819345474243, + "learning_rate": 5.3789934112175405e-06, + "loss": 0.3643, + "step": 54375 + }, + { + "epoch": 2.71, + "grad_norm": 4.45997428894043, + "learning_rate": 5.355923444622853e-06, + "loss": 0.5532, + "step": 54400 + }, + { + "epoch": 2.71, + "grad_norm": 3.3385884761810303, + "learning_rate": 5.332853478028164e-06, + "loss": 0.4333, + "step": 54425 + }, + { + "epoch": 2.71, + "grad_norm": 0.6692785620689392, + "learning_rate": 5.3097835114334754e-06, + "loss": 0.568, + "step": 54450 + }, + { + "epoch": 2.71, + "grad_norm": 2.820305347442627, + "learning_rate": 5.286713544838788e-06, + "loss": 0.3056, + "step": 54475 + }, + { + "epoch": 2.72, + "grad_norm": 1.4250597953796387, + "learning_rate": 5.263643578244099e-06, + "loss": 0.4482, + "step": 54500 + }, + { + "epoch": 2.72, + "grad_norm": 3.537055730819702, + "learning_rate": 5.24057361164941e-06, + "loss": 0.4922, + "step": 54525 + }, + { + "epoch": 2.72, + "grad_norm": 0.8892665505409241, + "learning_rate": 5.217503645054722e-06, + "loss": 0.4592, + "step": 54550 + }, + { + "epoch": 2.72, + "grad_norm": 0.7274606227874756, + "learning_rate": 5.194433678460034e-06, + "loss": 0.3322, + "step": 54575 + }, + { + "epoch": 2.72, + "grad_norm": 2.687718391418457, + "learning_rate": 5.171363711865346e-06, + "loss": 0.4036, + "step": 54600 + }, + { + "epoch": 2.72, + "grad_norm": 1.226678490638733, + "learning_rate": 5.148293745270657e-06, + "loss": 0.4232, + "step": 54625 + }, + { + "epoch": 2.72, + "grad_norm": 4.129399299621582, + "learning_rate": 5.125223778675969e-06, + "loss": 0.4251, + "step": 54650 + }, + { + "epoch": 2.72, + "grad_norm": 3.4940590858459473, + "learning_rate": 5.10215381208128e-06, + "loss": 0.525, + "step": 54675 + }, + { + "epoch": 2.73, + "grad_norm": 3.4788758754730225, + "learning_rate": 5.0790838454865924e-06, + "loss": 0.4932, + "step": 54700 + }, + { + "epoch": 2.73, + "grad_norm": 0.8910707235336304, + "learning_rate": 5.056013878891904e-06, + "loss": 0.3805, + "step": 54725 + }, + { + "epoch": 2.73, + "grad_norm": 8.056558609008789, + "learning_rate": 5.032943912297215e-06, + "loss": 0.4761, + "step": 54750 + }, + { + "epoch": 2.73, + "grad_norm": 0.702038586139679, + "learning_rate": 5.009873945702527e-06, + "loss": 0.3327, + "step": 54775 + }, + { + "epoch": 2.73, + "grad_norm": 0.9609004259109497, + "learning_rate": 4.986803979107838e-06, + "loss": 0.3144, + "step": 54800 + }, + { + "epoch": 2.73, + "grad_norm": 8.136700630187988, + "learning_rate": 4.96373401251315e-06, + "loss": 0.2805, + "step": 54825 + }, + { + "epoch": 2.73, + "grad_norm": 1.3470325469970703, + "learning_rate": 4.9406640459184614e-06, + "loss": 0.4093, + "step": 54850 + }, + { + "epoch": 2.73, + "grad_norm": 4.504135608673096, + "learning_rate": 4.917594079323774e-06, + "loss": 0.612, + "step": 54875 + }, + { + "epoch": 2.74, + "grad_norm": 0.6652967929840088, + "learning_rate": 4.894524112729085e-06, + "loss": 0.3323, + "step": 54900 + }, + { + "epoch": 2.74, + "grad_norm": 0.6020931005477905, + "learning_rate": 4.871454146134396e-06, + "loss": 0.3657, + "step": 54925 + }, + { + "epoch": 2.74, + "grad_norm": 7.894162178039551, + "learning_rate": 4.8483841795397086e-06, + "loss": 0.2989, + "step": 54950 + }, + { + "epoch": 2.74, + "grad_norm": 2.817714214324951, + "learning_rate": 4.82531421294502e-06, + "loss": 0.53, + "step": 54975 + }, + { + "epoch": 2.74, + "grad_norm": 5.955691337585449, + "learning_rate": 4.802244246350331e-06, + "loss": 0.4958, + "step": 55000 + }, + { + "epoch": 2.74, + "grad_norm": 0.6819628477096558, + "learning_rate": 4.7791742797556435e-06, + "loss": 0.3742, + "step": 55025 + }, + { + "epoch": 2.74, + "grad_norm": 3.3516225814819336, + "learning_rate": 4.756104313160955e-06, + "loss": 0.4367, + "step": 55050 + }, + { + "epoch": 2.74, + "grad_norm": 0.8088192939758301, + "learning_rate": 4.733034346566267e-06, + "loss": 0.4869, + "step": 55075 + }, + { + "epoch": 2.75, + "grad_norm": 11.700592994689941, + "learning_rate": 4.7099643799715776e-06, + "loss": 0.4097, + "step": 55100 + }, + { + "epoch": 2.75, + "grad_norm": 0.9019712805747986, + "learning_rate": 4.68689441337689e-06, + "loss": 0.2985, + "step": 55125 + }, + { + "epoch": 2.75, + "grad_norm": 0.7735490202903748, + "learning_rate": 4.663824446782201e-06, + "loss": 0.4336, + "step": 55150 + }, + { + "epoch": 2.75, + "grad_norm": 8.80534839630127, + "learning_rate": 4.6407544801875125e-06, + "loss": 0.4035, + "step": 55175 + }, + { + "epoch": 2.75, + "grad_norm": 2.1064610481262207, + "learning_rate": 4.617684513592825e-06, + "loss": 0.2458, + "step": 55200 + }, + { + "epoch": 2.75, + "grad_norm": 2.9238836765289307, + "learning_rate": 4.594614546998136e-06, + "loss": 0.4663, + "step": 55225 + }, + { + "epoch": 2.75, + "grad_norm": 10.220343589782715, + "learning_rate": 4.571544580403448e-06, + "loss": 0.5069, + "step": 55250 + }, + { + "epoch": 2.75, + "grad_norm": 0.9700506925582886, + "learning_rate": 4.548474613808759e-06, + "loss": 0.4545, + "step": 55275 + }, + { + "epoch": 2.76, + "grad_norm": 0.8662756085395813, + "learning_rate": 4.525404647214071e-06, + "loss": 0.5237, + "step": 55300 + }, + { + "epoch": 2.76, + "grad_norm": 3.5456159114837646, + "learning_rate": 4.502334680619383e-06, + "loss": 0.2305, + "step": 55325 + }, + { + "epoch": 2.76, + "grad_norm": 50.77360916137695, + "learning_rate": 4.4792647140246946e-06, + "loss": 0.4274, + "step": 55350 + }, + { + "epoch": 2.76, + "grad_norm": 0.8012394905090332, + "learning_rate": 4.456194747430006e-06, + "loss": 0.3622, + "step": 55375 + }, + { + "epoch": 2.76, + "grad_norm": 14.076640129089355, + "learning_rate": 4.433124780835317e-06, + "loss": 0.352, + "step": 55400 + }, + { + "epoch": 2.76, + "grad_norm": 3.324470281600952, + "learning_rate": 4.4100548142406295e-06, + "loss": 0.4269, + "step": 55425 + }, + { + "epoch": 2.76, + "grad_norm": 0.8007457256317139, + "learning_rate": 4.386984847645941e-06, + "loss": 0.4752, + "step": 55450 + }, + { + "epoch": 2.76, + "grad_norm": 92.04176330566406, + "learning_rate": 4.363914881051252e-06, + "loss": 0.4391, + "step": 55475 + }, + { + "epoch": 2.77, + "grad_norm": 0.700799822807312, + "learning_rate": 4.340844914456564e-06, + "loss": 0.6196, + "step": 55500 + }, + { + "epoch": 2.77, + "grad_norm": 1.1383848190307617, + "learning_rate": 4.317774947861876e-06, + "loss": 0.5262, + "step": 55525 + }, + { + "epoch": 2.77, + "grad_norm": 41.94340133666992, + "learning_rate": 4.294704981267187e-06, + "loss": 0.4941, + "step": 55550 + }, + { + "epoch": 2.77, + "grad_norm": 4.038702487945557, + "learning_rate": 4.2716350146724985e-06, + "loss": 0.4238, + "step": 55575 + }, + { + "epoch": 2.77, + "grad_norm": 3.343867301940918, + "learning_rate": 4.248565048077811e-06, + "loss": 0.3817, + "step": 55600 + }, + { + "epoch": 2.77, + "grad_norm": 15.859959602355957, + "learning_rate": 4.225495081483123e-06, + "loss": 0.4272, + "step": 55625 + }, + { + "epoch": 2.77, + "grad_norm": 3.166660785675049, + "learning_rate": 4.202425114888433e-06, + "loss": 0.3375, + "step": 55650 + }, + { + "epoch": 2.77, + "grad_norm": 0.8695279359817505, + "learning_rate": 4.179355148293746e-06, + "loss": 0.3752, + "step": 55675 + }, + { + "epoch": 2.78, + "grad_norm": 3.280230760574341, + "learning_rate": 4.156285181699057e-06, + "loss": 0.4374, + "step": 55700 + }, + { + "epoch": 2.78, + "grad_norm": 6.026671886444092, + "learning_rate": 4.133215215104369e-06, + "loss": 0.3545, + "step": 55725 + }, + { + "epoch": 2.78, + "grad_norm": 3.5654361248016357, + "learning_rate": 4.1101452485096805e-06, + "loss": 0.465, + "step": 55750 + }, + { + "epoch": 2.78, + "grad_norm": 0.7267552614212036, + "learning_rate": 4.087075281914992e-06, + "loss": 0.3821, + "step": 55775 + }, + { + "epoch": 2.78, + "grad_norm": 4.42562198638916, + "learning_rate": 4.064005315320304e-06, + "loss": 0.4465, + "step": 55800 + }, + { + "epoch": 2.78, + "grad_norm": 3.448962926864624, + "learning_rate": 4.040935348725615e-06, + "loss": 0.4483, + "step": 55825 + }, + { + "epoch": 2.78, + "grad_norm": 12.654199600219727, + "learning_rate": 4.017865382130927e-06, + "loss": 0.4897, + "step": 55850 + }, + { + "epoch": 2.78, + "grad_norm": 6.2945027351379395, + "learning_rate": 3.994795415536238e-06, + "loss": 0.3699, + "step": 55875 + }, + { + "epoch": 2.79, + "grad_norm": 4.625159740447998, + "learning_rate": 3.97172544894155e-06, + "loss": 0.3586, + "step": 55900 + }, + { + "epoch": 2.79, + "grad_norm": 6.696944236755371, + "learning_rate": 3.948655482346862e-06, + "loss": 0.4257, + "step": 55925 + }, + { + "epoch": 2.79, + "grad_norm": 3.5296878814697266, + "learning_rate": 3.925585515752173e-06, + "loss": 0.3581, + "step": 55950 + }, + { + "epoch": 2.79, + "grad_norm": 9.328879356384277, + "learning_rate": 3.902515549157485e-06, + "loss": 0.3942, + "step": 55975 + }, + { + "epoch": 2.79, + "grad_norm": 3.365654230117798, + "learning_rate": 3.879445582562797e-06, + "loss": 0.3671, + "step": 56000 + }, + { + "epoch": 2.79, + "grad_norm": 9.659181594848633, + "learning_rate": 3.856375615968108e-06, + "loss": 0.4164, + "step": 56025 + }, + { + "epoch": 2.79, + "grad_norm": 0.6306387782096863, + "learning_rate": 3.83330564937342e-06, + "loss": 0.2838, + "step": 56050 + }, + { + "epoch": 2.79, + "grad_norm": 175.56747436523438, + "learning_rate": 3.8102356827787316e-06, + "loss": 0.5275, + "step": 56075 + }, + { + "epoch": 2.8, + "grad_norm": 16.832124710083008, + "learning_rate": 3.7871657161840434e-06, + "loss": 0.5355, + "step": 56100 + }, + { + "epoch": 2.8, + "grad_norm": 2.7186450958251953, + "learning_rate": 3.7640957495893548e-06, + "loss": 0.3458, + "step": 56125 + }, + { + "epoch": 2.8, + "grad_norm": 3.481710433959961, + "learning_rate": 3.7410257829946665e-06, + "loss": 0.4307, + "step": 56150 + }, + { + "epoch": 2.8, + "grad_norm": 0.9718573093414307, + "learning_rate": 3.7179558163999783e-06, + "loss": 0.4509, + "step": 56175 + }, + { + "epoch": 2.8, + "grad_norm": 7.378538131713867, + "learning_rate": 3.6948858498052893e-06, + "loss": 0.3863, + "step": 56200 + }, + { + "epoch": 2.8, + "grad_norm": 0.7621744275093079, + "learning_rate": 3.671815883210601e-06, + "loss": 0.3617, + "step": 56225 + }, + { + "epoch": 2.8, + "grad_norm": 0.687117338180542, + "learning_rate": 3.648745916615913e-06, + "loss": 0.2841, + "step": 56250 + }, + { + "epoch": 2.8, + "grad_norm": 3.461027145385742, + "learning_rate": 3.6256759500212246e-06, + "loss": 0.3593, + "step": 56275 + }, + { + "epoch": 2.81, + "grad_norm": 6.926644325256348, + "learning_rate": 3.602605983426536e-06, + "loss": 0.4395, + "step": 56300 + }, + { + "epoch": 2.81, + "grad_norm": 4.752397060394287, + "learning_rate": 3.5795360168318477e-06, + "loss": 0.4244, + "step": 56325 + }, + { + "epoch": 2.81, + "grad_norm": 0.7236278057098389, + "learning_rate": 3.5564660502371595e-06, + "loss": 0.368, + "step": 56350 + }, + { + "epoch": 2.81, + "grad_norm": 0.6548141837120056, + "learning_rate": 3.5333960836424713e-06, + "loss": 0.3861, + "step": 56375 + }, + { + "epoch": 2.81, + "grad_norm": 7.0432209968566895, + "learning_rate": 3.5103261170477827e-06, + "loss": 0.4131, + "step": 56400 + }, + { + "epoch": 2.81, + "grad_norm": 14.461806297302246, + "learning_rate": 3.4872561504530945e-06, + "loss": 0.5109, + "step": 56425 + }, + { + "epoch": 2.81, + "grad_norm": 2.674800395965576, + "learning_rate": 3.4641861838584062e-06, + "loss": 0.4644, + "step": 56450 + }, + { + "epoch": 2.81, + "grad_norm": 6.315777778625488, + "learning_rate": 3.441116217263718e-06, + "loss": 0.3623, + "step": 56475 + }, + { + "epoch": 2.82, + "grad_norm": 0.7652971148490906, + "learning_rate": 3.418046250669029e-06, + "loss": 0.4106, + "step": 56500 + }, + { + "epoch": 2.82, + "grad_norm": 3.053166627883911, + "learning_rate": 3.3949762840743407e-06, + "loss": 0.4133, + "step": 56525 + }, + { + "epoch": 2.82, + "grad_norm": 1.1593868732452393, + "learning_rate": 3.3719063174796525e-06, + "loss": 0.4392, + "step": 56550 + }, + { + "epoch": 2.82, + "grad_norm": 0.6778175234794617, + "learning_rate": 3.348836350884964e-06, + "loss": 0.3329, + "step": 56575 + }, + { + "epoch": 2.82, + "grad_norm": 3.0964550971984863, + "learning_rate": 3.3257663842902757e-06, + "loss": 0.3984, + "step": 56600 + }, + { + "epoch": 2.82, + "grad_norm": 1.697141408920288, + "learning_rate": 3.3026964176955875e-06, + "loss": 0.308, + "step": 56625 + }, + { + "epoch": 2.82, + "grad_norm": 8.337875366210938, + "learning_rate": 3.2796264511008992e-06, + "loss": 0.3159, + "step": 56650 + }, + { + "epoch": 2.82, + "grad_norm": 34.76048278808594, + "learning_rate": 3.25655648450621e-06, + "loss": 0.3352, + "step": 56675 + }, + { + "epoch": 2.83, + "grad_norm": 7.32834529876709, + "learning_rate": 3.233486517911522e-06, + "loss": 0.4327, + "step": 56700 + }, + { + "epoch": 2.83, + "grad_norm": 3.3736586570739746, + "learning_rate": 3.210416551316834e-06, + "loss": 0.4657, + "step": 56725 + }, + { + "epoch": 2.83, + "grad_norm": 20.95182228088379, + "learning_rate": 3.187346584722146e-06, + "loss": 0.5894, + "step": 56750 + }, + { + "epoch": 2.83, + "grad_norm": 0.7072112560272217, + "learning_rate": 3.164276618127457e-06, + "loss": 0.4317, + "step": 56775 + }, + { + "epoch": 2.83, + "grad_norm": 43.19917297363281, + "learning_rate": 3.1412066515327687e-06, + "loss": 0.3036, + "step": 56800 + }, + { + "epoch": 2.83, + "grad_norm": 0.7587588429450989, + "learning_rate": 3.1181366849380804e-06, + "loss": 0.3638, + "step": 56825 + }, + { + "epoch": 2.83, + "grad_norm": 2.467799186706543, + "learning_rate": 3.095066718343392e-06, + "loss": 0.4133, + "step": 56850 + }, + { + "epoch": 2.83, + "grad_norm": 9.070191383361816, + "learning_rate": 3.0719967517487036e-06, + "loss": 0.432, + "step": 56875 + }, + { + "epoch": 2.84, + "grad_norm": 3.5591251850128174, + "learning_rate": 3.0489267851540154e-06, + "loss": 0.4507, + "step": 56900 + }, + { + "epoch": 2.84, + "grad_norm": 0.7420728802680969, + "learning_rate": 3.0258568185593267e-06, + "loss": 0.356, + "step": 56925 + }, + { + "epoch": 2.84, + "grad_norm": 60.67326354980469, + "learning_rate": 3.0027868519646385e-06, + "loss": 0.4632, + "step": 56950 + }, + { + "epoch": 2.84, + "grad_norm": 0.8600183129310608, + "learning_rate": 2.97971688536995e-06, + "loss": 0.4973, + "step": 56975 + }, + { + "epoch": 2.84, + "grad_norm": 0.7386710047721863, + "learning_rate": 2.9566469187752617e-06, + "loss": 0.308, + "step": 57000 + }, + { + "epoch": 2.84, + "grad_norm": 3.9108200073242188, + "learning_rate": 2.9335769521805734e-06, + "loss": 0.3888, + "step": 57025 + }, + { + "epoch": 2.84, + "grad_norm": 8.574189186096191, + "learning_rate": 2.9105069855858852e-06, + "loss": 0.3844, + "step": 57050 + }, + { + "epoch": 2.84, + "grad_norm": 0.6366267800331116, + "learning_rate": 2.8874370189911966e-06, + "loss": 0.3923, + "step": 57075 + }, + { + "epoch": 2.85, + "grad_norm": 19.721044540405273, + "learning_rate": 2.8643670523965084e-06, + "loss": 0.5066, + "step": 57100 + }, + { + "epoch": 2.85, + "grad_norm": 3.4444007873535156, + "learning_rate": 2.8412970858018197e-06, + "loss": 0.4136, + "step": 57125 + }, + { + "epoch": 2.85, + "grad_norm": 8.910185813903809, + "learning_rate": 2.8182271192071315e-06, + "loss": 0.5175, + "step": 57150 + }, + { + "epoch": 2.85, + "grad_norm": 0.9261099100112915, + "learning_rate": 2.7951571526124433e-06, + "loss": 0.462, + "step": 57175 + }, + { + "epoch": 2.85, + "grad_norm": 8.69874095916748, + "learning_rate": 2.7720871860177547e-06, + "loss": 0.3286, + "step": 57200 + }, + { + "epoch": 2.85, + "grad_norm": 0.8175128698348999, + "learning_rate": 2.7490172194230664e-06, + "loss": 0.5005, + "step": 57225 + }, + { + "epoch": 2.85, + "grad_norm": 6.2586750984191895, + "learning_rate": 2.725947252828378e-06, + "loss": 0.4317, + "step": 57250 + }, + { + "epoch": 2.85, + "grad_norm": 0.8921399116516113, + "learning_rate": 2.7028772862336896e-06, + "loss": 0.2917, + "step": 57275 + }, + { + "epoch": 2.86, + "grad_norm": 3.185152053833008, + "learning_rate": 2.6798073196390014e-06, + "loss": 0.366, + "step": 57300 + }, + { + "epoch": 2.86, + "grad_norm": 1.9014008045196533, + "learning_rate": 2.656737353044313e-06, + "loss": 0.3909, + "step": 57325 + }, + { + "epoch": 2.86, + "grad_norm": 1.7264939546585083, + "learning_rate": 2.6336673864496245e-06, + "loss": 0.3432, + "step": 57350 + }, + { + "epoch": 2.86, + "grad_norm": 22.747976303100586, + "learning_rate": 2.6105974198549363e-06, + "loss": 0.4633, + "step": 57375 + }, + { + "epoch": 2.86, + "grad_norm": 0.8758083581924438, + "learning_rate": 2.5875274532602477e-06, + "loss": 0.4581, + "step": 57400 + }, + { + "epoch": 2.86, + "grad_norm": 3.4689791202545166, + "learning_rate": 2.5644574866655594e-06, + "loss": 0.3463, + "step": 57425 + }, + { + "epoch": 2.86, + "grad_norm": 3.972005844116211, + "learning_rate": 2.5413875200708712e-06, + "loss": 0.4681, + "step": 57450 + }, + { + "epoch": 2.86, + "grad_norm": 50.45981979370117, + "learning_rate": 2.518317553476183e-06, + "loss": 0.5288, + "step": 57475 + }, + { + "epoch": 2.87, + "grad_norm": 1.1776808500289917, + "learning_rate": 2.4952475868814944e-06, + "loss": 0.5307, + "step": 57500 + }, + { + "epoch": 2.87, + "grad_norm": 3.626554250717163, + "learning_rate": 2.4721776202868057e-06, + "loss": 0.4914, + "step": 57525 + }, + { + "epoch": 2.87, + "grad_norm": 10.887569427490234, + "learning_rate": 2.4491076536921175e-06, + "loss": 0.3871, + "step": 57550 + }, + { + "epoch": 2.87, + "grad_norm": 0.7736265063285828, + "learning_rate": 2.426037687097429e-06, + "loss": 0.3467, + "step": 57575 + }, + { + "epoch": 2.87, + "grad_norm": 11.513165473937988, + "learning_rate": 2.402967720502741e-06, + "loss": 0.3595, + "step": 57600 + }, + { + "epoch": 2.87, + "grad_norm": 0.938291072845459, + "learning_rate": 2.3798977539080524e-06, + "loss": 0.3298, + "step": 57625 + }, + { + "epoch": 2.87, + "grad_norm": 1.0008108615875244, + "learning_rate": 2.3568277873133642e-06, + "loss": 0.3453, + "step": 57650 + }, + { + "epoch": 2.87, + "grad_norm": 2.2188680171966553, + "learning_rate": 2.3337578207186756e-06, + "loss": 0.374, + "step": 57675 + }, + { + "epoch": 2.88, + "grad_norm": 3.403674602508545, + "learning_rate": 2.3106878541239874e-06, + "loss": 0.4336, + "step": 57700 + }, + { + "epoch": 2.88, + "grad_norm": 0.7060783505439758, + "learning_rate": 2.2876178875292987e-06, + "loss": 0.3556, + "step": 57725 + }, + { + "epoch": 2.88, + "grad_norm": 0.691160261631012, + "learning_rate": 2.2645479209346105e-06, + "loss": 0.4475, + "step": 57750 + }, + { + "epoch": 2.88, + "grad_norm": 0.6420150399208069, + "learning_rate": 2.2414779543399223e-06, + "loss": 0.246, + "step": 57775 + }, + { + "epoch": 2.88, + "grad_norm": 3.1971852779388428, + "learning_rate": 2.218407987745234e-06, + "loss": 0.2914, + "step": 57800 + }, + { + "epoch": 2.88, + "grad_norm": 0.6994123458862305, + "learning_rate": 2.1953380211505454e-06, + "loss": 0.4257, + "step": 57825 + }, + { + "epoch": 2.88, + "grad_norm": 0.673072338104248, + "learning_rate": 2.172268054555857e-06, + "loss": 0.4714, + "step": 57850 + }, + { + "epoch": 2.88, + "grad_norm": 7.408416271209717, + "learning_rate": 2.1491980879611686e-06, + "loss": 0.5478, + "step": 57875 + }, + { + "epoch": 2.89, + "grad_norm": 4.219334125518799, + "learning_rate": 2.1261281213664804e-06, + "loss": 0.3488, + "step": 57900 + }, + { + "epoch": 2.89, + "grad_norm": 7.087649345397949, + "learning_rate": 2.103058154771792e-06, + "loss": 0.4356, + "step": 57925 + }, + { + "epoch": 2.89, + "grad_norm": 22.720373153686523, + "learning_rate": 2.0799881881771035e-06, + "loss": 0.3209, + "step": 57950 + }, + { + "epoch": 2.89, + "grad_norm": 21.636537551879883, + "learning_rate": 2.0569182215824153e-06, + "loss": 0.4794, + "step": 57975 + }, + { + "epoch": 2.89, + "grad_norm": 4.3353447914123535, + "learning_rate": 2.0338482549877266e-06, + "loss": 0.3213, + "step": 58000 + }, + { + "epoch": 2.89, + "grad_norm": 3.337759017944336, + "learning_rate": 2.0107782883930384e-06, + "loss": 0.4506, + "step": 58025 + }, + { + "epoch": 2.89, + "grad_norm": 0.6747495532035828, + "learning_rate": 1.98770832179835e-06, + "loss": 0.5877, + "step": 58050 + }, + { + "epoch": 2.89, + "grad_norm": 0.7362604737281799, + "learning_rate": 1.964638355203662e-06, + "loss": 0.3278, + "step": 58075 + }, + { + "epoch": 2.9, + "grad_norm": 0.698776125907898, + "learning_rate": 1.9415683886089733e-06, + "loss": 0.4506, + "step": 58100 + }, + { + "epoch": 2.9, + "grad_norm": 7.648797988891602, + "learning_rate": 1.918498422014285e-06, + "loss": 0.4466, + "step": 58125 + }, + { + "epoch": 2.9, + "grad_norm": 3.523207664489746, + "learning_rate": 1.8954284554195967e-06, + "loss": 0.3555, + "step": 58150 + }, + { + "epoch": 2.9, + "grad_norm": 2.721613883972168, + "learning_rate": 1.8723584888249085e-06, + "loss": 0.3494, + "step": 58175 + }, + { + "epoch": 2.9, + "grad_norm": 0.7379086017608643, + "learning_rate": 1.8492885222302198e-06, + "loss": 0.3489, + "step": 58200 + }, + { + "epoch": 2.9, + "grad_norm": 7.868014335632324, + "learning_rate": 1.8262185556355314e-06, + "loss": 0.5111, + "step": 58225 + }, + { + "epoch": 2.9, + "grad_norm": 3.1979591846466064, + "learning_rate": 1.8031485890408432e-06, + "loss": 0.3398, + "step": 58250 + }, + { + "epoch": 2.9, + "grad_norm": 3.2477834224700928, + "learning_rate": 1.7800786224461546e-06, + "loss": 0.4812, + "step": 58275 + }, + { + "epoch": 2.91, + "grad_norm": 0.7919636368751526, + "learning_rate": 1.7570086558514666e-06, + "loss": 0.3957, + "step": 58300 + }, + { + "epoch": 2.91, + "grad_norm": 9.772295951843262, + "learning_rate": 1.733938689256778e-06, + "loss": 0.3896, + "step": 58325 + }, + { + "epoch": 2.91, + "grad_norm": 52.001590728759766, + "learning_rate": 1.7108687226620897e-06, + "loss": 0.4273, + "step": 58350 + }, + { + "epoch": 2.91, + "grad_norm": 4.567387580871582, + "learning_rate": 1.6877987560674013e-06, + "loss": 0.5927, + "step": 58375 + }, + { + "epoch": 2.91, + "grad_norm": 0.7052479982376099, + "learning_rate": 1.664728789472713e-06, + "loss": 0.4157, + "step": 58400 + }, + { + "epoch": 2.91, + "grad_norm": 12.553930282592773, + "learning_rate": 1.6416588228780244e-06, + "loss": 0.3005, + "step": 58425 + }, + { + "epoch": 2.91, + "grad_norm": 1.8058936595916748, + "learning_rate": 1.6185888562833362e-06, + "loss": 0.4036, + "step": 58450 + }, + { + "epoch": 2.91, + "grad_norm": 0.6824276447296143, + "learning_rate": 1.5955188896886478e-06, + "loss": 0.4308, + "step": 58475 + }, + { + "epoch": 2.92, + "grad_norm": 4.887246131896973, + "learning_rate": 1.5724489230939596e-06, + "loss": 0.5227, + "step": 58500 + }, + { + "epoch": 2.92, + "grad_norm": 3.1629316806793213, + "learning_rate": 1.5493789564992711e-06, + "loss": 0.4616, + "step": 58525 + }, + { + "epoch": 2.92, + "grad_norm": 0.7038543224334717, + "learning_rate": 1.5263089899045827e-06, + "loss": 0.4053, + "step": 58550 + }, + { + "epoch": 2.92, + "grad_norm": 0.8581077456474304, + "learning_rate": 1.5032390233098943e-06, + "loss": 0.3863, + "step": 58575 + }, + { + "epoch": 2.92, + "grad_norm": 3.157999038696289, + "learning_rate": 1.480169056715206e-06, + "loss": 0.3802, + "step": 58600 + }, + { + "epoch": 2.92, + "grad_norm": 8.047822952270508, + "learning_rate": 1.4570990901205176e-06, + "loss": 0.4406, + "step": 58625 + }, + { + "epoch": 2.92, + "grad_norm": 3.3583245277404785, + "learning_rate": 1.4340291235258292e-06, + "loss": 0.4823, + "step": 58650 + }, + { + "epoch": 2.92, + "grad_norm": 0.6569061875343323, + "learning_rate": 1.410959156931141e-06, + "loss": 0.426, + "step": 58675 + }, + { + "epoch": 2.93, + "grad_norm": 9.12439250946045, + "learning_rate": 1.3878891903364525e-06, + "loss": 0.3746, + "step": 58700 + }, + { + "epoch": 2.93, + "grad_norm": 435.5043640136719, + "learning_rate": 1.3648192237417641e-06, + "loss": 0.2853, + "step": 58725 + }, + { + "epoch": 2.93, + "grad_norm": 6.289416313171387, + "learning_rate": 1.3417492571470757e-06, + "loss": 0.296, + "step": 58750 + }, + { + "epoch": 2.93, + "grad_norm": 3.6272895336151123, + "learning_rate": 1.3186792905523873e-06, + "loss": 0.4109, + "step": 58775 + }, + { + "epoch": 2.93, + "grad_norm": 0.7682371139526367, + "learning_rate": 1.2956093239576988e-06, + "loss": 0.4651, + "step": 58800 + }, + { + "epoch": 2.93, + "grad_norm": 3.5857977867126465, + "learning_rate": 1.2725393573630106e-06, + "loss": 0.4428, + "step": 58825 + }, + { + "epoch": 2.93, + "grad_norm": 3.7168776988983154, + "learning_rate": 1.2494693907683222e-06, + "loss": 0.4103, + "step": 58850 + }, + { + "epoch": 2.93, + "grad_norm": 3.0777294635772705, + "learning_rate": 1.2263994241736338e-06, + "loss": 0.3149, + "step": 58875 + }, + { + "epoch": 2.94, + "grad_norm": 3.4248714447021484, + "learning_rate": 1.2033294575789455e-06, + "loss": 0.3501, + "step": 58900 + }, + { + "epoch": 2.94, + "grad_norm": 3.277421236038208, + "learning_rate": 1.1802594909842571e-06, + "loss": 0.4754, + "step": 58925 + }, + { + "epoch": 2.94, + "grad_norm": 0.7078830003738403, + "learning_rate": 1.1571895243895687e-06, + "loss": 0.416, + "step": 58950 + }, + { + "epoch": 2.94, + "grad_norm": 4.206014633178711, + "learning_rate": 1.1341195577948805e-06, + "loss": 0.3185, + "step": 58975 + }, + { + "epoch": 2.94, + "grad_norm": 14.83720588684082, + "learning_rate": 1.111049591200192e-06, + "loss": 0.3692, + "step": 59000 + }, + { + "epoch": 2.94, + "grad_norm": 0.7804746627807617, + "learning_rate": 1.0879796246055036e-06, + "loss": 0.304, + "step": 59025 + }, + { + "epoch": 2.94, + "grad_norm": 3.3699700832366943, + "learning_rate": 1.0649096580108154e-06, + "loss": 0.3233, + "step": 59050 + }, + { + "epoch": 2.94, + "grad_norm": 3.395963668823242, + "learning_rate": 1.041839691416127e-06, + "loss": 0.4438, + "step": 59075 + }, + { + "epoch": 2.94, + "grad_norm": 3.5054526329040527, + "learning_rate": 1.0187697248214385e-06, + "loss": 0.4384, + "step": 59100 + }, + { + "epoch": 2.95, + "grad_norm": 3.579428195953369, + "learning_rate": 9.956997582267501e-07, + "loss": 0.436, + "step": 59125 + }, + { + "epoch": 2.95, + "grad_norm": 3.2751588821411133, + "learning_rate": 9.726297916320617e-07, + "loss": 0.4432, + "step": 59150 + }, + { + "epoch": 2.95, + "grad_norm": 3.472015857696533, + "learning_rate": 9.495598250373734e-07, + "loss": 0.5857, + "step": 59175 + }, + { + "epoch": 2.95, + "grad_norm": 10.637346267700195, + "learning_rate": 9.26489858442685e-07, + "loss": 0.3446, + "step": 59200 + }, + { + "epoch": 2.95, + "grad_norm": 63.770668029785156, + "learning_rate": 9.034198918479966e-07, + "loss": 0.3659, + "step": 59225 + }, + { + "epoch": 2.95, + "grad_norm": 5.983547687530518, + "learning_rate": 8.803499252533083e-07, + "loss": 0.447, + "step": 59250 + }, + { + "epoch": 2.95, + "grad_norm": 5.243884086608887, + "learning_rate": 8.5727995865862e-07, + "loss": 0.4767, + "step": 59275 + }, + { + "epoch": 2.95, + "grad_norm": 3.3908684253692627, + "learning_rate": 8.342099920639315e-07, + "loss": 0.4583, + "step": 59300 + }, + { + "epoch": 2.96, + "grad_norm": 3.267824172973633, + "learning_rate": 8.111400254692432e-07, + "loss": 0.3493, + "step": 59325 + }, + { + "epoch": 2.96, + "grad_norm": 8.377107620239258, + "learning_rate": 7.880700588745549e-07, + "loss": 0.4306, + "step": 59350 + }, + { + "epoch": 2.96, + "grad_norm": 0.8545198440551758, + "learning_rate": 7.650000922798664e-07, + "loss": 0.3806, + "step": 59375 + }, + { + "epoch": 2.96, + "grad_norm": 5.678221225738525, + "learning_rate": 7.41930125685178e-07, + "loss": 0.3354, + "step": 59400 + }, + { + "epoch": 2.96, + "grad_norm": 0.791506826877594, + "learning_rate": 7.188601590904897e-07, + "loss": 0.3562, + "step": 59425 + }, + { + "epoch": 2.96, + "grad_norm": 3.0934579372406006, + "learning_rate": 6.957901924958013e-07, + "loss": 0.396, + "step": 59450 + }, + { + "epoch": 2.96, + "grad_norm": 6.885002136230469, + "learning_rate": 6.72720225901113e-07, + "loss": 0.4913, + "step": 59475 + }, + { + "epoch": 2.96, + "grad_norm": 6.017404556274414, + "learning_rate": 6.496502593064246e-07, + "loss": 0.3693, + "step": 59500 + }, + { + "epoch": 2.97, + "grad_norm": 7.71619987487793, + "learning_rate": 6.265802927117361e-07, + "loss": 0.3407, + "step": 59525 + }, + { + "epoch": 2.97, + "grad_norm": 0.6652907133102417, + "learning_rate": 6.035103261170478e-07, + "loss": 0.2536, + "step": 59550 + }, + { + "epoch": 2.97, + "grad_norm": 0.734829843044281, + "learning_rate": 5.804403595223595e-07, + "loss": 0.4627, + "step": 59575 + }, + { + "epoch": 2.97, + "grad_norm": 0.6811460852622986, + "learning_rate": 5.57370392927671e-07, + "loss": 0.4573, + "step": 59600 + }, + { + "epoch": 2.97, + "grad_norm": 2.0909130573272705, + "learning_rate": 5.343004263329827e-07, + "loss": 0.4136, + "step": 59625 + }, + { + "epoch": 2.97, + "grad_norm": 7.259815216064453, + "learning_rate": 5.112304597382944e-07, + "loss": 0.3893, + "step": 59650 + }, + { + "epoch": 2.97, + "grad_norm": 0.6684561371803284, + "learning_rate": 4.88160493143606e-07, + "loss": 0.4502, + "step": 59675 + }, + { + "epoch": 2.97, + "grad_norm": 3.2914390563964844, + "learning_rate": 4.650905265489175e-07, + "loss": 0.272, + "step": 59700 + }, + { + "epoch": 2.98, + "grad_norm": 4.816464900970459, + "learning_rate": 4.420205599542292e-07, + "loss": 0.3046, + "step": 59725 + }, + { + "epoch": 2.98, + "grad_norm": 1.9208866357803345, + "learning_rate": 4.189505933595408e-07, + "loss": 0.4547, + "step": 59750 + }, + { + "epoch": 2.98, + "grad_norm": 18.676660537719727, + "learning_rate": 3.9588062676485245e-07, + "loss": 0.4184, + "step": 59775 + }, + { + "epoch": 2.98, + "grad_norm": 0.7250289916992188, + "learning_rate": 3.7281066017016413e-07, + "loss": 0.471, + "step": 59800 + }, + { + "epoch": 2.98, + "grad_norm": 7.761043548583984, + "learning_rate": 3.497406935754757e-07, + "loss": 0.3157, + "step": 59825 + }, + { + "epoch": 2.98, + "grad_norm": 1.860449194908142, + "learning_rate": 3.266707269807873e-07, + "loss": 0.3256, + "step": 59850 + }, + { + "epoch": 2.98, + "grad_norm": 23.85589027404785, + "learning_rate": 3.03600760386099e-07, + "loss": 0.3953, + "step": 59875 + }, + { + "epoch": 2.98, + "grad_norm": 0.6777181029319763, + "learning_rate": 2.8053079379141057e-07, + "loss": 0.2922, + "step": 59900 + }, + { + "epoch": 2.99, + "grad_norm": 8.837489128112793, + "learning_rate": 2.574608271967222e-07, + "loss": 0.4033, + "step": 59925 + }, + { + "epoch": 2.99, + "grad_norm": 3.261415719985962, + "learning_rate": 2.3439086060203387e-07, + "loss": 0.3313, + "step": 59950 + }, + { + "epoch": 2.99, + "grad_norm": 21.563766479492188, + "learning_rate": 2.113208940073455e-07, + "loss": 0.4912, + "step": 59975 + }, + { + "epoch": 2.99, + "grad_norm": 7.959074974060059, + "learning_rate": 1.8825092741265712e-07, + "loss": 0.4481, + "step": 60000 + }, + { + "epoch": 2.99, + "grad_norm": 33.16374969482422, + "learning_rate": 1.6518096081796875e-07, + "loss": 0.4365, + "step": 60025 + }, + { + "epoch": 2.99, + "grad_norm": 1.0097713470458984, + "learning_rate": 1.4211099422328037e-07, + "loss": 0.4598, + "step": 60050 + }, + { + "epoch": 2.99, + "grad_norm": 13.992691993713379, + "learning_rate": 1.19041027628592e-07, + "loss": 0.4552, + "step": 60075 + }, + { + "epoch": 2.99, + "grad_norm": 3.1975209712982178, + "learning_rate": 9.597106103390363e-08, + "loss": 0.5176, + "step": 60100 + }, + { + "epoch": 3.0, + "grad_norm": 0.6553680896759033, + "learning_rate": 7.290109443921525e-08, + "loss": 0.4653, + "step": 60125 + }, + { + "epoch": 3.0, + "grad_norm": 31.342302322387695, + "learning_rate": 4.9831127844526877e-08, + "loss": 0.5265, + "step": 60150 + }, + { + "epoch": 3.0, + "grad_norm": 8.892498016357422, + "learning_rate": 2.676116124983851e-08, + "loss": 0.5413, + "step": 60175 + }, + { + "epoch": 3.0, + "grad_norm": 6.391012191772461, + "learning_rate": 3.691194655150139e-09, + "loss": 0.4561, + "step": 60200 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.8848137535816619, + "eval_f1_macro": 0.7960248281288549, + "eval_f1_micro": 0.8848137535816619, + "eval_f1_weighted": 0.8831232981645943, + "eval_loss": 0.41398245096206665, + "eval_precision_macro": 0.8400378915863742, + "eval_precision_micro": 0.8848137535816619, + "eval_precision_weighted": 0.8831401394863536, + "eval_recall_macro": 0.7672428418298921, + "eval_recall_micro": 0.8848137535816619, + "eval_recall_weighted": 0.8848137535816619, + "eval_runtime": 6833.944, + "eval_samples_per_second": 5.873, + "eval_steps_per_second": 0.367, + "step": 60204 + } + ], + "logging_steps": 25, + "max_steps": 60204, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 3.167977639141325e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}