{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6755852842809364, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013377926421404682, "grad_norm": 8.909825325012207, "learning_rate": 5e-06, "loss": 2.9362, "step": 1 }, { "epoch": 0.0026755852842809363, "grad_norm": 11.420860290527344, "learning_rate": 1e-05, "loss": 3.1217, "step": 2 }, { "epoch": 0.004013377926421404, "grad_norm": 9.437948226928711, "learning_rate": 1.5e-05, "loss": 2.9222, "step": 3 }, { "epoch": 0.005351170568561873, "grad_norm": 6.445765972137451, "learning_rate": 2e-05, "loss": 3.0696, "step": 4 }, { "epoch": 0.006688963210702341, "grad_norm": 4.019280433654785, "learning_rate": 2.5e-05, "loss": 2.8349, "step": 5 }, { "epoch": 0.008026755852842809, "grad_norm": 3.1477231979370117, "learning_rate": 3e-05, "loss": 2.5674, "step": 6 }, { "epoch": 0.009364548494983277, "grad_norm": 6.760244846343994, "learning_rate": 3.5e-05, "loss": 2.4331, "step": 7 }, { "epoch": 0.010702341137123745, "grad_norm": 2.688317060470581, "learning_rate": 4e-05, "loss": 2.5435, "step": 8 }, { "epoch": 0.012040133779264214, "grad_norm": 2.7984418869018555, "learning_rate": 4.5e-05, "loss": 2.2427, "step": 9 }, { "epoch": 0.013377926421404682, "grad_norm": 8.174599647521973, "learning_rate": 5e-05, "loss": 2.7228, "step": 10 }, { "epoch": 0.01471571906354515, "grad_norm": 1.9308923482894897, "learning_rate": 5.500000000000001e-05, "loss": 2.6318, "step": 11 }, { "epoch": 0.016053511705685617, "grad_norm": 2.391382932662964, "learning_rate": 6e-05, "loss": 2.1344, "step": 12 }, { "epoch": 0.017391304347826087, "grad_norm": 3.2888343334198, "learning_rate": 6.500000000000001e-05, "loss": 2.3624, "step": 13 }, { "epoch": 0.018729096989966554, "grad_norm": 2.822356939315796, "learning_rate": 7e-05, "loss": 2.1097, "step": 14 }, { "epoch": 0.020066889632107024, "grad_norm": 2.7134530544281006, "learning_rate": 7.500000000000001e-05, "loss": 2.2967, "step": 15 }, { "epoch": 0.02140468227424749, "grad_norm": 1.22122323513031, "learning_rate": 8e-05, "loss": 2.2179, "step": 16 }, { "epoch": 0.02274247491638796, "grad_norm": 1.7385879755020142, "learning_rate": 8.5e-05, "loss": 2.1567, "step": 17 }, { "epoch": 0.024080267558528427, "grad_norm": 1.5407123565673828, "learning_rate": 9e-05, "loss": 2.1197, "step": 18 }, { "epoch": 0.025418060200668897, "grad_norm": 1.191502571105957, "learning_rate": 9.5e-05, "loss": 2.2177, "step": 19 }, { "epoch": 0.026755852842809364, "grad_norm": 1.3412432670593262, "learning_rate": 0.0001, "loss": 2.0945, "step": 20 }, { "epoch": 0.028093645484949834, "grad_norm": 1.3845086097717285, "learning_rate": 9.995497523638001e-05, "loss": 2.03, "step": 21 }, { "epoch": 0.0294314381270903, "grad_norm": 1.0299506187438965, "learning_rate": 9.990995047276002e-05, "loss": 2.0461, "step": 22 }, { "epoch": 0.03076923076923077, "grad_norm": 1.3790980577468872, "learning_rate": 9.986492570914003e-05, "loss": 2.4299, "step": 23 }, { "epoch": 0.032107023411371234, "grad_norm": 1.0298329591751099, "learning_rate": 9.981990094552004e-05, "loss": 2.3317, "step": 24 }, { "epoch": 0.033444816053511704, "grad_norm": 1.129258155822754, "learning_rate": 9.977487618190005e-05, "loss": 2.0976, "step": 25 }, { "epoch": 0.034782608695652174, "grad_norm": 2.688994884490967, "learning_rate": 9.972985141828006e-05, "loss": 1.8448, "step": 26 }, { "epoch": 0.036120401337792644, "grad_norm": 0.8690654039382935, "learning_rate": 9.968482665466006e-05, "loss": 1.9989, "step": 27 }, { "epoch": 0.03745819397993311, "grad_norm": 1.0267047882080078, "learning_rate": 9.963980189104007e-05, "loss": 2.2423, "step": 28 }, { "epoch": 0.03879598662207358, "grad_norm": 0.9660020470619202, "learning_rate": 9.95947771274201e-05, "loss": 2.3757, "step": 29 }, { "epoch": 0.04013377926421405, "grad_norm": 0.8873792886734009, "learning_rate": 9.954975236380009e-05, "loss": 2.1465, "step": 30 }, { "epoch": 0.04147157190635452, "grad_norm": 4.266314506530762, "learning_rate": 9.95047276001801e-05, "loss": 2.1787, "step": 31 }, { "epoch": 0.04280936454849498, "grad_norm": 0.9623901844024658, "learning_rate": 9.945970283656011e-05, "loss": 2.1484, "step": 32 }, { "epoch": 0.04414715719063545, "grad_norm": 1.0520215034484863, "learning_rate": 9.941467807294013e-05, "loss": 2.2169, "step": 33 }, { "epoch": 0.04548494983277592, "grad_norm": 1.2446262836456299, "learning_rate": 9.936965330932014e-05, "loss": 2.4512, "step": 34 }, { "epoch": 0.046822742474916385, "grad_norm": 8.27096176147461, "learning_rate": 9.932462854570013e-05, "loss": 2.3712, "step": 35 }, { "epoch": 0.048160535117056855, "grad_norm": 1.682218074798584, "learning_rate": 9.927960378208014e-05, "loss": 2.1458, "step": 36 }, { "epoch": 0.049498327759197325, "grad_norm": 2.1129493713378906, "learning_rate": 9.923457901846016e-05, "loss": 2.0079, "step": 37 }, { "epoch": 0.050836120401337795, "grad_norm": 2.0923821926116943, "learning_rate": 9.918955425484017e-05, "loss": 1.9663, "step": 38 }, { "epoch": 0.05217391304347826, "grad_norm": 1.2081389427185059, "learning_rate": 9.914452949122017e-05, "loss": 1.9386, "step": 39 }, { "epoch": 0.05351170568561873, "grad_norm": 1.0379135608673096, "learning_rate": 9.909950472760019e-05, "loss": 2.0566, "step": 40 }, { "epoch": 0.0548494983277592, "grad_norm": 0.940050482749939, "learning_rate": 9.90544799639802e-05, "loss": 2.1417, "step": 41 }, { "epoch": 0.05618729096989967, "grad_norm": 0.8876029849052429, "learning_rate": 9.900945520036021e-05, "loss": 1.8754, "step": 42 }, { "epoch": 0.05752508361204013, "grad_norm": 0.8591219186782837, "learning_rate": 9.89644304367402e-05, "loss": 2.4408, "step": 43 }, { "epoch": 0.0588628762541806, "grad_norm": 1.1327685117721558, "learning_rate": 9.891940567312022e-05, "loss": 1.9898, "step": 44 }, { "epoch": 0.06020066889632107, "grad_norm": 0.9538917541503906, "learning_rate": 9.887438090950023e-05, "loss": 2.2119, "step": 45 }, { "epoch": 0.06153846153846154, "grad_norm": 1.2696901559829712, "learning_rate": 9.882935614588024e-05, "loss": 2.3051, "step": 46 }, { "epoch": 0.06287625418060201, "grad_norm": 0.92132169008255, "learning_rate": 9.878433138226025e-05, "loss": 2.4412, "step": 47 }, { "epoch": 0.06421404682274247, "grad_norm": 0.8168394565582275, "learning_rate": 9.873930661864026e-05, "loss": 2.0003, "step": 48 }, { "epoch": 0.06555183946488294, "grad_norm": 0.9975659251213074, "learning_rate": 9.869428185502027e-05, "loss": 2.3379, "step": 49 }, { "epoch": 0.06688963210702341, "grad_norm": 0.8086885213851929, "learning_rate": 9.864925709140028e-05, "loss": 2.0088, "step": 50 }, { "epoch": 0.06822742474916388, "grad_norm": 3.218966007232666, "learning_rate": 9.860423232778028e-05, "loss": 2.2297, "step": 51 }, { "epoch": 0.06956521739130435, "grad_norm": 0.8951109647750854, "learning_rate": 9.855920756416029e-05, "loss": 2.2432, "step": 52 }, { "epoch": 0.07090301003344482, "grad_norm": 0.8055227994918823, "learning_rate": 9.85141828005403e-05, "loss": 2.2097, "step": 53 }, { "epoch": 0.07224080267558529, "grad_norm": 0.7354897856712341, "learning_rate": 9.846915803692031e-05, "loss": 2.1813, "step": 54 }, { "epoch": 0.07357859531772576, "grad_norm": 1.6674742698669434, "learning_rate": 9.842413327330032e-05, "loss": 2.1639, "step": 55 }, { "epoch": 0.07491638795986622, "grad_norm": 30.052200317382812, "learning_rate": 9.837910850968033e-05, "loss": 2.9772, "step": 56 }, { "epoch": 0.07625418060200669, "grad_norm": 1.2769989967346191, "learning_rate": 9.833408374606034e-05, "loss": 2.1162, "step": 57 }, { "epoch": 0.07759197324414716, "grad_norm": 0.7706617116928101, "learning_rate": 9.828905898244036e-05, "loss": 2.2085, "step": 58 }, { "epoch": 0.07892976588628763, "grad_norm": 1.0603933334350586, "learning_rate": 9.824403421882035e-05, "loss": 1.5831, "step": 59 }, { "epoch": 0.0802675585284281, "grad_norm": 3.992206573486328, "learning_rate": 9.819900945520036e-05, "loss": 2.5495, "step": 60 }, { "epoch": 0.08160535117056857, "grad_norm": 0.6373528242111206, "learning_rate": 9.815398469158037e-05, "loss": 1.9227, "step": 61 }, { "epoch": 0.08294314381270904, "grad_norm": 0.8140257000923157, "learning_rate": 9.810895992796039e-05, "loss": 2.1962, "step": 62 }, { "epoch": 0.08428093645484949, "grad_norm": 1.3295791149139404, "learning_rate": 9.806393516434039e-05, "loss": 2.1865, "step": 63 }, { "epoch": 0.08561872909698996, "grad_norm": 1.176526427268982, "learning_rate": 9.80189104007204e-05, "loss": 2.0073, "step": 64 }, { "epoch": 0.08695652173913043, "grad_norm": 0.8747398257255554, "learning_rate": 9.797388563710042e-05, "loss": 2.332, "step": 65 }, { "epoch": 0.0882943143812709, "grad_norm": 0.7917225956916809, "learning_rate": 9.792886087348043e-05, "loss": 2.1264, "step": 66 }, { "epoch": 0.08963210702341137, "grad_norm": 0.6571652889251709, "learning_rate": 9.788383610986042e-05, "loss": 2.1755, "step": 67 }, { "epoch": 0.09096989966555184, "grad_norm": 0.7766551971435547, "learning_rate": 9.783881134624043e-05, "loss": 2.0205, "step": 68 }, { "epoch": 0.09230769230769231, "grad_norm": 0.8635064363479614, "learning_rate": 9.779378658262045e-05, "loss": 1.6129, "step": 69 }, { "epoch": 0.09364548494983277, "grad_norm": 1.7755194902420044, "learning_rate": 9.774876181900046e-05, "loss": 2.2523, "step": 70 }, { "epoch": 0.09498327759197324, "grad_norm": 0.8652092814445496, "learning_rate": 9.770373705538046e-05, "loss": 2.1025, "step": 71 }, { "epoch": 0.09632107023411371, "grad_norm": 0.7990190982818604, "learning_rate": 9.765871229176046e-05, "loss": 2.1477, "step": 72 }, { "epoch": 0.09765886287625418, "grad_norm": 3.000373125076294, "learning_rate": 9.761368752814049e-05, "loss": 1.8605, "step": 73 }, { "epoch": 0.09899665551839465, "grad_norm": 0.7712721228599548, "learning_rate": 9.75686627645205e-05, "loss": 2.2457, "step": 74 }, { "epoch": 0.10033444816053512, "grad_norm": 0.6429623365402222, "learning_rate": 9.752363800090049e-05, "loss": 2.1461, "step": 75 }, { "epoch": 0.10167224080267559, "grad_norm": 0.8219184875488281, "learning_rate": 9.747861323728051e-05, "loss": 2.1473, "step": 76 }, { "epoch": 0.10301003344481606, "grad_norm": 0.8963313102722168, "learning_rate": 9.743358847366052e-05, "loss": 1.9817, "step": 77 }, { "epoch": 0.10434782608695652, "grad_norm": 1.6411892175674438, "learning_rate": 9.738856371004053e-05, "loss": 1.8849, "step": 78 }, { "epoch": 0.10568561872909699, "grad_norm": 0.8379344344139099, "learning_rate": 9.734353894642053e-05, "loss": 2.226, "step": 79 }, { "epoch": 0.10702341137123746, "grad_norm": 1.2867939472198486, "learning_rate": 9.729851418280055e-05, "loss": 2.3098, "step": 80 }, { "epoch": 0.10836120401337793, "grad_norm": 0.8235902190208435, "learning_rate": 9.725348941918056e-05, "loss": 2.0342, "step": 81 }, { "epoch": 0.1096989966555184, "grad_norm": 2.49385142326355, "learning_rate": 9.720846465556056e-05, "loss": 1.7611, "step": 82 }, { "epoch": 0.11103678929765887, "grad_norm": 0.8160635232925415, "learning_rate": 9.716343989194057e-05, "loss": 2.1493, "step": 83 }, { "epoch": 0.11237458193979934, "grad_norm": 0.739782452583313, "learning_rate": 9.711841512832058e-05, "loss": 2.1616, "step": 84 }, { "epoch": 0.11371237458193979, "grad_norm": 2.4206647872924805, "learning_rate": 9.707339036470059e-05, "loss": 2.0272, "step": 85 }, { "epoch": 0.11505016722408026, "grad_norm": 1.1969271898269653, "learning_rate": 9.70283656010806e-05, "loss": 2.2563, "step": 86 }, { "epoch": 0.11638795986622073, "grad_norm": 1.1001033782958984, "learning_rate": 9.698334083746061e-05, "loss": 2.2338, "step": 87 }, { "epoch": 0.1177257525083612, "grad_norm": 0.660322368144989, "learning_rate": 9.693831607384062e-05, "loss": 2.1892, "step": 88 }, { "epoch": 0.11906354515050167, "grad_norm": 0.7543382048606873, "learning_rate": 9.689329131022062e-05, "loss": 2.1808, "step": 89 }, { "epoch": 0.12040133779264214, "grad_norm": 0.6813179850578308, "learning_rate": 9.684826654660063e-05, "loss": 2.4539, "step": 90 }, { "epoch": 0.12173913043478261, "grad_norm": 0.7714611887931824, "learning_rate": 9.680324178298064e-05, "loss": 2.1391, "step": 91 }, { "epoch": 0.12307692307692308, "grad_norm": 0.8586478233337402, "learning_rate": 9.675821701936065e-05, "loss": 2.2578, "step": 92 }, { "epoch": 0.12441471571906354, "grad_norm": 0.6438114643096924, "learning_rate": 9.671319225574066e-05, "loss": 1.9993, "step": 93 }, { "epoch": 0.12575250836120402, "grad_norm": 0.7894206643104553, "learning_rate": 9.666816749212068e-05, "loss": 1.7321, "step": 94 }, { "epoch": 0.12709030100334448, "grad_norm": 1.143427848815918, "learning_rate": 9.662314272850068e-05, "loss": 1.93, "step": 95 }, { "epoch": 0.12842809364548494, "grad_norm": 0.6587496399879456, "learning_rate": 9.657811796488068e-05, "loss": 1.6182, "step": 96 }, { "epoch": 0.12976588628762542, "grad_norm": 0.8149839639663696, "learning_rate": 9.653309320126069e-05, "loss": 1.957, "step": 97 }, { "epoch": 0.13110367892976588, "grad_norm": 0.6830907464027405, "learning_rate": 9.648806843764072e-05, "loss": 2.6174, "step": 98 }, { "epoch": 0.13244147157190636, "grad_norm": 0.8395177721977234, "learning_rate": 9.644304367402071e-05, "loss": 2.2876, "step": 99 }, { "epoch": 0.13377926421404682, "grad_norm": 0.723379373550415, "learning_rate": 9.639801891040072e-05, "loss": 1.9577, "step": 100 }, { "epoch": 0.1351170568561873, "grad_norm": 1.305256962776184, "learning_rate": 9.635299414678074e-05, "loss": 1.787, "step": 101 }, { "epoch": 0.13645484949832776, "grad_norm": 1.9760712385177612, "learning_rate": 9.630796938316075e-05, "loss": 1.877, "step": 102 }, { "epoch": 0.13779264214046824, "grad_norm": 1.0222994089126587, "learning_rate": 9.626294461954074e-05, "loss": 2.1143, "step": 103 }, { "epoch": 0.1391304347826087, "grad_norm": 1.173228144645691, "learning_rate": 9.621791985592075e-05, "loss": 2.4299, "step": 104 }, { "epoch": 0.14046822742474915, "grad_norm": 0.8381727337837219, "learning_rate": 9.617289509230078e-05, "loss": 2.0116, "step": 105 }, { "epoch": 0.14180602006688964, "grad_norm": 0.8412324786186218, "learning_rate": 9.612787032868078e-05, "loss": 2.1727, "step": 106 }, { "epoch": 0.1431438127090301, "grad_norm": 1.3966891765594482, "learning_rate": 9.608284556506079e-05, "loss": 1.7429, "step": 107 }, { "epoch": 0.14448160535117058, "grad_norm": 0.860106885433197, "learning_rate": 9.603782080144079e-05, "loss": 2.2786, "step": 108 }, { "epoch": 0.14581939799331103, "grad_norm": 2.258957862854004, "learning_rate": 9.599279603782081e-05, "loss": 1.8781, "step": 109 }, { "epoch": 0.14715719063545152, "grad_norm": 0.8375409245491028, "learning_rate": 9.594777127420082e-05, "loss": 2.0776, "step": 110 }, { "epoch": 0.14849498327759197, "grad_norm": 0.6946620345115662, "learning_rate": 9.590274651058083e-05, "loss": 1.9919, "step": 111 }, { "epoch": 0.14983277591973243, "grad_norm": 0.7830926775932312, "learning_rate": 9.585772174696084e-05, "loss": 2.1972, "step": 112 }, { "epoch": 0.15117056856187291, "grad_norm": 0.7598808407783508, "learning_rate": 9.581269698334084e-05, "loss": 2.2484, "step": 113 }, { "epoch": 0.15250836120401337, "grad_norm": 1.3570846319198608, "learning_rate": 9.576767221972085e-05, "loss": 2.4063, "step": 114 }, { "epoch": 0.15384615384615385, "grad_norm": 0.9569585919380188, "learning_rate": 9.572264745610086e-05, "loss": 2.0047, "step": 115 }, { "epoch": 0.1551839464882943, "grad_norm": 0.6475653648376465, "learning_rate": 9.567762269248087e-05, "loss": 2.1048, "step": 116 }, { "epoch": 0.1565217391304348, "grad_norm": 1.0348844528198242, "learning_rate": 9.563259792886088e-05, "loss": 2.1492, "step": 117 }, { "epoch": 0.15785953177257525, "grad_norm": 1.5975306034088135, "learning_rate": 9.558757316524089e-05, "loss": 2.2867, "step": 118 }, { "epoch": 0.1591973244147157, "grad_norm": 0.7629224061965942, "learning_rate": 9.55425484016209e-05, "loss": 2.5327, "step": 119 }, { "epoch": 0.1605351170568562, "grad_norm": 7.820376396179199, "learning_rate": 9.54975236380009e-05, "loss": 1.9083, "step": 120 }, { "epoch": 0.16187290969899665, "grad_norm": 0.7340837121009827, "learning_rate": 9.545249887438091e-05, "loss": 2.1577, "step": 121 }, { "epoch": 0.16321070234113713, "grad_norm": 0.772908627986908, "learning_rate": 9.540747411076092e-05, "loss": 2.0281, "step": 122 }, { "epoch": 0.1645484949832776, "grad_norm": 0.6994909048080444, "learning_rate": 9.536244934714093e-05, "loss": 1.7673, "step": 123 }, { "epoch": 0.16588628762541807, "grad_norm": 4.215782642364502, "learning_rate": 9.531742458352094e-05, "loss": 2.0693, "step": 124 }, { "epoch": 0.16722408026755853, "grad_norm": 0.8319055438041687, "learning_rate": 9.527239981990095e-05, "loss": 2.1013, "step": 125 }, { "epoch": 0.16856187290969898, "grad_norm": 0.720665454864502, "learning_rate": 9.522737505628096e-05, "loss": 2.4987, "step": 126 }, { "epoch": 0.16989966555183947, "grad_norm": 0.7007707953453064, "learning_rate": 9.518235029266098e-05, "loss": 1.9992, "step": 127 }, { "epoch": 0.17123745819397992, "grad_norm": 1.1250567436218262, "learning_rate": 9.513732552904097e-05, "loss": 2.3214, "step": 128 }, { "epoch": 0.1725752508361204, "grad_norm": 1.054378628730774, "learning_rate": 9.509230076542098e-05, "loss": 2.3111, "step": 129 }, { "epoch": 0.17391304347826086, "grad_norm": 0.8713865876197815, "learning_rate": 9.5047276001801e-05, "loss": 1.4661, "step": 130 }, { "epoch": 0.17525083612040135, "grad_norm": 1.2222834825515747, "learning_rate": 9.500225123818101e-05, "loss": 2.0313, "step": 131 }, { "epoch": 0.1765886287625418, "grad_norm": 0.7791270017623901, "learning_rate": 9.495722647456101e-05, "loss": 1.8948, "step": 132 }, { "epoch": 0.17792642140468226, "grad_norm": 0.7660050392150879, "learning_rate": 9.491220171094102e-05, "loss": 1.5712, "step": 133 }, { "epoch": 0.17926421404682275, "grad_norm": 2.549402952194214, "learning_rate": 9.486717694732104e-05, "loss": 2.3461, "step": 134 }, { "epoch": 0.1806020066889632, "grad_norm": 0.8216731548309326, "learning_rate": 9.482215218370105e-05, "loss": 2.1653, "step": 135 }, { "epoch": 0.18193979933110369, "grad_norm": 1.1273239850997925, "learning_rate": 9.477712742008104e-05, "loss": 2.0003, "step": 136 }, { "epoch": 0.18327759197324414, "grad_norm": 1.4746992588043213, "learning_rate": 9.473210265646106e-05, "loss": 1.8176, "step": 137 }, { "epoch": 0.18461538461538463, "grad_norm": 0.6299614906311035, "learning_rate": 9.468707789284107e-05, "loss": 1.9021, "step": 138 }, { "epoch": 0.18595317725752508, "grad_norm": 0.6752821207046509, "learning_rate": 9.464205312922108e-05, "loss": 1.5829, "step": 139 }, { "epoch": 0.18729096989966554, "grad_norm": 0.8192165493965149, "learning_rate": 9.459702836560108e-05, "loss": 2.093, "step": 140 }, { "epoch": 0.18862876254180602, "grad_norm": 0.7584500312805176, "learning_rate": 9.45520036019811e-05, "loss": 2.2507, "step": 141 }, { "epoch": 0.18996655518394648, "grad_norm": 0.6749038100242615, "learning_rate": 9.45069788383611e-05, "loss": 2.1704, "step": 142 }, { "epoch": 0.19130434782608696, "grad_norm": 8.133386611938477, "learning_rate": 9.446195407474112e-05, "loss": 2.121, "step": 143 }, { "epoch": 0.19264214046822742, "grad_norm": 0.7231702208518982, "learning_rate": 9.441692931112111e-05, "loss": 2.1596, "step": 144 }, { "epoch": 0.1939799331103679, "grad_norm": 1.4290648698806763, "learning_rate": 9.437190454750113e-05, "loss": 2.0232, "step": 145 }, { "epoch": 0.19531772575250836, "grad_norm": 0.7707876563072205, "learning_rate": 9.432687978388114e-05, "loss": 2.408, "step": 146 }, { "epoch": 0.19665551839464884, "grad_norm": 0.6868302822113037, "learning_rate": 9.428185502026115e-05, "loss": 2.1884, "step": 147 }, { "epoch": 0.1979933110367893, "grad_norm": 1.0088449716567993, "learning_rate": 9.423683025664116e-05, "loss": 2.3152, "step": 148 }, { "epoch": 0.19933110367892976, "grad_norm": 0.7136611938476562, "learning_rate": 9.419180549302117e-05, "loss": 2.0639, "step": 149 }, { "epoch": 0.20066889632107024, "grad_norm": 0.7562902569770813, "learning_rate": 9.414678072940118e-05, "loss": 2.1516, "step": 150 }, { "epoch": 0.2020066889632107, "grad_norm": 0.7779235243797302, "learning_rate": 9.410175596578118e-05, "loss": 2.0172, "step": 151 }, { "epoch": 0.20334448160535118, "grad_norm": 0.7270732522010803, "learning_rate": 9.405673120216119e-05, "loss": 2.5632, "step": 152 }, { "epoch": 0.20468227424749164, "grad_norm": 6.782646179199219, "learning_rate": 9.40117064385412e-05, "loss": 1.9557, "step": 153 }, { "epoch": 0.20602006688963212, "grad_norm": 0.8244476318359375, "learning_rate": 9.396668167492121e-05, "loss": 2.2257, "step": 154 }, { "epoch": 0.20735785953177258, "grad_norm": 0.7395358681678772, "learning_rate": 9.392165691130123e-05, "loss": 1.8494, "step": 155 }, { "epoch": 0.20869565217391303, "grad_norm": 1.2709623575210571, "learning_rate": 9.387663214768123e-05, "loss": 2.579, "step": 156 }, { "epoch": 0.21003344481605352, "grad_norm": 0.8247620463371277, "learning_rate": 9.383160738406124e-05, "loss": 2.2475, "step": 157 }, { "epoch": 0.21137123745819397, "grad_norm": 0.7007076144218445, "learning_rate": 9.378658262044124e-05, "loss": 2.1498, "step": 158 }, { "epoch": 0.21270903010033446, "grad_norm": 0.6729087829589844, "learning_rate": 9.374155785682127e-05, "loss": 1.8659, "step": 159 }, { "epoch": 0.2140468227424749, "grad_norm": 0.8817248344421387, "learning_rate": 9.369653309320126e-05, "loss": 2.2277, "step": 160 }, { "epoch": 0.2153846153846154, "grad_norm": 0.660122275352478, "learning_rate": 9.365150832958127e-05, "loss": 2.3238, "step": 161 }, { "epoch": 0.21672240802675585, "grad_norm": 0.7546793818473816, "learning_rate": 9.360648356596128e-05, "loss": 2.3036, "step": 162 }, { "epoch": 0.2180602006688963, "grad_norm": 1.4873573780059814, "learning_rate": 9.35614588023413e-05, "loss": 2.0092, "step": 163 }, { "epoch": 0.2193979933110368, "grad_norm": 0.8722738027572632, "learning_rate": 9.35164340387213e-05, "loss": 1.6106, "step": 164 }, { "epoch": 0.22073578595317725, "grad_norm": 0.7043030858039856, "learning_rate": 9.34714092751013e-05, "loss": 1.7849, "step": 165 }, { "epoch": 0.22207357859531773, "grad_norm": 0.8679320216178894, "learning_rate": 9.342638451148133e-05, "loss": 1.816, "step": 166 }, { "epoch": 0.2234113712374582, "grad_norm": 0.6752539873123169, "learning_rate": 9.338135974786133e-05, "loss": 1.9456, "step": 167 }, { "epoch": 0.22474916387959867, "grad_norm": 0.8796198964118958, "learning_rate": 9.333633498424133e-05, "loss": 2.0501, "step": 168 }, { "epoch": 0.22608695652173913, "grad_norm": 1.6531866788864136, "learning_rate": 9.329131022062134e-05, "loss": 1.9587, "step": 169 }, { "epoch": 0.22742474916387959, "grad_norm": 0.6475427746772766, "learning_rate": 9.324628545700136e-05, "loss": 2.0387, "step": 170 }, { "epoch": 0.22876254180602007, "grad_norm": 0.79718416929245, "learning_rate": 9.320126069338137e-05, "loss": 1.9722, "step": 171 }, { "epoch": 0.23010033444816053, "grad_norm": 0.6928245425224304, "learning_rate": 9.315623592976136e-05, "loss": 2.2319, "step": 172 }, { "epoch": 0.231438127090301, "grad_norm": 0.8756166100502014, "learning_rate": 9.311121116614139e-05, "loss": 2.2756, "step": 173 }, { "epoch": 0.23277591973244147, "grad_norm": 0.7787219285964966, "learning_rate": 9.30661864025214e-05, "loss": 1.9229, "step": 174 }, { "epoch": 0.23411371237458195, "grad_norm": 1.0068119764328003, "learning_rate": 9.30211616389014e-05, "loss": 1.5486, "step": 175 }, { "epoch": 0.2354515050167224, "grad_norm": 0.6787288188934326, "learning_rate": 9.29761368752814e-05, "loss": 2.2945, "step": 176 }, { "epoch": 0.23678929765886286, "grad_norm": 0.6843809485435486, "learning_rate": 9.293111211166142e-05, "loss": 2.148, "step": 177 }, { "epoch": 0.23812709030100335, "grad_norm": 0.6215639114379883, "learning_rate": 9.288608734804143e-05, "loss": 2.2645, "step": 178 }, { "epoch": 0.2394648829431438, "grad_norm": 0.6082992553710938, "learning_rate": 9.284106258442144e-05, "loss": 2.0593, "step": 179 }, { "epoch": 0.2408026755852843, "grad_norm": 1.1877750158309937, "learning_rate": 9.279603782080145e-05, "loss": 2.0483, "step": 180 }, { "epoch": 0.24214046822742474, "grad_norm": 1.0917643308639526, "learning_rate": 9.275101305718146e-05, "loss": 2.2408, "step": 181 }, { "epoch": 0.24347826086956523, "grad_norm": 1.3530385494232178, "learning_rate": 9.270598829356146e-05, "loss": 2.0472, "step": 182 }, { "epoch": 0.24481605351170568, "grad_norm": 0.7971617579460144, "learning_rate": 9.266096352994147e-05, "loss": 2.2679, "step": 183 }, { "epoch": 0.24615384615384617, "grad_norm": 0.9468250870704651, "learning_rate": 9.261593876632148e-05, "loss": 2.2528, "step": 184 }, { "epoch": 0.24749163879598662, "grad_norm": 0.9091652035713196, "learning_rate": 9.257091400270149e-05, "loss": 2.2659, "step": 185 }, { "epoch": 0.24882943143812708, "grad_norm": 0.708520770072937, "learning_rate": 9.25258892390815e-05, "loss": 1.8455, "step": 186 }, { "epoch": 0.25016722408026754, "grad_norm": 1.3594329357147217, "learning_rate": 9.24808644754615e-05, "loss": 1.8394, "step": 187 }, { "epoch": 0.25150501672240805, "grad_norm": 0.7608739733695984, "learning_rate": 9.243583971184152e-05, "loss": 2.3127, "step": 188 }, { "epoch": 0.2528428093645485, "grad_norm": 0.8208618760108948, "learning_rate": 9.239081494822152e-05, "loss": 2.0739, "step": 189 }, { "epoch": 0.25418060200668896, "grad_norm": 4.823574542999268, "learning_rate": 9.234579018460153e-05, "loss": 2.3519, "step": 190 }, { "epoch": 0.2555183946488294, "grad_norm": 1.364988923072815, "learning_rate": 9.230076542098155e-05, "loss": 2.1007, "step": 191 }, { "epoch": 0.2568561872909699, "grad_norm": 0.6934626698493958, "learning_rate": 9.225574065736155e-05, "loss": 2.1424, "step": 192 }, { "epoch": 0.2581939799331104, "grad_norm": 0.6626648306846619, "learning_rate": 9.221071589374156e-05, "loss": 1.6056, "step": 193 }, { "epoch": 0.25953177257525084, "grad_norm": 0.6442323923110962, "learning_rate": 9.216569113012157e-05, "loss": 2.1698, "step": 194 }, { "epoch": 0.2608695652173913, "grad_norm": 1.5775187015533447, "learning_rate": 9.212066636650159e-05, "loss": 2.2164, "step": 195 }, { "epoch": 0.26220735785953175, "grad_norm": 0.5490379929542542, "learning_rate": 9.207564160288158e-05, "loss": 2.5528, "step": 196 }, { "epoch": 0.26354515050167227, "grad_norm": 0.6842154860496521, "learning_rate": 9.203061683926159e-05, "loss": 2.1731, "step": 197 }, { "epoch": 0.2648829431438127, "grad_norm": 0.5994325280189514, "learning_rate": 9.19855920756416e-05, "loss": 2.2696, "step": 198 }, { "epoch": 0.2662207357859532, "grad_norm": 0.7480664253234863, "learning_rate": 9.194056731202162e-05, "loss": 2.1391, "step": 199 }, { "epoch": 0.26755852842809363, "grad_norm": 0.7724633812904358, "learning_rate": 9.189554254840163e-05, "loss": 2.4159, "step": 200 }, { "epoch": 0.2688963210702341, "grad_norm": 0.7571853995323181, "learning_rate": 9.185051778478163e-05, "loss": 1.6681, "step": 201 }, { "epoch": 0.2702341137123746, "grad_norm": 0.7273426651954651, "learning_rate": 9.180549302116165e-05, "loss": 2.529, "step": 202 }, { "epoch": 0.27157190635451506, "grad_norm": 0.8330695629119873, "learning_rate": 9.176046825754166e-05, "loss": 2.2551, "step": 203 }, { "epoch": 0.2729096989966555, "grad_norm": 0.6210842132568359, "learning_rate": 9.171544349392167e-05, "loss": 2.0951, "step": 204 }, { "epoch": 0.27424749163879597, "grad_norm": 0.6045663356781006, "learning_rate": 9.167041873030166e-05, "loss": 1.6797, "step": 205 }, { "epoch": 0.2755852842809365, "grad_norm": 0.7505138516426086, "learning_rate": 9.162539396668168e-05, "loss": 1.862, "step": 206 }, { "epoch": 0.27692307692307694, "grad_norm": 0.77651047706604, "learning_rate": 9.158036920306169e-05, "loss": 2.2327, "step": 207 }, { "epoch": 0.2782608695652174, "grad_norm": 0.7325896620750427, "learning_rate": 9.15353444394417e-05, "loss": 1.8543, "step": 208 }, { "epoch": 0.27959866220735785, "grad_norm": 0.7572547793388367, "learning_rate": 9.149031967582171e-05, "loss": 2.2101, "step": 209 }, { "epoch": 0.2809364548494983, "grad_norm": 0.6754357814788818, "learning_rate": 9.144529491220172e-05, "loss": 2.2774, "step": 210 }, { "epoch": 0.2822742474916388, "grad_norm": 0.7105769515037537, "learning_rate": 9.140027014858173e-05, "loss": 2.0553, "step": 211 }, { "epoch": 0.2836120401337793, "grad_norm": 0.5948776602745056, "learning_rate": 9.135524538496173e-05, "loss": 2.1455, "step": 212 }, { "epoch": 0.28494983277591973, "grad_norm": 0.6252536177635193, "learning_rate": 9.131022062134174e-05, "loss": 1.7525, "step": 213 }, { "epoch": 0.2862876254180602, "grad_norm": 0.5089226961135864, "learning_rate": 9.126519585772175e-05, "loss": 1.8441, "step": 214 }, { "epoch": 0.28762541806020064, "grad_norm": 0.7862734198570251, "learning_rate": 9.122017109410176e-05, "loss": 2.1025, "step": 215 }, { "epoch": 0.28896321070234116, "grad_norm": 1.1257396936416626, "learning_rate": 9.117514633048177e-05, "loss": 1.8203, "step": 216 }, { "epoch": 0.2903010033444816, "grad_norm": 0.7163311243057251, "learning_rate": 9.113012156686178e-05, "loss": 2.513, "step": 217 }, { "epoch": 0.29163879598662207, "grad_norm": 0.5419270396232605, "learning_rate": 9.108509680324179e-05, "loss": 2.233, "step": 218 }, { "epoch": 0.2929765886287625, "grad_norm": 0.6821555495262146, "learning_rate": 9.10400720396218e-05, "loss": 2.0814, "step": 219 }, { "epoch": 0.29431438127090304, "grad_norm": 0.7533456087112427, "learning_rate": 9.09950472760018e-05, "loss": 1.9754, "step": 220 }, { "epoch": 0.2956521739130435, "grad_norm": 1.0426667928695679, "learning_rate": 9.095002251238181e-05, "loss": 1.8963, "step": 221 }, { "epoch": 0.29698996655518395, "grad_norm": 0.725765585899353, "learning_rate": 9.090499774876182e-05, "loss": 2.0239, "step": 222 }, { "epoch": 0.2983277591973244, "grad_norm": 0.5403796434402466, "learning_rate": 9.085997298514183e-05, "loss": 1.8004, "step": 223 }, { "epoch": 0.29966555183946486, "grad_norm": 0.7222073078155518, "learning_rate": 9.081494822152185e-05, "loss": 1.9841, "step": 224 }, { "epoch": 0.3010033444816054, "grad_norm": 1.1086597442626953, "learning_rate": 9.076992345790185e-05, "loss": 1.9085, "step": 225 }, { "epoch": 0.30234113712374583, "grad_norm": 1.6131938695907593, "learning_rate": 9.072489869428186e-05, "loss": 2.2787, "step": 226 }, { "epoch": 0.3036789297658863, "grad_norm": 0.686077356338501, "learning_rate": 9.067987393066188e-05, "loss": 1.8902, "step": 227 }, { "epoch": 0.30501672240802674, "grad_norm": 0.6241892576217651, "learning_rate": 9.063484916704189e-05, "loss": 2.2297, "step": 228 }, { "epoch": 0.3063545150501672, "grad_norm": 0.522357165813446, "learning_rate": 9.058982440342188e-05, "loss": 2.291, "step": 229 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5207152962684631, "learning_rate": 9.054479963980189e-05, "loss": 2.0928, "step": 230 }, { "epoch": 0.30903010033444817, "grad_norm": 0.6844892501831055, "learning_rate": 9.049977487618191e-05, "loss": 2.187, "step": 231 }, { "epoch": 0.3103678929765886, "grad_norm": 0.7186736464500427, "learning_rate": 9.045475011256192e-05, "loss": 2.0874, "step": 232 }, { "epoch": 0.3117056856187291, "grad_norm": 0.7187355160713196, "learning_rate": 9.040972534894192e-05, "loss": 1.9839, "step": 233 }, { "epoch": 0.3130434782608696, "grad_norm": 0.5916269421577454, "learning_rate": 9.036470058532192e-05, "loss": 2.1944, "step": 234 }, { "epoch": 0.31438127090301005, "grad_norm": 0.6687160134315491, "learning_rate": 9.031967582170195e-05, "loss": 1.9731, "step": 235 }, { "epoch": 0.3157190635451505, "grad_norm": 0.7072650790214539, "learning_rate": 9.027465105808195e-05, "loss": 1.8072, "step": 236 }, { "epoch": 0.31705685618729096, "grad_norm": 1.014660120010376, "learning_rate": 9.022962629446195e-05, "loss": 1.8828, "step": 237 }, { "epoch": 0.3183946488294314, "grad_norm": 1.4133497476577759, "learning_rate": 9.018460153084197e-05, "loss": 2.312, "step": 238 }, { "epoch": 0.3197324414715719, "grad_norm": 0.8173983693122864, "learning_rate": 9.013957676722198e-05, "loss": 2.0622, "step": 239 }, { "epoch": 0.3210702341137124, "grad_norm": 0.5996513962745667, "learning_rate": 9.009455200360199e-05, "loss": 2.2583, "step": 240 }, { "epoch": 0.32240802675585284, "grad_norm": 0.9969286918640137, "learning_rate": 9.004952723998198e-05, "loss": 2.1541, "step": 241 }, { "epoch": 0.3237458193979933, "grad_norm": 1.5697412490844727, "learning_rate": 9.0004502476362e-05, "loss": 1.8842, "step": 242 }, { "epoch": 0.3250836120401338, "grad_norm": 0.844559907913208, "learning_rate": 8.995947771274201e-05, "loss": 2.2494, "step": 243 }, { "epoch": 0.32642140468227426, "grad_norm": 0.8279430866241455, "learning_rate": 8.991445294912202e-05, "loss": 2.0218, "step": 244 }, { "epoch": 0.3277591973244147, "grad_norm": 0.5577782392501831, "learning_rate": 8.986942818550203e-05, "loss": 2.1826, "step": 245 }, { "epoch": 0.3290969899665552, "grad_norm": 0.6767871975898743, "learning_rate": 8.982440342188204e-05, "loss": 1.6723, "step": 246 }, { "epoch": 0.33043478260869563, "grad_norm": 0.7155217528343201, "learning_rate": 8.977937865826205e-05, "loss": 1.9627, "step": 247 }, { "epoch": 0.33177257525083614, "grad_norm": 0.7340490818023682, "learning_rate": 8.973435389464206e-05, "loss": 2.1307, "step": 248 }, { "epoch": 0.3331103678929766, "grad_norm": 1.0434596538543701, "learning_rate": 8.968932913102207e-05, "loss": 1.7668, "step": 249 }, { "epoch": 0.33444816053511706, "grad_norm": 0.7316417694091797, "learning_rate": 8.964430436740207e-05, "loss": 2.0947, "step": 250 }, { "epoch": 0.3357859531772575, "grad_norm": 0.6342126131057739, "learning_rate": 8.959927960378208e-05, "loss": 2.1559, "step": 251 }, { "epoch": 0.33712374581939797, "grad_norm": 0.6641095280647278, "learning_rate": 8.955425484016209e-05, "loss": 2.1736, "step": 252 }, { "epoch": 0.3384615384615385, "grad_norm": 0.6803715229034424, "learning_rate": 8.95092300765421e-05, "loss": 2.3727, "step": 253 }, { "epoch": 0.33979933110367894, "grad_norm": 0.6433061957359314, "learning_rate": 8.946420531292211e-05, "loss": 2.0082, "step": 254 }, { "epoch": 0.3411371237458194, "grad_norm": 0.7468423247337341, "learning_rate": 8.941918054930212e-05, "loss": 2.2016, "step": 255 }, { "epoch": 0.34247491638795985, "grad_norm": 1.1716619729995728, "learning_rate": 8.937415578568214e-05, "loss": 2.1824, "step": 256 }, { "epoch": 0.34381270903010036, "grad_norm": 3.4333786964416504, "learning_rate": 8.932913102206213e-05, "loss": 1.9569, "step": 257 }, { "epoch": 0.3451505016722408, "grad_norm": 0.7647579312324524, "learning_rate": 8.928410625844214e-05, "loss": 2.1416, "step": 258 }, { "epoch": 0.3464882943143813, "grad_norm": 0.7500723600387573, "learning_rate": 8.923908149482215e-05, "loss": 2.1495, "step": 259 }, { "epoch": 0.34782608695652173, "grad_norm": 0.7312958240509033, "learning_rate": 8.919405673120217e-05, "loss": 2.136, "step": 260 }, { "epoch": 0.3491638795986622, "grad_norm": 0.7841158509254456, "learning_rate": 8.914903196758217e-05, "loss": 1.837, "step": 261 }, { "epoch": 0.3505016722408027, "grad_norm": 1.176447868347168, "learning_rate": 8.910400720396218e-05, "loss": 2.1528, "step": 262 }, { "epoch": 0.35183946488294315, "grad_norm": 0.694179117679596, "learning_rate": 8.90589824403422e-05, "loss": 2.1408, "step": 263 }, { "epoch": 0.3531772575250836, "grad_norm": 0.6742585301399231, "learning_rate": 8.901395767672221e-05, "loss": 2.3309, "step": 264 }, { "epoch": 0.35451505016722407, "grad_norm": 10.892035484313965, "learning_rate": 8.89689329131022e-05, "loss": 2.6335, "step": 265 }, { "epoch": 0.3558528428093645, "grad_norm": 0.718368411064148, "learning_rate": 8.892390814948221e-05, "loss": 2.2006, "step": 266 }, { "epoch": 0.35719063545150503, "grad_norm": 0.9842844605445862, "learning_rate": 8.887888338586223e-05, "loss": 1.8219, "step": 267 }, { "epoch": 0.3585284280936455, "grad_norm": 0.6319013237953186, "learning_rate": 8.883385862224224e-05, "loss": 2.0513, "step": 268 }, { "epoch": 0.35986622073578595, "grad_norm": 0.7445176243782043, "learning_rate": 8.878883385862224e-05, "loss": 1.9484, "step": 269 }, { "epoch": 0.3612040133779264, "grad_norm": 0.7307112216949463, "learning_rate": 8.874380909500225e-05, "loss": 2.2374, "step": 270 }, { "epoch": 0.3625418060200669, "grad_norm": 0.6374862790107727, "learning_rate": 8.869878433138227e-05, "loss": 2.0784, "step": 271 }, { "epoch": 0.36387959866220737, "grad_norm": 0.6017893552780151, "learning_rate": 8.865375956776228e-05, "loss": 2.0, "step": 272 }, { "epoch": 0.3652173913043478, "grad_norm": 1.2405426502227783, "learning_rate": 8.860873480414229e-05, "loss": 2.3654, "step": 273 }, { "epoch": 0.3665551839464883, "grad_norm": 0.8560160398483276, "learning_rate": 8.85637100405223e-05, "loss": 2.181, "step": 274 }, { "epoch": 0.36789297658862874, "grad_norm": 1.1745796203613281, "learning_rate": 8.85186852769023e-05, "loss": 2.2952, "step": 275 }, { "epoch": 0.36923076923076925, "grad_norm": 0.6238117814064026, "learning_rate": 8.847366051328231e-05, "loss": 2.2203, "step": 276 }, { "epoch": 0.3705685618729097, "grad_norm": 1.2902181148529053, "learning_rate": 8.842863574966232e-05, "loss": 2.07, "step": 277 }, { "epoch": 0.37190635451505016, "grad_norm": 2.4692065715789795, "learning_rate": 8.838361098604233e-05, "loss": 1.4593, "step": 278 }, { "epoch": 0.3732441471571906, "grad_norm": 0.7107476592063904, "learning_rate": 8.833858622242234e-05, "loss": 2.1197, "step": 279 }, { "epoch": 0.3745819397993311, "grad_norm": 0.6064779162406921, "learning_rate": 8.829356145880235e-05, "loss": 2.1686, "step": 280 }, { "epoch": 0.3759197324414716, "grad_norm": 0.9871097207069397, "learning_rate": 8.824853669518235e-05, "loss": 2.1, "step": 281 }, { "epoch": 0.37725752508361204, "grad_norm": 0.7217766642570496, "learning_rate": 8.820351193156236e-05, "loss": 1.8677, "step": 282 }, { "epoch": 0.3785953177257525, "grad_norm": 0.6632222533226013, "learning_rate": 8.815848716794237e-05, "loss": 1.9146, "step": 283 }, { "epoch": 0.37993311036789296, "grad_norm": 1.2834292650222778, "learning_rate": 8.811346240432238e-05, "loss": 2.2682, "step": 284 }, { "epoch": 0.38127090301003347, "grad_norm": 0.6314417719841003, "learning_rate": 8.806843764070239e-05, "loss": 2.0938, "step": 285 }, { "epoch": 0.3826086956521739, "grad_norm": 0.8179631233215332, "learning_rate": 8.80234128770824e-05, "loss": 2.308, "step": 286 }, { "epoch": 0.3839464882943144, "grad_norm": 0.6403224468231201, "learning_rate": 8.79783881134624e-05, "loss": 2.186, "step": 287 }, { "epoch": 0.38528428093645484, "grad_norm": 0.6534366011619568, "learning_rate": 8.793336334984241e-05, "loss": 2.0692, "step": 288 }, { "epoch": 0.3866220735785953, "grad_norm": 0.7949046492576599, "learning_rate": 8.788833858622242e-05, "loss": 2.1176, "step": 289 }, { "epoch": 0.3879598662207358, "grad_norm": 0.5939118266105652, "learning_rate": 8.784331382260243e-05, "loss": 2.3326, "step": 290 }, { "epoch": 0.38929765886287626, "grad_norm": 0.6752482652664185, "learning_rate": 8.779828905898244e-05, "loss": 2.1355, "step": 291 }, { "epoch": 0.3906354515050167, "grad_norm": 1.1655668020248413, "learning_rate": 8.775326429536246e-05, "loss": 1.7848, "step": 292 }, { "epoch": 0.3919732441471572, "grad_norm": 1.0938507318496704, "learning_rate": 8.770823953174246e-05, "loss": 2.0154, "step": 293 }, { "epoch": 0.3933110367892977, "grad_norm": 1.0485789775848389, "learning_rate": 8.766321476812247e-05, "loss": 2.2559, "step": 294 }, { "epoch": 0.39464882943143814, "grad_norm": 1.0306947231292725, "learning_rate": 8.761819000450247e-05, "loss": 2.403, "step": 295 }, { "epoch": 0.3959866220735786, "grad_norm": 2.1360738277435303, "learning_rate": 8.75731652408825e-05, "loss": 2.0527, "step": 296 }, { "epoch": 0.39732441471571905, "grad_norm": 0.8952328562736511, "learning_rate": 8.75281404772625e-05, "loss": 2.2782, "step": 297 }, { "epoch": 0.3986622073578595, "grad_norm": 0.5768120288848877, "learning_rate": 8.74831157136425e-05, "loss": 2.527, "step": 298 }, { "epoch": 0.4, "grad_norm": 0.754932165145874, "learning_rate": 8.743809095002252e-05, "loss": 2.3223, "step": 299 }, { "epoch": 0.4013377926421405, "grad_norm": 1.3988687992095947, "learning_rate": 8.739306618640253e-05, "loss": 2.2288, "step": 300 }, { "epoch": 0.40267558528428093, "grad_norm": 0.6435593366622925, "learning_rate": 8.734804142278254e-05, "loss": 1.9748, "step": 301 }, { "epoch": 0.4040133779264214, "grad_norm": 2.387169361114502, "learning_rate": 8.730301665916253e-05, "loss": 2.2313, "step": 302 }, { "epoch": 0.40535117056856185, "grad_norm": 0.6832499504089355, "learning_rate": 8.725799189554256e-05, "loss": 2.2805, "step": 303 }, { "epoch": 0.40668896321070236, "grad_norm": 0.7087513208389282, "learning_rate": 8.721296713192257e-05, "loss": 2.0363, "step": 304 }, { "epoch": 0.4080267558528428, "grad_norm": 0.9904339909553528, "learning_rate": 8.716794236830257e-05, "loss": 2.4282, "step": 305 }, { "epoch": 0.40936454849498327, "grad_norm": 3.171907901763916, "learning_rate": 8.712291760468257e-05, "loss": 2.412, "step": 306 }, { "epoch": 0.4107023411371237, "grad_norm": 0.6599994897842407, "learning_rate": 8.707789284106259e-05, "loss": 2.0818, "step": 307 }, { "epoch": 0.41204013377926424, "grad_norm": 0.7640592455863953, "learning_rate": 8.70328680774426e-05, "loss": 2.0231, "step": 308 }, { "epoch": 0.4133779264214047, "grad_norm": 0.7022128701210022, "learning_rate": 8.698784331382261e-05, "loss": 1.9285, "step": 309 }, { "epoch": 0.41471571906354515, "grad_norm": 8.844080924987793, "learning_rate": 8.694281855020262e-05, "loss": 2.3078, "step": 310 }, { "epoch": 0.4160535117056856, "grad_norm": 0.6928530335426331, "learning_rate": 8.689779378658263e-05, "loss": 2.276, "step": 311 }, { "epoch": 0.41739130434782606, "grad_norm": 0.808559238910675, "learning_rate": 8.685276902296263e-05, "loss": 2.2516, "step": 312 }, { "epoch": 0.4187290969899666, "grad_norm": 0.5992230176925659, "learning_rate": 8.680774425934264e-05, "loss": 2.3784, "step": 313 }, { "epoch": 0.42006688963210703, "grad_norm": 0.618697464466095, "learning_rate": 8.676271949572265e-05, "loss": 2.0398, "step": 314 }, { "epoch": 0.4214046822742475, "grad_norm": 0.5807810425758362, "learning_rate": 8.671769473210266e-05, "loss": 2.2591, "step": 315 }, { "epoch": 0.42274247491638794, "grad_norm": 1.4132471084594727, "learning_rate": 8.667266996848267e-05, "loss": 2.3054, "step": 316 }, { "epoch": 0.4240802675585284, "grad_norm": 2.2352471351623535, "learning_rate": 8.662764520486268e-05, "loss": 1.8411, "step": 317 }, { "epoch": 0.4254180602006689, "grad_norm": 1.1899638175964355, "learning_rate": 8.658262044124269e-05, "loss": 2.1932, "step": 318 }, { "epoch": 0.42675585284280937, "grad_norm": 0.7901405096054077, "learning_rate": 8.65375956776227e-05, "loss": 2.2161, "step": 319 }, { "epoch": 0.4280936454849498, "grad_norm": 0.5505680441856384, "learning_rate": 8.64925709140027e-05, "loss": 2.1754, "step": 320 }, { "epoch": 0.4294314381270903, "grad_norm": 0.8369104266166687, "learning_rate": 8.644754615038273e-05, "loss": 1.6397, "step": 321 }, { "epoch": 0.4307692307692308, "grad_norm": 0.6455018520355225, "learning_rate": 8.640252138676272e-05, "loss": 2.1317, "step": 322 }, { "epoch": 0.43210702341137125, "grad_norm": 15.443414688110352, "learning_rate": 8.635749662314273e-05, "loss": 2.2802, "step": 323 }, { "epoch": 0.4334448160535117, "grad_norm": 0.7857627272605896, "learning_rate": 8.631247185952274e-05, "loss": 2.2325, "step": 324 }, { "epoch": 0.43478260869565216, "grad_norm": 0.9558283090591431, "learning_rate": 8.626744709590276e-05, "loss": 1.9514, "step": 325 }, { "epoch": 0.4361204013377926, "grad_norm": 2.7177233695983887, "learning_rate": 8.622242233228275e-05, "loss": 2.1328, "step": 326 }, { "epoch": 0.43745819397993313, "grad_norm": 2.361937999725342, "learning_rate": 8.617739756866276e-05, "loss": 2.4997, "step": 327 }, { "epoch": 0.4387959866220736, "grad_norm": 0.877702534198761, "learning_rate": 8.613237280504279e-05, "loss": 2.3157, "step": 328 }, { "epoch": 0.44013377926421404, "grad_norm": 1.4664796590805054, "learning_rate": 8.60873480414228e-05, "loss": 2.024, "step": 329 }, { "epoch": 0.4414715719063545, "grad_norm": 0.627921462059021, "learning_rate": 8.604232327780279e-05, "loss": 2.0251, "step": 330 }, { "epoch": 0.442809364548495, "grad_norm": 0.7147241830825806, "learning_rate": 8.59972985141828e-05, "loss": 1.9449, "step": 331 }, { "epoch": 0.44414715719063547, "grad_norm": 0.7197164297103882, "learning_rate": 8.595227375056282e-05, "loss": 2.1838, "step": 332 }, { "epoch": 0.4454849498327759, "grad_norm": 0.6572954654693604, "learning_rate": 8.590724898694283e-05, "loss": 2.0866, "step": 333 }, { "epoch": 0.4468227424749164, "grad_norm": 1.1570994853973389, "learning_rate": 8.586222422332282e-05, "loss": 2.3039, "step": 334 }, { "epoch": 0.44816053511705684, "grad_norm": 0.6246607899665833, "learning_rate": 8.581719945970285e-05, "loss": 2.1241, "step": 335 }, { "epoch": 0.44949832775919735, "grad_norm": 0.6910645365715027, "learning_rate": 8.577217469608285e-05, "loss": 2.3911, "step": 336 }, { "epoch": 0.4508361204013378, "grad_norm": 0.978664219379425, "learning_rate": 8.572714993246286e-05, "loss": 1.7931, "step": 337 }, { "epoch": 0.45217391304347826, "grad_norm": 0.7342723608016968, "learning_rate": 8.568212516884286e-05, "loss": 1.6711, "step": 338 }, { "epoch": 0.4535117056856187, "grad_norm": 0.8861130475997925, "learning_rate": 8.563710040522288e-05, "loss": 2.1125, "step": 339 }, { "epoch": 0.45484949832775917, "grad_norm": 0.6609489917755127, "learning_rate": 8.559207564160289e-05, "loss": 1.9707, "step": 340 }, { "epoch": 0.4561872909698997, "grad_norm": 0.6270145177841187, "learning_rate": 8.55470508779829e-05, "loss": 2.0042, "step": 341 }, { "epoch": 0.45752508361204014, "grad_norm": 0.9418214559555054, "learning_rate": 8.550202611436289e-05, "loss": 2.2311, "step": 342 }, { "epoch": 0.4588628762541806, "grad_norm": 0.9658025503158569, "learning_rate": 8.545700135074291e-05, "loss": 2.3304, "step": 343 }, { "epoch": 0.46020066889632105, "grad_norm": 0.769410252571106, "learning_rate": 8.541197658712292e-05, "loss": 2.3133, "step": 344 }, { "epoch": 0.46153846153846156, "grad_norm": 0.5739223957061768, "learning_rate": 8.536695182350293e-05, "loss": 2.2093, "step": 345 }, { "epoch": 0.462876254180602, "grad_norm": 0.6792606711387634, "learning_rate": 8.532192705988294e-05, "loss": 2.2756, "step": 346 }, { "epoch": 0.4642140468227425, "grad_norm": 1.3889212608337402, "learning_rate": 8.527690229626295e-05, "loss": 1.7942, "step": 347 }, { "epoch": 0.46555183946488293, "grad_norm": 0.7244587540626526, "learning_rate": 8.523187753264296e-05, "loss": 1.8507, "step": 348 }, { "epoch": 0.4668896321070234, "grad_norm": 1.222921371459961, "learning_rate": 8.518685276902297e-05, "loss": 1.653, "step": 349 }, { "epoch": 0.4682274247491639, "grad_norm": 0.6867721080780029, "learning_rate": 8.514182800540297e-05, "loss": 2.1023, "step": 350 }, { "epoch": 0.46956521739130436, "grad_norm": 0.9452881813049316, "learning_rate": 8.509680324178298e-05, "loss": 2.4444, "step": 351 }, { "epoch": 0.4709030100334448, "grad_norm": 0.6819020509719849, "learning_rate": 8.505177847816299e-05, "loss": 1.7956, "step": 352 }, { "epoch": 0.47224080267558527, "grad_norm": 0.6449626088142395, "learning_rate": 8.500675371454301e-05, "loss": 2.1945, "step": 353 }, { "epoch": 0.4735785953177257, "grad_norm": 0.9270897507667542, "learning_rate": 8.496172895092301e-05, "loss": 2.0861, "step": 354 }, { "epoch": 0.47491638795986624, "grad_norm": 0.8355360627174377, "learning_rate": 8.491670418730302e-05, "loss": 2.0304, "step": 355 }, { "epoch": 0.4762541806020067, "grad_norm": 0.7179239988327026, "learning_rate": 8.487167942368303e-05, "loss": 2.038, "step": 356 }, { "epoch": 0.47759197324414715, "grad_norm": 0.6889925003051758, "learning_rate": 8.482665466006305e-05, "loss": 2.0335, "step": 357 }, { "epoch": 0.4789297658862876, "grad_norm": 0.6966556906700134, "learning_rate": 8.478162989644304e-05, "loss": 2.1169, "step": 358 }, { "epoch": 0.4802675585284281, "grad_norm": 0.6920545697212219, "learning_rate": 8.473660513282305e-05, "loss": 1.5785, "step": 359 }, { "epoch": 0.4816053511705686, "grad_norm": 2.2333033084869385, "learning_rate": 8.469158036920306e-05, "loss": 1.952, "step": 360 }, { "epoch": 0.48294314381270903, "grad_norm": 0.7082642912864685, "learning_rate": 8.464655560558308e-05, "loss": 1.8666, "step": 361 }, { "epoch": 0.4842809364548495, "grad_norm": 0.6602755784988403, "learning_rate": 8.460153084196308e-05, "loss": 2.1019, "step": 362 }, { "epoch": 0.48561872909698994, "grad_norm": 0.5931785106658936, "learning_rate": 8.455650607834309e-05, "loss": 2.1189, "step": 363 }, { "epoch": 0.48695652173913045, "grad_norm": 0.9318033456802368, "learning_rate": 8.451148131472311e-05, "loss": 1.6414, "step": 364 }, { "epoch": 0.4882943143812709, "grad_norm": 0.7986612915992737, "learning_rate": 8.446645655110312e-05, "loss": 2.1038, "step": 365 }, { "epoch": 0.48963210702341137, "grad_norm": 0.7577380537986755, "learning_rate": 8.442143178748311e-05, "loss": 2.2667, "step": 366 }, { "epoch": 0.4909698996655518, "grad_norm": 0.7104119658470154, "learning_rate": 8.437640702386312e-05, "loss": 1.9144, "step": 367 }, { "epoch": 0.49230769230769234, "grad_norm": 0.6269575953483582, "learning_rate": 8.433138226024314e-05, "loss": 2.2801, "step": 368 }, { "epoch": 0.4936454849498328, "grad_norm": 0.6898888349533081, "learning_rate": 8.428635749662315e-05, "loss": 2.0484, "step": 369 }, { "epoch": 0.49498327759197325, "grad_norm": 0.797180712223053, "learning_rate": 8.424133273300316e-05, "loss": 2.0078, "step": 370 }, { "epoch": 0.4963210702341137, "grad_norm": 1.7789745330810547, "learning_rate": 8.419630796938317e-05, "loss": 2.3178, "step": 371 }, { "epoch": 0.49765886287625416, "grad_norm": 0.699863851070404, "learning_rate": 8.415128320576318e-05, "loss": 1.7836, "step": 372 }, { "epoch": 0.49899665551839467, "grad_norm": 0.720728874206543, "learning_rate": 8.410625844214319e-05, "loss": 1.9491, "step": 373 }, { "epoch": 0.5003344481605351, "grad_norm": 0.5678146481513977, "learning_rate": 8.40612336785232e-05, "loss": 2.2176, "step": 374 }, { "epoch": 0.5016722408026756, "grad_norm": 0.5399537682533264, "learning_rate": 8.40162089149032e-05, "loss": 1.6731, "step": 375 }, { "epoch": 0.5030100334448161, "grad_norm": 0.5497046709060669, "learning_rate": 8.397118415128321e-05, "loss": 2.2015, "step": 376 }, { "epoch": 0.5043478260869565, "grad_norm": 0.6270225644111633, "learning_rate": 8.392615938766322e-05, "loss": 2.183, "step": 377 }, { "epoch": 0.505685618729097, "grad_norm": 1.1238292455673218, "learning_rate": 8.388113462404323e-05, "loss": 1.8471, "step": 378 }, { "epoch": 0.5070234113712374, "grad_norm": 1.303147315979004, "learning_rate": 8.383610986042324e-05, "loss": 1.9869, "step": 379 }, { "epoch": 0.5083612040133779, "grad_norm": 1.5513795614242554, "learning_rate": 8.379108509680325e-05, "loss": 2.4587, "step": 380 }, { "epoch": 0.5096989966555184, "grad_norm": 0.7095016241073608, "learning_rate": 8.374606033318325e-05, "loss": 2.2674, "step": 381 }, { "epoch": 0.5110367892976588, "grad_norm": 0.7620111703872681, "learning_rate": 8.370103556956326e-05, "loss": 1.7031, "step": 382 }, { "epoch": 0.5123745819397993, "grad_norm": 0.7550622820854187, "learning_rate": 8.365601080594327e-05, "loss": 1.3312, "step": 383 }, { "epoch": 0.5137123745819397, "grad_norm": 0.5809555053710938, "learning_rate": 8.361098604232328e-05, "loss": 1.9467, "step": 384 }, { "epoch": 0.5150501672240803, "grad_norm": 0.7727431654930115, "learning_rate": 8.356596127870329e-05, "loss": 2.2737, "step": 385 }, { "epoch": 0.5163879598662208, "grad_norm": 0.6559184789657593, "learning_rate": 8.35209365150833e-05, "loss": 1.7666, "step": 386 }, { "epoch": 0.5177257525083612, "grad_norm": 0.5735934376716614, "learning_rate": 8.34759117514633e-05, "loss": 1.666, "step": 387 }, { "epoch": 0.5190635451505017, "grad_norm": 0.7658690810203552, "learning_rate": 8.343088698784331e-05, "loss": 2.1627, "step": 388 }, { "epoch": 0.5204013377926422, "grad_norm": 2.0972108840942383, "learning_rate": 8.338586222422334e-05, "loss": 1.8791, "step": 389 }, { "epoch": 0.5217391304347826, "grad_norm": 1.761249303817749, "learning_rate": 8.334083746060334e-05, "loss": 2.3226, "step": 390 }, { "epoch": 0.5230769230769231, "grad_norm": 0.7268783450126648, "learning_rate": 8.329581269698334e-05, "loss": 1.5883, "step": 391 }, { "epoch": 0.5244147157190635, "grad_norm": 0.6746870279312134, "learning_rate": 8.325078793336335e-05, "loss": 2.0599, "step": 392 }, { "epoch": 0.525752508361204, "grad_norm": 1.0724132061004639, "learning_rate": 8.320576316974337e-05, "loss": 2.1421, "step": 393 }, { "epoch": 0.5270903010033445, "grad_norm": 1.250876545906067, "learning_rate": 8.316073840612338e-05, "loss": 1.8395, "step": 394 }, { "epoch": 0.5284280936454849, "grad_norm": 0.6708439588546753, "learning_rate": 8.311571364250337e-05, "loss": 2.0683, "step": 395 }, { "epoch": 0.5297658862876254, "grad_norm": 0.5887036919593811, "learning_rate": 8.307068887888338e-05, "loss": 2.127, "step": 396 }, { "epoch": 0.5311036789297658, "grad_norm": 0.5943152904510498, "learning_rate": 8.30256641152634e-05, "loss": 1.7854, "step": 397 }, { "epoch": 0.5324414715719064, "grad_norm": 1.3241448402404785, "learning_rate": 8.298063935164341e-05, "loss": 2.1065, "step": 398 }, { "epoch": 0.5337792642140469, "grad_norm": 0.6816858053207397, "learning_rate": 8.293561458802341e-05, "loss": 2.102, "step": 399 }, { "epoch": 0.5351170568561873, "grad_norm": 0.6760022640228271, "learning_rate": 8.289058982440343e-05, "loss": 2.212, "step": 400 }, { "epoch": 0.5364548494983278, "grad_norm": 0.6194477081298828, "learning_rate": 8.284556506078344e-05, "loss": 2.1954, "step": 401 }, { "epoch": 0.5377926421404682, "grad_norm": 0.7603004574775696, "learning_rate": 8.280054029716345e-05, "loss": 1.7049, "step": 402 }, { "epoch": 0.5391304347826087, "grad_norm": 0.7374423146247864, "learning_rate": 8.275551553354344e-05, "loss": 2.2612, "step": 403 }, { "epoch": 0.5404682274247492, "grad_norm": 1.0149405002593994, "learning_rate": 8.271049076992347e-05, "loss": 2.253, "step": 404 }, { "epoch": 0.5418060200668896, "grad_norm": 3.6246485710144043, "learning_rate": 8.266546600630347e-05, "loss": 2.2174, "step": 405 }, { "epoch": 0.5431438127090301, "grad_norm": 0.8437290787696838, "learning_rate": 8.262044124268348e-05, "loss": 2.3438, "step": 406 }, { "epoch": 0.5444816053511705, "grad_norm": 1.9442620277404785, "learning_rate": 8.257541647906349e-05, "loss": 2.1367, "step": 407 }, { "epoch": 0.545819397993311, "grad_norm": 0.688398540019989, "learning_rate": 8.25303917154435e-05, "loss": 2.0249, "step": 408 }, { "epoch": 0.5471571906354515, "grad_norm": 1.0315051078796387, "learning_rate": 8.248536695182351e-05, "loss": 2.2462, "step": 409 }, { "epoch": 0.5484949832775919, "grad_norm": 0.5843492746353149, "learning_rate": 8.244034218820352e-05, "loss": 2.283, "step": 410 }, { "epoch": 0.5498327759197325, "grad_norm": 0.6606890559196472, "learning_rate": 8.239531742458353e-05, "loss": 2.1271, "step": 411 }, { "epoch": 0.551170568561873, "grad_norm": 1.0793790817260742, "learning_rate": 8.235029266096353e-05, "loss": 1.8262, "step": 412 }, { "epoch": 0.5525083612040134, "grad_norm": 0.7052550315856934, "learning_rate": 8.230526789734354e-05, "loss": 2.0957, "step": 413 }, { "epoch": 0.5538461538461539, "grad_norm": 0.5878149271011353, "learning_rate": 8.226024313372355e-05, "loss": 2.0454, "step": 414 }, { "epoch": 0.5551839464882943, "grad_norm": 0.7138032913208008, "learning_rate": 8.221521837010356e-05, "loss": 1.3474, "step": 415 }, { "epoch": 0.5565217391304348, "grad_norm": 0.7557210922241211, "learning_rate": 8.217019360648357e-05, "loss": 1.8938, "step": 416 }, { "epoch": 0.5578595317725753, "grad_norm": 2.0228683948516846, "learning_rate": 8.212516884286358e-05, "loss": 1.7706, "step": 417 }, { "epoch": 0.5591973244147157, "grad_norm": 1.115006685256958, "learning_rate": 8.20801440792436e-05, "loss": 1.9386, "step": 418 }, { "epoch": 0.5605351170568562, "grad_norm": 0.6833808422088623, "learning_rate": 8.20351193156236e-05, "loss": 2.2975, "step": 419 }, { "epoch": 0.5618729096989966, "grad_norm": 0.7261201739311218, "learning_rate": 8.19900945520036e-05, "loss": 1.9896, "step": 420 }, { "epoch": 0.5632107023411371, "grad_norm": 0.5823450088500977, "learning_rate": 8.194506978838361e-05, "loss": 2.0871, "step": 421 }, { "epoch": 0.5645484949832776, "grad_norm": 1.1205475330352783, "learning_rate": 8.190004502476363e-05, "loss": 1.6322, "step": 422 }, { "epoch": 0.565886287625418, "grad_norm": 1.0447696447372437, "learning_rate": 8.185502026114363e-05, "loss": 1.9021, "step": 423 }, { "epoch": 0.5672240802675586, "grad_norm": 0.7044191956520081, "learning_rate": 8.180999549752364e-05, "loss": 2.1777, "step": 424 }, { "epoch": 0.568561872909699, "grad_norm": 0.729425311088562, "learning_rate": 8.176497073390366e-05, "loss": 2.1134, "step": 425 }, { "epoch": 0.5698996655518395, "grad_norm": 0.5844915509223938, "learning_rate": 8.171994597028367e-05, "loss": 2.1628, "step": 426 }, { "epoch": 0.57123745819398, "grad_norm": 0.7304236888885498, "learning_rate": 8.167492120666366e-05, "loss": 2.0648, "step": 427 }, { "epoch": 0.5725752508361204, "grad_norm": 0.6491372585296631, "learning_rate": 8.162989644304367e-05, "loss": 2.2494, "step": 428 }, { "epoch": 0.5739130434782609, "grad_norm": 1.2159571647644043, "learning_rate": 8.15848716794237e-05, "loss": 1.9377, "step": 429 }, { "epoch": 0.5752508361204013, "grad_norm": 0.6987440586090088, "learning_rate": 8.15398469158037e-05, "loss": 2.0905, "step": 430 }, { "epoch": 0.5765886287625418, "grad_norm": 0.7087085843086243, "learning_rate": 8.14948221521837e-05, "loss": 2.0827, "step": 431 }, { "epoch": 0.5779264214046823, "grad_norm": 0.6850048303604126, "learning_rate": 8.14497973885637e-05, "loss": 2.0597, "step": 432 }, { "epoch": 0.5792642140468227, "grad_norm": 0.6765483617782593, "learning_rate": 8.140477262494373e-05, "loss": 2.1179, "step": 433 }, { "epoch": 0.5806020066889632, "grad_norm": 0.7441734075546265, "learning_rate": 8.135974786132374e-05, "loss": 1.92, "step": 434 }, { "epoch": 0.5819397993311036, "grad_norm": 0.5449488759040833, "learning_rate": 8.131472309770373e-05, "loss": 1.6044, "step": 435 }, { "epoch": 0.5832775919732441, "grad_norm": 1.5431065559387207, "learning_rate": 8.126969833408375e-05, "loss": 2.3955, "step": 436 }, { "epoch": 0.5846153846153846, "grad_norm": 0.9129299521446228, "learning_rate": 8.122467357046376e-05, "loss": 2.1519, "step": 437 }, { "epoch": 0.585953177257525, "grad_norm": 0.811627209186554, "learning_rate": 8.117964880684377e-05, "loss": 1.9321, "step": 438 }, { "epoch": 0.5872909698996656, "grad_norm": 0.7428467273712158, "learning_rate": 8.113462404322378e-05, "loss": 1.9116, "step": 439 }, { "epoch": 0.5886287625418061, "grad_norm": 0.639267086982727, "learning_rate": 8.108959927960379e-05, "loss": 2.1715, "step": 440 }, { "epoch": 0.5899665551839465, "grad_norm": 0.6075000762939453, "learning_rate": 8.10445745159838e-05, "loss": 2.1022, "step": 441 }, { "epoch": 0.591304347826087, "grad_norm": 0.9672107696533203, "learning_rate": 8.09995497523638e-05, "loss": 2.2003, "step": 442 }, { "epoch": 0.5926421404682274, "grad_norm": 0.6427491307258606, "learning_rate": 8.095452498874381e-05, "loss": 2.1691, "step": 443 }, { "epoch": 0.5939799331103679, "grad_norm": 0.6234250664710999, "learning_rate": 8.090950022512382e-05, "loss": 2.0638, "step": 444 }, { "epoch": 0.5953177257525084, "grad_norm": 0.7238093018531799, "learning_rate": 8.086447546150383e-05, "loss": 1.9306, "step": 445 }, { "epoch": 0.5966555183946488, "grad_norm": 0.8159562349319458, "learning_rate": 8.081945069788384e-05, "loss": 2.1521, "step": 446 }, { "epoch": 0.5979933110367893, "grad_norm": 0.7235854268074036, "learning_rate": 8.077442593426385e-05, "loss": 2.0037, "step": 447 }, { "epoch": 0.5993311036789297, "grad_norm": 0.6752901673316956, "learning_rate": 8.072940117064386e-05, "loss": 1.8655, "step": 448 }, { "epoch": 0.6006688963210702, "grad_norm": 0.6684075593948364, "learning_rate": 8.068437640702387e-05, "loss": 1.8252, "step": 449 }, { "epoch": 0.6020066889632107, "grad_norm": 1.1071723699569702, "learning_rate": 8.063935164340387e-05, "loss": 1.9672, "step": 450 }, { "epoch": 0.6033444816053511, "grad_norm": 1.1972233057022095, "learning_rate": 8.059432687978388e-05, "loss": 1.9875, "step": 451 }, { "epoch": 0.6046822742474917, "grad_norm": 0.9624909162521362, "learning_rate": 8.054930211616389e-05, "loss": 2.1065, "step": 452 }, { "epoch": 0.6060200668896321, "grad_norm": 1.122995138168335, "learning_rate": 8.05042773525439e-05, "loss": 1.8011, "step": 453 }, { "epoch": 0.6073578595317726, "grad_norm": 0.7040053009986877, "learning_rate": 8.045925258892392e-05, "loss": 2.256, "step": 454 }, { "epoch": 0.6086956521739131, "grad_norm": 0.6881429553031921, "learning_rate": 8.041422782530392e-05, "loss": 2.0402, "step": 455 }, { "epoch": 0.6100334448160535, "grad_norm": 0.7463430166244507, "learning_rate": 8.036920306168393e-05, "loss": 2.0015, "step": 456 }, { "epoch": 0.611371237458194, "grad_norm": 1.0377933979034424, "learning_rate": 8.032417829806393e-05, "loss": 2.1018, "step": 457 }, { "epoch": 0.6127090301003344, "grad_norm": 0.9001895785331726, "learning_rate": 8.027915353444396e-05, "loss": 2.3027, "step": 458 }, { "epoch": 0.6140468227424749, "grad_norm": 5.800236225128174, "learning_rate": 8.023412877082395e-05, "loss": 1.8693, "step": 459 }, { "epoch": 0.6153846153846154, "grad_norm": 0.8842706680297852, "learning_rate": 8.018910400720396e-05, "loss": 2.0381, "step": 460 }, { "epoch": 0.6167224080267558, "grad_norm": 0.8452533483505249, "learning_rate": 8.014407924358398e-05, "loss": 2.3479, "step": 461 }, { "epoch": 0.6180602006688963, "grad_norm": 0.6322413682937622, "learning_rate": 8.009905447996399e-05, "loss": 2.3138, "step": 462 }, { "epoch": 0.6193979933110368, "grad_norm": 0.6229380369186401, "learning_rate": 8.0054029716344e-05, "loss": 2.2976, "step": 463 }, { "epoch": 0.6207357859531772, "grad_norm": 0.6563906073570251, "learning_rate": 8.0009004952724e-05, "loss": 1.8494, "step": 464 }, { "epoch": 0.6220735785953178, "grad_norm": 0.7984792590141296, "learning_rate": 7.996398018910402e-05, "loss": 2.2518, "step": 465 }, { "epoch": 0.6234113712374582, "grad_norm": 0.7305912375450134, "learning_rate": 7.991895542548402e-05, "loss": 2.4169, "step": 466 }, { "epoch": 0.6247491638795987, "grad_norm": 0.6074129343032837, "learning_rate": 7.987393066186403e-05, "loss": 1.9538, "step": 467 }, { "epoch": 0.6260869565217392, "grad_norm": 0.9291728138923645, "learning_rate": 7.982890589824403e-05, "loss": 2.1267, "step": 468 }, { "epoch": 0.6274247491638796, "grad_norm": 0.754958987236023, "learning_rate": 7.978388113462405e-05, "loss": 1.5873, "step": 469 }, { "epoch": 0.6287625418060201, "grad_norm": 0.6374825835227966, "learning_rate": 7.973885637100406e-05, "loss": 2.3097, "step": 470 }, { "epoch": 0.6301003344481605, "grad_norm": 0.6864813566207886, "learning_rate": 7.969383160738407e-05, "loss": 2.144, "step": 471 }, { "epoch": 0.631438127090301, "grad_norm": 0.7135668992996216, "learning_rate": 7.964880684376408e-05, "loss": 2.1547, "step": 472 }, { "epoch": 0.6327759197324415, "grad_norm": 0.7295858263969421, "learning_rate": 7.960378208014408e-05, "loss": 2.1507, "step": 473 }, { "epoch": 0.6341137123745819, "grad_norm": 0.6061326861381531, "learning_rate": 7.95587573165241e-05, "loss": 2.1332, "step": 474 }, { "epoch": 0.6354515050167224, "grad_norm": 0.5801835656166077, "learning_rate": 7.95137325529041e-05, "loss": 2.3511, "step": 475 }, { "epoch": 0.6367892976588628, "grad_norm": 0.880810022354126, "learning_rate": 7.946870778928411e-05, "loss": 1.8928, "step": 476 }, { "epoch": 0.6381270903010033, "grad_norm": 0.6856786012649536, "learning_rate": 7.942368302566412e-05, "loss": 2.2853, "step": 477 }, { "epoch": 0.6394648829431439, "grad_norm": 0.8725216388702393, "learning_rate": 7.937865826204413e-05, "loss": 2.0342, "step": 478 }, { "epoch": 0.6408026755852843, "grad_norm": 0.6242175102233887, "learning_rate": 7.933363349842414e-05, "loss": 2.2387, "step": 479 }, { "epoch": 0.6421404682274248, "grad_norm": 0.6608566045761108, "learning_rate": 7.928860873480414e-05, "loss": 1.9173, "step": 480 }, { "epoch": 0.6434782608695652, "grad_norm": 0.7391323447227478, "learning_rate": 7.924358397118415e-05, "loss": 1.9619, "step": 481 }, { "epoch": 0.6448160535117057, "grad_norm": 3.0505142211914062, "learning_rate": 7.919855920756416e-05, "loss": 1.6748, "step": 482 }, { "epoch": 0.6461538461538462, "grad_norm": 0.6999809145927429, "learning_rate": 7.915353444394417e-05, "loss": 2.0176, "step": 483 }, { "epoch": 0.6474916387959866, "grad_norm": 0.5944308638572693, "learning_rate": 7.910850968032418e-05, "loss": 2.2038, "step": 484 }, { "epoch": 0.6488294314381271, "grad_norm": 0.5410728454589844, "learning_rate": 7.906348491670419e-05, "loss": 2.2845, "step": 485 }, { "epoch": 0.6501672240802676, "grad_norm": 1.1120905876159668, "learning_rate": 7.90184601530842e-05, "loss": 2.2103, "step": 486 }, { "epoch": 0.651505016722408, "grad_norm": 0.7039376497268677, "learning_rate": 7.897343538946422e-05, "loss": 2.0109, "step": 487 }, { "epoch": 0.6528428093645485, "grad_norm": 0.5916752815246582, "learning_rate": 7.892841062584421e-05, "loss": 1.8574, "step": 488 }, { "epoch": 0.6541806020066889, "grad_norm": 0.6692883968353271, "learning_rate": 7.888338586222422e-05, "loss": 2.0059, "step": 489 }, { "epoch": 0.6555183946488294, "grad_norm": 0.7776148319244385, "learning_rate": 7.883836109860424e-05, "loss": 1.5996, "step": 490 }, { "epoch": 0.65685618729097, "grad_norm": 0.6935766935348511, "learning_rate": 7.879333633498425e-05, "loss": 2.0559, "step": 491 }, { "epoch": 0.6581939799331104, "grad_norm": 0.6067877411842346, "learning_rate": 7.874831157136425e-05, "loss": 2.1315, "step": 492 }, { "epoch": 0.6595317725752509, "grad_norm": 0.8041082620620728, "learning_rate": 7.870328680774426e-05, "loss": 2.221, "step": 493 }, { "epoch": 0.6608695652173913, "grad_norm": 0.8216106295585632, "learning_rate": 7.865826204412428e-05, "loss": 1.5927, "step": 494 }, { "epoch": 0.6622073578595318, "grad_norm": 0.6823213696479797, "learning_rate": 7.861323728050429e-05, "loss": 2.0342, "step": 495 }, { "epoch": 0.6635451505016723, "grad_norm": 0.8577303290367126, "learning_rate": 7.856821251688428e-05, "loss": 2.1152, "step": 496 }, { "epoch": 0.6648829431438127, "grad_norm": 0.6728313565254211, "learning_rate": 7.85231877532643e-05, "loss": 2.0798, "step": 497 }, { "epoch": 0.6662207357859532, "grad_norm": 0.7217726707458496, "learning_rate": 7.847816298964431e-05, "loss": 2.0957, "step": 498 }, { "epoch": 0.6675585284280936, "grad_norm": 0.6146303415298462, "learning_rate": 7.843313822602432e-05, "loss": 2.0668, "step": 499 }, { "epoch": 0.6688963210702341, "grad_norm": 0.9603506922721863, "learning_rate": 7.838811346240432e-05, "loss": 2.2326, "step": 500 }, { "epoch": 0.6702341137123746, "grad_norm": 0.5818818807601929, "learning_rate": 7.834308869878434e-05, "loss": 2.182, "step": 501 }, { "epoch": 0.671571906354515, "grad_norm": 0.6627907752990723, "learning_rate": 7.829806393516435e-05, "loss": 2.3262, "step": 502 }, { "epoch": 0.6729096989966555, "grad_norm": 0.7816573977470398, "learning_rate": 7.825303917154436e-05, "loss": 1.7924, "step": 503 }, { "epoch": 0.6742474916387959, "grad_norm": 0.7198929786682129, "learning_rate": 7.820801440792435e-05, "loss": 2.2945, "step": 504 }, { "epoch": 0.6755852842809364, "grad_norm": 0.6396793723106384, "learning_rate": 7.816298964430437e-05, "loss": 2.0723, "step": 505 }, { "epoch": 0.676923076923077, "grad_norm": 1.1741186380386353, "learning_rate": 7.811796488068438e-05, "loss": 1.5519, "step": 506 }, { "epoch": 0.6782608695652174, "grad_norm": 1.1423894166946411, "learning_rate": 7.807294011706439e-05, "loss": 1.9561, "step": 507 }, { "epoch": 0.6795986622073579, "grad_norm": 0.7496652603149414, "learning_rate": 7.80279153534444e-05, "loss": 2.0628, "step": 508 }, { "epoch": 0.6809364548494983, "grad_norm": 0.7691856622695923, "learning_rate": 7.798289058982441e-05, "loss": 2.1493, "step": 509 }, { "epoch": 0.6822742474916388, "grad_norm": 0.7692395448684692, "learning_rate": 7.793786582620442e-05, "loss": 1.9799, "step": 510 }, { "epoch": 0.6836120401337793, "grad_norm": 0.5502052903175354, "learning_rate": 7.789284106258442e-05, "loss": 2.0543, "step": 511 }, { "epoch": 0.6849498327759197, "grad_norm": 0.6818161010742188, "learning_rate": 7.784781629896443e-05, "loss": 1.943, "step": 512 }, { "epoch": 0.6862876254180602, "grad_norm": 0.7240835428237915, "learning_rate": 7.780279153534444e-05, "loss": 1.5104, "step": 513 }, { "epoch": 0.6876254180602007, "grad_norm": 0.7366359829902649, "learning_rate": 7.775776677172445e-05, "loss": 2.0822, "step": 514 }, { "epoch": 0.6889632107023411, "grad_norm": 1.0837323665618896, "learning_rate": 7.771274200810447e-05, "loss": 2.289, "step": 515 }, { "epoch": 0.6903010033444816, "grad_norm": 0.651714026927948, "learning_rate": 7.766771724448447e-05, "loss": 1.8688, "step": 516 }, { "epoch": 0.691638795986622, "grad_norm": 0.6614827513694763, "learning_rate": 7.762269248086448e-05, "loss": 1.6443, "step": 517 }, { "epoch": 0.6929765886287625, "grad_norm": 0.7544852495193481, "learning_rate": 7.757766771724448e-05, "loss": 1.6606, "step": 518 }, { "epoch": 0.6943143812709031, "grad_norm": 0.6697096824645996, "learning_rate": 7.753264295362451e-05, "loss": 1.5946, "step": 519 }, { "epoch": 0.6956521739130435, "grad_norm": 0.7795844674110413, "learning_rate": 7.74876181900045e-05, "loss": 2.059, "step": 520 }, { "epoch": 0.696989966555184, "grad_norm": 0.6018288731575012, "learning_rate": 7.744259342638451e-05, "loss": 2.1916, "step": 521 }, { "epoch": 0.6983277591973244, "grad_norm": 0.7487697601318359, "learning_rate": 7.739756866276452e-05, "loss": 2.1951, "step": 522 }, { "epoch": 0.6996655518394649, "grad_norm": 1.5919585227966309, "learning_rate": 7.735254389914454e-05, "loss": 2.2205, "step": 523 }, { "epoch": 0.7010033444816054, "grad_norm": 0.6275078058242798, "learning_rate": 7.730751913552454e-05, "loss": 2.1698, "step": 524 }, { "epoch": 0.7023411371237458, "grad_norm": 0.7217388153076172, "learning_rate": 7.726249437190454e-05, "loss": 1.9864, "step": 525 }, { "epoch": 0.7036789297658863, "grad_norm": 1.1377520561218262, "learning_rate": 7.721746960828457e-05, "loss": 2.1668, "step": 526 }, { "epoch": 0.7050167224080267, "grad_norm": 0.718572735786438, "learning_rate": 7.717244484466458e-05, "loss": 1.928, "step": 527 }, { "epoch": 0.7063545150501672, "grad_norm": 0.765325665473938, "learning_rate": 7.712742008104457e-05, "loss": 2.1302, "step": 528 }, { "epoch": 0.7076923076923077, "grad_norm": 1.549810528755188, "learning_rate": 7.708239531742458e-05, "loss": 2.189, "step": 529 }, { "epoch": 0.7090301003344481, "grad_norm": 2.3518106937408447, "learning_rate": 7.70373705538046e-05, "loss": 1.9373, "step": 530 }, { "epoch": 0.7103678929765886, "grad_norm": 1.188962697982788, "learning_rate": 7.699234579018461e-05, "loss": 1.6998, "step": 531 }, { "epoch": 0.711705685618729, "grad_norm": 0.9281237721443176, "learning_rate": 7.69473210265646e-05, "loss": 2.1245, "step": 532 }, { "epoch": 0.7130434782608696, "grad_norm": 0.6713751554489136, "learning_rate": 7.690229626294463e-05, "loss": 2.3883, "step": 533 }, { "epoch": 0.7143812709030101, "grad_norm": 1.8563570976257324, "learning_rate": 7.685727149932464e-05, "loss": 2.4456, "step": 534 }, { "epoch": 0.7157190635451505, "grad_norm": 0.6634758710861206, "learning_rate": 7.681224673570464e-05, "loss": 2.2008, "step": 535 }, { "epoch": 0.717056856187291, "grad_norm": 0.5978294014930725, "learning_rate": 7.676722197208465e-05, "loss": 2.0657, "step": 536 }, { "epoch": 0.7183946488294315, "grad_norm": 0.6472600698471069, "learning_rate": 7.672219720846466e-05, "loss": 2.1728, "step": 537 }, { "epoch": 0.7197324414715719, "grad_norm": 0.6482618451118469, "learning_rate": 7.667717244484467e-05, "loss": 2.446, "step": 538 }, { "epoch": 0.7210702341137124, "grad_norm": 0.6306278705596924, "learning_rate": 7.663214768122468e-05, "loss": 1.5909, "step": 539 }, { "epoch": 0.7224080267558528, "grad_norm": 0.6908814907073975, "learning_rate": 7.658712291760469e-05, "loss": 2.1809, "step": 540 }, { "epoch": 0.7237458193979933, "grad_norm": 0.8022807240486145, "learning_rate": 7.65420981539847e-05, "loss": 2.227, "step": 541 }, { "epoch": 0.7250836120401338, "grad_norm": 0.7036264538764954, "learning_rate": 7.64970733903647e-05, "loss": 1.8489, "step": 542 }, { "epoch": 0.7264214046822742, "grad_norm": 0.5983272194862366, "learning_rate": 7.645204862674471e-05, "loss": 2.0517, "step": 543 }, { "epoch": 0.7277591973244147, "grad_norm": 0.6938000917434692, "learning_rate": 7.640702386312472e-05, "loss": 2.0043, "step": 544 }, { "epoch": 0.7290969899665551, "grad_norm": 1.180989384651184, "learning_rate": 7.636199909950473e-05, "loss": 1.6897, "step": 545 }, { "epoch": 0.7304347826086957, "grad_norm": 1.2202868461608887, "learning_rate": 7.631697433588474e-05, "loss": 2.2268, "step": 546 }, { "epoch": 0.7317725752508362, "grad_norm": 0.6938366889953613, "learning_rate": 7.627194957226475e-05, "loss": 1.9375, "step": 547 }, { "epoch": 0.7331103678929766, "grad_norm": 0.8012475371360779, "learning_rate": 7.622692480864476e-05, "loss": 2.5131, "step": 548 }, { "epoch": 0.7344481605351171, "grad_norm": 0.7444472908973694, "learning_rate": 7.618190004502476e-05, "loss": 1.9647, "step": 549 }, { "epoch": 0.7357859531772575, "grad_norm": 0.6282299160957336, "learning_rate": 7.613687528140477e-05, "loss": 2.2758, "step": 550 }, { "epoch": 0.737123745819398, "grad_norm": 0.6200249195098877, "learning_rate": 7.60918505177848e-05, "loss": 2.1907, "step": 551 }, { "epoch": 0.7384615384615385, "grad_norm": 0.7774984836578369, "learning_rate": 7.604682575416479e-05, "loss": 2.2005, "step": 552 }, { "epoch": 0.7397993311036789, "grad_norm": 0.6397766470909119, "learning_rate": 7.60018009905448e-05, "loss": 1.0321, "step": 553 }, { "epoch": 0.7411371237458194, "grad_norm": 0.7659112811088562, "learning_rate": 7.595677622692481e-05, "loss": 1.5511, "step": 554 }, { "epoch": 0.7424749163879598, "grad_norm": 0.540540874004364, "learning_rate": 7.591175146330483e-05, "loss": 2.1449, "step": 555 }, { "epoch": 0.7438127090301003, "grad_norm": 0.6674241423606873, "learning_rate": 7.586672669968482e-05, "loss": 2.1938, "step": 556 }, { "epoch": 0.7451505016722408, "grad_norm": 0.651637077331543, "learning_rate": 7.582170193606483e-05, "loss": 1.995, "step": 557 }, { "epoch": 0.7464882943143812, "grad_norm": 1.0496723651885986, "learning_rate": 7.577667717244484e-05, "loss": 1.5, "step": 558 }, { "epoch": 0.7478260869565218, "grad_norm": 0.6831291317939758, "learning_rate": 7.573165240882486e-05, "loss": 1.8952, "step": 559 }, { "epoch": 0.7491638795986622, "grad_norm": 0.6953374147415161, "learning_rate": 7.568662764520487e-05, "loss": 2.3361, "step": 560 }, { "epoch": 0.7505016722408027, "grad_norm": 0.8107741475105286, "learning_rate": 7.564160288158487e-05, "loss": 1.7492, "step": 561 }, { "epoch": 0.7518394648829432, "grad_norm": 0.6309297680854797, "learning_rate": 7.559657811796489e-05, "loss": 1.9916, "step": 562 }, { "epoch": 0.7531772575250836, "grad_norm": 1.0587294101715088, "learning_rate": 7.55515533543449e-05, "loss": 2.224, "step": 563 }, { "epoch": 0.7545150501672241, "grad_norm": 0.7029111981391907, "learning_rate": 7.550652859072491e-05, "loss": 1.9918, "step": 564 }, { "epoch": 0.7558528428093646, "grad_norm": 0.6372597217559814, "learning_rate": 7.54615038271049e-05, "loss": 1.6158, "step": 565 }, { "epoch": 0.757190635451505, "grad_norm": 0.7088492512702942, "learning_rate": 7.541647906348492e-05, "loss": 2.2011, "step": 566 }, { "epoch": 0.7585284280936455, "grad_norm": 0.6593747735023499, "learning_rate": 7.537145429986493e-05, "loss": 1.5873, "step": 567 }, { "epoch": 0.7598662207357859, "grad_norm": 0.7396959066390991, "learning_rate": 7.532642953624494e-05, "loss": 1.7784, "step": 568 }, { "epoch": 0.7612040133779264, "grad_norm": 0.9161505103111267, "learning_rate": 7.528140477262495e-05, "loss": 1.7815, "step": 569 }, { "epoch": 0.7625418060200669, "grad_norm": 0.6812069416046143, "learning_rate": 7.523638000900496e-05, "loss": 2.1424, "step": 570 }, { "epoch": 0.7638795986622073, "grad_norm": 0.6250039339065552, "learning_rate": 7.519135524538497e-05, "loss": 1.8774, "step": 571 }, { "epoch": 0.7652173913043478, "grad_norm": 0.7947536706924438, "learning_rate": 7.514633048176498e-05, "loss": 2.2003, "step": 572 }, { "epoch": 0.7665551839464882, "grad_norm": 0.6720924377441406, "learning_rate": 7.510130571814498e-05, "loss": 2.2568, "step": 573 }, { "epoch": 0.7678929765886288, "grad_norm": 0.8429411053657532, "learning_rate": 7.505628095452499e-05, "loss": 1.8639, "step": 574 }, { "epoch": 0.7692307692307693, "grad_norm": 0.7361516952514648, "learning_rate": 7.5011256190905e-05, "loss": 1.7005, "step": 575 }, { "epoch": 0.7705685618729097, "grad_norm": 1.273979663848877, "learning_rate": 7.496623142728501e-05, "loss": 2.4655, "step": 576 }, { "epoch": 0.7719063545150502, "grad_norm": 1.8873357772827148, "learning_rate": 7.492120666366502e-05, "loss": 2.338, "step": 577 }, { "epoch": 0.7732441471571906, "grad_norm": 0.571962296962738, "learning_rate": 7.487618190004503e-05, "loss": 2.0726, "step": 578 }, { "epoch": 0.7745819397993311, "grad_norm": 0.6177148818969727, "learning_rate": 7.483115713642504e-05, "loss": 2.1553, "step": 579 }, { "epoch": 0.7759197324414716, "grad_norm": 0.8624194860458374, "learning_rate": 7.478613237280504e-05, "loss": 2.4043, "step": 580 }, { "epoch": 0.777257525083612, "grad_norm": 0.5737015604972839, "learning_rate": 7.474110760918505e-05, "loss": 1.6189, "step": 581 }, { "epoch": 0.7785953177257525, "grad_norm": 0.6640171408653259, "learning_rate": 7.469608284556506e-05, "loss": 2.3091, "step": 582 }, { "epoch": 0.7799331103678929, "grad_norm": 1.1256778240203857, "learning_rate": 7.465105808194507e-05, "loss": 2.1997, "step": 583 }, { "epoch": 0.7812709030100334, "grad_norm": 0.6835268139839172, "learning_rate": 7.460603331832509e-05, "loss": 2.019, "step": 584 }, { "epoch": 0.782608695652174, "grad_norm": 0.6177977919578552, "learning_rate": 7.456100855470509e-05, "loss": 2.3367, "step": 585 }, { "epoch": 0.7839464882943143, "grad_norm": 0.7783511877059937, "learning_rate": 7.45159837910851e-05, "loss": 1.9634, "step": 586 }, { "epoch": 0.7852842809364549, "grad_norm": 0.503692626953125, "learning_rate": 7.447095902746512e-05, "loss": 2.1588, "step": 587 }, { "epoch": 0.7866220735785954, "grad_norm": 0.8090397119522095, "learning_rate": 7.442593426384513e-05, "loss": 1.6442, "step": 588 }, { "epoch": 0.7879598662207358, "grad_norm": 0.6573639512062073, "learning_rate": 7.438090950022512e-05, "loss": 2.2257, "step": 589 }, { "epoch": 0.7892976588628763, "grad_norm": 0.7102932929992676, "learning_rate": 7.433588473660513e-05, "loss": 1.6468, "step": 590 }, { "epoch": 0.7906354515050167, "grad_norm": 0.7322196364402771, "learning_rate": 7.429085997298515e-05, "loss": 2.2053, "step": 591 }, { "epoch": 0.7919732441471572, "grad_norm": 0.7007878422737122, "learning_rate": 7.424583520936516e-05, "loss": 2.0766, "step": 592 }, { "epoch": 0.7933110367892977, "grad_norm": 0.6183981895446777, "learning_rate": 7.420081044574516e-05, "loss": 2.0603, "step": 593 }, { "epoch": 0.7946488294314381, "grad_norm": 0.5837265253067017, "learning_rate": 7.415578568212516e-05, "loss": 2.3202, "step": 594 }, { "epoch": 0.7959866220735786, "grad_norm": 1.104719638824463, "learning_rate": 7.411076091850519e-05, "loss": 1.3282, "step": 595 }, { "epoch": 0.797324414715719, "grad_norm": 0.6628216505050659, "learning_rate": 7.40657361548852e-05, "loss": 2.3223, "step": 596 }, { "epoch": 0.7986622073578595, "grad_norm": 0.5468476414680481, "learning_rate": 7.402071139126519e-05, "loss": 1.7866, "step": 597 }, { "epoch": 0.8, "grad_norm": 0.8073409795761108, "learning_rate": 7.397568662764521e-05, "loss": 1.7743, "step": 598 }, { "epoch": 0.8013377926421404, "grad_norm": 1.0640450716018677, "learning_rate": 7.393066186402522e-05, "loss": 2.1797, "step": 599 }, { "epoch": 0.802675585284281, "grad_norm": 0.7137813568115234, "learning_rate": 7.388563710040523e-05, "loss": 2.0888, "step": 600 }, { "epoch": 0.8040133779264214, "grad_norm": 0.5866499543190002, "learning_rate": 7.384061233678522e-05, "loss": 1.668, "step": 601 }, { "epoch": 0.8053511705685619, "grad_norm": 0.7044905424118042, "learning_rate": 7.379558757316525e-05, "loss": 1.9982, "step": 602 }, { "epoch": 0.8066889632107024, "grad_norm": 0.5737768411636353, "learning_rate": 7.375056280954526e-05, "loss": 2.0302, "step": 603 }, { "epoch": 0.8080267558528428, "grad_norm": 0.5827451348304749, "learning_rate": 7.370553804592526e-05, "loss": 2.3226, "step": 604 }, { "epoch": 0.8093645484949833, "grad_norm": 0.7087153792381287, "learning_rate": 7.366051328230527e-05, "loss": 2.0301, "step": 605 }, { "epoch": 0.8107023411371237, "grad_norm": 0.6226567029953003, "learning_rate": 7.361548851868528e-05, "loss": 1.7037, "step": 606 }, { "epoch": 0.8120401337792642, "grad_norm": 0.7785759568214417, "learning_rate": 7.357046375506529e-05, "loss": 1.7154, "step": 607 }, { "epoch": 0.8133779264214047, "grad_norm": 0.777509868144989, "learning_rate": 7.35254389914453e-05, "loss": 2.0251, "step": 608 }, { "epoch": 0.8147157190635451, "grad_norm": 0.6344072222709656, "learning_rate": 7.348041422782531e-05, "loss": 2.2202, "step": 609 }, { "epoch": 0.8160535117056856, "grad_norm": 0.7604480385780334, "learning_rate": 7.343538946420532e-05, "loss": 1.9614, "step": 610 }, { "epoch": 0.8173913043478261, "grad_norm": 0.6319754719734192, "learning_rate": 7.339036470058532e-05, "loss": 1.6291, "step": 611 }, { "epoch": 0.8187290969899665, "grad_norm": 0.9950410723686218, "learning_rate": 7.334533993696533e-05, "loss": 1.8727, "step": 612 }, { "epoch": 0.820066889632107, "grad_norm": 0.6576692461967468, "learning_rate": 7.330031517334534e-05, "loss": 1.92, "step": 613 }, { "epoch": 0.8214046822742475, "grad_norm": 0.5095275640487671, "learning_rate": 7.325529040972535e-05, "loss": 1.9408, "step": 614 }, { "epoch": 0.822742474916388, "grad_norm": 1.188237190246582, "learning_rate": 7.321026564610536e-05, "loss": 1.9193, "step": 615 }, { "epoch": 0.8240802675585285, "grad_norm": 0.7145055532455444, "learning_rate": 7.316524088248538e-05, "loss": 2.2026, "step": 616 }, { "epoch": 0.8254180602006689, "grad_norm": 0.6884480714797974, "learning_rate": 7.312021611886538e-05, "loss": 2.3884, "step": 617 }, { "epoch": 0.8267558528428094, "grad_norm": 0.6090062856674194, "learning_rate": 7.307519135524538e-05, "loss": 2.1149, "step": 618 }, { "epoch": 0.8280936454849498, "grad_norm": 1.0304170846939087, "learning_rate": 7.303016659162539e-05, "loss": 1.4435, "step": 619 }, { "epoch": 0.8294314381270903, "grad_norm": 0.6356250047683716, "learning_rate": 7.298514182800542e-05, "loss": 1.8632, "step": 620 }, { "epoch": 0.8307692307692308, "grad_norm": 0.6120384931564331, "learning_rate": 7.294011706438541e-05, "loss": 2.0522, "step": 621 }, { "epoch": 0.8321070234113712, "grad_norm": 0.6501190662384033, "learning_rate": 7.289509230076542e-05, "loss": 1.8754, "step": 622 }, { "epoch": 0.8334448160535117, "grad_norm": 1.3946788311004639, "learning_rate": 7.285006753714544e-05, "loss": 2.0818, "step": 623 }, { "epoch": 0.8347826086956521, "grad_norm": 0.6905943751335144, "learning_rate": 7.280504277352545e-05, "loss": 2.4726, "step": 624 }, { "epoch": 0.8361204013377926, "grad_norm": 1.378237247467041, "learning_rate": 7.276001800990544e-05, "loss": 2.0315, "step": 625 }, { "epoch": 0.8374581939799332, "grad_norm": 0.9301738142967224, "learning_rate": 7.271499324628545e-05, "loss": 1.027, "step": 626 }, { "epoch": 0.8387959866220736, "grad_norm": 0.9967605471611023, "learning_rate": 7.266996848266548e-05, "loss": 2.1218, "step": 627 }, { "epoch": 0.8401337792642141, "grad_norm": 1.6421177387237549, "learning_rate": 7.262494371904548e-05, "loss": 1.5805, "step": 628 }, { "epoch": 0.8414715719063545, "grad_norm": 1.0547858476638794, "learning_rate": 7.257991895542548e-05, "loss": 2.3767, "step": 629 }, { "epoch": 0.842809364548495, "grad_norm": 0.9341603517532349, "learning_rate": 7.253489419180549e-05, "loss": 1.8782, "step": 630 }, { "epoch": 0.8441471571906355, "grad_norm": 0.8098312616348267, "learning_rate": 7.248986942818551e-05, "loss": 1.8284, "step": 631 }, { "epoch": 0.8454849498327759, "grad_norm": 1.4215975999832153, "learning_rate": 7.244484466456552e-05, "loss": 1.6072, "step": 632 }, { "epoch": 0.8468227424749164, "grad_norm": 0.6881840229034424, "learning_rate": 7.239981990094553e-05, "loss": 1.9254, "step": 633 }, { "epoch": 0.8481605351170568, "grad_norm": 0.7351963520050049, "learning_rate": 7.235479513732554e-05, "loss": 1.9477, "step": 634 }, { "epoch": 0.8494983277591973, "grad_norm": 0.7751598954200745, "learning_rate": 7.230977037370554e-05, "loss": 2.0508, "step": 635 }, { "epoch": 0.8508361204013378, "grad_norm": 0.7352134585380554, "learning_rate": 7.226474561008555e-05, "loss": 1.9826, "step": 636 }, { "epoch": 0.8521739130434782, "grad_norm": 1.0797083377838135, "learning_rate": 7.221972084646556e-05, "loss": 1.8754, "step": 637 }, { "epoch": 0.8535117056856187, "grad_norm": 0.6877202391624451, "learning_rate": 7.217469608284557e-05, "loss": 2.1698, "step": 638 }, { "epoch": 0.8548494983277592, "grad_norm": 0.5914489030838013, "learning_rate": 7.212967131922558e-05, "loss": 2.1072, "step": 639 }, { "epoch": 0.8561872909698997, "grad_norm": 1.0543609857559204, "learning_rate": 7.208464655560559e-05, "loss": 1.9583, "step": 640 }, { "epoch": 0.8575250836120402, "grad_norm": 0.6509416699409485, "learning_rate": 7.20396217919856e-05, "loss": 2.0584, "step": 641 }, { "epoch": 0.8588628762541806, "grad_norm": 0.6920568943023682, "learning_rate": 7.19945970283656e-05, "loss": 2.0691, "step": 642 }, { "epoch": 0.8602006688963211, "grad_norm": 0.5659633874893188, "learning_rate": 7.194957226474561e-05, "loss": 2.3212, "step": 643 }, { "epoch": 0.8615384615384616, "grad_norm": 0.8833758234977722, "learning_rate": 7.190454750112562e-05, "loss": 1.9745, "step": 644 }, { "epoch": 0.862876254180602, "grad_norm": 1.0847338438034058, "learning_rate": 7.185952273750563e-05, "loss": 2.17, "step": 645 }, { "epoch": 0.8642140468227425, "grad_norm": 0.7263421416282654, "learning_rate": 7.181449797388564e-05, "loss": 1.5802, "step": 646 }, { "epoch": 0.8655518394648829, "grad_norm": 1.1162469387054443, "learning_rate": 7.176947321026565e-05, "loss": 2.0613, "step": 647 }, { "epoch": 0.8668896321070234, "grad_norm": 0.73091721534729, "learning_rate": 7.172444844664566e-05, "loss": 1.8372, "step": 648 }, { "epoch": 0.8682274247491639, "grad_norm": 0.6744244694709778, "learning_rate": 7.167942368302566e-05, "loss": 2.0031, "step": 649 }, { "epoch": 0.8695652173913043, "grad_norm": 0.7309931516647339, "learning_rate": 7.163439891940567e-05, "loss": 1.5731, "step": 650 }, { "epoch": 0.8709030100334448, "grad_norm": 0.6159527897834778, "learning_rate": 7.158937415578568e-05, "loss": 2.2952, "step": 651 }, { "epoch": 0.8722408026755852, "grad_norm": 0.8553061485290527, "learning_rate": 7.15443493921657e-05, "loss": 1.8854, "step": 652 }, { "epoch": 0.8735785953177257, "grad_norm": 0.6783524751663208, "learning_rate": 7.149932462854571e-05, "loss": 1.13, "step": 653 }, { "epoch": 0.8749163879598663, "grad_norm": 1.1411305665969849, "learning_rate": 7.145429986492571e-05, "loss": 1.9138, "step": 654 }, { "epoch": 0.8762541806020067, "grad_norm": 0.7974840998649597, "learning_rate": 7.140927510130572e-05, "loss": 1.3333, "step": 655 }, { "epoch": 0.8775919732441472, "grad_norm": 0.8446215391159058, "learning_rate": 7.136425033768574e-05, "loss": 2.2973, "step": 656 }, { "epoch": 0.8789297658862876, "grad_norm": 0.7412329912185669, "learning_rate": 7.131922557406575e-05, "loss": 1.7935, "step": 657 }, { "epoch": 0.8802675585284281, "grad_norm": 1.7755156755447388, "learning_rate": 7.127420081044574e-05, "loss": 2.0122, "step": 658 }, { "epoch": 0.8816053511705686, "grad_norm": 1.0286380052566528, "learning_rate": 7.122917604682576e-05, "loss": 1.9987, "step": 659 }, { "epoch": 0.882943143812709, "grad_norm": 0.6704300045967102, "learning_rate": 7.118415128320577e-05, "loss": 1.8126, "step": 660 }, { "epoch": 0.8842809364548495, "grad_norm": 0.6966663002967834, "learning_rate": 7.113912651958578e-05, "loss": 1.993, "step": 661 }, { "epoch": 0.88561872909699, "grad_norm": 0.6740978360176086, "learning_rate": 7.109410175596578e-05, "loss": 1.9854, "step": 662 }, { "epoch": 0.8869565217391304, "grad_norm": 0.7223276495933533, "learning_rate": 7.10490769923458e-05, "loss": 2.0942, "step": 663 }, { "epoch": 0.8882943143812709, "grad_norm": 0.9881120324134827, "learning_rate": 7.10040522287258e-05, "loss": 1.9087, "step": 664 }, { "epoch": 0.8896321070234113, "grad_norm": 0.7445497512817383, "learning_rate": 7.095902746510582e-05, "loss": 2.058, "step": 665 }, { "epoch": 0.8909698996655518, "grad_norm": 1.0675506591796875, "learning_rate": 7.091400270148581e-05, "loss": 1.9869, "step": 666 }, { "epoch": 0.8923076923076924, "grad_norm": 1.2335395812988281, "learning_rate": 7.086897793786583e-05, "loss": 2.1621, "step": 667 }, { "epoch": 0.8936454849498328, "grad_norm": 0.6886176466941833, "learning_rate": 7.082395317424584e-05, "loss": 1.5078, "step": 668 }, { "epoch": 0.8949832775919733, "grad_norm": 0.71738600730896, "learning_rate": 7.077892841062585e-05, "loss": 2.3195, "step": 669 }, { "epoch": 0.8963210702341137, "grad_norm": 0.708991527557373, "learning_rate": 7.073390364700586e-05, "loss": 2.059, "step": 670 }, { "epoch": 0.8976588628762542, "grad_norm": 0.6323236227035522, "learning_rate": 7.068887888338587e-05, "loss": 1.9016, "step": 671 }, { "epoch": 0.8989966555183947, "grad_norm": 0.6371700763702393, "learning_rate": 7.064385411976588e-05, "loss": 2.1694, "step": 672 }, { "epoch": 0.9003344481605351, "grad_norm": 0.7130315899848938, "learning_rate": 7.059882935614588e-05, "loss": 1.8981, "step": 673 }, { "epoch": 0.9016722408026756, "grad_norm": 0.8963072299957275, "learning_rate": 7.055380459252589e-05, "loss": 2.056, "step": 674 }, { "epoch": 0.903010033444816, "grad_norm": 0.6768928170204163, "learning_rate": 7.05087798289059e-05, "loss": 1.9999, "step": 675 }, { "epoch": 0.9043478260869565, "grad_norm": 0.7496529221534729, "learning_rate": 7.046375506528591e-05, "loss": 2.2434, "step": 676 }, { "epoch": 0.905685618729097, "grad_norm": 0.5942232608795166, "learning_rate": 7.041873030166593e-05, "loss": 2.3195, "step": 677 }, { "epoch": 0.9070234113712374, "grad_norm": 0.6655867695808411, "learning_rate": 7.037370553804593e-05, "loss": 2.0573, "step": 678 }, { "epoch": 0.9083612040133779, "grad_norm": 0.5738136172294617, "learning_rate": 7.032868077442594e-05, "loss": 1.2416, "step": 679 }, { "epoch": 0.9096989966555183, "grad_norm": 0.8400653004646301, "learning_rate": 7.028365601080594e-05, "loss": 1.8816, "step": 680 }, { "epoch": 0.9110367892976589, "grad_norm": 0.6366567015647888, "learning_rate": 7.023863124718597e-05, "loss": 1.9616, "step": 681 }, { "epoch": 0.9123745819397994, "grad_norm": 0.7917408347129822, "learning_rate": 7.019360648356596e-05, "loss": 2.0806, "step": 682 }, { "epoch": 0.9137123745819398, "grad_norm": 0.669217586517334, "learning_rate": 7.014858171994597e-05, "loss": 2.0481, "step": 683 }, { "epoch": 0.9150501672240803, "grad_norm": 1.5556551218032837, "learning_rate": 7.010355695632598e-05, "loss": 1.5202, "step": 684 }, { "epoch": 0.9163879598662207, "grad_norm": 0.9249728918075562, "learning_rate": 7.0058532192706e-05, "loss": 2.0939, "step": 685 }, { "epoch": 0.9177257525083612, "grad_norm": 0.5922223329544067, "learning_rate": 7.0013507429086e-05, "loss": 2.058, "step": 686 }, { "epoch": 0.9190635451505017, "grad_norm": 0.5889205932617188, "learning_rate": 6.9968482665466e-05, "loss": 1.6894, "step": 687 }, { "epoch": 0.9204013377926421, "grad_norm": 0.7091755270957947, "learning_rate": 6.992345790184603e-05, "loss": 1.9788, "step": 688 }, { "epoch": 0.9217391304347826, "grad_norm": 0.5706994533538818, "learning_rate": 6.987843313822603e-05, "loss": 1.7282, "step": 689 }, { "epoch": 0.9230769230769231, "grad_norm": 0.6377424597740173, "learning_rate": 6.983340837460603e-05, "loss": 2.0029, "step": 690 }, { "epoch": 0.9244147157190635, "grad_norm": 1.2235517501831055, "learning_rate": 6.978838361098604e-05, "loss": 1.6211, "step": 691 }, { "epoch": 0.925752508361204, "grad_norm": 0.7518335580825806, "learning_rate": 6.974335884736606e-05, "loss": 2.0324, "step": 692 }, { "epoch": 0.9270903010033444, "grad_norm": 0.8257300853729248, "learning_rate": 6.969833408374607e-05, "loss": 1.8237, "step": 693 }, { "epoch": 0.928428093645485, "grad_norm": 0.7107146382331848, "learning_rate": 6.965330932012606e-05, "loss": 2.4846, "step": 694 }, { "epoch": 0.9297658862876255, "grad_norm": 0.954113245010376, "learning_rate": 6.960828455650609e-05, "loss": 2.0246, "step": 695 }, { "epoch": 0.9311036789297659, "grad_norm": 0.6087449789047241, "learning_rate": 6.95632597928861e-05, "loss": 1.9656, "step": 696 }, { "epoch": 0.9324414715719064, "grad_norm": 0.6922059059143066, "learning_rate": 6.95182350292661e-05, "loss": 2.3123, "step": 697 }, { "epoch": 0.9337792642140468, "grad_norm": 0.7577765583992004, "learning_rate": 6.94732102656461e-05, "loss": 1.5737, "step": 698 }, { "epoch": 0.9351170568561873, "grad_norm": 0.7660884857177734, "learning_rate": 6.942818550202612e-05, "loss": 2.2649, "step": 699 }, { "epoch": 0.9364548494983278, "grad_norm": 1.2487679719924927, "learning_rate": 6.938316073840613e-05, "loss": 2.0546, "step": 700 }, { "epoch": 0.9377926421404682, "grad_norm": 0.8137415647506714, "learning_rate": 6.933813597478614e-05, "loss": 2.1294, "step": 701 }, { "epoch": 0.9391304347826087, "grad_norm": 0.8820820450782776, "learning_rate": 6.929311121116615e-05, "loss": 2.3508, "step": 702 }, { "epoch": 0.9404682274247491, "grad_norm": 0.5844343900680542, "learning_rate": 6.924808644754615e-05, "loss": 2.024, "step": 703 }, { "epoch": 0.9418060200668896, "grad_norm": 0.9741575121879578, "learning_rate": 6.920306168392616e-05, "loss": 1.9389, "step": 704 }, { "epoch": 0.9431438127090301, "grad_norm": 0.5414264798164368, "learning_rate": 6.915803692030617e-05, "loss": 2.3536, "step": 705 }, { "epoch": 0.9444816053511705, "grad_norm": 0.9424920678138733, "learning_rate": 6.911301215668618e-05, "loss": 1.7482, "step": 706 }, { "epoch": 0.945819397993311, "grad_norm": 1.173538327217102, "learning_rate": 6.906798739306619e-05, "loss": 2.0628, "step": 707 }, { "epoch": 0.9471571906354515, "grad_norm": 0.808866024017334, "learning_rate": 6.90229626294462e-05, "loss": 2.0757, "step": 708 }, { "epoch": 0.948494983277592, "grad_norm": 0.8062437772750854, "learning_rate": 6.89779378658262e-05, "loss": 1.4797, "step": 709 }, { "epoch": 0.9498327759197325, "grad_norm": 0.6568036079406738, "learning_rate": 6.893291310220622e-05, "loss": 2.1079, "step": 710 }, { "epoch": 0.9511705685618729, "grad_norm": 1.0303947925567627, "learning_rate": 6.888788833858622e-05, "loss": 2.1879, "step": 711 }, { "epoch": 0.9525083612040134, "grad_norm": 1.439601182937622, "learning_rate": 6.884286357496623e-05, "loss": 1.5109, "step": 712 }, { "epoch": 0.9538461538461539, "grad_norm": 1.9849913120269775, "learning_rate": 6.879783881134625e-05, "loss": 1.929, "step": 713 }, { "epoch": 0.9551839464882943, "grad_norm": 0.6140236258506775, "learning_rate": 6.875281404772625e-05, "loss": 2.2385, "step": 714 }, { "epoch": 0.9565217391304348, "grad_norm": 0.7368261218070984, "learning_rate": 6.870778928410626e-05, "loss": 1.3549, "step": 715 }, { "epoch": 0.9578595317725752, "grad_norm": 0.8577214479446411, "learning_rate": 6.866276452048627e-05, "loss": 2.1128, "step": 716 }, { "epoch": 0.9591973244147157, "grad_norm": 0.7355601191520691, "learning_rate": 6.861773975686629e-05, "loss": 1.9433, "step": 717 }, { "epoch": 0.9605351170568562, "grad_norm": 1.2294349670410156, "learning_rate": 6.857271499324628e-05, "loss": 2.2753, "step": 718 }, { "epoch": 0.9618729096989966, "grad_norm": 0.574518084526062, "learning_rate": 6.852769022962629e-05, "loss": 2.0646, "step": 719 }, { "epoch": 0.9632107023411371, "grad_norm": 0.715045154094696, "learning_rate": 6.84826654660063e-05, "loss": 2.0687, "step": 720 }, { "epoch": 0.9645484949832775, "grad_norm": 0.8325941562652588, "learning_rate": 6.843764070238632e-05, "loss": 1.85, "step": 721 }, { "epoch": 0.9658862876254181, "grad_norm": 0.6069225668907166, "learning_rate": 6.839261593876632e-05, "loss": 2.1878, "step": 722 }, { "epoch": 0.9672240802675586, "grad_norm": 0.705606997013092, "learning_rate": 6.834759117514633e-05, "loss": 2.2931, "step": 723 }, { "epoch": 0.968561872909699, "grad_norm": 0.6511003375053406, "learning_rate": 6.830256641152635e-05, "loss": 1.9663, "step": 724 }, { "epoch": 0.9698996655518395, "grad_norm": 0.6704803109169006, "learning_rate": 6.825754164790636e-05, "loss": 1.5408, "step": 725 }, { "epoch": 0.9712374581939799, "grad_norm": 1.087570309638977, "learning_rate": 6.821251688428637e-05, "loss": 2.3817, "step": 726 }, { "epoch": 0.9725752508361204, "grad_norm": 0.7136285901069641, "learning_rate": 6.816749212066636e-05, "loss": 2.0755, "step": 727 }, { "epoch": 0.9739130434782609, "grad_norm": 0.7055271863937378, "learning_rate": 6.812246735704638e-05, "loss": 2.1872, "step": 728 }, { "epoch": 0.9752508361204013, "grad_norm": 2.6934823989868164, "learning_rate": 6.807744259342639e-05, "loss": 1.5151, "step": 729 }, { "epoch": 0.9765886287625418, "grad_norm": 0.7742595672607422, "learning_rate": 6.80324178298064e-05, "loss": 2.255, "step": 730 }, { "epoch": 0.9779264214046822, "grad_norm": 0.7820495963096619, "learning_rate": 6.798739306618641e-05, "loss": 2.2767, "step": 731 }, { "epoch": 0.9792642140468227, "grad_norm": 0.8606453537940979, "learning_rate": 6.794236830256642e-05, "loss": 1.7905, "step": 732 }, { "epoch": 0.9806020066889632, "grad_norm": 0.9574686884880066, "learning_rate": 6.789734353894643e-05, "loss": 2.1741, "step": 733 }, { "epoch": 0.9819397993311036, "grad_norm": 1.447963833808899, "learning_rate": 6.785231877532643e-05, "loss": 1.8766, "step": 734 }, { "epoch": 0.9832775919732442, "grad_norm": 3.8072526454925537, "learning_rate": 6.780729401170644e-05, "loss": 2.0198, "step": 735 }, { "epoch": 0.9846153846153847, "grad_norm": 0.5300580263137817, "learning_rate": 6.776226924808645e-05, "loss": 1.6352, "step": 736 }, { "epoch": 0.9859531772575251, "grad_norm": 0.9182571172714233, "learning_rate": 6.771724448446646e-05, "loss": 2.3257, "step": 737 }, { "epoch": 0.9872909698996656, "grad_norm": 1.5257279872894287, "learning_rate": 6.767221972084647e-05, "loss": 2.3333, "step": 738 }, { "epoch": 0.988628762541806, "grad_norm": 0.793891191482544, "learning_rate": 6.762719495722648e-05, "loss": 1.776, "step": 739 }, { "epoch": 0.9899665551839465, "grad_norm": 0.6769539713859558, "learning_rate": 6.758217019360649e-05, "loss": 2.0308, "step": 740 }, { "epoch": 0.991304347826087, "grad_norm": 0.7656939029693604, "learning_rate": 6.75371454299865e-05, "loss": 2.0104, "step": 741 }, { "epoch": 0.9926421404682274, "grad_norm": 1.0917755365371704, "learning_rate": 6.74921206663665e-05, "loss": 2.0117, "step": 742 }, { "epoch": 0.9939799331103679, "grad_norm": 0.5329183340072632, "learning_rate": 6.744709590274651e-05, "loss": 1.6329, "step": 743 }, { "epoch": 0.9953177257525083, "grad_norm": 0.612562358379364, "learning_rate": 6.740207113912652e-05, "loss": 2.2978, "step": 744 }, { "epoch": 0.9966555183946488, "grad_norm": 0.9256593585014343, "learning_rate": 6.735704637550653e-05, "loss": 2.2044, "step": 745 }, { "epoch": 0.9979933110367893, "grad_norm": 0.6008355617523193, "learning_rate": 6.731202161188654e-05, "loss": 2.3045, "step": 746 }, { "epoch": 0.9993311036789297, "grad_norm": 0.7152503728866577, "learning_rate": 6.726699684826655e-05, "loss": 2.0138, "step": 747 }, { "epoch": 1.0006688963210701, "grad_norm": 0.6082767844200134, "learning_rate": 6.722197208464655e-05, "loss": 1.6355, "step": 748 }, { "epoch": 1.0020066889632107, "grad_norm": 0.8323435187339783, "learning_rate": 6.717694732102658e-05, "loss": 1.5551, "step": 749 }, { "epoch": 1.0033444816053512, "grad_norm": 0.777487576007843, "learning_rate": 6.713192255740659e-05, "loss": 2.0041, "step": 750 }, { "epoch": 1.0046822742474917, "grad_norm": 1.172985315322876, "learning_rate": 6.708689779378658e-05, "loss": 0.7001, "step": 751 }, { "epoch": 1.0060200668896322, "grad_norm": 0.7470185160636902, "learning_rate": 6.704187303016659e-05, "loss": 1.7772, "step": 752 }, { "epoch": 1.0073578595317725, "grad_norm": 1.2879897356033325, "learning_rate": 6.699684826654661e-05, "loss": 1.5489, "step": 753 }, { "epoch": 1.008695652173913, "grad_norm": 1.5060155391693115, "learning_rate": 6.695182350292662e-05, "loss": 1.5242, "step": 754 }, { "epoch": 1.0100334448160535, "grad_norm": 0.8411580324172974, "learning_rate": 6.690679873930662e-05, "loss": 1.6664, "step": 755 }, { "epoch": 1.011371237458194, "grad_norm": 0.9809452295303345, "learning_rate": 6.686177397568662e-05, "loss": 1.8596, "step": 756 }, { "epoch": 1.0127090301003345, "grad_norm": 1.2678918838500977, "learning_rate": 6.681674921206665e-05, "loss": 1.6882, "step": 757 }, { "epoch": 1.0140468227424748, "grad_norm": 0.9003990888595581, "learning_rate": 6.677172444844665e-05, "loss": 1.8339, "step": 758 }, { "epoch": 1.0153846153846153, "grad_norm": 0.9132852554321289, "learning_rate": 6.672669968482665e-05, "loss": 1.6924, "step": 759 }, { "epoch": 1.0167224080267558, "grad_norm": 0.8782814741134644, "learning_rate": 6.668167492120667e-05, "loss": 1.2357, "step": 760 }, { "epoch": 1.0180602006688964, "grad_norm": 1.0629113912582397, "learning_rate": 6.663665015758668e-05, "loss": 1.4501, "step": 761 }, { "epoch": 1.0193979933110369, "grad_norm": 0.6594319343566895, "learning_rate": 6.659162539396669e-05, "loss": 1.9228, "step": 762 }, { "epoch": 1.0207357859531772, "grad_norm": 1.1182608604431152, "learning_rate": 6.654660063034668e-05, "loss": 1.562, "step": 763 }, { "epoch": 1.0220735785953177, "grad_norm": 0.6796782612800598, "learning_rate": 6.65015758667267e-05, "loss": 1.8346, "step": 764 }, { "epoch": 1.0234113712374582, "grad_norm": 1.0874909162521362, "learning_rate": 6.645655110310671e-05, "loss": 1.2697, "step": 765 }, { "epoch": 1.0247491638795987, "grad_norm": 1.1255723237991333, "learning_rate": 6.641152633948672e-05, "loss": 1.1482, "step": 766 }, { "epoch": 1.0260869565217392, "grad_norm": 0.7903443574905396, "learning_rate": 6.636650157586673e-05, "loss": 1.5898, "step": 767 }, { "epoch": 1.0274247491638795, "grad_norm": 0.8102263808250427, "learning_rate": 6.632147681224674e-05, "loss": 1.5937, "step": 768 }, { "epoch": 1.02876254180602, "grad_norm": 0.7469982504844666, "learning_rate": 6.627645204862675e-05, "loss": 1.542, "step": 769 }, { "epoch": 1.0301003344481605, "grad_norm": 0.8481405377388, "learning_rate": 6.623142728500676e-05, "loss": 1.0759, "step": 770 }, { "epoch": 1.031438127090301, "grad_norm": 0.8804197907447815, "learning_rate": 6.618640252138677e-05, "loss": 1.4243, "step": 771 }, { "epoch": 1.0327759197324415, "grad_norm": 0.8232119083404541, "learning_rate": 6.614137775776677e-05, "loss": 1.767, "step": 772 }, { "epoch": 1.034113712374582, "grad_norm": 1.0143418312072754, "learning_rate": 6.609635299414678e-05, "loss": 1.6867, "step": 773 }, { "epoch": 1.0354515050167223, "grad_norm": 0.8692298531532288, "learning_rate": 6.605132823052679e-05, "loss": 1.5972, "step": 774 }, { "epoch": 1.0367892976588629, "grad_norm": 0.8228359222412109, "learning_rate": 6.60063034669068e-05, "loss": 1.6464, "step": 775 }, { "epoch": 1.0381270903010034, "grad_norm": 0.9182949662208557, "learning_rate": 6.596127870328681e-05, "loss": 1.2447, "step": 776 }, { "epoch": 1.0394648829431439, "grad_norm": 0.9668188095092773, "learning_rate": 6.591625393966682e-05, "loss": 1.5519, "step": 777 }, { "epoch": 1.0408026755852844, "grad_norm": 0.8035921454429626, "learning_rate": 6.587122917604684e-05, "loss": 1.6115, "step": 778 }, { "epoch": 1.0421404682274247, "grad_norm": 0.7637916803359985, "learning_rate": 6.582620441242683e-05, "loss": 1.5989, "step": 779 }, { "epoch": 1.0434782608695652, "grad_norm": 0.6795792579650879, "learning_rate": 6.578117964880684e-05, "loss": 1.8817, "step": 780 }, { "epoch": 1.0448160535117057, "grad_norm": 0.7362264394760132, "learning_rate": 6.573615488518685e-05, "loss": 1.6863, "step": 781 }, { "epoch": 1.0461538461538462, "grad_norm": 0.7315294742584229, "learning_rate": 6.569113012156687e-05, "loss": 1.8927, "step": 782 }, { "epoch": 1.0474916387959867, "grad_norm": 0.9518543481826782, "learning_rate": 6.564610535794687e-05, "loss": 1.3103, "step": 783 }, { "epoch": 1.048829431438127, "grad_norm": 0.8763629794120789, "learning_rate": 6.560108059432688e-05, "loss": 1.9248, "step": 784 }, { "epoch": 1.0501672240802675, "grad_norm": 0.8956972360610962, "learning_rate": 6.55560558307069e-05, "loss": 2.4056, "step": 785 }, { "epoch": 1.051505016722408, "grad_norm": 1.1922390460968018, "learning_rate": 6.551103106708691e-05, "loss": 1.3751, "step": 786 }, { "epoch": 1.0528428093645485, "grad_norm": 0.8947060108184814, "learning_rate": 6.54660063034669e-05, "loss": 1.7996, "step": 787 }, { "epoch": 1.054180602006689, "grad_norm": 0.7719162702560425, "learning_rate": 6.542098153984691e-05, "loss": 1.7215, "step": 788 }, { "epoch": 1.0555183946488294, "grad_norm": 0.8137053847312927, "learning_rate": 6.537595677622693e-05, "loss": 1.6754, "step": 789 }, { "epoch": 1.0568561872909699, "grad_norm": 0.8411099910736084, "learning_rate": 6.533093201260694e-05, "loss": 1.7742, "step": 790 }, { "epoch": 1.0581939799331104, "grad_norm": 0.9793359041213989, "learning_rate": 6.528590724898694e-05, "loss": 1.5735, "step": 791 }, { "epoch": 1.0595317725752509, "grad_norm": 0.9073910117149353, "learning_rate": 6.524088248536695e-05, "loss": 2.2354, "step": 792 }, { "epoch": 1.0608695652173914, "grad_norm": 0.7567430138587952, "learning_rate": 6.519585772174697e-05, "loss": 1.5626, "step": 793 }, { "epoch": 1.0622073578595317, "grad_norm": 1.06880521774292, "learning_rate": 6.515083295812698e-05, "loss": 1.2051, "step": 794 }, { "epoch": 1.0635451505016722, "grad_norm": 0.8187976479530334, "learning_rate": 6.510580819450697e-05, "loss": 1.631, "step": 795 }, { "epoch": 1.0648829431438127, "grad_norm": 0.7056008577346802, "learning_rate": 6.5060783430887e-05, "loss": 1.5343, "step": 796 }, { "epoch": 1.0662207357859532, "grad_norm": 0.9352847337722778, "learning_rate": 6.5015758667267e-05, "loss": 1.6003, "step": 797 }, { "epoch": 1.0675585284280937, "grad_norm": 1.8865182399749756, "learning_rate": 6.497073390364701e-05, "loss": 1.7215, "step": 798 }, { "epoch": 1.068896321070234, "grad_norm": 1.0380245447158813, "learning_rate": 6.492570914002702e-05, "loss": 1.6079, "step": 799 }, { "epoch": 1.0702341137123745, "grad_norm": 1.001865029335022, "learning_rate": 6.488068437640703e-05, "loss": 1.0845, "step": 800 }, { "epoch": 1.071571906354515, "grad_norm": 1.0492470264434814, "learning_rate": 6.483565961278704e-05, "loss": 1.4899, "step": 801 }, { "epoch": 1.0729096989966556, "grad_norm": 1.1831380128860474, "learning_rate": 6.479063484916705e-05, "loss": 0.7326, "step": 802 }, { "epoch": 1.074247491638796, "grad_norm": 1.1278901100158691, "learning_rate": 6.474561008554705e-05, "loss": 1.5677, "step": 803 }, { "epoch": 1.0755852842809364, "grad_norm": 0.8802807927131653, "learning_rate": 6.470058532192706e-05, "loss": 1.7562, "step": 804 }, { "epoch": 1.0769230769230769, "grad_norm": 1.0205730199813843, "learning_rate": 6.465556055830707e-05, "loss": 0.9363, "step": 805 }, { "epoch": 1.0782608695652174, "grad_norm": 1.9332672357559204, "learning_rate": 6.461053579468708e-05, "loss": 1.2796, "step": 806 }, { "epoch": 1.079598662207358, "grad_norm": 1.5062698125839233, "learning_rate": 6.456551103106709e-05, "loss": 1.3188, "step": 807 }, { "epoch": 1.0809364548494984, "grad_norm": 0.7134535312652588, "learning_rate": 6.45204862674471e-05, "loss": 1.928, "step": 808 }, { "epoch": 1.0822742474916387, "grad_norm": 1.903796911239624, "learning_rate": 6.44754615038271e-05, "loss": 1.1583, "step": 809 }, { "epoch": 1.0836120401337792, "grad_norm": 0.7796040773391724, "learning_rate": 6.443043674020711e-05, "loss": 1.9661, "step": 810 }, { "epoch": 1.0849498327759197, "grad_norm": 0.7826983332633972, "learning_rate": 6.438541197658712e-05, "loss": 1.8202, "step": 811 }, { "epoch": 1.0862876254180602, "grad_norm": 0.7324210405349731, "learning_rate": 6.434038721296713e-05, "loss": 1.5366, "step": 812 }, { "epoch": 1.0876254180602007, "grad_norm": 0.7018693685531616, "learning_rate": 6.429536244934714e-05, "loss": 1.8534, "step": 813 }, { "epoch": 1.088963210702341, "grad_norm": 0.876075029373169, "learning_rate": 6.425033768572716e-05, "loss": 1.2189, "step": 814 }, { "epoch": 1.0903010033444815, "grad_norm": 1.021077275276184, "learning_rate": 6.420531292210716e-05, "loss": 1.4601, "step": 815 }, { "epoch": 1.091638795986622, "grad_norm": 1.157410740852356, "learning_rate": 6.416028815848717e-05, "loss": 0.9841, "step": 816 }, { "epoch": 1.0929765886287626, "grad_norm": 0.6732658743858337, "learning_rate": 6.411526339486717e-05, "loss": 1.9742, "step": 817 }, { "epoch": 1.094314381270903, "grad_norm": 0.6634247899055481, "learning_rate": 6.40702386312472e-05, "loss": 1.5411, "step": 818 }, { "epoch": 1.0956521739130434, "grad_norm": 0.7590674161911011, "learning_rate": 6.402521386762719e-05, "loss": 1.028, "step": 819 }, { "epoch": 1.0969899665551839, "grad_norm": 0.8274590969085693, "learning_rate": 6.39801891040072e-05, "loss": 1.6494, "step": 820 }, { "epoch": 1.0983277591973244, "grad_norm": 0.8210983276367188, "learning_rate": 6.393516434038722e-05, "loss": 1.5706, "step": 821 }, { "epoch": 1.099665551839465, "grad_norm": 0.9263409376144409, "learning_rate": 6.389013957676723e-05, "loss": 1.6716, "step": 822 }, { "epoch": 1.1010033444816054, "grad_norm": 0.8053122162818909, "learning_rate": 6.384511481314724e-05, "loss": 1.5703, "step": 823 }, { "epoch": 1.1023411371237457, "grad_norm": 1.2410510778427124, "learning_rate": 6.380009004952723e-05, "loss": 1.6517, "step": 824 }, { "epoch": 1.1036789297658862, "grad_norm": 0.909188449382782, "learning_rate": 6.375506528590726e-05, "loss": 1.177, "step": 825 }, { "epoch": 1.1050167224080267, "grad_norm": 0.673254132270813, "learning_rate": 6.371004052228727e-05, "loss": 1.5251, "step": 826 }, { "epoch": 1.1063545150501672, "grad_norm": 0.9418015480041504, "learning_rate": 6.366501575866727e-05, "loss": 1.2767, "step": 827 }, { "epoch": 1.1076923076923078, "grad_norm": 0.7347880601882935, "learning_rate": 6.361999099504727e-05, "loss": 1.5897, "step": 828 }, { "epoch": 1.109030100334448, "grad_norm": 0.9692310094833374, "learning_rate": 6.357496623142729e-05, "loss": 1.909, "step": 829 }, { "epoch": 1.1103678929765886, "grad_norm": 0.8513740301132202, "learning_rate": 6.35299414678073e-05, "loss": 1.2062, "step": 830 }, { "epoch": 1.111705685618729, "grad_norm": 1.5622446537017822, "learning_rate": 6.348491670418731e-05, "loss": 0.8553, "step": 831 }, { "epoch": 1.1130434782608696, "grad_norm": 0.8817566633224487, "learning_rate": 6.343989194056732e-05, "loss": 1.1544, "step": 832 }, { "epoch": 1.11438127090301, "grad_norm": 0.8441960215568542, "learning_rate": 6.339486717694733e-05, "loss": 1.2105, "step": 833 }, { "epoch": 1.1157190635451506, "grad_norm": 1.094710350036621, "learning_rate": 6.334984241332733e-05, "loss": 1.3695, "step": 834 }, { "epoch": 1.117056856187291, "grad_norm": 1.8443351984024048, "learning_rate": 6.330481764970734e-05, "loss": 2.2446, "step": 835 }, { "epoch": 1.1183946488294314, "grad_norm": 1.042940616607666, "learning_rate": 6.325979288608735e-05, "loss": 1.9706, "step": 836 }, { "epoch": 1.119732441471572, "grad_norm": 0.7949560880661011, "learning_rate": 6.321476812246736e-05, "loss": 1.7105, "step": 837 }, { "epoch": 1.1210702341137124, "grad_norm": 0.8403007388114929, "learning_rate": 6.316974335884737e-05, "loss": 1.6519, "step": 838 }, { "epoch": 1.122408026755853, "grad_norm": 1.5427777767181396, "learning_rate": 6.312471859522738e-05, "loss": 1.136, "step": 839 }, { "epoch": 1.1237458193979932, "grad_norm": 0.619282603263855, "learning_rate": 6.307969383160739e-05, "loss": 1.308, "step": 840 }, { "epoch": 1.1250836120401337, "grad_norm": 0.8113716244697571, "learning_rate": 6.30346690679874e-05, "loss": 1.9532, "step": 841 }, { "epoch": 1.1264214046822743, "grad_norm": 0.6016692519187927, "learning_rate": 6.29896443043674e-05, "loss": 1.3914, "step": 842 }, { "epoch": 1.1277591973244148, "grad_norm": 0.9051966071128845, "learning_rate": 6.294461954074741e-05, "loss": 1.6739, "step": 843 }, { "epoch": 1.1290969899665553, "grad_norm": 0.8936472535133362, "learning_rate": 6.289959477712742e-05, "loss": 1.5069, "step": 844 }, { "epoch": 1.1304347826086956, "grad_norm": 0.8642289042472839, "learning_rate": 6.285457001350743e-05, "loss": 1.6832, "step": 845 }, { "epoch": 1.131772575250836, "grad_norm": 1.186710238456726, "learning_rate": 6.280954524988744e-05, "loss": 1.6192, "step": 846 }, { "epoch": 1.1331103678929766, "grad_norm": 0.723852813243866, "learning_rate": 6.276452048626746e-05, "loss": 1.8716, "step": 847 }, { "epoch": 1.134448160535117, "grad_norm": 0.9492783546447754, "learning_rate": 6.271949572264745e-05, "loss": 1.6641, "step": 848 }, { "epoch": 1.1357859531772576, "grad_norm": 1.0893625020980835, "learning_rate": 6.267447095902746e-05, "loss": 1.5037, "step": 849 }, { "epoch": 1.137123745819398, "grad_norm": 1.28972589969635, "learning_rate": 6.262944619540749e-05, "loss": 1.0721, "step": 850 }, { "epoch": 1.1384615384615384, "grad_norm": 1.0664037466049194, "learning_rate": 6.25844214317875e-05, "loss": 1.6358, "step": 851 }, { "epoch": 1.139799331103679, "grad_norm": 0.8718981146812439, "learning_rate": 6.253939666816749e-05, "loss": 1.7783, "step": 852 }, { "epoch": 1.1411371237458194, "grad_norm": 0.7964752912521362, "learning_rate": 6.24943719045475e-05, "loss": 1.8482, "step": 853 }, { "epoch": 1.14247491638796, "grad_norm": 1.1163746118545532, "learning_rate": 6.244934714092752e-05, "loss": 1.5371, "step": 854 }, { "epoch": 1.1438127090301002, "grad_norm": 0.8743355870246887, "learning_rate": 6.240432237730753e-05, "loss": 1.5555, "step": 855 }, { "epoch": 1.1451505016722408, "grad_norm": 1.3836225271224976, "learning_rate": 6.235929761368752e-05, "loss": 1.138, "step": 856 }, { "epoch": 1.1464882943143813, "grad_norm": 0.9284037351608276, "learning_rate": 6.231427285006755e-05, "loss": 1.8, "step": 857 }, { "epoch": 1.1478260869565218, "grad_norm": 2.1215121746063232, "learning_rate": 6.226924808644755e-05, "loss": 1.886, "step": 858 }, { "epoch": 1.1491638795986623, "grad_norm": 0.8018773198127747, "learning_rate": 6.222422332282756e-05, "loss": 1.4954, "step": 859 }, { "epoch": 1.1505016722408028, "grad_norm": 0.7150079607963562, "learning_rate": 6.217919855920756e-05, "loss": 1.2708, "step": 860 }, { "epoch": 1.151839464882943, "grad_norm": 3.145190954208374, "learning_rate": 6.213417379558758e-05, "loss": 1.2389, "step": 861 }, { "epoch": 1.1531772575250836, "grad_norm": 0.9456586837768555, "learning_rate": 6.208914903196759e-05, "loss": 2.124, "step": 862 }, { "epoch": 1.154515050167224, "grad_norm": 1.0844643115997314, "learning_rate": 6.20441242683476e-05, "loss": 1.4282, "step": 863 }, { "epoch": 1.1558528428093646, "grad_norm": 0.8809278607368469, "learning_rate": 6.199909950472759e-05, "loss": 1.8585, "step": 864 }, { "epoch": 1.1571906354515051, "grad_norm": 0.9258586764335632, "learning_rate": 6.195407474110761e-05, "loss": 1.1901, "step": 865 }, { "epoch": 1.1585284280936454, "grad_norm": 0.8438937664031982, "learning_rate": 6.190904997748762e-05, "loss": 1.6645, "step": 866 }, { "epoch": 1.159866220735786, "grad_norm": 0.8730605244636536, "learning_rate": 6.186402521386763e-05, "loss": 1.5863, "step": 867 }, { "epoch": 1.1612040133779264, "grad_norm": 0.8524886965751648, "learning_rate": 6.181900045024764e-05, "loss": 1.2708, "step": 868 }, { "epoch": 1.162541806020067, "grad_norm": 0.6810708045959473, "learning_rate": 6.177397568662765e-05, "loss": 1.3026, "step": 869 }, { "epoch": 1.1638795986622075, "grad_norm": 1.1528704166412354, "learning_rate": 6.172895092300766e-05, "loss": 1.7219, "step": 870 }, { "epoch": 1.1652173913043478, "grad_norm": 2.7838613986968994, "learning_rate": 6.168392615938767e-05, "loss": 1.4851, "step": 871 }, { "epoch": 1.1665551839464883, "grad_norm": 0.8822845816612244, "learning_rate": 6.163890139576767e-05, "loss": 1.652, "step": 872 }, { "epoch": 1.1678929765886288, "grad_norm": 0.6918888688087463, "learning_rate": 6.159387663214768e-05, "loss": 1.2737, "step": 873 }, { "epoch": 1.1692307692307693, "grad_norm": 1.1061296463012695, "learning_rate": 6.154885186852769e-05, "loss": 1.4387, "step": 874 }, { "epoch": 1.1705685618729098, "grad_norm": 0.6670346856117249, "learning_rate": 6.150382710490771e-05, "loss": 1.9764, "step": 875 }, { "epoch": 1.17190635451505, "grad_norm": 0.8202233910560608, "learning_rate": 6.145880234128771e-05, "loss": 1.6687, "step": 876 }, { "epoch": 1.1732441471571906, "grad_norm": 0.7462705373764038, "learning_rate": 6.141377757766772e-05, "loss": 1.7787, "step": 877 }, { "epoch": 1.1745819397993311, "grad_norm": 0.8808805346488953, "learning_rate": 6.136875281404773e-05, "loss": 1.2746, "step": 878 }, { "epoch": 1.1759197324414716, "grad_norm": 0.7754594087600708, "learning_rate": 6.132372805042775e-05, "loss": 1.9316, "step": 879 }, { "epoch": 1.1772575250836121, "grad_norm": 1.5388227701187134, "learning_rate": 6.127870328680774e-05, "loss": 1.6537, "step": 880 }, { "epoch": 1.1785953177257524, "grad_norm": 2.2294013500213623, "learning_rate": 6.123367852318775e-05, "loss": 1.3761, "step": 881 }, { "epoch": 1.179933110367893, "grad_norm": 0.7639999389648438, "learning_rate": 6.118865375956776e-05, "loss": 1.522, "step": 882 }, { "epoch": 1.1812709030100335, "grad_norm": 1.0136057138442993, "learning_rate": 6.114362899594778e-05, "loss": 1.7424, "step": 883 }, { "epoch": 1.182608695652174, "grad_norm": 0.8844318985939026, "learning_rate": 6.109860423232778e-05, "loss": 1.1706, "step": 884 }, { "epoch": 1.1839464882943145, "grad_norm": 0.8913040161132812, "learning_rate": 6.105357946870779e-05, "loss": 1.7598, "step": 885 }, { "epoch": 1.1852842809364548, "grad_norm": 1.07845938205719, "learning_rate": 6.100855470508781e-05, "loss": 1.0585, "step": 886 }, { "epoch": 1.1866220735785953, "grad_norm": 0.9585360288619995, "learning_rate": 6.096352994146781e-05, "loss": 1.0684, "step": 887 }, { "epoch": 1.1879598662207358, "grad_norm": 1.028442621231079, "learning_rate": 6.091850517784782e-05, "loss": 1.8733, "step": 888 }, { "epoch": 1.1892976588628763, "grad_norm": 0.9173264503479004, "learning_rate": 6.087348041422783e-05, "loss": 1.7119, "step": 889 }, { "epoch": 1.1906354515050168, "grad_norm": 1.4423737525939941, "learning_rate": 6.082845565060784e-05, "loss": 1.963, "step": 890 }, { "epoch": 1.191973244147157, "grad_norm": 0.9499053955078125, "learning_rate": 6.0783430886987844e-05, "loss": 1.4102, "step": 891 }, { "epoch": 1.1933110367892976, "grad_norm": 0.7227963805198669, "learning_rate": 6.073840612336785e-05, "loss": 1.8352, "step": 892 }, { "epoch": 1.1946488294314381, "grad_norm": 0.8120591044425964, "learning_rate": 6.069338135974787e-05, "loss": 1.6841, "step": 893 }, { "epoch": 1.1959866220735786, "grad_norm": 0.7347455024719238, "learning_rate": 6.0648356596127876e-05, "loss": 1.8131, "step": 894 }, { "epoch": 1.1973244147157192, "grad_norm": 0.9052204489707947, "learning_rate": 6.060333183250788e-05, "loss": 1.4982, "step": 895 }, { "epoch": 1.1986622073578594, "grad_norm": 0.8606255650520325, "learning_rate": 6.055830706888789e-05, "loss": 1.1158, "step": 896 }, { "epoch": 1.2, "grad_norm": 1.5991266965866089, "learning_rate": 6.05132823052679e-05, "loss": 1.4721, "step": 897 }, { "epoch": 1.2013377926421405, "grad_norm": 0.9641832113265991, "learning_rate": 6.046825754164791e-05, "loss": 1.9599, "step": 898 }, { "epoch": 1.202675585284281, "grad_norm": 0.9988728165626526, "learning_rate": 6.042323277802792e-05, "loss": 1.4815, "step": 899 }, { "epoch": 1.2040133779264215, "grad_norm": 0.8313688635826111, "learning_rate": 6.037820801440792e-05, "loss": 1.0745, "step": 900 }, { "epoch": 1.2053511705685618, "grad_norm": 0.6813498735427856, "learning_rate": 6.033318325078794e-05, "loss": 1.3578, "step": 901 }, { "epoch": 1.2066889632107023, "grad_norm": 1.4857420921325684, "learning_rate": 6.0288158487167945e-05, "loss": 1.5083, "step": 902 }, { "epoch": 1.2080267558528428, "grad_norm": 0.7191517353057861, "learning_rate": 6.0243133723547954e-05, "loss": 1.8211, "step": 903 }, { "epoch": 1.2093645484949833, "grad_norm": 0.6989690065383911, "learning_rate": 6.019810895992797e-05, "loss": 1.8502, "step": 904 }, { "epoch": 1.2107023411371238, "grad_norm": 1.0212314128875732, "learning_rate": 6.015308419630797e-05, "loss": 1.9849, "step": 905 }, { "epoch": 1.2120401337792641, "grad_norm": 2.5734026432037354, "learning_rate": 6.010805943268798e-05, "loss": 1.2135, "step": 906 }, { "epoch": 1.2133779264214046, "grad_norm": 0.762624979019165, "learning_rate": 6.006303466906799e-05, "loss": 1.6969, "step": 907 }, { "epoch": 1.2147157190635451, "grad_norm": 0.9556645154953003, "learning_rate": 6.0018009905448004e-05, "loss": 1.4878, "step": 908 }, { "epoch": 1.2160535117056857, "grad_norm": 0.6057252883911133, "learning_rate": 5.9972985141828005e-05, "loss": 1.4873, "step": 909 }, { "epoch": 1.2173913043478262, "grad_norm": 1.1988812685012817, "learning_rate": 5.9927960378208014e-05, "loss": 1.4444, "step": 910 }, { "epoch": 1.2187290969899665, "grad_norm": 0.646827757358551, "learning_rate": 5.988293561458803e-05, "loss": 1.1479, "step": 911 }, { "epoch": 1.220066889632107, "grad_norm": 0.762706458568573, "learning_rate": 5.983791085096804e-05, "loss": 1.5255, "step": 912 }, { "epoch": 1.2214046822742475, "grad_norm": 0.9942954778671265, "learning_rate": 5.9792886087348046e-05, "loss": 1.252, "step": 913 }, { "epoch": 1.222742474916388, "grad_norm": 1.1604199409484863, "learning_rate": 5.974786132372805e-05, "loss": 1.7593, "step": 914 }, { "epoch": 1.2240802675585285, "grad_norm": 1.2815771102905273, "learning_rate": 5.9702836560108064e-05, "loss": 1.7823, "step": 915 }, { "epoch": 1.2254180602006688, "grad_norm": 0.7978178858757019, "learning_rate": 5.965781179648807e-05, "loss": 2.0054, "step": 916 }, { "epoch": 1.2267558528428093, "grad_norm": 0.6075098514556885, "learning_rate": 5.961278703286808e-05, "loss": 2.0769, "step": 917 }, { "epoch": 1.2280936454849498, "grad_norm": 0.7329334020614624, "learning_rate": 5.956776226924808e-05, "loss": 1.7853, "step": 918 }, { "epoch": 1.2294314381270903, "grad_norm": 0.689140260219574, "learning_rate": 5.95227375056281e-05, "loss": 1.9128, "step": 919 }, { "epoch": 1.2307692307692308, "grad_norm": 0.7186599969863892, "learning_rate": 5.9477712742008107e-05, "loss": 1.8959, "step": 920 }, { "epoch": 1.2321070234113711, "grad_norm": 1.0237096548080444, "learning_rate": 5.9432687978388115e-05, "loss": 2.0922, "step": 921 }, { "epoch": 1.2334448160535116, "grad_norm": 1.1331475973129272, "learning_rate": 5.938766321476813e-05, "loss": 1.5555, "step": 922 }, { "epoch": 1.2347826086956522, "grad_norm": 0.5831222534179688, "learning_rate": 5.934263845114814e-05, "loss": 1.3537, "step": 923 }, { "epoch": 1.2361204013377927, "grad_norm": 0.8829436302185059, "learning_rate": 5.929761368752814e-05, "loss": 1.56, "step": 924 }, { "epoch": 1.2374581939799332, "grad_norm": 0.6521071791648865, "learning_rate": 5.925258892390815e-05, "loss": 1.2735, "step": 925 }, { "epoch": 1.2387959866220735, "grad_norm": 0.8493558168411255, "learning_rate": 5.9207564160288165e-05, "loss": 1.8301, "step": 926 }, { "epoch": 1.240133779264214, "grad_norm": 0.889323890209198, "learning_rate": 5.9162539396668173e-05, "loss": 1.47, "step": 927 }, { "epoch": 1.2414715719063545, "grad_norm": 1.5412036180496216, "learning_rate": 5.9117514633048175e-05, "loss": 1.8443, "step": 928 }, { "epoch": 1.242809364548495, "grad_norm": 0.8223730325698853, "learning_rate": 5.907248986942819e-05, "loss": 1.4285, "step": 929 }, { "epoch": 1.2441471571906355, "grad_norm": 0.9287546277046204, "learning_rate": 5.90274651058082e-05, "loss": 1.5424, "step": 930 }, { "epoch": 1.2454849498327758, "grad_norm": 1.4095571041107178, "learning_rate": 5.898244034218821e-05, "loss": 1.1844, "step": 931 }, { "epoch": 1.2468227424749163, "grad_norm": 0.9519053101539612, "learning_rate": 5.893741557856821e-05, "loss": 1.5734, "step": 932 }, { "epoch": 1.2481605351170568, "grad_norm": 1.132487416267395, "learning_rate": 5.889239081494823e-05, "loss": 1.6002, "step": 933 }, { "epoch": 1.2494983277591973, "grad_norm": 0.6693798899650574, "learning_rate": 5.8847366051328234e-05, "loss": 1.8101, "step": 934 }, { "epoch": 1.2508361204013378, "grad_norm": 1.3155882358551025, "learning_rate": 5.880234128770824e-05, "loss": 1.6621, "step": 935 }, { "epoch": 1.2521739130434781, "grad_norm": 2.0933713912963867, "learning_rate": 5.8757316524088244e-05, "loss": 1.4421, "step": 936 }, { "epoch": 1.2535117056856186, "grad_norm": 0.9255110025405884, "learning_rate": 5.8712291760468266e-05, "loss": 2.0006, "step": 937 }, { "epoch": 1.2548494983277592, "grad_norm": 0.7264999151229858, "learning_rate": 5.866726699684827e-05, "loss": 1.9256, "step": 938 }, { "epoch": 1.2561872909698997, "grad_norm": 0.767331600189209, "learning_rate": 5.8622242233228277e-05, "loss": 1.8369, "step": 939 }, { "epoch": 1.2575250836120402, "grad_norm": 1.104103684425354, "learning_rate": 5.857721746960829e-05, "loss": 1.576, "step": 940 }, { "epoch": 1.2588628762541805, "grad_norm": 0.7990514636039734, "learning_rate": 5.85321927059883e-05, "loss": 1.8798, "step": 941 }, { "epoch": 1.2602006688963212, "grad_norm": 0.6627078056335449, "learning_rate": 5.84871679423683e-05, "loss": 1.8592, "step": 942 }, { "epoch": 1.2615384615384615, "grad_norm": 0.730588436126709, "learning_rate": 5.844214317874831e-05, "loss": 0.9581, "step": 943 }, { "epoch": 1.262876254180602, "grad_norm": 0.5782043933868408, "learning_rate": 5.8397118415128326e-05, "loss": 1.5693, "step": 944 }, { "epoch": 1.2642140468227425, "grad_norm": 1.0330079793930054, "learning_rate": 5.8352093651508335e-05, "loss": 1.3259, "step": 945 }, { "epoch": 1.2655518394648828, "grad_norm": 0.7264488935470581, "learning_rate": 5.830706888788834e-05, "loss": 1.8417, "step": 946 }, { "epoch": 1.2668896321070235, "grad_norm": 0.8809522986412048, "learning_rate": 5.826204412426836e-05, "loss": 1.3526, "step": 947 }, { "epoch": 1.2682274247491638, "grad_norm": 0.9869521856307983, "learning_rate": 5.821701936064836e-05, "loss": 1.3958, "step": 948 }, { "epoch": 1.2695652173913043, "grad_norm": 0.7876471281051636, "learning_rate": 5.817199459702837e-05, "loss": 1.5426, "step": 949 }, { "epoch": 1.2709030100334449, "grad_norm": 0.836608350276947, "learning_rate": 5.812696983340837e-05, "loss": 1.8164, "step": 950 }, { "epoch": 1.2722408026755851, "grad_norm": 0.7498127222061157, "learning_rate": 5.808194506978839e-05, "loss": 1.7363, "step": 951 }, { "epoch": 1.2735785953177259, "grad_norm": 0.9935570955276489, "learning_rate": 5.8036920306168395e-05, "loss": 1.5462, "step": 952 }, { "epoch": 1.2749163879598662, "grad_norm": 1.1275084018707275, "learning_rate": 5.7991895542548404e-05, "loss": 1.5394, "step": 953 }, { "epoch": 1.2762541806020067, "grad_norm": 0.996572732925415, "learning_rate": 5.7946870778928405e-05, "loss": 1.5601, "step": 954 }, { "epoch": 1.2775919732441472, "grad_norm": 1.216365933418274, "learning_rate": 5.790184601530843e-05, "loss": 1.4987, "step": 955 }, { "epoch": 1.2789297658862877, "grad_norm": 1.7709954977035522, "learning_rate": 5.785682125168843e-05, "loss": 1.3511, "step": 956 }, { "epoch": 1.2802675585284282, "grad_norm": 0.7698276042938232, "learning_rate": 5.781179648806844e-05, "loss": 1.8515, "step": 957 }, { "epoch": 1.2816053511705685, "grad_norm": 0.8181136846542358, "learning_rate": 5.776677172444845e-05, "loss": 1.5038, "step": 958 }, { "epoch": 1.282943143812709, "grad_norm": 0.9560226798057556, "learning_rate": 5.772174696082846e-05, "loss": 1.8003, "step": 959 }, { "epoch": 1.2842809364548495, "grad_norm": 0.9249825477600098, "learning_rate": 5.7676722197208464e-05, "loss": 1.8096, "step": 960 }, { "epoch": 1.28561872909699, "grad_norm": 0.7355000376701355, "learning_rate": 5.763169743358847e-05, "loss": 1.9474, "step": 961 }, { "epoch": 1.2869565217391306, "grad_norm": 0.7304123640060425, "learning_rate": 5.758667266996849e-05, "loss": 2.0113, "step": 962 }, { "epoch": 1.2882943143812708, "grad_norm": 1.1533622741699219, "learning_rate": 5.7541647906348496e-05, "loss": 1.0177, "step": 963 }, { "epoch": 1.2896321070234114, "grad_norm": 0.8715507984161377, "learning_rate": 5.74966231427285e-05, "loss": 1.4832, "step": 964 }, { "epoch": 1.2909698996655519, "grad_norm": 1.0615633726119995, "learning_rate": 5.745159837910852e-05, "loss": 1.5385, "step": 965 }, { "epoch": 1.2923076923076924, "grad_norm": 1.2473442554473877, "learning_rate": 5.740657361548852e-05, "loss": 1.3534, "step": 966 }, { "epoch": 1.293645484949833, "grad_norm": 1.1255221366882324, "learning_rate": 5.736154885186853e-05, "loss": 1.666, "step": 967 }, { "epoch": 1.2949832775919732, "grad_norm": 1.0651443004608154, "learning_rate": 5.731652408824853e-05, "loss": 1.7063, "step": 968 }, { "epoch": 1.2963210702341137, "grad_norm": 0.7863209247589111, "learning_rate": 5.7271499324628554e-05, "loss": 1.8771, "step": 969 }, { "epoch": 1.2976588628762542, "grad_norm": 1.0300050973892212, "learning_rate": 5.7226474561008556e-05, "loss": 1.6595, "step": 970 }, { "epoch": 1.2989966555183947, "grad_norm": 1.2955358028411865, "learning_rate": 5.7181449797388565e-05, "loss": 1.1926, "step": 971 }, { "epoch": 1.3003344481605352, "grad_norm": 1.4787100553512573, "learning_rate": 5.7136425033768573e-05, "loss": 1.6715, "step": 972 }, { "epoch": 1.3016722408026755, "grad_norm": 1.446374773979187, "learning_rate": 5.709140027014859e-05, "loss": 1.3466, "step": 973 }, { "epoch": 1.303010033444816, "grad_norm": 0.8684519529342651, "learning_rate": 5.704637550652859e-05, "loss": 1.3811, "step": 974 }, { "epoch": 1.3043478260869565, "grad_norm": 4.5600361824035645, "learning_rate": 5.70013507429086e-05, "loss": 1.3829, "step": 975 }, { "epoch": 1.305685618729097, "grad_norm": 1.1232842206954956, "learning_rate": 5.6956325979288615e-05, "loss": 1.7418, "step": 976 }, { "epoch": 1.3070234113712376, "grad_norm": 0.8644034266471863, "learning_rate": 5.691130121566862e-05, "loss": 1.3921, "step": 977 }, { "epoch": 1.3083612040133779, "grad_norm": 1.126865029335022, "learning_rate": 5.6866276452048625e-05, "loss": 1.2891, "step": 978 }, { "epoch": 1.3096989966555184, "grad_norm": 0.7839686870574951, "learning_rate": 5.6821251688428634e-05, "loss": 1.6133, "step": 979 }, { "epoch": 1.3110367892976589, "grad_norm": 0.9094774127006531, "learning_rate": 5.677622692480865e-05, "loss": 1.518, "step": 980 }, { "epoch": 1.3123745819397994, "grad_norm": 1.0354312658309937, "learning_rate": 5.673120216118866e-05, "loss": 1.1613, "step": 981 }, { "epoch": 1.31371237458194, "grad_norm": 1.0073140859603882, "learning_rate": 5.6686177397568666e-05, "loss": 2.0232, "step": 982 }, { "epoch": 1.3150501672240802, "grad_norm": 0.9858584403991699, "learning_rate": 5.664115263394868e-05, "loss": 0.9684, "step": 983 }, { "epoch": 1.3163879598662207, "grad_norm": 0.9003674387931824, "learning_rate": 5.659612787032868e-05, "loss": 1.4406, "step": 984 }, { "epoch": 1.3177257525083612, "grad_norm": 0.771503210067749, "learning_rate": 5.655110310670869e-05, "loss": 1.6696, "step": 985 }, { "epoch": 1.3190635451505017, "grad_norm": 2.726419687271118, "learning_rate": 5.65060783430887e-05, "loss": 1.1897, "step": 986 }, { "epoch": 1.3204013377926422, "grad_norm": 2.211148738861084, "learning_rate": 5.6461053579468716e-05, "loss": 2.0176, "step": 987 }, { "epoch": 1.3217391304347825, "grad_norm": 0.803394615650177, "learning_rate": 5.641602881584872e-05, "loss": 1.6801, "step": 988 }, { "epoch": 1.323076923076923, "grad_norm": 0.9389605522155762, "learning_rate": 5.6371004052228726e-05, "loss": 1.8548, "step": 989 }, { "epoch": 1.3244147157190636, "grad_norm": 1.031009316444397, "learning_rate": 5.6325979288608735e-05, "loss": 1.5096, "step": 990 }, { "epoch": 1.325752508361204, "grad_norm": 1.1561170816421509, "learning_rate": 5.628095452498875e-05, "loss": 1.7523, "step": 991 }, { "epoch": 1.3270903010033446, "grad_norm": 0.7659590840339661, "learning_rate": 5.623592976136875e-05, "loss": 1.0119, "step": 992 }, { "epoch": 1.3284280936454849, "grad_norm": 0.6387937664985657, "learning_rate": 5.619090499774876e-05, "loss": 2.0261, "step": 993 }, { "epoch": 1.3297658862876254, "grad_norm": 0.7081699967384338, "learning_rate": 5.6145880234128776e-05, "loss": 1.602, "step": 994 }, { "epoch": 1.3311036789297659, "grad_norm": 0.6930742263793945, "learning_rate": 5.6100855470508785e-05, "loss": 1.7137, "step": 995 }, { "epoch": 1.3324414715719064, "grad_norm": 0.7945417165756226, "learning_rate": 5.605583070688879e-05, "loss": 1.384, "step": 996 }, { "epoch": 1.333779264214047, "grad_norm": 0.736883282661438, "learning_rate": 5.6010805943268795e-05, "loss": 1.7751, "step": 997 }, { "epoch": 1.3351170568561872, "grad_norm": 0.7430248260498047, "learning_rate": 5.596578117964881e-05, "loss": 1.4957, "step": 998 }, { "epoch": 1.3364548494983277, "grad_norm": 0.7859342694282532, "learning_rate": 5.592075641602882e-05, "loss": 1.6672, "step": 999 }, { "epoch": 1.3377926421404682, "grad_norm": 1.0065340995788574, "learning_rate": 5.587573165240883e-05, "loss": 1.4852, "step": 1000 }, { "epoch": 1.3391304347826087, "grad_norm": 0.8629280924797058, "learning_rate": 5.583070688878884e-05, "loss": 1.6693, "step": 1001 }, { "epoch": 1.3404682274247492, "grad_norm": 0.7851661443710327, "learning_rate": 5.5785682125168845e-05, "loss": 1.319, "step": 1002 }, { "epoch": 1.3418060200668895, "grad_norm": 1.035677433013916, "learning_rate": 5.574065736154885e-05, "loss": 1.6294, "step": 1003 }, { "epoch": 1.34314381270903, "grad_norm": 1.3206753730773926, "learning_rate": 5.569563259792886e-05, "loss": 0.8565, "step": 1004 }, { "epoch": 1.3444816053511706, "grad_norm": 0.796152651309967, "learning_rate": 5.565060783430888e-05, "loss": 1.9462, "step": 1005 }, { "epoch": 1.345819397993311, "grad_norm": 1.0995591878890991, "learning_rate": 5.5605583070688886e-05, "loss": 1.8064, "step": 1006 }, { "epoch": 1.3471571906354516, "grad_norm": 0.8635403513908386, "learning_rate": 5.556055830706889e-05, "loss": 1.411, "step": 1007 }, { "epoch": 1.3484949832775919, "grad_norm": 1.2002208232879639, "learning_rate": 5.5515533543448896e-05, "loss": 1.4774, "step": 1008 }, { "epoch": 1.3498327759197324, "grad_norm": 0.9561333656311035, "learning_rate": 5.547050877982891e-05, "loss": 1.4763, "step": 1009 }, { "epoch": 1.351170568561873, "grad_norm": 0.8307086229324341, "learning_rate": 5.542548401620892e-05, "loss": 1.7697, "step": 1010 }, { "epoch": 1.3525083612040134, "grad_norm": 0.797065794467926, "learning_rate": 5.538045925258892e-05, "loss": 1.6327, "step": 1011 }, { "epoch": 1.353846153846154, "grad_norm": 1.3037409782409668, "learning_rate": 5.533543448896894e-05, "loss": 0.8671, "step": 1012 }, { "epoch": 1.3551839464882942, "grad_norm": 0.8096974492073059, "learning_rate": 5.5290409725348946e-05, "loss": 1.7238, "step": 1013 }, { "epoch": 1.3565217391304347, "grad_norm": 0.9276493191719055, "learning_rate": 5.5245384961728954e-05, "loss": 1.3982, "step": 1014 }, { "epoch": 1.3578595317725752, "grad_norm": 0.7602790594100952, "learning_rate": 5.5200360198108956e-05, "loss": 1.818, "step": 1015 }, { "epoch": 1.3591973244147157, "grad_norm": 0.7128927111625671, "learning_rate": 5.515533543448897e-05, "loss": 1.8052, "step": 1016 }, { "epoch": 1.3605351170568563, "grad_norm": 1.1914445161819458, "learning_rate": 5.511031067086898e-05, "loss": 1.6337, "step": 1017 }, { "epoch": 1.3618729096989965, "grad_norm": 2.203735828399658, "learning_rate": 5.506528590724899e-05, "loss": 1.4609, "step": 1018 }, { "epoch": 1.363210702341137, "grad_norm": 1.0320219993591309, "learning_rate": 5.5020261143629004e-05, "loss": 1.2802, "step": 1019 }, { "epoch": 1.3645484949832776, "grad_norm": 0.956654965877533, "learning_rate": 5.497523638000901e-05, "loss": 1.5686, "step": 1020 }, { "epoch": 1.365886287625418, "grad_norm": 0.8791170716285706, "learning_rate": 5.4930211616389015e-05, "loss": 1.4996, "step": 1021 }, { "epoch": 1.3672240802675586, "grad_norm": 0.8812573552131653, "learning_rate": 5.488518685276902e-05, "loss": 1.5253, "step": 1022 }, { "epoch": 1.3685618729096989, "grad_norm": 1.1747952699661255, "learning_rate": 5.484016208914904e-05, "loss": 1.0431, "step": 1023 }, { "epoch": 1.3698996655518394, "grad_norm": 0.892593502998352, "learning_rate": 5.479513732552905e-05, "loss": 1.8528, "step": 1024 }, { "epoch": 1.37123745819398, "grad_norm": 0.8653114438056946, "learning_rate": 5.475011256190905e-05, "loss": 1.6524, "step": 1025 }, { "epoch": 1.3725752508361204, "grad_norm": 1.1998345851898193, "learning_rate": 5.470508779828906e-05, "loss": 1.6844, "step": 1026 }, { "epoch": 1.373913043478261, "grad_norm": 0.8225812315940857, "learning_rate": 5.466006303466907e-05, "loss": 1.5627, "step": 1027 }, { "epoch": 1.3752508361204012, "grad_norm": 2.0456786155700684, "learning_rate": 5.461503827104908e-05, "loss": 1.7507, "step": 1028 }, { "epoch": 1.3765886287625417, "grad_norm": 0.9611921906471252, "learning_rate": 5.457001350742908e-05, "loss": 1.4718, "step": 1029 }, { "epoch": 1.3779264214046822, "grad_norm": 0.7811959385871887, "learning_rate": 5.4524988743809105e-05, "loss": 1.7484, "step": 1030 }, { "epoch": 1.3792642140468228, "grad_norm": 0.9279390573501587, "learning_rate": 5.447996398018911e-05, "loss": 1.7394, "step": 1031 }, { "epoch": 1.3806020066889633, "grad_norm": 0.9735180139541626, "learning_rate": 5.4434939216569116e-05, "loss": 1.8804, "step": 1032 }, { "epoch": 1.3819397993311036, "grad_norm": 0.8117311000823975, "learning_rate": 5.438991445294912e-05, "loss": 1.2928, "step": 1033 }, { "epoch": 1.383277591973244, "grad_norm": 0.8646334409713745, "learning_rate": 5.434488968932914e-05, "loss": 1.8303, "step": 1034 }, { "epoch": 1.3846153846153846, "grad_norm": 0.9675334692001343, "learning_rate": 5.429986492570914e-05, "loss": 1.8803, "step": 1035 }, { "epoch": 1.385953177257525, "grad_norm": 0.8046660423278809, "learning_rate": 5.425484016208915e-05, "loss": 1.7378, "step": 1036 }, { "epoch": 1.3872909698996656, "grad_norm": 0.6853929758071899, "learning_rate": 5.4209815398469166e-05, "loss": 1.8967, "step": 1037 }, { "epoch": 1.388628762541806, "grad_norm": 0.9253784418106079, "learning_rate": 5.4164790634849174e-05, "loss": 1.3781, "step": 1038 }, { "epoch": 1.3899665551839464, "grad_norm": 0.8796266317367554, "learning_rate": 5.4119765871229176e-05, "loss": 1.478, "step": 1039 }, { "epoch": 1.391304347826087, "grad_norm": 1.2055280208587646, "learning_rate": 5.4074741107609185e-05, "loss": 1.6066, "step": 1040 }, { "epoch": 1.3926421404682274, "grad_norm": 0.8891392946243286, "learning_rate": 5.40297163439892e-05, "loss": 1.4703, "step": 1041 }, { "epoch": 1.393979933110368, "grad_norm": 1.0517340898513794, "learning_rate": 5.398469158036921e-05, "loss": 1.7708, "step": 1042 }, { "epoch": 1.3953177257525082, "grad_norm": 0.7426386475563049, "learning_rate": 5.393966681674921e-05, "loss": 1.6997, "step": 1043 }, { "epoch": 1.396655518394649, "grad_norm": 0.7888435125350952, "learning_rate": 5.389464205312922e-05, "loss": 2.0114, "step": 1044 }, { "epoch": 1.3979933110367893, "grad_norm": 0.8119488954544067, "learning_rate": 5.3849617289509234e-05, "loss": 1.7638, "step": 1045 }, { "epoch": 1.3993311036789298, "grad_norm": 0.8995359539985657, "learning_rate": 5.380459252588924e-05, "loss": 1.9673, "step": 1046 }, { "epoch": 1.4006688963210703, "grad_norm": 0.903998851776123, "learning_rate": 5.3759567762269245e-05, "loss": 1.664, "step": 1047 }, { "epoch": 1.4020066889632106, "grad_norm": 0.7715796828269958, "learning_rate": 5.371454299864927e-05, "loss": 1.9947, "step": 1048 }, { "epoch": 1.4033444816053513, "grad_norm": 1.005541443824768, "learning_rate": 5.366951823502927e-05, "loss": 0.9912, "step": 1049 }, { "epoch": 1.4046822742474916, "grad_norm": 1.4997984170913696, "learning_rate": 5.362449347140928e-05, "loss": 1.5869, "step": 1050 }, { "epoch": 1.406020066889632, "grad_norm": 0.7256458401679993, "learning_rate": 5.357946870778928e-05, "loss": 1.7334, "step": 1051 }, { "epoch": 1.4073578595317726, "grad_norm": 0.9447379112243652, "learning_rate": 5.35344439441693e-05, "loss": 1.5052, "step": 1052 }, { "epoch": 1.4086956521739131, "grad_norm": 0.7913347482681274, "learning_rate": 5.34894191805493e-05, "loss": 2.0669, "step": 1053 }, { "epoch": 1.4100334448160536, "grad_norm": 1.1335333585739136, "learning_rate": 5.344439441692931e-05, "loss": 1.8613, "step": 1054 }, { "epoch": 1.411371237458194, "grad_norm": 0.7583142518997192, "learning_rate": 5.339936965330933e-05, "loss": 2.0301, "step": 1055 }, { "epoch": 1.4127090301003344, "grad_norm": 1.1084086894989014, "learning_rate": 5.3354344889689335e-05, "loss": 1.6319, "step": 1056 }, { "epoch": 1.414046822742475, "grad_norm": 1.1328237056732178, "learning_rate": 5.330932012606934e-05, "loss": 1.2222, "step": 1057 }, { "epoch": 1.4153846153846155, "grad_norm": 1.1727817058563232, "learning_rate": 5.3264295362449346e-05, "loss": 1.4558, "step": 1058 }, { "epoch": 1.416722408026756, "grad_norm": 1.4107069969177246, "learning_rate": 5.321927059882936e-05, "loss": 1.0919, "step": 1059 }, { "epoch": 1.4180602006688963, "grad_norm": 0.8056756854057312, "learning_rate": 5.317424583520937e-05, "loss": 1.9293, "step": 1060 }, { "epoch": 1.4193979933110368, "grad_norm": 0.814609169960022, "learning_rate": 5.312922107158937e-05, "loss": 1.9909, "step": 1061 }, { "epoch": 1.4207357859531773, "grad_norm": 0.8857481479644775, "learning_rate": 5.308419630796938e-05, "loss": 1.7429, "step": 1062 }, { "epoch": 1.4220735785953178, "grad_norm": 0.7829902172088623, "learning_rate": 5.3039171544349396e-05, "loss": 1.7188, "step": 1063 }, { "epoch": 1.4234113712374583, "grad_norm": 0.8884121179580688, "learning_rate": 5.2994146780729404e-05, "loss": 1.4074, "step": 1064 }, { "epoch": 1.4247491638795986, "grad_norm": 1.5147448778152466, "learning_rate": 5.2949122017109406e-05, "loss": 1.2415, "step": 1065 }, { "epoch": 1.4260869565217391, "grad_norm": 1.034368634223938, "learning_rate": 5.290409725348943e-05, "loss": 1.3288, "step": 1066 }, { "epoch": 1.4274247491638796, "grad_norm": 0.6370885372161865, "learning_rate": 5.285907248986943e-05, "loss": 1.3238, "step": 1067 }, { "epoch": 1.4287625418060201, "grad_norm": 0.7260492444038391, "learning_rate": 5.281404772624944e-05, "loss": 1.2832, "step": 1068 }, { "epoch": 1.4301003344481606, "grad_norm": 0.7294813990592957, "learning_rate": 5.276902296262945e-05, "loss": 1.652, "step": 1069 }, { "epoch": 1.431438127090301, "grad_norm": 0.8732017278671265, "learning_rate": 5.272399819900946e-05, "loss": 1.5793, "step": 1070 }, { "epoch": 1.4327759197324414, "grad_norm": 0.734139084815979, "learning_rate": 5.2678973435389464e-05, "loss": 1.6961, "step": 1071 }, { "epoch": 1.434113712374582, "grad_norm": 0.9525502920150757, "learning_rate": 5.263394867176947e-05, "loss": 1.6207, "step": 1072 }, { "epoch": 1.4354515050167225, "grad_norm": 1.0080657005310059, "learning_rate": 5.258892390814949e-05, "loss": 1.4763, "step": 1073 }, { "epoch": 1.436789297658863, "grad_norm": 0.7712482810020447, "learning_rate": 5.25438991445295e-05, "loss": 1.5778, "step": 1074 }, { "epoch": 1.4381270903010033, "grad_norm": 1.0089412927627563, "learning_rate": 5.24988743809095e-05, "loss": 1.7167, "step": 1075 }, { "epoch": 1.4394648829431438, "grad_norm": 0.7854228019714355, "learning_rate": 5.245384961728951e-05, "loss": 1.0463, "step": 1076 }, { "epoch": 1.4408026755852843, "grad_norm": 1.0776352882385254, "learning_rate": 5.240882485366952e-05, "loss": 1.074, "step": 1077 }, { "epoch": 1.4421404682274248, "grad_norm": 1.3480685949325562, "learning_rate": 5.236380009004953e-05, "loss": 1.6723, "step": 1078 }, { "epoch": 1.4434782608695653, "grad_norm": 0.8922518491744995, "learning_rate": 5.231877532642954e-05, "loss": 1.4558, "step": 1079 }, { "epoch": 1.4448160535117056, "grad_norm": 0.832686185836792, "learning_rate": 5.227375056280954e-05, "loss": 1.4743, "step": 1080 }, { "epoch": 1.4461538461538461, "grad_norm": 0.9173155426979065, "learning_rate": 5.222872579918956e-05, "loss": 1.6476, "step": 1081 }, { "epoch": 1.4474916387959866, "grad_norm": 1.1362848281860352, "learning_rate": 5.2183701035569566e-05, "loss": 1.2387, "step": 1082 }, { "epoch": 1.4488294314381271, "grad_norm": 0.9539154171943665, "learning_rate": 5.2138676271949574e-05, "loss": 1.6121, "step": 1083 }, { "epoch": 1.4501672240802677, "grad_norm": 1.0462729930877686, "learning_rate": 5.209365150832959e-05, "loss": 1.3236, "step": 1084 }, { "epoch": 1.451505016722408, "grad_norm": 0.9178979992866516, "learning_rate": 5.204862674470959e-05, "loss": 1.6152, "step": 1085 }, { "epoch": 1.4528428093645485, "grad_norm": 0.9143950939178467, "learning_rate": 5.20036019810896e-05, "loss": 1.7637, "step": 1086 }, { "epoch": 1.454180602006689, "grad_norm": 0.9704089164733887, "learning_rate": 5.195857721746961e-05, "loss": 0.9608, "step": 1087 }, { "epoch": 1.4555183946488295, "grad_norm": 0.6342671513557434, "learning_rate": 5.1913552453849624e-05, "loss": 1.4244, "step": 1088 }, { "epoch": 1.45685618729097, "grad_norm": 1.4497864246368408, "learning_rate": 5.186852769022963e-05, "loss": 1.86, "step": 1089 }, { "epoch": 1.4581939799331103, "grad_norm": 0.6924653649330139, "learning_rate": 5.1823502926609634e-05, "loss": 1.7662, "step": 1090 }, { "epoch": 1.4595317725752508, "grad_norm": 0.9326013922691345, "learning_rate": 5.177847816298965e-05, "loss": 1.2249, "step": 1091 }, { "epoch": 1.4608695652173913, "grad_norm": 0.8724308013916016, "learning_rate": 5.173345339936966e-05, "loss": 1.2829, "step": 1092 }, { "epoch": 1.4622073578595318, "grad_norm": 1.2138267755508423, "learning_rate": 5.168842863574967e-05, "loss": 1.025, "step": 1093 }, { "epoch": 1.4635451505016723, "grad_norm": 0.9376569390296936, "learning_rate": 5.164340387212967e-05, "loss": 1.5824, "step": 1094 }, { "epoch": 1.4648829431438126, "grad_norm": 0.81294184923172, "learning_rate": 5.1598379108509684e-05, "loss": 1.2309, "step": 1095 }, { "epoch": 1.4662207357859531, "grad_norm": 1.194340467453003, "learning_rate": 5.155335434488969e-05, "loss": 1.1125, "step": 1096 }, { "epoch": 1.4675585284280936, "grad_norm": 0.8782414793968201, "learning_rate": 5.15083295812697e-05, "loss": 1.6568, "step": 1097 }, { "epoch": 1.4688963210702342, "grad_norm": 0.653789222240448, "learning_rate": 5.14633048176497e-05, "loss": 1.3559, "step": 1098 }, { "epoch": 1.4702341137123747, "grad_norm": 0.9649755358695984, "learning_rate": 5.141828005402972e-05, "loss": 1.3056, "step": 1099 }, { "epoch": 1.471571906354515, "grad_norm": 0.8330010175704956, "learning_rate": 5.137325529040973e-05, "loss": 1.5524, "step": 1100 }, { "epoch": 1.4729096989966555, "grad_norm": 1.0811066627502441, "learning_rate": 5.1328230526789736e-05, "loss": 1.4753, "step": 1101 }, { "epoch": 1.474247491638796, "grad_norm": 0.8166443705558777, "learning_rate": 5.128320576316975e-05, "loss": 1.7568, "step": 1102 }, { "epoch": 1.4755852842809365, "grad_norm": 1.201744794845581, "learning_rate": 5.123818099954976e-05, "loss": 1.2685, "step": 1103 }, { "epoch": 1.476923076923077, "grad_norm": 0.8646392226219177, "learning_rate": 5.119315623592976e-05, "loss": 1.5086, "step": 1104 }, { "epoch": 1.4782608695652173, "grad_norm": 0.9522072076797485, "learning_rate": 5.114813147230977e-05, "loss": 1.8555, "step": 1105 }, { "epoch": 1.4795986622073578, "grad_norm": 0.8802261352539062, "learning_rate": 5.1103106708689785e-05, "loss": 1.6987, "step": 1106 }, { "epoch": 1.4809364548494983, "grad_norm": 0.8111658096313477, "learning_rate": 5.1058081945069794e-05, "loss": 1.6291, "step": 1107 }, { "epoch": 1.4822742474916388, "grad_norm": 1.1969858407974243, "learning_rate": 5.1013057181449796e-05, "loss": 1.332, "step": 1108 }, { "epoch": 1.4836120401337793, "grad_norm": 0.8505260944366455, "learning_rate": 5.096803241782981e-05, "loss": 1.81, "step": 1109 }, { "epoch": 1.4849498327759196, "grad_norm": 0.7508300542831421, "learning_rate": 5.092300765420982e-05, "loss": 1.7325, "step": 1110 }, { "epoch": 1.4862876254180601, "grad_norm": 0.8055757284164429, "learning_rate": 5.087798289058983e-05, "loss": 1.8016, "step": 1111 }, { "epoch": 1.4876254180602007, "grad_norm": 0.8028207421302795, "learning_rate": 5.083295812696983e-05, "loss": 2.0108, "step": 1112 }, { "epoch": 1.4889632107023412, "grad_norm": 0.8982310891151428, "learning_rate": 5.078793336334985e-05, "loss": 1.8517, "step": 1113 }, { "epoch": 1.4903010033444817, "grad_norm": 1.2262216806411743, "learning_rate": 5.0742908599729854e-05, "loss": 1.5475, "step": 1114 }, { "epoch": 1.491638795986622, "grad_norm": 0.692025899887085, "learning_rate": 5.069788383610986e-05, "loss": 1.2832, "step": 1115 }, { "epoch": 1.4929765886287625, "grad_norm": 1.0912442207336426, "learning_rate": 5.0652859072489864e-05, "loss": 1.778, "step": 1116 }, { "epoch": 1.494314381270903, "grad_norm": 0.8672052025794983, "learning_rate": 5.0607834308869886e-05, "loss": 1.6913, "step": 1117 }, { "epoch": 1.4956521739130435, "grad_norm": 0.9315889477729797, "learning_rate": 5.056280954524989e-05, "loss": 1.6336, "step": 1118 }, { "epoch": 1.496989966555184, "grad_norm": 0.8745747208595276, "learning_rate": 5.05177847816299e-05, "loss": 1.5267, "step": 1119 }, { "epoch": 1.4983277591973243, "grad_norm": 1.3531934022903442, "learning_rate": 5.047276001800991e-05, "loss": 1.3997, "step": 1120 }, { "epoch": 1.4996655518394648, "grad_norm": 0.9693718552589417, "learning_rate": 5.042773525438992e-05, "loss": 1.5723, "step": 1121 }, { "epoch": 1.5010033444816053, "grad_norm": 0.8846158385276794, "learning_rate": 5.038271049076992e-05, "loss": 1.1203, "step": 1122 }, { "epoch": 1.5023411371237458, "grad_norm": 1.0722146034240723, "learning_rate": 5.033768572714993e-05, "loss": 1.3621, "step": 1123 }, { "epoch": 1.5036789297658864, "grad_norm": 0.9507117867469788, "learning_rate": 5.0292660963529947e-05, "loss": 1.768, "step": 1124 }, { "epoch": 1.5050167224080266, "grad_norm": 0.9553461074829102, "learning_rate": 5.0247636199909955e-05, "loss": 2.001, "step": 1125 }, { "epoch": 1.5063545150501674, "grad_norm": 1.0646768808364868, "learning_rate": 5.020261143628996e-05, "loss": 1.7424, "step": 1126 }, { "epoch": 1.5076923076923077, "grad_norm": 0.8669482469558716, "learning_rate": 5.015758667266998e-05, "loss": 1.067, "step": 1127 }, { "epoch": 1.5090301003344482, "grad_norm": 1.4234883785247803, "learning_rate": 5.011256190904998e-05, "loss": 1.1849, "step": 1128 }, { "epoch": 1.5103678929765887, "grad_norm": 1.3873578310012817, "learning_rate": 5.006753714542999e-05, "loss": 1.8282, "step": 1129 }, { "epoch": 1.511705685618729, "grad_norm": 0.8728304505348206, "learning_rate": 5.002251238180999e-05, "loss": 1.8036, "step": 1130 }, { "epoch": 1.5130434782608697, "grad_norm": 0.86270672082901, "learning_rate": 4.997748761819001e-05, "loss": 1.6129, "step": 1131 }, { "epoch": 1.51438127090301, "grad_norm": 1.422959804534912, "learning_rate": 4.9932462854570015e-05, "loss": 1.6149, "step": 1132 }, { "epoch": 1.5157190635451505, "grad_norm": 0.7661011815071106, "learning_rate": 4.9887438090950024e-05, "loss": 2.2417, "step": 1133 }, { "epoch": 1.517056856187291, "grad_norm": 0.9919681549072266, "learning_rate": 4.984241332733003e-05, "loss": 1.4351, "step": 1134 }, { "epoch": 1.5183946488294313, "grad_norm": 0.8810451030731201, "learning_rate": 4.979738856371005e-05, "loss": 1.5639, "step": 1135 }, { "epoch": 1.519732441471572, "grad_norm": 1.5315157175064087, "learning_rate": 4.975236380009005e-05, "loss": 1.5218, "step": 1136 }, { "epoch": 1.5210702341137123, "grad_norm": 1.3847192525863647, "learning_rate": 4.9707339036470065e-05, "loss": 1.18, "step": 1137 }, { "epoch": 1.5224080267558529, "grad_norm": 1.2832859754562378, "learning_rate": 4.966231427285007e-05, "loss": 1.4576, "step": 1138 }, { "epoch": 1.5237458193979934, "grad_norm": 0.918787956237793, "learning_rate": 4.961728950923008e-05, "loss": 1.6287, "step": 1139 }, { "epoch": 1.5250836120401337, "grad_norm": 1.324271559715271, "learning_rate": 4.9572264745610084e-05, "loss": 1.658, "step": 1140 }, { "epoch": 1.5264214046822744, "grad_norm": 1.5736923217773438, "learning_rate": 4.95272399819901e-05, "loss": 1.5404, "step": 1141 }, { "epoch": 1.5277591973244147, "grad_norm": 0.7107148766517639, "learning_rate": 4.94822152183701e-05, "loss": 2.0322, "step": 1142 }, { "epoch": 1.5290969899665552, "grad_norm": 0.8476628065109253, "learning_rate": 4.9437190454750117e-05, "loss": 1.3441, "step": 1143 }, { "epoch": 1.5304347826086957, "grad_norm": 0.6295139193534851, "learning_rate": 4.9392165691130125e-05, "loss": 1.3322, "step": 1144 }, { "epoch": 1.531772575250836, "grad_norm": 0.8430982232093811, "learning_rate": 4.9347140927510134e-05, "loss": 1.0624, "step": 1145 }, { "epoch": 1.5331103678929767, "grad_norm": 1.353960394859314, "learning_rate": 4.930211616389014e-05, "loss": 1.0658, "step": 1146 }, { "epoch": 1.534448160535117, "grad_norm": 1.780779480934143, "learning_rate": 4.925709140027015e-05, "loss": 1.8983, "step": 1147 }, { "epoch": 1.5357859531772575, "grad_norm": 0.9181867241859436, "learning_rate": 4.921206663665016e-05, "loss": 1.3606, "step": 1148 }, { "epoch": 1.537123745819398, "grad_norm": 1.472495675086975, "learning_rate": 4.916704187303017e-05, "loss": 1.5477, "step": 1149 }, { "epoch": 1.5384615384615383, "grad_norm": 1.0969715118408203, "learning_rate": 4.912201710941018e-05, "loss": 1.5731, "step": 1150 }, { "epoch": 1.539799331103679, "grad_norm": 0.8129613995552063, "learning_rate": 4.9076992345790185e-05, "loss": 1.6769, "step": 1151 }, { "epoch": 1.5411371237458193, "grad_norm": 0.8746471405029297, "learning_rate": 4.9031967582170194e-05, "loss": 1.129, "step": 1152 }, { "epoch": 1.5424749163879599, "grad_norm": 1.206492304801941, "learning_rate": 4.898694281855021e-05, "loss": 1.3039, "step": 1153 }, { "epoch": 1.5438127090301004, "grad_norm": 1.562818169593811, "learning_rate": 4.894191805493021e-05, "loss": 1.3924, "step": 1154 }, { "epoch": 1.5451505016722407, "grad_norm": 2.0223031044006348, "learning_rate": 4.8896893291310226e-05, "loss": 1.346, "step": 1155 }, { "epoch": 1.5464882943143814, "grad_norm": 1.0795880556106567, "learning_rate": 4.885186852769023e-05, "loss": 1.4994, "step": 1156 }, { "epoch": 1.5478260869565217, "grad_norm": 1.5896492004394531, "learning_rate": 4.8806843764070244e-05, "loss": 1.2325, "step": 1157 }, { "epoch": 1.5491638795986622, "grad_norm": 0.875981867313385, "learning_rate": 4.8761819000450245e-05, "loss": 2.1033, "step": 1158 }, { "epoch": 1.5505016722408027, "grad_norm": 0.7678603529930115, "learning_rate": 4.871679423683026e-05, "loss": 1.7353, "step": 1159 }, { "epoch": 1.551839464882943, "grad_norm": 1.0514904260635376, "learning_rate": 4.867176947321026e-05, "loss": 1.5402, "step": 1160 }, { "epoch": 1.5531772575250837, "grad_norm": 0.9771605134010315, "learning_rate": 4.862674470959028e-05, "loss": 1.1979, "step": 1161 }, { "epoch": 1.554515050167224, "grad_norm": 0.8701416850090027, "learning_rate": 4.8581719945970286e-05, "loss": 1.6865, "step": 1162 }, { "epoch": 1.5558528428093645, "grad_norm": 1.5773226022720337, "learning_rate": 4.8536695182350295e-05, "loss": 1.3537, "step": 1163 }, { "epoch": 1.557190635451505, "grad_norm": 0.8707898855209351, "learning_rate": 4.8491670418730304e-05, "loss": 1.6703, "step": 1164 }, { "epoch": 1.5585284280936453, "grad_norm": 0.8050997257232666, "learning_rate": 4.844664565511031e-05, "loss": 1.4691, "step": 1165 }, { "epoch": 1.559866220735786, "grad_norm": 0.828519880771637, "learning_rate": 4.840162089149032e-05, "loss": 0.9642, "step": 1166 }, { "epoch": 1.5612040133779264, "grad_norm": 0.8314371109008789, "learning_rate": 4.835659612787033e-05, "loss": 1.7049, "step": 1167 }, { "epoch": 1.5625418060200669, "grad_norm": 1.1783218383789062, "learning_rate": 4.831157136425034e-05, "loss": 1.3039, "step": 1168 }, { "epoch": 1.5638795986622074, "grad_norm": 0.7392192482948303, "learning_rate": 4.8266546600630347e-05, "loss": 2.0674, "step": 1169 }, { "epoch": 1.5652173913043477, "grad_norm": 0.969502866268158, "learning_rate": 4.8221521837010355e-05, "loss": 1.7466, "step": 1170 }, { "epoch": 1.5665551839464884, "grad_norm": 0.8645799160003662, "learning_rate": 4.817649707339037e-05, "loss": 1.6764, "step": 1171 }, { "epoch": 1.5678929765886287, "grad_norm": 1.0396491289138794, "learning_rate": 4.813147230977037e-05, "loss": 1.7447, "step": 1172 }, { "epoch": 1.5692307692307692, "grad_norm": 0.8425093293190002, "learning_rate": 4.808644754615039e-05, "loss": 1.9199, "step": 1173 }, { "epoch": 1.5705685618729097, "grad_norm": 1.2044014930725098, "learning_rate": 4.8041422782530396e-05, "loss": 1.6451, "step": 1174 }, { "epoch": 1.57190635451505, "grad_norm": 0.7370679378509521, "learning_rate": 4.7996398018910405e-05, "loss": 1.9203, "step": 1175 }, { "epoch": 1.5732441471571907, "grad_norm": 0.7956492900848389, "learning_rate": 4.7951373255290413e-05, "loss": 1.6613, "step": 1176 }, { "epoch": 1.574581939799331, "grad_norm": 0.969325065612793, "learning_rate": 4.790634849167042e-05, "loss": 1.775, "step": 1177 }, { "epoch": 1.5759197324414715, "grad_norm": 0.5993706583976746, "learning_rate": 4.786132372805043e-05, "loss": 1.3278, "step": 1178 }, { "epoch": 1.577257525083612, "grad_norm": 1.4955739974975586, "learning_rate": 4.781629896443044e-05, "loss": 1.8495, "step": 1179 }, { "epoch": 1.5785953177257523, "grad_norm": 0.760959804058075, "learning_rate": 4.777127420081045e-05, "loss": 1.1063, "step": 1180 }, { "epoch": 1.579933110367893, "grad_norm": 0.7534853219985962, "learning_rate": 4.7726249437190456e-05, "loss": 1.1722, "step": 1181 }, { "epoch": 1.5812709030100334, "grad_norm": 0.8445656299591064, "learning_rate": 4.7681224673570465e-05, "loss": 1.8389, "step": 1182 }, { "epoch": 1.5826086956521739, "grad_norm": 0.8543136119842529, "learning_rate": 4.7636199909950474e-05, "loss": 1.841, "step": 1183 }, { "epoch": 1.5839464882943144, "grad_norm": 1.0799471139907837, "learning_rate": 4.759117514633049e-05, "loss": 0.7011, "step": 1184 }, { "epoch": 1.585284280936455, "grad_norm": 0.9755014777183533, "learning_rate": 4.754615038271049e-05, "loss": 1.7702, "step": 1185 }, { "epoch": 1.5866220735785954, "grad_norm": 0.6892296671867371, "learning_rate": 4.7501125619090506e-05, "loss": 1.4784, "step": 1186 }, { "epoch": 1.5879598662207357, "grad_norm": 0.9510697722434998, "learning_rate": 4.745610085547051e-05, "loss": 1.5627, "step": 1187 }, { "epoch": 1.5892976588628762, "grad_norm": 1.0505142211914062, "learning_rate": 4.741107609185052e-05, "loss": 1.5736, "step": 1188 }, { "epoch": 1.5906354515050167, "grad_norm": 1.007228970527649, "learning_rate": 4.736605132823053e-05, "loss": 1.0688, "step": 1189 }, { "epoch": 1.5919732441471572, "grad_norm": 1.0539839267730713, "learning_rate": 4.732102656461054e-05, "loss": 1.6078, "step": 1190 }, { "epoch": 1.5933110367892978, "grad_norm": 0.7513359189033508, "learning_rate": 4.727600180099055e-05, "loss": 1.9948, "step": 1191 }, { "epoch": 1.594648829431438, "grad_norm": 0.9461736679077148, "learning_rate": 4.723097703737056e-05, "loss": 1.516, "step": 1192 }, { "epoch": 1.5959866220735786, "grad_norm": 0.9117209315299988, "learning_rate": 4.7185952273750566e-05, "loss": 1.6311, "step": 1193 }, { "epoch": 1.597324414715719, "grad_norm": 0.7895516157150269, "learning_rate": 4.7140927510130575e-05, "loss": 1.808, "step": 1194 }, { "epoch": 1.5986622073578596, "grad_norm": 1.0118502378463745, "learning_rate": 4.7095902746510583e-05, "loss": 1.6352, "step": 1195 }, { "epoch": 1.6, "grad_norm": 1.148454189300537, "learning_rate": 4.705087798289059e-05, "loss": 1.9035, "step": 1196 }, { "epoch": 1.6013377926421404, "grad_norm": 1.7625706195831299, "learning_rate": 4.70058532192706e-05, "loss": 1.3833, "step": 1197 }, { "epoch": 1.602675585284281, "grad_norm": 0.8092257976531982, "learning_rate": 4.6960828455650616e-05, "loss": 1.6618, "step": 1198 }, { "epoch": 1.6040133779264214, "grad_norm": 0.810875654220581, "learning_rate": 4.691580369203062e-05, "loss": 1.3098, "step": 1199 }, { "epoch": 1.605351170568562, "grad_norm": 0.8476257920265198, "learning_rate": 4.687077892841063e-05, "loss": 1.0308, "step": 1200 }, { "epoch": 1.6066889632107024, "grad_norm": 1.0738558769226074, "learning_rate": 4.6825754164790635e-05, "loss": 1.3836, "step": 1201 }, { "epoch": 1.6080267558528427, "grad_norm": 0.9387577772140503, "learning_rate": 4.678072940117065e-05, "loss": 1.3271, "step": 1202 }, { "epoch": 1.6093645484949832, "grad_norm": 1.344914436340332, "learning_rate": 4.673570463755065e-05, "loss": 2.1565, "step": 1203 }, { "epoch": 1.6107023411371237, "grad_norm": 0.9820061922073364, "learning_rate": 4.669067987393067e-05, "loss": 1.6031, "step": 1204 }, { "epoch": 1.6120401337792643, "grad_norm": 0.7256208062171936, "learning_rate": 4.664565511031067e-05, "loss": 1.9967, "step": 1205 }, { "epoch": 1.6133779264214048, "grad_norm": 0.8569954633712769, "learning_rate": 4.6600630346690685e-05, "loss": 1.6004, "step": 1206 }, { "epoch": 1.614715719063545, "grad_norm": 1.1218992471694946, "learning_rate": 4.655560558307069e-05, "loss": 1.5533, "step": 1207 }, { "epoch": 1.6160535117056856, "grad_norm": 1.0124077796936035, "learning_rate": 4.65105808194507e-05, "loss": 1.7343, "step": 1208 }, { "epoch": 1.617391304347826, "grad_norm": 0.9381147623062134, "learning_rate": 4.646555605583071e-05, "loss": 1.2428, "step": 1209 }, { "epoch": 1.6187290969899666, "grad_norm": 1.1799885034561157, "learning_rate": 4.642053129221072e-05, "loss": 1.6861, "step": 1210 }, { "epoch": 1.620066889632107, "grad_norm": 0.9148082733154297, "learning_rate": 4.637550652859073e-05, "loss": 1.3491, "step": 1211 }, { "epoch": 1.6214046822742474, "grad_norm": 1.0270072221755981, "learning_rate": 4.6330481764970736e-05, "loss": 1.4965, "step": 1212 }, { "epoch": 1.6227424749163881, "grad_norm": 0.8440571427345276, "learning_rate": 4.6285457001350745e-05, "loss": 1.9027, "step": 1213 }, { "epoch": 1.6240802675585284, "grad_norm": 0.8079622387886047, "learning_rate": 4.624043223773075e-05, "loss": 1.718, "step": 1214 }, { "epoch": 1.625418060200669, "grad_norm": 0.6909911036491394, "learning_rate": 4.619540747411076e-05, "loss": 1.9922, "step": 1215 }, { "epoch": 1.6267558528428094, "grad_norm": 1.2815625667572021, "learning_rate": 4.615038271049078e-05, "loss": 1.1817, "step": 1216 }, { "epoch": 1.6280936454849497, "grad_norm": 0.7101358771324158, "learning_rate": 4.610535794687078e-05, "loss": 1.6939, "step": 1217 }, { "epoch": 1.6294314381270905, "grad_norm": 0.7537413835525513, "learning_rate": 4.6060333183250794e-05, "loss": 1.9988, "step": 1218 }, { "epoch": 1.6307692307692307, "grad_norm": 0.7323551177978516, "learning_rate": 4.6015308419630796e-05, "loss": 1.9491, "step": 1219 }, { "epoch": 1.6321070234113713, "grad_norm": 1.0128388404846191, "learning_rate": 4.597028365601081e-05, "loss": 1.4372, "step": 1220 }, { "epoch": 1.6334448160535118, "grad_norm": 1.1328787803649902, "learning_rate": 4.5925258892390813e-05, "loss": 2.1449, "step": 1221 }, { "epoch": 1.634782608695652, "grad_norm": 0.9315971732139587, "learning_rate": 4.588023412877083e-05, "loss": 1.7581, "step": 1222 }, { "epoch": 1.6361204013377928, "grad_norm": 0.8285197019577026, "learning_rate": 4.583520936515083e-05, "loss": 1.6367, "step": 1223 }, { "epoch": 1.637458193979933, "grad_norm": 0.6926225423812866, "learning_rate": 4.5790184601530846e-05, "loss": 1.654, "step": 1224 }, { "epoch": 1.6387959866220736, "grad_norm": 0.824735164642334, "learning_rate": 4.5745159837910855e-05, "loss": 1.7624, "step": 1225 }, { "epoch": 1.640133779264214, "grad_norm": 0.9015515446662903, "learning_rate": 4.570013507429086e-05, "loss": 1.2503, "step": 1226 }, { "epoch": 1.6414715719063544, "grad_norm": 1.440433382987976, "learning_rate": 4.565511031067087e-05, "loss": 1.2357, "step": 1227 }, { "epoch": 1.6428093645484951, "grad_norm": 0.8556510806083679, "learning_rate": 4.561008554705088e-05, "loss": 1.3501, "step": 1228 }, { "epoch": 1.6441471571906354, "grad_norm": 0.7644262313842773, "learning_rate": 4.556506078343089e-05, "loss": 1.8273, "step": 1229 }, { "epoch": 1.645484949832776, "grad_norm": 2.027939558029175, "learning_rate": 4.55200360198109e-05, "loss": 1.1069, "step": 1230 }, { "epoch": 1.6468227424749164, "grad_norm": 1.0590338706970215, "learning_rate": 4.5475011256190906e-05, "loss": 1.3507, "step": 1231 }, { "epoch": 1.6481605351170567, "grad_norm": 1.007128119468689, "learning_rate": 4.5429986492570915e-05, "loss": 1.5589, "step": 1232 }, { "epoch": 1.6494983277591975, "grad_norm": 0.8251166343688965, "learning_rate": 4.538496172895092e-05, "loss": 1.7861, "step": 1233 }, { "epoch": 1.6508361204013378, "grad_norm": 0.9109119176864624, "learning_rate": 4.533993696533094e-05, "loss": 1.6144, "step": 1234 }, { "epoch": 1.6521739130434783, "grad_norm": 0.7894558310508728, "learning_rate": 4.529491220171094e-05, "loss": 1.5139, "step": 1235 }, { "epoch": 1.6535117056856188, "grad_norm": 0.7755162715911865, "learning_rate": 4.5249887438090956e-05, "loss": 1.7346, "step": 1236 }, { "epoch": 1.654849498327759, "grad_norm": 2.059093952178955, "learning_rate": 4.520486267447096e-05, "loss": 1.7854, "step": 1237 }, { "epoch": 1.6561872909698998, "grad_norm": 0.908669650554657, "learning_rate": 4.515983791085097e-05, "loss": 1.6148, "step": 1238 }, { "epoch": 1.65752508361204, "grad_norm": 0.776725709438324, "learning_rate": 4.5114813147230975e-05, "loss": 1.6544, "step": 1239 }, { "epoch": 1.6588628762541806, "grad_norm": 0.6670528650283813, "learning_rate": 4.506978838361099e-05, "loss": 1.3139, "step": 1240 }, { "epoch": 1.6602006688963211, "grad_norm": 1.0721896886825562, "learning_rate": 4.502476361999099e-05, "loss": 1.5505, "step": 1241 }, { "epoch": 1.6615384615384614, "grad_norm": 1.018912434577942, "learning_rate": 4.497973885637101e-05, "loss": 1.9343, "step": 1242 }, { "epoch": 1.6628762541806021, "grad_norm": 0.8976886868476868, "learning_rate": 4.4934714092751016e-05, "loss": 1.7946, "step": 1243 }, { "epoch": 1.6642140468227424, "grad_norm": 0.9715926647186279, "learning_rate": 4.4889689329131025e-05, "loss": 1.7167, "step": 1244 }, { "epoch": 1.665551839464883, "grad_norm": 0.8015096783638, "learning_rate": 4.484466456551103e-05, "loss": 1.5664, "step": 1245 }, { "epoch": 1.6668896321070235, "grad_norm": 0.968388020992279, "learning_rate": 4.479963980189104e-05, "loss": 1.4593, "step": 1246 }, { "epoch": 1.6682274247491637, "grad_norm": 0.811292290687561, "learning_rate": 4.475461503827105e-05, "loss": 1.7979, "step": 1247 }, { "epoch": 1.6695652173913045, "grad_norm": 0.7946311235427856, "learning_rate": 4.470959027465106e-05, "loss": 1.8932, "step": 1248 }, { "epoch": 1.6709030100334448, "grad_norm": 0.7939193248748779, "learning_rate": 4.466456551103107e-05, "loss": 1.4773, "step": 1249 }, { "epoch": 1.6722408026755853, "grad_norm": 1.309445858001709, "learning_rate": 4.4619540747411076e-05, "loss": 1.8627, "step": 1250 }, { "epoch": 1.6735785953177258, "grad_norm": 0.7015190720558167, "learning_rate": 4.4574515983791085e-05, "loss": 1.4652, "step": 1251 }, { "epoch": 1.674916387959866, "grad_norm": 1.03852117061615, "learning_rate": 4.45294912201711e-05, "loss": 1.8289, "step": 1252 }, { "epoch": 1.6762541806020068, "grad_norm": 0.6862776279449463, "learning_rate": 4.44844664565511e-05, "loss": 1.3505, "step": 1253 }, { "epoch": 1.677591973244147, "grad_norm": 0.7088447213172913, "learning_rate": 4.443944169293112e-05, "loss": 1.3436, "step": 1254 }, { "epoch": 1.6789297658862876, "grad_norm": 0.6423237919807434, "learning_rate": 4.439441692931112e-05, "loss": 2.086, "step": 1255 }, { "epoch": 1.6802675585284281, "grad_norm": 0.926794171333313, "learning_rate": 4.4349392165691134e-05, "loss": 1.7186, "step": 1256 }, { "epoch": 1.6816053511705684, "grad_norm": 0.6868398189544678, "learning_rate": 4.430436740207114e-05, "loss": 1.7386, "step": 1257 }, { "epoch": 1.6829431438127092, "grad_norm": 1.3709686994552612, "learning_rate": 4.425934263845115e-05, "loss": 1.5422, "step": 1258 }, { "epoch": 1.6842809364548494, "grad_norm": 1.1822702884674072, "learning_rate": 4.421431787483116e-05, "loss": 1.6106, "step": 1259 }, { "epoch": 1.68561872909699, "grad_norm": 0.7609877586364746, "learning_rate": 4.416929311121117e-05, "loss": 0.7967, "step": 1260 }, { "epoch": 1.6869565217391305, "grad_norm": 0.9301615357398987, "learning_rate": 4.412426834759118e-05, "loss": 1.7581, "step": 1261 }, { "epoch": 1.6882943143812708, "grad_norm": 0.9290110468864441, "learning_rate": 4.4079243583971186e-05, "loss": 1.6569, "step": 1262 }, { "epoch": 1.6896321070234115, "grad_norm": 0.904699981212616, "learning_rate": 4.4034218820351194e-05, "loss": 1.3858, "step": 1263 }, { "epoch": 1.6909698996655518, "grad_norm": 0.8743433356285095, "learning_rate": 4.39891940567312e-05, "loss": 0.8902, "step": 1264 }, { "epoch": 1.6923076923076923, "grad_norm": 0.5912821292877197, "learning_rate": 4.394416929311121e-05, "loss": 1.6238, "step": 1265 }, { "epoch": 1.6936454849498328, "grad_norm": 2.2498979568481445, "learning_rate": 4.389914452949122e-05, "loss": 1.8626, "step": 1266 }, { "epoch": 1.694983277591973, "grad_norm": 0.8445337414741516, "learning_rate": 4.385411976587123e-05, "loss": 1.8312, "step": 1267 }, { "epoch": 1.6963210702341138, "grad_norm": 0.9713656902313232, "learning_rate": 4.380909500225124e-05, "loss": 1.3857, "step": 1268 }, { "epoch": 1.6976588628762541, "grad_norm": 0.6458280086517334, "learning_rate": 4.376407023863125e-05, "loss": 1.2622, "step": 1269 }, { "epoch": 1.6989966555183946, "grad_norm": 0.8908253908157349, "learning_rate": 4.371904547501126e-05, "loss": 1.798, "step": 1270 }, { "epoch": 1.7003344481605351, "grad_norm": 0.9305428862571716, "learning_rate": 4.367402071139127e-05, "loss": 1.7508, "step": 1271 }, { "epoch": 1.7016722408026754, "grad_norm": 0.8213643431663513, "learning_rate": 4.362899594777128e-05, "loss": 1.3151, "step": 1272 }, { "epoch": 1.7030100334448162, "grad_norm": 1.0373735427856445, "learning_rate": 4.358397118415129e-05, "loss": 1.4885, "step": 1273 }, { "epoch": 1.7043478260869565, "grad_norm": 0.676937460899353, "learning_rate": 4.3538946420531296e-05, "loss": 1.963, "step": 1274 }, { "epoch": 1.705685618729097, "grad_norm": 0.9667220115661621, "learning_rate": 4.3493921656911304e-05, "loss": 1.647, "step": 1275 }, { "epoch": 1.7070234113712375, "grad_norm": 1.1586287021636963, "learning_rate": 4.344889689329131e-05, "loss": 1.6821, "step": 1276 }, { "epoch": 1.7083612040133778, "grad_norm": 1.7140305042266846, "learning_rate": 4.340387212967132e-05, "loss": 1.6735, "step": 1277 }, { "epoch": 1.7096989966555185, "grad_norm": 1.2710798978805542, "learning_rate": 4.335884736605133e-05, "loss": 1.3605, "step": 1278 }, { "epoch": 1.7110367892976588, "grad_norm": 1.4276046752929688, "learning_rate": 4.331382260243134e-05, "loss": 1.2628, "step": 1279 }, { "epoch": 1.7123745819397993, "grad_norm": 0.8968702554702759, "learning_rate": 4.326879783881135e-05, "loss": 2.0376, "step": 1280 }, { "epoch": 1.7137123745819398, "grad_norm": 0.9779416918754578, "learning_rate": 4.322377307519136e-05, "loss": 1.9223, "step": 1281 }, { "epoch": 1.71505016722408, "grad_norm": 1.2591009140014648, "learning_rate": 4.3178748311571364e-05, "loss": 1.3221, "step": 1282 }, { "epoch": 1.7163879598662208, "grad_norm": 1.216707468032837, "learning_rate": 4.313372354795138e-05, "loss": 1.0194, "step": 1283 }, { "epoch": 1.7177257525083611, "grad_norm": 0.8018258213996887, "learning_rate": 4.308869878433138e-05, "loss": 2.0731, "step": 1284 }, { "epoch": 1.7190635451505016, "grad_norm": 0.8288760781288147, "learning_rate": 4.30436740207114e-05, "loss": 1.9664, "step": 1285 }, { "epoch": 1.7204013377926421, "grad_norm": 2.0245273113250732, "learning_rate": 4.29986492570914e-05, "loss": 1.4066, "step": 1286 }, { "epoch": 1.7217391304347827, "grad_norm": 1.0415517091751099, "learning_rate": 4.2953624493471414e-05, "loss": 1.6966, "step": 1287 }, { "epoch": 1.7230769230769232, "grad_norm": 1.4432035684585571, "learning_rate": 4.290859972985142e-05, "loss": 0.825, "step": 1288 }, { "epoch": 1.7244147157190635, "grad_norm": 0.7454544305801392, "learning_rate": 4.286357496623143e-05, "loss": 1.8722, "step": 1289 }, { "epoch": 1.725752508361204, "grad_norm": 1.177752137184143, "learning_rate": 4.281855020261144e-05, "loss": 1.4524, "step": 1290 }, { "epoch": 1.7270903010033445, "grad_norm": 0.9258737564086914, "learning_rate": 4.277352543899145e-05, "loss": 1.7513, "step": 1291 }, { "epoch": 1.728428093645485, "grad_norm": 0.832943856716156, "learning_rate": 4.272850067537146e-05, "loss": 1.5639, "step": 1292 }, { "epoch": 1.7297658862876255, "grad_norm": 1.6282193660736084, "learning_rate": 4.2683475911751466e-05, "loss": 1.3246, "step": 1293 }, { "epoch": 1.7311036789297658, "grad_norm": 0.7570889592170715, "learning_rate": 4.2638451148131474e-05, "loss": 1.3322, "step": 1294 }, { "epoch": 1.7324414715719063, "grad_norm": 0.8353405594825745, "learning_rate": 4.259342638451148e-05, "loss": 1.2958, "step": 1295 }, { "epoch": 1.7337792642140468, "grad_norm": 1.2555522918701172, "learning_rate": 4.254840162089149e-05, "loss": 1.4349, "step": 1296 }, { "epoch": 1.7351170568561873, "grad_norm": 1.1173676252365112, "learning_rate": 4.250337685727151e-05, "loss": 1.4082, "step": 1297 }, { "epoch": 1.7364548494983278, "grad_norm": 0.9005771279335022, "learning_rate": 4.245835209365151e-05, "loss": 1.9892, "step": 1298 }, { "epoch": 1.7377926421404681, "grad_norm": 0.9720417261123657, "learning_rate": 4.2413327330031524e-05, "loss": 1.5311, "step": 1299 }, { "epoch": 1.7391304347826086, "grad_norm": 0.9608173370361328, "learning_rate": 4.2368302566411526e-05, "loss": 1.4644, "step": 1300 }, { "epoch": 1.7404682274247492, "grad_norm": 1.2789133787155151, "learning_rate": 4.232327780279154e-05, "loss": 2.0233, "step": 1301 }, { "epoch": 1.7418060200668897, "grad_norm": 0.9710900783538818, "learning_rate": 4.227825303917154e-05, "loss": 1.1806, "step": 1302 }, { "epoch": 1.7431438127090302, "grad_norm": 0.8562449812889099, "learning_rate": 4.223322827555156e-05, "loss": 1.6512, "step": 1303 }, { "epoch": 1.7444816053511705, "grad_norm": 0.9922163486480713, "learning_rate": 4.218820351193156e-05, "loss": 1.7093, "step": 1304 }, { "epoch": 1.745819397993311, "grad_norm": 0.8094335794448853, "learning_rate": 4.2143178748311576e-05, "loss": 1.6508, "step": 1305 }, { "epoch": 1.7471571906354515, "grad_norm": 0.8714074492454529, "learning_rate": 4.2098153984691584e-05, "loss": 1.6098, "step": 1306 }, { "epoch": 1.748494983277592, "grad_norm": 1.1597920656204224, "learning_rate": 4.205312922107159e-05, "loss": 1.2336, "step": 1307 }, { "epoch": 1.7498327759197325, "grad_norm": 1.0653190612792969, "learning_rate": 4.20081044574516e-05, "loss": 1.1729, "step": 1308 }, { "epoch": 1.7511705685618728, "grad_norm": 0.941969096660614, "learning_rate": 4.196307969383161e-05, "loss": 1.808, "step": 1309 }, { "epoch": 1.7525083612040135, "grad_norm": 1.3117737770080566, "learning_rate": 4.191805493021162e-05, "loss": 1.0482, "step": 1310 }, { "epoch": 1.7538461538461538, "grad_norm": 0.8123752474784851, "learning_rate": 4.187303016659163e-05, "loss": 1.8081, "step": 1311 }, { "epoch": 1.7551839464882943, "grad_norm": 1.3279463052749634, "learning_rate": 4.1828005402971636e-05, "loss": 1.431, "step": 1312 }, { "epoch": 1.7565217391304349, "grad_norm": 1.044995665550232, "learning_rate": 4.1782980639351644e-05, "loss": 1.5174, "step": 1313 }, { "epoch": 1.7578595317725751, "grad_norm": 0.918773889541626, "learning_rate": 4.173795587573165e-05, "loss": 1.1729, "step": 1314 }, { "epoch": 1.7591973244147159, "grad_norm": 1.465911626815796, "learning_rate": 4.169293111211167e-05, "loss": 1.4036, "step": 1315 }, { "epoch": 1.7605351170568562, "grad_norm": 0.9047989845275879, "learning_rate": 4.164790634849167e-05, "loss": 1.0355, "step": 1316 }, { "epoch": 1.7618729096989967, "grad_norm": 1.1229565143585205, "learning_rate": 4.1602881584871685e-05, "loss": 1.3043, "step": 1317 }, { "epoch": 1.7632107023411372, "grad_norm": 0.7313392162322998, "learning_rate": 4.155785682125169e-05, "loss": 1.5846, "step": 1318 }, { "epoch": 1.7645484949832775, "grad_norm": 0.8520455360412598, "learning_rate": 4.15128320576317e-05, "loss": 1.4211, "step": 1319 }, { "epoch": 1.7658862876254182, "grad_norm": 0.8551596999168396, "learning_rate": 4.1467807294011704e-05, "loss": 1.6073, "step": 1320 }, { "epoch": 1.7672240802675585, "grad_norm": 0.9196562767028809, "learning_rate": 4.142278253039172e-05, "loss": 1.5378, "step": 1321 }, { "epoch": 1.768561872909699, "grad_norm": 1.1131138801574707, "learning_rate": 4.137775776677172e-05, "loss": 1.0312, "step": 1322 }, { "epoch": 1.7698996655518395, "grad_norm": 0.8004671335220337, "learning_rate": 4.133273300315174e-05, "loss": 2.0511, "step": 1323 }, { "epoch": 1.7712374581939798, "grad_norm": 1.1453050374984741, "learning_rate": 4.1287708239531745e-05, "loss": 0.7885, "step": 1324 }, { "epoch": 1.7725752508361206, "grad_norm": 1.051133394241333, "learning_rate": 4.1242683475911754e-05, "loss": 1.4642, "step": 1325 }, { "epoch": 1.7739130434782608, "grad_norm": 0.8506733775138855, "learning_rate": 4.119765871229176e-05, "loss": 1.396, "step": 1326 }, { "epoch": 1.7752508361204014, "grad_norm": 0.800424337387085, "learning_rate": 4.115263394867177e-05, "loss": 1.5075, "step": 1327 }, { "epoch": 1.7765886287625419, "grad_norm": 1.6729928255081177, "learning_rate": 4.110760918505178e-05, "loss": 0.978, "step": 1328 }, { "epoch": 1.7779264214046822, "grad_norm": 0.8187434077262878, "learning_rate": 4.106258442143179e-05, "loss": 1.6308, "step": 1329 }, { "epoch": 1.779264214046823, "grad_norm": 1.0891520977020264, "learning_rate": 4.10175596578118e-05, "loss": 1.4337, "step": 1330 }, { "epoch": 1.7806020066889632, "grad_norm": 0.9008198976516724, "learning_rate": 4.0972534894191806e-05, "loss": 1.476, "step": 1331 }, { "epoch": 1.7819397993311037, "grad_norm": 1.036777377128601, "learning_rate": 4.0927510130571814e-05, "loss": 1.4582, "step": 1332 }, { "epoch": 1.7832775919732442, "grad_norm": 1.3103868961334229, "learning_rate": 4.088248536695183e-05, "loss": 1.6545, "step": 1333 }, { "epoch": 1.7846153846153845, "grad_norm": 1.3254574537277222, "learning_rate": 4.083746060333183e-05, "loss": 1.5745, "step": 1334 }, { "epoch": 1.7859531772575252, "grad_norm": 0.771115243434906, "learning_rate": 4.079243583971185e-05, "loss": 1.9747, "step": 1335 }, { "epoch": 1.7872909698996655, "grad_norm": 0.7478201985359192, "learning_rate": 4.074741107609185e-05, "loss": 1.819, "step": 1336 }, { "epoch": 1.788628762541806, "grad_norm": 0.6987447142601013, "learning_rate": 4.0702386312471864e-05, "loss": 1.9189, "step": 1337 }, { "epoch": 1.7899665551839465, "grad_norm": 0.6925516724586487, "learning_rate": 4.0657361548851866e-05, "loss": 1.8319, "step": 1338 }, { "epoch": 1.7913043478260868, "grad_norm": 0.7896952033042908, "learning_rate": 4.061233678523188e-05, "loss": 0.9144, "step": 1339 }, { "epoch": 1.7926421404682276, "grad_norm": 0.847746729850769, "learning_rate": 4.056731202161189e-05, "loss": 1.4009, "step": 1340 }, { "epoch": 1.7939799331103679, "grad_norm": 0.8629260063171387, "learning_rate": 4.05222872579919e-05, "loss": 1.3139, "step": 1341 }, { "epoch": 1.7953177257525084, "grad_norm": 0.894623875617981, "learning_rate": 4.047726249437191e-05, "loss": 1.2251, "step": 1342 }, { "epoch": 1.7966555183946489, "grad_norm": 1.4640523195266724, "learning_rate": 4.0432237730751915e-05, "loss": 1.5554, "step": 1343 }, { "epoch": 1.7979933110367892, "grad_norm": 0.8280948996543884, "learning_rate": 4.0387212967131924e-05, "loss": 1.7077, "step": 1344 }, { "epoch": 1.79933110367893, "grad_norm": 1.387689471244812, "learning_rate": 4.034218820351193e-05, "loss": 1.4598, "step": 1345 }, { "epoch": 1.8006688963210702, "grad_norm": 0.7571278214454651, "learning_rate": 4.029716343989194e-05, "loss": 1.2455, "step": 1346 }, { "epoch": 1.8020066889632107, "grad_norm": 0.9558732509613037, "learning_rate": 4.025213867627195e-05, "loss": 0.9238, "step": 1347 }, { "epoch": 1.8033444816053512, "grad_norm": 1.0369471311569214, "learning_rate": 4.020711391265196e-05, "loss": 1.3457, "step": 1348 }, { "epoch": 1.8046822742474915, "grad_norm": 0.852948009967804, "learning_rate": 4.016208914903197e-05, "loss": 1.3754, "step": 1349 }, { "epoch": 1.8060200668896322, "grad_norm": 0.912213921546936, "learning_rate": 4.0117064385411976e-05, "loss": 1.8092, "step": 1350 }, { "epoch": 1.8073578595317725, "grad_norm": 0.8572192192077637, "learning_rate": 4.007203962179199e-05, "loss": 1.7593, "step": 1351 }, { "epoch": 1.808695652173913, "grad_norm": 1.1593003273010254, "learning_rate": 4.0027014858172e-05, "loss": 1.6094, "step": 1352 }, { "epoch": 1.8100334448160535, "grad_norm": 0.9357568621635437, "learning_rate": 3.998199009455201e-05, "loss": 0.9808, "step": 1353 }, { "epoch": 1.8113712374581938, "grad_norm": 0.8933264017105103, "learning_rate": 3.993696533093202e-05, "loss": 1.5225, "step": 1354 }, { "epoch": 1.8127090301003346, "grad_norm": 0.9033715724945068, "learning_rate": 3.9891940567312025e-05, "loss": 1.6487, "step": 1355 }, { "epoch": 1.8140468227424749, "grad_norm": 0.9341977834701538, "learning_rate": 3.9846915803692034e-05, "loss": 1.4483, "step": 1356 }, { "epoch": 1.8153846153846154, "grad_norm": 0.7608655095100403, "learning_rate": 3.980189104007204e-05, "loss": 2.0032, "step": 1357 }, { "epoch": 1.8167224080267559, "grad_norm": 1.3053033351898193, "learning_rate": 3.975686627645205e-05, "loss": 1.7181, "step": 1358 }, { "epoch": 1.8180602006688962, "grad_norm": 0.9982928037643433, "learning_rate": 3.971184151283206e-05, "loss": 1.5147, "step": 1359 }, { "epoch": 1.819397993311037, "grad_norm": 1.1024192571640015, "learning_rate": 3.966681674921207e-05, "loss": 1.561, "step": 1360 }, { "epoch": 1.8207357859531772, "grad_norm": 0.8846253752708435, "learning_rate": 3.962179198559208e-05, "loss": 1.4367, "step": 1361 }, { "epoch": 1.8220735785953177, "grad_norm": 0.8675643801689148, "learning_rate": 3.9576767221972085e-05, "loss": 1.8881, "step": 1362 }, { "epoch": 1.8234113712374582, "grad_norm": 0.8549279570579529, "learning_rate": 3.9531742458352094e-05, "loss": 1.6155, "step": 1363 }, { "epoch": 1.8247491638795985, "grad_norm": 0.8130283355712891, "learning_rate": 3.948671769473211e-05, "loss": 1.7252, "step": 1364 }, { "epoch": 1.8260869565217392, "grad_norm": 9.944559097290039, "learning_rate": 3.944169293111211e-05, "loss": 1.2356, "step": 1365 }, { "epoch": 1.8274247491638795, "grad_norm": 1.0192309617996216, "learning_rate": 3.9396668167492126e-05, "loss": 1.6147, "step": 1366 }, { "epoch": 1.82876254180602, "grad_norm": 1.1403286457061768, "learning_rate": 3.935164340387213e-05, "loss": 1.2417, "step": 1367 }, { "epoch": 1.8301003344481606, "grad_norm": 1.0655380487442017, "learning_rate": 3.9306618640252144e-05, "loss": 1.4588, "step": 1368 }, { "epoch": 1.8314381270903008, "grad_norm": 1.3363555669784546, "learning_rate": 3.926159387663215e-05, "loss": 1.5176, "step": 1369 }, { "epoch": 1.8327759197324416, "grad_norm": 0.7391565442085266, "learning_rate": 3.921656911301216e-05, "loss": 2.0288, "step": 1370 }, { "epoch": 1.8341137123745819, "grad_norm": 1.527122974395752, "learning_rate": 3.917154434939217e-05, "loss": 1.439, "step": 1371 }, { "epoch": 1.8354515050167224, "grad_norm": 1.5331617593765259, "learning_rate": 3.912651958577218e-05, "loss": 1.1387, "step": 1372 }, { "epoch": 1.836789297658863, "grad_norm": 0.7895675897598267, "learning_rate": 3.9081494822152187e-05, "loss": 1.7776, "step": 1373 }, { "epoch": 1.8381270903010032, "grad_norm": 1.14493727684021, "learning_rate": 3.9036470058532195e-05, "loss": 1.7798, "step": 1374 }, { "epoch": 1.839464882943144, "grad_norm": 1.087314248085022, "learning_rate": 3.8991445294912204e-05, "loss": 1.3838, "step": 1375 }, { "epoch": 1.8408026755852842, "grad_norm": 0.98976069688797, "learning_rate": 3.894642053129221e-05, "loss": 1.7391, "step": 1376 }, { "epoch": 1.8421404682274247, "grad_norm": 0.7153158783912659, "learning_rate": 3.890139576767222e-05, "loss": 2.0336, "step": 1377 }, { "epoch": 1.8434782608695652, "grad_norm": 1.0000951290130615, "learning_rate": 3.8856371004052236e-05, "loss": 1.6006, "step": 1378 }, { "epoch": 1.8448160535117055, "grad_norm": 0.8586902618408203, "learning_rate": 3.881134624043224e-05, "loss": 1.284, "step": 1379 }, { "epoch": 1.8461538461538463, "grad_norm": 0.8292785882949829, "learning_rate": 3.8766321476812253e-05, "loss": 1.5184, "step": 1380 }, { "epoch": 1.8474916387959865, "grad_norm": 0.7205365300178528, "learning_rate": 3.8721296713192255e-05, "loss": 1.8033, "step": 1381 }, { "epoch": 1.848829431438127, "grad_norm": 0.7597335577011108, "learning_rate": 3.867627194957227e-05, "loss": 1.4228, "step": 1382 }, { "epoch": 1.8501672240802676, "grad_norm": 0.9705151915550232, "learning_rate": 3.863124718595227e-05, "loss": 1.6322, "step": 1383 }, { "epoch": 1.851505016722408, "grad_norm": 0.9061914682388306, "learning_rate": 3.858622242233229e-05, "loss": 1.861, "step": 1384 }, { "epoch": 1.8528428093645486, "grad_norm": 0.9491883516311646, "learning_rate": 3.854119765871229e-05, "loss": 1.8632, "step": 1385 }, { "epoch": 1.8541806020066889, "grad_norm": 0.6613609790802002, "learning_rate": 3.8496172895092305e-05, "loss": 1.3203, "step": 1386 }, { "epoch": 1.8555183946488294, "grad_norm": 1.661139726638794, "learning_rate": 3.8451148131472314e-05, "loss": 0.9495, "step": 1387 }, { "epoch": 1.85685618729097, "grad_norm": 0.946655809879303, "learning_rate": 3.840612336785232e-05, "loss": 1.7735, "step": 1388 }, { "epoch": 1.8581939799331104, "grad_norm": 0.8011321425437927, "learning_rate": 3.836109860423233e-05, "loss": 1.0474, "step": 1389 }, { "epoch": 1.859531772575251, "grad_norm": 0.8001610040664673, "learning_rate": 3.831607384061234e-05, "loss": 1.6964, "step": 1390 }, { "epoch": 1.8608695652173912, "grad_norm": 1.0349432229995728, "learning_rate": 3.827104907699235e-05, "loss": 1.4787, "step": 1391 }, { "epoch": 1.8622073578595317, "grad_norm": 0.804012656211853, "learning_rate": 3.8226024313372357e-05, "loss": 1.9486, "step": 1392 }, { "epoch": 1.8635451505016722, "grad_norm": 0.936322033405304, "learning_rate": 3.8180999549752365e-05, "loss": 1.4181, "step": 1393 }, { "epoch": 1.8648829431438128, "grad_norm": 1.3157624006271362, "learning_rate": 3.8135974786132374e-05, "loss": 1.4829, "step": 1394 }, { "epoch": 1.8662207357859533, "grad_norm": 0.8876410722732544, "learning_rate": 3.809095002251238e-05, "loss": 1.6884, "step": 1395 }, { "epoch": 1.8675585284280936, "grad_norm": 0.6261321902275085, "learning_rate": 3.80459252588924e-05, "loss": 2.0704, "step": 1396 }, { "epoch": 1.868896321070234, "grad_norm": 1.0813729763031006, "learning_rate": 3.80009004952724e-05, "loss": 1.3557, "step": 1397 }, { "epoch": 1.8702341137123746, "grad_norm": 0.9512030482292175, "learning_rate": 3.7955875731652415e-05, "loss": 1.3012, "step": 1398 }, { "epoch": 1.871571906354515, "grad_norm": 0.8181292414665222, "learning_rate": 3.791085096803242e-05, "loss": 1.6619, "step": 1399 }, { "epoch": 1.8729096989966556, "grad_norm": 0.8774016499519348, "learning_rate": 3.786582620441243e-05, "loss": 1.2919, "step": 1400 }, { "epoch": 1.874247491638796, "grad_norm": 0.7443768978118896, "learning_rate": 3.7820801440792434e-05, "loss": 1.4092, "step": 1401 }, { "epoch": 1.8755852842809364, "grad_norm": 0.7041268348693848, "learning_rate": 3.777577667717245e-05, "loss": 1.0577, "step": 1402 }, { "epoch": 1.876923076923077, "grad_norm": 1.1805241107940674, "learning_rate": 3.773075191355245e-05, "loss": 1.4778, "step": 1403 }, { "epoch": 1.8782608695652174, "grad_norm": 1.1446001529693604, "learning_rate": 3.7685727149932466e-05, "loss": 1.3754, "step": 1404 }, { "epoch": 1.879598662207358, "grad_norm": 0.7583310604095459, "learning_rate": 3.7640702386312475e-05, "loss": 1.847, "step": 1405 }, { "epoch": 1.8809364548494982, "grad_norm": 1.094193696975708, "learning_rate": 3.7595677622692484e-05, "loss": 1.1759, "step": 1406 }, { "epoch": 1.8822742474916387, "grad_norm": 0.9978253841400146, "learning_rate": 3.755065285907249e-05, "loss": 1.2676, "step": 1407 }, { "epoch": 1.8836120401337793, "grad_norm": 0.7539755702018738, "learning_rate": 3.75056280954525e-05, "loss": 1.8526, "step": 1408 }, { "epoch": 1.8849498327759198, "grad_norm": 0.8578140735626221, "learning_rate": 3.746060333183251e-05, "loss": 1.4808, "step": 1409 }, { "epoch": 1.8862876254180603, "grad_norm": 0.8148081302642822, "learning_rate": 3.741557856821252e-05, "loss": 1.7241, "step": 1410 }, { "epoch": 1.8876254180602006, "grad_norm": 1.0469331741333008, "learning_rate": 3.7370553804592526e-05, "loss": 1.5271, "step": 1411 }, { "epoch": 1.8889632107023413, "grad_norm": 0.8617160320281982, "learning_rate": 3.7325529040972535e-05, "loss": 0.9646, "step": 1412 }, { "epoch": 1.8903010033444816, "grad_norm": 0.9113197326660156, "learning_rate": 3.7280504277352544e-05, "loss": 1.7127, "step": 1413 }, { "epoch": 1.891638795986622, "grad_norm": 0.8339391946792603, "learning_rate": 3.723547951373256e-05, "loss": 1.2338, "step": 1414 }, { "epoch": 1.8929765886287626, "grad_norm": 0.8415791988372803, "learning_rate": 3.719045475011256e-05, "loss": 1.7855, "step": 1415 }, { "epoch": 1.894314381270903, "grad_norm": 1.0455056428909302, "learning_rate": 3.7145429986492576e-05, "loss": 1.5371, "step": 1416 }, { "epoch": 1.8956521739130436, "grad_norm": 0.9208756685256958, "learning_rate": 3.710040522287258e-05, "loss": 1.5899, "step": 1417 }, { "epoch": 1.896989966555184, "grad_norm": 0.8601641654968262, "learning_rate": 3.705538045925259e-05, "loss": 1.5976, "step": 1418 }, { "epoch": 1.8983277591973244, "grad_norm": 1.0065354108810425, "learning_rate": 3.7010355695632595e-05, "loss": 1.9271, "step": 1419 }, { "epoch": 1.899665551839465, "grad_norm": 1.224617838859558, "learning_rate": 3.696533093201261e-05, "loss": 1.1049, "step": 1420 }, { "epoch": 1.9010033444816052, "grad_norm": 1.2477177381515503, "learning_rate": 3.692030616839261e-05, "loss": 1.1976, "step": 1421 }, { "epoch": 1.902341137123746, "grad_norm": 0.8188490867614746, "learning_rate": 3.687528140477263e-05, "loss": 1.2126, "step": 1422 }, { "epoch": 1.9036789297658863, "grad_norm": 0.927735447883606, "learning_rate": 3.6830256641152636e-05, "loss": 1.6352, "step": 1423 }, { "epoch": 1.9050167224080268, "grad_norm": 0.7117443084716797, "learning_rate": 3.6785231877532645e-05, "loss": 1.1576, "step": 1424 }, { "epoch": 1.9063545150501673, "grad_norm": 0.9709138870239258, "learning_rate": 3.6740207113912653e-05, "loss": 1.7208, "step": 1425 }, { "epoch": 1.9076923076923076, "grad_norm": 0.9803403615951538, "learning_rate": 3.669518235029266e-05, "loss": 1.6992, "step": 1426 }, { "epoch": 1.9090301003344483, "grad_norm": 0.9272592067718506, "learning_rate": 3.665015758667267e-05, "loss": 1.7691, "step": 1427 }, { "epoch": 1.9103678929765886, "grad_norm": 0.8970061540603638, "learning_rate": 3.660513282305268e-05, "loss": 1.6872, "step": 1428 }, { "epoch": 1.9117056856187291, "grad_norm": 1.3868622779846191, "learning_rate": 3.656010805943269e-05, "loss": 1.9104, "step": 1429 }, { "epoch": 1.9130434782608696, "grad_norm": 0.9223452806472778, "learning_rate": 3.6515083295812696e-05, "loss": 1.5378, "step": 1430 }, { "epoch": 1.91438127090301, "grad_norm": 0.8893844485282898, "learning_rate": 3.6470058532192705e-05, "loss": 1.5726, "step": 1431 }, { "epoch": 1.9157190635451506, "grad_norm": 0.8713217377662659, "learning_rate": 3.642503376857272e-05, "loss": 1.3582, "step": 1432 }, { "epoch": 1.917056856187291, "grad_norm": 0.9675815105438232, "learning_rate": 3.638000900495272e-05, "loss": 1.521, "step": 1433 }, { "epoch": 1.9183946488294314, "grad_norm": 1.0128233432769775, "learning_rate": 3.633498424133274e-05, "loss": 1.9584, "step": 1434 }, { "epoch": 1.919732441471572, "grad_norm": 0.7074108719825745, "learning_rate": 3.628995947771274e-05, "loss": 1.9104, "step": 1435 }, { "epoch": 1.9210702341137122, "grad_norm": 0.6000598669052124, "learning_rate": 3.6244934714092755e-05, "loss": 2.0013, "step": 1436 }, { "epoch": 1.922408026755853, "grad_norm": 0.8514284491539001, "learning_rate": 3.619990995047276e-05, "loss": 1.716, "step": 1437 }, { "epoch": 1.9237458193979933, "grad_norm": 0.6201356649398804, "learning_rate": 3.615488518685277e-05, "loss": 1.4419, "step": 1438 }, { "epoch": 1.9250836120401338, "grad_norm": 0.9943525791168213, "learning_rate": 3.610986042323278e-05, "loss": 1.9337, "step": 1439 }, { "epoch": 1.9264214046822743, "grad_norm": 0.9183377623558044, "learning_rate": 3.606483565961279e-05, "loss": 1.5447, "step": 1440 }, { "epoch": 1.9277591973244146, "grad_norm": 0.8715210556983948, "learning_rate": 3.60198108959928e-05, "loss": 1.7298, "step": 1441 }, { "epoch": 1.9290969899665553, "grad_norm": 1.0677322149276733, "learning_rate": 3.5974786132372806e-05, "loss": 1.1434, "step": 1442 }, { "epoch": 1.9304347826086956, "grad_norm": 0.8093985915184021, "learning_rate": 3.5929761368752815e-05, "loss": 1.9209, "step": 1443 }, { "epoch": 1.9317725752508361, "grad_norm": 1.3754236698150635, "learning_rate": 3.5884736605132823e-05, "loss": 1.7104, "step": 1444 }, { "epoch": 1.9331103678929766, "grad_norm": 0.8817523121833801, "learning_rate": 3.583971184151283e-05, "loss": 1.5823, "step": 1445 }, { "epoch": 1.934448160535117, "grad_norm": 1.1452676057815552, "learning_rate": 3.579468707789284e-05, "loss": 1.2673, "step": 1446 }, { "epoch": 1.9357859531772577, "grad_norm": 1.0037072896957397, "learning_rate": 3.5749662314272856e-05, "loss": 1.8337, "step": 1447 }, { "epoch": 1.937123745819398, "grad_norm": 1.1124719381332397, "learning_rate": 3.570463755065286e-05, "loss": 1.5884, "step": 1448 }, { "epoch": 1.9384615384615385, "grad_norm": 0.6976829767227173, "learning_rate": 3.565961278703287e-05, "loss": 2.3163, "step": 1449 }, { "epoch": 1.939799331103679, "grad_norm": 0.9209871888160706, "learning_rate": 3.561458802341288e-05, "loss": 1.5843, "step": 1450 }, { "epoch": 1.9411371237458193, "grad_norm": 1.010508418083191, "learning_rate": 3.556956325979289e-05, "loss": 1.458, "step": 1451 }, { "epoch": 1.94247491638796, "grad_norm": 1.0236648321151733, "learning_rate": 3.55245384961729e-05, "loss": 1.6633, "step": 1452 }, { "epoch": 1.9438127090301003, "grad_norm": 0.8461986184120178, "learning_rate": 3.547951373255291e-05, "loss": 1.7553, "step": 1453 }, { "epoch": 1.9451505016722408, "grad_norm": 1.7666321992874146, "learning_rate": 3.5434488968932916e-05, "loss": 1.8603, "step": 1454 }, { "epoch": 1.9464882943143813, "grad_norm": 1.007495641708374, "learning_rate": 3.5389464205312925e-05, "loss": 1.4298, "step": 1455 }, { "epoch": 1.9478260869565216, "grad_norm": 1.0924739837646484, "learning_rate": 3.534443944169293e-05, "loss": 1.2259, "step": 1456 }, { "epoch": 1.9491638795986623, "grad_norm": 1.3044383525848389, "learning_rate": 3.529941467807294e-05, "loss": 1.0427, "step": 1457 }, { "epoch": 1.9505016722408026, "grad_norm": 1.1054308414459229, "learning_rate": 3.525438991445295e-05, "loss": 1.2018, "step": 1458 }, { "epoch": 1.9518394648829431, "grad_norm": 0.857029378414154, "learning_rate": 3.5209365150832966e-05, "loss": 1.6903, "step": 1459 }, { "epoch": 1.9531772575250836, "grad_norm": 0.8971465826034546, "learning_rate": 3.516434038721297e-05, "loss": 1.8465, "step": 1460 }, { "epoch": 1.954515050167224, "grad_norm": 0.7438530921936035, "learning_rate": 3.511931562359298e-05, "loss": 2.0821, "step": 1461 }, { "epoch": 1.9558528428093647, "grad_norm": 0.9445087909698486, "learning_rate": 3.5074290859972985e-05, "loss": 1.186, "step": 1462 }, { "epoch": 1.957190635451505, "grad_norm": 0.8798640370368958, "learning_rate": 3.5029266096353e-05, "loss": 1.2806, "step": 1463 }, { "epoch": 1.9585284280936455, "grad_norm": 0.7574049830436707, "learning_rate": 3.4984241332733e-05, "loss": 1.9215, "step": 1464 }, { "epoch": 1.959866220735786, "grad_norm": 0.7600085139274597, "learning_rate": 3.493921656911302e-05, "loss": 1.547, "step": 1465 }, { "epoch": 1.9612040133779263, "grad_norm": 1.3530699014663696, "learning_rate": 3.489419180549302e-05, "loss": 1.3317, "step": 1466 }, { "epoch": 1.962541806020067, "grad_norm": 0.9421098232269287, "learning_rate": 3.4849167041873035e-05, "loss": 1.496, "step": 1467 }, { "epoch": 1.9638795986622073, "grad_norm": 0.9440025091171265, "learning_rate": 3.480414227825304e-05, "loss": 1.418, "step": 1468 }, { "epoch": 1.9652173913043478, "grad_norm": 0.8065316081047058, "learning_rate": 3.475911751463305e-05, "loss": 1.5454, "step": 1469 }, { "epoch": 1.9665551839464883, "grad_norm": 1.2114293575286865, "learning_rate": 3.471409275101306e-05, "loss": 1.214, "step": 1470 }, { "epoch": 1.9678929765886286, "grad_norm": 0.8474501371383667, "learning_rate": 3.466906798739307e-05, "loss": 1.4953, "step": 1471 }, { "epoch": 1.9692307692307693, "grad_norm": 1.0701537132263184, "learning_rate": 3.462404322377308e-05, "loss": 1.488, "step": 1472 }, { "epoch": 1.9705685618729096, "grad_norm": 0.6921608448028564, "learning_rate": 3.4579018460153086e-05, "loss": 1.8654, "step": 1473 }, { "epoch": 1.9719063545150501, "grad_norm": 1.0466644763946533, "learning_rate": 3.4533993696533095e-05, "loss": 1.7775, "step": 1474 }, { "epoch": 1.9732441471571907, "grad_norm": 0.8933880925178528, "learning_rate": 3.44889689329131e-05, "loss": 1.5537, "step": 1475 }, { "epoch": 1.974581939799331, "grad_norm": 0.8331461548805237, "learning_rate": 3.444394416929311e-05, "loss": 1.5999, "step": 1476 }, { "epoch": 1.9759197324414717, "grad_norm": 1.0497558116912842, "learning_rate": 3.439891940567313e-05, "loss": 1.2683, "step": 1477 }, { "epoch": 1.977257525083612, "grad_norm": 0.812122642993927, "learning_rate": 3.435389464205313e-05, "loss": 1.3971, "step": 1478 }, { "epoch": 1.9785953177257525, "grad_norm": 0.7708330154418945, "learning_rate": 3.4308869878433144e-05, "loss": 1.4712, "step": 1479 }, { "epoch": 1.979933110367893, "grad_norm": 0.8662110567092896, "learning_rate": 3.4263845114813146e-05, "loss": 1.7005, "step": 1480 }, { "epoch": 1.9812709030100333, "grad_norm": 0.7595421671867371, "learning_rate": 3.421882035119316e-05, "loss": 1.2517, "step": 1481 }, { "epoch": 1.982608695652174, "grad_norm": 0.6963253021240234, "learning_rate": 3.417379558757316e-05, "loss": 1.6449, "step": 1482 }, { "epoch": 1.9839464882943143, "grad_norm": 1.4446080923080444, "learning_rate": 3.412877082395318e-05, "loss": 1.8439, "step": 1483 }, { "epoch": 1.9852842809364548, "grad_norm": 0.791908323764801, "learning_rate": 3.408374606033318e-05, "loss": 1.7487, "step": 1484 }, { "epoch": 1.9866220735785953, "grad_norm": 0.8825159072875977, "learning_rate": 3.4038721296713196e-05, "loss": 1.7007, "step": 1485 }, { "epoch": 1.9879598662207358, "grad_norm": 1.163138747215271, "learning_rate": 3.3993696533093204e-05, "loss": 1.7588, "step": 1486 }, { "epoch": 1.9892976588628764, "grad_norm": 0.8407654166221619, "learning_rate": 3.394867176947321e-05, "loss": 1.7077, "step": 1487 }, { "epoch": 1.9906354515050166, "grad_norm": 1.0220328569412231, "learning_rate": 3.390364700585322e-05, "loss": 1.804, "step": 1488 }, { "epoch": 1.9919732441471572, "grad_norm": 1.0959333181381226, "learning_rate": 3.385862224223323e-05, "loss": 1.3809, "step": 1489 }, { "epoch": 1.9933110367892977, "grad_norm": 1.8417096138000488, "learning_rate": 3.381359747861324e-05, "loss": 1.5266, "step": 1490 }, { "epoch": 1.9946488294314382, "grad_norm": 0.7798132300376892, "learning_rate": 3.376857271499325e-05, "loss": 1.6998, "step": 1491 }, { "epoch": 1.9959866220735787, "grad_norm": 0.9236243963241577, "learning_rate": 3.3723547951373256e-05, "loss": 1.7454, "step": 1492 }, { "epoch": 1.997324414715719, "grad_norm": 0.82822185754776, "learning_rate": 3.3678523187753265e-05, "loss": 1.6078, "step": 1493 }, { "epoch": 1.9986622073578595, "grad_norm": 1.1015570163726807, "learning_rate": 3.363349842413327e-05, "loss": 1.2679, "step": 1494 }, { "epoch": 2.0, "grad_norm": 0.8461845517158508, "learning_rate": 3.358847366051329e-05, "loss": 1.6352, "step": 1495 }, { "epoch": 2.0013377926421403, "grad_norm": 1.279486894607544, "learning_rate": 3.354344889689329e-05, "loss": 1.1376, "step": 1496 }, { "epoch": 2.002675585284281, "grad_norm": 0.776619017124176, "learning_rate": 3.3498424133273306e-05, "loss": 0.4116, "step": 1497 }, { "epoch": 2.0040133779264213, "grad_norm": 1.0830862522125244, "learning_rate": 3.345339936965331e-05, "loss": 0.6109, "step": 1498 }, { "epoch": 2.005351170568562, "grad_norm": 1.01801598072052, "learning_rate": 3.340837460603332e-05, "loss": 0.8726, "step": 1499 }, { "epoch": 2.0066889632107023, "grad_norm": 1.0562922954559326, "learning_rate": 3.3363349842413325e-05, "loss": 1.1031, "step": 1500 }, { "epoch": 2.0080267558528426, "grad_norm": 2.5661964416503906, "learning_rate": 3.331832507879334e-05, "loss": 0.888, "step": 1501 }, { "epoch": 2.0093645484949834, "grad_norm": 0.9264085292816162, "learning_rate": 3.327330031517334e-05, "loss": 0.6023, "step": 1502 }, { "epoch": 2.0107023411371236, "grad_norm": 1.4456504583358765, "learning_rate": 3.322827555155336e-05, "loss": 0.7641, "step": 1503 }, { "epoch": 2.0120401337792644, "grad_norm": 1.4428790807724, "learning_rate": 3.3183250787933366e-05, "loss": 1.1088, "step": 1504 }, { "epoch": 2.0133779264214047, "grad_norm": 1.8238784074783325, "learning_rate": 3.3138226024313374e-05, "loss": 1.26, "step": 1505 }, { "epoch": 2.014715719063545, "grad_norm": 1.0849965810775757, "learning_rate": 3.309320126069338e-05, "loss": 1.3907, "step": 1506 }, { "epoch": 2.0160535117056857, "grad_norm": 1.50315260887146, "learning_rate": 3.304817649707339e-05, "loss": 0.9065, "step": 1507 }, { "epoch": 2.017391304347826, "grad_norm": 1.1935383081436157, "learning_rate": 3.30031517334534e-05, "loss": 1.0666, "step": 1508 }, { "epoch": 2.0187290969899667, "grad_norm": 1.2051082849502563, "learning_rate": 3.295812696983341e-05, "loss": 1.0776, "step": 1509 }, { "epoch": 2.020066889632107, "grad_norm": 0.8955470323562622, "learning_rate": 3.291310220621342e-05, "loss": 1.1224, "step": 1510 }, { "epoch": 2.0214046822742473, "grad_norm": 1.3287490606307983, "learning_rate": 3.2868077442593426e-05, "loss": 0.9316, "step": 1511 }, { "epoch": 2.022742474916388, "grad_norm": 0.9810749292373657, "learning_rate": 3.2823052678973435e-05, "loss": 1.0389, "step": 1512 }, { "epoch": 2.0240802675585283, "grad_norm": 1.1453120708465576, "learning_rate": 3.277802791535345e-05, "loss": 0.862, "step": 1513 }, { "epoch": 2.025418060200669, "grad_norm": 1.0180597305297852, "learning_rate": 3.273300315173345e-05, "loss": 0.9974, "step": 1514 }, { "epoch": 2.0267558528428093, "grad_norm": 0.8884549140930176, "learning_rate": 3.268797838811347e-05, "loss": 1.3196, "step": 1515 }, { "epoch": 2.0280936454849496, "grad_norm": 0.8313940167427063, "learning_rate": 3.264295362449347e-05, "loss": 1.6277, "step": 1516 }, { "epoch": 2.0294314381270904, "grad_norm": 1.1740089654922485, "learning_rate": 3.2597928860873484e-05, "loss": 1.0187, "step": 1517 }, { "epoch": 2.0307692307692307, "grad_norm": 1.3176401853561401, "learning_rate": 3.2552904097253486e-05, "loss": 1.2444, "step": 1518 }, { "epoch": 2.0321070234113714, "grad_norm": 0.8785193562507629, "learning_rate": 3.25078793336335e-05, "loss": 1.7614, "step": 1519 }, { "epoch": 2.0334448160535117, "grad_norm": 1.3594410419464111, "learning_rate": 3.246285457001351e-05, "loss": 1.0677, "step": 1520 }, { "epoch": 2.034782608695652, "grad_norm": 1.3269301652908325, "learning_rate": 3.241782980639352e-05, "loss": 0.4644, "step": 1521 }, { "epoch": 2.0361204013377927, "grad_norm": 1.262554407119751, "learning_rate": 3.237280504277353e-05, "loss": 1.0349, "step": 1522 }, { "epoch": 2.037458193979933, "grad_norm": 1.3310198783874512, "learning_rate": 3.2327780279153536e-05, "loss": 0.6012, "step": 1523 }, { "epoch": 2.0387959866220737, "grad_norm": 1.1281541585922241, "learning_rate": 3.2282755515533544e-05, "loss": 1.2607, "step": 1524 }, { "epoch": 2.040133779264214, "grad_norm": 1.0073496103286743, "learning_rate": 3.223773075191355e-05, "loss": 1.2157, "step": 1525 }, { "epoch": 2.0414715719063543, "grad_norm": 1.030488133430481, "learning_rate": 3.219270598829356e-05, "loss": 0.808, "step": 1526 }, { "epoch": 2.042809364548495, "grad_norm": 1.070237159729004, "learning_rate": 3.214768122467357e-05, "loss": 1.4373, "step": 1527 }, { "epoch": 2.0441471571906353, "grad_norm": 1.4718862771987915, "learning_rate": 3.210265646105358e-05, "loss": 1.0276, "step": 1528 }, { "epoch": 2.045484949832776, "grad_norm": 1.0908019542694092, "learning_rate": 3.205763169743359e-05, "loss": 0.3747, "step": 1529 }, { "epoch": 2.0468227424749164, "grad_norm": 1.0104992389678955, "learning_rate": 3.2012606933813596e-05, "loss": 1.295, "step": 1530 }, { "epoch": 2.0481605351170566, "grad_norm": 1.1235511302947998, "learning_rate": 3.196758217019361e-05, "loss": 1.3199, "step": 1531 }, { "epoch": 2.0494983277591974, "grad_norm": 0.7247384190559387, "learning_rate": 3.192255740657362e-05, "loss": 1.1936, "step": 1532 }, { "epoch": 2.0508361204013377, "grad_norm": 1.0165960788726807, "learning_rate": 3.187753264295363e-05, "loss": 0.6051, "step": 1533 }, { "epoch": 2.0521739130434784, "grad_norm": 1.3204349279403687, "learning_rate": 3.183250787933364e-05, "loss": 0.7344, "step": 1534 }, { "epoch": 2.0535117056856187, "grad_norm": 1.2122710943222046, "learning_rate": 3.1787483115713646e-05, "loss": 0.898, "step": 1535 }, { "epoch": 2.054849498327759, "grad_norm": 1.2342621088027954, "learning_rate": 3.1742458352093654e-05, "loss": 1.1225, "step": 1536 }, { "epoch": 2.0561872909698997, "grad_norm": 0.9262746572494507, "learning_rate": 3.169743358847366e-05, "loss": 1.527, "step": 1537 }, { "epoch": 2.05752508361204, "grad_norm": 1.1269612312316895, "learning_rate": 3.165240882485367e-05, "loss": 1.2711, "step": 1538 }, { "epoch": 2.0588628762541807, "grad_norm": 0.9769323468208313, "learning_rate": 3.160738406123368e-05, "loss": 1.3137, "step": 1539 }, { "epoch": 2.060200668896321, "grad_norm": 1.2974296808242798, "learning_rate": 3.156235929761369e-05, "loss": 0.9984, "step": 1540 }, { "epoch": 2.0615384615384613, "grad_norm": 0.9156148433685303, "learning_rate": 3.15173345339937e-05, "loss": 1.0303, "step": 1541 }, { "epoch": 2.062876254180602, "grad_norm": 1.0938788652420044, "learning_rate": 3.1472309770373706e-05, "loss": 0.6634, "step": 1542 }, { "epoch": 2.0642140468227423, "grad_norm": 1.010788917541504, "learning_rate": 3.1427285006753714e-05, "loss": 0.662, "step": 1543 }, { "epoch": 2.065551839464883, "grad_norm": 1.0869221687316895, "learning_rate": 3.138226024313373e-05, "loss": 1.3342, "step": 1544 }, { "epoch": 2.0668896321070234, "grad_norm": 1.017852783203125, "learning_rate": 3.133723547951373e-05, "loss": 0.608, "step": 1545 }, { "epoch": 2.068227424749164, "grad_norm": 1.0046050548553467, "learning_rate": 3.129221071589375e-05, "loss": 1.1453, "step": 1546 }, { "epoch": 2.0695652173913044, "grad_norm": 1.4137753248214722, "learning_rate": 3.124718595227375e-05, "loss": 0.8363, "step": 1547 }, { "epoch": 2.0709030100334447, "grad_norm": 0.852415919303894, "learning_rate": 3.1202161188653764e-05, "loss": 1.4471, "step": 1548 }, { "epoch": 2.0722408026755854, "grad_norm": 1.1125855445861816, "learning_rate": 3.115713642503377e-05, "loss": 0.9525, "step": 1549 }, { "epoch": 2.0735785953177257, "grad_norm": 1.400482177734375, "learning_rate": 3.111211166141378e-05, "loss": 1.3223, "step": 1550 }, { "epoch": 2.074916387959866, "grad_norm": 1.349930763244629, "learning_rate": 3.106708689779379e-05, "loss": 1.1897, "step": 1551 }, { "epoch": 2.0762541806020067, "grad_norm": 1.397952914237976, "learning_rate": 3.10220621341738e-05, "loss": 0.7269, "step": 1552 }, { "epoch": 2.077591973244147, "grad_norm": 1.566179871559143, "learning_rate": 3.097703737055381e-05, "loss": 0.5265, "step": 1553 }, { "epoch": 2.0789297658862878, "grad_norm": 1.1740292310714722, "learning_rate": 3.0932012606933816e-05, "loss": 0.574, "step": 1554 }, { "epoch": 2.080267558528428, "grad_norm": 1.0403320789337158, "learning_rate": 3.0886987843313824e-05, "loss": 0.9537, "step": 1555 }, { "epoch": 2.0816053511705688, "grad_norm": 0.9628156423568726, "learning_rate": 3.084196307969383e-05, "loss": 0.6196, "step": 1556 }, { "epoch": 2.082943143812709, "grad_norm": 1.2764846086502075, "learning_rate": 3.079693831607384e-05, "loss": 0.9719, "step": 1557 }, { "epoch": 2.0842809364548494, "grad_norm": 1.7418354749679565, "learning_rate": 3.075191355245386e-05, "loss": 0.2701, "step": 1558 }, { "epoch": 2.08561872909699, "grad_norm": 1.3813650608062744, "learning_rate": 3.070688878883386e-05, "loss": 1.0376, "step": 1559 }, { "epoch": 2.0869565217391304, "grad_norm": 1.4286768436431885, "learning_rate": 3.0661864025213874e-05, "loss": 0.4777, "step": 1560 }, { "epoch": 2.088294314381271, "grad_norm": 0.9821745157241821, "learning_rate": 3.0616839261593876e-05, "loss": 1.1567, "step": 1561 }, { "epoch": 2.0896321070234114, "grad_norm": 1.05975341796875, "learning_rate": 3.057181449797389e-05, "loss": 0.9883, "step": 1562 }, { "epoch": 2.0909698996655517, "grad_norm": 1.042338490486145, "learning_rate": 3.052678973435389e-05, "loss": 0.8155, "step": 1563 }, { "epoch": 2.0923076923076924, "grad_norm": 0.7823744416236877, "learning_rate": 3.0481764970733905e-05, "loss": 1.2554, "step": 1564 }, { "epoch": 2.0936454849498327, "grad_norm": 1.068821907043457, "learning_rate": 3.0436740207113913e-05, "loss": 1.263, "step": 1565 }, { "epoch": 2.0949832775919734, "grad_norm": 1.022455096244812, "learning_rate": 3.0391715443493922e-05, "loss": 1.1237, "step": 1566 }, { "epoch": 2.0963210702341137, "grad_norm": 1.326230525970459, "learning_rate": 3.0346690679873934e-05, "loss": 1.033, "step": 1567 }, { "epoch": 2.097658862876254, "grad_norm": 0.8274069428443909, "learning_rate": 3.030166591625394e-05, "loss": 1.544, "step": 1568 }, { "epoch": 2.0989966555183948, "grad_norm": 1.4057950973510742, "learning_rate": 3.025664115263395e-05, "loss": 0.8866, "step": 1569 }, { "epoch": 2.100334448160535, "grad_norm": 1.241181492805481, "learning_rate": 3.021161638901396e-05, "loss": 1.037, "step": 1570 }, { "epoch": 2.101672240802676, "grad_norm": 1.1959797143936157, "learning_rate": 3.016659162539397e-05, "loss": 0.8406, "step": 1571 }, { "epoch": 2.103010033444816, "grad_norm": 1.3782483339309692, "learning_rate": 3.0121566861773977e-05, "loss": 1.0415, "step": 1572 }, { "epoch": 2.1043478260869564, "grad_norm": 0.8823965787887573, "learning_rate": 3.0076542098153985e-05, "loss": 0.612, "step": 1573 }, { "epoch": 2.105685618729097, "grad_norm": 1.0144659280776978, "learning_rate": 3.0031517334533994e-05, "loss": 1.4259, "step": 1574 }, { "epoch": 2.1070234113712374, "grad_norm": 1.094332218170166, "learning_rate": 2.9986492570914003e-05, "loss": 1.2305, "step": 1575 }, { "epoch": 2.108361204013378, "grad_norm": 1.0350522994995117, "learning_rate": 2.9941467807294015e-05, "loss": 0.6495, "step": 1576 }, { "epoch": 2.1096989966555184, "grad_norm": 1.2167094945907593, "learning_rate": 2.9896443043674023e-05, "loss": 0.5618, "step": 1577 }, { "epoch": 2.1110367892976587, "grad_norm": 0.9851863384246826, "learning_rate": 2.9851418280054032e-05, "loss": 1.4429, "step": 1578 }, { "epoch": 2.1123745819397994, "grad_norm": 1.2461366653442383, "learning_rate": 2.980639351643404e-05, "loss": 0.6586, "step": 1579 }, { "epoch": 2.1137123745819397, "grad_norm": 1.1684248447418213, "learning_rate": 2.976136875281405e-05, "loss": 1.109, "step": 1580 }, { "epoch": 2.1150501672240805, "grad_norm": 1.1089441776275635, "learning_rate": 2.9716343989194058e-05, "loss": 1.4378, "step": 1581 }, { "epoch": 2.1163879598662207, "grad_norm": 1.082276463508606, "learning_rate": 2.967131922557407e-05, "loss": 1.0216, "step": 1582 }, { "epoch": 2.117725752508361, "grad_norm": 1.1534171104431152, "learning_rate": 2.9626294461954075e-05, "loss": 0.798, "step": 1583 }, { "epoch": 2.1190635451505018, "grad_norm": 1.5649821758270264, "learning_rate": 2.9581269698334087e-05, "loss": 0.4634, "step": 1584 }, { "epoch": 2.120401337792642, "grad_norm": 0.8794542551040649, "learning_rate": 2.9536244934714095e-05, "loss": 0.8743, "step": 1585 }, { "epoch": 2.121739130434783, "grad_norm": 0.8998076319694519, "learning_rate": 2.9491220171094104e-05, "loss": 1.8453, "step": 1586 }, { "epoch": 2.123076923076923, "grad_norm": 0.9526141285896301, "learning_rate": 2.9446195407474116e-05, "loss": 1.0177, "step": 1587 }, { "epoch": 2.1244147157190634, "grad_norm": 1.1450635194778442, "learning_rate": 2.940117064385412e-05, "loss": 1.2151, "step": 1588 }, { "epoch": 2.125752508361204, "grad_norm": 1.4250377416610718, "learning_rate": 2.9356145880234133e-05, "loss": 0.7914, "step": 1589 }, { "epoch": 2.1270903010033444, "grad_norm": 1.0612053871154785, "learning_rate": 2.9311121116614138e-05, "loss": 1.3046, "step": 1590 }, { "epoch": 2.128428093645485, "grad_norm": 1.2039984464645386, "learning_rate": 2.926609635299415e-05, "loss": 0.5904, "step": 1591 }, { "epoch": 2.1297658862876254, "grad_norm": 1.2266685962677002, "learning_rate": 2.9221071589374155e-05, "loss": 1.2272, "step": 1592 }, { "epoch": 2.1311036789297657, "grad_norm": 1.5131837129592896, "learning_rate": 2.9176046825754167e-05, "loss": 1.1346, "step": 1593 }, { "epoch": 2.1324414715719064, "grad_norm": 1.3815799951553345, "learning_rate": 2.913102206213418e-05, "loss": 0.5999, "step": 1594 }, { "epoch": 2.1337792642140467, "grad_norm": 0.8422326445579529, "learning_rate": 2.9085997298514185e-05, "loss": 0.8242, "step": 1595 }, { "epoch": 2.1351170568561875, "grad_norm": 0.8974445462226868, "learning_rate": 2.9040972534894197e-05, "loss": 1.3765, "step": 1596 }, { "epoch": 2.1364548494983278, "grad_norm": 1.0997154712677002, "learning_rate": 2.8995947771274202e-05, "loss": 0.8954, "step": 1597 }, { "epoch": 2.137792642140468, "grad_norm": 1.0048385858535767, "learning_rate": 2.8950923007654214e-05, "loss": 1.3485, "step": 1598 }, { "epoch": 2.139130434782609, "grad_norm": 0.8595561981201172, "learning_rate": 2.890589824403422e-05, "loss": 0.7611, "step": 1599 }, { "epoch": 2.140468227424749, "grad_norm": 1.1249717473983765, "learning_rate": 2.886087348041423e-05, "loss": 1.0467, "step": 1600 }, { "epoch": 2.14180602006689, "grad_norm": 1.151814341545105, "learning_rate": 2.8815848716794236e-05, "loss": 1.0023, "step": 1601 }, { "epoch": 2.14314381270903, "grad_norm": 0.9440100789070129, "learning_rate": 2.8770823953174248e-05, "loss": 1.3554, "step": 1602 }, { "epoch": 2.1444816053511704, "grad_norm": 1.1240310668945312, "learning_rate": 2.872579918955426e-05, "loss": 1.126, "step": 1603 }, { "epoch": 2.145819397993311, "grad_norm": 1.3048230409622192, "learning_rate": 2.8680774425934265e-05, "loss": 1.0317, "step": 1604 }, { "epoch": 2.1471571906354514, "grad_norm": 1.1779913902282715, "learning_rate": 2.8635749662314277e-05, "loss": 1.0175, "step": 1605 }, { "epoch": 2.148494983277592, "grad_norm": 1.0159926414489746, "learning_rate": 2.8590724898694282e-05, "loss": 0.9013, "step": 1606 }, { "epoch": 2.1498327759197324, "grad_norm": 1.1253591775894165, "learning_rate": 2.8545700135074294e-05, "loss": 1.0637, "step": 1607 }, { "epoch": 2.1511705685618727, "grad_norm": 1.393731713294983, "learning_rate": 2.85006753714543e-05, "loss": 1.4256, "step": 1608 }, { "epoch": 2.1525083612040135, "grad_norm": 1.1206722259521484, "learning_rate": 2.845565060783431e-05, "loss": 1.1309, "step": 1609 }, { "epoch": 2.1538461538461537, "grad_norm": 0.8800863027572632, "learning_rate": 2.8410625844214317e-05, "loss": 0.8318, "step": 1610 }, { "epoch": 2.1551839464882945, "grad_norm": 1.0208646059036255, "learning_rate": 2.836560108059433e-05, "loss": 1.379, "step": 1611 }, { "epoch": 2.1565217391304348, "grad_norm": 1.1892729997634888, "learning_rate": 2.832057631697434e-05, "loss": 0.773, "step": 1612 }, { "epoch": 2.157859531772575, "grad_norm": 1.4659501314163208, "learning_rate": 2.8275551553354346e-05, "loss": 0.659, "step": 1613 }, { "epoch": 2.159197324414716, "grad_norm": 1.2769265174865723, "learning_rate": 2.8230526789734358e-05, "loss": 0.9489, "step": 1614 }, { "epoch": 2.160535117056856, "grad_norm": 1.3630527257919312, "learning_rate": 2.8185502026114363e-05, "loss": 0.8874, "step": 1615 }, { "epoch": 2.161872909698997, "grad_norm": 0.9882266521453857, "learning_rate": 2.8140477262494375e-05, "loss": 1.2838, "step": 1616 }, { "epoch": 2.163210702341137, "grad_norm": 0.9982869625091553, "learning_rate": 2.809545249887438e-05, "loss": 1.5081, "step": 1617 }, { "epoch": 2.1645484949832774, "grad_norm": 1.4063819646835327, "learning_rate": 2.8050427735254392e-05, "loss": 0.9401, "step": 1618 }, { "epoch": 2.165886287625418, "grad_norm": 1.2039049863815308, "learning_rate": 2.8005402971634397e-05, "loss": 1.1665, "step": 1619 }, { "epoch": 2.1672240802675584, "grad_norm": 1.0905070304870605, "learning_rate": 2.796037820801441e-05, "loss": 0.9254, "step": 1620 }, { "epoch": 2.168561872909699, "grad_norm": 1.2284868955612183, "learning_rate": 2.791535344439442e-05, "loss": 0.6904, "step": 1621 }, { "epoch": 2.1698996655518394, "grad_norm": 0.9538512825965881, "learning_rate": 2.7870328680774427e-05, "loss": 0.8834, "step": 1622 }, { "epoch": 2.1712374581939797, "grad_norm": 1.2037982940673828, "learning_rate": 2.782530391715444e-05, "loss": 1.1294, "step": 1623 }, { "epoch": 2.1725752508361205, "grad_norm": 0.953117847442627, "learning_rate": 2.7780279153534444e-05, "loss": 1.4922, "step": 1624 }, { "epoch": 2.1739130434782608, "grad_norm": 0.8379032015800476, "learning_rate": 2.7735254389914456e-05, "loss": 0.9568, "step": 1625 }, { "epoch": 2.1752508361204015, "grad_norm": 1.0359519720077515, "learning_rate": 2.769022962629446e-05, "loss": 0.4698, "step": 1626 }, { "epoch": 2.1765886287625418, "grad_norm": 0.8869040012359619, "learning_rate": 2.7645204862674473e-05, "loss": 1.0776, "step": 1627 }, { "epoch": 2.177926421404682, "grad_norm": 1.293930172920227, "learning_rate": 2.7600180099054478e-05, "loss": 0.8307, "step": 1628 }, { "epoch": 2.179264214046823, "grad_norm": 1.2503477334976196, "learning_rate": 2.755515533543449e-05, "loss": 1.0004, "step": 1629 }, { "epoch": 2.180602006688963, "grad_norm": 1.387004017829895, "learning_rate": 2.7510130571814502e-05, "loss": 1.2621, "step": 1630 }, { "epoch": 2.181939799331104, "grad_norm": 1.4255646467208862, "learning_rate": 2.7465105808194507e-05, "loss": 0.679, "step": 1631 }, { "epoch": 2.183277591973244, "grad_norm": 1.2277337312698364, "learning_rate": 2.742008104457452e-05, "loss": 0.8446, "step": 1632 }, { "epoch": 2.184615384615385, "grad_norm": 1.272117257118225, "learning_rate": 2.7375056280954524e-05, "loss": 1.309, "step": 1633 }, { "epoch": 2.185953177257525, "grad_norm": 0.906496524810791, "learning_rate": 2.7330031517334536e-05, "loss": 0.3541, "step": 1634 }, { "epoch": 2.1872909698996654, "grad_norm": 1.2406556606292725, "learning_rate": 2.728500675371454e-05, "loss": 0.6467, "step": 1635 }, { "epoch": 2.188628762541806, "grad_norm": 1.1095494031906128, "learning_rate": 2.7239981990094554e-05, "loss": 0.9538, "step": 1636 }, { "epoch": 2.1899665551839465, "grad_norm": 1.2391284704208374, "learning_rate": 2.719495722647456e-05, "loss": 1.0218, "step": 1637 }, { "epoch": 2.1913043478260867, "grad_norm": 1.4398272037506104, "learning_rate": 2.714993246285457e-05, "loss": 0.9383, "step": 1638 }, { "epoch": 2.1926421404682275, "grad_norm": 1.0370733737945557, "learning_rate": 2.7104907699234583e-05, "loss": 0.5855, "step": 1639 }, { "epoch": 2.1939799331103678, "grad_norm": 1.026199221611023, "learning_rate": 2.7059882935614588e-05, "loss": 1.1646, "step": 1640 }, { "epoch": 2.1953177257525085, "grad_norm": 0.794501781463623, "learning_rate": 2.70148581719946e-05, "loss": 1.6246, "step": 1641 }, { "epoch": 2.196655518394649, "grad_norm": 1.6995491981506348, "learning_rate": 2.6969833408374605e-05, "loss": 0.8617, "step": 1642 }, { "epoch": 2.1979933110367895, "grad_norm": 0.9656935930252075, "learning_rate": 2.6924808644754617e-05, "loss": 0.6808, "step": 1643 }, { "epoch": 2.19933110367893, "grad_norm": 1.2793020009994507, "learning_rate": 2.6879783881134622e-05, "loss": 0.7165, "step": 1644 }, { "epoch": 2.20066889632107, "grad_norm": 1.0213698148727417, "learning_rate": 2.6834759117514634e-05, "loss": 0.5796, "step": 1645 }, { "epoch": 2.202006688963211, "grad_norm": 0.9900079965591431, "learning_rate": 2.678973435389464e-05, "loss": 0.3005, "step": 1646 }, { "epoch": 2.203344481605351, "grad_norm": 1.314415693283081, "learning_rate": 2.674470959027465e-05, "loss": 0.8062, "step": 1647 }, { "epoch": 2.2046822742474914, "grad_norm": 1.058727502822876, "learning_rate": 2.6699684826654663e-05, "loss": 1.0138, "step": 1648 }, { "epoch": 2.206020066889632, "grad_norm": 1.2562659978866577, "learning_rate": 2.665466006303467e-05, "loss": 0.5364, "step": 1649 }, { "epoch": 2.2073578595317724, "grad_norm": 1.1769195795059204, "learning_rate": 2.660963529941468e-05, "loss": 0.8234, "step": 1650 }, { "epoch": 2.208695652173913, "grad_norm": 1.1072698831558228, "learning_rate": 2.6564610535794686e-05, "loss": 1.0157, "step": 1651 }, { "epoch": 2.2100334448160535, "grad_norm": 1.242130160331726, "learning_rate": 2.6519585772174698e-05, "loss": 0.772, "step": 1652 }, { "epoch": 2.211371237458194, "grad_norm": 1.0669097900390625, "learning_rate": 2.6474561008554703e-05, "loss": 1.1534, "step": 1653 }, { "epoch": 2.2127090301003345, "grad_norm": 1.0712904930114746, "learning_rate": 2.6429536244934715e-05, "loss": 1.1747, "step": 1654 }, { "epoch": 2.2140468227424748, "grad_norm": 1.408160924911499, "learning_rate": 2.6384511481314724e-05, "loss": 1.3222, "step": 1655 }, { "epoch": 2.2153846153846155, "grad_norm": 1.0598413944244385, "learning_rate": 2.6339486717694732e-05, "loss": 0.8916, "step": 1656 }, { "epoch": 2.216722408026756, "grad_norm": 1.279381513595581, "learning_rate": 2.6294461954074744e-05, "loss": 0.7631, "step": 1657 }, { "epoch": 2.218060200668896, "grad_norm": 1.2336814403533936, "learning_rate": 2.624943719045475e-05, "loss": 1.2253, "step": 1658 }, { "epoch": 2.219397993311037, "grad_norm": 1.1152136325836182, "learning_rate": 2.620441242683476e-05, "loss": 1.0768, "step": 1659 }, { "epoch": 2.220735785953177, "grad_norm": 1.0855528116226196, "learning_rate": 2.615938766321477e-05, "loss": 1.3688, "step": 1660 }, { "epoch": 2.222073578595318, "grad_norm": 1.0505130290985107, "learning_rate": 2.611436289959478e-05, "loss": 1.5493, "step": 1661 }, { "epoch": 2.223411371237458, "grad_norm": 1.069624423980713, "learning_rate": 2.6069338135974787e-05, "loss": 1.0145, "step": 1662 }, { "epoch": 2.224749163879599, "grad_norm": 1.2100669145584106, "learning_rate": 2.6024313372354796e-05, "loss": 0.5034, "step": 1663 }, { "epoch": 2.226086956521739, "grad_norm": 0.8809670209884644, "learning_rate": 2.5979288608734804e-05, "loss": 1.7303, "step": 1664 }, { "epoch": 2.2274247491638794, "grad_norm": 1.0502550601959229, "learning_rate": 2.5934263845114816e-05, "loss": 0.9524, "step": 1665 }, { "epoch": 2.22876254180602, "grad_norm": 1.357090711593628, "learning_rate": 2.5889239081494825e-05, "loss": 0.6862, "step": 1666 }, { "epoch": 2.2301003344481605, "grad_norm": 1.0590691566467285, "learning_rate": 2.5844214317874833e-05, "loss": 1.1155, "step": 1667 }, { "epoch": 2.231438127090301, "grad_norm": 1.186330795288086, "learning_rate": 2.5799189554254842e-05, "loss": 0.6492, "step": 1668 }, { "epoch": 2.2327759197324415, "grad_norm": 1.2332799434661865, "learning_rate": 2.575416479063485e-05, "loss": 1.2474, "step": 1669 }, { "epoch": 2.234113712374582, "grad_norm": 1.0606275796890259, "learning_rate": 2.570914002701486e-05, "loss": 0.8702, "step": 1670 }, { "epoch": 2.2354515050167225, "grad_norm": 1.2487508058547974, "learning_rate": 2.5664115263394868e-05, "loss": 0.7746, "step": 1671 }, { "epoch": 2.236789297658863, "grad_norm": 1.1616050004959106, "learning_rate": 2.561909049977488e-05, "loss": 0.7112, "step": 1672 }, { "epoch": 2.2381270903010035, "grad_norm": 1.1081942319869995, "learning_rate": 2.5574065736154885e-05, "loss": 1.0841, "step": 1673 }, { "epoch": 2.239464882943144, "grad_norm": 1.123223900794983, "learning_rate": 2.5529040972534897e-05, "loss": 0.9884, "step": 1674 }, { "epoch": 2.240802675585284, "grad_norm": 1.2441766262054443, "learning_rate": 2.5484016208914905e-05, "loss": 0.9708, "step": 1675 }, { "epoch": 2.242140468227425, "grad_norm": 1.051093578338623, "learning_rate": 2.5438991445294914e-05, "loss": 0.7997, "step": 1676 }, { "epoch": 2.243478260869565, "grad_norm": 1.2590465545654297, "learning_rate": 2.5393966681674926e-05, "loss": 1.4614, "step": 1677 }, { "epoch": 2.244816053511706, "grad_norm": 0.8667411804199219, "learning_rate": 2.534894191805493e-05, "loss": 1.6479, "step": 1678 }, { "epoch": 2.246153846153846, "grad_norm": 1.396817922592163, "learning_rate": 2.5303917154434943e-05, "loss": 0.7776, "step": 1679 }, { "epoch": 2.2474916387959865, "grad_norm": 1.1650742292404175, "learning_rate": 2.525889239081495e-05, "loss": 0.6929, "step": 1680 }, { "epoch": 2.248829431438127, "grad_norm": 0.9426375031471252, "learning_rate": 2.521386762719496e-05, "loss": 0.9831, "step": 1681 }, { "epoch": 2.2501672240802675, "grad_norm": 0.9585577845573425, "learning_rate": 2.5168842863574966e-05, "loss": 1.1836, "step": 1682 }, { "epoch": 2.251505016722408, "grad_norm": 1.1615204811096191, "learning_rate": 2.5123818099954978e-05, "loss": 0.4986, "step": 1683 }, { "epoch": 2.2528428093645485, "grad_norm": 1.2810487747192383, "learning_rate": 2.507879333633499e-05, "loss": 0.6468, "step": 1684 }, { "epoch": 2.254180602006689, "grad_norm": 1.1308361291885376, "learning_rate": 2.5033768572714995e-05, "loss": 0.7336, "step": 1685 }, { "epoch": 2.2555183946488295, "grad_norm": 0.7869702577590942, "learning_rate": 2.4988743809095003e-05, "loss": 1.0697, "step": 1686 }, { "epoch": 2.25685618729097, "grad_norm": 1.1998578310012817, "learning_rate": 2.4943719045475012e-05, "loss": 1.2017, "step": 1687 }, { "epoch": 2.2581939799331106, "grad_norm": 1.0956956148147583, "learning_rate": 2.4898694281855024e-05, "loss": 0.3821, "step": 1688 }, { "epoch": 2.259531772575251, "grad_norm": 1.1351637840270996, "learning_rate": 2.4853669518235032e-05, "loss": 1.1934, "step": 1689 }, { "epoch": 2.260869565217391, "grad_norm": 0.9958216547966003, "learning_rate": 2.480864475461504e-05, "loss": 1.2529, "step": 1690 }, { "epoch": 2.262207357859532, "grad_norm": 1.2681056261062622, "learning_rate": 2.476361999099505e-05, "loss": 1.0012, "step": 1691 }, { "epoch": 2.263545150501672, "grad_norm": 0.9700149893760681, "learning_rate": 2.4718595227375058e-05, "loss": 0.473, "step": 1692 }, { "epoch": 2.264882943143813, "grad_norm": 1.3021841049194336, "learning_rate": 2.4673570463755067e-05, "loss": 1.1692, "step": 1693 }, { "epoch": 2.266220735785953, "grad_norm": 1.036901831626892, "learning_rate": 2.4628545700135075e-05, "loss": 1.1165, "step": 1694 }, { "epoch": 2.2675585284280935, "grad_norm": 1.0550732612609863, "learning_rate": 2.4583520936515084e-05, "loss": 0.7615, "step": 1695 }, { "epoch": 2.268896321070234, "grad_norm": 1.2030292749404907, "learning_rate": 2.4538496172895093e-05, "loss": 1.1545, "step": 1696 }, { "epoch": 2.2702341137123745, "grad_norm": 1.2884608507156372, "learning_rate": 2.4493471409275105e-05, "loss": 1.0744, "step": 1697 }, { "epoch": 2.2715719063545152, "grad_norm": 0.9818195700645447, "learning_rate": 2.4448446645655113e-05, "loss": 0.9492, "step": 1698 }, { "epoch": 2.2729096989966555, "grad_norm": 1.0629665851593018, "learning_rate": 2.4403421882035122e-05, "loss": 0.8669, "step": 1699 }, { "epoch": 2.274247491638796, "grad_norm": 1.0125162601470947, "learning_rate": 2.435839711841513e-05, "loss": 0.9677, "step": 1700 }, { "epoch": 2.2755852842809365, "grad_norm": 1.0470705032348633, "learning_rate": 2.431337235479514e-05, "loss": 1.0364, "step": 1701 }, { "epoch": 2.276923076923077, "grad_norm": 1.2241395711898804, "learning_rate": 2.4268347591175148e-05, "loss": 0.906, "step": 1702 }, { "epoch": 2.2782608695652176, "grad_norm": 1.0276167392730713, "learning_rate": 2.4223322827555156e-05, "loss": 0.8643, "step": 1703 }, { "epoch": 2.279598662207358, "grad_norm": 1.1805189847946167, "learning_rate": 2.4178298063935165e-05, "loss": 0.5515, "step": 1704 }, { "epoch": 2.280936454849498, "grad_norm": 1.0454705953598022, "learning_rate": 2.4133273300315173e-05, "loss": 1.3343, "step": 1705 }, { "epoch": 2.282274247491639, "grad_norm": 0.7291643619537354, "learning_rate": 2.4088248536695185e-05, "loss": 1.178, "step": 1706 }, { "epoch": 2.283612040133779, "grad_norm": 1.2851074934005737, "learning_rate": 2.4043223773075194e-05, "loss": 1.1367, "step": 1707 }, { "epoch": 2.28494983277592, "grad_norm": 1.2466861009597778, "learning_rate": 2.3998199009455202e-05, "loss": 0.6178, "step": 1708 }, { "epoch": 2.28628762541806, "grad_norm": 0.9368792176246643, "learning_rate": 2.395317424583521e-05, "loss": 0.9587, "step": 1709 }, { "epoch": 2.2876254180602005, "grad_norm": 1.3019531965255737, "learning_rate": 2.390814948221522e-05, "loss": 0.7194, "step": 1710 }, { "epoch": 2.288963210702341, "grad_norm": 1.5246587991714478, "learning_rate": 2.3863124718595228e-05, "loss": 0.6984, "step": 1711 }, { "epoch": 2.2903010033444815, "grad_norm": 1.6100928783416748, "learning_rate": 2.3818099954975237e-05, "loss": 0.8573, "step": 1712 }, { "epoch": 2.2916387959866222, "grad_norm": 1.0381380319595337, "learning_rate": 2.3773075191355245e-05, "loss": 0.6743, "step": 1713 }, { "epoch": 2.2929765886287625, "grad_norm": 1.2357443571090698, "learning_rate": 2.3728050427735254e-05, "loss": 1.0916, "step": 1714 }, { "epoch": 2.294314381270903, "grad_norm": 0.9141361713409424, "learning_rate": 2.3683025664115266e-05, "loss": 0.8996, "step": 1715 }, { "epoch": 2.2956521739130435, "grad_norm": 1.232163667678833, "learning_rate": 2.3638000900495275e-05, "loss": 0.6892, "step": 1716 }, { "epoch": 2.296989966555184, "grad_norm": 1.240925908088684, "learning_rate": 2.3592976136875283e-05, "loss": 1.1866, "step": 1717 }, { "epoch": 2.2983277591973246, "grad_norm": 1.143701195716858, "learning_rate": 2.3547951373255292e-05, "loss": 1.1714, "step": 1718 }, { "epoch": 2.299665551839465, "grad_norm": 1.2745702266693115, "learning_rate": 2.35029266096353e-05, "loss": 0.6173, "step": 1719 }, { "epoch": 2.3010033444816056, "grad_norm": 1.3332494497299194, "learning_rate": 2.345790184601531e-05, "loss": 0.6876, "step": 1720 }, { "epoch": 2.302341137123746, "grad_norm": 1.3572496175765991, "learning_rate": 2.3412877082395317e-05, "loss": 0.8975, "step": 1721 }, { "epoch": 2.303678929765886, "grad_norm": 1.0085145235061646, "learning_rate": 2.3367852318775326e-05, "loss": 1.3663, "step": 1722 }, { "epoch": 2.305016722408027, "grad_norm": 1.1505640745162964, "learning_rate": 2.3322827555155335e-05, "loss": 0.8133, "step": 1723 }, { "epoch": 2.306354515050167, "grad_norm": 1.2748997211456299, "learning_rate": 2.3277802791535347e-05, "loss": 0.8408, "step": 1724 }, { "epoch": 2.3076923076923075, "grad_norm": 0.9610079526901245, "learning_rate": 2.3232778027915355e-05, "loss": 0.6906, "step": 1725 }, { "epoch": 2.309030100334448, "grad_norm": 1.1950584650039673, "learning_rate": 2.3187753264295364e-05, "loss": 0.9894, "step": 1726 }, { "epoch": 2.3103678929765885, "grad_norm": 1.2355742454528809, "learning_rate": 2.3142728500675372e-05, "loss": 0.7915, "step": 1727 }, { "epoch": 2.3117056856187292, "grad_norm": 1.3379088640213013, "learning_rate": 2.309770373705538e-05, "loss": 0.9516, "step": 1728 }, { "epoch": 2.3130434782608695, "grad_norm": 1.3441375494003296, "learning_rate": 2.305267897343539e-05, "loss": 0.7547, "step": 1729 }, { "epoch": 2.3143812709030103, "grad_norm": 1.0970510244369507, "learning_rate": 2.3007654209815398e-05, "loss": 1.0687, "step": 1730 }, { "epoch": 2.3157190635451506, "grad_norm": 1.1877330541610718, "learning_rate": 2.2962629446195407e-05, "loss": 1.5353, "step": 1731 }, { "epoch": 2.317056856187291, "grad_norm": 1.1516923904418945, "learning_rate": 2.2917604682575415e-05, "loss": 0.4395, "step": 1732 }, { "epoch": 2.3183946488294316, "grad_norm": 0.7589366436004639, "learning_rate": 2.2872579918955427e-05, "loss": 0.6784, "step": 1733 }, { "epoch": 2.319732441471572, "grad_norm": 1.0079516172409058, "learning_rate": 2.2827555155335436e-05, "loss": 0.5076, "step": 1734 }, { "epoch": 2.321070234113712, "grad_norm": 1.5039020776748657, "learning_rate": 2.2782530391715444e-05, "loss": 0.9259, "step": 1735 }, { "epoch": 2.322408026755853, "grad_norm": 0.8978360295295715, "learning_rate": 2.2737505628095453e-05, "loss": 1.4465, "step": 1736 }, { "epoch": 2.323745819397993, "grad_norm": 1.3363885879516602, "learning_rate": 2.269248086447546e-05, "loss": 0.8954, "step": 1737 }, { "epoch": 2.325083612040134, "grad_norm": 1.2241244316101074, "learning_rate": 2.264745610085547e-05, "loss": 1.3225, "step": 1738 }, { "epoch": 2.326421404682274, "grad_norm": 1.3648251295089722, "learning_rate": 2.260243133723548e-05, "loss": 1.0267, "step": 1739 }, { "epoch": 2.327759197324415, "grad_norm": 1.5363645553588867, "learning_rate": 2.2557406573615487e-05, "loss": 0.9331, "step": 1740 }, { "epoch": 2.3290969899665552, "grad_norm": 1.1219274997711182, "learning_rate": 2.2512381809995496e-05, "loss": 0.9591, "step": 1741 }, { "epoch": 2.3304347826086955, "grad_norm": 1.2403483390808105, "learning_rate": 2.2467357046375508e-05, "loss": 0.8553, "step": 1742 }, { "epoch": 2.3317725752508363, "grad_norm": 1.2419347763061523, "learning_rate": 2.2422332282755517e-05, "loss": 0.9683, "step": 1743 }, { "epoch": 2.3331103678929765, "grad_norm": 1.2146024703979492, "learning_rate": 2.2377307519135525e-05, "loss": 1.1727, "step": 1744 }, { "epoch": 2.334448160535117, "grad_norm": 1.3546491861343384, "learning_rate": 2.2332282755515534e-05, "loss": 0.8247, "step": 1745 }, { "epoch": 2.3357859531772576, "grad_norm": 1.0356041193008423, "learning_rate": 2.2287257991895542e-05, "loss": 0.9872, "step": 1746 }, { "epoch": 2.337123745819398, "grad_norm": 1.2764581441879272, "learning_rate": 2.224223322827555e-05, "loss": 1.1876, "step": 1747 }, { "epoch": 2.3384615384615386, "grad_norm": 1.2266361713409424, "learning_rate": 2.219720846465556e-05, "loss": 0.8758, "step": 1748 }, { "epoch": 2.339799331103679, "grad_norm": 1.0712088346481323, "learning_rate": 2.215218370103557e-05, "loss": 1.3622, "step": 1749 }, { "epoch": 2.3411371237458196, "grad_norm": 1.194495677947998, "learning_rate": 2.210715893741558e-05, "loss": 0.7502, "step": 1750 }, { "epoch": 2.34247491638796, "grad_norm": 1.4769166707992554, "learning_rate": 2.206213417379559e-05, "loss": 1.2554, "step": 1751 }, { "epoch": 2.3438127090301, "grad_norm": 1.2552289962768555, "learning_rate": 2.2017109410175597e-05, "loss": 0.8602, "step": 1752 }, { "epoch": 2.345150501672241, "grad_norm": 0.9770278334617615, "learning_rate": 2.1972084646555606e-05, "loss": 1.5884, "step": 1753 }, { "epoch": 2.346488294314381, "grad_norm": 1.1476101875305176, "learning_rate": 2.1927059882935614e-05, "loss": 0.5061, "step": 1754 }, { "epoch": 2.3478260869565215, "grad_norm": 1.1554605960845947, "learning_rate": 2.1882035119315626e-05, "loss": 0.4689, "step": 1755 }, { "epoch": 2.3491638795986622, "grad_norm": 1.296264886856079, "learning_rate": 2.1837010355695635e-05, "loss": 0.873, "step": 1756 }, { "epoch": 2.3505016722408025, "grad_norm": 1.245503544807434, "learning_rate": 2.1791985592075644e-05, "loss": 1.2616, "step": 1757 }, { "epoch": 2.3518394648829433, "grad_norm": 1.1375371217727661, "learning_rate": 2.1746960828455652e-05, "loss": 1.1452, "step": 1758 }, { "epoch": 2.3531772575250836, "grad_norm": 0.955712080001831, "learning_rate": 2.170193606483566e-05, "loss": 1.4884, "step": 1759 }, { "epoch": 2.3545150501672243, "grad_norm": 1.0706889629364014, "learning_rate": 2.165691130121567e-05, "loss": 0.7082, "step": 1760 }, { "epoch": 2.3558528428093646, "grad_norm": 1.3834351301193237, "learning_rate": 2.161188653759568e-05, "loss": 0.5975, "step": 1761 }, { "epoch": 2.357190635451505, "grad_norm": 1.1080961227416992, "learning_rate": 2.156686177397569e-05, "loss": 0.9926, "step": 1762 }, { "epoch": 2.3585284280936456, "grad_norm": 1.2083733081817627, "learning_rate": 2.15218370103557e-05, "loss": 0.6541, "step": 1763 }, { "epoch": 2.359866220735786, "grad_norm": 1.4147807359695435, "learning_rate": 2.1476812246735707e-05, "loss": 1.1652, "step": 1764 }, { "epoch": 2.361204013377926, "grad_norm": 0.9736313819885254, "learning_rate": 2.1431787483115716e-05, "loss": 1.1962, "step": 1765 }, { "epoch": 2.362541806020067, "grad_norm": 1.0683282613754272, "learning_rate": 2.1386762719495724e-05, "loss": 0.8665, "step": 1766 }, { "epoch": 2.363879598662207, "grad_norm": 0.9101349115371704, "learning_rate": 2.1341737955875733e-05, "loss": 1.4562, "step": 1767 }, { "epoch": 2.365217391304348, "grad_norm": 0.8101057410240173, "learning_rate": 2.129671319225574e-05, "loss": 1.436, "step": 1768 }, { "epoch": 2.3665551839464882, "grad_norm": 1.0484764575958252, "learning_rate": 2.1251688428635753e-05, "loss": 1.1506, "step": 1769 }, { "epoch": 2.367892976588629, "grad_norm": 1.45047926902771, "learning_rate": 2.1206663665015762e-05, "loss": 0.3479, "step": 1770 }, { "epoch": 2.3692307692307693, "grad_norm": 1.1299214363098145, "learning_rate": 2.116163890139577e-05, "loss": 1.1567, "step": 1771 }, { "epoch": 2.3705685618729095, "grad_norm": 1.1455305814743042, "learning_rate": 2.111661413777578e-05, "loss": 0.758, "step": 1772 }, { "epoch": 2.3719063545150503, "grad_norm": 1.1516034603118896, "learning_rate": 2.1071589374155788e-05, "loss": 0.6784, "step": 1773 }, { "epoch": 2.3732441471571906, "grad_norm": 1.4549493789672852, "learning_rate": 2.1026564610535796e-05, "loss": 0.9763, "step": 1774 }, { "epoch": 2.374581939799331, "grad_norm": 1.4241305589675903, "learning_rate": 2.0981539846915805e-05, "loss": 0.6554, "step": 1775 }, { "epoch": 2.3759197324414716, "grad_norm": 1.1807152032852173, "learning_rate": 2.0936515083295814e-05, "loss": 1.0083, "step": 1776 }, { "epoch": 2.377257525083612, "grad_norm": 1.2309315204620361, "learning_rate": 2.0891490319675822e-05, "loss": 1.4019, "step": 1777 }, { "epoch": 2.3785953177257526, "grad_norm": 1.0805270671844482, "learning_rate": 2.0846465556055834e-05, "loss": 0.929, "step": 1778 }, { "epoch": 2.379933110367893, "grad_norm": 1.2308298349380493, "learning_rate": 2.0801440792435843e-05, "loss": 1.3442, "step": 1779 }, { "epoch": 2.3812709030100336, "grad_norm": 1.1787298917770386, "learning_rate": 2.075641602881585e-05, "loss": 1.087, "step": 1780 }, { "epoch": 2.382608695652174, "grad_norm": 1.1068018674850464, "learning_rate": 2.071139126519586e-05, "loss": 1.0062, "step": 1781 }, { "epoch": 2.383946488294314, "grad_norm": 1.109654426574707, "learning_rate": 2.066636650157587e-05, "loss": 0.664, "step": 1782 }, { "epoch": 2.385284280936455, "grad_norm": 1.4103230237960815, "learning_rate": 2.0621341737955877e-05, "loss": 0.9542, "step": 1783 }, { "epoch": 2.3866220735785952, "grad_norm": 1.1636139154434204, "learning_rate": 2.0576316974335886e-05, "loss": 0.713, "step": 1784 }, { "epoch": 2.387959866220736, "grad_norm": 1.1543084383010864, "learning_rate": 2.0531292210715894e-05, "loss": 1.1145, "step": 1785 }, { "epoch": 2.3892976588628763, "grad_norm": 1.088383674621582, "learning_rate": 2.0486267447095903e-05, "loss": 0.852, "step": 1786 }, { "epoch": 2.3906354515050166, "grad_norm": 1.3434123992919922, "learning_rate": 2.0441242683475915e-05, "loss": 0.7964, "step": 1787 }, { "epoch": 2.3919732441471573, "grad_norm": 1.1704776287078857, "learning_rate": 2.0396217919855923e-05, "loss": 1.026, "step": 1788 }, { "epoch": 2.3933110367892976, "grad_norm": 1.2548800706863403, "learning_rate": 2.0351193156235932e-05, "loss": 1.0465, "step": 1789 }, { "epoch": 2.3946488294314383, "grad_norm": 1.332874059677124, "learning_rate": 2.030616839261594e-05, "loss": 0.8655, "step": 1790 }, { "epoch": 2.3959866220735786, "grad_norm": 0.925167977809906, "learning_rate": 2.026114362899595e-05, "loss": 0.9586, "step": 1791 }, { "epoch": 2.397324414715719, "grad_norm": 1.582779884338379, "learning_rate": 2.0216118865375958e-05, "loss": 0.9598, "step": 1792 }, { "epoch": 2.3986622073578596, "grad_norm": 1.0422955751419067, "learning_rate": 2.0171094101755966e-05, "loss": 1.144, "step": 1793 }, { "epoch": 2.4, "grad_norm": 1.4884545803070068, "learning_rate": 2.0126069338135975e-05, "loss": 0.6358, "step": 1794 }, { "epoch": 2.4013377926421406, "grad_norm": 1.151254653930664, "learning_rate": 2.0081044574515983e-05, "loss": 1.5272, "step": 1795 }, { "epoch": 2.402675585284281, "grad_norm": 1.0272475481033325, "learning_rate": 2.0036019810895995e-05, "loss": 1.107, "step": 1796 }, { "epoch": 2.4040133779264212, "grad_norm": 1.1820266246795654, "learning_rate": 1.9990995047276004e-05, "loss": 0.812, "step": 1797 }, { "epoch": 2.405351170568562, "grad_norm": 1.1445200443267822, "learning_rate": 1.9945970283656013e-05, "loss": 1.0356, "step": 1798 }, { "epoch": 2.4066889632107022, "grad_norm": 0.9422075748443604, "learning_rate": 1.990094552003602e-05, "loss": 1.2716, "step": 1799 }, { "epoch": 2.408026755852843, "grad_norm": 1.2195419073104858, "learning_rate": 1.985592075641603e-05, "loss": 1.0983, "step": 1800 }, { "epoch": 2.4093645484949833, "grad_norm": 1.0389440059661865, "learning_rate": 1.981089599279604e-05, "loss": 1.2389, "step": 1801 }, { "epoch": 2.4107023411371236, "grad_norm": 0.952129602432251, "learning_rate": 1.9765871229176047e-05, "loss": 0.6066, "step": 1802 }, { "epoch": 2.4120401337792643, "grad_norm": 1.1310745477676392, "learning_rate": 1.9720846465556056e-05, "loss": 0.7812, "step": 1803 }, { "epoch": 2.4133779264214046, "grad_norm": 1.4534422159194946, "learning_rate": 1.9675821701936064e-05, "loss": 1.0928, "step": 1804 }, { "epoch": 2.4147157190635453, "grad_norm": 1.0552500486373901, "learning_rate": 1.9630796938316076e-05, "loss": 1.0877, "step": 1805 }, { "epoch": 2.4160535117056856, "grad_norm": 1.2675952911376953, "learning_rate": 1.9585772174696085e-05, "loss": 0.4539, "step": 1806 }, { "epoch": 2.417391304347826, "grad_norm": 1.1485333442687988, "learning_rate": 1.9540747411076093e-05, "loss": 0.993, "step": 1807 }, { "epoch": 2.4187290969899666, "grad_norm": 1.238770842552185, "learning_rate": 1.9495722647456102e-05, "loss": 0.8684, "step": 1808 }, { "epoch": 2.420066889632107, "grad_norm": 1.22825026512146, "learning_rate": 1.945069788383611e-05, "loss": 1.023, "step": 1809 }, { "epoch": 2.4214046822742477, "grad_norm": 1.2012650966644287, "learning_rate": 1.940567312021612e-05, "loss": 0.9327, "step": 1810 }, { "epoch": 2.422742474916388, "grad_norm": 1.0674593448638916, "learning_rate": 1.9360648356596128e-05, "loss": 1.2149, "step": 1811 }, { "epoch": 2.4240802675585282, "grad_norm": 1.0509954690933228, "learning_rate": 1.9315623592976136e-05, "loss": 1.4321, "step": 1812 }, { "epoch": 2.425418060200669, "grad_norm": 1.2722355127334595, "learning_rate": 1.9270598829356145e-05, "loss": 0.387, "step": 1813 }, { "epoch": 2.4267558528428093, "grad_norm": 1.2516536712646484, "learning_rate": 1.9225574065736157e-05, "loss": 0.8528, "step": 1814 }, { "epoch": 2.42809364548495, "grad_norm": 1.777623176574707, "learning_rate": 1.9180549302116165e-05, "loss": 0.9748, "step": 1815 }, { "epoch": 2.4294314381270903, "grad_norm": 1.517867922782898, "learning_rate": 1.9135524538496174e-05, "loss": 0.9008, "step": 1816 }, { "epoch": 2.430769230769231, "grad_norm": 1.2444127798080444, "learning_rate": 1.9090499774876183e-05, "loss": 1.0568, "step": 1817 }, { "epoch": 2.4321070234113713, "grad_norm": 1.157259225845337, "learning_rate": 1.904547501125619e-05, "loss": 0.8248, "step": 1818 }, { "epoch": 2.4334448160535116, "grad_norm": 1.0810209512710571, "learning_rate": 1.90004502476362e-05, "loss": 0.9286, "step": 1819 }, { "epoch": 2.4347826086956523, "grad_norm": 1.2783986330032349, "learning_rate": 1.895542548401621e-05, "loss": 1.191, "step": 1820 }, { "epoch": 2.4361204013377926, "grad_norm": 1.5304255485534668, "learning_rate": 1.8910400720396217e-05, "loss": 0.9669, "step": 1821 }, { "epoch": 2.437458193979933, "grad_norm": 0.8730368614196777, "learning_rate": 1.8865375956776226e-05, "loss": 1.8674, "step": 1822 }, { "epoch": 2.4387959866220736, "grad_norm": 0.7612476348876953, "learning_rate": 1.8820351193156237e-05, "loss": 1.1188, "step": 1823 }, { "epoch": 2.440133779264214, "grad_norm": 0.917701005935669, "learning_rate": 1.8775326429536246e-05, "loss": 1.5137, "step": 1824 }, { "epoch": 2.4414715719063547, "grad_norm": 1.051805853843689, "learning_rate": 1.8730301665916255e-05, "loss": 0.7611, "step": 1825 }, { "epoch": 2.442809364548495, "grad_norm": 1.0616856813430786, "learning_rate": 1.8685276902296263e-05, "loss": 1.0647, "step": 1826 }, { "epoch": 2.4441471571906357, "grad_norm": 1.0747015476226807, "learning_rate": 1.8640252138676272e-05, "loss": 0.8052, "step": 1827 }, { "epoch": 2.445484949832776, "grad_norm": 1.0665451288223267, "learning_rate": 1.859522737505628e-05, "loss": 1.3131, "step": 1828 }, { "epoch": 2.4468227424749163, "grad_norm": 1.2551919221878052, "learning_rate": 1.855020261143629e-05, "loss": 1.0341, "step": 1829 }, { "epoch": 2.448160535117057, "grad_norm": 1.1262199878692627, "learning_rate": 1.8505177847816298e-05, "loss": 1.4566, "step": 1830 }, { "epoch": 2.4494983277591973, "grad_norm": 1.1426798105239868, "learning_rate": 1.8460153084196306e-05, "loss": 0.4731, "step": 1831 }, { "epoch": 2.4508361204013376, "grad_norm": 0.8868839740753174, "learning_rate": 1.8415128320576318e-05, "loss": 1.5829, "step": 1832 }, { "epoch": 2.4521739130434783, "grad_norm": 1.0325320959091187, "learning_rate": 1.8370103556956327e-05, "loss": 0.707, "step": 1833 }, { "epoch": 2.4535117056856186, "grad_norm": 1.1204519271850586, "learning_rate": 1.8325078793336335e-05, "loss": 0.58, "step": 1834 }, { "epoch": 2.4548494983277593, "grad_norm": 1.3483015298843384, "learning_rate": 1.8280054029716344e-05, "loss": 0.8913, "step": 1835 }, { "epoch": 2.4561872909698996, "grad_norm": 1.328149676322937, "learning_rate": 1.8235029266096353e-05, "loss": 0.8025, "step": 1836 }, { "epoch": 2.4575250836120404, "grad_norm": 1.5574856996536255, "learning_rate": 1.819000450247636e-05, "loss": 0.856, "step": 1837 }, { "epoch": 2.4588628762541807, "grad_norm": 1.2833364009857178, "learning_rate": 1.814497973885637e-05, "loss": 1.1202, "step": 1838 }, { "epoch": 2.460200668896321, "grad_norm": 1.504071593284607, "learning_rate": 1.809995497523638e-05, "loss": 0.8259, "step": 1839 }, { "epoch": 2.4615384615384617, "grad_norm": 0.9362914562225342, "learning_rate": 1.805493021161639e-05, "loss": 1.1352, "step": 1840 }, { "epoch": 2.462876254180602, "grad_norm": 1.0429112911224365, "learning_rate": 1.80099054479964e-05, "loss": 0.6208, "step": 1841 }, { "epoch": 2.4642140468227423, "grad_norm": 0.8809786438941956, "learning_rate": 1.7964880684376407e-05, "loss": 0.7602, "step": 1842 }, { "epoch": 2.465551839464883, "grad_norm": 0.8987886309623718, "learning_rate": 1.7919855920756416e-05, "loss": 0.3419, "step": 1843 }, { "epoch": 2.4668896321070233, "grad_norm": 1.066552758216858, "learning_rate": 1.7874831157136428e-05, "loss": 1.1363, "step": 1844 }, { "epoch": 2.468227424749164, "grad_norm": 1.1129825115203857, "learning_rate": 1.7829806393516437e-05, "loss": 1.0859, "step": 1845 }, { "epoch": 2.4695652173913043, "grad_norm": 1.2088080644607544, "learning_rate": 1.7784781629896445e-05, "loss": 0.9777, "step": 1846 }, { "epoch": 2.470903010033445, "grad_norm": 1.3047457933425903, "learning_rate": 1.7739756866276454e-05, "loss": 0.7029, "step": 1847 }, { "epoch": 2.4722408026755853, "grad_norm": 1.3074170351028442, "learning_rate": 1.7694732102656462e-05, "loss": 1.1114, "step": 1848 }, { "epoch": 2.4735785953177256, "grad_norm": 1.1256179809570312, "learning_rate": 1.764970733903647e-05, "loss": 1.0348, "step": 1849 }, { "epoch": 2.4749163879598663, "grad_norm": 1.1836167573928833, "learning_rate": 1.7604682575416483e-05, "loss": 1.1177, "step": 1850 }, { "epoch": 2.4762541806020066, "grad_norm": 1.1081132888793945, "learning_rate": 1.755965781179649e-05, "loss": 0.9263, "step": 1851 }, { "epoch": 2.477591973244147, "grad_norm": 0.8676427602767944, "learning_rate": 1.75146330481765e-05, "loss": 1.5092, "step": 1852 }, { "epoch": 2.4789297658862877, "grad_norm": 1.0682950019836426, "learning_rate": 1.746960828455651e-05, "loss": 1.1037, "step": 1853 }, { "epoch": 2.480267558528428, "grad_norm": 0.8002800941467285, "learning_rate": 1.7424583520936517e-05, "loss": 1.4235, "step": 1854 }, { "epoch": 2.4816053511705687, "grad_norm": 3.852647304534912, "learning_rate": 1.7379558757316526e-05, "loss": 0.5508, "step": 1855 }, { "epoch": 2.482943143812709, "grad_norm": 0.9870190024375916, "learning_rate": 1.7334533993696534e-05, "loss": 1.0963, "step": 1856 }, { "epoch": 2.4842809364548497, "grad_norm": 1.1127872467041016, "learning_rate": 1.7289509230076543e-05, "loss": 0.9211, "step": 1857 }, { "epoch": 2.48561872909699, "grad_norm": 1.0517997741699219, "learning_rate": 1.724448446645655e-05, "loss": 0.9661, "step": 1858 }, { "epoch": 2.4869565217391303, "grad_norm": 1.0883008241653442, "learning_rate": 1.7199459702836564e-05, "loss": 1.0869, "step": 1859 }, { "epoch": 2.488294314381271, "grad_norm": 0.9822888374328613, "learning_rate": 1.7154434939216572e-05, "loss": 1.1655, "step": 1860 }, { "epoch": 2.4896321070234113, "grad_norm": 1.487812876701355, "learning_rate": 1.710941017559658e-05, "loss": 0.6279, "step": 1861 }, { "epoch": 2.4909698996655516, "grad_norm": 1.1666566133499146, "learning_rate": 1.706438541197659e-05, "loss": 1.049, "step": 1862 }, { "epoch": 2.4923076923076923, "grad_norm": 1.2897906303405762, "learning_rate": 1.7019360648356598e-05, "loss": 1.3856, "step": 1863 }, { "epoch": 2.4936454849498326, "grad_norm": 1.2394946813583374, "learning_rate": 1.6974335884736607e-05, "loss": 1.0223, "step": 1864 }, { "epoch": 2.4949832775919734, "grad_norm": 1.536291241645813, "learning_rate": 1.6929311121116615e-05, "loss": 0.7075, "step": 1865 }, { "epoch": 2.4963210702341136, "grad_norm": 0.7085421085357666, "learning_rate": 1.6884286357496624e-05, "loss": 0.626, "step": 1866 }, { "epoch": 2.4976588628762544, "grad_norm": 1.1903938055038452, "learning_rate": 1.6839261593876632e-05, "loss": 1.4943, "step": 1867 }, { "epoch": 2.4989966555183947, "grad_norm": 0.9671469330787659, "learning_rate": 1.6794236830256644e-05, "loss": 1.4292, "step": 1868 }, { "epoch": 2.500334448160535, "grad_norm": 1.2777341604232788, "learning_rate": 1.6749212066636653e-05, "loss": 1.371, "step": 1869 }, { "epoch": 2.5016722408026757, "grad_norm": 1.1267082691192627, "learning_rate": 1.670418730301666e-05, "loss": 1.0386, "step": 1870 }, { "epoch": 2.503010033444816, "grad_norm": 1.0690659284591675, "learning_rate": 1.665916253939667e-05, "loss": 0.9463, "step": 1871 }, { "epoch": 2.5043478260869563, "grad_norm": 1.0973694324493408, "learning_rate": 1.661413777577668e-05, "loss": 0.7422, "step": 1872 }, { "epoch": 2.505685618729097, "grad_norm": 1.0973224639892578, "learning_rate": 1.6569113012156687e-05, "loss": 0.6723, "step": 1873 }, { "epoch": 2.5070234113712373, "grad_norm": 1.0682529211044312, "learning_rate": 1.6524088248536696e-05, "loss": 1.0982, "step": 1874 }, { "epoch": 2.508361204013378, "grad_norm": 1.1073600053787231, "learning_rate": 1.6479063484916704e-05, "loss": 0.8828, "step": 1875 }, { "epoch": 2.5096989966555183, "grad_norm": 0.9082316160202026, "learning_rate": 1.6434038721296713e-05, "loss": 1.1856, "step": 1876 }, { "epoch": 2.511036789297659, "grad_norm": 0.7870885729789734, "learning_rate": 1.6389013957676725e-05, "loss": 1.2523, "step": 1877 }, { "epoch": 2.5123745819397993, "grad_norm": 1.105562448501587, "learning_rate": 1.6343989194056734e-05, "loss": 0.5557, "step": 1878 }, { "epoch": 2.5137123745819396, "grad_norm": 1.1683121919631958, "learning_rate": 1.6298964430436742e-05, "loss": 1.0643, "step": 1879 }, { "epoch": 2.5150501672240804, "grad_norm": 1.3365424871444702, "learning_rate": 1.625393966681675e-05, "loss": 1.1505, "step": 1880 }, { "epoch": 2.5163879598662207, "grad_norm": 1.2845288515090942, "learning_rate": 1.620891490319676e-05, "loss": 0.787, "step": 1881 }, { "epoch": 2.517725752508361, "grad_norm": 0.5601235628128052, "learning_rate": 1.6163890139576768e-05, "loss": 0.5122, "step": 1882 }, { "epoch": 2.5190635451505017, "grad_norm": 1.0477005243301392, "learning_rate": 1.6118865375956776e-05, "loss": 0.7563, "step": 1883 }, { "epoch": 2.5204013377926424, "grad_norm": 1.183910608291626, "learning_rate": 1.6073840612336785e-05, "loss": 0.8595, "step": 1884 }, { "epoch": 2.5217391304347827, "grad_norm": 1.1977518796920776, "learning_rate": 1.6028815848716794e-05, "loss": 0.5492, "step": 1885 }, { "epoch": 2.523076923076923, "grad_norm": 1.104203224182129, "learning_rate": 1.5983791085096806e-05, "loss": 1.2017, "step": 1886 }, { "epoch": 2.5244147157190637, "grad_norm": 1.2420964241027832, "learning_rate": 1.5938766321476814e-05, "loss": 1.0783, "step": 1887 }, { "epoch": 2.525752508361204, "grad_norm": 1.2682381868362427, "learning_rate": 1.5893741557856823e-05, "loss": 0.5764, "step": 1888 }, { "epoch": 2.5270903010033443, "grad_norm": 1.061891794204712, "learning_rate": 1.584871679423683e-05, "loss": 0.5618, "step": 1889 }, { "epoch": 2.528428093645485, "grad_norm": 1.5407414436340332, "learning_rate": 1.580369203061684e-05, "loss": 0.88, "step": 1890 }, { "epoch": 2.5297658862876253, "grad_norm": 1.427150845527649, "learning_rate": 1.575866726699685e-05, "loss": 0.8215, "step": 1891 }, { "epoch": 2.5311036789297656, "grad_norm": 1.1984659433364868, "learning_rate": 1.5713642503376857e-05, "loss": 0.9356, "step": 1892 }, { "epoch": 2.5324414715719064, "grad_norm": 1.241512656211853, "learning_rate": 1.5668617739756866e-05, "loss": 0.6812, "step": 1893 }, { "epoch": 2.533779264214047, "grad_norm": 1.1882518529891968, "learning_rate": 1.5623592976136874e-05, "loss": 0.8977, "step": 1894 }, { "epoch": 2.5351170568561874, "grad_norm": 1.09954833984375, "learning_rate": 1.5578568212516886e-05, "loss": 1.3995, "step": 1895 }, { "epoch": 2.5364548494983277, "grad_norm": 1.0280219316482544, "learning_rate": 1.5533543448896895e-05, "loss": 1.048, "step": 1896 }, { "epoch": 2.5377926421404684, "grad_norm": 1.1472655534744263, "learning_rate": 1.5488518685276903e-05, "loss": 0.8748, "step": 1897 }, { "epoch": 2.5391304347826087, "grad_norm": 1.2066268920898438, "learning_rate": 1.5443493921656912e-05, "loss": 1.1831, "step": 1898 }, { "epoch": 2.540468227424749, "grad_norm": 1.6384567022323608, "learning_rate": 1.539846915803692e-05, "loss": 0.5115, "step": 1899 }, { "epoch": 2.5418060200668897, "grad_norm": 1.2105473279953003, "learning_rate": 1.535344439441693e-05, "loss": 1.0732, "step": 1900 }, { "epoch": 2.54314381270903, "grad_norm": 0.9396874904632568, "learning_rate": 1.5308419630796938e-05, "loss": 1.1214, "step": 1901 }, { "epoch": 2.5444816053511703, "grad_norm": 1.2806360721588135, "learning_rate": 1.5263394867176946e-05, "loss": 0.673, "step": 1902 }, { "epoch": 2.545819397993311, "grad_norm": 1.1149977445602417, "learning_rate": 1.5218370103556957e-05, "loss": 0.7108, "step": 1903 }, { "epoch": 2.5471571906354518, "grad_norm": 1.0973268747329712, "learning_rate": 1.5173345339936967e-05, "loss": 1.085, "step": 1904 }, { "epoch": 2.548494983277592, "grad_norm": 1.1979267597198486, "learning_rate": 1.5128320576316976e-05, "loss": 0.7764, "step": 1905 }, { "epoch": 2.5498327759197323, "grad_norm": 1.1910693645477295, "learning_rate": 1.5083295812696984e-05, "loss": 0.8246, "step": 1906 }, { "epoch": 2.551170568561873, "grad_norm": 1.1525737047195435, "learning_rate": 1.5038271049076993e-05, "loss": 1.0309, "step": 1907 }, { "epoch": 2.5525083612040134, "grad_norm": 1.1554816961288452, "learning_rate": 1.4993246285457001e-05, "loss": 1.1261, "step": 1908 }, { "epoch": 2.5538461538461537, "grad_norm": 1.2663363218307495, "learning_rate": 1.4948221521837012e-05, "loss": 0.8414, "step": 1909 }, { "epoch": 2.5551839464882944, "grad_norm": 0.9890291094779968, "learning_rate": 1.490319675821702e-05, "loss": 1.7599, "step": 1910 }, { "epoch": 2.5565217391304347, "grad_norm": 1.3378098011016846, "learning_rate": 1.4858171994597029e-05, "loss": 1.187, "step": 1911 }, { "epoch": 2.5578595317725754, "grad_norm": 0.920430600643158, "learning_rate": 1.4813147230977037e-05, "loss": 0.6821, "step": 1912 }, { "epoch": 2.5591973244147157, "grad_norm": 1.643335223197937, "learning_rate": 1.4768122467357048e-05, "loss": 0.8469, "step": 1913 }, { "epoch": 2.5605351170568564, "grad_norm": 1.3408782482147217, "learning_rate": 1.4723097703737058e-05, "loss": 1.0189, "step": 1914 }, { "epoch": 2.5618729096989967, "grad_norm": 1.2192609310150146, "learning_rate": 1.4678072940117067e-05, "loss": 1.0288, "step": 1915 }, { "epoch": 2.563210702341137, "grad_norm": 0.9527949690818787, "learning_rate": 1.4633048176497075e-05, "loss": 1.3139, "step": 1916 }, { "epoch": 2.5645484949832777, "grad_norm": 1.1736693382263184, "learning_rate": 1.4588023412877084e-05, "loss": 0.9382, "step": 1917 }, { "epoch": 2.565886287625418, "grad_norm": 1.4159702062606812, "learning_rate": 1.4542998649257092e-05, "loss": 0.6853, "step": 1918 }, { "epoch": 2.5672240802675583, "grad_norm": 1.1225204467773438, "learning_rate": 1.4497973885637101e-05, "loss": 1.079, "step": 1919 }, { "epoch": 2.568561872909699, "grad_norm": 1.1435071229934692, "learning_rate": 1.445294912201711e-05, "loss": 1.2635, "step": 1920 }, { "epoch": 2.5698996655518394, "grad_norm": 1.0206356048583984, "learning_rate": 1.4407924358397118e-05, "loss": 1.4776, "step": 1921 }, { "epoch": 2.57123745819398, "grad_norm": 1.7108975648880005, "learning_rate": 1.436289959477713e-05, "loss": 0.9222, "step": 1922 }, { "epoch": 2.5725752508361204, "grad_norm": 1.0113279819488525, "learning_rate": 1.4317874831157139e-05, "loss": 0.9089, "step": 1923 }, { "epoch": 2.573913043478261, "grad_norm": 1.2199150323867798, "learning_rate": 1.4272850067537147e-05, "loss": 0.8124, "step": 1924 }, { "epoch": 2.5752508361204014, "grad_norm": 0.9723790287971497, "learning_rate": 1.4227825303917156e-05, "loss": 1.6872, "step": 1925 }, { "epoch": 2.5765886287625417, "grad_norm": 1.3184490203857422, "learning_rate": 1.4182800540297164e-05, "loss": 0.6796, "step": 1926 }, { "epoch": 2.5779264214046824, "grad_norm": 0.9623216986656189, "learning_rate": 1.4137775776677173e-05, "loss": 0.9223, "step": 1927 }, { "epoch": 2.5792642140468227, "grad_norm": 1.3507014513015747, "learning_rate": 1.4092751013057182e-05, "loss": 0.8685, "step": 1928 }, { "epoch": 2.580602006688963, "grad_norm": 1.5632047653198242, "learning_rate": 1.404772624943719e-05, "loss": 1.086, "step": 1929 }, { "epoch": 2.5819397993311037, "grad_norm": 1.377564787864685, "learning_rate": 1.4002701485817199e-05, "loss": 0.9658, "step": 1930 }, { "epoch": 2.583277591973244, "grad_norm": 1.2215312719345093, "learning_rate": 1.395767672219721e-05, "loss": 1.0862, "step": 1931 }, { "epoch": 2.5846153846153848, "grad_norm": 1.145735740661621, "learning_rate": 1.391265195857722e-05, "loss": 1.2792, "step": 1932 }, { "epoch": 2.585953177257525, "grad_norm": 1.2771897315979004, "learning_rate": 1.3867627194957228e-05, "loss": 1.1728, "step": 1933 }, { "epoch": 2.587290969899666, "grad_norm": 1.2495319843292236, "learning_rate": 1.3822602431337236e-05, "loss": 0.8183, "step": 1934 }, { "epoch": 2.588628762541806, "grad_norm": 1.5561614036560059, "learning_rate": 1.3777577667717245e-05, "loss": 0.9294, "step": 1935 }, { "epoch": 2.5899665551839464, "grad_norm": 1.4182900190353394, "learning_rate": 1.3732552904097254e-05, "loss": 1.146, "step": 1936 }, { "epoch": 2.591304347826087, "grad_norm": 1.2488436698913574, "learning_rate": 1.3687528140477262e-05, "loss": 0.6245, "step": 1937 }, { "epoch": 2.5926421404682274, "grad_norm": 1.273505687713623, "learning_rate": 1.364250337685727e-05, "loss": 0.8321, "step": 1938 }, { "epoch": 2.5939799331103677, "grad_norm": 1.0358428955078125, "learning_rate": 1.359747861323728e-05, "loss": 0.9381, "step": 1939 }, { "epoch": 2.5953177257525084, "grad_norm": 1.0949110984802246, "learning_rate": 1.3552453849617291e-05, "loss": 1.4436, "step": 1940 }, { "epoch": 2.5966555183946487, "grad_norm": 1.0827977657318115, "learning_rate": 1.35074290859973e-05, "loss": 1.3695, "step": 1941 }, { "epoch": 2.5979933110367894, "grad_norm": 1.3221254348754883, "learning_rate": 1.3462404322377309e-05, "loss": 0.883, "step": 1942 }, { "epoch": 2.5993311036789297, "grad_norm": 1.330696702003479, "learning_rate": 1.3417379558757317e-05, "loss": 0.9751, "step": 1943 }, { "epoch": 2.6006688963210705, "grad_norm": 0.9758976101875305, "learning_rate": 1.3372354795137326e-05, "loss": 1.3133, "step": 1944 }, { "epoch": 2.6020066889632107, "grad_norm": 0.8660216331481934, "learning_rate": 1.3327330031517334e-05, "loss": 1.1024, "step": 1945 }, { "epoch": 2.603344481605351, "grad_norm": 1.2315067052841187, "learning_rate": 1.3282305267897343e-05, "loss": 0.7998, "step": 1946 }, { "epoch": 2.6046822742474918, "grad_norm": 1.1562628746032715, "learning_rate": 1.3237280504277352e-05, "loss": 1.3548, "step": 1947 }, { "epoch": 2.606020066889632, "grad_norm": 1.4570300579071045, "learning_rate": 1.3192255740657362e-05, "loss": 1.0354, "step": 1948 }, { "epoch": 2.6073578595317723, "grad_norm": 0.8457752466201782, "learning_rate": 1.3147230977037372e-05, "loss": 0.7525, "step": 1949 }, { "epoch": 2.608695652173913, "grad_norm": 1.1738195419311523, "learning_rate": 1.310220621341738e-05, "loss": 1.3463, "step": 1950 }, { "epoch": 2.6100334448160534, "grad_norm": 1.1737287044525146, "learning_rate": 1.305718144979739e-05, "loss": 0.8589, "step": 1951 }, { "epoch": 2.611371237458194, "grad_norm": 1.2601444721221924, "learning_rate": 1.3012156686177398e-05, "loss": 0.8973, "step": 1952 }, { "epoch": 2.6127090301003344, "grad_norm": 0.9962211847305298, "learning_rate": 1.2967131922557408e-05, "loss": 1.1899, "step": 1953 }, { "epoch": 2.614046822742475, "grad_norm": 1.0661948919296265, "learning_rate": 1.2922107158937417e-05, "loss": 1.2756, "step": 1954 }, { "epoch": 2.6153846153846154, "grad_norm": 1.3405736684799194, "learning_rate": 1.2877082395317425e-05, "loss": 0.7138, "step": 1955 }, { "epoch": 2.6167224080267557, "grad_norm": 1.1791526079177856, "learning_rate": 1.2832057631697434e-05, "loss": 0.9403, "step": 1956 }, { "epoch": 2.6180602006688964, "grad_norm": 1.0099623203277588, "learning_rate": 1.2787032868077442e-05, "loss": 0.8853, "step": 1957 }, { "epoch": 2.6193979933110367, "grad_norm": 1.1736373901367188, "learning_rate": 1.2742008104457453e-05, "loss": 1.0082, "step": 1958 }, { "epoch": 2.620735785953177, "grad_norm": 1.2148722410202026, "learning_rate": 1.2696983340837463e-05, "loss": 1.2064, "step": 1959 }, { "epoch": 2.6220735785953178, "grad_norm": 1.094408392906189, "learning_rate": 1.2651958577217472e-05, "loss": 0.9928, "step": 1960 }, { "epoch": 2.623411371237458, "grad_norm": 1.1903018951416016, "learning_rate": 1.260693381359748e-05, "loss": 0.9445, "step": 1961 }, { "epoch": 2.624749163879599, "grad_norm": 1.101300597190857, "learning_rate": 1.2561909049977489e-05, "loss": 0.6501, "step": 1962 }, { "epoch": 2.626086956521739, "grad_norm": 1.079803466796875, "learning_rate": 1.2516884286357497e-05, "loss": 1.1167, "step": 1963 }, { "epoch": 2.62742474916388, "grad_norm": 0.9757705926895142, "learning_rate": 1.2471859522737506e-05, "loss": 0.7433, "step": 1964 }, { "epoch": 2.62876254180602, "grad_norm": 1.028402328491211, "learning_rate": 1.2426834759117516e-05, "loss": 1.6142, "step": 1965 }, { "epoch": 2.6301003344481604, "grad_norm": 0.6608441472053528, "learning_rate": 1.2381809995497525e-05, "loss": 0.5881, "step": 1966 }, { "epoch": 2.631438127090301, "grad_norm": 1.1534054279327393, "learning_rate": 1.2336785231877533e-05, "loss": 1.2169, "step": 1967 }, { "epoch": 2.6327759197324414, "grad_norm": 1.0863515138626099, "learning_rate": 1.2291760468257542e-05, "loss": 1.288, "step": 1968 }, { "epoch": 2.6341137123745817, "grad_norm": 1.3784714937210083, "learning_rate": 1.2246735704637552e-05, "loss": 1.2264, "step": 1969 }, { "epoch": 2.6354515050167224, "grad_norm": 1.5014134645462036, "learning_rate": 1.2201710941017561e-05, "loss": 0.7316, "step": 1970 }, { "epoch": 2.6367892976588627, "grad_norm": 1.3054801225662231, "learning_rate": 1.215668617739757e-05, "loss": 0.412, "step": 1971 }, { "epoch": 2.6381270903010035, "grad_norm": 0.8251444101333618, "learning_rate": 1.2111661413777578e-05, "loss": 1.0635, "step": 1972 }, { "epoch": 2.6394648829431437, "grad_norm": 1.0586771965026855, "learning_rate": 1.2066636650157587e-05, "loss": 0.681, "step": 1973 }, { "epoch": 2.6408026755852845, "grad_norm": 1.3109058141708374, "learning_rate": 1.2021611886537597e-05, "loss": 0.9528, "step": 1974 }, { "epoch": 2.6421404682274248, "grad_norm": 1.1887587308883667, "learning_rate": 1.1976587122917606e-05, "loss": 0.8634, "step": 1975 }, { "epoch": 2.643478260869565, "grad_norm": 1.2000380754470825, "learning_rate": 1.1931562359297614e-05, "loss": 1.1268, "step": 1976 }, { "epoch": 2.644816053511706, "grad_norm": 1.0287446975708008, "learning_rate": 1.1886537595677623e-05, "loss": 0.678, "step": 1977 }, { "epoch": 2.646153846153846, "grad_norm": 1.051858901977539, "learning_rate": 1.1841512832057633e-05, "loss": 0.6412, "step": 1978 }, { "epoch": 2.6474916387959864, "grad_norm": 1.2014086246490479, "learning_rate": 1.1796488068437642e-05, "loss": 0.9895, "step": 1979 }, { "epoch": 2.648829431438127, "grad_norm": 0.8527210354804993, "learning_rate": 1.175146330481765e-05, "loss": 0.9464, "step": 1980 }, { "epoch": 2.650167224080268, "grad_norm": 1.1812256574630737, "learning_rate": 1.1706438541197659e-05, "loss": 0.7605, "step": 1981 }, { "epoch": 2.651505016722408, "grad_norm": 1.463782787322998, "learning_rate": 1.1661413777577667e-05, "loss": 1.1127, "step": 1982 }, { "epoch": 2.6528428093645484, "grad_norm": 1.2627677917480469, "learning_rate": 1.1616389013957678e-05, "loss": 0.5517, "step": 1983 }, { "epoch": 2.654180602006689, "grad_norm": 1.1560112237930298, "learning_rate": 1.1571364250337686e-05, "loss": 1.1212, "step": 1984 }, { "epoch": 2.6555183946488294, "grad_norm": 1.3743358850479126, "learning_rate": 1.1526339486717695e-05, "loss": 0.9603, "step": 1985 }, { "epoch": 2.6568561872909697, "grad_norm": 1.1926093101501465, "learning_rate": 1.1481314723097703e-05, "loss": 1.112, "step": 1986 }, { "epoch": 2.6581939799331105, "grad_norm": 1.288248896598816, "learning_rate": 1.1436289959477714e-05, "loss": 0.8998, "step": 1987 }, { "epoch": 2.6595317725752508, "grad_norm": 1.1038404703140259, "learning_rate": 1.1391265195857722e-05, "loss": 0.2956, "step": 1988 }, { "epoch": 2.660869565217391, "grad_norm": 1.2742241621017456, "learning_rate": 1.134624043223773e-05, "loss": 1.2681, "step": 1989 }, { "epoch": 2.6622073578595318, "grad_norm": 0.9164013862609863, "learning_rate": 1.130121566861774e-05, "loss": 0.8526, "step": 1990 }, { "epoch": 2.6635451505016725, "grad_norm": 1.274473786354065, "learning_rate": 1.1256190904997748e-05, "loss": 0.9956, "step": 1991 }, { "epoch": 2.664882943143813, "grad_norm": 1.241410255432129, "learning_rate": 1.1211166141377758e-05, "loss": 0.8181, "step": 1992 }, { "epoch": 2.666220735785953, "grad_norm": 1.138505458831787, "learning_rate": 1.1166141377757767e-05, "loss": 1.4564, "step": 1993 }, { "epoch": 2.667558528428094, "grad_norm": 1.180283784866333, "learning_rate": 1.1121116614137775e-05, "loss": 1.3439, "step": 1994 }, { "epoch": 2.668896321070234, "grad_norm": 0.9829315543174744, "learning_rate": 1.1076091850517786e-05, "loss": 1.4337, "step": 1995 }, { "epoch": 2.6702341137123744, "grad_norm": 1.160563588142395, "learning_rate": 1.1031067086897794e-05, "loss": 0.5688, "step": 1996 }, { "epoch": 2.671571906354515, "grad_norm": 1.1791166067123413, "learning_rate": 1.0986042323277803e-05, "loss": 0.4572, "step": 1997 }, { "epoch": 2.6729096989966554, "grad_norm": 1.2810428142547607, "learning_rate": 1.0941017559657813e-05, "loss": 0.5095, "step": 1998 }, { "epoch": 2.6742474916387957, "grad_norm": 1.0259909629821777, "learning_rate": 1.0895992796037822e-05, "loss": 1.3085, "step": 1999 }, { "epoch": 2.6755852842809364, "grad_norm": 0.9915300607681274, "learning_rate": 1.085096803241783e-05, "loss": 0.5626, "step": 2000 } ], "logging_steps": 1, "max_steps": 2241, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.122019867483095e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }