{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0066889632107023, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013377926421404682, "grad_norm": 15.929469108581543, "learning_rate": 5e-06, "loss": 3.4069, "step": 1 }, { "epoch": 0.0026755852842809363, "grad_norm": 15.711925506591797, "learning_rate": 1e-05, "loss": 3.5693, "step": 2 }, { "epoch": 0.004013377926421404, "grad_norm": 15.229619979858398, "learning_rate": 1.5e-05, "loss": 3.3951, "step": 3 }, { "epoch": 0.005351170568561873, "grad_norm": 9.533045768737793, "learning_rate": 2e-05, "loss": 3.2976, "step": 4 }, { "epoch": 0.006688963210702341, "grad_norm": 5.610800266265869, "learning_rate": 2.5e-05, "loss": 2.9262, "step": 5 }, { "epoch": 0.008026755852842809, "grad_norm": 3.5882365703582764, "learning_rate": 3e-05, "loss": 2.4932, "step": 6 }, { "epoch": 0.009364548494983277, "grad_norm": 3.190317392349243, "learning_rate": 3.5e-05, "loss": 2.2106, "step": 7 }, { "epoch": 0.010702341137123745, "grad_norm": 3.090921401977539, "learning_rate": 4e-05, "loss": 2.3511, "step": 8 }, { "epoch": 0.012040133779264214, "grad_norm": 1.6608407497406006, "learning_rate": 4.5e-05, "loss": 2.0331, "step": 9 }, { "epoch": 0.013377926421404682, "grad_norm": 2.3017241954803467, "learning_rate": 5e-05, "loss": 2.4304, "step": 10 }, { "epoch": 0.01471571906354515, "grad_norm": 1.7485525608062744, "learning_rate": 5.500000000000001e-05, "loss": 2.4206, "step": 11 }, { "epoch": 0.016053511705685617, "grad_norm": 1.4397761821746826, "learning_rate": 6e-05, "loss": 1.9572, "step": 12 }, { "epoch": 0.017391304347826087, "grad_norm": 2.1730759143829346, "learning_rate": 6.500000000000001e-05, "loss": 2.0966, "step": 13 }, { "epoch": 0.018729096989966554, "grad_norm": 1.3649462461471558, "learning_rate": 7e-05, "loss": 1.8041, "step": 14 }, { "epoch": 0.020066889632107024, "grad_norm": 2.1070213317871094, "learning_rate": 7.500000000000001e-05, "loss": 2.1222, "step": 15 }, { "epoch": 0.02140468227424749, "grad_norm": 0.7499298453330994, "learning_rate": 8e-05, "loss": 1.9738, "step": 16 }, { "epoch": 0.02274247491638796, "grad_norm": 0.8609057068824768, "learning_rate": 8.5e-05, "loss": 1.8804, "step": 17 }, { "epoch": 0.024080267558528427, "grad_norm": 0.9405566453933716, "learning_rate": 9e-05, "loss": 1.8886, "step": 18 }, { "epoch": 0.025418060200668897, "grad_norm": 0.6386722922325134, "learning_rate": 9.5e-05, "loss": 1.9253, "step": 19 }, { "epoch": 0.026755852842809364, "grad_norm": 0.6493627429008484, "learning_rate": 0.0001, "loss": 1.8284, "step": 20 }, { "epoch": 0.028093645484949834, "grad_norm": 0.81246018409729, "learning_rate": 9.995497523638001e-05, "loss": 1.6218, "step": 21 }, { "epoch": 0.0294314381270903, "grad_norm": 0.7177069187164307, "learning_rate": 9.990995047276002e-05, "loss": 1.8196, "step": 22 }, { "epoch": 0.03076923076923077, "grad_norm": 0.8126530647277832, "learning_rate": 9.986492570914003e-05, "loss": 2.1567, "step": 23 }, { "epoch": 0.032107023411371234, "grad_norm": 0.5535350441932678, "learning_rate": 9.981990094552004e-05, "loss": 2.0801, "step": 24 }, { "epoch": 0.033444816053511704, "grad_norm": 0.6111394762992859, "learning_rate": 9.977487618190005e-05, "loss": 1.8416, "step": 25 }, { "epoch": 0.034782608695652174, "grad_norm": 3.8264172077178955, "learning_rate": 9.972985141828006e-05, "loss": 1.5934, "step": 26 }, { "epoch": 0.036120401337792644, "grad_norm": 0.5423453450202942, "learning_rate": 9.968482665466006e-05, "loss": 1.7254, "step": 27 }, { "epoch": 0.03745819397993311, "grad_norm": 0.5544487237930298, "learning_rate": 9.963980189104007e-05, "loss": 1.9727, "step": 28 }, { "epoch": 0.03879598662207358, "grad_norm": 0.4474441409111023, "learning_rate": 9.95947771274201e-05, "loss": 2.0919, "step": 29 }, { "epoch": 0.04013377926421405, "grad_norm": 0.4968682527542114, "learning_rate": 9.954975236380009e-05, "loss": 1.8365, "step": 30 }, { "epoch": 0.04147157190635452, "grad_norm": 0.8396462798118591, "learning_rate": 9.95047276001801e-05, "loss": 1.5896, "step": 31 }, { "epoch": 0.04280936454849498, "grad_norm": 0.45893028378486633, "learning_rate": 9.945970283656011e-05, "loss": 1.8436, "step": 32 }, { "epoch": 0.04414715719063545, "grad_norm": 0.5614059567451477, "learning_rate": 9.941467807294013e-05, "loss": 1.8984, "step": 33 }, { "epoch": 0.04548494983277592, "grad_norm": 0.4408206343650818, "learning_rate": 9.936965330932014e-05, "loss": 2.1722, "step": 34 }, { "epoch": 0.046822742474916385, "grad_norm": 0.4931108057498932, "learning_rate": 9.932462854570013e-05, "loss": 2.0494, "step": 35 }, { "epoch": 0.048160535117056855, "grad_norm": 1.6200001239776611, "learning_rate": 9.927960378208014e-05, "loss": 1.8094, "step": 36 }, { "epoch": 0.049498327759197325, "grad_norm": 4.759247303009033, "learning_rate": 9.923457901846016e-05, "loss": 1.7833, "step": 37 }, { "epoch": 0.050836120401337795, "grad_norm": 0.5733784437179565, "learning_rate": 9.918955425484017e-05, "loss": 1.7082, "step": 38 }, { "epoch": 0.05217391304347826, "grad_norm": 0.6474048495292664, "learning_rate": 9.914452949122017e-05, "loss": 1.6784, "step": 39 }, { "epoch": 0.05351170568561873, "grad_norm": 0.47228601574897766, "learning_rate": 9.909950472760019e-05, "loss": 1.8055, "step": 40 }, { "epoch": 0.0548494983277592, "grad_norm": 1.4710334539413452, "learning_rate": 9.90544799639802e-05, "loss": 1.8068, "step": 41 }, { "epoch": 0.05618729096989967, "grad_norm": 0.467542439699173, "learning_rate": 9.900945520036021e-05, "loss": 1.6524, "step": 42 }, { "epoch": 0.05752508361204013, "grad_norm": 0.4020284116268158, "learning_rate": 9.89644304367402e-05, "loss": 2.1742, "step": 43 }, { "epoch": 0.0588628762541806, "grad_norm": 0.7365009188652039, "learning_rate": 9.891940567312022e-05, "loss": 1.6893, "step": 44 }, { "epoch": 0.06020066889632107, "grad_norm": 0.46295851469039917, "learning_rate": 9.887438090950023e-05, "loss": 1.9585, "step": 45 }, { "epoch": 0.06153846153846154, "grad_norm": 0.5056664943695068, "learning_rate": 9.882935614588024e-05, "loss": 1.9968, "step": 46 }, { "epoch": 0.06287625418060201, "grad_norm": 0.3968791961669922, "learning_rate": 9.878433138226025e-05, "loss": 2.0845, "step": 47 }, { "epoch": 0.06421404682274247, "grad_norm": 0.4480941891670227, "learning_rate": 9.873930661864026e-05, "loss": 1.7947, "step": 48 }, { "epoch": 0.06555183946488294, "grad_norm": 0.4318523108959198, "learning_rate": 9.869428185502027e-05, "loss": 2.093, "step": 49 }, { "epoch": 0.06688963210702341, "grad_norm": 0.43793708086013794, "learning_rate": 9.864925709140028e-05, "loss": 1.7113, "step": 50 }, { "epoch": 0.06822742474916388, "grad_norm": 0.8287019729614258, "learning_rate": 9.860423232778028e-05, "loss": 1.6314, "step": 51 }, { "epoch": 0.06956521739130435, "grad_norm": 0.36626288294792175, "learning_rate": 9.855920756416029e-05, "loss": 1.9596, "step": 52 }, { "epoch": 0.07090301003344482, "grad_norm": 0.39505261182785034, "learning_rate": 9.85141828005403e-05, "loss": 1.9304, "step": 53 }, { "epoch": 0.07224080267558529, "grad_norm": 0.3602500557899475, "learning_rate": 9.846915803692031e-05, "loss": 1.9283, "step": 54 }, { "epoch": 0.07357859531772576, "grad_norm": 0.4631737470626831, "learning_rate": 9.842413327330032e-05, "loss": 1.88, "step": 55 }, { "epoch": 0.07491638795986622, "grad_norm": 0.8588123321533203, "learning_rate": 9.837910850968033e-05, "loss": 1.9562, "step": 56 }, { "epoch": 0.07625418060200669, "grad_norm": 0.4845622181892395, "learning_rate": 9.833408374606034e-05, "loss": 1.8439, "step": 57 }, { "epoch": 0.07759197324414716, "grad_norm": 0.3816477060317993, "learning_rate": 9.828905898244036e-05, "loss": 1.9877, "step": 58 }, { "epoch": 0.07892976588628763, "grad_norm": 0.4842838943004608, "learning_rate": 9.824403421882035e-05, "loss": 1.3894, "step": 59 }, { "epoch": 0.0802675585284281, "grad_norm": 0.5041085481643677, "learning_rate": 9.819900945520036e-05, "loss": 2.2597, "step": 60 }, { "epoch": 0.08160535117056857, "grad_norm": 0.30196964740753174, "learning_rate": 9.815398469158037e-05, "loss": 1.7422, "step": 61 }, { "epoch": 0.08294314381270904, "grad_norm": 0.3853469491004944, "learning_rate": 9.810895992796039e-05, "loss": 1.9314, "step": 62 }, { "epoch": 0.08428093645484949, "grad_norm": 1.472838044166565, "learning_rate": 9.806393516434039e-05, "loss": 1.8565, "step": 63 }, { "epoch": 0.08561872909698996, "grad_norm": 0.3896331787109375, "learning_rate": 9.80189104007204e-05, "loss": 1.7477, "step": 64 }, { "epoch": 0.08695652173913043, "grad_norm": 0.3785063922405243, "learning_rate": 9.797388563710042e-05, "loss": 2.1115, "step": 65 }, { "epoch": 0.0882943143812709, "grad_norm": 0.3835844397544861, "learning_rate": 9.792886087348043e-05, "loss": 1.8925, "step": 66 }, { "epoch": 0.08963210702341137, "grad_norm": 0.32654905319213867, "learning_rate": 9.788383610986042e-05, "loss": 1.9158, "step": 67 }, { "epoch": 0.09096989966555184, "grad_norm": 0.38295117020606995, "learning_rate": 9.783881134624043e-05, "loss": 1.7556, "step": 68 }, { "epoch": 0.09230769230769231, "grad_norm": 0.4461645781993866, "learning_rate": 9.779378658262045e-05, "loss": 1.4386, "step": 69 }, { "epoch": 0.09364548494983277, "grad_norm": 0.7319120764732361, "learning_rate": 9.774876181900046e-05, "loss": 2.0061, "step": 70 }, { "epoch": 0.09498327759197324, "grad_norm": 0.4447004199028015, "learning_rate": 9.770373705538046e-05, "loss": 1.8502, "step": 71 }, { "epoch": 0.09632107023411371, "grad_norm": 0.3706307113170624, "learning_rate": 9.765871229176046e-05, "loss": 1.9704, "step": 72 }, { "epoch": 0.09765886287625418, "grad_norm": 2.6014416217803955, "learning_rate": 9.761368752814049e-05, "loss": 1.673, "step": 73 }, { "epoch": 0.09899665551839465, "grad_norm": 0.36813560128211975, "learning_rate": 9.75686627645205e-05, "loss": 1.9571, "step": 74 }, { "epoch": 0.10033444816053512, "grad_norm": 0.2965801954269409, "learning_rate": 9.752363800090049e-05, "loss": 1.8679, "step": 75 }, { "epoch": 0.10167224080267559, "grad_norm": 0.36737844347953796, "learning_rate": 9.747861323728051e-05, "loss": 1.914, "step": 76 }, { "epoch": 0.10301003344481606, "grad_norm": 0.5539958477020264, "learning_rate": 9.743358847366052e-05, "loss": 1.7668, "step": 77 }, { "epoch": 0.10434782608695652, "grad_norm": 0.9024600982666016, "learning_rate": 9.738856371004053e-05, "loss": 1.602, "step": 78 }, { "epoch": 0.10568561872909699, "grad_norm": 0.47630879282951355, "learning_rate": 9.734353894642053e-05, "loss": 1.9614, "step": 79 }, { "epoch": 0.10702341137123746, "grad_norm": 0.33962881565093994, "learning_rate": 9.729851418280055e-05, "loss": 2.0143, "step": 80 }, { "epoch": 0.10836120401337793, "grad_norm": 0.34463438391685486, "learning_rate": 9.725348941918056e-05, "loss": 1.8305, "step": 81 }, { "epoch": 0.1096989966555184, "grad_norm": 0.5177729725837708, "learning_rate": 9.720846465556056e-05, "loss": 1.5823, "step": 82 }, { "epoch": 0.11103678929765887, "grad_norm": 0.3883843421936035, "learning_rate": 9.716343989194057e-05, "loss": 1.9247, "step": 83 }, { "epoch": 0.11237458193979934, "grad_norm": 0.3565119206905365, "learning_rate": 9.711841512832058e-05, "loss": 1.8981, "step": 84 }, { "epoch": 0.11371237458193979, "grad_norm": 0.9169286489486694, "learning_rate": 9.707339036470059e-05, "loss": 1.6898, "step": 85 }, { "epoch": 0.11505016722408026, "grad_norm": 2.372044801712036, "learning_rate": 9.70283656010806e-05, "loss": 2.0062, "step": 86 }, { "epoch": 0.11638795986622073, "grad_norm": 0.4511963725090027, "learning_rate": 9.698334083746061e-05, "loss": 1.9983, "step": 87 }, { "epoch": 0.1177257525083612, "grad_norm": 0.2942259907722473, "learning_rate": 9.693831607384062e-05, "loss": 1.9471, "step": 88 }, { "epoch": 0.11906354515050167, "grad_norm": 0.3808842897415161, "learning_rate": 9.689329131022062e-05, "loss": 1.9287, "step": 89 }, { "epoch": 0.12040133779264214, "grad_norm": 0.3474765121936798, "learning_rate": 9.684826654660063e-05, "loss": 2.166, "step": 90 }, { "epoch": 0.12173913043478261, "grad_norm": 0.4045035243034363, "learning_rate": 9.680324178298064e-05, "loss": 1.9044, "step": 91 }, { "epoch": 0.12307692307692308, "grad_norm": 0.3970213532447815, "learning_rate": 9.675821701936065e-05, "loss": 2.023, "step": 92 }, { "epoch": 0.12441471571906354, "grad_norm": 0.3404536843299866, "learning_rate": 9.671319225574066e-05, "loss": 1.7032, "step": 93 }, { "epoch": 0.12575250836120402, "grad_norm": 0.441057413816452, "learning_rate": 9.666816749212068e-05, "loss": 1.5145, "step": 94 }, { "epoch": 0.12709030100334448, "grad_norm": 0.6671501398086548, "learning_rate": 9.662314272850068e-05, "loss": 1.7196, "step": 95 }, { "epoch": 0.12842809364548494, "grad_norm": 0.37828826904296875, "learning_rate": 9.657811796488068e-05, "loss": 1.3918, "step": 96 }, { "epoch": 0.12976588628762542, "grad_norm": 0.4070121645927429, "learning_rate": 9.653309320126069e-05, "loss": 1.7636, "step": 97 }, { "epoch": 0.13110367892976588, "grad_norm": 0.349697470664978, "learning_rate": 9.648806843764072e-05, "loss": 2.383, "step": 98 }, { "epoch": 0.13244147157190636, "grad_norm": 0.3957515358924866, "learning_rate": 9.644304367402071e-05, "loss": 2.0249, "step": 99 }, { "epoch": 0.13377926421404682, "grad_norm": 0.6649832725524902, "learning_rate": 9.639801891040072e-05, "loss": 1.7339, "step": 100 }, { "epoch": 0.1351170568561873, "grad_norm": 0.6055648922920227, "learning_rate": 9.635299414678074e-05, "loss": 1.5403, "step": 101 }, { "epoch": 0.13645484949832776, "grad_norm": 0.4690856635570526, "learning_rate": 9.630796938316075e-05, "loss": 1.7279, "step": 102 }, { "epoch": 0.13779264214046824, "grad_norm": 0.4584232568740845, "learning_rate": 9.626294461954074e-05, "loss": 1.8899, "step": 103 }, { "epoch": 0.1391304347826087, "grad_norm": 0.3669104278087616, "learning_rate": 9.621791985592075e-05, "loss": 2.2324, "step": 104 }, { "epoch": 0.14046822742474915, "grad_norm": 0.506993293762207, "learning_rate": 9.617289509230078e-05, "loss": 1.7735, "step": 105 }, { "epoch": 0.14180602006688964, "grad_norm": 0.405987948179245, "learning_rate": 9.612787032868078e-05, "loss": 1.9026, "step": 106 }, { "epoch": 0.1431438127090301, "grad_norm": 0.8430399298667908, "learning_rate": 9.608284556506079e-05, "loss": 1.4946, "step": 107 }, { "epoch": 0.14448160535117058, "grad_norm": 0.38154295086860657, "learning_rate": 9.603782080144079e-05, "loss": 2.0693, "step": 108 }, { "epoch": 0.14581939799331103, "grad_norm": 0.43963882327079773, "learning_rate": 9.599279603782081e-05, "loss": 1.6948, "step": 109 }, { "epoch": 0.14715719063545152, "grad_norm": 0.46459442377090454, "learning_rate": 9.594777127420082e-05, "loss": 1.8412, "step": 110 }, { "epoch": 0.14849498327759197, "grad_norm": 0.409085214138031, "learning_rate": 9.590274651058083e-05, "loss": 1.7562, "step": 111 }, { "epoch": 0.14983277591973243, "grad_norm": 0.35461780428886414, "learning_rate": 9.585772174696084e-05, "loss": 1.9862, "step": 112 }, { "epoch": 0.15117056856187291, "grad_norm": 0.3632294237613678, "learning_rate": 9.581269698334084e-05, "loss": 2.0089, "step": 113 }, { "epoch": 0.15250836120401337, "grad_norm": 0.6132340431213379, "learning_rate": 9.576767221972085e-05, "loss": 2.116, "step": 114 }, { "epoch": 0.15384615384615385, "grad_norm": 0.4431364834308624, "learning_rate": 9.572264745610086e-05, "loss": 1.76, "step": 115 }, { "epoch": 0.1551839464882943, "grad_norm": 0.3065609037876129, "learning_rate": 9.567762269248087e-05, "loss": 1.7882, "step": 116 }, { "epoch": 0.1565217391304348, "grad_norm": 0.6607733964920044, "learning_rate": 9.563259792886088e-05, "loss": 1.8994, "step": 117 }, { "epoch": 0.15785953177257525, "grad_norm": 0.5508191585540771, "learning_rate": 9.558757316524089e-05, "loss": 2.1155, "step": 118 }, { "epoch": 0.1591973244147157, "grad_norm": 0.3941163420677185, "learning_rate": 9.55425484016209e-05, "loss": 2.2935, "step": 119 }, { "epoch": 0.1605351170568562, "grad_norm": 0.6320101618766785, "learning_rate": 9.54975236380009e-05, "loss": 1.801, "step": 120 }, { "epoch": 0.16187290969899665, "grad_norm": 0.2693134844303131, "learning_rate": 9.545249887438091e-05, "loss": 1.9149, "step": 121 }, { "epoch": 0.16321070234113713, "grad_norm": 0.36482131481170654, "learning_rate": 9.540747411076092e-05, "loss": 1.7871, "step": 122 }, { "epoch": 0.1645484949832776, "grad_norm": 0.3653654456138611, "learning_rate": 9.536244934714093e-05, "loss": 1.5542, "step": 123 }, { "epoch": 0.16588628762541807, "grad_norm": 0.3543652296066284, "learning_rate": 9.531742458352094e-05, "loss": 1.766, "step": 124 }, { "epoch": 0.16722408026755853, "grad_norm": 0.5176847577095032, "learning_rate": 9.527239981990095e-05, "loss": 1.8646, "step": 125 }, { "epoch": 0.16856187290969898, "grad_norm": 0.3467628061771393, "learning_rate": 9.522737505628096e-05, "loss": 2.217, "step": 126 }, { "epoch": 0.16989966555183947, "grad_norm": 0.3651058077812195, "learning_rate": 9.518235029266098e-05, "loss": 1.7298, "step": 127 }, { "epoch": 0.17123745819397992, "grad_norm": 0.3439328968524933, "learning_rate": 9.513732552904097e-05, "loss": 2.0967, "step": 128 }, { "epoch": 0.1725752508361204, "grad_norm": 0.367436945438385, "learning_rate": 9.509230076542098e-05, "loss": 2.0584, "step": 129 }, { "epoch": 0.17391304347826086, "grad_norm": 0.4682665467262268, "learning_rate": 9.5047276001801e-05, "loss": 1.2279, "step": 130 }, { "epoch": 0.17525083612040135, "grad_norm": 0.6343939900398254, "learning_rate": 9.500225123818101e-05, "loss": 1.8301, "step": 131 }, { "epoch": 0.1765886287625418, "grad_norm": 0.37340253591537476, "learning_rate": 9.495722647456101e-05, "loss": 1.6715, "step": 132 }, { "epoch": 0.17792642140468226, "grad_norm": 0.4212535619735718, "learning_rate": 9.491220171094102e-05, "loss": 1.3813, "step": 133 }, { "epoch": 0.17926421404682275, "grad_norm": 0.7796082496643066, "learning_rate": 9.486717694732104e-05, "loss": 2.0301, "step": 134 }, { "epoch": 0.1806020066889632, "grad_norm": 0.3705946207046509, "learning_rate": 9.482215218370105e-05, "loss": 1.9791, "step": 135 }, { "epoch": 0.18193979933110369, "grad_norm": 1.0462645292282104, "learning_rate": 9.477712742008104e-05, "loss": 1.8377, "step": 136 }, { "epoch": 0.18327759197324414, "grad_norm": 0.8452597856521606, "learning_rate": 9.473210265646106e-05, "loss": 1.587, "step": 137 }, { "epoch": 0.18461538461538463, "grad_norm": 0.34128281474113464, "learning_rate": 9.468707789284107e-05, "loss": 1.7248, "step": 138 }, { "epoch": 0.18595317725752508, "grad_norm": 0.3522973656654358, "learning_rate": 9.464205312922108e-05, "loss": 1.4179, "step": 139 }, { "epoch": 0.18729096989966554, "grad_norm": 0.3930247128009796, "learning_rate": 9.459702836560108e-05, "loss": 1.8452, "step": 140 }, { "epoch": 0.18862876254180602, "grad_norm": 0.4119817018508911, "learning_rate": 9.45520036019811e-05, "loss": 2.0599, "step": 141 }, { "epoch": 0.18996655518394648, "grad_norm": 0.31673747301101685, "learning_rate": 9.45069788383611e-05, "loss": 1.9548, "step": 142 }, { "epoch": 0.19130434782608696, "grad_norm": 0.6387179493904114, "learning_rate": 9.446195407474112e-05, "loss": 1.8265, "step": 143 }, { "epoch": 0.19264214046822742, "grad_norm": 0.36570000648498535, "learning_rate": 9.441692931112111e-05, "loss": 1.9025, "step": 144 }, { "epoch": 0.1939799331103679, "grad_norm": 0.8736979365348816, "learning_rate": 9.437190454750113e-05, "loss": 1.7701, "step": 145 }, { "epoch": 0.19531772575250836, "grad_norm": 0.35222160816192627, "learning_rate": 9.432687978388114e-05, "loss": 2.1553, "step": 146 }, { "epoch": 0.19665551839464884, "grad_norm": 0.33907803893089294, "learning_rate": 9.428185502026115e-05, "loss": 1.9887, "step": 147 }, { "epoch": 0.1979933110367893, "grad_norm": 0.5510755181312561, "learning_rate": 9.423683025664116e-05, "loss": 2.0307, "step": 148 }, { "epoch": 0.19933110367892976, "grad_norm": 0.3364030122756958, "learning_rate": 9.419180549302117e-05, "loss": 1.7475, "step": 149 }, { "epoch": 0.20066889632107024, "grad_norm": 0.3714922070503235, "learning_rate": 9.414678072940118e-05, "loss": 1.8964, "step": 150 }, { "epoch": 0.2020066889632107, "grad_norm": 0.3768271803855896, "learning_rate": 9.410175596578118e-05, "loss": 1.8011, "step": 151 }, { "epoch": 0.20334448160535118, "grad_norm": 0.3435843586921692, "learning_rate": 9.405673120216119e-05, "loss": 2.368, "step": 152 }, { "epoch": 0.20468227424749164, "grad_norm": 0.3646228313446045, "learning_rate": 9.40117064385412e-05, "loss": 1.6072, "step": 153 }, { "epoch": 0.20602006688963212, "grad_norm": 0.35950562357902527, "learning_rate": 9.396668167492121e-05, "loss": 1.9689, "step": 154 }, { "epoch": 0.20735785953177258, "grad_norm": 0.40717366337776184, "learning_rate": 9.392165691130123e-05, "loss": 1.6078, "step": 155 }, { "epoch": 0.20869565217391303, "grad_norm": 0.6353041529655457, "learning_rate": 9.387663214768123e-05, "loss": 2.2688, "step": 156 }, { "epoch": 0.21003344481605352, "grad_norm": 0.4215109050273895, "learning_rate": 9.383160738406124e-05, "loss": 1.9392, "step": 157 }, { "epoch": 0.21137123745819397, "grad_norm": 0.37301746010780334, "learning_rate": 9.378658262044124e-05, "loss": 1.9112, "step": 158 }, { "epoch": 0.21270903010033446, "grad_norm": 0.36573272943496704, "learning_rate": 9.374155785682127e-05, "loss": 1.6324, "step": 159 }, { "epoch": 0.2140468227424749, "grad_norm": 0.5235257148742676, "learning_rate": 9.369653309320126e-05, "loss": 1.9713, "step": 160 }, { "epoch": 0.2153846153846154, "grad_norm": 0.34718433022499084, "learning_rate": 9.365150832958127e-05, "loss": 2.0838, "step": 161 }, { "epoch": 0.21672240802675585, "grad_norm": 0.39332470297813416, "learning_rate": 9.360648356596128e-05, "loss": 2.0534, "step": 162 }, { "epoch": 0.2180602006688963, "grad_norm": 0.7983328104019165, "learning_rate": 9.35614588023413e-05, "loss": 1.7208, "step": 163 }, { "epoch": 0.2193979933110368, "grad_norm": 0.4300435185432434, "learning_rate": 9.35164340387213e-05, "loss": 1.39, "step": 164 }, { "epoch": 0.22073578595317725, "grad_norm": 0.377658873796463, "learning_rate": 9.34714092751013e-05, "loss": 1.5722, "step": 165 }, { "epoch": 0.22207357859531773, "grad_norm": 0.4393126964569092, "learning_rate": 9.342638451148133e-05, "loss": 1.5827, "step": 166 }, { "epoch": 0.2234113712374582, "grad_norm": 0.35238978266716003, "learning_rate": 9.338135974786133e-05, "loss": 1.6863, "step": 167 }, { "epoch": 0.22474916387959867, "grad_norm": 0.3524191379547119, "learning_rate": 9.333633498424133e-05, "loss": 1.8321, "step": 168 }, { "epoch": 0.22608695652173913, "grad_norm": 0.6319050788879395, "learning_rate": 9.329131022062134e-05, "loss": 1.7113, "step": 169 }, { "epoch": 0.22742474916387959, "grad_norm": 0.3477349281311035, "learning_rate": 9.324628545700136e-05, "loss": 1.8297, "step": 170 }, { "epoch": 0.22876254180602007, "grad_norm": 1.4175788164138794, "learning_rate": 9.320126069338137e-05, "loss": 1.7897, "step": 171 }, { "epoch": 0.23010033444816053, "grad_norm": 0.3294921815395355, "learning_rate": 9.315623592976136e-05, "loss": 1.9491, "step": 172 }, { "epoch": 0.231438127090301, "grad_norm": 0.35877764225006104, "learning_rate": 9.311121116614139e-05, "loss": 2.0432, "step": 173 }, { "epoch": 0.23277591973244147, "grad_norm": 0.3810911476612091, "learning_rate": 9.30661864025214e-05, "loss": 1.7374, "step": 174 }, { "epoch": 0.23411371237458195, "grad_norm": 0.39322909712791443, "learning_rate": 9.30211616389014e-05, "loss": 1.3096, "step": 175 }, { "epoch": 0.2354515050167224, "grad_norm": 0.3306376039981842, "learning_rate": 9.29761368752814e-05, "loss": 2.0244, "step": 176 }, { "epoch": 0.23678929765886286, "grad_norm": 0.35327744483947754, "learning_rate": 9.293111211166142e-05, "loss": 1.8494, "step": 177 }, { "epoch": 0.23812709030100335, "grad_norm": 0.3139340579509735, "learning_rate": 9.288608734804143e-05, "loss": 2.0111, "step": 178 }, { "epoch": 0.2394648829431438, "grad_norm": 0.29642781615257263, "learning_rate": 9.284106258442144e-05, "loss": 1.8517, "step": 179 }, { "epoch": 0.2408026755852843, "grad_norm": 0.6292489171028137, "learning_rate": 9.279603782080145e-05, "loss": 1.7926, "step": 180 }, { "epoch": 0.24214046822742474, "grad_norm": 0.5784863233566284, "learning_rate": 9.275101305718146e-05, "loss": 1.9574, "step": 181 }, { "epoch": 0.24347826086956523, "grad_norm": 0.5684370398521423, "learning_rate": 9.270598829356146e-05, "loss": 1.7677, "step": 182 }, { "epoch": 0.24481605351170568, "grad_norm": 0.435253769159317, "learning_rate": 9.266096352994147e-05, "loss": 2.0751, "step": 183 }, { "epoch": 0.24615384615384617, "grad_norm": 0.394359827041626, "learning_rate": 9.261593876632148e-05, "loss": 2.0506, "step": 184 }, { "epoch": 0.24749163879598662, "grad_norm": 0.5143195390701294, "learning_rate": 9.257091400270149e-05, "loss": 2.0168, "step": 185 }, { "epoch": 0.24882943143812708, "grad_norm": 0.368755966424942, "learning_rate": 9.25258892390815e-05, "loss": 1.609, "step": 186 }, { "epoch": 0.25016722408026754, "grad_norm": 3.3827755451202393, "learning_rate": 9.24808644754615e-05, "loss": 1.5342, "step": 187 }, { "epoch": 0.25150501672240805, "grad_norm": 0.3961702287197113, "learning_rate": 9.243583971184152e-05, "loss": 2.0325, "step": 188 }, { "epoch": 0.2528428093645485, "grad_norm": 0.4427662193775177, "learning_rate": 9.239081494822152e-05, "loss": 1.8261, "step": 189 }, { "epoch": 0.25418060200668896, "grad_norm": 0.7670131325721741, "learning_rate": 9.234579018460153e-05, "loss": 2.0239, "step": 190 }, { "epoch": 0.2555183946488294, "grad_norm": 0.33620205521583557, "learning_rate": 9.230076542098155e-05, "loss": 1.8774, "step": 191 }, { "epoch": 0.2568561872909699, "grad_norm": 0.3418707549571991, "learning_rate": 9.225574065736155e-05, "loss": 1.8545, "step": 192 }, { "epoch": 0.2581939799331104, "grad_norm": 0.32373929023742676, "learning_rate": 9.221071589374156e-05, "loss": 1.4021, "step": 193 }, { "epoch": 0.25953177257525084, "grad_norm": 0.32591140270233154, "learning_rate": 9.216569113012157e-05, "loss": 1.9349, "step": 194 }, { "epoch": 0.2608695652173913, "grad_norm": 0.8838204741477966, "learning_rate": 9.212066636650159e-05, "loss": 1.9257, "step": 195 }, { "epoch": 0.26220735785953175, "grad_norm": 0.2679453194141388, "learning_rate": 9.207564160288158e-05, "loss": 2.2633, "step": 196 }, { "epoch": 0.26354515050167227, "grad_norm": 0.36055904626846313, "learning_rate": 9.203061683926159e-05, "loss": 1.9155, "step": 197 }, { "epoch": 0.2648829431438127, "grad_norm": 0.28946951031684875, "learning_rate": 9.19855920756416e-05, "loss": 2.0331, "step": 198 }, { "epoch": 0.2662207357859532, "grad_norm": 0.38088685274124146, "learning_rate": 9.194056731202162e-05, "loss": 1.9669, "step": 199 }, { "epoch": 0.26755852842809363, "grad_norm": 0.3809536099433899, "learning_rate": 9.189554254840163e-05, "loss": 2.1654, "step": 200 }, { "epoch": 0.2688963210702341, "grad_norm": 0.3993472754955292, "learning_rate": 9.185051778478163e-05, "loss": 1.4857, "step": 201 }, { "epoch": 0.2702341137123746, "grad_norm": 0.3034169375896454, "learning_rate": 9.180549302116165e-05, "loss": 2.2763, "step": 202 }, { "epoch": 0.27157190635451506, "grad_norm": 0.3558633625507355, "learning_rate": 9.176046825754166e-05, "loss": 2.0766, "step": 203 }, { "epoch": 0.2729096989966555, "grad_norm": 0.5177504420280457, "learning_rate": 9.171544349392167e-05, "loss": 1.7944, "step": 204 }, { "epoch": 0.27424749163879597, "grad_norm": 0.3450024425983429, "learning_rate": 9.167041873030166e-05, "loss": 1.4671, "step": 205 }, { "epoch": 0.2755852842809365, "grad_norm": 0.44552841782569885, "learning_rate": 9.162539396668168e-05, "loss": 1.6058, "step": 206 }, { "epoch": 0.27692307692307694, "grad_norm": 0.41852515935897827, "learning_rate": 9.158036920306169e-05, "loss": 1.9554, "step": 207 }, { "epoch": 0.2782608695652174, "grad_norm": 0.3624606132507324, "learning_rate": 9.15353444394417e-05, "loss": 1.6291, "step": 208 }, { "epoch": 0.27959866220735785, "grad_norm": 0.4587150812149048, "learning_rate": 9.149031967582171e-05, "loss": 1.9435, "step": 209 }, { "epoch": 0.2809364548494983, "grad_norm": 0.3697861433029175, "learning_rate": 9.144529491220172e-05, "loss": 2.0404, "step": 210 }, { "epoch": 0.2822742474916388, "grad_norm": 0.37733131647109985, "learning_rate": 9.140027014858173e-05, "loss": 1.8168, "step": 211 }, { "epoch": 0.2836120401337793, "grad_norm": 0.32481226325035095, "learning_rate": 9.135524538496173e-05, "loss": 1.8865, "step": 212 }, { "epoch": 0.28494983277591973, "grad_norm": 0.3467252552509308, "learning_rate": 9.131022062134174e-05, "loss": 1.5196, "step": 213 }, { "epoch": 0.2862876254180602, "grad_norm": 0.27760034799575806, "learning_rate": 9.126519585772175e-05, "loss": 1.6441, "step": 214 }, { "epoch": 0.28762541806020064, "grad_norm": 0.41142746806144714, "learning_rate": 9.122017109410176e-05, "loss": 1.8563, "step": 215 }, { "epoch": 0.28896321070234116, "grad_norm": 0.504342257976532, "learning_rate": 9.117514633048177e-05, "loss": 1.639, "step": 216 }, { "epoch": 0.2903010033444816, "grad_norm": 0.3190425932407379, "learning_rate": 9.113012156686178e-05, "loss": 2.2112, "step": 217 }, { "epoch": 0.29163879598662207, "grad_norm": 0.2690954804420471, "learning_rate": 9.108509680324179e-05, "loss": 1.9693, "step": 218 }, { "epoch": 0.2929765886287625, "grad_norm": 0.9039385914802551, "learning_rate": 9.10400720396218e-05, "loss": 1.8742, "step": 219 }, { "epoch": 0.29431438127090304, "grad_norm": 0.4258744418621063, "learning_rate": 9.09950472760018e-05, "loss": 1.7538, "step": 220 }, { "epoch": 0.2956521739130435, "grad_norm": 0.5416736006736755, "learning_rate": 9.095002251238181e-05, "loss": 1.4503, "step": 221 }, { "epoch": 0.29698996655518395, "grad_norm": 0.43486952781677246, "learning_rate": 9.090499774876182e-05, "loss": 1.9158, "step": 222 }, { "epoch": 0.2983277591973244, "grad_norm": 0.23655688762664795, "learning_rate": 9.085997298514183e-05, "loss": 1.6036, "step": 223 }, { "epoch": 0.29966555183946486, "grad_norm": 0.3448864221572876, "learning_rate": 9.081494822152185e-05, "loss": 1.7351, "step": 224 }, { "epoch": 0.3010033444816054, "grad_norm": 0.5123642683029175, "learning_rate": 9.076992345790185e-05, "loss": 1.6658, "step": 225 }, { "epoch": 0.30234113712374583, "grad_norm": 0.5703243613243103, "learning_rate": 9.072489869428186e-05, "loss": 1.9601, "step": 226 }, { "epoch": 0.3036789297658863, "grad_norm": 0.37126022577285767, "learning_rate": 9.067987393066188e-05, "loss": 1.6886, "step": 227 }, { "epoch": 0.30501672240802674, "grad_norm": 0.3115082383155823, "learning_rate": 9.063484916704189e-05, "loss": 1.9697, "step": 228 }, { "epoch": 0.3063545150501672, "grad_norm": 0.2590930759906769, "learning_rate": 9.058982440342188e-05, "loss": 2.0054, "step": 229 }, { "epoch": 0.3076923076923077, "grad_norm": 0.28678327798843384, "learning_rate": 9.054479963980189e-05, "loss": 1.8215, "step": 230 }, { "epoch": 0.30903010033444817, "grad_norm": 0.3587772846221924, "learning_rate": 9.049977487618191e-05, "loss": 1.9469, "step": 231 }, { "epoch": 0.3103678929765886, "grad_norm": 0.40721800923347473, "learning_rate": 9.045475011256192e-05, "loss": 1.8971, "step": 232 }, { "epoch": 0.3117056856187291, "grad_norm": 0.395236998796463, "learning_rate": 9.040972534894192e-05, "loss": 1.7274, "step": 233 }, { "epoch": 0.3130434782608696, "grad_norm": 0.327338308095932, "learning_rate": 9.036470058532192e-05, "loss": 1.9656, "step": 234 }, { "epoch": 0.31438127090301005, "grad_norm": 0.35737645626068115, "learning_rate": 9.031967582170195e-05, "loss": 1.7269, "step": 235 }, { "epoch": 0.3157190635451505, "grad_norm": 0.376131147146225, "learning_rate": 9.027465105808195e-05, "loss": 1.6209, "step": 236 }, { "epoch": 0.31705685618729096, "grad_norm": 1.226188063621521, "learning_rate": 9.022962629446195e-05, "loss": 1.7785, "step": 237 }, { "epoch": 0.3183946488294314, "grad_norm": 0.3651161193847656, "learning_rate": 9.018460153084197e-05, "loss": 2.0646, "step": 238 }, { "epoch": 0.3197324414715719, "grad_norm": 0.41171953082084656, "learning_rate": 9.013957676722198e-05, "loss": 1.8584, "step": 239 }, { "epoch": 0.3210702341137124, "grad_norm": 0.29637786746025085, "learning_rate": 9.009455200360199e-05, "loss": 1.9911, "step": 240 }, { "epoch": 0.32240802675585284, "grad_norm": 0.6785081028938293, "learning_rate": 9.004952723998198e-05, "loss": 1.9317, "step": 241 }, { "epoch": 0.3237458193979933, "grad_norm": 0.3170675039291382, "learning_rate": 9.0004502476362e-05, "loss": 1.6286, "step": 242 }, { "epoch": 0.3250836120401338, "grad_norm": 0.39666345715522766, "learning_rate": 8.995947771274201e-05, "loss": 2.0269, "step": 243 }, { "epoch": 0.32642140468227426, "grad_norm": 0.42573514580726624, "learning_rate": 8.991445294912202e-05, "loss": 1.8167, "step": 244 }, { "epoch": 0.3277591973244147, "grad_norm": 0.287176251411438, "learning_rate": 8.986942818550203e-05, "loss": 1.9762, "step": 245 }, { "epoch": 0.3290969899665552, "grad_norm": 0.328698992729187, "learning_rate": 8.982440342188204e-05, "loss": 1.4258, "step": 246 }, { "epoch": 0.33043478260869563, "grad_norm": 0.35098153352737427, "learning_rate": 8.977937865826205e-05, "loss": 1.7426, "step": 247 }, { "epoch": 0.33177257525083614, "grad_norm": 0.3326055705547333, "learning_rate": 8.973435389464206e-05, "loss": 1.8434, "step": 248 }, { "epoch": 0.3331103678929766, "grad_norm": 0.5356029272079468, "learning_rate": 8.968932913102207e-05, "loss": 1.6118, "step": 249 }, { "epoch": 0.33444816053511706, "grad_norm": 0.3759405016899109, "learning_rate": 8.964430436740207e-05, "loss": 1.8523, "step": 250 }, { "epoch": 0.3357859531772575, "grad_norm": 0.327251672744751, "learning_rate": 8.959927960378208e-05, "loss": 1.9094, "step": 251 }, { "epoch": 0.33712374581939797, "grad_norm": 0.34880220890045166, "learning_rate": 8.955425484016209e-05, "loss": 1.9313, "step": 252 }, { "epoch": 0.3384615384615385, "grad_norm": 0.34632259607315063, "learning_rate": 8.95092300765421e-05, "loss": 2.0977, "step": 253 }, { "epoch": 0.33979933110367894, "grad_norm": 0.34165191650390625, "learning_rate": 8.946420531292211e-05, "loss": 1.7803, "step": 254 }, { "epoch": 0.3411371237458194, "grad_norm": 0.4165712296962738, "learning_rate": 8.941918054930212e-05, "loss": 1.9873, "step": 255 }, { "epoch": 0.34247491638795985, "grad_norm": 0.6011919379234314, "learning_rate": 8.937415578568214e-05, "loss": 1.9191, "step": 256 }, { "epoch": 0.34381270903010036, "grad_norm": 1.136637568473816, "learning_rate": 8.932913102206213e-05, "loss": 1.6417, "step": 257 }, { "epoch": 0.3451505016722408, "grad_norm": 0.3873753845691681, "learning_rate": 8.928410625844214e-05, "loss": 1.9526, "step": 258 }, { "epoch": 0.3464882943143813, "grad_norm": 0.38796165585517883, "learning_rate": 8.923908149482215e-05, "loss": 1.9183, "step": 259 }, { "epoch": 0.34782608695652173, "grad_norm": 0.37068337202072144, "learning_rate": 8.919405673120217e-05, "loss": 1.9615, "step": 260 }, { "epoch": 0.3491638795986622, "grad_norm": 0.41028955578804016, "learning_rate": 8.914903196758217e-05, "loss": 1.5558, "step": 261 }, { "epoch": 0.3505016722408027, "grad_norm": 0.6443496942520142, "learning_rate": 8.910400720396218e-05, "loss": 1.8737, "step": 262 }, { "epoch": 0.35183946488294315, "grad_norm": 0.3643587827682495, "learning_rate": 8.90589824403422e-05, "loss": 1.8952, "step": 263 }, { "epoch": 0.3531772575250836, "grad_norm": 0.3416200578212738, "learning_rate": 8.901395767672221e-05, "loss": 2.0677, "step": 264 }, { "epoch": 0.35451505016722407, "grad_norm": 1.0528233051300049, "learning_rate": 8.89689329131022e-05, "loss": 2.3153, "step": 265 }, { "epoch": 0.3558528428093645, "grad_norm": 0.49315449595451355, "learning_rate": 8.892390814948221e-05, "loss": 1.9844, "step": 266 }, { "epoch": 0.35719063545150503, "grad_norm": 0.3391367793083191, "learning_rate": 8.887888338586223e-05, "loss": 1.5993, "step": 267 }, { "epoch": 0.3585284280936455, "grad_norm": 0.3510306477546692, "learning_rate": 8.883385862224224e-05, "loss": 1.8076, "step": 268 }, { "epoch": 0.35986622073578595, "grad_norm": 0.36950451135635376, "learning_rate": 8.878883385862224e-05, "loss": 1.701, "step": 269 }, { "epoch": 0.3612040133779264, "grad_norm": 0.4125894606113434, "learning_rate": 8.874380909500225e-05, "loss": 1.9721, "step": 270 }, { "epoch": 0.3625418060200669, "grad_norm": 0.35017120838165283, "learning_rate": 8.869878433138227e-05, "loss": 1.8645, "step": 271 }, { "epoch": 0.36387959866220737, "grad_norm": 0.30292269587516785, "learning_rate": 8.865375956776228e-05, "loss": 1.758, "step": 272 }, { "epoch": 0.3652173913043478, "grad_norm": 0.8412917256355286, "learning_rate": 8.860873480414229e-05, "loss": 2.1873, "step": 273 }, { "epoch": 0.3665551839464883, "grad_norm": 0.2973708212375641, "learning_rate": 8.85637100405223e-05, "loss": 1.9731, "step": 274 }, { "epoch": 0.36789297658862874, "grad_norm": 1.3152915239334106, "learning_rate": 8.85186852769023e-05, "loss": 2.067, "step": 275 }, { "epoch": 0.36923076923076925, "grad_norm": 0.30133023858070374, "learning_rate": 8.847366051328231e-05, "loss": 1.9556, "step": 276 }, { "epoch": 0.3705685618729097, "grad_norm": 0.3463495671749115, "learning_rate": 8.842863574966232e-05, "loss": 1.9353, "step": 277 }, { "epoch": 0.37190635451505016, "grad_norm": 1.1229948997497559, "learning_rate": 8.838361098604233e-05, "loss": 1.2522, "step": 278 }, { "epoch": 0.3732441471571906, "grad_norm": 0.38608673214912415, "learning_rate": 8.833858622242234e-05, "loss": 1.8587, "step": 279 }, { "epoch": 0.3745819397993311, "grad_norm": 0.31167173385620117, "learning_rate": 8.829356145880235e-05, "loss": 1.9185, "step": 280 }, { "epoch": 0.3759197324414716, "grad_norm": 0.5619071125984192, "learning_rate": 8.824853669518235e-05, "loss": 1.814, "step": 281 }, { "epoch": 0.37725752508361204, "grad_norm": 0.5451438426971436, "learning_rate": 8.820351193156236e-05, "loss": 1.5962, "step": 282 }, { "epoch": 0.3785953177257525, "grad_norm": 0.41030535101890564, "learning_rate": 8.815848716794237e-05, "loss": 1.7109, "step": 283 }, { "epoch": 0.37993311036789296, "grad_norm": 0.8977394104003906, "learning_rate": 8.811346240432238e-05, "loss": 2.0786, "step": 284 }, { "epoch": 0.38127090301003347, "grad_norm": 0.35019567608833313, "learning_rate": 8.806843764070239e-05, "loss": 1.8875, "step": 285 }, { "epoch": 0.3826086956521739, "grad_norm": 0.4626282751560211, "learning_rate": 8.80234128770824e-05, "loss": 2.0935, "step": 286 }, { "epoch": 0.3839464882943144, "grad_norm": 0.3663441240787506, "learning_rate": 8.79783881134624e-05, "loss": 1.9519, "step": 287 }, { "epoch": 0.38528428093645484, "grad_norm": 0.3869033753871918, "learning_rate": 8.793336334984241e-05, "loss": 1.876, "step": 288 }, { "epoch": 0.3866220735785953, "grad_norm": 0.43982234597206116, "learning_rate": 8.788833858622242e-05, "loss": 1.9525, "step": 289 }, { "epoch": 0.3879598662207358, "grad_norm": 0.2982971966266632, "learning_rate": 8.784331382260243e-05, "loss": 2.0443, "step": 290 }, { "epoch": 0.38929765886287626, "grad_norm": 0.3693241477012634, "learning_rate": 8.779828905898244e-05, "loss": 1.9276, "step": 291 }, { "epoch": 0.3906354515050167, "grad_norm": 0.39458420872688293, "learning_rate": 8.775326429536246e-05, "loss": 1.6411, "step": 292 }, { "epoch": 0.3919732441471572, "grad_norm": 1.7858718633651733, "learning_rate": 8.770823953174246e-05, "loss": 1.8085, "step": 293 }, { "epoch": 0.3933110367892977, "grad_norm": 0.4990810751914978, "learning_rate": 8.766321476812247e-05, "loss": 1.9774, "step": 294 }, { "epoch": 0.39464882943143814, "grad_norm": 0.5808725357055664, "learning_rate": 8.761819000450247e-05, "loss": 2.1432, "step": 295 }, { "epoch": 0.3959866220735786, "grad_norm": 1.3691102266311646, "learning_rate": 8.75731652408825e-05, "loss": 1.869, "step": 296 }, { "epoch": 0.39732441471571905, "grad_norm": 0.4151483476161957, "learning_rate": 8.75281404772625e-05, "loss": 2.0903, "step": 297 }, { "epoch": 0.3986622073578595, "grad_norm": 0.34775832295417786, "learning_rate": 8.74831157136425e-05, "loss": 2.2351, "step": 298 }, { "epoch": 0.4, "grad_norm": 0.35934117436408997, "learning_rate": 8.743809095002252e-05, "loss": 2.1257, "step": 299 }, { "epoch": 0.4013377926421405, "grad_norm": 0.5283246636390686, "learning_rate": 8.739306618640253e-05, "loss": 2.0353, "step": 300 }, { "epoch": 0.40267558528428093, "grad_norm": 0.44409871101379395, "learning_rate": 8.734804142278254e-05, "loss": 1.7333, "step": 301 }, { "epoch": 0.4040133779264214, "grad_norm": 0.41795071959495544, "learning_rate": 8.730301665916253e-05, "loss": 1.9624, "step": 302 }, { "epoch": 0.40535117056856185, "grad_norm": 0.338204950094223, "learning_rate": 8.725799189554256e-05, "loss": 1.9867, "step": 303 }, { "epoch": 0.40668896321070236, "grad_norm": 0.3992144763469696, "learning_rate": 8.721296713192257e-05, "loss": 1.7873, "step": 304 }, { "epoch": 0.4080267558528428, "grad_norm": 0.41751959919929504, "learning_rate": 8.716794236830257e-05, "loss": 2.1863, "step": 305 }, { "epoch": 0.40936454849498327, "grad_norm": 1.1421892642974854, "learning_rate": 8.712291760468257e-05, "loss": 2.1859, "step": 306 }, { "epoch": 0.4107023411371237, "grad_norm": 0.35610154271125793, "learning_rate": 8.707789284106259e-05, "loss": 1.8319, "step": 307 }, { "epoch": 0.41204013377926424, "grad_norm": 0.5834404826164246, "learning_rate": 8.70328680774426e-05, "loss": 1.819, "step": 308 }, { "epoch": 0.4133779264214047, "grad_norm": 0.4020130932331085, "learning_rate": 8.698784331382261e-05, "loss": 1.6767, "step": 309 }, { "epoch": 0.41471571906354515, "grad_norm": 0.7889217138290405, "learning_rate": 8.694281855020262e-05, "loss": 1.9374, "step": 310 }, { "epoch": 0.4160535117056856, "grad_norm": 0.37698855996131897, "learning_rate": 8.689779378658263e-05, "loss": 2.0154, "step": 311 }, { "epoch": 0.41739130434782606, "grad_norm": 0.43415555357933044, "learning_rate": 8.685276902296263e-05, "loss": 1.9254, "step": 312 }, { "epoch": 0.4187290969899666, "grad_norm": 0.3221052587032318, "learning_rate": 8.680774425934264e-05, "loss": 2.0957, "step": 313 }, { "epoch": 0.42006688963210703, "grad_norm": 0.3111625909805298, "learning_rate": 8.676271949572265e-05, "loss": 1.7638, "step": 314 }, { "epoch": 0.4214046822742475, "grad_norm": 0.356253445148468, "learning_rate": 8.671769473210266e-05, "loss": 1.9813, "step": 315 }, { "epoch": 0.42274247491638794, "grad_norm": 0.7372254729270935, "learning_rate": 8.667266996848267e-05, "loss": 1.8251, "step": 316 }, { "epoch": 0.4240802675585284, "grad_norm": 1.448208212852478, "learning_rate": 8.662764520486268e-05, "loss": 1.6077, "step": 317 }, { "epoch": 0.4254180602006689, "grad_norm": 0.653560221195221, "learning_rate": 8.658262044124269e-05, "loss": 1.9399, "step": 318 }, { "epoch": 0.42675585284280937, "grad_norm": 0.4218508005142212, "learning_rate": 8.65375956776227e-05, "loss": 2.01, "step": 319 }, { "epoch": 0.4280936454849498, "grad_norm": 0.3056946396827698, "learning_rate": 8.64925709140027e-05, "loss": 1.8975, "step": 320 }, { "epoch": 0.4294314381270903, "grad_norm": 0.5265414118766785, "learning_rate": 8.644754615038273e-05, "loss": 1.406, "step": 321 }, { "epoch": 0.4307692307692308, "grad_norm": 0.34402531385421753, "learning_rate": 8.640252138676272e-05, "loss": 1.8997, "step": 322 }, { "epoch": 0.43210702341137125, "grad_norm": 0.46225470304489136, "learning_rate": 8.635749662314273e-05, "loss": 2.0228, "step": 323 }, { "epoch": 0.4334448160535117, "grad_norm": 0.4547097682952881, "learning_rate": 8.631247185952274e-05, "loss": 2.0001, "step": 324 }, { "epoch": 0.43478260869565216, "grad_norm": 2.0929479598999023, "learning_rate": 8.626744709590276e-05, "loss": 1.9063, "step": 325 }, { "epoch": 0.4361204013377926, "grad_norm": 0.7159757614135742, "learning_rate": 8.622242233228275e-05, "loss": 1.9047, "step": 326 }, { "epoch": 0.43745819397993313, "grad_norm": 3.7366178035736084, "learning_rate": 8.617739756866276e-05, "loss": 2.4446, "step": 327 }, { "epoch": 0.4387959866220736, "grad_norm": 0.5191693902015686, "learning_rate": 8.613237280504279e-05, "loss": 2.0927, "step": 328 }, { "epoch": 0.44013377926421404, "grad_norm": 0.49155277013778687, "learning_rate": 8.60873480414228e-05, "loss": 1.7654, "step": 329 }, { "epoch": 0.4414715719063545, "grad_norm": 0.3525542616844177, "learning_rate": 8.604232327780279e-05, "loss": 1.791, "step": 330 }, { "epoch": 0.442809364548495, "grad_norm": 0.5055109262466431, "learning_rate": 8.59972985141828e-05, "loss": 1.7466, "step": 331 }, { "epoch": 0.44414715719063547, "grad_norm": 0.3834606409072876, "learning_rate": 8.595227375056282e-05, "loss": 1.9207, "step": 332 }, { "epoch": 0.4454849498327759, "grad_norm": 0.48070740699768066, "learning_rate": 8.590724898694283e-05, "loss": 1.899, "step": 333 }, { "epoch": 0.4468227424749164, "grad_norm": 0.30500054359436035, "learning_rate": 8.586222422332282e-05, "loss": 2.1356, "step": 334 }, { "epoch": 0.44816053511705684, "grad_norm": 0.402435839176178, "learning_rate": 8.581719945970285e-05, "loss": 1.8639, "step": 335 }, { "epoch": 0.44949832775919735, "grad_norm": 0.4309161305427551, "learning_rate": 8.577217469608285e-05, "loss": 2.1182, "step": 336 }, { "epoch": 0.4508361204013378, "grad_norm": 0.6321202516555786, "learning_rate": 8.572714993246286e-05, "loss": 1.6111, "step": 337 }, { "epoch": 0.45217391304347826, "grad_norm": 0.37674644589424133, "learning_rate": 8.568212516884286e-05, "loss": 1.4399, "step": 338 }, { "epoch": 0.4535117056856187, "grad_norm": 0.38387489318847656, "learning_rate": 8.563710040522288e-05, "loss": 1.9581, "step": 339 }, { "epoch": 0.45484949832775917, "grad_norm": 0.37002870440483093, "learning_rate": 8.559207564160289e-05, "loss": 1.7161, "step": 340 }, { "epoch": 0.4561872909698997, "grad_norm": 0.3412352502346039, "learning_rate": 8.55470508779829e-05, "loss": 1.8263, "step": 341 }, { "epoch": 0.45752508361204014, "grad_norm": 0.47680556774139404, "learning_rate": 8.550202611436289e-05, "loss": 2.0372, "step": 342 }, { "epoch": 0.4588628762541806, "grad_norm": 0.5688019394874573, "learning_rate": 8.545700135074291e-05, "loss": 2.0338, "step": 343 }, { "epoch": 0.46020066889632105, "grad_norm": 0.4578900635242462, "learning_rate": 8.541197658712292e-05, "loss": 2.1086, "step": 344 }, { "epoch": 0.46153846153846156, "grad_norm": 0.3213759958744049, "learning_rate": 8.536695182350293e-05, "loss": 1.9701, "step": 345 }, { "epoch": 0.462876254180602, "grad_norm": 0.3761240541934967, "learning_rate": 8.532192705988294e-05, "loss": 2.042, "step": 346 }, { "epoch": 0.4642140468227425, "grad_norm": 0.6948366761207581, "learning_rate": 8.527690229626295e-05, "loss": 1.7776, "step": 347 }, { "epoch": 0.46555183946488293, "grad_norm": 0.38398241996765137, "learning_rate": 8.523187753264296e-05, "loss": 1.6168, "step": 348 }, { "epoch": 0.4668896321070234, "grad_norm": 0.4216487407684326, "learning_rate": 8.518685276902297e-05, "loss": 1.5656, "step": 349 }, { "epoch": 0.4682274247491639, "grad_norm": 0.3816230297088623, "learning_rate": 8.514182800540297e-05, "loss": 1.9222, "step": 350 }, { "epoch": 0.46956521739130436, "grad_norm": 0.5072731971740723, "learning_rate": 8.509680324178298e-05, "loss": 2.257, "step": 351 }, { "epoch": 0.4709030100334448, "grad_norm": 0.38990819454193115, "learning_rate": 8.505177847816299e-05, "loss": 1.6184, "step": 352 }, { "epoch": 0.47224080267558527, "grad_norm": 0.3647673726081848, "learning_rate": 8.500675371454301e-05, "loss": 1.9753, "step": 353 }, { "epoch": 0.4735785953177257, "grad_norm": 0.503564715385437, "learning_rate": 8.496172895092301e-05, "loss": 1.88, "step": 354 }, { "epoch": 0.47491638795986624, "grad_norm": 0.4017346501350403, "learning_rate": 8.491670418730302e-05, "loss": 1.8751, "step": 355 }, { "epoch": 0.4762541806020067, "grad_norm": 0.3733857274055481, "learning_rate": 8.487167942368303e-05, "loss": 1.8511, "step": 356 }, { "epoch": 0.47759197324414715, "grad_norm": 0.35360386967658997, "learning_rate": 8.482665466006305e-05, "loss": 1.8221, "step": 357 }, { "epoch": 0.4789297658862876, "grad_norm": 0.38599494099617004, "learning_rate": 8.478162989644304e-05, "loss": 1.8883, "step": 358 }, { "epoch": 0.4802675585284281, "grad_norm": 0.3708478808403015, "learning_rate": 8.473660513282305e-05, "loss": 1.3802, "step": 359 }, { "epoch": 0.4816053511705686, "grad_norm": 0.903266191482544, "learning_rate": 8.469158036920306e-05, "loss": 1.6616, "step": 360 }, { "epoch": 0.48294314381270903, "grad_norm": 0.3742479681968689, "learning_rate": 8.464655560558308e-05, "loss": 1.6105, "step": 361 }, { "epoch": 0.4842809364548495, "grad_norm": 0.34880515933036804, "learning_rate": 8.460153084196308e-05, "loss": 1.8501, "step": 362 }, { "epoch": 0.48561872909698994, "grad_norm": 0.3260308802127838, "learning_rate": 8.455650607834309e-05, "loss": 1.9235, "step": 363 }, { "epoch": 0.48695652173913045, "grad_norm": 0.4487158954143524, "learning_rate": 8.451148131472311e-05, "loss": 1.3566, "step": 364 }, { "epoch": 0.4882943143812709, "grad_norm": 0.6587825417518616, "learning_rate": 8.446645655110312e-05, "loss": 1.9224, "step": 365 }, { "epoch": 0.48963210702341137, "grad_norm": 0.3895570635795593, "learning_rate": 8.442143178748311e-05, "loss": 2.0373, "step": 366 }, { "epoch": 0.4909698996655518, "grad_norm": 0.4009748101234436, "learning_rate": 8.437640702386312e-05, "loss": 1.6818, "step": 367 }, { "epoch": 0.49230769230769234, "grad_norm": 0.36814218759536743, "learning_rate": 8.433138226024314e-05, "loss": 1.9883, "step": 368 }, { "epoch": 0.4936454849498328, "grad_norm": 0.3416324257850647, "learning_rate": 8.428635749662315e-05, "loss": 1.7745, "step": 369 }, { "epoch": 0.49498327759197325, "grad_norm": 0.3150230646133423, "learning_rate": 8.424133273300316e-05, "loss": 1.8217, "step": 370 }, { "epoch": 0.4963210702341137, "grad_norm": 0.6296526193618774, "learning_rate": 8.419630796938317e-05, "loss": 2.105, "step": 371 }, { "epoch": 0.49765886287625416, "grad_norm": 0.37235918641090393, "learning_rate": 8.415128320576318e-05, "loss": 1.5564, "step": 372 }, { "epoch": 0.49899665551839467, "grad_norm": 0.373354434967041, "learning_rate": 8.410625844214319e-05, "loss": 1.7034, "step": 373 }, { "epoch": 0.5003344481605351, "grad_norm": 0.3129545748233795, "learning_rate": 8.40612336785232e-05, "loss": 1.9642, "step": 374 }, { "epoch": 0.5016722408026756, "grad_norm": 0.30649834871292114, "learning_rate": 8.40162089149032e-05, "loss": 1.449, "step": 375 }, { "epoch": 0.5030100334448161, "grad_norm": 0.2958093285560608, "learning_rate": 8.397118415128321e-05, "loss": 1.9685, "step": 376 }, { "epoch": 0.5043478260869565, "grad_norm": 0.336683064699173, "learning_rate": 8.392615938766322e-05, "loss": 1.9817, "step": 377 }, { "epoch": 0.505685618729097, "grad_norm": 0.9705008268356323, "learning_rate": 8.388113462404323e-05, "loss": 1.5965, "step": 378 }, { "epoch": 0.5070234113712374, "grad_norm": 0.4110693037509918, "learning_rate": 8.383610986042324e-05, "loss": 1.7417, "step": 379 }, { "epoch": 0.5083612040133779, "grad_norm": 0.8143429160118103, "learning_rate": 8.379108509680325e-05, "loss": 2.2159, "step": 380 }, { "epoch": 0.5096989966555184, "grad_norm": 0.3616657555103302, "learning_rate": 8.374606033318325e-05, "loss": 2.0832, "step": 381 }, { "epoch": 0.5110367892976588, "grad_norm": 0.500678300857544, "learning_rate": 8.370103556956326e-05, "loss": 1.5084, "step": 382 }, { "epoch": 0.5123745819397993, "grad_norm": 0.4309229254722595, "learning_rate": 8.365601080594327e-05, "loss": 1.1731, "step": 383 }, { "epoch": 0.5137123745819397, "grad_norm": 0.34316661953926086, "learning_rate": 8.361098604232328e-05, "loss": 1.7091, "step": 384 }, { "epoch": 0.5150501672240803, "grad_norm": 0.38358014822006226, "learning_rate": 8.356596127870329e-05, "loss": 2.0595, "step": 385 }, { "epoch": 0.5163879598662208, "grad_norm": 0.3089703321456909, "learning_rate": 8.35209365150833e-05, "loss": 1.6416, "step": 386 }, { "epoch": 0.5177257525083612, "grad_norm": 0.3062998056411743, "learning_rate": 8.34759117514633e-05, "loss": 1.4986, "step": 387 }, { "epoch": 0.5190635451505017, "grad_norm": 0.5895267724990845, "learning_rate": 8.343088698784331e-05, "loss": 1.8914, "step": 388 }, { "epoch": 0.5204013377926422, "grad_norm": 0.46645572781562805, "learning_rate": 8.338586222422334e-05, "loss": 1.6489, "step": 389 }, { "epoch": 0.5217391304347826, "grad_norm": 0.7197579741477966, "learning_rate": 8.334083746060334e-05, "loss": 2.0879, "step": 390 }, { "epoch": 0.5230769230769231, "grad_norm": 0.38837894797325134, "learning_rate": 8.329581269698334e-05, "loss": 1.4096, "step": 391 }, { "epoch": 0.5244147157190635, "grad_norm": 0.3910644054412842, "learning_rate": 8.325078793336335e-05, "loss": 1.8055, "step": 392 }, { "epoch": 0.525752508361204, "grad_norm": 0.5699204206466675, "learning_rate": 8.320576316974337e-05, "loss": 1.9215, "step": 393 }, { "epoch": 0.5270903010033445, "grad_norm": 0.4684099853038788, "learning_rate": 8.316073840612338e-05, "loss": 1.6515, "step": 394 }, { "epoch": 0.5284280936454849, "grad_norm": 0.3977600634098053, "learning_rate": 8.311571364250337e-05, "loss": 1.8285, "step": 395 }, { "epoch": 0.5297658862876254, "grad_norm": 0.3171498775482178, "learning_rate": 8.307068887888338e-05, "loss": 1.8305, "step": 396 }, { "epoch": 0.5311036789297658, "grad_norm": 0.321153849363327, "learning_rate": 8.30256641152634e-05, "loss": 1.582, "step": 397 }, { "epoch": 0.5324414715719064, "grad_norm": 0.8034740090370178, "learning_rate": 8.298063935164341e-05, "loss": 1.816, "step": 398 }, { "epoch": 0.5337792642140469, "grad_norm": 0.3740721642971039, "learning_rate": 8.293561458802341e-05, "loss": 1.9171, "step": 399 }, { "epoch": 0.5351170568561873, "grad_norm": 0.35908418893814087, "learning_rate": 8.289058982440343e-05, "loss": 1.9799, "step": 400 }, { "epoch": 0.5364548494983278, "grad_norm": 0.3337550163269043, "learning_rate": 8.284556506078344e-05, "loss": 1.9837, "step": 401 }, { "epoch": 0.5377926421404682, "grad_norm": 0.47816741466522217, "learning_rate": 8.280054029716345e-05, "loss": 1.4683, "step": 402 }, { "epoch": 0.5391304347826087, "grad_norm": 0.3333340585231781, "learning_rate": 8.275551553354344e-05, "loss": 2.0083, "step": 403 }, { "epoch": 0.5404682274247492, "grad_norm": 0.5132210850715637, "learning_rate": 8.271049076992347e-05, "loss": 1.9655, "step": 404 }, { "epoch": 0.5418060200668896, "grad_norm": 0.49101531505584717, "learning_rate": 8.266546600630347e-05, "loss": 1.9655, "step": 405 }, { "epoch": 0.5431438127090301, "grad_norm": 0.4403785765171051, "learning_rate": 8.262044124268348e-05, "loss": 2.1054, "step": 406 }, { "epoch": 0.5444816053511705, "grad_norm": 0.4216044843196869, "learning_rate": 8.257541647906349e-05, "loss": 1.763, "step": 407 }, { "epoch": 0.545819397993311, "grad_norm": 0.36043408513069153, "learning_rate": 8.25303917154435e-05, "loss": 1.7495, "step": 408 }, { "epoch": 0.5471571906354515, "grad_norm": 0.5740343332290649, "learning_rate": 8.248536695182351e-05, "loss": 2.0926, "step": 409 }, { "epoch": 0.5484949832775919, "grad_norm": 0.3019751012325287, "learning_rate": 8.244034218820352e-05, "loss": 2.029, "step": 410 }, { "epoch": 0.5498327759197325, "grad_norm": 0.3443738520145416, "learning_rate": 8.239531742458353e-05, "loss": 1.9477, "step": 411 }, { "epoch": 0.551170568561873, "grad_norm": 0.6188628673553467, "learning_rate": 8.235029266096353e-05, "loss": 1.6712, "step": 412 }, { "epoch": 0.5525083612040134, "grad_norm": 0.39438313245773315, "learning_rate": 8.230526789734354e-05, "loss": 1.8374, "step": 413 }, { "epoch": 0.5538461538461539, "grad_norm": 0.2995496988296509, "learning_rate": 8.226024313372355e-05, "loss": 1.8306, "step": 414 }, { "epoch": 0.5551839464882943, "grad_norm": 0.3863040804862976, "learning_rate": 8.221521837010356e-05, "loss": 1.175, "step": 415 }, { "epoch": 0.5565217391304348, "grad_norm": 0.41024214029312134, "learning_rate": 8.217019360648357e-05, "loss": 1.6772, "step": 416 }, { "epoch": 0.5578595317725753, "grad_norm": 1.2178558111190796, "learning_rate": 8.212516884286358e-05, "loss": 1.5956, "step": 417 }, { "epoch": 0.5591973244147157, "grad_norm": 0.656810998916626, "learning_rate": 8.20801440792436e-05, "loss": 1.6988, "step": 418 }, { "epoch": 0.5605351170568562, "grad_norm": 0.36660322546958923, "learning_rate": 8.20351193156236e-05, "loss": 2.0613, "step": 419 }, { "epoch": 0.5618729096989966, "grad_norm": 0.4018813967704773, "learning_rate": 8.19900945520036e-05, "loss": 1.7731, "step": 420 }, { "epoch": 0.5632107023411371, "grad_norm": 0.3366681635379791, "learning_rate": 8.194506978838361e-05, "loss": 1.8276, "step": 421 }, { "epoch": 0.5645484949832776, "grad_norm": 0.615424394607544, "learning_rate": 8.190004502476363e-05, "loss": 1.4514, "step": 422 }, { "epoch": 0.565886287625418, "grad_norm": 0.6425555944442749, "learning_rate": 8.185502026114363e-05, "loss": 1.6658, "step": 423 }, { "epoch": 0.5672240802675586, "grad_norm": 0.3678185045719147, "learning_rate": 8.180999549752364e-05, "loss": 1.9313, "step": 424 }, { "epoch": 0.568561872909699, "grad_norm": 0.43040040135383606, "learning_rate": 8.176497073390366e-05, "loss": 1.8427, "step": 425 }, { "epoch": 0.5698996655518395, "grad_norm": 0.3130514323711395, "learning_rate": 8.171994597028367e-05, "loss": 1.8946, "step": 426 }, { "epoch": 0.57123745819398, "grad_norm": 0.8794007301330566, "learning_rate": 8.167492120666366e-05, "loss": 1.8651, "step": 427 }, { "epoch": 0.5725752508361204, "grad_norm": 0.3628743588924408, "learning_rate": 8.162989644304367e-05, "loss": 2.0068, "step": 428 }, { "epoch": 0.5739130434782609, "grad_norm": 0.6228901743888855, "learning_rate": 8.15848716794237e-05, "loss": 1.7479, "step": 429 }, { "epoch": 0.5752508361204013, "grad_norm": 0.3421112298965454, "learning_rate": 8.15398469158037e-05, "loss": 1.92, "step": 430 }, { "epoch": 0.5765886287625418, "grad_norm": 0.40324121713638306, "learning_rate": 8.14948221521837e-05, "loss": 1.8626, "step": 431 }, { "epoch": 0.5779264214046823, "grad_norm": 0.3942056894302368, "learning_rate": 8.14497973885637e-05, "loss": 1.8359, "step": 432 }, { "epoch": 0.5792642140468227, "grad_norm": 0.3615321218967438, "learning_rate": 8.140477262494373e-05, "loss": 1.8924, "step": 433 }, { "epoch": 0.5806020066889632, "grad_norm": 0.41227877140045166, "learning_rate": 8.135974786132374e-05, "loss": 1.6984, "step": 434 }, { "epoch": 0.5819397993311036, "grad_norm": 1.1059563159942627, "learning_rate": 8.131472309770373e-05, "loss": 1.4266, "step": 435 }, { "epoch": 0.5832775919732441, "grad_norm": 0.3503919541835785, "learning_rate": 8.126969833408375e-05, "loss": 2.1598, "step": 436 }, { "epoch": 0.5846153846153846, "grad_norm": 0.5328121781349182, "learning_rate": 8.122467357046376e-05, "loss": 1.9038, "step": 437 }, { "epoch": 0.585953177257525, "grad_norm": 0.3714490234851837, "learning_rate": 8.117964880684377e-05, "loss": 1.7892, "step": 438 }, { "epoch": 0.5872909698996656, "grad_norm": 0.3813803493976593, "learning_rate": 8.113462404322378e-05, "loss": 1.7479, "step": 439 }, { "epoch": 0.5886287625418061, "grad_norm": 0.3410722017288208, "learning_rate": 8.108959927960379e-05, "loss": 1.94, "step": 440 }, { "epoch": 0.5899665551839465, "grad_norm": 0.3214760422706604, "learning_rate": 8.10445745159838e-05, "loss": 1.8526, "step": 441 }, { "epoch": 0.591304347826087, "grad_norm": 0.6665300726890564, "learning_rate": 8.09995497523638e-05, "loss": 1.951, "step": 442 }, { "epoch": 0.5926421404682274, "grad_norm": 0.33864709734916687, "learning_rate": 8.095452498874381e-05, "loss": 1.9193, "step": 443 }, { "epoch": 0.5939799331103679, "grad_norm": 0.3851359188556671, "learning_rate": 8.090950022512382e-05, "loss": 1.8382, "step": 444 }, { "epoch": 0.5953177257525084, "grad_norm": 0.3829728960990906, "learning_rate": 8.086447546150383e-05, "loss": 1.7381, "step": 445 }, { "epoch": 0.5966555183946488, "grad_norm": 0.41027361154556274, "learning_rate": 8.081945069788384e-05, "loss": 1.9236, "step": 446 }, { "epoch": 0.5979933110367893, "grad_norm": 0.4161706864833832, "learning_rate": 8.077442593426385e-05, "loss": 1.7539, "step": 447 }, { "epoch": 0.5993311036789297, "grad_norm": 0.4396213889122009, "learning_rate": 8.072940117064386e-05, "loss": 1.7225, "step": 448 }, { "epoch": 0.6006688963210702, "grad_norm": 0.36475786566734314, "learning_rate": 8.068437640702387e-05, "loss": 1.63, "step": 449 }, { "epoch": 0.6020066889632107, "grad_norm": 0.5781269073486328, "learning_rate": 8.063935164340387e-05, "loss": 1.7179, "step": 450 }, { "epoch": 0.6033444816053511, "grad_norm": 1.0035388469696045, "learning_rate": 8.059432687978388e-05, "loss": 1.7692, "step": 451 }, { "epoch": 0.6046822742474917, "grad_norm": 0.3682602643966675, "learning_rate": 8.054930211616389e-05, "loss": 1.9889, "step": 452 }, { "epoch": 0.6060200668896321, "grad_norm": 0.49924179911613464, "learning_rate": 8.05042773525439e-05, "loss": 1.5572, "step": 453 }, { "epoch": 0.6073578595317726, "grad_norm": 0.34571748971939087, "learning_rate": 8.045925258892392e-05, "loss": 2.0112, "step": 454 }, { "epoch": 0.6086956521739131, "grad_norm": 0.35000765323638916, "learning_rate": 8.041422782530392e-05, "loss": 1.852, "step": 455 }, { "epoch": 0.6100334448160535, "grad_norm": 0.3942291736602783, "learning_rate": 8.036920306168393e-05, "loss": 1.7856, "step": 456 }, { "epoch": 0.611371237458194, "grad_norm": 0.5689213275909424, "learning_rate": 8.032417829806393e-05, "loss": 1.9164, "step": 457 }, { "epoch": 0.6127090301003344, "grad_norm": 0.4623394310474396, "learning_rate": 8.027915353444396e-05, "loss": 2.0554, "step": 458 }, { "epoch": 0.6140468227424749, "grad_norm": 0.5661097168922424, "learning_rate": 8.023412877082395e-05, "loss": 1.6894, "step": 459 }, { "epoch": 0.6153846153846154, "grad_norm": 0.41809728741645813, "learning_rate": 8.018910400720396e-05, "loss": 1.7757, "step": 460 }, { "epoch": 0.6167224080267558, "grad_norm": 0.4237455427646637, "learning_rate": 8.014407924358398e-05, "loss": 2.1467, "step": 461 }, { "epoch": 0.6180602006688963, "grad_norm": 0.3313245177268982, "learning_rate": 8.009905447996399e-05, "loss": 2.072, "step": 462 }, { "epoch": 0.6193979933110368, "grad_norm": 0.3133883476257324, "learning_rate": 8.0054029716344e-05, "loss": 2.0608, "step": 463 }, { "epoch": 0.6207357859531772, "grad_norm": 0.3357526659965515, "learning_rate": 8.0009004952724e-05, "loss": 1.5915, "step": 464 }, { "epoch": 0.6220735785953178, "grad_norm": 0.39856886863708496, "learning_rate": 7.996398018910402e-05, "loss": 2.015, "step": 465 }, { "epoch": 0.6234113712374582, "grad_norm": 0.3311580717563629, "learning_rate": 7.991895542548402e-05, "loss": 2.2078, "step": 466 }, { "epoch": 0.6247491638795987, "grad_norm": 0.28546395897865295, "learning_rate": 7.987393066186403e-05, "loss": 1.7635, "step": 467 }, { "epoch": 0.6260869565217392, "grad_norm": 0.8754998445510864, "learning_rate": 7.982890589824403e-05, "loss": 1.9734, "step": 468 }, { "epoch": 0.6274247491638796, "grad_norm": 0.39375200867652893, "learning_rate": 7.978388113462405e-05, "loss": 1.3764, "step": 469 }, { "epoch": 0.6287625418060201, "grad_norm": 0.3225364089012146, "learning_rate": 7.973885637100406e-05, "loss": 2.1192, "step": 470 }, { "epoch": 0.6301003344481605, "grad_norm": 0.34391409158706665, "learning_rate": 7.969383160738407e-05, "loss": 1.9341, "step": 471 }, { "epoch": 0.631438127090301, "grad_norm": 0.37655723094940186, "learning_rate": 7.964880684376408e-05, "loss": 1.9184, "step": 472 }, { "epoch": 0.6327759197324415, "grad_norm": 0.32347163558006287, "learning_rate": 7.960378208014408e-05, "loss": 1.9153, "step": 473 }, { "epoch": 0.6341137123745819, "grad_norm": 0.31293460726737976, "learning_rate": 7.95587573165241e-05, "loss": 1.9334, "step": 474 }, { "epoch": 0.6354515050167224, "grad_norm": 0.29669955372810364, "learning_rate": 7.95137325529041e-05, "loss": 2.0717, "step": 475 }, { "epoch": 0.6367892976588628, "grad_norm": 0.5345836281776428, "learning_rate": 7.946870778928411e-05, "loss": 1.6862, "step": 476 }, { "epoch": 0.6381270903010033, "grad_norm": 0.3699991703033447, "learning_rate": 7.942368302566412e-05, "loss": 2.0417, "step": 477 }, { "epoch": 0.6394648829431439, "grad_norm": 0.502368152141571, "learning_rate": 7.937865826204413e-05, "loss": 1.8139, "step": 478 }, { "epoch": 0.6408026755852843, "grad_norm": 0.339144766330719, "learning_rate": 7.933363349842414e-05, "loss": 1.9849, "step": 479 }, { "epoch": 0.6421404682274248, "grad_norm": 0.3838237226009369, "learning_rate": 7.928860873480414e-05, "loss": 1.7248, "step": 480 }, { "epoch": 0.6434782608695652, "grad_norm": 0.3984738886356354, "learning_rate": 7.924358397118415e-05, "loss": 1.7374, "step": 481 }, { "epoch": 0.6448160535117057, "grad_norm": 1.3994791507720947, "learning_rate": 7.919855920756416e-05, "loss": 1.3882, "step": 482 }, { "epoch": 0.6461538461538462, "grad_norm": 0.41736477613449097, "learning_rate": 7.915353444394417e-05, "loss": 1.8299, "step": 483 }, { "epoch": 0.6474916387959866, "grad_norm": 0.2818329930305481, "learning_rate": 7.910850968032418e-05, "loss": 1.94, "step": 484 }, { "epoch": 0.6488294314381271, "grad_norm": 0.27325278520584106, "learning_rate": 7.906348491670419e-05, "loss": 2.001, "step": 485 }, { "epoch": 0.6501672240802676, "grad_norm": 0.642238199710846, "learning_rate": 7.90184601530842e-05, "loss": 1.9489, "step": 486 }, { "epoch": 0.651505016722408, "grad_norm": 0.38445815443992615, "learning_rate": 7.897343538946422e-05, "loss": 1.7597, "step": 487 }, { "epoch": 0.6528428093645485, "grad_norm": 0.2907230257987976, "learning_rate": 7.892841062584421e-05, "loss": 1.65, "step": 488 }, { "epoch": 0.6541806020066889, "grad_norm": 0.3835679888725281, "learning_rate": 7.888338586222422e-05, "loss": 1.8158, "step": 489 }, { "epoch": 0.6555183946488294, "grad_norm": 0.44675248861312866, "learning_rate": 7.883836109860424e-05, "loss": 1.4216, "step": 490 }, { "epoch": 0.65685618729097, "grad_norm": 0.385631263256073, "learning_rate": 7.879333633498425e-05, "loss": 1.8526, "step": 491 }, { "epoch": 0.6581939799331104, "grad_norm": 0.3172449469566345, "learning_rate": 7.874831157136425e-05, "loss": 1.9237, "step": 492 }, { "epoch": 0.6595317725752509, "grad_norm": 0.41748154163360596, "learning_rate": 7.870328680774426e-05, "loss": 2.026, "step": 493 }, { "epoch": 0.6608695652173913, "grad_norm": 0.4081355035305023, "learning_rate": 7.865826204412428e-05, "loss": 1.4274, "step": 494 }, { "epoch": 0.6622073578595318, "grad_norm": 0.338257759809494, "learning_rate": 7.861323728050429e-05, "loss": 1.8443, "step": 495 }, { "epoch": 0.6635451505016723, "grad_norm": 0.40070194005966187, "learning_rate": 7.856821251688428e-05, "loss": 1.9438, "step": 496 }, { "epoch": 0.6648829431438127, "grad_norm": 0.2751341462135315, "learning_rate": 7.85231877532643e-05, "loss": 1.9885, "step": 497 }, { "epoch": 0.6662207357859532, "grad_norm": 0.3681652843952179, "learning_rate": 7.847816298964431e-05, "loss": 1.9029, "step": 498 }, { "epoch": 0.6675585284280936, "grad_norm": 0.317685067653656, "learning_rate": 7.843313822602432e-05, "loss": 1.8493, "step": 499 }, { "epoch": 0.6688963210702341, "grad_norm": 0.5167484283447266, "learning_rate": 7.838811346240432e-05, "loss": 2.0053, "step": 500 }, { "epoch": 0.6702341137123746, "grad_norm": 0.283053457736969, "learning_rate": 7.834308869878434e-05, "loss": 1.9312, "step": 501 }, { "epoch": 0.671571906354515, "grad_norm": 0.3423631191253662, "learning_rate": 7.829806393516435e-05, "loss": 2.0661, "step": 502 }, { "epoch": 0.6729096989966555, "grad_norm": 0.41216039657592773, "learning_rate": 7.825303917154436e-05, "loss": 1.6058, "step": 503 }, { "epoch": 0.6742474916387959, "grad_norm": 0.40254294872283936, "learning_rate": 7.820801440792435e-05, "loss": 2.0609, "step": 504 }, { "epoch": 0.6755852842809364, "grad_norm": 0.35236918926239014, "learning_rate": 7.816298964430437e-05, "loss": 1.8236, "step": 505 }, { "epoch": 0.676923076923077, "grad_norm": 0.7102991342544556, "learning_rate": 7.811796488068438e-05, "loss": 1.3943, "step": 506 }, { "epoch": 0.6782608695652174, "grad_norm": 0.4407871961593628, "learning_rate": 7.807294011706439e-05, "loss": 1.7709, "step": 507 }, { "epoch": 0.6795986622073579, "grad_norm": 0.29523321986198425, "learning_rate": 7.80279153534444e-05, "loss": 1.8408, "step": 508 }, { "epoch": 0.6809364548494983, "grad_norm": 0.4053889811038971, "learning_rate": 7.798289058982441e-05, "loss": 1.9194, "step": 509 }, { "epoch": 0.6822742474916388, "grad_norm": 0.4437839686870575, "learning_rate": 7.793786582620442e-05, "loss": 1.7094, "step": 510 }, { "epoch": 0.6836120401337793, "grad_norm": 0.3048234283924103, "learning_rate": 7.789284106258442e-05, "loss": 1.7877, "step": 511 }, { "epoch": 0.6849498327759197, "grad_norm": 0.3253330588340759, "learning_rate": 7.784781629896443e-05, "loss": 1.7939, "step": 512 }, { "epoch": 0.6862876254180602, "grad_norm": 0.4025883674621582, "learning_rate": 7.780279153534444e-05, "loss": 1.3158, "step": 513 }, { "epoch": 0.6876254180602007, "grad_norm": 0.3705989122390747, "learning_rate": 7.775776677172445e-05, "loss": 1.9503, "step": 514 }, { "epoch": 0.6889632107023411, "grad_norm": 1.0391372442245483, "learning_rate": 7.771274200810447e-05, "loss": 1.9089, "step": 515 }, { "epoch": 0.6903010033444816, "grad_norm": 0.33640894293785095, "learning_rate": 7.766771724448447e-05, "loss": 1.6701, "step": 516 }, { "epoch": 0.691638795986622, "grad_norm": 0.34906384348869324, "learning_rate": 7.762269248086448e-05, "loss": 1.4419, "step": 517 }, { "epoch": 0.6929765886287625, "grad_norm": 0.4226134121417999, "learning_rate": 7.757766771724448e-05, "loss": 1.4917, "step": 518 }, { "epoch": 0.6943143812709031, "grad_norm": 0.3549327552318573, "learning_rate": 7.753264295362451e-05, "loss": 1.4032, "step": 519 }, { "epoch": 0.6956521739130435, "grad_norm": 0.4137127995491028, "learning_rate": 7.74876181900045e-05, "loss": 1.8731, "step": 520 }, { "epoch": 0.696989966555184, "grad_norm": 0.3058634102344513, "learning_rate": 7.744259342638451e-05, "loss": 1.8576, "step": 521 }, { "epoch": 0.6983277591973244, "grad_norm": 0.38258033990859985, "learning_rate": 7.739756866276452e-05, "loss": 1.9838, "step": 522 }, { "epoch": 0.6996655518394649, "grad_norm": 0.7790876626968384, "learning_rate": 7.735254389914454e-05, "loss": 2.1705, "step": 523 }, { "epoch": 0.7010033444816054, "grad_norm": 0.3393210768699646, "learning_rate": 7.730751913552454e-05, "loss": 1.949, "step": 524 }, { "epoch": 0.7023411371237458, "grad_norm": 0.3894656300544739, "learning_rate": 7.726249437190454e-05, "loss": 1.7554, "step": 525 }, { "epoch": 0.7036789297658863, "grad_norm": 0.44569316506385803, "learning_rate": 7.721746960828457e-05, "loss": 1.9891, "step": 526 }, { "epoch": 0.7050167224080267, "grad_norm": 0.40504804253578186, "learning_rate": 7.717244484466458e-05, "loss": 1.7219, "step": 527 }, { "epoch": 0.7063545150501672, "grad_norm": 0.40453100204467773, "learning_rate": 7.712742008104457e-05, "loss": 1.8641, "step": 528 }, { "epoch": 0.7076923076923077, "grad_norm": 0.36321282386779785, "learning_rate": 7.708239531742458e-05, "loss": 1.9582, "step": 529 }, { "epoch": 0.7090301003344481, "grad_norm": 0.588949978351593, "learning_rate": 7.70373705538046e-05, "loss": 1.4981, "step": 530 }, { "epoch": 0.7103678929765886, "grad_norm": 0.6625288128852844, "learning_rate": 7.699234579018461e-05, "loss": 1.5321, "step": 531 }, { "epoch": 0.711705685618729, "grad_norm": 0.3740290403366089, "learning_rate": 7.69473210265646e-05, "loss": 1.9171, "step": 532 }, { "epoch": 0.7130434782608696, "grad_norm": 0.30846625566482544, "learning_rate": 7.690229626294463e-05, "loss": 2.1682, "step": 533 }, { "epoch": 0.7143812709030101, "grad_norm": 0.34462276101112366, "learning_rate": 7.685727149932464e-05, "loss": 2.2179, "step": 534 }, { "epoch": 0.7157190635451505, "grad_norm": 0.3851476013660431, "learning_rate": 7.681224673570464e-05, "loss": 1.9401, "step": 535 }, { "epoch": 0.717056856187291, "grad_norm": 0.3371794521808624, "learning_rate": 7.676722197208465e-05, "loss": 1.8142, "step": 536 }, { "epoch": 0.7183946488294315, "grad_norm": 0.34345126152038574, "learning_rate": 7.672219720846466e-05, "loss": 1.8962, "step": 537 }, { "epoch": 0.7197324414715719, "grad_norm": 0.3379805386066437, "learning_rate": 7.667717244484467e-05, "loss": 2.1848, "step": 538 }, { "epoch": 0.7210702341137124, "grad_norm": 0.3815709054470062, "learning_rate": 7.663214768122468e-05, "loss": 1.4092, "step": 539 }, { "epoch": 0.7224080267558528, "grad_norm": 0.3566444516181946, "learning_rate": 7.658712291760469e-05, "loss": 1.9942, "step": 540 }, { "epoch": 0.7237458193979933, "grad_norm": 0.42823100090026855, "learning_rate": 7.65420981539847e-05, "loss": 1.9884, "step": 541 }, { "epoch": 0.7250836120401338, "grad_norm": 0.5952591896057129, "learning_rate": 7.64970733903647e-05, "loss": 1.6506, "step": 542 }, { "epoch": 0.7264214046822742, "grad_norm": 0.321796715259552, "learning_rate": 7.645204862674471e-05, "loss": 1.8377, "step": 543 }, { "epoch": 0.7277591973244147, "grad_norm": 0.3382924497127533, "learning_rate": 7.640702386312472e-05, "loss": 1.8229, "step": 544 }, { "epoch": 0.7290969899665551, "grad_norm": 0.6358975768089294, "learning_rate": 7.636199909950473e-05, "loss": 1.3559, "step": 545 }, { "epoch": 0.7304347826086957, "grad_norm": 2.565281629562378, "learning_rate": 7.631697433588474e-05, "loss": 2.0329, "step": 546 }, { "epoch": 0.7317725752508362, "grad_norm": 0.3889695405960083, "learning_rate": 7.627194957226475e-05, "loss": 1.7389, "step": 547 }, { "epoch": 0.7331103678929766, "grad_norm": 0.3816166818141937, "learning_rate": 7.622692480864476e-05, "loss": 2.3092, "step": 548 }, { "epoch": 0.7344481605351171, "grad_norm": 0.46524176001548767, "learning_rate": 7.618190004502476e-05, "loss": 1.7616, "step": 549 }, { "epoch": 0.7357859531772575, "grad_norm": 0.36591479182243347, "learning_rate": 7.613687528140477e-05, "loss": 2.0946, "step": 550 }, { "epoch": 0.737123745819398, "grad_norm": 0.34079721570014954, "learning_rate": 7.60918505177848e-05, "loss": 2.0089, "step": 551 }, { "epoch": 0.7384615384615385, "grad_norm": 0.3858109712600708, "learning_rate": 7.604682575416479e-05, "loss": 1.9751, "step": 552 }, { "epoch": 0.7397993311036789, "grad_norm": 0.3411719799041748, "learning_rate": 7.60018009905448e-05, "loss": 0.8972, "step": 553 }, { "epoch": 0.7411371237458194, "grad_norm": 0.3922092914581299, "learning_rate": 7.595677622692481e-05, "loss": 1.2931, "step": 554 }, { "epoch": 0.7424749163879598, "grad_norm": 0.27630677819252014, "learning_rate": 7.591175146330483e-05, "loss": 1.8692, "step": 555 }, { "epoch": 0.7438127090301003, "grad_norm": 0.3395419716835022, "learning_rate": 7.586672669968482e-05, "loss": 1.967, "step": 556 }, { "epoch": 0.7451505016722408, "grad_norm": 0.34704309701919556, "learning_rate": 7.582170193606483e-05, "loss": 1.7707, "step": 557 }, { "epoch": 0.7464882943143812, "grad_norm": 0.7625495791435242, "learning_rate": 7.577667717244484e-05, "loss": 1.336, "step": 558 }, { "epoch": 0.7478260869565218, "grad_norm": 0.33455583453178406, "learning_rate": 7.573165240882486e-05, "loss": 1.7007, "step": 559 }, { "epoch": 0.7491638795986622, "grad_norm": 0.3844684064388275, "learning_rate": 7.568662764520487e-05, "loss": 2.0887, "step": 560 }, { "epoch": 0.7505016722408027, "grad_norm": 0.4520154595375061, "learning_rate": 7.564160288158487e-05, "loss": 1.5324, "step": 561 }, { "epoch": 0.7518394648829432, "grad_norm": 0.2970287799835205, "learning_rate": 7.559657811796489e-05, "loss": 1.842, "step": 562 }, { "epoch": 0.7531772575250836, "grad_norm": 0.6159781813621521, "learning_rate": 7.55515533543449e-05, "loss": 1.9882, "step": 563 }, { "epoch": 0.7545150501672241, "grad_norm": 0.3633442521095276, "learning_rate": 7.550652859072491e-05, "loss": 1.7449, "step": 564 }, { "epoch": 0.7558528428093646, "grad_norm": 0.33036932349205017, "learning_rate": 7.54615038271049e-05, "loss": 1.4392, "step": 565 }, { "epoch": 0.757190635451505, "grad_norm": 0.3699447214603424, "learning_rate": 7.541647906348492e-05, "loss": 1.9728, "step": 566 }, { "epoch": 0.7585284280936455, "grad_norm": 0.4786345362663269, "learning_rate": 7.537145429986493e-05, "loss": 1.3553, "step": 567 }, { "epoch": 0.7598662207357859, "grad_norm": 0.3800508677959442, "learning_rate": 7.532642953624494e-05, "loss": 1.6109, "step": 568 }, { "epoch": 0.7612040133779264, "grad_norm": 0.3128887414932251, "learning_rate": 7.528140477262495e-05, "loss": 1.5856, "step": 569 }, { "epoch": 0.7625418060200669, "grad_norm": 0.35543105006217957, "learning_rate": 7.523638000900496e-05, "loss": 1.8994, "step": 570 }, { "epoch": 0.7638795986622073, "grad_norm": 0.33077099919319153, "learning_rate": 7.519135524538497e-05, "loss": 1.6525, "step": 571 }, { "epoch": 0.7652173913043478, "grad_norm": 0.3991314470767975, "learning_rate": 7.514633048176498e-05, "loss": 1.9248, "step": 572 }, { "epoch": 0.7665551839464882, "grad_norm": 0.36139121651649475, "learning_rate": 7.510130571814498e-05, "loss": 2.0326, "step": 573 }, { "epoch": 0.7678929765886288, "grad_norm": 0.35255613923072815, "learning_rate": 7.505628095452499e-05, "loss": 1.8889, "step": 574 }, { "epoch": 0.7692307692307693, "grad_norm": 0.3487282693386078, "learning_rate": 7.5011256190905e-05, "loss": 1.4981, "step": 575 }, { "epoch": 0.7705685618729097, "grad_norm": 0.6345880627632141, "learning_rate": 7.496623142728501e-05, "loss": 2.1821, "step": 576 }, { "epoch": 0.7719063545150502, "grad_norm": 0.5673653483390808, "learning_rate": 7.492120666366502e-05, "loss": 2.1171, "step": 577 }, { "epoch": 0.7732441471571906, "grad_norm": 0.290094256401062, "learning_rate": 7.487618190004503e-05, "loss": 1.8362, "step": 578 }, { "epoch": 0.7745819397993311, "grad_norm": 0.33212703466415405, "learning_rate": 7.483115713642504e-05, "loss": 1.9094, "step": 579 }, { "epoch": 0.7759197324414716, "grad_norm": 0.6631553769111633, "learning_rate": 7.478613237280504e-05, "loss": 2.1565, "step": 580 }, { "epoch": 0.777257525083612, "grad_norm": 0.33475595712661743, "learning_rate": 7.474110760918505e-05, "loss": 1.4291, "step": 581 }, { "epoch": 0.7785953177257525, "grad_norm": 0.3770703673362732, "learning_rate": 7.469608284556506e-05, "loss": 2.0621, "step": 582 }, { "epoch": 0.7799331103678929, "grad_norm": 0.7004806995391846, "learning_rate": 7.465105808194507e-05, "loss": 2.0331, "step": 583 }, { "epoch": 0.7812709030100334, "grad_norm": 0.38309189677238464, "learning_rate": 7.460603331832509e-05, "loss": 1.8102, "step": 584 }, { "epoch": 0.782608695652174, "grad_norm": 0.33871129155158997, "learning_rate": 7.456100855470509e-05, "loss": 2.1456, "step": 585 }, { "epoch": 0.7839464882943143, "grad_norm": 0.4187791347503662, "learning_rate": 7.45159837910851e-05, "loss": 1.7323, "step": 586 }, { "epoch": 0.7852842809364549, "grad_norm": 0.2464730590581894, "learning_rate": 7.447095902746512e-05, "loss": 1.9111, "step": 587 }, { "epoch": 0.7866220735785954, "grad_norm": 0.37835320830345154, "learning_rate": 7.442593426384513e-05, "loss": 1.396, "step": 588 }, { "epoch": 0.7879598662207358, "grad_norm": 0.363475501537323, "learning_rate": 7.438090950022512e-05, "loss": 1.9542, "step": 589 }, { "epoch": 0.7892976588628763, "grad_norm": 0.37294596433639526, "learning_rate": 7.433588473660513e-05, "loss": 1.4258, "step": 590 }, { "epoch": 0.7906354515050167, "grad_norm": 0.3721500039100647, "learning_rate": 7.429085997298515e-05, "loss": 1.987, "step": 591 }, { "epoch": 0.7919732441471572, "grad_norm": 0.32486671209335327, "learning_rate": 7.424583520936516e-05, "loss": 1.8621, "step": 592 }, { "epoch": 0.7933110367892977, "grad_norm": 0.3256564736366272, "learning_rate": 7.420081044574516e-05, "loss": 1.7656, "step": 593 }, { "epoch": 0.7946488294314381, "grad_norm": 0.3214690089225769, "learning_rate": 7.415578568212516e-05, "loss": 2.0624, "step": 594 }, { "epoch": 0.7959866220735786, "grad_norm": 0.6959064602851868, "learning_rate": 7.411076091850519e-05, "loss": 1.1767, "step": 595 }, { "epoch": 0.797324414715719, "grad_norm": 0.36852458119392395, "learning_rate": 7.40657361548852e-05, "loss": 2.0539, "step": 596 }, { "epoch": 0.7986622073578595, "grad_norm": 0.2969950735569, "learning_rate": 7.402071139126519e-05, "loss": 1.5776, "step": 597 }, { "epoch": 0.8, "grad_norm": 0.450792521238327, "learning_rate": 7.397568662764521e-05, "loss": 1.5855, "step": 598 }, { "epoch": 0.8013377926421404, "grad_norm": 0.5402222871780396, "learning_rate": 7.393066186402522e-05, "loss": 1.8693, "step": 599 }, { "epoch": 0.802675585284281, "grad_norm": 0.406260222196579, "learning_rate": 7.388563710040523e-05, "loss": 1.8339, "step": 600 }, { "epoch": 0.8040133779264214, "grad_norm": 0.3280527591705322, "learning_rate": 7.384061233678522e-05, "loss": 1.469, "step": 601 }, { "epoch": 0.8053511705685619, "grad_norm": 0.3896856904029846, "learning_rate": 7.379558757316525e-05, "loss": 1.7457, "step": 602 }, { "epoch": 0.8066889632107024, "grad_norm": 0.3198883831501007, "learning_rate": 7.375056280954526e-05, "loss": 1.8016, "step": 603 }, { "epoch": 0.8080267558528428, "grad_norm": 0.3065289258956909, "learning_rate": 7.370553804592526e-05, "loss": 2.0356, "step": 604 }, { "epoch": 0.8093645484949833, "grad_norm": 0.37292978167533875, "learning_rate": 7.366051328230527e-05, "loss": 1.8988, "step": 605 }, { "epoch": 0.8107023411371237, "grad_norm": 0.3623843193054199, "learning_rate": 7.361548851868528e-05, "loss": 1.5998, "step": 606 }, { "epoch": 0.8120401337792642, "grad_norm": 0.43376150727272034, "learning_rate": 7.357046375506529e-05, "loss": 1.4865, "step": 607 }, { "epoch": 0.8133779264214047, "grad_norm": 0.3577169179916382, "learning_rate": 7.35254389914453e-05, "loss": 1.8154, "step": 608 }, { "epoch": 0.8147157190635451, "grad_norm": 0.33039671182632446, "learning_rate": 7.348041422782531e-05, "loss": 1.9667, "step": 609 }, { "epoch": 0.8160535117056856, "grad_norm": 0.3932124674320221, "learning_rate": 7.343538946420532e-05, "loss": 1.7125, "step": 610 }, { "epoch": 0.8173913043478261, "grad_norm": 0.3652454912662506, "learning_rate": 7.339036470058532e-05, "loss": 1.4416, "step": 611 }, { "epoch": 0.8187290969899665, "grad_norm": 0.6105794310569763, "learning_rate": 7.334533993696533e-05, "loss": 1.7701, "step": 612 }, { "epoch": 0.820066889632107, "grad_norm": 0.3876180946826935, "learning_rate": 7.330031517334534e-05, "loss": 1.6549, "step": 613 }, { "epoch": 0.8214046822742475, "grad_norm": 0.29518094658851624, "learning_rate": 7.325529040972535e-05, "loss": 1.8002, "step": 614 }, { "epoch": 0.822742474916388, "grad_norm": 0.7063753604888916, "learning_rate": 7.321026564610536e-05, "loss": 1.6917, "step": 615 }, { "epoch": 0.8240802675585285, "grad_norm": 0.5985257625579834, "learning_rate": 7.316524088248538e-05, "loss": 1.9754, "step": 616 }, { "epoch": 0.8254180602006689, "grad_norm": 0.36610302329063416, "learning_rate": 7.312021611886538e-05, "loss": 2.1499, "step": 617 }, { "epoch": 0.8267558528428094, "grad_norm": 0.32885974645614624, "learning_rate": 7.307519135524538e-05, "loss": 1.8947, "step": 618 }, { "epoch": 0.8280936454849498, "grad_norm": 0.449900358915329, "learning_rate": 7.303016659162539e-05, "loss": 1.257, "step": 619 }, { "epoch": 0.8294314381270903, "grad_norm": 0.37102293968200684, "learning_rate": 7.298514182800542e-05, "loss": 1.5899, "step": 620 }, { "epoch": 0.8307692307692308, "grad_norm": 0.31158074736595154, "learning_rate": 7.294011706438541e-05, "loss": 1.7936, "step": 621 }, { "epoch": 0.8321070234113712, "grad_norm": 0.3557308614253998, "learning_rate": 7.289509230076542e-05, "loss": 1.6656, "step": 622 }, { "epoch": 0.8334448160535117, "grad_norm": 0.7624556422233582, "learning_rate": 7.285006753714544e-05, "loss": 1.8276, "step": 623 }, { "epoch": 0.8347826086956521, "grad_norm": 0.3486712872982025, "learning_rate": 7.280504277352545e-05, "loss": 2.2463, "step": 624 }, { "epoch": 0.8361204013377926, "grad_norm": 0.5724909901618958, "learning_rate": 7.276001800990544e-05, "loss": 1.8355, "step": 625 }, { "epoch": 0.8374581939799332, "grad_norm": 0.8517188429832458, "learning_rate": 7.271499324628545e-05, "loss": 0.8871, "step": 626 }, { "epoch": 0.8387959866220736, "grad_norm": 0.4913051128387451, "learning_rate": 7.266996848266548e-05, "loss": 1.856, "step": 627 }, { "epoch": 0.8401337792642141, "grad_norm": 0.45800983905792236, "learning_rate": 7.262494371904548e-05, "loss": 1.4221, "step": 628 }, { "epoch": 0.8414715719063545, "grad_norm": 0.6228715777397156, "learning_rate": 7.257991895542548e-05, "loss": 2.224, "step": 629 }, { "epoch": 0.842809364548495, "grad_norm": 0.5099749565124512, "learning_rate": 7.253489419180549e-05, "loss": 1.702, "step": 630 }, { "epoch": 0.8441471571906355, "grad_norm": 0.39042428135871887, "learning_rate": 7.248986942818551e-05, "loss": 1.5742, "step": 631 }, { "epoch": 0.8454849498327759, "grad_norm": 0.7996957898139954, "learning_rate": 7.244484466456552e-05, "loss": 1.328, "step": 632 }, { "epoch": 0.8468227424749164, "grad_norm": 0.35331302881240845, "learning_rate": 7.239981990094553e-05, "loss": 1.6865, "step": 633 }, { "epoch": 0.8481605351170568, "grad_norm": 0.4120136499404907, "learning_rate": 7.235479513732554e-05, "loss": 1.684, "step": 634 }, { "epoch": 0.8494983277591973, "grad_norm": 0.4208261966705322, "learning_rate": 7.230977037370554e-05, "loss": 1.8001, "step": 635 }, { "epoch": 0.8508361204013378, "grad_norm": 0.41124269366264343, "learning_rate": 7.226474561008555e-05, "loss": 1.7426, "step": 636 }, { "epoch": 0.8521739130434782, "grad_norm": 0.5735759139060974, "learning_rate": 7.221972084646556e-05, "loss": 1.6861, "step": 637 }, { "epoch": 0.8535117056856187, "grad_norm": 0.3232871890068054, "learning_rate": 7.217469608284557e-05, "loss": 1.9313, "step": 638 }, { "epoch": 0.8548494983277592, "grad_norm": 0.315378338098526, "learning_rate": 7.212967131922558e-05, "loss": 1.7994, "step": 639 }, { "epoch": 0.8561872909698997, "grad_norm": 0.5437686443328857, "learning_rate": 7.208464655560559e-05, "loss": 1.761, "step": 640 }, { "epoch": 0.8575250836120402, "grad_norm": 0.4012024998664856, "learning_rate": 7.20396217919856e-05, "loss": 1.8082, "step": 641 }, { "epoch": 0.8588628762541806, "grad_norm": 0.35168206691741943, "learning_rate": 7.19945970283656e-05, "loss": 1.8141, "step": 642 }, { "epoch": 0.8602006688963211, "grad_norm": 0.29326000809669495, "learning_rate": 7.194957226474561e-05, "loss": 2.0852, "step": 643 }, { "epoch": 0.8615384615384616, "grad_norm": 0.5200529098510742, "learning_rate": 7.190454750112562e-05, "loss": 1.7179, "step": 644 }, { "epoch": 0.862876254180602, "grad_norm": 0.419899046421051, "learning_rate": 7.185952273750563e-05, "loss": 1.8952, "step": 645 }, { "epoch": 0.8642140468227425, "grad_norm": 0.3791028559207916, "learning_rate": 7.181449797388564e-05, "loss": 1.4306, "step": 646 }, { "epoch": 0.8655518394648829, "grad_norm": 0.3782893121242523, "learning_rate": 7.176947321026565e-05, "loss": 1.8628, "step": 647 }, { "epoch": 0.8668896321070234, "grad_norm": 0.3860854208469391, "learning_rate": 7.172444844664566e-05, "loss": 1.6255, "step": 648 }, { "epoch": 0.8682274247491639, "grad_norm": 0.361833393573761, "learning_rate": 7.167942368302566e-05, "loss": 1.7462, "step": 649 }, { "epoch": 0.8695652173913043, "grad_norm": 0.400254487991333, "learning_rate": 7.163439891940567e-05, "loss": 1.3854, "step": 650 }, { "epoch": 0.8709030100334448, "grad_norm": 0.34192296862602234, "learning_rate": 7.158937415578568e-05, "loss": 2.0707, "step": 651 }, { "epoch": 0.8722408026755852, "grad_norm": 0.5104194283485413, "learning_rate": 7.15443493921657e-05, "loss": 1.6693, "step": 652 }, { "epoch": 0.8735785953177257, "grad_norm": 0.3650829792022705, "learning_rate": 7.149932462854571e-05, "loss": 0.9515, "step": 653 }, { "epoch": 0.8749163879598663, "grad_norm": 0.42598962783813477, "learning_rate": 7.145429986492571e-05, "loss": 1.6728, "step": 654 }, { "epoch": 0.8762541806020067, "grad_norm": 0.4158768057823181, "learning_rate": 7.140927510130572e-05, "loss": 1.2007, "step": 655 }, { "epoch": 0.8775919732441472, "grad_norm": 0.34016454219818115, "learning_rate": 7.136425033768574e-05, "loss": 2.1394, "step": 656 }, { "epoch": 0.8789297658862876, "grad_norm": 0.34736478328704834, "learning_rate": 7.131922557406575e-05, "loss": 1.566, "step": 657 }, { "epoch": 0.8802675585284281, "grad_norm": 0.47918611764907837, "learning_rate": 7.127420081044574e-05, "loss": 1.7246, "step": 658 }, { "epoch": 0.8816053511705686, "grad_norm": 0.41045960783958435, "learning_rate": 7.122917604682576e-05, "loss": 1.7283, "step": 659 }, { "epoch": 0.882943143812709, "grad_norm": 0.3634040057659149, "learning_rate": 7.118415128320577e-05, "loss": 1.6028, "step": 660 }, { "epoch": 0.8842809364548495, "grad_norm": 0.3791142404079437, "learning_rate": 7.113912651958578e-05, "loss": 1.7992, "step": 661 }, { "epoch": 0.88561872909699, "grad_norm": 0.356406569480896, "learning_rate": 7.109410175596578e-05, "loss": 1.7343, "step": 662 }, { "epoch": 0.8869565217391304, "grad_norm": 0.36963218450546265, "learning_rate": 7.10490769923458e-05, "loss": 1.8745, "step": 663 }, { "epoch": 0.8882943143812709, "grad_norm": 0.5904031991958618, "learning_rate": 7.10040522287258e-05, "loss": 1.6621, "step": 664 }, { "epoch": 0.8896321070234113, "grad_norm": 0.40875113010406494, "learning_rate": 7.095902746510582e-05, "loss": 1.826, "step": 665 }, { "epoch": 0.8909698996655518, "grad_norm": 0.6849672198295593, "learning_rate": 7.091400270148581e-05, "loss": 1.7498, "step": 666 }, { "epoch": 0.8923076923076924, "grad_norm": 0.8226297497749329, "learning_rate": 7.086897793786583e-05, "loss": 1.8779, "step": 667 }, { "epoch": 0.8936454849498328, "grad_norm": 0.39494749903678894, "learning_rate": 7.082395317424584e-05, "loss": 1.3091, "step": 668 }, { "epoch": 0.8949832775919733, "grad_norm": 0.36774224042892456, "learning_rate": 7.077892841062585e-05, "loss": 2.0947, "step": 669 }, { "epoch": 0.8963210702341137, "grad_norm": 0.38955172896385193, "learning_rate": 7.073390364700586e-05, "loss": 1.8579, "step": 670 }, { "epoch": 0.8976588628762542, "grad_norm": 0.3742164671421051, "learning_rate": 7.068887888338587e-05, "loss": 1.6201, "step": 671 }, { "epoch": 0.8989966555183947, "grad_norm": 0.3513646721839905, "learning_rate": 7.064385411976588e-05, "loss": 1.9323, "step": 672 }, { "epoch": 0.9003344481605351, "grad_norm": 0.3952341079711914, "learning_rate": 7.059882935614588e-05, "loss": 1.6883, "step": 673 }, { "epoch": 0.9016722408026756, "grad_norm": 0.42206764221191406, "learning_rate": 7.055380459252589e-05, "loss": 1.9177, "step": 674 }, { "epoch": 0.903010033444816, "grad_norm": 0.36615172028541565, "learning_rate": 7.05087798289059e-05, "loss": 1.7579, "step": 675 }, { "epoch": 0.9043478260869565, "grad_norm": 0.3826829791069031, "learning_rate": 7.046375506528591e-05, "loss": 2.0081, "step": 676 }, { "epoch": 0.905685618729097, "grad_norm": 0.3107741177082062, "learning_rate": 7.041873030166593e-05, "loss": 2.1041, "step": 677 }, { "epoch": 0.9070234113712374, "grad_norm": 0.33083420991897583, "learning_rate": 7.037370553804593e-05, "loss": 1.8097, "step": 678 }, { "epoch": 0.9083612040133779, "grad_norm": 0.3041648864746094, "learning_rate": 7.032868077442594e-05, "loss": 1.0782, "step": 679 }, { "epoch": 0.9096989966555183, "grad_norm": 0.5441336035728455, "learning_rate": 7.028365601080594e-05, "loss": 1.6318, "step": 680 }, { "epoch": 0.9110367892976589, "grad_norm": 0.363393634557724, "learning_rate": 7.023863124718597e-05, "loss": 1.7169, "step": 681 }, { "epoch": 0.9123745819397994, "grad_norm": 0.6789020299911499, "learning_rate": 7.019360648356596e-05, "loss": 1.9805, "step": 682 }, { "epoch": 0.9137123745819398, "grad_norm": 0.3467765748500824, "learning_rate": 7.014858171994597e-05, "loss": 1.8415, "step": 683 }, { "epoch": 0.9150501672240803, "grad_norm": 0.7270785570144653, "learning_rate": 7.010355695632598e-05, "loss": 1.3206, "step": 684 }, { "epoch": 0.9163879598662207, "grad_norm": 0.5416232943534851, "learning_rate": 7.0058532192706e-05, "loss": 1.8754, "step": 685 }, { "epoch": 0.9177257525083612, "grad_norm": 0.3280985653400421, "learning_rate": 7.0013507429086e-05, "loss": 1.8428, "step": 686 }, { "epoch": 0.9190635451505017, "grad_norm": 0.3461412787437439, "learning_rate": 6.9968482665466e-05, "loss": 1.5161, "step": 687 }, { "epoch": 0.9204013377926421, "grad_norm": 0.3746154308319092, "learning_rate": 6.992345790184603e-05, "loss": 1.8658, "step": 688 }, { "epoch": 0.9217391304347826, "grad_norm": 0.3138766288757324, "learning_rate": 6.987843313822603e-05, "loss": 1.4987, "step": 689 }, { "epoch": 0.9230769230769231, "grad_norm": 0.34391963481903076, "learning_rate": 6.983340837460603e-05, "loss": 1.7672, "step": 690 }, { "epoch": 0.9244147157190635, "grad_norm": 0.7809226512908936, "learning_rate": 6.978838361098604e-05, "loss": 1.6137, "step": 691 }, { "epoch": 0.925752508361204, "grad_norm": 0.3945732116699219, "learning_rate": 6.974335884736606e-05, "loss": 1.7833, "step": 692 }, { "epoch": 0.9270903010033444, "grad_norm": 0.41453108191490173, "learning_rate": 6.969833408374607e-05, "loss": 1.5949, "step": 693 }, { "epoch": 0.928428093645485, "grad_norm": 0.3755939304828644, "learning_rate": 6.965330932012606e-05, "loss": 2.3068, "step": 694 }, { "epoch": 0.9297658862876255, "grad_norm": 0.345302015542984, "learning_rate": 6.960828455650609e-05, "loss": 1.7721, "step": 695 }, { "epoch": 0.9311036789297659, "grad_norm": 0.3220181465148926, "learning_rate": 6.95632597928861e-05, "loss": 1.6998, "step": 696 }, { "epoch": 0.9324414715719064, "grad_norm": 0.366615891456604, "learning_rate": 6.95182350292661e-05, "loss": 2.0303, "step": 697 }, { "epoch": 0.9337792642140468, "grad_norm": 0.4134201407432556, "learning_rate": 6.94732102656461e-05, "loss": 1.3428, "step": 698 }, { "epoch": 0.9351170568561873, "grad_norm": 0.38845616579055786, "learning_rate": 6.942818550202612e-05, "loss": 2.0773, "step": 699 }, { "epoch": 0.9364548494983278, "grad_norm": 0.4948274493217468, "learning_rate": 6.938316073840613e-05, "loss": 1.8458, "step": 700 }, { "epoch": 0.9377926421404682, "grad_norm": 0.43470853567123413, "learning_rate": 6.933813597478614e-05, "loss": 1.9182, "step": 701 }, { "epoch": 0.9391304347826087, "grad_norm": 0.503555417060852, "learning_rate": 6.929311121116615e-05, "loss": 2.1079, "step": 702 }, { "epoch": 0.9404682274247491, "grad_norm": 0.31471186876296997, "learning_rate": 6.924808644754615e-05, "loss": 1.8166, "step": 703 }, { "epoch": 0.9418060200668896, "grad_norm": 0.5391733646392822, "learning_rate": 6.920306168392616e-05, "loss": 1.6998, "step": 704 }, { "epoch": 0.9431438127090301, "grad_norm": 0.3019644618034363, "learning_rate": 6.915803692030617e-05, "loss": 2.0563, "step": 705 }, { "epoch": 0.9444816053511705, "grad_norm": 0.5476101040840149, "learning_rate": 6.911301215668618e-05, "loss": 1.5407, "step": 706 }, { "epoch": 0.945819397993311, "grad_norm": 0.5264822244644165, "learning_rate": 6.906798739306619e-05, "loss": 1.7866, "step": 707 }, { "epoch": 0.9471571906354515, "grad_norm": 0.2899741232395172, "learning_rate": 6.90229626294462e-05, "loss": 1.8482, "step": 708 }, { "epoch": 0.948494983277592, "grad_norm": 0.45578619837760925, "learning_rate": 6.89779378658262e-05, "loss": 1.3086, "step": 709 }, { "epoch": 0.9498327759197325, "grad_norm": 0.3755444586277008, "learning_rate": 6.893291310220622e-05, "loss": 1.9272, "step": 710 }, { "epoch": 0.9511705685618729, "grad_norm": 0.5911352634429932, "learning_rate": 6.888788833858622e-05, "loss": 1.9175, "step": 711 }, { "epoch": 0.9525083612040134, "grad_norm": 0.2892666757106781, "learning_rate": 6.884286357496623e-05, "loss": 1.3744, "step": 712 }, { "epoch": 0.9538461538461539, "grad_norm": 0.4964825212955475, "learning_rate": 6.879783881134625e-05, "loss": 1.8149, "step": 713 }, { "epoch": 0.9551839464882943, "grad_norm": 0.3442051410675049, "learning_rate": 6.875281404772625e-05, "loss": 1.9712, "step": 714 }, { "epoch": 0.9565217391304348, "grad_norm": 0.5018149614334106, "learning_rate": 6.870778928410626e-05, "loss": 1.2699, "step": 715 }, { "epoch": 0.9578595317725752, "grad_norm": 0.45120319724082947, "learning_rate": 6.866276452048627e-05, "loss": 1.871, "step": 716 }, { "epoch": 0.9591973244147157, "grad_norm": 0.3946061134338379, "learning_rate": 6.861773975686629e-05, "loss": 1.749, "step": 717 }, { "epoch": 0.9605351170568562, "grad_norm": 0.7921074628829956, "learning_rate": 6.857271499324628e-05, "loss": 1.9881, "step": 718 }, { "epoch": 0.9618729096989966, "grad_norm": 0.29882094264030457, "learning_rate": 6.852769022962629e-05, "loss": 1.8386, "step": 719 }, { "epoch": 0.9632107023411371, "grad_norm": 0.38971996307373047, "learning_rate": 6.84826654660063e-05, "loss": 1.8224, "step": 720 }, { "epoch": 0.9645484949832775, "grad_norm": 0.435881644487381, "learning_rate": 6.843764070238632e-05, "loss": 1.5976, "step": 721 }, { "epoch": 0.9658862876254181, "grad_norm": 0.3308292329311371, "learning_rate": 6.839261593876632e-05, "loss": 1.9759, "step": 722 }, { "epoch": 0.9672240802675586, "grad_norm": 0.38128048181533813, "learning_rate": 6.834759117514633e-05, "loss": 2.0457, "step": 723 }, { "epoch": 0.968561872909699, "grad_norm": 0.37816083431243896, "learning_rate": 6.830256641152635e-05, "loss": 1.7379, "step": 724 }, { "epoch": 0.9698996655518395, "grad_norm": 0.3599185049533844, "learning_rate": 6.825754164790636e-05, "loss": 1.3529, "step": 725 }, { "epoch": 0.9712374581939799, "grad_norm": 0.34246405959129333, "learning_rate": 6.821251688428637e-05, "loss": 2.2177, "step": 726 }, { "epoch": 0.9725752508361204, "grad_norm": 0.36853083968162537, "learning_rate": 6.816749212066636e-05, "loss": 1.8784, "step": 727 }, { "epoch": 0.9739130434782609, "grad_norm": 0.39494502544403076, "learning_rate": 6.812246735704638e-05, "loss": 2.0329, "step": 728 }, { "epoch": 0.9752508361204013, "grad_norm": 1.9131022691726685, "learning_rate": 6.807744259342639e-05, "loss": 1.391, "step": 729 }, { "epoch": 0.9765886287625418, "grad_norm": 0.4040629267692566, "learning_rate": 6.80324178298064e-05, "loss": 2.0042, "step": 730 }, { "epoch": 0.9779264214046822, "grad_norm": 0.38302701711654663, "learning_rate": 6.798739306618641e-05, "loss": 2.1064, "step": 731 }, { "epoch": 0.9792642140468227, "grad_norm": 0.3989606201648712, "learning_rate": 6.794236830256642e-05, "loss": 1.6208, "step": 732 }, { "epoch": 0.9806020066889632, "grad_norm": 0.5049397945404053, "learning_rate": 6.789734353894643e-05, "loss": 1.9809, "step": 733 }, { "epoch": 0.9819397993311036, "grad_norm": 0.5564031004905701, "learning_rate": 6.785231877532643e-05, "loss": 1.9769, "step": 734 }, { "epoch": 0.9832775919732442, "grad_norm": 1.37478506565094, "learning_rate": 6.780729401170644e-05, "loss": 1.6798, "step": 735 }, { "epoch": 0.9846153846153847, "grad_norm": 0.2812560498714447, "learning_rate": 6.776226924808645e-05, "loss": 1.4855, "step": 736 }, { "epoch": 0.9859531772575251, "grad_norm": 0.5258631706237793, "learning_rate": 6.771724448446646e-05, "loss": 2.1563, "step": 737 }, { "epoch": 0.9872909698996656, "grad_norm": 0.7295839190483093, "learning_rate": 6.767221972084647e-05, "loss": 2.0624, "step": 738 }, { "epoch": 0.988628762541806, "grad_norm": 0.4331284463405609, "learning_rate": 6.762719495722648e-05, "loss": 1.5767, "step": 739 }, { "epoch": 0.9899665551839465, "grad_norm": 0.38064879179000854, "learning_rate": 6.758217019360649e-05, "loss": 1.8441, "step": 740 }, { "epoch": 0.991304347826087, "grad_norm": 0.3991696834564209, "learning_rate": 6.75371454299865e-05, "loss": 1.769, "step": 741 }, { "epoch": 0.9926421404682274, "grad_norm": 0.8858866691589355, "learning_rate": 6.74921206663665e-05, "loss": 1.6091, "step": 742 }, { "epoch": 0.9939799331103679, "grad_norm": 0.2829599380493164, "learning_rate": 6.744709590274651e-05, "loss": 1.4218, "step": 743 }, { "epoch": 0.9953177257525083, "grad_norm": 0.3447091579437256, "learning_rate": 6.740207113912652e-05, "loss": 2.08, "step": 744 }, { "epoch": 0.9966555183946488, "grad_norm": 0.5587038397789001, "learning_rate": 6.735704637550653e-05, "loss": 1.9443, "step": 745 }, { "epoch": 0.9979933110367893, "grad_norm": 0.29927799105644226, "learning_rate": 6.731202161188654e-05, "loss": 2.1269, "step": 746 }, { "epoch": 0.9993311036789297, "grad_norm": 0.3965597152709961, "learning_rate": 6.726699684826655e-05, "loss": 1.7172, "step": 747 }, { "epoch": 1.0006688963210701, "grad_norm": 0.36948877573013306, "learning_rate": 6.722197208464655e-05, "loss": 1.3854, "step": 748 }, { "epoch": 1.0020066889632107, "grad_norm": 0.36516493558883667, "learning_rate": 6.717694732102658e-05, "loss": 1.4223, "step": 749 }, { "epoch": 1.0033444816053512, "grad_norm": 0.4343159794807434, "learning_rate": 6.713192255740659e-05, "loss": 1.8986, "step": 750 }, { "epoch": 1.0046822742474917, "grad_norm": 0.4338577091693878, "learning_rate": 6.708689779378658e-05, "loss": 0.5871, "step": 751 }, { "epoch": 1.0060200668896322, "grad_norm": 0.41223862767219543, "learning_rate": 6.704187303016659e-05, "loss": 1.5983, "step": 752 }, { "epoch": 1.0073578595317725, "grad_norm": 0.7298925518989563, "learning_rate": 6.699684826654661e-05, "loss": 1.4322, "step": 753 }, { "epoch": 1.008695652173913, "grad_norm": 1.1361639499664307, "learning_rate": 6.695182350292662e-05, "loss": 1.4163, "step": 754 }, { "epoch": 1.0100334448160535, "grad_norm": 0.5715144276618958, "learning_rate": 6.690679873930662e-05, "loss": 1.498, "step": 755 }, { "epoch": 1.011371237458194, "grad_norm": 0.6086817383766174, "learning_rate": 6.686177397568662e-05, "loss": 1.7753, "step": 756 }, { "epoch": 1.0127090301003345, "grad_norm": 0.8653783202171326, "learning_rate": 6.681674921206665e-05, "loss": 1.5424, "step": 757 }, { "epoch": 1.0140468227424748, "grad_norm": 0.5191454291343689, "learning_rate": 6.677172444844665e-05, "loss": 1.7054, "step": 758 }, { "epoch": 1.0153846153846153, "grad_norm": 0.5097001791000366, "learning_rate": 6.672669968482665e-05, "loss": 1.4023, "step": 759 }, { "epoch": 1.0167224080267558, "grad_norm": 0.5936307311058044, "learning_rate": 6.668167492120667e-05, "loss": 1.1395, "step": 760 }, { "epoch": 1.0180602006688964, "grad_norm": 0.5675538778305054, "learning_rate": 6.663665015758668e-05, "loss": 1.3809, "step": 761 }, { "epoch": 1.0193979933110369, "grad_norm": 0.407277911901474, "learning_rate": 6.659162539396669e-05, "loss": 1.7331, "step": 762 }, { "epoch": 1.0207357859531772, "grad_norm": 1.353188157081604, "learning_rate": 6.654660063034668e-05, "loss": 1.409, "step": 763 }, { "epoch": 1.0220735785953177, "grad_norm": 0.3952758312225342, "learning_rate": 6.65015758667267e-05, "loss": 1.6219, "step": 764 }, { "epoch": 1.0234113712374582, "grad_norm": 0.6428177356719971, "learning_rate": 6.645655110310671e-05, "loss": 1.1543, "step": 765 }, { "epoch": 1.0247491638795987, "grad_norm": 1.0934861898422241, "learning_rate": 6.641152633948672e-05, "loss": 0.9668, "step": 766 }, { "epoch": 1.0260869565217392, "grad_norm": 0.4791591763496399, "learning_rate": 6.636650157586673e-05, "loss": 1.4271, "step": 767 }, { "epoch": 1.0274247491638795, "grad_norm": 0.5504617094993591, "learning_rate": 6.632147681224674e-05, "loss": 1.4648, "step": 768 }, { "epoch": 1.02876254180602, "grad_norm": 0.45765021443367004, "learning_rate": 6.627645204862675e-05, "loss": 1.3732, "step": 769 }, { "epoch": 1.0301003344481605, "grad_norm": 0.46277210116386414, "learning_rate": 6.623142728500676e-05, "loss": 1.0407, "step": 770 }, { "epoch": 1.031438127090301, "grad_norm": 0.5813681483268738, "learning_rate": 6.618640252138677e-05, "loss": 1.2612, "step": 771 }, { "epoch": 1.0327759197324415, "grad_norm": 0.408017635345459, "learning_rate": 6.614137775776677e-05, "loss": 1.5765, "step": 772 }, { "epoch": 1.034113712374582, "grad_norm": 0.6632124185562134, "learning_rate": 6.609635299414678e-05, "loss": 1.6674, "step": 773 }, { "epoch": 1.0354515050167223, "grad_norm": 0.563066303730011, "learning_rate": 6.605132823052679e-05, "loss": 1.4702, "step": 774 }, { "epoch": 1.0367892976588629, "grad_norm": 0.4824022948741913, "learning_rate": 6.60063034669068e-05, "loss": 1.4301, "step": 775 }, { "epoch": 1.0381270903010034, "grad_norm": 0.5674872398376465, "learning_rate": 6.596127870328681e-05, "loss": 1.1038, "step": 776 }, { "epoch": 1.0394648829431439, "grad_norm": 0.5925496816635132, "learning_rate": 6.591625393966682e-05, "loss": 1.3891, "step": 777 }, { "epoch": 1.0408026755852844, "grad_norm": 0.45445653796195984, "learning_rate": 6.587122917604684e-05, "loss": 1.4653, "step": 778 }, { "epoch": 1.0421404682274247, "grad_norm": 0.44357824325561523, "learning_rate": 6.582620441242683e-05, "loss": 1.4065, "step": 779 }, { "epoch": 1.0434782608695652, "grad_norm": 0.4146457612514496, "learning_rate": 6.578117964880684e-05, "loss": 1.6831, "step": 780 }, { "epoch": 1.0448160535117057, "grad_norm": 0.48196661472320557, "learning_rate": 6.573615488518685e-05, "loss": 1.5082, "step": 781 }, { "epoch": 1.0461538461538462, "grad_norm": 0.4578336179256439, "learning_rate": 6.569113012156687e-05, "loss": 1.7227, "step": 782 }, { "epoch": 1.0474916387959867, "grad_norm": 0.5198684334754944, "learning_rate": 6.564610535794687e-05, "loss": 1.1455, "step": 783 }, { "epoch": 1.048829431438127, "grad_norm": 0.497288316488266, "learning_rate": 6.560108059432688e-05, "loss": 1.6872, "step": 784 }, { "epoch": 1.0501672240802675, "grad_norm": 0.36458608508110046, "learning_rate": 6.55560558307069e-05, "loss": 2.2226, "step": 785 }, { "epoch": 1.051505016722408, "grad_norm": 0.8122197985649109, "learning_rate": 6.551103106708691e-05, "loss": 1.3132, "step": 786 }, { "epoch": 1.0528428093645485, "grad_norm": 0.4065374732017517, "learning_rate": 6.54660063034669e-05, "loss": 1.7364, "step": 787 }, { "epoch": 1.054180602006689, "grad_norm": 0.5058987140655518, "learning_rate": 6.542098153984691e-05, "loss": 1.6037, "step": 788 }, { "epoch": 1.0555183946488294, "grad_norm": 0.5093656778335571, "learning_rate": 6.537595677622693e-05, "loss": 1.4635, "step": 789 }, { "epoch": 1.0568561872909699, "grad_norm": 0.4434761106967926, "learning_rate": 6.533093201260694e-05, "loss": 1.7289, "step": 790 }, { "epoch": 1.0581939799331104, "grad_norm": 0.8123733997344971, "learning_rate": 6.528590724898694e-05, "loss": 1.3977, "step": 791 }, { "epoch": 1.0595317725752509, "grad_norm": 0.419659286737442, "learning_rate": 6.524088248536695e-05, "loss": 1.9813, "step": 792 }, { "epoch": 1.0608695652173914, "grad_norm": 0.4884895980358124, "learning_rate": 6.519585772174697e-05, "loss": 1.388, "step": 793 }, { "epoch": 1.0622073578595317, "grad_norm": 0.6140890717506409, "learning_rate": 6.515083295812698e-05, "loss": 1.0599, "step": 794 }, { "epoch": 1.0635451505016722, "grad_norm": 0.45217812061309814, "learning_rate": 6.510580819450697e-05, "loss": 1.5021, "step": 795 }, { "epoch": 1.0648829431438127, "grad_norm": 0.45027461647987366, "learning_rate": 6.5060783430887e-05, "loss": 1.4057, "step": 796 }, { "epoch": 1.0662207357859532, "grad_norm": 0.6308950185775757, "learning_rate": 6.5015758667267e-05, "loss": 1.3602, "step": 797 }, { "epoch": 1.0675585284280937, "grad_norm": 1.535835862159729, "learning_rate": 6.497073390364701e-05, "loss": 1.6195, "step": 798 }, { "epoch": 1.068896321070234, "grad_norm": 0.4812445640563965, "learning_rate": 6.492570914002702e-05, "loss": 1.6331, "step": 799 }, { "epoch": 1.0702341137123745, "grad_norm": 0.5334980487823486, "learning_rate": 6.488068437640703e-05, "loss": 1.1138, "step": 800 }, { "epoch": 1.071571906354515, "grad_norm": 0.6437579989433289, "learning_rate": 6.483565961278704e-05, "loss": 1.4453, "step": 801 }, { "epoch": 1.0729096989966556, "grad_norm": 1.0662591457366943, "learning_rate": 6.479063484916705e-05, "loss": 0.603, "step": 802 }, { "epoch": 1.074247491638796, "grad_norm": 0.6081445217132568, "learning_rate": 6.474561008554705e-05, "loss": 1.4157, "step": 803 }, { "epoch": 1.0755852842809364, "grad_norm": 0.5122383832931519, "learning_rate": 6.470058532192706e-05, "loss": 1.6912, "step": 804 }, { "epoch": 1.0769230769230769, "grad_norm": 0.5406622290611267, "learning_rate": 6.465556055830707e-05, "loss": 0.8931, "step": 805 }, { "epoch": 1.0782608695652174, "grad_norm": 0.6800340414047241, "learning_rate": 6.461053579468708e-05, "loss": 1.2054, "step": 806 }, { "epoch": 1.079598662207358, "grad_norm": 0.891855001449585, "learning_rate": 6.456551103106709e-05, "loss": 1.2156, "step": 807 }, { "epoch": 1.0809364548494984, "grad_norm": 0.4451705515384674, "learning_rate": 6.45204862674471e-05, "loss": 1.7379, "step": 808 }, { "epoch": 1.0822742474916387, "grad_norm": 1.2329106330871582, "learning_rate": 6.44754615038271e-05, "loss": 1.0769, "step": 809 }, { "epoch": 1.0836120401337792, "grad_norm": 0.46149975061416626, "learning_rate": 6.443043674020711e-05, "loss": 1.7287, "step": 810 }, { "epoch": 1.0849498327759197, "grad_norm": 0.4032789170742035, "learning_rate": 6.438541197658712e-05, "loss": 1.6798, "step": 811 }, { "epoch": 1.0862876254180602, "grad_norm": 0.5288923978805542, "learning_rate": 6.434038721296713e-05, "loss": 1.3158, "step": 812 }, { "epoch": 1.0876254180602007, "grad_norm": 0.4517413377761841, "learning_rate": 6.429536244934714e-05, "loss": 1.7064, "step": 813 }, { "epoch": 1.088963210702341, "grad_norm": 0.5029974579811096, "learning_rate": 6.425033768572716e-05, "loss": 1.0985, "step": 814 }, { "epoch": 1.0903010033444815, "grad_norm": 0.649246096611023, "learning_rate": 6.420531292210716e-05, "loss": 1.3266, "step": 815 }, { "epoch": 1.091638795986622, "grad_norm": 0.8336004614830017, "learning_rate": 6.416028815848717e-05, "loss": 0.9194, "step": 816 }, { "epoch": 1.0929765886287626, "grad_norm": 0.3941144347190857, "learning_rate": 6.411526339486717e-05, "loss": 1.7471, "step": 817 }, { "epoch": 1.094314381270903, "grad_norm": 0.38524457812309265, "learning_rate": 6.40702386312472e-05, "loss": 1.3742, "step": 818 }, { "epoch": 1.0956521739130434, "grad_norm": 0.45431381464004517, "learning_rate": 6.402521386762719e-05, "loss": 0.9257, "step": 819 }, { "epoch": 1.0969899665551839, "grad_norm": 0.5474141240119934, "learning_rate": 6.39801891040072e-05, "loss": 1.437, "step": 820 }, { "epoch": 1.0983277591973244, "grad_norm": 0.5354152321815491, "learning_rate": 6.393516434038722e-05, "loss": 1.4539, "step": 821 }, { "epoch": 1.099665551839465, "grad_norm": 0.6111787557601929, "learning_rate": 6.389013957676723e-05, "loss": 1.3616, "step": 822 }, { "epoch": 1.1010033444816054, "grad_norm": 0.5158681273460388, "learning_rate": 6.384511481314724e-05, "loss": 1.4201, "step": 823 }, { "epoch": 1.1023411371237457, "grad_norm": 0.6790710687637329, "learning_rate": 6.380009004952723e-05, "loss": 1.4573, "step": 824 }, { "epoch": 1.1036789297658862, "grad_norm": 0.6283074021339417, "learning_rate": 6.375506528590726e-05, "loss": 1.0444, "step": 825 }, { "epoch": 1.1050167224080267, "grad_norm": 0.41498756408691406, "learning_rate": 6.371004052228727e-05, "loss": 1.3567, "step": 826 }, { "epoch": 1.1063545150501672, "grad_norm": 0.60197913646698, "learning_rate": 6.366501575866727e-05, "loss": 1.1496, "step": 827 }, { "epoch": 1.1076923076923078, "grad_norm": 0.4228304922580719, "learning_rate": 6.361999099504727e-05, "loss": 1.3917, "step": 828 }, { "epoch": 1.109030100334448, "grad_norm": 0.5080685019493103, "learning_rate": 6.357496623142729e-05, "loss": 1.8623, "step": 829 }, { "epoch": 1.1103678929765886, "grad_norm": 0.49745386838912964, "learning_rate": 6.35299414678073e-05, "loss": 1.421, "step": 830 }, { "epoch": 1.111705685618729, "grad_norm": 0.8473727107048035, "learning_rate": 6.348491670418731e-05, "loss": 0.7793, "step": 831 }, { "epoch": 1.1130434782608696, "grad_norm": 0.5693299174308777, "learning_rate": 6.343989194056732e-05, "loss": 1.0338, "step": 832 }, { "epoch": 1.11438127090301, "grad_norm": 0.5039275884628296, "learning_rate": 6.339486717694733e-05, "loss": 1.0676, "step": 833 }, { "epoch": 1.1157190635451506, "grad_norm": 0.5785884261131287, "learning_rate": 6.334984241332733e-05, "loss": 1.4532, "step": 834 }, { "epoch": 1.117056856187291, "grad_norm": 0.4384928047657013, "learning_rate": 6.330481764970734e-05, "loss": 1.9992, "step": 835 }, { "epoch": 1.1183946488294314, "grad_norm": 0.6519571542739868, "learning_rate": 6.325979288608735e-05, "loss": 1.8105, "step": 836 }, { "epoch": 1.119732441471572, "grad_norm": 0.5303328633308411, "learning_rate": 6.321476812246736e-05, "loss": 1.5492, "step": 837 }, { "epoch": 1.1210702341137124, "grad_norm": 0.5411662459373474, "learning_rate": 6.316974335884737e-05, "loss": 1.4728, "step": 838 }, { "epoch": 1.122408026755853, "grad_norm": 0.8510624170303345, "learning_rate": 6.312471859522738e-05, "loss": 1.0466, "step": 839 }, { "epoch": 1.1237458193979932, "grad_norm": 0.3836539685726166, "learning_rate": 6.307969383160739e-05, "loss": 1.1762, "step": 840 }, { "epoch": 1.1250836120401337, "grad_norm": 0.4980509877204895, "learning_rate": 6.30346690679874e-05, "loss": 1.7758, "step": 841 }, { "epoch": 1.1264214046822743, "grad_norm": 0.364563524723053, "learning_rate": 6.29896443043674e-05, "loss": 1.2367, "step": 842 }, { "epoch": 1.1277591973244148, "grad_norm": 0.7302637100219727, "learning_rate": 6.294461954074741e-05, "loss": 1.5548, "step": 843 }, { "epoch": 1.1290969899665553, "grad_norm": 0.5207593441009521, "learning_rate": 6.289959477712742e-05, "loss": 1.3097, "step": 844 }, { "epoch": 1.1304347826086956, "grad_norm": 0.519754946231842, "learning_rate": 6.285457001350743e-05, "loss": 1.5021, "step": 845 }, { "epoch": 1.131772575250836, "grad_norm": 0.5610724687576294, "learning_rate": 6.280954524988744e-05, "loss": 1.4954, "step": 846 }, { "epoch": 1.1331103678929766, "grad_norm": 0.44261258840560913, "learning_rate": 6.276452048626746e-05, "loss": 1.6946, "step": 847 }, { "epoch": 1.134448160535117, "grad_norm": 0.5406861305236816, "learning_rate": 6.271949572264745e-05, "loss": 1.5805, "step": 848 }, { "epoch": 1.1357859531772576, "grad_norm": 0.6939743757247925, "learning_rate": 6.267447095902746e-05, "loss": 1.3863, "step": 849 }, { "epoch": 1.137123745819398, "grad_norm": 0.7074916362762451, "learning_rate": 6.262944619540749e-05, "loss": 0.8737, "step": 850 }, { "epoch": 1.1384615384615384, "grad_norm": 0.5969648957252502, "learning_rate": 6.25844214317875e-05, "loss": 1.5225, "step": 851 }, { "epoch": 1.139799331103679, "grad_norm": 0.542970597743988, "learning_rate": 6.253939666816749e-05, "loss": 1.5206, "step": 852 }, { "epoch": 1.1411371237458194, "grad_norm": 0.4956679344177246, "learning_rate": 6.24943719045475e-05, "loss": 1.6725, "step": 853 }, { "epoch": 1.14247491638796, "grad_norm": 0.620997965335846, "learning_rate": 6.244934714092752e-05, "loss": 1.3671, "step": 854 }, { "epoch": 1.1438127090301002, "grad_norm": 0.5618900060653687, "learning_rate": 6.240432237730753e-05, "loss": 1.4067, "step": 855 }, { "epoch": 1.1451505016722408, "grad_norm": 0.847754716873169, "learning_rate": 6.235929761368752e-05, "loss": 1.1148, "step": 856 }, { "epoch": 1.1464882943143813, "grad_norm": 0.5353214144706726, "learning_rate": 6.231427285006755e-05, "loss": 1.5326, "step": 857 }, { "epoch": 1.1478260869565218, "grad_norm": 0.6438226699829102, "learning_rate": 6.226924808644755e-05, "loss": 1.651, "step": 858 }, { "epoch": 1.1491638795986623, "grad_norm": 0.5264010429382324, "learning_rate": 6.222422332282756e-05, "loss": 1.3338, "step": 859 }, { "epoch": 1.1505016722408028, "grad_norm": 0.4603655934333801, "learning_rate": 6.217919855920756e-05, "loss": 1.1312, "step": 860 }, { "epoch": 1.151839464882943, "grad_norm": 0.850045919418335, "learning_rate": 6.213417379558758e-05, "loss": 1.1957, "step": 861 }, { "epoch": 1.1531772575250836, "grad_norm": 0.5338031053543091, "learning_rate": 6.208914903196759e-05, "loss": 1.9606, "step": 862 }, { "epoch": 1.154515050167224, "grad_norm": 0.7752164006233215, "learning_rate": 6.20441242683476e-05, "loss": 1.2606, "step": 863 }, { "epoch": 1.1558528428093646, "grad_norm": 0.532210111618042, "learning_rate": 6.199909950472759e-05, "loss": 1.7095, "step": 864 }, { "epoch": 1.1571906354515051, "grad_norm": 0.5509344339370728, "learning_rate": 6.195407474110761e-05, "loss": 1.047, "step": 865 }, { "epoch": 1.1585284280936454, "grad_norm": 0.43656522035598755, "learning_rate": 6.190904997748762e-05, "loss": 1.5622, "step": 866 }, { "epoch": 1.159866220735786, "grad_norm": 0.5709621906280518, "learning_rate": 6.186402521386763e-05, "loss": 1.425, "step": 867 }, { "epoch": 1.1612040133779264, "grad_norm": 0.4696739912033081, "learning_rate": 6.181900045024764e-05, "loss": 1.4856, "step": 868 }, { "epoch": 1.162541806020067, "grad_norm": 0.4259323179721832, "learning_rate": 6.177397568662765e-05, "loss": 1.1235, "step": 869 }, { "epoch": 1.1638795986622075, "grad_norm": 0.5247973799705505, "learning_rate": 6.172895092300766e-05, "loss": 1.5293, "step": 870 }, { "epoch": 1.1652173913043478, "grad_norm": 0.510625422000885, "learning_rate": 6.168392615938767e-05, "loss": 1.5001, "step": 871 }, { "epoch": 1.1665551839464883, "grad_norm": 0.5341862440109253, "learning_rate": 6.163890139576767e-05, "loss": 1.4726, "step": 872 }, { "epoch": 1.1678929765886288, "grad_norm": 0.42538025975227356, "learning_rate": 6.159387663214768e-05, "loss": 1.1921, "step": 873 }, { "epoch": 1.1692307692307693, "grad_norm": 0.5730955600738525, "learning_rate": 6.154885186852769e-05, "loss": 1.2907, "step": 874 }, { "epoch": 1.1705685618729098, "grad_norm": 0.3825736939907074, "learning_rate": 6.150382710490771e-05, "loss": 1.7252, "step": 875 }, { "epoch": 1.17190635451505, "grad_norm": 0.4969604015350342, "learning_rate": 6.145880234128771e-05, "loss": 1.4712, "step": 876 }, { "epoch": 1.1732441471571906, "grad_norm": 0.4480053782463074, "learning_rate": 6.141377757766772e-05, "loss": 1.5548, "step": 877 }, { "epoch": 1.1745819397993311, "grad_norm": 1.1541557312011719, "learning_rate": 6.136875281404773e-05, "loss": 1.3099, "step": 878 }, { "epoch": 1.1759197324414716, "grad_norm": 0.5004495978355408, "learning_rate": 6.132372805042775e-05, "loss": 1.7068, "step": 879 }, { "epoch": 1.1772575250836121, "grad_norm": 0.7697363495826721, "learning_rate": 6.127870328680774e-05, "loss": 1.5432, "step": 880 }, { "epoch": 1.1785953177257524, "grad_norm": 0.9075251221656799, "learning_rate": 6.123367852318775e-05, "loss": 1.0909, "step": 881 }, { "epoch": 1.179933110367893, "grad_norm": 0.509414553642273, "learning_rate": 6.118865375956776e-05, "loss": 1.3679, "step": 882 }, { "epoch": 1.1812709030100335, "grad_norm": 0.924888014793396, "learning_rate": 6.114362899594778e-05, "loss": 1.5587, "step": 883 }, { "epoch": 1.182608695652174, "grad_norm": 0.6355368494987488, "learning_rate": 6.109860423232778e-05, "loss": 1.1649, "step": 884 }, { "epoch": 1.1839464882943145, "grad_norm": 0.550225019454956, "learning_rate": 6.105357946870779e-05, "loss": 1.5418, "step": 885 }, { "epoch": 1.1852842809364548, "grad_norm": 0.5702582001686096, "learning_rate": 6.100855470508781e-05, "loss": 0.8423, "step": 886 }, { "epoch": 1.1866220735785953, "grad_norm": 0.4942728877067566, "learning_rate": 6.096352994146781e-05, "loss": 0.9389, "step": 887 }, { "epoch": 1.1879598662207358, "grad_norm": 0.5035423636436462, "learning_rate": 6.091850517784782e-05, "loss": 1.659, "step": 888 }, { "epoch": 1.1892976588628763, "grad_norm": 0.3537347614765167, "learning_rate": 6.087348041422783e-05, "loss": 1.6727, "step": 889 }, { "epoch": 1.1906354515050168, "grad_norm": 0.4293750822544098, "learning_rate": 6.082845565060784e-05, "loss": 1.9057, "step": 890 }, { "epoch": 1.191973244147157, "grad_norm": 0.6442123055458069, "learning_rate": 6.0783430886987844e-05, "loss": 1.2067, "step": 891 }, { "epoch": 1.1933110367892976, "grad_norm": 0.48610013723373413, "learning_rate": 6.073840612336785e-05, "loss": 1.6491, "step": 892 }, { "epoch": 1.1946488294314381, "grad_norm": 0.5059003233909607, "learning_rate": 6.069338135974787e-05, "loss": 1.5232, "step": 893 }, { "epoch": 1.1959866220735786, "grad_norm": 0.4608552157878876, "learning_rate": 6.0648356596127876e-05, "loss": 1.652, "step": 894 }, { "epoch": 1.1973244147157192, "grad_norm": 0.5745627284049988, "learning_rate": 6.060333183250788e-05, "loss": 1.3177, "step": 895 }, { "epoch": 1.1986622073578594, "grad_norm": 0.5850013494491577, "learning_rate": 6.055830706888789e-05, "loss": 0.9728, "step": 896 }, { "epoch": 1.2, "grad_norm": 0.595733642578125, "learning_rate": 6.05132823052679e-05, "loss": 1.4109, "step": 897 }, { "epoch": 1.2013377926421405, "grad_norm": 0.5972495675086975, "learning_rate": 6.046825754164791e-05, "loss": 1.762, "step": 898 }, { "epoch": 1.202675585284281, "grad_norm": 0.7657443881034851, "learning_rate": 6.042323277802792e-05, "loss": 1.2932, "step": 899 }, { "epoch": 1.2040133779264215, "grad_norm": 0.5386178493499756, "learning_rate": 6.037820801440792e-05, "loss": 0.8722, "step": 900 }, { "epoch": 1.2053511705685618, "grad_norm": 0.4514966607093811, "learning_rate": 6.033318325078794e-05, "loss": 1.2226, "step": 901 }, { "epoch": 1.2066889632107023, "grad_norm": 0.561952531337738, "learning_rate": 6.0288158487167945e-05, "loss": 1.3083, "step": 902 }, { "epoch": 1.2080267558528428, "grad_norm": 0.46882307529449463, "learning_rate": 6.0243133723547954e-05, "loss": 1.6229, "step": 903 }, { "epoch": 1.2093645484949833, "grad_norm": 0.41895994544029236, "learning_rate": 6.019810895992797e-05, "loss": 1.6748, "step": 904 }, { "epoch": 1.2107023411371238, "grad_norm": 0.4929127097129822, "learning_rate": 6.015308419630797e-05, "loss": 1.7661, "step": 905 }, { "epoch": 1.2120401337792641, "grad_norm": 0.5943595170974731, "learning_rate": 6.010805943268798e-05, "loss": 1.3203, "step": 906 }, { "epoch": 1.2133779264214046, "grad_norm": 0.5031788945198059, "learning_rate": 6.006303466906799e-05, "loss": 1.5386, "step": 907 }, { "epoch": 1.2147157190635451, "grad_norm": 0.5768786072731018, "learning_rate": 6.0018009905448004e-05, "loss": 1.1965, "step": 908 }, { "epoch": 1.2160535117056857, "grad_norm": 0.3699272871017456, "learning_rate": 5.9972985141828005e-05, "loss": 1.2933, "step": 909 }, { "epoch": 1.2173913043478262, "grad_norm": 0.7275379300117493, "learning_rate": 5.9927960378208014e-05, "loss": 1.3549, "step": 910 }, { "epoch": 1.2187290969899665, "grad_norm": 0.48716142773628235, "learning_rate": 5.988293561458803e-05, "loss": 1.0621, "step": 911 }, { "epoch": 1.220066889632107, "grad_norm": 0.8821597695350647, "learning_rate": 5.983791085096804e-05, "loss": 1.3591, "step": 912 }, { "epoch": 1.2214046822742475, "grad_norm": 0.48447397351264954, "learning_rate": 5.9792886087348046e-05, "loss": 1.1019, "step": 913 }, { "epoch": 1.222742474916388, "grad_norm": 0.5043441653251648, "learning_rate": 5.974786132372805e-05, "loss": 1.7618, "step": 914 }, { "epoch": 1.2240802675585285, "grad_norm": 0.6771908402442932, "learning_rate": 5.9702836560108064e-05, "loss": 1.3763, "step": 915 }, { "epoch": 1.2254180602006688, "grad_norm": 0.4996180832386017, "learning_rate": 5.965781179648807e-05, "loss": 1.7422, "step": 916 }, { "epoch": 1.2267558528428093, "grad_norm": 0.32585346698760986, "learning_rate": 5.961278703286808e-05, "loss": 1.9686, "step": 917 }, { "epoch": 1.2280936454849498, "grad_norm": 0.4633427560329437, "learning_rate": 5.956776226924808e-05, "loss": 1.619, "step": 918 }, { "epoch": 1.2294314381270903, "grad_norm": 0.47458308935165405, "learning_rate": 5.95227375056281e-05, "loss": 1.8054, "step": 919 }, { "epoch": 1.2307692307692308, "grad_norm": 0.5091044902801514, "learning_rate": 5.9477712742008107e-05, "loss": 1.7261, "step": 920 }, { "epoch": 1.2321070234113711, "grad_norm": 0.5245359539985657, "learning_rate": 5.9432687978388115e-05, "loss": 1.9436, "step": 921 }, { "epoch": 1.2334448160535116, "grad_norm": 0.6491963267326355, "learning_rate": 5.938766321476813e-05, "loss": 1.4365, "step": 922 }, { "epoch": 1.2347826086956522, "grad_norm": 0.3472338616847992, "learning_rate": 5.934263845114814e-05, "loss": 1.1998, "step": 923 }, { "epoch": 1.2361204013377927, "grad_norm": 0.520018458366394, "learning_rate": 5.929761368752814e-05, "loss": 1.409, "step": 924 }, { "epoch": 1.2374581939799332, "grad_norm": 0.3718913197517395, "learning_rate": 5.925258892390815e-05, "loss": 1.1047, "step": 925 }, { "epoch": 1.2387959866220735, "grad_norm": 0.5046524405479431, "learning_rate": 5.9207564160288165e-05, "loss": 1.6533, "step": 926 }, { "epoch": 1.240133779264214, "grad_norm": 0.49469617009162903, "learning_rate": 5.9162539396668173e-05, "loss": 1.3681, "step": 927 }, { "epoch": 1.2414715719063545, "grad_norm": 0.5294374227523804, "learning_rate": 5.9117514633048175e-05, "loss": 1.8672, "step": 928 }, { "epoch": 1.242809364548495, "grad_norm": 0.44988906383514404, "learning_rate": 5.907248986942819e-05, "loss": 1.3822, "step": 929 }, { "epoch": 1.2441471571906355, "grad_norm": 0.4916761517524719, "learning_rate": 5.90274651058082e-05, "loss": 1.3785, "step": 930 }, { "epoch": 1.2454849498327758, "grad_norm": 0.7246065139770508, "learning_rate": 5.898244034218821e-05, "loss": 1.0528, "step": 931 }, { "epoch": 1.2468227424749163, "grad_norm": 0.5096299648284912, "learning_rate": 5.893741557856821e-05, "loss": 1.5648, "step": 932 }, { "epoch": 1.2481605351170568, "grad_norm": 0.5564544796943665, "learning_rate": 5.889239081494823e-05, "loss": 1.4249, "step": 933 }, { "epoch": 1.2494983277591973, "grad_norm": 0.4242939352989197, "learning_rate": 5.8847366051328234e-05, "loss": 1.6266, "step": 934 }, { "epoch": 1.2508361204013378, "grad_norm": 0.965453565120697, "learning_rate": 5.880234128770824e-05, "loss": 1.1993, "step": 935 }, { "epoch": 1.2521739130434781, "grad_norm": 0.8625237941741943, "learning_rate": 5.8757316524088244e-05, "loss": 1.2209, "step": 936 }, { "epoch": 1.2535117056856186, "grad_norm": 0.542377769947052, "learning_rate": 5.8712291760468266e-05, "loss": 1.7549, "step": 937 }, { "epoch": 1.2548494983277592, "grad_norm": 0.5045947432518005, "learning_rate": 5.866726699684827e-05, "loss": 1.6812, "step": 938 }, { "epoch": 1.2561872909698997, "grad_norm": 0.5573652982711792, "learning_rate": 5.8622242233228277e-05, "loss": 1.5869, "step": 939 }, { "epoch": 1.2575250836120402, "grad_norm": 0.771754264831543, "learning_rate": 5.857721746960829e-05, "loss": 1.502, "step": 940 }, { "epoch": 1.2588628762541805, "grad_norm": 0.5458933711051941, "learning_rate": 5.85321927059883e-05, "loss": 1.7175, "step": 941 }, { "epoch": 1.2602006688963212, "grad_norm": 0.4187781810760498, "learning_rate": 5.84871679423683e-05, "loss": 1.7161, "step": 942 }, { "epoch": 1.2615384615384615, "grad_norm": 0.5237634778022766, "learning_rate": 5.844214317874831e-05, "loss": 0.8686, "step": 943 }, { "epoch": 1.262876254180602, "grad_norm": 0.32764026522636414, "learning_rate": 5.8397118415128326e-05, "loss": 1.4472, "step": 944 }, { "epoch": 1.2642140468227425, "grad_norm": 0.7707194685935974, "learning_rate": 5.8352093651508335e-05, "loss": 1.1732, "step": 945 }, { "epoch": 1.2655518394648828, "grad_norm": 0.4281066656112671, "learning_rate": 5.830706888788834e-05, "loss": 1.6155, "step": 946 }, { "epoch": 1.2668896321070235, "grad_norm": 0.5570145845413208, "learning_rate": 5.826204412426836e-05, "loss": 1.1536, "step": 947 }, { "epoch": 1.2682274247491638, "grad_norm": 0.7634395360946655, "learning_rate": 5.821701936064836e-05, "loss": 1.1469, "step": 948 }, { "epoch": 1.2695652173913043, "grad_norm": 0.5208802819252014, "learning_rate": 5.817199459702837e-05, "loss": 1.382, "step": 949 }, { "epoch": 1.2709030100334449, "grad_norm": 0.5326187610626221, "learning_rate": 5.812696983340837e-05, "loss": 1.5838, "step": 950 }, { "epoch": 1.2722408026755851, "grad_norm": 0.506121814250946, "learning_rate": 5.808194506978839e-05, "loss": 1.5555, "step": 951 }, { "epoch": 1.2735785953177259, "grad_norm": 0.6669555902481079, "learning_rate": 5.8036920306168395e-05, "loss": 1.3553, "step": 952 }, { "epoch": 1.2749163879598662, "grad_norm": 0.7633807063102722, "learning_rate": 5.7991895542548404e-05, "loss": 1.3975, "step": 953 }, { "epoch": 1.2762541806020067, "grad_norm": 0.6958717107772827, "learning_rate": 5.7946870778928405e-05, "loss": 1.3524, "step": 954 }, { "epoch": 1.2775919732441472, "grad_norm": 0.688834011554718, "learning_rate": 5.790184601530843e-05, "loss": 1.2323, "step": 955 }, { "epoch": 1.2789297658862877, "grad_norm": 0.8541237115859985, "learning_rate": 5.785682125168843e-05, "loss": 1.172, "step": 956 }, { "epoch": 1.2802675585284282, "grad_norm": 0.4427731931209564, "learning_rate": 5.781179648806844e-05, "loss": 1.6397, "step": 957 }, { "epoch": 1.2816053511705685, "grad_norm": 0.5415353775024414, "learning_rate": 5.776677172444845e-05, "loss": 1.3547, "step": 958 }, { "epoch": 1.282943143812709, "grad_norm": 0.6246786117553711, "learning_rate": 5.772174696082846e-05, "loss": 1.523, "step": 959 }, { "epoch": 1.2842809364548495, "grad_norm": 0.477402001619339, "learning_rate": 5.7676722197208464e-05, "loss": 1.652, "step": 960 }, { "epoch": 1.28561872909699, "grad_norm": 0.4590649902820587, "learning_rate": 5.763169743358847e-05, "loss": 1.7355, "step": 961 }, { "epoch": 1.2869565217391306, "grad_norm": 0.43109437823295593, "learning_rate": 5.758667266996849e-05, "loss": 1.8118, "step": 962 }, { "epoch": 1.2882943143812708, "grad_norm": 0.7176820635795593, "learning_rate": 5.7541647906348496e-05, "loss": 0.9641, "step": 963 }, { "epoch": 1.2896321070234114, "grad_norm": 0.5545156598091125, "learning_rate": 5.74966231427285e-05, "loss": 1.3134, "step": 964 }, { "epoch": 1.2909698996655519, "grad_norm": 0.7610632181167603, "learning_rate": 5.745159837910852e-05, "loss": 1.5356, "step": 965 }, { "epoch": 1.2923076923076924, "grad_norm": 0.5686385035514832, "learning_rate": 5.740657361548852e-05, "loss": 1.4585, "step": 966 }, { "epoch": 1.293645484949833, "grad_norm": 0.7597028613090515, "learning_rate": 5.736154885186853e-05, "loss": 1.5304, "step": 967 }, { "epoch": 1.2949832775919732, "grad_norm": 0.5620704889297485, "learning_rate": 5.731652408824853e-05, "loss": 1.4065, "step": 968 }, { "epoch": 1.2963210702341137, "grad_norm": 0.46487486362457275, "learning_rate": 5.7271499324628554e-05, "loss": 1.6403, "step": 969 }, { "epoch": 1.2976588628762542, "grad_norm": 0.6293439865112305, "learning_rate": 5.7226474561008556e-05, "loss": 1.4635, "step": 970 }, { "epoch": 1.2989966555183947, "grad_norm": 0.9160875678062439, "learning_rate": 5.7181449797388565e-05, "loss": 1.0431, "step": 971 }, { "epoch": 1.3003344481605352, "grad_norm": 0.8708162903785706, "learning_rate": 5.7136425033768573e-05, "loss": 1.5403, "step": 972 }, { "epoch": 1.3016722408026755, "grad_norm": 0.9074984788894653, "learning_rate": 5.709140027014859e-05, "loss": 1.2068, "step": 973 }, { "epoch": 1.303010033444816, "grad_norm": 0.6425647139549255, "learning_rate": 5.704637550652859e-05, "loss": 1.2269, "step": 974 }, { "epoch": 1.3043478260869565, "grad_norm": 0.57787024974823, "learning_rate": 5.70013507429086e-05, "loss": 1.5209, "step": 975 }, { "epoch": 1.305685618729097, "grad_norm": 0.6588714122772217, "learning_rate": 5.6956325979288615e-05, "loss": 1.4408, "step": 976 }, { "epoch": 1.3070234113712376, "grad_norm": 0.4849056899547577, "learning_rate": 5.691130121566862e-05, "loss": 1.416, "step": 977 }, { "epoch": 1.3083612040133779, "grad_norm": 0.5900976061820984, "learning_rate": 5.6866276452048625e-05, "loss": 1.3661, "step": 978 }, { "epoch": 1.3096989966555184, "grad_norm": 0.528537392616272, "learning_rate": 5.6821251688428634e-05, "loss": 1.4337, "step": 979 }, { "epoch": 1.3110367892976589, "grad_norm": 0.6017236113548279, "learning_rate": 5.677622692480865e-05, "loss": 1.3365, "step": 980 }, { "epoch": 1.3123745819397994, "grad_norm": 0.5581938624382019, "learning_rate": 5.673120216118866e-05, "loss": 1.0921, "step": 981 }, { "epoch": 1.31371237458194, "grad_norm": 0.4984164535999298, "learning_rate": 5.6686177397568666e-05, "loss": 1.7687, "step": 982 }, { "epoch": 1.3150501672240802, "grad_norm": 0.6263704895973206, "learning_rate": 5.664115263394868e-05, "loss": 0.8377, "step": 983 }, { "epoch": 1.3163879598662207, "grad_norm": 0.5864318013191223, "learning_rate": 5.659612787032868e-05, "loss": 1.2958, "step": 984 }, { "epoch": 1.3177257525083612, "grad_norm": 0.5671847462654114, "learning_rate": 5.655110310670869e-05, "loss": 1.4972, "step": 985 }, { "epoch": 1.3190635451505017, "grad_norm": 1.1298816204071045, "learning_rate": 5.65060783430887e-05, "loss": 0.9593, "step": 986 }, { "epoch": 1.3204013377926422, "grad_norm": 0.5613873600959778, "learning_rate": 5.6461053579468716e-05, "loss": 1.7714, "step": 987 }, { "epoch": 1.3217391304347825, "grad_norm": 0.4778936207294464, "learning_rate": 5.641602881584872e-05, "loss": 1.4784, "step": 988 }, { "epoch": 1.323076923076923, "grad_norm": 0.5867298245429993, "learning_rate": 5.6371004052228726e-05, "loss": 1.6855, "step": 989 }, { "epoch": 1.3244147157190636, "grad_norm": 0.67825847864151, "learning_rate": 5.6325979288608735e-05, "loss": 1.3786, "step": 990 }, { "epoch": 1.325752508361204, "grad_norm": 0.566182017326355, "learning_rate": 5.628095452498875e-05, "loss": 1.5857, "step": 991 }, { "epoch": 1.3270903010033446, "grad_norm": 0.6108747124671936, "learning_rate": 5.623592976136875e-05, "loss": 0.9231, "step": 992 }, { "epoch": 1.3284280936454849, "grad_norm": 0.3762817680835724, "learning_rate": 5.619090499774876e-05, "loss": 1.8333, "step": 993 }, { "epoch": 1.3297658862876254, "grad_norm": 0.42532092332839966, "learning_rate": 5.6145880234128776e-05, "loss": 1.4002, "step": 994 }, { "epoch": 1.3311036789297659, "grad_norm": 0.409871369600296, "learning_rate": 5.6100855470508785e-05, "loss": 1.5429, "step": 995 }, { "epoch": 1.3324414715719064, "grad_norm": 0.4747501313686371, "learning_rate": 5.605583070688879e-05, "loss": 1.2592, "step": 996 }, { "epoch": 1.333779264214047, "grad_norm": 0.4586211144924164, "learning_rate": 5.6010805943268795e-05, "loss": 1.5638, "step": 997 }, { "epoch": 1.3351170568561872, "grad_norm": 0.48024147748947144, "learning_rate": 5.596578117964881e-05, "loss": 1.3406, "step": 998 }, { "epoch": 1.3364548494983277, "grad_norm": 0.5290399193763733, "learning_rate": 5.592075641602882e-05, "loss": 1.4827, "step": 999 }, { "epoch": 1.3377926421404682, "grad_norm": 0.6649457812309265, "learning_rate": 5.587573165240883e-05, "loss": 1.3084, "step": 1000 }, { "epoch": 1.3391304347826087, "grad_norm": 0.5224519371986389, "learning_rate": 5.583070688878884e-05, "loss": 1.4575, "step": 1001 }, { "epoch": 1.3404682274247492, "grad_norm": 0.5149902105331421, "learning_rate": 5.5785682125168845e-05, "loss": 1.2122, "step": 1002 }, { "epoch": 1.3418060200668895, "grad_norm": 0.5006117820739746, "learning_rate": 5.574065736154885e-05, "loss": 1.6049, "step": 1003 }, { "epoch": 1.34314381270903, "grad_norm": 1.1320693492889404, "learning_rate": 5.569563259792886e-05, "loss": 1.076, "step": 1004 }, { "epoch": 1.3444816053511706, "grad_norm": 0.5004568099975586, "learning_rate": 5.565060783430888e-05, "loss": 1.7695, "step": 1005 }, { "epoch": 1.345819397993311, "grad_norm": 0.7288807034492493, "learning_rate": 5.5605583070688886e-05, "loss": 1.5639, "step": 1006 }, { "epoch": 1.3471571906354516, "grad_norm": 0.5516156554222107, "learning_rate": 5.556055830706889e-05, "loss": 1.2222, "step": 1007 }, { "epoch": 1.3484949832775919, "grad_norm": 0.5309849381446838, "learning_rate": 5.5515533543448896e-05, "loss": 1.451, "step": 1008 }, { "epoch": 1.3498327759197324, "grad_norm": 0.5378895401954651, "learning_rate": 5.547050877982891e-05, "loss": 1.4105, "step": 1009 }, { "epoch": 1.351170568561873, "grad_norm": 0.5446533560752869, "learning_rate": 5.542548401620892e-05, "loss": 1.6093, "step": 1010 }, { "epoch": 1.3525083612040134, "grad_norm": 0.4849577844142914, "learning_rate": 5.538045925258892e-05, "loss": 1.4458, "step": 1011 }, { "epoch": 1.353846153846154, "grad_norm": 0.7450141906738281, "learning_rate": 5.533543448896894e-05, "loss": 0.6454, "step": 1012 }, { "epoch": 1.3551839464882942, "grad_norm": 0.4997408986091614, "learning_rate": 5.5290409725348946e-05, "loss": 1.5501, "step": 1013 }, { "epoch": 1.3565217391304347, "grad_norm": 0.49663230776786804, "learning_rate": 5.5245384961728954e-05, "loss": 1.2957, "step": 1014 }, { "epoch": 1.3578595317725752, "grad_norm": 0.47518983483314514, "learning_rate": 5.5200360198108956e-05, "loss": 1.5983, "step": 1015 }, { "epoch": 1.3591973244147157, "grad_norm": 0.3959956765174866, "learning_rate": 5.515533543448897e-05, "loss": 1.5998, "step": 1016 }, { "epoch": 1.3605351170568563, "grad_norm": 0.5632640719413757, "learning_rate": 5.511031067086898e-05, "loss": 1.4659, "step": 1017 }, { "epoch": 1.3618729096989965, "grad_norm": 0.6305011510848999, "learning_rate": 5.506528590724899e-05, "loss": 1.1801, "step": 1018 }, { "epoch": 1.363210702341137, "grad_norm": 0.6637885570526123, "learning_rate": 5.5020261143629004e-05, "loss": 1.137, "step": 1019 }, { "epoch": 1.3645484949832776, "grad_norm": 0.5782528519630432, "learning_rate": 5.497523638000901e-05, "loss": 1.4629, "step": 1020 }, { "epoch": 1.365886287625418, "grad_norm": 0.5602366924285889, "learning_rate": 5.4930211616389015e-05, "loss": 1.3326, "step": 1021 }, { "epoch": 1.3672240802675586, "grad_norm": 0.6684134006500244, "learning_rate": 5.488518685276902e-05, "loss": 1.3521, "step": 1022 }, { "epoch": 1.3685618729096989, "grad_norm": 0.7985564470291138, "learning_rate": 5.484016208914904e-05, "loss": 0.8858, "step": 1023 }, { "epoch": 1.3698996655518394, "grad_norm": 0.5301754474639893, "learning_rate": 5.479513732552905e-05, "loss": 1.5861, "step": 1024 }, { "epoch": 1.37123745819398, "grad_norm": 0.5510058403015137, "learning_rate": 5.475011256190905e-05, "loss": 1.4559, "step": 1025 }, { "epoch": 1.3725752508361204, "grad_norm": 0.8531842827796936, "learning_rate": 5.470508779828906e-05, "loss": 1.4512, "step": 1026 }, { "epoch": 1.373913043478261, "grad_norm": 0.5832063555717468, "learning_rate": 5.466006303466907e-05, "loss": 1.392, "step": 1027 }, { "epoch": 1.3752508361204012, "grad_norm": 0.587691068649292, "learning_rate": 5.461503827104908e-05, "loss": 1.5835, "step": 1028 }, { "epoch": 1.3765886287625417, "grad_norm": 0.5541633367538452, "learning_rate": 5.457001350742908e-05, "loss": 1.5491, "step": 1029 }, { "epoch": 1.3779264214046822, "grad_norm": 0.46723368763923645, "learning_rate": 5.4524988743809105e-05, "loss": 1.5469, "step": 1030 }, { "epoch": 1.3792642140468228, "grad_norm": 0.5442748069763184, "learning_rate": 5.447996398018911e-05, "loss": 1.4914, "step": 1031 }, { "epoch": 1.3806020066889633, "grad_norm": 0.5219207406044006, "learning_rate": 5.4434939216569116e-05, "loss": 1.7056, "step": 1032 }, { "epoch": 1.3819397993311036, "grad_norm": 0.5391073822975159, "learning_rate": 5.438991445294912e-05, "loss": 1.1345, "step": 1033 }, { "epoch": 1.383277591973244, "grad_norm": 0.5431692004203796, "learning_rate": 5.434488968932914e-05, "loss": 1.6605, "step": 1034 }, { "epoch": 1.3846153846153846, "grad_norm": 0.5454650521278381, "learning_rate": 5.429986492570914e-05, "loss": 1.6891, "step": 1035 }, { "epoch": 1.385953177257525, "grad_norm": 0.5413758754730225, "learning_rate": 5.425484016208915e-05, "loss": 1.5395, "step": 1036 }, { "epoch": 1.3872909698996656, "grad_norm": 0.44604915380477905, "learning_rate": 5.4209815398469166e-05, "loss": 1.6368, "step": 1037 }, { "epoch": 1.388628762541806, "grad_norm": 0.6125640869140625, "learning_rate": 5.4164790634849174e-05, "loss": 1.2193, "step": 1038 }, { "epoch": 1.3899665551839464, "grad_norm": 0.5223219990730286, "learning_rate": 5.4119765871229176e-05, "loss": 1.511, "step": 1039 }, { "epoch": 1.391304347826087, "grad_norm": 0.7312715649604797, "learning_rate": 5.4074741107609185e-05, "loss": 1.4429, "step": 1040 }, { "epoch": 1.3926421404682274, "grad_norm": 0.5728110671043396, "learning_rate": 5.40297163439892e-05, "loss": 1.2849, "step": 1041 }, { "epoch": 1.393979933110368, "grad_norm": 0.475210040807724, "learning_rate": 5.398469158036921e-05, "loss": 1.5608, "step": 1042 }, { "epoch": 1.3953177257525082, "grad_norm": 0.4216032326221466, "learning_rate": 5.393966681674921e-05, "loss": 1.4798, "step": 1043 }, { "epoch": 1.396655518394649, "grad_norm": 0.45767804980278015, "learning_rate": 5.389464205312922e-05, "loss": 1.8409, "step": 1044 }, { "epoch": 1.3979933110367893, "grad_norm": 0.46793442964553833, "learning_rate": 5.3849617289509234e-05, "loss": 1.5628, "step": 1045 }, { "epoch": 1.3993311036789298, "grad_norm": 0.5342912673950195, "learning_rate": 5.380459252588924e-05, "loss": 1.6847, "step": 1046 }, { "epoch": 1.4006688963210703, "grad_norm": 0.6303462386131287, "learning_rate": 5.3759567762269245e-05, "loss": 1.4752, "step": 1047 }, { "epoch": 1.4020066889632106, "grad_norm": 0.4938918650150299, "learning_rate": 5.371454299864927e-05, "loss": 1.8291, "step": 1048 }, { "epoch": 1.4033444816053513, "grad_norm": 0.6580408215522766, "learning_rate": 5.366951823502927e-05, "loss": 0.8986, "step": 1049 }, { "epoch": 1.4046822742474916, "grad_norm": 0.600391685962677, "learning_rate": 5.362449347140928e-05, "loss": 1.4318, "step": 1050 }, { "epoch": 1.406020066889632, "grad_norm": 0.4521050453186035, "learning_rate": 5.357946870778928e-05, "loss": 1.4837, "step": 1051 }, { "epoch": 1.4073578595317726, "grad_norm": 0.6474264860153198, "learning_rate": 5.35344439441693e-05, "loss": 1.3424, "step": 1052 }, { "epoch": 1.4086956521739131, "grad_norm": 0.500771701335907, "learning_rate": 5.34894191805493e-05, "loss": 1.8707, "step": 1053 }, { "epoch": 1.4100334448160536, "grad_norm": 0.5102487206459045, "learning_rate": 5.344439441692931e-05, "loss": 1.8188, "step": 1054 }, { "epoch": 1.411371237458194, "grad_norm": 0.406031996011734, "learning_rate": 5.339936965330933e-05, "loss": 1.854, "step": 1055 }, { "epoch": 1.4127090301003344, "grad_norm": 0.415464848279953, "learning_rate": 5.3354344889689335e-05, "loss": 1.8053, "step": 1056 }, { "epoch": 1.414046822742475, "grad_norm": 0.5517300963401794, "learning_rate": 5.330932012606934e-05, "loss": 1.227, "step": 1057 }, { "epoch": 1.4153846153846155, "grad_norm": 0.6167062520980835, "learning_rate": 5.3264295362449346e-05, "loss": 1.31, "step": 1058 }, { "epoch": 1.416722408026756, "grad_norm": 0.5428354740142822, "learning_rate": 5.321927059882936e-05, "loss": 1.2271, "step": 1059 }, { "epoch": 1.4180602006688963, "grad_norm": 0.5453240871429443, "learning_rate": 5.317424583520937e-05, "loss": 1.759, "step": 1060 }, { "epoch": 1.4193979933110368, "grad_norm": 0.4308525323867798, "learning_rate": 5.312922107158937e-05, "loss": 1.8145, "step": 1061 }, { "epoch": 1.4207357859531773, "grad_norm": 0.5427666902542114, "learning_rate": 5.308419630796938e-05, "loss": 1.4224, "step": 1062 }, { "epoch": 1.4220735785953178, "grad_norm": 0.46834635734558105, "learning_rate": 5.3039171544349396e-05, "loss": 1.4914, "step": 1063 }, { "epoch": 1.4234113712374583, "grad_norm": 0.615035355091095, "learning_rate": 5.2994146780729404e-05, "loss": 1.2526, "step": 1064 }, { "epoch": 1.4247491638795986, "grad_norm": 0.7380486130714417, "learning_rate": 5.2949122017109406e-05, "loss": 1.4013, "step": 1065 }, { "epoch": 1.4260869565217391, "grad_norm": 0.5666106939315796, "learning_rate": 5.290409725348943e-05, "loss": 1.1257, "step": 1066 }, { "epoch": 1.4274247491638796, "grad_norm": 0.3648187220096588, "learning_rate": 5.285907248986943e-05, "loss": 1.1758, "step": 1067 }, { "epoch": 1.4287625418060201, "grad_norm": 0.4655036926269531, "learning_rate": 5.281404772624944e-05, "loss": 1.0849, "step": 1068 }, { "epoch": 1.4301003344481606, "grad_norm": 0.4767707586288452, "learning_rate": 5.276902296262945e-05, "loss": 1.4827, "step": 1069 }, { "epoch": 1.431438127090301, "grad_norm": 0.5286468267440796, "learning_rate": 5.272399819900946e-05, "loss": 1.4783, "step": 1070 }, { "epoch": 1.4327759197324414, "grad_norm": 0.47978368401527405, "learning_rate": 5.2678973435389464e-05, "loss": 1.4588, "step": 1071 }, { "epoch": 1.434113712374582, "grad_norm": 0.6320868730545044, "learning_rate": 5.263394867176947e-05, "loss": 1.4445, "step": 1072 }, { "epoch": 1.4354515050167225, "grad_norm": 0.7280179262161255, "learning_rate": 5.258892390814949e-05, "loss": 1.33, "step": 1073 }, { "epoch": 1.436789297658863, "grad_norm": 0.5280917286872864, "learning_rate": 5.25438991445295e-05, "loss": 1.3857, "step": 1074 }, { "epoch": 1.4381270903010033, "grad_norm": 0.6721616387367249, "learning_rate": 5.24988743809095e-05, "loss": 1.5877, "step": 1075 }, { "epoch": 1.4394648829431438, "grad_norm": 0.5173552632331848, "learning_rate": 5.245384961728951e-05, "loss": 0.9419, "step": 1076 }, { "epoch": 1.4408026755852843, "grad_norm": 0.6093307137489319, "learning_rate": 5.240882485366952e-05, "loss": 0.8124, "step": 1077 }, { "epoch": 1.4421404682274248, "grad_norm": 0.8187419772148132, "learning_rate": 5.236380009004953e-05, "loss": 1.5013, "step": 1078 }, { "epoch": 1.4434782608695653, "grad_norm": 0.5983606576919556, "learning_rate": 5.231877532642954e-05, "loss": 1.2655, "step": 1079 }, { "epoch": 1.4448160535117056, "grad_norm": 0.5444653630256653, "learning_rate": 5.227375056280954e-05, "loss": 1.2253, "step": 1080 }, { "epoch": 1.4461538461538461, "grad_norm": 0.563248336315155, "learning_rate": 5.222872579918956e-05, "loss": 1.414, "step": 1081 }, { "epoch": 1.4474916387959866, "grad_norm": 0.6191748976707458, "learning_rate": 5.2183701035569566e-05, "loss": 1.1588, "step": 1082 }, { "epoch": 1.4488294314381271, "grad_norm": 0.604679524898529, "learning_rate": 5.2138676271949574e-05, "loss": 1.4095, "step": 1083 }, { "epoch": 1.4501672240802677, "grad_norm": 0.5420505404472351, "learning_rate": 5.209365150832959e-05, "loss": 1.1935, "step": 1084 }, { "epoch": 1.451505016722408, "grad_norm": 0.651980996131897, "learning_rate": 5.204862674470959e-05, "loss": 1.4503, "step": 1085 }, { "epoch": 1.4528428093645485, "grad_norm": 0.6438544988632202, "learning_rate": 5.20036019810896e-05, "loss": 1.5319, "step": 1086 }, { "epoch": 1.454180602006689, "grad_norm": 0.5814422965049744, "learning_rate": 5.195857721746961e-05, "loss": 0.9881, "step": 1087 }, { "epoch": 1.4555183946488295, "grad_norm": 0.4369351863861084, "learning_rate": 5.1913552453849624e-05, "loss": 1.2893, "step": 1088 }, { "epoch": 1.45685618729097, "grad_norm": 0.8631730079650879, "learning_rate": 5.186852769022963e-05, "loss": 1.6095, "step": 1089 }, { "epoch": 1.4581939799331103, "grad_norm": 0.4449801743030548, "learning_rate": 5.1823502926609634e-05, "loss": 1.5782, "step": 1090 }, { "epoch": 1.4595317725752508, "grad_norm": 0.6127569079399109, "learning_rate": 5.177847816298965e-05, "loss": 1.1066, "step": 1091 }, { "epoch": 1.4608695652173913, "grad_norm": 0.5010789632797241, "learning_rate": 5.173345339936966e-05, "loss": 1.103, "step": 1092 }, { "epoch": 1.4622073578595318, "grad_norm": 0.6753023862838745, "learning_rate": 5.168842863574967e-05, "loss": 0.9127, "step": 1093 }, { "epoch": 1.4635451505016723, "grad_norm": 0.6370785236358643, "learning_rate": 5.164340387212967e-05, "loss": 1.4349, "step": 1094 }, { "epoch": 1.4648829431438126, "grad_norm": 0.5292558073997498, "learning_rate": 5.1598379108509684e-05, "loss": 1.0429, "step": 1095 }, { "epoch": 1.4662207357859531, "grad_norm": 0.6851668953895569, "learning_rate": 5.155335434488969e-05, "loss": 1.0644, "step": 1096 }, { "epoch": 1.4675585284280936, "grad_norm": 0.5523104071617126, "learning_rate": 5.15083295812697e-05, "loss": 1.436, "step": 1097 }, { "epoch": 1.4688963210702342, "grad_norm": 0.44476109743118286, "learning_rate": 5.14633048176497e-05, "loss": 1.2197, "step": 1098 }, { "epoch": 1.4702341137123747, "grad_norm": 0.6210783123970032, "learning_rate": 5.141828005402972e-05, "loss": 1.1594, "step": 1099 }, { "epoch": 1.471571906354515, "grad_norm": 0.5576224327087402, "learning_rate": 5.137325529040973e-05, "loss": 1.379, "step": 1100 }, { "epoch": 1.4729096989966555, "grad_norm": 0.7876810431480408, "learning_rate": 5.1328230526789736e-05, "loss": 1.3052, "step": 1101 }, { "epoch": 1.474247491638796, "grad_norm": 0.5204209685325623, "learning_rate": 5.128320576316975e-05, "loss": 1.5625, "step": 1102 }, { "epoch": 1.4755852842809365, "grad_norm": 0.6800068020820618, "learning_rate": 5.123818099954976e-05, "loss": 1.1098, "step": 1103 }, { "epoch": 1.476923076923077, "grad_norm": 0.5767846703529358, "learning_rate": 5.119315623592976e-05, "loss": 1.3146, "step": 1104 }, { "epoch": 1.4782608695652173, "grad_norm": 0.6460472941398621, "learning_rate": 5.114813147230977e-05, "loss": 1.5605, "step": 1105 }, { "epoch": 1.4795986622073578, "grad_norm": 0.5904108285903931, "learning_rate": 5.1103106708689785e-05, "loss": 1.5344, "step": 1106 }, { "epoch": 1.4809364548494983, "grad_norm": 0.5445310473442078, "learning_rate": 5.1058081945069794e-05, "loss": 1.4232, "step": 1107 }, { "epoch": 1.4822742474916388, "grad_norm": 0.642361044883728, "learning_rate": 5.1013057181449796e-05, "loss": 1.259, "step": 1108 }, { "epoch": 1.4836120401337793, "grad_norm": 0.5225979089736938, "learning_rate": 5.096803241782981e-05, "loss": 1.6267, "step": 1109 }, { "epoch": 1.4849498327759196, "grad_norm": 0.467866450548172, "learning_rate": 5.092300765420982e-05, "loss": 1.5308, "step": 1110 }, { "epoch": 1.4862876254180601, "grad_norm": 0.5182499289512634, "learning_rate": 5.087798289058983e-05, "loss": 1.6366, "step": 1111 }, { "epoch": 1.4876254180602007, "grad_norm": 0.4528636932373047, "learning_rate": 5.083295812696983e-05, "loss": 1.7337, "step": 1112 }, { "epoch": 1.4889632107023412, "grad_norm": 0.6056678295135498, "learning_rate": 5.078793336334985e-05, "loss": 1.6116, "step": 1113 }, { "epoch": 1.4903010033444817, "grad_norm": 0.7801780700683594, "learning_rate": 5.0742908599729854e-05, "loss": 1.3719, "step": 1114 }, { "epoch": 1.491638795986622, "grad_norm": 0.37030741572380066, "learning_rate": 5.069788383610986e-05, "loss": 1.3142, "step": 1115 }, { "epoch": 1.4929765886287625, "grad_norm": 0.43180251121520996, "learning_rate": 5.0652859072489864e-05, "loss": 1.6445, "step": 1116 }, { "epoch": 1.494314381270903, "grad_norm": 0.5785216689109802, "learning_rate": 5.0607834308869886e-05, "loss": 1.4886, "step": 1117 }, { "epoch": 1.4956521739130435, "grad_norm": 0.6844804286956787, "learning_rate": 5.056280954524989e-05, "loss": 1.5056, "step": 1118 }, { "epoch": 1.496989966555184, "grad_norm": 0.5532545447349548, "learning_rate": 5.05177847816299e-05, "loss": 1.4305, "step": 1119 }, { "epoch": 1.4983277591973243, "grad_norm": 1.0775946378707886, "learning_rate": 5.047276001800991e-05, "loss": 1.1778, "step": 1120 }, { "epoch": 1.4996655518394648, "grad_norm": 0.6081984043121338, "learning_rate": 5.042773525438992e-05, "loss": 1.4213, "step": 1121 }, { "epoch": 1.5010033444816053, "grad_norm": 0.664418637752533, "learning_rate": 5.038271049076992e-05, "loss": 0.9045, "step": 1122 }, { "epoch": 1.5023411371237458, "grad_norm": 0.7059425711631775, "learning_rate": 5.033768572714993e-05, "loss": 1.2277, "step": 1123 }, { "epoch": 1.5036789297658864, "grad_norm": 0.5413299202919006, "learning_rate": 5.0292660963529947e-05, "loss": 1.5393, "step": 1124 }, { "epoch": 1.5050167224080266, "grad_norm": 0.47964879870414734, "learning_rate": 5.0247636199909955e-05, "loss": 1.7589, "step": 1125 }, { "epoch": 1.5063545150501674, "grad_norm": 0.6963114142417908, "learning_rate": 5.020261143628996e-05, "loss": 1.5982, "step": 1126 }, { "epoch": 1.5076923076923077, "grad_norm": 0.567061722278595, "learning_rate": 5.015758667266998e-05, "loss": 0.9174, "step": 1127 }, { "epoch": 1.5090301003344482, "grad_norm": 1.0022151470184326, "learning_rate": 5.011256190904998e-05, "loss": 1.0001, "step": 1128 }, { "epoch": 1.5103678929765887, "grad_norm": 0.49206629395484924, "learning_rate": 5.006753714542999e-05, "loss": 1.7532, "step": 1129 }, { "epoch": 1.511705685618729, "grad_norm": 0.6257492899894714, "learning_rate": 5.002251238180999e-05, "loss": 1.6988, "step": 1130 }, { "epoch": 1.5130434782608697, "grad_norm": 0.4502313435077667, "learning_rate": 4.997748761819001e-05, "loss": 1.5575, "step": 1131 }, { "epoch": 1.51438127090301, "grad_norm": 0.7328041195869446, "learning_rate": 4.9932462854570015e-05, "loss": 1.3456, "step": 1132 }, { "epoch": 1.5157190635451505, "grad_norm": 0.5006700158119202, "learning_rate": 4.9887438090950024e-05, "loss": 1.962, "step": 1133 }, { "epoch": 1.517056856187291, "grad_norm": 0.6119201183319092, "learning_rate": 4.984241332733003e-05, "loss": 1.2587, "step": 1134 }, { "epoch": 1.5183946488294313, "grad_norm": 0.4837782382965088, "learning_rate": 4.979738856371005e-05, "loss": 1.5549, "step": 1135 }, { "epoch": 1.519732441471572, "grad_norm": 0.7072848677635193, "learning_rate": 4.975236380009005e-05, "loss": 1.1933, "step": 1136 }, { "epoch": 1.5210702341137123, "grad_norm": 0.9050494432449341, "learning_rate": 4.9707339036470065e-05, "loss": 0.9648, "step": 1137 }, { "epoch": 1.5224080267558529, "grad_norm": 0.9395582675933838, "learning_rate": 4.966231427285007e-05, "loss": 1.3409, "step": 1138 }, { "epoch": 1.5237458193979934, "grad_norm": 0.6518849730491638, "learning_rate": 4.961728950923008e-05, "loss": 1.3857, "step": 1139 }, { "epoch": 1.5250836120401337, "grad_norm": 0.6363665461540222, "learning_rate": 4.9572264745610084e-05, "loss": 1.6827, "step": 1140 }, { "epoch": 1.5264214046822744, "grad_norm": 0.6692220568656921, "learning_rate": 4.95272399819901e-05, "loss": 1.2984, "step": 1141 }, { "epoch": 1.5277591973244147, "grad_norm": 0.4842371940612793, "learning_rate": 4.94822152183701e-05, "loss": 1.8894, "step": 1142 }, { "epoch": 1.5290969899665552, "grad_norm": 0.6643278002738953, "learning_rate": 4.9437190454750117e-05, "loss": 1.4444, "step": 1143 }, { "epoch": 1.5304347826086957, "grad_norm": 0.3972679674625397, "learning_rate": 4.9392165691130125e-05, "loss": 1.1412, "step": 1144 }, { "epoch": 1.531772575250836, "grad_norm": 0.5554966926574707, "learning_rate": 4.9347140927510134e-05, "loss": 0.9472, "step": 1145 }, { "epoch": 1.5331103678929767, "grad_norm": 0.6864800453186035, "learning_rate": 4.930211616389014e-05, "loss": 0.91, "step": 1146 }, { "epoch": 1.534448160535117, "grad_norm": 0.45306283235549927, "learning_rate": 4.925709140027015e-05, "loss": 1.8113, "step": 1147 }, { "epoch": 1.5357859531772575, "grad_norm": 0.6085485219955444, "learning_rate": 4.921206663665016e-05, "loss": 1.1981, "step": 1148 }, { "epoch": 1.537123745819398, "grad_norm": 0.5183812975883484, "learning_rate": 4.916704187303017e-05, "loss": 1.6182, "step": 1149 }, { "epoch": 1.5384615384615383, "grad_norm": 0.5374182462692261, "learning_rate": 4.912201710941018e-05, "loss": 1.467, "step": 1150 }, { "epoch": 1.539799331103679, "grad_norm": 0.5360819101333618, "learning_rate": 4.9076992345790185e-05, "loss": 1.5612, "step": 1151 }, { "epoch": 1.5411371237458193, "grad_norm": 0.5663394927978516, "learning_rate": 4.9031967582170194e-05, "loss": 0.9437, "step": 1152 }, { "epoch": 1.5424749163879599, "grad_norm": 0.8458664417266846, "learning_rate": 4.898694281855021e-05, "loss": 1.1492, "step": 1153 }, { "epoch": 1.5438127090301004, "grad_norm": 0.8276277184486389, "learning_rate": 4.894191805493021e-05, "loss": 1.2647, "step": 1154 }, { "epoch": 1.5451505016722407, "grad_norm": 0.9039729833602905, "learning_rate": 4.8896893291310226e-05, "loss": 1.2929, "step": 1155 }, { "epoch": 1.5464882943143814, "grad_norm": 0.63777095079422, "learning_rate": 4.885186852769023e-05, "loss": 1.3589, "step": 1156 }, { "epoch": 1.5478260869565217, "grad_norm": 0.825036883354187, "learning_rate": 4.8806843764070244e-05, "loss": 1.233, "step": 1157 }, { "epoch": 1.5491638795986622, "grad_norm": 0.5583025217056274, "learning_rate": 4.8761819000450245e-05, "loss": 1.8935, "step": 1158 }, { "epoch": 1.5505016722408027, "grad_norm": 0.4850707948207855, "learning_rate": 4.871679423683026e-05, "loss": 1.5355, "step": 1159 }, { "epoch": 1.551839464882943, "grad_norm": 0.5671193599700928, "learning_rate": 4.867176947321026e-05, "loss": 1.3592, "step": 1160 }, { "epoch": 1.5531772575250837, "grad_norm": 0.6245211362838745, "learning_rate": 4.862674470959028e-05, "loss": 1.1835, "step": 1161 }, { "epoch": 1.554515050167224, "grad_norm": 0.5358390808105469, "learning_rate": 4.8581719945970286e-05, "loss": 1.4703, "step": 1162 }, { "epoch": 1.5558528428093645, "grad_norm": 0.7337758541107178, "learning_rate": 4.8536695182350295e-05, "loss": 1.0353, "step": 1163 }, { "epoch": 1.557190635451505, "grad_norm": 0.5729077458381653, "learning_rate": 4.8491670418730304e-05, "loss": 1.495, "step": 1164 }, { "epoch": 1.5585284280936453, "grad_norm": 0.5456716418266296, "learning_rate": 4.844664565511031e-05, "loss": 1.3311, "step": 1165 }, { "epoch": 1.559866220735786, "grad_norm": 0.5501260757446289, "learning_rate": 4.840162089149032e-05, "loss": 0.8144, "step": 1166 }, { "epoch": 1.5612040133779264, "grad_norm": 0.5962331295013428, "learning_rate": 4.835659612787033e-05, "loss": 1.5534, "step": 1167 }, { "epoch": 1.5625418060200669, "grad_norm": 0.6607121825218201, "learning_rate": 4.831157136425034e-05, "loss": 1.2124, "step": 1168 }, { "epoch": 1.5638795986622074, "grad_norm": 0.46615132689476013, "learning_rate": 4.8266546600630347e-05, "loss": 1.846, "step": 1169 }, { "epoch": 1.5652173913043477, "grad_norm": 0.5132423639297485, "learning_rate": 4.8221521837010355e-05, "loss": 1.5602, "step": 1170 }, { "epoch": 1.5665551839464884, "grad_norm": 0.5841774344444275, "learning_rate": 4.817649707339037e-05, "loss": 1.4619, "step": 1171 }, { "epoch": 1.5678929765886287, "grad_norm": 0.5244891047477722, "learning_rate": 4.813147230977037e-05, "loss": 1.54, "step": 1172 }, { "epoch": 1.5692307692307692, "grad_norm": 0.5597742199897766, "learning_rate": 4.808644754615039e-05, "loss": 1.6573, "step": 1173 }, { "epoch": 1.5705685618729097, "grad_norm": 0.7487589716911316, "learning_rate": 4.8041422782530396e-05, "loss": 1.5697, "step": 1174 }, { "epoch": 1.57190635451505, "grad_norm": 0.3977908790111542, "learning_rate": 4.7996398018910405e-05, "loss": 1.7967, "step": 1175 }, { "epoch": 1.5732441471571907, "grad_norm": 0.5278039574623108, "learning_rate": 4.7951373255290413e-05, "loss": 1.4762, "step": 1176 }, { "epoch": 1.574581939799331, "grad_norm": 0.6334072947502136, "learning_rate": 4.790634849167042e-05, "loss": 1.527, "step": 1177 }, { "epoch": 1.5759197324414715, "grad_norm": 0.4393516182899475, "learning_rate": 4.786132372805043e-05, "loss": 1.1867, "step": 1178 }, { "epoch": 1.577257525083612, "grad_norm": 1.0068846940994263, "learning_rate": 4.781629896443044e-05, "loss": 1.5931, "step": 1179 }, { "epoch": 1.5785953177257523, "grad_norm": 0.53587406873703, "learning_rate": 4.777127420081045e-05, "loss": 0.9545, "step": 1180 }, { "epoch": 1.579933110367893, "grad_norm": 0.39035940170288086, "learning_rate": 4.7726249437190456e-05, "loss": 1.255, "step": 1181 }, { "epoch": 1.5812709030100334, "grad_norm": 0.4179837703704834, "learning_rate": 4.7681224673570465e-05, "loss": 1.7384, "step": 1182 }, { "epoch": 1.5826086956521739, "grad_norm": 0.5482913255691528, "learning_rate": 4.7636199909950474e-05, "loss": 1.6497, "step": 1183 }, { "epoch": 1.5839464882943144, "grad_norm": 0.7347548007965088, "learning_rate": 4.759117514633049e-05, "loss": 0.603, "step": 1184 }, { "epoch": 1.585284280936455, "grad_norm": 0.6300942897796631, "learning_rate": 4.754615038271049e-05, "loss": 1.6236, "step": 1185 }, { "epoch": 1.5866220735785954, "grad_norm": 0.39102619886398315, "learning_rate": 4.7501125619090506e-05, "loss": 1.2623, "step": 1186 }, { "epoch": 1.5879598662207357, "grad_norm": 0.6081557273864746, "learning_rate": 4.745610085547051e-05, "loss": 1.4035, "step": 1187 }, { "epoch": 1.5892976588628762, "grad_norm": 0.7682766318321228, "learning_rate": 4.741107609185052e-05, "loss": 1.4293, "step": 1188 }, { "epoch": 1.5906354515050167, "grad_norm": 0.6367042660713196, "learning_rate": 4.736605132823053e-05, "loss": 0.9255, "step": 1189 }, { "epoch": 1.5919732441471572, "grad_norm": 0.5380322337150574, "learning_rate": 4.732102656461054e-05, "loss": 1.5305, "step": 1190 }, { "epoch": 1.5933110367892978, "grad_norm": 0.4204621911048889, "learning_rate": 4.727600180099055e-05, "loss": 1.7735, "step": 1191 }, { "epoch": 1.594648829431438, "grad_norm": 0.6201006770133972, "learning_rate": 4.723097703737056e-05, "loss": 1.2954, "step": 1192 }, { "epoch": 1.5959866220735786, "grad_norm": 0.5950494408607483, "learning_rate": 4.7185952273750566e-05, "loss": 1.3848, "step": 1193 }, { "epoch": 1.597324414715719, "grad_norm": 0.5504522919654846, "learning_rate": 4.7140927510130575e-05, "loss": 1.5958, "step": 1194 }, { "epoch": 1.5986622073578596, "grad_norm": 0.4658716320991516, "learning_rate": 4.7095902746510583e-05, "loss": 1.5919, "step": 1195 }, { "epoch": 1.6, "grad_norm": 0.7538139820098877, "learning_rate": 4.705087798289059e-05, "loss": 1.4238, "step": 1196 }, { "epoch": 1.6013377926421404, "grad_norm": 0.6065263152122498, "learning_rate": 4.70058532192706e-05, "loss": 1.4358, "step": 1197 }, { "epoch": 1.602675585284281, "grad_norm": 0.481280118227005, "learning_rate": 4.6960828455650616e-05, "loss": 1.5438, "step": 1198 }, { "epoch": 1.6040133779264214, "grad_norm": 0.5536462664604187, "learning_rate": 4.691580369203062e-05, "loss": 1.1397, "step": 1199 }, { "epoch": 1.605351170568562, "grad_norm": 0.6622151732444763, "learning_rate": 4.687077892841063e-05, "loss": 0.8969, "step": 1200 }, { "epoch": 1.6066889632107024, "grad_norm": 0.758965015411377, "learning_rate": 4.6825754164790635e-05, "loss": 1.1739, "step": 1201 }, { "epoch": 1.6080267558528427, "grad_norm": 0.7237421274185181, "learning_rate": 4.678072940117065e-05, "loss": 1.1649, "step": 1202 }, { "epoch": 1.6093645484949832, "grad_norm": 0.7297394871711731, "learning_rate": 4.673570463755065e-05, "loss": 1.9839, "step": 1203 }, { "epoch": 1.6107023411371237, "grad_norm": 0.6325135231018066, "learning_rate": 4.669067987393067e-05, "loss": 1.4049, "step": 1204 }, { "epoch": 1.6120401337792643, "grad_norm": 0.46899211406707764, "learning_rate": 4.664565511031067e-05, "loss": 1.7795, "step": 1205 }, { "epoch": 1.6133779264214048, "grad_norm": 0.6146332025527954, "learning_rate": 4.6600630346690685e-05, "loss": 1.4186, "step": 1206 }, { "epoch": 1.614715719063545, "grad_norm": 0.6527016758918762, "learning_rate": 4.655560558307069e-05, "loss": 1.5647, "step": 1207 }, { "epoch": 1.6160535117056856, "grad_norm": 0.6520981192588806, "learning_rate": 4.65105808194507e-05, "loss": 1.5652, "step": 1208 }, { "epoch": 1.617391304347826, "grad_norm": 0.6226683855056763, "learning_rate": 4.646555605583071e-05, "loss": 0.9986, "step": 1209 }, { "epoch": 1.6187290969899666, "grad_norm": 0.6789349317550659, "learning_rate": 4.642053129221072e-05, "loss": 1.499, "step": 1210 }, { "epoch": 1.620066889632107, "grad_norm": 0.5669140815734863, "learning_rate": 4.637550652859073e-05, "loss": 1.2055, "step": 1211 }, { "epoch": 1.6214046822742474, "grad_norm": 0.5172103047370911, "learning_rate": 4.6330481764970736e-05, "loss": 1.4732, "step": 1212 }, { "epoch": 1.6227424749163881, "grad_norm": 0.6144703030586243, "learning_rate": 4.6285457001350745e-05, "loss": 1.6759, "step": 1213 }, { "epoch": 1.6240802675585284, "grad_norm": 0.5319096446037292, "learning_rate": 4.624043223773075e-05, "loss": 1.5875, "step": 1214 }, { "epoch": 1.625418060200669, "grad_norm": 0.37053182721138, "learning_rate": 4.619540747411076e-05, "loss": 1.8184, "step": 1215 }, { "epoch": 1.6267558528428094, "grad_norm": 0.5792490243911743, "learning_rate": 4.615038271049078e-05, "loss": 1.0232, "step": 1216 }, { "epoch": 1.6280936454849497, "grad_norm": 0.4057672619819641, "learning_rate": 4.610535794687078e-05, "loss": 1.5673, "step": 1217 }, { "epoch": 1.6294314381270905, "grad_norm": 0.49478086829185486, "learning_rate": 4.6060333183250794e-05, "loss": 1.7108, "step": 1218 }, { "epoch": 1.6307692307692307, "grad_norm": 0.4574749171733856, "learning_rate": 4.6015308419630796e-05, "loss": 1.7347, "step": 1219 }, { "epoch": 1.6321070234113713, "grad_norm": 0.6416162252426147, "learning_rate": 4.597028365601081e-05, "loss": 1.2321, "step": 1220 }, { "epoch": 1.6334448160535118, "grad_norm": 0.6849283576011658, "learning_rate": 4.5925258892390813e-05, "loss": 1.7808, "step": 1221 }, { "epoch": 1.634782608695652, "grad_norm": 0.639849841594696, "learning_rate": 4.588023412877083e-05, "loss": 1.4418, "step": 1222 }, { "epoch": 1.6361204013377928, "grad_norm": 0.5444992780685425, "learning_rate": 4.583520936515083e-05, "loss": 1.3752, "step": 1223 }, { "epoch": 1.637458193979933, "grad_norm": 0.47308918833732605, "learning_rate": 4.5790184601530846e-05, "loss": 1.396, "step": 1224 }, { "epoch": 1.6387959866220736, "grad_norm": 0.5986432433128357, "learning_rate": 4.5745159837910855e-05, "loss": 1.548, "step": 1225 }, { "epoch": 1.640133779264214, "grad_norm": 0.499828577041626, "learning_rate": 4.570013507429086e-05, "loss": 1.0529, "step": 1226 }, { "epoch": 1.6414715719063544, "grad_norm": 1.0466457605361938, "learning_rate": 4.565511031067087e-05, "loss": 1.0108, "step": 1227 }, { "epoch": 1.6428093645484951, "grad_norm": 0.5736595988273621, "learning_rate": 4.561008554705088e-05, "loss": 1.1951, "step": 1228 }, { "epoch": 1.6441471571906354, "grad_norm": 0.5176533460617065, "learning_rate": 4.556506078343089e-05, "loss": 1.6408, "step": 1229 }, { "epoch": 1.645484949832776, "grad_norm": 0.823546290397644, "learning_rate": 4.55200360198109e-05, "loss": 0.8908, "step": 1230 }, { "epoch": 1.6468227424749164, "grad_norm": 0.6029638051986694, "learning_rate": 4.5475011256190906e-05, "loss": 1.3513, "step": 1231 }, { "epoch": 1.6481605351170567, "grad_norm": 0.5313596129417419, "learning_rate": 4.5429986492570915e-05, "loss": 1.3504, "step": 1232 }, { "epoch": 1.6494983277591975, "grad_norm": 0.4931012988090515, "learning_rate": 4.538496172895092e-05, "loss": 1.5954, "step": 1233 }, { "epoch": 1.6508361204013378, "grad_norm": 0.6142311692237854, "learning_rate": 4.533993696533094e-05, "loss": 1.4441, "step": 1234 }, { "epoch": 1.6521739130434783, "grad_norm": 0.5163131952285767, "learning_rate": 4.529491220171094e-05, "loss": 1.3505, "step": 1235 }, { "epoch": 1.6535117056856188, "grad_norm": 0.4623430669307709, "learning_rate": 4.5249887438090956e-05, "loss": 1.5548, "step": 1236 }, { "epoch": 1.654849498327759, "grad_norm": 0.7020666599273682, "learning_rate": 4.520486267447096e-05, "loss": 1.3642, "step": 1237 }, { "epoch": 1.6561872909698998, "grad_norm": 0.6045846343040466, "learning_rate": 4.515983791085097e-05, "loss": 1.4035, "step": 1238 }, { "epoch": 1.65752508361204, "grad_norm": 0.5127092003822327, "learning_rate": 4.5114813147230975e-05, "loss": 1.4297, "step": 1239 }, { "epoch": 1.6588628762541806, "grad_norm": 0.37600478529930115, "learning_rate": 4.506978838361099e-05, "loss": 1.2362, "step": 1240 }, { "epoch": 1.6602006688963211, "grad_norm": 0.8099076747894287, "learning_rate": 4.502476361999099e-05, "loss": 1.3902, "step": 1241 }, { "epoch": 1.6615384615384614, "grad_norm": 0.6966046690940857, "learning_rate": 4.497973885637101e-05, "loss": 1.63, "step": 1242 }, { "epoch": 1.6628762541806021, "grad_norm": 0.607402503490448, "learning_rate": 4.4934714092751016e-05, "loss": 1.5215, "step": 1243 }, { "epoch": 1.6642140468227424, "grad_norm": 0.5723284482955933, "learning_rate": 4.4889689329131025e-05, "loss": 1.3004, "step": 1244 }, { "epoch": 1.665551839464883, "grad_norm": 0.5554037690162659, "learning_rate": 4.484466456551103e-05, "loss": 1.3593, "step": 1245 }, { "epoch": 1.6668896321070235, "grad_norm": 0.7105173468589783, "learning_rate": 4.479963980189104e-05, "loss": 1.2843, "step": 1246 }, { "epoch": 1.6682274247491637, "grad_norm": 0.604043185710907, "learning_rate": 4.475461503827105e-05, "loss": 1.6447, "step": 1247 }, { "epoch": 1.6695652173913045, "grad_norm": 0.6090781688690186, "learning_rate": 4.470959027465106e-05, "loss": 1.6954, "step": 1248 }, { "epoch": 1.6709030100334448, "grad_norm": 0.5769763588905334, "learning_rate": 4.466456551103107e-05, "loss": 1.3212, "step": 1249 }, { "epoch": 1.6722408026755853, "grad_norm": 0.5302979350090027, "learning_rate": 4.4619540747411076e-05, "loss": 1.6354, "step": 1250 }, { "epoch": 1.6735785953177258, "grad_norm": 0.5180411338806152, "learning_rate": 4.4574515983791085e-05, "loss": 1.55, "step": 1251 }, { "epoch": 1.674916387959866, "grad_norm": 0.8196110725402832, "learning_rate": 4.45294912201711e-05, "loss": 1.4945, "step": 1252 }, { "epoch": 1.6762541806020068, "grad_norm": 0.4334547519683838, "learning_rate": 4.44844664565511e-05, "loss": 1.2096, "step": 1253 }, { "epoch": 1.677591973244147, "grad_norm": 0.4448462128639221, "learning_rate": 4.443944169293112e-05, "loss": 1.1928, "step": 1254 }, { "epoch": 1.6789297658862876, "grad_norm": 0.4100378155708313, "learning_rate": 4.439441692931112e-05, "loss": 1.8475, "step": 1255 }, { "epoch": 1.6802675585284281, "grad_norm": 0.5713425278663635, "learning_rate": 4.4349392165691134e-05, "loss": 1.5135, "step": 1256 }, { "epoch": 1.6816053511705684, "grad_norm": 0.42964500188827515, "learning_rate": 4.430436740207114e-05, "loss": 1.522, "step": 1257 }, { "epoch": 1.6829431438127092, "grad_norm": 0.7717701196670532, "learning_rate": 4.425934263845115e-05, "loss": 1.3358, "step": 1258 }, { "epoch": 1.6842809364548494, "grad_norm": 0.5887270569801331, "learning_rate": 4.421431787483116e-05, "loss": 1.272, "step": 1259 }, { "epoch": 1.68561872909699, "grad_norm": 0.8065038323402405, "learning_rate": 4.416929311121117e-05, "loss": 0.8129, "step": 1260 }, { "epoch": 1.6869565217391305, "grad_norm": 0.6046693325042725, "learning_rate": 4.412426834759118e-05, "loss": 1.5241, "step": 1261 }, { "epoch": 1.6882943143812708, "grad_norm": 0.4960588812828064, "learning_rate": 4.4079243583971186e-05, "loss": 1.4725, "step": 1262 }, { "epoch": 1.6896321070234115, "grad_norm": 0.6210783123970032, "learning_rate": 4.4034218820351194e-05, "loss": 1.2133, "step": 1263 }, { "epoch": 1.6909698996655518, "grad_norm": 0.5066959857940674, "learning_rate": 4.39891940567312e-05, "loss": 0.7267, "step": 1264 }, { "epoch": 1.6923076923076923, "grad_norm": 0.3616136610507965, "learning_rate": 4.394416929311121e-05, "loss": 1.4818, "step": 1265 }, { "epoch": 1.6936454849498328, "grad_norm": 0.6788431406021118, "learning_rate": 4.389914452949122e-05, "loss": 1.4515, "step": 1266 }, { "epoch": 1.694983277591973, "grad_norm": 0.5391234755516052, "learning_rate": 4.385411976587123e-05, "loss": 1.5275, "step": 1267 }, { "epoch": 1.6963210702341138, "grad_norm": 0.6781142354011536, "learning_rate": 4.380909500225124e-05, "loss": 1.1785, "step": 1268 }, { "epoch": 1.6976588628762541, "grad_norm": 0.45886972546577454, "learning_rate": 4.376407023863125e-05, "loss": 1.0886, "step": 1269 }, { "epoch": 1.6989966555183946, "grad_norm": 0.6447398662567139, "learning_rate": 4.371904547501126e-05, "loss": 1.5353, "step": 1270 }, { "epoch": 1.7003344481605351, "grad_norm": 0.6216405034065247, "learning_rate": 4.367402071139127e-05, "loss": 1.5214, "step": 1271 }, { "epoch": 1.7016722408026754, "grad_norm": 0.6292665600776672, "learning_rate": 4.362899594777128e-05, "loss": 1.1899, "step": 1272 }, { "epoch": 1.7030100334448162, "grad_norm": 0.7419617176055908, "learning_rate": 4.358397118415129e-05, "loss": 1.2945, "step": 1273 }, { "epoch": 1.7043478260869565, "grad_norm": 0.49025335907936096, "learning_rate": 4.3538946420531296e-05, "loss": 1.7496, "step": 1274 }, { "epoch": 1.705685618729097, "grad_norm": 0.659555196762085, "learning_rate": 4.3493921656911304e-05, "loss": 1.5394, "step": 1275 }, { "epoch": 1.7070234113712375, "grad_norm": 0.7264639735221863, "learning_rate": 4.344889689329131e-05, "loss": 1.5763, "step": 1276 }, { "epoch": 1.7083612040133778, "grad_norm": 0.9294725656509399, "learning_rate": 4.340387212967132e-05, "loss": 1.3261, "step": 1277 }, { "epoch": 1.7096989966555185, "grad_norm": 0.6634767651557922, "learning_rate": 4.335884736605133e-05, "loss": 1.1431, "step": 1278 }, { "epoch": 1.7110367892976588, "grad_norm": 1.48710036277771, "learning_rate": 4.331382260243134e-05, "loss": 1.3194, "step": 1279 }, { "epoch": 1.7123745819397993, "grad_norm": 0.4782906770706177, "learning_rate": 4.326879783881135e-05, "loss": 1.8644, "step": 1280 }, { "epoch": 1.7137123745819398, "grad_norm": 0.47573214769363403, "learning_rate": 4.322377307519136e-05, "loss": 1.7005, "step": 1281 }, { "epoch": 1.71505016722408, "grad_norm": 0.5692914724349976, "learning_rate": 4.3178748311571364e-05, "loss": 1.1717, "step": 1282 }, { "epoch": 1.7163879598662208, "grad_norm": 0.6902774572372437, "learning_rate": 4.313372354795138e-05, "loss": 0.8005, "step": 1283 }, { "epoch": 1.7177257525083611, "grad_norm": 0.4654804766178131, "learning_rate": 4.308869878433138e-05, "loss": 1.8657, "step": 1284 }, { "epoch": 1.7190635451505016, "grad_norm": 0.525980532169342, "learning_rate": 4.30436740207114e-05, "loss": 1.7472, "step": 1285 }, { "epoch": 1.7204013377926421, "grad_norm": 3.5633068084716797, "learning_rate": 4.29986492570914e-05, "loss": 1.1038, "step": 1286 }, { "epoch": 1.7217391304347827, "grad_norm": 0.6150617599487305, "learning_rate": 4.2953624493471414e-05, "loss": 1.5504, "step": 1287 }, { "epoch": 1.7230769230769232, "grad_norm": 0.8399518728256226, "learning_rate": 4.290859972985142e-05, "loss": 0.5775, "step": 1288 }, { "epoch": 1.7244147157190635, "grad_norm": 0.47936901450157166, "learning_rate": 4.286357496623143e-05, "loss": 1.5882, "step": 1289 }, { "epoch": 1.725752508361204, "grad_norm": 0.7554563879966736, "learning_rate": 4.281855020261144e-05, "loss": 1.2668, "step": 1290 }, { "epoch": 1.7270903010033445, "grad_norm": 0.6163820624351501, "learning_rate": 4.277352543899145e-05, "loss": 1.7666, "step": 1291 }, { "epoch": 1.728428093645485, "grad_norm": 0.5814427733421326, "learning_rate": 4.272850067537146e-05, "loss": 1.3781, "step": 1292 }, { "epoch": 1.7297658862876255, "grad_norm": 1.035375714302063, "learning_rate": 4.2683475911751466e-05, "loss": 1.128, "step": 1293 }, { "epoch": 1.7311036789297658, "grad_norm": 0.4975506067276001, "learning_rate": 4.2638451148131474e-05, "loss": 1.2066, "step": 1294 }, { "epoch": 1.7324414715719063, "grad_norm": 0.533572793006897, "learning_rate": 4.259342638451148e-05, "loss": 1.076, "step": 1295 }, { "epoch": 1.7337792642140468, "grad_norm": 0.5955677628517151, "learning_rate": 4.254840162089149e-05, "loss": 1.6552, "step": 1296 }, { "epoch": 1.7351170568561873, "grad_norm": 0.7461806535720825, "learning_rate": 4.250337685727151e-05, "loss": 1.2706, "step": 1297 }, { "epoch": 1.7364548494983278, "grad_norm": 0.5620076060295105, "learning_rate": 4.245835209365151e-05, "loss": 1.8955, "step": 1298 }, { "epoch": 1.7377926421404681, "grad_norm": 0.5752491354942322, "learning_rate": 4.2413327330031524e-05, "loss": 1.3457, "step": 1299 }, { "epoch": 1.7391304347826086, "grad_norm": 0.6129872798919678, "learning_rate": 4.2368302566411526e-05, "loss": 1.2544, "step": 1300 }, { "epoch": 1.7404682274247492, "grad_norm": 0.43887007236480713, "learning_rate": 4.232327780279154e-05, "loss": 1.8382, "step": 1301 }, { "epoch": 1.7418060200668897, "grad_norm": 0.7264611124992371, "learning_rate": 4.227825303917154e-05, "loss": 1.0174, "step": 1302 }, { "epoch": 1.7431438127090302, "grad_norm": 0.570402979850769, "learning_rate": 4.223322827555156e-05, "loss": 1.4562, "step": 1303 }, { "epoch": 1.7444816053511705, "grad_norm": 0.6455773115158081, "learning_rate": 4.218820351193156e-05, "loss": 1.4974, "step": 1304 }, { "epoch": 1.745819397993311, "grad_norm": 0.5429325699806213, "learning_rate": 4.2143178748311576e-05, "loss": 1.4276, "step": 1305 }, { "epoch": 1.7471571906354515, "grad_norm": 0.5509850978851318, "learning_rate": 4.2098153984691584e-05, "loss": 1.436, "step": 1306 }, { "epoch": 1.748494983277592, "grad_norm": 0.7787244915962219, "learning_rate": 4.205312922107159e-05, "loss": 0.9241, "step": 1307 }, { "epoch": 1.7498327759197325, "grad_norm": 0.7184525728225708, "learning_rate": 4.20081044574516e-05, "loss": 1.0473, "step": 1308 }, { "epoch": 1.7511705685618728, "grad_norm": 0.6182901859283447, "learning_rate": 4.196307969383161e-05, "loss": 1.6895, "step": 1309 }, { "epoch": 1.7525083612040135, "grad_norm": 0.6682157516479492, "learning_rate": 4.191805493021162e-05, "loss": 0.8872, "step": 1310 }, { "epoch": 1.7538461538461538, "grad_norm": 0.5342267155647278, "learning_rate": 4.187303016659163e-05, "loss": 1.5721, "step": 1311 }, { "epoch": 1.7551839464882943, "grad_norm": 0.8530276417732239, "learning_rate": 4.1828005402971636e-05, "loss": 1.1826, "step": 1312 }, { "epoch": 1.7565217391304349, "grad_norm": 0.5931107997894287, "learning_rate": 4.1782980639351644e-05, "loss": 1.4665, "step": 1313 }, { "epoch": 1.7578595317725751, "grad_norm": 0.5947359204292297, "learning_rate": 4.173795587573165e-05, "loss": 0.9617, "step": 1314 }, { "epoch": 1.7591973244147159, "grad_norm": 1.004927396774292, "learning_rate": 4.169293111211167e-05, "loss": 1.2389, "step": 1315 }, { "epoch": 1.7605351170568562, "grad_norm": 0.5625318288803101, "learning_rate": 4.164790634849167e-05, "loss": 0.8856, "step": 1316 }, { "epoch": 1.7618729096989967, "grad_norm": 0.6681520342826843, "learning_rate": 4.1602881584871685e-05, "loss": 1.0649, "step": 1317 }, { "epoch": 1.7632107023411372, "grad_norm": 0.45618459582328796, "learning_rate": 4.155785682125169e-05, "loss": 1.3875, "step": 1318 }, { "epoch": 1.7645484949832775, "grad_norm": 0.5873126983642578, "learning_rate": 4.15128320576317e-05, "loss": 1.1558, "step": 1319 }, { "epoch": 1.7658862876254182, "grad_norm": 0.5614002346992493, "learning_rate": 4.1467807294011704e-05, "loss": 1.4631, "step": 1320 }, { "epoch": 1.7672240802675585, "grad_norm": 0.5961179733276367, "learning_rate": 4.142278253039172e-05, "loss": 1.3621, "step": 1321 }, { "epoch": 1.768561872909699, "grad_norm": 0.6985737085342407, "learning_rate": 4.137775776677172e-05, "loss": 0.8297, "step": 1322 }, { "epoch": 1.7698996655518395, "grad_norm": 0.5106129050254822, "learning_rate": 4.133273300315174e-05, "loss": 1.7857, "step": 1323 }, { "epoch": 1.7712374581939798, "grad_norm": 0.6078426837921143, "learning_rate": 4.1287708239531745e-05, "loss": 0.6776, "step": 1324 }, { "epoch": 1.7725752508361206, "grad_norm": 0.7135141491889954, "learning_rate": 4.1242683475911754e-05, "loss": 1.3022, "step": 1325 }, { "epoch": 1.7739130434782608, "grad_norm": 0.5969687104225159, "learning_rate": 4.119765871229176e-05, "loss": 1.2663, "step": 1326 }, { "epoch": 1.7752508361204014, "grad_norm": 0.519609272480011, "learning_rate": 4.115263394867177e-05, "loss": 1.3102, "step": 1327 }, { "epoch": 1.7765886287625419, "grad_norm": 0.907187283039093, "learning_rate": 4.110760918505178e-05, "loss": 0.7346, "step": 1328 }, { "epoch": 1.7779264214046822, "grad_norm": 0.5560885667800903, "learning_rate": 4.106258442143179e-05, "loss": 1.4642, "step": 1329 }, { "epoch": 1.779264214046823, "grad_norm": 0.701776921749115, "learning_rate": 4.10175596578118e-05, "loss": 1.2355, "step": 1330 }, { "epoch": 1.7806020066889632, "grad_norm": 0.5565826892852783, "learning_rate": 4.0972534894191806e-05, "loss": 1.3008, "step": 1331 }, { "epoch": 1.7819397993311037, "grad_norm": 0.6748178005218506, "learning_rate": 4.0927510130571814e-05, "loss": 1.2313, "step": 1332 }, { "epoch": 1.7832775919732442, "grad_norm": 0.9739630222320557, "learning_rate": 4.088248536695183e-05, "loss": 1.4087, "step": 1333 }, { "epoch": 1.7846153846153845, "grad_norm": 0.7524449825286865, "learning_rate": 4.083746060333183e-05, "loss": 1.4471, "step": 1334 }, { "epoch": 1.7859531772575252, "grad_norm": 0.5693138241767883, "learning_rate": 4.079243583971185e-05, "loss": 1.7757, "step": 1335 }, { "epoch": 1.7872909698996655, "grad_norm": 0.4918235242366791, "learning_rate": 4.074741107609185e-05, "loss": 1.5435, "step": 1336 }, { "epoch": 1.788628762541806, "grad_norm": 0.39673417806625366, "learning_rate": 4.0702386312471864e-05, "loss": 1.6191, "step": 1337 }, { "epoch": 1.7899665551839465, "grad_norm": 0.487466424703598, "learning_rate": 4.0657361548851866e-05, "loss": 1.6805, "step": 1338 }, { "epoch": 1.7913043478260868, "grad_norm": 0.5584820508956909, "learning_rate": 4.061233678523188e-05, "loss": 0.7761, "step": 1339 }, { "epoch": 1.7926421404682276, "grad_norm": 0.5846370458602905, "learning_rate": 4.056731202161189e-05, "loss": 1.2145, "step": 1340 }, { "epoch": 1.7939799331103679, "grad_norm": 0.7206812500953674, "learning_rate": 4.05222872579919e-05, "loss": 1.1692, "step": 1341 }, { "epoch": 1.7953177257525084, "grad_norm": 0.5454712510108948, "learning_rate": 4.047726249437191e-05, "loss": 1.0397, "step": 1342 }, { "epoch": 1.7966555183946489, "grad_norm": 1.1169034242630005, "learning_rate": 4.0432237730751915e-05, "loss": 1.1179, "step": 1343 }, { "epoch": 1.7979933110367892, "grad_norm": 0.5023759007453918, "learning_rate": 4.0387212967131924e-05, "loss": 1.4789, "step": 1344 }, { "epoch": 1.79933110367893, "grad_norm": 1.1506235599517822, "learning_rate": 4.034218820351193e-05, "loss": 1.2992, "step": 1345 }, { "epoch": 1.8006688963210702, "grad_norm": 0.44863954186439514, "learning_rate": 4.029716343989194e-05, "loss": 1.1604, "step": 1346 }, { "epoch": 1.8020066889632107, "grad_norm": 0.6827251315116882, "learning_rate": 4.025213867627195e-05, "loss": 0.756, "step": 1347 }, { "epoch": 1.8033444816053512, "grad_norm": 0.688870906829834, "learning_rate": 4.020711391265196e-05, "loss": 1.17, "step": 1348 }, { "epoch": 1.8046822742474915, "grad_norm": 0.5064193606376648, "learning_rate": 4.016208914903197e-05, "loss": 1.2382, "step": 1349 }, { "epoch": 1.8060200668896322, "grad_norm": 0.4319298565387726, "learning_rate": 4.0117064385411976e-05, "loss": 1.8041, "step": 1350 }, { "epoch": 1.8073578595317725, "grad_norm": 0.5849002003669739, "learning_rate": 4.007203962179199e-05, "loss": 1.5873, "step": 1351 }, { "epoch": 1.808695652173913, "grad_norm": 0.6112298369407654, "learning_rate": 4.0027014858172e-05, "loss": 1.5176, "step": 1352 }, { "epoch": 1.8100334448160535, "grad_norm": 0.839145302772522, "learning_rate": 3.998199009455201e-05, "loss": 0.8495, "step": 1353 }, { "epoch": 1.8113712374581938, "grad_norm": 0.6453105807304382, "learning_rate": 3.993696533093202e-05, "loss": 1.3811, "step": 1354 }, { "epoch": 1.8127090301003346, "grad_norm": 0.5794005990028381, "learning_rate": 3.9891940567312025e-05, "loss": 1.411, "step": 1355 }, { "epoch": 1.8140468227424749, "grad_norm": 0.6016961932182312, "learning_rate": 3.9846915803692034e-05, "loss": 1.2842, "step": 1356 }, { "epoch": 1.8153846153846154, "grad_norm": 0.4260030686855316, "learning_rate": 3.980189104007204e-05, "loss": 1.8403, "step": 1357 }, { "epoch": 1.8167224080267559, "grad_norm": 0.8129619359970093, "learning_rate": 3.975686627645205e-05, "loss": 1.4538, "step": 1358 }, { "epoch": 1.8180602006688962, "grad_norm": 0.7668693661689758, "learning_rate": 3.971184151283206e-05, "loss": 1.329, "step": 1359 }, { "epoch": 1.819397993311037, "grad_norm": 0.9522974491119385, "learning_rate": 3.966681674921207e-05, "loss": 1.4439, "step": 1360 }, { "epoch": 1.8207357859531772, "grad_norm": 0.6404708027839661, "learning_rate": 3.962179198559208e-05, "loss": 1.2675, "step": 1361 }, { "epoch": 1.8220735785953177, "grad_norm": 0.5133899450302124, "learning_rate": 3.9576767221972085e-05, "loss": 1.8803, "step": 1362 }, { "epoch": 1.8234113712374582, "grad_norm": 0.5757606625556946, "learning_rate": 3.9531742458352094e-05, "loss": 1.4274, "step": 1363 }, { "epoch": 1.8247491638795985, "grad_norm": 0.5684844255447388, "learning_rate": 3.948671769473211e-05, "loss": 1.5116, "step": 1364 }, { "epoch": 1.8260869565217392, "grad_norm": 0.5964487791061401, "learning_rate": 3.944169293111211e-05, "loss": 1.0454, "step": 1365 }, { "epoch": 1.8274247491638795, "grad_norm": 0.6252050399780273, "learning_rate": 3.9396668167492126e-05, "loss": 1.5453, "step": 1366 }, { "epoch": 1.82876254180602, "grad_norm": 1.144151210784912, "learning_rate": 3.935164340387213e-05, "loss": 1.0064, "step": 1367 }, { "epoch": 1.8301003344481606, "grad_norm": 0.716434121131897, "learning_rate": 3.9306618640252144e-05, "loss": 1.2945, "step": 1368 }, { "epoch": 1.8314381270903008, "grad_norm": 0.9308441281318665, "learning_rate": 3.926159387663215e-05, "loss": 1.5683, "step": 1369 }, { "epoch": 1.8327759197324416, "grad_norm": 0.5083184838294983, "learning_rate": 3.921656911301216e-05, "loss": 1.859, "step": 1370 }, { "epoch": 1.8341137123745819, "grad_norm": 0.5718291997909546, "learning_rate": 3.917154434939217e-05, "loss": 1.5812, "step": 1371 }, { "epoch": 1.8354515050167224, "grad_norm": 1.204251766204834, "learning_rate": 3.912651958577218e-05, "loss": 0.9265, "step": 1372 }, { "epoch": 1.836789297658863, "grad_norm": 0.5010527968406677, "learning_rate": 3.9081494822152187e-05, "loss": 1.5297, "step": 1373 }, { "epoch": 1.8381270903010032, "grad_norm": 0.5558527708053589, "learning_rate": 3.9036470058532195e-05, "loss": 1.5666, "step": 1374 }, { "epoch": 1.839464882943144, "grad_norm": 0.66669762134552, "learning_rate": 3.8991445294912204e-05, "loss": 1.2004, "step": 1375 }, { "epoch": 1.8408026755852842, "grad_norm": 0.5700245499610901, "learning_rate": 3.894642053129221e-05, "loss": 1.5689, "step": 1376 }, { "epoch": 1.8421404682274247, "grad_norm": 0.4422018826007843, "learning_rate": 3.890139576767222e-05, "loss": 1.7777, "step": 1377 }, { "epoch": 1.8434782608695652, "grad_norm": 0.6519413590431213, "learning_rate": 3.8856371004052236e-05, "loss": 1.372, "step": 1378 }, { "epoch": 1.8448160535117055, "grad_norm": 0.591915488243103, "learning_rate": 3.881134624043224e-05, "loss": 1.1428, "step": 1379 }, { "epoch": 1.8461538461538463, "grad_norm": 0.5624032020568848, "learning_rate": 3.8766321476812253e-05, "loss": 1.3378, "step": 1380 }, { "epoch": 1.8474916387959865, "grad_norm": 0.5132156014442444, "learning_rate": 3.8721296713192255e-05, "loss": 1.5724, "step": 1381 }, { "epoch": 1.848829431438127, "grad_norm": 0.49696585536003113, "learning_rate": 3.867627194957227e-05, "loss": 1.2503, "step": 1382 }, { "epoch": 1.8501672240802676, "grad_norm": 0.6636870503425598, "learning_rate": 3.863124718595227e-05, "loss": 1.4385, "step": 1383 }, { "epoch": 1.851505016722408, "grad_norm": 0.4480172097682953, "learning_rate": 3.858622242233229e-05, "loss": 1.7326, "step": 1384 }, { "epoch": 1.8528428093645486, "grad_norm": 0.893785297870636, "learning_rate": 3.854119765871229e-05, "loss": 1.7338, "step": 1385 }, { "epoch": 1.8541806020066889, "grad_norm": 0.41819554567337036, "learning_rate": 3.8496172895092305e-05, "loss": 1.1574, "step": 1386 }, { "epoch": 1.8555183946488294, "grad_norm": 1.0737574100494385, "learning_rate": 3.8451148131472314e-05, "loss": 0.7989, "step": 1387 }, { "epoch": 1.85685618729097, "grad_norm": 0.5694720149040222, "learning_rate": 3.840612336785232e-05, "loss": 1.5935, "step": 1388 }, { "epoch": 1.8581939799331104, "grad_norm": 0.5654177069664001, "learning_rate": 3.836109860423233e-05, "loss": 0.8953, "step": 1389 }, { "epoch": 1.859531772575251, "grad_norm": 0.5609742403030396, "learning_rate": 3.831607384061234e-05, "loss": 1.5239, "step": 1390 }, { "epoch": 1.8608695652173912, "grad_norm": 0.6583299040794373, "learning_rate": 3.827104907699235e-05, "loss": 1.2912, "step": 1391 }, { "epoch": 1.8622073578595317, "grad_norm": 0.5440747141838074, "learning_rate": 3.8226024313372357e-05, "loss": 1.6956, "step": 1392 }, { "epoch": 1.8635451505016722, "grad_norm": 0.6128625869750977, "learning_rate": 3.8180999549752365e-05, "loss": 1.3703, "step": 1393 }, { "epoch": 1.8648829431438128, "grad_norm": 0.643354058265686, "learning_rate": 3.8135974786132374e-05, "loss": 1.2531, "step": 1394 }, { "epoch": 1.8662207357859533, "grad_norm": 0.6170260310173035, "learning_rate": 3.809095002251238e-05, "loss": 1.5198, "step": 1395 }, { "epoch": 1.8675585284280936, "grad_norm": 0.4028455317020416, "learning_rate": 3.80459252588924e-05, "loss": 1.8615, "step": 1396 }, { "epoch": 1.868896321070234, "grad_norm": 0.7505510449409485, "learning_rate": 3.80009004952724e-05, "loss": 1.2079, "step": 1397 }, { "epoch": 1.8702341137123746, "grad_norm": 0.6865562796592712, "learning_rate": 3.7955875731652415e-05, "loss": 1.1446, "step": 1398 }, { "epoch": 1.871571906354515, "grad_norm": 0.5812001824378967, "learning_rate": 3.791085096803242e-05, "loss": 1.4853, "step": 1399 }, { "epoch": 1.8729096989966556, "grad_norm": 0.6272473335266113, "learning_rate": 3.786582620441243e-05, "loss": 1.1798, "step": 1400 }, { "epoch": 1.874247491638796, "grad_norm": 0.5115850567817688, "learning_rate": 3.7820801440792434e-05, "loss": 1.2559, "step": 1401 }, { "epoch": 1.8755852842809364, "grad_norm": 0.43357017636299133, "learning_rate": 3.777577667717245e-05, "loss": 0.9326, "step": 1402 }, { "epoch": 1.876923076923077, "grad_norm": 0.5632050633430481, "learning_rate": 3.773075191355245e-05, "loss": 1.388, "step": 1403 }, { "epoch": 1.8782608695652174, "grad_norm": 0.6570121049880981, "learning_rate": 3.7685727149932466e-05, "loss": 1.1719, "step": 1404 }, { "epoch": 1.879598662207358, "grad_norm": 0.480442613363266, "learning_rate": 3.7640702386312475e-05, "loss": 1.6521, "step": 1405 }, { "epoch": 1.8809364548494982, "grad_norm": 0.7593757510185242, "learning_rate": 3.7595677622692484e-05, "loss": 1.0407, "step": 1406 }, { "epoch": 1.8822742474916387, "grad_norm": 0.6240037679672241, "learning_rate": 3.755065285907249e-05, "loss": 1.1501, "step": 1407 }, { "epoch": 1.8836120401337793, "grad_norm": 0.48465749621391296, "learning_rate": 3.75056280954525e-05, "loss": 1.6513, "step": 1408 }, { "epoch": 1.8849498327759198, "grad_norm": 0.7941508889198303, "learning_rate": 3.746060333183251e-05, "loss": 1.3305, "step": 1409 }, { "epoch": 1.8862876254180603, "grad_norm": 0.574839174747467, "learning_rate": 3.741557856821252e-05, "loss": 1.4869, "step": 1410 }, { "epoch": 1.8876254180602006, "grad_norm": 0.6008003354072571, "learning_rate": 3.7370553804592526e-05, "loss": 1.5017, "step": 1411 }, { "epoch": 1.8889632107023413, "grad_norm": 0.5762261152267456, "learning_rate": 3.7325529040972535e-05, "loss": 0.7915, "step": 1412 }, { "epoch": 1.8903010033444816, "grad_norm": 0.6438276767730713, "learning_rate": 3.7280504277352544e-05, "loss": 1.5426, "step": 1413 }, { "epoch": 1.891638795986622, "grad_norm": 0.5164512395858765, "learning_rate": 3.723547951373256e-05, "loss": 1.1212, "step": 1414 }, { "epoch": 1.8929765886287626, "grad_norm": 0.5601268410682678, "learning_rate": 3.719045475011256e-05, "loss": 1.6185, "step": 1415 }, { "epoch": 1.894314381270903, "grad_norm": 0.6093222498893738, "learning_rate": 3.7145429986492576e-05, "loss": 1.4573, "step": 1416 }, { "epoch": 1.8956521739130436, "grad_norm": 0.6153028607368469, "learning_rate": 3.710040522287258e-05, "loss": 1.4644, "step": 1417 }, { "epoch": 1.896989966555184, "grad_norm": 0.5739652514457703, "learning_rate": 3.705538045925259e-05, "loss": 1.4324, "step": 1418 }, { "epoch": 1.8983277591973244, "grad_norm": 0.6016227602958679, "learning_rate": 3.7010355695632595e-05, "loss": 1.7096, "step": 1419 }, { "epoch": 1.899665551839465, "grad_norm": 0.9013569355010986, "learning_rate": 3.696533093201261e-05, "loss": 1.0067, "step": 1420 }, { "epoch": 1.9010033444816052, "grad_norm": 0.7423526644706726, "learning_rate": 3.692030616839261e-05, "loss": 1.1336, "step": 1421 }, { "epoch": 1.902341137123746, "grad_norm": 0.5514466762542725, "learning_rate": 3.687528140477263e-05, "loss": 1.097, "step": 1422 }, { "epoch": 1.9036789297658863, "grad_norm": 0.6179841160774231, "learning_rate": 3.6830256641152636e-05, "loss": 1.5022, "step": 1423 }, { "epoch": 1.9050167224080268, "grad_norm": 0.48712772130966187, "learning_rate": 3.6785231877532645e-05, "loss": 1.0286, "step": 1424 }, { "epoch": 1.9063545150501673, "grad_norm": 0.6644452214241028, "learning_rate": 3.6740207113912653e-05, "loss": 1.4727, "step": 1425 }, { "epoch": 1.9076923076923076, "grad_norm": 0.6408343315124512, "learning_rate": 3.669518235029266e-05, "loss": 1.5816, "step": 1426 }, { "epoch": 1.9090301003344483, "grad_norm": 0.6295036673545837, "learning_rate": 3.665015758667267e-05, "loss": 1.5896, "step": 1427 }, { "epoch": 1.9103678929765886, "grad_norm": 0.6161107420921326, "learning_rate": 3.660513282305268e-05, "loss": 1.397, "step": 1428 }, { "epoch": 1.9117056856187291, "grad_norm": 0.8119416832923889, "learning_rate": 3.656010805943269e-05, "loss": 1.6998, "step": 1429 }, { "epoch": 1.9130434782608696, "grad_norm": 0.504453718662262, "learning_rate": 3.6515083295812696e-05, "loss": 1.4708, "step": 1430 }, { "epoch": 1.91438127090301, "grad_norm": 0.5680826902389526, "learning_rate": 3.6470058532192705e-05, "loss": 1.416, "step": 1431 }, { "epoch": 1.9157190635451506, "grad_norm": 0.5165196657180786, "learning_rate": 3.642503376857272e-05, "loss": 1.153, "step": 1432 }, { "epoch": 1.917056856187291, "grad_norm": 0.640468418598175, "learning_rate": 3.638000900495272e-05, "loss": 1.3228, "step": 1433 }, { "epoch": 1.9183946488294314, "grad_norm": 0.6464949250221252, "learning_rate": 3.633498424133274e-05, "loss": 1.5836, "step": 1434 }, { "epoch": 1.919732441471572, "grad_norm": 0.48071908950805664, "learning_rate": 3.628995947771274e-05, "loss": 1.7179, "step": 1435 }, { "epoch": 1.9210702341137122, "grad_norm": 0.3788856565952301, "learning_rate": 3.6244934714092755e-05, "loss": 1.8167, "step": 1436 }, { "epoch": 1.922408026755853, "grad_norm": 0.563204824924469, "learning_rate": 3.619990995047276e-05, "loss": 1.5081, "step": 1437 }, { "epoch": 1.9237458193979933, "grad_norm": 0.4034525156021118, "learning_rate": 3.615488518685277e-05, "loss": 1.2595, "step": 1438 }, { "epoch": 1.9250836120401338, "grad_norm": 0.7248520255088806, "learning_rate": 3.610986042323278e-05, "loss": 1.7668, "step": 1439 }, { "epoch": 1.9264214046822743, "grad_norm": 0.612653911113739, "learning_rate": 3.606483565961279e-05, "loss": 1.294, "step": 1440 }, { "epoch": 1.9277591973244146, "grad_norm": 0.49420690536499023, "learning_rate": 3.60198108959928e-05, "loss": 1.5397, "step": 1441 }, { "epoch": 1.9290969899665553, "grad_norm": 0.679892897605896, "learning_rate": 3.5974786132372806e-05, "loss": 1.0175, "step": 1442 }, { "epoch": 1.9304347826086956, "grad_norm": 0.44175776839256287, "learning_rate": 3.5929761368752815e-05, "loss": 1.7368, "step": 1443 }, { "epoch": 1.9317725752508361, "grad_norm": 0.634681224822998, "learning_rate": 3.5884736605132823e-05, "loss": 1.4975, "step": 1444 }, { "epoch": 1.9331103678929766, "grad_norm": 0.5663192272186279, "learning_rate": 3.583971184151283e-05, "loss": 1.4068, "step": 1445 }, { "epoch": 1.934448160535117, "grad_norm": 0.6057150959968567, "learning_rate": 3.579468707789284e-05, "loss": 1.2817, "step": 1446 }, { "epoch": 1.9357859531772577, "grad_norm": 0.595527708530426, "learning_rate": 3.5749662314272856e-05, "loss": 1.6711, "step": 1447 }, { "epoch": 1.937123745819398, "grad_norm": 0.5907280445098877, "learning_rate": 3.570463755065286e-05, "loss": 1.5177, "step": 1448 }, { "epoch": 1.9384615384615385, "grad_norm": 0.39169877767562866, "learning_rate": 3.565961278703287e-05, "loss": 2.0893, "step": 1449 }, { "epoch": 1.939799331103679, "grad_norm": 0.5973325967788696, "learning_rate": 3.561458802341288e-05, "loss": 1.421, "step": 1450 }, { "epoch": 1.9411371237458193, "grad_norm": 0.6645547151565552, "learning_rate": 3.556956325979289e-05, "loss": 1.3091, "step": 1451 }, { "epoch": 1.94247491638796, "grad_norm": 0.7206888198852539, "learning_rate": 3.55245384961729e-05, "loss": 1.5102, "step": 1452 }, { "epoch": 1.9438127090301003, "grad_norm": 0.5224019289016724, "learning_rate": 3.547951373255291e-05, "loss": 1.5029, "step": 1453 }, { "epoch": 1.9451505016722408, "grad_norm": 0.8365053534507751, "learning_rate": 3.5434488968932916e-05, "loss": 1.2303, "step": 1454 }, { "epoch": 1.9464882943143813, "grad_norm": 0.6158286929130554, "learning_rate": 3.5389464205312925e-05, "loss": 1.2886, "step": 1455 }, { "epoch": 1.9478260869565216, "grad_norm": 0.7675470113754272, "learning_rate": 3.534443944169293e-05, "loss": 1.0725, "step": 1456 }, { "epoch": 1.9491638795986623, "grad_norm": 1.0587267875671387, "learning_rate": 3.529941467807294e-05, "loss": 1.0416, "step": 1457 }, { "epoch": 1.9505016722408026, "grad_norm": 0.7863774299621582, "learning_rate": 3.525438991445295e-05, "loss": 1.0111, "step": 1458 }, { "epoch": 1.9518394648829431, "grad_norm": 0.47847363352775574, "learning_rate": 3.5209365150832966e-05, "loss": 1.6936, "step": 1459 }, { "epoch": 1.9531772575250836, "grad_norm": 0.5850685834884644, "learning_rate": 3.516434038721297e-05, "loss": 1.5939, "step": 1460 }, { "epoch": 1.954515050167224, "grad_norm": 0.47978919744491577, "learning_rate": 3.511931562359298e-05, "loss": 1.8567, "step": 1461 }, { "epoch": 1.9558528428093647, "grad_norm": 0.5951332449913025, "learning_rate": 3.5074290859972985e-05, "loss": 1.0255, "step": 1462 }, { "epoch": 1.957190635451505, "grad_norm": 0.8558800220489502, "learning_rate": 3.5029266096353e-05, "loss": 1.1657, "step": 1463 }, { "epoch": 1.9585284280936455, "grad_norm": 0.4947897791862488, "learning_rate": 3.4984241332733e-05, "loss": 1.6925, "step": 1464 }, { "epoch": 1.959866220735786, "grad_norm": 0.48866933584213257, "learning_rate": 3.493921656911302e-05, "loss": 1.6474, "step": 1465 }, { "epoch": 1.9612040133779263, "grad_norm": 0.6075807213783264, "learning_rate": 3.489419180549302e-05, "loss": 1.3366, "step": 1466 }, { "epoch": 1.962541806020067, "grad_norm": 0.6691599488258362, "learning_rate": 3.4849167041873035e-05, "loss": 1.3393, "step": 1467 }, { "epoch": 1.9638795986622073, "grad_norm": 0.656430721282959, "learning_rate": 3.480414227825304e-05, "loss": 1.2173, "step": 1468 }, { "epoch": 1.9652173913043478, "grad_norm": 0.540453314781189, "learning_rate": 3.475911751463305e-05, "loss": 1.3623, "step": 1469 }, { "epoch": 1.9665551839464883, "grad_norm": 0.6606508493423462, "learning_rate": 3.471409275101306e-05, "loss": 1.2194, "step": 1470 }, { "epoch": 1.9678929765886286, "grad_norm": 0.6008053421974182, "learning_rate": 3.466906798739307e-05, "loss": 1.3245, "step": 1471 }, { "epoch": 1.9692307692307693, "grad_norm": 0.7815794348716736, "learning_rate": 3.462404322377308e-05, "loss": 1.3401, "step": 1472 }, { "epoch": 1.9705685618729096, "grad_norm": 0.4332338869571686, "learning_rate": 3.4579018460153086e-05, "loss": 1.6547, "step": 1473 }, { "epoch": 1.9719063545150501, "grad_norm": 0.7275242209434509, "learning_rate": 3.4533993696533095e-05, "loss": 1.5228, "step": 1474 }, { "epoch": 1.9732441471571907, "grad_norm": 0.5694137811660767, "learning_rate": 3.44889689329131e-05, "loss": 1.3479, "step": 1475 }, { "epoch": 1.974581939799331, "grad_norm": 0.529672384262085, "learning_rate": 3.444394416929311e-05, "loss": 1.4186, "step": 1476 }, { "epoch": 1.9759197324414717, "grad_norm": 0.6662195920944214, "learning_rate": 3.439891940567313e-05, "loss": 1.3579, "step": 1477 }, { "epoch": 1.977257525083612, "grad_norm": 0.47519707679748535, "learning_rate": 3.435389464205313e-05, "loss": 1.2978, "step": 1478 }, { "epoch": 1.9785953177257525, "grad_norm": 1.7963173389434814, "learning_rate": 3.4308869878433144e-05, "loss": 1.3871, "step": 1479 }, { "epoch": 1.979933110367893, "grad_norm": 0.5129130482673645, "learning_rate": 3.4263845114813146e-05, "loss": 1.5933, "step": 1480 }, { "epoch": 1.9812709030100333, "grad_norm": 0.5007113218307495, "learning_rate": 3.421882035119316e-05, "loss": 1.1155, "step": 1481 }, { "epoch": 1.982608695652174, "grad_norm": 0.46495723724365234, "learning_rate": 3.417379558757316e-05, "loss": 1.408, "step": 1482 }, { "epoch": 1.9839464882943143, "grad_norm": 0.5745284557342529, "learning_rate": 3.412877082395318e-05, "loss": 1.6946, "step": 1483 }, { "epoch": 1.9852842809364548, "grad_norm": 0.48399198055267334, "learning_rate": 3.408374606033318e-05, "loss": 1.5874, "step": 1484 }, { "epoch": 1.9866220735785953, "grad_norm": 0.5783731937408447, "learning_rate": 3.4038721296713196e-05, "loss": 1.537, "step": 1485 }, { "epoch": 1.9879598662207358, "grad_norm": 0.6687440276145935, "learning_rate": 3.3993696533093204e-05, "loss": 1.5465, "step": 1486 }, { "epoch": 1.9892976588628764, "grad_norm": 0.5151572227478027, "learning_rate": 3.394867176947321e-05, "loss": 1.471, "step": 1487 }, { "epoch": 1.9906354515050166, "grad_norm": 0.5424696207046509, "learning_rate": 3.390364700585322e-05, "loss": 1.5837, "step": 1488 }, { "epoch": 1.9919732441471572, "grad_norm": 0.6752240061759949, "learning_rate": 3.385862224223323e-05, "loss": 1.3542, "step": 1489 }, { "epoch": 1.9933110367892977, "grad_norm": 0.9526898264884949, "learning_rate": 3.381359747861324e-05, "loss": 1.255, "step": 1490 }, { "epoch": 1.9946488294314382, "grad_norm": 0.5398581624031067, "learning_rate": 3.376857271499325e-05, "loss": 1.468, "step": 1491 }, { "epoch": 1.9959866220735787, "grad_norm": 0.5538991093635559, "learning_rate": 3.3723547951373256e-05, "loss": 1.5927, "step": 1492 }, { "epoch": 1.997324414715719, "grad_norm": 0.5554574728012085, "learning_rate": 3.3678523187753265e-05, "loss": 1.3998, "step": 1493 }, { "epoch": 1.9986622073578595, "grad_norm": 0.7692258358001709, "learning_rate": 3.363349842413327e-05, "loss": 1.0331, "step": 1494 }, { "epoch": 2.0, "grad_norm": 0.6130909323692322, "learning_rate": 3.358847366051329e-05, "loss": 1.4259, "step": 1495 }, { "epoch": 2.0013377926421403, "grad_norm": 0.8359543085098267, "learning_rate": 3.354344889689329e-05, "loss": 0.8013, "step": 1496 }, { "epoch": 2.002675585284281, "grad_norm": 0.4600941240787506, "learning_rate": 3.3498424133273306e-05, "loss": 0.3069, "step": 1497 }, { "epoch": 2.0040133779264213, "grad_norm": 0.6617846488952637, "learning_rate": 3.345339936965331e-05, "loss": 0.604, "step": 1498 }, { "epoch": 2.005351170568562, "grad_norm": 0.7170037627220154, "learning_rate": 3.340837460603332e-05, "loss": 0.7151, "step": 1499 }, { "epoch": 2.0066889632107023, "grad_norm": 0.6578344106674194, "learning_rate": 3.3363349842413325e-05, "loss": 1.1236, "step": 1500 } ], "logging_steps": 1, "max_steps": 2241, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.694569146514203e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }