{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 6992, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.635512828826904, "learning_rate": 2.9995709382151033e-05, "loss": 6.0242, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.262711524963379, "learning_rate": 2.9991418764302058e-05, "loss": 5.8131, "step": 2 }, { "epoch": 0.0, "grad_norm": 4.200323581695557, "learning_rate": 2.998712814645309e-05, "loss": 5.6225, "step": 3 }, { "epoch": 0.0, "grad_norm": 4.810133457183838, "learning_rate": 2.998283752860412e-05, "loss": 5.4304, "step": 4 }, { "epoch": 0.0, "grad_norm": 4.93181037902832, "learning_rate": 2.997854691075515e-05, "loss": 5.2684, "step": 5 }, { "epoch": 0.0, "grad_norm": 5.074693202972412, "learning_rate": 2.9974256292906177e-05, "loss": 5.0533, "step": 6 }, { "epoch": 0.0, "grad_norm": 4.967318534851074, "learning_rate": 2.996996567505721e-05, "loss": 4.9104, "step": 7 }, { "epoch": 0.0, "grad_norm": 5.453105926513672, "learning_rate": 2.9965675057208238e-05, "loss": 4.6064, "step": 8 }, { "epoch": 0.0, "grad_norm": 5.421672344207764, "learning_rate": 2.996138443935927e-05, "loss": 4.5984, "step": 9 }, { "epoch": 0.0, "grad_norm": 5.02965784072876, "learning_rate": 2.99570938215103e-05, "loss": 4.449, "step": 10 }, { "epoch": 0.0, "grad_norm": 5.782529830932617, "learning_rate": 2.9952803203661327e-05, "loss": 4.1912, "step": 11 }, { "epoch": 0.0, "grad_norm": 6.099738121032715, "learning_rate": 2.994851258581236e-05, "loss": 4.1198, "step": 12 }, { "epoch": 0.0, "grad_norm": 5.961019515991211, "learning_rate": 2.9944221967963388e-05, "loss": 4.0437, "step": 13 }, { "epoch": 0.0, "grad_norm": 10.967097282409668, "learning_rate": 2.9939931350114417e-05, "loss": 3.885, "step": 14 }, { "epoch": 0.0, "grad_norm": 8.03209114074707, "learning_rate": 2.9935640732265446e-05, "loss": 3.6933, "step": 15 }, { "epoch": 0.0, "grad_norm": 8.347334861755371, "learning_rate": 2.9931350114416478e-05, "loss": 3.859, "step": 16 }, { "epoch": 0.0, "grad_norm": 9.741003036499023, "learning_rate": 2.9927059496567503e-05, "loss": 3.4883, "step": 17 }, { "epoch": 0.01, "grad_norm": 11.666119575500488, "learning_rate": 2.9922768878718536e-05, "loss": 3.4243, "step": 18 }, { "epoch": 0.01, "grad_norm": 9.64477252960205, "learning_rate": 2.9918478260869568e-05, "loss": 3.2306, "step": 19 }, { "epoch": 0.01, "grad_norm": 22.396427154541016, "learning_rate": 2.9914187643020596e-05, "loss": 3.3432, "step": 20 }, { "epoch": 0.01, "grad_norm": 14.657906532287598, "learning_rate": 2.9909897025171625e-05, "loss": 3.2188, "step": 21 }, { "epoch": 0.01, "grad_norm": 11.172964096069336, "learning_rate": 2.9905606407322654e-05, "loss": 3.009, "step": 22 }, { "epoch": 0.01, "grad_norm": 30.494609832763672, "learning_rate": 2.9901315789473686e-05, "loss": 3.1648, "step": 23 }, { "epoch": 0.01, "grad_norm": 20.13739776611328, "learning_rate": 2.9897025171624715e-05, "loss": 2.8155, "step": 24 }, { "epoch": 0.01, "grad_norm": 10.68649959564209, "learning_rate": 2.9892734553775744e-05, "loss": 2.8085, "step": 25 }, { "epoch": 0.01, "grad_norm": 13.222569465637207, "learning_rate": 2.9888443935926776e-05, "loss": 2.8157, "step": 26 }, { "epoch": 0.01, "grad_norm": 21.290855407714844, "learning_rate": 2.9884153318077805e-05, "loss": 2.6447, "step": 27 }, { "epoch": 0.01, "grad_norm": 20.91508674621582, "learning_rate": 2.9879862700228833e-05, "loss": 2.6235, "step": 28 }, { "epoch": 0.01, "grad_norm": 14.453760147094727, "learning_rate": 2.9875572082379862e-05, "loss": 2.6736, "step": 29 }, { "epoch": 0.01, "grad_norm": 15.384343147277832, "learning_rate": 2.9871281464530894e-05, "loss": 2.6873, "step": 30 }, { "epoch": 0.01, "grad_norm": 11.872452735900879, "learning_rate": 2.9866990846681923e-05, "loss": 2.6731, "step": 31 }, { "epoch": 0.01, "grad_norm": 7.314385890960693, "learning_rate": 2.9862700228832952e-05, "loss": 2.2646, "step": 32 }, { "epoch": 0.01, "grad_norm": 8.31044864654541, "learning_rate": 2.985840961098398e-05, "loss": 2.5389, "step": 33 }, { "epoch": 0.01, "grad_norm": 11.034132957458496, "learning_rate": 2.9854118993135013e-05, "loss": 2.2643, "step": 34 }, { "epoch": 0.01, "grad_norm": 10.82011604309082, "learning_rate": 2.9849828375286045e-05, "loss": 2.0751, "step": 35 }, { "epoch": 0.01, "grad_norm": 8.5283842086792, "learning_rate": 2.984553775743707e-05, "loss": 2.4416, "step": 36 }, { "epoch": 0.01, "grad_norm": 8.773575782775879, "learning_rate": 2.9841247139588102e-05, "loss": 2.417, "step": 37 }, { "epoch": 0.01, "grad_norm": 17.709125518798828, "learning_rate": 2.983695652173913e-05, "loss": 2.5134, "step": 38 }, { "epoch": 0.01, "grad_norm": 11.986088752746582, "learning_rate": 2.9832665903890163e-05, "loss": 2.5418, "step": 39 }, { "epoch": 0.01, "grad_norm": 8.830208778381348, "learning_rate": 2.982837528604119e-05, "loss": 2.4333, "step": 40 }, { "epoch": 0.01, "grad_norm": 8.927085876464844, "learning_rate": 2.982408466819222e-05, "loss": 2.3037, "step": 41 }, { "epoch": 0.01, "grad_norm": 9.560665130615234, "learning_rate": 2.981979405034325e-05, "loss": 2.5077, "step": 42 }, { "epoch": 0.01, "grad_norm": 9.039464950561523, "learning_rate": 2.981550343249428e-05, "loss": 2.2192, "step": 43 }, { "epoch": 0.01, "grad_norm": 9.975371360778809, "learning_rate": 2.981121281464531e-05, "loss": 2.6494, "step": 44 }, { "epoch": 0.01, "grad_norm": 8.994516372680664, "learning_rate": 2.980692219679634e-05, "loss": 2.2144, "step": 45 }, { "epoch": 0.01, "grad_norm": 9.323376655578613, "learning_rate": 2.980263157894737e-05, "loss": 2.2742, "step": 46 }, { "epoch": 0.01, "grad_norm": 10.652460098266602, "learning_rate": 2.9798340961098397e-05, "loss": 2.1642, "step": 47 }, { "epoch": 0.01, "grad_norm": 10.230737686157227, "learning_rate": 2.979405034324943e-05, "loss": 2.2399, "step": 48 }, { "epoch": 0.01, "grad_norm": 9.706355094909668, "learning_rate": 2.9789759725400458e-05, "loss": 2.2218, "step": 49 }, { "epoch": 0.01, "grad_norm": 9.502559661865234, "learning_rate": 2.978546910755149e-05, "loss": 2.0737, "step": 50 }, { "epoch": 0.01, "grad_norm": 11.392340660095215, "learning_rate": 2.9781178489702515e-05, "loss": 2.5131, "step": 51 }, { "epoch": 0.01, "grad_norm": 14.368525505065918, "learning_rate": 2.9776887871853548e-05, "loss": 1.8392, "step": 52 }, { "epoch": 0.02, "grad_norm": 12.000808715820312, "learning_rate": 2.977259725400458e-05, "loss": 2.0387, "step": 53 }, { "epoch": 0.02, "grad_norm": 10.566619873046875, "learning_rate": 2.9768306636155605e-05, "loss": 2.2464, "step": 54 }, { "epoch": 0.02, "grad_norm": 10.068693161010742, "learning_rate": 2.9764016018306637e-05, "loss": 2.0293, "step": 55 }, { "epoch": 0.02, "grad_norm": 12.956381797790527, "learning_rate": 2.9759725400457666e-05, "loss": 2.3687, "step": 56 }, { "epoch": 0.02, "grad_norm": 9.81608772277832, "learning_rate": 2.9755434782608698e-05, "loss": 2.377, "step": 57 }, { "epoch": 0.02, "grad_norm": 11.117986679077148, "learning_rate": 2.9751144164759724e-05, "loss": 2.439, "step": 58 }, { "epoch": 0.02, "grad_norm": 9.488683700561523, "learning_rate": 2.9746853546910756e-05, "loss": 1.8631, "step": 59 }, { "epoch": 0.02, "grad_norm": 9.672380447387695, "learning_rate": 2.9742562929061788e-05, "loss": 2.0851, "step": 60 }, { "epoch": 0.02, "grad_norm": 12.200733184814453, "learning_rate": 2.9738272311212817e-05, "loss": 2.5484, "step": 61 }, { "epoch": 0.02, "grad_norm": 9.255772590637207, "learning_rate": 2.9733981693363845e-05, "loss": 2.1288, "step": 62 }, { "epoch": 0.02, "grad_norm": 10.493276596069336, "learning_rate": 2.9729691075514874e-05, "loss": 2.2297, "step": 63 }, { "epoch": 0.02, "grad_norm": 9.244399070739746, "learning_rate": 2.9725400457665906e-05, "loss": 2.4804, "step": 64 }, { "epoch": 0.02, "grad_norm": 9.426789283752441, "learning_rate": 2.9721109839816935e-05, "loss": 2.1489, "step": 65 }, { "epoch": 0.02, "grad_norm": 11.235112190246582, "learning_rate": 2.9716819221967964e-05, "loss": 2.0655, "step": 66 }, { "epoch": 0.02, "grad_norm": 9.057259559631348, "learning_rate": 2.9712528604118993e-05, "loss": 2.2147, "step": 67 }, { "epoch": 0.02, "grad_norm": 8.027843475341797, "learning_rate": 2.9708237986270025e-05, "loss": 1.9357, "step": 68 }, { "epoch": 0.02, "grad_norm": 9.6011962890625, "learning_rate": 2.9703947368421054e-05, "loss": 2.1391, "step": 69 }, { "epoch": 0.02, "grad_norm": 9.837102890014648, "learning_rate": 2.9699656750572082e-05, "loss": 2.1937, "step": 70 }, { "epoch": 0.02, "grad_norm": 9.933879852294922, "learning_rate": 2.9695366132723114e-05, "loss": 2.4057, "step": 71 }, { "epoch": 0.02, "grad_norm": 9.57646369934082, "learning_rate": 2.9691075514874143e-05, "loss": 2.2708, "step": 72 }, { "epoch": 0.02, "grad_norm": 8.571762084960938, "learning_rate": 2.9686784897025172e-05, "loss": 1.9102, "step": 73 }, { "epoch": 0.02, "grad_norm": 9.619646072387695, "learning_rate": 2.96824942791762e-05, "loss": 1.9027, "step": 74 }, { "epoch": 0.02, "grad_norm": 8.471129417419434, "learning_rate": 2.9678203661327233e-05, "loss": 2.2281, "step": 75 }, { "epoch": 0.02, "grad_norm": 12.377062797546387, "learning_rate": 2.967391304347826e-05, "loss": 2.3651, "step": 76 }, { "epoch": 0.02, "grad_norm": 9.500938415527344, "learning_rate": 2.966962242562929e-05, "loss": 2.1969, "step": 77 }, { "epoch": 0.02, "grad_norm": 8.627216339111328, "learning_rate": 2.9665331807780323e-05, "loss": 2.1825, "step": 78 }, { "epoch": 0.02, "grad_norm": 10.074145317077637, "learning_rate": 2.966104118993135e-05, "loss": 2.0536, "step": 79 }, { "epoch": 0.02, "grad_norm": 9.057881355285645, "learning_rate": 2.965675057208238e-05, "loss": 1.9442, "step": 80 }, { "epoch": 0.02, "grad_norm": 8.898153305053711, "learning_rate": 2.965245995423341e-05, "loss": 1.9726, "step": 81 }, { "epoch": 0.02, "grad_norm": 8.749974250793457, "learning_rate": 2.964816933638444e-05, "loss": 1.8927, "step": 82 }, { "epoch": 0.02, "grad_norm": 9.17689037322998, "learning_rate": 2.964387871853547e-05, "loss": 2.2203, "step": 83 }, { "epoch": 0.02, "grad_norm": 8.687392234802246, "learning_rate": 2.96395881006865e-05, "loss": 2.314, "step": 84 }, { "epoch": 0.02, "grad_norm": 9.856606483459473, "learning_rate": 2.9635297482837527e-05, "loss": 1.8409, "step": 85 }, { "epoch": 0.02, "grad_norm": 11.169034004211426, "learning_rate": 2.963100686498856e-05, "loss": 2.1135, "step": 86 }, { "epoch": 0.02, "grad_norm": 12.441506385803223, "learning_rate": 2.962671624713959e-05, "loss": 1.8989, "step": 87 }, { "epoch": 0.03, "grad_norm": 10.590865135192871, "learning_rate": 2.9622425629290617e-05, "loss": 2.1097, "step": 88 }, { "epoch": 0.03, "grad_norm": 10.905450820922852, "learning_rate": 2.961813501144165e-05, "loss": 1.9331, "step": 89 }, { "epoch": 0.03, "grad_norm": 10.653109550476074, "learning_rate": 2.9613844393592678e-05, "loss": 2.4247, "step": 90 }, { "epoch": 0.03, "grad_norm": 10.621190071105957, "learning_rate": 2.960955377574371e-05, "loss": 2.0484, "step": 91 }, { "epoch": 0.03, "grad_norm": 9.247668266296387, "learning_rate": 2.9605263157894735e-05, "loss": 1.8694, "step": 92 }, { "epoch": 0.03, "grad_norm": 11.075560569763184, "learning_rate": 2.9600972540045768e-05, "loss": 1.7545, "step": 93 }, { "epoch": 0.03, "grad_norm": 9.714777946472168, "learning_rate": 2.9596681922196796e-05, "loss": 1.961, "step": 94 }, { "epoch": 0.03, "grad_norm": 8.880008697509766, "learning_rate": 2.9592391304347825e-05, "loss": 1.727, "step": 95 }, { "epoch": 0.03, "grad_norm": 9.815738677978516, "learning_rate": 2.9588100686498857e-05, "loss": 1.6125, "step": 96 }, { "epoch": 0.03, "grad_norm": 10.082423210144043, "learning_rate": 2.9583810068649886e-05, "loss": 2.1206, "step": 97 }, { "epoch": 0.03, "grad_norm": 10.187947273254395, "learning_rate": 2.9579519450800918e-05, "loss": 2.1751, "step": 98 }, { "epoch": 0.03, "grad_norm": 9.888360977172852, "learning_rate": 2.9575228832951944e-05, "loss": 1.8519, "step": 99 }, { "epoch": 0.03, "grad_norm": 9.402551651000977, "learning_rate": 2.9570938215102976e-05, "loss": 1.8692, "step": 100 }, { "epoch": 0.03, "grad_norm": 8.821449279785156, "learning_rate": 2.9566647597254005e-05, "loss": 2.0974, "step": 101 }, { "epoch": 0.03, "grad_norm": 9.82812213897705, "learning_rate": 2.9562356979405037e-05, "loss": 2.039, "step": 102 }, { "epoch": 0.03, "grad_norm": 8.911670684814453, "learning_rate": 2.9558066361556065e-05, "loss": 1.8222, "step": 103 }, { "epoch": 0.03, "grad_norm": 8.715346336364746, "learning_rate": 2.9553775743707094e-05, "loss": 1.9212, "step": 104 }, { "epoch": 0.03, "grad_norm": 8.142864227294922, "learning_rate": 2.9549485125858126e-05, "loss": 1.8101, "step": 105 }, { "epoch": 0.03, "grad_norm": 8.892821311950684, "learning_rate": 2.9545194508009152e-05, "loss": 1.7335, "step": 106 }, { "epoch": 0.03, "grad_norm": 9.593453407287598, "learning_rate": 2.9540903890160184e-05, "loss": 1.8757, "step": 107 }, { "epoch": 0.03, "grad_norm": 9.683908462524414, "learning_rate": 2.9536613272311213e-05, "loss": 1.7976, "step": 108 }, { "epoch": 0.03, "grad_norm": 9.749051094055176, "learning_rate": 2.9532322654462245e-05, "loss": 1.7942, "step": 109 }, { "epoch": 0.03, "grad_norm": 9.038222312927246, "learning_rate": 2.952803203661327e-05, "loss": 1.9111, "step": 110 }, { "epoch": 0.03, "grad_norm": 9.059216499328613, "learning_rate": 2.9523741418764302e-05, "loss": 1.7355, "step": 111 }, { "epoch": 0.03, "grad_norm": 12.66389274597168, "learning_rate": 2.9519450800915335e-05, "loss": 1.9505, "step": 112 }, { "epoch": 0.03, "grad_norm": 9.865212440490723, "learning_rate": 2.9515160183066363e-05, "loss": 1.878, "step": 113 }, { "epoch": 0.03, "grad_norm": 11.242400169372559, "learning_rate": 2.9510869565217392e-05, "loss": 1.8862, "step": 114 }, { "epoch": 0.03, "grad_norm": 10.39486312866211, "learning_rate": 2.950657894736842e-05, "loss": 2.164, "step": 115 }, { "epoch": 0.03, "grad_norm": 7.808732986450195, "learning_rate": 2.9502288329519453e-05, "loss": 1.5214, "step": 116 }, { "epoch": 0.03, "grad_norm": 8.637123107910156, "learning_rate": 2.9497997711670482e-05, "loss": 1.8034, "step": 117 }, { "epoch": 0.03, "grad_norm": 8.61000919342041, "learning_rate": 2.949370709382151e-05, "loss": 1.8804, "step": 118 }, { "epoch": 0.03, "grad_norm": 9.153660774230957, "learning_rate": 2.948941647597254e-05, "loss": 1.757, "step": 119 }, { "epoch": 0.03, "grad_norm": 11.368372917175293, "learning_rate": 2.948512585812357e-05, "loss": 2.169, "step": 120 }, { "epoch": 0.03, "grad_norm": 9.239725112915039, "learning_rate": 2.94808352402746e-05, "loss": 1.7511, "step": 121 }, { "epoch": 0.03, "grad_norm": 9.863459587097168, "learning_rate": 2.947654462242563e-05, "loss": 1.6704, "step": 122 }, { "epoch": 0.04, "grad_norm": 9.673696517944336, "learning_rate": 2.947225400457666e-05, "loss": 2.0968, "step": 123 }, { "epoch": 0.04, "grad_norm": 9.66123104095459, "learning_rate": 2.946796338672769e-05, "loss": 2.0148, "step": 124 }, { "epoch": 0.04, "grad_norm": 11.710880279541016, "learning_rate": 2.946367276887872e-05, "loss": 1.8219, "step": 125 }, { "epoch": 0.04, "grad_norm": 13.148943901062012, "learning_rate": 2.9459382151029747e-05, "loss": 2.0196, "step": 126 }, { "epoch": 0.04, "grad_norm": 10.845335960388184, "learning_rate": 2.945509153318078e-05, "loss": 1.9038, "step": 127 }, { "epoch": 0.04, "grad_norm": 9.211142539978027, "learning_rate": 2.945080091533181e-05, "loss": 1.4489, "step": 128 }, { "epoch": 0.04, "grad_norm": 13.306626319885254, "learning_rate": 2.9446510297482837e-05, "loss": 2.1193, "step": 129 }, { "epoch": 0.04, "grad_norm": 9.279850959777832, "learning_rate": 2.944221967963387e-05, "loss": 1.6406, "step": 130 }, { "epoch": 0.04, "grad_norm": 10.513463020324707, "learning_rate": 2.9437929061784898e-05, "loss": 1.4844, "step": 131 }, { "epoch": 0.04, "grad_norm": 10.108253479003906, "learning_rate": 2.9433638443935927e-05, "loss": 1.935, "step": 132 }, { "epoch": 0.04, "grad_norm": 10.302982330322266, "learning_rate": 2.9429347826086956e-05, "loss": 1.9848, "step": 133 }, { "epoch": 0.04, "grad_norm": 11.062186241149902, "learning_rate": 2.9425057208237988e-05, "loss": 1.8631, "step": 134 }, { "epoch": 0.04, "grad_norm": 10.099900245666504, "learning_rate": 2.9420766590389017e-05, "loss": 1.9532, "step": 135 }, { "epoch": 0.04, "grad_norm": 10.016757011413574, "learning_rate": 2.9416475972540045e-05, "loss": 2.2666, "step": 136 }, { "epoch": 0.04, "grad_norm": 10.943782806396484, "learning_rate": 2.9412185354691074e-05, "loss": 1.8332, "step": 137 }, { "epoch": 0.04, "grad_norm": 10.009693145751953, "learning_rate": 2.9407894736842106e-05, "loss": 1.7901, "step": 138 }, { "epoch": 0.04, "grad_norm": 8.308258056640625, "learning_rate": 2.940360411899314e-05, "loss": 1.5903, "step": 139 }, { "epoch": 0.04, "grad_norm": 8.762723922729492, "learning_rate": 2.9399313501144164e-05, "loss": 1.933, "step": 140 }, { "epoch": 0.04, "grad_norm": 9.699069023132324, "learning_rate": 2.9395022883295196e-05, "loss": 1.631, "step": 141 }, { "epoch": 0.04, "grad_norm": 9.041118621826172, "learning_rate": 2.9390732265446225e-05, "loss": 1.4745, "step": 142 }, { "epoch": 0.04, "grad_norm": 9.30391788482666, "learning_rate": 2.9386441647597257e-05, "loss": 1.8366, "step": 143 }, { "epoch": 0.04, "grad_norm": 12.04333209991455, "learning_rate": 2.9382151029748282e-05, "loss": 1.8336, "step": 144 }, { "epoch": 0.04, "grad_norm": 10.09054946899414, "learning_rate": 2.9377860411899314e-05, "loss": 1.7866, "step": 145 }, { "epoch": 0.04, "grad_norm": 13.490518569946289, "learning_rate": 2.9373569794050347e-05, "loss": 2.1837, "step": 146 }, { "epoch": 0.04, "grad_norm": 9.580062866210938, "learning_rate": 2.9369279176201372e-05, "loss": 1.9534, "step": 147 }, { "epoch": 0.04, "grad_norm": 9.711172103881836, "learning_rate": 2.9364988558352404e-05, "loss": 1.7171, "step": 148 }, { "epoch": 0.04, "grad_norm": 10.504125595092773, "learning_rate": 2.9360697940503433e-05, "loss": 1.5971, "step": 149 }, { "epoch": 0.04, "grad_norm": 10.39430046081543, "learning_rate": 2.9356407322654465e-05, "loss": 1.8896, "step": 150 }, { "epoch": 0.04, "grad_norm": 8.143174171447754, "learning_rate": 2.935211670480549e-05, "loss": 1.7462, "step": 151 }, { "epoch": 0.04, "grad_norm": 9.526236534118652, "learning_rate": 2.9347826086956523e-05, "loss": 2.1149, "step": 152 }, { "epoch": 0.04, "grad_norm": 8.047273635864258, "learning_rate": 2.934353546910755e-05, "loss": 1.7804, "step": 153 }, { "epoch": 0.04, "grad_norm": 9.352036476135254, "learning_rate": 2.9339244851258583e-05, "loss": 1.6782, "step": 154 }, { "epoch": 0.04, "grad_norm": 8.56297492980957, "learning_rate": 2.9334954233409612e-05, "loss": 1.9109, "step": 155 }, { "epoch": 0.04, "grad_norm": 9.026894569396973, "learning_rate": 2.933066361556064e-05, "loss": 1.9137, "step": 156 }, { "epoch": 0.04, "grad_norm": 10.315428733825684, "learning_rate": 2.9326372997711673e-05, "loss": 1.9549, "step": 157 }, { "epoch": 0.05, "grad_norm": 10.20797348022461, "learning_rate": 2.93220823798627e-05, "loss": 1.947, "step": 158 }, { "epoch": 0.05, "grad_norm": 9.273527145385742, "learning_rate": 2.931779176201373e-05, "loss": 1.8877, "step": 159 }, { "epoch": 0.05, "grad_norm": 10.411182403564453, "learning_rate": 2.931350114416476e-05, "loss": 2.0504, "step": 160 }, { "epoch": 0.05, "grad_norm": 9.119146347045898, "learning_rate": 2.930921052631579e-05, "loss": 1.782, "step": 161 }, { "epoch": 0.05, "grad_norm": 9.15261173248291, "learning_rate": 2.9304919908466817e-05, "loss": 1.5476, "step": 162 }, { "epoch": 0.05, "grad_norm": 8.623429298400879, "learning_rate": 2.930062929061785e-05, "loss": 1.5317, "step": 163 }, { "epoch": 0.05, "grad_norm": 9.154732704162598, "learning_rate": 2.929633867276888e-05, "loss": 1.6405, "step": 164 }, { "epoch": 0.05, "grad_norm": 11.059431076049805, "learning_rate": 2.929204805491991e-05, "loss": 1.8032, "step": 165 }, { "epoch": 0.05, "grad_norm": 9.126934051513672, "learning_rate": 2.928775743707094e-05, "loss": 1.7491, "step": 166 }, { "epoch": 0.05, "grad_norm": 10.09865665435791, "learning_rate": 2.9283466819221968e-05, "loss": 1.8666, "step": 167 }, { "epoch": 0.05, "grad_norm": 10.668212890625, "learning_rate": 2.9279176201373e-05, "loss": 1.6601, "step": 168 }, { "epoch": 0.05, "grad_norm": 8.207748413085938, "learning_rate": 2.927488558352403e-05, "loss": 1.6901, "step": 169 }, { "epoch": 0.05, "grad_norm": 9.090766906738281, "learning_rate": 2.9270594965675057e-05, "loss": 1.809, "step": 170 }, { "epoch": 0.05, "grad_norm": 8.604634284973145, "learning_rate": 2.9266304347826086e-05, "loss": 1.5706, "step": 171 }, { "epoch": 0.05, "grad_norm": 8.801182746887207, "learning_rate": 2.9262013729977118e-05, "loss": 1.6983, "step": 172 }, { "epoch": 0.05, "grad_norm": 9.632198333740234, "learning_rate": 2.9257723112128147e-05, "loss": 1.6291, "step": 173 }, { "epoch": 0.05, "grad_norm": 7.52984094619751, "learning_rate": 2.9253432494279176e-05, "loss": 1.6256, "step": 174 }, { "epoch": 0.05, "grad_norm": 8.251009941101074, "learning_rate": 2.9249141876430208e-05, "loss": 1.4832, "step": 175 }, { "epoch": 0.05, "grad_norm": 9.106322288513184, "learning_rate": 2.9244851258581237e-05, "loss": 1.7677, "step": 176 }, { "epoch": 0.05, "grad_norm": 10.289727210998535, "learning_rate": 2.9240560640732265e-05, "loss": 1.3136, "step": 177 }, { "epoch": 0.05, "grad_norm": 9.70762825012207, "learning_rate": 2.9236270022883294e-05, "loss": 1.9693, "step": 178 }, { "epoch": 0.05, "grad_norm": 9.674822807312012, "learning_rate": 2.9231979405034326e-05, "loss": 1.9052, "step": 179 }, { "epoch": 0.05, "grad_norm": 7.9198899269104, "learning_rate": 2.922768878718536e-05, "loss": 1.299, "step": 180 }, { "epoch": 0.05, "grad_norm": 8.938362121582031, "learning_rate": 2.9223398169336384e-05, "loss": 1.724, "step": 181 }, { "epoch": 0.05, "grad_norm": 10.080430030822754, "learning_rate": 2.9219107551487416e-05, "loss": 1.8977, "step": 182 }, { "epoch": 0.05, "grad_norm": 11.959859848022461, "learning_rate": 2.9214816933638445e-05, "loss": 2.1547, "step": 183 }, { "epoch": 0.05, "grad_norm": 9.7039794921875, "learning_rate": 2.9210526315789474e-05, "loss": 2.1874, "step": 184 }, { "epoch": 0.05, "grad_norm": 11.266067504882812, "learning_rate": 2.9206235697940502e-05, "loss": 1.5548, "step": 185 }, { "epoch": 0.05, "grad_norm": 9.631067276000977, "learning_rate": 2.9201945080091535e-05, "loss": 1.9972, "step": 186 }, { "epoch": 0.05, "grad_norm": 9.844793319702148, "learning_rate": 2.9197654462242563e-05, "loss": 1.6902, "step": 187 }, { "epoch": 0.05, "grad_norm": 8.674718856811523, "learning_rate": 2.9193363844393592e-05, "loss": 1.5891, "step": 188 }, { "epoch": 0.05, "grad_norm": 8.092727661132812, "learning_rate": 2.9189073226544624e-05, "loss": 1.8592, "step": 189 }, { "epoch": 0.05, "grad_norm": 8.948922157287598, "learning_rate": 2.9184782608695653e-05, "loss": 1.5874, "step": 190 }, { "epoch": 0.05, "grad_norm": 9.707277297973633, "learning_rate": 2.9180491990846685e-05, "loss": 2.122, "step": 191 }, { "epoch": 0.05, "grad_norm": 8.720427513122559, "learning_rate": 2.917620137299771e-05, "loss": 1.9793, "step": 192 }, { "epoch": 0.06, "grad_norm": 10.036628723144531, "learning_rate": 2.9171910755148743e-05, "loss": 2.1604, "step": 193 }, { "epoch": 0.06, "grad_norm": 8.703857421875, "learning_rate": 2.916762013729977e-05, "loss": 1.735, "step": 194 }, { "epoch": 0.06, "grad_norm": 9.382275581359863, "learning_rate": 2.9163329519450804e-05, "loss": 1.3775, "step": 195 }, { "epoch": 0.06, "grad_norm": 10.020468711853027, "learning_rate": 2.915903890160183e-05, "loss": 1.5763, "step": 196 }, { "epoch": 0.06, "grad_norm": 9.866108894348145, "learning_rate": 2.915474828375286e-05, "loss": 1.6297, "step": 197 }, { "epoch": 0.06, "grad_norm": 7.478298664093018, "learning_rate": 2.9150457665903893e-05, "loss": 1.4512, "step": 198 }, { "epoch": 0.06, "grad_norm": 9.695619583129883, "learning_rate": 2.914616704805492e-05, "loss": 2.2167, "step": 199 }, { "epoch": 0.06, "grad_norm": 9.859832763671875, "learning_rate": 2.914187643020595e-05, "loss": 1.7297, "step": 200 }, { "epoch": 0.06, "grad_norm": 9.221556663513184, "learning_rate": 2.913758581235698e-05, "loss": 1.4514, "step": 201 }, { "epoch": 0.06, "grad_norm": 8.9334135055542, "learning_rate": 2.9133295194508012e-05, "loss": 1.8826, "step": 202 }, { "epoch": 0.06, "grad_norm": 8.83910846710205, "learning_rate": 2.9129004576659037e-05, "loss": 1.6077, "step": 203 }, { "epoch": 0.06, "grad_norm": 9.432626724243164, "learning_rate": 2.912471395881007e-05, "loss": 1.4681, "step": 204 }, { "epoch": 0.06, "grad_norm": 10.136731147766113, "learning_rate": 2.9120423340961098e-05, "loss": 1.6358, "step": 205 }, { "epoch": 0.06, "grad_norm": 9.665613174438477, "learning_rate": 2.911613272311213e-05, "loss": 1.7486, "step": 206 }, { "epoch": 0.06, "grad_norm": 8.559819221496582, "learning_rate": 2.911184210526316e-05, "loss": 1.5329, "step": 207 }, { "epoch": 0.06, "grad_norm": 9.54000473022461, "learning_rate": 2.9107551487414188e-05, "loss": 1.7634, "step": 208 }, { "epoch": 0.06, "grad_norm": 9.584062576293945, "learning_rate": 2.910326086956522e-05, "loss": 1.6295, "step": 209 }, { "epoch": 0.06, "grad_norm": 15.094372749328613, "learning_rate": 2.909897025171625e-05, "loss": 1.7466, "step": 210 }, { "epoch": 0.06, "grad_norm": 11.30350112915039, "learning_rate": 2.9094679633867277e-05, "loss": 1.8303, "step": 211 }, { "epoch": 0.06, "grad_norm": 11.015275955200195, "learning_rate": 2.9090389016018306e-05, "loss": 2.1698, "step": 212 }, { "epoch": 0.06, "grad_norm": 8.419334411621094, "learning_rate": 2.908609839816934e-05, "loss": 1.6318, "step": 213 }, { "epoch": 0.06, "grad_norm": 9.208695411682129, "learning_rate": 2.9081807780320364e-05, "loss": 1.4457, "step": 214 }, { "epoch": 0.06, "grad_norm": 11.84401798248291, "learning_rate": 2.9077517162471396e-05, "loss": 1.888, "step": 215 }, { "epoch": 0.06, "grad_norm": 10.923035621643066, "learning_rate": 2.9073226544622428e-05, "loss": 1.5284, "step": 216 }, { "epoch": 0.06, "grad_norm": 9.151190757751465, "learning_rate": 2.9068935926773457e-05, "loss": 1.7259, "step": 217 }, { "epoch": 0.06, "grad_norm": 10.143967628479004, "learning_rate": 2.9064645308924486e-05, "loss": 1.5389, "step": 218 }, { "epoch": 0.06, "grad_norm": 8.866828918457031, "learning_rate": 2.9060354691075514e-05, "loss": 1.7653, "step": 219 }, { "epoch": 0.06, "grad_norm": 9.750387191772461, "learning_rate": 2.9056064073226547e-05, "loss": 1.9439, "step": 220 }, { "epoch": 0.06, "grad_norm": 8.95814323425293, "learning_rate": 2.9051773455377575e-05, "loss": 1.7061, "step": 221 }, { "epoch": 0.06, "grad_norm": 8.123912811279297, "learning_rate": 2.9047482837528604e-05, "loss": 1.8644, "step": 222 }, { "epoch": 0.06, "grad_norm": 8.334152221679688, "learning_rate": 2.9043192219679636e-05, "loss": 1.7821, "step": 223 }, { "epoch": 0.06, "grad_norm": 8.544561386108398, "learning_rate": 2.9038901601830665e-05, "loss": 1.6721, "step": 224 }, { "epoch": 0.06, "grad_norm": 10.224461555480957, "learning_rate": 2.9034610983981694e-05, "loss": 1.3896, "step": 225 }, { "epoch": 0.06, "grad_norm": 9.83475112915039, "learning_rate": 2.9030320366132723e-05, "loss": 1.8461, "step": 226 }, { "epoch": 0.06, "grad_norm": 8.36566162109375, "learning_rate": 2.9026029748283755e-05, "loss": 1.8513, "step": 227 }, { "epoch": 0.07, "grad_norm": 11.508695602416992, "learning_rate": 2.9021739130434783e-05, "loss": 1.4862, "step": 228 }, { "epoch": 0.07, "grad_norm": 12.102386474609375, "learning_rate": 2.9017448512585812e-05, "loss": 1.9282, "step": 229 }, { "epoch": 0.07, "grad_norm": 9.708913803100586, "learning_rate": 2.901315789473684e-05, "loss": 1.4771, "step": 230 }, { "epoch": 0.07, "grad_norm": 9.158058166503906, "learning_rate": 2.9008867276887873e-05, "loss": 1.5567, "step": 231 }, { "epoch": 0.07, "grad_norm": 7.920383930206299, "learning_rate": 2.9004576659038905e-05, "loss": 1.7236, "step": 232 }, { "epoch": 0.07, "grad_norm": 10.36752986907959, "learning_rate": 2.900028604118993e-05, "loss": 1.3592, "step": 233 }, { "epoch": 0.07, "grad_norm": 11.605936050415039, "learning_rate": 2.8995995423340963e-05, "loss": 1.6096, "step": 234 }, { "epoch": 0.07, "grad_norm": 11.090018272399902, "learning_rate": 2.899170480549199e-05, "loss": 1.9197, "step": 235 }, { "epoch": 0.07, "grad_norm": 11.563491821289062, "learning_rate": 2.8987414187643024e-05, "loss": 1.7927, "step": 236 }, { "epoch": 0.07, "grad_norm": 11.237104415893555, "learning_rate": 2.898312356979405e-05, "loss": 2.0423, "step": 237 }, { "epoch": 0.07, "grad_norm": 12.092026710510254, "learning_rate": 2.897883295194508e-05, "loss": 1.6959, "step": 238 }, { "epoch": 0.07, "grad_norm": 8.514479637145996, "learning_rate": 2.897454233409611e-05, "loss": 1.4793, "step": 239 }, { "epoch": 0.07, "grad_norm": 9.548802375793457, "learning_rate": 2.897025171624714e-05, "loss": 1.7667, "step": 240 }, { "epoch": 0.07, "grad_norm": 8.590652465820312, "learning_rate": 2.896596109839817e-05, "loss": 1.9237, "step": 241 }, { "epoch": 0.07, "grad_norm": 9.465312957763672, "learning_rate": 2.89616704805492e-05, "loss": 1.4994, "step": 242 }, { "epoch": 0.07, "grad_norm": 11.648931503295898, "learning_rate": 2.8957379862700232e-05, "loss": 2.0277, "step": 243 }, { "epoch": 0.07, "grad_norm": 10.328200340270996, "learning_rate": 2.8953089244851257e-05, "loss": 1.9486, "step": 244 }, { "epoch": 0.07, "grad_norm": 8.387717247009277, "learning_rate": 2.894879862700229e-05, "loss": 1.6182, "step": 245 }, { "epoch": 0.07, "grad_norm": 10.021469116210938, "learning_rate": 2.8944508009153318e-05, "loss": 1.6038, "step": 246 }, { "epoch": 0.07, "grad_norm": 14.117218017578125, "learning_rate": 2.894021739130435e-05, "loss": 1.7402, "step": 247 }, { "epoch": 0.07, "grad_norm": 9.381802558898926, "learning_rate": 2.8935926773455376e-05, "loss": 1.7406, "step": 248 }, { "epoch": 0.07, "grad_norm": 9.9871826171875, "learning_rate": 2.8931636155606408e-05, "loss": 1.7055, "step": 249 }, { "epoch": 0.07, "grad_norm": 9.729340553283691, "learning_rate": 2.892734553775744e-05, "loss": 1.6221, "step": 250 }, { "epoch": 0.07, "grad_norm": 8.504460334777832, "learning_rate": 2.8923054919908465e-05, "loss": 1.7229, "step": 251 }, { "epoch": 0.07, "grad_norm": 7.916970252990723, "learning_rate": 2.8918764302059498e-05, "loss": 1.6216, "step": 252 }, { "epoch": 0.07, "grad_norm": 12.042543411254883, "learning_rate": 2.8914473684210526e-05, "loss": 2.0739, "step": 253 }, { "epoch": 0.07, "grad_norm": 8.228246688842773, "learning_rate": 2.891018306636156e-05, "loss": 1.5692, "step": 254 }, { "epoch": 0.07, "grad_norm": 10.877127647399902, "learning_rate": 2.8905892448512584e-05, "loss": 1.7809, "step": 255 }, { "epoch": 0.07, "grad_norm": 8.196961402893066, "learning_rate": 2.8901601830663616e-05, "loss": 1.421, "step": 256 }, { "epoch": 0.07, "grad_norm": 9.786067962646484, "learning_rate": 2.8897311212814645e-05, "loss": 2.0212, "step": 257 }, { "epoch": 0.07, "grad_norm": 9.010289192199707, "learning_rate": 2.8893020594965677e-05, "loss": 1.58, "step": 258 }, { "epoch": 0.07, "grad_norm": 9.008468627929688, "learning_rate": 2.8888729977116706e-05, "loss": 1.931, "step": 259 }, { "epoch": 0.07, "grad_norm": 8.551482200622559, "learning_rate": 2.8884439359267735e-05, "loss": 1.5659, "step": 260 }, { "epoch": 0.07, "grad_norm": 9.659241676330566, "learning_rate": 2.8880148741418767e-05, "loss": 1.8359, "step": 261 }, { "epoch": 0.07, "grad_norm": 8.562986373901367, "learning_rate": 2.8875858123569795e-05, "loss": 1.5877, "step": 262 }, { "epoch": 0.08, "grad_norm": 7.784165382385254, "learning_rate": 2.8871567505720824e-05, "loss": 1.4274, "step": 263 }, { "epoch": 0.08, "grad_norm": 10.632110595703125, "learning_rate": 2.8867276887871853e-05, "loss": 1.8839, "step": 264 }, { "epoch": 0.08, "grad_norm": 7.901241779327393, "learning_rate": 2.8862986270022885e-05, "loss": 1.3047, "step": 265 }, { "epoch": 0.08, "grad_norm": 8.041234970092773, "learning_rate": 2.8858695652173914e-05, "loss": 1.5787, "step": 266 }, { "epoch": 0.08, "grad_norm": 8.147648811340332, "learning_rate": 2.8854405034324943e-05, "loss": 1.3792, "step": 267 }, { "epoch": 0.08, "grad_norm": 8.332252502441406, "learning_rate": 2.8850114416475975e-05, "loss": 2.0235, "step": 268 }, { "epoch": 0.08, "grad_norm": 7.136639595031738, "learning_rate": 2.8845823798627004e-05, "loss": 1.5239, "step": 269 }, { "epoch": 0.08, "grad_norm": 21.16472053527832, "learning_rate": 2.8841533180778032e-05, "loss": 1.4307, "step": 270 }, { "epoch": 0.08, "grad_norm": 8.72244930267334, "learning_rate": 2.883724256292906e-05, "loss": 1.7088, "step": 271 }, { "epoch": 0.08, "grad_norm": 9.103704452514648, "learning_rate": 2.8832951945080093e-05, "loss": 1.8197, "step": 272 }, { "epoch": 0.08, "grad_norm": 8.807337760925293, "learning_rate": 2.8828661327231122e-05, "loss": 1.8819, "step": 273 }, { "epoch": 0.08, "grad_norm": 9.339899063110352, "learning_rate": 2.882437070938215e-05, "loss": 1.7452, "step": 274 }, { "epoch": 0.08, "grad_norm": 9.583975791931152, "learning_rate": 2.8820080091533183e-05, "loss": 2.0151, "step": 275 }, { "epoch": 0.08, "grad_norm": 8.562725067138672, "learning_rate": 2.8815789473684212e-05, "loss": 1.3846, "step": 276 }, { "epoch": 0.08, "grad_norm": 9.248943328857422, "learning_rate": 2.881149885583524e-05, "loss": 1.6616, "step": 277 }, { "epoch": 0.08, "grad_norm": 8.356354713439941, "learning_rate": 2.880720823798627e-05, "loss": 1.9607, "step": 278 }, { "epoch": 0.08, "grad_norm": 8.86824893951416, "learning_rate": 2.88029176201373e-05, "loss": 1.5309, "step": 279 }, { "epoch": 0.08, "grad_norm": 9.779897689819336, "learning_rate": 2.879862700228833e-05, "loss": 1.8248, "step": 280 }, { "epoch": 0.08, "grad_norm": 9.615009307861328, "learning_rate": 2.879433638443936e-05, "loss": 1.8228, "step": 281 }, { "epoch": 0.08, "grad_norm": 9.457856178283691, "learning_rate": 2.8790045766590388e-05, "loss": 1.4264, "step": 282 }, { "epoch": 0.08, "grad_norm": 8.006612777709961, "learning_rate": 2.878575514874142e-05, "loss": 1.1521, "step": 283 }, { "epoch": 0.08, "grad_norm": 9.314384460449219, "learning_rate": 2.8781464530892452e-05, "loss": 1.5094, "step": 284 }, { "epoch": 0.08, "grad_norm": 8.521162033081055, "learning_rate": 2.8777173913043477e-05, "loss": 1.5872, "step": 285 }, { "epoch": 0.08, "grad_norm": 9.152196884155273, "learning_rate": 2.877288329519451e-05, "loss": 1.8608, "step": 286 }, { "epoch": 0.08, "grad_norm": 10.426616668701172, "learning_rate": 2.876859267734554e-05, "loss": 1.3441, "step": 287 }, { "epoch": 0.08, "grad_norm": 10.633772850036621, "learning_rate": 2.876430205949657e-05, "loss": 2.0061, "step": 288 }, { "epoch": 0.08, "grad_norm": 11.46512508392334, "learning_rate": 2.8760011441647596e-05, "loss": 1.9394, "step": 289 }, { "epoch": 0.08, "grad_norm": 8.515049934387207, "learning_rate": 2.8755720823798628e-05, "loss": 1.1297, "step": 290 }, { "epoch": 0.08, "grad_norm": 9.623251914978027, "learning_rate": 2.8751430205949657e-05, "loss": 1.7703, "step": 291 }, { "epoch": 0.08, "grad_norm": 9.966745376586914, "learning_rate": 2.8747139588100686e-05, "loss": 1.7564, "step": 292 }, { "epoch": 0.08, "grad_norm": 10.016712188720703, "learning_rate": 2.8742848970251718e-05, "loss": 1.4858, "step": 293 }, { "epoch": 0.08, "grad_norm": 8.469600677490234, "learning_rate": 2.8738558352402747e-05, "loss": 1.7119, "step": 294 }, { "epoch": 0.08, "grad_norm": 8.720199584960938, "learning_rate": 2.873426773455378e-05, "loss": 1.6103, "step": 295 }, { "epoch": 0.08, "grad_norm": 11.37694263458252, "learning_rate": 2.8729977116704804e-05, "loss": 1.9161, "step": 296 }, { "epoch": 0.08, "grad_norm": 8.98237419128418, "learning_rate": 2.8725686498855836e-05, "loss": 1.5475, "step": 297 }, { "epoch": 0.09, "grad_norm": 9.324840545654297, "learning_rate": 2.8721395881006865e-05, "loss": 1.4995, "step": 298 }, { "epoch": 0.09, "grad_norm": 8.735411643981934, "learning_rate": 2.8717105263157897e-05, "loss": 1.3443, "step": 299 }, { "epoch": 0.09, "grad_norm": 8.82810115814209, "learning_rate": 2.8712814645308926e-05, "loss": 1.7164, "step": 300 }, { "epoch": 0.09, "grad_norm": 9.84345817565918, "learning_rate": 2.8708524027459955e-05, "loss": 1.4979, "step": 301 }, { "epoch": 0.09, "grad_norm": 9.750595092773438, "learning_rate": 2.8704233409610987e-05, "loss": 1.506, "step": 302 }, { "epoch": 0.09, "grad_norm": 10.426441192626953, "learning_rate": 2.8699942791762012e-05, "loss": 1.6819, "step": 303 }, { "epoch": 0.09, "grad_norm": 9.954815864562988, "learning_rate": 2.8695652173913044e-05, "loss": 1.628, "step": 304 }, { "epoch": 0.09, "grad_norm": 10.592462539672852, "learning_rate": 2.8691361556064073e-05, "loss": 1.6014, "step": 305 }, { "epoch": 0.09, "grad_norm": 9.672579765319824, "learning_rate": 2.8687070938215105e-05, "loss": 1.7661, "step": 306 }, { "epoch": 0.09, "grad_norm": 8.73773193359375, "learning_rate": 2.868278032036613e-05, "loss": 1.7942, "step": 307 }, { "epoch": 0.09, "grad_norm": 9.554476737976074, "learning_rate": 2.8678489702517163e-05, "loss": 1.2383, "step": 308 }, { "epoch": 0.09, "grad_norm": 9.73338508605957, "learning_rate": 2.8674199084668195e-05, "loss": 1.3876, "step": 309 }, { "epoch": 0.09, "grad_norm": 8.714142799377441, "learning_rate": 2.8669908466819224e-05, "loss": 1.3542, "step": 310 }, { "epoch": 0.09, "grad_norm": 8.723358154296875, "learning_rate": 2.8665617848970253e-05, "loss": 1.5286, "step": 311 }, { "epoch": 0.09, "grad_norm": 7.836288928985596, "learning_rate": 2.866132723112128e-05, "loss": 1.4391, "step": 312 }, { "epoch": 0.09, "grad_norm": 8.699174880981445, "learning_rate": 2.8657036613272313e-05, "loss": 1.4725, "step": 313 }, { "epoch": 0.09, "grad_norm": 9.351971626281738, "learning_rate": 2.8652745995423342e-05, "loss": 1.5725, "step": 314 }, { "epoch": 0.09, "grad_norm": 10.71751880645752, "learning_rate": 2.864845537757437e-05, "loss": 1.8646, "step": 315 }, { "epoch": 0.09, "grad_norm": 10.017102241516113, "learning_rate": 2.86441647597254e-05, "loss": 1.8136, "step": 316 }, { "epoch": 0.09, "grad_norm": 9.702308654785156, "learning_rate": 2.8639874141876432e-05, "loss": 1.6153, "step": 317 }, { "epoch": 0.09, "grad_norm": 9.080348014831543, "learning_rate": 2.863558352402746e-05, "loss": 1.6787, "step": 318 }, { "epoch": 0.09, "grad_norm": 10.15158462524414, "learning_rate": 2.863129290617849e-05, "loss": 1.8533, "step": 319 }, { "epoch": 0.09, "grad_norm": 11.515812873840332, "learning_rate": 2.862700228832952e-05, "loss": 1.6914, "step": 320 }, { "epoch": 0.09, "grad_norm": 10.957345962524414, "learning_rate": 2.862271167048055e-05, "loss": 1.8418, "step": 321 }, { "epoch": 0.09, "grad_norm": 8.494919776916504, "learning_rate": 2.861842105263158e-05, "loss": 1.6915, "step": 322 }, { "epoch": 0.09, "grad_norm": 8.075895309448242, "learning_rate": 2.8614130434782608e-05, "loss": 1.4509, "step": 323 }, { "epoch": 0.09, "grad_norm": 8.955622673034668, "learning_rate": 2.860983981693364e-05, "loss": 1.4193, "step": 324 }, { "epoch": 0.09, "grad_norm": 9.788787841796875, "learning_rate": 2.860554919908467e-05, "loss": 1.5942, "step": 325 }, { "epoch": 0.09, "grad_norm": 9.214378356933594, "learning_rate": 2.8601258581235698e-05, "loss": 1.6611, "step": 326 }, { "epoch": 0.09, "grad_norm": 8.840919494628906, "learning_rate": 2.859696796338673e-05, "loss": 1.3909, "step": 327 }, { "epoch": 0.09, "grad_norm": 9.53697681427002, "learning_rate": 2.859267734553776e-05, "loss": 1.5233, "step": 328 }, { "epoch": 0.09, "grad_norm": 8.874279975891113, "learning_rate": 2.8588386727688787e-05, "loss": 1.8105, "step": 329 }, { "epoch": 0.09, "grad_norm": 8.041751861572266, "learning_rate": 2.8584096109839816e-05, "loss": 1.6273, "step": 330 }, { "epoch": 0.09, "grad_norm": 10.59816837310791, "learning_rate": 2.8579805491990848e-05, "loss": 1.6535, "step": 331 }, { "epoch": 0.09, "grad_norm": 9.560877799987793, "learning_rate": 2.8575514874141877e-05, "loss": 1.5892, "step": 332 }, { "epoch": 0.1, "grad_norm": 10.164031028747559, "learning_rate": 2.8571224256292906e-05, "loss": 1.7095, "step": 333 }, { "epoch": 0.1, "grad_norm": 9.156442642211914, "learning_rate": 2.8566933638443934e-05, "loss": 1.4494, "step": 334 }, { "epoch": 0.1, "grad_norm": 8.107510566711426, "learning_rate": 2.8562643020594967e-05, "loss": 1.6335, "step": 335 }, { "epoch": 0.1, "grad_norm": 8.041921615600586, "learning_rate": 2.8558352402746e-05, "loss": 1.3935, "step": 336 }, { "epoch": 0.1, "grad_norm": 8.365302085876465, "learning_rate": 2.8554061784897024e-05, "loss": 1.5283, "step": 337 }, { "epoch": 0.1, "grad_norm": 8.143136024475098, "learning_rate": 2.8549771167048056e-05, "loss": 1.3668, "step": 338 }, { "epoch": 0.1, "grad_norm": 12.083857536315918, "learning_rate": 2.8545480549199085e-05, "loss": 2.0169, "step": 339 }, { "epoch": 0.1, "grad_norm": 9.328901290893555, "learning_rate": 2.8541189931350117e-05, "loss": 1.3423, "step": 340 }, { "epoch": 0.1, "grad_norm": 9.404691696166992, "learning_rate": 2.8536899313501143e-05, "loss": 1.5616, "step": 341 }, { "epoch": 0.1, "grad_norm": 9.255928993225098, "learning_rate": 2.8532608695652175e-05, "loss": 1.4001, "step": 342 }, { "epoch": 0.1, "grad_norm": 13.580086708068848, "learning_rate": 2.8528318077803207e-05, "loss": 1.5508, "step": 343 }, { "epoch": 0.1, "grad_norm": 10.312821388244629, "learning_rate": 2.8524027459954232e-05, "loss": 2.1192, "step": 344 }, { "epoch": 0.1, "grad_norm": 11.666476249694824, "learning_rate": 2.8519736842105264e-05, "loss": 1.8555, "step": 345 }, { "epoch": 0.1, "grad_norm": 11.251701354980469, "learning_rate": 2.8515446224256293e-05, "loss": 1.9988, "step": 346 }, { "epoch": 0.1, "grad_norm": 9.151383399963379, "learning_rate": 2.8511155606407325e-05, "loss": 1.602, "step": 347 }, { "epoch": 0.1, "grad_norm": 9.019693374633789, "learning_rate": 2.850686498855835e-05, "loss": 1.6737, "step": 348 }, { "epoch": 0.1, "grad_norm": 8.890886306762695, "learning_rate": 2.8502574370709383e-05, "loss": 1.5644, "step": 349 }, { "epoch": 0.1, "grad_norm": 7.539163112640381, "learning_rate": 2.8498283752860412e-05, "loss": 1.6877, "step": 350 }, { "epoch": 0.1, "grad_norm": 8.853219032287598, "learning_rate": 2.8493993135011444e-05, "loss": 1.6493, "step": 351 }, { "epoch": 0.1, "grad_norm": 8.162408828735352, "learning_rate": 2.8489702517162473e-05, "loss": 1.8122, "step": 352 }, { "epoch": 0.1, "grad_norm": 8.233209609985352, "learning_rate": 2.84854118993135e-05, "loss": 1.7432, "step": 353 }, { "epoch": 0.1, "grad_norm": 7.457662105560303, "learning_rate": 2.8481121281464534e-05, "loss": 1.689, "step": 354 }, { "epoch": 0.1, "grad_norm": 7.5489397048950195, "learning_rate": 2.8476830663615562e-05, "loss": 1.6797, "step": 355 }, { "epoch": 0.1, "grad_norm": 8.018563270568848, "learning_rate": 2.847254004576659e-05, "loss": 1.7935, "step": 356 }, { "epoch": 0.1, "grad_norm": 8.723430633544922, "learning_rate": 2.846824942791762e-05, "loss": 1.6617, "step": 357 }, { "epoch": 0.1, "grad_norm": 7.9907636642456055, "learning_rate": 2.8463958810068652e-05, "loss": 1.7775, "step": 358 }, { "epoch": 0.1, "grad_norm": 8.556412696838379, "learning_rate": 2.8459668192219677e-05, "loss": 1.5166, "step": 359 }, { "epoch": 0.1, "grad_norm": 8.768579483032227, "learning_rate": 2.845537757437071e-05, "loss": 1.4092, "step": 360 }, { "epoch": 0.1, "grad_norm": 8.49852466583252, "learning_rate": 2.8451086956521742e-05, "loss": 1.5011, "step": 361 }, { "epoch": 0.1, "grad_norm": 9.627189636230469, "learning_rate": 2.844679633867277e-05, "loss": 1.5046, "step": 362 }, { "epoch": 0.1, "grad_norm": 11.643542289733887, "learning_rate": 2.84425057208238e-05, "loss": 2.18, "step": 363 }, { "epoch": 0.1, "grad_norm": 10.84292221069336, "learning_rate": 2.8438215102974828e-05, "loss": 2.0504, "step": 364 }, { "epoch": 0.1, "grad_norm": 10.566253662109375, "learning_rate": 2.843392448512586e-05, "loss": 1.7854, "step": 365 }, { "epoch": 0.1, "grad_norm": 9.069684028625488, "learning_rate": 2.842963386727689e-05, "loss": 1.7755, "step": 366 }, { "epoch": 0.1, "grad_norm": 9.103590965270996, "learning_rate": 2.8425343249427918e-05, "loss": 1.5551, "step": 367 }, { "epoch": 0.11, "grad_norm": 9.507101058959961, "learning_rate": 2.8421052631578946e-05, "loss": 1.3767, "step": 368 }, { "epoch": 0.11, "grad_norm": 10.058374404907227, "learning_rate": 2.841676201372998e-05, "loss": 1.5317, "step": 369 }, { "epoch": 0.11, "grad_norm": 11.702919960021973, "learning_rate": 2.8412471395881007e-05, "loss": 1.9569, "step": 370 }, { "epoch": 0.11, "grad_norm": 10.480443954467773, "learning_rate": 2.8408180778032036e-05, "loss": 1.8785, "step": 371 }, { "epoch": 0.11, "grad_norm": 8.140588760375977, "learning_rate": 2.840389016018307e-05, "loss": 1.6059, "step": 372 }, { "epoch": 0.11, "grad_norm": 7.487027168273926, "learning_rate": 2.8399599542334097e-05, "loss": 1.6194, "step": 373 }, { "epoch": 0.11, "grad_norm": 9.53647518157959, "learning_rate": 2.8395308924485126e-05, "loss": 1.826, "step": 374 }, { "epoch": 0.11, "grad_norm": 7.119506359100342, "learning_rate": 2.8391018306636155e-05, "loss": 1.5237, "step": 375 }, { "epoch": 0.11, "grad_norm": 8.412140846252441, "learning_rate": 2.8386727688787187e-05, "loss": 1.2996, "step": 376 }, { "epoch": 0.11, "grad_norm": 9.240374565124512, "learning_rate": 2.8382437070938216e-05, "loss": 1.8373, "step": 377 }, { "epoch": 0.11, "grad_norm": 8.49670124053955, "learning_rate": 2.8378146453089244e-05, "loss": 1.649, "step": 378 }, { "epoch": 0.11, "grad_norm": 10.530879974365234, "learning_rate": 2.8373855835240276e-05, "loss": 1.9523, "step": 379 }, { "epoch": 0.11, "grad_norm": 9.252795219421387, "learning_rate": 2.8369565217391305e-05, "loss": 1.3662, "step": 380 }, { "epoch": 0.11, "grad_norm": 10.121156692504883, "learning_rate": 2.8365274599542334e-05, "loss": 1.8007, "step": 381 }, { "epoch": 0.11, "grad_norm": 8.110308647155762, "learning_rate": 2.8360983981693363e-05, "loss": 1.6888, "step": 382 }, { "epoch": 0.11, "grad_norm": 13.776968955993652, "learning_rate": 2.8356693363844395e-05, "loss": 1.5352, "step": 383 }, { "epoch": 0.11, "grad_norm": 9.49256706237793, "learning_rate": 2.8352402745995424e-05, "loss": 1.5955, "step": 384 }, { "epoch": 0.11, "grad_norm": 8.58328914642334, "learning_rate": 2.8348112128146452e-05, "loss": 1.6714, "step": 385 }, { "epoch": 0.11, "grad_norm": 9.3377103805542, "learning_rate": 2.8343821510297485e-05, "loss": 1.6089, "step": 386 }, { "epoch": 0.11, "grad_norm": 8.08781623840332, "learning_rate": 2.8339530892448513e-05, "loss": 1.3559, "step": 387 }, { "epoch": 0.11, "grad_norm": 9.012792587280273, "learning_rate": 2.8335240274599546e-05, "loss": 1.5669, "step": 388 }, { "epoch": 0.11, "grad_norm": 8.003541946411133, "learning_rate": 2.833094965675057e-05, "loss": 1.7334, "step": 389 }, { "epoch": 0.11, "grad_norm": 7.730262279510498, "learning_rate": 2.8326659038901603e-05, "loss": 1.648, "step": 390 }, { "epoch": 0.11, "grad_norm": 8.558412551879883, "learning_rate": 2.8322368421052632e-05, "loss": 1.681, "step": 391 }, { "epoch": 0.11, "grad_norm": 7.935482978820801, "learning_rate": 2.8318077803203664e-05, "loss": 1.5282, "step": 392 }, { "epoch": 0.11, "grad_norm": 7.9830098152160645, "learning_rate": 2.831378718535469e-05, "loss": 1.7982, "step": 393 }, { "epoch": 0.11, "grad_norm": 8.142667770385742, "learning_rate": 2.830949656750572e-05, "loss": 1.767, "step": 394 }, { "epoch": 0.11, "grad_norm": 7.926479339599609, "learning_rate": 2.8305205949656754e-05, "loss": 1.6133, "step": 395 }, { "epoch": 0.11, "grad_norm": 8.327887535095215, "learning_rate": 2.830091533180778e-05, "loss": 1.3836, "step": 396 }, { "epoch": 0.11, "grad_norm": 7.93806266784668, "learning_rate": 2.829662471395881e-05, "loss": 1.5477, "step": 397 }, { "epoch": 0.11, "grad_norm": 8.302078247070312, "learning_rate": 2.829233409610984e-05, "loss": 1.6192, "step": 398 }, { "epoch": 0.11, "grad_norm": 9.20541000366211, "learning_rate": 2.8288043478260872e-05, "loss": 1.7687, "step": 399 }, { "epoch": 0.11, "grad_norm": 8.753988265991211, "learning_rate": 2.8283752860411898e-05, "loss": 1.6086, "step": 400 }, { "epoch": 0.11, "grad_norm": 8.764487266540527, "learning_rate": 2.827946224256293e-05, "loss": 1.6797, "step": 401 }, { "epoch": 0.11, "grad_norm": 9.942675590515137, "learning_rate": 2.827517162471396e-05, "loss": 1.7611, "step": 402 }, { "epoch": 0.12, "grad_norm": 9.709769248962402, "learning_rate": 2.827088100686499e-05, "loss": 1.6783, "step": 403 }, { "epoch": 0.12, "grad_norm": 12.350006103515625, "learning_rate": 2.826659038901602e-05, "loss": 1.5502, "step": 404 }, { "epoch": 0.12, "grad_norm": 9.543091773986816, "learning_rate": 2.8262299771167048e-05, "loss": 1.5901, "step": 405 }, { "epoch": 0.12, "grad_norm": 9.05308723449707, "learning_rate": 2.825800915331808e-05, "loss": 1.5929, "step": 406 }, { "epoch": 0.12, "grad_norm": 9.204638481140137, "learning_rate": 2.825371853546911e-05, "loss": 1.4633, "step": 407 }, { "epoch": 0.12, "grad_norm": 9.998212814331055, "learning_rate": 2.8249427917620138e-05, "loss": 1.7381, "step": 408 }, { "epoch": 0.12, "grad_norm": 9.445399284362793, "learning_rate": 2.8245137299771167e-05, "loss": 1.8509, "step": 409 }, { "epoch": 0.12, "grad_norm": 7.8274736404418945, "learning_rate": 2.82408466819222e-05, "loss": 1.6, "step": 410 }, { "epoch": 0.12, "grad_norm": 8.54792308807373, "learning_rate": 2.8236556064073224e-05, "loss": 1.5708, "step": 411 }, { "epoch": 0.12, "grad_norm": 9.415521621704102, "learning_rate": 2.8232265446224256e-05, "loss": 1.4578, "step": 412 }, { "epoch": 0.12, "grad_norm": 8.425575256347656, "learning_rate": 2.822797482837529e-05, "loss": 1.4646, "step": 413 }, { "epoch": 0.12, "grad_norm": 8.011249542236328, "learning_rate": 2.8223684210526317e-05, "loss": 1.742, "step": 414 }, { "epoch": 0.12, "grad_norm": 7.653934478759766, "learning_rate": 2.8219393592677346e-05, "loss": 1.5293, "step": 415 }, { "epoch": 0.12, "grad_norm": 8.901880264282227, "learning_rate": 2.8215102974828375e-05, "loss": 1.6321, "step": 416 }, { "epoch": 0.12, "grad_norm": 8.286306381225586, "learning_rate": 2.8210812356979407e-05, "loss": 1.6388, "step": 417 }, { "epoch": 0.12, "grad_norm": 9.622949600219727, "learning_rate": 2.8206521739130436e-05, "loss": 1.664, "step": 418 }, { "epoch": 0.12, "grad_norm": 9.048681259155273, "learning_rate": 2.8202231121281464e-05, "loss": 1.4288, "step": 419 }, { "epoch": 0.12, "grad_norm": 9.250710487365723, "learning_rate": 2.8197940503432497e-05, "loss": 1.5068, "step": 420 }, { "epoch": 0.12, "grad_norm": 9.357898712158203, "learning_rate": 2.8193649885583525e-05, "loss": 1.489, "step": 421 }, { "epoch": 0.12, "grad_norm": 9.524295806884766, "learning_rate": 2.8189359267734554e-05, "loss": 1.5555, "step": 422 }, { "epoch": 0.12, "grad_norm": 10.393868446350098, "learning_rate": 2.8185068649885583e-05, "loss": 1.8444, "step": 423 }, { "epoch": 0.12, "grad_norm": 8.325084686279297, "learning_rate": 2.8180778032036615e-05, "loss": 1.4135, "step": 424 }, { "epoch": 0.12, "grad_norm": 8.550622940063477, "learning_rate": 2.8176487414187644e-05, "loss": 1.4797, "step": 425 }, { "epoch": 0.12, "grad_norm": 9.385437965393066, "learning_rate": 2.8172196796338673e-05, "loss": 1.6019, "step": 426 }, { "epoch": 0.12, "grad_norm": 8.729028701782227, "learning_rate": 2.81679061784897e-05, "loss": 1.2063, "step": 427 }, { "epoch": 0.12, "grad_norm": 8.351651191711426, "learning_rate": 2.8163615560640734e-05, "loss": 1.6706, "step": 428 }, { "epoch": 0.12, "grad_norm": 8.507067680358887, "learning_rate": 2.8159324942791766e-05, "loss": 1.5326, "step": 429 }, { "epoch": 0.12, "grad_norm": 8.760214805603027, "learning_rate": 2.815503432494279e-05, "loss": 1.6172, "step": 430 }, { "epoch": 0.12, "grad_norm": 8.832893371582031, "learning_rate": 2.8150743707093823e-05, "loss": 1.4825, "step": 431 }, { "epoch": 0.12, "grad_norm": 8.570629119873047, "learning_rate": 2.8146453089244852e-05, "loss": 1.4647, "step": 432 }, { "epoch": 0.12, "grad_norm": 9.038146018981934, "learning_rate": 2.8142162471395884e-05, "loss": 1.4906, "step": 433 }, { "epoch": 0.12, "grad_norm": 9.179692268371582, "learning_rate": 2.813787185354691e-05, "loss": 1.7981, "step": 434 }, { "epoch": 0.12, "grad_norm": 9.589019775390625, "learning_rate": 2.8133581235697942e-05, "loss": 1.8932, "step": 435 }, { "epoch": 0.12, "grad_norm": 9.438563346862793, "learning_rate": 2.812929061784897e-05, "loss": 1.5164, "step": 436 }, { "epoch": 0.12, "grad_norm": 7.712190628051758, "learning_rate": 2.8125e-05, "loss": 1.587, "step": 437 }, { "epoch": 0.13, "grad_norm": 8.640191078186035, "learning_rate": 2.812070938215103e-05, "loss": 1.3491, "step": 438 }, { "epoch": 0.13, "grad_norm": 7.796834468841553, "learning_rate": 2.811641876430206e-05, "loss": 1.533, "step": 439 }, { "epoch": 0.13, "grad_norm": 8.263230323791504, "learning_rate": 2.8112128146453092e-05, "loss": 1.4814, "step": 440 }, { "epoch": 0.13, "grad_norm": 7.591869354248047, "learning_rate": 2.8107837528604118e-05, "loss": 1.4957, "step": 441 }, { "epoch": 0.13, "grad_norm": 7.961359024047852, "learning_rate": 2.810354691075515e-05, "loss": 1.5566, "step": 442 }, { "epoch": 0.13, "grad_norm": 8.94115924835205, "learning_rate": 2.809925629290618e-05, "loss": 1.4563, "step": 443 }, { "epoch": 0.13, "grad_norm": 8.587944984436035, "learning_rate": 2.809496567505721e-05, "loss": 1.3679, "step": 444 }, { "epoch": 0.13, "grad_norm": 8.851431846618652, "learning_rate": 2.8090675057208236e-05, "loss": 1.1945, "step": 445 }, { "epoch": 0.13, "grad_norm": 9.290416717529297, "learning_rate": 2.8086384439359268e-05, "loss": 1.3487, "step": 446 }, { "epoch": 0.13, "grad_norm": 9.352265357971191, "learning_rate": 2.80820938215103e-05, "loss": 1.4586, "step": 447 }, { "epoch": 0.13, "grad_norm": 9.46808910369873, "learning_rate": 2.8077803203661326e-05, "loss": 1.5198, "step": 448 }, { "epoch": 0.13, "grad_norm": 9.68099594116211, "learning_rate": 2.8073512585812358e-05, "loss": 1.8168, "step": 449 }, { "epoch": 0.13, "grad_norm": 11.353328704833984, "learning_rate": 2.8069221967963387e-05, "loss": 1.9091, "step": 450 }, { "epoch": 0.13, "grad_norm": 9.7789888381958, "learning_rate": 2.806493135011442e-05, "loss": 1.6881, "step": 451 }, { "epoch": 0.13, "grad_norm": 10.931682586669922, "learning_rate": 2.8060640732265444e-05, "loss": 1.4275, "step": 452 }, { "epoch": 0.13, "grad_norm": 9.01468563079834, "learning_rate": 2.8056350114416476e-05, "loss": 1.5962, "step": 453 }, { "epoch": 0.13, "grad_norm": 10.376738548278809, "learning_rate": 2.8052059496567505e-05, "loss": 1.6438, "step": 454 }, { "epoch": 0.13, "grad_norm": 10.467251777648926, "learning_rate": 2.8047768878718537e-05, "loss": 1.5796, "step": 455 }, { "epoch": 0.13, "grad_norm": 8.759889602661133, "learning_rate": 2.8043478260869566e-05, "loss": 1.4211, "step": 456 }, { "epoch": 0.13, "grad_norm": 9.308679580688477, "learning_rate": 2.8039187643020595e-05, "loss": 1.2328, "step": 457 }, { "epoch": 0.13, "grad_norm": 9.23669147491455, "learning_rate": 2.8034897025171627e-05, "loss": 1.3872, "step": 458 }, { "epoch": 0.13, "grad_norm": 8.825939178466797, "learning_rate": 2.8030606407322656e-05, "loss": 1.4965, "step": 459 }, { "epoch": 0.13, "grad_norm": 8.422708511352539, "learning_rate": 2.8026315789473685e-05, "loss": 1.4893, "step": 460 }, { "epoch": 0.13, "grad_norm": 9.007925987243652, "learning_rate": 2.8022025171624713e-05, "loss": 1.698, "step": 461 }, { "epoch": 0.13, "grad_norm": 8.067597389221191, "learning_rate": 2.8017734553775746e-05, "loss": 1.4106, "step": 462 }, { "epoch": 0.13, "grad_norm": 7.50868034362793, "learning_rate": 2.8013443935926774e-05, "loss": 1.1772, "step": 463 }, { "epoch": 0.13, "grad_norm": 8.10647201538086, "learning_rate": 2.8009153318077803e-05, "loss": 1.7043, "step": 464 }, { "epoch": 0.13, "grad_norm": 9.774900436401367, "learning_rate": 2.8004862700228835e-05, "loss": 1.5012, "step": 465 }, { "epoch": 0.13, "grad_norm": 9.257354736328125, "learning_rate": 2.8000572082379864e-05, "loss": 1.6215, "step": 466 }, { "epoch": 0.13, "grad_norm": 8.386637687683105, "learning_rate": 2.7996281464530893e-05, "loss": 1.5138, "step": 467 }, { "epoch": 0.13, "grad_norm": 10.336791038513184, "learning_rate": 2.799199084668192e-05, "loss": 1.8383, "step": 468 }, { "epoch": 0.13, "grad_norm": 8.72229290008545, "learning_rate": 2.7987700228832954e-05, "loss": 1.3901, "step": 469 }, { "epoch": 0.13, "grad_norm": 7.802308559417725, "learning_rate": 2.7983409610983982e-05, "loss": 1.6891, "step": 470 }, { "epoch": 0.13, "grad_norm": 9.691417694091797, "learning_rate": 2.797911899313501e-05, "loss": 1.3613, "step": 471 }, { "epoch": 0.14, "grad_norm": 9.308101654052734, "learning_rate": 2.7974828375286043e-05, "loss": 1.5225, "step": 472 }, { "epoch": 0.14, "grad_norm": 8.286172866821289, "learning_rate": 2.7970537757437072e-05, "loss": 1.186, "step": 473 }, { "epoch": 0.14, "grad_norm": 9.888103485107422, "learning_rate": 2.79662471395881e-05, "loss": 1.5228, "step": 474 }, { "epoch": 0.14, "grad_norm": 9.277830123901367, "learning_rate": 2.796195652173913e-05, "loss": 1.6649, "step": 475 }, { "epoch": 0.14, "grad_norm": 10.21792221069336, "learning_rate": 2.7957665903890162e-05, "loss": 1.6197, "step": 476 }, { "epoch": 0.14, "grad_norm": 10.857234954833984, "learning_rate": 2.795337528604119e-05, "loss": 1.6978, "step": 477 }, { "epoch": 0.14, "grad_norm": 9.541655540466309, "learning_rate": 2.794908466819222e-05, "loss": 1.5044, "step": 478 }, { "epoch": 0.14, "grad_norm": 9.415315628051758, "learning_rate": 2.7944794050343248e-05, "loss": 1.839, "step": 479 }, { "epoch": 0.14, "grad_norm": 7.52134370803833, "learning_rate": 2.794050343249428e-05, "loss": 1.3303, "step": 480 }, { "epoch": 0.14, "grad_norm": 9.852075576782227, "learning_rate": 2.7936212814645312e-05, "loss": 1.5964, "step": 481 }, { "epoch": 0.14, "grad_norm": 10.195798873901367, "learning_rate": 2.7931922196796338e-05, "loss": 1.6181, "step": 482 }, { "epoch": 0.14, "grad_norm": 8.980742454528809, "learning_rate": 2.792763157894737e-05, "loss": 1.5698, "step": 483 }, { "epoch": 0.14, "grad_norm": 7.269761085510254, "learning_rate": 2.79233409610984e-05, "loss": 1.1187, "step": 484 }, { "epoch": 0.14, "grad_norm": 10.480413436889648, "learning_rate": 2.791905034324943e-05, "loss": 1.2166, "step": 485 }, { "epoch": 0.14, "grad_norm": 9.4518404006958, "learning_rate": 2.7914759725400456e-05, "loss": 1.8096, "step": 486 }, { "epoch": 0.14, "grad_norm": 9.00314998626709, "learning_rate": 2.791046910755149e-05, "loss": 1.7498, "step": 487 }, { "epoch": 0.14, "grad_norm": 9.233830451965332, "learning_rate": 2.7906178489702517e-05, "loss": 1.6336, "step": 488 }, { "epoch": 0.14, "grad_norm": 7.957357406616211, "learning_rate": 2.7901887871853546e-05, "loss": 1.3371, "step": 489 }, { "epoch": 0.14, "grad_norm": 7.830927848815918, "learning_rate": 2.7897597254004578e-05, "loss": 1.3703, "step": 490 }, { "epoch": 0.14, "grad_norm": 8.449617385864258, "learning_rate": 2.7893306636155607e-05, "loss": 1.3393, "step": 491 }, { "epoch": 0.14, "grad_norm": 8.626500129699707, "learning_rate": 2.788901601830664e-05, "loss": 1.4369, "step": 492 }, { "epoch": 0.14, "grad_norm": 9.729392051696777, "learning_rate": 2.7884725400457664e-05, "loss": 1.7288, "step": 493 }, { "epoch": 0.14, "grad_norm": 9.956872940063477, "learning_rate": 2.7880434782608697e-05, "loss": 1.5276, "step": 494 }, { "epoch": 0.14, "grad_norm": 8.511049270629883, "learning_rate": 2.7876144164759725e-05, "loss": 1.3991, "step": 495 }, { "epoch": 0.14, "grad_norm": 10.387262344360352, "learning_rate": 2.7871853546910758e-05, "loss": 1.3234, "step": 496 }, { "epoch": 0.14, "grad_norm": 9.903482437133789, "learning_rate": 2.7867562929061786e-05, "loss": 1.6989, "step": 497 }, { "epoch": 0.14, "grad_norm": 9.461954116821289, "learning_rate": 2.7863272311212815e-05, "loss": 1.65, "step": 498 }, { "epoch": 0.14, "grad_norm": 10.389571189880371, "learning_rate": 2.7858981693363847e-05, "loss": 1.8413, "step": 499 }, { "epoch": 0.14, "grad_norm": 10.60290241241455, "learning_rate": 2.7854691075514873e-05, "loss": 1.5862, "step": 500 }, { "epoch": 0.14, "grad_norm": 8.807363510131836, "learning_rate": 2.7850400457665905e-05, "loss": 1.4559, "step": 501 }, { "epoch": 0.14, "grad_norm": 8.981348037719727, "learning_rate": 2.7846109839816934e-05, "loss": 1.4053, "step": 502 }, { "epoch": 0.14, "grad_norm": 12.610603332519531, "learning_rate": 2.7841819221967966e-05, "loss": 1.7157, "step": 503 }, { "epoch": 0.14, "grad_norm": 8.09223461151123, "learning_rate": 2.783752860411899e-05, "loss": 1.4528, "step": 504 }, { "epoch": 0.14, "grad_norm": 9.06646728515625, "learning_rate": 2.7833237986270023e-05, "loss": 1.5101, "step": 505 }, { "epoch": 0.14, "grad_norm": 9.109045028686523, "learning_rate": 2.7828947368421055e-05, "loss": 1.6101, "step": 506 }, { "epoch": 0.15, "grad_norm": 9.117020606994629, "learning_rate": 2.7824656750572084e-05, "loss": 1.4038, "step": 507 }, { "epoch": 0.15, "grad_norm": 8.422054290771484, "learning_rate": 2.7820366132723113e-05, "loss": 1.6549, "step": 508 }, { "epoch": 0.15, "grad_norm": 9.228119850158691, "learning_rate": 2.781607551487414e-05, "loss": 1.4346, "step": 509 }, { "epoch": 0.15, "grad_norm": 11.748756408691406, "learning_rate": 2.7811784897025174e-05, "loss": 1.9742, "step": 510 }, { "epoch": 0.15, "grad_norm": 8.695389747619629, "learning_rate": 2.7807494279176203e-05, "loss": 1.3546, "step": 511 }, { "epoch": 0.15, "grad_norm": 7.655311584472656, "learning_rate": 2.780320366132723e-05, "loss": 1.2804, "step": 512 }, { "epoch": 0.15, "grad_norm": 7.489044189453125, "learning_rate": 2.779891304347826e-05, "loss": 1.5239, "step": 513 }, { "epoch": 0.15, "grad_norm": 7.734864711761475, "learning_rate": 2.7794622425629292e-05, "loss": 1.516, "step": 514 }, { "epoch": 0.15, "grad_norm": 7.818589687347412, "learning_rate": 2.779033180778032e-05, "loss": 1.2432, "step": 515 }, { "epoch": 0.15, "grad_norm": 9.193716049194336, "learning_rate": 2.778604118993135e-05, "loss": 1.3179, "step": 516 }, { "epoch": 0.15, "grad_norm": 8.570880889892578, "learning_rate": 2.7781750572082382e-05, "loss": 1.2127, "step": 517 }, { "epoch": 0.15, "grad_norm": 10.106489181518555, "learning_rate": 2.777745995423341e-05, "loss": 1.4325, "step": 518 }, { "epoch": 0.15, "grad_norm": 9.232028007507324, "learning_rate": 2.777316933638444e-05, "loss": 1.4359, "step": 519 }, { "epoch": 0.15, "grad_norm": 8.200241088867188, "learning_rate": 2.7768878718535468e-05, "loss": 1.346, "step": 520 }, { "epoch": 0.15, "grad_norm": 9.4496488571167, "learning_rate": 2.77645881006865e-05, "loss": 1.5329, "step": 521 }, { "epoch": 0.15, "grad_norm": 10.742775917053223, "learning_rate": 2.776029748283753e-05, "loss": 1.599, "step": 522 }, { "epoch": 0.15, "grad_norm": 10.676810264587402, "learning_rate": 2.7756006864988558e-05, "loss": 1.6097, "step": 523 }, { "epoch": 0.15, "grad_norm": 9.049821853637695, "learning_rate": 2.775171624713959e-05, "loss": 1.6242, "step": 524 }, { "epoch": 0.15, "grad_norm": 9.311460494995117, "learning_rate": 2.774742562929062e-05, "loss": 1.613, "step": 525 }, { "epoch": 0.15, "grad_norm": 10.265384674072266, "learning_rate": 2.7743135011441648e-05, "loss": 1.8818, "step": 526 }, { "epoch": 0.15, "grad_norm": 8.79378890991211, "learning_rate": 2.7738844393592676e-05, "loss": 1.8994, "step": 527 }, { "epoch": 0.15, "grad_norm": 8.35362434387207, "learning_rate": 2.773455377574371e-05, "loss": 1.4752, "step": 528 }, { "epoch": 0.15, "grad_norm": 8.710142135620117, "learning_rate": 2.7730263157894737e-05, "loss": 1.3721, "step": 529 }, { "epoch": 0.15, "grad_norm": 9.677940368652344, "learning_rate": 2.7725972540045766e-05, "loss": 1.3834, "step": 530 }, { "epoch": 0.15, "grad_norm": 6.871583461761475, "learning_rate": 2.7721681922196795e-05, "loss": 1.3109, "step": 531 }, { "epoch": 0.15, "grad_norm": 9.098319053649902, "learning_rate": 2.7717391304347827e-05, "loss": 1.5228, "step": 532 }, { "epoch": 0.15, "grad_norm": 9.34329891204834, "learning_rate": 2.771310068649886e-05, "loss": 1.6127, "step": 533 }, { "epoch": 0.15, "grad_norm": 8.98932933807373, "learning_rate": 2.7708810068649885e-05, "loss": 1.3145, "step": 534 }, { "epoch": 0.15, "grad_norm": 9.540974617004395, "learning_rate": 2.7704519450800917e-05, "loss": 1.638, "step": 535 }, { "epoch": 0.15, "grad_norm": 8.818389892578125, "learning_rate": 2.7700228832951946e-05, "loss": 1.4179, "step": 536 }, { "epoch": 0.15, "grad_norm": 10.52511978149414, "learning_rate": 2.7695938215102978e-05, "loss": 1.481, "step": 537 }, { "epoch": 0.15, "grad_norm": 11.235359191894531, "learning_rate": 2.7691647597254003e-05, "loss": 1.6016, "step": 538 }, { "epoch": 0.15, "grad_norm": 9.686572074890137, "learning_rate": 2.7687356979405035e-05, "loss": 1.7741, "step": 539 }, { "epoch": 0.15, "grad_norm": 8.403671264648438, "learning_rate": 2.7683066361556067e-05, "loss": 1.4, "step": 540 }, { "epoch": 0.15, "grad_norm": 9.480093002319336, "learning_rate": 2.7678775743707093e-05, "loss": 1.3759, "step": 541 }, { "epoch": 0.16, "grad_norm": 8.01067066192627, "learning_rate": 2.7674485125858125e-05, "loss": 1.2315, "step": 542 }, { "epoch": 0.16, "grad_norm": 9.176029205322266, "learning_rate": 2.7670194508009154e-05, "loss": 1.6042, "step": 543 }, { "epoch": 0.16, "grad_norm": 9.995184898376465, "learning_rate": 2.7665903890160186e-05, "loss": 2.0453, "step": 544 }, { "epoch": 0.16, "grad_norm": 9.106335639953613, "learning_rate": 2.766161327231121e-05, "loss": 1.4097, "step": 545 }, { "epoch": 0.16, "grad_norm": 8.503254890441895, "learning_rate": 2.7657322654462243e-05, "loss": 1.2176, "step": 546 }, { "epoch": 0.16, "grad_norm": 8.362741470336914, "learning_rate": 2.7653032036613272e-05, "loss": 1.1075, "step": 547 }, { "epoch": 0.16, "grad_norm": 7.65399169921875, "learning_rate": 2.7648741418764304e-05, "loss": 1.4227, "step": 548 }, { "epoch": 0.16, "grad_norm": 7.963006973266602, "learning_rate": 2.7644450800915333e-05, "loss": 1.3591, "step": 549 }, { "epoch": 0.16, "grad_norm": 8.329169273376465, "learning_rate": 2.7640160183066362e-05, "loss": 1.6284, "step": 550 }, { "epoch": 0.16, "grad_norm": 7.6844916343688965, "learning_rate": 2.7635869565217394e-05, "loss": 1.4227, "step": 551 }, { "epoch": 0.16, "grad_norm": 8.27899169921875, "learning_rate": 2.7631578947368423e-05, "loss": 1.5949, "step": 552 }, { "epoch": 0.16, "grad_norm": 9.054556846618652, "learning_rate": 2.762728832951945e-05, "loss": 1.1804, "step": 553 }, { "epoch": 0.16, "grad_norm": 7.904552459716797, "learning_rate": 2.762299771167048e-05, "loss": 1.2641, "step": 554 }, { "epoch": 0.16, "grad_norm": 9.288396835327148, "learning_rate": 2.7618707093821512e-05, "loss": 1.6544, "step": 555 }, { "epoch": 0.16, "grad_norm": 10.109333038330078, "learning_rate": 2.7614416475972538e-05, "loss": 1.5606, "step": 556 }, { "epoch": 0.16, "grad_norm": 9.692341804504395, "learning_rate": 2.761012585812357e-05, "loss": 1.3516, "step": 557 }, { "epoch": 0.16, "grad_norm": 9.641256332397461, "learning_rate": 2.7605835240274602e-05, "loss": 1.3218, "step": 558 }, { "epoch": 0.16, "grad_norm": 8.773728370666504, "learning_rate": 2.760154462242563e-05, "loss": 1.1629, "step": 559 }, { "epoch": 0.16, "grad_norm": 12.425092697143555, "learning_rate": 2.759725400457666e-05, "loss": 1.296, "step": 560 }, { "epoch": 0.16, "grad_norm": 9.03546142578125, "learning_rate": 2.759296338672769e-05, "loss": 1.6785, "step": 561 }, { "epoch": 0.16, "grad_norm": 9.400121688842773, "learning_rate": 2.758867276887872e-05, "loss": 1.8052, "step": 562 }, { "epoch": 0.16, "grad_norm": 8.625514030456543, "learning_rate": 2.758438215102975e-05, "loss": 1.3596, "step": 563 }, { "epoch": 0.16, "grad_norm": 10.135568618774414, "learning_rate": 2.7580091533180778e-05, "loss": 1.7402, "step": 564 }, { "epoch": 0.16, "grad_norm": 8.666523933410645, "learning_rate": 2.7575800915331807e-05, "loss": 1.4847, "step": 565 }, { "epoch": 0.16, "grad_norm": 9.05019760131836, "learning_rate": 2.757151029748284e-05, "loss": 1.4296, "step": 566 }, { "epoch": 0.16, "grad_norm": 8.563323020935059, "learning_rate": 2.7567219679633868e-05, "loss": 1.6986, "step": 567 }, { "epoch": 0.16, "grad_norm": 8.601005554199219, "learning_rate": 2.7562929061784897e-05, "loss": 1.8225, "step": 568 }, { "epoch": 0.16, "grad_norm": 8.396276473999023, "learning_rate": 2.755863844393593e-05, "loss": 1.2789, "step": 569 }, { "epoch": 0.16, "grad_norm": 9.241303443908691, "learning_rate": 2.7554347826086957e-05, "loss": 1.5369, "step": 570 }, { "epoch": 0.16, "grad_norm": 7.413241386413574, "learning_rate": 2.7550057208237986e-05, "loss": 1.015, "step": 571 }, { "epoch": 0.16, "grad_norm": 8.75148868560791, "learning_rate": 2.7545766590389015e-05, "loss": 1.4863, "step": 572 }, { "epoch": 0.16, "grad_norm": 9.57811450958252, "learning_rate": 2.7541475972540047e-05, "loss": 1.589, "step": 573 }, { "epoch": 0.16, "grad_norm": 8.320711135864258, "learning_rate": 2.7537185354691076e-05, "loss": 1.5479, "step": 574 }, { "epoch": 0.16, "grad_norm": 8.287501335144043, "learning_rate": 2.7532894736842105e-05, "loss": 1.1933, "step": 575 }, { "epoch": 0.16, "grad_norm": 8.462252616882324, "learning_rate": 2.7528604118993137e-05, "loss": 1.4716, "step": 576 }, { "epoch": 0.17, "grad_norm": 9.69751262664795, "learning_rate": 2.7524313501144166e-05, "loss": 1.789, "step": 577 }, { "epoch": 0.17, "grad_norm": 8.301849365234375, "learning_rate": 2.7520022883295198e-05, "loss": 1.5647, "step": 578 }, { "epoch": 0.17, "grad_norm": 10.18299674987793, "learning_rate": 2.7515732265446223e-05, "loss": 1.4232, "step": 579 }, { "epoch": 0.17, "grad_norm": 8.621585845947266, "learning_rate": 2.7511441647597255e-05, "loss": 1.6084, "step": 580 }, { "epoch": 0.17, "grad_norm": 7.825051784515381, "learning_rate": 2.7507151029748284e-05, "loss": 1.4396, "step": 581 }, { "epoch": 0.17, "grad_norm": 8.34615707397461, "learning_rate": 2.7502860411899313e-05, "loss": 1.7879, "step": 582 }, { "epoch": 0.17, "grad_norm": 8.576688766479492, "learning_rate": 2.7498569794050345e-05, "loss": 1.1719, "step": 583 }, { "epoch": 0.17, "grad_norm": 8.739412307739258, "learning_rate": 2.7494279176201374e-05, "loss": 1.6384, "step": 584 }, { "epoch": 0.17, "grad_norm": 9.907939910888672, "learning_rate": 2.7489988558352406e-05, "loss": 1.5864, "step": 585 }, { "epoch": 0.17, "grad_norm": 8.430566787719727, "learning_rate": 2.748569794050343e-05, "loss": 1.5615, "step": 586 }, { "epoch": 0.17, "grad_norm": 8.017333030700684, "learning_rate": 2.7481407322654463e-05, "loss": 1.4049, "step": 587 }, { "epoch": 0.17, "grad_norm": 10.431910514831543, "learning_rate": 2.7477116704805492e-05, "loss": 1.5352, "step": 588 }, { "epoch": 0.17, "grad_norm": 9.034849166870117, "learning_rate": 2.7472826086956524e-05, "loss": 1.742, "step": 589 }, { "epoch": 0.17, "grad_norm": 8.847888946533203, "learning_rate": 2.746853546910755e-05, "loss": 1.4581, "step": 590 }, { "epoch": 0.17, "grad_norm": 9.845882415771484, "learning_rate": 2.7464244851258582e-05, "loss": 1.4175, "step": 591 }, { "epoch": 0.17, "grad_norm": 9.636897087097168, "learning_rate": 2.7459954233409614e-05, "loss": 1.5935, "step": 592 }, { "epoch": 0.17, "grad_norm": 9.328896522521973, "learning_rate": 2.745566361556064e-05, "loss": 1.4, "step": 593 }, { "epoch": 0.17, "grad_norm": 9.179758071899414, "learning_rate": 2.745137299771167e-05, "loss": 1.4922, "step": 594 }, { "epoch": 0.17, "grad_norm": 8.562376976013184, "learning_rate": 2.74470823798627e-05, "loss": 1.4231, "step": 595 }, { "epoch": 0.17, "grad_norm": 9.743012428283691, "learning_rate": 2.7442791762013733e-05, "loss": 1.457, "step": 596 }, { "epoch": 0.17, "grad_norm": 10.17732048034668, "learning_rate": 2.7438501144164758e-05, "loss": 1.7462, "step": 597 }, { "epoch": 0.17, "grad_norm": 8.325789451599121, "learning_rate": 2.743421052631579e-05, "loss": 1.6907, "step": 598 }, { "epoch": 0.17, "grad_norm": 9.406829833984375, "learning_rate": 2.742991990846682e-05, "loss": 1.3209, "step": 599 }, { "epoch": 0.17, "grad_norm": 9.432525634765625, "learning_rate": 2.742562929061785e-05, "loss": 1.827, "step": 600 }, { "epoch": 0.17, "grad_norm": 10.047086715698242, "learning_rate": 2.742133867276888e-05, "loss": 1.529, "step": 601 }, { "epoch": 0.17, "grad_norm": 10.171759605407715, "learning_rate": 2.741704805491991e-05, "loss": 1.5158, "step": 602 }, { "epoch": 0.17, "grad_norm": 7.706381320953369, "learning_rate": 2.741275743707094e-05, "loss": 1.468, "step": 603 }, { "epoch": 0.17, "grad_norm": 8.306461334228516, "learning_rate": 2.740846681922197e-05, "loss": 1.7419, "step": 604 }, { "epoch": 0.17, "grad_norm": 10.386027336120605, "learning_rate": 2.7404176201372998e-05, "loss": 1.3714, "step": 605 }, { "epoch": 0.17, "grad_norm": 10.559309959411621, "learning_rate": 2.7399885583524027e-05, "loss": 1.453, "step": 606 }, { "epoch": 0.17, "grad_norm": 10.220772743225098, "learning_rate": 2.739559496567506e-05, "loss": 1.5189, "step": 607 }, { "epoch": 0.17, "grad_norm": 10.321388244628906, "learning_rate": 2.7391304347826085e-05, "loss": 1.783, "step": 608 }, { "epoch": 0.17, "grad_norm": 9.188323020935059, "learning_rate": 2.7387013729977117e-05, "loss": 1.6611, "step": 609 }, { "epoch": 0.17, "grad_norm": 8.511995315551758, "learning_rate": 2.738272311212815e-05, "loss": 1.4972, "step": 610 }, { "epoch": 0.17, "grad_norm": 9.255911827087402, "learning_rate": 2.7378432494279178e-05, "loss": 1.6587, "step": 611 }, { "epoch": 0.18, "grad_norm": 9.88244342803955, "learning_rate": 2.7374141876430206e-05, "loss": 1.8485, "step": 612 }, { "epoch": 0.18, "grad_norm": 8.155802726745605, "learning_rate": 2.7369851258581235e-05, "loss": 1.3883, "step": 613 }, { "epoch": 0.18, "grad_norm": 10.00084114074707, "learning_rate": 2.7365560640732267e-05, "loss": 1.6237, "step": 614 }, { "epoch": 0.18, "grad_norm": 8.468073844909668, "learning_rate": 2.7361270022883296e-05, "loss": 1.3699, "step": 615 }, { "epoch": 0.18, "grad_norm": 10.218586921691895, "learning_rate": 2.7356979405034325e-05, "loss": 1.4842, "step": 616 }, { "epoch": 0.18, "grad_norm": 9.44926643371582, "learning_rate": 2.7352688787185357e-05, "loss": 1.5308, "step": 617 }, { "epoch": 0.18, "grad_norm": 8.894407272338867, "learning_rate": 2.7348398169336386e-05, "loss": 1.4603, "step": 618 }, { "epoch": 0.18, "grad_norm": 8.344779014587402, "learning_rate": 2.7344107551487415e-05, "loss": 1.4788, "step": 619 }, { "epoch": 0.18, "grad_norm": 10.086723327636719, "learning_rate": 2.7339816933638443e-05, "loss": 1.5314, "step": 620 }, { "epoch": 0.18, "grad_norm": 8.903287887573242, "learning_rate": 2.7335526315789475e-05, "loss": 1.2463, "step": 621 }, { "epoch": 0.18, "grad_norm": 9.488832473754883, "learning_rate": 2.7331235697940504e-05, "loss": 1.4667, "step": 622 }, { "epoch": 0.18, "grad_norm": 8.60849666595459, "learning_rate": 2.7326945080091533e-05, "loss": 1.4363, "step": 623 }, { "epoch": 0.18, "grad_norm": 8.344590187072754, "learning_rate": 2.7322654462242562e-05, "loss": 1.2329, "step": 624 }, { "epoch": 0.18, "grad_norm": 9.900020599365234, "learning_rate": 2.7318363844393594e-05, "loss": 1.6858, "step": 625 }, { "epoch": 0.18, "grad_norm": 8.698345184326172, "learning_rate": 2.7314073226544626e-05, "loss": 1.6498, "step": 626 }, { "epoch": 0.18, "grad_norm": 10.291359901428223, "learning_rate": 2.730978260869565e-05, "loss": 1.7879, "step": 627 }, { "epoch": 0.18, "grad_norm": 8.507843017578125, "learning_rate": 2.7305491990846684e-05, "loss": 1.7091, "step": 628 }, { "epoch": 0.18, "grad_norm": 9.516185760498047, "learning_rate": 2.7301201372997712e-05, "loss": 1.5036, "step": 629 }, { "epoch": 0.18, "grad_norm": 8.028081893920898, "learning_rate": 2.7296910755148745e-05, "loss": 1.2918, "step": 630 }, { "epoch": 0.18, "grad_norm": 8.034631729125977, "learning_rate": 2.729262013729977e-05, "loss": 1.7479, "step": 631 }, { "epoch": 0.18, "grad_norm": 8.290336608886719, "learning_rate": 2.7288329519450802e-05, "loss": 1.8074, "step": 632 }, { "epoch": 0.18, "grad_norm": 8.031563758850098, "learning_rate": 2.728403890160183e-05, "loss": 1.2922, "step": 633 }, { "epoch": 0.18, "grad_norm": 8.388860702514648, "learning_rate": 2.727974828375286e-05, "loss": 1.2661, "step": 634 }, { "epoch": 0.18, "grad_norm": 8.607510566711426, "learning_rate": 2.7275457665903892e-05, "loss": 1.5673, "step": 635 }, { "epoch": 0.18, "grad_norm": 9.935588836669922, "learning_rate": 2.727116704805492e-05, "loss": 1.7149, "step": 636 }, { "epoch": 0.18, "grad_norm": 6.709554672241211, "learning_rate": 2.7266876430205953e-05, "loss": 1.3551, "step": 637 }, { "epoch": 0.18, "grad_norm": 8.180497169494629, "learning_rate": 2.7262585812356978e-05, "loss": 1.473, "step": 638 }, { "epoch": 0.18, "grad_norm": 8.450583457946777, "learning_rate": 2.725829519450801e-05, "loss": 1.2896, "step": 639 }, { "epoch": 0.18, "grad_norm": 7.966728210449219, "learning_rate": 2.725400457665904e-05, "loss": 1.405, "step": 640 }, { "epoch": 0.18, "grad_norm": 10.327277183532715, "learning_rate": 2.724971395881007e-05, "loss": 1.6359, "step": 641 }, { "epoch": 0.18, "grad_norm": 9.075959205627441, "learning_rate": 2.7245423340961097e-05, "loss": 1.4942, "step": 642 }, { "epoch": 0.18, "grad_norm": 11.307262420654297, "learning_rate": 2.724113272311213e-05, "loss": 1.6728, "step": 643 }, { "epoch": 0.18, "grad_norm": 8.996954917907715, "learning_rate": 2.723684210526316e-05, "loss": 1.2933, "step": 644 }, { "epoch": 0.18, "grad_norm": 8.015362739562988, "learning_rate": 2.7232551487414186e-05, "loss": 1.2742, "step": 645 }, { "epoch": 0.18, "grad_norm": 8.029914855957031, "learning_rate": 2.722826086956522e-05, "loss": 1.5412, "step": 646 }, { "epoch": 0.19, "grad_norm": 8.644845962524414, "learning_rate": 2.7223970251716247e-05, "loss": 1.2903, "step": 647 }, { "epoch": 0.19, "grad_norm": 8.480746269226074, "learning_rate": 2.721967963386728e-05, "loss": 1.2119, "step": 648 }, { "epoch": 0.19, "grad_norm": 9.632102966308594, "learning_rate": 2.7215389016018305e-05, "loss": 1.7093, "step": 649 }, { "epoch": 0.19, "grad_norm": 10.944215774536133, "learning_rate": 2.7211098398169337e-05, "loss": 1.5005, "step": 650 }, { "epoch": 0.19, "grad_norm": 9.142459869384766, "learning_rate": 2.7206807780320366e-05, "loss": 1.3795, "step": 651 }, { "epoch": 0.19, "grad_norm": 7.930737495422363, "learning_rate": 2.7202517162471398e-05, "loss": 1.2816, "step": 652 }, { "epoch": 0.19, "grad_norm": 8.225480079650879, "learning_rate": 2.7198226544622427e-05, "loss": 1.3025, "step": 653 }, { "epoch": 0.19, "grad_norm": 10.08369255065918, "learning_rate": 2.7193935926773455e-05, "loss": 1.4614, "step": 654 }, { "epoch": 0.19, "grad_norm": 9.468962669372559, "learning_rate": 2.7189645308924487e-05, "loss": 1.5527, "step": 655 }, { "epoch": 0.19, "grad_norm": 10.897669792175293, "learning_rate": 2.7185354691075516e-05, "loss": 1.3351, "step": 656 }, { "epoch": 0.19, "grad_norm": 10.082230567932129, "learning_rate": 2.7181064073226545e-05, "loss": 1.416, "step": 657 }, { "epoch": 0.19, "grad_norm": 9.202457427978516, "learning_rate": 2.7176773455377574e-05, "loss": 1.7083, "step": 658 }, { "epoch": 0.19, "grad_norm": 8.446024894714355, "learning_rate": 2.7172482837528606e-05, "loss": 1.126, "step": 659 }, { "epoch": 0.19, "grad_norm": 8.490036010742188, "learning_rate": 2.7168192219679635e-05, "loss": 1.6602, "step": 660 }, { "epoch": 0.19, "grad_norm": 7.171084880828857, "learning_rate": 2.7163901601830663e-05, "loss": 1.2008, "step": 661 }, { "epoch": 0.19, "grad_norm": 7.329586982727051, "learning_rate": 2.7159610983981696e-05, "loss": 1.1091, "step": 662 }, { "epoch": 0.19, "grad_norm": 7.757038116455078, "learning_rate": 2.7155320366132724e-05, "loss": 1.4675, "step": 663 }, { "epoch": 0.19, "grad_norm": 10.028850555419922, "learning_rate": 2.7151029748283753e-05, "loss": 1.7565, "step": 664 }, { "epoch": 0.19, "grad_norm": 8.8215970993042, "learning_rate": 2.7146739130434782e-05, "loss": 1.3142, "step": 665 }, { "epoch": 0.19, "grad_norm": 9.601076126098633, "learning_rate": 2.7142448512585814e-05, "loss": 1.6848, "step": 666 }, { "epoch": 0.19, "grad_norm": 10.196307182312012, "learning_rate": 2.7138157894736843e-05, "loss": 1.879, "step": 667 }, { "epoch": 0.19, "grad_norm": 9.428377151489258, "learning_rate": 2.713386727688787e-05, "loss": 1.6253, "step": 668 }, { "epoch": 0.19, "grad_norm": 9.526021003723145, "learning_rate": 2.7129576659038904e-05, "loss": 1.2186, "step": 669 }, { "epoch": 0.19, "grad_norm": 9.465749740600586, "learning_rate": 2.7125286041189933e-05, "loss": 1.4507, "step": 670 }, { "epoch": 0.19, "grad_norm": 11.018233299255371, "learning_rate": 2.712099542334096e-05, "loss": 1.5466, "step": 671 }, { "epoch": 0.19, "grad_norm": 8.881890296936035, "learning_rate": 2.711670480549199e-05, "loss": 1.8524, "step": 672 }, { "epoch": 0.19, "grad_norm": 8.963903427124023, "learning_rate": 2.7112414187643022e-05, "loss": 1.3904, "step": 673 }, { "epoch": 0.19, "grad_norm": 8.687522888183594, "learning_rate": 2.710812356979405e-05, "loss": 1.1201, "step": 674 }, { "epoch": 0.19, "grad_norm": 8.622756004333496, "learning_rate": 2.710383295194508e-05, "loss": 1.2027, "step": 675 }, { "epoch": 0.19, "grad_norm": 9.560112953186035, "learning_rate": 2.709954233409611e-05, "loss": 1.7021, "step": 676 }, { "epoch": 0.19, "grad_norm": 7.986423015594482, "learning_rate": 2.709525171624714e-05, "loss": 1.2896, "step": 677 }, { "epoch": 0.19, "grad_norm": 9.79885482788086, "learning_rate": 2.7090961098398173e-05, "loss": 1.608, "step": 678 }, { "epoch": 0.19, "grad_norm": 8.915427207946777, "learning_rate": 2.7086670480549198e-05, "loss": 1.2062, "step": 679 }, { "epoch": 0.19, "grad_norm": 10.137808799743652, "learning_rate": 2.708237986270023e-05, "loss": 1.3713, "step": 680 }, { "epoch": 0.19, "grad_norm": 10.706426620483398, "learning_rate": 2.707808924485126e-05, "loss": 1.5678, "step": 681 }, { "epoch": 0.2, "grad_norm": 8.411629676818848, "learning_rate": 2.707379862700229e-05, "loss": 1.2383, "step": 682 }, { "epoch": 0.2, "grad_norm": 9.0653715133667, "learning_rate": 2.7069508009153317e-05, "loss": 1.2453, "step": 683 }, { "epoch": 0.2, "grad_norm": 9.046063423156738, "learning_rate": 2.706521739130435e-05, "loss": 1.2131, "step": 684 }, { "epoch": 0.2, "grad_norm": 8.591142654418945, "learning_rate": 2.7060926773455378e-05, "loss": 1.4589, "step": 685 }, { "epoch": 0.2, "grad_norm": 7.216357231140137, "learning_rate": 2.7056636155606406e-05, "loss": 1.0575, "step": 686 }, { "epoch": 0.2, "grad_norm": 11.10873031616211, "learning_rate": 2.705234553775744e-05, "loss": 1.6542, "step": 687 }, { "epoch": 0.2, "grad_norm": 11.519617080688477, "learning_rate": 2.7048054919908467e-05, "loss": 1.6119, "step": 688 }, { "epoch": 0.2, "grad_norm": 9.64222240447998, "learning_rate": 2.70437643020595e-05, "loss": 1.3678, "step": 689 }, { "epoch": 0.2, "grad_norm": 9.729552268981934, "learning_rate": 2.7039473684210525e-05, "loss": 1.2114, "step": 690 }, { "epoch": 0.2, "grad_norm": 9.77514362335205, "learning_rate": 2.7035183066361557e-05, "loss": 1.6144, "step": 691 }, { "epoch": 0.2, "grad_norm": 10.376748085021973, "learning_rate": 2.7030892448512586e-05, "loss": 1.4726, "step": 692 }, { "epoch": 0.2, "grad_norm": 9.687190055847168, "learning_rate": 2.7026601830663618e-05, "loss": 1.3794, "step": 693 }, { "epoch": 0.2, "grad_norm": 9.168863296508789, "learning_rate": 2.7022311212814643e-05, "loss": 1.573, "step": 694 }, { "epoch": 0.2, "grad_norm": 9.668581008911133, "learning_rate": 2.7018020594965675e-05, "loss": 1.5868, "step": 695 }, { "epoch": 0.2, "grad_norm": 8.229246139526367, "learning_rate": 2.7013729977116708e-05, "loss": 1.1747, "step": 696 }, { "epoch": 0.2, "grad_norm": 7.377071857452393, "learning_rate": 2.7009439359267733e-05, "loss": 1.3782, "step": 697 }, { "epoch": 0.2, "grad_norm": 9.349214553833008, "learning_rate": 2.7005148741418765e-05, "loss": 1.5642, "step": 698 }, { "epoch": 0.2, "grad_norm": 8.807302474975586, "learning_rate": 2.7000858123569794e-05, "loss": 1.4453, "step": 699 }, { "epoch": 0.2, "grad_norm": 8.276571273803711, "learning_rate": 2.6996567505720826e-05, "loss": 1.4524, "step": 700 }, { "epoch": 0.2, "grad_norm": 8.345519065856934, "learning_rate": 2.699227688787185e-05, "loss": 1.4265, "step": 701 }, { "epoch": 0.2, "grad_norm": 9.080941200256348, "learning_rate": 2.6987986270022884e-05, "loss": 1.5395, "step": 702 }, { "epoch": 0.2, "grad_norm": 7.670738697052002, "learning_rate": 2.6983695652173916e-05, "loss": 1.2509, "step": 703 }, { "epoch": 0.2, "grad_norm": 8.05324935913086, "learning_rate": 2.6979405034324945e-05, "loss": 1.3125, "step": 704 }, { "epoch": 0.2, "grad_norm": 7.438810348510742, "learning_rate": 2.6975114416475973e-05, "loss": 1.5452, "step": 705 }, { "epoch": 0.2, "grad_norm": 7.855247974395752, "learning_rate": 2.6970823798627002e-05, "loss": 1.6452, "step": 706 }, { "epoch": 0.2, "grad_norm": 8.98528003692627, "learning_rate": 2.6966533180778034e-05, "loss": 1.4263, "step": 707 }, { "epoch": 0.2, "grad_norm": 9.596749305725098, "learning_rate": 2.6962242562929063e-05, "loss": 1.4357, "step": 708 }, { "epoch": 0.2, "grad_norm": 8.83573055267334, "learning_rate": 2.6957951945080092e-05, "loss": 1.4635, "step": 709 }, { "epoch": 0.2, "grad_norm": 9.183515548706055, "learning_rate": 2.695366132723112e-05, "loss": 1.6072, "step": 710 }, { "epoch": 0.2, "grad_norm": 9.77737808227539, "learning_rate": 2.6949370709382153e-05, "loss": 1.5869, "step": 711 }, { "epoch": 0.2, "grad_norm": 8.03734302520752, "learning_rate": 2.694508009153318e-05, "loss": 1.3849, "step": 712 }, { "epoch": 0.2, "grad_norm": 10.255045890808105, "learning_rate": 2.694078947368421e-05, "loss": 1.3344, "step": 713 }, { "epoch": 0.2, "grad_norm": 8.329828262329102, "learning_rate": 2.6936498855835242e-05, "loss": 1.3703, "step": 714 }, { "epoch": 0.2, "grad_norm": 9.793808937072754, "learning_rate": 2.693220823798627e-05, "loss": 1.7685, "step": 715 }, { "epoch": 0.2, "grad_norm": 9.194530487060547, "learning_rate": 2.69279176201373e-05, "loss": 1.7428, "step": 716 }, { "epoch": 0.21, "grad_norm": 7.332390308380127, "learning_rate": 2.692362700228833e-05, "loss": 1.2059, "step": 717 }, { "epoch": 0.21, "grad_norm": 9.699625015258789, "learning_rate": 2.691933638443936e-05, "loss": 1.542, "step": 718 }, { "epoch": 0.21, "grad_norm": 8.5438232421875, "learning_rate": 2.691504576659039e-05, "loss": 1.3377, "step": 719 }, { "epoch": 0.21, "grad_norm": 10.025996208190918, "learning_rate": 2.691075514874142e-05, "loss": 1.7135, "step": 720 }, { "epoch": 0.21, "grad_norm": 9.530084609985352, "learning_rate": 2.690646453089245e-05, "loss": 1.1862, "step": 721 }, { "epoch": 0.21, "grad_norm": 9.832947731018066, "learning_rate": 2.690217391304348e-05, "loss": 1.5505, "step": 722 }, { "epoch": 0.21, "grad_norm": 9.562239646911621, "learning_rate": 2.6897883295194508e-05, "loss": 1.4249, "step": 723 }, { "epoch": 0.21, "grad_norm": 8.007452011108398, "learning_rate": 2.6893592677345537e-05, "loss": 1.3717, "step": 724 }, { "epoch": 0.21, "grad_norm": 9.98516845703125, "learning_rate": 2.688930205949657e-05, "loss": 1.7065, "step": 725 }, { "epoch": 0.21, "grad_norm": 7.55573844909668, "learning_rate": 2.6885011441647598e-05, "loss": 1.4353, "step": 726 }, { "epoch": 0.21, "grad_norm": 9.175204277038574, "learning_rate": 2.6880720823798627e-05, "loss": 1.2867, "step": 727 }, { "epoch": 0.21, "grad_norm": 9.250204086303711, "learning_rate": 2.6876430205949655e-05, "loss": 1.3647, "step": 728 }, { "epoch": 0.21, "grad_norm": 8.163433074951172, "learning_rate": 2.6872139588100687e-05, "loss": 1.4372, "step": 729 }, { "epoch": 0.21, "grad_norm": 9.446063041687012, "learning_rate": 2.686784897025172e-05, "loss": 1.7786, "step": 730 }, { "epoch": 0.21, "grad_norm": 7.554766654968262, "learning_rate": 2.6863558352402745e-05, "loss": 1.2214, "step": 731 }, { "epoch": 0.21, "grad_norm": 10.089883804321289, "learning_rate": 2.6859267734553777e-05, "loss": 1.451, "step": 732 }, { "epoch": 0.21, "grad_norm": 9.008309364318848, "learning_rate": 2.6854977116704806e-05, "loss": 1.8082, "step": 733 }, { "epoch": 0.21, "grad_norm": 8.668070793151855, "learning_rate": 2.6850686498855838e-05, "loss": 1.5068, "step": 734 }, { "epoch": 0.21, "grad_norm": 7.68413782119751, "learning_rate": 2.6846395881006863e-05, "loss": 1.3725, "step": 735 }, { "epoch": 0.21, "grad_norm": 7.327756881713867, "learning_rate": 2.6842105263157896e-05, "loss": 1.1768, "step": 736 }, { "epoch": 0.21, "grad_norm": 8.70918083190918, "learning_rate": 2.6837814645308928e-05, "loss": 1.3466, "step": 737 }, { "epoch": 0.21, "grad_norm": 8.751984596252441, "learning_rate": 2.6833524027459953e-05, "loss": 1.2856, "step": 738 }, { "epoch": 0.21, "grad_norm": 9.377949714660645, "learning_rate": 2.6829233409610985e-05, "loss": 1.7009, "step": 739 }, { "epoch": 0.21, "grad_norm": 10.846776962280273, "learning_rate": 2.6824942791762014e-05, "loss": 1.2112, "step": 740 }, { "epoch": 0.21, "grad_norm": 8.490901947021484, "learning_rate": 2.6820652173913046e-05, "loss": 1.3016, "step": 741 }, { "epoch": 0.21, "grad_norm": 9.792441368103027, "learning_rate": 2.681636155606407e-05, "loss": 1.6748, "step": 742 }, { "epoch": 0.21, "grad_norm": 9.022385597229004, "learning_rate": 2.6812070938215104e-05, "loss": 1.2802, "step": 743 }, { "epoch": 0.21, "grad_norm": 9.235421180725098, "learning_rate": 2.6807780320366133e-05, "loss": 1.1258, "step": 744 }, { "epoch": 0.21, "grad_norm": 8.907036781311035, "learning_rate": 2.6803489702517165e-05, "loss": 1.1832, "step": 745 }, { "epoch": 0.21, "grad_norm": 9.597955703735352, "learning_rate": 2.6799199084668193e-05, "loss": 1.4996, "step": 746 }, { "epoch": 0.21, "grad_norm": 9.799057006835938, "learning_rate": 2.6794908466819222e-05, "loss": 1.5921, "step": 747 }, { "epoch": 0.21, "grad_norm": 10.515116691589355, "learning_rate": 2.6790617848970254e-05, "loss": 1.8032, "step": 748 }, { "epoch": 0.21, "grad_norm": 8.887858390808105, "learning_rate": 2.6786327231121283e-05, "loss": 1.3823, "step": 749 }, { "epoch": 0.21, "grad_norm": 10.281638145446777, "learning_rate": 2.6782036613272312e-05, "loss": 1.9007, "step": 750 }, { "epoch": 0.21, "grad_norm": 7.96937370300293, "learning_rate": 2.677774599542334e-05, "loss": 1.3861, "step": 751 }, { "epoch": 0.22, "grad_norm": 8.151745796203613, "learning_rate": 2.6773455377574373e-05, "loss": 1.2723, "step": 752 }, { "epoch": 0.22, "grad_norm": 8.529725074768066, "learning_rate": 2.6769164759725398e-05, "loss": 1.5717, "step": 753 }, { "epoch": 0.22, "grad_norm": 7.393907070159912, "learning_rate": 2.676487414187643e-05, "loss": 1.1694, "step": 754 }, { "epoch": 0.22, "grad_norm": 8.489725112915039, "learning_rate": 2.6760583524027463e-05, "loss": 1.5208, "step": 755 }, { "epoch": 0.22, "grad_norm": 8.4804048538208, "learning_rate": 2.675629290617849e-05, "loss": 1.3057, "step": 756 }, { "epoch": 0.22, "grad_norm": 8.823626518249512, "learning_rate": 2.675200228832952e-05, "loss": 1.4457, "step": 757 }, { "epoch": 0.22, "grad_norm": 9.953473091125488, "learning_rate": 2.674771167048055e-05, "loss": 1.7318, "step": 758 }, { "epoch": 0.22, "grad_norm": 10.457352638244629, "learning_rate": 2.674342105263158e-05, "loss": 1.4098, "step": 759 }, { "epoch": 0.22, "grad_norm": 7.706247329711914, "learning_rate": 2.673913043478261e-05, "loss": 1.2625, "step": 760 }, { "epoch": 0.22, "grad_norm": 9.632022857666016, "learning_rate": 2.673483981693364e-05, "loss": 1.5531, "step": 761 }, { "epoch": 0.22, "grad_norm": 9.971721649169922, "learning_rate": 2.6730549199084667e-05, "loss": 1.4319, "step": 762 }, { "epoch": 0.22, "grad_norm": 7.860352516174316, "learning_rate": 2.67262585812357e-05, "loss": 1.2889, "step": 763 }, { "epoch": 0.22, "grad_norm": 8.265642166137695, "learning_rate": 2.6721967963386728e-05, "loss": 1.2929, "step": 764 }, { "epoch": 0.22, "grad_norm": 9.112139701843262, "learning_rate": 2.6717677345537757e-05, "loss": 1.632, "step": 765 }, { "epoch": 0.22, "grad_norm": 8.007532119750977, "learning_rate": 2.671338672768879e-05, "loss": 1.4319, "step": 766 }, { "epoch": 0.22, "grad_norm": 8.495431900024414, "learning_rate": 2.6709096109839818e-05, "loss": 1.3153, "step": 767 }, { "epoch": 0.22, "grad_norm": 8.244626998901367, "learning_rate": 2.6704805491990847e-05, "loss": 1.2545, "step": 768 }, { "epoch": 0.22, "grad_norm": 10.376943588256836, "learning_rate": 2.6700514874141875e-05, "loss": 1.3685, "step": 769 }, { "epoch": 0.22, "grad_norm": 12.076695442199707, "learning_rate": 2.6696224256292908e-05, "loss": 1.494, "step": 770 }, { "epoch": 0.22, "grad_norm": 9.511319160461426, "learning_rate": 2.6691933638443936e-05, "loss": 1.387, "step": 771 }, { "epoch": 0.22, "grad_norm": 8.889599800109863, "learning_rate": 2.6687643020594965e-05, "loss": 1.5296, "step": 772 }, { "epoch": 0.22, "grad_norm": 8.285345077514648, "learning_rate": 2.6683352402745997e-05, "loss": 1.578, "step": 773 }, { "epoch": 0.22, "grad_norm": 8.711544036865234, "learning_rate": 2.6679061784897026e-05, "loss": 1.3207, "step": 774 }, { "epoch": 0.22, "grad_norm": 9.246892929077148, "learning_rate": 2.6674771167048058e-05, "loss": 1.5279, "step": 775 }, { "epoch": 0.22, "grad_norm": 9.31828498840332, "learning_rate": 2.6670480549199084e-05, "loss": 1.5191, "step": 776 }, { "epoch": 0.22, "grad_norm": 10.230610847473145, "learning_rate": 2.6666189931350116e-05, "loss": 1.9263, "step": 777 }, { "epoch": 0.22, "grad_norm": 8.435502052307129, "learning_rate": 2.6661899313501145e-05, "loss": 1.5, "step": 778 }, { "epoch": 0.22, "grad_norm": 8.353205680847168, "learning_rate": 2.6657608695652173e-05, "loss": 1.1672, "step": 779 }, { "epoch": 0.22, "grad_norm": 9.82955265045166, "learning_rate": 2.6653318077803205e-05, "loss": 1.7323, "step": 780 }, { "epoch": 0.22, "grad_norm": 8.477160453796387, "learning_rate": 2.6649027459954234e-05, "loss": 1.5587, "step": 781 }, { "epoch": 0.22, "grad_norm": 7.85951566696167, "learning_rate": 2.6644736842105266e-05, "loss": 1.4767, "step": 782 }, { "epoch": 0.22, "grad_norm": 8.957002639770508, "learning_rate": 2.6640446224256292e-05, "loss": 1.2401, "step": 783 }, { "epoch": 0.22, "grad_norm": 9.295096397399902, "learning_rate": 2.6636155606407324e-05, "loss": 1.7779, "step": 784 }, { "epoch": 0.22, "grad_norm": 10.103282928466797, "learning_rate": 2.6631864988558353e-05, "loss": 1.6127, "step": 785 }, { "epoch": 0.22, "grad_norm": 8.313076972961426, "learning_rate": 2.6627574370709385e-05, "loss": 1.2176, "step": 786 }, { "epoch": 0.23, "grad_norm": 8.133856773376465, "learning_rate": 2.662328375286041e-05, "loss": 1.2762, "step": 787 }, { "epoch": 0.23, "grad_norm": 8.65207290649414, "learning_rate": 2.6618993135011442e-05, "loss": 1.6066, "step": 788 }, { "epoch": 0.23, "grad_norm": 10.28887939453125, "learning_rate": 2.6614702517162475e-05, "loss": 1.7862, "step": 789 }, { "epoch": 0.23, "grad_norm": 10.515134811401367, "learning_rate": 2.66104118993135e-05, "loss": 1.5071, "step": 790 }, { "epoch": 0.23, "grad_norm": 8.611674308776855, "learning_rate": 2.6606121281464532e-05, "loss": 1.5397, "step": 791 }, { "epoch": 0.23, "grad_norm": 10.166763305664062, "learning_rate": 2.660183066361556e-05, "loss": 1.294, "step": 792 }, { "epoch": 0.23, "grad_norm": 8.607548713684082, "learning_rate": 2.6597540045766593e-05, "loss": 1.3341, "step": 793 }, { "epoch": 0.23, "grad_norm": 9.909729957580566, "learning_rate": 2.659324942791762e-05, "loss": 1.4576, "step": 794 }, { "epoch": 0.23, "grad_norm": 9.688549995422363, "learning_rate": 2.658895881006865e-05, "loss": 1.3312, "step": 795 }, { "epoch": 0.23, "grad_norm": 9.35120677947998, "learning_rate": 2.658466819221968e-05, "loss": 1.3555, "step": 796 }, { "epoch": 0.23, "grad_norm": 10.456376075744629, "learning_rate": 2.658037757437071e-05, "loss": 1.6144, "step": 797 }, { "epoch": 0.23, "grad_norm": 9.537264823913574, "learning_rate": 2.657608695652174e-05, "loss": 1.5542, "step": 798 }, { "epoch": 0.23, "grad_norm": 10.017496109008789, "learning_rate": 2.657179633867277e-05, "loss": 1.5115, "step": 799 }, { "epoch": 0.23, "grad_norm": 9.876407623291016, "learning_rate": 2.65675057208238e-05, "loss": 1.461, "step": 800 }, { "epoch": 0.23, "grad_norm": 10.00885009765625, "learning_rate": 2.656321510297483e-05, "loss": 1.514, "step": 801 }, { "epoch": 0.23, "grad_norm": 8.821441650390625, "learning_rate": 2.655892448512586e-05, "loss": 1.4214, "step": 802 }, { "epoch": 0.23, "grad_norm": 8.728100776672363, "learning_rate": 2.6554633867276887e-05, "loss": 1.4235, "step": 803 }, { "epoch": 0.23, "grad_norm": 8.904654502868652, "learning_rate": 2.655034324942792e-05, "loss": 1.6595, "step": 804 }, { "epoch": 0.23, "grad_norm": 8.324999809265137, "learning_rate": 2.6546052631578945e-05, "loss": 1.4202, "step": 805 }, { "epoch": 0.23, "grad_norm": 8.588123321533203, "learning_rate": 2.6541762013729977e-05, "loss": 1.3708, "step": 806 }, { "epoch": 0.23, "grad_norm": 8.563580513000488, "learning_rate": 2.653747139588101e-05, "loss": 1.3736, "step": 807 }, { "epoch": 0.23, "grad_norm": 7.648112773895264, "learning_rate": 2.6533180778032038e-05, "loss": 1.3695, "step": 808 }, { "epoch": 0.23, "grad_norm": 8.872638702392578, "learning_rate": 2.6528890160183067e-05, "loss": 1.3779, "step": 809 }, { "epoch": 0.23, "grad_norm": 9.684102058410645, "learning_rate": 2.6524599542334096e-05, "loss": 1.7289, "step": 810 }, { "epoch": 0.23, "grad_norm": 9.175305366516113, "learning_rate": 2.6520308924485128e-05, "loss": 1.5158, "step": 811 }, { "epoch": 0.23, "grad_norm": 9.868097305297852, "learning_rate": 2.6516018306636156e-05, "loss": 1.7496, "step": 812 }, { "epoch": 0.23, "grad_norm": 8.471468925476074, "learning_rate": 2.6511727688787185e-05, "loss": 1.6277, "step": 813 }, { "epoch": 0.23, "grad_norm": 9.10570240020752, "learning_rate": 2.6507437070938214e-05, "loss": 1.6749, "step": 814 }, { "epoch": 0.23, "grad_norm": 8.941532135009766, "learning_rate": 2.6503146453089246e-05, "loss": 1.5023, "step": 815 }, { "epoch": 0.23, "grad_norm": 8.747757911682129, "learning_rate": 2.6498855835240275e-05, "loss": 1.4065, "step": 816 }, { "epoch": 0.23, "grad_norm": 10.095454216003418, "learning_rate": 2.6494565217391304e-05, "loss": 1.1893, "step": 817 }, { "epoch": 0.23, "grad_norm": 8.078791618347168, "learning_rate": 2.6490274599542336e-05, "loss": 1.3243, "step": 818 }, { "epoch": 0.23, "grad_norm": 8.536273002624512, "learning_rate": 2.6485983981693365e-05, "loss": 1.4985, "step": 819 }, { "epoch": 0.23, "grad_norm": 8.09534740447998, "learning_rate": 2.6481693363844393e-05, "loss": 1.3751, "step": 820 }, { "epoch": 0.23, "grad_norm": 9.298988342285156, "learning_rate": 2.6477402745995422e-05, "loss": 1.444, "step": 821 }, { "epoch": 0.24, "grad_norm": 7.982908725738525, "learning_rate": 2.6473112128146454e-05, "loss": 1.3142, "step": 822 }, { "epoch": 0.24, "grad_norm": 8.453125, "learning_rate": 2.6468821510297486e-05, "loss": 1.2621, "step": 823 }, { "epoch": 0.24, "grad_norm": 8.977948188781738, "learning_rate": 2.6464530892448512e-05, "loss": 1.2767, "step": 824 }, { "epoch": 0.24, "grad_norm": 8.755941390991211, "learning_rate": 2.6460240274599544e-05, "loss": 1.4843, "step": 825 }, { "epoch": 0.24, "grad_norm": 8.676729202270508, "learning_rate": 2.6455949656750573e-05, "loss": 1.3186, "step": 826 }, { "epoch": 0.24, "grad_norm": 8.526939392089844, "learning_rate": 2.6451659038901605e-05, "loss": 1.7197, "step": 827 }, { "epoch": 0.24, "grad_norm": 9.150410652160645, "learning_rate": 2.644736842105263e-05, "loss": 1.5229, "step": 828 }, { "epoch": 0.24, "grad_norm": 7.937966823577881, "learning_rate": 2.6443077803203662e-05, "loss": 1.1528, "step": 829 }, { "epoch": 0.24, "grad_norm": 8.968094825744629, "learning_rate": 2.643878718535469e-05, "loss": 1.2425, "step": 830 }, { "epoch": 0.24, "grad_norm": 9.370745658874512, "learning_rate": 2.643449656750572e-05, "loss": 1.5296, "step": 831 }, { "epoch": 0.24, "grad_norm": 10.294328689575195, "learning_rate": 2.6430205949656752e-05, "loss": 1.5658, "step": 832 }, { "epoch": 0.24, "grad_norm": 8.228590965270996, "learning_rate": 2.642591533180778e-05, "loss": 1.1133, "step": 833 }, { "epoch": 0.24, "grad_norm": 8.272261619567871, "learning_rate": 2.6421624713958813e-05, "loss": 1.2029, "step": 834 }, { "epoch": 0.24, "grad_norm": 10.820258140563965, "learning_rate": 2.641733409610984e-05, "loss": 1.517, "step": 835 }, { "epoch": 0.24, "grad_norm": 8.859976768493652, "learning_rate": 2.641304347826087e-05, "loss": 1.2971, "step": 836 }, { "epoch": 0.24, "grad_norm": 9.648213386535645, "learning_rate": 2.64087528604119e-05, "loss": 1.1995, "step": 837 }, { "epoch": 0.24, "grad_norm": 9.68163013458252, "learning_rate": 2.640446224256293e-05, "loss": 1.3573, "step": 838 }, { "epoch": 0.24, "grad_norm": 7.964151382446289, "learning_rate": 2.6400171624713957e-05, "loss": 0.964, "step": 839 }, { "epoch": 0.24, "grad_norm": 9.399771690368652, "learning_rate": 2.639588100686499e-05, "loss": 1.5236, "step": 840 }, { "epoch": 0.24, "grad_norm": 10.44339370727539, "learning_rate": 2.639159038901602e-05, "loss": 1.5872, "step": 841 }, { "epoch": 0.24, "grad_norm": 9.737083435058594, "learning_rate": 2.6387299771167047e-05, "loss": 1.6335, "step": 842 }, { "epoch": 0.24, "grad_norm": 9.311802864074707, "learning_rate": 2.638300915331808e-05, "loss": 1.254, "step": 843 }, { "epoch": 0.24, "grad_norm": 9.597392082214355, "learning_rate": 2.6378718535469108e-05, "loss": 1.2899, "step": 844 }, { "epoch": 0.24, "grad_norm": 8.301457405090332, "learning_rate": 2.637442791762014e-05, "loss": 1.1914, "step": 845 }, { "epoch": 0.24, "grad_norm": 9.583595275878906, "learning_rate": 2.6370137299771165e-05, "loss": 1.6079, "step": 846 }, { "epoch": 0.24, "grad_norm": 8.2188138961792, "learning_rate": 2.6365846681922197e-05, "loss": 1.4473, "step": 847 }, { "epoch": 0.24, "grad_norm": 9.318650245666504, "learning_rate": 2.6361556064073226e-05, "loss": 1.409, "step": 848 }, { "epoch": 0.24, "grad_norm": 8.837665557861328, "learning_rate": 2.6357265446224258e-05, "loss": 1.3829, "step": 849 }, { "epoch": 0.24, "grad_norm": 7.132809638977051, "learning_rate": 2.6352974828375287e-05, "loss": 1.2159, "step": 850 }, { "epoch": 0.24, "grad_norm": 7.9716339111328125, "learning_rate": 2.6348684210526316e-05, "loss": 1.5728, "step": 851 }, { "epoch": 0.24, "grad_norm": 7.7947492599487305, "learning_rate": 2.6344393592677348e-05, "loss": 1.3865, "step": 852 }, { "epoch": 0.24, "grad_norm": 7.062242031097412, "learning_rate": 2.6340102974828377e-05, "loss": 1.3908, "step": 853 }, { "epoch": 0.24, "grad_norm": 8.27694320678711, "learning_rate": 2.6335812356979405e-05, "loss": 1.6951, "step": 854 }, { "epoch": 0.24, "grad_norm": 7.277055263519287, "learning_rate": 2.6331521739130434e-05, "loss": 1.3148, "step": 855 }, { "epoch": 0.24, "grad_norm": 7.7327470779418945, "learning_rate": 2.6327231121281466e-05, "loss": 1.2293, "step": 856 }, { "epoch": 0.25, "grad_norm": 8.443324089050293, "learning_rate": 2.6322940503432495e-05, "loss": 1.2151, "step": 857 }, { "epoch": 0.25, "grad_norm": 8.355253219604492, "learning_rate": 2.6318649885583524e-05, "loss": 1.5138, "step": 858 }, { "epoch": 0.25, "grad_norm": 9.991596221923828, "learning_rate": 2.6314359267734556e-05, "loss": 1.4703, "step": 859 }, { "epoch": 0.25, "grad_norm": 8.560808181762695, "learning_rate": 2.6310068649885585e-05, "loss": 1.1305, "step": 860 }, { "epoch": 0.25, "grad_norm": 8.362797737121582, "learning_rate": 2.6305778032036614e-05, "loss": 1.2677, "step": 861 }, { "epoch": 0.25, "grad_norm": 8.149765014648438, "learning_rate": 2.6301487414187642e-05, "loss": 1.2291, "step": 862 }, { "epoch": 0.25, "grad_norm": 10.845412254333496, "learning_rate": 2.6297196796338674e-05, "loss": 1.2982, "step": 863 }, { "epoch": 0.25, "grad_norm": 7.772393226623535, "learning_rate": 2.6292906178489703e-05, "loss": 1.272, "step": 864 }, { "epoch": 0.25, "grad_norm": 8.768017768859863, "learning_rate": 2.6288615560640732e-05, "loss": 1.5599, "step": 865 }, { "epoch": 0.25, "grad_norm": 9.420010566711426, "learning_rate": 2.6284324942791764e-05, "loss": 1.1816, "step": 866 }, { "epoch": 0.25, "grad_norm": 8.9022798538208, "learning_rate": 2.6280034324942793e-05, "loss": 1.6228, "step": 867 }, { "epoch": 0.25, "grad_norm": 9.873797416687012, "learning_rate": 2.6275743707093822e-05, "loss": 1.3512, "step": 868 }, { "epoch": 0.25, "grad_norm": 7.665680408477783, "learning_rate": 2.627145308924485e-05, "loss": 1.1994, "step": 869 }, { "epoch": 0.25, "grad_norm": 9.856553077697754, "learning_rate": 2.6267162471395883e-05, "loss": 1.6137, "step": 870 }, { "epoch": 0.25, "grad_norm": 9.588093757629395, "learning_rate": 2.626287185354691e-05, "loss": 1.1853, "step": 871 }, { "epoch": 0.25, "grad_norm": 7.5686163902282715, "learning_rate": 2.625858123569794e-05, "loss": 1.3191, "step": 872 }, { "epoch": 0.25, "grad_norm": 8.717270851135254, "learning_rate": 2.625429061784897e-05, "loss": 1.556, "step": 873 }, { "epoch": 0.25, "grad_norm": 8.204094886779785, "learning_rate": 2.625e-05, "loss": 1.4066, "step": 874 }, { "epoch": 0.25, "grad_norm": 7.699957370758057, "learning_rate": 2.6245709382151033e-05, "loss": 1.2625, "step": 875 }, { "epoch": 0.25, "grad_norm": 6.8762640953063965, "learning_rate": 2.624141876430206e-05, "loss": 1.1352, "step": 876 }, { "epoch": 0.25, "grad_norm": 9.54216480255127, "learning_rate": 2.623712814645309e-05, "loss": 1.3852, "step": 877 }, { "epoch": 0.25, "grad_norm": 8.11685848236084, "learning_rate": 2.623283752860412e-05, "loss": 1.0808, "step": 878 }, { "epoch": 0.25, "grad_norm": 8.630313873291016, "learning_rate": 2.6228546910755152e-05, "loss": 1.2204, "step": 879 }, { "epoch": 0.25, "grad_norm": 9.078722953796387, "learning_rate": 2.6224256292906177e-05, "loss": 1.3517, "step": 880 }, { "epoch": 0.25, "grad_norm": 9.79880428314209, "learning_rate": 2.621996567505721e-05, "loss": 1.533, "step": 881 }, { "epoch": 0.25, "grad_norm": 8.823820114135742, "learning_rate": 2.6215675057208238e-05, "loss": 1.3513, "step": 882 }, { "epoch": 0.25, "grad_norm": 9.656079292297363, "learning_rate": 2.6211384439359267e-05, "loss": 1.3914, "step": 883 }, { "epoch": 0.25, "grad_norm": 9.426806449890137, "learning_rate": 2.62070938215103e-05, "loss": 1.4989, "step": 884 }, { "epoch": 0.25, "grad_norm": 8.181078910827637, "learning_rate": 2.6202803203661328e-05, "loss": 1.4191, "step": 885 }, { "epoch": 0.25, "grad_norm": 8.820877075195312, "learning_rate": 2.619851258581236e-05, "loss": 1.0893, "step": 886 }, { "epoch": 0.25, "grad_norm": 10.27674388885498, "learning_rate": 2.6194221967963385e-05, "loss": 1.4704, "step": 887 }, { "epoch": 0.25, "grad_norm": 8.574858665466309, "learning_rate": 2.6189931350114417e-05, "loss": 1.5897, "step": 888 }, { "epoch": 0.25, "grad_norm": 8.53704833984375, "learning_rate": 2.6185640732265446e-05, "loss": 1.2895, "step": 889 }, { "epoch": 0.25, "grad_norm": 7.642971515655518, "learning_rate": 2.618135011441648e-05, "loss": 1.3837, "step": 890 }, { "epoch": 0.25, "grad_norm": 8.106728553771973, "learning_rate": 2.6177059496567504e-05, "loss": 1.2967, "step": 891 }, { "epoch": 0.26, "grad_norm": 8.707520484924316, "learning_rate": 2.6172768878718536e-05, "loss": 1.6252, "step": 892 }, { "epoch": 0.26, "grad_norm": 9.524190902709961, "learning_rate": 2.6168478260869568e-05, "loss": 1.4453, "step": 893 }, { "epoch": 0.26, "grad_norm": 7.845490455627441, "learning_rate": 2.6164187643020593e-05, "loss": 1.2811, "step": 894 }, { "epoch": 0.26, "grad_norm": 9.159387588500977, "learning_rate": 2.6159897025171626e-05, "loss": 1.409, "step": 895 }, { "epoch": 0.26, "grad_norm": 7.53691291809082, "learning_rate": 2.6155606407322654e-05, "loss": 1.3078, "step": 896 }, { "epoch": 0.26, "grad_norm": 9.199991226196289, "learning_rate": 2.6151315789473686e-05, "loss": 1.3968, "step": 897 }, { "epoch": 0.26, "grad_norm": 9.692501068115234, "learning_rate": 2.6147025171624712e-05, "loss": 1.3681, "step": 898 }, { "epoch": 0.26, "grad_norm": 10.840760231018066, "learning_rate": 2.6142734553775744e-05, "loss": 1.4837, "step": 899 }, { "epoch": 0.26, "grad_norm": 9.615287780761719, "learning_rate": 2.6138443935926776e-05, "loss": 1.153, "step": 900 }, { "epoch": 0.26, "grad_norm": 11.4331693649292, "learning_rate": 2.6134153318077805e-05, "loss": 1.2511, "step": 901 }, { "epoch": 0.26, "grad_norm": 12.56297779083252, "learning_rate": 2.6129862700228834e-05, "loss": 1.4341, "step": 902 }, { "epoch": 0.26, "grad_norm": 10.176109313964844, "learning_rate": 2.6125572082379862e-05, "loss": 1.1976, "step": 903 }, { "epoch": 0.26, "grad_norm": 10.312824249267578, "learning_rate": 2.6121281464530895e-05, "loss": 1.3812, "step": 904 }, { "epoch": 0.26, "grad_norm": 9.968500137329102, "learning_rate": 2.6116990846681923e-05, "loss": 1.3292, "step": 905 }, { "epoch": 0.26, "grad_norm": 10.515392303466797, "learning_rate": 2.6112700228832952e-05, "loss": 1.5927, "step": 906 }, { "epoch": 0.26, "grad_norm": 10.473443031311035, "learning_rate": 2.610840961098398e-05, "loss": 1.4326, "step": 907 }, { "epoch": 0.26, "grad_norm": 10.10062313079834, "learning_rate": 2.6104118993135013e-05, "loss": 1.4284, "step": 908 }, { "epoch": 0.26, "grad_norm": 13.7451753616333, "learning_rate": 2.6099828375286042e-05, "loss": 1.7868, "step": 909 }, { "epoch": 0.26, "grad_norm": 8.696174621582031, "learning_rate": 2.609553775743707e-05, "loss": 1.1353, "step": 910 }, { "epoch": 0.26, "grad_norm": 7.880374908447266, "learning_rate": 2.6091247139588103e-05, "loss": 1.0382, "step": 911 }, { "epoch": 0.26, "grad_norm": 8.875225067138672, "learning_rate": 2.608695652173913e-05, "loss": 1.1643, "step": 912 }, { "epoch": 0.26, "grad_norm": 7.7749247550964355, "learning_rate": 2.608266590389016e-05, "loss": 1.1447, "step": 913 }, { "epoch": 0.26, "grad_norm": 10.65074634552002, "learning_rate": 2.607837528604119e-05, "loss": 1.2644, "step": 914 }, { "epoch": 0.26, "grad_norm": 9.620262145996094, "learning_rate": 2.607408466819222e-05, "loss": 1.668, "step": 915 }, { "epoch": 0.26, "grad_norm": 8.444010734558105, "learning_rate": 2.606979405034325e-05, "loss": 1.4725, "step": 916 }, { "epoch": 0.26, "grad_norm": 8.710188865661621, "learning_rate": 2.606550343249428e-05, "loss": 1.5552, "step": 917 }, { "epoch": 0.26, "grad_norm": 10.831197738647461, "learning_rate": 2.606121281464531e-05, "loss": 1.6034, "step": 918 }, { "epoch": 0.26, "grad_norm": 10.13759708404541, "learning_rate": 2.605692219679634e-05, "loss": 1.0555, "step": 919 }, { "epoch": 0.26, "grad_norm": 9.175713539123535, "learning_rate": 2.605263157894737e-05, "loss": 1.2767, "step": 920 }, { "epoch": 0.26, "grad_norm": 7.387826442718506, "learning_rate": 2.6048340961098397e-05, "loss": 1.0568, "step": 921 }, { "epoch": 0.26, "grad_norm": 9.601868629455566, "learning_rate": 2.604405034324943e-05, "loss": 1.578, "step": 922 }, { "epoch": 0.26, "grad_norm": 8.701878547668457, "learning_rate": 2.6039759725400458e-05, "loss": 1.486, "step": 923 }, { "epoch": 0.26, "grad_norm": 8.450980186462402, "learning_rate": 2.6035469107551487e-05, "loss": 1.3934, "step": 924 }, { "epoch": 0.26, "grad_norm": 8.329426765441895, "learning_rate": 2.6031178489702516e-05, "loss": 1.2754, "step": 925 }, { "epoch": 0.26, "grad_norm": 7.788487434387207, "learning_rate": 2.6026887871853548e-05, "loss": 1.0489, "step": 926 }, { "epoch": 0.27, "grad_norm": 10.402837753295898, "learning_rate": 2.602259725400458e-05, "loss": 1.487, "step": 927 }, { "epoch": 0.27, "grad_norm": 7.651103973388672, "learning_rate": 2.6018306636155605e-05, "loss": 1.3695, "step": 928 }, { "epoch": 0.27, "grad_norm": 8.581132888793945, "learning_rate": 2.6014016018306638e-05, "loss": 1.6594, "step": 929 }, { "epoch": 0.27, "grad_norm": 9.17892074584961, "learning_rate": 2.6009725400457666e-05, "loss": 1.5891, "step": 930 }, { "epoch": 0.27, "grad_norm": 8.242602348327637, "learning_rate": 2.60054347826087e-05, "loss": 1.2397, "step": 931 }, { "epoch": 0.27, "grad_norm": 7.623937129974365, "learning_rate": 2.6001144164759724e-05, "loss": 1.2555, "step": 932 }, { "epoch": 0.27, "grad_norm": 8.100603103637695, "learning_rate": 2.5996853546910756e-05, "loss": 1.2265, "step": 933 }, { "epoch": 0.27, "grad_norm": 8.403155326843262, "learning_rate": 2.5992562929061788e-05, "loss": 1.2792, "step": 934 }, { "epoch": 0.27, "grad_norm": 9.048784255981445, "learning_rate": 2.5988272311212814e-05, "loss": 1.5941, "step": 935 }, { "epoch": 0.27, "grad_norm": 9.061111450195312, "learning_rate": 2.5983981693363846e-05, "loss": 1.4503, "step": 936 }, { "epoch": 0.27, "grad_norm": 9.425355911254883, "learning_rate": 2.5979691075514874e-05, "loss": 1.3135, "step": 937 }, { "epoch": 0.27, "grad_norm": 7.993234634399414, "learning_rate": 2.5975400457665907e-05, "loss": 1.3449, "step": 938 }, { "epoch": 0.27, "grad_norm": 8.822248458862305, "learning_rate": 2.5971109839816932e-05, "loss": 1.2058, "step": 939 }, { "epoch": 0.27, "grad_norm": 8.437129974365234, "learning_rate": 2.5966819221967964e-05, "loss": 1.2206, "step": 940 }, { "epoch": 0.27, "grad_norm": 9.871660232543945, "learning_rate": 2.5962528604118993e-05, "loss": 1.4596, "step": 941 }, { "epoch": 0.27, "grad_norm": 8.384592056274414, "learning_rate": 2.5958237986270025e-05, "loss": 1.3168, "step": 942 }, { "epoch": 0.27, "grad_norm": 9.543497085571289, "learning_rate": 2.5953947368421054e-05, "loss": 1.3973, "step": 943 }, { "epoch": 0.27, "grad_norm": 9.770606994628906, "learning_rate": 2.5949656750572083e-05, "loss": 1.4777, "step": 944 }, { "epoch": 0.27, "grad_norm": 8.465770721435547, "learning_rate": 2.5945366132723115e-05, "loss": 1.2012, "step": 945 }, { "epoch": 0.27, "grad_norm": 8.79576587677002, "learning_rate": 2.5941075514874144e-05, "loss": 1.4603, "step": 946 }, { "epoch": 0.27, "grad_norm": 7.772630214691162, "learning_rate": 2.5936784897025172e-05, "loss": 1.4308, "step": 947 }, { "epoch": 0.27, "grad_norm": 9.234416961669922, "learning_rate": 2.59324942791762e-05, "loss": 1.3327, "step": 948 }, { "epoch": 0.27, "grad_norm": 8.109087944030762, "learning_rate": 2.5928203661327233e-05, "loss": 1.4196, "step": 949 }, { "epoch": 0.27, "grad_norm": 9.199943542480469, "learning_rate": 2.592391304347826e-05, "loss": 1.6369, "step": 950 }, { "epoch": 0.27, "grad_norm": 9.374139785766602, "learning_rate": 2.591962242562929e-05, "loss": 1.2285, "step": 951 }, { "epoch": 0.27, "grad_norm": 8.24150276184082, "learning_rate": 2.5915331807780323e-05, "loss": 1.117, "step": 952 }, { "epoch": 0.27, "grad_norm": 8.536650657653809, "learning_rate": 2.591104118993135e-05, "loss": 1.1041, "step": 953 }, { "epoch": 0.27, "grad_norm": 8.652554512023926, "learning_rate": 2.590675057208238e-05, "loss": 1.2805, "step": 954 }, { "epoch": 0.27, "grad_norm": 10.050910949707031, "learning_rate": 2.590245995423341e-05, "loss": 1.4703, "step": 955 }, { "epoch": 0.27, "grad_norm": 9.267804145812988, "learning_rate": 2.589816933638444e-05, "loss": 1.4239, "step": 956 }, { "epoch": 0.27, "grad_norm": 7.391750812530518, "learning_rate": 2.589387871853547e-05, "loss": 1.4491, "step": 957 }, { "epoch": 0.27, "grad_norm": 8.495930671691895, "learning_rate": 2.58895881006865e-05, "loss": 1.156, "step": 958 }, { "epoch": 0.27, "grad_norm": 8.142219543457031, "learning_rate": 2.5885297482837528e-05, "loss": 1.2461, "step": 959 }, { "epoch": 0.27, "grad_norm": 8.015151977539062, "learning_rate": 2.588100686498856e-05, "loss": 1.2013, "step": 960 }, { "epoch": 0.27, "grad_norm": 7.377295017242432, "learning_rate": 2.587671624713959e-05, "loss": 1.0362, "step": 961 }, { "epoch": 0.28, "grad_norm": 9.156243324279785, "learning_rate": 2.5872425629290617e-05, "loss": 1.4888, "step": 962 }, { "epoch": 0.28, "grad_norm": 8.230779647827148, "learning_rate": 2.586813501144165e-05, "loss": 1.4112, "step": 963 }, { "epoch": 0.28, "grad_norm": 9.809797286987305, "learning_rate": 2.5863844393592678e-05, "loss": 1.3778, "step": 964 }, { "epoch": 0.28, "grad_norm": 8.639060974121094, "learning_rate": 2.5859553775743707e-05, "loss": 1.2252, "step": 965 }, { "epoch": 0.28, "grad_norm": 8.97224235534668, "learning_rate": 2.5855263157894736e-05, "loss": 1.5233, "step": 966 }, { "epoch": 0.28, "grad_norm": 8.486656188964844, "learning_rate": 2.5850972540045768e-05, "loss": 1.5144, "step": 967 }, { "epoch": 0.28, "grad_norm": 8.242863655090332, "learning_rate": 2.5846681922196797e-05, "loss": 1.4416, "step": 968 }, { "epoch": 0.28, "grad_norm": 8.781201362609863, "learning_rate": 2.5842391304347826e-05, "loss": 1.1063, "step": 969 }, { "epoch": 0.28, "grad_norm": 7.582833290100098, "learning_rate": 2.5838100686498858e-05, "loss": 1.0988, "step": 970 }, { "epoch": 0.28, "grad_norm": 7.837419033050537, "learning_rate": 2.5833810068649886e-05, "loss": 1.3314, "step": 971 }, { "epoch": 0.28, "grad_norm": 9.807422637939453, "learning_rate": 2.582951945080092e-05, "loss": 1.3859, "step": 972 }, { "epoch": 0.28, "grad_norm": 9.319928169250488, "learning_rate": 2.5825228832951944e-05, "loss": 1.3565, "step": 973 }, { "epoch": 0.28, "grad_norm": 9.010677337646484, "learning_rate": 2.5820938215102976e-05, "loss": 1.4269, "step": 974 }, { "epoch": 0.28, "grad_norm": 9.191767692565918, "learning_rate": 2.5816647597254005e-05, "loss": 1.5395, "step": 975 }, { "epoch": 0.28, "grad_norm": 8.80266284942627, "learning_rate": 2.5812356979405034e-05, "loss": 1.4944, "step": 976 }, { "epoch": 0.28, "grad_norm": 9.184325218200684, "learning_rate": 2.5808066361556066e-05, "loss": 1.2486, "step": 977 }, { "epoch": 0.28, "grad_norm": 9.970193862915039, "learning_rate": 2.5803775743707095e-05, "loss": 1.3057, "step": 978 }, { "epoch": 0.28, "grad_norm": 8.319640159606934, "learning_rate": 2.5799485125858127e-05, "loss": 1.4045, "step": 979 }, { "epoch": 0.28, "grad_norm": 9.836695671081543, "learning_rate": 2.5795194508009152e-05, "loss": 1.3089, "step": 980 }, { "epoch": 0.28, "grad_norm": 8.435751914978027, "learning_rate": 2.5790903890160184e-05, "loss": 1.311, "step": 981 }, { "epoch": 0.28, "grad_norm": 8.068690299987793, "learning_rate": 2.5786613272311213e-05, "loss": 1.6868, "step": 982 }, { "epoch": 0.28, "grad_norm": 8.13770866394043, "learning_rate": 2.5782322654462245e-05, "loss": 1.5963, "step": 983 }, { "epoch": 0.28, "grad_norm": 7.724429607391357, "learning_rate": 2.577803203661327e-05, "loss": 1.4556, "step": 984 }, { "epoch": 0.28, "grad_norm": 7.583844184875488, "learning_rate": 2.5773741418764303e-05, "loss": 1.4503, "step": 985 }, { "epoch": 0.28, "grad_norm": 8.158873558044434, "learning_rate": 2.5769450800915335e-05, "loss": 1.3628, "step": 986 }, { "epoch": 0.28, "grad_norm": 7.549899578094482, "learning_rate": 2.576516018306636e-05, "loss": 1.2796, "step": 987 }, { "epoch": 0.28, "grad_norm": 8.432297706604004, "learning_rate": 2.5760869565217392e-05, "loss": 1.6099, "step": 988 }, { "epoch": 0.28, "grad_norm": 7.539736270904541, "learning_rate": 2.575657894736842e-05, "loss": 1.3312, "step": 989 }, { "epoch": 0.28, "grad_norm": 7.980632781982422, "learning_rate": 2.5752288329519453e-05, "loss": 1.3764, "step": 990 }, { "epoch": 0.28, "grad_norm": 7.134516716003418, "learning_rate": 2.574799771167048e-05, "loss": 1.0205, "step": 991 }, { "epoch": 0.28, "grad_norm": 8.90495777130127, "learning_rate": 2.574370709382151e-05, "loss": 1.597, "step": 992 }, { "epoch": 0.28, "grad_norm": 9.75139331817627, "learning_rate": 2.573941647597254e-05, "loss": 1.2573, "step": 993 }, { "epoch": 0.28, "grad_norm": 9.376870155334473, "learning_rate": 2.5735125858123572e-05, "loss": 1.8324, "step": 994 }, { "epoch": 0.28, "grad_norm": 8.90761947631836, "learning_rate": 2.57308352402746e-05, "loss": 1.2191, "step": 995 }, { "epoch": 0.28, "grad_norm": 8.86811351776123, "learning_rate": 2.572654462242563e-05, "loss": 1.4312, "step": 996 }, { "epoch": 0.29, "grad_norm": 9.165155410766602, "learning_rate": 2.572225400457666e-05, "loss": 1.4856, "step": 997 }, { "epoch": 0.29, "grad_norm": 8.089405059814453, "learning_rate": 2.571796338672769e-05, "loss": 1.3299, "step": 998 }, { "epoch": 0.29, "grad_norm": 8.177542686462402, "learning_rate": 2.571367276887872e-05, "loss": 1.3916, "step": 999 }, { "epoch": 0.29, "grad_norm": 8.137677192687988, "learning_rate": 2.5709382151029748e-05, "loss": 1.2121, "step": 1000 }, { "epoch": 0.29, "grad_norm": 8.527697563171387, "learning_rate": 2.570509153318078e-05, "loss": 1.4384, "step": 1001 }, { "epoch": 0.29, "grad_norm": 8.89223861694336, "learning_rate": 2.5700800915331805e-05, "loss": 1.1825, "step": 1002 }, { "epoch": 0.29, "grad_norm": 9.23388671875, "learning_rate": 2.5696510297482838e-05, "loss": 1.8242, "step": 1003 }, { "epoch": 0.29, "grad_norm": 8.942590713500977, "learning_rate": 2.569221967963387e-05, "loss": 1.5872, "step": 1004 }, { "epoch": 0.29, "grad_norm": 8.871176719665527, "learning_rate": 2.56879290617849e-05, "loss": 1.4072, "step": 1005 }, { "epoch": 0.29, "grad_norm": 8.461094856262207, "learning_rate": 2.5683638443935927e-05, "loss": 1.4736, "step": 1006 }, { "epoch": 0.29, "grad_norm": 7.951471328735352, "learning_rate": 2.5679347826086956e-05, "loss": 1.2968, "step": 1007 }, { "epoch": 0.29, "grad_norm": 7.7858171463012695, "learning_rate": 2.5675057208237988e-05, "loss": 1.3835, "step": 1008 }, { "epoch": 0.29, "grad_norm": 7.943772792816162, "learning_rate": 2.5670766590389017e-05, "loss": 1.8021, "step": 1009 }, { "epoch": 0.29, "grad_norm": 6.024533748626709, "learning_rate": 2.5666475972540046e-05, "loss": 0.9488, "step": 1010 }, { "epoch": 0.29, "grad_norm": 7.342346668243408, "learning_rate": 2.5662185354691074e-05, "loss": 1.1327, "step": 1011 }, { "epoch": 0.29, "grad_norm": 8.056375503540039, "learning_rate": 2.5657894736842107e-05, "loss": 1.4397, "step": 1012 }, { "epoch": 0.29, "grad_norm": 7.417347431182861, "learning_rate": 2.5653604118993135e-05, "loss": 1.2529, "step": 1013 }, { "epoch": 0.29, "grad_norm": 8.64712905883789, "learning_rate": 2.5649313501144164e-05, "loss": 1.3253, "step": 1014 }, { "epoch": 0.29, "grad_norm": 8.166894912719727, "learning_rate": 2.5645022883295196e-05, "loss": 1.5091, "step": 1015 }, { "epoch": 0.29, "grad_norm": 8.005392074584961, "learning_rate": 2.5640732265446225e-05, "loss": 1.226, "step": 1016 }, { "epoch": 0.29, "grad_norm": 7.414338111877441, "learning_rate": 2.5636441647597254e-05, "loss": 1.3174, "step": 1017 }, { "epoch": 0.29, "grad_norm": 8.061562538146973, "learning_rate": 2.5632151029748283e-05, "loss": 1.4758, "step": 1018 }, { "epoch": 0.29, "grad_norm": 8.46532917022705, "learning_rate": 2.5627860411899315e-05, "loss": 1.2356, "step": 1019 }, { "epoch": 0.29, "grad_norm": 7.5079026222229, "learning_rate": 2.5623569794050347e-05, "loss": 1.2108, "step": 1020 }, { "epoch": 0.29, "grad_norm": 8.118891716003418, "learning_rate": 2.5619279176201372e-05, "loss": 1.2809, "step": 1021 }, { "epoch": 0.29, "grad_norm": 9.336339950561523, "learning_rate": 2.5614988558352404e-05, "loss": 1.3214, "step": 1022 }, { "epoch": 0.29, "grad_norm": 7.880213737487793, "learning_rate": 2.5610697940503433e-05, "loss": 1.0074, "step": 1023 }, { "epoch": 0.29, "grad_norm": 10.576152801513672, "learning_rate": 2.5606407322654465e-05, "loss": 1.7207, "step": 1024 }, { "epoch": 0.29, "grad_norm": 8.325366973876953, "learning_rate": 2.560211670480549e-05, "loss": 1.353, "step": 1025 }, { "epoch": 0.29, "grad_norm": 7.758729457855225, "learning_rate": 2.5597826086956523e-05, "loss": 1.4028, "step": 1026 }, { "epoch": 0.29, "grad_norm": 8.77721118927002, "learning_rate": 2.559353546910755e-05, "loss": 1.5293, "step": 1027 }, { "epoch": 0.29, "grad_norm": 8.359360694885254, "learning_rate": 2.558924485125858e-05, "loss": 1.1269, "step": 1028 }, { "epoch": 0.29, "grad_norm": 8.833972930908203, "learning_rate": 2.5584954233409613e-05, "loss": 1.3712, "step": 1029 }, { "epoch": 0.29, "grad_norm": 8.514402389526367, "learning_rate": 2.558066361556064e-05, "loss": 1.204, "step": 1030 }, { "epoch": 0.29, "grad_norm": 9.313460350036621, "learning_rate": 2.5576372997711674e-05, "loss": 1.5063, "step": 1031 }, { "epoch": 0.3, "grad_norm": 10.018709182739258, "learning_rate": 2.55720823798627e-05, "loss": 1.2818, "step": 1032 }, { "epoch": 0.3, "grad_norm": 7.012324810028076, "learning_rate": 2.556779176201373e-05, "loss": 1.1244, "step": 1033 }, { "epoch": 0.3, "grad_norm": 8.470826148986816, "learning_rate": 2.556350114416476e-05, "loss": 1.2546, "step": 1034 }, { "epoch": 0.3, "grad_norm": 7.6571269035339355, "learning_rate": 2.5559210526315792e-05, "loss": 1.2892, "step": 1035 }, { "epoch": 0.3, "grad_norm": 8.890312194824219, "learning_rate": 2.5554919908466817e-05, "loss": 1.521, "step": 1036 }, { "epoch": 0.3, "grad_norm": 7.642049789428711, "learning_rate": 2.555062929061785e-05, "loss": 1.1408, "step": 1037 }, { "epoch": 0.3, "grad_norm": 8.882678985595703, "learning_rate": 2.554633867276888e-05, "loss": 1.2596, "step": 1038 }, { "epoch": 0.3, "grad_norm": 9.036952018737793, "learning_rate": 2.5542048054919907e-05, "loss": 1.331, "step": 1039 }, { "epoch": 0.3, "grad_norm": 8.739363670349121, "learning_rate": 2.553775743707094e-05, "loss": 1.2239, "step": 1040 }, { "epoch": 0.3, "grad_norm": 8.373655319213867, "learning_rate": 2.5533466819221968e-05, "loss": 1.2434, "step": 1041 }, { "epoch": 0.3, "grad_norm": 10.343050956726074, "learning_rate": 2.5529176201373e-05, "loss": 1.4349, "step": 1042 }, { "epoch": 0.3, "grad_norm": 8.710387229919434, "learning_rate": 2.5524885583524025e-05, "loss": 1.4334, "step": 1043 }, { "epoch": 0.3, "grad_norm": 9.289206504821777, "learning_rate": 2.5520594965675058e-05, "loss": 1.1788, "step": 1044 }, { "epoch": 0.3, "grad_norm": 10.75119400024414, "learning_rate": 2.5516304347826086e-05, "loss": 1.4688, "step": 1045 }, { "epoch": 0.3, "grad_norm": 8.680585861206055, "learning_rate": 2.551201372997712e-05, "loss": 1.449, "step": 1046 }, { "epoch": 0.3, "grad_norm": 7.56112813949585, "learning_rate": 2.5507723112128147e-05, "loss": 1.2321, "step": 1047 }, { "epoch": 0.3, "grad_norm": 8.956106185913086, "learning_rate": 2.5503432494279176e-05, "loss": 1.3308, "step": 1048 }, { "epoch": 0.3, "grad_norm": 8.079631805419922, "learning_rate": 2.5499141876430208e-05, "loss": 1.1529, "step": 1049 }, { "epoch": 0.3, "grad_norm": 9.40312671661377, "learning_rate": 2.5494851258581237e-05, "loss": 1.5271, "step": 1050 }, { "epoch": 0.3, "grad_norm": 9.621920585632324, "learning_rate": 2.5490560640732266e-05, "loss": 1.1845, "step": 1051 }, { "epoch": 0.3, "grad_norm": 8.569252967834473, "learning_rate": 2.5486270022883295e-05, "loss": 1.3228, "step": 1052 }, { "epoch": 0.3, "grad_norm": 8.784046173095703, "learning_rate": 2.5481979405034327e-05, "loss": 1.3341, "step": 1053 }, { "epoch": 0.3, "grad_norm": 9.730287551879883, "learning_rate": 2.5477688787185355e-05, "loss": 1.2053, "step": 1054 }, { "epoch": 0.3, "grad_norm": 9.498429298400879, "learning_rate": 2.5473398169336384e-05, "loss": 1.1694, "step": 1055 }, { "epoch": 0.3, "grad_norm": 7.870926856994629, "learning_rate": 2.5469107551487416e-05, "loss": 1.4959, "step": 1056 }, { "epoch": 0.3, "grad_norm": 7.9740214347839355, "learning_rate": 2.5464816933638445e-05, "loss": 1.4715, "step": 1057 }, { "epoch": 0.3, "grad_norm": 8.216447830200195, "learning_rate": 2.5460526315789474e-05, "loss": 1.1949, "step": 1058 }, { "epoch": 0.3, "grad_norm": 9.495555877685547, "learning_rate": 2.5456235697940503e-05, "loss": 1.5628, "step": 1059 }, { "epoch": 0.3, "grad_norm": 8.50665283203125, "learning_rate": 2.5451945080091535e-05, "loss": 1.275, "step": 1060 }, { "epoch": 0.3, "grad_norm": 10.58385181427002, "learning_rate": 2.5447654462242564e-05, "loss": 1.6663, "step": 1061 }, { "epoch": 0.3, "grad_norm": 7.590288162231445, "learning_rate": 2.5443363844393592e-05, "loss": 1.3047, "step": 1062 }, { "epoch": 0.3, "grad_norm": 10.212004661560059, "learning_rate": 2.5439073226544625e-05, "loss": 1.6188, "step": 1063 }, { "epoch": 0.3, "grad_norm": 9.003531455993652, "learning_rate": 2.5434782608695653e-05, "loss": 1.2925, "step": 1064 }, { "epoch": 0.3, "grad_norm": 7.413034439086914, "learning_rate": 2.5430491990846682e-05, "loss": 1.0456, "step": 1065 }, { "epoch": 0.3, "grad_norm": 8.396681785583496, "learning_rate": 2.542620137299771e-05, "loss": 1.2743, "step": 1066 }, { "epoch": 0.31, "grad_norm": 9.070167541503906, "learning_rate": 2.5421910755148743e-05, "loss": 1.4599, "step": 1067 }, { "epoch": 0.31, "grad_norm": 8.388463973999023, "learning_rate": 2.5417620137299772e-05, "loss": 1.3985, "step": 1068 }, { "epoch": 0.31, "grad_norm": 9.501354217529297, "learning_rate": 2.54133295194508e-05, "loss": 1.5014, "step": 1069 }, { "epoch": 0.31, "grad_norm": 8.798341751098633, "learning_rate": 2.540903890160183e-05, "loss": 1.1759, "step": 1070 }, { "epoch": 0.31, "grad_norm": 8.82196044921875, "learning_rate": 2.540474828375286e-05, "loss": 1.3665, "step": 1071 }, { "epoch": 0.31, "grad_norm": 10.14842700958252, "learning_rate": 2.5400457665903894e-05, "loss": 1.2128, "step": 1072 }, { "epoch": 0.31, "grad_norm": 8.154882431030273, "learning_rate": 2.539616704805492e-05, "loss": 1.2736, "step": 1073 }, { "epoch": 0.31, "grad_norm": 9.904618263244629, "learning_rate": 2.539187643020595e-05, "loss": 1.3211, "step": 1074 }, { "epoch": 0.31, "grad_norm": 9.278619766235352, "learning_rate": 2.538758581235698e-05, "loss": 1.3248, "step": 1075 }, { "epoch": 0.31, "grad_norm": 9.660553932189941, "learning_rate": 2.5383295194508012e-05, "loss": 1.2732, "step": 1076 }, { "epoch": 0.31, "grad_norm": 10.31358528137207, "learning_rate": 2.5379004576659037e-05, "loss": 1.4625, "step": 1077 }, { "epoch": 0.31, "grad_norm": 7.699115753173828, "learning_rate": 2.537471395881007e-05, "loss": 1.1086, "step": 1078 }, { "epoch": 0.31, "grad_norm": 7.902919292449951, "learning_rate": 2.53704233409611e-05, "loss": 1.0589, "step": 1079 }, { "epoch": 0.31, "grad_norm": 9.76630973815918, "learning_rate": 2.5366132723112127e-05, "loss": 1.1594, "step": 1080 }, { "epoch": 0.31, "grad_norm": 9.40144157409668, "learning_rate": 2.536184210526316e-05, "loss": 1.4975, "step": 1081 }, { "epoch": 0.31, "grad_norm": 8.331470489501953, "learning_rate": 2.5357551487414188e-05, "loss": 1.1938, "step": 1082 }, { "epoch": 0.31, "grad_norm": 7.63892126083374, "learning_rate": 2.535326086956522e-05, "loss": 1.3181, "step": 1083 }, { "epoch": 0.31, "grad_norm": 8.823601722717285, "learning_rate": 2.5348970251716246e-05, "loss": 1.4152, "step": 1084 }, { "epoch": 0.31, "grad_norm": 8.027241706848145, "learning_rate": 2.5344679633867278e-05, "loss": 1.2235, "step": 1085 }, { "epoch": 0.31, "grad_norm": 8.12237548828125, "learning_rate": 2.5340389016018307e-05, "loss": 1.3252, "step": 1086 }, { "epoch": 0.31, "grad_norm": 10.51408576965332, "learning_rate": 2.533609839816934e-05, "loss": 1.6956, "step": 1087 }, { "epoch": 0.31, "grad_norm": 8.42678451538086, "learning_rate": 2.5331807780320364e-05, "loss": 1.3338, "step": 1088 }, { "epoch": 0.31, "grad_norm": 8.596070289611816, "learning_rate": 2.5327517162471396e-05, "loss": 1.2526, "step": 1089 }, { "epoch": 0.31, "grad_norm": 7.929720401763916, "learning_rate": 2.532322654462243e-05, "loss": 1.1503, "step": 1090 }, { "epoch": 0.31, "grad_norm": 8.295254707336426, "learning_rate": 2.5318935926773457e-05, "loss": 1.6479, "step": 1091 }, { "epoch": 0.31, "grad_norm": 8.760801315307617, "learning_rate": 2.5314645308924486e-05, "loss": 1.492, "step": 1092 }, { "epoch": 0.31, "grad_norm": 8.011282920837402, "learning_rate": 2.5310354691075515e-05, "loss": 1.1961, "step": 1093 }, { "epoch": 0.31, "grad_norm": 8.240226745605469, "learning_rate": 2.5306064073226547e-05, "loss": 0.9983, "step": 1094 }, { "epoch": 0.31, "grad_norm": 8.712852478027344, "learning_rate": 2.5301773455377572e-05, "loss": 1.1892, "step": 1095 }, { "epoch": 0.31, "grad_norm": 11.721887588500977, "learning_rate": 2.5297482837528604e-05, "loss": 1.4409, "step": 1096 }, { "epoch": 0.31, "grad_norm": 9.104716300964355, "learning_rate": 2.5293192219679637e-05, "loss": 1.3911, "step": 1097 }, { "epoch": 0.31, "grad_norm": 8.200728416442871, "learning_rate": 2.5288901601830665e-05, "loss": 1.0253, "step": 1098 }, { "epoch": 0.31, "grad_norm": 9.017943382263184, "learning_rate": 2.5284610983981694e-05, "loss": 1.0505, "step": 1099 }, { "epoch": 0.31, "grad_norm": 10.048181533813477, "learning_rate": 2.5280320366132723e-05, "loss": 1.6158, "step": 1100 }, { "epoch": 0.31, "grad_norm": 9.796588897705078, "learning_rate": 2.5276029748283755e-05, "loss": 1.4182, "step": 1101 }, { "epoch": 0.32, "grad_norm": 8.351282119750977, "learning_rate": 2.5271739130434784e-05, "loss": 0.9927, "step": 1102 }, { "epoch": 0.32, "grad_norm": 9.599355697631836, "learning_rate": 2.5267448512585813e-05, "loss": 1.0907, "step": 1103 }, { "epoch": 0.32, "grad_norm": 10.412554740905762, "learning_rate": 2.526315789473684e-05, "loss": 1.4171, "step": 1104 }, { "epoch": 0.32, "grad_norm": 8.77164077758789, "learning_rate": 2.5258867276887873e-05, "loss": 1.3462, "step": 1105 }, { "epoch": 0.32, "grad_norm": 10.209586143493652, "learning_rate": 2.5254576659038902e-05, "loss": 1.2277, "step": 1106 }, { "epoch": 0.32, "grad_norm": 9.908252716064453, "learning_rate": 2.525028604118993e-05, "loss": 1.5864, "step": 1107 }, { "epoch": 0.32, "grad_norm": 7.8651933670043945, "learning_rate": 2.5245995423340963e-05, "loss": 1.3012, "step": 1108 }, { "epoch": 0.32, "grad_norm": 9.069273948669434, "learning_rate": 2.5241704805491992e-05, "loss": 1.1916, "step": 1109 }, { "epoch": 0.32, "grad_norm": 9.798157691955566, "learning_rate": 2.523741418764302e-05, "loss": 1.4687, "step": 1110 }, { "epoch": 0.32, "grad_norm": 8.591025352478027, "learning_rate": 2.523312356979405e-05, "loss": 1.2547, "step": 1111 }, { "epoch": 0.32, "grad_norm": 8.359914779663086, "learning_rate": 2.522883295194508e-05, "loss": 1.1862, "step": 1112 }, { "epoch": 0.32, "grad_norm": 8.099730491638184, "learning_rate": 2.522454233409611e-05, "loss": 1.4387, "step": 1113 }, { "epoch": 0.32, "grad_norm": 7.6268157958984375, "learning_rate": 2.522025171624714e-05, "loss": 1.2851, "step": 1114 }, { "epoch": 0.32, "grad_norm": 8.440269470214844, "learning_rate": 2.521596109839817e-05, "loss": 1.4515, "step": 1115 }, { "epoch": 0.32, "grad_norm": 8.150582313537598, "learning_rate": 2.52116704805492e-05, "loss": 1.4764, "step": 1116 }, { "epoch": 0.32, "grad_norm": 7.502106666564941, "learning_rate": 2.520737986270023e-05, "loss": 1.1288, "step": 1117 }, { "epoch": 0.32, "grad_norm": 8.629725456237793, "learning_rate": 2.5203089244851258e-05, "loss": 1.3621, "step": 1118 }, { "epoch": 0.32, "grad_norm": 8.673270225524902, "learning_rate": 2.519879862700229e-05, "loss": 1.2357, "step": 1119 }, { "epoch": 0.32, "grad_norm": 9.973155975341797, "learning_rate": 2.519450800915332e-05, "loss": 1.3026, "step": 1120 }, { "epoch": 0.32, "grad_norm": 8.627095222473145, "learning_rate": 2.5190217391304347e-05, "loss": 1.1334, "step": 1121 }, { "epoch": 0.32, "grad_norm": 7.878874778747559, "learning_rate": 2.5185926773455376e-05, "loss": 1.1232, "step": 1122 }, { "epoch": 0.32, "grad_norm": 9.94869613647461, "learning_rate": 2.5181636155606408e-05, "loss": 1.6974, "step": 1123 }, { "epoch": 0.32, "grad_norm": 10.141195297241211, "learning_rate": 2.517734553775744e-05, "loss": 1.6349, "step": 1124 }, { "epoch": 0.32, "grad_norm": 9.594303131103516, "learning_rate": 2.5173054919908466e-05, "loss": 1.6864, "step": 1125 }, { "epoch": 0.32, "grad_norm": 9.13325023651123, "learning_rate": 2.5168764302059498e-05, "loss": 1.2408, "step": 1126 }, { "epoch": 0.32, "grad_norm": 9.727703094482422, "learning_rate": 2.5164473684210527e-05, "loss": 1.5705, "step": 1127 }, { "epoch": 0.32, "grad_norm": 11.216474533081055, "learning_rate": 2.516018306636156e-05, "loss": 1.6617, "step": 1128 }, { "epoch": 0.32, "grad_norm": 8.66234016418457, "learning_rate": 2.5155892448512584e-05, "loss": 1.2739, "step": 1129 }, { "epoch": 0.32, "grad_norm": 8.064190864562988, "learning_rate": 2.5151601830663616e-05, "loss": 1.0814, "step": 1130 }, { "epoch": 0.32, "grad_norm": 9.823426246643066, "learning_rate": 2.5147311212814645e-05, "loss": 1.7493, "step": 1131 }, { "epoch": 0.32, "grad_norm": 9.140297889709473, "learning_rate": 2.5143020594965674e-05, "loss": 1.3828, "step": 1132 }, { "epoch": 0.32, "grad_norm": 8.560859680175781, "learning_rate": 2.5138729977116706e-05, "loss": 1.3355, "step": 1133 }, { "epoch": 0.32, "grad_norm": 8.429637908935547, "learning_rate": 2.5134439359267735e-05, "loss": 1.2483, "step": 1134 }, { "epoch": 0.32, "grad_norm": 9.343366622924805, "learning_rate": 2.5130148741418767e-05, "loss": 1.6462, "step": 1135 }, { "epoch": 0.32, "grad_norm": 7.374827861785889, "learning_rate": 2.5125858123569792e-05, "loss": 1.0401, "step": 1136 }, { "epoch": 0.33, "grad_norm": 8.14588737487793, "learning_rate": 2.5121567505720825e-05, "loss": 1.1931, "step": 1137 }, { "epoch": 0.33, "grad_norm": 9.2505521774292, "learning_rate": 2.5117276887871853e-05, "loss": 1.2012, "step": 1138 }, { "epoch": 0.33, "grad_norm": 8.059476852416992, "learning_rate": 2.5112986270022885e-05, "loss": 1.1364, "step": 1139 }, { "epoch": 0.33, "grad_norm": 7.84462308883667, "learning_rate": 2.5108695652173914e-05, "loss": 1.1497, "step": 1140 }, { "epoch": 0.33, "grad_norm": 8.2100830078125, "learning_rate": 2.5104405034324943e-05, "loss": 1.0862, "step": 1141 }, { "epoch": 0.33, "grad_norm": 9.621990203857422, "learning_rate": 2.5100114416475975e-05, "loss": 1.2482, "step": 1142 }, { "epoch": 0.33, "grad_norm": 7.5615925788879395, "learning_rate": 2.5095823798627004e-05, "loss": 0.9791, "step": 1143 }, { "epoch": 0.33, "grad_norm": 9.368245124816895, "learning_rate": 2.5091533180778033e-05, "loss": 1.5939, "step": 1144 }, { "epoch": 0.33, "grad_norm": 8.730009078979492, "learning_rate": 2.508724256292906e-05, "loss": 1.2557, "step": 1145 }, { "epoch": 0.33, "grad_norm": 10.07455062866211, "learning_rate": 2.5082951945080094e-05, "loss": 1.5275, "step": 1146 }, { "epoch": 0.33, "grad_norm": 8.566425323486328, "learning_rate": 2.507866132723112e-05, "loss": 1.1481, "step": 1147 }, { "epoch": 0.33, "grad_norm": 9.685506820678711, "learning_rate": 2.507437070938215e-05, "loss": 1.4072, "step": 1148 }, { "epoch": 0.33, "grad_norm": 10.160717010498047, "learning_rate": 2.5070080091533183e-05, "loss": 1.4458, "step": 1149 }, { "epoch": 0.33, "grad_norm": 9.259299278259277, "learning_rate": 2.5065789473684212e-05, "loss": 1.6841, "step": 1150 }, { "epoch": 0.33, "grad_norm": 9.61259937286377, "learning_rate": 2.506149885583524e-05, "loss": 1.4721, "step": 1151 }, { "epoch": 0.33, "grad_norm": 9.85054874420166, "learning_rate": 2.505720823798627e-05, "loss": 1.352, "step": 1152 }, { "epoch": 0.33, "grad_norm": 9.074386596679688, "learning_rate": 2.5052917620137302e-05, "loss": 1.4526, "step": 1153 }, { "epoch": 0.33, "grad_norm": 7.661362648010254, "learning_rate": 2.504862700228833e-05, "loss": 1.3133, "step": 1154 }, { "epoch": 0.33, "grad_norm": 8.18763256072998, "learning_rate": 2.504433638443936e-05, "loss": 1.5691, "step": 1155 }, { "epoch": 0.33, "grad_norm": 7.982461929321289, "learning_rate": 2.5040045766590388e-05, "loss": 1.3467, "step": 1156 }, { "epoch": 0.33, "grad_norm": 9.239413261413574, "learning_rate": 2.503575514874142e-05, "loss": 1.4486, "step": 1157 }, { "epoch": 0.33, "grad_norm": 7.908697605133057, "learning_rate": 2.503146453089245e-05, "loss": 1.4239, "step": 1158 }, { "epoch": 0.33, "grad_norm": 7.104166030883789, "learning_rate": 2.5027173913043478e-05, "loss": 1.1813, "step": 1159 }, { "epoch": 0.33, "grad_norm": 7.924987316131592, "learning_rate": 2.502288329519451e-05, "loss": 1.0608, "step": 1160 }, { "epoch": 0.33, "grad_norm": 7.379941940307617, "learning_rate": 2.501859267734554e-05, "loss": 1.3629, "step": 1161 }, { "epoch": 0.33, "grad_norm": 9.123723983764648, "learning_rate": 2.5014302059496567e-05, "loss": 1.3557, "step": 1162 }, { "epoch": 0.33, "grad_norm": 8.908727645874023, "learning_rate": 2.5010011441647596e-05, "loss": 1.0705, "step": 1163 }, { "epoch": 0.33, "grad_norm": 8.93742847442627, "learning_rate": 2.500572082379863e-05, "loss": 1.1082, "step": 1164 }, { "epoch": 0.33, "grad_norm": 10.610589981079102, "learning_rate": 2.5001430205949657e-05, "loss": 1.4131, "step": 1165 }, { "epoch": 0.33, "grad_norm": 8.712764739990234, "learning_rate": 2.4997139588100686e-05, "loss": 1.1069, "step": 1166 }, { "epoch": 0.33, "grad_norm": 9.058858871459961, "learning_rate": 2.4992848970251718e-05, "loss": 1.2948, "step": 1167 }, { "epoch": 0.33, "grad_norm": 7.90199613571167, "learning_rate": 2.4988558352402747e-05, "loss": 1.1113, "step": 1168 }, { "epoch": 0.33, "grad_norm": 10.635540008544922, "learning_rate": 2.498426773455378e-05, "loss": 1.4564, "step": 1169 }, { "epoch": 0.33, "grad_norm": 10.573737144470215, "learning_rate": 2.4979977116704804e-05, "loss": 1.4524, "step": 1170 }, { "epoch": 0.33, "grad_norm": 13.97392749786377, "learning_rate": 2.4975686498855837e-05, "loss": 1.7515, "step": 1171 }, { "epoch": 0.34, "grad_norm": 7.978637218475342, "learning_rate": 2.4971395881006865e-05, "loss": 0.9382, "step": 1172 }, { "epoch": 0.34, "grad_norm": 9.72246265411377, "learning_rate": 2.4967105263157894e-05, "loss": 1.3806, "step": 1173 }, { "epoch": 0.34, "grad_norm": 11.520387649536133, "learning_rate": 2.4962814645308926e-05, "loss": 1.3977, "step": 1174 }, { "epoch": 0.34, "grad_norm": 10.564435958862305, "learning_rate": 2.4958524027459955e-05, "loss": 1.323, "step": 1175 }, { "epoch": 0.34, "grad_norm": 10.678526878356934, "learning_rate": 2.4954233409610987e-05, "loss": 1.7055, "step": 1176 }, { "epoch": 0.34, "grad_norm": 10.193828582763672, "learning_rate": 2.4949942791762013e-05, "loss": 1.4728, "step": 1177 }, { "epoch": 0.34, "grad_norm": 9.24264144897461, "learning_rate": 2.4945652173913045e-05, "loss": 1.2945, "step": 1178 }, { "epoch": 0.34, "grad_norm": 9.4100341796875, "learning_rate": 2.4941361556064073e-05, "loss": 1.5619, "step": 1179 }, { "epoch": 0.34, "grad_norm": 9.36609172821045, "learning_rate": 2.4937070938215106e-05, "loss": 1.1721, "step": 1180 }, { "epoch": 0.34, "grad_norm": 7.610232353210449, "learning_rate": 2.493278032036613e-05, "loss": 1.3444, "step": 1181 }, { "epoch": 0.34, "grad_norm": 9.46660327911377, "learning_rate": 2.4928489702517163e-05, "loss": 1.6277, "step": 1182 }, { "epoch": 0.34, "grad_norm": 9.28674602508545, "learning_rate": 2.4924199084668195e-05, "loss": 1.2887, "step": 1183 }, { "epoch": 0.34, "grad_norm": 9.585891723632812, "learning_rate": 2.491990846681922e-05, "loss": 1.5058, "step": 1184 }, { "epoch": 0.34, "grad_norm": 7.711112976074219, "learning_rate": 2.4915617848970253e-05, "loss": 1.364, "step": 1185 }, { "epoch": 0.34, "grad_norm": 7.428664684295654, "learning_rate": 2.491132723112128e-05, "loss": 1.2072, "step": 1186 }, { "epoch": 0.34, "grad_norm": 7.880519866943359, "learning_rate": 2.4907036613272314e-05, "loss": 1.3054, "step": 1187 }, { "epoch": 0.34, "grad_norm": 8.168986320495605, "learning_rate": 2.490274599542334e-05, "loss": 1.3597, "step": 1188 }, { "epoch": 0.34, "grad_norm": 7.571588039398193, "learning_rate": 2.489845537757437e-05, "loss": 1.4104, "step": 1189 }, { "epoch": 0.34, "grad_norm": 8.356108665466309, "learning_rate": 2.48941647597254e-05, "loss": 1.1608, "step": 1190 }, { "epoch": 0.34, "grad_norm": 9.247628211975098, "learning_rate": 2.4889874141876432e-05, "loss": 1.4549, "step": 1191 }, { "epoch": 0.34, "grad_norm": 7.7991461753845215, "learning_rate": 2.488558352402746e-05, "loss": 1.1564, "step": 1192 }, { "epoch": 0.34, "grad_norm": 7.154600620269775, "learning_rate": 2.488129290617849e-05, "loss": 1.1743, "step": 1193 }, { "epoch": 0.34, "grad_norm": 8.795459747314453, "learning_rate": 2.4877002288329522e-05, "loss": 1.2105, "step": 1194 }, { "epoch": 0.34, "grad_norm": 7.458930969238281, "learning_rate": 2.487271167048055e-05, "loss": 1.2433, "step": 1195 }, { "epoch": 0.34, "grad_norm": 7.928001880645752, "learning_rate": 2.486842105263158e-05, "loss": 1.0818, "step": 1196 }, { "epoch": 0.34, "grad_norm": 8.688636779785156, "learning_rate": 2.4864130434782608e-05, "loss": 1.4069, "step": 1197 }, { "epoch": 0.34, "grad_norm": 8.312451362609863, "learning_rate": 2.485983981693364e-05, "loss": 1.1773, "step": 1198 }, { "epoch": 0.34, "grad_norm": 8.169563293457031, "learning_rate": 2.4855549199084666e-05, "loss": 1.1708, "step": 1199 }, { "epoch": 0.34, "grad_norm": 10.276508331298828, "learning_rate": 2.4851258581235698e-05, "loss": 1.6543, "step": 1200 }, { "epoch": 0.34, "grad_norm": 8.86247730255127, "learning_rate": 2.484696796338673e-05, "loss": 1.5157, "step": 1201 }, { "epoch": 0.34, "grad_norm": 7.955330848693848, "learning_rate": 2.484267734553776e-05, "loss": 1.0957, "step": 1202 }, { "epoch": 0.34, "grad_norm": 8.047369956970215, "learning_rate": 2.4838386727688788e-05, "loss": 1.135, "step": 1203 }, { "epoch": 0.34, "grad_norm": 8.094965934753418, "learning_rate": 2.4834096109839816e-05, "loss": 1.055, "step": 1204 }, { "epoch": 0.34, "grad_norm": 7.789654731750488, "learning_rate": 2.482980549199085e-05, "loss": 1.3808, "step": 1205 }, { "epoch": 0.34, "grad_norm": 7.9882707595825195, "learning_rate": 2.4825514874141877e-05, "loss": 1.4581, "step": 1206 }, { "epoch": 0.35, "grad_norm": 9.175067901611328, "learning_rate": 2.4821224256292906e-05, "loss": 1.4699, "step": 1207 }, { "epoch": 0.35, "grad_norm": 7.715549945831299, "learning_rate": 2.4816933638443935e-05, "loss": 1.2313, "step": 1208 }, { "epoch": 0.35, "grad_norm": 8.82292366027832, "learning_rate": 2.4812643020594967e-05, "loss": 1.3491, "step": 1209 }, { "epoch": 0.35, "grad_norm": 9.679081916809082, "learning_rate": 2.4808352402745996e-05, "loss": 1.3, "step": 1210 }, { "epoch": 0.35, "grad_norm": 8.480063438415527, "learning_rate": 2.4804061784897025e-05, "loss": 0.9899, "step": 1211 }, { "epoch": 0.35, "grad_norm": 8.44324016571045, "learning_rate": 2.4799771167048057e-05, "loss": 1.2802, "step": 1212 }, { "epoch": 0.35, "grad_norm": 8.433918952941895, "learning_rate": 2.4795480549199085e-05, "loss": 1.2748, "step": 1213 }, { "epoch": 0.35, "grad_norm": 8.200996398925781, "learning_rate": 2.4791189931350114e-05, "loss": 1.2814, "step": 1214 }, { "epoch": 0.35, "grad_norm": 9.791327476501465, "learning_rate": 2.4786899313501143e-05, "loss": 1.4816, "step": 1215 }, { "epoch": 0.35, "grad_norm": 8.12392520904541, "learning_rate": 2.4782608695652175e-05, "loss": 0.9496, "step": 1216 }, { "epoch": 0.35, "grad_norm": 8.058268547058105, "learning_rate": 2.4778318077803207e-05, "loss": 1.0657, "step": 1217 }, { "epoch": 0.35, "grad_norm": 8.287860870361328, "learning_rate": 2.4774027459954233e-05, "loss": 1.3242, "step": 1218 }, { "epoch": 0.35, "grad_norm": 9.007309913635254, "learning_rate": 2.4769736842105265e-05, "loss": 1.3125, "step": 1219 }, { "epoch": 0.35, "grad_norm": 8.979630470275879, "learning_rate": 2.4765446224256294e-05, "loss": 1.0767, "step": 1220 }, { "epoch": 0.35, "grad_norm": 9.455676078796387, "learning_rate": 2.4761155606407326e-05, "loss": 1.4545, "step": 1221 }, { "epoch": 0.35, "grad_norm": 10.137964248657227, "learning_rate": 2.475686498855835e-05, "loss": 1.5658, "step": 1222 }, { "epoch": 0.35, "grad_norm": 8.38525104522705, "learning_rate": 2.4752574370709383e-05, "loss": 1.3243, "step": 1223 }, { "epoch": 0.35, "grad_norm": 7.730583190917969, "learning_rate": 2.4748283752860412e-05, "loss": 1.1342, "step": 1224 }, { "epoch": 0.35, "grad_norm": 8.241644859313965, "learning_rate": 2.474399313501144e-05, "loss": 1.2161, "step": 1225 }, { "epoch": 0.35, "grad_norm": 8.795563697814941, "learning_rate": 2.4739702517162473e-05, "loss": 1.3466, "step": 1226 }, { "epoch": 0.35, "grad_norm": 7.525288105010986, "learning_rate": 2.4735411899313502e-05, "loss": 1.4963, "step": 1227 }, { "epoch": 0.35, "grad_norm": 8.251082420349121, "learning_rate": 2.4731121281464534e-05, "loss": 1.1266, "step": 1228 }, { "epoch": 0.35, "grad_norm": 8.439674377441406, "learning_rate": 2.472683066361556e-05, "loss": 1.0338, "step": 1229 }, { "epoch": 0.35, "grad_norm": 9.983979225158691, "learning_rate": 2.472254004576659e-05, "loss": 1.6099, "step": 1230 }, { "epoch": 0.35, "grad_norm": 9.698434829711914, "learning_rate": 2.471824942791762e-05, "loss": 1.6213, "step": 1231 }, { "epoch": 0.35, "grad_norm": 8.603226661682129, "learning_rate": 2.4713958810068652e-05, "loss": 1.5249, "step": 1232 }, { "epoch": 0.35, "grad_norm": 9.249344825744629, "learning_rate": 2.4709668192219678e-05, "loss": 1.0933, "step": 1233 }, { "epoch": 0.35, "grad_norm": 8.643524169921875, "learning_rate": 2.470537757437071e-05, "loss": 1.2073, "step": 1234 }, { "epoch": 0.35, "grad_norm": 8.731209754943848, "learning_rate": 2.4701086956521742e-05, "loss": 1.2141, "step": 1235 }, { "epoch": 0.35, "grad_norm": 8.993982315063477, "learning_rate": 2.4696796338672767e-05, "loss": 1.3216, "step": 1236 }, { "epoch": 0.35, "grad_norm": 8.99123764038086, "learning_rate": 2.46925057208238e-05, "loss": 1.4136, "step": 1237 }, { "epoch": 0.35, "grad_norm": 8.950716018676758, "learning_rate": 2.468821510297483e-05, "loss": 1.3142, "step": 1238 }, { "epoch": 0.35, "grad_norm": 9.027567863464355, "learning_rate": 2.468392448512586e-05, "loss": 1.1953, "step": 1239 }, { "epoch": 0.35, "grad_norm": 7.93207311630249, "learning_rate": 2.4679633867276886e-05, "loss": 0.9217, "step": 1240 }, { "epoch": 0.35, "grad_norm": 7.474572658538818, "learning_rate": 2.4675343249427918e-05, "loss": 0.9354, "step": 1241 }, { "epoch": 0.36, "grad_norm": 10.033756256103516, "learning_rate": 2.4671052631578947e-05, "loss": 1.4035, "step": 1242 }, { "epoch": 0.36, "grad_norm": 9.002933502197266, "learning_rate": 2.466676201372998e-05, "loss": 0.9634, "step": 1243 }, { "epoch": 0.36, "grad_norm": 8.01943302154541, "learning_rate": 2.4662471395881008e-05, "loss": 1.0456, "step": 1244 }, { "epoch": 0.36, "grad_norm": 10.665780067443848, "learning_rate": 2.4658180778032037e-05, "loss": 1.4181, "step": 1245 }, { "epoch": 0.36, "grad_norm": 9.919100761413574, "learning_rate": 2.465389016018307e-05, "loss": 1.3317, "step": 1246 }, { "epoch": 0.36, "grad_norm": 9.487162590026855, "learning_rate": 2.4649599542334097e-05, "loss": 1.3828, "step": 1247 }, { "epoch": 0.36, "grad_norm": 9.3540678024292, "learning_rate": 2.4645308924485126e-05, "loss": 1.1949, "step": 1248 }, { "epoch": 0.36, "grad_norm": 9.808308601379395, "learning_rate": 2.4641018306636155e-05, "loss": 1.156, "step": 1249 }, { "epoch": 0.36, "grad_norm": 9.10568904876709, "learning_rate": 2.4636727688787187e-05, "loss": 1.387, "step": 1250 }, { "epoch": 0.36, "grad_norm": 9.04727840423584, "learning_rate": 2.4632437070938212e-05, "loss": 1.1991, "step": 1251 }, { "epoch": 0.36, "grad_norm": 9.775404930114746, "learning_rate": 2.4628146453089245e-05, "loss": 1.5236, "step": 1252 }, { "epoch": 0.36, "grad_norm": 9.491841316223145, "learning_rate": 2.4623855835240277e-05, "loss": 1.1701, "step": 1253 }, { "epoch": 0.36, "grad_norm": 9.250205039978027, "learning_rate": 2.4619565217391306e-05, "loss": 1.4561, "step": 1254 }, { "epoch": 0.36, "grad_norm": 8.8140230178833, "learning_rate": 2.4615274599542334e-05, "loss": 1.1453, "step": 1255 }, { "epoch": 0.36, "grad_norm": 10.00520133972168, "learning_rate": 2.4610983981693363e-05, "loss": 1.4822, "step": 1256 }, { "epoch": 0.36, "grad_norm": 8.018996238708496, "learning_rate": 2.4606693363844395e-05, "loss": 1.0052, "step": 1257 }, { "epoch": 0.36, "grad_norm": 9.236218452453613, "learning_rate": 2.4602402745995424e-05, "loss": 1.6975, "step": 1258 }, { "epoch": 0.36, "grad_norm": 7.647527694702148, "learning_rate": 2.4598112128146453e-05, "loss": 1.2796, "step": 1259 }, { "epoch": 0.36, "grad_norm": 8.628692626953125, "learning_rate": 2.4593821510297485e-05, "loss": 1.2854, "step": 1260 }, { "epoch": 0.36, "grad_norm": 8.866012573242188, "learning_rate": 2.4589530892448514e-05, "loss": 1.3675, "step": 1261 }, { "epoch": 0.36, "grad_norm": 10.024090766906738, "learning_rate": 2.4585240274599542e-05, "loss": 1.3188, "step": 1262 }, { "epoch": 0.36, "grad_norm": 9.781789779663086, "learning_rate": 2.458094965675057e-05, "loss": 1.4322, "step": 1263 }, { "epoch": 0.36, "grad_norm": 8.879053115844727, "learning_rate": 2.4576659038901603e-05, "loss": 1.437, "step": 1264 }, { "epoch": 0.36, "grad_norm": 8.417719841003418, "learning_rate": 2.4572368421052632e-05, "loss": 1.0245, "step": 1265 }, { "epoch": 0.36, "grad_norm": 11.67907428741455, "learning_rate": 2.456807780320366e-05, "loss": 1.1642, "step": 1266 }, { "epoch": 0.36, "grad_norm": 8.326916694641113, "learning_rate": 2.456378718535469e-05, "loss": 1.547, "step": 1267 }, { "epoch": 0.36, "grad_norm": 8.553854942321777, "learning_rate": 2.4559496567505722e-05, "loss": 1.4601, "step": 1268 }, { "epoch": 0.36, "grad_norm": 8.98757266998291, "learning_rate": 2.4555205949656754e-05, "loss": 1.1181, "step": 1269 }, { "epoch": 0.36, "grad_norm": 8.45922565460205, "learning_rate": 2.455091533180778e-05, "loss": 1.0405, "step": 1270 }, { "epoch": 0.36, "grad_norm": 10.106505393981934, "learning_rate": 2.454662471395881e-05, "loss": 1.4776, "step": 1271 }, { "epoch": 0.36, "grad_norm": 9.02731990814209, "learning_rate": 2.454233409610984e-05, "loss": 1.6652, "step": 1272 }, { "epoch": 0.36, "grad_norm": 10.727587699890137, "learning_rate": 2.4538043478260872e-05, "loss": 1.2804, "step": 1273 }, { "epoch": 0.36, "grad_norm": 9.161126136779785, "learning_rate": 2.4533752860411898e-05, "loss": 1.3514, "step": 1274 }, { "epoch": 0.36, "grad_norm": 9.589289665222168, "learning_rate": 2.452946224256293e-05, "loss": 1.3186, "step": 1275 }, { "epoch": 0.36, "grad_norm": 7.296161651611328, "learning_rate": 2.452517162471396e-05, "loss": 1.2932, "step": 1276 }, { "epoch": 0.37, "grad_norm": 8.590826034545898, "learning_rate": 2.4520881006864988e-05, "loss": 1.1958, "step": 1277 }, { "epoch": 0.37, "grad_norm": 9.288223266601562, "learning_rate": 2.451659038901602e-05, "loss": 1.1643, "step": 1278 }, { "epoch": 0.37, "grad_norm": 8.862256050109863, "learning_rate": 2.451229977116705e-05, "loss": 1.5331, "step": 1279 }, { "epoch": 0.37, "grad_norm": 8.638525009155273, "learning_rate": 2.450800915331808e-05, "loss": 1.3783, "step": 1280 }, { "epoch": 0.37, "grad_norm": 8.808732986450195, "learning_rate": 2.4503718535469106e-05, "loss": 1.4159, "step": 1281 }, { "epoch": 0.37, "grad_norm": 8.5829496383667, "learning_rate": 2.4499427917620138e-05, "loss": 1.4327, "step": 1282 }, { "epoch": 0.37, "grad_norm": 9.102815628051758, "learning_rate": 2.4495137299771167e-05, "loss": 1.4051, "step": 1283 }, { "epoch": 0.37, "grad_norm": 9.11328125, "learning_rate": 2.44908466819222e-05, "loss": 1.4893, "step": 1284 }, { "epoch": 0.37, "grad_norm": 8.082062721252441, "learning_rate": 2.4486556064073224e-05, "loss": 1.1809, "step": 1285 }, { "epoch": 0.37, "grad_norm": 9.857675552368164, "learning_rate": 2.4482265446224257e-05, "loss": 1.3247, "step": 1286 }, { "epoch": 0.37, "grad_norm": 7.963157653808594, "learning_rate": 2.447797482837529e-05, "loss": 1.0194, "step": 1287 }, { "epoch": 0.37, "grad_norm": 7.50498628616333, "learning_rate": 2.4473684210526318e-05, "loss": 0.9924, "step": 1288 }, { "epoch": 0.37, "grad_norm": 8.573609352111816, "learning_rate": 2.4469393592677346e-05, "loss": 1.6141, "step": 1289 }, { "epoch": 0.37, "grad_norm": 9.218179702758789, "learning_rate": 2.4465102974828375e-05, "loss": 1.365, "step": 1290 }, { "epoch": 0.37, "grad_norm": 8.033930778503418, "learning_rate": 2.4460812356979407e-05, "loss": 1.4356, "step": 1291 }, { "epoch": 0.37, "grad_norm": 8.373608589172363, "learning_rate": 2.4456521739130433e-05, "loss": 1.4214, "step": 1292 }, { "epoch": 0.37, "grad_norm": 9.133663177490234, "learning_rate": 2.4452231121281465e-05, "loss": 1.2398, "step": 1293 }, { "epoch": 0.37, "grad_norm": 9.034195899963379, "learning_rate": 2.4447940503432497e-05, "loss": 1.3809, "step": 1294 }, { "epoch": 0.37, "grad_norm": 9.040098190307617, "learning_rate": 2.4443649885583526e-05, "loss": 1.3916, "step": 1295 }, { "epoch": 0.37, "grad_norm": 7.705639362335205, "learning_rate": 2.4439359267734554e-05, "loss": 1.3414, "step": 1296 }, { "epoch": 0.37, "grad_norm": 9.405379295349121, "learning_rate": 2.4435068649885583e-05, "loss": 1.5481, "step": 1297 }, { "epoch": 0.37, "grad_norm": 7.484687805175781, "learning_rate": 2.4430778032036615e-05, "loss": 1.2209, "step": 1298 }, { "epoch": 0.37, "grad_norm": 7.746870994567871, "learning_rate": 2.4426487414187644e-05, "loss": 1.2595, "step": 1299 }, { "epoch": 0.37, "grad_norm": 9.075613975524902, "learning_rate": 2.4422196796338673e-05, "loss": 1.2402, "step": 1300 }, { "epoch": 0.37, "grad_norm": 9.48637580871582, "learning_rate": 2.4417906178489702e-05, "loss": 1.4072, "step": 1301 }, { "epoch": 0.37, "grad_norm": 8.419695854187012, "learning_rate": 2.4413615560640734e-05, "loss": 0.9975, "step": 1302 }, { "epoch": 0.37, "grad_norm": 8.64478874206543, "learning_rate": 2.4409324942791763e-05, "loss": 1.2916, "step": 1303 }, { "epoch": 0.37, "grad_norm": 8.342888832092285, "learning_rate": 2.440503432494279e-05, "loss": 1.3963, "step": 1304 }, { "epoch": 0.37, "grad_norm": 7.8090362548828125, "learning_rate": 2.4400743707093824e-05, "loss": 1.1569, "step": 1305 }, { "epoch": 0.37, "grad_norm": 8.666332244873047, "learning_rate": 2.4396453089244852e-05, "loss": 1.4286, "step": 1306 }, { "epoch": 0.37, "grad_norm": 10.178092956542969, "learning_rate": 2.439216247139588e-05, "loss": 1.7242, "step": 1307 }, { "epoch": 0.37, "grad_norm": 8.932698249816895, "learning_rate": 2.438787185354691e-05, "loss": 1.2023, "step": 1308 }, { "epoch": 0.37, "grad_norm": 8.65407657623291, "learning_rate": 2.4383581235697942e-05, "loss": 1.299, "step": 1309 }, { "epoch": 0.37, "grad_norm": 8.978166580200195, "learning_rate": 2.437929061784897e-05, "loss": 1.3771, "step": 1310 }, { "epoch": 0.38, "grad_norm": 7.935179710388184, "learning_rate": 2.4375e-05, "loss": 1.3486, "step": 1311 }, { "epoch": 0.38, "grad_norm": 7.84268045425415, "learning_rate": 2.4370709382151032e-05, "loss": 1.0263, "step": 1312 }, { "epoch": 0.38, "grad_norm": 7.6278228759765625, "learning_rate": 2.436641876430206e-05, "loss": 1.2904, "step": 1313 }, { "epoch": 0.38, "grad_norm": 8.676783561706543, "learning_rate": 2.436212814645309e-05, "loss": 0.9318, "step": 1314 }, { "epoch": 0.38, "grad_norm": 7.841330528259277, "learning_rate": 2.4357837528604118e-05, "loss": 1.1714, "step": 1315 }, { "epoch": 0.38, "grad_norm": 8.406579971313477, "learning_rate": 2.435354691075515e-05, "loss": 1.2257, "step": 1316 }, { "epoch": 0.38, "grad_norm": 8.868280410766602, "learning_rate": 2.434925629290618e-05, "loss": 1.3, "step": 1317 }, { "epoch": 0.38, "grad_norm": 8.387145042419434, "learning_rate": 2.4344965675057208e-05, "loss": 1.1957, "step": 1318 }, { "epoch": 0.38, "grad_norm": 8.952069282531738, "learning_rate": 2.4340675057208236e-05, "loss": 1.0876, "step": 1319 }, { "epoch": 0.38, "grad_norm": 9.023176193237305, "learning_rate": 2.433638443935927e-05, "loss": 1.3813, "step": 1320 }, { "epoch": 0.38, "grad_norm": 8.673901557922363, "learning_rate": 2.43320938215103e-05, "loss": 1.179, "step": 1321 }, { "epoch": 0.38, "grad_norm": 11.203038215637207, "learning_rate": 2.4327803203661326e-05, "loss": 1.0637, "step": 1322 }, { "epoch": 0.38, "grad_norm": 8.70146656036377, "learning_rate": 2.432351258581236e-05, "loss": 1.3067, "step": 1323 }, { "epoch": 0.38, "grad_norm": 9.276034355163574, "learning_rate": 2.4319221967963387e-05, "loss": 1.3299, "step": 1324 }, { "epoch": 0.38, "grad_norm": 9.101346015930176, "learning_rate": 2.431493135011442e-05, "loss": 1.3958, "step": 1325 }, { "epoch": 0.38, "grad_norm": 8.510576248168945, "learning_rate": 2.4310640732265445e-05, "loss": 1.1045, "step": 1326 }, { "epoch": 0.38, "grad_norm": 9.810791969299316, "learning_rate": 2.4306350114416477e-05, "loss": 1.2913, "step": 1327 }, { "epoch": 0.38, "grad_norm": 8.729816436767578, "learning_rate": 2.4302059496567506e-05, "loss": 1.2609, "step": 1328 }, { "epoch": 0.38, "grad_norm": 8.372541427612305, "learning_rate": 2.4297768878718534e-05, "loss": 1.4533, "step": 1329 }, { "epoch": 0.38, "grad_norm": 8.771547317504883, "learning_rate": 2.4293478260869566e-05, "loss": 1.376, "step": 1330 }, { "epoch": 0.38, "grad_norm": 9.162940979003906, "learning_rate": 2.4289187643020595e-05, "loss": 1.2187, "step": 1331 }, { "epoch": 0.38, "grad_norm": 9.148791313171387, "learning_rate": 2.4284897025171627e-05, "loss": 1.3383, "step": 1332 }, { "epoch": 0.38, "grad_norm": 8.023940086364746, "learning_rate": 2.4280606407322653e-05, "loss": 1.0984, "step": 1333 }, { "epoch": 0.38, "grad_norm": 7.9747843742370605, "learning_rate": 2.4276315789473685e-05, "loss": 1.1721, "step": 1334 }, { "epoch": 0.38, "grad_norm": 9.558140754699707, "learning_rate": 2.4272025171624714e-05, "loss": 1.2776, "step": 1335 }, { "epoch": 0.38, "grad_norm": 11.381139755249023, "learning_rate": 2.4267734553775746e-05, "loss": 1.1888, "step": 1336 }, { "epoch": 0.38, "grad_norm": 10.013339042663574, "learning_rate": 2.4263443935926775e-05, "loss": 1.5181, "step": 1337 }, { "epoch": 0.38, "grad_norm": 9.366135597229004, "learning_rate": 2.4259153318077803e-05, "loss": 1.1374, "step": 1338 }, { "epoch": 0.38, "grad_norm": 8.258301734924316, "learning_rate": 2.4254862700228836e-05, "loss": 1.1101, "step": 1339 }, { "epoch": 0.38, "grad_norm": 8.093093872070312, "learning_rate": 2.4250572082379864e-05, "loss": 1.0627, "step": 1340 }, { "epoch": 0.38, "grad_norm": 8.383330345153809, "learning_rate": 2.4246281464530893e-05, "loss": 1.1353, "step": 1341 }, { "epoch": 0.38, "grad_norm": 10.459028244018555, "learning_rate": 2.4241990846681922e-05, "loss": 1.2926, "step": 1342 }, { "epoch": 0.38, "grad_norm": 8.990187644958496, "learning_rate": 2.4237700228832954e-05, "loss": 1.4097, "step": 1343 }, { "epoch": 0.38, "grad_norm": 11.132209777832031, "learning_rate": 2.423340961098398e-05, "loss": 1.603, "step": 1344 }, { "epoch": 0.38, "grad_norm": 9.05010986328125, "learning_rate": 2.422911899313501e-05, "loss": 1.1662, "step": 1345 }, { "epoch": 0.39, "grad_norm": 9.722626686096191, "learning_rate": 2.4224828375286044e-05, "loss": 1.3975, "step": 1346 }, { "epoch": 0.39, "grad_norm": 11.301931381225586, "learning_rate": 2.4220537757437072e-05, "loss": 1.6098, "step": 1347 }, { "epoch": 0.39, "grad_norm": 8.93022346496582, "learning_rate": 2.42162471395881e-05, "loss": 1.299, "step": 1348 }, { "epoch": 0.39, "grad_norm": 7.965664386749268, "learning_rate": 2.421195652173913e-05, "loss": 1.1455, "step": 1349 }, { "epoch": 0.39, "grad_norm": 8.106415748596191, "learning_rate": 2.4207665903890162e-05, "loss": 1.146, "step": 1350 }, { "epoch": 0.39, "grad_norm": 8.92429256439209, "learning_rate": 2.420337528604119e-05, "loss": 1.5329, "step": 1351 }, { "epoch": 0.39, "grad_norm": 8.482048988342285, "learning_rate": 2.419908466819222e-05, "loss": 1.4407, "step": 1352 }, { "epoch": 0.39, "grad_norm": 7.7144904136657715, "learning_rate": 2.419479405034325e-05, "loss": 1.1205, "step": 1353 }, { "epoch": 0.39, "grad_norm": 9.001365661621094, "learning_rate": 2.419050343249428e-05, "loss": 1.1997, "step": 1354 }, { "epoch": 0.39, "grad_norm": 9.95022201538086, "learning_rate": 2.418621281464531e-05, "loss": 1.3606, "step": 1355 }, { "epoch": 0.39, "grad_norm": 7.322661876678467, "learning_rate": 2.4181922196796338e-05, "loss": 1.0748, "step": 1356 }, { "epoch": 0.39, "grad_norm": 7.953974723815918, "learning_rate": 2.417763157894737e-05, "loss": 1.0571, "step": 1357 }, { "epoch": 0.39, "grad_norm": 9.130672454833984, "learning_rate": 2.41733409610984e-05, "loss": 1.2434, "step": 1358 }, { "epoch": 0.39, "grad_norm": 9.41163444519043, "learning_rate": 2.4169050343249428e-05, "loss": 1.505, "step": 1359 }, { "epoch": 0.39, "grad_norm": 8.805686950683594, "learning_rate": 2.4164759725400457e-05, "loss": 1.3325, "step": 1360 }, { "epoch": 0.39, "grad_norm": 8.738990783691406, "learning_rate": 2.416046910755149e-05, "loss": 1.116, "step": 1361 }, { "epoch": 0.39, "grad_norm": 9.374129295349121, "learning_rate": 2.4156178489702518e-05, "loss": 1.4156, "step": 1362 }, { "epoch": 0.39, "grad_norm": 9.01900863647461, "learning_rate": 2.4151887871853546e-05, "loss": 1.3289, "step": 1363 }, { "epoch": 0.39, "grad_norm": 9.995376586914062, "learning_rate": 2.414759725400458e-05, "loss": 1.7106, "step": 1364 }, { "epoch": 0.39, "grad_norm": 9.745902061462402, "learning_rate": 2.4143306636155607e-05, "loss": 1.6264, "step": 1365 }, { "epoch": 0.39, "grad_norm": 8.528644561767578, "learning_rate": 2.413901601830664e-05, "loss": 1.1799, "step": 1366 }, { "epoch": 0.39, "grad_norm": 8.402424812316895, "learning_rate": 2.4134725400457665e-05, "loss": 1.2642, "step": 1367 }, { "epoch": 0.39, "grad_norm": 9.838375091552734, "learning_rate": 2.4130434782608697e-05, "loss": 1.5943, "step": 1368 }, { "epoch": 0.39, "grad_norm": 9.3024320602417, "learning_rate": 2.4126144164759726e-05, "loss": 1.3896, "step": 1369 }, { "epoch": 0.39, "grad_norm": 9.33349323272705, "learning_rate": 2.4121853546910754e-05, "loss": 1.4866, "step": 1370 }, { "epoch": 0.39, "grad_norm": 9.199357986450195, "learning_rate": 2.4117562929061787e-05, "loss": 1.2041, "step": 1371 }, { "epoch": 0.39, "grad_norm": 8.57052230834961, "learning_rate": 2.4113272311212815e-05, "loss": 1.2684, "step": 1372 }, { "epoch": 0.39, "grad_norm": 9.016642570495605, "learning_rate": 2.4108981693363848e-05, "loss": 0.9034, "step": 1373 }, { "epoch": 0.39, "grad_norm": 7.731619358062744, "learning_rate": 2.4104691075514873e-05, "loss": 0.9991, "step": 1374 }, { "epoch": 0.39, "grad_norm": 8.050274848937988, "learning_rate": 2.4100400457665905e-05, "loss": 1.1917, "step": 1375 }, { "epoch": 0.39, "grad_norm": 9.060944557189941, "learning_rate": 2.4096109839816934e-05, "loss": 1.1226, "step": 1376 }, { "epoch": 0.39, "grad_norm": 9.330954551696777, "learning_rate": 2.4091819221967966e-05, "loss": 1.2132, "step": 1377 }, { "epoch": 0.39, "grad_norm": 8.34937858581543, "learning_rate": 2.408752860411899e-05, "loss": 1.2008, "step": 1378 }, { "epoch": 0.39, "grad_norm": 10.294057846069336, "learning_rate": 2.4083237986270024e-05, "loss": 1.51, "step": 1379 }, { "epoch": 0.39, "grad_norm": 9.819464683532715, "learning_rate": 2.4078947368421056e-05, "loss": 1.3065, "step": 1380 }, { "epoch": 0.4, "grad_norm": 11.210472106933594, "learning_rate": 2.407465675057208e-05, "loss": 1.3412, "step": 1381 }, { "epoch": 0.4, "grad_norm": 9.68340015411377, "learning_rate": 2.4070366132723113e-05, "loss": 1.2003, "step": 1382 }, { "epoch": 0.4, "grad_norm": 9.639593124389648, "learning_rate": 2.4066075514874142e-05, "loss": 1.1033, "step": 1383 }, { "epoch": 0.4, "grad_norm": 8.134466171264648, "learning_rate": 2.4061784897025174e-05, "loss": 1.2797, "step": 1384 }, { "epoch": 0.4, "grad_norm": 9.370076179504395, "learning_rate": 2.40574942791762e-05, "loss": 1.6027, "step": 1385 }, { "epoch": 0.4, "grad_norm": 10.915976524353027, "learning_rate": 2.405320366132723e-05, "loss": 1.5843, "step": 1386 }, { "epoch": 0.4, "grad_norm": 11.412590026855469, "learning_rate": 2.404891304347826e-05, "loss": 1.6253, "step": 1387 }, { "epoch": 0.4, "grad_norm": 8.945582389831543, "learning_rate": 2.4044622425629293e-05, "loss": 1.3744, "step": 1388 }, { "epoch": 0.4, "grad_norm": 9.530177116394043, "learning_rate": 2.404033180778032e-05, "loss": 0.8811, "step": 1389 }, { "epoch": 0.4, "grad_norm": 7.074493885040283, "learning_rate": 2.403604118993135e-05, "loss": 0.9554, "step": 1390 }, { "epoch": 0.4, "grad_norm": 9.388182640075684, "learning_rate": 2.4031750572082382e-05, "loss": 1.1566, "step": 1391 }, { "epoch": 0.4, "grad_norm": 8.897048950195312, "learning_rate": 2.402745995423341e-05, "loss": 1.041, "step": 1392 }, { "epoch": 0.4, "grad_norm": 8.569944381713867, "learning_rate": 2.402316933638444e-05, "loss": 1.4443, "step": 1393 }, { "epoch": 0.4, "grad_norm": 9.350441932678223, "learning_rate": 2.401887871853547e-05, "loss": 1.2276, "step": 1394 }, { "epoch": 0.4, "grad_norm": 8.618452072143555, "learning_rate": 2.40145881006865e-05, "loss": 1.2577, "step": 1395 }, { "epoch": 0.4, "grad_norm": 9.834698677062988, "learning_rate": 2.4010297482837526e-05, "loss": 1.4529, "step": 1396 }, { "epoch": 0.4, "grad_norm": 9.187966346740723, "learning_rate": 2.4006006864988558e-05, "loss": 1.1598, "step": 1397 }, { "epoch": 0.4, "grad_norm": 9.153619766235352, "learning_rate": 2.400171624713959e-05, "loss": 1.5876, "step": 1398 }, { "epoch": 0.4, "grad_norm": 9.023094177246094, "learning_rate": 2.399742562929062e-05, "loss": 1.1627, "step": 1399 }, { "epoch": 0.4, "grad_norm": 9.275665283203125, "learning_rate": 2.3993135011441648e-05, "loss": 1.2592, "step": 1400 }, { "epoch": 0.4, "grad_norm": 8.038787841796875, "learning_rate": 2.3988844393592677e-05, "loss": 1.0274, "step": 1401 }, { "epoch": 0.4, "grad_norm": 9.471991539001465, "learning_rate": 2.398455377574371e-05, "loss": 1.2368, "step": 1402 }, { "epoch": 0.4, "grad_norm": 8.320343017578125, "learning_rate": 2.3980263157894738e-05, "loss": 1.0732, "step": 1403 }, { "epoch": 0.4, "grad_norm": 8.846017837524414, "learning_rate": 2.3975972540045766e-05, "loss": 1.1362, "step": 1404 }, { "epoch": 0.4, "grad_norm": 11.177179336547852, "learning_rate": 2.3971681922196795e-05, "loss": 1.3263, "step": 1405 }, { "epoch": 0.4, "grad_norm": 8.469406127929688, "learning_rate": 2.3967391304347827e-05, "loss": 1.2119, "step": 1406 }, { "epoch": 0.4, "grad_norm": 9.226251602172852, "learning_rate": 2.3963100686498856e-05, "loss": 1.5254, "step": 1407 }, { "epoch": 0.4, "grad_norm": 9.160237312316895, "learning_rate": 2.3958810068649885e-05, "loss": 1.2009, "step": 1408 }, { "epoch": 0.4, "grad_norm": 9.037147521972656, "learning_rate": 2.3954519450800917e-05, "loss": 1.2019, "step": 1409 }, { "epoch": 0.4, "grad_norm": 8.3545560836792, "learning_rate": 2.3950228832951946e-05, "loss": 1.0934, "step": 1410 }, { "epoch": 0.4, "grad_norm": 7.292141437530518, "learning_rate": 2.3945938215102975e-05, "loss": 1.057, "step": 1411 }, { "epoch": 0.4, "grad_norm": 8.759928703308105, "learning_rate": 2.3941647597254003e-05, "loss": 1.2076, "step": 1412 }, { "epoch": 0.4, "grad_norm": 8.506291389465332, "learning_rate": 2.3937356979405036e-05, "loss": 1.3331, "step": 1413 }, { "epoch": 0.4, "grad_norm": 9.5624418258667, "learning_rate": 2.3933066361556068e-05, "loss": 1.3577, "step": 1414 }, { "epoch": 0.4, "grad_norm": 7.361660003662109, "learning_rate": 2.3928775743707093e-05, "loss": 0.9031, "step": 1415 }, { "epoch": 0.41, "grad_norm": 8.643257141113281, "learning_rate": 2.3924485125858125e-05, "loss": 1.2242, "step": 1416 }, { "epoch": 0.41, "grad_norm": 8.304159164428711, "learning_rate": 2.3920194508009154e-05, "loss": 1.1468, "step": 1417 }, { "epoch": 0.41, "grad_norm": 9.026140213012695, "learning_rate": 2.3915903890160186e-05, "loss": 1.1251, "step": 1418 }, { "epoch": 0.41, "grad_norm": 8.157430648803711, "learning_rate": 2.391161327231121e-05, "loss": 1.2867, "step": 1419 }, { "epoch": 0.41, "grad_norm": 8.46143627166748, "learning_rate": 2.3907322654462244e-05, "loss": 1.1906, "step": 1420 }, { "epoch": 0.41, "grad_norm": 8.104385375976562, "learning_rate": 2.3903032036613272e-05, "loss": 1.0685, "step": 1421 }, { "epoch": 0.41, "grad_norm": 7.670660018920898, "learning_rate": 2.38987414187643e-05, "loss": 1.0857, "step": 1422 }, { "epoch": 0.41, "grad_norm": 8.949851989746094, "learning_rate": 2.3894450800915333e-05, "loss": 1.1966, "step": 1423 }, { "epoch": 0.41, "grad_norm": 8.842041015625, "learning_rate": 2.3890160183066362e-05, "loss": 1.4296, "step": 1424 }, { "epoch": 0.41, "grad_norm": 9.33974838256836, "learning_rate": 2.3885869565217394e-05, "loss": 1.6068, "step": 1425 }, { "epoch": 0.41, "grad_norm": 8.253230094909668, "learning_rate": 2.388157894736842e-05, "loss": 1.0953, "step": 1426 }, { "epoch": 0.41, "grad_norm": 10.492298126220703, "learning_rate": 2.3877288329519452e-05, "loss": 1.6289, "step": 1427 }, { "epoch": 0.41, "grad_norm": 8.839768409729004, "learning_rate": 2.387299771167048e-05, "loss": 1.2073, "step": 1428 }, { "epoch": 0.41, "grad_norm": 9.604236602783203, "learning_rate": 2.3868707093821513e-05, "loss": 1.5861, "step": 1429 }, { "epoch": 0.41, "grad_norm": 9.197074890136719, "learning_rate": 2.3864416475972538e-05, "loss": 1.3949, "step": 1430 }, { "epoch": 0.41, "grad_norm": 7.777657985687256, "learning_rate": 2.386012585812357e-05, "loss": 1.1687, "step": 1431 }, { "epoch": 0.41, "grad_norm": 8.555583953857422, "learning_rate": 2.3855835240274602e-05, "loss": 1.2248, "step": 1432 }, { "epoch": 0.41, "grad_norm": 8.336581230163574, "learning_rate": 2.3851544622425628e-05, "loss": 1.2914, "step": 1433 }, { "epoch": 0.41, "grad_norm": 8.059797286987305, "learning_rate": 2.384725400457666e-05, "loss": 1.1301, "step": 1434 }, { "epoch": 0.41, "grad_norm": 8.82107925415039, "learning_rate": 2.384296338672769e-05, "loss": 1.3305, "step": 1435 }, { "epoch": 0.41, "grad_norm": 6.868978023529053, "learning_rate": 2.383867276887872e-05, "loss": 1.1569, "step": 1436 }, { "epoch": 0.41, "grad_norm": 7.062217712402344, "learning_rate": 2.3834382151029746e-05, "loss": 1.2047, "step": 1437 }, { "epoch": 0.41, "grad_norm": 9.001330375671387, "learning_rate": 2.383009153318078e-05, "loss": 1.5471, "step": 1438 }, { "epoch": 0.41, "grad_norm": 9.982261657714844, "learning_rate": 2.3825800915331807e-05, "loss": 1.5064, "step": 1439 }, { "epoch": 0.41, "grad_norm": 9.749502182006836, "learning_rate": 2.382151029748284e-05, "loss": 1.3297, "step": 1440 }, { "epoch": 0.41, "grad_norm": 9.250558853149414, "learning_rate": 2.3817219679633868e-05, "loss": 1.2343, "step": 1441 }, { "epoch": 0.41, "grad_norm": 9.083165168762207, "learning_rate": 2.3812929061784897e-05, "loss": 1.1752, "step": 1442 }, { "epoch": 0.41, "grad_norm": 9.45124626159668, "learning_rate": 2.380863844393593e-05, "loss": 1.3145, "step": 1443 }, { "epoch": 0.41, "grad_norm": 7.276574611663818, "learning_rate": 2.3804347826086958e-05, "loss": 1.0405, "step": 1444 }, { "epoch": 0.41, "grad_norm": 9.243409156799316, "learning_rate": 2.3800057208237987e-05, "loss": 1.2529, "step": 1445 }, { "epoch": 0.41, "grad_norm": 8.41983699798584, "learning_rate": 2.3795766590389015e-05, "loss": 1.2946, "step": 1446 }, { "epoch": 0.41, "grad_norm": 8.295051574707031, "learning_rate": 2.3791475972540048e-05, "loss": 1.2942, "step": 1447 }, { "epoch": 0.41, "grad_norm": 8.93205738067627, "learning_rate": 2.3787185354691073e-05, "loss": 1.3353, "step": 1448 }, { "epoch": 0.41, "grad_norm": 9.91295051574707, "learning_rate": 2.3782894736842105e-05, "loss": 1.3193, "step": 1449 }, { "epoch": 0.41, "grad_norm": 8.965539932250977, "learning_rate": 2.3778604118993137e-05, "loss": 1.2203, "step": 1450 }, { "epoch": 0.42, "grad_norm": 8.239827156066895, "learning_rate": 2.3774313501144166e-05, "loss": 1.2644, "step": 1451 }, { "epoch": 0.42, "grad_norm": 9.137190818786621, "learning_rate": 2.3770022883295195e-05, "loss": 1.3303, "step": 1452 }, { "epoch": 0.42, "grad_norm": 9.793137550354004, "learning_rate": 2.3765732265446224e-05, "loss": 1.3811, "step": 1453 }, { "epoch": 0.42, "grad_norm": 9.519445419311523, "learning_rate": 2.3761441647597256e-05, "loss": 1.4462, "step": 1454 }, { "epoch": 0.42, "grad_norm": 8.207239151000977, "learning_rate": 2.3757151029748284e-05, "loss": 1.3354, "step": 1455 }, { "epoch": 0.42, "grad_norm": 9.833036422729492, "learning_rate": 2.3752860411899313e-05, "loss": 1.5372, "step": 1456 }, { "epoch": 0.42, "grad_norm": 10.283196449279785, "learning_rate": 2.3748569794050345e-05, "loss": 1.1521, "step": 1457 }, { "epoch": 0.42, "grad_norm": 9.274222373962402, "learning_rate": 2.3744279176201374e-05, "loss": 1.4175, "step": 1458 }, { "epoch": 0.42, "grad_norm": 9.186198234558105, "learning_rate": 2.3739988558352403e-05, "loss": 1.3154, "step": 1459 }, { "epoch": 0.42, "grad_norm": 8.352169036865234, "learning_rate": 2.373569794050343e-05, "loss": 1.2017, "step": 1460 }, { "epoch": 0.42, "grad_norm": 8.348889350891113, "learning_rate": 2.3731407322654464e-05, "loss": 1.2981, "step": 1461 }, { "epoch": 0.42, "grad_norm": 8.70347785949707, "learning_rate": 2.3727116704805493e-05, "loss": 1.5687, "step": 1462 }, { "epoch": 0.42, "grad_norm": 8.238858222961426, "learning_rate": 2.372282608695652e-05, "loss": 1.2132, "step": 1463 }, { "epoch": 0.42, "grad_norm": 8.277292251586914, "learning_rate": 2.371853546910755e-05, "loss": 1.0626, "step": 1464 }, { "epoch": 0.42, "grad_norm": 9.16200065612793, "learning_rate": 2.3714244851258582e-05, "loss": 1.5139, "step": 1465 }, { "epoch": 0.42, "grad_norm": 7.976869583129883, "learning_rate": 2.3709954233409614e-05, "loss": 1.353, "step": 1466 }, { "epoch": 0.42, "grad_norm": 8.776309967041016, "learning_rate": 2.370566361556064e-05, "loss": 1.3921, "step": 1467 }, { "epoch": 0.42, "grad_norm": 8.482604026794434, "learning_rate": 2.3701372997711672e-05, "loss": 1.5642, "step": 1468 }, { "epoch": 0.42, "grad_norm": 8.380767822265625, "learning_rate": 2.36970823798627e-05, "loss": 1.3062, "step": 1469 }, { "epoch": 0.42, "grad_norm": 8.498199462890625, "learning_rate": 2.3692791762013733e-05, "loss": 1.4111, "step": 1470 }, { "epoch": 0.42, "grad_norm": 9.320439338684082, "learning_rate": 2.3688501144164758e-05, "loss": 1.3205, "step": 1471 }, { "epoch": 0.42, "grad_norm": 9.131627082824707, "learning_rate": 2.368421052631579e-05, "loss": 1.4386, "step": 1472 }, { "epoch": 0.42, "grad_norm": 7.26507043838501, "learning_rate": 2.367991990846682e-05, "loss": 1.2513, "step": 1473 }, { "epoch": 0.42, "grad_norm": 8.468502044677734, "learning_rate": 2.3675629290617848e-05, "loss": 1.0528, "step": 1474 }, { "epoch": 0.42, "grad_norm": 9.420333862304688, "learning_rate": 2.367133867276888e-05, "loss": 1.3885, "step": 1475 }, { "epoch": 0.42, "grad_norm": 8.6135892868042, "learning_rate": 2.366704805491991e-05, "loss": 1.2761, "step": 1476 }, { "epoch": 0.42, "grad_norm": 7.293272972106934, "learning_rate": 2.366275743707094e-05, "loss": 0.8257, "step": 1477 }, { "epoch": 0.42, "grad_norm": 7.460020065307617, "learning_rate": 2.3658466819221966e-05, "loss": 1.0788, "step": 1478 }, { "epoch": 0.42, "grad_norm": 9.232573509216309, "learning_rate": 2.3654176201373e-05, "loss": 1.4714, "step": 1479 }, { "epoch": 0.42, "grad_norm": 8.258026123046875, "learning_rate": 2.3649885583524027e-05, "loss": 1.1303, "step": 1480 }, { "epoch": 0.42, "grad_norm": 10.65665340423584, "learning_rate": 2.364559496567506e-05, "loss": 1.3665, "step": 1481 }, { "epoch": 0.42, "grad_norm": 11.075395584106445, "learning_rate": 2.3641304347826085e-05, "loss": 1.1579, "step": 1482 }, { "epoch": 0.42, "grad_norm": 9.51037311553955, "learning_rate": 2.3637013729977117e-05, "loss": 1.1477, "step": 1483 }, { "epoch": 0.42, "grad_norm": 8.741596221923828, "learning_rate": 2.363272311212815e-05, "loss": 1.14, "step": 1484 }, { "epoch": 0.42, "grad_norm": 11.098268508911133, "learning_rate": 2.3628432494279178e-05, "loss": 1.4584, "step": 1485 }, { "epoch": 0.43, "grad_norm": 8.322772026062012, "learning_rate": 2.3624141876430207e-05, "loss": 1.26, "step": 1486 }, { "epoch": 0.43, "grad_norm": 8.70244312286377, "learning_rate": 2.3619851258581236e-05, "loss": 1.3188, "step": 1487 }, { "epoch": 0.43, "grad_norm": 9.393972396850586, "learning_rate": 2.3615560640732268e-05, "loss": 1.3285, "step": 1488 }, { "epoch": 0.43, "grad_norm": 9.17244815826416, "learning_rate": 2.3611270022883293e-05, "loss": 1.2333, "step": 1489 }, { "epoch": 0.43, "grad_norm": 8.737652778625488, "learning_rate": 2.3606979405034325e-05, "loss": 1.4713, "step": 1490 }, { "epoch": 0.43, "grad_norm": 9.080338478088379, "learning_rate": 2.3602688787185357e-05, "loss": 1.0657, "step": 1491 }, { "epoch": 0.43, "grad_norm": 9.210623741149902, "learning_rate": 2.3598398169336386e-05, "loss": 1.0281, "step": 1492 }, { "epoch": 0.43, "grad_norm": 8.44229507446289, "learning_rate": 2.3594107551487415e-05, "loss": 1.3468, "step": 1493 }, { "epoch": 0.43, "grad_norm": 9.192740440368652, "learning_rate": 2.3589816933638444e-05, "loss": 1.4332, "step": 1494 }, { "epoch": 0.43, "grad_norm": 9.328465461730957, "learning_rate": 2.3585526315789476e-05, "loss": 1.2152, "step": 1495 }, { "epoch": 0.43, "grad_norm": 8.111274719238281, "learning_rate": 2.3581235697940505e-05, "loss": 1.1548, "step": 1496 }, { "epoch": 0.43, "grad_norm": 8.120012283325195, "learning_rate": 2.3576945080091533e-05, "loss": 1.2525, "step": 1497 }, { "epoch": 0.43, "grad_norm": 8.653223991394043, "learning_rate": 2.3572654462242562e-05, "loss": 1.5593, "step": 1498 }, { "epoch": 0.43, "grad_norm": 8.163555145263672, "learning_rate": 2.3568363844393594e-05, "loss": 1.1074, "step": 1499 }, { "epoch": 0.43, "grad_norm": 8.843320846557617, "learning_rate": 2.3564073226544623e-05, "loss": 1.6794, "step": 1500 }, { "epoch": 0.43, "grad_norm": 8.1924409866333, "learning_rate": 2.3559782608695652e-05, "loss": 1.1617, "step": 1501 }, { "epoch": 0.43, "grad_norm": 8.335710525512695, "learning_rate": 2.3555491990846684e-05, "loss": 1.2226, "step": 1502 }, { "epoch": 0.43, "grad_norm": 8.478708267211914, "learning_rate": 2.3551201372997713e-05, "loss": 1.3615, "step": 1503 }, { "epoch": 0.43, "grad_norm": 8.279967308044434, "learning_rate": 2.354691075514874e-05, "loss": 1.2645, "step": 1504 }, { "epoch": 0.43, "grad_norm": 8.7472562789917, "learning_rate": 2.354262013729977e-05, "loss": 1.4243, "step": 1505 }, { "epoch": 0.43, "grad_norm": 10.629935264587402, "learning_rate": 2.3538329519450802e-05, "loss": 1.7176, "step": 1506 }, { "epoch": 0.43, "grad_norm": 9.287959098815918, "learning_rate": 2.353403890160183e-05, "loss": 1.2294, "step": 1507 }, { "epoch": 0.43, "grad_norm": 8.736032485961914, "learning_rate": 2.352974828375286e-05, "loss": 1.3285, "step": 1508 }, { "epoch": 0.43, "grad_norm": 7.25554895401001, "learning_rate": 2.3525457665903892e-05, "loss": 1.0597, "step": 1509 }, { "epoch": 0.43, "grad_norm": 8.317910194396973, "learning_rate": 2.352116704805492e-05, "loss": 1.2206, "step": 1510 }, { "epoch": 0.43, "grad_norm": 7.878781318664551, "learning_rate": 2.3516876430205953e-05, "loss": 1.3455, "step": 1511 }, { "epoch": 0.43, "grad_norm": 8.274806022644043, "learning_rate": 2.351258581235698e-05, "loss": 1.1493, "step": 1512 }, { "epoch": 0.43, "grad_norm": 8.328680038452148, "learning_rate": 2.350829519450801e-05, "loss": 1.0924, "step": 1513 }, { "epoch": 0.43, "grad_norm": 9.172496795654297, "learning_rate": 2.350400457665904e-05, "loss": 1.3245, "step": 1514 }, { "epoch": 0.43, "grad_norm": 7.963389873504639, "learning_rate": 2.3499713958810068e-05, "loss": 1.1043, "step": 1515 }, { "epoch": 0.43, "grad_norm": 9.72868537902832, "learning_rate": 2.3495423340961097e-05, "loss": 1.3213, "step": 1516 }, { "epoch": 0.43, "grad_norm": 11.010828018188477, "learning_rate": 2.349113272311213e-05, "loss": 1.3094, "step": 1517 }, { "epoch": 0.43, "grad_norm": 9.427931785583496, "learning_rate": 2.348684210526316e-05, "loss": 1.4155, "step": 1518 }, { "epoch": 0.43, "grad_norm": 10.318192481994629, "learning_rate": 2.3482551487414187e-05, "loss": 1.3738, "step": 1519 }, { "epoch": 0.43, "grad_norm": 8.963619232177734, "learning_rate": 2.347826086956522e-05, "loss": 1.1076, "step": 1520 }, { "epoch": 0.44, "grad_norm": 8.685846328735352, "learning_rate": 2.3473970251716247e-05, "loss": 1.2785, "step": 1521 }, { "epoch": 0.44, "grad_norm": 9.562762260437012, "learning_rate": 2.346967963386728e-05, "loss": 1.2463, "step": 1522 }, { "epoch": 0.44, "grad_norm": 8.336532592773438, "learning_rate": 2.3465389016018305e-05, "loss": 1.1253, "step": 1523 }, { "epoch": 0.44, "grad_norm": 8.443828582763672, "learning_rate": 2.3461098398169337e-05, "loss": 1.0381, "step": 1524 }, { "epoch": 0.44, "grad_norm": 9.334905624389648, "learning_rate": 2.3456807780320366e-05, "loss": 1.1564, "step": 1525 }, { "epoch": 0.44, "grad_norm": 9.118339538574219, "learning_rate": 2.3452517162471395e-05, "loss": 1.3133, "step": 1526 }, { "epoch": 0.44, "grad_norm": 9.523225784301758, "learning_rate": 2.3448226544622427e-05, "loss": 1.266, "step": 1527 }, { "epoch": 0.44, "grad_norm": 10.204797744750977, "learning_rate": 2.3443935926773456e-05, "loss": 1.5447, "step": 1528 }, { "epoch": 0.44, "grad_norm": 8.629857063293457, "learning_rate": 2.3439645308924488e-05, "loss": 1.2232, "step": 1529 }, { "epoch": 0.44, "grad_norm": 8.885933876037598, "learning_rate": 2.3435354691075513e-05, "loss": 1.245, "step": 1530 }, { "epoch": 0.44, "grad_norm": 8.119367599487305, "learning_rate": 2.3431064073226545e-05, "loss": 1.1223, "step": 1531 }, { "epoch": 0.44, "grad_norm": 8.569046020507812, "learning_rate": 2.3426773455377574e-05, "loss": 1.3702, "step": 1532 }, { "epoch": 0.44, "grad_norm": 9.757391929626465, "learning_rate": 2.3422482837528606e-05, "loss": 1.8398, "step": 1533 }, { "epoch": 0.44, "grad_norm": 7.991138935089111, "learning_rate": 2.3418192219679635e-05, "loss": 1.3525, "step": 1534 }, { "epoch": 0.44, "grad_norm": 9.762513160705566, "learning_rate": 2.3413901601830664e-05, "loss": 1.3359, "step": 1535 }, { "epoch": 0.44, "grad_norm": 10.335434913635254, "learning_rate": 2.3409610983981696e-05, "loss": 1.1716, "step": 1536 }, { "epoch": 0.44, "grad_norm": 8.94473934173584, "learning_rate": 2.3405320366132725e-05, "loss": 1.079, "step": 1537 }, { "epoch": 0.44, "grad_norm": 8.816618919372559, "learning_rate": 2.3401029748283753e-05, "loss": 1.1536, "step": 1538 }, { "epoch": 0.44, "grad_norm": 9.002206802368164, "learning_rate": 2.3396739130434782e-05, "loss": 1.268, "step": 1539 }, { "epoch": 0.44, "grad_norm": 8.662298202514648, "learning_rate": 2.3392448512585814e-05, "loss": 1.3461, "step": 1540 }, { "epoch": 0.44, "grad_norm": 10.412079811096191, "learning_rate": 2.338815789473684e-05, "loss": 1.4632, "step": 1541 }, { "epoch": 0.44, "grad_norm": 8.586483001708984, "learning_rate": 2.3383867276887872e-05, "loss": 1.5167, "step": 1542 }, { "epoch": 0.44, "grad_norm": 8.466395378112793, "learning_rate": 2.3379576659038904e-05, "loss": 1.1714, "step": 1543 }, { "epoch": 0.44, "grad_norm": 8.513823509216309, "learning_rate": 2.3375286041189933e-05, "loss": 1.2773, "step": 1544 }, { "epoch": 0.44, "grad_norm": 8.646080017089844, "learning_rate": 2.337099542334096e-05, "loss": 1.1845, "step": 1545 }, { "epoch": 0.44, "grad_norm": 9.359091758728027, "learning_rate": 2.336670480549199e-05, "loss": 1.3177, "step": 1546 }, { "epoch": 0.44, "grad_norm": 8.305411338806152, "learning_rate": 2.3362414187643023e-05, "loss": 1.159, "step": 1547 }, { "epoch": 0.44, "grad_norm": 10.000143051147461, "learning_rate": 2.335812356979405e-05, "loss": 1.1044, "step": 1548 }, { "epoch": 0.44, "grad_norm": 9.8117094039917, "learning_rate": 2.335383295194508e-05, "loss": 1.4982, "step": 1549 }, { "epoch": 0.44, "grad_norm": 9.323227882385254, "learning_rate": 2.334954233409611e-05, "loss": 1.4575, "step": 1550 }, { "epoch": 0.44, "grad_norm": 8.354408264160156, "learning_rate": 2.334525171624714e-05, "loss": 1.2588, "step": 1551 }, { "epoch": 0.44, "grad_norm": 8.50368881225586, "learning_rate": 2.334096109839817e-05, "loss": 1.0109, "step": 1552 }, { "epoch": 0.44, "grad_norm": 10.090970039367676, "learning_rate": 2.33366704805492e-05, "loss": 1.4676, "step": 1553 }, { "epoch": 0.44, "grad_norm": 9.197575569152832, "learning_rate": 2.333237986270023e-05, "loss": 1.4296, "step": 1554 }, { "epoch": 0.44, "grad_norm": 8.461068153381348, "learning_rate": 2.332808924485126e-05, "loss": 1.3344, "step": 1555 }, { "epoch": 0.45, "grad_norm": 7.606350421905518, "learning_rate": 2.3323798627002288e-05, "loss": 1.157, "step": 1556 }, { "epoch": 0.45, "grad_norm": 8.888628005981445, "learning_rate": 2.3319508009153317e-05, "loss": 1.1213, "step": 1557 }, { "epoch": 0.45, "grad_norm": 8.940152168273926, "learning_rate": 2.331521739130435e-05, "loss": 1.3451, "step": 1558 }, { "epoch": 0.45, "grad_norm": 8.469507217407227, "learning_rate": 2.3310926773455378e-05, "loss": 1.1574, "step": 1559 }, { "epoch": 0.45, "grad_norm": 8.728058815002441, "learning_rate": 2.3306636155606407e-05, "loss": 1.2855, "step": 1560 }, { "epoch": 0.45, "grad_norm": 8.956009864807129, "learning_rate": 2.330234553775744e-05, "loss": 0.8661, "step": 1561 }, { "epoch": 0.45, "grad_norm": 8.254283905029297, "learning_rate": 2.3298054919908468e-05, "loss": 1.4645, "step": 1562 }, { "epoch": 0.45, "grad_norm": 9.068936347961426, "learning_rate": 2.32937643020595e-05, "loss": 1.2349, "step": 1563 }, { "epoch": 0.45, "grad_norm": 10.539466857910156, "learning_rate": 2.3289473684210525e-05, "loss": 1.2601, "step": 1564 }, { "epoch": 0.45, "grad_norm": 8.409708023071289, "learning_rate": 2.3285183066361557e-05, "loss": 1.0722, "step": 1565 }, { "epoch": 0.45, "grad_norm": 8.638346672058105, "learning_rate": 2.3280892448512586e-05, "loss": 1.2543, "step": 1566 }, { "epoch": 0.45, "grad_norm": 9.373037338256836, "learning_rate": 2.3276601830663615e-05, "loss": 1.4724, "step": 1567 }, { "epoch": 0.45, "grad_norm": 8.4254789352417, "learning_rate": 2.3272311212814644e-05, "loss": 1.3384, "step": 1568 }, { "epoch": 0.45, "grad_norm": 8.436665534973145, "learning_rate": 2.3268020594965676e-05, "loss": 1.5132, "step": 1569 }, { "epoch": 0.45, "grad_norm": 8.459665298461914, "learning_rate": 2.3263729977116708e-05, "loss": 1.1472, "step": 1570 }, { "epoch": 0.45, "grad_norm": 9.826032638549805, "learning_rate": 2.3259439359267733e-05, "loss": 1.2903, "step": 1571 }, { "epoch": 0.45, "grad_norm": 9.004020690917969, "learning_rate": 2.3255148741418765e-05, "loss": 1.4222, "step": 1572 }, { "epoch": 0.45, "grad_norm": 8.417764663696289, "learning_rate": 2.3250858123569794e-05, "loss": 1.1339, "step": 1573 }, { "epoch": 0.45, "grad_norm": 9.473125457763672, "learning_rate": 2.3246567505720826e-05, "loss": 1.3293, "step": 1574 }, { "epoch": 0.45, "grad_norm": 7.595772743225098, "learning_rate": 2.3242276887871852e-05, "loss": 1.0717, "step": 1575 }, { "epoch": 0.45, "grad_norm": 7.648061275482178, "learning_rate": 2.3237986270022884e-05, "loss": 0.9319, "step": 1576 }, { "epoch": 0.45, "grad_norm": 10.227629661560059, "learning_rate": 2.3233695652173916e-05, "loss": 1.2729, "step": 1577 }, { "epoch": 0.45, "grad_norm": 9.110548973083496, "learning_rate": 2.322940503432494e-05, "loss": 1.2656, "step": 1578 }, { "epoch": 0.45, "grad_norm": 8.993183135986328, "learning_rate": 2.3225114416475974e-05, "loss": 1.2294, "step": 1579 }, { "epoch": 0.45, "grad_norm": 7.090939044952393, "learning_rate": 2.3220823798627002e-05, "loss": 1.021, "step": 1580 }, { "epoch": 0.45, "grad_norm": 8.64580249786377, "learning_rate": 2.3216533180778035e-05, "loss": 1.2793, "step": 1581 }, { "epoch": 0.45, "grad_norm": 9.000699996948242, "learning_rate": 2.321224256292906e-05, "loss": 1.3211, "step": 1582 }, { "epoch": 0.45, "grad_norm": 7.053379535675049, "learning_rate": 2.3207951945080092e-05, "loss": 1.1912, "step": 1583 }, { "epoch": 0.45, "grad_norm": 10.279417991638184, "learning_rate": 2.320366132723112e-05, "loss": 1.4488, "step": 1584 }, { "epoch": 0.45, "grad_norm": 8.077784538269043, "learning_rate": 2.3199370709382153e-05, "loss": 1.1453, "step": 1585 }, { "epoch": 0.45, "grad_norm": 9.773016929626465, "learning_rate": 2.3195080091533182e-05, "loss": 1.1198, "step": 1586 }, { "epoch": 0.45, "grad_norm": 9.63460636138916, "learning_rate": 2.319078947368421e-05, "loss": 1.0544, "step": 1587 }, { "epoch": 0.45, "grad_norm": 9.71286678314209, "learning_rate": 2.3186498855835243e-05, "loss": 1.1635, "step": 1588 }, { "epoch": 0.45, "grad_norm": 9.21727466583252, "learning_rate": 2.318220823798627e-05, "loss": 1.38, "step": 1589 }, { "epoch": 0.45, "grad_norm": 9.806991577148438, "learning_rate": 2.31779176201373e-05, "loss": 1.1062, "step": 1590 }, { "epoch": 0.46, "grad_norm": 9.155527114868164, "learning_rate": 2.317362700228833e-05, "loss": 1.2349, "step": 1591 }, { "epoch": 0.46, "grad_norm": 7.316340923309326, "learning_rate": 2.316933638443936e-05, "loss": 0.7638, "step": 1592 }, { "epoch": 0.46, "grad_norm": 9.659943580627441, "learning_rate": 2.3165045766590387e-05, "loss": 1.1853, "step": 1593 }, { "epoch": 0.46, "grad_norm": 9.813520431518555, "learning_rate": 2.316075514874142e-05, "loss": 1.4322, "step": 1594 }, { "epoch": 0.46, "grad_norm": 7.801672458648682, "learning_rate": 2.315646453089245e-05, "loss": 1.1196, "step": 1595 }, { "epoch": 0.46, "grad_norm": 9.470723152160645, "learning_rate": 2.315217391304348e-05, "loss": 1.3101, "step": 1596 }, { "epoch": 0.46, "grad_norm": 8.638633728027344, "learning_rate": 2.314788329519451e-05, "loss": 1.0461, "step": 1597 }, { "epoch": 0.46, "grad_norm": 9.282573699951172, "learning_rate": 2.3143592677345537e-05, "loss": 1.2249, "step": 1598 }, { "epoch": 0.46, "grad_norm": 12.078672409057617, "learning_rate": 2.313930205949657e-05, "loss": 1.575, "step": 1599 }, { "epoch": 0.46, "grad_norm": 9.369669914245605, "learning_rate": 2.3135011441647598e-05, "loss": 1.1217, "step": 1600 }, { "epoch": 0.46, "grad_norm": 10.193273544311523, "learning_rate": 2.3130720823798627e-05, "loss": 1.385, "step": 1601 }, { "epoch": 0.46, "grad_norm": 8.902130126953125, "learning_rate": 2.3126430205949656e-05, "loss": 1.3013, "step": 1602 }, { "epoch": 0.46, "grad_norm": 9.053253173828125, "learning_rate": 2.3122139588100688e-05, "loss": 1.3588, "step": 1603 }, { "epoch": 0.46, "grad_norm": 9.413948059082031, "learning_rate": 2.3117848970251717e-05, "loss": 1.1728, "step": 1604 }, { "epoch": 0.46, "grad_norm": 10.024253845214844, "learning_rate": 2.3113558352402745e-05, "loss": 1.3468, "step": 1605 }, { "epoch": 0.46, "grad_norm": 6.907523155212402, "learning_rate": 2.3109267734553777e-05, "loss": 0.9856, "step": 1606 }, { "epoch": 0.46, "grad_norm": 8.771257400512695, "learning_rate": 2.3104977116704806e-05, "loss": 1.0424, "step": 1607 }, { "epoch": 0.46, "grad_norm": 10.07789421081543, "learning_rate": 2.3100686498855835e-05, "loss": 1.3329, "step": 1608 }, { "epoch": 0.46, "grad_norm": 7.559316635131836, "learning_rate": 2.3096395881006864e-05, "loss": 1.291, "step": 1609 }, { "epoch": 0.46, "grad_norm": 8.47673225402832, "learning_rate": 2.3092105263157896e-05, "loss": 1.2689, "step": 1610 }, { "epoch": 0.46, "grad_norm": 7.403980731964111, "learning_rate": 2.3087814645308928e-05, "loss": 1.046, "step": 1611 }, { "epoch": 0.46, "grad_norm": 8.600950241088867, "learning_rate": 2.3083524027459953e-05, "loss": 1.0703, "step": 1612 }, { "epoch": 0.46, "grad_norm": 8.569875717163086, "learning_rate": 2.3079233409610986e-05, "loss": 1.1255, "step": 1613 }, { "epoch": 0.46, "grad_norm": 10.12997055053711, "learning_rate": 2.3074942791762014e-05, "loss": 1.2578, "step": 1614 }, { "epoch": 0.46, "grad_norm": 10.162885665893555, "learning_rate": 2.3070652173913047e-05, "loss": 1.234, "step": 1615 }, { "epoch": 0.46, "grad_norm": 7.93971061706543, "learning_rate": 2.3066361556064072e-05, "loss": 0.9553, "step": 1616 }, { "epoch": 0.46, "grad_norm": 9.405335426330566, "learning_rate": 2.3062070938215104e-05, "loss": 1.3859, "step": 1617 }, { "epoch": 0.46, "grad_norm": 8.421310424804688, "learning_rate": 2.3057780320366133e-05, "loss": 1.0926, "step": 1618 }, { "epoch": 0.46, "grad_norm": 8.382375717163086, "learning_rate": 2.305348970251716e-05, "loss": 0.9399, "step": 1619 }, { "epoch": 0.46, "grad_norm": 9.15948486328125, "learning_rate": 2.3049199084668194e-05, "loss": 1.4505, "step": 1620 }, { "epoch": 0.46, "grad_norm": 9.814046859741211, "learning_rate": 2.3044908466819223e-05, "loss": 0.9856, "step": 1621 }, { "epoch": 0.46, "grad_norm": 9.627074241638184, "learning_rate": 2.3040617848970255e-05, "loss": 1.349, "step": 1622 }, { "epoch": 0.46, "grad_norm": 9.516486167907715, "learning_rate": 2.303632723112128e-05, "loss": 1.1915, "step": 1623 }, { "epoch": 0.46, "grad_norm": 8.72075080871582, "learning_rate": 2.3032036613272312e-05, "loss": 1.2033, "step": 1624 }, { "epoch": 0.46, "grad_norm": 11.18325138092041, "learning_rate": 2.302774599542334e-05, "loss": 1.2994, "step": 1625 }, { "epoch": 0.47, "grad_norm": 8.222212791442871, "learning_rate": 2.3023455377574373e-05, "loss": 1.2269, "step": 1626 }, { "epoch": 0.47, "grad_norm": 9.96588134765625, "learning_rate": 2.30191647597254e-05, "loss": 1.5958, "step": 1627 }, { "epoch": 0.47, "grad_norm": 8.952670097351074, "learning_rate": 2.301487414187643e-05, "loss": 1.1666, "step": 1628 }, { "epoch": 0.47, "grad_norm": 9.964648246765137, "learning_rate": 2.3010583524027463e-05, "loss": 1.2652, "step": 1629 }, { "epoch": 0.47, "grad_norm": 8.549927711486816, "learning_rate": 2.3006292906178488e-05, "loss": 1.2339, "step": 1630 }, { "epoch": 0.47, "grad_norm": 7.86664342880249, "learning_rate": 2.300200228832952e-05, "loss": 1.2895, "step": 1631 }, { "epoch": 0.47, "grad_norm": 8.924127578735352, "learning_rate": 2.299771167048055e-05, "loss": 1.2733, "step": 1632 }, { "epoch": 0.47, "grad_norm": 8.034481048583984, "learning_rate": 2.299342105263158e-05, "loss": 1.2594, "step": 1633 }, { "epoch": 0.47, "grad_norm": 7.503042221069336, "learning_rate": 2.2989130434782607e-05, "loss": 1.1939, "step": 1634 }, { "epoch": 0.47, "grad_norm": 7.935110092163086, "learning_rate": 2.298483981693364e-05, "loss": 1.0546, "step": 1635 }, { "epoch": 0.47, "grad_norm": 8.795965194702148, "learning_rate": 2.2980549199084668e-05, "loss": 1.2768, "step": 1636 }, { "epoch": 0.47, "grad_norm": 8.024333953857422, "learning_rate": 2.29762585812357e-05, "loss": 1.0955, "step": 1637 }, { "epoch": 0.47, "grad_norm": 9.787989616394043, "learning_rate": 2.297196796338673e-05, "loss": 1.6545, "step": 1638 }, { "epoch": 0.47, "grad_norm": 7.520974159240723, "learning_rate": 2.2967677345537757e-05, "loss": 0.9123, "step": 1639 }, { "epoch": 0.47, "grad_norm": 9.345480918884277, "learning_rate": 2.296338672768879e-05, "loss": 1.2796, "step": 1640 }, { "epoch": 0.47, "grad_norm": 7.186126232147217, "learning_rate": 2.2959096109839818e-05, "loss": 0.8431, "step": 1641 }, { "epoch": 0.47, "grad_norm": 9.361184120178223, "learning_rate": 2.2954805491990847e-05, "loss": 1.4357, "step": 1642 }, { "epoch": 0.47, "grad_norm": 8.415900230407715, "learning_rate": 2.2950514874141876e-05, "loss": 1.1054, "step": 1643 }, { "epoch": 0.47, "grad_norm": 9.558908462524414, "learning_rate": 2.2946224256292908e-05, "loss": 1.2839, "step": 1644 }, { "epoch": 0.47, "grad_norm": 9.749784469604492, "learning_rate": 2.2941933638443933e-05, "loss": 1.2448, "step": 1645 }, { "epoch": 0.47, "grad_norm": 9.153518676757812, "learning_rate": 2.2937643020594965e-05, "loss": 1.1404, "step": 1646 }, { "epoch": 0.47, "grad_norm": 10.492255210876465, "learning_rate": 2.2933352402745998e-05, "loss": 1.421, "step": 1647 }, { "epoch": 0.47, "grad_norm": 9.747618675231934, "learning_rate": 2.2929061784897026e-05, "loss": 1.4087, "step": 1648 }, { "epoch": 0.47, "grad_norm": 9.309370994567871, "learning_rate": 2.2924771167048055e-05, "loss": 1.2397, "step": 1649 }, { "epoch": 0.47, "grad_norm": 8.79931926727295, "learning_rate": 2.2920480549199084e-05, "loss": 1.3496, "step": 1650 }, { "epoch": 0.47, "grad_norm": 9.985559463500977, "learning_rate": 2.2916189931350116e-05, "loss": 1.1182, "step": 1651 }, { "epoch": 0.47, "grad_norm": 8.381940841674805, "learning_rate": 2.2911899313501145e-05, "loss": 1.0647, "step": 1652 }, { "epoch": 0.47, "grad_norm": 9.11472225189209, "learning_rate": 2.2907608695652174e-05, "loss": 1.1354, "step": 1653 }, { "epoch": 0.47, "grad_norm": 7.868236064910889, "learning_rate": 2.2903318077803206e-05, "loss": 0.9615, "step": 1654 }, { "epoch": 0.47, "grad_norm": 10.499838829040527, "learning_rate": 2.2899027459954235e-05, "loss": 1.3577, "step": 1655 }, { "epoch": 0.47, "grad_norm": 9.916804313659668, "learning_rate": 2.2894736842105263e-05, "loss": 1.3944, "step": 1656 }, { "epoch": 0.47, "grad_norm": 9.581201553344727, "learning_rate": 2.2890446224256292e-05, "loss": 1.4577, "step": 1657 }, { "epoch": 0.47, "grad_norm": 10.742766380310059, "learning_rate": 2.2886155606407324e-05, "loss": 1.4189, "step": 1658 }, { "epoch": 0.47, "grad_norm": 9.715123176574707, "learning_rate": 2.2881864988558353e-05, "loss": 1.3955, "step": 1659 }, { "epoch": 0.47, "grad_norm": 8.724863052368164, "learning_rate": 2.2877574370709382e-05, "loss": 1.4059, "step": 1660 }, { "epoch": 0.48, "grad_norm": 8.264730453491211, "learning_rate": 2.287328375286041e-05, "loss": 1.1103, "step": 1661 }, { "epoch": 0.48, "grad_norm": 8.868542671203613, "learning_rate": 2.2868993135011443e-05, "loss": 1.1557, "step": 1662 }, { "epoch": 0.48, "grad_norm": 9.071462631225586, "learning_rate": 2.2864702517162475e-05, "loss": 1.3348, "step": 1663 }, { "epoch": 0.48, "grad_norm": 8.672259330749512, "learning_rate": 2.28604118993135e-05, "loss": 1.2869, "step": 1664 }, { "epoch": 0.48, "grad_norm": 8.83873176574707, "learning_rate": 2.2856121281464532e-05, "loss": 1.3988, "step": 1665 }, { "epoch": 0.48, "grad_norm": 9.405563354492188, "learning_rate": 2.285183066361556e-05, "loss": 1.3591, "step": 1666 }, { "epoch": 0.48, "grad_norm": 8.799030303955078, "learning_rate": 2.2847540045766593e-05, "loss": 1.2546, "step": 1667 }, { "epoch": 0.48, "grad_norm": 9.277997970581055, "learning_rate": 2.284324942791762e-05, "loss": 0.9324, "step": 1668 }, { "epoch": 0.48, "grad_norm": 8.408809661865234, "learning_rate": 2.283895881006865e-05, "loss": 1.2105, "step": 1669 }, { "epoch": 0.48, "grad_norm": 9.046985626220703, "learning_rate": 2.283466819221968e-05, "loss": 1.4227, "step": 1670 }, { "epoch": 0.48, "grad_norm": 8.309938430786133, "learning_rate": 2.283037757437071e-05, "loss": 1.0991, "step": 1671 }, { "epoch": 0.48, "grad_norm": 8.683304786682129, "learning_rate": 2.282608695652174e-05, "loss": 1.2208, "step": 1672 }, { "epoch": 0.48, "grad_norm": 9.217636108398438, "learning_rate": 2.282179633867277e-05, "loss": 1.1511, "step": 1673 }, { "epoch": 0.48, "grad_norm": 8.621679306030273, "learning_rate": 2.28175057208238e-05, "loss": 1.2189, "step": 1674 }, { "epoch": 0.48, "grad_norm": 9.081939697265625, "learning_rate": 2.2813215102974827e-05, "loss": 0.9736, "step": 1675 }, { "epoch": 0.48, "grad_norm": 10.959156036376953, "learning_rate": 2.280892448512586e-05, "loss": 1.1892, "step": 1676 }, { "epoch": 0.48, "grad_norm": 10.01351261138916, "learning_rate": 2.2804633867276888e-05, "loss": 1.378, "step": 1677 }, { "epoch": 0.48, "grad_norm": 7.741083145141602, "learning_rate": 2.280034324942792e-05, "loss": 1.1976, "step": 1678 }, { "epoch": 0.48, "grad_norm": 9.066365242004395, "learning_rate": 2.2796052631578945e-05, "loss": 1.0487, "step": 1679 }, { "epoch": 0.48, "grad_norm": 9.079463958740234, "learning_rate": 2.2791762013729977e-05, "loss": 1.03, "step": 1680 }, { "epoch": 0.48, "grad_norm": 11.072271347045898, "learning_rate": 2.278747139588101e-05, "loss": 1.2092, "step": 1681 }, { "epoch": 0.48, "grad_norm": 9.641533851623535, "learning_rate": 2.278318077803204e-05, "loss": 1.4444, "step": 1682 }, { "epoch": 0.48, "grad_norm": 8.743915557861328, "learning_rate": 2.2778890160183067e-05, "loss": 0.9681, "step": 1683 }, { "epoch": 0.48, "grad_norm": 9.499055862426758, "learning_rate": 2.2774599542334096e-05, "loss": 1.3324, "step": 1684 }, { "epoch": 0.48, "grad_norm": 9.612502098083496, "learning_rate": 2.2770308924485128e-05, "loss": 1.2739, "step": 1685 }, { "epoch": 0.48, "grad_norm": 9.890045166015625, "learning_rate": 2.2766018306636153e-05, "loss": 1.3744, "step": 1686 }, { "epoch": 0.48, "grad_norm": 9.541918754577637, "learning_rate": 2.2761727688787186e-05, "loss": 1.3663, "step": 1687 }, { "epoch": 0.48, "grad_norm": 8.16054630279541, "learning_rate": 2.2757437070938214e-05, "loss": 1.1082, "step": 1688 }, { "epoch": 0.48, "grad_norm": 9.091609954833984, "learning_rate": 2.2753146453089247e-05, "loss": 1.1344, "step": 1689 }, { "epoch": 0.48, "grad_norm": 9.458251953125, "learning_rate": 2.2748855835240275e-05, "loss": 1.228, "step": 1690 }, { "epoch": 0.48, "grad_norm": 10.517223358154297, "learning_rate": 2.2744565217391304e-05, "loss": 1.081, "step": 1691 }, { "epoch": 0.48, "grad_norm": 9.296842575073242, "learning_rate": 2.2740274599542336e-05, "loss": 1.0767, "step": 1692 }, { "epoch": 0.48, "grad_norm": 9.35966682434082, "learning_rate": 2.2735983981693365e-05, "loss": 1.2591, "step": 1693 }, { "epoch": 0.48, "grad_norm": 9.021504402160645, "learning_rate": 2.2731693363844394e-05, "loss": 1.0929, "step": 1694 }, { "epoch": 0.48, "grad_norm": 9.004847526550293, "learning_rate": 2.2727402745995423e-05, "loss": 1.0958, "step": 1695 }, { "epoch": 0.49, "grad_norm": 9.843135833740234, "learning_rate": 2.2723112128146455e-05, "loss": 1.2846, "step": 1696 }, { "epoch": 0.49, "grad_norm": 7.5560221672058105, "learning_rate": 2.2718821510297483e-05, "loss": 0.871, "step": 1697 }, { "epoch": 0.49, "grad_norm": 10.070405960083008, "learning_rate": 2.2714530892448512e-05, "loss": 1.2432, "step": 1698 }, { "epoch": 0.49, "grad_norm": 9.583444595336914, "learning_rate": 2.2710240274599544e-05, "loss": 1.4108, "step": 1699 }, { "epoch": 0.49, "grad_norm": 10.146078109741211, "learning_rate": 2.2705949656750573e-05, "loss": 1.1774, "step": 1700 }, { "epoch": 0.49, "grad_norm": 10.606247901916504, "learning_rate": 2.2701659038901602e-05, "loss": 1.2373, "step": 1701 }, { "epoch": 0.49, "grad_norm": 9.760239601135254, "learning_rate": 2.269736842105263e-05, "loss": 1.2494, "step": 1702 }, { "epoch": 0.49, "grad_norm": 10.328970909118652, "learning_rate": 2.2693077803203663e-05, "loss": 1.3952, "step": 1703 }, { "epoch": 0.49, "grad_norm": 15.020970344543457, "learning_rate": 2.268878718535469e-05, "loss": 1.6676, "step": 1704 }, { "epoch": 0.49, "grad_norm": 8.3358793258667, "learning_rate": 2.268449656750572e-05, "loss": 0.8649, "step": 1705 }, { "epoch": 0.49, "grad_norm": 8.585587501525879, "learning_rate": 2.2680205949656753e-05, "loss": 1.1444, "step": 1706 }, { "epoch": 0.49, "grad_norm": 8.351394653320312, "learning_rate": 2.267591533180778e-05, "loss": 1.1239, "step": 1707 }, { "epoch": 0.49, "grad_norm": 8.965201377868652, "learning_rate": 2.2671624713958813e-05, "loss": 1.2742, "step": 1708 }, { "epoch": 0.49, "grad_norm": 9.27197265625, "learning_rate": 2.266733409610984e-05, "loss": 1.1194, "step": 1709 }, { "epoch": 0.49, "grad_norm": 9.04944133758545, "learning_rate": 2.266304347826087e-05, "loss": 0.8732, "step": 1710 }, { "epoch": 0.49, "grad_norm": 8.792150497436523, "learning_rate": 2.26587528604119e-05, "loss": 1.1253, "step": 1711 }, { "epoch": 0.49, "grad_norm": 8.72215747833252, "learning_rate": 2.265446224256293e-05, "loss": 1.014, "step": 1712 }, { "epoch": 0.49, "grad_norm": 11.412017822265625, "learning_rate": 2.2650171624713957e-05, "loss": 1.4195, "step": 1713 }, { "epoch": 0.49, "grad_norm": 6.746893882751465, "learning_rate": 2.264588100686499e-05, "loss": 0.8599, "step": 1714 }, { "epoch": 0.49, "grad_norm": 9.354917526245117, "learning_rate": 2.264159038901602e-05, "loss": 1.2557, "step": 1715 }, { "epoch": 0.49, "grad_norm": 9.321457862854004, "learning_rate": 2.2637299771167047e-05, "loss": 1.4941, "step": 1716 }, { "epoch": 0.49, "grad_norm": 9.840455055236816, "learning_rate": 2.263300915331808e-05, "loss": 1.1555, "step": 1717 }, { "epoch": 0.49, "grad_norm": 9.907840728759766, "learning_rate": 2.2628718535469108e-05, "loss": 1.311, "step": 1718 }, { "epoch": 0.49, "grad_norm": 7.54386568069458, "learning_rate": 2.262442791762014e-05, "loss": 0.913, "step": 1719 }, { "epoch": 0.49, "grad_norm": 9.386448860168457, "learning_rate": 2.2620137299771165e-05, "loss": 0.9293, "step": 1720 }, { "epoch": 0.49, "grad_norm": 8.661848068237305, "learning_rate": 2.2615846681922198e-05, "loss": 1.1228, "step": 1721 }, { "epoch": 0.49, "grad_norm": 8.990273475646973, "learning_rate": 2.2611556064073226e-05, "loss": 1.1689, "step": 1722 }, { "epoch": 0.49, "grad_norm": 9.665467262268066, "learning_rate": 2.2607265446224255e-05, "loss": 1.448, "step": 1723 }, { "epoch": 0.49, "grad_norm": 11.016701698303223, "learning_rate": 2.2602974828375287e-05, "loss": 1.0422, "step": 1724 }, { "epoch": 0.49, "grad_norm": 8.635202407836914, "learning_rate": 2.2598684210526316e-05, "loss": 1.0084, "step": 1725 }, { "epoch": 0.49, "grad_norm": 7.417308807373047, "learning_rate": 2.2594393592677348e-05, "loss": 1.0566, "step": 1726 }, { "epoch": 0.49, "grad_norm": 8.810213088989258, "learning_rate": 2.2590102974828374e-05, "loss": 1.0975, "step": 1727 }, { "epoch": 0.49, "grad_norm": 9.824432373046875, "learning_rate": 2.2585812356979406e-05, "loss": 1.2073, "step": 1728 }, { "epoch": 0.49, "grad_norm": 11.824142456054688, "learning_rate": 2.2581521739130434e-05, "loss": 1.5631, "step": 1729 }, { "epoch": 0.49, "grad_norm": 9.870973587036133, "learning_rate": 2.2577231121281467e-05, "loss": 1.0913, "step": 1730 }, { "epoch": 0.5, "grad_norm": 9.540595054626465, "learning_rate": 2.2572940503432495e-05, "loss": 1.1692, "step": 1731 }, { "epoch": 0.5, "grad_norm": 11.168774604797363, "learning_rate": 2.2568649885583524e-05, "loss": 1.4247, "step": 1732 }, { "epoch": 0.5, "grad_norm": 10.752882957458496, "learning_rate": 2.2564359267734556e-05, "loss": 1.3847, "step": 1733 }, { "epoch": 0.5, "grad_norm": 8.558868408203125, "learning_rate": 2.2560068649885585e-05, "loss": 1.2971, "step": 1734 }, { "epoch": 0.5, "grad_norm": 9.694822311401367, "learning_rate": 2.2555778032036614e-05, "loss": 1.2184, "step": 1735 }, { "epoch": 0.5, "grad_norm": 8.882486343383789, "learning_rate": 2.2551487414187643e-05, "loss": 0.9044, "step": 1736 }, { "epoch": 0.5, "grad_norm": 8.602743148803711, "learning_rate": 2.2547196796338675e-05, "loss": 1.1263, "step": 1737 }, { "epoch": 0.5, "grad_norm": 8.551252365112305, "learning_rate": 2.25429061784897e-05, "loss": 1.3499, "step": 1738 }, { "epoch": 0.5, "grad_norm": 8.753378868103027, "learning_rate": 2.2538615560640732e-05, "loss": 1.0904, "step": 1739 }, { "epoch": 0.5, "grad_norm": 9.857767105102539, "learning_rate": 2.2534324942791764e-05, "loss": 1.3323, "step": 1740 }, { "epoch": 0.5, "grad_norm": 8.807705879211426, "learning_rate": 2.2530034324942793e-05, "loss": 1.0568, "step": 1741 }, { "epoch": 0.5, "grad_norm": 9.36620044708252, "learning_rate": 2.2525743707093822e-05, "loss": 1.0966, "step": 1742 }, { "epoch": 0.5, "grad_norm": 10.410367012023926, "learning_rate": 2.252145308924485e-05, "loss": 1.236, "step": 1743 }, { "epoch": 0.5, "grad_norm": 8.877896308898926, "learning_rate": 2.2517162471395883e-05, "loss": 0.9688, "step": 1744 }, { "epoch": 0.5, "grad_norm": 9.445262908935547, "learning_rate": 2.2512871853546912e-05, "loss": 1.2386, "step": 1745 }, { "epoch": 0.5, "grad_norm": 8.056120872497559, "learning_rate": 2.250858123569794e-05, "loss": 1.109, "step": 1746 }, { "epoch": 0.5, "grad_norm": 8.643233299255371, "learning_rate": 2.250429061784897e-05, "loss": 1.3352, "step": 1747 }, { "epoch": 0.5, "grad_norm": 10.496377944946289, "learning_rate": 2.25e-05, "loss": 1.5072, "step": 1748 }, { "epoch": 0.5, "grad_norm": 8.46739673614502, "learning_rate": 2.249570938215103e-05, "loss": 1.3112, "step": 1749 }, { "epoch": 0.5, "grad_norm": 8.37043571472168, "learning_rate": 2.249141876430206e-05, "loss": 1.2561, "step": 1750 }, { "epoch": 0.5, "grad_norm": 8.924062728881836, "learning_rate": 2.248712814645309e-05, "loss": 1.3819, "step": 1751 }, { "epoch": 0.5, "grad_norm": 8.79951286315918, "learning_rate": 2.248283752860412e-05, "loss": 1.2067, "step": 1752 }, { "epoch": 0.5, "grad_norm": 8.621842384338379, "learning_rate": 2.247854691075515e-05, "loss": 1.3184, "step": 1753 }, { "epoch": 0.5, "grad_norm": 8.6490478515625, "learning_rate": 2.2474256292906177e-05, "loss": 1.1118, "step": 1754 }, { "epoch": 0.5, "grad_norm": 8.803616523742676, "learning_rate": 2.246996567505721e-05, "loss": 1.2251, "step": 1755 }, { "epoch": 0.5, "grad_norm": 7.9771599769592285, "learning_rate": 2.246567505720824e-05, "loss": 1.0104, "step": 1756 }, { "epoch": 0.5, "grad_norm": 8.340866088867188, "learning_rate": 2.2461384439359267e-05, "loss": 0.8333, "step": 1757 }, { "epoch": 0.5, "grad_norm": 8.675475120544434, "learning_rate": 2.24570938215103e-05, "loss": 1.1501, "step": 1758 }, { "epoch": 0.5, "grad_norm": 8.976713180541992, "learning_rate": 2.2452803203661328e-05, "loss": 1.4332, "step": 1759 }, { "epoch": 0.5, "grad_norm": 9.324586868286133, "learning_rate": 2.244851258581236e-05, "loss": 1.2735, "step": 1760 }, { "epoch": 0.5, "grad_norm": 9.389039039611816, "learning_rate": 2.2444221967963386e-05, "loss": 1.3965, "step": 1761 }, { "epoch": 0.5, "grad_norm": 7.7743425369262695, "learning_rate": 2.2439931350114418e-05, "loss": 1.0439, "step": 1762 }, { "epoch": 0.5, "grad_norm": 8.806344032287598, "learning_rate": 2.2435640732265446e-05, "loss": 1.2115, "step": 1763 }, { "epoch": 0.5, "grad_norm": 8.250657081604004, "learning_rate": 2.2431350114416475e-05, "loss": 1.3243, "step": 1764 }, { "epoch": 0.5, "grad_norm": 8.528936386108398, "learning_rate": 2.2427059496567504e-05, "loss": 1.2346, "step": 1765 }, { "epoch": 0.51, "grad_norm": 9.796095848083496, "learning_rate": 2.2422768878718536e-05, "loss": 1.3457, "step": 1766 }, { "epoch": 0.51, "grad_norm": 6.960087776184082, "learning_rate": 2.241847826086957e-05, "loss": 1.1521, "step": 1767 }, { "epoch": 0.51, "grad_norm": 6.853693962097168, "learning_rate": 2.2414187643020594e-05, "loss": 1.1208, "step": 1768 }, { "epoch": 0.51, "grad_norm": 8.802952766418457, "learning_rate": 2.2409897025171626e-05, "loss": 1.188, "step": 1769 }, { "epoch": 0.51, "grad_norm": 7.594489574432373, "learning_rate": 2.2405606407322655e-05, "loss": 1.5319, "step": 1770 }, { "epoch": 0.51, "grad_norm": 8.9003267288208, "learning_rate": 2.2401315789473687e-05, "loss": 1.1745, "step": 1771 }, { "epoch": 0.51, "grad_norm": 8.605582237243652, "learning_rate": 2.2397025171624712e-05, "loss": 1.03, "step": 1772 }, { "epoch": 0.51, "grad_norm": 8.687684059143066, "learning_rate": 2.2392734553775744e-05, "loss": 1.2067, "step": 1773 }, { "epoch": 0.51, "grad_norm": 8.256669044494629, "learning_rate": 2.2388443935926776e-05, "loss": 0.9294, "step": 1774 }, { "epoch": 0.51, "grad_norm": 9.63094711303711, "learning_rate": 2.2384153318077802e-05, "loss": 1.2256, "step": 1775 }, { "epoch": 0.51, "grad_norm": 7.976043224334717, "learning_rate": 2.2379862700228834e-05, "loss": 1.191, "step": 1776 }, { "epoch": 0.51, "grad_norm": 9.502062797546387, "learning_rate": 2.2375572082379863e-05, "loss": 1.106, "step": 1777 }, { "epoch": 0.51, "grad_norm": 8.629040718078613, "learning_rate": 2.2371281464530895e-05, "loss": 1.1471, "step": 1778 }, { "epoch": 0.51, "grad_norm": 10.833641052246094, "learning_rate": 2.236699084668192e-05, "loss": 1.3265, "step": 1779 }, { "epoch": 0.51, "grad_norm": 9.588047981262207, "learning_rate": 2.2362700228832952e-05, "loss": 1.1669, "step": 1780 }, { "epoch": 0.51, "grad_norm": 9.60359001159668, "learning_rate": 2.235840961098398e-05, "loss": 1.1454, "step": 1781 }, { "epoch": 0.51, "grad_norm": 10.990739822387695, "learning_rate": 2.2354118993135013e-05, "loss": 1.4634, "step": 1782 }, { "epoch": 0.51, "grad_norm": 10.82275390625, "learning_rate": 2.2349828375286042e-05, "loss": 1.3927, "step": 1783 }, { "epoch": 0.51, "grad_norm": 10.499259948730469, "learning_rate": 2.234553775743707e-05, "loss": 1.3967, "step": 1784 }, { "epoch": 0.51, "grad_norm": 10.63853645324707, "learning_rate": 2.2341247139588103e-05, "loss": 1.1029, "step": 1785 }, { "epoch": 0.51, "grad_norm": 9.913352012634277, "learning_rate": 2.2336956521739132e-05, "loss": 1.507, "step": 1786 }, { "epoch": 0.51, "grad_norm": 9.286478042602539, "learning_rate": 2.233266590389016e-05, "loss": 1.3428, "step": 1787 }, { "epoch": 0.51, "grad_norm": 8.3721342086792, "learning_rate": 2.232837528604119e-05, "loss": 1.061, "step": 1788 }, { "epoch": 0.51, "grad_norm": 9.989459037780762, "learning_rate": 2.232408466819222e-05, "loss": 1.2538, "step": 1789 }, { "epoch": 0.51, "grad_norm": 10.688898086547852, "learning_rate": 2.2319794050343247e-05, "loss": 1.0603, "step": 1790 }, { "epoch": 0.51, "grad_norm": 8.396010398864746, "learning_rate": 2.231550343249428e-05, "loss": 1.0253, "step": 1791 }, { "epoch": 0.51, "grad_norm": 7.522763252258301, "learning_rate": 2.231121281464531e-05, "loss": 0.966, "step": 1792 }, { "epoch": 0.51, "grad_norm": 10.1019287109375, "learning_rate": 2.230692219679634e-05, "loss": 1.3999, "step": 1793 }, { "epoch": 0.51, "grad_norm": 7.7475266456604, "learning_rate": 2.230263157894737e-05, "loss": 1.3943, "step": 1794 }, { "epoch": 0.51, "grad_norm": 9.222977638244629, "learning_rate": 2.2298340961098398e-05, "loss": 1.4151, "step": 1795 }, { "epoch": 0.51, "grad_norm": 8.31693172454834, "learning_rate": 2.229405034324943e-05, "loss": 1.0597, "step": 1796 }, { "epoch": 0.51, "grad_norm": 9.179215431213379, "learning_rate": 2.228975972540046e-05, "loss": 1.2418, "step": 1797 }, { "epoch": 0.51, "grad_norm": 9.257237434387207, "learning_rate": 2.2285469107551487e-05, "loss": 0.9888, "step": 1798 }, { "epoch": 0.51, "grad_norm": 9.153144836425781, "learning_rate": 2.2281178489702516e-05, "loss": 1.4073, "step": 1799 }, { "epoch": 0.51, "grad_norm": 7.4600982666015625, "learning_rate": 2.2276887871853548e-05, "loss": 0.9782, "step": 1800 }, { "epoch": 0.52, "grad_norm": 7.916541576385498, "learning_rate": 2.2272597254004577e-05, "loss": 1.159, "step": 1801 }, { "epoch": 0.52, "grad_norm": 8.717126846313477, "learning_rate": 2.2268306636155606e-05, "loss": 1.1931, "step": 1802 }, { "epoch": 0.52, "grad_norm": 9.714426040649414, "learning_rate": 2.2264016018306638e-05, "loss": 1.3754, "step": 1803 }, { "epoch": 0.52, "grad_norm": 10.216131210327148, "learning_rate": 2.2259725400457667e-05, "loss": 1.1484, "step": 1804 }, { "epoch": 0.52, "grad_norm": 10.28882122039795, "learning_rate": 2.2255434782608695e-05, "loss": 1.3258, "step": 1805 }, { "epoch": 0.52, "grad_norm": 8.439464569091797, "learning_rate": 2.2251144164759724e-05, "loss": 1.3977, "step": 1806 }, { "epoch": 0.52, "grad_norm": 9.396167755126953, "learning_rate": 2.2246853546910756e-05, "loss": 1.4009, "step": 1807 }, { "epoch": 0.52, "grad_norm": 7.737992763519287, "learning_rate": 2.224256292906179e-05, "loss": 1.2623, "step": 1808 }, { "epoch": 0.52, "grad_norm": 9.101402282714844, "learning_rate": 2.2238272311212814e-05, "loss": 1.458, "step": 1809 }, { "epoch": 0.52, "grad_norm": 8.564967155456543, "learning_rate": 2.2233981693363846e-05, "loss": 1.1027, "step": 1810 }, { "epoch": 0.52, "grad_norm": 8.401267051696777, "learning_rate": 2.2229691075514875e-05, "loss": 0.9862, "step": 1811 }, { "epoch": 0.52, "grad_norm": 9.633465766906738, "learning_rate": 2.2225400457665907e-05, "loss": 1.23, "step": 1812 }, { "epoch": 0.52, "grad_norm": 10.191815376281738, "learning_rate": 2.2221109839816932e-05, "loss": 1.1127, "step": 1813 }, { "epoch": 0.52, "grad_norm": 7.706086158752441, "learning_rate": 2.2216819221967964e-05, "loss": 0.9632, "step": 1814 }, { "epoch": 0.52, "grad_norm": 8.303768157958984, "learning_rate": 2.2212528604118993e-05, "loss": 0.9504, "step": 1815 }, { "epoch": 0.52, "grad_norm": 10.180074691772461, "learning_rate": 2.2208237986270022e-05, "loss": 1.0633, "step": 1816 }, { "epoch": 0.52, "grad_norm": 9.964677810668945, "learning_rate": 2.2203947368421054e-05, "loss": 1.4001, "step": 1817 }, { "epoch": 0.52, "grad_norm": 7.654510021209717, "learning_rate": 2.2199656750572083e-05, "loss": 1.0369, "step": 1818 }, { "epoch": 0.52, "grad_norm": 9.303197860717773, "learning_rate": 2.2195366132723115e-05, "loss": 1.2726, "step": 1819 }, { "epoch": 0.52, "grad_norm": 9.660150527954102, "learning_rate": 2.219107551487414e-05, "loss": 0.9767, "step": 1820 }, { "epoch": 0.52, "grad_norm": 10.521946907043457, "learning_rate": 2.2186784897025173e-05, "loss": 1.317, "step": 1821 }, { "epoch": 0.52, "grad_norm": 10.280237197875977, "learning_rate": 2.21824942791762e-05, "loss": 1.302, "step": 1822 }, { "epoch": 0.52, "grad_norm": 10.660154342651367, "learning_rate": 2.2178203661327234e-05, "loss": 1.4013, "step": 1823 }, { "epoch": 0.52, "grad_norm": 10.585969924926758, "learning_rate": 2.217391304347826e-05, "loss": 1.3267, "step": 1824 }, { "epoch": 0.52, "grad_norm": 8.46932315826416, "learning_rate": 2.216962242562929e-05, "loss": 1.2359, "step": 1825 }, { "epoch": 0.52, "grad_norm": 11.864045143127441, "learning_rate": 2.2165331807780323e-05, "loss": 1.5453, "step": 1826 }, { "epoch": 0.52, "grad_norm": 10.174424171447754, "learning_rate": 2.216104118993135e-05, "loss": 1.1714, "step": 1827 }, { "epoch": 0.52, "grad_norm": 9.30581283569336, "learning_rate": 2.215675057208238e-05, "loss": 1.2256, "step": 1828 }, { "epoch": 0.52, "grad_norm": 9.124938011169434, "learning_rate": 2.215245995423341e-05, "loss": 1.3058, "step": 1829 }, { "epoch": 0.52, "grad_norm": 7.997518539428711, "learning_rate": 2.2148169336384442e-05, "loss": 1.1819, "step": 1830 }, { "epoch": 0.52, "grad_norm": 8.968917846679688, "learning_rate": 2.2143878718535467e-05, "loss": 1.4171, "step": 1831 }, { "epoch": 0.52, "grad_norm": 8.303597450256348, "learning_rate": 2.21395881006865e-05, "loss": 1.0651, "step": 1832 }, { "epoch": 0.52, "grad_norm": 9.020977020263672, "learning_rate": 2.2135297482837528e-05, "loss": 0.999, "step": 1833 }, { "epoch": 0.52, "grad_norm": 10.523795127868652, "learning_rate": 2.213100686498856e-05, "loss": 1.1654, "step": 1834 }, { "epoch": 0.52, "grad_norm": 9.899744987487793, "learning_rate": 2.212671624713959e-05, "loss": 1.2683, "step": 1835 }, { "epoch": 0.53, "grad_norm": 8.98214054107666, "learning_rate": 2.2122425629290618e-05, "loss": 0.9926, "step": 1836 }, { "epoch": 0.53, "grad_norm": 8.100053787231445, "learning_rate": 2.211813501144165e-05, "loss": 1.0374, "step": 1837 }, { "epoch": 0.53, "grad_norm": 9.72776985168457, "learning_rate": 2.211384439359268e-05, "loss": 1.5318, "step": 1838 }, { "epoch": 0.53, "grad_norm": 8.794493675231934, "learning_rate": 2.2109553775743707e-05, "loss": 1.1, "step": 1839 }, { "epoch": 0.53, "grad_norm": 8.37671184539795, "learning_rate": 2.2105263157894736e-05, "loss": 1.0562, "step": 1840 }, { "epoch": 0.53, "grad_norm": 7.808015823364258, "learning_rate": 2.210097254004577e-05, "loss": 0.8847, "step": 1841 }, { "epoch": 0.53, "grad_norm": 9.175644874572754, "learning_rate": 2.2096681922196794e-05, "loss": 1.527, "step": 1842 }, { "epoch": 0.53, "grad_norm": 7.95731782913208, "learning_rate": 2.2092391304347826e-05, "loss": 1.2384, "step": 1843 }, { "epoch": 0.53, "grad_norm": 9.053457260131836, "learning_rate": 2.2088100686498858e-05, "loss": 1.4655, "step": 1844 }, { "epoch": 0.53, "grad_norm": 9.427711486816406, "learning_rate": 2.2083810068649887e-05, "loss": 1.2248, "step": 1845 }, { "epoch": 0.53, "grad_norm": 9.888148307800293, "learning_rate": 2.2079519450800916e-05, "loss": 1.3585, "step": 1846 }, { "epoch": 0.53, "grad_norm": 9.449552536010742, "learning_rate": 2.2075228832951944e-05, "loss": 1.2309, "step": 1847 }, { "epoch": 0.53, "grad_norm": 9.936347007751465, "learning_rate": 2.2070938215102976e-05, "loss": 1.3455, "step": 1848 }, { "epoch": 0.53, "grad_norm": 8.431443214416504, "learning_rate": 2.2066647597254005e-05, "loss": 1.172, "step": 1849 }, { "epoch": 0.53, "grad_norm": 8.189772605895996, "learning_rate": 2.2062356979405034e-05, "loss": 1.1464, "step": 1850 }, { "epoch": 0.53, "grad_norm": 8.310342788696289, "learning_rate": 2.2058066361556066e-05, "loss": 1.2688, "step": 1851 }, { "epoch": 0.53, "grad_norm": 8.572994232177734, "learning_rate": 2.2053775743707095e-05, "loss": 1.2116, "step": 1852 }, { "epoch": 0.53, "grad_norm": 9.292588233947754, "learning_rate": 2.2049485125858124e-05, "loss": 1.2581, "step": 1853 }, { "epoch": 0.53, "grad_norm": 8.470903396606445, "learning_rate": 2.2045194508009152e-05, "loss": 1.0894, "step": 1854 }, { "epoch": 0.53, "grad_norm": 8.987678527832031, "learning_rate": 2.2040903890160185e-05, "loss": 1.5021, "step": 1855 }, { "epoch": 0.53, "grad_norm": 8.505946159362793, "learning_rate": 2.2036613272311213e-05, "loss": 1.1892, "step": 1856 }, { "epoch": 0.53, "grad_norm": 9.59296989440918, "learning_rate": 2.2032322654462242e-05, "loss": 1.4473, "step": 1857 }, { "epoch": 0.53, "grad_norm": 7.823615074157715, "learning_rate": 2.202803203661327e-05, "loss": 0.9165, "step": 1858 }, { "epoch": 0.53, "grad_norm": 9.013300895690918, "learning_rate": 2.2023741418764303e-05, "loss": 1.3646, "step": 1859 }, { "epoch": 0.53, "grad_norm": 8.896345138549805, "learning_rate": 2.2019450800915335e-05, "loss": 1.0996, "step": 1860 }, { "epoch": 0.53, "grad_norm": 8.645607948303223, "learning_rate": 2.201516018306636e-05, "loss": 0.8644, "step": 1861 }, { "epoch": 0.53, "grad_norm": 8.687626838684082, "learning_rate": 2.2010869565217393e-05, "loss": 0.887, "step": 1862 }, { "epoch": 0.53, "grad_norm": 10.690119743347168, "learning_rate": 2.200657894736842e-05, "loss": 1.3183, "step": 1863 }, { "epoch": 0.53, "grad_norm": 9.361321449279785, "learning_rate": 2.2002288329519454e-05, "loss": 1.3161, "step": 1864 }, { "epoch": 0.53, "grad_norm": 10.281274795532227, "learning_rate": 2.199799771167048e-05, "loss": 1.4265, "step": 1865 }, { "epoch": 0.53, "grad_norm": 9.622797012329102, "learning_rate": 2.199370709382151e-05, "loss": 1.3256, "step": 1866 }, { "epoch": 0.53, "grad_norm": 11.24836254119873, "learning_rate": 2.198941647597254e-05, "loss": 1.2944, "step": 1867 }, { "epoch": 0.53, "grad_norm": 9.081978797912598, "learning_rate": 2.198512585812357e-05, "loss": 1.021, "step": 1868 }, { "epoch": 0.53, "grad_norm": 8.716439247131348, "learning_rate": 2.19808352402746e-05, "loss": 1.2928, "step": 1869 }, { "epoch": 0.53, "grad_norm": 9.750237464904785, "learning_rate": 2.197654462242563e-05, "loss": 1.3407, "step": 1870 }, { "epoch": 0.54, "grad_norm": 8.843461036682129, "learning_rate": 2.1972254004576662e-05, "loss": 1.195, "step": 1871 }, { "epoch": 0.54, "grad_norm": 9.61376953125, "learning_rate": 2.1967963386727687e-05, "loss": 1.1895, "step": 1872 }, { "epoch": 0.54, "grad_norm": 10.649840354919434, "learning_rate": 2.196367276887872e-05, "loss": 1.6845, "step": 1873 }, { "epoch": 0.54, "grad_norm": 9.97451400756836, "learning_rate": 2.1959382151029748e-05, "loss": 1.136, "step": 1874 }, { "epoch": 0.54, "grad_norm": 10.086967468261719, "learning_rate": 2.195509153318078e-05, "loss": 1.3111, "step": 1875 }, { "epoch": 0.54, "grad_norm": 8.95392894744873, "learning_rate": 2.1950800915331806e-05, "loss": 1.0321, "step": 1876 }, { "epoch": 0.54, "grad_norm": 7.904888153076172, "learning_rate": 2.1946510297482838e-05, "loss": 1.1564, "step": 1877 }, { "epoch": 0.54, "grad_norm": 8.450617790222168, "learning_rate": 2.194221967963387e-05, "loss": 1.2575, "step": 1878 }, { "epoch": 0.54, "grad_norm": 8.170722961425781, "learning_rate": 2.19379290617849e-05, "loss": 1.0407, "step": 1879 }, { "epoch": 0.54, "grad_norm": 9.442112922668457, "learning_rate": 2.1933638443935928e-05, "loss": 1.2719, "step": 1880 }, { "epoch": 0.54, "grad_norm": 9.413675308227539, "learning_rate": 2.1929347826086956e-05, "loss": 1.1307, "step": 1881 }, { "epoch": 0.54, "grad_norm": 11.167388916015625, "learning_rate": 2.192505720823799e-05, "loss": 1.8384, "step": 1882 }, { "epoch": 0.54, "grad_norm": 9.069658279418945, "learning_rate": 2.1920766590389014e-05, "loss": 1.1637, "step": 1883 }, { "epoch": 0.54, "grad_norm": 8.350651741027832, "learning_rate": 2.1916475972540046e-05, "loss": 1.2792, "step": 1884 }, { "epoch": 0.54, "grad_norm": 8.218442916870117, "learning_rate": 2.1912185354691075e-05, "loss": 1.0882, "step": 1885 }, { "epoch": 0.54, "grad_norm": 10.116166114807129, "learning_rate": 2.1907894736842107e-05, "loss": 1.3364, "step": 1886 }, { "epoch": 0.54, "grad_norm": 7.011248588562012, "learning_rate": 2.1903604118993136e-05, "loss": 0.9083, "step": 1887 }, { "epoch": 0.54, "grad_norm": 8.695975303649902, "learning_rate": 2.1899313501144164e-05, "loss": 1.2792, "step": 1888 }, { "epoch": 0.54, "grad_norm": 8.591291427612305, "learning_rate": 2.1895022883295197e-05, "loss": 1.2621, "step": 1889 }, { "epoch": 0.54, "grad_norm": 8.328107833862305, "learning_rate": 2.1890732265446225e-05, "loss": 0.9975, "step": 1890 }, { "epoch": 0.54, "grad_norm": 8.86084270477295, "learning_rate": 2.1886441647597254e-05, "loss": 1.6399, "step": 1891 }, { "epoch": 0.54, "grad_norm": 7.860621929168701, "learning_rate": 2.1882151029748283e-05, "loss": 1.0072, "step": 1892 }, { "epoch": 0.54, "grad_norm": 9.932562828063965, "learning_rate": 2.1877860411899315e-05, "loss": 1.2332, "step": 1893 }, { "epoch": 0.54, "grad_norm": 8.443490982055664, "learning_rate": 2.1873569794050344e-05, "loss": 1.2122, "step": 1894 }, { "epoch": 0.54, "grad_norm": 7.739225387573242, "learning_rate": 2.1869279176201373e-05, "loss": 1.0659, "step": 1895 }, { "epoch": 0.54, "grad_norm": 9.519488334655762, "learning_rate": 2.1864988558352405e-05, "loss": 1.5054, "step": 1896 }, { "epoch": 0.54, "grad_norm": 9.8134765625, "learning_rate": 2.1860697940503434e-05, "loss": 1.3584, "step": 1897 }, { "epoch": 0.54, "grad_norm": 8.263157844543457, "learning_rate": 2.1856407322654462e-05, "loss": 0.9603, "step": 1898 }, { "epoch": 0.54, "grad_norm": 9.455352783203125, "learning_rate": 2.185211670480549e-05, "loss": 1.1942, "step": 1899 }, { "epoch": 0.54, "grad_norm": 9.731051445007324, "learning_rate": 2.1847826086956523e-05, "loss": 1.1219, "step": 1900 }, { "epoch": 0.54, "grad_norm": 11.34709358215332, "learning_rate": 2.1843535469107552e-05, "loss": 1.1467, "step": 1901 }, { "epoch": 0.54, "grad_norm": 7.7822394371032715, "learning_rate": 2.183924485125858e-05, "loss": 1.0081, "step": 1902 }, { "epoch": 0.54, "grad_norm": 10.143342971801758, "learning_rate": 2.1834954233409613e-05, "loss": 1.25, "step": 1903 }, { "epoch": 0.54, "grad_norm": 8.767507553100586, "learning_rate": 2.183066361556064e-05, "loss": 0.8448, "step": 1904 }, { "epoch": 0.54, "grad_norm": 9.370316505432129, "learning_rate": 2.1826372997711674e-05, "loss": 1.0835, "step": 1905 }, { "epoch": 0.55, "grad_norm": 8.88417911529541, "learning_rate": 2.18220823798627e-05, "loss": 1.1458, "step": 1906 }, { "epoch": 0.55, "grad_norm": 10.507073402404785, "learning_rate": 2.181779176201373e-05, "loss": 1.2188, "step": 1907 }, { "epoch": 0.55, "grad_norm": 8.445672035217285, "learning_rate": 2.181350114416476e-05, "loss": 1.1862, "step": 1908 }, { "epoch": 0.55, "grad_norm": 9.55842399597168, "learning_rate": 2.180921052631579e-05, "loss": 1.4181, "step": 1909 }, { "epoch": 0.55, "grad_norm": 9.664935111999512, "learning_rate": 2.1804919908466818e-05, "loss": 1.1623, "step": 1910 }, { "epoch": 0.55, "grad_norm": 9.136595726013184, "learning_rate": 2.180062929061785e-05, "loss": 1.3494, "step": 1911 }, { "epoch": 0.55, "grad_norm": 7.968899250030518, "learning_rate": 2.1796338672768882e-05, "loss": 0.9201, "step": 1912 }, { "epoch": 0.55, "grad_norm": 7.745423793792725, "learning_rate": 2.1792048054919907e-05, "loss": 1.2271, "step": 1913 }, { "epoch": 0.55, "grad_norm": 7.834641933441162, "learning_rate": 2.178775743707094e-05, "loss": 0.9795, "step": 1914 }, { "epoch": 0.55, "grad_norm": 9.000736236572266, "learning_rate": 2.1783466819221968e-05, "loss": 1.1041, "step": 1915 }, { "epoch": 0.55, "grad_norm": 9.695572853088379, "learning_rate": 2.1779176201373e-05, "loss": 1.176, "step": 1916 }, { "epoch": 0.55, "grad_norm": 8.908391952514648, "learning_rate": 2.1774885583524026e-05, "loss": 1.1249, "step": 1917 }, { "epoch": 0.55, "grad_norm": 9.691105842590332, "learning_rate": 2.1770594965675058e-05, "loss": 1.1521, "step": 1918 }, { "epoch": 0.55, "grad_norm": 9.583081245422363, "learning_rate": 2.1766304347826087e-05, "loss": 1.1156, "step": 1919 }, { "epoch": 0.55, "grad_norm": 12.590463638305664, "learning_rate": 2.1762013729977116e-05, "loss": 1.264, "step": 1920 }, { "epoch": 0.55, "grad_norm": 8.349695205688477, "learning_rate": 2.1757723112128148e-05, "loss": 1.1436, "step": 1921 }, { "epoch": 0.55, "grad_norm": 8.978100776672363, "learning_rate": 2.1753432494279176e-05, "loss": 0.9863, "step": 1922 }, { "epoch": 0.55, "grad_norm": 9.360097885131836, "learning_rate": 2.174914187643021e-05, "loss": 1.1557, "step": 1923 }, { "epoch": 0.55, "grad_norm": 9.199853897094727, "learning_rate": 2.1744851258581234e-05, "loss": 1.3216, "step": 1924 }, { "epoch": 0.55, "grad_norm": 8.022394180297852, "learning_rate": 2.1740560640732266e-05, "loss": 1.2244, "step": 1925 }, { "epoch": 0.55, "grad_norm": 9.001893997192383, "learning_rate": 2.1736270022883295e-05, "loss": 1.3324, "step": 1926 }, { "epoch": 0.55, "grad_norm": 9.297123908996582, "learning_rate": 2.1731979405034327e-05, "loss": 1.1158, "step": 1927 }, { "epoch": 0.55, "grad_norm": 10.891748428344727, "learning_rate": 2.1727688787185356e-05, "loss": 1.2127, "step": 1928 }, { "epoch": 0.55, "grad_norm": 9.588907241821289, "learning_rate": 2.1723398169336385e-05, "loss": 1.2623, "step": 1929 }, { "epoch": 0.55, "grad_norm": 8.479381561279297, "learning_rate": 2.1719107551487417e-05, "loss": 1.1961, "step": 1930 }, { "epoch": 0.55, "grad_norm": 9.2621431350708, "learning_rate": 2.1714816933638446e-05, "loss": 1.3056, "step": 1931 }, { "epoch": 0.55, "grad_norm": 8.670048713684082, "learning_rate": 2.1710526315789474e-05, "loss": 1.2837, "step": 1932 }, { "epoch": 0.55, "grad_norm": 8.699110984802246, "learning_rate": 2.1706235697940503e-05, "loss": 1.2077, "step": 1933 }, { "epoch": 0.55, "grad_norm": 7.799557685852051, "learning_rate": 2.1701945080091535e-05, "loss": 0.8993, "step": 1934 }, { "epoch": 0.55, "grad_norm": 7.742098808288574, "learning_rate": 2.169765446224256e-05, "loss": 1.2505, "step": 1935 }, { "epoch": 0.55, "grad_norm": 9.196280479431152, "learning_rate": 2.1693363844393593e-05, "loss": 1.3756, "step": 1936 }, { "epoch": 0.55, "grad_norm": 6.625796318054199, "learning_rate": 2.1689073226544625e-05, "loss": 0.8229, "step": 1937 }, { "epoch": 0.55, "grad_norm": 8.674047470092773, "learning_rate": 2.1684782608695654e-05, "loss": 0.9824, "step": 1938 }, { "epoch": 0.55, "grad_norm": 7.702921390533447, "learning_rate": 2.1680491990846682e-05, "loss": 1.0809, "step": 1939 }, { "epoch": 0.55, "grad_norm": 9.056543350219727, "learning_rate": 2.167620137299771e-05, "loss": 1.1688, "step": 1940 }, { "epoch": 0.56, "grad_norm": 9.157732963562012, "learning_rate": 2.1671910755148743e-05, "loss": 1.201, "step": 1941 }, { "epoch": 0.56, "grad_norm": 10.283062934875488, "learning_rate": 2.1667620137299772e-05, "loss": 1.4347, "step": 1942 }, { "epoch": 0.56, "grad_norm": 8.770071029663086, "learning_rate": 2.16633295194508e-05, "loss": 0.9662, "step": 1943 }, { "epoch": 0.56, "grad_norm": 8.866783142089844, "learning_rate": 2.165903890160183e-05, "loss": 1.1569, "step": 1944 }, { "epoch": 0.56, "grad_norm": 9.816003799438477, "learning_rate": 2.1654748283752862e-05, "loss": 1.1886, "step": 1945 }, { "epoch": 0.56, "grad_norm": 11.375457763671875, "learning_rate": 2.165045766590389e-05, "loss": 1.307, "step": 1946 }, { "epoch": 0.56, "grad_norm": 9.87993335723877, "learning_rate": 2.164616704805492e-05, "loss": 1.2194, "step": 1947 }, { "epoch": 0.56, "grad_norm": 9.221055030822754, "learning_rate": 2.164187643020595e-05, "loss": 1.1166, "step": 1948 }, { "epoch": 0.56, "grad_norm": 10.571791648864746, "learning_rate": 2.163758581235698e-05, "loss": 1.2822, "step": 1949 }, { "epoch": 0.56, "grad_norm": 8.576422691345215, "learning_rate": 2.163329519450801e-05, "loss": 0.8849, "step": 1950 }, { "epoch": 0.56, "grad_norm": 9.687113761901855, "learning_rate": 2.1629004576659038e-05, "loss": 1.1576, "step": 1951 }, { "epoch": 0.56, "grad_norm": 8.230101585388184, "learning_rate": 2.162471395881007e-05, "loss": 0.8704, "step": 1952 }, { "epoch": 0.56, "grad_norm": 10.570345878601074, "learning_rate": 2.16204233409611e-05, "loss": 0.9873, "step": 1953 }, { "epoch": 0.56, "grad_norm": 10.308501243591309, "learning_rate": 2.1616132723112128e-05, "loss": 1.5126, "step": 1954 }, { "epoch": 0.56, "grad_norm": 7.877004146575928, "learning_rate": 2.161184210526316e-05, "loss": 1.0293, "step": 1955 }, { "epoch": 0.56, "grad_norm": 9.860822677612305, "learning_rate": 2.160755148741419e-05, "loss": 1.1995, "step": 1956 }, { "epoch": 0.56, "grad_norm": 8.284786224365234, "learning_rate": 2.160326086956522e-05, "loss": 1.1766, "step": 1957 }, { "epoch": 0.56, "grad_norm": 10.226141929626465, "learning_rate": 2.1598970251716246e-05, "loss": 1.4969, "step": 1958 }, { "epoch": 0.56, "grad_norm": 11.073193550109863, "learning_rate": 2.1594679633867278e-05, "loss": 1.326, "step": 1959 }, { "epoch": 0.56, "grad_norm": 8.278326988220215, "learning_rate": 2.1590389016018307e-05, "loss": 1.1496, "step": 1960 }, { "epoch": 0.56, "grad_norm": 8.75911808013916, "learning_rate": 2.1586098398169336e-05, "loss": 1.302, "step": 1961 }, { "epoch": 0.56, "grad_norm": 9.249210357666016, "learning_rate": 2.1581807780320364e-05, "loss": 1.4291, "step": 1962 }, { "epoch": 0.56, "grad_norm": 8.406425476074219, "learning_rate": 2.1577517162471397e-05, "loss": 1.1631, "step": 1963 }, { "epoch": 0.56, "grad_norm": 9.443185806274414, "learning_rate": 2.157322654462243e-05, "loss": 1.7803, "step": 1964 }, { "epoch": 0.56, "grad_norm": 6.937527656555176, "learning_rate": 2.1568935926773454e-05, "loss": 0.8683, "step": 1965 }, { "epoch": 0.56, "grad_norm": 8.713139533996582, "learning_rate": 2.1564645308924486e-05, "loss": 1.0959, "step": 1966 }, { "epoch": 0.56, "grad_norm": 8.732977867126465, "learning_rate": 2.1560354691075515e-05, "loss": 1.103, "step": 1967 }, { "epoch": 0.56, "grad_norm": 9.105290412902832, "learning_rate": 2.1556064073226547e-05, "loss": 1.1155, "step": 1968 }, { "epoch": 0.56, "grad_norm": 8.14785385131836, "learning_rate": 2.1551773455377573e-05, "loss": 1.0435, "step": 1969 }, { "epoch": 0.56, "grad_norm": 8.170175552368164, "learning_rate": 2.1547482837528605e-05, "loss": 0.8512, "step": 1970 }, { "epoch": 0.56, "grad_norm": 12.869532585144043, "learning_rate": 2.1543192219679637e-05, "loss": 1.489, "step": 1971 }, { "epoch": 0.56, "grad_norm": 9.479695320129395, "learning_rate": 2.1538901601830662e-05, "loss": 1.2053, "step": 1972 }, { "epoch": 0.56, "grad_norm": 9.297906875610352, "learning_rate": 2.1534610983981694e-05, "loss": 1.0794, "step": 1973 }, { "epoch": 0.56, "grad_norm": 8.7311429977417, "learning_rate": 2.1530320366132723e-05, "loss": 1.1172, "step": 1974 }, { "epoch": 0.56, "grad_norm": 7.909209251403809, "learning_rate": 2.1526029748283755e-05, "loss": 0.8978, "step": 1975 }, { "epoch": 0.57, "grad_norm": 8.352812767028809, "learning_rate": 2.152173913043478e-05, "loss": 1.1804, "step": 1976 }, { "epoch": 0.57, "grad_norm": 10.12589168548584, "learning_rate": 2.1517448512585813e-05, "loss": 1.1711, "step": 1977 }, { "epoch": 0.57, "grad_norm": 9.779499053955078, "learning_rate": 2.151315789473684e-05, "loss": 1.2041, "step": 1978 }, { "epoch": 0.57, "grad_norm": 8.799803733825684, "learning_rate": 2.1508867276887874e-05, "loss": 1.0511, "step": 1979 }, { "epoch": 0.57, "grad_norm": 8.467108726501465, "learning_rate": 2.1504576659038903e-05, "loss": 1.2735, "step": 1980 }, { "epoch": 0.57, "grad_norm": 8.35204792022705, "learning_rate": 2.150028604118993e-05, "loss": 1.0897, "step": 1981 }, { "epoch": 0.57, "grad_norm": 8.898812294006348, "learning_rate": 2.1495995423340963e-05, "loss": 1.101, "step": 1982 }, { "epoch": 0.57, "grad_norm": 8.413460731506348, "learning_rate": 2.1491704805491992e-05, "loss": 1.1917, "step": 1983 }, { "epoch": 0.57, "grad_norm": 9.989850997924805, "learning_rate": 2.148741418764302e-05, "loss": 1.2358, "step": 1984 }, { "epoch": 0.57, "grad_norm": 7.671233654022217, "learning_rate": 2.148312356979405e-05, "loss": 1.2357, "step": 1985 }, { "epoch": 0.57, "grad_norm": 10.515417098999023, "learning_rate": 2.1478832951945082e-05, "loss": 1.1486, "step": 1986 }, { "epoch": 0.57, "grad_norm": 8.841854095458984, "learning_rate": 2.1474542334096107e-05, "loss": 1.3288, "step": 1987 }, { "epoch": 0.57, "grad_norm": 8.639217376708984, "learning_rate": 2.147025171624714e-05, "loss": 1.1916, "step": 1988 }, { "epoch": 0.57, "grad_norm": 8.984121322631836, "learning_rate": 2.146596109839817e-05, "loss": 1.2535, "step": 1989 }, { "epoch": 0.57, "grad_norm": 8.637347221374512, "learning_rate": 2.14616704805492e-05, "loss": 1.0519, "step": 1990 }, { "epoch": 0.57, "grad_norm": 9.064813613891602, "learning_rate": 2.145737986270023e-05, "loss": 1.2175, "step": 1991 }, { "epoch": 0.57, "grad_norm": 9.587684631347656, "learning_rate": 2.1453089244851258e-05, "loss": 0.9991, "step": 1992 }, { "epoch": 0.57, "grad_norm": 8.530813217163086, "learning_rate": 2.144879862700229e-05, "loss": 0.9494, "step": 1993 }, { "epoch": 0.57, "grad_norm": 8.605618476867676, "learning_rate": 2.144450800915332e-05, "loss": 1.2244, "step": 1994 }, { "epoch": 0.57, "grad_norm": 8.557844161987305, "learning_rate": 2.1440217391304348e-05, "loss": 1.0231, "step": 1995 }, { "epoch": 0.57, "grad_norm": 7.781949043273926, "learning_rate": 2.1435926773455376e-05, "loss": 1.192, "step": 1996 }, { "epoch": 0.57, "grad_norm": 8.588788986206055, "learning_rate": 2.143163615560641e-05, "loss": 1.2064, "step": 1997 }, { "epoch": 0.57, "grad_norm": 10.51740550994873, "learning_rate": 2.1427345537757437e-05, "loss": 1.3746, "step": 1998 }, { "epoch": 0.57, "grad_norm": 10.21289348602295, "learning_rate": 2.1423054919908466e-05, "loss": 1.1652, "step": 1999 }, { "epoch": 0.57, "grad_norm": 7.317837715148926, "learning_rate": 2.1418764302059498e-05, "loss": 1.2338, "step": 2000 }, { "epoch": 0.57, "grad_norm": 9.614209175109863, "learning_rate": 2.1414473684210527e-05, "loss": 1.1308, "step": 2001 }, { "epoch": 0.57, "grad_norm": 8.713343620300293, "learning_rate": 2.1410183066361556e-05, "loss": 1.0924, "step": 2002 }, { "epoch": 0.57, "grad_norm": 9.420449256896973, "learning_rate": 2.1405892448512585e-05, "loss": 1.2524, "step": 2003 }, { "epoch": 0.57, "grad_norm": 10.03909683227539, "learning_rate": 2.1401601830663617e-05, "loss": 1.0411, "step": 2004 }, { "epoch": 0.57, "grad_norm": 10.105528831481934, "learning_rate": 2.1397311212814645e-05, "loss": 1.0933, "step": 2005 }, { "epoch": 0.57, "grad_norm": 11.266928672790527, "learning_rate": 2.1393020594965674e-05, "loss": 1.2867, "step": 2006 }, { "epoch": 0.57, "grad_norm": 8.959481239318848, "learning_rate": 2.1388729977116706e-05, "loss": 0.9824, "step": 2007 }, { "epoch": 0.57, "grad_norm": 11.12039852142334, "learning_rate": 2.1384439359267735e-05, "loss": 1.3624, "step": 2008 }, { "epoch": 0.57, "grad_norm": 9.747989654541016, "learning_rate": 2.1380148741418767e-05, "loss": 0.9588, "step": 2009 }, { "epoch": 0.57, "grad_norm": 9.255857467651367, "learning_rate": 2.1375858123569793e-05, "loss": 1.1861, "step": 2010 }, { "epoch": 0.58, "grad_norm": 9.196959495544434, "learning_rate": 2.1371567505720825e-05, "loss": 1.1194, "step": 2011 }, { "epoch": 0.58, "grad_norm": 7.997098922729492, "learning_rate": 2.1367276887871854e-05, "loss": 1.2389, "step": 2012 }, { "epoch": 0.58, "grad_norm": 8.245403289794922, "learning_rate": 2.1362986270022882e-05, "loss": 1.0143, "step": 2013 }, { "epoch": 0.58, "grad_norm": 9.781437873840332, "learning_rate": 2.1358695652173915e-05, "loss": 1.1881, "step": 2014 }, { "epoch": 0.58, "grad_norm": 9.052122116088867, "learning_rate": 2.1354405034324943e-05, "loss": 0.8558, "step": 2015 }, { "epoch": 0.58, "grad_norm": 8.379697799682617, "learning_rate": 2.1350114416475975e-05, "loss": 1.1805, "step": 2016 }, { "epoch": 0.58, "grad_norm": 8.204866409301758, "learning_rate": 2.1345823798627e-05, "loss": 0.9265, "step": 2017 }, { "epoch": 0.58, "grad_norm": 8.82898235321045, "learning_rate": 2.1341533180778033e-05, "loss": 1.2062, "step": 2018 }, { "epoch": 0.58, "grad_norm": 8.235489845275879, "learning_rate": 2.1337242562929062e-05, "loss": 1.1957, "step": 2019 }, { "epoch": 0.58, "grad_norm": 9.028446197509766, "learning_rate": 2.1332951945080094e-05, "loss": 1.0049, "step": 2020 }, { "epoch": 0.58, "grad_norm": 8.168365478515625, "learning_rate": 2.132866132723112e-05, "loss": 0.9962, "step": 2021 }, { "epoch": 0.58, "grad_norm": 9.710518836975098, "learning_rate": 2.132437070938215e-05, "loss": 1.1069, "step": 2022 }, { "epoch": 0.58, "grad_norm": 9.715204238891602, "learning_rate": 2.1320080091533184e-05, "loss": 1.3578, "step": 2023 }, { "epoch": 0.58, "grad_norm": 9.091570854187012, "learning_rate": 2.1315789473684212e-05, "loss": 0.8876, "step": 2024 }, { "epoch": 0.58, "grad_norm": 9.888350486755371, "learning_rate": 2.131149885583524e-05, "loss": 1.1702, "step": 2025 }, { "epoch": 0.58, "grad_norm": 11.44349479675293, "learning_rate": 2.130720823798627e-05, "loss": 1.715, "step": 2026 }, { "epoch": 0.58, "grad_norm": 9.71235466003418, "learning_rate": 2.1302917620137302e-05, "loss": 1.2066, "step": 2027 }, { "epoch": 0.58, "grad_norm": 11.094636917114258, "learning_rate": 2.1298627002288327e-05, "loss": 0.91, "step": 2028 }, { "epoch": 0.58, "grad_norm": 8.74800968170166, "learning_rate": 2.129433638443936e-05, "loss": 1.0944, "step": 2029 }, { "epoch": 0.58, "grad_norm": 9.274219512939453, "learning_rate": 2.129004576659039e-05, "loss": 0.9759, "step": 2030 }, { "epoch": 0.58, "grad_norm": 10.601633071899414, "learning_rate": 2.128575514874142e-05, "loss": 1.491, "step": 2031 }, { "epoch": 0.58, "grad_norm": 7.937924385070801, "learning_rate": 2.128146453089245e-05, "loss": 1.0133, "step": 2032 }, { "epoch": 0.58, "grad_norm": 7.562763214111328, "learning_rate": 2.1277173913043478e-05, "loss": 0.9443, "step": 2033 }, { "epoch": 0.58, "grad_norm": 10.2804536819458, "learning_rate": 2.127288329519451e-05, "loss": 1.473, "step": 2034 }, { "epoch": 0.58, "grad_norm": 9.761529922485352, "learning_rate": 2.126859267734554e-05, "loss": 1.2093, "step": 2035 }, { "epoch": 0.58, "grad_norm": 8.06948471069336, "learning_rate": 2.1264302059496568e-05, "loss": 1.2422, "step": 2036 }, { "epoch": 0.58, "grad_norm": 7.157027244567871, "learning_rate": 2.1260011441647597e-05, "loss": 0.8158, "step": 2037 }, { "epoch": 0.58, "grad_norm": 9.515137672424316, "learning_rate": 2.125572082379863e-05, "loss": 1.424, "step": 2038 }, { "epoch": 0.58, "grad_norm": 9.041302680969238, "learning_rate": 2.1251430205949654e-05, "loss": 0.908, "step": 2039 }, { "epoch": 0.58, "grad_norm": 9.615650177001953, "learning_rate": 2.1247139588100686e-05, "loss": 1.4213, "step": 2040 }, { "epoch": 0.58, "grad_norm": 9.955256462097168, "learning_rate": 2.124284897025172e-05, "loss": 1.3073, "step": 2041 }, { "epoch": 0.58, "grad_norm": 7.355934143066406, "learning_rate": 2.1238558352402747e-05, "loss": 0.8669, "step": 2042 }, { "epoch": 0.58, "grad_norm": 10.485574722290039, "learning_rate": 2.1234267734553776e-05, "loss": 1.4687, "step": 2043 }, { "epoch": 0.58, "grad_norm": 8.188101768493652, "learning_rate": 2.1229977116704805e-05, "loss": 1.083, "step": 2044 }, { "epoch": 0.58, "grad_norm": 8.055419921875, "learning_rate": 2.1225686498855837e-05, "loss": 1.189, "step": 2045 }, { "epoch": 0.59, "grad_norm": 8.69979190826416, "learning_rate": 2.1221395881006866e-05, "loss": 0.8412, "step": 2046 }, { "epoch": 0.59, "grad_norm": 8.15188217163086, "learning_rate": 2.1217105263157894e-05, "loss": 1.0282, "step": 2047 }, { "epoch": 0.59, "grad_norm": 11.337603569030762, "learning_rate": 2.1212814645308927e-05, "loss": 1.1298, "step": 2048 }, { "epoch": 0.59, "grad_norm": 11.604207038879395, "learning_rate": 2.1208524027459955e-05, "loss": 1.26, "step": 2049 }, { "epoch": 0.59, "grad_norm": 10.798079490661621, "learning_rate": 2.1204233409610984e-05, "loss": 1.3073, "step": 2050 }, { "epoch": 0.59, "grad_norm": 9.075469017028809, "learning_rate": 2.1199942791762013e-05, "loss": 1.1813, "step": 2051 }, { "epoch": 0.59, "grad_norm": 9.19278335571289, "learning_rate": 2.1195652173913045e-05, "loss": 1.0175, "step": 2052 }, { "epoch": 0.59, "grad_norm": 8.641410827636719, "learning_rate": 2.1191361556064074e-05, "loss": 1.0907, "step": 2053 }, { "epoch": 0.59, "grad_norm": 10.665210723876953, "learning_rate": 2.1187070938215103e-05, "loss": 1.5654, "step": 2054 }, { "epoch": 0.59, "grad_norm": 9.677375793457031, "learning_rate": 2.118278032036613e-05, "loss": 1.3304, "step": 2055 }, { "epoch": 0.59, "grad_norm": 10.193497657775879, "learning_rate": 2.1178489702517163e-05, "loss": 1.1239, "step": 2056 }, { "epoch": 0.59, "grad_norm": 10.984728813171387, "learning_rate": 2.1174199084668196e-05, "loss": 1.3757, "step": 2057 }, { "epoch": 0.59, "grad_norm": 9.717297554016113, "learning_rate": 2.116990846681922e-05, "loss": 1.147, "step": 2058 }, { "epoch": 0.59, "grad_norm": 8.830130577087402, "learning_rate": 2.1165617848970253e-05, "loss": 0.9154, "step": 2059 }, { "epoch": 0.59, "grad_norm": 8.209005355834961, "learning_rate": 2.1161327231121282e-05, "loss": 1.4371, "step": 2060 }, { "epoch": 0.59, "grad_norm": 9.334105491638184, "learning_rate": 2.1157036613272314e-05, "loss": 1.055, "step": 2061 }, { "epoch": 0.59, "grad_norm": 10.272869110107422, "learning_rate": 2.115274599542334e-05, "loss": 1.049, "step": 2062 }, { "epoch": 0.59, "grad_norm": 9.7098388671875, "learning_rate": 2.114845537757437e-05, "loss": 1.1033, "step": 2063 }, { "epoch": 0.59, "grad_norm": 8.774618148803711, "learning_rate": 2.11441647597254e-05, "loss": 0.8308, "step": 2064 }, { "epoch": 0.59, "grad_norm": 8.327656745910645, "learning_rate": 2.113987414187643e-05, "loss": 1.0889, "step": 2065 }, { "epoch": 0.59, "grad_norm": 7.887519359588623, "learning_rate": 2.113558352402746e-05, "loss": 1.0136, "step": 2066 }, { "epoch": 0.59, "grad_norm": 8.921797752380371, "learning_rate": 2.113129290617849e-05, "loss": 1.0613, "step": 2067 }, { "epoch": 0.59, "grad_norm": 9.537035942077637, "learning_rate": 2.1127002288329522e-05, "loss": 1.2971, "step": 2068 }, { "epoch": 0.59, "grad_norm": 10.419792175292969, "learning_rate": 2.1122711670480548e-05, "loss": 1.504, "step": 2069 }, { "epoch": 0.59, "grad_norm": 9.048227310180664, "learning_rate": 2.111842105263158e-05, "loss": 1.1806, "step": 2070 }, { "epoch": 0.59, "grad_norm": 10.106603622436523, "learning_rate": 2.111413043478261e-05, "loss": 1.4529, "step": 2071 }, { "epoch": 0.59, "grad_norm": 9.84475040435791, "learning_rate": 2.110983981693364e-05, "loss": 0.9912, "step": 2072 }, { "epoch": 0.59, "grad_norm": 9.732037544250488, "learning_rate": 2.1105549199084666e-05, "loss": 1.0068, "step": 2073 }, { "epoch": 0.59, "grad_norm": 9.757185935974121, "learning_rate": 2.1101258581235698e-05, "loss": 1.2628, "step": 2074 }, { "epoch": 0.59, "grad_norm": 9.87586498260498, "learning_rate": 2.109696796338673e-05, "loss": 1.5249, "step": 2075 }, { "epoch": 0.59, "grad_norm": 9.647394180297852, "learning_rate": 2.109267734553776e-05, "loss": 1.3032, "step": 2076 }, { "epoch": 0.59, "grad_norm": 9.906692504882812, "learning_rate": 2.1088386727688788e-05, "loss": 1.1063, "step": 2077 }, { "epoch": 0.59, "grad_norm": 6.8456926345825195, "learning_rate": 2.1084096109839817e-05, "loss": 0.728, "step": 2078 }, { "epoch": 0.59, "grad_norm": 10.565463066101074, "learning_rate": 2.107980549199085e-05, "loss": 1.2561, "step": 2079 }, { "epoch": 0.59, "grad_norm": 8.672847747802734, "learning_rate": 2.1075514874141874e-05, "loss": 1.2062, "step": 2080 }, { "epoch": 0.6, "grad_norm": 10.219789505004883, "learning_rate": 2.1071224256292906e-05, "loss": 1.1501, "step": 2081 }, { "epoch": 0.6, "grad_norm": 10.40511417388916, "learning_rate": 2.1066933638443935e-05, "loss": 1.0968, "step": 2082 }, { "epoch": 0.6, "grad_norm": 8.463879585266113, "learning_rate": 2.1062643020594967e-05, "loss": 0.9518, "step": 2083 }, { "epoch": 0.6, "grad_norm": 9.839200973510742, "learning_rate": 2.1058352402745996e-05, "loss": 1.1395, "step": 2084 }, { "epoch": 0.6, "grad_norm": 8.488741874694824, "learning_rate": 2.1054061784897025e-05, "loss": 1.0916, "step": 2085 }, { "epoch": 0.6, "grad_norm": 10.752121925354004, "learning_rate": 2.1049771167048057e-05, "loss": 1.1249, "step": 2086 }, { "epoch": 0.6, "grad_norm": 10.362652778625488, "learning_rate": 2.1045480549199086e-05, "loss": 1.4336, "step": 2087 }, { "epoch": 0.6, "grad_norm": 8.308162689208984, "learning_rate": 2.1041189931350115e-05, "loss": 0.9118, "step": 2088 }, { "epoch": 0.6, "grad_norm": 9.194893836975098, "learning_rate": 2.1036899313501143e-05, "loss": 0.9576, "step": 2089 }, { "epoch": 0.6, "grad_norm": 8.937129020690918, "learning_rate": 2.1032608695652175e-05, "loss": 1.1631, "step": 2090 }, { "epoch": 0.6, "grad_norm": 9.854636192321777, "learning_rate": 2.1028318077803204e-05, "loss": 1.2521, "step": 2091 }, { "epoch": 0.6, "grad_norm": 9.050986289978027, "learning_rate": 2.1024027459954233e-05, "loss": 1.081, "step": 2092 }, { "epoch": 0.6, "grad_norm": 9.640084266662598, "learning_rate": 2.1019736842105265e-05, "loss": 1.0249, "step": 2093 }, { "epoch": 0.6, "grad_norm": 10.013104438781738, "learning_rate": 2.1015446224256294e-05, "loss": 0.9533, "step": 2094 }, { "epoch": 0.6, "grad_norm": 10.025662422180176, "learning_rate": 2.1011155606407323e-05, "loss": 1.1224, "step": 2095 }, { "epoch": 0.6, "grad_norm": 8.688165664672852, "learning_rate": 2.100686498855835e-05, "loss": 1.0492, "step": 2096 }, { "epoch": 0.6, "grad_norm": 8.700384140014648, "learning_rate": 2.1002574370709384e-05, "loss": 1.0879, "step": 2097 }, { "epoch": 0.6, "grad_norm": 11.119878768920898, "learning_rate": 2.0998283752860412e-05, "loss": 1.4303, "step": 2098 }, { "epoch": 0.6, "grad_norm": 7.985533714294434, "learning_rate": 2.099399313501144e-05, "loss": 1.0919, "step": 2099 }, { "epoch": 0.6, "grad_norm": 9.751565933227539, "learning_rate": 2.0989702517162473e-05, "loss": 1.2801, "step": 2100 }, { "epoch": 0.6, "grad_norm": 9.29434585571289, "learning_rate": 2.0985411899313502e-05, "loss": 1.0066, "step": 2101 }, { "epoch": 0.6, "grad_norm": 8.124516487121582, "learning_rate": 2.0981121281464534e-05, "loss": 1.2844, "step": 2102 }, { "epoch": 0.6, "grad_norm": 8.318306922912598, "learning_rate": 2.097683066361556e-05, "loss": 1.5056, "step": 2103 }, { "epoch": 0.6, "grad_norm": 8.364325523376465, "learning_rate": 2.0972540045766592e-05, "loss": 1.052, "step": 2104 }, { "epoch": 0.6, "grad_norm": 7.576369285583496, "learning_rate": 2.096824942791762e-05, "loss": 1.0159, "step": 2105 }, { "epoch": 0.6, "grad_norm": 8.611291885375977, "learning_rate": 2.096395881006865e-05, "loss": 1.1869, "step": 2106 }, { "epoch": 0.6, "grad_norm": 7.1158061027526855, "learning_rate": 2.0959668192219678e-05, "loss": 1.0589, "step": 2107 }, { "epoch": 0.6, "grad_norm": 8.34353256225586, "learning_rate": 2.095537757437071e-05, "loss": 1.1575, "step": 2108 }, { "epoch": 0.6, "grad_norm": 7.847663879394531, "learning_rate": 2.0951086956521742e-05, "loss": 1.0464, "step": 2109 }, { "epoch": 0.6, "grad_norm": 9.272878646850586, "learning_rate": 2.0946796338672768e-05, "loss": 1.2564, "step": 2110 }, { "epoch": 0.6, "grad_norm": 10.065635681152344, "learning_rate": 2.09425057208238e-05, "loss": 1.5392, "step": 2111 }, { "epoch": 0.6, "grad_norm": 9.396218299865723, "learning_rate": 2.093821510297483e-05, "loss": 1.1943, "step": 2112 }, { "epoch": 0.6, "grad_norm": 6.927637577056885, "learning_rate": 2.093392448512586e-05, "loss": 0.9002, "step": 2113 }, { "epoch": 0.6, "grad_norm": 9.358610153198242, "learning_rate": 2.0929633867276886e-05, "loss": 1.3487, "step": 2114 }, { "epoch": 0.6, "grad_norm": 8.768491744995117, "learning_rate": 2.092534324942792e-05, "loss": 1.263, "step": 2115 }, { "epoch": 0.61, "grad_norm": 9.752141952514648, "learning_rate": 2.0921052631578947e-05, "loss": 1.1306, "step": 2116 }, { "epoch": 0.61, "grad_norm": 8.661421775817871, "learning_rate": 2.0916762013729976e-05, "loss": 1.0151, "step": 2117 }, { "epoch": 0.61, "grad_norm": 10.647745132446289, "learning_rate": 2.0912471395881008e-05, "loss": 1.0757, "step": 2118 }, { "epoch": 0.61, "grad_norm": 9.90361213684082, "learning_rate": 2.0908180778032037e-05, "loss": 1.1973, "step": 2119 }, { "epoch": 0.61, "grad_norm": 10.638439178466797, "learning_rate": 2.090389016018307e-05, "loss": 1.0734, "step": 2120 }, { "epoch": 0.61, "grad_norm": 10.39708423614502, "learning_rate": 2.0899599542334094e-05, "loss": 1.1697, "step": 2121 }, { "epoch": 0.61, "grad_norm": 11.243378639221191, "learning_rate": 2.0895308924485127e-05, "loss": 1.4658, "step": 2122 }, { "epoch": 0.61, "grad_norm": 10.966980934143066, "learning_rate": 2.0891018306636155e-05, "loss": 1.3194, "step": 2123 }, { "epoch": 0.61, "grad_norm": 8.226202011108398, "learning_rate": 2.0886727688787187e-05, "loss": 1.2252, "step": 2124 }, { "epoch": 0.61, "grad_norm": 7.807674407958984, "learning_rate": 2.0882437070938213e-05, "loss": 0.936, "step": 2125 }, { "epoch": 0.61, "grad_norm": 9.516277313232422, "learning_rate": 2.0878146453089245e-05, "loss": 1.1853, "step": 2126 }, { "epoch": 0.61, "grad_norm": 8.409971237182617, "learning_rate": 2.0873855835240277e-05, "loss": 1.1543, "step": 2127 }, { "epoch": 0.61, "grad_norm": 8.993494987487793, "learning_rate": 2.0869565217391306e-05, "loss": 1.4076, "step": 2128 }, { "epoch": 0.61, "grad_norm": 8.260432243347168, "learning_rate": 2.0865274599542335e-05, "loss": 1.1792, "step": 2129 }, { "epoch": 0.61, "grad_norm": 7.629167556762695, "learning_rate": 2.0860983981693363e-05, "loss": 1.1718, "step": 2130 }, { "epoch": 0.61, "grad_norm": 9.276948928833008, "learning_rate": 2.0856693363844396e-05, "loss": 1.2379, "step": 2131 }, { "epoch": 0.61, "grad_norm": 8.391092300415039, "learning_rate": 2.085240274599542e-05, "loss": 1.0704, "step": 2132 }, { "epoch": 0.61, "grad_norm": 7.370236396789551, "learning_rate": 2.0848112128146453e-05, "loss": 1.2056, "step": 2133 }, { "epoch": 0.61, "grad_norm": 9.195940971374512, "learning_rate": 2.0843821510297485e-05, "loss": 1.1338, "step": 2134 }, { "epoch": 0.61, "grad_norm": 7.95945930480957, "learning_rate": 2.0839530892448514e-05, "loss": 1.0615, "step": 2135 }, { "epoch": 0.61, "grad_norm": 7.3939056396484375, "learning_rate": 2.0835240274599543e-05, "loss": 0.9169, "step": 2136 }, { "epoch": 0.61, "grad_norm": 8.680707931518555, "learning_rate": 2.083094965675057e-05, "loss": 1.3699, "step": 2137 }, { "epoch": 0.61, "grad_norm": 9.372149467468262, "learning_rate": 2.0826659038901604e-05, "loss": 1.2526, "step": 2138 }, { "epoch": 0.61, "grad_norm": 9.262450218200684, "learning_rate": 2.0822368421052633e-05, "loss": 1.2425, "step": 2139 }, { "epoch": 0.61, "grad_norm": 7.113325119018555, "learning_rate": 2.081807780320366e-05, "loss": 0.7992, "step": 2140 }, { "epoch": 0.61, "grad_norm": 9.704121589660645, "learning_rate": 2.081378718535469e-05, "loss": 1.351, "step": 2141 }, { "epoch": 0.61, "grad_norm": 10.085105895996094, "learning_rate": 2.0809496567505722e-05, "loss": 1.2512, "step": 2142 }, { "epoch": 0.61, "grad_norm": 9.638779640197754, "learning_rate": 2.080520594965675e-05, "loss": 1.1392, "step": 2143 }, { "epoch": 0.61, "grad_norm": 9.373291015625, "learning_rate": 2.080091533180778e-05, "loss": 1.312, "step": 2144 }, { "epoch": 0.61, "grad_norm": 8.3704833984375, "learning_rate": 2.0796624713958812e-05, "loss": 1.0496, "step": 2145 }, { "epoch": 0.61, "grad_norm": 8.622896194458008, "learning_rate": 2.079233409610984e-05, "loss": 1.1862, "step": 2146 }, { "epoch": 0.61, "grad_norm": 10.123673439025879, "learning_rate": 2.078804347826087e-05, "loss": 1.1337, "step": 2147 }, { "epoch": 0.61, "grad_norm": 8.257584571838379, "learning_rate": 2.0783752860411898e-05, "loss": 1.0569, "step": 2148 }, { "epoch": 0.61, "grad_norm": 9.453557968139648, "learning_rate": 2.077946224256293e-05, "loss": 1.0547, "step": 2149 }, { "epoch": 0.61, "grad_norm": 11.368425369262695, "learning_rate": 2.077517162471396e-05, "loss": 1.3893, "step": 2150 }, { "epoch": 0.62, "grad_norm": 6.905810356140137, "learning_rate": 2.0770881006864988e-05, "loss": 0.8847, "step": 2151 }, { "epoch": 0.62, "grad_norm": 9.193455696105957, "learning_rate": 2.076659038901602e-05, "loss": 1.1597, "step": 2152 }, { "epoch": 0.62, "grad_norm": 8.975447654724121, "learning_rate": 2.076229977116705e-05, "loss": 1.4165, "step": 2153 }, { "epoch": 0.62, "grad_norm": 8.44798469543457, "learning_rate": 2.075800915331808e-05, "loss": 0.8604, "step": 2154 }, { "epoch": 0.62, "grad_norm": 9.232073783874512, "learning_rate": 2.0753718535469106e-05, "loss": 0.9972, "step": 2155 }, { "epoch": 0.62, "grad_norm": 9.040815353393555, "learning_rate": 2.074942791762014e-05, "loss": 1.0944, "step": 2156 }, { "epoch": 0.62, "grad_norm": 8.155389785766602, "learning_rate": 2.0745137299771167e-05, "loss": 0.9028, "step": 2157 }, { "epoch": 0.62, "grad_norm": 7.844616413116455, "learning_rate": 2.0740846681922196e-05, "loss": 1.0819, "step": 2158 }, { "epoch": 0.62, "grad_norm": 10.479866027832031, "learning_rate": 2.0736556064073225e-05, "loss": 1.247, "step": 2159 }, { "epoch": 0.62, "grad_norm": 9.226712226867676, "learning_rate": 2.0732265446224257e-05, "loss": 1.2943, "step": 2160 }, { "epoch": 0.62, "grad_norm": 10.073928833007812, "learning_rate": 2.072797482837529e-05, "loss": 1.1224, "step": 2161 }, { "epoch": 0.62, "grad_norm": 9.664060592651367, "learning_rate": 2.0723684210526315e-05, "loss": 1.1303, "step": 2162 }, { "epoch": 0.62, "grad_norm": 7.801047325134277, "learning_rate": 2.0719393592677347e-05, "loss": 0.9249, "step": 2163 }, { "epoch": 0.62, "grad_norm": 9.232869148254395, "learning_rate": 2.0715102974828375e-05, "loss": 1.2656, "step": 2164 }, { "epoch": 0.62, "grad_norm": 9.818890571594238, "learning_rate": 2.0710812356979408e-05, "loss": 1.2511, "step": 2165 }, { "epoch": 0.62, "grad_norm": 8.902132987976074, "learning_rate": 2.0706521739130433e-05, "loss": 0.9535, "step": 2166 }, { "epoch": 0.62, "grad_norm": 9.043094635009766, "learning_rate": 2.0702231121281465e-05, "loss": 1.0496, "step": 2167 }, { "epoch": 0.62, "grad_norm": 9.18312931060791, "learning_rate": 2.0697940503432497e-05, "loss": 1.0329, "step": 2168 }, { "epoch": 0.62, "grad_norm": 8.758755683898926, "learning_rate": 2.0693649885583523e-05, "loss": 0.928, "step": 2169 }, { "epoch": 0.62, "grad_norm": 8.533914566040039, "learning_rate": 2.0689359267734555e-05, "loss": 0.9896, "step": 2170 }, { "epoch": 0.62, "grad_norm": 10.245708465576172, "learning_rate": 2.0685068649885584e-05, "loss": 0.9935, "step": 2171 }, { "epoch": 0.62, "grad_norm": 9.271585464477539, "learning_rate": 2.0680778032036616e-05, "loss": 1.0515, "step": 2172 }, { "epoch": 0.62, "grad_norm": 9.259747505187988, "learning_rate": 2.067648741418764e-05, "loss": 1.3014, "step": 2173 }, { "epoch": 0.62, "grad_norm": 8.592206954956055, "learning_rate": 2.0672196796338673e-05, "loss": 1.0706, "step": 2174 }, { "epoch": 0.62, "grad_norm": 8.149291038513184, "learning_rate": 2.0667906178489702e-05, "loss": 1.0591, "step": 2175 }, { "epoch": 0.62, "grad_norm": 9.54188346862793, "learning_rate": 2.0663615560640734e-05, "loss": 1.1857, "step": 2176 }, { "epoch": 0.62, "grad_norm": 8.516733169555664, "learning_rate": 2.0659324942791763e-05, "loss": 1.0539, "step": 2177 }, { "epoch": 0.62, "grad_norm": 8.936399459838867, "learning_rate": 2.0655034324942792e-05, "loss": 1.0294, "step": 2178 }, { "epoch": 0.62, "grad_norm": 8.519046783447266, "learning_rate": 2.0650743707093824e-05, "loss": 0.9475, "step": 2179 }, { "epoch": 0.62, "grad_norm": 9.646597862243652, "learning_rate": 2.0646453089244853e-05, "loss": 1.376, "step": 2180 }, { "epoch": 0.62, "grad_norm": 10.472565650939941, "learning_rate": 2.064216247139588e-05, "loss": 0.9122, "step": 2181 }, { "epoch": 0.62, "grad_norm": 9.792078971862793, "learning_rate": 2.063787185354691e-05, "loss": 1.2285, "step": 2182 }, { "epoch": 0.62, "grad_norm": 9.674057006835938, "learning_rate": 2.0633581235697942e-05, "loss": 1.0658, "step": 2183 }, { "epoch": 0.62, "grad_norm": 8.935258865356445, "learning_rate": 2.0629290617848968e-05, "loss": 0.9981, "step": 2184 }, { "epoch": 0.62, "grad_norm": 8.39003849029541, "learning_rate": 2.0625e-05, "loss": 1.0571, "step": 2185 }, { "epoch": 0.63, "grad_norm": 8.747468948364258, "learning_rate": 2.0620709382151032e-05, "loss": 0.9429, "step": 2186 }, { "epoch": 0.63, "grad_norm": 12.674715042114258, "learning_rate": 2.061641876430206e-05, "loss": 1.3938, "step": 2187 }, { "epoch": 0.63, "grad_norm": 8.458878517150879, "learning_rate": 2.061212814645309e-05, "loss": 1.1022, "step": 2188 }, { "epoch": 0.63, "grad_norm": 8.754960060119629, "learning_rate": 2.060783752860412e-05, "loss": 1.3661, "step": 2189 }, { "epoch": 0.63, "grad_norm": 8.990206718444824, "learning_rate": 2.060354691075515e-05, "loss": 1.08, "step": 2190 }, { "epoch": 0.63, "grad_norm": 9.211421012878418, "learning_rate": 2.059925629290618e-05, "loss": 1.1255, "step": 2191 }, { "epoch": 0.63, "grad_norm": 10.06490707397461, "learning_rate": 2.0594965675057208e-05, "loss": 1.3158, "step": 2192 }, { "epoch": 0.63, "grad_norm": 7.738185405731201, "learning_rate": 2.0590675057208237e-05, "loss": 0.8872, "step": 2193 }, { "epoch": 0.63, "grad_norm": 7.422226905822754, "learning_rate": 2.058638443935927e-05, "loss": 1.1407, "step": 2194 }, { "epoch": 0.63, "grad_norm": 9.108756065368652, "learning_rate": 2.0582093821510298e-05, "loss": 1.0932, "step": 2195 }, { "epoch": 0.63, "grad_norm": 9.866443634033203, "learning_rate": 2.0577803203661326e-05, "loss": 1.3087, "step": 2196 }, { "epoch": 0.63, "grad_norm": 9.107356071472168, "learning_rate": 2.057351258581236e-05, "loss": 1.3113, "step": 2197 }, { "epoch": 0.63, "grad_norm": 9.157082557678223, "learning_rate": 2.0569221967963387e-05, "loss": 1.143, "step": 2198 }, { "epoch": 0.63, "grad_norm": 7.636971473693848, "learning_rate": 2.0564931350114416e-05, "loss": 1.2641, "step": 2199 }, { "epoch": 0.63, "grad_norm": 8.182448387145996, "learning_rate": 2.0560640732265445e-05, "loss": 1.1857, "step": 2200 }, { "epoch": 0.63, "grad_norm": 8.556438446044922, "learning_rate": 2.0556350114416477e-05, "loss": 1.225, "step": 2201 }, { "epoch": 0.63, "grad_norm": 7.437405109405518, "learning_rate": 2.0552059496567506e-05, "loss": 0.8973, "step": 2202 }, { "epoch": 0.63, "grad_norm": 9.594822883605957, "learning_rate": 2.0547768878718535e-05, "loss": 1.1858, "step": 2203 }, { "epoch": 0.63, "grad_norm": 10.339887619018555, "learning_rate": 2.0543478260869567e-05, "loss": 1.3691, "step": 2204 }, { "epoch": 0.63, "grad_norm": 7.668767929077148, "learning_rate": 2.0539187643020596e-05, "loss": 1.1917, "step": 2205 }, { "epoch": 0.63, "grad_norm": 8.312116622924805, "learning_rate": 2.0534897025171628e-05, "loss": 1.2876, "step": 2206 }, { "epoch": 0.63, "grad_norm": 7.3700151443481445, "learning_rate": 2.0530606407322653e-05, "loss": 0.9395, "step": 2207 }, { "epoch": 0.63, "grad_norm": 8.707545280456543, "learning_rate": 2.0526315789473685e-05, "loss": 0.936, "step": 2208 }, { "epoch": 0.63, "grad_norm": 9.36646842956543, "learning_rate": 2.0522025171624714e-05, "loss": 1.0538, "step": 2209 }, { "epoch": 0.63, "grad_norm": 9.574594497680664, "learning_rate": 2.0517734553775743e-05, "loss": 1.2534, "step": 2210 }, { "epoch": 0.63, "grad_norm": 8.412030220031738, "learning_rate": 2.0513443935926775e-05, "loss": 0.9075, "step": 2211 }, { "epoch": 0.63, "grad_norm": 9.844574928283691, "learning_rate": 2.0509153318077804e-05, "loss": 1.0429, "step": 2212 }, { "epoch": 0.63, "grad_norm": 9.063969612121582, "learning_rate": 2.0504862700228836e-05, "loss": 1.1042, "step": 2213 }, { "epoch": 0.63, "grad_norm": 9.998139381408691, "learning_rate": 2.050057208237986e-05, "loss": 1.2145, "step": 2214 }, { "epoch": 0.63, "grad_norm": 8.991023063659668, "learning_rate": 2.0496281464530893e-05, "loss": 1.0048, "step": 2215 }, { "epoch": 0.63, "grad_norm": 9.947741508483887, "learning_rate": 2.0491990846681922e-05, "loss": 1.3416, "step": 2216 }, { "epoch": 0.63, "grad_norm": 8.170355796813965, "learning_rate": 2.0487700228832954e-05, "loss": 0.8943, "step": 2217 }, { "epoch": 0.63, "grad_norm": 8.764482498168945, "learning_rate": 2.048340961098398e-05, "loss": 0.9741, "step": 2218 }, { "epoch": 0.63, "grad_norm": 11.503988265991211, "learning_rate": 2.0479118993135012e-05, "loss": 1.3174, "step": 2219 }, { "epoch": 0.64, "grad_norm": 8.56205940246582, "learning_rate": 2.0474828375286044e-05, "loss": 1.0041, "step": 2220 }, { "epoch": 0.64, "grad_norm": 8.355834007263184, "learning_rate": 2.0470537757437073e-05, "loss": 1.0502, "step": 2221 }, { "epoch": 0.64, "grad_norm": 11.749591827392578, "learning_rate": 2.04662471395881e-05, "loss": 1.0767, "step": 2222 }, { "epoch": 0.64, "grad_norm": 8.654799461364746, "learning_rate": 2.046195652173913e-05, "loss": 1.28, "step": 2223 }, { "epoch": 0.64, "grad_norm": 9.626363754272461, "learning_rate": 2.0457665903890162e-05, "loss": 0.9684, "step": 2224 }, { "epoch": 0.64, "grad_norm": 7.304511070251465, "learning_rate": 2.0453375286041188e-05, "loss": 0.9854, "step": 2225 }, { "epoch": 0.64, "grad_norm": 8.409841537475586, "learning_rate": 2.044908466819222e-05, "loss": 1.0368, "step": 2226 }, { "epoch": 0.64, "grad_norm": 8.896282196044922, "learning_rate": 2.044479405034325e-05, "loss": 1.0266, "step": 2227 }, { "epoch": 0.64, "grad_norm": 9.600053787231445, "learning_rate": 2.044050343249428e-05, "loss": 1.1626, "step": 2228 }, { "epoch": 0.64, "grad_norm": 11.362189292907715, "learning_rate": 2.043621281464531e-05, "loss": 1.2641, "step": 2229 }, { "epoch": 0.64, "grad_norm": 9.800779342651367, "learning_rate": 2.043192219679634e-05, "loss": 1.3698, "step": 2230 }, { "epoch": 0.64, "grad_norm": 9.380990982055664, "learning_rate": 2.042763157894737e-05, "loss": 1.2739, "step": 2231 }, { "epoch": 0.64, "grad_norm": 8.903562545776367, "learning_rate": 2.04233409610984e-05, "loss": 1.1876, "step": 2232 }, { "epoch": 0.64, "grad_norm": 9.238883972167969, "learning_rate": 2.0419050343249428e-05, "loss": 1.1984, "step": 2233 }, { "epoch": 0.64, "grad_norm": 11.166067123413086, "learning_rate": 2.0414759725400457e-05, "loss": 1.215, "step": 2234 }, { "epoch": 0.64, "grad_norm": 8.688907623291016, "learning_rate": 2.041046910755149e-05, "loss": 1.1086, "step": 2235 }, { "epoch": 0.64, "grad_norm": 9.190163612365723, "learning_rate": 2.0406178489702514e-05, "loss": 0.9761, "step": 2236 }, { "epoch": 0.64, "grad_norm": 8.373835563659668, "learning_rate": 2.0401887871853547e-05, "loss": 1.1574, "step": 2237 }, { "epoch": 0.64, "grad_norm": 10.375309944152832, "learning_rate": 2.039759725400458e-05, "loss": 1.4149, "step": 2238 }, { "epoch": 0.64, "grad_norm": 9.415925979614258, "learning_rate": 2.0393306636155608e-05, "loss": 1.238, "step": 2239 }, { "epoch": 0.64, "grad_norm": 9.052910804748535, "learning_rate": 2.0389016018306636e-05, "loss": 1.2935, "step": 2240 }, { "epoch": 0.64, "grad_norm": 10.221535682678223, "learning_rate": 2.0384725400457665e-05, "loss": 1.6285, "step": 2241 }, { "epoch": 0.64, "grad_norm": 9.205775260925293, "learning_rate": 2.0380434782608697e-05, "loss": 1.2095, "step": 2242 }, { "epoch": 0.64, "grad_norm": 8.49085521697998, "learning_rate": 2.0376144164759726e-05, "loss": 1.1456, "step": 2243 }, { "epoch": 0.64, "grad_norm": 10.231178283691406, "learning_rate": 2.0371853546910755e-05, "loss": 0.8561, "step": 2244 }, { "epoch": 0.64, "grad_norm": 8.529679298400879, "learning_rate": 2.0367562929061787e-05, "loss": 0.9702, "step": 2245 }, { "epoch": 0.64, "grad_norm": 9.257425308227539, "learning_rate": 2.0363272311212816e-05, "loss": 1.1129, "step": 2246 }, { "epoch": 0.64, "grad_norm": 8.610989570617676, "learning_rate": 2.0358981693363848e-05, "loss": 1.0532, "step": 2247 }, { "epoch": 0.64, "grad_norm": 8.476094245910645, "learning_rate": 2.0354691075514873e-05, "loss": 0.966, "step": 2248 }, { "epoch": 0.64, "grad_norm": 9.928940773010254, "learning_rate": 2.0350400457665905e-05, "loss": 1.1766, "step": 2249 }, { "epoch": 0.64, "grad_norm": 9.589329719543457, "learning_rate": 2.0346109839816934e-05, "loss": 1.0415, "step": 2250 }, { "epoch": 0.64, "grad_norm": 10.908214569091797, "learning_rate": 2.0341819221967963e-05, "loss": 1.4506, "step": 2251 }, { "epoch": 0.64, "grad_norm": 10.8147554397583, "learning_rate": 2.0337528604118992e-05, "loss": 1.2983, "step": 2252 }, { "epoch": 0.64, "grad_norm": 9.463642120361328, "learning_rate": 2.0333237986270024e-05, "loss": 1.0882, "step": 2253 }, { "epoch": 0.64, "grad_norm": 9.379366874694824, "learning_rate": 2.0328947368421056e-05, "loss": 0.9897, "step": 2254 }, { "epoch": 0.65, "grad_norm": 8.763785362243652, "learning_rate": 2.032465675057208e-05, "loss": 1.098, "step": 2255 }, { "epoch": 0.65, "grad_norm": 9.14033031463623, "learning_rate": 2.0320366132723114e-05, "loss": 1.1363, "step": 2256 }, { "epoch": 0.65, "grad_norm": 9.258912086486816, "learning_rate": 2.0316075514874142e-05, "loss": 1.1787, "step": 2257 }, { "epoch": 0.65, "grad_norm": 9.259736061096191, "learning_rate": 2.0311784897025174e-05, "loss": 1.1174, "step": 2258 }, { "epoch": 0.65, "grad_norm": 8.554972648620605, "learning_rate": 2.03074942791762e-05, "loss": 1.0788, "step": 2259 }, { "epoch": 0.65, "grad_norm": 8.994354248046875, "learning_rate": 2.0303203661327232e-05, "loss": 1.2247, "step": 2260 }, { "epoch": 0.65, "grad_norm": 8.856390953063965, "learning_rate": 2.029891304347826e-05, "loss": 1.203, "step": 2261 }, { "epoch": 0.65, "grad_norm": 8.640615463256836, "learning_rate": 2.029462242562929e-05, "loss": 1.0098, "step": 2262 }, { "epoch": 0.65, "grad_norm": 8.442652702331543, "learning_rate": 2.0290331807780322e-05, "loss": 0.8984, "step": 2263 }, { "epoch": 0.65, "grad_norm": 9.604318618774414, "learning_rate": 2.028604118993135e-05, "loss": 1.2527, "step": 2264 }, { "epoch": 0.65, "grad_norm": 8.059185981750488, "learning_rate": 2.0281750572082383e-05, "loss": 1.1368, "step": 2265 }, { "epoch": 0.65, "grad_norm": 9.449300765991211, "learning_rate": 2.0277459954233408e-05, "loss": 1.3336, "step": 2266 }, { "epoch": 0.65, "grad_norm": 9.19174861907959, "learning_rate": 2.027316933638444e-05, "loss": 0.9381, "step": 2267 }, { "epoch": 0.65, "grad_norm": 7.822282791137695, "learning_rate": 2.026887871853547e-05, "loss": 1.2018, "step": 2268 }, { "epoch": 0.65, "grad_norm": 8.758184432983398, "learning_rate": 2.02645881006865e-05, "loss": 1.0484, "step": 2269 }, { "epoch": 0.65, "grad_norm": 9.32040786743164, "learning_rate": 2.0260297482837526e-05, "loss": 1.2123, "step": 2270 }, { "epoch": 0.65, "grad_norm": 7.464560031890869, "learning_rate": 2.025600686498856e-05, "loss": 1.0035, "step": 2271 }, { "epoch": 0.65, "grad_norm": 10.008631706237793, "learning_rate": 2.025171624713959e-05, "loss": 1.3727, "step": 2272 }, { "epoch": 0.65, "grad_norm": 8.217758178710938, "learning_rate": 2.024742562929062e-05, "loss": 0.9811, "step": 2273 }, { "epoch": 0.65, "grad_norm": 9.442007064819336, "learning_rate": 2.024313501144165e-05, "loss": 1.1444, "step": 2274 }, { "epoch": 0.65, "grad_norm": 8.960583686828613, "learning_rate": 2.0238844393592677e-05, "loss": 1.1571, "step": 2275 }, { "epoch": 0.65, "grad_norm": 8.48130989074707, "learning_rate": 2.023455377574371e-05, "loss": 1.0325, "step": 2276 }, { "epoch": 0.65, "grad_norm": 7.487373352050781, "learning_rate": 2.0230263157894735e-05, "loss": 0.8956, "step": 2277 }, { "epoch": 0.65, "grad_norm": 7.44172477722168, "learning_rate": 2.0225972540045767e-05, "loss": 0.9372, "step": 2278 }, { "epoch": 0.65, "grad_norm": 9.773127555847168, "learning_rate": 2.0221681922196796e-05, "loss": 1.227, "step": 2279 }, { "epoch": 0.65, "grad_norm": 8.245462417602539, "learning_rate": 2.0217391304347828e-05, "loss": 1.1996, "step": 2280 }, { "epoch": 0.65, "grad_norm": 8.477191925048828, "learning_rate": 2.0213100686498856e-05, "loss": 0.8538, "step": 2281 }, { "epoch": 0.65, "grad_norm": 9.498077392578125, "learning_rate": 2.0208810068649885e-05, "loss": 0.8543, "step": 2282 }, { "epoch": 0.65, "grad_norm": 8.986138343811035, "learning_rate": 2.0204519450800917e-05, "loss": 1.061, "step": 2283 }, { "epoch": 0.65, "grad_norm": 8.769342422485352, "learning_rate": 2.0200228832951946e-05, "loss": 1.0352, "step": 2284 }, { "epoch": 0.65, "grad_norm": 9.644377708435059, "learning_rate": 2.0195938215102975e-05, "loss": 1.1557, "step": 2285 }, { "epoch": 0.65, "grad_norm": 11.438213348388672, "learning_rate": 2.0191647597254004e-05, "loss": 1.0977, "step": 2286 }, { "epoch": 0.65, "grad_norm": 9.189215660095215, "learning_rate": 2.0187356979405036e-05, "loss": 0.9457, "step": 2287 }, { "epoch": 0.65, "grad_norm": 6.1353864669799805, "learning_rate": 2.0183066361556065e-05, "loss": 0.4329, "step": 2288 }, { "epoch": 0.65, "grad_norm": 11.20683765411377, "learning_rate": 2.0178775743707093e-05, "loss": 1.3686, "step": 2289 }, { "epoch": 0.66, "grad_norm": 10.71774673461914, "learning_rate": 2.0174485125858126e-05, "loss": 1.1133, "step": 2290 }, { "epoch": 0.66, "grad_norm": 8.056065559387207, "learning_rate": 2.0170194508009154e-05, "loss": 0.834, "step": 2291 }, { "epoch": 0.66, "grad_norm": 11.788326263427734, "learning_rate": 2.0165903890160183e-05, "loss": 1.2508, "step": 2292 }, { "epoch": 0.66, "grad_norm": 9.932804107666016, "learning_rate": 2.0161613272311212e-05, "loss": 1.5201, "step": 2293 }, { "epoch": 0.66, "grad_norm": 8.786749839782715, "learning_rate": 2.0157322654462244e-05, "loss": 1.0267, "step": 2294 }, { "epoch": 0.66, "grad_norm": 9.666580200195312, "learning_rate": 2.0153032036613273e-05, "loss": 1.1708, "step": 2295 }, { "epoch": 0.66, "grad_norm": 8.67428970336914, "learning_rate": 2.01487414187643e-05, "loss": 0.9788, "step": 2296 }, { "epoch": 0.66, "grad_norm": 8.6431303024292, "learning_rate": 2.0144450800915334e-05, "loss": 1.0471, "step": 2297 }, { "epoch": 0.66, "grad_norm": 7.736600399017334, "learning_rate": 2.0140160183066362e-05, "loss": 0.8203, "step": 2298 }, { "epoch": 0.66, "grad_norm": 10.18701457977295, "learning_rate": 2.0135869565217395e-05, "loss": 1.3264, "step": 2299 }, { "epoch": 0.66, "grad_norm": 9.586442947387695, "learning_rate": 2.013157894736842e-05, "loss": 1.2081, "step": 2300 }, { "epoch": 0.66, "grad_norm": 9.979724884033203, "learning_rate": 2.0127288329519452e-05, "loss": 1.2535, "step": 2301 }, { "epoch": 0.66, "grad_norm": 10.856149673461914, "learning_rate": 2.012299771167048e-05, "loss": 1.1634, "step": 2302 }, { "epoch": 0.66, "grad_norm": 8.824821472167969, "learning_rate": 2.011870709382151e-05, "loss": 1.2392, "step": 2303 }, { "epoch": 0.66, "grad_norm": 8.485241889953613, "learning_rate": 2.011441647597254e-05, "loss": 0.8693, "step": 2304 }, { "epoch": 0.66, "grad_norm": 8.472940444946289, "learning_rate": 2.011012585812357e-05, "loss": 0.999, "step": 2305 }, { "epoch": 0.66, "grad_norm": 8.946276664733887, "learning_rate": 2.0105835240274603e-05, "loss": 1.2392, "step": 2306 }, { "epoch": 0.66, "grad_norm": 9.315030097961426, "learning_rate": 2.0101544622425628e-05, "loss": 0.996, "step": 2307 }, { "epoch": 0.66, "grad_norm": 9.195350646972656, "learning_rate": 2.009725400457666e-05, "loss": 1.2306, "step": 2308 }, { "epoch": 0.66, "grad_norm": 9.057229042053223, "learning_rate": 2.009296338672769e-05, "loss": 1.4311, "step": 2309 }, { "epoch": 0.66, "grad_norm": 8.28529167175293, "learning_rate": 2.008867276887872e-05, "loss": 0.9281, "step": 2310 }, { "epoch": 0.66, "grad_norm": 8.697407722473145, "learning_rate": 2.0084382151029747e-05, "loss": 1.0057, "step": 2311 }, { "epoch": 0.66, "grad_norm": 9.405632972717285, "learning_rate": 2.008009153318078e-05, "loss": 1.4378, "step": 2312 }, { "epoch": 0.66, "grad_norm": 8.110221862792969, "learning_rate": 2.0075800915331808e-05, "loss": 1.1495, "step": 2313 }, { "epoch": 0.66, "grad_norm": 7.584765434265137, "learning_rate": 2.0071510297482836e-05, "loss": 0.9162, "step": 2314 }, { "epoch": 0.66, "grad_norm": 8.562564849853516, "learning_rate": 2.006721967963387e-05, "loss": 0.9467, "step": 2315 }, { "epoch": 0.66, "grad_norm": 8.545437812805176, "learning_rate": 2.0062929061784897e-05, "loss": 1.062, "step": 2316 }, { "epoch": 0.66, "grad_norm": 9.241497993469238, "learning_rate": 2.005863844393593e-05, "loss": 1.2093, "step": 2317 }, { "epoch": 0.66, "grad_norm": 10.11286449432373, "learning_rate": 2.0054347826086955e-05, "loss": 1.2354, "step": 2318 }, { "epoch": 0.66, "grad_norm": 9.947376251220703, "learning_rate": 2.0050057208237987e-05, "loss": 1.1555, "step": 2319 }, { "epoch": 0.66, "grad_norm": 8.167878150939941, "learning_rate": 2.0045766590389016e-05, "loss": 0.8684, "step": 2320 }, { "epoch": 0.66, "grad_norm": 8.431657791137695, "learning_rate": 2.0041475972540048e-05, "loss": 1.1477, "step": 2321 }, { "epoch": 0.66, "grad_norm": 9.346592903137207, "learning_rate": 2.0037185354691073e-05, "loss": 1.0756, "step": 2322 }, { "epoch": 0.66, "grad_norm": 9.329963684082031, "learning_rate": 2.0032894736842105e-05, "loss": 0.8836, "step": 2323 }, { "epoch": 0.66, "grad_norm": 10.168521881103516, "learning_rate": 2.0028604118993138e-05, "loss": 1.2817, "step": 2324 }, { "epoch": 0.67, "grad_norm": 8.805412292480469, "learning_rate": 2.0024313501144166e-05, "loss": 0.9593, "step": 2325 }, { "epoch": 0.67, "grad_norm": 10.176719665527344, "learning_rate": 2.0020022883295195e-05, "loss": 0.9256, "step": 2326 }, { "epoch": 0.67, "grad_norm": 10.037216186523438, "learning_rate": 2.0015732265446224e-05, "loss": 1.1016, "step": 2327 }, { "epoch": 0.67, "grad_norm": 8.835389137268066, "learning_rate": 2.0011441647597256e-05, "loss": 1.1988, "step": 2328 }, { "epoch": 0.67, "grad_norm": 7.627316951751709, "learning_rate": 2.000715102974828e-05, "loss": 0.8773, "step": 2329 }, { "epoch": 0.67, "grad_norm": 11.330472946166992, "learning_rate": 2.0002860411899314e-05, "loss": 1.3436, "step": 2330 }, { "epoch": 0.67, "grad_norm": 9.78010082244873, "learning_rate": 1.9998569794050346e-05, "loss": 0.9646, "step": 2331 }, { "epoch": 0.67, "grad_norm": 9.319540023803711, "learning_rate": 1.9994279176201374e-05, "loss": 1.1373, "step": 2332 }, { "epoch": 0.67, "grad_norm": 9.710355758666992, "learning_rate": 1.9989988558352403e-05, "loss": 1.2371, "step": 2333 }, { "epoch": 0.67, "grad_norm": 9.212077140808105, "learning_rate": 1.9985697940503432e-05, "loss": 0.9714, "step": 2334 }, { "epoch": 0.67, "grad_norm": 8.847437858581543, "learning_rate": 1.9981407322654464e-05, "loss": 1.0738, "step": 2335 }, { "epoch": 0.67, "grad_norm": 10.038082122802734, "learning_rate": 1.9977116704805493e-05, "loss": 1.1797, "step": 2336 }, { "epoch": 0.67, "grad_norm": 9.931341171264648, "learning_rate": 1.997282608695652e-05, "loss": 1.0681, "step": 2337 }, { "epoch": 0.67, "grad_norm": 8.347209930419922, "learning_rate": 1.996853546910755e-05, "loss": 1.0247, "step": 2338 }, { "epoch": 0.67, "grad_norm": 8.950143814086914, "learning_rate": 1.9964244851258583e-05, "loss": 1.0472, "step": 2339 }, { "epoch": 0.67, "grad_norm": 8.463066101074219, "learning_rate": 1.995995423340961e-05, "loss": 1.0845, "step": 2340 }, { "epoch": 0.67, "grad_norm": 8.918933868408203, "learning_rate": 1.995566361556064e-05, "loss": 1.355, "step": 2341 }, { "epoch": 0.67, "grad_norm": 9.00894546508789, "learning_rate": 1.9951372997711672e-05, "loss": 0.8562, "step": 2342 }, { "epoch": 0.67, "grad_norm": 10.099607467651367, "learning_rate": 1.99470823798627e-05, "loss": 0.9076, "step": 2343 }, { "epoch": 0.67, "grad_norm": 8.967351913452148, "learning_rate": 1.994279176201373e-05, "loss": 0.95, "step": 2344 }, { "epoch": 0.67, "grad_norm": 8.86395263671875, "learning_rate": 1.993850114416476e-05, "loss": 1.2044, "step": 2345 }, { "epoch": 0.67, "grad_norm": 10.624259948730469, "learning_rate": 1.993421052631579e-05, "loss": 1.4937, "step": 2346 }, { "epoch": 0.67, "grad_norm": 8.641493797302246, "learning_rate": 1.992991990846682e-05, "loss": 1.1664, "step": 2347 }, { "epoch": 0.67, "grad_norm": 9.341193199157715, "learning_rate": 1.9925629290617848e-05, "loss": 1.0786, "step": 2348 }, { "epoch": 0.67, "grad_norm": 8.933123588562012, "learning_rate": 1.992133867276888e-05, "loss": 1.1971, "step": 2349 }, { "epoch": 0.67, "grad_norm": 10.406205177307129, "learning_rate": 1.991704805491991e-05, "loss": 1.466, "step": 2350 }, { "epoch": 0.67, "grad_norm": 7.680570125579834, "learning_rate": 1.991275743707094e-05, "loss": 0.8666, "step": 2351 }, { "epoch": 0.67, "grad_norm": 10.177651405334473, "learning_rate": 1.9908466819221967e-05, "loss": 1.0843, "step": 2352 }, { "epoch": 0.67, "grad_norm": 10.994171142578125, "learning_rate": 1.9904176201373e-05, "loss": 1.3342, "step": 2353 }, { "epoch": 0.67, "grad_norm": 9.462268829345703, "learning_rate": 1.9899885583524028e-05, "loss": 1.1493, "step": 2354 }, { "epoch": 0.67, "grad_norm": 10.827287673950195, "learning_rate": 1.9895594965675056e-05, "loss": 1.3075, "step": 2355 }, { "epoch": 0.67, "grad_norm": 9.858643531799316, "learning_rate": 1.9891304347826085e-05, "loss": 1.3137, "step": 2356 }, { "epoch": 0.67, "grad_norm": 8.442103385925293, "learning_rate": 1.9887013729977117e-05, "loss": 1.0156, "step": 2357 }, { "epoch": 0.67, "grad_norm": 9.429743766784668, "learning_rate": 1.988272311212815e-05, "loss": 1.12, "step": 2358 }, { "epoch": 0.67, "grad_norm": 9.051727294921875, "learning_rate": 1.9878432494279175e-05, "loss": 1.1413, "step": 2359 }, { "epoch": 0.68, "grad_norm": 9.427308082580566, "learning_rate": 1.9874141876430207e-05, "loss": 1.2045, "step": 2360 }, { "epoch": 0.68, "grad_norm": 9.644721984863281, "learning_rate": 1.9869851258581236e-05, "loss": 1.4366, "step": 2361 }, { "epoch": 0.68, "grad_norm": 7.993911266326904, "learning_rate": 1.9865560640732268e-05, "loss": 1.0485, "step": 2362 }, { "epoch": 0.68, "grad_norm": 9.51898193359375, "learning_rate": 1.9861270022883293e-05, "loss": 1.2335, "step": 2363 }, { "epoch": 0.68, "grad_norm": 8.08745002746582, "learning_rate": 1.9856979405034326e-05, "loss": 1.0672, "step": 2364 }, { "epoch": 0.68, "grad_norm": 9.468375205993652, "learning_rate": 1.9852688787185358e-05, "loss": 1.0735, "step": 2365 }, { "epoch": 0.68, "grad_norm": 7.964073657989502, "learning_rate": 1.9848398169336383e-05, "loss": 1.038, "step": 2366 }, { "epoch": 0.68, "grad_norm": 8.643996238708496, "learning_rate": 1.9844107551487415e-05, "loss": 1.1209, "step": 2367 }, { "epoch": 0.68, "grad_norm": 9.548465728759766, "learning_rate": 1.9839816933638444e-05, "loss": 1.2388, "step": 2368 }, { "epoch": 0.68, "grad_norm": 10.564348220825195, "learning_rate": 1.9835526315789476e-05, "loss": 1.4411, "step": 2369 }, { "epoch": 0.68, "grad_norm": 9.36286735534668, "learning_rate": 1.98312356979405e-05, "loss": 1.1672, "step": 2370 }, { "epoch": 0.68, "grad_norm": 9.827733039855957, "learning_rate": 1.9826945080091534e-05, "loss": 1.4856, "step": 2371 }, { "epoch": 0.68, "grad_norm": 8.963473320007324, "learning_rate": 1.9822654462242562e-05, "loss": 0.9815, "step": 2372 }, { "epoch": 0.68, "grad_norm": 8.298812866210938, "learning_rate": 1.9818363844393595e-05, "loss": 1.0033, "step": 2373 }, { "epoch": 0.68, "grad_norm": 7.578042984008789, "learning_rate": 1.9814073226544623e-05, "loss": 1.0352, "step": 2374 }, { "epoch": 0.68, "grad_norm": 10.863141059875488, "learning_rate": 1.9809782608695652e-05, "loss": 1.0734, "step": 2375 }, { "epoch": 0.68, "grad_norm": 10.756592750549316, "learning_rate": 1.9805491990846684e-05, "loss": 1.3603, "step": 2376 }, { "epoch": 0.68, "grad_norm": 8.858885765075684, "learning_rate": 1.9801201372997713e-05, "loss": 0.6397, "step": 2377 }, { "epoch": 0.68, "grad_norm": 8.354283332824707, "learning_rate": 1.9796910755148742e-05, "loss": 1.2313, "step": 2378 }, { "epoch": 0.68, "grad_norm": 10.650423049926758, "learning_rate": 1.979262013729977e-05, "loss": 1.2583, "step": 2379 }, { "epoch": 0.68, "grad_norm": 9.179424285888672, "learning_rate": 1.9788329519450803e-05, "loss": 1.1419, "step": 2380 }, { "epoch": 0.68, "grad_norm": 9.642839431762695, "learning_rate": 1.9784038901601828e-05, "loss": 1.2686, "step": 2381 }, { "epoch": 0.68, "grad_norm": 7.6793646812438965, "learning_rate": 1.977974828375286e-05, "loss": 1.0472, "step": 2382 }, { "epoch": 0.68, "grad_norm": 9.839323043823242, "learning_rate": 1.9775457665903892e-05, "loss": 1.3391, "step": 2383 }, { "epoch": 0.68, "grad_norm": 8.746617317199707, "learning_rate": 1.977116704805492e-05, "loss": 1.2077, "step": 2384 }, { "epoch": 0.68, "grad_norm": 8.517253875732422, "learning_rate": 1.976687643020595e-05, "loss": 0.8728, "step": 2385 }, { "epoch": 0.68, "grad_norm": 9.364797592163086, "learning_rate": 1.976258581235698e-05, "loss": 1.2451, "step": 2386 }, { "epoch": 0.68, "grad_norm": 9.547454833984375, "learning_rate": 1.975829519450801e-05, "loss": 1.151, "step": 2387 }, { "epoch": 0.68, "grad_norm": 7.6331095695495605, "learning_rate": 1.975400457665904e-05, "loss": 0.9652, "step": 2388 }, { "epoch": 0.68, "grad_norm": 11.37586784362793, "learning_rate": 1.974971395881007e-05, "loss": 1.3211, "step": 2389 }, { "epoch": 0.68, "grad_norm": 11.770246505737305, "learning_rate": 1.9745423340961097e-05, "loss": 1.4752, "step": 2390 }, { "epoch": 0.68, "grad_norm": 9.619356155395508, "learning_rate": 1.974113272311213e-05, "loss": 1.4053, "step": 2391 }, { "epoch": 0.68, "grad_norm": 9.411015510559082, "learning_rate": 1.9736842105263158e-05, "loss": 1.1664, "step": 2392 }, { "epoch": 0.68, "grad_norm": 9.58008098602295, "learning_rate": 1.9732551487414187e-05, "loss": 1.1125, "step": 2393 }, { "epoch": 0.68, "grad_norm": 8.968585014343262, "learning_rate": 1.972826086956522e-05, "loss": 1.1154, "step": 2394 }, { "epoch": 0.69, "grad_norm": 8.26370620727539, "learning_rate": 1.9723970251716248e-05, "loss": 1.1691, "step": 2395 }, { "epoch": 0.69, "grad_norm": 7.678633689880371, "learning_rate": 1.9719679633867277e-05, "loss": 0.8724, "step": 2396 }, { "epoch": 0.69, "grad_norm": 8.350838661193848, "learning_rate": 1.9715389016018305e-05, "loss": 0.9113, "step": 2397 }, { "epoch": 0.69, "grad_norm": 8.200567245483398, "learning_rate": 1.9711098398169338e-05, "loss": 0.8809, "step": 2398 }, { "epoch": 0.69, "grad_norm": 9.043484687805176, "learning_rate": 1.9706807780320366e-05, "loss": 1.0817, "step": 2399 }, { "epoch": 0.69, "grad_norm": 8.50027084350586, "learning_rate": 1.9702517162471395e-05, "loss": 0.8939, "step": 2400 }, { "epoch": 0.69, "grad_norm": 9.82008171081543, "learning_rate": 1.9698226544622427e-05, "loss": 1.2579, "step": 2401 }, { "epoch": 0.69, "grad_norm": 10.053723335266113, "learning_rate": 1.9693935926773456e-05, "loss": 1.164, "step": 2402 }, { "epoch": 0.69, "grad_norm": 9.12060260772705, "learning_rate": 1.9689645308924488e-05, "loss": 0.9384, "step": 2403 }, { "epoch": 0.69, "grad_norm": 9.901167869567871, "learning_rate": 1.9685354691075514e-05, "loss": 1.109, "step": 2404 }, { "epoch": 0.69, "grad_norm": 9.169709205627441, "learning_rate": 1.9681064073226546e-05, "loss": 1.381, "step": 2405 }, { "epoch": 0.69, "grad_norm": 9.790369033813477, "learning_rate": 1.9676773455377574e-05, "loss": 1.131, "step": 2406 }, { "epoch": 0.69, "grad_norm": 9.024384498596191, "learning_rate": 1.9672482837528603e-05, "loss": 1.0487, "step": 2407 }, { "epoch": 0.69, "grad_norm": 8.600336074829102, "learning_rate": 1.9668192219679635e-05, "loss": 1.0277, "step": 2408 }, { "epoch": 0.69, "grad_norm": 11.553108215332031, "learning_rate": 1.9663901601830664e-05, "loss": 1.0997, "step": 2409 }, { "epoch": 0.69, "grad_norm": 8.540474891662598, "learning_rate": 1.9659610983981696e-05, "loss": 1.004, "step": 2410 }, { "epoch": 0.69, "grad_norm": 10.915793418884277, "learning_rate": 1.965532036613272e-05, "loss": 1.3838, "step": 2411 }, { "epoch": 0.69, "grad_norm": 8.809085845947266, "learning_rate": 1.9651029748283754e-05, "loss": 1.1397, "step": 2412 }, { "epoch": 0.69, "grad_norm": 10.364714622497559, "learning_rate": 1.9646739130434783e-05, "loss": 1.2425, "step": 2413 }, { "epoch": 0.69, "grad_norm": 9.270904541015625, "learning_rate": 1.9642448512585815e-05, "loss": 1.0459, "step": 2414 }, { "epoch": 0.69, "grad_norm": 9.43685245513916, "learning_rate": 1.963815789473684e-05, "loss": 1.1709, "step": 2415 }, { "epoch": 0.69, "grad_norm": 8.97892951965332, "learning_rate": 1.9633867276887872e-05, "loss": 0.9688, "step": 2416 }, { "epoch": 0.69, "grad_norm": 9.369361877441406, "learning_rate": 1.9629576659038904e-05, "loss": 1.4883, "step": 2417 }, { "epoch": 0.69, "grad_norm": 10.247814178466797, "learning_rate": 1.9625286041189933e-05, "loss": 1.2236, "step": 2418 }, { "epoch": 0.69, "grad_norm": 10.082172393798828, "learning_rate": 1.9620995423340962e-05, "loss": 1.2265, "step": 2419 }, { "epoch": 0.69, "grad_norm": 9.495353698730469, "learning_rate": 1.961670480549199e-05, "loss": 1.4157, "step": 2420 }, { "epoch": 0.69, "grad_norm": 9.695307731628418, "learning_rate": 1.9612414187643023e-05, "loss": 1.0561, "step": 2421 }, { "epoch": 0.69, "grad_norm": 9.251786231994629, "learning_rate": 1.9608123569794048e-05, "loss": 1.1198, "step": 2422 }, { "epoch": 0.69, "grad_norm": 7.915933132171631, "learning_rate": 1.960383295194508e-05, "loss": 1.0593, "step": 2423 }, { "epoch": 0.69, "grad_norm": 8.137659072875977, "learning_rate": 1.959954233409611e-05, "loss": 1.0115, "step": 2424 }, { "epoch": 0.69, "grad_norm": 8.613964080810547, "learning_rate": 1.959525171624714e-05, "loss": 1.2495, "step": 2425 }, { "epoch": 0.69, "grad_norm": 9.045031547546387, "learning_rate": 1.959096109839817e-05, "loss": 1.1832, "step": 2426 }, { "epoch": 0.69, "grad_norm": 7.762406349182129, "learning_rate": 1.95866704805492e-05, "loss": 1.0719, "step": 2427 }, { "epoch": 0.69, "grad_norm": 9.389142990112305, "learning_rate": 1.958237986270023e-05, "loss": 1.2699, "step": 2428 }, { "epoch": 0.69, "grad_norm": 9.382674217224121, "learning_rate": 1.957808924485126e-05, "loss": 1.2774, "step": 2429 }, { "epoch": 0.7, "grad_norm": 9.801231384277344, "learning_rate": 1.957379862700229e-05, "loss": 1.0567, "step": 2430 }, { "epoch": 0.7, "grad_norm": 9.806095123291016, "learning_rate": 1.9569508009153317e-05, "loss": 1.3174, "step": 2431 }, { "epoch": 0.7, "grad_norm": 8.43411922454834, "learning_rate": 1.956521739130435e-05, "loss": 0.9739, "step": 2432 }, { "epoch": 0.7, "grad_norm": 8.108588218688965, "learning_rate": 1.9560926773455375e-05, "loss": 0.9199, "step": 2433 }, { "epoch": 0.7, "grad_norm": 8.779991149902344, "learning_rate": 1.9556636155606407e-05, "loss": 0.967, "step": 2434 }, { "epoch": 0.7, "grad_norm": 9.319437026977539, "learning_rate": 1.955234553775744e-05, "loss": 1.3314, "step": 2435 }, { "epoch": 0.7, "grad_norm": 8.56344223022461, "learning_rate": 1.9548054919908468e-05, "loss": 1.0371, "step": 2436 }, { "epoch": 0.7, "grad_norm": 10.121199607849121, "learning_rate": 1.9543764302059497e-05, "loss": 1.1355, "step": 2437 }, { "epoch": 0.7, "grad_norm": 8.621366500854492, "learning_rate": 1.9539473684210525e-05, "loss": 1.1594, "step": 2438 }, { "epoch": 0.7, "grad_norm": 10.054593086242676, "learning_rate": 1.9535183066361558e-05, "loss": 1.084, "step": 2439 }, { "epoch": 0.7, "grad_norm": 9.424495697021484, "learning_rate": 1.9530892448512586e-05, "loss": 1.0958, "step": 2440 }, { "epoch": 0.7, "grad_norm": 8.96681022644043, "learning_rate": 1.9526601830663615e-05, "loss": 1.1376, "step": 2441 }, { "epoch": 0.7, "grad_norm": 8.197810173034668, "learning_rate": 1.9522311212814644e-05, "loss": 1.1704, "step": 2442 }, { "epoch": 0.7, "grad_norm": 8.515543937683105, "learning_rate": 1.9518020594965676e-05, "loss": 0.9871, "step": 2443 }, { "epoch": 0.7, "grad_norm": 9.73293399810791, "learning_rate": 1.9513729977116708e-05, "loss": 1.22, "step": 2444 }, { "epoch": 0.7, "grad_norm": 7.523548603057861, "learning_rate": 1.9509439359267734e-05, "loss": 0.8579, "step": 2445 }, { "epoch": 0.7, "grad_norm": 8.210893630981445, "learning_rate": 1.9505148741418766e-05, "loss": 1.0666, "step": 2446 }, { "epoch": 0.7, "grad_norm": 7.820652008056641, "learning_rate": 1.9500858123569795e-05, "loss": 1.2288, "step": 2447 }, { "epoch": 0.7, "grad_norm": 7.922313213348389, "learning_rate": 1.9496567505720823e-05, "loss": 1.1147, "step": 2448 }, { "epoch": 0.7, "grad_norm": 9.569446563720703, "learning_rate": 1.9492276887871852e-05, "loss": 1.1903, "step": 2449 }, { "epoch": 0.7, "grad_norm": 8.225363731384277, "learning_rate": 1.9487986270022884e-05, "loss": 1.2157, "step": 2450 }, { "epoch": 0.7, "grad_norm": 9.372381210327148, "learning_rate": 1.9483695652173916e-05, "loss": 1.2702, "step": 2451 }, { "epoch": 0.7, "grad_norm": 9.420974731445312, "learning_rate": 1.9479405034324942e-05, "loss": 1.4587, "step": 2452 }, { "epoch": 0.7, "grad_norm": 8.934778213500977, "learning_rate": 1.9475114416475974e-05, "loss": 1.4302, "step": 2453 }, { "epoch": 0.7, "grad_norm": 8.482074737548828, "learning_rate": 1.9470823798627003e-05, "loss": 1.1006, "step": 2454 }, { "epoch": 0.7, "grad_norm": 7.829884052276611, "learning_rate": 1.9466533180778035e-05, "loss": 1.1241, "step": 2455 }, { "epoch": 0.7, "grad_norm": 10.12347412109375, "learning_rate": 1.946224256292906e-05, "loss": 0.8159, "step": 2456 }, { "epoch": 0.7, "grad_norm": 9.303497314453125, "learning_rate": 1.9457951945080092e-05, "loss": 1.3549, "step": 2457 }, { "epoch": 0.7, "grad_norm": 8.074723243713379, "learning_rate": 1.945366132723112e-05, "loss": 0.9796, "step": 2458 }, { "epoch": 0.7, "grad_norm": 9.593305587768555, "learning_rate": 1.944937070938215e-05, "loss": 1.0156, "step": 2459 }, { "epoch": 0.7, "grad_norm": 8.930562019348145, "learning_rate": 1.9445080091533182e-05, "loss": 0.9765, "step": 2460 }, { "epoch": 0.7, "grad_norm": 9.386600494384766, "learning_rate": 1.944078947368421e-05, "loss": 1.1518, "step": 2461 }, { "epoch": 0.7, "grad_norm": 10.876976013183594, "learning_rate": 1.9436498855835243e-05, "loss": 1.2208, "step": 2462 }, { "epoch": 0.7, "grad_norm": 8.179274559020996, "learning_rate": 1.943220823798627e-05, "loss": 0.8473, "step": 2463 }, { "epoch": 0.7, "grad_norm": 8.907797813415527, "learning_rate": 1.94279176201373e-05, "loss": 1.1434, "step": 2464 }, { "epoch": 0.71, "grad_norm": 12.280512809753418, "learning_rate": 1.942362700228833e-05, "loss": 1.4368, "step": 2465 }, { "epoch": 0.71, "grad_norm": 8.636651992797852, "learning_rate": 1.941933638443936e-05, "loss": 0.9767, "step": 2466 }, { "epoch": 0.71, "grad_norm": 9.658784866333008, "learning_rate": 1.9415045766590387e-05, "loss": 0.9644, "step": 2467 }, { "epoch": 0.71, "grad_norm": 9.097394943237305, "learning_rate": 1.941075514874142e-05, "loss": 0.9025, "step": 2468 }, { "epoch": 0.71, "grad_norm": 11.89475154876709, "learning_rate": 1.940646453089245e-05, "loss": 1.2886, "step": 2469 }, { "epoch": 0.71, "grad_norm": 8.268407821655273, "learning_rate": 1.940217391304348e-05, "loss": 0.9651, "step": 2470 }, { "epoch": 0.71, "grad_norm": 9.751604080200195, "learning_rate": 1.939788329519451e-05, "loss": 1.0763, "step": 2471 }, { "epoch": 0.71, "grad_norm": 9.457395553588867, "learning_rate": 1.9393592677345537e-05, "loss": 0.9043, "step": 2472 }, { "epoch": 0.71, "grad_norm": 9.714011192321777, "learning_rate": 1.938930205949657e-05, "loss": 0.9692, "step": 2473 }, { "epoch": 0.71, "grad_norm": 8.668814659118652, "learning_rate": 1.9385011441647595e-05, "loss": 1.0284, "step": 2474 }, { "epoch": 0.71, "grad_norm": 13.269686698913574, "learning_rate": 1.9380720823798627e-05, "loss": 1.5412, "step": 2475 }, { "epoch": 0.71, "grad_norm": 8.09195327758789, "learning_rate": 1.9376430205949656e-05, "loss": 0.8828, "step": 2476 }, { "epoch": 0.71, "grad_norm": 8.99887752532959, "learning_rate": 1.9372139588100688e-05, "loss": 1.1266, "step": 2477 }, { "epoch": 0.71, "grad_norm": 9.050641059875488, "learning_rate": 1.9367848970251717e-05, "loss": 0.8672, "step": 2478 }, { "epoch": 0.71, "grad_norm": 9.21500301361084, "learning_rate": 1.9363558352402746e-05, "loss": 1.0729, "step": 2479 }, { "epoch": 0.71, "grad_norm": 9.102481842041016, "learning_rate": 1.9359267734553778e-05, "loss": 1.1911, "step": 2480 }, { "epoch": 0.71, "grad_norm": 8.278970718383789, "learning_rate": 1.9354977116704807e-05, "loss": 0.7449, "step": 2481 }, { "epoch": 0.71, "grad_norm": 8.626091003417969, "learning_rate": 1.9350686498855835e-05, "loss": 1.1365, "step": 2482 }, { "epoch": 0.71, "grad_norm": 9.949851989746094, "learning_rate": 1.9346395881006864e-05, "loss": 1.0653, "step": 2483 }, { "epoch": 0.71, "grad_norm": 8.16932201385498, "learning_rate": 1.9342105263157896e-05, "loss": 0.9872, "step": 2484 }, { "epoch": 0.71, "grad_norm": 8.62385368347168, "learning_rate": 1.9337814645308925e-05, "loss": 1.0783, "step": 2485 }, { "epoch": 0.71, "grad_norm": 9.550857543945312, "learning_rate": 1.9333524027459954e-05, "loss": 1.2684, "step": 2486 }, { "epoch": 0.71, "grad_norm": 9.422028541564941, "learning_rate": 1.9329233409610986e-05, "loss": 0.9019, "step": 2487 }, { "epoch": 0.71, "grad_norm": 9.581709861755371, "learning_rate": 1.9324942791762015e-05, "loss": 1.0272, "step": 2488 }, { "epoch": 0.71, "grad_norm": 8.081588745117188, "learning_rate": 1.9320652173913043e-05, "loss": 1.0151, "step": 2489 }, { "epoch": 0.71, "grad_norm": 9.836494445800781, "learning_rate": 1.9316361556064072e-05, "loss": 0.8711, "step": 2490 }, { "epoch": 0.71, "grad_norm": 8.66437816619873, "learning_rate": 1.9312070938215104e-05, "loss": 0.9395, "step": 2491 }, { "epoch": 0.71, "grad_norm": 8.79299259185791, "learning_rate": 1.9307780320366133e-05, "loss": 0.7948, "step": 2492 }, { "epoch": 0.71, "grad_norm": 8.855372428894043, "learning_rate": 1.9303489702517162e-05, "loss": 1.1007, "step": 2493 }, { "epoch": 0.71, "grad_norm": 9.21322250366211, "learning_rate": 1.9299199084668194e-05, "loss": 0.8631, "step": 2494 }, { "epoch": 0.71, "grad_norm": 10.277508735656738, "learning_rate": 1.9294908466819223e-05, "loss": 1.1876, "step": 2495 }, { "epoch": 0.71, "grad_norm": 9.104170799255371, "learning_rate": 1.9290617848970255e-05, "loss": 1.0572, "step": 2496 }, { "epoch": 0.71, "grad_norm": 13.10643196105957, "learning_rate": 1.928632723112128e-05, "loss": 1.5775, "step": 2497 }, { "epoch": 0.71, "grad_norm": 9.110145568847656, "learning_rate": 1.9282036613272313e-05, "loss": 0.9785, "step": 2498 }, { "epoch": 0.71, "grad_norm": 7.77860164642334, "learning_rate": 1.927774599542334e-05, "loss": 0.939, "step": 2499 }, { "epoch": 0.72, "grad_norm": 10.14551830291748, "learning_rate": 1.927345537757437e-05, "loss": 1.3206, "step": 2500 }, { "epoch": 0.72, "grad_norm": 12.676076889038086, "learning_rate": 1.92691647597254e-05, "loss": 1.3357, "step": 2501 }, { "epoch": 0.72, "grad_norm": 9.927719116210938, "learning_rate": 1.926487414187643e-05, "loss": 1.3232, "step": 2502 }, { "epoch": 0.72, "grad_norm": 8.037900924682617, "learning_rate": 1.9260583524027463e-05, "loss": 1.0347, "step": 2503 }, { "epoch": 0.72, "grad_norm": 8.319453239440918, "learning_rate": 1.925629290617849e-05, "loss": 1.0187, "step": 2504 }, { "epoch": 0.72, "grad_norm": 9.605573654174805, "learning_rate": 1.925200228832952e-05, "loss": 1.2331, "step": 2505 }, { "epoch": 0.72, "grad_norm": 9.671036720275879, "learning_rate": 1.924771167048055e-05, "loss": 1.2035, "step": 2506 }, { "epoch": 0.72, "grad_norm": 7.830979824066162, "learning_rate": 1.924342105263158e-05, "loss": 0.9673, "step": 2507 }, { "epoch": 0.72, "grad_norm": 10.420891761779785, "learning_rate": 1.9239130434782607e-05, "loss": 1.2931, "step": 2508 }, { "epoch": 0.72, "grad_norm": 8.493072509765625, "learning_rate": 1.923483981693364e-05, "loss": 1.0137, "step": 2509 }, { "epoch": 0.72, "grad_norm": 9.534939765930176, "learning_rate": 1.9230549199084668e-05, "loss": 0.9068, "step": 2510 }, { "epoch": 0.72, "grad_norm": 8.904736518859863, "learning_rate": 1.9226258581235697e-05, "loss": 1.1091, "step": 2511 }, { "epoch": 0.72, "grad_norm": 8.013572692871094, "learning_rate": 1.922196796338673e-05, "loss": 0.912, "step": 2512 }, { "epoch": 0.72, "grad_norm": 9.072867393493652, "learning_rate": 1.9217677345537758e-05, "loss": 1.2602, "step": 2513 }, { "epoch": 0.72, "grad_norm": 9.154093742370605, "learning_rate": 1.921338672768879e-05, "loss": 1.0201, "step": 2514 }, { "epoch": 0.72, "grad_norm": 8.744938850402832, "learning_rate": 1.9209096109839815e-05, "loss": 1.1458, "step": 2515 }, { "epoch": 0.72, "grad_norm": 9.348825454711914, "learning_rate": 1.9204805491990847e-05, "loss": 1.0297, "step": 2516 }, { "epoch": 0.72, "grad_norm": 9.790005683898926, "learning_rate": 1.9200514874141876e-05, "loss": 1.063, "step": 2517 }, { "epoch": 0.72, "grad_norm": 11.721095085144043, "learning_rate": 1.9196224256292908e-05, "loss": 1.13, "step": 2518 }, { "epoch": 0.72, "grad_norm": 9.300861358642578, "learning_rate": 1.9191933638443934e-05, "loss": 0.8934, "step": 2519 }, { "epoch": 0.72, "grad_norm": 8.523219108581543, "learning_rate": 1.9187643020594966e-05, "loss": 0.9684, "step": 2520 }, { "epoch": 0.72, "grad_norm": 9.683642387390137, "learning_rate": 1.9183352402745998e-05, "loss": 1.0538, "step": 2521 }, { "epoch": 0.72, "grad_norm": 8.01683235168457, "learning_rate": 1.9179061784897027e-05, "loss": 0.9628, "step": 2522 }, { "epoch": 0.72, "grad_norm": 7.2914652824401855, "learning_rate": 1.9174771167048055e-05, "loss": 0.7033, "step": 2523 }, { "epoch": 0.72, "grad_norm": 9.02861499786377, "learning_rate": 1.9170480549199084e-05, "loss": 0.8338, "step": 2524 }, { "epoch": 0.72, "grad_norm": 10.126641273498535, "learning_rate": 1.9166189931350116e-05, "loss": 1.1902, "step": 2525 }, { "epoch": 0.72, "grad_norm": 9.803444862365723, "learning_rate": 1.9161899313501142e-05, "loss": 1.1011, "step": 2526 }, { "epoch": 0.72, "grad_norm": 8.839347839355469, "learning_rate": 1.9157608695652174e-05, "loss": 1.0522, "step": 2527 }, { "epoch": 0.72, "grad_norm": 9.325223922729492, "learning_rate": 1.9153318077803206e-05, "loss": 1.219, "step": 2528 }, { "epoch": 0.72, "grad_norm": 8.535804748535156, "learning_rate": 1.9149027459954235e-05, "loss": 0.9931, "step": 2529 }, { "epoch": 0.72, "grad_norm": 8.813097953796387, "learning_rate": 1.9144736842105264e-05, "loss": 0.8771, "step": 2530 }, { "epoch": 0.72, "grad_norm": 9.58659839630127, "learning_rate": 1.9140446224256292e-05, "loss": 1.0303, "step": 2531 }, { "epoch": 0.72, "grad_norm": 9.635456085205078, "learning_rate": 1.9136155606407325e-05, "loss": 1.0285, "step": 2532 }, { "epoch": 0.72, "grad_norm": 9.991578102111816, "learning_rate": 1.9131864988558353e-05, "loss": 1.0554, "step": 2533 }, { "epoch": 0.72, "grad_norm": 8.071062088012695, "learning_rate": 1.9127574370709382e-05, "loss": 1.0673, "step": 2534 }, { "epoch": 0.73, "grad_norm": 8.62812328338623, "learning_rate": 1.912328375286041e-05, "loss": 1.0919, "step": 2535 }, { "epoch": 0.73, "grad_norm": 7.468624114990234, "learning_rate": 1.9118993135011443e-05, "loss": 0.8146, "step": 2536 }, { "epoch": 0.73, "grad_norm": 9.206171035766602, "learning_rate": 1.9114702517162472e-05, "loss": 1.1295, "step": 2537 }, { "epoch": 0.73, "grad_norm": 8.525646209716797, "learning_rate": 1.91104118993135e-05, "loss": 1.3128, "step": 2538 }, { "epoch": 0.73, "grad_norm": 8.496814727783203, "learning_rate": 1.9106121281464533e-05, "loss": 1.0191, "step": 2539 }, { "epoch": 0.73, "grad_norm": 10.425469398498535, "learning_rate": 1.910183066361556e-05, "loss": 1.268, "step": 2540 }, { "epoch": 0.73, "grad_norm": 7.101430416107178, "learning_rate": 1.909754004576659e-05, "loss": 0.8288, "step": 2541 }, { "epoch": 0.73, "grad_norm": 10.26478385925293, "learning_rate": 1.909324942791762e-05, "loss": 0.9117, "step": 2542 }, { "epoch": 0.73, "grad_norm": 9.921600341796875, "learning_rate": 1.908895881006865e-05, "loss": 1.3147, "step": 2543 }, { "epoch": 0.73, "grad_norm": 8.800148963928223, "learning_rate": 1.908466819221968e-05, "loss": 1.1436, "step": 2544 }, { "epoch": 0.73, "grad_norm": 8.311388969421387, "learning_rate": 1.908037757437071e-05, "loss": 0.9736, "step": 2545 }, { "epoch": 0.73, "grad_norm": 9.316076278686523, "learning_rate": 1.907608695652174e-05, "loss": 0.9719, "step": 2546 }, { "epoch": 0.73, "grad_norm": 9.000842094421387, "learning_rate": 1.907179633867277e-05, "loss": 1.0308, "step": 2547 }, { "epoch": 0.73, "grad_norm": 10.87099552154541, "learning_rate": 1.9067505720823802e-05, "loss": 1.3402, "step": 2548 }, { "epoch": 0.73, "grad_norm": 10.852824211120605, "learning_rate": 1.9063215102974827e-05, "loss": 1.2251, "step": 2549 }, { "epoch": 0.73, "grad_norm": 8.490774154663086, "learning_rate": 1.905892448512586e-05, "loss": 0.7745, "step": 2550 }, { "epoch": 0.73, "grad_norm": 10.485783576965332, "learning_rate": 1.9054633867276888e-05, "loss": 1.4443, "step": 2551 }, { "epoch": 0.73, "grad_norm": 9.42505168914795, "learning_rate": 1.9050343249427917e-05, "loss": 0.9396, "step": 2552 }, { "epoch": 0.73, "grad_norm": 10.281153678894043, "learning_rate": 1.9046052631578946e-05, "loss": 1.1538, "step": 2553 }, { "epoch": 0.73, "grad_norm": 7.801650524139404, "learning_rate": 1.9041762013729978e-05, "loss": 0.842, "step": 2554 }, { "epoch": 0.73, "grad_norm": 8.858254432678223, "learning_rate": 1.903747139588101e-05, "loss": 1.0788, "step": 2555 }, { "epoch": 0.73, "grad_norm": 9.263640403747559, "learning_rate": 1.9033180778032035e-05, "loss": 1.0237, "step": 2556 }, { "epoch": 0.73, "grad_norm": 10.844314575195312, "learning_rate": 1.9028890160183067e-05, "loss": 1.4164, "step": 2557 }, { "epoch": 0.73, "grad_norm": 7.440629005432129, "learning_rate": 1.9024599542334096e-05, "loss": 0.874, "step": 2558 }, { "epoch": 0.73, "grad_norm": 8.15101146697998, "learning_rate": 1.902030892448513e-05, "loss": 1.1855, "step": 2559 }, { "epoch": 0.73, "grad_norm": 7.89607572555542, "learning_rate": 1.9016018306636154e-05, "loss": 0.8347, "step": 2560 }, { "epoch": 0.73, "grad_norm": 9.365269660949707, "learning_rate": 1.9011727688787186e-05, "loss": 1.2891, "step": 2561 }, { "epoch": 0.73, "grad_norm": 8.910431861877441, "learning_rate": 1.9007437070938215e-05, "loss": 1.0829, "step": 2562 }, { "epoch": 0.73, "grad_norm": 8.414385795593262, "learning_rate": 1.9003146453089243e-05, "loss": 1.0729, "step": 2563 }, { "epoch": 0.73, "grad_norm": 11.398804664611816, "learning_rate": 1.8998855835240276e-05, "loss": 1.4494, "step": 2564 }, { "epoch": 0.73, "grad_norm": 8.427895545959473, "learning_rate": 1.8994565217391304e-05, "loss": 0.9459, "step": 2565 }, { "epoch": 0.73, "grad_norm": 9.052424430847168, "learning_rate": 1.8990274599542337e-05, "loss": 1.0575, "step": 2566 }, { "epoch": 0.73, "grad_norm": 10.641477584838867, "learning_rate": 1.8985983981693362e-05, "loss": 1.0919, "step": 2567 }, { "epoch": 0.73, "grad_norm": 10.028521537780762, "learning_rate": 1.8981693363844394e-05, "loss": 1.0165, "step": 2568 }, { "epoch": 0.73, "grad_norm": 8.408310890197754, "learning_rate": 1.8977402745995423e-05, "loss": 1.1676, "step": 2569 }, { "epoch": 0.74, "grad_norm": 9.366040229797363, "learning_rate": 1.8973112128146455e-05, "loss": 1.1194, "step": 2570 }, { "epoch": 0.74, "grad_norm": 8.971992492675781, "learning_rate": 1.8968821510297484e-05, "loss": 1.0539, "step": 2571 }, { "epoch": 0.74, "grad_norm": 9.06114387512207, "learning_rate": 1.8964530892448513e-05, "loss": 1.0163, "step": 2572 }, { "epoch": 0.74, "grad_norm": 8.988670349121094, "learning_rate": 1.8960240274599545e-05, "loss": 1.0297, "step": 2573 }, { "epoch": 0.74, "grad_norm": 8.470457077026367, "learning_rate": 1.8955949656750573e-05, "loss": 0.9485, "step": 2574 }, { "epoch": 0.74, "grad_norm": 9.652470588684082, "learning_rate": 1.8951659038901602e-05, "loss": 1.1597, "step": 2575 }, { "epoch": 0.74, "grad_norm": 10.958063125610352, "learning_rate": 1.894736842105263e-05, "loss": 1.3988, "step": 2576 }, { "epoch": 0.74, "grad_norm": 9.248319625854492, "learning_rate": 1.8943077803203663e-05, "loss": 1.0791, "step": 2577 }, { "epoch": 0.74, "grad_norm": 9.106576919555664, "learning_rate": 1.893878718535469e-05, "loss": 1.1785, "step": 2578 }, { "epoch": 0.74, "grad_norm": 8.481927871704102, "learning_rate": 1.893449656750572e-05, "loss": 1.0148, "step": 2579 }, { "epoch": 0.74, "grad_norm": 8.416252136230469, "learning_rate": 1.8930205949656753e-05, "loss": 1.0448, "step": 2580 }, { "epoch": 0.74, "grad_norm": 9.41560173034668, "learning_rate": 1.892591533180778e-05, "loss": 1.0477, "step": 2581 }, { "epoch": 0.74, "grad_norm": 9.394129753112793, "learning_rate": 1.892162471395881e-05, "loss": 0.9718, "step": 2582 }, { "epoch": 0.74, "grad_norm": 7.814067363739014, "learning_rate": 1.891733409610984e-05, "loss": 0.7597, "step": 2583 }, { "epoch": 0.74, "grad_norm": 9.573678970336914, "learning_rate": 1.891304347826087e-05, "loss": 1.1839, "step": 2584 }, { "epoch": 0.74, "grad_norm": 8.720595359802246, "learning_rate": 1.89087528604119e-05, "loss": 1.3163, "step": 2585 }, { "epoch": 0.74, "grad_norm": 8.662771224975586, "learning_rate": 1.890446224256293e-05, "loss": 0.8793, "step": 2586 }, { "epoch": 0.74, "grad_norm": 9.156516075134277, "learning_rate": 1.8900171624713958e-05, "loss": 1.0649, "step": 2587 }, { "epoch": 0.74, "grad_norm": 8.830260276794434, "learning_rate": 1.889588100686499e-05, "loss": 1.3595, "step": 2588 }, { "epoch": 0.74, "grad_norm": 8.842612266540527, "learning_rate": 1.889159038901602e-05, "loss": 1.3127, "step": 2589 }, { "epoch": 0.74, "grad_norm": 9.004518508911133, "learning_rate": 1.8887299771167047e-05, "loss": 0.791, "step": 2590 }, { "epoch": 0.74, "grad_norm": 9.46159839630127, "learning_rate": 1.888300915331808e-05, "loss": 1.0763, "step": 2591 }, { "epoch": 0.74, "grad_norm": 9.404709815979004, "learning_rate": 1.8878718535469108e-05, "loss": 1.0805, "step": 2592 }, { "epoch": 0.74, "grad_norm": 8.421338081359863, "learning_rate": 1.8874427917620137e-05, "loss": 1.2548, "step": 2593 }, { "epoch": 0.74, "grad_norm": 8.607651710510254, "learning_rate": 1.8870137299771166e-05, "loss": 1.2314, "step": 2594 }, { "epoch": 0.74, "grad_norm": 8.027852058410645, "learning_rate": 1.8865846681922198e-05, "loss": 0.9529, "step": 2595 }, { "epoch": 0.74, "grad_norm": 9.327431678771973, "learning_rate": 1.8861556064073227e-05, "loss": 1.162, "step": 2596 }, { "epoch": 0.74, "grad_norm": 9.700788497924805, "learning_rate": 1.8857265446224255e-05, "loss": 1.2408, "step": 2597 }, { "epoch": 0.74, "grad_norm": 8.374039649963379, "learning_rate": 1.8852974828375288e-05, "loss": 1.115, "step": 2598 }, { "epoch": 0.74, "grad_norm": 8.619352340698242, "learning_rate": 1.8848684210526316e-05, "loss": 1.0719, "step": 2599 }, { "epoch": 0.74, "grad_norm": 9.652650833129883, "learning_rate": 1.884439359267735e-05, "loss": 1.0184, "step": 2600 }, { "epoch": 0.74, "grad_norm": 8.914128303527832, "learning_rate": 1.8840102974828374e-05, "loss": 1.37, "step": 2601 }, { "epoch": 0.74, "grad_norm": 10.957221031188965, "learning_rate": 1.8835812356979406e-05, "loss": 1.1816, "step": 2602 }, { "epoch": 0.74, "grad_norm": 9.956828117370605, "learning_rate": 1.8831521739130435e-05, "loss": 1.1326, "step": 2603 }, { "epoch": 0.74, "grad_norm": 8.612574577331543, "learning_rate": 1.8827231121281464e-05, "loss": 0.9619, "step": 2604 }, { "epoch": 0.75, "grad_norm": 8.865391731262207, "learning_rate": 1.8822940503432496e-05, "loss": 1.0499, "step": 2605 }, { "epoch": 0.75, "grad_norm": 8.714326858520508, "learning_rate": 1.8818649885583525e-05, "loss": 1.0391, "step": 2606 }, { "epoch": 0.75, "grad_norm": 8.581421852111816, "learning_rate": 1.8814359267734557e-05, "loss": 1.0204, "step": 2607 }, { "epoch": 0.75, "grad_norm": 8.185111045837402, "learning_rate": 1.8810068649885582e-05, "loss": 1.0015, "step": 2608 }, { "epoch": 0.75, "grad_norm": 8.517500877380371, "learning_rate": 1.8805778032036614e-05, "loss": 0.9711, "step": 2609 }, { "epoch": 0.75, "grad_norm": 9.859618186950684, "learning_rate": 1.8801487414187643e-05, "loss": 1.0174, "step": 2610 }, { "epoch": 0.75, "grad_norm": 7.156263828277588, "learning_rate": 1.8797196796338675e-05, "loss": 0.8121, "step": 2611 }, { "epoch": 0.75, "grad_norm": 9.95913314819336, "learning_rate": 1.87929061784897e-05, "loss": 1.1522, "step": 2612 }, { "epoch": 0.75, "grad_norm": 7.466686248779297, "learning_rate": 1.8788615560640733e-05, "loss": 0.883, "step": 2613 }, { "epoch": 0.75, "grad_norm": 9.566696166992188, "learning_rate": 1.8784324942791765e-05, "loss": 1.1206, "step": 2614 }, { "epoch": 0.75, "grad_norm": 9.486233711242676, "learning_rate": 1.8780034324942794e-05, "loss": 1.0718, "step": 2615 }, { "epoch": 0.75, "grad_norm": 10.664968490600586, "learning_rate": 1.8775743707093822e-05, "loss": 0.9513, "step": 2616 }, { "epoch": 0.75, "grad_norm": 9.688613891601562, "learning_rate": 1.877145308924485e-05, "loss": 1.0027, "step": 2617 }, { "epoch": 0.75, "grad_norm": 9.05102825164795, "learning_rate": 1.8767162471395883e-05, "loss": 1.1885, "step": 2618 }, { "epoch": 0.75, "grad_norm": 8.73543643951416, "learning_rate": 1.876287185354691e-05, "loss": 1.0659, "step": 2619 }, { "epoch": 0.75, "grad_norm": 10.386043548583984, "learning_rate": 1.875858123569794e-05, "loss": 1.2505, "step": 2620 }, { "epoch": 0.75, "grad_norm": 12.936104774475098, "learning_rate": 1.875429061784897e-05, "loss": 1.1552, "step": 2621 }, { "epoch": 0.75, "grad_norm": 8.49905014038086, "learning_rate": 1.8750000000000002e-05, "loss": 1.1317, "step": 2622 }, { "epoch": 0.75, "grad_norm": 9.787158012390137, "learning_rate": 1.874570938215103e-05, "loss": 0.9795, "step": 2623 }, { "epoch": 0.75, "grad_norm": 10.00570297241211, "learning_rate": 1.874141876430206e-05, "loss": 1.0594, "step": 2624 }, { "epoch": 0.75, "grad_norm": 8.770262718200684, "learning_rate": 1.873712814645309e-05, "loss": 1.1636, "step": 2625 }, { "epoch": 0.75, "grad_norm": 9.585314750671387, "learning_rate": 1.873283752860412e-05, "loss": 1.1635, "step": 2626 }, { "epoch": 0.75, "grad_norm": 11.837164878845215, "learning_rate": 1.872854691075515e-05, "loss": 1.4008, "step": 2627 }, { "epoch": 0.75, "grad_norm": 10.564339637756348, "learning_rate": 1.8724256292906178e-05, "loss": 1.2256, "step": 2628 }, { "epoch": 0.75, "grad_norm": 9.10994815826416, "learning_rate": 1.871996567505721e-05, "loss": 1.3112, "step": 2629 }, { "epoch": 0.75, "grad_norm": 8.335712432861328, "learning_rate": 1.8715675057208235e-05, "loss": 1.1398, "step": 2630 }, { "epoch": 0.75, "grad_norm": 8.341137886047363, "learning_rate": 1.8711384439359267e-05, "loss": 0.8713, "step": 2631 }, { "epoch": 0.75, "grad_norm": 10.362038612365723, "learning_rate": 1.87070938215103e-05, "loss": 1.4944, "step": 2632 }, { "epoch": 0.75, "grad_norm": 7.386731147766113, "learning_rate": 1.870280320366133e-05, "loss": 0.8924, "step": 2633 }, { "epoch": 0.75, "grad_norm": 7.392826557159424, "learning_rate": 1.8698512585812357e-05, "loss": 0.9767, "step": 2634 }, { "epoch": 0.75, "grad_norm": 8.432779312133789, "learning_rate": 1.8694221967963386e-05, "loss": 1.155, "step": 2635 }, { "epoch": 0.75, "grad_norm": 9.268701553344727, "learning_rate": 1.8689931350114418e-05, "loss": 1.2737, "step": 2636 }, { "epoch": 0.75, "grad_norm": 8.36255931854248, "learning_rate": 1.8685640732265447e-05, "loss": 0.8501, "step": 2637 }, { "epoch": 0.75, "grad_norm": 8.937508583068848, "learning_rate": 1.8681350114416476e-05, "loss": 1.2618, "step": 2638 }, { "epoch": 0.75, "grad_norm": 6.815885543823242, "learning_rate": 1.8677059496567504e-05, "loss": 1.0871, "step": 2639 }, { "epoch": 0.76, "grad_norm": 8.406804084777832, "learning_rate": 1.8672768878718537e-05, "loss": 1.1689, "step": 2640 }, { "epoch": 0.76, "grad_norm": 7.972434043884277, "learning_rate": 1.866847826086957e-05, "loss": 0.8999, "step": 2641 }, { "epoch": 0.76, "grad_norm": 11.70073127746582, "learning_rate": 1.8664187643020594e-05, "loss": 1.4695, "step": 2642 }, { "epoch": 0.76, "grad_norm": 8.955720901489258, "learning_rate": 1.8659897025171626e-05, "loss": 1.1373, "step": 2643 }, { "epoch": 0.76, "grad_norm": 9.123139381408691, "learning_rate": 1.8655606407322655e-05, "loss": 1.0841, "step": 2644 }, { "epoch": 0.76, "grad_norm": 10.323467254638672, "learning_rate": 1.8651315789473684e-05, "loss": 1.1008, "step": 2645 }, { "epoch": 0.76, "grad_norm": 9.747201919555664, "learning_rate": 1.8647025171624713e-05, "loss": 1.024, "step": 2646 }, { "epoch": 0.76, "grad_norm": 8.707141876220703, "learning_rate": 1.8642734553775745e-05, "loss": 1.0112, "step": 2647 }, { "epoch": 0.76, "grad_norm": 9.492386817932129, "learning_rate": 1.8638443935926777e-05, "loss": 1.1169, "step": 2648 }, { "epoch": 0.76, "grad_norm": 8.571977615356445, "learning_rate": 1.8634153318077802e-05, "loss": 1.0638, "step": 2649 }, { "epoch": 0.76, "grad_norm": 10.807567596435547, "learning_rate": 1.8629862700228834e-05, "loss": 1.3333, "step": 2650 }, { "epoch": 0.76, "grad_norm": 8.413228034973145, "learning_rate": 1.8625572082379863e-05, "loss": 1.0567, "step": 2651 }, { "epoch": 0.76, "grad_norm": 12.146622657775879, "learning_rate": 1.8621281464530895e-05, "loss": 1.1632, "step": 2652 }, { "epoch": 0.76, "grad_norm": 8.922531127929688, "learning_rate": 1.861699084668192e-05, "loss": 0.8596, "step": 2653 }, { "epoch": 0.76, "grad_norm": 9.631738662719727, "learning_rate": 1.8612700228832953e-05, "loss": 1.17, "step": 2654 }, { "epoch": 0.76, "grad_norm": 11.031031608581543, "learning_rate": 1.860840961098398e-05, "loss": 1.0493, "step": 2655 }, { "epoch": 0.76, "grad_norm": 9.348603248596191, "learning_rate": 1.860411899313501e-05, "loss": 1.1414, "step": 2656 }, { "epoch": 0.76, "grad_norm": 10.053489685058594, "learning_rate": 1.8599828375286043e-05, "loss": 0.9839, "step": 2657 }, { "epoch": 0.76, "grad_norm": 9.80137825012207, "learning_rate": 1.859553775743707e-05, "loss": 1.2351, "step": 2658 }, { "epoch": 0.76, "grad_norm": 10.180928230285645, "learning_rate": 1.8591247139588103e-05, "loss": 0.9798, "step": 2659 }, { "epoch": 0.76, "grad_norm": 9.522939682006836, "learning_rate": 1.858695652173913e-05, "loss": 1.1515, "step": 2660 }, { "epoch": 0.76, "grad_norm": 9.786197662353516, "learning_rate": 1.858266590389016e-05, "loss": 1.0778, "step": 2661 }, { "epoch": 0.76, "grad_norm": 10.08316707611084, "learning_rate": 1.857837528604119e-05, "loss": 1.0474, "step": 2662 }, { "epoch": 0.76, "grad_norm": 8.703034400939941, "learning_rate": 1.8574084668192222e-05, "loss": 0.894, "step": 2663 }, { "epoch": 0.76, "grad_norm": 9.297806739807129, "learning_rate": 1.8569794050343247e-05, "loss": 1.0639, "step": 2664 }, { "epoch": 0.76, "grad_norm": 8.80936336517334, "learning_rate": 1.856550343249428e-05, "loss": 0.9902, "step": 2665 }, { "epoch": 0.76, "grad_norm": 8.111929893493652, "learning_rate": 1.856121281464531e-05, "loss": 1.1252, "step": 2666 }, { "epoch": 0.76, "grad_norm": 9.989709854125977, "learning_rate": 1.855692219679634e-05, "loss": 1.1853, "step": 2667 }, { "epoch": 0.76, "grad_norm": 10.384547233581543, "learning_rate": 1.855263157894737e-05, "loss": 0.946, "step": 2668 }, { "epoch": 0.76, "grad_norm": 7.475389003753662, "learning_rate": 1.8548340961098398e-05, "loss": 0.704, "step": 2669 }, { "epoch": 0.76, "grad_norm": 10.378499031066895, "learning_rate": 1.854405034324943e-05, "loss": 1.3423, "step": 2670 }, { "epoch": 0.76, "grad_norm": 10.349032402038574, "learning_rate": 1.8539759725400455e-05, "loss": 1.032, "step": 2671 }, { "epoch": 0.76, "grad_norm": 8.30756950378418, "learning_rate": 1.8535469107551488e-05, "loss": 1.0812, "step": 2672 }, { "epoch": 0.76, "grad_norm": 11.054231643676758, "learning_rate": 1.8531178489702516e-05, "loss": 1.12, "step": 2673 }, { "epoch": 0.76, "grad_norm": 9.871310234069824, "learning_rate": 1.852688787185355e-05, "loss": 0.9755, "step": 2674 }, { "epoch": 0.77, "grad_norm": 7.793604373931885, "learning_rate": 1.8522597254004577e-05, "loss": 0.9116, "step": 2675 }, { "epoch": 0.77, "grad_norm": 9.23965072631836, "learning_rate": 1.8518306636155606e-05, "loss": 0.8213, "step": 2676 }, { "epoch": 0.77, "grad_norm": 10.072052001953125, "learning_rate": 1.8514016018306638e-05, "loss": 1.0072, "step": 2677 }, { "epoch": 0.77, "grad_norm": 10.18537712097168, "learning_rate": 1.8509725400457667e-05, "loss": 0.925, "step": 2678 }, { "epoch": 0.77, "grad_norm": 13.233726501464844, "learning_rate": 1.8505434782608696e-05, "loss": 1.1301, "step": 2679 }, { "epoch": 0.77, "grad_norm": 10.48590087890625, "learning_rate": 1.8501144164759724e-05, "loss": 1.0966, "step": 2680 }, { "epoch": 0.77, "grad_norm": 9.115750312805176, "learning_rate": 1.8496853546910757e-05, "loss": 1.0654, "step": 2681 }, { "epoch": 0.77, "grad_norm": 8.910179138183594, "learning_rate": 1.8492562929061785e-05, "loss": 0.8975, "step": 2682 }, { "epoch": 0.77, "grad_norm": 10.317663192749023, "learning_rate": 1.8488272311212814e-05, "loss": 1.1109, "step": 2683 }, { "epoch": 0.77, "grad_norm": 9.628530502319336, "learning_rate": 1.8483981693363846e-05, "loss": 1.0068, "step": 2684 }, { "epoch": 0.77, "grad_norm": 10.652451515197754, "learning_rate": 1.8479691075514875e-05, "loss": 1.124, "step": 2685 }, { "epoch": 0.77, "grad_norm": 8.596827507019043, "learning_rate": 1.8475400457665904e-05, "loss": 0.9857, "step": 2686 }, { "epoch": 0.77, "grad_norm": 9.025288581848145, "learning_rate": 1.8471109839816933e-05, "loss": 1.1873, "step": 2687 }, { "epoch": 0.77, "grad_norm": 9.74854564666748, "learning_rate": 1.8466819221967965e-05, "loss": 1.0938, "step": 2688 }, { "epoch": 0.77, "grad_norm": 9.616058349609375, "learning_rate": 1.8462528604118994e-05, "loss": 1.0013, "step": 2689 }, { "epoch": 0.77, "grad_norm": 8.085488319396973, "learning_rate": 1.8458237986270022e-05, "loss": 0.9281, "step": 2690 }, { "epoch": 0.77, "grad_norm": 7.250451564788818, "learning_rate": 1.8453947368421054e-05, "loss": 1.0076, "step": 2691 }, { "epoch": 0.77, "grad_norm": 9.059502601623535, "learning_rate": 1.8449656750572083e-05, "loss": 0.9559, "step": 2692 }, { "epoch": 0.77, "grad_norm": 8.862520217895508, "learning_rate": 1.8445366132723115e-05, "loss": 1.0068, "step": 2693 }, { "epoch": 0.77, "grad_norm": 9.757304191589355, "learning_rate": 1.844107551487414e-05, "loss": 1.0841, "step": 2694 }, { "epoch": 0.77, "grad_norm": 12.453340530395508, "learning_rate": 1.8436784897025173e-05, "loss": 1.4574, "step": 2695 }, { "epoch": 0.77, "grad_norm": 9.447304725646973, "learning_rate": 1.8432494279176202e-05, "loss": 1.1475, "step": 2696 }, { "epoch": 0.77, "grad_norm": 10.128987312316895, "learning_rate": 1.842820366132723e-05, "loss": 1.3936, "step": 2697 }, { "epoch": 0.77, "grad_norm": 8.295130729675293, "learning_rate": 1.842391304347826e-05, "loss": 1.0786, "step": 2698 }, { "epoch": 0.77, "grad_norm": 8.610480308532715, "learning_rate": 1.841962242562929e-05, "loss": 1.0077, "step": 2699 }, { "epoch": 0.77, "grad_norm": 7.891595840454102, "learning_rate": 1.8415331807780324e-05, "loss": 0.8981, "step": 2700 }, { "epoch": 0.77, "grad_norm": 8.902894020080566, "learning_rate": 1.841104118993135e-05, "loss": 0.9923, "step": 2701 }, { "epoch": 0.77, "grad_norm": 8.750622749328613, "learning_rate": 1.840675057208238e-05, "loss": 0.8989, "step": 2702 }, { "epoch": 0.77, "grad_norm": 8.808574676513672, "learning_rate": 1.840245995423341e-05, "loss": 0.848, "step": 2703 }, { "epoch": 0.77, "grad_norm": 8.473341941833496, "learning_rate": 1.8398169336384442e-05, "loss": 1.1411, "step": 2704 }, { "epoch": 0.77, "grad_norm": 9.206725120544434, "learning_rate": 1.8393878718535467e-05, "loss": 1.0971, "step": 2705 }, { "epoch": 0.77, "grad_norm": 8.211832046508789, "learning_rate": 1.83895881006865e-05, "loss": 0.8216, "step": 2706 }, { "epoch": 0.77, "grad_norm": 10.205649375915527, "learning_rate": 1.838529748283753e-05, "loss": 1.5003, "step": 2707 }, { "epoch": 0.77, "grad_norm": 9.92562198638916, "learning_rate": 1.8381006864988557e-05, "loss": 1.094, "step": 2708 }, { "epoch": 0.77, "grad_norm": 8.90312671661377, "learning_rate": 1.837671624713959e-05, "loss": 1.1677, "step": 2709 }, { "epoch": 0.78, "grad_norm": 11.466680526733398, "learning_rate": 1.8372425629290618e-05, "loss": 1.1583, "step": 2710 }, { "epoch": 0.78, "grad_norm": 9.08590030670166, "learning_rate": 1.836813501144165e-05, "loss": 1.0724, "step": 2711 }, { "epoch": 0.78, "grad_norm": 9.549266815185547, "learning_rate": 1.8363844393592676e-05, "loss": 1.1368, "step": 2712 }, { "epoch": 0.78, "grad_norm": 9.550378799438477, "learning_rate": 1.8359553775743708e-05, "loss": 1.1726, "step": 2713 }, { "epoch": 0.78, "grad_norm": 9.870880126953125, "learning_rate": 1.8355263157894736e-05, "loss": 1.0843, "step": 2714 }, { "epoch": 0.78, "grad_norm": 8.207592964172363, "learning_rate": 1.835097254004577e-05, "loss": 1.1149, "step": 2715 }, { "epoch": 0.78, "grad_norm": 10.710161209106445, "learning_rate": 1.8346681922196794e-05, "loss": 1.188, "step": 2716 }, { "epoch": 0.78, "grad_norm": 9.842384338378906, "learning_rate": 1.8342391304347826e-05, "loss": 1.2707, "step": 2717 }, { "epoch": 0.78, "grad_norm": 10.725979804992676, "learning_rate": 1.833810068649886e-05, "loss": 1.1073, "step": 2718 }, { "epoch": 0.78, "grad_norm": 8.74438190460205, "learning_rate": 1.8333810068649887e-05, "loss": 1.0223, "step": 2719 }, { "epoch": 0.78, "grad_norm": 10.196919441223145, "learning_rate": 1.8329519450800916e-05, "loss": 1.0389, "step": 2720 }, { "epoch": 0.78, "grad_norm": 9.852926254272461, "learning_rate": 1.8325228832951945e-05, "loss": 1.3454, "step": 2721 }, { "epoch": 0.78, "grad_norm": 9.271720886230469, "learning_rate": 1.8320938215102977e-05, "loss": 1.0826, "step": 2722 }, { "epoch": 0.78, "grad_norm": 8.834572792053223, "learning_rate": 1.8316647597254002e-05, "loss": 0.9662, "step": 2723 }, { "epoch": 0.78, "grad_norm": 8.742707252502441, "learning_rate": 1.8312356979405034e-05, "loss": 1.0828, "step": 2724 }, { "epoch": 0.78, "grad_norm": 10.010408401489258, "learning_rate": 1.8308066361556066e-05, "loss": 1.0087, "step": 2725 }, { "epoch": 0.78, "grad_norm": 10.239311218261719, "learning_rate": 1.8303775743707095e-05, "loss": 1.086, "step": 2726 }, { "epoch": 0.78, "grad_norm": 8.413912773132324, "learning_rate": 1.8299485125858124e-05, "loss": 1.0647, "step": 2727 }, { "epoch": 0.78, "grad_norm": 8.625919342041016, "learning_rate": 1.8295194508009153e-05, "loss": 1.1359, "step": 2728 }, { "epoch": 0.78, "grad_norm": 9.683062553405762, "learning_rate": 1.8290903890160185e-05, "loss": 1.1086, "step": 2729 }, { "epoch": 0.78, "grad_norm": 8.346321105957031, "learning_rate": 1.8286613272311214e-05, "loss": 0.9379, "step": 2730 }, { "epoch": 0.78, "grad_norm": 9.237349510192871, "learning_rate": 1.8282322654462242e-05, "loss": 0.968, "step": 2731 }, { "epoch": 0.78, "grad_norm": 8.374894142150879, "learning_rate": 1.827803203661327e-05, "loss": 0.9907, "step": 2732 }, { "epoch": 0.78, "grad_norm": 9.64543342590332, "learning_rate": 1.8273741418764303e-05, "loss": 1.0265, "step": 2733 }, { "epoch": 0.78, "grad_norm": 9.248924255371094, "learning_rate": 1.8269450800915332e-05, "loss": 1.1036, "step": 2734 }, { "epoch": 0.78, "grad_norm": 8.953285217285156, "learning_rate": 1.826516018306636e-05, "loss": 0.8402, "step": 2735 }, { "epoch": 0.78, "grad_norm": 11.745101928710938, "learning_rate": 1.8260869565217393e-05, "loss": 1.0951, "step": 2736 }, { "epoch": 0.78, "grad_norm": 11.503557205200195, "learning_rate": 1.8256578947368422e-05, "loss": 1.2745, "step": 2737 }, { "epoch": 0.78, "grad_norm": 7.832923412322998, "learning_rate": 1.825228832951945e-05, "loss": 0.9448, "step": 2738 }, { "epoch": 0.78, "grad_norm": 9.32075023651123, "learning_rate": 1.824799771167048e-05, "loss": 1.1564, "step": 2739 }, { "epoch": 0.78, "grad_norm": 10.57994556427002, "learning_rate": 1.824370709382151e-05, "loss": 1.0131, "step": 2740 }, { "epoch": 0.78, "grad_norm": 10.00416374206543, "learning_rate": 1.823941647597254e-05, "loss": 1.0168, "step": 2741 }, { "epoch": 0.78, "grad_norm": 9.534642219543457, "learning_rate": 1.823512585812357e-05, "loss": 1.1624, "step": 2742 }, { "epoch": 0.78, "grad_norm": 9.412590980529785, "learning_rate": 1.82308352402746e-05, "loss": 1.0281, "step": 2743 }, { "epoch": 0.78, "grad_norm": 7.895086765289307, "learning_rate": 1.822654462242563e-05, "loss": 0.911, "step": 2744 }, { "epoch": 0.79, "grad_norm": 10.005386352539062, "learning_rate": 1.8222254004576662e-05, "loss": 1.0018, "step": 2745 }, { "epoch": 0.79, "grad_norm": 9.550594329833984, "learning_rate": 1.8217963386727688e-05, "loss": 0.9918, "step": 2746 }, { "epoch": 0.79, "grad_norm": 6.900699138641357, "learning_rate": 1.821367276887872e-05, "loss": 0.8453, "step": 2747 }, { "epoch": 0.79, "grad_norm": 8.093343734741211, "learning_rate": 1.820938215102975e-05, "loss": 0.7882, "step": 2748 }, { "epoch": 0.79, "grad_norm": 9.774897575378418, "learning_rate": 1.8205091533180777e-05, "loss": 1.0871, "step": 2749 }, { "epoch": 0.79, "grad_norm": 9.677360534667969, "learning_rate": 1.8200800915331806e-05, "loss": 0.8507, "step": 2750 }, { "epoch": 0.79, "grad_norm": 11.09919548034668, "learning_rate": 1.8196510297482838e-05, "loss": 1.1706, "step": 2751 }, { "epoch": 0.79, "grad_norm": 10.05200481414795, "learning_rate": 1.819221967963387e-05, "loss": 0.9521, "step": 2752 }, { "epoch": 0.79, "grad_norm": 9.532218933105469, "learning_rate": 1.8187929061784896e-05, "loss": 1.1341, "step": 2753 }, { "epoch": 0.79, "grad_norm": 9.517202377319336, "learning_rate": 1.8183638443935928e-05, "loss": 0.9918, "step": 2754 }, { "epoch": 0.79, "grad_norm": 9.410207748413086, "learning_rate": 1.8179347826086957e-05, "loss": 1.1503, "step": 2755 }, { "epoch": 0.79, "grad_norm": 9.514126777648926, "learning_rate": 1.817505720823799e-05, "loss": 0.9599, "step": 2756 }, { "epoch": 0.79, "grad_norm": 9.519126892089844, "learning_rate": 1.8170766590389014e-05, "loss": 0.9525, "step": 2757 }, { "epoch": 0.79, "grad_norm": 11.100128173828125, "learning_rate": 1.8166475972540046e-05, "loss": 1.4367, "step": 2758 }, { "epoch": 0.79, "grad_norm": 9.024754524230957, "learning_rate": 1.8162185354691075e-05, "loss": 0.8391, "step": 2759 }, { "epoch": 0.79, "grad_norm": 9.76130485534668, "learning_rate": 1.8157894736842107e-05, "loss": 1.2622, "step": 2760 }, { "epoch": 0.79, "grad_norm": 10.94499397277832, "learning_rate": 1.8153604118993136e-05, "loss": 1.3885, "step": 2761 }, { "epoch": 0.79, "grad_norm": 10.015135765075684, "learning_rate": 1.8149313501144165e-05, "loss": 1.5093, "step": 2762 }, { "epoch": 0.79, "grad_norm": 8.107895851135254, "learning_rate": 1.8145022883295197e-05, "loss": 0.9366, "step": 2763 }, { "epoch": 0.79, "grad_norm": 10.041106224060059, "learning_rate": 1.8140732265446222e-05, "loss": 1.1956, "step": 2764 }, { "epoch": 0.79, "grad_norm": 10.189181327819824, "learning_rate": 1.8136441647597254e-05, "loss": 1.0976, "step": 2765 }, { "epoch": 0.79, "grad_norm": 10.372639656066895, "learning_rate": 1.8132151029748283e-05, "loss": 1.0334, "step": 2766 }, { "epoch": 0.79, "grad_norm": 9.486791610717773, "learning_rate": 1.8127860411899315e-05, "loss": 1.115, "step": 2767 }, { "epoch": 0.79, "grad_norm": 8.912895202636719, "learning_rate": 1.8123569794050344e-05, "loss": 0.761, "step": 2768 }, { "epoch": 0.79, "grad_norm": 8.6273832321167, "learning_rate": 1.8119279176201373e-05, "loss": 0.8588, "step": 2769 }, { "epoch": 0.79, "grad_norm": 9.135920524597168, "learning_rate": 1.8114988558352405e-05, "loss": 1.069, "step": 2770 }, { "epoch": 0.79, "grad_norm": 9.210888862609863, "learning_rate": 1.8110697940503434e-05, "loss": 0.9714, "step": 2771 }, { "epoch": 0.79, "grad_norm": 8.39043140411377, "learning_rate": 1.8106407322654463e-05, "loss": 1.0827, "step": 2772 }, { "epoch": 0.79, "grad_norm": 9.327219009399414, "learning_rate": 1.810211670480549e-05, "loss": 1.0137, "step": 2773 }, { "epoch": 0.79, "grad_norm": 11.347169876098633, "learning_rate": 1.8097826086956524e-05, "loss": 1.0051, "step": 2774 }, { "epoch": 0.79, "grad_norm": 7.7931671142578125, "learning_rate": 1.809353546910755e-05, "loss": 0.9233, "step": 2775 }, { "epoch": 0.79, "grad_norm": 8.85224437713623, "learning_rate": 1.808924485125858e-05, "loss": 1.0101, "step": 2776 }, { "epoch": 0.79, "grad_norm": 9.604768753051758, "learning_rate": 1.8084954233409613e-05, "loss": 1.0067, "step": 2777 }, { "epoch": 0.79, "grad_norm": 8.965160369873047, "learning_rate": 1.8080663615560642e-05, "loss": 0.8081, "step": 2778 }, { "epoch": 0.79, "grad_norm": 9.196931838989258, "learning_rate": 1.807637299771167e-05, "loss": 1.1284, "step": 2779 }, { "epoch": 0.8, "grad_norm": 10.866255760192871, "learning_rate": 1.80720823798627e-05, "loss": 1.2934, "step": 2780 }, { "epoch": 0.8, "grad_norm": 11.388153076171875, "learning_rate": 1.806779176201373e-05, "loss": 1.2407, "step": 2781 }, { "epoch": 0.8, "grad_norm": 7.777901649475098, "learning_rate": 1.806350114416476e-05, "loss": 0.78, "step": 2782 }, { "epoch": 0.8, "grad_norm": 12.280560493469238, "learning_rate": 1.805921052631579e-05, "loss": 1.2513, "step": 2783 }, { "epoch": 0.8, "grad_norm": 8.953399658203125, "learning_rate": 1.8054919908466818e-05, "loss": 0.8567, "step": 2784 }, { "epoch": 0.8, "grad_norm": 8.849992752075195, "learning_rate": 1.805062929061785e-05, "loss": 1.1959, "step": 2785 }, { "epoch": 0.8, "grad_norm": 8.891160011291504, "learning_rate": 1.804633867276888e-05, "loss": 1.1964, "step": 2786 }, { "epoch": 0.8, "grad_norm": 8.540994644165039, "learning_rate": 1.8042048054919908e-05, "loss": 1.0048, "step": 2787 }, { "epoch": 0.8, "grad_norm": 7.833096027374268, "learning_rate": 1.803775743707094e-05, "loss": 0.9079, "step": 2788 }, { "epoch": 0.8, "grad_norm": 9.545397758483887, "learning_rate": 1.803346681922197e-05, "loss": 1.137, "step": 2789 }, { "epoch": 0.8, "grad_norm": 8.271461486816406, "learning_rate": 1.8029176201372997e-05, "loss": 0.7457, "step": 2790 }, { "epoch": 0.8, "grad_norm": 7.652635097503662, "learning_rate": 1.8024885583524026e-05, "loss": 0.6924, "step": 2791 }, { "epoch": 0.8, "grad_norm": 8.800633430480957, "learning_rate": 1.8020594965675058e-05, "loss": 0.8786, "step": 2792 }, { "epoch": 0.8, "grad_norm": 9.03191089630127, "learning_rate": 1.8016304347826087e-05, "loss": 1.047, "step": 2793 }, { "epoch": 0.8, "grad_norm": 9.993693351745605, "learning_rate": 1.8012013729977116e-05, "loss": 0.8651, "step": 2794 }, { "epoch": 0.8, "grad_norm": 8.549789428710938, "learning_rate": 1.8007723112128148e-05, "loss": 0.8024, "step": 2795 }, { "epoch": 0.8, "grad_norm": 12.151975631713867, "learning_rate": 1.8003432494279177e-05, "loss": 1.3883, "step": 2796 }, { "epoch": 0.8, "grad_norm": 8.671039581298828, "learning_rate": 1.799914187643021e-05, "loss": 0.9327, "step": 2797 }, { "epoch": 0.8, "grad_norm": 10.68326187133789, "learning_rate": 1.7994851258581234e-05, "loss": 1.0669, "step": 2798 }, { "epoch": 0.8, "grad_norm": 8.871553421020508, "learning_rate": 1.7990560640732266e-05, "loss": 0.989, "step": 2799 }, { "epoch": 0.8, "grad_norm": 10.067877769470215, "learning_rate": 1.7986270022883295e-05, "loss": 1.0254, "step": 2800 }, { "epoch": 0.8, "grad_norm": 9.065559387207031, "learning_rate": 1.7981979405034324e-05, "loss": 1.0136, "step": 2801 }, { "epoch": 0.8, "grad_norm": 9.589637756347656, "learning_rate": 1.7977688787185356e-05, "loss": 1.0668, "step": 2802 }, { "epoch": 0.8, "grad_norm": 9.31413745880127, "learning_rate": 1.7973398169336385e-05, "loss": 0.8831, "step": 2803 }, { "epoch": 0.8, "grad_norm": 8.096830368041992, "learning_rate": 1.7969107551487417e-05, "loss": 0.702, "step": 2804 }, { "epoch": 0.8, "grad_norm": 10.909378051757812, "learning_rate": 1.7964816933638442e-05, "loss": 1.2209, "step": 2805 }, { "epoch": 0.8, "grad_norm": 8.658547401428223, "learning_rate": 1.7960526315789475e-05, "loss": 1.0153, "step": 2806 }, { "epoch": 0.8, "grad_norm": 11.248679161071777, "learning_rate": 1.7956235697940503e-05, "loss": 1.302, "step": 2807 }, { "epoch": 0.8, "grad_norm": 12.030311584472656, "learning_rate": 1.7951945080091536e-05, "loss": 1.3551, "step": 2808 }, { "epoch": 0.8, "grad_norm": 8.535387992858887, "learning_rate": 1.794765446224256e-05, "loss": 1.0262, "step": 2809 }, { "epoch": 0.8, "grad_norm": 11.121785163879395, "learning_rate": 1.7943363844393593e-05, "loss": 1.2911, "step": 2810 }, { "epoch": 0.8, "grad_norm": 8.692615509033203, "learning_rate": 1.7939073226544625e-05, "loss": 1.0072, "step": 2811 }, { "epoch": 0.8, "grad_norm": 10.252633094787598, "learning_rate": 1.7934782608695654e-05, "loss": 1.1589, "step": 2812 }, { "epoch": 0.8, "grad_norm": 10.307209968566895, "learning_rate": 1.7930491990846683e-05, "loss": 1.133, "step": 2813 }, { "epoch": 0.8, "grad_norm": 8.116240501403809, "learning_rate": 1.792620137299771e-05, "loss": 0.8728, "step": 2814 }, { "epoch": 0.81, "grad_norm": 8.729217529296875, "learning_rate": 1.7921910755148744e-05, "loss": 1.0193, "step": 2815 }, { "epoch": 0.81, "grad_norm": 9.098910331726074, "learning_rate": 1.791762013729977e-05, "loss": 1.291, "step": 2816 }, { "epoch": 0.81, "grad_norm": 9.680808067321777, "learning_rate": 1.79133295194508e-05, "loss": 1.5833, "step": 2817 }, { "epoch": 0.81, "grad_norm": 8.461689949035645, "learning_rate": 1.790903890160183e-05, "loss": 1.0234, "step": 2818 }, { "epoch": 0.81, "grad_norm": 8.697273254394531, "learning_rate": 1.7904748283752862e-05, "loss": 1.3734, "step": 2819 }, { "epoch": 0.81, "grad_norm": 6.797390937805176, "learning_rate": 1.790045766590389e-05, "loss": 0.6928, "step": 2820 }, { "epoch": 0.81, "grad_norm": 9.431440353393555, "learning_rate": 1.789616704805492e-05, "loss": 1.086, "step": 2821 }, { "epoch": 0.81, "grad_norm": 8.511075019836426, "learning_rate": 1.7891876430205952e-05, "loss": 1.0503, "step": 2822 }, { "epoch": 0.81, "grad_norm": 10.25367546081543, "learning_rate": 1.788758581235698e-05, "loss": 1.3238, "step": 2823 }, { "epoch": 0.81, "grad_norm": 9.629301071166992, "learning_rate": 1.788329519450801e-05, "loss": 1.2745, "step": 2824 }, { "epoch": 0.81, "grad_norm": 9.358258247375488, "learning_rate": 1.7879004576659038e-05, "loss": 1.1041, "step": 2825 }, { "epoch": 0.81, "grad_norm": 9.950977325439453, "learning_rate": 1.787471395881007e-05, "loss": 1.126, "step": 2826 }, { "epoch": 0.81, "grad_norm": 8.868396759033203, "learning_rate": 1.7870423340961096e-05, "loss": 0.8953, "step": 2827 }, { "epoch": 0.81, "grad_norm": 9.405526161193848, "learning_rate": 1.7866132723112128e-05, "loss": 1.1577, "step": 2828 }, { "epoch": 0.81, "grad_norm": 8.505644798278809, "learning_rate": 1.786184210526316e-05, "loss": 1.0532, "step": 2829 }, { "epoch": 0.81, "grad_norm": 8.55901050567627, "learning_rate": 1.785755148741419e-05, "loss": 1.044, "step": 2830 }, { "epoch": 0.81, "grad_norm": 11.979047775268555, "learning_rate": 1.7853260869565218e-05, "loss": 1.2484, "step": 2831 }, { "epoch": 0.81, "grad_norm": 8.85673713684082, "learning_rate": 1.7848970251716246e-05, "loss": 1.0276, "step": 2832 }, { "epoch": 0.81, "grad_norm": 8.023272514343262, "learning_rate": 1.784467963386728e-05, "loss": 0.9118, "step": 2833 }, { "epoch": 0.81, "grad_norm": 10.74399471282959, "learning_rate": 1.7840389016018307e-05, "loss": 1.2764, "step": 2834 }, { "epoch": 0.81, "grad_norm": 10.298340797424316, "learning_rate": 1.7836098398169336e-05, "loss": 1.1011, "step": 2835 }, { "epoch": 0.81, "grad_norm": 11.09740924835205, "learning_rate": 1.7831807780320365e-05, "loss": 1.3755, "step": 2836 }, { "epoch": 0.81, "grad_norm": 11.25870132446289, "learning_rate": 1.7827517162471397e-05, "loss": 1.5211, "step": 2837 }, { "epoch": 0.81, "grad_norm": 9.853792190551758, "learning_rate": 1.782322654462243e-05, "loss": 1.031, "step": 2838 }, { "epoch": 0.81, "grad_norm": 8.365081787109375, "learning_rate": 1.7818935926773454e-05, "loss": 1.1502, "step": 2839 }, { "epoch": 0.81, "grad_norm": 7.9195556640625, "learning_rate": 1.7814645308924487e-05, "loss": 0.8347, "step": 2840 }, { "epoch": 0.81, "grad_norm": 9.461267471313477, "learning_rate": 1.7810354691075515e-05, "loss": 0.959, "step": 2841 }, { "epoch": 0.81, "grad_norm": 7.771499156951904, "learning_rate": 1.7806064073226544e-05, "loss": 0.9366, "step": 2842 }, { "epoch": 0.81, "grad_norm": 9.530256271362305, "learning_rate": 1.7801773455377573e-05, "loss": 1.1998, "step": 2843 }, { "epoch": 0.81, "grad_norm": 9.167468070983887, "learning_rate": 1.7797482837528605e-05, "loss": 1.0393, "step": 2844 }, { "epoch": 0.81, "grad_norm": 10.303680419921875, "learning_rate": 1.7793192219679637e-05, "loss": 0.9507, "step": 2845 }, { "epoch": 0.81, "grad_norm": 8.955582618713379, "learning_rate": 1.7788901601830663e-05, "loss": 0.9256, "step": 2846 }, { "epoch": 0.81, "grad_norm": 12.868518829345703, "learning_rate": 1.7784610983981695e-05, "loss": 1.414, "step": 2847 }, { "epoch": 0.81, "grad_norm": 10.280289649963379, "learning_rate": 1.7780320366132724e-05, "loss": 1.2262, "step": 2848 }, { "epoch": 0.81, "grad_norm": 9.211709022521973, "learning_rate": 1.7776029748283756e-05, "loss": 0.9237, "step": 2849 }, { "epoch": 0.82, "grad_norm": 10.173371315002441, "learning_rate": 1.777173913043478e-05, "loss": 1.1584, "step": 2850 }, { "epoch": 0.82, "grad_norm": 8.673914909362793, "learning_rate": 1.7767448512585813e-05, "loss": 0.9677, "step": 2851 }, { "epoch": 0.82, "grad_norm": 10.421575546264648, "learning_rate": 1.7763157894736842e-05, "loss": 1.2843, "step": 2852 }, { "epoch": 0.82, "grad_norm": 8.3233060836792, "learning_rate": 1.775886727688787e-05, "loss": 0.9641, "step": 2853 }, { "epoch": 0.82, "grad_norm": 9.67033863067627, "learning_rate": 1.7754576659038903e-05, "loss": 0.9924, "step": 2854 }, { "epoch": 0.82, "grad_norm": 9.184983253479004, "learning_rate": 1.775028604118993e-05, "loss": 0.8973, "step": 2855 }, { "epoch": 0.82, "grad_norm": 10.033408164978027, "learning_rate": 1.7745995423340964e-05, "loss": 1.0007, "step": 2856 }, { "epoch": 0.82, "grad_norm": 10.438945770263672, "learning_rate": 1.774170480549199e-05, "loss": 1.0585, "step": 2857 }, { "epoch": 0.82, "grad_norm": 9.126609802246094, "learning_rate": 1.773741418764302e-05, "loss": 1.0277, "step": 2858 }, { "epoch": 0.82, "grad_norm": 9.337931632995605, "learning_rate": 1.773312356979405e-05, "loss": 0.9812, "step": 2859 }, { "epoch": 0.82, "grad_norm": 8.967913627624512, "learning_rate": 1.7728832951945082e-05, "loss": 1.3706, "step": 2860 }, { "epoch": 0.82, "grad_norm": 9.8753023147583, "learning_rate": 1.7724542334096108e-05, "loss": 1.0067, "step": 2861 }, { "epoch": 0.82, "grad_norm": 8.703413009643555, "learning_rate": 1.772025171624714e-05, "loss": 1.0373, "step": 2862 }, { "epoch": 0.82, "grad_norm": 9.074624061584473, "learning_rate": 1.7715961098398172e-05, "loss": 1.3182, "step": 2863 }, { "epoch": 0.82, "grad_norm": 9.192731857299805, "learning_rate": 1.77116704805492e-05, "loss": 0.897, "step": 2864 }, { "epoch": 0.82, "grad_norm": 8.995240211486816, "learning_rate": 1.770737986270023e-05, "loss": 1.1138, "step": 2865 }, { "epoch": 0.82, "grad_norm": 8.425854682922363, "learning_rate": 1.7703089244851258e-05, "loss": 1.1363, "step": 2866 }, { "epoch": 0.82, "grad_norm": 7.753910541534424, "learning_rate": 1.769879862700229e-05, "loss": 0.824, "step": 2867 }, { "epoch": 0.82, "grad_norm": 8.238604545593262, "learning_rate": 1.7694508009153316e-05, "loss": 0.8746, "step": 2868 }, { "epoch": 0.82, "grad_norm": 9.719847679138184, "learning_rate": 1.7690217391304348e-05, "loss": 1.1737, "step": 2869 }, { "epoch": 0.82, "grad_norm": 8.411002159118652, "learning_rate": 1.7685926773455377e-05, "loss": 0.995, "step": 2870 }, { "epoch": 0.82, "grad_norm": 7.016213893890381, "learning_rate": 1.768163615560641e-05, "loss": 0.8299, "step": 2871 }, { "epoch": 0.82, "grad_norm": 8.941311836242676, "learning_rate": 1.7677345537757438e-05, "loss": 1.0729, "step": 2872 }, { "epoch": 0.82, "grad_norm": 9.35824203491211, "learning_rate": 1.7673054919908466e-05, "loss": 1.0629, "step": 2873 }, { "epoch": 0.82, "grad_norm": 8.965079307556152, "learning_rate": 1.76687643020595e-05, "loss": 0.9683, "step": 2874 }, { "epoch": 0.82, "grad_norm": 8.929749488830566, "learning_rate": 1.7664473684210527e-05, "loss": 0.8733, "step": 2875 }, { "epoch": 0.82, "grad_norm": 6.842812538146973, "learning_rate": 1.7660183066361556e-05, "loss": 0.6962, "step": 2876 }, { "epoch": 0.82, "grad_norm": 8.920101165771484, "learning_rate": 1.7655892448512585e-05, "loss": 0.9707, "step": 2877 }, { "epoch": 0.82, "grad_norm": 7.8547139167785645, "learning_rate": 1.7651601830663617e-05, "loss": 0.8205, "step": 2878 }, { "epoch": 0.82, "grad_norm": 8.265000343322754, "learning_rate": 1.7647311212814642e-05, "loss": 1.1583, "step": 2879 }, { "epoch": 0.82, "grad_norm": 10.203521728515625, "learning_rate": 1.7643020594965675e-05, "loss": 0.9276, "step": 2880 }, { "epoch": 0.82, "grad_norm": 10.345846176147461, "learning_rate": 1.7638729977116707e-05, "loss": 0.7885, "step": 2881 }, { "epoch": 0.82, "grad_norm": 9.39890193939209, "learning_rate": 1.7634439359267736e-05, "loss": 1.0957, "step": 2882 }, { "epoch": 0.82, "grad_norm": 10.122784614562988, "learning_rate": 1.7630148741418764e-05, "loss": 1.0031, "step": 2883 }, { "epoch": 0.82, "grad_norm": 8.748235702514648, "learning_rate": 1.7625858123569793e-05, "loss": 0.785, "step": 2884 }, { "epoch": 0.83, "grad_norm": 7.878302097320557, "learning_rate": 1.7621567505720825e-05, "loss": 0.7446, "step": 2885 }, { "epoch": 0.83, "grad_norm": 12.413397789001465, "learning_rate": 1.7617276887871854e-05, "loss": 1.0913, "step": 2886 }, { "epoch": 0.83, "grad_norm": 9.62625503540039, "learning_rate": 1.7612986270022883e-05, "loss": 0.729, "step": 2887 }, { "epoch": 0.83, "grad_norm": 9.879027366638184, "learning_rate": 1.7608695652173915e-05, "loss": 1.1488, "step": 2888 }, { "epoch": 0.83, "grad_norm": 12.591888427734375, "learning_rate": 1.7604405034324944e-05, "loss": 1.2602, "step": 2889 }, { "epoch": 0.83, "grad_norm": 9.228856086730957, "learning_rate": 1.7600114416475976e-05, "loss": 0.954, "step": 2890 }, { "epoch": 0.83, "grad_norm": 9.087937355041504, "learning_rate": 1.7595823798627e-05, "loss": 0.8813, "step": 2891 }, { "epoch": 0.83, "grad_norm": 11.661734580993652, "learning_rate": 1.7591533180778033e-05, "loss": 1.2382, "step": 2892 }, { "epoch": 0.83, "grad_norm": 10.572035789489746, "learning_rate": 1.7587242562929062e-05, "loss": 0.7731, "step": 2893 }, { "epoch": 0.83, "grad_norm": 9.623916625976562, "learning_rate": 1.758295194508009e-05, "loss": 1.0266, "step": 2894 }, { "epoch": 0.83, "grad_norm": 10.228483200073242, "learning_rate": 1.757866132723112e-05, "loss": 0.8791, "step": 2895 }, { "epoch": 0.83, "grad_norm": 9.038342475891113, "learning_rate": 1.7574370709382152e-05, "loss": 0.9342, "step": 2896 }, { "epoch": 0.83, "grad_norm": 8.881660461425781, "learning_rate": 1.7570080091533184e-05, "loss": 1.0093, "step": 2897 }, { "epoch": 0.83, "grad_norm": 11.040688514709473, "learning_rate": 1.756578947368421e-05, "loss": 1.0726, "step": 2898 }, { "epoch": 0.83, "grad_norm": 7.727034568786621, "learning_rate": 1.756149885583524e-05, "loss": 0.9336, "step": 2899 }, { "epoch": 0.83, "grad_norm": 10.47050666809082, "learning_rate": 1.755720823798627e-05, "loss": 1.1571, "step": 2900 }, { "epoch": 0.83, "grad_norm": 8.488235473632812, "learning_rate": 1.7552917620137302e-05, "loss": 0.917, "step": 2901 }, { "epoch": 0.83, "grad_norm": 9.848282814025879, "learning_rate": 1.7548627002288328e-05, "loss": 1.1838, "step": 2902 }, { "epoch": 0.83, "grad_norm": 8.979755401611328, "learning_rate": 1.754433638443936e-05, "loss": 1.234, "step": 2903 }, { "epoch": 0.83, "grad_norm": 9.087435722351074, "learning_rate": 1.754004576659039e-05, "loss": 1.1591, "step": 2904 }, { "epoch": 0.83, "grad_norm": 7.905564308166504, "learning_rate": 1.7535755148741417e-05, "loss": 0.9567, "step": 2905 }, { "epoch": 0.83, "grad_norm": 8.611047744750977, "learning_rate": 1.753146453089245e-05, "loss": 1.0315, "step": 2906 }, { "epoch": 0.83, "grad_norm": 8.980897903442383, "learning_rate": 1.752717391304348e-05, "loss": 1.0776, "step": 2907 }, { "epoch": 0.83, "grad_norm": 8.368359565734863, "learning_rate": 1.752288329519451e-05, "loss": 0.922, "step": 2908 }, { "epoch": 0.83, "grad_norm": 9.533286094665527, "learning_rate": 1.7518592677345536e-05, "loss": 1.0402, "step": 2909 }, { "epoch": 0.83, "grad_norm": 9.625426292419434, "learning_rate": 1.7514302059496568e-05, "loss": 1.0357, "step": 2910 }, { "epoch": 0.83, "grad_norm": 7.755105018615723, "learning_rate": 1.7510011441647597e-05, "loss": 0.9851, "step": 2911 }, { "epoch": 0.83, "grad_norm": 10.236310005187988, "learning_rate": 1.750572082379863e-05, "loss": 1.1758, "step": 2912 }, { "epoch": 0.83, "grad_norm": 8.932103157043457, "learning_rate": 1.7501430205949654e-05, "loss": 0.7994, "step": 2913 }, { "epoch": 0.83, "grad_norm": 9.338969230651855, "learning_rate": 1.7497139588100687e-05, "loss": 1.0408, "step": 2914 }, { "epoch": 0.83, "grad_norm": 8.797367095947266, "learning_rate": 1.749284897025172e-05, "loss": 1.1807, "step": 2915 }, { "epoch": 0.83, "grad_norm": 8.847677230834961, "learning_rate": 1.7488558352402747e-05, "loss": 0.8965, "step": 2916 }, { "epoch": 0.83, "grad_norm": 7.4715447425842285, "learning_rate": 1.7484267734553776e-05, "loss": 0.9171, "step": 2917 }, { "epoch": 0.83, "grad_norm": 8.547280311584473, "learning_rate": 1.7479977116704805e-05, "loss": 1.1989, "step": 2918 }, { "epoch": 0.83, "grad_norm": 8.58836841583252, "learning_rate": 1.7475686498855837e-05, "loss": 0.7985, "step": 2919 }, { "epoch": 0.84, "grad_norm": 12.026870727539062, "learning_rate": 1.7471395881006863e-05, "loss": 1.15, "step": 2920 }, { "epoch": 0.84, "grad_norm": 8.279645919799805, "learning_rate": 1.7467105263157895e-05, "loss": 0.9435, "step": 2921 }, { "epoch": 0.84, "grad_norm": 8.120729446411133, "learning_rate": 1.7462814645308927e-05, "loss": 0.8074, "step": 2922 }, { "epoch": 0.84, "grad_norm": 11.065886497497559, "learning_rate": 1.7458524027459956e-05, "loss": 1.1931, "step": 2923 }, { "epoch": 0.84, "grad_norm": 12.420193672180176, "learning_rate": 1.7454233409610984e-05, "loss": 1.3434, "step": 2924 }, { "epoch": 0.84, "grad_norm": 9.91895866394043, "learning_rate": 1.7449942791762013e-05, "loss": 1.0267, "step": 2925 }, { "epoch": 0.84, "grad_norm": 9.360223770141602, "learning_rate": 1.7445652173913045e-05, "loss": 0.9777, "step": 2926 }, { "epoch": 0.84, "grad_norm": 9.885186195373535, "learning_rate": 1.7441361556064074e-05, "loss": 1.035, "step": 2927 }, { "epoch": 0.84, "grad_norm": 10.056436538696289, "learning_rate": 1.7437070938215103e-05, "loss": 1.3135, "step": 2928 }, { "epoch": 0.84, "grad_norm": 11.684242248535156, "learning_rate": 1.743278032036613e-05, "loss": 1.1643, "step": 2929 }, { "epoch": 0.84, "grad_norm": 7.900113582611084, "learning_rate": 1.7428489702517164e-05, "loss": 0.8937, "step": 2930 }, { "epoch": 0.84, "grad_norm": 8.781208992004395, "learning_rate": 1.7424199084668193e-05, "loss": 0.6792, "step": 2931 }, { "epoch": 0.84, "grad_norm": 8.980362892150879, "learning_rate": 1.741990846681922e-05, "loss": 1.0824, "step": 2932 }, { "epoch": 0.84, "grad_norm": 10.64148235321045, "learning_rate": 1.7415617848970253e-05, "loss": 1.1883, "step": 2933 }, { "epoch": 0.84, "grad_norm": 9.602545738220215, "learning_rate": 1.7411327231121282e-05, "loss": 1.1871, "step": 2934 }, { "epoch": 0.84, "grad_norm": 12.133111000061035, "learning_rate": 1.740703661327231e-05, "loss": 1.5245, "step": 2935 }, { "epoch": 0.84, "grad_norm": 10.329170227050781, "learning_rate": 1.740274599542334e-05, "loss": 1.2061, "step": 2936 }, { "epoch": 0.84, "grad_norm": 8.698854446411133, "learning_rate": 1.7398455377574372e-05, "loss": 0.9893, "step": 2937 }, { "epoch": 0.84, "grad_norm": 9.945671081542969, "learning_rate": 1.73941647597254e-05, "loss": 1.381, "step": 2938 }, { "epoch": 0.84, "grad_norm": 8.756754875183105, "learning_rate": 1.738987414187643e-05, "loss": 0.9526, "step": 2939 }, { "epoch": 0.84, "grad_norm": 9.349870681762695, "learning_rate": 1.738558352402746e-05, "loss": 1.3003, "step": 2940 }, { "epoch": 0.84, "grad_norm": 9.652379035949707, "learning_rate": 1.738129290617849e-05, "loss": 1.1248, "step": 2941 }, { "epoch": 0.84, "grad_norm": 9.066186904907227, "learning_rate": 1.7377002288329523e-05, "loss": 1.0749, "step": 2942 }, { "epoch": 0.84, "grad_norm": 7.665074825286865, "learning_rate": 1.7372711670480548e-05, "loss": 0.9764, "step": 2943 }, { "epoch": 0.84, "grad_norm": 6.827920436859131, "learning_rate": 1.736842105263158e-05, "loss": 0.9367, "step": 2944 }, { "epoch": 0.84, "grad_norm": 9.536245346069336, "learning_rate": 1.736413043478261e-05, "loss": 1.0384, "step": 2945 }, { "epoch": 0.84, "grad_norm": 8.041460037231445, "learning_rate": 1.7359839816933638e-05, "loss": 0.9429, "step": 2946 }, { "epoch": 0.84, "grad_norm": 7.036596298217773, "learning_rate": 1.7355549199084666e-05, "loss": 0.8475, "step": 2947 }, { "epoch": 0.84, "grad_norm": 9.844472885131836, "learning_rate": 1.73512585812357e-05, "loss": 1.017, "step": 2948 }, { "epoch": 0.84, "grad_norm": 9.281776428222656, "learning_rate": 1.734696796338673e-05, "loss": 1.1786, "step": 2949 }, { "epoch": 0.84, "grad_norm": 9.428088188171387, "learning_rate": 1.7342677345537756e-05, "loss": 1.0895, "step": 2950 }, { "epoch": 0.84, "grad_norm": 9.562387466430664, "learning_rate": 1.7338386727688788e-05, "loss": 1.2424, "step": 2951 }, { "epoch": 0.84, "grad_norm": 9.42207145690918, "learning_rate": 1.7334096109839817e-05, "loss": 1.0374, "step": 2952 }, { "epoch": 0.84, "grad_norm": 9.31929874420166, "learning_rate": 1.732980549199085e-05, "loss": 1.0619, "step": 2953 }, { "epoch": 0.84, "grad_norm": 10.772723197937012, "learning_rate": 1.7325514874141875e-05, "loss": 1.2016, "step": 2954 }, { "epoch": 0.85, "grad_norm": 10.2816743850708, "learning_rate": 1.7321224256292907e-05, "loss": 1.1468, "step": 2955 }, { "epoch": 0.85, "grad_norm": 10.321672439575195, "learning_rate": 1.7316933638443935e-05, "loss": 1.0652, "step": 2956 }, { "epoch": 0.85, "grad_norm": 9.434853553771973, "learning_rate": 1.7312643020594968e-05, "loss": 1.1972, "step": 2957 }, { "epoch": 0.85, "grad_norm": 9.571502685546875, "learning_rate": 1.7308352402745996e-05, "loss": 0.9115, "step": 2958 }, { "epoch": 0.85, "grad_norm": 9.985664367675781, "learning_rate": 1.7304061784897025e-05, "loss": 1.0932, "step": 2959 }, { "epoch": 0.85, "grad_norm": 9.816332817077637, "learning_rate": 1.7299771167048057e-05, "loss": 1.2125, "step": 2960 }, { "epoch": 0.85, "grad_norm": 9.004415512084961, "learning_rate": 1.7295480549199083e-05, "loss": 1.018, "step": 2961 }, { "epoch": 0.85, "grad_norm": 9.268218040466309, "learning_rate": 1.7291189931350115e-05, "loss": 1.0142, "step": 2962 }, { "epoch": 0.85, "grad_norm": 7.872369766235352, "learning_rate": 1.7286899313501144e-05, "loss": 0.9751, "step": 2963 }, { "epoch": 0.85, "grad_norm": 8.151576042175293, "learning_rate": 1.7282608695652176e-05, "loss": 0.9471, "step": 2964 }, { "epoch": 0.85, "grad_norm": 9.960821151733398, "learning_rate": 1.7278318077803205e-05, "loss": 1.2366, "step": 2965 }, { "epoch": 0.85, "grad_norm": 8.932233810424805, "learning_rate": 1.7274027459954233e-05, "loss": 0.9703, "step": 2966 }, { "epoch": 0.85, "grad_norm": 8.246437072753906, "learning_rate": 1.7269736842105265e-05, "loss": 0.7479, "step": 2967 }, { "epoch": 0.85, "grad_norm": 11.880038261413574, "learning_rate": 1.7265446224256294e-05, "loss": 1.344, "step": 2968 }, { "epoch": 0.85, "grad_norm": 9.149515151977539, "learning_rate": 1.7261155606407323e-05, "loss": 1.066, "step": 2969 }, { "epoch": 0.85, "grad_norm": 9.845952033996582, "learning_rate": 1.7256864988558352e-05, "loss": 1.1182, "step": 2970 }, { "epoch": 0.85, "grad_norm": 10.139139175415039, "learning_rate": 1.7252574370709384e-05, "loss": 1.2844, "step": 2971 }, { "epoch": 0.85, "grad_norm": 8.441656112670898, "learning_rate": 1.724828375286041e-05, "loss": 0.835, "step": 2972 }, { "epoch": 0.85, "grad_norm": 10.01870346069336, "learning_rate": 1.724399313501144e-05, "loss": 1.1136, "step": 2973 }, { "epoch": 0.85, "grad_norm": 8.709890365600586, "learning_rate": 1.7239702517162474e-05, "loss": 1.0586, "step": 2974 }, { "epoch": 0.85, "grad_norm": 9.84835147857666, "learning_rate": 1.7235411899313502e-05, "loss": 1.1279, "step": 2975 }, { "epoch": 0.85, "grad_norm": 9.617959022521973, "learning_rate": 1.723112128146453e-05, "loss": 1.2462, "step": 2976 }, { "epoch": 0.85, "grad_norm": 8.675017356872559, "learning_rate": 1.722683066361556e-05, "loss": 0.9196, "step": 2977 }, { "epoch": 0.85, "grad_norm": 8.21192455291748, "learning_rate": 1.7222540045766592e-05, "loss": 1.1072, "step": 2978 }, { "epoch": 0.85, "grad_norm": 7.8568902015686035, "learning_rate": 1.721824942791762e-05, "loss": 1.0281, "step": 2979 }, { "epoch": 0.85, "grad_norm": 7.6694488525390625, "learning_rate": 1.721395881006865e-05, "loss": 0.902, "step": 2980 }, { "epoch": 0.85, "grad_norm": 8.478190422058105, "learning_rate": 1.720966819221968e-05, "loss": 0.9622, "step": 2981 }, { "epoch": 0.85, "grad_norm": 9.412214279174805, "learning_rate": 1.720537757437071e-05, "loss": 1.0104, "step": 2982 }, { "epoch": 0.85, "grad_norm": 9.078675270080566, "learning_rate": 1.7201086956521743e-05, "loss": 1.0179, "step": 2983 }, { "epoch": 0.85, "grad_norm": 9.115436553955078, "learning_rate": 1.7196796338672768e-05, "loss": 1.1088, "step": 2984 }, { "epoch": 0.85, "grad_norm": 7.480807781219482, "learning_rate": 1.71925057208238e-05, "loss": 0.8795, "step": 2985 }, { "epoch": 0.85, "grad_norm": 8.407920837402344, "learning_rate": 1.718821510297483e-05, "loss": 0.9394, "step": 2986 }, { "epoch": 0.85, "grad_norm": 9.731978416442871, "learning_rate": 1.7183924485125858e-05, "loss": 1.0389, "step": 2987 }, { "epoch": 0.85, "grad_norm": 8.553013801574707, "learning_rate": 1.7179633867276887e-05, "loss": 0.96, "step": 2988 }, { "epoch": 0.85, "grad_norm": 9.58366870880127, "learning_rate": 1.717534324942792e-05, "loss": 1.2158, "step": 2989 }, { "epoch": 0.86, "grad_norm": 12.59498119354248, "learning_rate": 1.7171052631578947e-05, "loss": 0.9116, "step": 2990 }, { "epoch": 0.86, "grad_norm": 8.889739036560059, "learning_rate": 1.7166762013729976e-05, "loss": 0.7415, "step": 2991 }, { "epoch": 0.86, "grad_norm": 7.628733158111572, "learning_rate": 1.716247139588101e-05, "loss": 0.7047, "step": 2992 }, { "epoch": 0.86, "grad_norm": 9.510764122009277, "learning_rate": 1.7158180778032037e-05, "loss": 0.9974, "step": 2993 }, { "epoch": 0.86, "grad_norm": 11.699542999267578, "learning_rate": 1.715389016018307e-05, "loss": 1.1202, "step": 2994 }, { "epoch": 0.86, "grad_norm": 11.28286075592041, "learning_rate": 1.7149599542334095e-05, "loss": 0.9915, "step": 2995 }, { "epoch": 0.86, "grad_norm": 9.383990287780762, "learning_rate": 1.7145308924485127e-05, "loss": 0.9895, "step": 2996 }, { "epoch": 0.86, "grad_norm": 9.332915306091309, "learning_rate": 1.7141018306636156e-05, "loss": 0.9564, "step": 2997 }, { "epoch": 0.86, "grad_norm": 8.677774429321289, "learning_rate": 1.7136727688787184e-05, "loss": 0.7396, "step": 2998 }, { "epoch": 0.86, "grad_norm": 10.549933433532715, "learning_rate": 1.7132437070938213e-05, "loss": 0.9943, "step": 2999 }, { "epoch": 0.86, "grad_norm": 10.350984573364258, "learning_rate": 1.7128146453089245e-05, "loss": 1.2945, "step": 3000 }, { "epoch": 0.86, "grad_norm": 11.026261329650879, "learning_rate": 1.7123855835240277e-05, "loss": 1.1736, "step": 3001 }, { "epoch": 0.86, "grad_norm": 9.979823112487793, "learning_rate": 1.7119565217391303e-05, "loss": 1.0384, "step": 3002 }, { "epoch": 0.86, "grad_norm": 8.890838623046875, "learning_rate": 1.7115274599542335e-05, "loss": 1.1439, "step": 3003 }, { "epoch": 0.86, "grad_norm": 9.048349380493164, "learning_rate": 1.7110983981693364e-05, "loss": 1.0301, "step": 3004 }, { "epoch": 0.86, "grad_norm": 9.576517105102539, "learning_rate": 1.7106693363844396e-05, "loss": 0.9483, "step": 3005 }, { "epoch": 0.86, "grad_norm": 11.636025428771973, "learning_rate": 1.710240274599542e-05, "loss": 1.1323, "step": 3006 }, { "epoch": 0.86, "grad_norm": 9.963531494140625, "learning_rate": 1.7098112128146453e-05, "loss": 1.1524, "step": 3007 }, { "epoch": 0.86, "grad_norm": 10.237224578857422, "learning_rate": 1.7093821510297486e-05, "loss": 1.2993, "step": 3008 }, { "epoch": 0.86, "grad_norm": 8.78339672088623, "learning_rate": 1.7089530892448514e-05, "loss": 0.8406, "step": 3009 }, { "epoch": 0.86, "grad_norm": 10.362381935119629, "learning_rate": 1.7085240274599543e-05, "loss": 1.0079, "step": 3010 }, { "epoch": 0.86, "grad_norm": 10.179791450500488, "learning_rate": 1.7080949656750572e-05, "loss": 1.2726, "step": 3011 }, { "epoch": 0.86, "grad_norm": 9.268647193908691, "learning_rate": 1.7076659038901604e-05, "loss": 1.1014, "step": 3012 }, { "epoch": 0.86, "grad_norm": 11.005672454833984, "learning_rate": 1.707236842105263e-05, "loss": 1.3472, "step": 3013 }, { "epoch": 0.86, "grad_norm": 8.85457706451416, "learning_rate": 1.706807780320366e-05, "loss": 1.0802, "step": 3014 }, { "epoch": 0.86, "grad_norm": 10.423368453979492, "learning_rate": 1.706378718535469e-05, "loss": 1.0567, "step": 3015 }, { "epoch": 0.86, "grad_norm": 8.43641471862793, "learning_rate": 1.7059496567505723e-05, "loss": 1.0063, "step": 3016 }, { "epoch": 0.86, "grad_norm": 8.766290664672852, "learning_rate": 1.705520594965675e-05, "loss": 1.1333, "step": 3017 }, { "epoch": 0.86, "grad_norm": 8.754609107971191, "learning_rate": 1.705091533180778e-05, "loss": 1.0378, "step": 3018 }, { "epoch": 0.86, "grad_norm": 8.543985366821289, "learning_rate": 1.7046624713958812e-05, "loss": 1.1846, "step": 3019 }, { "epoch": 0.86, "grad_norm": 7.664855480194092, "learning_rate": 1.704233409610984e-05, "loss": 1.0171, "step": 3020 }, { "epoch": 0.86, "grad_norm": 9.609197616577148, "learning_rate": 1.703804347826087e-05, "loss": 1.0862, "step": 3021 }, { "epoch": 0.86, "grad_norm": 8.459212303161621, "learning_rate": 1.70337528604119e-05, "loss": 0.9658, "step": 3022 }, { "epoch": 0.86, "grad_norm": 8.076066017150879, "learning_rate": 1.702946224256293e-05, "loss": 0.9613, "step": 3023 }, { "epoch": 0.86, "grad_norm": 8.68406867980957, "learning_rate": 1.7025171624713956e-05, "loss": 0.8566, "step": 3024 }, { "epoch": 0.87, "grad_norm": 9.512566566467285, "learning_rate": 1.7020881006864988e-05, "loss": 0.8524, "step": 3025 }, { "epoch": 0.87, "grad_norm": 8.096138954162598, "learning_rate": 1.701659038901602e-05, "loss": 0.682, "step": 3026 }, { "epoch": 0.87, "grad_norm": 8.064615249633789, "learning_rate": 1.701229977116705e-05, "loss": 1.1461, "step": 3027 }, { "epoch": 0.87, "grad_norm": 8.561189651489258, "learning_rate": 1.7008009153318078e-05, "loss": 1.0293, "step": 3028 }, { "epoch": 0.87, "grad_norm": 7.9466447830200195, "learning_rate": 1.7003718535469107e-05, "loss": 0.8737, "step": 3029 }, { "epoch": 0.87, "grad_norm": 8.754287719726562, "learning_rate": 1.699942791762014e-05, "loss": 0.9984, "step": 3030 }, { "epoch": 0.87, "grad_norm": 11.927494049072266, "learning_rate": 1.6995137299771168e-05, "loss": 1.2555, "step": 3031 }, { "epoch": 0.87, "grad_norm": 9.53317642211914, "learning_rate": 1.6990846681922196e-05, "loss": 1.0106, "step": 3032 }, { "epoch": 0.87, "grad_norm": 10.192111015319824, "learning_rate": 1.6986556064073225e-05, "loss": 1.1324, "step": 3033 }, { "epoch": 0.87, "grad_norm": 9.124427795410156, "learning_rate": 1.6982265446224257e-05, "loss": 0.9526, "step": 3034 }, { "epoch": 0.87, "grad_norm": 11.455728530883789, "learning_rate": 1.697797482837529e-05, "loss": 1.2767, "step": 3035 }, { "epoch": 0.87, "grad_norm": 9.49399471282959, "learning_rate": 1.6973684210526315e-05, "loss": 0.9035, "step": 3036 }, { "epoch": 0.87, "grad_norm": 8.540650367736816, "learning_rate": 1.6969393592677347e-05, "loss": 1.011, "step": 3037 }, { "epoch": 0.87, "grad_norm": 10.606213569641113, "learning_rate": 1.6965102974828376e-05, "loss": 1.1867, "step": 3038 }, { "epoch": 0.87, "grad_norm": 9.003400802612305, "learning_rate": 1.6960812356979405e-05, "loss": 1.0107, "step": 3039 }, { "epoch": 0.87, "grad_norm": 8.365703582763672, "learning_rate": 1.6956521739130433e-05, "loss": 0.8612, "step": 3040 }, { "epoch": 0.87, "grad_norm": 8.514734268188477, "learning_rate": 1.6952231121281465e-05, "loss": 0.8481, "step": 3041 }, { "epoch": 0.87, "grad_norm": 10.031387329101562, "learning_rate": 1.6947940503432498e-05, "loss": 0.8515, "step": 3042 }, { "epoch": 0.87, "grad_norm": 10.178349494934082, "learning_rate": 1.6943649885583523e-05, "loss": 1.3057, "step": 3043 }, { "epoch": 0.87, "grad_norm": 8.990435600280762, "learning_rate": 1.6939359267734555e-05, "loss": 0.7206, "step": 3044 }, { "epoch": 0.87, "grad_norm": 9.12608814239502, "learning_rate": 1.6935068649885584e-05, "loss": 0.9431, "step": 3045 }, { "epoch": 0.87, "grad_norm": 9.1451416015625, "learning_rate": 1.6930778032036616e-05, "loss": 0.8445, "step": 3046 }, { "epoch": 0.87, "grad_norm": 8.905163764953613, "learning_rate": 1.692648741418764e-05, "loss": 1.088, "step": 3047 }, { "epoch": 0.87, "grad_norm": 9.993717193603516, "learning_rate": 1.6922196796338674e-05, "loss": 1.2007, "step": 3048 }, { "epoch": 0.87, "grad_norm": 11.483057975769043, "learning_rate": 1.6917906178489702e-05, "loss": 1.0004, "step": 3049 }, { "epoch": 0.87, "grad_norm": 9.861003875732422, "learning_rate": 1.691361556064073e-05, "loss": 1.049, "step": 3050 }, { "epoch": 0.87, "grad_norm": 8.530832290649414, "learning_rate": 1.6909324942791763e-05, "loss": 0.7936, "step": 3051 }, { "epoch": 0.87, "grad_norm": 9.132295608520508, "learning_rate": 1.6905034324942792e-05, "loss": 0.6706, "step": 3052 }, { "epoch": 0.87, "grad_norm": 11.055174827575684, "learning_rate": 1.6900743707093824e-05, "loss": 0.957, "step": 3053 }, { "epoch": 0.87, "grad_norm": 11.633831977844238, "learning_rate": 1.689645308924485e-05, "loss": 1.2538, "step": 3054 }, { "epoch": 0.87, "grad_norm": 9.796125411987305, "learning_rate": 1.6892162471395882e-05, "loss": 1.0641, "step": 3055 }, { "epoch": 0.87, "grad_norm": 9.294535636901855, "learning_rate": 1.688787185354691e-05, "loss": 0.9855, "step": 3056 }, { "epoch": 0.87, "grad_norm": 10.125809669494629, "learning_rate": 1.6883581235697943e-05, "loss": 0.9899, "step": 3057 }, { "epoch": 0.87, "grad_norm": 8.996604919433594, "learning_rate": 1.6879290617848968e-05, "loss": 0.9472, "step": 3058 }, { "epoch": 0.88, "grad_norm": 10.40684986114502, "learning_rate": 1.6875e-05, "loss": 1.0968, "step": 3059 }, { "epoch": 0.88, "grad_norm": 10.77299976348877, "learning_rate": 1.6870709382151032e-05, "loss": 1.3063, "step": 3060 }, { "epoch": 0.88, "grad_norm": 9.238236427307129, "learning_rate": 1.686641876430206e-05, "loss": 1.0473, "step": 3061 }, { "epoch": 0.88, "grad_norm": 9.003403663635254, "learning_rate": 1.686212814645309e-05, "loss": 1.09, "step": 3062 }, { "epoch": 0.88, "grad_norm": 11.033190727233887, "learning_rate": 1.685783752860412e-05, "loss": 1.3033, "step": 3063 }, { "epoch": 0.88, "grad_norm": 12.842031478881836, "learning_rate": 1.685354691075515e-05, "loss": 1.1193, "step": 3064 }, { "epoch": 0.88, "grad_norm": 10.233874320983887, "learning_rate": 1.6849256292906176e-05, "loss": 0.924, "step": 3065 }, { "epoch": 0.88, "grad_norm": 9.496210098266602, "learning_rate": 1.684496567505721e-05, "loss": 1.2856, "step": 3066 }, { "epoch": 0.88, "grad_norm": 9.07898998260498, "learning_rate": 1.6840675057208237e-05, "loss": 0.8656, "step": 3067 }, { "epoch": 0.88, "grad_norm": 9.06139850616455, "learning_rate": 1.683638443935927e-05, "loss": 0.9459, "step": 3068 }, { "epoch": 0.88, "grad_norm": 10.527409553527832, "learning_rate": 1.6832093821510298e-05, "loss": 0.9977, "step": 3069 }, { "epoch": 0.88, "grad_norm": 10.252501487731934, "learning_rate": 1.6827803203661327e-05, "loss": 1.0964, "step": 3070 }, { "epoch": 0.88, "grad_norm": 11.455581665039062, "learning_rate": 1.682351258581236e-05, "loss": 1.286, "step": 3071 }, { "epoch": 0.88, "grad_norm": 9.602471351623535, "learning_rate": 1.6819221967963388e-05, "loss": 1.1101, "step": 3072 }, { "epoch": 0.88, "grad_norm": 8.360219955444336, "learning_rate": 1.6814931350114417e-05, "loss": 0.9595, "step": 3073 }, { "epoch": 0.88, "grad_norm": 8.391324043273926, "learning_rate": 1.6810640732265445e-05, "loss": 0.8692, "step": 3074 }, { "epoch": 0.88, "grad_norm": 9.762292861938477, "learning_rate": 1.6806350114416477e-05, "loss": 1.068, "step": 3075 }, { "epoch": 0.88, "grad_norm": 9.805365562438965, "learning_rate": 1.6802059496567503e-05, "loss": 1.0729, "step": 3076 }, { "epoch": 0.88, "grad_norm": 8.31454086303711, "learning_rate": 1.6797768878718535e-05, "loss": 0.7873, "step": 3077 }, { "epoch": 0.88, "grad_norm": 11.559967041015625, "learning_rate": 1.6793478260869567e-05, "loss": 1.0795, "step": 3078 }, { "epoch": 0.88, "grad_norm": 11.343534469604492, "learning_rate": 1.6789187643020596e-05, "loss": 1.1779, "step": 3079 }, { "epoch": 0.88, "grad_norm": 7.209791660308838, "learning_rate": 1.6784897025171625e-05, "loss": 0.788, "step": 3080 }, { "epoch": 0.88, "grad_norm": 9.449609756469727, "learning_rate": 1.6780606407322653e-05, "loss": 0.9448, "step": 3081 }, { "epoch": 0.88, "grad_norm": 8.622262001037598, "learning_rate": 1.6776315789473686e-05, "loss": 0.788, "step": 3082 }, { "epoch": 0.88, "grad_norm": 10.25172233581543, "learning_rate": 1.6772025171624714e-05, "loss": 0.9932, "step": 3083 }, { "epoch": 0.88, "grad_norm": 8.239367485046387, "learning_rate": 1.6767734553775743e-05, "loss": 0.8375, "step": 3084 }, { "epoch": 0.88, "grad_norm": 8.967516899108887, "learning_rate": 1.6763443935926775e-05, "loss": 0.8182, "step": 3085 }, { "epoch": 0.88, "grad_norm": 12.356708526611328, "learning_rate": 1.6759153318077804e-05, "loss": 1.0941, "step": 3086 }, { "epoch": 0.88, "grad_norm": 8.643538475036621, "learning_rate": 1.6754862700228836e-05, "loss": 1.0097, "step": 3087 }, { "epoch": 0.88, "grad_norm": 9.382425308227539, "learning_rate": 1.675057208237986e-05, "loss": 0.9607, "step": 3088 }, { "epoch": 0.88, "grad_norm": 10.312171936035156, "learning_rate": 1.6746281464530894e-05, "loss": 1.2677, "step": 3089 }, { "epoch": 0.88, "grad_norm": 10.411649703979492, "learning_rate": 1.6741990846681923e-05, "loss": 0.9854, "step": 3090 }, { "epoch": 0.88, "grad_norm": 10.293243408203125, "learning_rate": 1.673770022883295e-05, "loss": 1.0462, "step": 3091 }, { "epoch": 0.88, "grad_norm": 8.277262687683105, "learning_rate": 1.673340961098398e-05, "loss": 1.1064, "step": 3092 }, { "epoch": 0.88, "grad_norm": 12.135811805725098, "learning_rate": 1.6729118993135012e-05, "loss": 0.8687, "step": 3093 }, { "epoch": 0.89, "grad_norm": 12.491722106933594, "learning_rate": 1.6724828375286044e-05, "loss": 1.0355, "step": 3094 }, { "epoch": 0.89, "grad_norm": 7.733823299407959, "learning_rate": 1.672053775743707e-05, "loss": 0.7278, "step": 3095 }, { "epoch": 0.89, "grad_norm": 10.659414291381836, "learning_rate": 1.6716247139588102e-05, "loss": 1.0786, "step": 3096 }, { "epoch": 0.89, "grad_norm": 9.9557466506958, "learning_rate": 1.671195652173913e-05, "loss": 1.2499, "step": 3097 }, { "epoch": 0.89, "grad_norm": 10.577733039855957, "learning_rate": 1.6707665903890163e-05, "loss": 0.9821, "step": 3098 }, { "epoch": 0.89, "grad_norm": 8.668160438537598, "learning_rate": 1.6703375286041188e-05, "loss": 0.8874, "step": 3099 }, { "epoch": 0.89, "grad_norm": 9.793705940246582, "learning_rate": 1.669908466819222e-05, "loss": 0.9216, "step": 3100 }, { "epoch": 0.89, "grad_norm": 8.16748332977295, "learning_rate": 1.669479405034325e-05, "loss": 0.7774, "step": 3101 }, { "epoch": 0.89, "grad_norm": 10.827296257019043, "learning_rate": 1.6690503432494278e-05, "loss": 0.965, "step": 3102 }, { "epoch": 0.89, "grad_norm": 11.990835189819336, "learning_rate": 1.668621281464531e-05, "loss": 0.8418, "step": 3103 }, { "epoch": 0.89, "grad_norm": 12.614843368530273, "learning_rate": 1.668192219679634e-05, "loss": 0.8315, "step": 3104 }, { "epoch": 0.89, "grad_norm": 10.358224868774414, "learning_rate": 1.667763157894737e-05, "loss": 1.1432, "step": 3105 }, { "epoch": 0.89, "grad_norm": 10.044696807861328, "learning_rate": 1.6673340961098396e-05, "loss": 1.0201, "step": 3106 }, { "epoch": 0.89, "grad_norm": 10.316402435302734, "learning_rate": 1.666905034324943e-05, "loss": 0.946, "step": 3107 }, { "epoch": 0.89, "grad_norm": 9.924317359924316, "learning_rate": 1.6664759725400457e-05, "loss": 1.1153, "step": 3108 }, { "epoch": 0.89, "grad_norm": 8.860552787780762, "learning_rate": 1.666046910755149e-05, "loss": 1.0023, "step": 3109 }, { "epoch": 0.89, "grad_norm": 9.352194786071777, "learning_rate": 1.6656178489702515e-05, "loss": 0.8491, "step": 3110 }, { "epoch": 0.89, "grad_norm": 8.415346145629883, "learning_rate": 1.6651887871853547e-05, "loss": 0.6951, "step": 3111 }, { "epoch": 0.89, "grad_norm": 7.790891647338867, "learning_rate": 1.664759725400458e-05, "loss": 0.9119, "step": 3112 }, { "epoch": 0.89, "grad_norm": 8.764174461364746, "learning_rate": 1.6643306636155608e-05, "loss": 1.2364, "step": 3113 }, { "epoch": 0.89, "grad_norm": 8.090877532958984, "learning_rate": 1.6639016018306637e-05, "loss": 0.8456, "step": 3114 }, { "epoch": 0.89, "grad_norm": 8.34622859954834, "learning_rate": 1.6634725400457665e-05, "loss": 0.8178, "step": 3115 }, { "epoch": 0.89, "grad_norm": 10.495920181274414, "learning_rate": 1.6630434782608698e-05, "loss": 1.2764, "step": 3116 }, { "epoch": 0.89, "grad_norm": 10.262067794799805, "learning_rate": 1.6626144164759723e-05, "loss": 1.2381, "step": 3117 }, { "epoch": 0.89, "grad_norm": 10.799932479858398, "learning_rate": 1.6621853546910755e-05, "loss": 1.1879, "step": 3118 }, { "epoch": 0.89, "grad_norm": 12.72973346710205, "learning_rate": 1.6617562929061787e-05, "loss": 1.2686, "step": 3119 }, { "epoch": 0.89, "grad_norm": 10.531723976135254, "learning_rate": 1.6613272311212816e-05, "loss": 1.1524, "step": 3120 }, { "epoch": 0.89, "grad_norm": 10.594547271728516, "learning_rate": 1.6608981693363845e-05, "loss": 1.0767, "step": 3121 }, { "epoch": 0.89, "grad_norm": 9.052020072937012, "learning_rate": 1.6604691075514874e-05, "loss": 0.8348, "step": 3122 }, { "epoch": 0.89, "grad_norm": 9.655191421508789, "learning_rate": 1.6600400457665906e-05, "loss": 1.0825, "step": 3123 }, { "epoch": 0.89, "grad_norm": 11.176694869995117, "learning_rate": 1.6596109839816935e-05, "loss": 1.1807, "step": 3124 }, { "epoch": 0.89, "grad_norm": 9.434635162353516, "learning_rate": 1.6591819221967963e-05, "loss": 0.836, "step": 3125 }, { "epoch": 0.89, "grad_norm": 9.345967292785645, "learning_rate": 1.6587528604118992e-05, "loss": 1.0104, "step": 3126 }, { "epoch": 0.89, "grad_norm": 10.331924438476562, "learning_rate": 1.6583237986270024e-05, "loss": 1.0878, "step": 3127 }, { "epoch": 0.89, "grad_norm": 9.606528282165527, "learning_rate": 1.6578947368421053e-05, "loss": 1.068, "step": 3128 }, { "epoch": 0.9, "grad_norm": 7.378060817718506, "learning_rate": 1.6574656750572082e-05, "loss": 0.7387, "step": 3129 }, { "epoch": 0.9, "grad_norm": 9.276107788085938, "learning_rate": 1.6570366132723114e-05, "loss": 1.0986, "step": 3130 }, { "epoch": 0.9, "grad_norm": 7.8492631912231445, "learning_rate": 1.6566075514874143e-05, "loss": 1.0775, "step": 3131 }, { "epoch": 0.9, "grad_norm": 9.005735397338867, "learning_rate": 1.656178489702517e-05, "loss": 1.2657, "step": 3132 }, { "epoch": 0.9, "grad_norm": 9.524003982543945, "learning_rate": 1.65574942791762e-05, "loss": 1.1501, "step": 3133 }, { "epoch": 0.9, "grad_norm": 9.842630386352539, "learning_rate": 1.6553203661327232e-05, "loss": 1.3252, "step": 3134 }, { "epoch": 0.9, "grad_norm": 8.478659629821777, "learning_rate": 1.654891304347826e-05, "loss": 0.8981, "step": 3135 }, { "epoch": 0.9, "grad_norm": 8.528156280517578, "learning_rate": 1.654462242562929e-05, "loss": 0.9483, "step": 3136 }, { "epoch": 0.9, "grad_norm": 8.961124420166016, "learning_rate": 1.6540331807780322e-05, "loss": 1.2126, "step": 3137 }, { "epoch": 0.9, "grad_norm": 9.386552810668945, "learning_rate": 1.653604118993135e-05, "loss": 0.9615, "step": 3138 }, { "epoch": 0.9, "grad_norm": 8.276663780212402, "learning_rate": 1.6531750572082383e-05, "loss": 0.8203, "step": 3139 }, { "epoch": 0.9, "grad_norm": 9.044401168823242, "learning_rate": 1.652745995423341e-05, "loss": 1.0648, "step": 3140 }, { "epoch": 0.9, "grad_norm": 9.254098892211914, "learning_rate": 1.652316933638444e-05, "loss": 1.0039, "step": 3141 }, { "epoch": 0.9, "grad_norm": 7.073532581329346, "learning_rate": 1.651887871853547e-05, "loss": 0.7994, "step": 3142 }, { "epoch": 0.9, "grad_norm": 8.611063003540039, "learning_rate": 1.6514588100686498e-05, "loss": 1.0628, "step": 3143 }, { "epoch": 0.9, "grad_norm": 9.973498344421387, "learning_rate": 1.6510297482837527e-05, "loss": 0.9726, "step": 3144 }, { "epoch": 0.9, "grad_norm": 8.836945533752441, "learning_rate": 1.650600686498856e-05, "loss": 0.9552, "step": 3145 }, { "epoch": 0.9, "grad_norm": 9.754205703735352, "learning_rate": 1.650171624713959e-05, "loss": 0.8553, "step": 3146 }, { "epoch": 0.9, "grad_norm": 8.891960144042969, "learning_rate": 1.6497425629290616e-05, "loss": 0.9489, "step": 3147 }, { "epoch": 0.9, "grad_norm": 11.262051582336426, "learning_rate": 1.649313501144165e-05, "loss": 0.9542, "step": 3148 }, { "epoch": 0.9, "grad_norm": 7.30917501449585, "learning_rate": 1.6488844393592677e-05, "loss": 0.8196, "step": 3149 }, { "epoch": 0.9, "grad_norm": 9.353693008422852, "learning_rate": 1.648455377574371e-05, "loss": 0.9017, "step": 3150 }, { "epoch": 0.9, "grad_norm": 11.002487182617188, "learning_rate": 1.6480263157894735e-05, "loss": 0.8348, "step": 3151 }, { "epoch": 0.9, "grad_norm": 10.591752052307129, "learning_rate": 1.6475972540045767e-05, "loss": 0.7949, "step": 3152 }, { "epoch": 0.9, "grad_norm": 9.392352104187012, "learning_rate": 1.6471681922196796e-05, "loss": 0.8932, "step": 3153 }, { "epoch": 0.9, "grad_norm": 9.849568367004395, "learning_rate": 1.6467391304347828e-05, "loss": 1.1319, "step": 3154 }, { "epoch": 0.9, "grad_norm": 11.07901668548584, "learning_rate": 1.6463100686498857e-05, "loss": 1.2838, "step": 3155 }, { "epoch": 0.9, "grad_norm": 10.085216522216797, "learning_rate": 1.6458810068649886e-05, "loss": 0.9914, "step": 3156 }, { "epoch": 0.9, "grad_norm": 9.623714447021484, "learning_rate": 1.6454519450800918e-05, "loss": 0.9696, "step": 3157 }, { "epoch": 0.9, "grad_norm": 8.515621185302734, "learning_rate": 1.6450228832951943e-05, "loss": 0.8009, "step": 3158 }, { "epoch": 0.9, "grad_norm": 9.103911399841309, "learning_rate": 1.6445938215102975e-05, "loss": 0.8285, "step": 3159 }, { "epoch": 0.9, "grad_norm": 9.101774215698242, "learning_rate": 1.6441647597254004e-05, "loss": 0.7519, "step": 3160 }, { "epoch": 0.9, "grad_norm": 8.230019569396973, "learning_rate": 1.6437356979405036e-05, "loss": 0.9152, "step": 3161 }, { "epoch": 0.9, "grad_norm": 9.370148658752441, "learning_rate": 1.6433066361556065e-05, "loss": 0.8063, "step": 3162 }, { "epoch": 0.9, "grad_norm": 10.779153823852539, "learning_rate": 1.6428775743707094e-05, "loss": 1.3406, "step": 3163 }, { "epoch": 0.91, "grad_norm": 9.598963737487793, "learning_rate": 1.6424485125858126e-05, "loss": 0.8718, "step": 3164 }, { "epoch": 0.91, "grad_norm": 9.773757934570312, "learning_rate": 1.6420194508009155e-05, "loss": 1.1121, "step": 3165 }, { "epoch": 0.91, "grad_norm": 9.981141090393066, "learning_rate": 1.6415903890160183e-05, "loss": 0.881, "step": 3166 }, { "epoch": 0.91, "grad_norm": 9.617985725402832, "learning_rate": 1.6411613272311212e-05, "loss": 1.0474, "step": 3167 }, { "epoch": 0.91, "grad_norm": 8.892661094665527, "learning_rate": 1.6407322654462244e-05, "loss": 1.0152, "step": 3168 }, { "epoch": 0.91, "grad_norm": 8.366165161132812, "learning_rate": 1.640303203661327e-05, "loss": 1.1588, "step": 3169 }, { "epoch": 0.91, "grad_norm": 10.079965591430664, "learning_rate": 1.6398741418764302e-05, "loss": 0.9844, "step": 3170 }, { "epoch": 0.91, "grad_norm": 7.300412178039551, "learning_rate": 1.6394450800915334e-05, "loss": 0.7623, "step": 3171 }, { "epoch": 0.91, "grad_norm": 10.555304527282715, "learning_rate": 1.6390160183066363e-05, "loss": 0.8854, "step": 3172 }, { "epoch": 0.91, "grad_norm": 10.230918884277344, "learning_rate": 1.638586956521739e-05, "loss": 1.1895, "step": 3173 }, { "epoch": 0.91, "grad_norm": 10.42735481262207, "learning_rate": 1.638157894736842e-05, "loss": 1.2141, "step": 3174 }, { "epoch": 0.91, "grad_norm": 11.082991600036621, "learning_rate": 1.6377288329519452e-05, "loss": 1.0251, "step": 3175 }, { "epoch": 0.91, "grad_norm": 8.29589557647705, "learning_rate": 1.637299771167048e-05, "loss": 1.1099, "step": 3176 }, { "epoch": 0.91, "grad_norm": 8.441349029541016, "learning_rate": 1.636870709382151e-05, "loss": 0.9099, "step": 3177 }, { "epoch": 0.91, "grad_norm": 10.149102210998535, "learning_rate": 1.636441647597254e-05, "loss": 1.2221, "step": 3178 }, { "epoch": 0.91, "grad_norm": 11.996493339538574, "learning_rate": 1.636012585812357e-05, "loss": 1.2849, "step": 3179 }, { "epoch": 0.91, "grad_norm": 9.45579719543457, "learning_rate": 1.6355835240274603e-05, "loss": 1.152, "step": 3180 }, { "epoch": 0.91, "grad_norm": 10.456029891967773, "learning_rate": 1.635154462242563e-05, "loss": 1.0496, "step": 3181 }, { "epoch": 0.91, "grad_norm": 9.552337646484375, "learning_rate": 1.634725400457666e-05, "loss": 0.7783, "step": 3182 }, { "epoch": 0.91, "grad_norm": 8.918163299560547, "learning_rate": 1.634296338672769e-05, "loss": 1.0866, "step": 3183 }, { "epoch": 0.91, "grad_norm": 8.836033821105957, "learning_rate": 1.6338672768878718e-05, "loss": 1.0125, "step": 3184 }, { "epoch": 0.91, "grad_norm": 9.861259460449219, "learning_rate": 1.6334382151029747e-05, "loss": 1.1258, "step": 3185 }, { "epoch": 0.91, "grad_norm": 10.794419288635254, "learning_rate": 1.633009153318078e-05, "loss": 1.0022, "step": 3186 }, { "epoch": 0.91, "grad_norm": 7.860095500946045, "learning_rate": 1.6325800915331808e-05, "loss": 0.9663, "step": 3187 }, { "epoch": 0.91, "grad_norm": 9.200760841369629, "learning_rate": 1.6321510297482837e-05, "loss": 0.9666, "step": 3188 }, { "epoch": 0.91, "grad_norm": 7.224365234375, "learning_rate": 1.631721967963387e-05, "loss": 0.7837, "step": 3189 }, { "epoch": 0.91, "grad_norm": 10.976065635681152, "learning_rate": 1.6312929061784898e-05, "loss": 1.1979, "step": 3190 }, { "epoch": 0.91, "grad_norm": 9.322733879089355, "learning_rate": 1.630863844393593e-05, "loss": 0.828, "step": 3191 }, { "epoch": 0.91, "grad_norm": 10.57866096496582, "learning_rate": 1.6304347826086955e-05, "loss": 0.9835, "step": 3192 }, { "epoch": 0.91, "grad_norm": 8.49717903137207, "learning_rate": 1.6300057208237987e-05, "loss": 0.9419, "step": 3193 }, { "epoch": 0.91, "grad_norm": 7.884578227996826, "learning_rate": 1.6295766590389016e-05, "loss": 0.6729, "step": 3194 }, { "epoch": 0.91, "grad_norm": 8.000340461730957, "learning_rate": 1.6291475972540045e-05, "loss": 0.8283, "step": 3195 }, { "epoch": 0.91, "grad_norm": 12.245285987854004, "learning_rate": 1.6287185354691074e-05, "loss": 1.363, "step": 3196 }, { "epoch": 0.91, "grad_norm": 10.48442268371582, "learning_rate": 1.6282894736842106e-05, "loss": 0.944, "step": 3197 }, { "epoch": 0.91, "grad_norm": 8.397787094116211, "learning_rate": 1.6278604118993138e-05, "loss": 0.9394, "step": 3198 }, { "epoch": 0.92, "grad_norm": 10.015027046203613, "learning_rate": 1.6274313501144163e-05, "loss": 0.8126, "step": 3199 }, { "epoch": 0.92, "grad_norm": 9.795433044433594, "learning_rate": 1.6270022883295195e-05, "loss": 1.1504, "step": 3200 }, { "epoch": 0.92, "grad_norm": 8.850866317749023, "learning_rate": 1.6265732265446224e-05, "loss": 1.0913, "step": 3201 }, { "epoch": 0.92, "grad_norm": 10.196776390075684, "learning_rate": 1.6261441647597256e-05, "loss": 0.9343, "step": 3202 }, { "epoch": 0.92, "grad_norm": 10.463024139404297, "learning_rate": 1.6257151029748282e-05, "loss": 1.1631, "step": 3203 }, { "epoch": 0.92, "grad_norm": 12.322643280029297, "learning_rate": 1.6252860411899314e-05, "loss": 1.1612, "step": 3204 }, { "epoch": 0.92, "grad_norm": 10.773918151855469, "learning_rate": 1.6248569794050346e-05, "loss": 1.2507, "step": 3205 }, { "epoch": 0.92, "grad_norm": 9.977486610412598, "learning_rate": 1.6244279176201375e-05, "loss": 0.9428, "step": 3206 }, { "epoch": 0.92, "grad_norm": 9.164650917053223, "learning_rate": 1.6239988558352404e-05, "loss": 0.9418, "step": 3207 }, { "epoch": 0.92, "grad_norm": 10.065773010253906, "learning_rate": 1.6235697940503432e-05, "loss": 0.9398, "step": 3208 }, { "epoch": 0.92, "grad_norm": 9.81015682220459, "learning_rate": 1.6231407322654464e-05, "loss": 0.8495, "step": 3209 }, { "epoch": 0.92, "grad_norm": 8.081978797912598, "learning_rate": 1.622711670480549e-05, "loss": 0.8546, "step": 3210 }, { "epoch": 0.92, "grad_norm": 10.246928215026855, "learning_rate": 1.6222826086956522e-05, "loss": 0.8802, "step": 3211 }, { "epoch": 0.92, "grad_norm": 11.4380521774292, "learning_rate": 1.621853546910755e-05, "loss": 1.0022, "step": 3212 }, { "epoch": 0.92, "grad_norm": 10.336760520935059, "learning_rate": 1.6214244851258583e-05, "loss": 1.0296, "step": 3213 }, { "epoch": 0.92, "grad_norm": 10.019309997558594, "learning_rate": 1.6209954233409612e-05, "loss": 0.8747, "step": 3214 }, { "epoch": 0.92, "grad_norm": 8.206274032592773, "learning_rate": 1.620566361556064e-05, "loss": 0.9604, "step": 3215 }, { "epoch": 0.92, "grad_norm": 10.94852066040039, "learning_rate": 1.6201372997711673e-05, "loss": 0.9252, "step": 3216 }, { "epoch": 0.92, "grad_norm": 9.24874496459961, "learning_rate": 1.61970823798627e-05, "loss": 0.9816, "step": 3217 }, { "epoch": 0.92, "grad_norm": 9.955107688903809, "learning_rate": 1.619279176201373e-05, "loss": 1.4561, "step": 3218 }, { "epoch": 0.92, "grad_norm": 10.70648193359375, "learning_rate": 1.618850114416476e-05, "loss": 1.0059, "step": 3219 }, { "epoch": 0.92, "grad_norm": 11.166799545288086, "learning_rate": 1.618421052631579e-05, "loss": 1.0511, "step": 3220 }, { "epoch": 0.92, "grad_norm": 7.510572910308838, "learning_rate": 1.6179919908466816e-05, "loss": 0.8416, "step": 3221 }, { "epoch": 0.92, "grad_norm": 8.313467979431152, "learning_rate": 1.617562929061785e-05, "loss": 0.9154, "step": 3222 }, { "epoch": 0.92, "grad_norm": 8.768488883972168, "learning_rate": 1.617133867276888e-05, "loss": 0.7979, "step": 3223 }, { "epoch": 0.92, "grad_norm": 8.637980461120605, "learning_rate": 1.616704805491991e-05, "loss": 0.9436, "step": 3224 }, { "epoch": 0.92, "grad_norm": 11.256755828857422, "learning_rate": 1.616275743707094e-05, "loss": 1.0568, "step": 3225 }, { "epoch": 0.92, "grad_norm": 11.269331932067871, "learning_rate": 1.6158466819221967e-05, "loss": 1.2881, "step": 3226 }, { "epoch": 0.92, "grad_norm": 8.064380645751953, "learning_rate": 1.6154176201373e-05, "loss": 0.8013, "step": 3227 }, { "epoch": 0.92, "grad_norm": 9.082005500793457, "learning_rate": 1.6149885583524028e-05, "loss": 1.0026, "step": 3228 }, { "epoch": 0.92, "grad_norm": 8.47630500793457, "learning_rate": 1.6145594965675057e-05, "loss": 1.1559, "step": 3229 }, { "epoch": 0.92, "grad_norm": 8.413256645202637, "learning_rate": 1.6141304347826086e-05, "loss": 0.9758, "step": 3230 }, { "epoch": 0.92, "grad_norm": 9.38217830657959, "learning_rate": 1.6137013729977118e-05, "loss": 1.1137, "step": 3231 }, { "epoch": 0.92, "grad_norm": 8.091568946838379, "learning_rate": 1.613272311212815e-05, "loss": 1.0385, "step": 3232 }, { "epoch": 0.92, "grad_norm": 11.839828491210938, "learning_rate": 1.6128432494279175e-05, "loss": 1.0217, "step": 3233 }, { "epoch": 0.93, "grad_norm": 9.954550743103027, "learning_rate": 1.6124141876430207e-05, "loss": 0.9288, "step": 3234 }, { "epoch": 0.93, "grad_norm": 9.699662208557129, "learning_rate": 1.6119851258581236e-05, "loss": 1.166, "step": 3235 }, { "epoch": 0.93, "grad_norm": 9.94688892364502, "learning_rate": 1.6115560640732265e-05, "loss": 0.8658, "step": 3236 }, { "epoch": 0.93, "grad_norm": 8.275080680847168, "learning_rate": 1.6111270022883294e-05, "loss": 0.801, "step": 3237 }, { "epoch": 0.93, "grad_norm": 7.638550758361816, "learning_rate": 1.6106979405034326e-05, "loss": 0.8034, "step": 3238 }, { "epoch": 0.93, "grad_norm": 9.947699546813965, "learning_rate": 1.6102688787185358e-05, "loss": 1.234, "step": 3239 }, { "epoch": 0.93, "grad_norm": 9.809272766113281, "learning_rate": 1.6098398169336383e-05, "loss": 1.0888, "step": 3240 }, { "epoch": 0.93, "grad_norm": 9.042963027954102, "learning_rate": 1.6094107551487416e-05, "loss": 1.2458, "step": 3241 }, { "epoch": 0.93, "grad_norm": 9.463010787963867, "learning_rate": 1.6089816933638444e-05, "loss": 0.8898, "step": 3242 }, { "epoch": 0.93, "grad_norm": 10.513991355895996, "learning_rate": 1.6085526315789476e-05, "loss": 1.0433, "step": 3243 }, { "epoch": 0.93, "grad_norm": 11.259737968444824, "learning_rate": 1.6081235697940502e-05, "loss": 1.1836, "step": 3244 }, { "epoch": 0.93, "grad_norm": 11.464911460876465, "learning_rate": 1.6076945080091534e-05, "loss": 1.3609, "step": 3245 }, { "epoch": 0.93, "grad_norm": 7.19293212890625, "learning_rate": 1.6072654462242563e-05, "loss": 0.8884, "step": 3246 }, { "epoch": 0.93, "grad_norm": 7.9146037101745605, "learning_rate": 1.606836384439359e-05, "loss": 0.7844, "step": 3247 }, { "epoch": 0.93, "grad_norm": 8.42154598236084, "learning_rate": 1.6064073226544624e-05, "loss": 0.9371, "step": 3248 }, { "epoch": 0.93, "grad_norm": 8.096179008483887, "learning_rate": 1.6059782608695652e-05, "loss": 0.8173, "step": 3249 }, { "epoch": 0.93, "grad_norm": 9.047220230102539, "learning_rate": 1.6055491990846685e-05, "loss": 1.1358, "step": 3250 }, { "epoch": 0.93, "grad_norm": 9.304205894470215, "learning_rate": 1.605120137299771e-05, "loss": 0.8731, "step": 3251 }, { "epoch": 0.93, "grad_norm": 9.897394180297852, "learning_rate": 1.6046910755148742e-05, "loss": 1.0724, "step": 3252 }, { "epoch": 0.93, "grad_norm": 9.924748420715332, "learning_rate": 1.604262013729977e-05, "loss": 0.9685, "step": 3253 }, { "epoch": 0.93, "grad_norm": 9.466620445251465, "learning_rate": 1.6038329519450803e-05, "loss": 0.8894, "step": 3254 }, { "epoch": 0.93, "grad_norm": 8.961726188659668, "learning_rate": 1.603403890160183e-05, "loss": 0.9971, "step": 3255 }, { "epoch": 0.93, "grad_norm": 9.545747756958008, "learning_rate": 1.602974828375286e-05, "loss": 0.8442, "step": 3256 }, { "epoch": 0.93, "grad_norm": 9.017826080322266, "learning_rate": 1.6025457665903893e-05, "loss": 0.9271, "step": 3257 }, { "epoch": 0.93, "grad_norm": 7.684579372406006, "learning_rate": 1.602116704805492e-05, "loss": 0.547, "step": 3258 }, { "epoch": 0.93, "grad_norm": 8.903583526611328, "learning_rate": 1.601687643020595e-05, "loss": 0.9456, "step": 3259 }, { "epoch": 0.93, "grad_norm": 11.6437349319458, "learning_rate": 1.601258581235698e-05, "loss": 1.0433, "step": 3260 }, { "epoch": 0.93, "grad_norm": 8.919283866882324, "learning_rate": 1.600829519450801e-05, "loss": 0.9232, "step": 3261 }, { "epoch": 0.93, "grad_norm": 11.192386627197266, "learning_rate": 1.6004004576659037e-05, "loss": 1.0861, "step": 3262 }, { "epoch": 0.93, "grad_norm": 10.518370628356934, "learning_rate": 1.599971395881007e-05, "loss": 1.1146, "step": 3263 }, { "epoch": 0.93, "grad_norm": 10.677868843078613, "learning_rate": 1.5995423340961098e-05, "loss": 1.271, "step": 3264 }, { "epoch": 0.93, "grad_norm": 10.702676773071289, "learning_rate": 1.599113272311213e-05, "loss": 1.2026, "step": 3265 }, { "epoch": 0.93, "grad_norm": 8.851753234863281, "learning_rate": 1.598684210526316e-05, "loss": 0.9492, "step": 3266 }, { "epoch": 0.93, "grad_norm": 9.764097213745117, "learning_rate": 1.5982551487414187e-05, "loss": 0.9993, "step": 3267 }, { "epoch": 0.93, "grad_norm": 7.402448654174805, "learning_rate": 1.597826086956522e-05, "loss": 0.7815, "step": 3268 }, { "epoch": 0.94, "grad_norm": 9.05008316040039, "learning_rate": 1.5973970251716248e-05, "loss": 0.7609, "step": 3269 }, { "epoch": 0.94, "grad_norm": 10.128369331359863, "learning_rate": 1.5969679633867277e-05, "loss": 1.1545, "step": 3270 }, { "epoch": 0.94, "grad_norm": 10.443517684936523, "learning_rate": 1.5965389016018306e-05, "loss": 1.0197, "step": 3271 }, { "epoch": 0.94, "grad_norm": 11.30739688873291, "learning_rate": 1.5961098398169338e-05, "loss": 1.1871, "step": 3272 }, { "epoch": 0.94, "grad_norm": 9.866897583007812, "learning_rate": 1.5956807780320367e-05, "loss": 0.9183, "step": 3273 }, { "epoch": 0.94, "grad_norm": 10.537786483764648, "learning_rate": 1.5952517162471395e-05, "loss": 1.3296, "step": 3274 }, { "epoch": 0.94, "grad_norm": 8.425891876220703, "learning_rate": 1.5948226544622428e-05, "loss": 0.8341, "step": 3275 }, { "epoch": 0.94, "grad_norm": 8.038627624511719, "learning_rate": 1.5943935926773456e-05, "loss": 0.8743, "step": 3276 }, { "epoch": 0.94, "grad_norm": 8.650530815124512, "learning_rate": 1.5939645308924485e-05, "loss": 0.9197, "step": 3277 }, { "epoch": 0.94, "grad_norm": 10.73486042022705, "learning_rate": 1.5935354691075514e-05, "loss": 1.2104, "step": 3278 }, { "epoch": 0.94, "grad_norm": 10.41913890838623, "learning_rate": 1.5931064073226546e-05, "loss": 1.0029, "step": 3279 }, { "epoch": 0.94, "grad_norm": 126.80941772460938, "learning_rate": 1.5926773455377575e-05, "loss": 1.2048, "step": 3280 }, { "epoch": 0.94, "grad_norm": 11.67647933959961, "learning_rate": 1.5922482837528604e-05, "loss": 1.3911, "step": 3281 }, { "epoch": 0.94, "grad_norm": 11.207754135131836, "learning_rate": 1.5918192219679636e-05, "loss": 1.153, "step": 3282 }, { "epoch": 0.94, "grad_norm": 11.849954605102539, "learning_rate": 1.5913901601830664e-05, "loss": 1.009, "step": 3283 }, { "epoch": 0.94, "grad_norm": 10.543679237365723, "learning_rate": 1.5909610983981697e-05, "loss": 0.8047, "step": 3284 }, { "epoch": 0.94, "grad_norm": 9.641677856445312, "learning_rate": 1.5905320366132722e-05, "loss": 0.6823, "step": 3285 }, { "epoch": 0.94, "grad_norm": 12.406120300292969, "learning_rate": 1.5901029748283754e-05, "loss": 1.2721, "step": 3286 }, { "epoch": 0.94, "grad_norm": 10.462618827819824, "learning_rate": 1.5896739130434783e-05, "loss": 1.0017, "step": 3287 }, { "epoch": 0.94, "grad_norm": 8.184019088745117, "learning_rate": 1.589244851258581e-05, "loss": 0.9553, "step": 3288 }, { "epoch": 0.94, "grad_norm": 7.911831855773926, "learning_rate": 1.588815789473684e-05, "loss": 0.8236, "step": 3289 }, { "epoch": 0.94, "grad_norm": 8.127483367919922, "learning_rate": 1.5883867276887873e-05, "loss": 0.8052, "step": 3290 }, { "epoch": 0.94, "grad_norm": 10.355116844177246, "learning_rate": 1.5879576659038905e-05, "loss": 0.9767, "step": 3291 }, { "epoch": 0.94, "grad_norm": 10.917792320251465, "learning_rate": 1.587528604118993e-05, "loss": 1.0647, "step": 3292 }, { "epoch": 0.94, "grad_norm": 10.53119945526123, "learning_rate": 1.5870995423340962e-05, "loss": 1.1415, "step": 3293 }, { "epoch": 0.94, "grad_norm": 9.361116409301758, "learning_rate": 1.586670480549199e-05, "loss": 1.0765, "step": 3294 }, { "epoch": 0.94, "grad_norm": 9.876181602478027, "learning_rate": 1.5862414187643023e-05, "loss": 1.2355, "step": 3295 }, { "epoch": 0.94, "grad_norm": 7.987552642822266, "learning_rate": 1.585812356979405e-05, "loss": 0.7288, "step": 3296 }, { "epoch": 0.94, "grad_norm": 16.646928787231445, "learning_rate": 1.585383295194508e-05, "loss": 1.0366, "step": 3297 }, { "epoch": 0.94, "grad_norm": 10.816709518432617, "learning_rate": 1.584954233409611e-05, "loss": 1.3376, "step": 3298 }, { "epoch": 0.94, "grad_norm": 7.854706764221191, "learning_rate": 1.5845251716247138e-05, "loss": 0.9233, "step": 3299 }, { "epoch": 0.94, "grad_norm": 7.691450119018555, "learning_rate": 1.584096109839817e-05, "loss": 0.8012, "step": 3300 }, { "epoch": 0.94, "grad_norm": 9.379831314086914, "learning_rate": 1.58366704805492e-05, "loss": 0.8886, "step": 3301 }, { "epoch": 0.94, "grad_norm": 9.529094696044922, "learning_rate": 1.583237986270023e-05, "loss": 1.0188, "step": 3302 }, { "epoch": 0.94, "grad_norm": 10.709331512451172, "learning_rate": 1.5828089244851257e-05, "loss": 1.0348, "step": 3303 }, { "epoch": 0.95, "grad_norm": 10.2379732131958, "learning_rate": 1.582379862700229e-05, "loss": 0.8678, "step": 3304 }, { "epoch": 0.95, "grad_norm": 9.242757797241211, "learning_rate": 1.5819508009153318e-05, "loss": 1.1564, "step": 3305 }, { "epoch": 0.95, "grad_norm": 8.454890251159668, "learning_rate": 1.581521739130435e-05, "loss": 0.9922, "step": 3306 }, { "epoch": 0.95, "grad_norm": 8.57014274597168, "learning_rate": 1.5810926773455375e-05, "loss": 1.048, "step": 3307 }, { "epoch": 0.95, "grad_norm": 8.023551940917969, "learning_rate": 1.5806636155606407e-05, "loss": 0.8361, "step": 3308 }, { "epoch": 0.95, "grad_norm": 9.4293794631958, "learning_rate": 1.580234553775744e-05, "loss": 1.1393, "step": 3309 }, { "epoch": 0.95, "grad_norm": 9.56303882598877, "learning_rate": 1.5798054919908468e-05, "loss": 1.1764, "step": 3310 }, { "epoch": 0.95, "grad_norm": 9.185225486755371, "learning_rate": 1.5793764302059497e-05, "loss": 0.869, "step": 3311 }, { "epoch": 0.95, "grad_norm": 9.167194366455078, "learning_rate": 1.5789473684210526e-05, "loss": 0.9115, "step": 3312 }, { "epoch": 0.95, "grad_norm": 9.10361385345459, "learning_rate": 1.5785183066361558e-05, "loss": 1.1696, "step": 3313 }, { "epoch": 0.95, "grad_norm": 9.448793411254883, "learning_rate": 1.5780892448512583e-05, "loss": 0.8851, "step": 3314 }, { "epoch": 0.95, "grad_norm": 8.414605140686035, "learning_rate": 1.5776601830663616e-05, "loss": 0.7982, "step": 3315 }, { "epoch": 0.95, "grad_norm": 9.551800727844238, "learning_rate": 1.5772311212814644e-05, "loss": 0.8693, "step": 3316 }, { "epoch": 0.95, "grad_norm": 9.259560585021973, "learning_rate": 1.5768020594965676e-05, "loss": 0.8606, "step": 3317 }, { "epoch": 0.95, "grad_norm": 9.371599197387695, "learning_rate": 1.5763729977116705e-05, "loss": 0.9186, "step": 3318 }, { "epoch": 0.95, "grad_norm": 9.043257713317871, "learning_rate": 1.5759439359267734e-05, "loss": 0.7495, "step": 3319 }, { "epoch": 0.95, "grad_norm": 9.614897727966309, "learning_rate": 1.5755148741418766e-05, "loss": 0.7925, "step": 3320 }, { "epoch": 0.95, "grad_norm": 9.677974700927734, "learning_rate": 1.5750858123569795e-05, "loss": 1.086, "step": 3321 }, { "epoch": 0.95, "grad_norm": 11.341986656188965, "learning_rate": 1.5746567505720824e-05, "loss": 1.1042, "step": 3322 }, { "epoch": 0.95, "grad_norm": 10.899744033813477, "learning_rate": 1.5742276887871852e-05, "loss": 0.9006, "step": 3323 }, { "epoch": 0.95, "grad_norm": 9.07625961303711, "learning_rate": 1.5737986270022885e-05, "loss": 0.9405, "step": 3324 }, { "epoch": 0.95, "grad_norm": 10.768837928771973, "learning_rate": 1.5733695652173913e-05, "loss": 1.0818, "step": 3325 }, { "epoch": 0.95, "grad_norm": 10.189533233642578, "learning_rate": 1.5729405034324942e-05, "loss": 1.2383, "step": 3326 }, { "epoch": 0.95, "grad_norm": 9.930407524108887, "learning_rate": 1.5725114416475974e-05, "loss": 0.8071, "step": 3327 }, { "epoch": 0.95, "grad_norm": 7.634292125701904, "learning_rate": 1.5720823798627003e-05, "loss": 0.8753, "step": 3328 }, { "epoch": 0.95, "grad_norm": 9.883581161499023, "learning_rate": 1.5716533180778032e-05, "loss": 0.8969, "step": 3329 }, { "epoch": 0.95, "grad_norm": 9.58691120147705, "learning_rate": 1.571224256292906e-05, "loss": 1.2702, "step": 3330 }, { "epoch": 0.95, "grad_norm": 11.525691986083984, "learning_rate": 1.5707951945080093e-05, "loss": 0.9529, "step": 3331 }, { "epoch": 0.95, "grad_norm": 10.9685640335083, "learning_rate": 1.570366132723112e-05, "loss": 0.9871, "step": 3332 }, { "epoch": 0.95, "grad_norm": 10.324636459350586, "learning_rate": 1.569937070938215e-05, "loss": 1.0842, "step": 3333 }, { "epoch": 0.95, "grad_norm": 8.318097114562988, "learning_rate": 1.5695080091533182e-05, "loss": 0.8873, "step": 3334 }, { "epoch": 0.95, "grad_norm": 9.906346321105957, "learning_rate": 1.569078947368421e-05, "loss": 0.9217, "step": 3335 }, { "epoch": 0.95, "grad_norm": 9.697980880737305, "learning_rate": 1.5686498855835243e-05, "loss": 1.0565, "step": 3336 }, { "epoch": 0.95, "grad_norm": 8.229022026062012, "learning_rate": 1.568220823798627e-05, "loss": 0.7354, "step": 3337 }, { "epoch": 0.95, "grad_norm": 9.6383695602417, "learning_rate": 1.56779176201373e-05, "loss": 1.015, "step": 3338 }, { "epoch": 0.96, "grad_norm": 10.653667449951172, "learning_rate": 1.567362700228833e-05, "loss": 1.5088, "step": 3339 }, { "epoch": 0.96, "grad_norm": 8.933518409729004, "learning_rate": 1.566933638443936e-05, "loss": 1.1612, "step": 3340 }, { "epoch": 0.96, "grad_norm": 10.447750091552734, "learning_rate": 1.5665045766590387e-05, "loss": 1.19, "step": 3341 }, { "epoch": 0.96, "grad_norm": 8.314963340759277, "learning_rate": 1.566075514874142e-05, "loss": 0.6001, "step": 3342 }, { "epoch": 0.96, "grad_norm": 9.529887199401855, "learning_rate": 1.565646453089245e-05, "loss": 0.9961, "step": 3343 }, { "epoch": 0.96, "grad_norm": 9.230010986328125, "learning_rate": 1.5652173913043477e-05, "loss": 0.7727, "step": 3344 }, { "epoch": 0.96, "grad_norm": 9.142366409301758, "learning_rate": 1.564788329519451e-05, "loss": 0.8415, "step": 3345 }, { "epoch": 0.96, "grad_norm": 8.853986740112305, "learning_rate": 1.5643592677345538e-05, "loss": 1.0927, "step": 3346 }, { "epoch": 0.96, "grad_norm": 9.466641426086426, "learning_rate": 1.563930205949657e-05, "loss": 1.0912, "step": 3347 }, { "epoch": 0.96, "grad_norm": 8.603524208068848, "learning_rate": 1.5635011441647595e-05, "loss": 0.9035, "step": 3348 }, { "epoch": 0.96, "grad_norm": 12.175256729125977, "learning_rate": 1.5630720823798628e-05, "loss": 1.3209, "step": 3349 }, { "epoch": 0.96, "grad_norm": 9.855653762817383, "learning_rate": 1.5626430205949656e-05, "loss": 0.9801, "step": 3350 }, { "epoch": 0.96, "grad_norm": 9.72407054901123, "learning_rate": 1.562213958810069e-05, "loss": 0.819, "step": 3351 }, { "epoch": 0.96, "grad_norm": 11.676117897033691, "learning_rate": 1.5617848970251717e-05, "loss": 1.2892, "step": 3352 }, { "epoch": 0.96, "grad_norm": 9.289527893066406, "learning_rate": 1.5613558352402746e-05, "loss": 1.0813, "step": 3353 }, { "epoch": 0.96, "grad_norm": 10.125395774841309, "learning_rate": 1.5609267734553778e-05, "loss": 0.9076, "step": 3354 }, { "epoch": 0.96, "grad_norm": 10.02776050567627, "learning_rate": 1.5604977116704804e-05, "loss": 1.1723, "step": 3355 }, { "epoch": 0.96, "grad_norm": 9.241559982299805, "learning_rate": 1.5600686498855836e-05, "loss": 0.8031, "step": 3356 }, { "epoch": 0.96, "grad_norm": 10.31569766998291, "learning_rate": 1.5596395881006864e-05, "loss": 0.8867, "step": 3357 }, { "epoch": 0.96, "grad_norm": 8.171711921691895, "learning_rate": 1.5592105263157897e-05, "loss": 1.0547, "step": 3358 }, { "epoch": 0.96, "grad_norm": 9.938399314880371, "learning_rate": 1.5587814645308925e-05, "loss": 0.9277, "step": 3359 }, { "epoch": 0.96, "grad_norm": 9.381302833557129, "learning_rate": 1.5583524027459954e-05, "loss": 0.9156, "step": 3360 }, { "epoch": 0.96, "grad_norm": 10.064274787902832, "learning_rate": 1.5579233409610986e-05, "loss": 0.897, "step": 3361 }, { "epoch": 0.96, "grad_norm": 8.624159812927246, "learning_rate": 1.5574942791762015e-05, "loss": 0.8896, "step": 3362 }, { "epoch": 0.96, "grad_norm": 10.71406078338623, "learning_rate": 1.5570652173913044e-05, "loss": 0.977, "step": 3363 }, { "epoch": 0.96, "grad_norm": 10.768048286437988, "learning_rate": 1.5566361556064073e-05, "loss": 1.2492, "step": 3364 }, { "epoch": 0.96, "grad_norm": 9.973236083984375, "learning_rate": 1.5562070938215105e-05, "loss": 0.8089, "step": 3365 }, { "epoch": 0.96, "grad_norm": 8.230417251586914, "learning_rate": 1.555778032036613e-05, "loss": 0.8676, "step": 3366 }, { "epoch": 0.96, "grad_norm": 9.360048294067383, "learning_rate": 1.5553489702517162e-05, "loss": 1.0663, "step": 3367 }, { "epoch": 0.96, "grad_norm": 8.129524230957031, "learning_rate": 1.5549199084668194e-05, "loss": 0.8034, "step": 3368 }, { "epoch": 0.96, "grad_norm": 9.040216445922852, "learning_rate": 1.5544908466819223e-05, "loss": 0.9049, "step": 3369 }, { "epoch": 0.96, "grad_norm": 9.51258373260498, "learning_rate": 1.5540617848970252e-05, "loss": 0.9196, "step": 3370 }, { "epoch": 0.96, "grad_norm": 9.116039276123047, "learning_rate": 1.553632723112128e-05, "loss": 0.9509, "step": 3371 }, { "epoch": 0.96, "grad_norm": 7.990875244140625, "learning_rate": 1.5532036613272313e-05, "loss": 0.7907, "step": 3372 }, { "epoch": 0.96, "grad_norm": 10.509187698364258, "learning_rate": 1.552774599542334e-05, "loss": 1.0079, "step": 3373 }, { "epoch": 0.97, "grad_norm": 9.599989891052246, "learning_rate": 1.552345537757437e-05, "loss": 1.0179, "step": 3374 }, { "epoch": 0.97, "grad_norm": 10.02453899383545, "learning_rate": 1.55191647597254e-05, "loss": 1.0442, "step": 3375 }, { "epoch": 0.97, "grad_norm": 9.042299270629883, "learning_rate": 1.551487414187643e-05, "loss": 0.9454, "step": 3376 }, { "epoch": 0.97, "grad_norm": 10.32194995880127, "learning_rate": 1.5510583524027463e-05, "loss": 1.0094, "step": 3377 }, { "epoch": 0.97, "grad_norm": 8.976984024047852, "learning_rate": 1.550629290617849e-05, "loss": 0.8772, "step": 3378 }, { "epoch": 0.97, "grad_norm": 10.98599910736084, "learning_rate": 1.550200228832952e-05, "loss": 0.8978, "step": 3379 }, { "epoch": 0.97, "grad_norm": 10.110823631286621, "learning_rate": 1.549771167048055e-05, "loss": 1.0381, "step": 3380 }, { "epoch": 0.97, "grad_norm": 10.129533767700195, "learning_rate": 1.549342105263158e-05, "loss": 1.132, "step": 3381 }, { "epoch": 0.97, "grad_norm": 9.173952102661133, "learning_rate": 1.5489130434782607e-05, "loss": 1.1657, "step": 3382 }, { "epoch": 0.97, "grad_norm": 9.895243644714355, "learning_rate": 1.548483981693364e-05, "loss": 0.975, "step": 3383 }, { "epoch": 0.97, "grad_norm": 12.187675476074219, "learning_rate": 1.5480549199084668e-05, "loss": 1.4564, "step": 3384 }, { "epoch": 0.97, "grad_norm": 11.068359375, "learning_rate": 1.5476258581235697e-05, "loss": 1.3282, "step": 3385 }, { "epoch": 0.97, "grad_norm": 10.907172203063965, "learning_rate": 1.547196796338673e-05, "loss": 1.0901, "step": 3386 }, { "epoch": 0.97, "grad_norm": 8.23672103881836, "learning_rate": 1.5467677345537758e-05, "loss": 0.6549, "step": 3387 }, { "epoch": 0.97, "grad_norm": 12.269115447998047, "learning_rate": 1.546338672768879e-05, "loss": 1.1592, "step": 3388 }, { "epoch": 0.97, "grad_norm": 11.227182388305664, "learning_rate": 1.5459096109839815e-05, "loss": 0.8964, "step": 3389 }, { "epoch": 0.97, "grad_norm": 7.9429402351379395, "learning_rate": 1.5454805491990848e-05, "loss": 0.9423, "step": 3390 }, { "epoch": 0.97, "grad_norm": 8.185820579528809, "learning_rate": 1.5450514874141876e-05, "loss": 0.8259, "step": 3391 }, { "epoch": 0.97, "grad_norm": 11.577701568603516, "learning_rate": 1.5446224256292905e-05, "loss": 0.9428, "step": 3392 }, { "epoch": 0.97, "grad_norm": 9.812365531921387, "learning_rate": 1.5441933638443934e-05, "loss": 0.8602, "step": 3393 }, { "epoch": 0.97, "grad_norm": 8.900385856628418, "learning_rate": 1.5437643020594966e-05, "loss": 0.8757, "step": 3394 }, { "epoch": 0.97, "grad_norm": 8.60604190826416, "learning_rate": 1.5433352402745998e-05, "loss": 0.8252, "step": 3395 }, { "epoch": 0.97, "grad_norm": 8.94503402709961, "learning_rate": 1.5429061784897024e-05, "loss": 0.9966, "step": 3396 }, { "epoch": 0.97, "grad_norm": 7.996914863586426, "learning_rate": 1.5424771167048056e-05, "loss": 0.6504, "step": 3397 }, { "epoch": 0.97, "grad_norm": 8.389142036437988, "learning_rate": 1.5420480549199085e-05, "loss": 0.9272, "step": 3398 }, { "epoch": 0.97, "grad_norm": 9.937409400939941, "learning_rate": 1.5416189931350117e-05, "loss": 0.9442, "step": 3399 }, { "epoch": 0.97, "grad_norm": 12.677313804626465, "learning_rate": 1.5411899313501142e-05, "loss": 1.4754, "step": 3400 }, { "epoch": 0.97, "grad_norm": 10.936812400817871, "learning_rate": 1.5407608695652174e-05, "loss": 0.9896, "step": 3401 }, { "epoch": 0.97, "grad_norm": 10.202360153198242, "learning_rate": 1.5403318077803206e-05, "loss": 1.2344, "step": 3402 }, { "epoch": 0.97, "grad_norm": 8.682063102722168, "learning_rate": 1.5399027459954235e-05, "loss": 0.9496, "step": 3403 }, { "epoch": 0.97, "grad_norm": 9.442776679992676, "learning_rate": 1.5394736842105264e-05, "loss": 0.826, "step": 3404 }, { "epoch": 0.97, "grad_norm": 9.357182502746582, "learning_rate": 1.5390446224256293e-05, "loss": 1.0239, "step": 3405 }, { "epoch": 0.97, "grad_norm": 7.865712642669678, "learning_rate": 1.5386155606407325e-05, "loss": 0.9239, "step": 3406 }, { "epoch": 0.97, "grad_norm": 9.808480262756348, "learning_rate": 1.538186498855835e-05, "loss": 0.7902, "step": 3407 }, { "epoch": 0.97, "grad_norm": 10.461572647094727, "learning_rate": 1.5377574370709382e-05, "loss": 1.0959, "step": 3408 }, { "epoch": 0.98, "grad_norm": 8.114765167236328, "learning_rate": 1.537328375286041e-05, "loss": 0.8807, "step": 3409 }, { "epoch": 0.98, "grad_norm": 8.87446117401123, "learning_rate": 1.5368993135011443e-05, "loss": 1.1426, "step": 3410 }, { "epoch": 0.98, "grad_norm": 11.126185417175293, "learning_rate": 1.5364702517162472e-05, "loss": 1.0115, "step": 3411 }, { "epoch": 0.98, "grad_norm": 10.193495750427246, "learning_rate": 1.53604118993135e-05, "loss": 0.8523, "step": 3412 }, { "epoch": 0.98, "grad_norm": 8.045726776123047, "learning_rate": 1.5356121281464533e-05, "loss": 0.6918, "step": 3413 }, { "epoch": 0.98, "grad_norm": 9.752435684204102, "learning_rate": 1.5351830663615562e-05, "loss": 1.0277, "step": 3414 }, { "epoch": 0.98, "grad_norm": 9.08300495147705, "learning_rate": 1.534754004576659e-05, "loss": 0.9409, "step": 3415 }, { "epoch": 0.98, "grad_norm": 9.115248680114746, "learning_rate": 1.534324942791762e-05, "loss": 0.916, "step": 3416 }, { "epoch": 0.98, "grad_norm": 9.535004615783691, "learning_rate": 1.533895881006865e-05, "loss": 0.7505, "step": 3417 }, { "epoch": 0.98, "grad_norm": 7.965742588043213, "learning_rate": 1.5334668192219677e-05, "loss": 0.9348, "step": 3418 }, { "epoch": 0.98, "grad_norm": 9.332640647888184, "learning_rate": 1.533037757437071e-05, "loss": 1.0244, "step": 3419 }, { "epoch": 0.98, "grad_norm": 9.236135482788086, "learning_rate": 1.532608695652174e-05, "loss": 0.884, "step": 3420 }, { "epoch": 0.98, "grad_norm": 7.922878742218018, "learning_rate": 1.532179633867277e-05, "loss": 0.8087, "step": 3421 }, { "epoch": 0.98, "grad_norm": 9.578164100646973, "learning_rate": 1.53175057208238e-05, "loss": 1.1014, "step": 3422 }, { "epoch": 0.98, "grad_norm": 10.6139554977417, "learning_rate": 1.5313215102974827e-05, "loss": 0.9526, "step": 3423 }, { "epoch": 0.98, "grad_norm": 10.56572151184082, "learning_rate": 1.530892448512586e-05, "loss": 0.8512, "step": 3424 }, { "epoch": 0.98, "grad_norm": 10.807348251342773, "learning_rate": 1.530463386727689e-05, "loss": 0.9336, "step": 3425 }, { "epoch": 0.98, "grad_norm": 10.48486328125, "learning_rate": 1.5300343249427917e-05, "loss": 0.7069, "step": 3426 }, { "epoch": 0.98, "grad_norm": 8.4706392288208, "learning_rate": 1.5296052631578946e-05, "loss": 1.0151, "step": 3427 }, { "epoch": 0.98, "grad_norm": 10.076640129089355, "learning_rate": 1.5291762013729978e-05, "loss": 0.9837, "step": 3428 }, { "epoch": 0.98, "grad_norm": 10.002256393432617, "learning_rate": 1.528747139588101e-05, "loss": 0.9437, "step": 3429 }, { "epoch": 0.98, "grad_norm": 10.732633590698242, "learning_rate": 1.5283180778032036e-05, "loss": 0.8993, "step": 3430 }, { "epoch": 0.98, "grad_norm": 9.509393692016602, "learning_rate": 1.5278890160183068e-05, "loss": 0.9594, "step": 3431 }, { "epoch": 0.98, "grad_norm": 10.0447359085083, "learning_rate": 1.5274599542334097e-05, "loss": 0.873, "step": 3432 }, { "epoch": 0.98, "grad_norm": 12.14205551147461, "learning_rate": 1.5270308924485125e-05, "loss": 0.9494, "step": 3433 }, { "epoch": 0.98, "grad_norm": 10.456064224243164, "learning_rate": 1.5266018306636154e-05, "loss": 1.0558, "step": 3434 }, { "epoch": 0.98, "grad_norm": 9.385796546936035, "learning_rate": 1.5261727688787186e-05, "loss": 0.9873, "step": 3435 }, { "epoch": 0.98, "grad_norm": 9.608353614807129, "learning_rate": 1.5257437070938213e-05, "loss": 0.8733, "step": 3436 }, { "epoch": 0.98, "grad_norm": 8.533060073852539, "learning_rate": 1.5253146453089245e-05, "loss": 0.7835, "step": 3437 }, { "epoch": 0.98, "grad_norm": 8.630105972290039, "learning_rate": 1.5248855835240276e-05, "loss": 0.8252, "step": 3438 }, { "epoch": 0.98, "grad_norm": 12.451923370361328, "learning_rate": 1.5244565217391305e-05, "loss": 1.3839, "step": 3439 }, { "epoch": 0.98, "grad_norm": 9.088960647583008, "learning_rate": 1.5240274599542335e-05, "loss": 0.9431, "step": 3440 }, { "epoch": 0.98, "grad_norm": 11.612284660339355, "learning_rate": 1.5235983981693364e-05, "loss": 1.2605, "step": 3441 }, { "epoch": 0.98, "grad_norm": 10.245793342590332, "learning_rate": 1.5231693363844394e-05, "loss": 1.1037, "step": 3442 }, { "epoch": 0.98, "grad_norm": 9.433359146118164, "learning_rate": 1.5227402745995423e-05, "loss": 0.8219, "step": 3443 }, { "epoch": 0.99, "grad_norm": 9.079718589782715, "learning_rate": 1.5223112128146454e-05, "loss": 0.955, "step": 3444 }, { "epoch": 0.99, "grad_norm": 10.114764213562012, "learning_rate": 1.5218821510297484e-05, "loss": 1.1642, "step": 3445 }, { "epoch": 0.99, "grad_norm": 9.784605979919434, "learning_rate": 1.5214530892448513e-05, "loss": 0.8729, "step": 3446 }, { "epoch": 0.99, "grad_norm": 11.648603439331055, "learning_rate": 1.5210240274599543e-05, "loss": 1.0002, "step": 3447 }, { "epoch": 0.99, "grad_norm": 10.44534969329834, "learning_rate": 1.5205949656750572e-05, "loss": 1.2613, "step": 3448 }, { "epoch": 0.99, "grad_norm": 8.184114456176758, "learning_rate": 1.5201659038901603e-05, "loss": 1.0381, "step": 3449 }, { "epoch": 0.99, "grad_norm": 8.278879165649414, "learning_rate": 1.5197368421052631e-05, "loss": 0.8383, "step": 3450 }, { "epoch": 0.99, "grad_norm": 10.543951034545898, "learning_rate": 1.5193077803203662e-05, "loss": 1.1907, "step": 3451 }, { "epoch": 0.99, "grad_norm": 9.217336654663086, "learning_rate": 1.518878718535469e-05, "loss": 1.1273, "step": 3452 }, { "epoch": 0.99, "grad_norm": 8.791984558105469, "learning_rate": 1.5184496567505721e-05, "loss": 1.0187, "step": 3453 }, { "epoch": 0.99, "grad_norm": 9.331918716430664, "learning_rate": 1.5180205949656753e-05, "loss": 0.8774, "step": 3454 }, { "epoch": 0.99, "grad_norm": 9.732755661010742, "learning_rate": 1.517591533180778e-05, "loss": 1.2942, "step": 3455 }, { "epoch": 0.99, "grad_norm": 9.513022422790527, "learning_rate": 1.5171624713958812e-05, "loss": 1.0173, "step": 3456 }, { "epoch": 0.99, "grad_norm": 7.993738651275635, "learning_rate": 1.516733409610984e-05, "loss": 0.9076, "step": 3457 }, { "epoch": 0.99, "grad_norm": 10.794379234313965, "learning_rate": 1.516304347826087e-05, "loss": 1.0986, "step": 3458 }, { "epoch": 0.99, "grad_norm": 11.232872009277344, "learning_rate": 1.5158752860411899e-05, "loss": 1.0131, "step": 3459 }, { "epoch": 0.99, "grad_norm": 8.671607971191406, "learning_rate": 1.515446224256293e-05, "loss": 0.7477, "step": 3460 }, { "epoch": 0.99, "grad_norm": 9.410494804382324, "learning_rate": 1.5150171624713958e-05, "loss": 0.6941, "step": 3461 }, { "epoch": 0.99, "grad_norm": 9.8814058303833, "learning_rate": 1.5145881006864988e-05, "loss": 0.9058, "step": 3462 }, { "epoch": 0.99, "grad_norm": 8.776352882385254, "learning_rate": 1.514159038901602e-05, "loss": 0.9837, "step": 3463 }, { "epoch": 0.99, "grad_norm": 10.406237602233887, "learning_rate": 1.5137299771167048e-05, "loss": 0.9577, "step": 3464 }, { "epoch": 0.99, "grad_norm": 9.833565711975098, "learning_rate": 1.513300915331808e-05, "loss": 1.0214, "step": 3465 }, { "epoch": 0.99, "grad_norm": 7.488353729248047, "learning_rate": 1.5128718535469107e-05, "loss": 0.7898, "step": 3466 }, { "epoch": 0.99, "grad_norm": 9.927220344543457, "learning_rate": 1.5124427917620139e-05, "loss": 0.8986, "step": 3467 }, { "epoch": 0.99, "grad_norm": 9.887399673461914, "learning_rate": 1.5120137299771166e-05, "loss": 0.9155, "step": 3468 }, { "epoch": 0.99, "grad_norm": 9.517233848571777, "learning_rate": 1.5115846681922198e-05, "loss": 0.8387, "step": 3469 }, { "epoch": 0.99, "grad_norm": 9.98740291595459, "learning_rate": 1.5111556064073225e-05, "loss": 1.0168, "step": 3470 }, { "epoch": 0.99, "grad_norm": 10.123132705688477, "learning_rate": 1.5107265446224256e-05, "loss": 0.9903, "step": 3471 }, { "epoch": 0.99, "grad_norm": 8.628500938415527, "learning_rate": 1.5102974828375288e-05, "loss": 0.5802, "step": 3472 }, { "epoch": 0.99, "grad_norm": 9.079924583435059, "learning_rate": 1.5098684210526315e-05, "loss": 0.9983, "step": 3473 }, { "epoch": 0.99, "grad_norm": 9.199273109436035, "learning_rate": 1.5094393592677347e-05, "loss": 1.0829, "step": 3474 }, { "epoch": 0.99, "grad_norm": 9.835076332092285, "learning_rate": 1.5090102974828374e-05, "loss": 1.0947, "step": 3475 }, { "epoch": 0.99, "grad_norm": 10.03668212890625, "learning_rate": 1.5085812356979406e-05, "loss": 0.7865, "step": 3476 }, { "epoch": 0.99, "grad_norm": 10.380677223205566, "learning_rate": 1.5081521739130433e-05, "loss": 1.2301, "step": 3477 }, { "epoch": 0.99, "grad_norm": 12.602004051208496, "learning_rate": 1.5077231121281466e-05, "loss": 1.2464, "step": 3478 }, { "epoch": 1.0, "grad_norm": 9.200499534606934, "learning_rate": 1.5072940503432496e-05, "loss": 0.9179, "step": 3479 }, { "epoch": 1.0, "grad_norm": 8.837831497192383, "learning_rate": 1.5068649885583525e-05, "loss": 0.6146, "step": 3480 }, { "epoch": 1.0, "grad_norm": 9.19485092163086, "learning_rate": 1.5064359267734555e-05, "loss": 0.7807, "step": 3481 }, { "epoch": 1.0, "grad_norm": 10.346631050109863, "learning_rate": 1.5060068649885584e-05, "loss": 1.0513, "step": 3482 }, { "epoch": 1.0, "grad_norm": 9.652012825012207, "learning_rate": 1.5055778032036615e-05, "loss": 0.9422, "step": 3483 }, { "epoch": 1.0, "grad_norm": 10.3707857131958, "learning_rate": 1.5051487414187642e-05, "loss": 0.8258, "step": 3484 }, { "epoch": 1.0, "grad_norm": 9.806299209594727, "learning_rate": 1.5047196796338674e-05, "loss": 0.8674, "step": 3485 }, { "epoch": 1.0, "grad_norm": 10.051835060119629, "learning_rate": 1.50429061784897e-05, "loss": 1.0132, "step": 3486 }, { "epoch": 1.0, "grad_norm": 8.189953804016113, "learning_rate": 1.5038615560640733e-05, "loss": 0.8535, "step": 3487 }, { "epoch": 1.0, "grad_norm": 8.710848808288574, "learning_rate": 1.5034324942791763e-05, "loss": 0.8356, "step": 3488 }, { "epoch": 1.0, "grad_norm": 9.190397262573242, "learning_rate": 1.5030034324942792e-05, "loss": 0.9986, "step": 3489 }, { "epoch": 1.0, "grad_norm": 10.301618576049805, "learning_rate": 1.5025743707093823e-05, "loss": 0.9548, "step": 3490 }, { "epoch": 1.0, "grad_norm": 8.565505027770996, "learning_rate": 1.5021453089244851e-05, "loss": 0.9822, "step": 3491 }, { "epoch": 1.0, "grad_norm": 9.500279426574707, "learning_rate": 1.5017162471395882e-05, "loss": 0.8476, "step": 3492 }, { "epoch": 1.0, "grad_norm": 9.195255279541016, "learning_rate": 1.501287185354691e-05, "loss": 0.8558, "step": 3493 }, { "epoch": 1.0, "grad_norm": 8.99519157409668, "learning_rate": 1.5008581235697941e-05, "loss": 0.9316, "step": 3494 }, { "epoch": 1.0, "grad_norm": 7.147590637207031, "learning_rate": 1.500429061784897e-05, "loss": 0.7436, "step": 3495 }, { "epoch": 1.0, "grad_norm": 11.322429656982422, "learning_rate": 1.5e-05, "loss": 1.0799, "step": 3496 }, { "epoch": 1.0, "grad_norm": 8.729336738586426, "learning_rate": 1.4995709382151029e-05, "loss": 0.9429, "step": 3497 }, { "epoch": 1.0, "grad_norm": 8.235333442687988, "learning_rate": 1.499141876430206e-05, "loss": 0.9913, "step": 3498 }, { "epoch": 1.0, "grad_norm": 10.02188777923584, "learning_rate": 1.4987128146453088e-05, "loss": 0.9376, "step": 3499 }, { "epoch": 1.0, "grad_norm": 8.01506233215332, "learning_rate": 1.4982837528604119e-05, "loss": 0.6847, "step": 3500 }, { "epoch": 1.0, "grad_norm": 8.915964126586914, "learning_rate": 1.497854691075515e-05, "loss": 1.0377, "step": 3501 }, { "epoch": 1.0, "grad_norm": 9.767047882080078, "learning_rate": 1.497425629290618e-05, "loss": 1.0724, "step": 3502 }, { "epoch": 1.0, "grad_norm": 10.36973762512207, "learning_rate": 1.4969965675057209e-05, "loss": 1.0589, "step": 3503 }, { "epoch": 1.0, "grad_norm": 10.49530029296875, "learning_rate": 1.4965675057208239e-05, "loss": 0.9688, "step": 3504 }, { "epoch": 1.0, "grad_norm": 7.333240509033203, "learning_rate": 1.4961384439359268e-05, "loss": 0.7759, "step": 3505 }, { "epoch": 1.0, "grad_norm": 7.345729827880859, "learning_rate": 1.4957093821510298e-05, "loss": 0.5091, "step": 3506 }, { "epoch": 1.0, "grad_norm": 8.10158634185791, "learning_rate": 1.4952803203661327e-05, "loss": 0.602, "step": 3507 }, { "epoch": 1.0, "grad_norm": 8.788975715637207, "learning_rate": 1.4948512585812357e-05, "loss": 0.8844, "step": 3508 }, { "epoch": 1.0, "grad_norm": 8.100739479064941, "learning_rate": 1.4944221967963388e-05, "loss": 0.7641, "step": 3509 }, { "epoch": 1.0, "grad_norm": 7.1900248527526855, "learning_rate": 1.4939931350114417e-05, "loss": 0.7606, "step": 3510 }, { "epoch": 1.0, "grad_norm": 10.251717567443848, "learning_rate": 1.4935640732265447e-05, "loss": 0.792, "step": 3511 }, { "epoch": 1.0, "grad_norm": 9.145462036132812, "learning_rate": 1.4931350114416476e-05, "loss": 1.0039, "step": 3512 }, { "epoch": 1.0, "grad_norm": 7.771153450012207, "learning_rate": 1.4927059496567506e-05, "loss": 0.8965, "step": 3513 }, { "epoch": 1.01, "grad_norm": 9.03454303741455, "learning_rate": 1.4922768878718535e-05, "loss": 0.8384, "step": 3514 }, { "epoch": 1.01, "grad_norm": 8.778411865234375, "learning_rate": 1.4918478260869566e-05, "loss": 0.8014, "step": 3515 }, { "epoch": 1.01, "grad_norm": 9.974286079406738, "learning_rate": 1.4914187643020594e-05, "loss": 0.7165, "step": 3516 }, { "epoch": 1.01, "grad_norm": 7.651914596557617, "learning_rate": 1.4909897025171625e-05, "loss": 0.7693, "step": 3517 }, { "epoch": 1.01, "grad_norm": 11.590794563293457, "learning_rate": 1.4905606407322655e-05, "loss": 0.8939, "step": 3518 }, { "epoch": 1.01, "grad_norm": 11.48332691192627, "learning_rate": 1.4901315789473686e-05, "loss": 0.8586, "step": 3519 }, { "epoch": 1.01, "grad_norm": 10.365842819213867, "learning_rate": 1.4897025171624715e-05, "loss": 0.8985, "step": 3520 }, { "epoch": 1.01, "grad_norm": 9.330724716186523, "learning_rate": 1.4892734553775745e-05, "loss": 0.8827, "step": 3521 }, { "epoch": 1.01, "grad_norm": 8.041494369506836, "learning_rate": 1.4888443935926774e-05, "loss": 0.7185, "step": 3522 }, { "epoch": 1.01, "grad_norm": 8.145724296569824, "learning_rate": 1.4884153318077803e-05, "loss": 0.7699, "step": 3523 }, { "epoch": 1.01, "grad_norm": 7.616659164428711, "learning_rate": 1.4879862700228833e-05, "loss": 0.6846, "step": 3524 }, { "epoch": 1.01, "grad_norm": 7.798200607299805, "learning_rate": 1.4875572082379862e-05, "loss": 0.7173, "step": 3525 }, { "epoch": 1.01, "grad_norm": 10.310019493103027, "learning_rate": 1.4871281464530894e-05, "loss": 0.7357, "step": 3526 }, { "epoch": 1.01, "grad_norm": 11.527127265930176, "learning_rate": 1.4866990846681923e-05, "loss": 0.9704, "step": 3527 }, { "epoch": 1.01, "grad_norm": 11.12434196472168, "learning_rate": 1.4862700228832953e-05, "loss": 0.918, "step": 3528 }, { "epoch": 1.01, "grad_norm": 11.516888618469238, "learning_rate": 1.4858409610983982e-05, "loss": 1.0105, "step": 3529 }, { "epoch": 1.01, "grad_norm": 7.838921546936035, "learning_rate": 1.4854118993135012e-05, "loss": 0.7692, "step": 3530 }, { "epoch": 1.01, "grad_norm": 9.907757759094238, "learning_rate": 1.4849828375286041e-05, "loss": 0.7239, "step": 3531 }, { "epoch": 1.01, "grad_norm": 9.133485794067383, "learning_rate": 1.4845537757437072e-05, "loss": 0.9211, "step": 3532 }, { "epoch": 1.01, "grad_norm": 11.23843002319336, "learning_rate": 1.48412471395881e-05, "loss": 1.0158, "step": 3533 }, { "epoch": 1.01, "grad_norm": 9.841829299926758, "learning_rate": 1.483695652173913e-05, "loss": 0.6117, "step": 3534 }, { "epoch": 1.01, "grad_norm": 10.754142761230469, "learning_rate": 1.4832665903890161e-05, "loss": 0.6776, "step": 3535 }, { "epoch": 1.01, "grad_norm": 13.35550308227539, "learning_rate": 1.482837528604119e-05, "loss": 0.8379, "step": 3536 }, { "epoch": 1.01, "grad_norm": 11.992549896240234, "learning_rate": 1.482408466819222e-05, "loss": 1.0228, "step": 3537 }, { "epoch": 1.01, "grad_norm": 11.021371841430664, "learning_rate": 1.481979405034325e-05, "loss": 0.7198, "step": 3538 }, { "epoch": 1.01, "grad_norm": 9.923230171203613, "learning_rate": 1.481550343249428e-05, "loss": 0.9317, "step": 3539 }, { "epoch": 1.01, "grad_norm": 8.832412719726562, "learning_rate": 1.4811212814645309e-05, "loss": 0.8334, "step": 3540 }, { "epoch": 1.01, "grad_norm": 9.36916732788086, "learning_rate": 1.4806922196796339e-05, "loss": 0.7994, "step": 3541 }, { "epoch": 1.01, "grad_norm": 9.33375358581543, "learning_rate": 1.4802631578947368e-05, "loss": 0.8682, "step": 3542 }, { "epoch": 1.01, "grad_norm": 10.338713645935059, "learning_rate": 1.4798340961098398e-05, "loss": 0.7042, "step": 3543 }, { "epoch": 1.01, "grad_norm": 9.743721008300781, "learning_rate": 1.4794050343249429e-05, "loss": 0.8741, "step": 3544 }, { "epoch": 1.01, "grad_norm": 6.583962917327881, "learning_rate": 1.4789759725400459e-05, "loss": 0.5621, "step": 3545 }, { "epoch": 1.01, "grad_norm": 9.400547981262207, "learning_rate": 1.4785469107551488e-05, "loss": 0.7926, "step": 3546 }, { "epoch": 1.01, "grad_norm": 10.095444679260254, "learning_rate": 1.4781178489702518e-05, "loss": 0.7835, "step": 3547 }, { "epoch": 1.01, "grad_norm": 10.908488273620605, "learning_rate": 1.4776887871853547e-05, "loss": 0.8011, "step": 3548 }, { "epoch": 1.02, "grad_norm": 9.587437629699707, "learning_rate": 1.4772597254004576e-05, "loss": 0.876, "step": 3549 }, { "epoch": 1.02, "grad_norm": 9.30736255645752, "learning_rate": 1.4768306636155606e-05, "loss": 1.0483, "step": 3550 }, { "epoch": 1.02, "grad_norm": 7.909506320953369, "learning_rate": 1.4764016018306635e-05, "loss": 0.7863, "step": 3551 }, { "epoch": 1.02, "grad_norm": 8.779753684997559, "learning_rate": 1.4759725400457667e-05, "loss": 0.998, "step": 3552 }, { "epoch": 1.02, "grad_norm": 9.129301071166992, "learning_rate": 1.4755434782608696e-05, "loss": 0.7694, "step": 3553 }, { "epoch": 1.02, "grad_norm": 9.928060531616211, "learning_rate": 1.4751144164759727e-05, "loss": 0.7666, "step": 3554 }, { "epoch": 1.02, "grad_norm": 10.944051742553711, "learning_rate": 1.4746853546910755e-05, "loss": 0.9608, "step": 3555 }, { "epoch": 1.02, "grad_norm": 7.6837239265441895, "learning_rate": 1.4742562929061786e-05, "loss": 0.5508, "step": 3556 }, { "epoch": 1.02, "grad_norm": 10.845468521118164, "learning_rate": 1.4738272311212815e-05, "loss": 0.8979, "step": 3557 }, { "epoch": 1.02, "grad_norm": 8.706075668334961, "learning_rate": 1.4733981693363845e-05, "loss": 0.6102, "step": 3558 }, { "epoch": 1.02, "grad_norm": 13.22905158996582, "learning_rate": 1.4729691075514874e-05, "loss": 1.0806, "step": 3559 }, { "epoch": 1.02, "grad_norm": 7.789815902709961, "learning_rate": 1.4725400457665904e-05, "loss": 0.5796, "step": 3560 }, { "epoch": 1.02, "grad_norm": 8.726973533630371, "learning_rate": 1.4721109839816935e-05, "loss": 0.7599, "step": 3561 }, { "epoch": 1.02, "grad_norm": 10.404717445373535, "learning_rate": 1.4716819221967963e-05, "loss": 0.8246, "step": 3562 }, { "epoch": 1.02, "grad_norm": 11.025307655334473, "learning_rate": 1.4712528604118994e-05, "loss": 0.8303, "step": 3563 }, { "epoch": 1.02, "grad_norm": 11.1351318359375, "learning_rate": 1.4708237986270023e-05, "loss": 0.8676, "step": 3564 }, { "epoch": 1.02, "grad_norm": 8.169400215148926, "learning_rate": 1.4703947368421053e-05, "loss": 0.5322, "step": 3565 }, { "epoch": 1.02, "grad_norm": 9.377978324890137, "learning_rate": 1.4699656750572082e-05, "loss": 0.8039, "step": 3566 }, { "epoch": 1.02, "grad_norm": 10.150505065917969, "learning_rate": 1.4695366132723112e-05, "loss": 0.9011, "step": 3567 }, { "epoch": 1.02, "grad_norm": 8.142455101013184, "learning_rate": 1.4691075514874141e-05, "loss": 0.8546, "step": 3568 }, { "epoch": 1.02, "grad_norm": 8.890527725219727, "learning_rate": 1.4686784897025173e-05, "loss": 0.901, "step": 3569 }, { "epoch": 1.02, "grad_norm": 10.418173789978027, "learning_rate": 1.4682494279176202e-05, "loss": 0.8699, "step": 3570 }, { "epoch": 1.02, "grad_norm": 10.354543685913086, "learning_rate": 1.4678203661327233e-05, "loss": 1.0417, "step": 3571 }, { "epoch": 1.02, "grad_norm": 11.536067008972168, "learning_rate": 1.4673913043478261e-05, "loss": 0.8563, "step": 3572 }, { "epoch": 1.02, "grad_norm": 10.119474411010742, "learning_rate": 1.4669622425629292e-05, "loss": 1.0292, "step": 3573 }, { "epoch": 1.02, "grad_norm": 7.384093284606934, "learning_rate": 1.466533180778032e-05, "loss": 0.6153, "step": 3574 }, { "epoch": 1.02, "grad_norm": 10.39474105834961, "learning_rate": 1.466104118993135e-05, "loss": 1.0311, "step": 3575 }, { "epoch": 1.02, "grad_norm": 9.86563777923584, "learning_rate": 1.465675057208238e-05, "loss": 0.743, "step": 3576 }, { "epoch": 1.02, "grad_norm": 9.815017700195312, "learning_rate": 1.4652459954233409e-05, "loss": 0.8117, "step": 3577 }, { "epoch": 1.02, "grad_norm": 8.892045974731445, "learning_rate": 1.464816933638444e-05, "loss": 0.7091, "step": 3578 }, { "epoch": 1.02, "grad_norm": 11.2473726272583, "learning_rate": 1.464387871853547e-05, "loss": 0.8584, "step": 3579 }, { "epoch": 1.02, "grad_norm": 8.565177917480469, "learning_rate": 1.46395881006865e-05, "loss": 0.7814, "step": 3580 }, { "epoch": 1.02, "grad_norm": 9.64526653289795, "learning_rate": 1.4635297482837529e-05, "loss": 0.9121, "step": 3581 }, { "epoch": 1.02, "grad_norm": 8.384381294250488, "learning_rate": 1.4631006864988559e-05, "loss": 0.6535, "step": 3582 }, { "epoch": 1.02, "grad_norm": 10.528741836547852, "learning_rate": 1.4626716247139588e-05, "loss": 0.5912, "step": 3583 }, { "epoch": 1.03, "grad_norm": 8.734308242797852, "learning_rate": 1.4622425629290618e-05, "loss": 0.6136, "step": 3584 }, { "epoch": 1.03, "grad_norm": 10.390425682067871, "learning_rate": 1.4618135011441647e-05, "loss": 0.9535, "step": 3585 }, { "epoch": 1.03, "grad_norm": 9.493962287902832, "learning_rate": 1.461384439359268e-05, "loss": 1.1426, "step": 3586 }, { "epoch": 1.03, "grad_norm": 10.703402519226074, "learning_rate": 1.4609553775743708e-05, "loss": 0.9597, "step": 3587 }, { "epoch": 1.03, "grad_norm": 10.467942237854004, "learning_rate": 1.4605263157894737e-05, "loss": 1.0131, "step": 3588 }, { "epoch": 1.03, "grad_norm": 9.35848331451416, "learning_rate": 1.4600972540045767e-05, "loss": 0.8164, "step": 3589 }, { "epoch": 1.03, "grad_norm": 9.839441299438477, "learning_rate": 1.4596681922196796e-05, "loss": 0.6921, "step": 3590 }, { "epoch": 1.03, "grad_norm": 10.550750732421875, "learning_rate": 1.4592391304347827e-05, "loss": 0.8487, "step": 3591 }, { "epoch": 1.03, "grad_norm": 8.606681823730469, "learning_rate": 1.4588100686498855e-05, "loss": 0.7856, "step": 3592 }, { "epoch": 1.03, "grad_norm": 9.010801315307617, "learning_rate": 1.4583810068649886e-05, "loss": 0.689, "step": 3593 }, { "epoch": 1.03, "grad_norm": 8.278401374816895, "learning_rate": 1.4579519450800915e-05, "loss": 0.6683, "step": 3594 }, { "epoch": 1.03, "grad_norm": 10.394222259521484, "learning_rate": 1.4575228832951947e-05, "loss": 0.896, "step": 3595 }, { "epoch": 1.03, "grad_norm": 9.620981216430664, "learning_rate": 1.4570938215102975e-05, "loss": 0.7403, "step": 3596 }, { "epoch": 1.03, "grad_norm": 9.585387229919434, "learning_rate": 1.4566647597254006e-05, "loss": 0.7583, "step": 3597 }, { "epoch": 1.03, "grad_norm": 11.001717567443848, "learning_rate": 1.4562356979405035e-05, "loss": 0.9145, "step": 3598 }, { "epoch": 1.03, "grad_norm": 8.327040672302246, "learning_rate": 1.4558066361556065e-05, "loss": 0.6258, "step": 3599 }, { "epoch": 1.03, "grad_norm": 9.499870300292969, "learning_rate": 1.4553775743707094e-05, "loss": 0.6328, "step": 3600 }, { "epoch": 1.03, "grad_norm": 10.079899787902832, "learning_rate": 1.4549485125858124e-05, "loss": 0.6817, "step": 3601 }, { "epoch": 1.03, "grad_norm": 14.0177640914917, "learning_rate": 1.4545194508009153e-05, "loss": 0.916, "step": 3602 }, { "epoch": 1.03, "grad_norm": 10.041084289550781, "learning_rate": 1.4540903890160182e-05, "loss": 0.9139, "step": 3603 }, { "epoch": 1.03, "grad_norm": 8.739937782287598, "learning_rate": 1.4536613272311214e-05, "loss": 0.5609, "step": 3604 }, { "epoch": 1.03, "grad_norm": 8.561568260192871, "learning_rate": 1.4532322654462243e-05, "loss": 0.6168, "step": 3605 }, { "epoch": 1.03, "grad_norm": 9.857020378112793, "learning_rate": 1.4528032036613273e-05, "loss": 0.733, "step": 3606 }, { "epoch": 1.03, "grad_norm": 11.430785179138184, "learning_rate": 1.4523741418764302e-05, "loss": 0.9336, "step": 3607 }, { "epoch": 1.03, "grad_norm": 10.511173248291016, "learning_rate": 1.4519450800915332e-05, "loss": 1.0044, "step": 3608 }, { "epoch": 1.03, "grad_norm": 10.712701797485352, "learning_rate": 1.4515160183066361e-05, "loss": 1.0262, "step": 3609 }, { "epoch": 1.03, "grad_norm": 11.696800231933594, "learning_rate": 1.4510869565217392e-05, "loss": 1.197, "step": 3610 }, { "epoch": 1.03, "grad_norm": 8.272732734680176, "learning_rate": 1.450657894736842e-05, "loss": 0.7798, "step": 3611 }, { "epoch": 1.03, "grad_norm": 12.144145965576172, "learning_rate": 1.4502288329519453e-05, "loss": 0.6809, "step": 3612 }, { "epoch": 1.03, "grad_norm": 10.280263900756836, "learning_rate": 1.4497997711670481e-05, "loss": 0.9479, "step": 3613 }, { "epoch": 1.03, "grad_norm": 9.201637268066406, "learning_rate": 1.4493707093821512e-05, "loss": 0.6101, "step": 3614 }, { "epoch": 1.03, "grad_norm": 7.813424110412598, "learning_rate": 1.448941647597254e-05, "loss": 0.7455, "step": 3615 }, { "epoch": 1.03, "grad_norm": 9.607821464538574, "learning_rate": 1.448512585812357e-05, "loss": 0.7986, "step": 3616 }, { "epoch": 1.03, "grad_norm": 10.124610900878906, "learning_rate": 1.44808352402746e-05, "loss": 0.7356, "step": 3617 }, { "epoch": 1.03, "grad_norm": 10.535299301147461, "learning_rate": 1.4476544622425629e-05, "loss": 0.8917, "step": 3618 }, { "epoch": 1.04, "grad_norm": 10.994709014892578, "learning_rate": 1.4472254004576659e-05, "loss": 0.8014, "step": 3619 }, { "epoch": 1.04, "grad_norm": 9.829028129577637, "learning_rate": 1.4467963386727688e-05, "loss": 0.7732, "step": 3620 }, { "epoch": 1.04, "grad_norm": 9.724833488464355, "learning_rate": 1.446367276887872e-05, "loss": 0.8182, "step": 3621 }, { "epoch": 1.04, "grad_norm": 10.023293495178223, "learning_rate": 1.4459382151029749e-05, "loss": 0.9182, "step": 3622 }, { "epoch": 1.04, "grad_norm": 11.00589656829834, "learning_rate": 1.445509153318078e-05, "loss": 0.9998, "step": 3623 }, { "epoch": 1.04, "grad_norm": 9.75013256072998, "learning_rate": 1.4450800915331808e-05, "loss": 1.0815, "step": 3624 }, { "epoch": 1.04, "grad_norm": 9.099947929382324, "learning_rate": 1.4446510297482838e-05, "loss": 0.9197, "step": 3625 }, { "epoch": 1.04, "grad_norm": 11.304966926574707, "learning_rate": 1.4442219679633867e-05, "loss": 1.0328, "step": 3626 }, { "epoch": 1.04, "grad_norm": 11.483154296875, "learning_rate": 1.4437929061784898e-05, "loss": 0.98, "step": 3627 }, { "epoch": 1.04, "grad_norm": 8.709813117980957, "learning_rate": 1.4433638443935926e-05, "loss": 0.7196, "step": 3628 }, { "epoch": 1.04, "grad_norm": 10.48593521118164, "learning_rate": 1.4429347826086957e-05, "loss": 0.864, "step": 3629 }, { "epoch": 1.04, "grad_norm": 7.9823737144470215, "learning_rate": 1.4425057208237987e-05, "loss": 0.5869, "step": 3630 }, { "epoch": 1.04, "grad_norm": 8.290487289428711, "learning_rate": 1.4420766590389016e-05, "loss": 0.7856, "step": 3631 }, { "epoch": 1.04, "grad_norm": 10.442862510681152, "learning_rate": 1.4416475972540047e-05, "loss": 0.8337, "step": 3632 }, { "epoch": 1.04, "grad_norm": 8.83592700958252, "learning_rate": 1.4412185354691075e-05, "loss": 0.5868, "step": 3633 }, { "epoch": 1.04, "grad_norm": 11.119686126708984, "learning_rate": 1.4407894736842106e-05, "loss": 0.8916, "step": 3634 }, { "epoch": 1.04, "grad_norm": 9.67123031616211, "learning_rate": 1.4403604118993135e-05, "loss": 0.8316, "step": 3635 }, { "epoch": 1.04, "grad_norm": 9.490544319152832, "learning_rate": 1.4399313501144165e-05, "loss": 0.999, "step": 3636 }, { "epoch": 1.04, "grad_norm": 11.423625946044922, "learning_rate": 1.4395022883295194e-05, "loss": 1.1596, "step": 3637 }, { "epoch": 1.04, "grad_norm": 8.861817359924316, "learning_rate": 1.4390732265446226e-05, "loss": 0.8421, "step": 3638 }, { "epoch": 1.04, "grad_norm": 8.873979568481445, "learning_rate": 1.4386441647597255e-05, "loss": 0.8049, "step": 3639 }, { "epoch": 1.04, "grad_norm": 10.359341621398926, "learning_rate": 1.4382151029748285e-05, "loss": 0.7929, "step": 3640 }, { "epoch": 1.04, "grad_norm": 9.25731372833252, "learning_rate": 1.4377860411899314e-05, "loss": 0.7527, "step": 3641 }, { "epoch": 1.04, "grad_norm": 10.850419998168945, "learning_rate": 1.4373569794050343e-05, "loss": 0.8118, "step": 3642 }, { "epoch": 1.04, "grad_norm": 12.61296272277832, "learning_rate": 1.4369279176201373e-05, "loss": 0.9118, "step": 3643 }, { "epoch": 1.04, "grad_norm": 9.669021606445312, "learning_rate": 1.4364988558352402e-05, "loss": 0.8012, "step": 3644 }, { "epoch": 1.04, "grad_norm": 10.008260726928711, "learning_rate": 1.4360697940503432e-05, "loss": 0.9706, "step": 3645 }, { "epoch": 1.04, "grad_norm": 10.625000953674316, "learning_rate": 1.4356407322654463e-05, "loss": 1.0906, "step": 3646 }, { "epoch": 1.04, "grad_norm": 10.33679485321045, "learning_rate": 1.4352116704805493e-05, "loss": 0.7084, "step": 3647 }, { "epoch": 1.04, "grad_norm": 9.466826438903809, "learning_rate": 1.4347826086956522e-05, "loss": 0.6289, "step": 3648 }, { "epoch": 1.04, "grad_norm": 10.775632858276367, "learning_rate": 1.4343535469107553e-05, "loss": 1.0995, "step": 3649 }, { "epoch": 1.04, "grad_norm": 9.4617280960083, "learning_rate": 1.4339244851258581e-05, "loss": 0.9055, "step": 3650 }, { "epoch": 1.04, "grad_norm": 8.862994194030762, "learning_rate": 1.4334954233409612e-05, "loss": 0.7649, "step": 3651 }, { "epoch": 1.04, "grad_norm": 9.316967010498047, "learning_rate": 1.433066361556064e-05, "loss": 0.8565, "step": 3652 }, { "epoch": 1.04, "grad_norm": 9.595606803894043, "learning_rate": 1.4326372997711671e-05, "loss": 0.9249, "step": 3653 }, { "epoch": 1.05, "grad_norm": 11.789463996887207, "learning_rate": 1.43220823798627e-05, "loss": 1.1813, "step": 3654 }, { "epoch": 1.05, "grad_norm": 10.194077491760254, "learning_rate": 1.431779176201373e-05, "loss": 0.7936, "step": 3655 }, { "epoch": 1.05, "grad_norm": 9.44644832611084, "learning_rate": 1.431350114416476e-05, "loss": 0.7989, "step": 3656 }, { "epoch": 1.05, "grad_norm": 9.425652503967285, "learning_rate": 1.430921052631579e-05, "loss": 0.7062, "step": 3657 }, { "epoch": 1.05, "grad_norm": 10.893973350524902, "learning_rate": 1.430491990846682e-05, "loss": 0.846, "step": 3658 }, { "epoch": 1.05, "grad_norm": 9.67643928527832, "learning_rate": 1.4300629290617849e-05, "loss": 0.7771, "step": 3659 }, { "epoch": 1.05, "grad_norm": 8.7727689743042, "learning_rate": 1.429633867276888e-05, "loss": 0.6081, "step": 3660 }, { "epoch": 1.05, "grad_norm": 10.030600547790527, "learning_rate": 1.4292048054919908e-05, "loss": 0.5844, "step": 3661 }, { "epoch": 1.05, "grad_norm": 8.970498085021973, "learning_rate": 1.4287757437070938e-05, "loss": 0.8009, "step": 3662 }, { "epoch": 1.05, "grad_norm": 12.848243713378906, "learning_rate": 1.4283466819221967e-05, "loss": 0.8377, "step": 3663 }, { "epoch": 1.05, "grad_norm": 10.326622009277344, "learning_rate": 1.4279176201373e-05, "loss": 0.7998, "step": 3664 }, { "epoch": 1.05, "grad_norm": 7.992002010345459, "learning_rate": 1.4274885583524028e-05, "loss": 0.6024, "step": 3665 }, { "epoch": 1.05, "grad_norm": 9.728657722473145, "learning_rate": 1.4270594965675059e-05, "loss": 0.8022, "step": 3666 }, { "epoch": 1.05, "grad_norm": 10.044332504272461, "learning_rate": 1.4266304347826087e-05, "loss": 0.7803, "step": 3667 }, { "epoch": 1.05, "grad_norm": 11.209845542907715, "learning_rate": 1.4262013729977116e-05, "loss": 0.7259, "step": 3668 }, { "epoch": 1.05, "grad_norm": 11.56793212890625, "learning_rate": 1.4257723112128147e-05, "loss": 0.8163, "step": 3669 }, { "epoch": 1.05, "grad_norm": 11.417867660522461, "learning_rate": 1.4253432494279175e-05, "loss": 0.8054, "step": 3670 }, { "epoch": 1.05, "grad_norm": 10.159390449523926, "learning_rate": 1.4249141876430206e-05, "loss": 0.9439, "step": 3671 }, { "epoch": 1.05, "grad_norm": 12.502946853637695, "learning_rate": 1.4244851258581236e-05, "loss": 1.1544, "step": 3672 }, { "epoch": 1.05, "grad_norm": 8.187594413757324, "learning_rate": 1.4240560640732267e-05, "loss": 0.6972, "step": 3673 }, { "epoch": 1.05, "grad_norm": 10.784448623657227, "learning_rate": 1.4236270022883296e-05, "loss": 1.0062, "step": 3674 }, { "epoch": 1.05, "grad_norm": 10.996561050415039, "learning_rate": 1.4231979405034326e-05, "loss": 0.9502, "step": 3675 }, { "epoch": 1.05, "grad_norm": 8.331461906433105, "learning_rate": 1.4227688787185355e-05, "loss": 0.9851, "step": 3676 }, { "epoch": 1.05, "grad_norm": 7.585824012756348, "learning_rate": 1.4223398169336385e-05, "loss": 0.8282, "step": 3677 }, { "epoch": 1.05, "grad_norm": 7.308041572570801, "learning_rate": 1.4219107551487414e-05, "loss": 0.6192, "step": 3678 }, { "epoch": 1.05, "grad_norm": 11.896117210388184, "learning_rate": 1.4214816933638444e-05, "loss": 1.153, "step": 3679 }, { "epoch": 1.05, "grad_norm": 11.243952751159668, "learning_rate": 1.4210526315789473e-05, "loss": 0.8675, "step": 3680 }, { "epoch": 1.05, "grad_norm": 10.377750396728516, "learning_rate": 1.4206235697940504e-05, "loss": 0.8831, "step": 3681 }, { "epoch": 1.05, "grad_norm": 9.671457290649414, "learning_rate": 1.4201945080091534e-05, "loss": 0.7269, "step": 3682 }, { "epoch": 1.05, "grad_norm": 11.001798629760742, "learning_rate": 1.4197654462242563e-05, "loss": 0.8837, "step": 3683 }, { "epoch": 1.05, "grad_norm": 9.80832290649414, "learning_rate": 1.4193363844393593e-05, "loss": 0.8081, "step": 3684 }, { "epoch": 1.05, "grad_norm": 9.946518898010254, "learning_rate": 1.4189073226544622e-05, "loss": 0.8903, "step": 3685 }, { "epoch": 1.05, "grad_norm": 10.565032005310059, "learning_rate": 1.4184782608695653e-05, "loss": 0.9812, "step": 3686 }, { "epoch": 1.05, "grad_norm": 10.964146614074707, "learning_rate": 1.4180491990846681e-05, "loss": 0.9207, "step": 3687 }, { "epoch": 1.05, "grad_norm": 10.63786792755127, "learning_rate": 1.4176201372997712e-05, "loss": 0.9114, "step": 3688 }, { "epoch": 1.06, "grad_norm": 9.37463665008545, "learning_rate": 1.4171910755148742e-05, "loss": 0.8679, "step": 3689 }, { "epoch": 1.06, "grad_norm": 11.379569053649902, "learning_rate": 1.4167620137299773e-05, "loss": 0.923, "step": 3690 }, { "epoch": 1.06, "grad_norm": 10.972661018371582, "learning_rate": 1.4163329519450802e-05, "loss": 0.7566, "step": 3691 }, { "epoch": 1.06, "grad_norm": 9.997492790222168, "learning_rate": 1.4159038901601832e-05, "loss": 0.9684, "step": 3692 }, { "epoch": 1.06, "grad_norm": 9.21652603149414, "learning_rate": 1.415474828375286e-05, "loss": 0.723, "step": 3693 }, { "epoch": 1.06, "grad_norm": 9.660791397094727, "learning_rate": 1.415045766590389e-05, "loss": 0.8044, "step": 3694 }, { "epoch": 1.06, "grad_norm": 9.309823036193848, "learning_rate": 1.414616704805492e-05, "loss": 0.8231, "step": 3695 }, { "epoch": 1.06, "grad_norm": 9.975504875183105, "learning_rate": 1.4141876430205949e-05, "loss": 0.8977, "step": 3696 }, { "epoch": 1.06, "grad_norm": 10.991817474365234, "learning_rate": 1.413758581235698e-05, "loss": 0.6957, "step": 3697 }, { "epoch": 1.06, "grad_norm": 7.990804672241211, "learning_rate": 1.413329519450801e-05, "loss": 0.6785, "step": 3698 }, { "epoch": 1.06, "grad_norm": 9.592016220092773, "learning_rate": 1.412900457665904e-05, "loss": 1.0659, "step": 3699 }, { "epoch": 1.06, "grad_norm": 9.965288162231445, "learning_rate": 1.4124713958810069e-05, "loss": 0.8429, "step": 3700 }, { "epoch": 1.06, "grad_norm": 8.703361511230469, "learning_rate": 1.41204233409611e-05, "loss": 0.8845, "step": 3701 }, { "epoch": 1.06, "grad_norm": 10.193435668945312, "learning_rate": 1.4116132723112128e-05, "loss": 0.8447, "step": 3702 }, { "epoch": 1.06, "grad_norm": 9.333001136779785, "learning_rate": 1.4111842105263159e-05, "loss": 0.8867, "step": 3703 }, { "epoch": 1.06, "grad_norm": 10.7531156539917, "learning_rate": 1.4107551487414187e-05, "loss": 1.0541, "step": 3704 }, { "epoch": 1.06, "grad_norm": 10.385746955871582, "learning_rate": 1.4103260869565218e-05, "loss": 0.852, "step": 3705 }, { "epoch": 1.06, "grad_norm": 10.889044761657715, "learning_rate": 1.4098970251716248e-05, "loss": 0.9843, "step": 3706 }, { "epoch": 1.06, "grad_norm": 8.673348426818848, "learning_rate": 1.4094679633867277e-05, "loss": 0.8038, "step": 3707 }, { "epoch": 1.06, "grad_norm": 10.381312370300293, "learning_rate": 1.4090389016018308e-05, "loss": 0.7471, "step": 3708 }, { "epoch": 1.06, "grad_norm": 9.749486923217773, "learning_rate": 1.4086098398169336e-05, "loss": 0.7541, "step": 3709 }, { "epoch": 1.06, "grad_norm": 10.783663749694824, "learning_rate": 1.4081807780320367e-05, "loss": 0.8046, "step": 3710 }, { "epoch": 1.06, "grad_norm": 10.428963661193848, "learning_rate": 1.4077517162471396e-05, "loss": 0.8263, "step": 3711 }, { "epoch": 1.06, "grad_norm": 8.74447250366211, "learning_rate": 1.4073226544622426e-05, "loss": 0.7661, "step": 3712 }, { "epoch": 1.06, "grad_norm": 10.322054862976074, "learning_rate": 1.4068935926773455e-05, "loss": 0.9435, "step": 3713 }, { "epoch": 1.06, "grad_norm": 9.705693244934082, "learning_rate": 1.4064645308924485e-05, "loss": 0.8276, "step": 3714 }, { "epoch": 1.06, "grad_norm": 9.739495277404785, "learning_rate": 1.4060354691075516e-05, "loss": 0.5656, "step": 3715 }, { "epoch": 1.06, "grad_norm": 11.19046401977539, "learning_rate": 1.4056064073226546e-05, "loss": 0.8381, "step": 3716 }, { "epoch": 1.06, "grad_norm": 9.727036476135254, "learning_rate": 1.4051773455377575e-05, "loss": 0.8366, "step": 3717 }, { "epoch": 1.06, "grad_norm": 7.8415207862854, "learning_rate": 1.4047482837528605e-05, "loss": 0.7503, "step": 3718 }, { "epoch": 1.06, "grad_norm": 9.288435935974121, "learning_rate": 1.4043192219679634e-05, "loss": 0.7301, "step": 3719 }, { "epoch": 1.06, "grad_norm": 8.485249519348145, "learning_rate": 1.4038901601830663e-05, "loss": 0.8689, "step": 3720 }, { "epoch": 1.06, "grad_norm": 10.322175025939941, "learning_rate": 1.4034610983981693e-05, "loss": 0.9206, "step": 3721 }, { "epoch": 1.06, "grad_norm": 9.204500198364258, "learning_rate": 1.4030320366132722e-05, "loss": 0.6163, "step": 3722 }, { "epoch": 1.06, "grad_norm": 12.585383415222168, "learning_rate": 1.4026029748283753e-05, "loss": 0.6983, "step": 3723 }, { "epoch": 1.07, "grad_norm": 10.98762321472168, "learning_rate": 1.4021739130434783e-05, "loss": 0.6787, "step": 3724 }, { "epoch": 1.07, "grad_norm": 15.406034469604492, "learning_rate": 1.4017448512585814e-05, "loss": 0.947, "step": 3725 }, { "epoch": 1.07, "grad_norm": 10.489848136901855, "learning_rate": 1.4013157894736842e-05, "loss": 0.7061, "step": 3726 }, { "epoch": 1.07, "grad_norm": 8.680780410766602, "learning_rate": 1.4008867276887873e-05, "loss": 0.6413, "step": 3727 }, { "epoch": 1.07, "grad_norm": 10.098986625671387, "learning_rate": 1.4004576659038902e-05, "loss": 0.8731, "step": 3728 }, { "epoch": 1.07, "grad_norm": 9.972673416137695, "learning_rate": 1.4000286041189932e-05, "loss": 0.6916, "step": 3729 }, { "epoch": 1.07, "grad_norm": 10.532546997070312, "learning_rate": 1.399599542334096e-05, "loss": 0.8237, "step": 3730 }, { "epoch": 1.07, "grad_norm": 9.868332862854004, "learning_rate": 1.3991704805491991e-05, "loss": 0.5668, "step": 3731 }, { "epoch": 1.07, "grad_norm": 9.763664245605469, "learning_rate": 1.3987414187643022e-05, "loss": 0.7665, "step": 3732 }, { "epoch": 1.07, "grad_norm": 8.648012161254883, "learning_rate": 1.398312356979405e-05, "loss": 0.526, "step": 3733 }, { "epoch": 1.07, "grad_norm": 10.330743789672852, "learning_rate": 1.3978832951945081e-05, "loss": 0.838, "step": 3734 }, { "epoch": 1.07, "grad_norm": 10.847214698791504, "learning_rate": 1.397454233409611e-05, "loss": 0.8665, "step": 3735 }, { "epoch": 1.07, "grad_norm": 12.673944473266602, "learning_rate": 1.397025171624714e-05, "loss": 0.9717, "step": 3736 }, { "epoch": 1.07, "grad_norm": 12.03902530670166, "learning_rate": 1.3965961098398169e-05, "loss": 1.05, "step": 3737 }, { "epoch": 1.07, "grad_norm": 10.373015403747559, "learning_rate": 1.39616704805492e-05, "loss": 0.7497, "step": 3738 }, { "epoch": 1.07, "grad_norm": 8.553824424743652, "learning_rate": 1.3957379862700228e-05, "loss": 0.6724, "step": 3739 }, { "epoch": 1.07, "grad_norm": 10.366500854492188, "learning_rate": 1.3953089244851259e-05, "loss": 0.8859, "step": 3740 }, { "epoch": 1.07, "grad_norm": 10.80486011505127, "learning_rate": 1.3948798627002289e-05, "loss": 1.0033, "step": 3741 }, { "epoch": 1.07, "grad_norm": 11.4472017288208, "learning_rate": 1.394450800915332e-05, "loss": 0.9502, "step": 3742 }, { "epoch": 1.07, "grad_norm": 10.950733184814453, "learning_rate": 1.3940217391304348e-05, "loss": 0.8685, "step": 3743 }, { "epoch": 1.07, "grad_norm": 11.05859661102295, "learning_rate": 1.3935926773455379e-05, "loss": 0.868, "step": 3744 }, { "epoch": 1.07, "grad_norm": 10.811240196228027, "learning_rate": 1.3931636155606408e-05, "loss": 0.9819, "step": 3745 }, { "epoch": 1.07, "grad_norm": 9.162971496582031, "learning_rate": 1.3927345537757436e-05, "loss": 0.643, "step": 3746 }, { "epoch": 1.07, "grad_norm": 9.291269302368164, "learning_rate": 1.3923054919908467e-05, "loss": 0.8225, "step": 3747 }, { "epoch": 1.07, "grad_norm": 12.17906379699707, "learning_rate": 1.3918764302059496e-05, "loss": 1.1213, "step": 3748 }, { "epoch": 1.07, "grad_norm": 11.068907737731934, "learning_rate": 1.3914473684210528e-05, "loss": 1.0983, "step": 3749 }, { "epoch": 1.07, "grad_norm": 11.328256607055664, "learning_rate": 1.3910183066361556e-05, "loss": 1.0155, "step": 3750 }, { "epoch": 1.07, "grad_norm": 10.168045997619629, "learning_rate": 1.3905892448512587e-05, "loss": 0.8069, "step": 3751 }, { "epoch": 1.07, "grad_norm": 9.096185684204102, "learning_rate": 1.3901601830663616e-05, "loss": 0.7577, "step": 3752 }, { "epoch": 1.07, "grad_norm": 7.600979328155518, "learning_rate": 1.3897311212814646e-05, "loss": 0.5877, "step": 3753 }, { "epoch": 1.07, "grad_norm": 9.016931533813477, "learning_rate": 1.3893020594965675e-05, "loss": 0.6699, "step": 3754 }, { "epoch": 1.07, "grad_norm": 7.853875160217285, "learning_rate": 1.3888729977116705e-05, "loss": 0.6426, "step": 3755 }, { "epoch": 1.07, "grad_norm": 9.79909896850586, "learning_rate": 1.3884439359267734e-05, "loss": 0.8809, "step": 3756 }, { "epoch": 1.07, "grad_norm": 9.951688766479492, "learning_rate": 1.3880148741418765e-05, "loss": 0.9389, "step": 3757 }, { "epoch": 1.07, "grad_norm": 10.226069450378418, "learning_rate": 1.3875858123569795e-05, "loss": 1.003, "step": 3758 }, { "epoch": 1.08, "grad_norm": 7.558326244354248, "learning_rate": 1.3871567505720824e-05, "loss": 0.787, "step": 3759 }, { "epoch": 1.08, "grad_norm": 9.79404067993164, "learning_rate": 1.3867276887871854e-05, "loss": 0.8167, "step": 3760 }, { "epoch": 1.08, "grad_norm": 10.235540390014648, "learning_rate": 1.3862986270022883e-05, "loss": 0.886, "step": 3761 }, { "epoch": 1.08, "grad_norm": 9.964478492736816, "learning_rate": 1.3858695652173914e-05, "loss": 0.7499, "step": 3762 }, { "epoch": 1.08, "grad_norm": 10.020101547241211, "learning_rate": 1.3854405034324942e-05, "loss": 0.6242, "step": 3763 }, { "epoch": 1.08, "grad_norm": 12.146797180175781, "learning_rate": 1.3850114416475973e-05, "loss": 0.9343, "step": 3764 }, { "epoch": 1.08, "grad_norm": 8.966434478759766, "learning_rate": 1.3845823798627002e-05, "loss": 0.9949, "step": 3765 }, { "epoch": 1.08, "grad_norm": 8.525348663330078, "learning_rate": 1.3841533180778034e-05, "loss": 0.5102, "step": 3766 }, { "epoch": 1.08, "grad_norm": 10.42138957977295, "learning_rate": 1.3837242562929062e-05, "loss": 0.8386, "step": 3767 }, { "epoch": 1.08, "grad_norm": 12.72672176361084, "learning_rate": 1.3832951945080093e-05, "loss": 1.0639, "step": 3768 }, { "epoch": 1.08, "grad_norm": 11.651678085327148, "learning_rate": 1.3828661327231122e-05, "loss": 0.8836, "step": 3769 }, { "epoch": 1.08, "grad_norm": 10.607648849487305, "learning_rate": 1.3824370709382152e-05, "loss": 0.9617, "step": 3770 }, { "epoch": 1.08, "grad_norm": 10.19911003112793, "learning_rate": 1.3820080091533181e-05, "loss": 0.7797, "step": 3771 }, { "epoch": 1.08, "grad_norm": 8.973006248474121, "learning_rate": 1.3815789473684211e-05, "loss": 0.7331, "step": 3772 }, { "epoch": 1.08, "grad_norm": 10.524520874023438, "learning_rate": 1.381149885583524e-05, "loss": 0.834, "step": 3773 }, { "epoch": 1.08, "grad_norm": 9.342079162597656, "learning_rate": 1.3807208237986269e-05, "loss": 0.6822, "step": 3774 }, { "epoch": 1.08, "grad_norm": 10.43412971496582, "learning_rate": 1.3802917620137301e-05, "loss": 0.8916, "step": 3775 }, { "epoch": 1.08, "grad_norm": 10.923917770385742, "learning_rate": 1.379862700228833e-05, "loss": 1.0817, "step": 3776 }, { "epoch": 1.08, "grad_norm": 10.417581558227539, "learning_rate": 1.379433638443936e-05, "loss": 0.6987, "step": 3777 }, { "epoch": 1.08, "grad_norm": 9.958463668823242, "learning_rate": 1.3790045766590389e-05, "loss": 0.8642, "step": 3778 }, { "epoch": 1.08, "grad_norm": 10.143387794494629, "learning_rate": 1.378575514874142e-05, "loss": 0.6766, "step": 3779 }, { "epoch": 1.08, "grad_norm": 8.822731971740723, "learning_rate": 1.3781464530892448e-05, "loss": 0.7229, "step": 3780 }, { "epoch": 1.08, "grad_norm": 9.64212417602539, "learning_rate": 1.3777173913043479e-05, "loss": 0.7974, "step": 3781 }, { "epoch": 1.08, "grad_norm": 9.49821949005127, "learning_rate": 1.3772883295194508e-05, "loss": 0.5882, "step": 3782 }, { "epoch": 1.08, "grad_norm": 9.604161262512207, "learning_rate": 1.3768592677345538e-05, "loss": 0.7547, "step": 3783 }, { "epoch": 1.08, "grad_norm": 10.2879638671875, "learning_rate": 1.3764302059496568e-05, "loss": 0.9607, "step": 3784 }, { "epoch": 1.08, "grad_norm": 8.067282676696777, "learning_rate": 1.3760011441647599e-05, "loss": 0.797, "step": 3785 }, { "epoch": 1.08, "grad_norm": 11.269867897033691, "learning_rate": 1.3755720823798628e-05, "loss": 0.9492, "step": 3786 }, { "epoch": 1.08, "grad_norm": 9.921648979187012, "learning_rate": 1.3751430205949656e-05, "loss": 0.6434, "step": 3787 }, { "epoch": 1.08, "grad_norm": 8.82890510559082, "learning_rate": 1.3747139588100687e-05, "loss": 0.8586, "step": 3788 }, { "epoch": 1.08, "grad_norm": 9.924644470214844, "learning_rate": 1.3742848970251716e-05, "loss": 0.7732, "step": 3789 }, { "epoch": 1.08, "grad_norm": 8.59892463684082, "learning_rate": 1.3738558352402746e-05, "loss": 0.7436, "step": 3790 }, { "epoch": 1.08, "grad_norm": 10.175561904907227, "learning_rate": 1.3734267734553775e-05, "loss": 0.8205, "step": 3791 }, { "epoch": 1.08, "grad_norm": 8.727319717407227, "learning_rate": 1.3729977116704807e-05, "loss": 0.6489, "step": 3792 }, { "epoch": 1.08, "grad_norm": 9.501527786254883, "learning_rate": 1.3725686498855836e-05, "loss": 0.7864, "step": 3793 }, { "epoch": 1.09, "grad_norm": 9.407328605651855, "learning_rate": 1.3721395881006866e-05, "loss": 0.707, "step": 3794 }, { "epoch": 1.09, "grad_norm": 9.251099586486816, "learning_rate": 1.3717105263157895e-05, "loss": 0.7515, "step": 3795 }, { "epoch": 1.09, "grad_norm": 13.685705184936523, "learning_rate": 1.3712814645308926e-05, "loss": 1.0767, "step": 3796 }, { "epoch": 1.09, "grad_norm": 10.53074836730957, "learning_rate": 1.3708524027459954e-05, "loss": 0.8211, "step": 3797 }, { "epoch": 1.09, "grad_norm": 9.404327392578125, "learning_rate": 1.3704233409610985e-05, "loss": 0.7364, "step": 3798 }, { "epoch": 1.09, "grad_norm": 8.201828956604004, "learning_rate": 1.3699942791762014e-05, "loss": 0.8571, "step": 3799 }, { "epoch": 1.09, "grad_norm": 11.381919860839844, "learning_rate": 1.3695652173913042e-05, "loss": 1.0725, "step": 3800 }, { "epoch": 1.09, "grad_norm": 11.355937957763672, "learning_rate": 1.3691361556064074e-05, "loss": 0.769, "step": 3801 }, { "epoch": 1.09, "grad_norm": 10.03324031829834, "learning_rate": 1.3687070938215103e-05, "loss": 0.7663, "step": 3802 }, { "epoch": 1.09, "grad_norm": 8.717913627624512, "learning_rate": 1.3682780320366134e-05, "loss": 0.6451, "step": 3803 }, { "epoch": 1.09, "grad_norm": 13.435998916625977, "learning_rate": 1.3678489702517162e-05, "loss": 0.9202, "step": 3804 }, { "epoch": 1.09, "grad_norm": 13.894830703735352, "learning_rate": 1.3674199084668193e-05, "loss": 0.9243, "step": 3805 }, { "epoch": 1.09, "grad_norm": 11.40660285949707, "learning_rate": 1.3669908466819222e-05, "loss": 1.0794, "step": 3806 }, { "epoch": 1.09, "grad_norm": 9.202834129333496, "learning_rate": 1.3665617848970252e-05, "loss": 0.6797, "step": 3807 }, { "epoch": 1.09, "grad_norm": 9.211841583251953, "learning_rate": 1.3661327231121281e-05, "loss": 0.7329, "step": 3808 }, { "epoch": 1.09, "grad_norm": 11.23617172241211, "learning_rate": 1.3657036613272313e-05, "loss": 0.8237, "step": 3809 }, { "epoch": 1.09, "grad_norm": 11.655969619750977, "learning_rate": 1.3652745995423342e-05, "loss": 0.9963, "step": 3810 }, { "epoch": 1.09, "grad_norm": 9.310397148132324, "learning_rate": 1.3648455377574372e-05, "loss": 0.7312, "step": 3811 }, { "epoch": 1.09, "grad_norm": 9.105588912963867, "learning_rate": 1.3644164759725401e-05, "loss": 0.6932, "step": 3812 }, { "epoch": 1.09, "grad_norm": 8.658004760742188, "learning_rate": 1.363987414187643e-05, "loss": 0.8755, "step": 3813 }, { "epoch": 1.09, "grad_norm": 11.68152904510498, "learning_rate": 1.363558352402746e-05, "loss": 1.0816, "step": 3814 }, { "epoch": 1.09, "grad_norm": 13.390514373779297, "learning_rate": 1.3631292906178489e-05, "loss": 1.0396, "step": 3815 }, { "epoch": 1.09, "grad_norm": 10.660036087036133, "learning_rate": 1.362700228832952e-05, "loss": 0.8675, "step": 3816 }, { "epoch": 1.09, "grad_norm": 9.080789566040039, "learning_rate": 1.3622711670480548e-05, "loss": 0.7688, "step": 3817 }, { "epoch": 1.09, "grad_norm": 14.014163970947266, "learning_rate": 1.361842105263158e-05, "loss": 0.9729, "step": 3818 }, { "epoch": 1.09, "grad_norm": 10.439164161682129, "learning_rate": 1.361413043478261e-05, "loss": 0.8642, "step": 3819 }, { "epoch": 1.09, "grad_norm": 8.424198150634766, "learning_rate": 1.360983981693364e-05, "loss": 0.8619, "step": 3820 }, { "epoch": 1.09, "grad_norm": 11.352283477783203, "learning_rate": 1.3605549199084668e-05, "loss": 0.728, "step": 3821 }, { "epoch": 1.09, "grad_norm": 10.121688842773438, "learning_rate": 1.3601258581235699e-05, "loss": 0.7932, "step": 3822 }, { "epoch": 1.09, "grad_norm": 8.25421142578125, "learning_rate": 1.3596967963386728e-05, "loss": 0.7211, "step": 3823 }, { "epoch": 1.09, "grad_norm": 8.372735977172852, "learning_rate": 1.3592677345537758e-05, "loss": 0.8354, "step": 3824 }, { "epoch": 1.09, "grad_norm": 8.384273529052734, "learning_rate": 1.3588386727688787e-05, "loss": 0.7215, "step": 3825 }, { "epoch": 1.09, "grad_norm": 12.669160842895508, "learning_rate": 1.3584096109839817e-05, "loss": 0.9718, "step": 3826 }, { "epoch": 1.09, "grad_norm": 9.039535522460938, "learning_rate": 1.3579805491990848e-05, "loss": 0.8118, "step": 3827 }, { "epoch": 1.09, "grad_norm": 9.365995407104492, "learning_rate": 1.3575514874141877e-05, "loss": 0.7145, "step": 3828 }, { "epoch": 1.1, "grad_norm": 11.246047973632812, "learning_rate": 1.3571224256292907e-05, "loss": 0.9481, "step": 3829 }, { "epoch": 1.1, "grad_norm": 10.856829643249512, "learning_rate": 1.3566933638443936e-05, "loss": 0.9542, "step": 3830 }, { "epoch": 1.1, "grad_norm": 10.257083892822266, "learning_rate": 1.3562643020594966e-05, "loss": 0.9543, "step": 3831 }, { "epoch": 1.1, "grad_norm": 8.790023803710938, "learning_rate": 1.3558352402745995e-05, "loss": 0.6408, "step": 3832 }, { "epoch": 1.1, "grad_norm": 11.7524995803833, "learning_rate": 1.3554061784897025e-05, "loss": 1.1864, "step": 3833 }, { "epoch": 1.1, "grad_norm": 11.273727416992188, "learning_rate": 1.3549771167048054e-05, "loss": 1.1412, "step": 3834 }, { "epoch": 1.1, "grad_norm": 9.125393867492676, "learning_rate": 1.3545480549199086e-05, "loss": 0.9648, "step": 3835 }, { "epoch": 1.1, "grad_norm": 9.272759437561035, "learning_rate": 1.3541189931350115e-05, "loss": 0.6963, "step": 3836 }, { "epoch": 1.1, "grad_norm": 8.003986358642578, "learning_rate": 1.3536899313501146e-05, "loss": 0.8362, "step": 3837 }, { "epoch": 1.1, "grad_norm": 7.971385955810547, "learning_rate": 1.3532608695652174e-05, "loss": 0.6797, "step": 3838 }, { "epoch": 1.1, "grad_norm": 8.665493965148926, "learning_rate": 1.3528318077803203e-05, "loss": 0.8865, "step": 3839 }, { "epoch": 1.1, "grad_norm": 10.29703426361084, "learning_rate": 1.3524027459954234e-05, "loss": 1.0045, "step": 3840 }, { "epoch": 1.1, "grad_norm": 8.791654586791992, "learning_rate": 1.3519736842105262e-05, "loss": 0.8541, "step": 3841 }, { "epoch": 1.1, "grad_norm": 9.257181167602539, "learning_rate": 1.3515446224256293e-05, "loss": 0.8603, "step": 3842 }, { "epoch": 1.1, "grad_norm": 10.332367897033691, "learning_rate": 1.3511155606407322e-05, "loss": 0.8712, "step": 3843 }, { "epoch": 1.1, "grad_norm": 11.061684608459473, "learning_rate": 1.3506864988558354e-05, "loss": 1.1324, "step": 3844 }, { "epoch": 1.1, "grad_norm": 9.152527809143066, "learning_rate": 1.3502574370709383e-05, "loss": 0.7468, "step": 3845 }, { "epoch": 1.1, "grad_norm": 11.406342506408691, "learning_rate": 1.3498283752860413e-05, "loss": 0.8362, "step": 3846 }, { "epoch": 1.1, "grad_norm": 8.86299991607666, "learning_rate": 1.3493993135011442e-05, "loss": 0.591, "step": 3847 }, { "epoch": 1.1, "grad_norm": 9.53094482421875, "learning_rate": 1.3489702517162472e-05, "loss": 0.7772, "step": 3848 }, { "epoch": 1.1, "grad_norm": 10.275636672973633, "learning_rate": 1.3485411899313501e-05, "loss": 0.8751, "step": 3849 }, { "epoch": 1.1, "grad_norm": 11.548718452453613, "learning_rate": 1.3481121281464531e-05, "loss": 0.709, "step": 3850 }, { "epoch": 1.1, "grad_norm": 10.843782424926758, "learning_rate": 1.347683066361556e-05, "loss": 0.8233, "step": 3851 }, { "epoch": 1.1, "grad_norm": 10.66096305847168, "learning_rate": 1.347254004576659e-05, "loss": 0.8643, "step": 3852 }, { "epoch": 1.1, "grad_norm": 7.803994178771973, "learning_rate": 1.3468249427917621e-05, "loss": 0.6668, "step": 3853 }, { "epoch": 1.1, "grad_norm": 10.722987174987793, "learning_rate": 1.346395881006865e-05, "loss": 0.8912, "step": 3854 }, { "epoch": 1.1, "grad_norm": 8.521932601928711, "learning_rate": 1.345966819221968e-05, "loss": 0.8668, "step": 3855 }, { "epoch": 1.1, "grad_norm": 10.58981990814209, "learning_rate": 1.345537757437071e-05, "loss": 0.8814, "step": 3856 }, { "epoch": 1.1, "grad_norm": 10.837814331054688, "learning_rate": 1.345108695652174e-05, "loss": 0.9011, "step": 3857 }, { "epoch": 1.1, "grad_norm": 9.734378814697266, "learning_rate": 1.3446796338672768e-05, "loss": 0.8212, "step": 3858 }, { "epoch": 1.1, "grad_norm": 9.41157341003418, "learning_rate": 1.3442505720823799e-05, "loss": 0.7862, "step": 3859 }, { "epoch": 1.1, "grad_norm": 10.327248573303223, "learning_rate": 1.3438215102974828e-05, "loss": 0.8898, "step": 3860 }, { "epoch": 1.1, "grad_norm": 11.293153762817383, "learning_rate": 1.343392448512586e-05, "loss": 0.8931, "step": 3861 }, { "epoch": 1.1, "grad_norm": 8.565530776977539, "learning_rate": 1.3429633867276889e-05, "loss": 0.7467, "step": 3862 }, { "epoch": 1.1, "grad_norm": 9.503556251525879, "learning_rate": 1.3425343249427919e-05, "loss": 0.7474, "step": 3863 }, { "epoch": 1.11, "grad_norm": 11.677658081054688, "learning_rate": 1.3421052631578948e-05, "loss": 0.9569, "step": 3864 }, { "epoch": 1.11, "grad_norm": 10.089754104614258, "learning_rate": 1.3416762013729977e-05, "loss": 0.8821, "step": 3865 }, { "epoch": 1.11, "grad_norm": 9.392486572265625, "learning_rate": 1.3412471395881007e-05, "loss": 0.6777, "step": 3866 }, { "epoch": 1.11, "grad_norm": 9.124673843383789, "learning_rate": 1.3408180778032036e-05, "loss": 0.7258, "step": 3867 }, { "epoch": 1.11, "grad_norm": 10.929582595825195, "learning_rate": 1.3403890160183066e-05, "loss": 1.1245, "step": 3868 }, { "epoch": 1.11, "grad_norm": 13.617674827575684, "learning_rate": 1.3399599542334097e-05, "loss": 1.2321, "step": 3869 }, { "epoch": 1.11, "grad_norm": 6.998832702636719, "learning_rate": 1.3395308924485127e-05, "loss": 0.5102, "step": 3870 }, { "epoch": 1.11, "grad_norm": 8.23576545715332, "learning_rate": 1.3391018306636156e-05, "loss": 0.6787, "step": 3871 }, { "epoch": 1.11, "grad_norm": 9.127097129821777, "learning_rate": 1.3386727688787186e-05, "loss": 0.8956, "step": 3872 }, { "epoch": 1.11, "grad_norm": 8.985982894897461, "learning_rate": 1.3382437070938215e-05, "loss": 0.8104, "step": 3873 }, { "epoch": 1.11, "grad_norm": 10.850921630859375, "learning_rate": 1.3378146453089246e-05, "loss": 0.8172, "step": 3874 }, { "epoch": 1.11, "grad_norm": 11.072077751159668, "learning_rate": 1.3373855835240274e-05, "loss": 0.8077, "step": 3875 }, { "epoch": 1.11, "grad_norm": 9.985342025756836, "learning_rate": 1.3369565217391305e-05, "loss": 0.7859, "step": 3876 }, { "epoch": 1.11, "grad_norm": 10.39843463897705, "learning_rate": 1.3365274599542334e-05, "loss": 0.9875, "step": 3877 }, { "epoch": 1.11, "grad_norm": 10.44593620300293, "learning_rate": 1.3360983981693364e-05, "loss": 0.7658, "step": 3878 }, { "epoch": 1.11, "grad_norm": 7.363400936126709, "learning_rate": 1.3356693363844395e-05, "loss": 0.7387, "step": 3879 }, { "epoch": 1.11, "grad_norm": 8.5333251953125, "learning_rate": 1.3352402745995423e-05, "loss": 0.6563, "step": 3880 }, { "epoch": 1.11, "grad_norm": 11.564074516296387, "learning_rate": 1.3348112128146454e-05, "loss": 0.7479, "step": 3881 }, { "epoch": 1.11, "grad_norm": 11.394442558288574, "learning_rate": 1.3343821510297483e-05, "loss": 0.8078, "step": 3882 }, { "epoch": 1.11, "grad_norm": 11.571203231811523, "learning_rate": 1.3339530892448513e-05, "loss": 0.9504, "step": 3883 }, { "epoch": 1.11, "grad_norm": 9.84980297088623, "learning_rate": 1.3335240274599542e-05, "loss": 0.6538, "step": 3884 }, { "epoch": 1.11, "grad_norm": 9.279945373535156, "learning_rate": 1.3330949656750572e-05, "loss": 0.8315, "step": 3885 }, { "epoch": 1.11, "grad_norm": 8.108189582824707, "learning_rate": 1.3326659038901603e-05, "loss": 0.6845, "step": 3886 }, { "epoch": 1.11, "grad_norm": 11.106013298034668, "learning_rate": 1.3322368421052633e-05, "loss": 0.9725, "step": 3887 }, { "epoch": 1.11, "grad_norm": 9.482634544372559, "learning_rate": 1.3318077803203662e-05, "loss": 0.7943, "step": 3888 }, { "epoch": 1.11, "grad_norm": 8.352679252624512, "learning_rate": 1.3313787185354692e-05, "loss": 0.7034, "step": 3889 }, { "epoch": 1.11, "grad_norm": 9.596867561340332, "learning_rate": 1.3309496567505721e-05, "loss": 0.7767, "step": 3890 }, { "epoch": 1.11, "grad_norm": 10.758633613586426, "learning_rate": 1.330520594965675e-05, "loss": 1.1519, "step": 3891 }, { "epoch": 1.11, "grad_norm": 8.963069915771484, "learning_rate": 1.330091533180778e-05, "loss": 0.7092, "step": 3892 }, { "epoch": 1.11, "grad_norm": 10.518741607666016, "learning_rate": 1.329662471395881e-05, "loss": 1.0363, "step": 3893 }, { "epoch": 1.11, "grad_norm": 10.884881973266602, "learning_rate": 1.329233409610984e-05, "loss": 0.9321, "step": 3894 }, { "epoch": 1.11, "grad_norm": 8.422544479370117, "learning_rate": 1.328804347826087e-05, "loss": 0.5192, "step": 3895 }, { "epoch": 1.11, "grad_norm": 11.745040893554688, "learning_rate": 1.32837528604119e-05, "loss": 1.0652, "step": 3896 }, { "epoch": 1.11, "grad_norm": 8.32690143585205, "learning_rate": 1.327946224256293e-05, "loss": 0.7271, "step": 3897 }, { "epoch": 1.11, "grad_norm": 10.290936470031738, "learning_rate": 1.327517162471396e-05, "loss": 0.7431, "step": 3898 }, { "epoch": 1.12, "grad_norm": 11.083048820495605, "learning_rate": 1.3270881006864989e-05, "loss": 0.831, "step": 3899 }, { "epoch": 1.12, "grad_norm": 10.073370933532715, "learning_rate": 1.3266590389016019e-05, "loss": 0.874, "step": 3900 }, { "epoch": 1.12, "grad_norm": 12.049382209777832, "learning_rate": 1.3262299771167048e-05, "loss": 0.8641, "step": 3901 }, { "epoch": 1.12, "grad_norm": 11.350683212280273, "learning_rate": 1.3258009153318078e-05, "loss": 0.5894, "step": 3902 }, { "epoch": 1.12, "grad_norm": 8.89463996887207, "learning_rate": 1.3253718535469107e-05, "loss": 0.699, "step": 3903 }, { "epoch": 1.12, "grad_norm": 8.856133460998535, "learning_rate": 1.3249427917620137e-05, "loss": 0.7376, "step": 3904 }, { "epoch": 1.12, "grad_norm": 8.090983390808105, "learning_rate": 1.3245137299771168e-05, "loss": 0.4932, "step": 3905 }, { "epoch": 1.12, "grad_norm": 12.358712196350098, "learning_rate": 1.3240846681922197e-05, "loss": 0.8241, "step": 3906 }, { "epoch": 1.12, "grad_norm": 8.918922424316406, "learning_rate": 1.3236556064073227e-05, "loss": 0.661, "step": 3907 }, { "epoch": 1.12, "grad_norm": 13.80139446258545, "learning_rate": 1.3232265446224256e-05, "loss": 1.0576, "step": 3908 }, { "epoch": 1.12, "grad_norm": 8.336238861083984, "learning_rate": 1.3227974828375286e-05, "loss": 0.6813, "step": 3909 }, { "epoch": 1.12, "grad_norm": 8.881771087646484, "learning_rate": 1.3223684210526315e-05, "loss": 0.5574, "step": 3910 }, { "epoch": 1.12, "grad_norm": 10.341936111450195, "learning_rate": 1.3219393592677346e-05, "loss": 0.779, "step": 3911 }, { "epoch": 1.12, "grad_norm": 10.47202205657959, "learning_rate": 1.3215102974828376e-05, "loss": 0.7365, "step": 3912 }, { "epoch": 1.12, "grad_norm": 10.373587608337402, "learning_rate": 1.3210812356979407e-05, "loss": 0.9122, "step": 3913 }, { "epoch": 1.12, "grad_norm": 11.215059280395508, "learning_rate": 1.3206521739130435e-05, "loss": 0.8947, "step": 3914 }, { "epoch": 1.12, "grad_norm": 10.4517822265625, "learning_rate": 1.3202231121281466e-05, "loss": 0.9194, "step": 3915 }, { "epoch": 1.12, "grad_norm": 13.125057220458984, "learning_rate": 1.3197940503432495e-05, "loss": 1.114, "step": 3916 }, { "epoch": 1.12, "grad_norm": 10.323071479797363, "learning_rate": 1.3193649885583523e-05, "loss": 0.9372, "step": 3917 }, { "epoch": 1.12, "grad_norm": 9.739777565002441, "learning_rate": 1.3189359267734554e-05, "loss": 0.9486, "step": 3918 }, { "epoch": 1.12, "grad_norm": 9.97629451751709, "learning_rate": 1.3185068649885583e-05, "loss": 0.7643, "step": 3919 }, { "epoch": 1.12, "grad_norm": 10.741100311279297, "learning_rate": 1.3180778032036613e-05, "loss": 0.8237, "step": 3920 }, { "epoch": 1.12, "grad_norm": 10.449416160583496, "learning_rate": 1.3176487414187643e-05, "loss": 1.0419, "step": 3921 }, { "epoch": 1.12, "grad_norm": 9.64289665222168, "learning_rate": 1.3172196796338674e-05, "loss": 0.7974, "step": 3922 }, { "epoch": 1.12, "grad_norm": 8.52686595916748, "learning_rate": 1.3167906178489703e-05, "loss": 0.791, "step": 3923 }, { "epoch": 1.12, "grad_norm": 10.28368091583252, "learning_rate": 1.3163615560640733e-05, "loss": 0.7411, "step": 3924 }, { "epoch": 1.12, "grad_norm": 8.14837646484375, "learning_rate": 1.3159324942791762e-05, "loss": 0.805, "step": 3925 }, { "epoch": 1.12, "grad_norm": 10.000300407409668, "learning_rate": 1.3155034324942792e-05, "loss": 0.7476, "step": 3926 }, { "epoch": 1.12, "grad_norm": 11.533475875854492, "learning_rate": 1.3150743707093821e-05, "loss": 0.8103, "step": 3927 }, { "epoch": 1.12, "grad_norm": 10.855892181396484, "learning_rate": 1.3146453089244852e-05, "loss": 0.7256, "step": 3928 }, { "epoch": 1.12, "grad_norm": 10.99836254119873, "learning_rate": 1.3142162471395882e-05, "loss": 1.0138, "step": 3929 }, { "epoch": 1.12, "grad_norm": 9.347951889038086, "learning_rate": 1.3137871853546911e-05, "loss": 0.7863, "step": 3930 }, { "epoch": 1.12, "grad_norm": 10.57369613647461, "learning_rate": 1.3133581235697941e-05, "loss": 0.7666, "step": 3931 }, { "epoch": 1.12, "grad_norm": 9.703213691711426, "learning_rate": 1.312929061784897e-05, "loss": 0.822, "step": 3932 }, { "epoch": 1.12, "grad_norm": 8.332062721252441, "learning_rate": 1.3125e-05, "loss": 0.5574, "step": 3933 }, { "epoch": 1.13, "grad_norm": 9.98759651184082, "learning_rate": 1.312070938215103e-05, "loss": 0.7092, "step": 3934 }, { "epoch": 1.13, "grad_norm": 9.835576057434082, "learning_rate": 1.311641876430206e-05, "loss": 0.7832, "step": 3935 }, { "epoch": 1.13, "grad_norm": 10.170145988464355, "learning_rate": 1.3112128146453089e-05, "loss": 0.6981, "step": 3936 }, { "epoch": 1.13, "grad_norm": 10.585335731506348, "learning_rate": 1.3107837528604119e-05, "loss": 0.8255, "step": 3937 }, { "epoch": 1.13, "grad_norm": 8.866064071655273, "learning_rate": 1.310354691075515e-05, "loss": 0.7015, "step": 3938 }, { "epoch": 1.13, "grad_norm": 11.070013999938965, "learning_rate": 1.309925629290618e-05, "loss": 0.899, "step": 3939 }, { "epoch": 1.13, "grad_norm": 9.384458541870117, "learning_rate": 1.3094965675057209e-05, "loss": 0.4722, "step": 3940 }, { "epoch": 1.13, "grad_norm": 9.862875938415527, "learning_rate": 1.309067505720824e-05, "loss": 0.8842, "step": 3941 }, { "epoch": 1.13, "grad_norm": 10.934288024902344, "learning_rate": 1.3086384439359268e-05, "loss": 0.7596, "step": 3942 }, { "epoch": 1.13, "grad_norm": 10.560894012451172, "learning_rate": 1.3082093821510297e-05, "loss": 0.6473, "step": 3943 }, { "epoch": 1.13, "grad_norm": 10.554158210754395, "learning_rate": 1.3077803203661327e-05, "loss": 0.8742, "step": 3944 }, { "epoch": 1.13, "grad_norm": 10.599578857421875, "learning_rate": 1.3073512585812356e-05, "loss": 0.8191, "step": 3945 }, { "epoch": 1.13, "grad_norm": 9.034960746765137, "learning_rate": 1.3069221967963388e-05, "loss": 0.7124, "step": 3946 }, { "epoch": 1.13, "grad_norm": 11.03193473815918, "learning_rate": 1.3064931350114417e-05, "loss": 0.6531, "step": 3947 }, { "epoch": 1.13, "grad_norm": 12.117925643920898, "learning_rate": 1.3060640732265447e-05, "loss": 1.0116, "step": 3948 }, { "epoch": 1.13, "grad_norm": 9.007270812988281, "learning_rate": 1.3056350114416476e-05, "loss": 0.7321, "step": 3949 }, { "epoch": 1.13, "grad_norm": 10.332066535949707, "learning_rate": 1.3052059496567507e-05, "loss": 1.1249, "step": 3950 }, { "epoch": 1.13, "grad_norm": 8.80511474609375, "learning_rate": 1.3047768878718535e-05, "loss": 0.6779, "step": 3951 }, { "epoch": 1.13, "grad_norm": 10.973099708557129, "learning_rate": 1.3043478260869566e-05, "loss": 0.6649, "step": 3952 }, { "epoch": 1.13, "grad_norm": 8.528388977050781, "learning_rate": 1.3039187643020595e-05, "loss": 0.7054, "step": 3953 }, { "epoch": 1.13, "grad_norm": 9.05822467803955, "learning_rate": 1.3034897025171625e-05, "loss": 0.7759, "step": 3954 }, { "epoch": 1.13, "grad_norm": 9.46092414855957, "learning_rate": 1.3030606407322655e-05, "loss": 0.6742, "step": 3955 }, { "epoch": 1.13, "grad_norm": 9.287985801696777, "learning_rate": 1.3026315789473684e-05, "loss": 0.7852, "step": 3956 }, { "epoch": 1.13, "grad_norm": 10.732928276062012, "learning_rate": 1.3022025171624715e-05, "loss": 0.8557, "step": 3957 }, { "epoch": 1.13, "grad_norm": 9.83497142791748, "learning_rate": 1.3017734553775743e-05, "loss": 0.6626, "step": 3958 }, { "epoch": 1.13, "grad_norm": 8.530620574951172, "learning_rate": 1.3013443935926774e-05, "loss": 0.6577, "step": 3959 }, { "epoch": 1.13, "grad_norm": 8.171707153320312, "learning_rate": 1.3009153318077803e-05, "loss": 0.6615, "step": 3960 }, { "epoch": 1.13, "grad_norm": 9.138986587524414, "learning_rate": 1.3004862700228833e-05, "loss": 0.6291, "step": 3961 }, { "epoch": 1.13, "grad_norm": 9.378610610961914, "learning_rate": 1.3000572082379862e-05, "loss": 0.8419, "step": 3962 }, { "epoch": 1.13, "grad_norm": 10.207708358764648, "learning_rate": 1.2996281464530894e-05, "loss": 0.7725, "step": 3963 }, { "epoch": 1.13, "grad_norm": 12.150755882263184, "learning_rate": 1.2991990846681923e-05, "loss": 1.2711, "step": 3964 }, { "epoch": 1.13, "grad_norm": 11.433258056640625, "learning_rate": 1.2987700228832953e-05, "loss": 0.8698, "step": 3965 }, { "epoch": 1.13, "grad_norm": 12.114084243774414, "learning_rate": 1.2983409610983982e-05, "loss": 0.9742, "step": 3966 }, { "epoch": 1.13, "grad_norm": 8.968756675720215, "learning_rate": 1.2979118993135013e-05, "loss": 0.6515, "step": 3967 }, { "epoch": 1.14, "grad_norm": 9.046331405639648, "learning_rate": 1.2974828375286041e-05, "loss": 0.7982, "step": 3968 }, { "epoch": 1.14, "grad_norm": 11.625608444213867, "learning_rate": 1.2970537757437072e-05, "loss": 1.0168, "step": 3969 }, { "epoch": 1.14, "grad_norm": 8.300956726074219, "learning_rate": 1.29662471395881e-05, "loss": 0.7453, "step": 3970 }, { "epoch": 1.14, "grad_norm": 7.171285629272461, "learning_rate": 1.296195652173913e-05, "loss": 0.5139, "step": 3971 }, { "epoch": 1.14, "grad_norm": 7.901523590087891, "learning_rate": 1.2957665903890161e-05, "loss": 0.6808, "step": 3972 }, { "epoch": 1.14, "grad_norm": 10.088178634643555, "learning_rate": 1.295337528604119e-05, "loss": 1.0606, "step": 3973 }, { "epoch": 1.14, "grad_norm": 10.532071113586426, "learning_rate": 1.294908466819222e-05, "loss": 0.6841, "step": 3974 }, { "epoch": 1.14, "grad_norm": 10.62282943725586, "learning_rate": 1.294479405034325e-05, "loss": 0.9119, "step": 3975 }, { "epoch": 1.14, "grad_norm": 9.230091094970703, "learning_rate": 1.294050343249428e-05, "loss": 0.5679, "step": 3976 }, { "epoch": 1.14, "grad_norm": 10.999259948730469, "learning_rate": 1.2936212814645309e-05, "loss": 0.8994, "step": 3977 }, { "epoch": 1.14, "grad_norm": 8.278515815734863, "learning_rate": 1.2931922196796339e-05, "loss": 0.5793, "step": 3978 }, { "epoch": 1.14, "grad_norm": 13.591418266296387, "learning_rate": 1.2927631578947368e-05, "loss": 0.8064, "step": 3979 }, { "epoch": 1.14, "grad_norm": 10.123757362365723, "learning_rate": 1.2923340961098398e-05, "loss": 0.8161, "step": 3980 }, { "epoch": 1.14, "grad_norm": 11.282858848571777, "learning_rate": 1.2919050343249429e-05, "loss": 0.8874, "step": 3981 }, { "epoch": 1.14, "grad_norm": 11.874940872192383, "learning_rate": 1.291475972540046e-05, "loss": 0.6845, "step": 3982 }, { "epoch": 1.14, "grad_norm": 9.55794906616211, "learning_rate": 1.2910469107551488e-05, "loss": 0.767, "step": 3983 }, { "epoch": 1.14, "grad_norm": 8.992429733276367, "learning_rate": 1.2906178489702517e-05, "loss": 0.5362, "step": 3984 }, { "epoch": 1.14, "grad_norm": 9.76063060760498, "learning_rate": 1.2901887871853547e-05, "loss": 0.6032, "step": 3985 }, { "epoch": 1.14, "grad_norm": 11.053279876708984, "learning_rate": 1.2897597254004576e-05, "loss": 0.8658, "step": 3986 }, { "epoch": 1.14, "grad_norm": 9.731647491455078, "learning_rate": 1.2893306636155607e-05, "loss": 0.6615, "step": 3987 }, { "epoch": 1.14, "grad_norm": 9.753975868225098, "learning_rate": 1.2889016018306635e-05, "loss": 0.702, "step": 3988 }, { "epoch": 1.14, "grad_norm": 8.342461585998535, "learning_rate": 1.2884725400457667e-05, "loss": 0.7509, "step": 3989 }, { "epoch": 1.14, "grad_norm": 10.290438652038574, "learning_rate": 1.2880434782608696e-05, "loss": 0.7519, "step": 3990 }, { "epoch": 1.14, "grad_norm": 9.696769714355469, "learning_rate": 1.2876144164759727e-05, "loss": 0.8955, "step": 3991 }, { "epoch": 1.14, "grad_norm": 10.913823127746582, "learning_rate": 1.2871853546910755e-05, "loss": 0.909, "step": 3992 }, { "epoch": 1.14, "grad_norm": 9.27153205871582, "learning_rate": 1.2867562929061786e-05, "loss": 0.7404, "step": 3993 }, { "epoch": 1.14, "grad_norm": 9.592728614807129, "learning_rate": 1.2863272311212815e-05, "loss": 1.0155, "step": 3994 }, { "epoch": 1.14, "grad_norm": 10.578533172607422, "learning_rate": 1.2858981693363845e-05, "loss": 0.7963, "step": 3995 }, { "epoch": 1.14, "grad_norm": 9.958884239196777, "learning_rate": 1.2854691075514874e-05, "loss": 0.9383, "step": 3996 }, { "epoch": 1.14, "grad_norm": 12.885809898376465, "learning_rate": 1.2850400457665903e-05, "loss": 0.9006, "step": 3997 }, { "epoch": 1.14, "grad_norm": 10.383514404296875, "learning_rate": 1.2846109839816935e-05, "loss": 0.8054, "step": 3998 }, { "epoch": 1.14, "grad_norm": 9.011198997497559, "learning_rate": 1.2841819221967964e-05, "loss": 0.7757, "step": 3999 }, { "epoch": 1.14, "grad_norm": 10.90754222869873, "learning_rate": 1.2837528604118994e-05, "loss": 0.667, "step": 4000 }, { "epoch": 1.14, "grad_norm": 8.191333770751953, "learning_rate": 1.2833237986270023e-05, "loss": 0.7656, "step": 4001 }, { "epoch": 1.14, "grad_norm": 11.077473640441895, "learning_rate": 1.2828947368421053e-05, "loss": 0.7835, "step": 4002 }, { "epoch": 1.15, "grad_norm": 9.449070930480957, "learning_rate": 1.2824656750572082e-05, "loss": 0.7441, "step": 4003 }, { "epoch": 1.15, "grad_norm": 11.44480037689209, "learning_rate": 1.2820366132723113e-05, "loss": 0.7525, "step": 4004 }, { "epoch": 1.15, "grad_norm": 9.595183372497559, "learning_rate": 1.2816075514874141e-05, "loss": 0.7875, "step": 4005 }, { "epoch": 1.15, "grad_norm": 9.679591178894043, "learning_rate": 1.2811784897025173e-05, "loss": 1.0367, "step": 4006 }, { "epoch": 1.15, "grad_norm": 11.832993507385254, "learning_rate": 1.2807494279176202e-05, "loss": 0.7952, "step": 4007 }, { "epoch": 1.15, "grad_norm": 10.265448570251465, "learning_rate": 1.2803203661327233e-05, "loss": 0.8162, "step": 4008 }, { "epoch": 1.15, "grad_norm": 9.365416526794434, "learning_rate": 1.2798913043478261e-05, "loss": 0.8788, "step": 4009 }, { "epoch": 1.15, "grad_norm": 8.634854316711426, "learning_rate": 1.279462242562929e-05, "loss": 0.688, "step": 4010 }, { "epoch": 1.15, "grad_norm": 9.54733657836914, "learning_rate": 1.279033180778032e-05, "loss": 0.729, "step": 4011 }, { "epoch": 1.15, "grad_norm": 9.37033462524414, "learning_rate": 1.278604118993135e-05, "loss": 0.7131, "step": 4012 }, { "epoch": 1.15, "grad_norm": 11.617125511169434, "learning_rate": 1.278175057208238e-05, "loss": 0.951, "step": 4013 }, { "epoch": 1.15, "grad_norm": 11.0932035446167, "learning_rate": 1.2777459954233409e-05, "loss": 0.7694, "step": 4014 }, { "epoch": 1.15, "grad_norm": 11.520350456237793, "learning_rate": 1.277316933638444e-05, "loss": 1.0484, "step": 4015 }, { "epoch": 1.15, "grad_norm": 11.6951322555542, "learning_rate": 1.276887871853547e-05, "loss": 0.8313, "step": 4016 }, { "epoch": 1.15, "grad_norm": 10.18323802947998, "learning_rate": 1.27645881006865e-05, "loss": 0.7908, "step": 4017 }, { "epoch": 1.15, "grad_norm": 15.108692169189453, "learning_rate": 1.2760297482837529e-05, "loss": 1.0901, "step": 4018 }, { "epoch": 1.15, "grad_norm": 10.572484016418457, "learning_rate": 1.275600686498856e-05, "loss": 0.8613, "step": 4019 }, { "epoch": 1.15, "grad_norm": 9.657601356506348, "learning_rate": 1.2751716247139588e-05, "loss": 0.9282, "step": 4020 }, { "epoch": 1.15, "grad_norm": 11.38335132598877, "learning_rate": 1.2747425629290619e-05, "loss": 0.7404, "step": 4021 }, { "epoch": 1.15, "grad_norm": 8.653599739074707, "learning_rate": 1.2743135011441647e-05, "loss": 0.6312, "step": 4022 }, { "epoch": 1.15, "grad_norm": 11.255392074584961, "learning_rate": 1.2738844393592678e-05, "loss": 1.0732, "step": 4023 }, { "epoch": 1.15, "grad_norm": 10.099267959594727, "learning_rate": 1.2734553775743708e-05, "loss": 0.9457, "step": 4024 }, { "epoch": 1.15, "grad_norm": 9.576580047607422, "learning_rate": 1.2730263157894737e-05, "loss": 0.8892, "step": 4025 }, { "epoch": 1.15, "grad_norm": 11.603577613830566, "learning_rate": 1.2725972540045767e-05, "loss": 0.7765, "step": 4026 }, { "epoch": 1.15, "grad_norm": 10.528868675231934, "learning_rate": 1.2721681922196796e-05, "loss": 0.9207, "step": 4027 }, { "epoch": 1.15, "grad_norm": 12.098111152648926, "learning_rate": 1.2717391304347827e-05, "loss": 1.0071, "step": 4028 }, { "epoch": 1.15, "grad_norm": 13.240303039550781, "learning_rate": 1.2713100686498855e-05, "loss": 0.7886, "step": 4029 }, { "epoch": 1.15, "grad_norm": 9.420174598693848, "learning_rate": 1.2708810068649886e-05, "loss": 0.7186, "step": 4030 }, { "epoch": 1.15, "grad_norm": 11.674076080322266, "learning_rate": 1.2704519450800915e-05, "loss": 0.8748, "step": 4031 }, { "epoch": 1.15, "grad_norm": 11.141865730285645, "learning_rate": 1.2700228832951947e-05, "loss": 0.8274, "step": 4032 }, { "epoch": 1.15, "grad_norm": 10.610071182250977, "learning_rate": 1.2695938215102976e-05, "loss": 0.8465, "step": 4033 }, { "epoch": 1.15, "grad_norm": 10.964436531066895, "learning_rate": 1.2691647597254006e-05, "loss": 0.7164, "step": 4034 }, { "epoch": 1.15, "grad_norm": 10.76241397857666, "learning_rate": 1.2687356979405035e-05, "loss": 0.8646, "step": 4035 }, { "epoch": 1.15, "grad_norm": 12.814397811889648, "learning_rate": 1.2683066361556064e-05, "loss": 0.9439, "step": 4036 }, { "epoch": 1.15, "grad_norm": 10.622892379760742, "learning_rate": 1.2678775743707094e-05, "loss": 1.0646, "step": 4037 }, { "epoch": 1.16, "grad_norm": 12.848695755004883, "learning_rate": 1.2674485125858123e-05, "loss": 1.0784, "step": 4038 }, { "epoch": 1.16, "grad_norm": 9.831666946411133, "learning_rate": 1.2670194508009153e-05, "loss": 0.7179, "step": 4039 }, { "epoch": 1.16, "grad_norm": 8.572142601013184, "learning_rate": 1.2665903890160182e-05, "loss": 0.5997, "step": 4040 }, { "epoch": 1.16, "grad_norm": 10.928911209106445, "learning_rate": 1.2661613272311214e-05, "loss": 1.0008, "step": 4041 }, { "epoch": 1.16, "grad_norm": 9.064866065979004, "learning_rate": 1.2657322654462243e-05, "loss": 0.5705, "step": 4042 }, { "epoch": 1.16, "grad_norm": 10.2470121383667, "learning_rate": 1.2653032036613273e-05, "loss": 1.0596, "step": 4043 }, { "epoch": 1.16, "grad_norm": 10.076916694641113, "learning_rate": 1.2648741418764302e-05, "loss": 0.7603, "step": 4044 }, { "epoch": 1.16, "grad_norm": 10.133118629455566, "learning_rate": 1.2644450800915333e-05, "loss": 0.9168, "step": 4045 }, { "epoch": 1.16, "grad_norm": 11.268158912658691, "learning_rate": 1.2640160183066361e-05, "loss": 0.8666, "step": 4046 }, { "epoch": 1.16, "grad_norm": 9.203883171081543, "learning_rate": 1.2635869565217392e-05, "loss": 0.6609, "step": 4047 }, { "epoch": 1.16, "grad_norm": 12.575746536254883, "learning_rate": 1.263157894736842e-05, "loss": 0.7837, "step": 4048 }, { "epoch": 1.16, "grad_norm": 11.600099563598633, "learning_rate": 1.2627288329519451e-05, "loss": 0.9264, "step": 4049 }, { "epoch": 1.16, "grad_norm": 10.319470405578613, "learning_rate": 1.2622997711670482e-05, "loss": 0.602, "step": 4050 }, { "epoch": 1.16, "grad_norm": 10.774476051330566, "learning_rate": 1.261870709382151e-05, "loss": 0.7665, "step": 4051 }, { "epoch": 1.16, "grad_norm": 10.500500679016113, "learning_rate": 1.261441647597254e-05, "loss": 0.9282, "step": 4052 }, { "epoch": 1.16, "grad_norm": 9.612297058105469, "learning_rate": 1.261012585812357e-05, "loss": 0.8067, "step": 4053 }, { "epoch": 1.16, "grad_norm": 9.459089279174805, "learning_rate": 1.26058352402746e-05, "loss": 0.6766, "step": 4054 }, { "epoch": 1.16, "grad_norm": 12.293682098388672, "learning_rate": 1.2601544622425629e-05, "loss": 0.8211, "step": 4055 }, { "epoch": 1.16, "grad_norm": 8.434786796569824, "learning_rate": 1.259725400457666e-05, "loss": 0.6012, "step": 4056 }, { "epoch": 1.16, "grad_norm": 11.257843017578125, "learning_rate": 1.2592963386727688e-05, "loss": 0.7561, "step": 4057 }, { "epoch": 1.16, "grad_norm": 11.848149299621582, "learning_rate": 1.258867276887872e-05, "loss": 0.8905, "step": 4058 }, { "epoch": 1.16, "grad_norm": 8.863398551940918, "learning_rate": 1.2584382151029749e-05, "loss": 0.8206, "step": 4059 }, { "epoch": 1.16, "grad_norm": 7.729502201080322, "learning_rate": 1.258009153318078e-05, "loss": 0.6038, "step": 4060 }, { "epoch": 1.16, "grad_norm": 10.019165992736816, "learning_rate": 1.2575800915331808e-05, "loss": 0.7282, "step": 4061 }, { "epoch": 1.16, "grad_norm": 8.591156959533691, "learning_rate": 1.2571510297482837e-05, "loss": 0.7666, "step": 4062 }, { "epoch": 1.16, "grad_norm": 8.982990264892578, "learning_rate": 1.2567219679633867e-05, "loss": 0.8294, "step": 4063 }, { "epoch": 1.16, "grad_norm": 9.198776245117188, "learning_rate": 1.2562929061784896e-05, "loss": 0.8153, "step": 4064 }, { "epoch": 1.16, "grad_norm": 7.7439422607421875, "learning_rate": 1.2558638443935927e-05, "loss": 0.65, "step": 4065 }, { "epoch": 1.16, "grad_norm": 8.913726806640625, "learning_rate": 1.2554347826086957e-05, "loss": 0.6944, "step": 4066 }, { "epoch": 1.16, "grad_norm": 9.340245246887207, "learning_rate": 1.2550057208237988e-05, "loss": 0.7589, "step": 4067 }, { "epoch": 1.16, "grad_norm": 11.276741027832031, "learning_rate": 1.2545766590389016e-05, "loss": 0.862, "step": 4068 }, { "epoch": 1.16, "grad_norm": 10.607254028320312, "learning_rate": 1.2541475972540047e-05, "loss": 0.9057, "step": 4069 }, { "epoch": 1.16, "grad_norm": 12.084604263305664, "learning_rate": 1.2537185354691076e-05, "loss": 0.7923, "step": 4070 }, { "epoch": 1.16, "grad_norm": 12.961553573608398, "learning_rate": 1.2532894736842106e-05, "loss": 0.8188, "step": 4071 }, { "epoch": 1.16, "grad_norm": 9.144914627075195, "learning_rate": 1.2528604118993135e-05, "loss": 0.5782, "step": 4072 }, { "epoch": 1.17, "grad_norm": 8.926443099975586, "learning_rate": 1.2524313501144165e-05, "loss": 0.8521, "step": 4073 }, { "epoch": 1.17, "grad_norm": 7.8484344482421875, "learning_rate": 1.2520022883295194e-05, "loss": 0.5207, "step": 4074 }, { "epoch": 1.17, "grad_norm": 12.259339332580566, "learning_rate": 1.2515732265446224e-05, "loss": 1.2073, "step": 4075 }, { "epoch": 1.17, "grad_norm": 10.956076622009277, "learning_rate": 1.2511441647597255e-05, "loss": 0.9287, "step": 4076 }, { "epoch": 1.17, "grad_norm": 11.137741088867188, "learning_rate": 1.2507151029748284e-05, "loss": 0.9011, "step": 4077 }, { "epoch": 1.17, "grad_norm": 11.437335014343262, "learning_rate": 1.2502860411899314e-05, "loss": 0.7987, "step": 4078 }, { "epoch": 1.17, "grad_norm": 9.757527351379395, "learning_rate": 1.2498569794050343e-05, "loss": 0.835, "step": 4079 }, { "epoch": 1.17, "grad_norm": 10.70904541015625, "learning_rate": 1.2494279176201373e-05, "loss": 0.9434, "step": 4080 }, { "epoch": 1.17, "grad_norm": 9.849149703979492, "learning_rate": 1.2489988558352402e-05, "loss": 0.8466, "step": 4081 }, { "epoch": 1.17, "grad_norm": 9.947101593017578, "learning_rate": 1.2485697940503433e-05, "loss": 0.8026, "step": 4082 }, { "epoch": 1.17, "grad_norm": 9.381686210632324, "learning_rate": 1.2481407322654463e-05, "loss": 0.5783, "step": 4083 }, { "epoch": 1.17, "grad_norm": 12.77807331085205, "learning_rate": 1.2477116704805494e-05, "loss": 0.7914, "step": 4084 }, { "epoch": 1.17, "grad_norm": 13.345623970031738, "learning_rate": 1.2472826086956522e-05, "loss": 0.7986, "step": 4085 }, { "epoch": 1.17, "grad_norm": 9.868369102478027, "learning_rate": 1.2468535469107553e-05, "loss": 0.825, "step": 4086 }, { "epoch": 1.17, "grad_norm": 9.543618202209473, "learning_rate": 1.2464244851258582e-05, "loss": 0.794, "step": 4087 }, { "epoch": 1.17, "grad_norm": 8.939046859741211, "learning_rate": 1.245995423340961e-05, "loss": 0.6565, "step": 4088 }, { "epoch": 1.17, "grad_norm": 9.811325073242188, "learning_rate": 1.245566361556064e-05, "loss": 0.7062, "step": 4089 }, { "epoch": 1.17, "grad_norm": 10.338138580322266, "learning_rate": 1.245137299771167e-05, "loss": 0.773, "step": 4090 }, { "epoch": 1.17, "grad_norm": 10.663650512695312, "learning_rate": 1.24470823798627e-05, "loss": 0.7764, "step": 4091 }, { "epoch": 1.17, "grad_norm": 10.51228141784668, "learning_rate": 1.244279176201373e-05, "loss": 0.7541, "step": 4092 }, { "epoch": 1.17, "grad_norm": 11.164430618286133, "learning_rate": 1.2438501144164761e-05, "loss": 0.8271, "step": 4093 }, { "epoch": 1.17, "grad_norm": 7.714865207672119, "learning_rate": 1.243421052631579e-05, "loss": 0.6045, "step": 4094 }, { "epoch": 1.17, "grad_norm": 10.589447021484375, "learning_rate": 1.242991990846682e-05, "loss": 0.7164, "step": 4095 }, { "epoch": 1.17, "grad_norm": 11.344781875610352, "learning_rate": 1.2425629290617849e-05, "loss": 0.7694, "step": 4096 }, { "epoch": 1.17, "grad_norm": 13.170841217041016, "learning_rate": 1.242133867276888e-05, "loss": 0.9305, "step": 4097 }, { "epoch": 1.17, "grad_norm": 9.428088188171387, "learning_rate": 1.2417048054919908e-05, "loss": 0.8694, "step": 4098 }, { "epoch": 1.17, "grad_norm": 11.26711654663086, "learning_rate": 1.2412757437070939e-05, "loss": 0.8744, "step": 4099 }, { "epoch": 1.17, "grad_norm": 11.067695617675781, "learning_rate": 1.2408466819221967e-05, "loss": 0.8843, "step": 4100 }, { "epoch": 1.17, "grad_norm": 14.268261909484863, "learning_rate": 1.2404176201372998e-05, "loss": 0.8601, "step": 4101 }, { "epoch": 1.17, "grad_norm": 9.639678001403809, "learning_rate": 1.2399885583524028e-05, "loss": 0.7101, "step": 4102 }, { "epoch": 1.17, "grad_norm": 8.908231735229492, "learning_rate": 1.2395594965675057e-05, "loss": 0.7455, "step": 4103 }, { "epoch": 1.17, "grad_norm": 11.500310897827148, "learning_rate": 1.2391304347826088e-05, "loss": 0.9985, "step": 4104 }, { "epoch": 1.17, "grad_norm": 10.697529792785645, "learning_rate": 1.2387013729977116e-05, "loss": 0.6407, "step": 4105 }, { "epoch": 1.17, "grad_norm": 7.432031154632568, "learning_rate": 1.2382723112128147e-05, "loss": 0.6257, "step": 4106 }, { "epoch": 1.17, "grad_norm": 11.1350679397583, "learning_rate": 1.2378432494279176e-05, "loss": 0.7523, "step": 4107 }, { "epoch": 1.18, "grad_norm": 10.981945037841797, "learning_rate": 1.2374141876430206e-05, "loss": 1.1006, "step": 4108 }, { "epoch": 1.18, "grad_norm": 8.67753791809082, "learning_rate": 1.2369851258581236e-05, "loss": 0.573, "step": 4109 }, { "epoch": 1.18, "grad_norm": 9.949068069458008, "learning_rate": 1.2365560640732267e-05, "loss": 0.9602, "step": 4110 }, { "epoch": 1.18, "grad_norm": 12.82435417175293, "learning_rate": 1.2361270022883296e-05, "loss": 1.0123, "step": 4111 }, { "epoch": 1.18, "grad_norm": 10.570842742919922, "learning_rate": 1.2356979405034326e-05, "loss": 0.735, "step": 4112 }, { "epoch": 1.18, "grad_norm": 10.396074295043945, "learning_rate": 1.2352688787185355e-05, "loss": 0.7812, "step": 4113 }, { "epoch": 1.18, "grad_norm": 10.221151351928711, "learning_rate": 1.2348398169336384e-05, "loss": 0.732, "step": 4114 }, { "epoch": 1.18, "grad_norm": 10.070449829101562, "learning_rate": 1.2344107551487414e-05, "loss": 0.8596, "step": 4115 }, { "epoch": 1.18, "grad_norm": 9.374528884887695, "learning_rate": 1.2339816933638443e-05, "loss": 0.8476, "step": 4116 }, { "epoch": 1.18, "grad_norm": 9.961380958557129, "learning_rate": 1.2335526315789473e-05, "loss": 0.8881, "step": 4117 }, { "epoch": 1.18, "grad_norm": 8.791508674621582, "learning_rate": 1.2331235697940504e-05, "loss": 0.7018, "step": 4118 }, { "epoch": 1.18, "grad_norm": 9.854459762573242, "learning_rate": 1.2326945080091534e-05, "loss": 0.7528, "step": 4119 }, { "epoch": 1.18, "grad_norm": 11.351456642150879, "learning_rate": 1.2322654462242563e-05, "loss": 0.5635, "step": 4120 }, { "epoch": 1.18, "grad_norm": 7.799431324005127, "learning_rate": 1.2318363844393594e-05, "loss": 0.5938, "step": 4121 }, { "epoch": 1.18, "grad_norm": 12.781096458435059, "learning_rate": 1.2314073226544622e-05, "loss": 0.9506, "step": 4122 }, { "epoch": 1.18, "grad_norm": 9.58182430267334, "learning_rate": 1.2309782608695653e-05, "loss": 0.756, "step": 4123 }, { "epoch": 1.18, "grad_norm": 10.1422119140625, "learning_rate": 1.2305491990846682e-05, "loss": 1.0917, "step": 4124 }, { "epoch": 1.18, "grad_norm": 8.120306015014648, "learning_rate": 1.2301201372997712e-05, "loss": 0.8838, "step": 4125 }, { "epoch": 1.18, "grad_norm": 9.112457275390625, "learning_rate": 1.2296910755148742e-05, "loss": 0.805, "step": 4126 }, { "epoch": 1.18, "grad_norm": 10.73107624053955, "learning_rate": 1.2292620137299771e-05, "loss": 1.0341, "step": 4127 }, { "epoch": 1.18, "grad_norm": 9.700551986694336, "learning_rate": 1.2288329519450802e-05, "loss": 0.9188, "step": 4128 }, { "epoch": 1.18, "grad_norm": 15.175979614257812, "learning_rate": 1.228403890160183e-05, "loss": 1.0461, "step": 4129 }, { "epoch": 1.18, "grad_norm": 8.576842308044434, "learning_rate": 1.2279748283752861e-05, "loss": 0.8233, "step": 4130 }, { "epoch": 1.18, "grad_norm": 9.577347755432129, "learning_rate": 1.227545766590389e-05, "loss": 0.7652, "step": 4131 }, { "epoch": 1.18, "grad_norm": 12.247037887573242, "learning_rate": 1.227116704805492e-05, "loss": 0.9427, "step": 4132 }, { "epoch": 1.18, "grad_norm": 9.787284851074219, "learning_rate": 1.2266876430205949e-05, "loss": 0.8011, "step": 4133 }, { "epoch": 1.18, "grad_norm": 10.73867416381836, "learning_rate": 1.226258581235698e-05, "loss": 0.5968, "step": 4134 }, { "epoch": 1.18, "grad_norm": 8.938671112060547, "learning_rate": 1.225829519450801e-05, "loss": 0.8767, "step": 4135 }, { "epoch": 1.18, "grad_norm": 10.706457138061523, "learning_rate": 1.225400457665904e-05, "loss": 0.8142, "step": 4136 }, { "epoch": 1.18, "grad_norm": 11.404316902160645, "learning_rate": 1.2249713958810069e-05, "loss": 0.8691, "step": 4137 }, { "epoch": 1.18, "grad_norm": 9.415704727172852, "learning_rate": 1.22454233409611e-05, "loss": 0.7746, "step": 4138 }, { "epoch": 1.18, "grad_norm": 10.434395790100098, "learning_rate": 1.2241132723112128e-05, "loss": 0.9429, "step": 4139 }, { "epoch": 1.18, "grad_norm": 8.272794723510742, "learning_rate": 1.2236842105263159e-05, "loss": 0.646, "step": 4140 }, { "epoch": 1.18, "grad_norm": 9.81921100616455, "learning_rate": 1.2232551487414188e-05, "loss": 0.8917, "step": 4141 }, { "epoch": 1.18, "grad_norm": 9.18488597869873, "learning_rate": 1.2228260869565216e-05, "loss": 0.759, "step": 4142 }, { "epoch": 1.19, "grad_norm": 9.562060356140137, "learning_rate": 1.2223970251716248e-05, "loss": 0.8422, "step": 4143 }, { "epoch": 1.19, "grad_norm": 7.701141834259033, "learning_rate": 1.2219679633867277e-05, "loss": 0.5721, "step": 4144 }, { "epoch": 1.19, "grad_norm": 11.071125030517578, "learning_rate": 1.2215389016018308e-05, "loss": 0.6635, "step": 4145 }, { "epoch": 1.19, "grad_norm": 12.54422664642334, "learning_rate": 1.2211098398169336e-05, "loss": 0.8234, "step": 4146 }, { "epoch": 1.19, "grad_norm": 11.236760139465332, "learning_rate": 1.2206807780320367e-05, "loss": 0.8189, "step": 4147 }, { "epoch": 1.19, "grad_norm": 10.954216003417969, "learning_rate": 1.2202517162471396e-05, "loss": 0.8914, "step": 4148 }, { "epoch": 1.19, "grad_norm": 10.158648490905762, "learning_rate": 1.2198226544622426e-05, "loss": 0.7184, "step": 4149 }, { "epoch": 1.19, "grad_norm": 11.025486946105957, "learning_rate": 1.2193935926773455e-05, "loss": 0.7505, "step": 4150 }, { "epoch": 1.19, "grad_norm": 10.360795974731445, "learning_rate": 1.2189645308924485e-05, "loss": 0.7453, "step": 4151 }, { "epoch": 1.19, "grad_norm": 10.180240631103516, "learning_rate": 1.2185354691075516e-05, "loss": 0.8649, "step": 4152 }, { "epoch": 1.19, "grad_norm": 11.01577091217041, "learning_rate": 1.2181064073226545e-05, "loss": 0.9037, "step": 4153 }, { "epoch": 1.19, "grad_norm": 9.540740013122559, "learning_rate": 1.2176773455377575e-05, "loss": 0.7046, "step": 4154 }, { "epoch": 1.19, "grad_norm": 9.411883354187012, "learning_rate": 1.2172482837528604e-05, "loss": 0.8421, "step": 4155 }, { "epoch": 1.19, "grad_norm": 12.0674467086792, "learning_rate": 1.2168192219679634e-05, "loss": 0.6131, "step": 4156 }, { "epoch": 1.19, "grad_norm": 10.145977020263672, "learning_rate": 1.2163901601830663e-05, "loss": 0.932, "step": 4157 }, { "epoch": 1.19, "grad_norm": 7.649431228637695, "learning_rate": 1.2159610983981694e-05, "loss": 0.5755, "step": 4158 }, { "epoch": 1.19, "grad_norm": 10.664711952209473, "learning_rate": 1.2155320366132722e-05, "loss": 0.8075, "step": 4159 }, { "epoch": 1.19, "grad_norm": 11.5469970703125, "learning_rate": 1.2151029748283753e-05, "loss": 0.8425, "step": 4160 }, { "epoch": 1.19, "grad_norm": 9.75073528289795, "learning_rate": 1.2146739130434783e-05, "loss": 0.6543, "step": 4161 }, { "epoch": 1.19, "grad_norm": 9.785978317260742, "learning_rate": 1.2142448512585814e-05, "loss": 0.6521, "step": 4162 }, { "epoch": 1.19, "grad_norm": 10.929224014282227, "learning_rate": 1.2138157894736842e-05, "loss": 0.9284, "step": 4163 }, { "epoch": 1.19, "grad_norm": 7.091762542724609, "learning_rate": 1.2133867276887873e-05, "loss": 0.4471, "step": 4164 }, { "epoch": 1.19, "grad_norm": 11.026457786560059, "learning_rate": 1.2129576659038902e-05, "loss": 0.8445, "step": 4165 }, { "epoch": 1.19, "grad_norm": 13.157074928283691, "learning_rate": 1.2125286041189932e-05, "loss": 0.8575, "step": 4166 }, { "epoch": 1.19, "grad_norm": 9.50782299041748, "learning_rate": 1.2120995423340961e-05, "loss": 0.614, "step": 4167 }, { "epoch": 1.19, "grad_norm": 12.477818489074707, "learning_rate": 1.211670480549199e-05, "loss": 0.8306, "step": 4168 }, { "epoch": 1.19, "grad_norm": 12.342202186584473, "learning_rate": 1.2112414187643022e-05, "loss": 0.6937, "step": 4169 }, { "epoch": 1.19, "grad_norm": 8.733111381530762, "learning_rate": 1.210812356979405e-05, "loss": 0.6343, "step": 4170 }, { "epoch": 1.19, "grad_norm": 11.11361312866211, "learning_rate": 1.2103832951945081e-05, "loss": 0.9674, "step": 4171 }, { "epoch": 1.19, "grad_norm": 10.384431838989258, "learning_rate": 1.209954233409611e-05, "loss": 0.7703, "step": 4172 }, { "epoch": 1.19, "grad_norm": 10.025187492370605, "learning_rate": 1.209525171624714e-05, "loss": 0.7246, "step": 4173 }, { "epoch": 1.19, "grad_norm": 9.30639934539795, "learning_rate": 1.2090961098398169e-05, "loss": 0.713, "step": 4174 }, { "epoch": 1.19, "grad_norm": 10.196172714233398, "learning_rate": 1.20866704805492e-05, "loss": 0.8763, "step": 4175 }, { "epoch": 1.19, "grad_norm": 12.444104194641113, "learning_rate": 1.2082379862700228e-05, "loss": 0.971, "step": 4176 }, { "epoch": 1.19, "grad_norm": 10.587760925292969, "learning_rate": 1.2078089244851259e-05, "loss": 0.7235, "step": 4177 }, { "epoch": 1.2, "grad_norm": 7.7796478271484375, "learning_rate": 1.207379862700229e-05, "loss": 0.6905, "step": 4178 }, { "epoch": 1.2, "grad_norm": 10.688580513000488, "learning_rate": 1.206950800915332e-05, "loss": 1.1034, "step": 4179 }, { "epoch": 1.2, "grad_norm": 9.650777816772461, "learning_rate": 1.2065217391304348e-05, "loss": 0.7908, "step": 4180 }, { "epoch": 1.2, "grad_norm": 9.831267356872559, "learning_rate": 1.2060926773455377e-05, "loss": 0.7006, "step": 4181 }, { "epoch": 1.2, "grad_norm": 11.56423282623291, "learning_rate": 1.2056636155606408e-05, "loss": 0.7525, "step": 4182 }, { "epoch": 1.2, "grad_norm": 9.822474479675293, "learning_rate": 1.2052345537757436e-05, "loss": 1.1696, "step": 4183 }, { "epoch": 1.2, "grad_norm": 8.155306816101074, "learning_rate": 1.2048054919908467e-05, "loss": 0.6272, "step": 4184 }, { "epoch": 1.2, "grad_norm": 9.39324951171875, "learning_rate": 1.2043764302059496e-05, "loss": 0.7688, "step": 4185 }, { "epoch": 1.2, "grad_norm": 11.637955665588379, "learning_rate": 1.2039473684210528e-05, "loss": 1.0111, "step": 4186 }, { "epoch": 1.2, "grad_norm": 13.19721794128418, "learning_rate": 1.2035183066361557e-05, "loss": 0.9972, "step": 4187 }, { "epoch": 1.2, "grad_norm": 9.880034446716309, "learning_rate": 1.2030892448512587e-05, "loss": 0.7787, "step": 4188 }, { "epoch": 1.2, "grad_norm": 9.659402847290039, "learning_rate": 1.2026601830663616e-05, "loss": 0.71, "step": 4189 }, { "epoch": 1.2, "grad_norm": 7.872838973999023, "learning_rate": 1.2022311212814646e-05, "loss": 0.7305, "step": 4190 }, { "epoch": 1.2, "grad_norm": 8.332547187805176, "learning_rate": 1.2018020594965675e-05, "loss": 1.1115, "step": 4191 }, { "epoch": 1.2, "grad_norm": 8.256744384765625, "learning_rate": 1.2013729977116706e-05, "loss": 0.902, "step": 4192 }, { "epoch": 1.2, "grad_norm": 8.371477127075195, "learning_rate": 1.2009439359267734e-05, "loss": 0.875, "step": 4193 }, { "epoch": 1.2, "grad_norm": 10.548712730407715, "learning_rate": 1.2005148741418763e-05, "loss": 0.8353, "step": 4194 }, { "epoch": 1.2, "grad_norm": 11.617545127868652, "learning_rate": 1.2000858123569795e-05, "loss": 1.303, "step": 4195 }, { "epoch": 1.2, "grad_norm": 9.448934555053711, "learning_rate": 1.1996567505720824e-05, "loss": 0.8609, "step": 4196 }, { "epoch": 1.2, "grad_norm": 9.004947662353516, "learning_rate": 1.1992276887871854e-05, "loss": 0.7202, "step": 4197 }, { "epoch": 1.2, "grad_norm": 8.329684257507324, "learning_rate": 1.1987986270022883e-05, "loss": 0.863, "step": 4198 }, { "epoch": 1.2, "grad_norm": 10.899600982666016, "learning_rate": 1.1983695652173914e-05, "loss": 0.9036, "step": 4199 }, { "epoch": 1.2, "grad_norm": 7.325217247009277, "learning_rate": 1.1979405034324942e-05, "loss": 0.601, "step": 4200 }, { "epoch": 1.2, "grad_norm": 9.30616283416748, "learning_rate": 1.1975114416475973e-05, "loss": 0.9284, "step": 4201 }, { "epoch": 1.2, "grad_norm": 10.972339630126953, "learning_rate": 1.1970823798627002e-05, "loss": 0.9204, "step": 4202 }, { "epoch": 1.2, "grad_norm": 11.110115051269531, "learning_rate": 1.1966533180778034e-05, "loss": 0.6842, "step": 4203 }, { "epoch": 1.2, "grad_norm": 12.855428695678711, "learning_rate": 1.1962242562929063e-05, "loss": 0.8472, "step": 4204 }, { "epoch": 1.2, "grad_norm": 8.137256622314453, "learning_rate": 1.1957951945080093e-05, "loss": 0.5689, "step": 4205 }, { "epoch": 1.2, "grad_norm": 9.706517219543457, "learning_rate": 1.1953661327231122e-05, "loss": 0.7754, "step": 4206 }, { "epoch": 1.2, "grad_norm": 9.710511207580566, "learning_rate": 1.194937070938215e-05, "loss": 0.9112, "step": 4207 }, { "epoch": 1.2, "grad_norm": 11.372818946838379, "learning_rate": 1.1945080091533181e-05, "loss": 0.9699, "step": 4208 }, { "epoch": 1.2, "grad_norm": 10.132120132446289, "learning_rate": 1.194078947368421e-05, "loss": 0.641, "step": 4209 }, { "epoch": 1.2, "grad_norm": 10.642168998718262, "learning_rate": 1.193649885583524e-05, "loss": 0.776, "step": 4210 }, { "epoch": 1.2, "grad_norm": 12.64185905456543, "learning_rate": 1.1932208237986269e-05, "loss": 0.7297, "step": 4211 }, { "epoch": 1.2, "grad_norm": 13.01512622833252, "learning_rate": 1.1927917620137301e-05, "loss": 1.3426, "step": 4212 }, { "epoch": 1.21, "grad_norm": 10.130009651184082, "learning_rate": 1.192362700228833e-05, "loss": 0.6614, "step": 4213 }, { "epoch": 1.21, "grad_norm": 8.871742248535156, "learning_rate": 1.191933638443936e-05, "loss": 0.5409, "step": 4214 }, { "epoch": 1.21, "grad_norm": 10.790398597717285, "learning_rate": 1.191504576659039e-05, "loss": 0.7948, "step": 4215 }, { "epoch": 1.21, "grad_norm": 10.162729263305664, "learning_rate": 1.191075514874142e-05, "loss": 0.7729, "step": 4216 }, { "epoch": 1.21, "grad_norm": 7.943366050720215, "learning_rate": 1.1906464530892448e-05, "loss": 0.7385, "step": 4217 }, { "epoch": 1.21, "grad_norm": 9.41102409362793, "learning_rate": 1.1902173913043479e-05, "loss": 0.6734, "step": 4218 }, { "epoch": 1.21, "grad_norm": 10.491739273071289, "learning_rate": 1.1897883295194508e-05, "loss": 0.6161, "step": 4219 }, { "epoch": 1.21, "grad_norm": 11.373291015625, "learning_rate": 1.1893592677345536e-05, "loss": 0.8434, "step": 4220 }, { "epoch": 1.21, "grad_norm": 8.466850280761719, "learning_rate": 1.1889302059496569e-05, "loss": 0.7244, "step": 4221 }, { "epoch": 1.21, "grad_norm": 13.56641960144043, "learning_rate": 1.1885011441647597e-05, "loss": 0.8512, "step": 4222 }, { "epoch": 1.21, "grad_norm": 12.589375495910645, "learning_rate": 1.1880720823798628e-05, "loss": 0.9807, "step": 4223 }, { "epoch": 1.21, "grad_norm": 9.531851768493652, "learning_rate": 1.1876430205949657e-05, "loss": 0.6452, "step": 4224 }, { "epoch": 1.21, "grad_norm": 11.992056846618652, "learning_rate": 1.1872139588100687e-05, "loss": 0.8185, "step": 4225 }, { "epoch": 1.21, "grad_norm": 9.233867645263672, "learning_rate": 1.1867848970251716e-05, "loss": 0.802, "step": 4226 }, { "epoch": 1.21, "grad_norm": 10.06538200378418, "learning_rate": 1.1863558352402746e-05, "loss": 0.6751, "step": 4227 }, { "epoch": 1.21, "grad_norm": 11.67239761352539, "learning_rate": 1.1859267734553775e-05, "loss": 0.7702, "step": 4228 }, { "epoch": 1.21, "grad_norm": 11.357415199279785, "learning_rate": 1.1854977116704807e-05, "loss": 0.8018, "step": 4229 }, { "epoch": 1.21, "grad_norm": 9.972132682800293, "learning_rate": 1.1850686498855836e-05, "loss": 0.6667, "step": 4230 }, { "epoch": 1.21, "grad_norm": 10.696549415588379, "learning_rate": 1.1846395881006866e-05, "loss": 0.8272, "step": 4231 }, { "epoch": 1.21, "grad_norm": 10.538824081420898, "learning_rate": 1.1842105263157895e-05, "loss": 0.7871, "step": 4232 }, { "epoch": 1.21, "grad_norm": 10.483213424682617, "learning_rate": 1.1837814645308924e-05, "loss": 0.6551, "step": 4233 }, { "epoch": 1.21, "grad_norm": 10.283119201660156, "learning_rate": 1.1833524027459954e-05, "loss": 0.7317, "step": 4234 }, { "epoch": 1.21, "grad_norm": 10.398222923278809, "learning_rate": 1.1829233409610983e-05, "loss": 0.7929, "step": 4235 }, { "epoch": 1.21, "grad_norm": 10.281462669372559, "learning_rate": 1.1824942791762014e-05, "loss": 0.7468, "step": 4236 }, { "epoch": 1.21, "grad_norm": 8.925246238708496, "learning_rate": 1.1820652173913042e-05, "loss": 0.5831, "step": 4237 }, { "epoch": 1.21, "grad_norm": 9.566448211669922, "learning_rate": 1.1816361556064075e-05, "loss": 0.7537, "step": 4238 }, { "epoch": 1.21, "grad_norm": 13.305020332336426, "learning_rate": 1.1812070938215103e-05, "loss": 0.7581, "step": 4239 }, { "epoch": 1.21, "grad_norm": 11.126168251037598, "learning_rate": 1.1807780320366134e-05, "loss": 0.5523, "step": 4240 }, { "epoch": 1.21, "grad_norm": 8.93124771118164, "learning_rate": 1.1803489702517163e-05, "loss": 0.5531, "step": 4241 }, { "epoch": 1.21, "grad_norm": 10.155959129333496, "learning_rate": 1.1799199084668193e-05, "loss": 0.9744, "step": 4242 }, { "epoch": 1.21, "grad_norm": 10.217401504516602, "learning_rate": 1.1794908466819222e-05, "loss": 0.9487, "step": 4243 }, { "epoch": 1.21, "grad_norm": 10.667862892150879, "learning_rate": 1.1790617848970252e-05, "loss": 0.7737, "step": 4244 }, { "epoch": 1.21, "grad_norm": 11.557846069335938, "learning_rate": 1.1786327231121281e-05, "loss": 1.1461, "step": 4245 }, { "epoch": 1.21, "grad_norm": 10.171220779418945, "learning_rate": 1.1782036613272312e-05, "loss": 0.8439, "step": 4246 }, { "epoch": 1.21, "grad_norm": 12.268500328063965, "learning_rate": 1.1777745995423342e-05, "loss": 0.8422, "step": 4247 }, { "epoch": 1.22, "grad_norm": 8.701354026794434, "learning_rate": 1.177345537757437e-05, "loss": 0.6734, "step": 4248 }, { "epoch": 1.22, "grad_norm": 10.68353271484375, "learning_rate": 1.1769164759725401e-05, "loss": 0.6657, "step": 4249 }, { "epoch": 1.22, "grad_norm": 12.258493423461914, "learning_rate": 1.176487414187643e-05, "loss": 1.1132, "step": 4250 }, { "epoch": 1.22, "grad_norm": 12.400599479675293, "learning_rate": 1.176058352402746e-05, "loss": 1.0322, "step": 4251 }, { "epoch": 1.22, "grad_norm": 10.623896598815918, "learning_rate": 1.175629290617849e-05, "loss": 0.8161, "step": 4252 }, { "epoch": 1.22, "grad_norm": 9.813971519470215, "learning_rate": 1.175200228832952e-05, "loss": 0.6358, "step": 4253 }, { "epoch": 1.22, "grad_norm": 8.067031860351562, "learning_rate": 1.1747711670480548e-05, "loss": 0.8396, "step": 4254 }, { "epoch": 1.22, "grad_norm": 9.504332542419434, "learning_rate": 1.174342105263158e-05, "loss": 0.8028, "step": 4255 }, { "epoch": 1.22, "grad_norm": 9.022904396057129, "learning_rate": 1.173913043478261e-05, "loss": 0.814, "step": 4256 }, { "epoch": 1.22, "grad_norm": 10.11242961883545, "learning_rate": 1.173483981693364e-05, "loss": 0.6788, "step": 4257 }, { "epoch": 1.22, "grad_norm": 10.316167831420898, "learning_rate": 1.1730549199084669e-05, "loss": 0.7584, "step": 4258 }, { "epoch": 1.22, "grad_norm": 10.539634704589844, "learning_rate": 1.1726258581235697e-05, "loss": 0.6962, "step": 4259 }, { "epoch": 1.22, "grad_norm": 6.796972751617432, "learning_rate": 1.1721967963386728e-05, "loss": 0.5701, "step": 4260 }, { "epoch": 1.22, "grad_norm": 10.749701499938965, "learning_rate": 1.1717677345537757e-05, "loss": 0.946, "step": 4261 }, { "epoch": 1.22, "grad_norm": 8.778101921081543, "learning_rate": 1.1713386727688787e-05, "loss": 0.8784, "step": 4262 }, { "epoch": 1.22, "grad_norm": 9.367880821228027, "learning_rate": 1.1709096109839818e-05, "loss": 0.8687, "step": 4263 }, { "epoch": 1.22, "grad_norm": 10.653621673583984, "learning_rate": 1.1704805491990848e-05, "loss": 1.0333, "step": 4264 }, { "epoch": 1.22, "grad_norm": 10.247994422912598, "learning_rate": 1.1700514874141877e-05, "loss": 0.8605, "step": 4265 }, { "epoch": 1.22, "grad_norm": 9.010552406311035, "learning_rate": 1.1696224256292907e-05, "loss": 0.8025, "step": 4266 }, { "epoch": 1.22, "grad_norm": 8.36340618133545, "learning_rate": 1.1691933638443936e-05, "loss": 0.6301, "step": 4267 }, { "epoch": 1.22, "grad_norm": 9.572992324829102, "learning_rate": 1.1687643020594966e-05, "loss": 0.6753, "step": 4268 }, { "epoch": 1.22, "grad_norm": 9.083409309387207, "learning_rate": 1.1683352402745995e-05, "loss": 0.7244, "step": 4269 }, { "epoch": 1.22, "grad_norm": 11.072820663452148, "learning_rate": 1.1679061784897026e-05, "loss": 0.6236, "step": 4270 }, { "epoch": 1.22, "grad_norm": 12.434370994567871, "learning_rate": 1.1674771167048054e-05, "loss": 0.9262, "step": 4271 }, { "epoch": 1.22, "grad_norm": 13.024633407592773, "learning_rate": 1.1670480549199085e-05, "loss": 0.748, "step": 4272 }, { "epoch": 1.22, "grad_norm": 7.6458659172058105, "learning_rate": 1.1666189931350115e-05, "loss": 0.7682, "step": 4273 }, { "epoch": 1.22, "grad_norm": 9.794417381286621, "learning_rate": 1.1661899313501144e-05, "loss": 0.7265, "step": 4274 }, { "epoch": 1.22, "grad_norm": 10.18636703491211, "learning_rate": 1.1657608695652175e-05, "loss": 0.7181, "step": 4275 }, { "epoch": 1.22, "grad_norm": 11.913507461547852, "learning_rate": 1.1653318077803203e-05, "loss": 0.7299, "step": 4276 }, { "epoch": 1.22, "grad_norm": 9.873882293701172, "learning_rate": 1.1649027459954234e-05, "loss": 0.8151, "step": 4277 }, { "epoch": 1.22, "grad_norm": 11.573633193969727, "learning_rate": 1.1644736842105263e-05, "loss": 1.019, "step": 4278 }, { "epoch": 1.22, "grad_norm": 9.217798233032227, "learning_rate": 1.1640446224256293e-05, "loss": 0.6498, "step": 4279 }, { "epoch": 1.22, "grad_norm": 9.954983711242676, "learning_rate": 1.1636155606407322e-05, "loss": 0.7433, "step": 4280 }, { "epoch": 1.22, "grad_norm": 11.062023162841797, "learning_rate": 1.1631864988558354e-05, "loss": 0.9304, "step": 4281 }, { "epoch": 1.22, "grad_norm": 9.869750022888184, "learning_rate": 1.1627574370709383e-05, "loss": 0.7542, "step": 4282 }, { "epoch": 1.23, "grad_norm": 11.585716247558594, "learning_rate": 1.1623283752860413e-05, "loss": 0.8442, "step": 4283 }, { "epoch": 1.23, "grad_norm": 8.067532539367676, "learning_rate": 1.1618993135011442e-05, "loss": 0.6195, "step": 4284 }, { "epoch": 1.23, "grad_norm": 12.927903175354004, "learning_rate": 1.161470251716247e-05, "loss": 0.8351, "step": 4285 }, { "epoch": 1.23, "grad_norm": 9.903480529785156, "learning_rate": 1.1610411899313501e-05, "loss": 0.7832, "step": 4286 }, { "epoch": 1.23, "grad_norm": 9.87097454071045, "learning_rate": 1.160612128146453e-05, "loss": 1.1323, "step": 4287 }, { "epoch": 1.23, "grad_norm": 9.815967559814453, "learning_rate": 1.160183066361556e-05, "loss": 0.784, "step": 4288 }, { "epoch": 1.23, "grad_norm": 9.69139575958252, "learning_rate": 1.1597540045766591e-05, "loss": 0.7694, "step": 4289 }, { "epoch": 1.23, "grad_norm": 10.718193054199219, "learning_rate": 1.1593249427917621e-05, "loss": 0.9341, "step": 4290 }, { "epoch": 1.23, "grad_norm": 8.258038520812988, "learning_rate": 1.158895881006865e-05, "loss": 0.6007, "step": 4291 }, { "epoch": 1.23, "grad_norm": 7.3089823722839355, "learning_rate": 1.158466819221968e-05, "loss": 0.7345, "step": 4292 }, { "epoch": 1.23, "grad_norm": 7.901584625244141, "learning_rate": 1.158037757437071e-05, "loss": 0.7332, "step": 4293 }, { "epoch": 1.23, "grad_norm": 7.5747785568237305, "learning_rate": 1.157608695652174e-05, "loss": 0.6251, "step": 4294 }, { "epoch": 1.23, "grad_norm": 8.1260347366333, "learning_rate": 1.1571796338672769e-05, "loss": 0.7314, "step": 4295 }, { "epoch": 1.23, "grad_norm": 8.920598030090332, "learning_rate": 1.1567505720823799e-05, "loss": 0.6244, "step": 4296 }, { "epoch": 1.23, "grad_norm": 9.813675880432129, "learning_rate": 1.1563215102974828e-05, "loss": 0.8088, "step": 4297 }, { "epoch": 1.23, "grad_norm": 12.650906562805176, "learning_rate": 1.1558924485125858e-05, "loss": 0.9075, "step": 4298 }, { "epoch": 1.23, "grad_norm": 12.57913589477539, "learning_rate": 1.1554633867276889e-05, "loss": 0.9788, "step": 4299 }, { "epoch": 1.23, "grad_norm": 11.513996124267578, "learning_rate": 1.1550343249427917e-05, "loss": 0.9212, "step": 4300 }, { "epoch": 1.23, "grad_norm": 8.296212196350098, "learning_rate": 1.1546052631578948e-05, "loss": 0.7903, "step": 4301 }, { "epoch": 1.23, "grad_norm": 11.32043743133545, "learning_rate": 1.1541762013729977e-05, "loss": 0.8166, "step": 4302 }, { "epoch": 1.23, "grad_norm": 10.928311347961426, "learning_rate": 1.1537471395881007e-05, "loss": 1.1405, "step": 4303 }, { "epoch": 1.23, "grad_norm": 9.672231674194336, "learning_rate": 1.1533180778032036e-05, "loss": 0.8346, "step": 4304 }, { "epoch": 1.23, "grad_norm": 11.749388694763184, "learning_rate": 1.1528890160183066e-05, "loss": 0.8113, "step": 4305 }, { "epoch": 1.23, "grad_norm": 9.33349609375, "learning_rate": 1.1524599542334097e-05, "loss": 0.7047, "step": 4306 }, { "epoch": 1.23, "grad_norm": 10.378589630126953, "learning_rate": 1.1520308924485127e-05, "loss": 0.9374, "step": 4307 }, { "epoch": 1.23, "grad_norm": 12.265445709228516, "learning_rate": 1.1516018306636156e-05, "loss": 0.7387, "step": 4308 }, { "epoch": 1.23, "grad_norm": 9.64606761932373, "learning_rate": 1.1511727688787187e-05, "loss": 0.9858, "step": 4309 }, { "epoch": 1.23, "grad_norm": 10.293717384338379, "learning_rate": 1.1507437070938215e-05, "loss": 1.4012, "step": 4310 }, { "epoch": 1.23, "grad_norm": 10.404987335205078, "learning_rate": 1.1503146453089244e-05, "loss": 1.0246, "step": 4311 }, { "epoch": 1.23, "grad_norm": 6.542437553405762, "learning_rate": 1.1498855835240275e-05, "loss": 0.6179, "step": 4312 }, { "epoch": 1.23, "grad_norm": 10.147354125976562, "learning_rate": 1.1494565217391303e-05, "loss": 0.8771, "step": 4313 }, { "epoch": 1.23, "grad_norm": 9.722268104553223, "learning_rate": 1.1490274599542334e-05, "loss": 0.8938, "step": 4314 }, { "epoch": 1.23, "grad_norm": 8.905896186828613, "learning_rate": 1.1485983981693364e-05, "loss": 0.6041, "step": 4315 }, { "epoch": 1.23, "grad_norm": 11.898765563964844, "learning_rate": 1.1481693363844395e-05, "loss": 0.9426, "step": 4316 }, { "epoch": 1.23, "grad_norm": 13.410457611083984, "learning_rate": 1.1477402745995423e-05, "loss": 1.0473, "step": 4317 }, { "epoch": 1.24, "grad_norm": 9.72824478149414, "learning_rate": 1.1473112128146454e-05, "loss": 0.8856, "step": 4318 }, { "epoch": 1.24, "grad_norm": 10.007899284362793, "learning_rate": 1.1468821510297483e-05, "loss": 0.7087, "step": 4319 }, { "epoch": 1.24, "grad_norm": 12.402856826782227, "learning_rate": 1.1464530892448513e-05, "loss": 0.7871, "step": 4320 }, { "epoch": 1.24, "grad_norm": 9.839303970336914, "learning_rate": 1.1460240274599542e-05, "loss": 0.9303, "step": 4321 }, { "epoch": 1.24, "grad_norm": 10.47752857208252, "learning_rate": 1.1455949656750572e-05, "loss": 0.9723, "step": 4322 }, { "epoch": 1.24, "grad_norm": 10.317313194274902, "learning_rate": 1.1451659038901603e-05, "loss": 0.7985, "step": 4323 }, { "epoch": 1.24, "grad_norm": 9.262114524841309, "learning_rate": 1.1447368421052632e-05, "loss": 0.8299, "step": 4324 }, { "epoch": 1.24, "grad_norm": 7.015544414520264, "learning_rate": 1.1443077803203662e-05, "loss": 0.5288, "step": 4325 }, { "epoch": 1.24, "grad_norm": 8.641091346740723, "learning_rate": 1.1438787185354691e-05, "loss": 0.7868, "step": 4326 }, { "epoch": 1.24, "grad_norm": 10.731534957885742, "learning_rate": 1.1434496567505721e-05, "loss": 0.8198, "step": 4327 }, { "epoch": 1.24, "grad_norm": 12.458267211914062, "learning_rate": 1.143020594965675e-05, "loss": 0.9479, "step": 4328 }, { "epoch": 1.24, "grad_norm": 9.34188175201416, "learning_rate": 1.142591533180778e-05, "loss": 0.8897, "step": 4329 }, { "epoch": 1.24, "grad_norm": 10.190241813659668, "learning_rate": 1.142162471395881e-05, "loss": 0.5841, "step": 4330 }, { "epoch": 1.24, "grad_norm": 14.47514820098877, "learning_rate": 1.141733409610984e-05, "loss": 1.1723, "step": 4331 }, { "epoch": 1.24, "grad_norm": 11.83283519744873, "learning_rate": 1.141304347826087e-05, "loss": 0.8721, "step": 4332 }, { "epoch": 1.24, "grad_norm": 11.338597297668457, "learning_rate": 1.14087528604119e-05, "loss": 0.6735, "step": 4333 }, { "epoch": 1.24, "grad_norm": 9.52934741973877, "learning_rate": 1.140446224256293e-05, "loss": 0.7374, "step": 4334 }, { "epoch": 1.24, "grad_norm": 9.10802936553955, "learning_rate": 1.140017162471396e-05, "loss": 0.73, "step": 4335 }, { "epoch": 1.24, "grad_norm": 9.148085594177246, "learning_rate": 1.1395881006864989e-05, "loss": 0.5268, "step": 4336 }, { "epoch": 1.24, "grad_norm": 9.599064826965332, "learning_rate": 1.139159038901602e-05, "loss": 0.8209, "step": 4337 }, { "epoch": 1.24, "grad_norm": 10.743919372558594, "learning_rate": 1.1387299771167048e-05, "loss": 0.7522, "step": 4338 }, { "epoch": 1.24, "grad_norm": 11.263919830322266, "learning_rate": 1.1383009153318077e-05, "loss": 0.6177, "step": 4339 }, { "epoch": 1.24, "grad_norm": 10.132713317871094, "learning_rate": 1.1378718535469107e-05, "loss": 0.8611, "step": 4340 }, { "epoch": 1.24, "grad_norm": 10.401273727416992, "learning_rate": 1.1374427917620138e-05, "loss": 0.7318, "step": 4341 }, { "epoch": 1.24, "grad_norm": 10.272201538085938, "learning_rate": 1.1370137299771168e-05, "loss": 0.8656, "step": 4342 }, { "epoch": 1.24, "grad_norm": 9.658712387084961, "learning_rate": 1.1365846681922197e-05, "loss": 0.7594, "step": 4343 }, { "epoch": 1.24, "grad_norm": 10.106707572937012, "learning_rate": 1.1361556064073227e-05, "loss": 0.7132, "step": 4344 }, { "epoch": 1.24, "grad_norm": 9.241507530212402, "learning_rate": 1.1357265446224256e-05, "loss": 0.588, "step": 4345 }, { "epoch": 1.24, "grad_norm": 10.850719451904297, "learning_rate": 1.1352974828375287e-05, "loss": 0.6053, "step": 4346 }, { "epoch": 1.24, "grad_norm": 9.792867660522461, "learning_rate": 1.1348684210526315e-05, "loss": 0.8155, "step": 4347 }, { "epoch": 1.24, "grad_norm": 7.734493255615234, "learning_rate": 1.1344393592677346e-05, "loss": 0.6452, "step": 4348 }, { "epoch": 1.24, "grad_norm": 9.958784103393555, "learning_rate": 1.1340102974828376e-05, "loss": 0.734, "step": 4349 }, { "epoch": 1.24, "grad_norm": 9.508255958557129, "learning_rate": 1.1335812356979407e-05, "loss": 0.8112, "step": 4350 }, { "epoch": 1.24, "grad_norm": 9.391280174255371, "learning_rate": 1.1331521739130435e-05, "loss": 0.6283, "step": 4351 }, { "epoch": 1.24, "grad_norm": 9.672481536865234, "learning_rate": 1.1327231121281464e-05, "loss": 0.8086, "step": 4352 }, { "epoch": 1.25, "grad_norm": 10.470000267028809, "learning_rate": 1.1322940503432495e-05, "loss": 0.7252, "step": 4353 }, { "epoch": 1.25, "grad_norm": 11.01935863494873, "learning_rate": 1.1318649885583523e-05, "loss": 0.9348, "step": 4354 }, { "epoch": 1.25, "grad_norm": 9.3633451461792, "learning_rate": 1.1314359267734554e-05, "loss": 0.722, "step": 4355 }, { "epoch": 1.25, "grad_norm": 13.133733749389648, "learning_rate": 1.1310068649885583e-05, "loss": 0.6562, "step": 4356 }, { "epoch": 1.25, "grad_norm": 8.972723007202148, "learning_rate": 1.1305778032036613e-05, "loss": 0.8306, "step": 4357 }, { "epoch": 1.25, "grad_norm": 8.94922161102295, "learning_rate": 1.1301487414187644e-05, "loss": 0.9429, "step": 4358 }, { "epoch": 1.25, "grad_norm": 10.160902976989746, "learning_rate": 1.1297196796338674e-05, "loss": 0.7064, "step": 4359 }, { "epoch": 1.25, "grad_norm": 11.051900863647461, "learning_rate": 1.1292906178489703e-05, "loss": 0.7791, "step": 4360 }, { "epoch": 1.25, "grad_norm": 10.453458786010742, "learning_rate": 1.1288615560640733e-05, "loss": 0.6881, "step": 4361 }, { "epoch": 1.25, "grad_norm": 9.616094589233398, "learning_rate": 1.1284324942791762e-05, "loss": 0.7485, "step": 4362 }, { "epoch": 1.25, "grad_norm": 10.842020034790039, "learning_rate": 1.1280034324942793e-05, "loss": 0.8852, "step": 4363 }, { "epoch": 1.25, "grad_norm": 10.710325241088867, "learning_rate": 1.1275743707093821e-05, "loss": 1.0494, "step": 4364 }, { "epoch": 1.25, "grad_norm": 10.418954849243164, "learning_rate": 1.127145308924485e-05, "loss": 0.7913, "step": 4365 }, { "epoch": 1.25, "grad_norm": 10.215678215026855, "learning_rate": 1.1267162471395882e-05, "loss": 0.8389, "step": 4366 }, { "epoch": 1.25, "grad_norm": 11.755205154418945, "learning_rate": 1.1262871853546911e-05, "loss": 0.6641, "step": 4367 }, { "epoch": 1.25, "grad_norm": 8.971964836120605, "learning_rate": 1.1258581235697941e-05, "loss": 0.707, "step": 4368 }, { "epoch": 1.25, "grad_norm": 10.31049919128418, "learning_rate": 1.125429061784897e-05, "loss": 0.6869, "step": 4369 }, { "epoch": 1.25, "grad_norm": 9.721508979797363, "learning_rate": 1.125e-05, "loss": 0.9134, "step": 4370 }, { "epoch": 1.25, "grad_norm": 14.146642684936523, "learning_rate": 1.124570938215103e-05, "loss": 0.8786, "step": 4371 }, { "epoch": 1.25, "grad_norm": 10.55115795135498, "learning_rate": 1.124141876430206e-05, "loss": 0.8725, "step": 4372 }, { "epoch": 1.25, "grad_norm": 9.048384666442871, "learning_rate": 1.1237128146453089e-05, "loss": 0.7171, "step": 4373 }, { "epoch": 1.25, "grad_norm": 9.084287643432617, "learning_rate": 1.123283752860412e-05, "loss": 0.6334, "step": 4374 }, { "epoch": 1.25, "grad_norm": 10.687078475952148, "learning_rate": 1.122854691075515e-05, "loss": 0.798, "step": 4375 }, { "epoch": 1.25, "grad_norm": 10.394928932189941, "learning_rate": 1.122425629290618e-05, "loss": 0.823, "step": 4376 }, { "epoch": 1.25, "grad_norm": 11.122708320617676, "learning_rate": 1.1219965675057209e-05, "loss": 0.6799, "step": 4377 }, { "epoch": 1.25, "grad_norm": 11.894579887390137, "learning_rate": 1.1215675057208238e-05, "loss": 0.7922, "step": 4378 }, { "epoch": 1.25, "grad_norm": 10.686917304992676, "learning_rate": 1.1211384439359268e-05, "loss": 0.9087, "step": 4379 }, { "epoch": 1.25, "grad_norm": 11.511585235595703, "learning_rate": 1.1207093821510297e-05, "loss": 0.919, "step": 4380 }, { "epoch": 1.25, "grad_norm": 6.411844253540039, "learning_rate": 1.1202803203661327e-05, "loss": 0.449, "step": 4381 }, { "epoch": 1.25, "grad_norm": 9.558112144470215, "learning_rate": 1.1198512585812356e-05, "loss": 0.717, "step": 4382 }, { "epoch": 1.25, "grad_norm": 9.505937576293945, "learning_rate": 1.1194221967963388e-05, "loss": 0.7318, "step": 4383 }, { "epoch": 1.25, "grad_norm": 8.222380638122559, "learning_rate": 1.1189931350114417e-05, "loss": 0.7322, "step": 4384 }, { "epoch": 1.25, "grad_norm": 12.940840721130371, "learning_rate": 1.1185640732265447e-05, "loss": 0.8371, "step": 4385 }, { "epoch": 1.25, "grad_norm": 10.579774856567383, "learning_rate": 1.1181350114416476e-05, "loss": 0.8169, "step": 4386 }, { "epoch": 1.25, "grad_norm": 8.442181587219238, "learning_rate": 1.1177059496567507e-05, "loss": 0.6415, "step": 4387 }, { "epoch": 1.26, "grad_norm": 11.065545082092285, "learning_rate": 1.1172768878718535e-05, "loss": 0.8782, "step": 4388 }, { "epoch": 1.26, "grad_norm": 9.941202163696289, "learning_rate": 1.1168478260869566e-05, "loss": 0.5738, "step": 4389 }, { "epoch": 1.26, "grad_norm": 10.161408424377441, "learning_rate": 1.1164187643020595e-05, "loss": 0.6929, "step": 4390 }, { "epoch": 1.26, "grad_norm": 8.900789260864258, "learning_rate": 1.1159897025171623e-05, "loss": 0.5595, "step": 4391 }, { "epoch": 1.26, "grad_norm": 9.961406707763672, "learning_rate": 1.1155606407322656e-05, "loss": 0.4831, "step": 4392 }, { "epoch": 1.26, "grad_norm": 10.347709655761719, "learning_rate": 1.1151315789473684e-05, "loss": 0.7459, "step": 4393 }, { "epoch": 1.26, "grad_norm": 10.460041046142578, "learning_rate": 1.1147025171624715e-05, "loss": 0.599, "step": 4394 }, { "epoch": 1.26, "grad_norm": 12.151421546936035, "learning_rate": 1.1142734553775744e-05, "loss": 0.9976, "step": 4395 }, { "epoch": 1.26, "grad_norm": 10.167964935302734, "learning_rate": 1.1138443935926774e-05, "loss": 0.6846, "step": 4396 }, { "epoch": 1.26, "grad_norm": 11.677679061889648, "learning_rate": 1.1134153318077803e-05, "loss": 1.1699, "step": 4397 }, { "epoch": 1.26, "grad_norm": 11.125264167785645, "learning_rate": 1.1129862700228833e-05, "loss": 0.7343, "step": 4398 }, { "epoch": 1.26, "grad_norm": 8.717630386352539, "learning_rate": 1.1125572082379862e-05, "loss": 0.5695, "step": 4399 }, { "epoch": 1.26, "grad_norm": 11.196426391601562, "learning_rate": 1.1121281464530894e-05, "loss": 0.963, "step": 4400 }, { "epoch": 1.26, "grad_norm": 9.22913646697998, "learning_rate": 1.1116990846681923e-05, "loss": 0.6797, "step": 4401 }, { "epoch": 1.26, "grad_norm": 12.953048706054688, "learning_rate": 1.1112700228832953e-05, "loss": 0.9139, "step": 4402 }, { "epoch": 1.26, "grad_norm": 10.623172760009766, "learning_rate": 1.1108409610983982e-05, "loss": 1.0378, "step": 4403 }, { "epoch": 1.26, "grad_norm": 7.46474552154541, "learning_rate": 1.1104118993135011e-05, "loss": 0.5835, "step": 4404 }, { "epoch": 1.26, "grad_norm": 9.413641929626465, "learning_rate": 1.1099828375286041e-05, "loss": 0.634, "step": 4405 }, { "epoch": 1.26, "grad_norm": 10.446019172668457, "learning_rate": 1.109553775743707e-05, "loss": 0.7857, "step": 4406 }, { "epoch": 1.26, "grad_norm": 10.171029090881348, "learning_rate": 1.10912471395881e-05, "loss": 0.6332, "step": 4407 }, { "epoch": 1.26, "grad_norm": 9.081442832946777, "learning_rate": 1.108695652173913e-05, "loss": 0.6324, "step": 4408 }, { "epoch": 1.26, "grad_norm": 10.221150398254395, "learning_rate": 1.1082665903890162e-05, "loss": 0.6282, "step": 4409 }, { "epoch": 1.26, "grad_norm": 9.756031036376953, "learning_rate": 1.107837528604119e-05, "loss": 0.8952, "step": 4410 }, { "epoch": 1.26, "grad_norm": 9.257505416870117, "learning_rate": 1.1074084668192221e-05, "loss": 0.6357, "step": 4411 }, { "epoch": 1.26, "grad_norm": 10.139008522033691, "learning_rate": 1.106979405034325e-05, "loss": 0.855, "step": 4412 }, { "epoch": 1.26, "grad_norm": 13.11406421661377, "learning_rate": 1.106550343249428e-05, "loss": 1.1138, "step": 4413 }, { "epoch": 1.26, "grad_norm": 9.236861228942871, "learning_rate": 1.1061212814645309e-05, "loss": 0.5939, "step": 4414 }, { "epoch": 1.26, "grad_norm": 10.148466110229492, "learning_rate": 1.105692219679634e-05, "loss": 0.9314, "step": 4415 }, { "epoch": 1.26, "grad_norm": 10.256353378295898, "learning_rate": 1.1052631578947368e-05, "loss": 0.8695, "step": 4416 }, { "epoch": 1.26, "grad_norm": 9.436090469360352, "learning_rate": 1.1048340961098397e-05, "loss": 0.8043, "step": 4417 }, { "epoch": 1.26, "grad_norm": 9.323995590209961, "learning_rate": 1.1044050343249429e-05, "loss": 0.8311, "step": 4418 }, { "epoch": 1.26, "grad_norm": 10.637639045715332, "learning_rate": 1.1039759725400458e-05, "loss": 0.961, "step": 4419 }, { "epoch": 1.26, "grad_norm": 10.672082901000977, "learning_rate": 1.1035469107551488e-05, "loss": 0.8238, "step": 4420 }, { "epoch": 1.26, "grad_norm": 9.179261207580566, "learning_rate": 1.1031178489702517e-05, "loss": 0.8912, "step": 4421 }, { "epoch": 1.26, "grad_norm": 9.817282676696777, "learning_rate": 1.1026887871853547e-05, "loss": 0.8843, "step": 4422 }, { "epoch": 1.27, "grad_norm": 8.61851692199707, "learning_rate": 1.1022597254004576e-05, "loss": 0.7468, "step": 4423 }, { "epoch": 1.27, "grad_norm": 7.733014106750488, "learning_rate": 1.1018306636155607e-05, "loss": 0.6755, "step": 4424 }, { "epoch": 1.27, "grad_norm": 9.36716079711914, "learning_rate": 1.1014016018306635e-05, "loss": 0.8969, "step": 4425 }, { "epoch": 1.27, "grad_norm": 10.766349792480469, "learning_rate": 1.1009725400457668e-05, "loss": 0.9567, "step": 4426 }, { "epoch": 1.27, "grad_norm": 11.445676803588867, "learning_rate": 1.1005434782608696e-05, "loss": 0.9534, "step": 4427 }, { "epoch": 1.27, "grad_norm": 9.643470764160156, "learning_rate": 1.1001144164759727e-05, "loss": 0.7908, "step": 4428 }, { "epoch": 1.27, "grad_norm": 9.62116813659668, "learning_rate": 1.0996853546910756e-05, "loss": 0.6876, "step": 4429 }, { "epoch": 1.27, "grad_norm": 11.832209587097168, "learning_rate": 1.0992562929061784e-05, "loss": 0.9371, "step": 4430 }, { "epoch": 1.27, "grad_norm": 9.72209358215332, "learning_rate": 1.0988272311212815e-05, "loss": 0.6439, "step": 4431 }, { "epoch": 1.27, "grad_norm": 7.92345666885376, "learning_rate": 1.0983981693363844e-05, "loss": 0.5952, "step": 4432 }, { "epoch": 1.27, "grad_norm": 8.540083885192871, "learning_rate": 1.0979691075514874e-05, "loss": 0.7791, "step": 4433 }, { "epoch": 1.27, "grad_norm": 9.119351387023926, "learning_rate": 1.0975400457665903e-05, "loss": 0.7931, "step": 4434 }, { "epoch": 1.27, "grad_norm": 11.509077072143555, "learning_rate": 1.0971109839816935e-05, "loss": 0.9935, "step": 4435 }, { "epoch": 1.27, "grad_norm": 11.747727394104004, "learning_rate": 1.0966819221967964e-05, "loss": 0.647, "step": 4436 }, { "epoch": 1.27, "grad_norm": 9.080245018005371, "learning_rate": 1.0962528604118994e-05, "loss": 0.5636, "step": 4437 }, { "epoch": 1.27, "grad_norm": 8.393102645874023, "learning_rate": 1.0958237986270023e-05, "loss": 0.698, "step": 4438 }, { "epoch": 1.27, "grad_norm": 10.01490592956543, "learning_rate": 1.0953947368421053e-05, "loss": 0.4403, "step": 4439 }, { "epoch": 1.27, "grad_norm": 9.193498611450195, "learning_rate": 1.0949656750572082e-05, "loss": 0.8632, "step": 4440 }, { "epoch": 1.27, "grad_norm": 10.101186752319336, "learning_rate": 1.0945366132723113e-05, "loss": 0.7359, "step": 4441 }, { "epoch": 1.27, "grad_norm": 11.3381929397583, "learning_rate": 1.0941075514874141e-05, "loss": 0.9855, "step": 4442 }, { "epoch": 1.27, "grad_norm": 10.720212936401367, "learning_rate": 1.0936784897025172e-05, "loss": 0.8431, "step": 4443 }, { "epoch": 1.27, "grad_norm": 10.518932342529297, "learning_rate": 1.0932494279176202e-05, "loss": 1.0291, "step": 4444 }, { "epoch": 1.27, "grad_norm": 9.96078109741211, "learning_rate": 1.0928203661327231e-05, "loss": 0.7174, "step": 4445 }, { "epoch": 1.27, "grad_norm": 9.446786880493164, "learning_rate": 1.0923913043478262e-05, "loss": 0.9162, "step": 4446 }, { "epoch": 1.27, "grad_norm": 9.563207626342773, "learning_rate": 1.091962242562929e-05, "loss": 0.7722, "step": 4447 }, { "epoch": 1.27, "grad_norm": 10.334098815917969, "learning_rate": 1.091533180778032e-05, "loss": 0.8665, "step": 4448 }, { "epoch": 1.27, "grad_norm": 9.159623146057129, "learning_rate": 1.091104118993135e-05, "loss": 0.6491, "step": 4449 }, { "epoch": 1.27, "grad_norm": 8.697135925292969, "learning_rate": 1.090675057208238e-05, "loss": 0.7193, "step": 4450 }, { "epoch": 1.27, "grad_norm": 10.513727188110352, "learning_rate": 1.0902459954233409e-05, "loss": 0.7611, "step": 4451 }, { "epoch": 1.27, "grad_norm": 8.544404983520508, "learning_rate": 1.0898169336384441e-05, "loss": 0.9147, "step": 4452 }, { "epoch": 1.27, "grad_norm": 9.933435440063477, "learning_rate": 1.089387871853547e-05, "loss": 0.9955, "step": 4453 }, { "epoch": 1.27, "grad_norm": 11.88568115234375, "learning_rate": 1.08895881006865e-05, "loss": 0.8911, "step": 4454 }, { "epoch": 1.27, "grad_norm": 11.408987998962402, "learning_rate": 1.0885297482837529e-05, "loss": 1.0424, "step": 4455 }, { "epoch": 1.27, "grad_norm": 9.65211296081543, "learning_rate": 1.0881006864988558e-05, "loss": 0.684, "step": 4456 }, { "epoch": 1.27, "grad_norm": 10.292887687683105, "learning_rate": 1.0876716247139588e-05, "loss": 0.623, "step": 4457 }, { "epoch": 1.28, "grad_norm": 8.282696723937988, "learning_rate": 1.0872425629290617e-05, "loss": 0.6805, "step": 4458 }, { "epoch": 1.28, "grad_norm": 11.467270851135254, "learning_rate": 1.0868135011441647e-05, "loss": 0.7225, "step": 4459 }, { "epoch": 1.28, "grad_norm": 13.024991035461426, "learning_rate": 1.0863844393592678e-05, "loss": 0.7548, "step": 4460 }, { "epoch": 1.28, "grad_norm": 9.389389991760254, "learning_rate": 1.0859553775743708e-05, "loss": 0.7311, "step": 4461 }, { "epoch": 1.28, "grad_norm": 10.110858917236328, "learning_rate": 1.0855263157894737e-05, "loss": 0.5706, "step": 4462 }, { "epoch": 1.28, "grad_norm": 8.631179809570312, "learning_rate": 1.0850972540045768e-05, "loss": 0.6806, "step": 4463 }, { "epoch": 1.28, "grad_norm": 10.76997184753418, "learning_rate": 1.0846681922196796e-05, "loss": 1.02, "step": 4464 }, { "epoch": 1.28, "grad_norm": 11.195799827575684, "learning_rate": 1.0842391304347827e-05, "loss": 0.7281, "step": 4465 }, { "epoch": 1.28, "grad_norm": 11.992496490478516, "learning_rate": 1.0838100686498856e-05, "loss": 0.938, "step": 4466 }, { "epoch": 1.28, "grad_norm": 10.878107070922852, "learning_rate": 1.0833810068649886e-05, "loss": 0.8651, "step": 4467 }, { "epoch": 1.28, "grad_norm": 11.575766563415527, "learning_rate": 1.0829519450800915e-05, "loss": 0.7973, "step": 4468 }, { "epoch": 1.28, "grad_norm": 8.012828826904297, "learning_rate": 1.0825228832951945e-05, "loss": 0.7017, "step": 4469 }, { "epoch": 1.28, "grad_norm": 9.665970802307129, "learning_rate": 1.0820938215102976e-05, "loss": 0.7088, "step": 4470 }, { "epoch": 1.28, "grad_norm": 9.927847862243652, "learning_rate": 1.0816647597254005e-05, "loss": 0.8645, "step": 4471 }, { "epoch": 1.28, "grad_norm": 9.595250129699707, "learning_rate": 1.0812356979405035e-05, "loss": 0.8021, "step": 4472 }, { "epoch": 1.28, "grad_norm": 11.20498275756836, "learning_rate": 1.0808066361556064e-05, "loss": 0.8367, "step": 4473 }, { "epoch": 1.28, "grad_norm": 10.3494234085083, "learning_rate": 1.0803775743707094e-05, "loss": 0.7279, "step": 4474 }, { "epoch": 1.28, "grad_norm": 8.78724193572998, "learning_rate": 1.0799485125858123e-05, "loss": 0.7025, "step": 4475 }, { "epoch": 1.28, "grad_norm": 9.069414138793945, "learning_rate": 1.0795194508009153e-05, "loss": 0.6999, "step": 4476 }, { "epoch": 1.28, "grad_norm": 12.110283851623535, "learning_rate": 1.0790903890160182e-05, "loss": 0.7431, "step": 4477 }, { "epoch": 1.28, "grad_norm": 10.447746276855469, "learning_rate": 1.0786613272311214e-05, "loss": 0.8934, "step": 4478 }, { "epoch": 1.28, "grad_norm": 9.537714958190918, "learning_rate": 1.0782322654462243e-05, "loss": 0.7105, "step": 4479 }, { "epoch": 1.28, "grad_norm": 10.018208503723145, "learning_rate": 1.0778032036613274e-05, "loss": 0.8458, "step": 4480 }, { "epoch": 1.28, "grad_norm": 10.869131088256836, "learning_rate": 1.0773741418764302e-05, "loss": 0.6133, "step": 4481 }, { "epoch": 1.28, "grad_norm": 10.848339080810547, "learning_rate": 1.0769450800915331e-05, "loss": 0.8584, "step": 4482 }, { "epoch": 1.28, "grad_norm": 12.082961082458496, "learning_rate": 1.0765160183066362e-05, "loss": 0.9047, "step": 4483 }, { "epoch": 1.28, "grad_norm": 10.2700834274292, "learning_rate": 1.076086956521739e-05, "loss": 0.4975, "step": 4484 }, { "epoch": 1.28, "grad_norm": 10.529144287109375, "learning_rate": 1.075657894736842e-05, "loss": 0.9198, "step": 4485 }, { "epoch": 1.28, "grad_norm": 11.378095626831055, "learning_rate": 1.0752288329519451e-05, "loss": 0.7958, "step": 4486 }, { "epoch": 1.28, "grad_norm": 10.02692985534668, "learning_rate": 1.0747997711670482e-05, "loss": 0.9134, "step": 4487 }, { "epoch": 1.28, "grad_norm": 8.611089706420898, "learning_rate": 1.074370709382151e-05, "loss": 0.7818, "step": 4488 }, { "epoch": 1.28, "grad_norm": 8.577508926391602, "learning_rate": 1.0739416475972541e-05, "loss": 0.5102, "step": 4489 }, { "epoch": 1.28, "grad_norm": 10.593514442443848, "learning_rate": 1.073512585812357e-05, "loss": 0.7386, "step": 4490 }, { "epoch": 1.28, "grad_norm": 8.785354614257812, "learning_rate": 1.07308352402746e-05, "loss": 0.4635, "step": 4491 }, { "epoch": 1.28, "grad_norm": 11.214665412902832, "learning_rate": 1.0726544622425629e-05, "loss": 1.0071, "step": 4492 }, { "epoch": 1.29, "grad_norm": 10.643991470336914, "learning_rate": 1.072225400457666e-05, "loss": 0.7866, "step": 4493 }, { "epoch": 1.29, "grad_norm": 11.196982383728027, "learning_rate": 1.0717963386727688e-05, "loss": 0.9896, "step": 4494 }, { "epoch": 1.29, "grad_norm": 9.391378402709961, "learning_rate": 1.0713672768878719e-05, "loss": 0.7679, "step": 4495 }, { "epoch": 1.29, "grad_norm": 8.35649585723877, "learning_rate": 1.0709382151029749e-05, "loss": 0.5818, "step": 4496 }, { "epoch": 1.29, "grad_norm": 12.581069946289062, "learning_rate": 1.0705091533180778e-05, "loss": 0.7659, "step": 4497 }, { "epoch": 1.29, "grad_norm": 8.290343284606934, "learning_rate": 1.0700800915331808e-05, "loss": 0.7201, "step": 4498 }, { "epoch": 1.29, "grad_norm": 10.348004341125488, "learning_rate": 1.0696510297482837e-05, "loss": 0.811, "step": 4499 }, { "epoch": 1.29, "grad_norm": 12.528030395507812, "learning_rate": 1.0692219679633868e-05, "loss": 0.9127, "step": 4500 }, { "epoch": 1.29, "grad_norm": 10.638726234436035, "learning_rate": 1.0687929061784896e-05, "loss": 0.5826, "step": 4501 }, { "epoch": 1.29, "grad_norm": 11.5818452835083, "learning_rate": 1.0683638443935927e-05, "loss": 0.792, "step": 4502 }, { "epoch": 1.29, "grad_norm": 12.89651107788086, "learning_rate": 1.0679347826086957e-05, "loss": 0.7994, "step": 4503 }, { "epoch": 1.29, "grad_norm": 11.800990104675293, "learning_rate": 1.0675057208237988e-05, "loss": 1.1297, "step": 4504 }, { "epoch": 1.29, "grad_norm": 11.796900749206543, "learning_rate": 1.0670766590389017e-05, "loss": 0.8113, "step": 4505 }, { "epoch": 1.29, "grad_norm": 12.544075012207031, "learning_rate": 1.0666475972540047e-05, "loss": 0.8221, "step": 4506 }, { "epoch": 1.29, "grad_norm": 9.659513473510742, "learning_rate": 1.0662185354691076e-05, "loss": 0.7253, "step": 4507 }, { "epoch": 1.29, "grad_norm": 12.628800392150879, "learning_rate": 1.0657894736842106e-05, "loss": 0.9197, "step": 4508 }, { "epoch": 1.29, "grad_norm": 9.06775951385498, "learning_rate": 1.0653604118993135e-05, "loss": 0.7415, "step": 4509 }, { "epoch": 1.29, "grad_norm": 8.520203590393066, "learning_rate": 1.0649313501144164e-05, "loss": 0.7182, "step": 4510 }, { "epoch": 1.29, "grad_norm": 8.279836654663086, "learning_rate": 1.0645022883295194e-05, "loss": 0.5338, "step": 4511 }, { "epoch": 1.29, "grad_norm": 10.691319465637207, "learning_rate": 1.0640732265446225e-05, "loss": 0.6707, "step": 4512 }, { "epoch": 1.29, "grad_norm": 11.865562438964844, "learning_rate": 1.0636441647597255e-05, "loss": 0.9687, "step": 4513 }, { "epoch": 1.29, "grad_norm": 8.937037467956543, "learning_rate": 1.0632151029748284e-05, "loss": 0.6894, "step": 4514 }, { "epoch": 1.29, "grad_norm": 12.400564193725586, "learning_rate": 1.0627860411899314e-05, "loss": 1.2345, "step": 4515 }, { "epoch": 1.29, "grad_norm": 8.49405288696289, "learning_rate": 1.0623569794050343e-05, "loss": 0.4766, "step": 4516 }, { "epoch": 1.29, "grad_norm": 8.88058853149414, "learning_rate": 1.0619279176201374e-05, "loss": 0.6505, "step": 4517 }, { "epoch": 1.29, "grad_norm": 8.913090705871582, "learning_rate": 1.0614988558352402e-05, "loss": 0.6876, "step": 4518 }, { "epoch": 1.29, "grad_norm": 10.349099159240723, "learning_rate": 1.0610697940503433e-05, "loss": 0.7209, "step": 4519 }, { "epoch": 1.29, "grad_norm": 11.539715766906738, "learning_rate": 1.0606407322654463e-05, "loss": 0.9389, "step": 4520 }, { "epoch": 1.29, "grad_norm": 9.029921531677246, "learning_rate": 1.0602116704805492e-05, "loss": 0.6983, "step": 4521 }, { "epoch": 1.29, "grad_norm": 10.217121124267578, "learning_rate": 1.0597826086956523e-05, "loss": 1.0282, "step": 4522 }, { "epoch": 1.29, "grad_norm": 9.556905746459961, "learning_rate": 1.0593535469107551e-05, "loss": 0.6441, "step": 4523 }, { "epoch": 1.29, "grad_norm": 13.417732238769531, "learning_rate": 1.0589244851258582e-05, "loss": 0.9225, "step": 4524 }, { "epoch": 1.29, "grad_norm": 10.074112892150879, "learning_rate": 1.058495423340961e-05, "loss": 0.7527, "step": 4525 }, { "epoch": 1.29, "grad_norm": 10.604214668273926, "learning_rate": 1.0580663615560641e-05, "loss": 0.7732, "step": 4526 }, { "epoch": 1.29, "grad_norm": 10.65685749053955, "learning_rate": 1.057637299771167e-05, "loss": 1.0403, "step": 4527 }, { "epoch": 1.3, "grad_norm": 8.894769668579102, "learning_rate": 1.05720823798627e-05, "loss": 0.6774, "step": 4528 }, { "epoch": 1.3, "grad_norm": 9.923067092895508, "learning_rate": 1.056779176201373e-05, "loss": 0.5607, "step": 4529 }, { "epoch": 1.3, "grad_norm": 8.149468421936035, "learning_rate": 1.0563501144164761e-05, "loss": 0.5877, "step": 4530 }, { "epoch": 1.3, "grad_norm": 10.948150634765625, "learning_rate": 1.055921052631579e-05, "loss": 1.0175, "step": 4531 }, { "epoch": 1.3, "grad_norm": 12.211588859558105, "learning_rate": 1.055491990846682e-05, "loss": 0.8954, "step": 4532 }, { "epoch": 1.3, "grad_norm": 12.116756439208984, "learning_rate": 1.0550629290617849e-05, "loss": 0.9302, "step": 4533 }, { "epoch": 1.3, "grad_norm": 9.606791496276855, "learning_rate": 1.054633867276888e-05, "loss": 0.7397, "step": 4534 }, { "epoch": 1.3, "grad_norm": 8.567057609558105, "learning_rate": 1.0542048054919908e-05, "loss": 0.5171, "step": 4535 }, { "epoch": 1.3, "grad_norm": 10.134928703308105, "learning_rate": 1.0537757437070937e-05, "loss": 0.7642, "step": 4536 }, { "epoch": 1.3, "grad_norm": 12.260796546936035, "learning_rate": 1.0533466819221968e-05, "loss": 0.7348, "step": 4537 }, { "epoch": 1.3, "grad_norm": 8.267999649047852, "learning_rate": 1.0529176201372998e-05, "loss": 0.6745, "step": 4538 }, { "epoch": 1.3, "grad_norm": 10.089160919189453, "learning_rate": 1.0524885583524028e-05, "loss": 0.7294, "step": 4539 }, { "epoch": 1.3, "grad_norm": 11.858624458312988, "learning_rate": 1.0520594965675057e-05, "loss": 1.0909, "step": 4540 }, { "epoch": 1.3, "grad_norm": 9.252303123474121, "learning_rate": 1.0516304347826088e-05, "loss": 0.5499, "step": 4541 }, { "epoch": 1.3, "grad_norm": 10.532364845275879, "learning_rate": 1.0512013729977116e-05, "loss": 0.716, "step": 4542 }, { "epoch": 1.3, "grad_norm": 10.907944679260254, "learning_rate": 1.0507723112128147e-05, "loss": 0.8436, "step": 4543 }, { "epoch": 1.3, "grad_norm": 9.34953498840332, "learning_rate": 1.0503432494279176e-05, "loss": 0.9408, "step": 4544 }, { "epoch": 1.3, "grad_norm": 8.764162063598633, "learning_rate": 1.0499141876430206e-05, "loss": 0.7812, "step": 4545 }, { "epoch": 1.3, "grad_norm": 10.903592109680176, "learning_rate": 1.0494851258581237e-05, "loss": 0.9772, "step": 4546 }, { "epoch": 1.3, "grad_norm": 9.145940780639648, "learning_rate": 1.0490560640732267e-05, "loss": 0.5603, "step": 4547 }, { "epoch": 1.3, "grad_norm": 8.859952926635742, "learning_rate": 1.0486270022883296e-05, "loss": 0.493, "step": 4548 }, { "epoch": 1.3, "grad_norm": 10.92986011505127, "learning_rate": 1.0481979405034325e-05, "loss": 0.8482, "step": 4549 }, { "epoch": 1.3, "grad_norm": 8.599848747253418, "learning_rate": 1.0477688787185355e-05, "loss": 0.7271, "step": 4550 }, { "epoch": 1.3, "grad_norm": 9.906673431396484, "learning_rate": 1.0473398169336384e-05, "loss": 0.7742, "step": 4551 }, { "epoch": 1.3, "grad_norm": 12.310548782348633, "learning_rate": 1.0469107551487414e-05, "loss": 0.6319, "step": 4552 }, { "epoch": 1.3, "grad_norm": 11.670842170715332, "learning_rate": 1.0464816933638443e-05, "loss": 0.8078, "step": 4553 }, { "epoch": 1.3, "grad_norm": 11.398588180541992, "learning_rate": 1.0460526315789474e-05, "loss": 0.9609, "step": 4554 }, { "epoch": 1.3, "grad_norm": 11.271353721618652, "learning_rate": 1.0456235697940504e-05, "loss": 0.8449, "step": 4555 }, { "epoch": 1.3, "grad_norm": 10.082358360290527, "learning_rate": 1.0451945080091534e-05, "loss": 0.7383, "step": 4556 }, { "epoch": 1.3, "grad_norm": 9.931964874267578, "learning_rate": 1.0447654462242563e-05, "loss": 0.6965, "step": 4557 }, { "epoch": 1.3, "grad_norm": 10.41787052154541, "learning_rate": 1.0443363844393594e-05, "loss": 0.934, "step": 4558 }, { "epoch": 1.3, "grad_norm": 12.855107307434082, "learning_rate": 1.0439073226544622e-05, "loss": 0.8262, "step": 4559 }, { "epoch": 1.3, "grad_norm": 11.924995422363281, "learning_rate": 1.0434782608695653e-05, "loss": 1.0188, "step": 4560 }, { "epoch": 1.3, "grad_norm": 10.832833290100098, "learning_rate": 1.0430491990846682e-05, "loss": 0.8195, "step": 4561 }, { "epoch": 1.3, "grad_norm": 7.587615489959717, "learning_rate": 1.042620137299771e-05, "loss": 0.539, "step": 4562 }, { "epoch": 1.31, "grad_norm": 8.638379096984863, "learning_rate": 1.0421910755148743e-05, "loss": 0.574, "step": 4563 }, { "epoch": 1.31, "grad_norm": 8.017379760742188, "learning_rate": 1.0417620137299771e-05, "loss": 0.685, "step": 4564 }, { "epoch": 1.31, "grad_norm": 9.898755073547363, "learning_rate": 1.0413329519450802e-05, "loss": 0.8604, "step": 4565 }, { "epoch": 1.31, "grad_norm": 7.718481063842773, "learning_rate": 1.040903890160183e-05, "loss": 0.7614, "step": 4566 }, { "epoch": 1.31, "grad_norm": 10.322260856628418, "learning_rate": 1.0404748283752861e-05, "loss": 0.7968, "step": 4567 }, { "epoch": 1.31, "grad_norm": 13.542582511901855, "learning_rate": 1.040045766590389e-05, "loss": 0.8681, "step": 4568 }, { "epoch": 1.31, "grad_norm": 9.584424018859863, "learning_rate": 1.039616704805492e-05, "loss": 0.9568, "step": 4569 }, { "epoch": 1.31, "grad_norm": 12.441803932189941, "learning_rate": 1.0391876430205949e-05, "loss": 0.9867, "step": 4570 }, { "epoch": 1.31, "grad_norm": 8.160896301269531, "learning_rate": 1.038758581235698e-05, "loss": 0.569, "step": 4571 }, { "epoch": 1.31, "grad_norm": 12.349885940551758, "learning_rate": 1.038329519450801e-05, "loss": 0.8415, "step": 4572 }, { "epoch": 1.31, "grad_norm": 10.131543159484863, "learning_rate": 1.037900457665904e-05, "loss": 0.8312, "step": 4573 }, { "epoch": 1.31, "grad_norm": 10.335249900817871, "learning_rate": 1.037471395881007e-05, "loss": 0.916, "step": 4574 }, { "epoch": 1.31, "grad_norm": 9.307873725891113, "learning_rate": 1.0370423340961098e-05, "loss": 0.5624, "step": 4575 }, { "epoch": 1.31, "grad_norm": 10.228752136230469, "learning_rate": 1.0366132723112128e-05, "loss": 0.7126, "step": 4576 }, { "epoch": 1.31, "grad_norm": 12.114232063293457, "learning_rate": 1.0361842105263157e-05, "loss": 0.8773, "step": 4577 }, { "epoch": 1.31, "grad_norm": 10.18767261505127, "learning_rate": 1.0357551487414188e-05, "loss": 0.8004, "step": 4578 }, { "epoch": 1.31, "grad_norm": 10.70250129699707, "learning_rate": 1.0353260869565216e-05, "loss": 0.8739, "step": 4579 }, { "epoch": 1.31, "grad_norm": 10.009703636169434, "learning_rate": 1.0348970251716249e-05, "loss": 0.8867, "step": 4580 }, { "epoch": 1.31, "grad_norm": 8.726730346679688, "learning_rate": 1.0344679633867277e-05, "loss": 0.6418, "step": 4581 }, { "epoch": 1.31, "grad_norm": 10.187596321105957, "learning_rate": 1.0340389016018308e-05, "loss": 0.6098, "step": 4582 }, { "epoch": 1.31, "grad_norm": 11.032061576843262, "learning_rate": 1.0336098398169337e-05, "loss": 0.84, "step": 4583 }, { "epoch": 1.31, "grad_norm": 9.076355934143066, "learning_rate": 1.0331807780320367e-05, "loss": 0.9079, "step": 4584 }, { "epoch": 1.31, "grad_norm": 9.244355201721191, "learning_rate": 1.0327517162471396e-05, "loss": 0.6617, "step": 4585 }, { "epoch": 1.31, "grad_norm": 8.106156349182129, "learning_rate": 1.0323226544622426e-05, "loss": 0.5986, "step": 4586 }, { "epoch": 1.31, "grad_norm": 10.858281135559082, "learning_rate": 1.0318935926773455e-05, "loss": 0.9367, "step": 4587 }, { "epoch": 1.31, "grad_norm": 9.763096809387207, "learning_rate": 1.0314645308924484e-05, "loss": 0.8339, "step": 4588 }, { "epoch": 1.31, "grad_norm": 9.7815580368042, "learning_rate": 1.0310354691075516e-05, "loss": 0.808, "step": 4589 }, { "epoch": 1.31, "grad_norm": 7.393942832946777, "learning_rate": 1.0306064073226545e-05, "loss": 0.4532, "step": 4590 }, { "epoch": 1.31, "grad_norm": 11.11086368560791, "learning_rate": 1.0301773455377575e-05, "loss": 0.7224, "step": 4591 }, { "epoch": 1.31, "grad_norm": 9.721107482910156, "learning_rate": 1.0297482837528604e-05, "loss": 0.8394, "step": 4592 }, { "epoch": 1.31, "grad_norm": 10.71251392364502, "learning_rate": 1.0293192219679634e-05, "loss": 0.9271, "step": 4593 }, { "epoch": 1.31, "grad_norm": 7.635794639587402, "learning_rate": 1.0288901601830663e-05, "loss": 0.597, "step": 4594 }, { "epoch": 1.31, "grad_norm": 10.38532829284668, "learning_rate": 1.0284610983981694e-05, "loss": 0.8118, "step": 4595 }, { "epoch": 1.31, "grad_norm": 10.115407943725586, "learning_rate": 1.0280320366132722e-05, "loss": 0.7534, "step": 4596 }, { "epoch": 1.31, "grad_norm": 10.365483283996582, "learning_rate": 1.0276029748283753e-05, "loss": 1.1437, "step": 4597 }, { "epoch": 1.32, "grad_norm": 10.19611644744873, "learning_rate": 1.0271739130434783e-05, "loss": 0.7595, "step": 4598 }, { "epoch": 1.32, "grad_norm": 11.781787872314453, "learning_rate": 1.0267448512585814e-05, "loss": 0.8448, "step": 4599 }, { "epoch": 1.32, "grad_norm": 10.059295654296875, "learning_rate": 1.0263157894736843e-05, "loss": 0.6639, "step": 4600 }, { "epoch": 1.32, "grad_norm": 10.477483749389648, "learning_rate": 1.0258867276887871e-05, "loss": 0.7552, "step": 4601 }, { "epoch": 1.32, "grad_norm": 9.580055236816406, "learning_rate": 1.0254576659038902e-05, "loss": 0.9293, "step": 4602 }, { "epoch": 1.32, "grad_norm": 9.448568344116211, "learning_rate": 1.025028604118993e-05, "loss": 0.716, "step": 4603 }, { "epoch": 1.32, "grad_norm": 13.001675605773926, "learning_rate": 1.0245995423340961e-05, "loss": 0.9883, "step": 4604 }, { "epoch": 1.32, "grad_norm": 9.869972229003906, "learning_rate": 1.024170480549199e-05, "loss": 0.6913, "step": 4605 }, { "epoch": 1.32, "grad_norm": 9.498185157775879, "learning_rate": 1.0237414187643022e-05, "loss": 0.7251, "step": 4606 }, { "epoch": 1.32, "grad_norm": 12.074101448059082, "learning_rate": 1.023312356979405e-05, "loss": 0.6859, "step": 4607 }, { "epoch": 1.32, "grad_norm": 9.54678726196289, "learning_rate": 1.0228832951945081e-05, "loss": 0.7507, "step": 4608 }, { "epoch": 1.32, "grad_norm": 9.271499633789062, "learning_rate": 1.022454233409611e-05, "loss": 0.6925, "step": 4609 }, { "epoch": 1.32, "grad_norm": 12.213305473327637, "learning_rate": 1.022025171624714e-05, "loss": 0.965, "step": 4610 }, { "epoch": 1.32, "grad_norm": 11.851221084594727, "learning_rate": 1.021596109839817e-05, "loss": 0.6818, "step": 4611 }, { "epoch": 1.32, "grad_norm": 10.92638111114502, "learning_rate": 1.02116704805492e-05, "loss": 0.8467, "step": 4612 }, { "epoch": 1.32, "grad_norm": 9.772537231445312, "learning_rate": 1.0207379862700228e-05, "loss": 0.6508, "step": 4613 }, { "epoch": 1.32, "grad_norm": 10.279985427856445, "learning_rate": 1.0203089244851257e-05, "loss": 1.0279, "step": 4614 }, { "epoch": 1.32, "grad_norm": 9.189204216003418, "learning_rate": 1.019879862700229e-05, "loss": 0.8367, "step": 4615 }, { "epoch": 1.32, "grad_norm": 10.385782241821289, "learning_rate": 1.0194508009153318e-05, "loss": 0.9493, "step": 4616 }, { "epoch": 1.32, "grad_norm": 10.314265251159668, "learning_rate": 1.0190217391304349e-05, "loss": 0.7412, "step": 4617 }, { "epoch": 1.32, "grad_norm": 11.677253723144531, "learning_rate": 1.0185926773455377e-05, "loss": 0.6281, "step": 4618 }, { "epoch": 1.32, "grad_norm": 9.144380569458008, "learning_rate": 1.0181636155606408e-05, "loss": 0.7098, "step": 4619 }, { "epoch": 1.32, "grad_norm": 12.974599838256836, "learning_rate": 1.0177345537757437e-05, "loss": 1.1372, "step": 4620 }, { "epoch": 1.32, "grad_norm": 11.553279876708984, "learning_rate": 1.0173054919908467e-05, "loss": 0.9577, "step": 4621 }, { "epoch": 1.32, "grad_norm": 10.324438095092773, "learning_rate": 1.0168764302059496e-05, "loss": 0.6501, "step": 4622 }, { "epoch": 1.32, "grad_norm": 13.246345520019531, "learning_rate": 1.0164473684210528e-05, "loss": 1.1245, "step": 4623 }, { "epoch": 1.32, "grad_norm": 9.062743186950684, "learning_rate": 1.0160183066361557e-05, "loss": 0.6808, "step": 4624 }, { "epoch": 1.32, "grad_norm": 10.063706398010254, "learning_rate": 1.0155892448512587e-05, "loss": 0.8209, "step": 4625 }, { "epoch": 1.32, "grad_norm": 8.378332138061523, "learning_rate": 1.0151601830663616e-05, "loss": 0.4858, "step": 4626 }, { "epoch": 1.32, "grad_norm": 9.984187126159668, "learning_rate": 1.0147311212814645e-05, "loss": 0.7595, "step": 4627 }, { "epoch": 1.32, "grad_norm": 12.664240837097168, "learning_rate": 1.0143020594965675e-05, "loss": 0.8782, "step": 4628 }, { "epoch": 1.32, "grad_norm": 8.787510871887207, "learning_rate": 1.0138729977116704e-05, "loss": 0.7938, "step": 4629 }, { "epoch": 1.32, "grad_norm": 9.451238632202148, "learning_rate": 1.0134439359267734e-05, "loss": 0.7193, "step": 4630 }, { "epoch": 1.32, "grad_norm": 10.58932876586914, "learning_rate": 1.0130148741418763e-05, "loss": 0.6507, "step": 4631 }, { "epoch": 1.32, "grad_norm": 9.03365707397461, "learning_rate": 1.0125858123569795e-05, "loss": 0.7437, "step": 4632 }, { "epoch": 1.33, "grad_norm": 8.374075889587402, "learning_rate": 1.0121567505720824e-05, "loss": 0.6675, "step": 4633 }, { "epoch": 1.33, "grad_norm": 8.497538566589355, "learning_rate": 1.0117276887871855e-05, "loss": 0.6729, "step": 4634 }, { "epoch": 1.33, "grad_norm": 9.651091575622559, "learning_rate": 1.0112986270022883e-05, "loss": 0.6992, "step": 4635 }, { "epoch": 1.33, "grad_norm": 10.816292762756348, "learning_rate": 1.0108695652173914e-05, "loss": 0.8129, "step": 4636 }, { "epoch": 1.33, "grad_norm": 9.455986022949219, "learning_rate": 1.0104405034324943e-05, "loss": 0.7505, "step": 4637 }, { "epoch": 1.33, "grad_norm": 9.267904281616211, "learning_rate": 1.0100114416475973e-05, "loss": 0.6653, "step": 4638 }, { "epoch": 1.33, "grad_norm": 10.088541984558105, "learning_rate": 1.0095823798627002e-05, "loss": 0.5034, "step": 4639 }, { "epoch": 1.33, "grad_norm": 9.960984230041504, "learning_rate": 1.0091533180778032e-05, "loss": 0.634, "step": 4640 }, { "epoch": 1.33, "grad_norm": 10.747187614440918, "learning_rate": 1.0087242562929063e-05, "loss": 0.791, "step": 4641 }, { "epoch": 1.33, "grad_norm": 8.487506866455078, "learning_rate": 1.0082951945080092e-05, "loss": 0.7181, "step": 4642 }, { "epoch": 1.33, "grad_norm": 8.82575798034668, "learning_rate": 1.0078661327231122e-05, "loss": 0.7306, "step": 4643 }, { "epoch": 1.33, "grad_norm": 9.52674674987793, "learning_rate": 1.007437070938215e-05, "loss": 0.6141, "step": 4644 }, { "epoch": 1.33, "grad_norm": 11.346635818481445, "learning_rate": 1.0070080091533181e-05, "loss": 0.7195, "step": 4645 }, { "epoch": 1.33, "grad_norm": 9.532054901123047, "learning_rate": 1.006578947368421e-05, "loss": 0.5254, "step": 4646 }, { "epoch": 1.33, "grad_norm": 9.944281578063965, "learning_rate": 1.006149885583524e-05, "loss": 0.8458, "step": 4647 }, { "epoch": 1.33, "grad_norm": 10.513945579528809, "learning_rate": 1.005720823798627e-05, "loss": 0.8692, "step": 4648 }, { "epoch": 1.33, "grad_norm": 10.14871883392334, "learning_rate": 1.0052917620137301e-05, "loss": 0.6585, "step": 4649 }, { "epoch": 1.33, "grad_norm": 9.397774696350098, "learning_rate": 1.004862700228833e-05, "loss": 0.5747, "step": 4650 }, { "epoch": 1.33, "grad_norm": 10.431659698486328, "learning_rate": 1.004433638443936e-05, "loss": 0.7969, "step": 4651 }, { "epoch": 1.33, "grad_norm": 11.2437105178833, "learning_rate": 1.004004576659039e-05, "loss": 0.6863, "step": 4652 }, { "epoch": 1.33, "grad_norm": 11.774871826171875, "learning_rate": 1.0035755148741418e-05, "loss": 1.0087, "step": 4653 }, { "epoch": 1.33, "grad_norm": 12.120447158813477, "learning_rate": 1.0031464530892449e-05, "loss": 1.0396, "step": 4654 }, { "epoch": 1.33, "grad_norm": 9.248188018798828, "learning_rate": 1.0027173913043477e-05, "loss": 0.6938, "step": 4655 }, { "epoch": 1.33, "grad_norm": 8.619288444519043, "learning_rate": 1.0022883295194508e-05, "loss": 0.6464, "step": 4656 }, { "epoch": 1.33, "grad_norm": 11.303775787353516, "learning_rate": 1.0018592677345537e-05, "loss": 0.7225, "step": 4657 }, { "epoch": 1.33, "grad_norm": 11.534278869628906, "learning_rate": 1.0014302059496569e-05, "loss": 0.7771, "step": 4658 }, { "epoch": 1.33, "grad_norm": 10.095863342285156, "learning_rate": 1.0010011441647598e-05, "loss": 0.6307, "step": 4659 }, { "epoch": 1.33, "grad_norm": 9.884389877319336, "learning_rate": 1.0005720823798628e-05, "loss": 0.593, "step": 4660 }, { "epoch": 1.33, "grad_norm": 10.065033912658691, "learning_rate": 1.0001430205949657e-05, "loss": 0.6567, "step": 4661 }, { "epoch": 1.33, "grad_norm": 9.432774543762207, "learning_rate": 9.997139588100687e-06, "loss": 0.575, "step": 4662 }, { "epoch": 1.33, "grad_norm": 10.539703369140625, "learning_rate": 9.992848970251716e-06, "loss": 0.556, "step": 4663 }, { "epoch": 1.33, "grad_norm": 8.618727684020996, "learning_rate": 9.988558352402746e-06, "loss": 0.5203, "step": 4664 }, { "epoch": 1.33, "grad_norm": 8.832174301147461, "learning_rate": 9.984267734553775e-06, "loss": 0.7284, "step": 4665 }, { "epoch": 1.33, "grad_norm": 10.143733978271484, "learning_rate": 9.979977116704806e-06, "loss": 0.6738, "step": 4666 }, { "epoch": 1.33, "grad_norm": 9.429113388061523, "learning_rate": 9.975686498855836e-06, "loss": 0.5794, "step": 4667 }, { "epoch": 1.34, "grad_norm": 8.873308181762695, "learning_rate": 9.971395881006865e-06, "loss": 0.5068, "step": 4668 }, { "epoch": 1.34, "grad_norm": 9.968809127807617, "learning_rate": 9.967105263157895e-06, "loss": 0.6062, "step": 4669 }, { "epoch": 1.34, "grad_norm": 10.752883911132812, "learning_rate": 9.962814645308924e-06, "loss": 0.6434, "step": 4670 }, { "epoch": 1.34, "grad_norm": 8.815367698669434, "learning_rate": 9.958524027459955e-06, "loss": 0.6924, "step": 4671 }, { "epoch": 1.34, "grad_norm": 9.490862846374512, "learning_rate": 9.954233409610983e-06, "loss": 0.9181, "step": 4672 }, { "epoch": 1.34, "grad_norm": 10.75500202178955, "learning_rate": 9.949942791762014e-06, "loss": 0.9343, "step": 4673 }, { "epoch": 1.34, "grad_norm": 12.002522468566895, "learning_rate": 9.945652173913043e-06, "loss": 0.8571, "step": 4674 }, { "epoch": 1.34, "grad_norm": 9.769218444824219, "learning_rate": 9.941361556064075e-06, "loss": 0.7145, "step": 4675 }, { "epoch": 1.34, "grad_norm": 8.448454856872559, "learning_rate": 9.937070938215104e-06, "loss": 0.6893, "step": 4676 }, { "epoch": 1.34, "grad_norm": 11.79710865020752, "learning_rate": 9.932780320366134e-06, "loss": 0.8783, "step": 4677 }, { "epoch": 1.34, "grad_norm": 11.979585647583008, "learning_rate": 9.928489702517163e-06, "loss": 0.7278, "step": 4678 }, { "epoch": 1.34, "grad_norm": 8.608783721923828, "learning_rate": 9.924199084668192e-06, "loss": 0.567, "step": 4679 }, { "epoch": 1.34, "grad_norm": 8.807727813720703, "learning_rate": 9.919908466819222e-06, "loss": 0.5357, "step": 4680 }, { "epoch": 1.34, "grad_norm": 6.912734031677246, "learning_rate": 9.91561784897025e-06, "loss": 0.4204, "step": 4681 }, { "epoch": 1.34, "grad_norm": 10.059368133544922, "learning_rate": 9.911327231121281e-06, "loss": 0.7899, "step": 4682 }, { "epoch": 1.34, "grad_norm": 11.73801040649414, "learning_rate": 9.907036613272312e-06, "loss": 0.7755, "step": 4683 }, { "epoch": 1.34, "grad_norm": 9.313863754272461, "learning_rate": 9.902745995423342e-06, "loss": 0.7283, "step": 4684 }, { "epoch": 1.34, "grad_norm": 10.158074378967285, "learning_rate": 9.898455377574371e-06, "loss": 0.6462, "step": 4685 }, { "epoch": 1.34, "grad_norm": 10.189724922180176, "learning_rate": 9.894164759725401e-06, "loss": 0.9259, "step": 4686 }, { "epoch": 1.34, "grad_norm": 11.322792053222656, "learning_rate": 9.88987414187643e-06, "loss": 0.5923, "step": 4687 }, { "epoch": 1.34, "grad_norm": 8.962874412536621, "learning_rate": 9.88558352402746e-06, "loss": 0.7261, "step": 4688 }, { "epoch": 1.34, "grad_norm": 10.788043975830078, "learning_rate": 9.88129290617849e-06, "loss": 0.8296, "step": 4689 }, { "epoch": 1.34, "grad_norm": 13.454601287841797, "learning_rate": 9.87700228832952e-06, "loss": 0.7895, "step": 4690 }, { "epoch": 1.34, "grad_norm": 10.123590469360352, "learning_rate": 9.872711670480549e-06, "loss": 0.6016, "step": 4691 }, { "epoch": 1.34, "grad_norm": 7.896024227142334, "learning_rate": 9.868421052631579e-06, "loss": 0.6095, "step": 4692 }, { "epoch": 1.34, "grad_norm": 10.664581298828125, "learning_rate": 9.86413043478261e-06, "loss": 0.7457, "step": 4693 }, { "epoch": 1.34, "grad_norm": 11.129467964172363, "learning_rate": 9.859839816933638e-06, "loss": 0.7181, "step": 4694 }, { "epoch": 1.34, "grad_norm": 11.753436088562012, "learning_rate": 9.855549199084669e-06, "loss": 0.8247, "step": 4695 }, { "epoch": 1.34, "grad_norm": 8.845498085021973, "learning_rate": 9.851258581235698e-06, "loss": 0.5541, "step": 4696 }, { "epoch": 1.34, "grad_norm": 12.261682510375977, "learning_rate": 9.846967963386728e-06, "loss": 0.9145, "step": 4697 }, { "epoch": 1.34, "grad_norm": 11.292670249938965, "learning_rate": 9.842677345537757e-06, "loss": 0.757, "step": 4698 }, { "epoch": 1.34, "grad_norm": 13.406408309936523, "learning_rate": 9.838386727688787e-06, "loss": 1.1041, "step": 4699 }, { "epoch": 1.34, "grad_norm": 10.857704162597656, "learning_rate": 9.834096109839818e-06, "loss": 0.669, "step": 4700 }, { "epoch": 1.34, "grad_norm": 11.81068229675293, "learning_rate": 9.829805491990848e-06, "loss": 0.9058, "step": 4701 }, { "epoch": 1.34, "grad_norm": 9.678082466125488, "learning_rate": 9.825514874141877e-06, "loss": 0.7138, "step": 4702 }, { "epoch": 1.35, "grad_norm": 9.629570007324219, "learning_rate": 9.821224256292907e-06, "loss": 0.7891, "step": 4703 }, { "epoch": 1.35, "grad_norm": 8.208858489990234, "learning_rate": 9.816933638443936e-06, "loss": 0.7526, "step": 4704 }, { "epoch": 1.35, "grad_norm": 11.03001880645752, "learning_rate": 9.812643020594967e-06, "loss": 0.9222, "step": 4705 }, { "epoch": 1.35, "grad_norm": 13.611908912658691, "learning_rate": 9.808352402745995e-06, "loss": 0.8673, "step": 4706 }, { "epoch": 1.35, "grad_norm": 11.656730651855469, "learning_rate": 9.804061784897024e-06, "loss": 0.6069, "step": 4707 }, { "epoch": 1.35, "grad_norm": 10.210548400878906, "learning_rate": 9.799771167048055e-06, "loss": 0.7342, "step": 4708 }, { "epoch": 1.35, "grad_norm": 10.285316467285156, "learning_rate": 9.795480549199085e-06, "loss": 0.9595, "step": 4709 }, { "epoch": 1.35, "grad_norm": 10.383859634399414, "learning_rate": 9.791189931350116e-06, "loss": 0.6653, "step": 4710 }, { "epoch": 1.35, "grad_norm": 9.29322338104248, "learning_rate": 9.786899313501144e-06, "loss": 0.6616, "step": 4711 }, { "epoch": 1.35, "grad_norm": 11.92971134185791, "learning_rate": 9.782608695652175e-06, "loss": 0.9191, "step": 4712 }, { "epoch": 1.35, "grad_norm": 9.166658401489258, "learning_rate": 9.778318077803204e-06, "loss": 0.6697, "step": 4713 }, { "epoch": 1.35, "grad_norm": 10.312832832336426, "learning_rate": 9.774027459954234e-06, "loss": 0.8248, "step": 4714 }, { "epoch": 1.35, "grad_norm": 12.214032173156738, "learning_rate": 9.769736842105263e-06, "loss": 0.7471, "step": 4715 }, { "epoch": 1.35, "grad_norm": 11.253389358520508, "learning_rate": 9.765446224256293e-06, "loss": 0.7021, "step": 4716 }, { "epoch": 1.35, "grad_norm": 11.42955493927002, "learning_rate": 9.761155606407322e-06, "loss": 0.7276, "step": 4717 }, { "epoch": 1.35, "grad_norm": 9.202191352844238, "learning_rate": 9.756864988558354e-06, "loss": 0.6752, "step": 4718 }, { "epoch": 1.35, "grad_norm": 10.934903144836426, "learning_rate": 9.752574370709383e-06, "loss": 0.7681, "step": 4719 }, { "epoch": 1.35, "grad_norm": 10.316263198852539, "learning_rate": 9.748283752860412e-06, "loss": 0.7321, "step": 4720 }, { "epoch": 1.35, "grad_norm": 9.91502857208252, "learning_rate": 9.743993135011442e-06, "loss": 0.8845, "step": 4721 }, { "epoch": 1.35, "grad_norm": 9.166141510009766, "learning_rate": 9.739702517162471e-06, "loss": 0.7623, "step": 4722 }, { "epoch": 1.35, "grad_norm": 9.540265083312988, "learning_rate": 9.735411899313501e-06, "loss": 0.8254, "step": 4723 }, { "epoch": 1.35, "grad_norm": 10.553126335144043, "learning_rate": 9.73112128146453e-06, "loss": 0.6477, "step": 4724 }, { "epoch": 1.35, "grad_norm": 10.808560371398926, "learning_rate": 9.72683066361556e-06, "loss": 0.8561, "step": 4725 }, { "epoch": 1.35, "grad_norm": 8.343021392822266, "learning_rate": 9.722540045766591e-06, "loss": 0.7708, "step": 4726 }, { "epoch": 1.35, "grad_norm": 8.422737121582031, "learning_rate": 9.718249427917622e-06, "loss": 0.5716, "step": 4727 }, { "epoch": 1.35, "grad_norm": 9.480222702026367, "learning_rate": 9.71395881006865e-06, "loss": 0.7948, "step": 4728 }, { "epoch": 1.35, "grad_norm": 12.601561546325684, "learning_rate": 9.70966819221968e-06, "loss": 0.9539, "step": 4729 }, { "epoch": 1.35, "grad_norm": 8.475797653198242, "learning_rate": 9.70537757437071e-06, "loss": 0.597, "step": 4730 }, { "epoch": 1.35, "grad_norm": 9.0745267868042, "learning_rate": 9.70108695652174e-06, "loss": 0.6777, "step": 4731 }, { "epoch": 1.35, "grad_norm": 9.286697387695312, "learning_rate": 9.696796338672769e-06, "loss": 0.6527, "step": 4732 }, { "epoch": 1.35, "grad_norm": 10.54956340789795, "learning_rate": 9.692505720823798e-06, "loss": 0.6245, "step": 4733 }, { "epoch": 1.35, "grad_norm": 9.258973121643066, "learning_rate": 9.688215102974828e-06, "loss": 0.6062, "step": 4734 }, { "epoch": 1.35, "grad_norm": 9.817070007324219, "learning_rate": 9.683924485125858e-06, "loss": 0.6548, "step": 4735 }, { "epoch": 1.35, "grad_norm": 7.8161301612854, "learning_rate": 9.679633867276889e-06, "loss": 0.6321, "step": 4736 }, { "epoch": 1.35, "grad_norm": 8.645702362060547, "learning_rate": 9.675343249427918e-06, "loss": 0.5972, "step": 4737 }, { "epoch": 1.36, "grad_norm": 10.85690689086914, "learning_rate": 9.671052631578948e-06, "loss": 0.7775, "step": 4738 }, { "epoch": 1.36, "grad_norm": 10.943389892578125, "learning_rate": 9.666762013729977e-06, "loss": 0.8891, "step": 4739 }, { "epoch": 1.36, "grad_norm": 9.511855125427246, "learning_rate": 9.662471395881007e-06, "loss": 0.6053, "step": 4740 }, { "epoch": 1.36, "grad_norm": 11.409897804260254, "learning_rate": 9.658180778032036e-06, "loss": 0.8391, "step": 4741 }, { "epoch": 1.36, "grad_norm": 10.824045181274414, "learning_rate": 9.653890160183067e-06, "loss": 0.7154, "step": 4742 }, { "epoch": 1.36, "grad_norm": 11.163288116455078, "learning_rate": 9.649599542334097e-06, "loss": 0.9376, "step": 4743 }, { "epoch": 1.36, "grad_norm": 12.635465621948242, "learning_rate": 9.645308924485128e-06, "loss": 0.8153, "step": 4744 }, { "epoch": 1.36, "grad_norm": 9.781368255615234, "learning_rate": 9.641018306636156e-06, "loss": 0.7844, "step": 4745 }, { "epoch": 1.36, "grad_norm": 10.899678230285645, "learning_rate": 9.636727688787185e-06, "loss": 0.8468, "step": 4746 }, { "epoch": 1.36, "grad_norm": 10.143167495727539, "learning_rate": 9.632437070938216e-06, "loss": 0.7934, "step": 4747 }, { "epoch": 1.36, "grad_norm": 10.395912170410156, "learning_rate": 9.628146453089244e-06, "loss": 0.6913, "step": 4748 }, { "epoch": 1.36, "grad_norm": 11.43149471282959, "learning_rate": 9.623855835240275e-06, "loss": 0.8784, "step": 4749 }, { "epoch": 1.36, "grad_norm": 9.007231712341309, "learning_rate": 9.619565217391304e-06, "loss": 0.671, "step": 4750 }, { "epoch": 1.36, "grad_norm": 9.453091621398926, "learning_rate": 9.615274599542334e-06, "loss": 0.7037, "step": 4751 }, { "epoch": 1.36, "grad_norm": 9.27692985534668, "learning_rate": 9.610983981693364e-06, "loss": 0.6657, "step": 4752 }, { "epoch": 1.36, "grad_norm": 10.366813659667969, "learning_rate": 9.606693363844395e-06, "loss": 0.5964, "step": 4753 }, { "epoch": 1.36, "grad_norm": 9.110732078552246, "learning_rate": 9.602402745995424e-06, "loss": 0.6431, "step": 4754 }, { "epoch": 1.36, "grad_norm": 8.982587814331055, "learning_rate": 9.598112128146454e-06, "loss": 0.5971, "step": 4755 }, { "epoch": 1.36, "grad_norm": 10.255237579345703, "learning_rate": 9.593821510297483e-06, "loss": 0.794, "step": 4756 }, { "epoch": 1.36, "grad_norm": 9.470309257507324, "learning_rate": 9.589530892448513e-06, "loss": 0.6336, "step": 4757 }, { "epoch": 1.36, "grad_norm": 10.870800018310547, "learning_rate": 9.585240274599542e-06, "loss": 1.029, "step": 4758 }, { "epoch": 1.36, "grad_norm": 10.282632827758789, "learning_rate": 9.580949656750571e-06, "loss": 1.0672, "step": 4759 }, { "epoch": 1.36, "grad_norm": 10.640077590942383, "learning_rate": 9.576659038901603e-06, "loss": 0.6764, "step": 4760 }, { "epoch": 1.36, "grad_norm": 8.993135452270508, "learning_rate": 9.572368421052632e-06, "loss": 0.6048, "step": 4761 }, { "epoch": 1.36, "grad_norm": 13.737961769104004, "learning_rate": 9.568077803203662e-06, "loss": 0.9287, "step": 4762 }, { "epoch": 1.36, "grad_norm": 11.905489921569824, "learning_rate": 9.563787185354691e-06, "loss": 0.7215, "step": 4763 }, { "epoch": 1.36, "grad_norm": 8.13138484954834, "learning_rate": 9.559496567505722e-06, "loss": 0.8252, "step": 4764 }, { "epoch": 1.36, "grad_norm": 11.043505668640137, "learning_rate": 9.55520594965675e-06, "loss": 0.8947, "step": 4765 }, { "epoch": 1.36, "grad_norm": 10.608139038085938, "learning_rate": 9.55091533180778e-06, "loss": 0.5847, "step": 4766 }, { "epoch": 1.36, "grad_norm": 8.512394905090332, "learning_rate": 9.54662471395881e-06, "loss": 0.7247, "step": 4767 }, { "epoch": 1.36, "grad_norm": 9.464174270629883, "learning_rate": 9.54233409610984e-06, "loss": 0.6077, "step": 4768 }, { "epoch": 1.36, "grad_norm": 14.590190887451172, "learning_rate": 9.53804347826087e-06, "loss": 0.8994, "step": 4769 }, { "epoch": 1.36, "grad_norm": 9.813478469848633, "learning_rate": 9.533752860411901e-06, "loss": 0.7307, "step": 4770 }, { "epoch": 1.36, "grad_norm": 9.709108352661133, "learning_rate": 9.52946224256293e-06, "loss": 0.7144, "step": 4771 }, { "epoch": 1.36, "grad_norm": 11.180831909179688, "learning_rate": 9.525171624713958e-06, "loss": 0.9568, "step": 4772 }, { "epoch": 1.37, "grad_norm": 8.907102584838867, "learning_rate": 9.520881006864989e-06, "loss": 0.6992, "step": 4773 }, { "epoch": 1.37, "grad_norm": 10.115985870361328, "learning_rate": 9.516590389016018e-06, "loss": 0.8351, "step": 4774 }, { "epoch": 1.37, "grad_norm": 10.451964378356934, "learning_rate": 9.512299771167048e-06, "loss": 1.1227, "step": 4775 }, { "epoch": 1.37, "grad_norm": 9.790129661560059, "learning_rate": 9.508009153318077e-06, "loss": 0.5994, "step": 4776 }, { "epoch": 1.37, "grad_norm": 9.342123985290527, "learning_rate": 9.503718535469107e-06, "loss": 0.5986, "step": 4777 }, { "epoch": 1.37, "grad_norm": 10.826268196105957, "learning_rate": 9.499427917620138e-06, "loss": 0.6164, "step": 4778 }, { "epoch": 1.37, "grad_norm": 12.784324645996094, "learning_rate": 9.495137299771168e-06, "loss": 0.7825, "step": 4779 }, { "epoch": 1.37, "grad_norm": 10.142254829406738, "learning_rate": 9.490846681922197e-06, "loss": 0.7858, "step": 4780 }, { "epoch": 1.37, "grad_norm": 10.85872745513916, "learning_rate": 9.486556064073227e-06, "loss": 0.8585, "step": 4781 }, { "epoch": 1.37, "grad_norm": 12.147515296936035, "learning_rate": 9.482265446224256e-06, "loss": 0.932, "step": 4782 }, { "epoch": 1.37, "grad_norm": 10.450340270996094, "learning_rate": 9.477974828375287e-06, "loss": 0.7762, "step": 4783 }, { "epoch": 1.37, "grad_norm": 10.950103759765625, "learning_rate": 9.473684210526315e-06, "loss": 0.5991, "step": 4784 }, { "epoch": 1.37, "grad_norm": 11.839818954467773, "learning_rate": 9.469393592677344e-06, "loss": 0.7055, "step": 4785 }, { "epoch": 1.37, "grad_norm": 10.563735008239746, "learning_rate": 9.465102974828376e-06, "loss": 0.8403, "step": 4786 }, { "epoch": 1.37, "grad_norm": 10.265862464904785, "learning_rate": 9.460812356979405e-06, "loss": 0.6591, "step": 4787 }, { "epoch": 1.37, "grad_norm": 9.833613395690918, "learning_rate": 9.456521739130436e-06, "loss": 0.7097, "step": 4788 }, { "epoch": 1.37, "grad_norm": 10.166386604309082, "learning_rate": 9.452231121281464e-06, "loss": 0.7283, "step": 4789 }, { "epoch": 1.37, "grad_norm": 10.88394832611084, "learning_rate": 9.447940503432495e-06, "loss": 0.67, "step": 4790 }, { "epoch": 1.37, "grad_norm": 10.023024559020996, "learning_rate": 9.443649885583524e-06, "loss": 0.7822, "step": 4791 }, { "epoch": 1.37, "grad_norm": 11.198458671569824, "learning_rate": 9.439359267734554e-06, "loss": 0.7167, "step": 4792 }, { "epoch": 1.37, "grad_norm": 8.456743240356445, "learning_rate": 9.435068649885583e-06, "loss": 0.7078, "step": 4793 }, { "epoch": 1.37, "grad_norm": 8.988679885864258, "learning_rate": 9.430778032036613e-06, "loss": 0.5257, "step": 4794 }, { "epoch": 1.37, "grad_norm": 8.99492073059082, "learning_rate": 9.426487414187644e-06, "loss": 0.7596, "step": 4795 }, { "epoch": 1.37, "grad_norm": 11.329377174377441, "learning_rate": 9.422196796338674e-06, "loss": 0.741, "step": 4796 }, { "epoch": 1.37, "grad_norm": 9.76819896697998, "learning_rate": 9.417906178489703e-06, "loss": 0.661, "step": 4797 }, { "epoch": 1.37, "grad_norm": 10.368531227111816, "learning_rate": 9.413615560640732e-06, "loss": 0.6498, "step": 4798 }, { "epoch": 1.37, "grad_norm": 13.120408058166504, "learning_rate": 9.409324942791762e-06, "loss": 0.7884, "step": 4799 }, { "epoch": 1.37, "grad_norm": 11.418519973754883, "learning_rate": 9.405034324942791e-06, "loss": 0.73, "step": 4800 }, { "epoch": 1.37, "grad_norm": 8.889922142028809, "learning_rate": 9.400743707093821e-06, "loss": 0.6517, "step": 4801 }, { "epoch": 1.37, "grad_norm": 8.791597366333008, "learning_rate": 9.39645308924485e-06, "loss": 0.5533, "step": 4802 }, { "epoch": 1.37, "grad_norm": 10.5639009475708, "learning_rate": 9.392162471395882e-06, "loss": 0.8333, "step": 4803 }, { "epoch": 1.37, "grad_norm": 10.931742668151855, "learning_rate": 9.387871853546911e-06, "loss": 0.6539, "step": 4804 }, { "epoch": 1.37, "grad_norm": 10.472341537475586, "learning_rate": 9.383581235697942e-06, "loss": 0.762, "step": 4805 }, { "epoch": 1.37, "grad_norm": 12.587716102600098, "learning_rate": 9.37929061784897e-06, "loss": 0.7897, "step": 4806 }, { "epoch": 1.38, "grad_norm": 11.926453590393066, "learning_rate": 9.375000000000001e-06, "loss": 0.9278, "step": 4807 }, { "epoch": 1.38, "grad_norm": 13.291482925415039, "learning_rate": 9.37070938215103e-06, "loss": 0.8511, "step": 4808 }, { "epoch": 1.38, "grad_norm": 10.288946151733398, "learning_rate": 9.36641876430206e-06, "loss": 0.929, "step": 4809 }, { "epoch": 1.38, "grad_norm": 10.623122215270996, "learning_rate": 9.362128146453089e-06, "loss": 0.6458, "step": 4810 }, { "epoch": 1.38, "grad_norm": 10.468912124633789, "learning_rate": 9.357837528604118e-06, "loss": 0.6873, "step": 4811 }, { "epoch": 1.38, "grad_norm": 11.56838607788086, "learning_rate": 9.35354691075515e-06, "loss": 0.7875, "step": 4812 }, { "epoch": 1.38, "grad_norm": 11.725969314575195, "learning_rate": 9.349256292906179e-06, "loss": 0.8911, "step": 4813 }, { "epoch": 1.38, "grad_norm": 9.419934272766113, "learning_rate": 9.344965675057209e-06, "loss": 0.7614, "step": 4814 }, { "epoch": 1.38, "grad_norm": 11.268380165100098, "learning_rate": 9.340675057208238e-06, "loss": 0.7354, "step": 4815 }, { "epoch": 1.38, "grad_norm": 10.358317375183105, "learning_rate": 9.336384439359268e-06, "loss": 0.9548, "step": 4816 }, { "epoch": 1.38, "grad_norm": 8.72009563446045, "learning_rate": 9.332093821510297e-06, "loss": 0.582, "step": 4817 }, { "epoch": 1.38, "grad_norm": 10.13125991821289, "learning_rate": 9.327803203661327e-06, "loss": 0.6966, "step": 4818 }, { "epoch": 1.38, "grad_norm": 9.3549222946167, "learning_rate": 9.323512585812356e-06, "loss": 0.6896, "step": 4819 }, { "epoch": 1.38, "grad_norm": 10.324894905090332, "learning_rate": 9.319221967963388e-06, "loss": 0.7683, "step": 4820 }, { "epoch": 1.38, "grad_norm": 8.656771659851074, "learning_rate": 9.314931350114417e-06, "loss": 0.5893, "step": 4821 }, { "epoch": 1.38, "grad_norm": 11.520030975341797, "learning_rate": 9.310640732265448e-06, "loss": 1.0155, "step": 4822 }, { "epoch": 1.38, "grad_norm": 10.97884464263916, "learning_rate": 9.306350114416476e-06, "loss": 0.7444, "step": 4823 }, { "epoch": 1.38, "grad_norm": 13.074646949768066, "learning_rate": 9.302059496567505e-06, "loss": 0.9445, "step": 4824 }, { "epoch": 1.38, "grad_norm": 12.072041511535645, "learning_rate": 9.297768878718536e-06, "loss": 0.9269, "step": 4825 }, { "epoch": 1.38, "grad_norm": 8.852041244506836, "learning_rate": 9.293478260869564e-06, "loss": 0.585, "step": 4826 }, { "epoch": 1.38, "grad_norm": 10.590184211730957, "learning_rate": 9.289187643020595e-06, "loss": 0.7778, "step": 4827 }, { "epoch": 1.38, "grad_norm": 11.690543174743652, "learning_rate": 9.284897025171624e-06, "loss": 0.878, "step": 4828 }, { "epoch": 1.38, "grad_norm": 10.135167121887207, "learning_rate": 9.280606407322656e-06, "loss": 0.9343, "step": 4829 }, { "epoch": 1.38, "grad_norm": 8.094252586364746, "learning_rate": 9.276315789473685e-06, "loss": 0.6903, "step": 4830 }, { "epoch": 1.38, "grad_norm": 10.056294441223145, "learning_rate": 9.272025171624715e-06, "loss": 0.8418, "step": 4831 }, { "epoch": 1.38, "grad_norm": 11.848834037780762, "learning_rate": 9.267734553775744e-06, "loss": 0.6992, "step": 4832 }, { "epoch": 1.38, "grad_norm": 10.331584930419922, "learning_rate": 9.263443935926774e-06, "loss": 0.9414, "step": 4833 }, { "epoch": 1.38, "grad_norm": 9.208252906799316, "learning_rate": 9.259153318077803e-06, "loss": 0.5844, "step": 4834 }, { "epoch": 1.38, "grad_norm": 9.118937492370605, "learning_rate": 9.254862700228833e-06, "loss": 0.9003, "step": 4835 }, { "epoch": 1.38, "grad_norm": 9.917899131774902, "learning_rate": 9.250572082379862e-06, "loss": 0.6907, "step": 4836 }, { "epoch": 1.38, "grad_norm": 11.160566329956055, "learning_rate": 9.246281464530893e-06, "loss": 0.7855, "step": 4837 }, { "epoch": 1.38, "grad_norm": 9.2199125289917, "learning_rate": 9.241990846681923e-06, "loss": 0.7515, "step": 4838 }, { "epoch": 1.38, "grad_norm": 10.523056983947754, "learning_rate": 9.237700228832952e-06, "loss": 0.862, "step": 4839 }, { "epoch": 1.38, "grad_norm": 8.743433952331543, "learning_rate": 9.233409610983982e-06, "loss": 0.5145, "step": 4840 }, { "epoch": 1.38, "grad_norm": 10.881135940551758, "learning_rate": 9.229118993135011e-06, "loss": 0.8192, "step": 4841 }, { "epoch": 1.39, "grad_norm": 7.766360282897949, "learning_rate": 9.224828375286042e-06, "loss": 0.528, "step": 4842 }, { "epoch": 1.39, "grad_norm": 10.022210121154785, "learning_rate": 9.22053775743707e-06, "loss": 0.7403, "step": 4843 }, { "epoch": 1.39, "grad_norm": 10.161805152893066, "learning_rate": 9.216247139588101e-06, "loss": 0.8515, "step": 4844 }, { "epoch": 1.39, "grad_norm": 10.7399263381958, "learning_rate": 9.21195652173913e-06, "loss": 0.7869, "step": 4845 }, { "epoch": 1.39, "grad_norm": 10.652128219604492, "learning_rate": 9.207665903890162e-06, "loss": 0.6249, "step": 4846 }, { "epoch": 1.39, "grad_norm": 13.797903060913086, "learning_rate": 9.20337528604119e-06, "loss": 0.8755, "step": 4847 }, { "epoch": 1.39, "grad_norm": 10.382393836975098, "learning_rate": 9.199084668192221e-06, "loss": 1.107, "step": 4848 }, { "epoch": 1.39, "grad_norm": 11.453968048095703, "learning_rate": 9.19479405034325e-06, "loss": 0.9609, "step": 4849 }, { "epoch": 1.39, "grad_norm": 9.114245414733887, "learning_rate": 9.190503432494279e-06, "loss": 0.5484, "step": 4850 }, { "epoch": 1.39, "grad_norm": 10.182549476623535, "learning_rate": 9.186212814645309e-06, "loss": 0.6344, "step": 4851 }, { "epoch": 1.39, "grad_norm": 12.673012733459473, "learning_rate": 9.181922196796338e-06, "loss": 0.9958, "step": 4852 }, { "epoch": 1.39, "grad_norm": 9.4663667678833, "learning_rate": 9.177631578947368e-06, "loss": 0.7675, "step": 4853 }, { "epoch": 1.39, "grad_norm": 9.954926490783691, "learning_rate": 9.173340961098397e-06, "loss": 0.7907, "step": 4854 }, { "epoch": 1.39, "grad_norm": 10.192892074584961, "learning_rate": 9.16905034324943e-06, "loss": 0.6709, "step": 4855 }, { "epoch": 1.39, "grad_norm": 10.08601188659668, "learning_rate": 9.164759725400458e-06, "loss": 0.6938, "step": 4856 }, { "epoch": 1.39, "grad_norm": 11.993782997131348, "learning_rate": 9.160469107551488e-06, "loss": 0.9492, "step": 4857 }, { "epoch": 1.39, "grad_norm": 8.177780151367188, "learning_rate": 9.156178489702517e-06, "loss": 0.512, "step": 4858 }, { "epoch": 1.39, "grad_norm": 9.7926025390625, "learning_rate": 9.151887871853548e-06, "loss": 0.6743, "step": 4859 }, { "epoch": 1.39, "grad_norm": 13.589574813842773, "learning_rate": 9.147597254004576e-06, "loss": 0.7877, "step": 4860 }, { "epoch": 1.39, "grad_norm": 8.5984525680542, "learning_rate": 9.143306636155607e-06, "loss": 0.6091, "step": 4861 }, { "epoch": 1.39, "grad_norm": 9.036418914794922, "learning_rate": 9.139016018306636e-06, "loss": 0.7091, "step": 4862 }, { "epoch": 1.39, "grad_norm": 9.252519607543945, "learning_rate": 9.134725400457666e-06, "loss": 0.7672, "step": 4863 }, { "epoch": 1.39, "grad_norm": 11.83324146270752, "learning_rate": 9.130434782608697e-06, "loss": 1.0583, "step": 4864 }, { "epoch": 1.39, "grad_norm": 9.944408416748047, "learning_rate": 9.126144164759725e-06, "loss": 0.9297, "step": 4865 }, { "epoch": 1.39, "grad_norm": 9.997650146484375, "learning_rate": 9.121853546910756e-06, "loss": 1.0245, "step": 4866 }, { "epoch": 1.39, "grad_norm": 8.75586223602295, "learning_rate": 9.117562929061785e-06, "loss": 0.5483, "step": 4867 }, { "epoch": 1.39, "grad_norm": 11.224337577819824, "learning_rate": 9.113272311212815e-06, "loss": 0.6652, "step": 4868 }, { "epoch": 1.39, "grad_norm": 7.3334479331970215, "learning_rate": 9.108981693363844e-06, "loss": 0.634, "step": 4869 }, { "epoch": 1.39, "grad_norm": 10.251147270202637, "learning_rate": 9.104691075514874e-06, "loss": 0.5847, "step": 4870 }, { "epoch": 1.39, "grad_norm": 9.901888847351074, "learning_rate": 9.100400457665903e-06, "loss": 0.6054, "step": 4871 }, { "epoch": 1.39, "grad_norm": 10.24634075164795, "learning_rate": 9.096109839816935e-06, "loss": 0.8071, "step": 4872 }, { "epoch": 1.39, "grad_norm": 10.110356330871582, "learning_rate": 9.091819221967964e-06, "loss": 0.7466, "step": 4873 }, { "epoch": 1.39, "grad_norm": 9.721631050109863, "learning_rate": 9.087528604118994e-06, "loss": 0.6514, "step": 4874 }, { "epoch": 1.39, "grad_norm": 7.648428916931152, "learning_rate": 9.083237986270023e-06, "loss": 0.5112, "step": 4875 }, { "epoch": 1.39, "grad_norm": 11.041352272033691, "learning_rate": 9.078947368421054e-06, "loss": 1.0079, "step": 4876 }, { "epoch": 1.4, "grad_norm": 7.543818950653076, "learning_rate": 9.074656750572082e-06, "loss": 0.5, "step": 4877 }, { "epoch": 1.4, "grad_norm": 7.59186315536499, "learning_rate": 9.070366132723111e-06, "loss": 0.6879, "step": 4878 }, { "epoch": 1.4, "grad_norm": 12.302193641662598, "learning_rate": 9.066075514874142e-06, "loss": 0.8182, "step": 4879 }, { "epoch": 1.4, "grad_norm": 7.864506244659424, "learning_rate": 9.061784897025172e-06, "loss": 0.7759, "step": 4880 }, { "epoch": 1.4, "grad_norm": 11.609857559204102, "learning_rate": 9.057494279176203e-06, "loss": 0.5229, "step": 4881 }, { "epoch": 1.4, "grad_norm": 7.691769123077393, "learning_rate": 9.053203661327231e-06, "loss": 0.6559, "step": 4882 }, { "epoch": 1.4, "grad_norm": 9.675695419311523, "learning_rate": 9.048913043478262e-06, "loss": 0.4562, "step": 4883 }, { "epoch": 1.4, "grad_norm": 10.433341026306152, "learning_rate": 9.04462242562929e-06, "loss": 0.8334, "step": 4884 }, { "epoch": 1.4, "grad_norm": 11.0515775680542, "learning_rate": 9.040331807780321e-06, "loss": 0.8456, "step": 4885 }, { "epoch": 1.4, "grad_norm": 10.080611228942871, "learning_rate": 9.03604118993135e-06, "loss": 0.718, "step": 4886 }, { "epoch": 1.4, "grad_norm": 11.835367202758789, "learning_rate": 9.03175057208238e-06, "loss": 0.9327, "step": 4887 }, { "epoch": 1.4, "grad_norm": 10.179398536682129, "learning_rate": 9.027459954233409e-06, "loss": 0.7596, "step": 4888 }, { "epoch": 1.4, "grad_norm": 10.481108665466309, "learning_rate": 9.02316933638444e-06, "loss": 0.7859, "step": 4889 }, { "epoch": 1.4, "grad_norm": 9.284229278564453, "learning_rate": 9.01887871853547e-06, "loss": 0.745, "step": 4890 }, { "epoch": 1.4, "grad_norm": 11.10659408569336, "learning_rate": 9.014588100686499e-06, "loss": 0.8434, "step": 4891 }, { "epoch": 1.4, "grad_norm": 9.546857833862305, "learning_rate": 9.010297482837529e-06, "loss": 0.8202, "step": 4892 }, { "epoch": 1.4, "grad_norm": 12.39665699005127, "learning_rate": 9.006006864988558e-06, "loss": 0.8723, "step": 4893 }, { "epoch": 1.4, "grad_norm": 12.502572059631348, "learning_rate": 9.001716247139588e-06, "loss": 1.0049, "step": 4894 }, { "epoch": 1.4, "grad_norm": 8.685144424438477, "learning_rate": 8.997425629290617e-06, "loss": 0.6399, "step": 4895 }, { "epoch": 1.4, "grad_norm": 12.4518404006958, "learning_rate": 8.993135011441648e-06, "loss": 0.9697, "step": 4896 }, { "epoch": 1.4, "grad_norm": 8.57666015625, "learning_rate": 8.988844393592678e-06, "loss": 0.5471, "step": 4897 }, { "epoch": 1.4, "grad_norm": 8.991373062133789, "learning_rate": 8.984553775743709e-06, "loss": 0.6502, "step": 4898 }, { "epoch": 1.4, "grad_norm": 8.511014938354492, "learning_rate": 8.980263157894737e-06, "loss": 0.5548, "step": 4899 }, { "epoch": 1.4, "grad_norm": 9.343938827514648, "learning_rate": 8.975972540045768e-06, "loss": 0.8212, "step": 4900 }, { "epoch": 1.4, "grad_norm": 12.516120910644531, "learning_rate": 8.971681922196797e-06, "loss": 0.8418, "step": 4901 }, { "epoch": 1.4, "grad_norm": 9.186525344848633, "learning_rate": 8.967391304347827e-06, "loss": 0.6415, "step": 4902 }, { "epoch": 1.4, "grad_norm": 9.080911636352539, "learning_rate": 8.963100686498856e-06, "loss": 0.7878, "step": 4903 }, { "epoch": 1.4, "grad_norm": 9.257999420166016, "learning_rate": 8.958810068649885e-06, "loss": 0.702, "step": 4904 }, { "epoch": 1.4, "grad_norm": 11.845600128173828, "learning_rate": 8.954519450800915e-06, "loss": 0.7782, "step": 4905 }, { "epoch": 1.4, "grad_norm": 12.40800666809082, "learning_rate": 8.950228832951945e-06, "loss": 0.9072, "step": 4906 }, { "epoch": 1.4, "grad_norm": 10.087108612060547, "learning_rate": 8.945938215102976e-06, "loss": 0.9709, "step": 4907 }, { "epoch": 1.4, "grad_norm": 10.10132884979248, "learning_rate": 8.941647597254005e-06, "loss": 1.0056, "step": 4908 }, { "epoch": 1.4, "grad_norm": 12.176788330078125, "learning_rate": 8.937356979405035e-06, "loss": 0.7682, "step": 4909 }, { "epoch": 1.4, "grad_norm": 7.847817420959473, "learning_rate": 8.933066361556064e-06, "loss": 0.5401, "step": 4910 }, { "epoch": 1.4, "grad_norm": 6.8500895500183105, "learning_rate": 8.928775743707094e-06, "loss": 0.4209, "step": 4911 }, { "epoch": 1.41, "grad_norm": 11.234733581542969, "learning_rate": 8.924485125858123e-06, "loss": 0.6664, "step": 4912 }, { "epoch": 1.41, "grad_norm": 9.983650207519531, "learning_rate": 8.920194508009154e-06, "loss": 0.5712, "step": 4913 }, { "epoch": 1.41, "grad_norm": 11.488990783691406, "learning_rate": 8.915903890160182e-06, "loss": 0.9202, "step": 4914 }, { "epoch": 1.41, "grad_norm": 10.480536460876465, "learning_rate": 8.911613272311215e-06, "loss": 0.5609, "step": 4915 }, { "epoch": 1.41, "grad_norm": 10.097199440002441, "learning_rate": 8.907322654462243e-06, "loss": 0.6498, "step": 4916 }, { "epoch": 1.41, "grad_norm": 12.0875825881958, "learning_rate": 8.903032036613272e-06, "loss": 0.7561, "step": 4917 }, { "epoch": 1.41, "grad_norm": 9.851820945739746, "learning_rate": 8.898741418764303e-06, "loss": 0.609, "step": 4918 }, { "epoch": 1.41, "grad_norm": 10.984374046325684, "learning_rate": 8.894450800915331e-06, "loss": 0.6938, "step": 4919 }, { "epoch": 1.41, "grad_norm": 10.526418685913086, "learning_rate": 8.890160183066362e-06, "loss": 0.9217, "step": 4920 }, { "epoch": 1.41, "grad_norm": 11.407646179199219, "learning_rate": 8.88586956521739e-06, "loss": 0.7912, "step": 4921 }, { "epoch": 1.41, "grad_norm": 13.315333366394043, "learning_rate": 8.881578947368421e-06, "loss": 0.7204, "step": 4922 }, { "epoch": 1.41, "grad_norm": 11.823431015014648, "learning_rate": 8.877288329519451e-06, "loss": 0.8202, "step": 4923 }, { "epoch": 1.41, "grad_norm": 12.348017692565918, "learning_rate": 8.872997711670482e-06, "loss": 0.8588, "step": 4924 }, { "epoch": 1.41, "grad_norm": 9.585533142089844, "learning_rate": 8.86870709382151e-06, "loss": 0.7297, "step": 4925 }, { "epoch": 1.41, "grad_norm": 10.81173324584961, "learning_rate": 8.864416475972541e-06, "loss": 0.7518, "step": 4926 }, { "epoch": 1.41, "grad_norm": 10.336341857910156, "learning_rate": 8.86012585812357e-06, "loss": 0.7924, "step": 4927 }, { "epoch": 1.41, "grad_norm": 10.460423469543457, "learning_rate": 8.8558352402746e-06, "loss": 0.7537, "step": 4928 }, { "epoch": 1.41, "grad_norm": 10.195754051208496, "learning_rate": 8.851544622425629e-06, "loss": 0.7238, "step": 4929 }, { "epoch": 1.41, "grad_norm": 11.2444429397583, "learning_rate": 8.847254004576658e-06, "loss": 0.7621, "step": 4930 }, { "epoch": 1.41, "grad_norm": 10.065338134765625, "learning_rate": 8.842963386727688e-06, "loss": 0.5536, "step": 4931 }, { "epoch": 1.41, "grad_norm": 11.432260513305664, "learning_rate": 8.838672768878719e-06, "loss": 0.8951, "step": 4932 }, { "epoch": 1.41, "grad_norm": 12.106633186340332, "learning_rate": 8.83438215102975e-06, "loss": 1.0926, "step": 4933 }, { "epoch": 1.41, "grad_norm": 9.704801559448242, "learning_rate": 8.830091533180778e-06, "loss": 0.8252, "step": 4934 }, { "epoch": 1.41, "grad_norm": 8.296677589416504, "learning_rate": 8.825800915331809e-06, "loss": 0.4684, "step": 4935 }, { "epoch": 1.41, "grad_norm": 10.15492057800293, "learning_rate": 8.821510297482837e-06, "loss": 0.6786, "step": 4936 }, { "epoch": 1.41, "grad_norm": 8.982447624206543, "learning_rate": 8.817219679633868e-06, "loss": 0.6677, "step": 4937 }, { "epoch": 1.41, "grad_norm": 8.942950248718262, "learning_rate": 8.812929061784897e-06, "loss": 0.6398, "step": 4938 }, { "epoch": 1.41, "grad_norm": 8.501655578613281, "learning_rate": 8.808638443935927e-06, "loss": 0.6015, "step": 4939 }, { "epoch": 1.41, "grad_norm": 9.412524223327637, "learning_rate": 8.804347826086957e-06, "loss": 0.8239, "step": 4940 }, { "epoch": 1.41, "grad_norm": 8.406737327575684, "learning_rate": 8.800057208237988e-06, "loss": 0.5314, "step": 4941 }, { "epoch": 1.41, "grad_norm": 9.219732284545898, "learning_rate": 8.795766590389017e-06, "loss": 0.6431, "step": 4942 }, { "epoch": 1.41, "grad_norm": 8.9622220993042, "learning_rate": 8.791475972540045e-06, "loss": 0.6074, "step": 4943 }, { "epoch": 1.41, "grad_norm": 10.373034477233887, "learning_rate": 8.787185354691076e-06, "loss": 0.7059, "step": 4944 }, { "epoch": 1.41, "grad_norm": 10.128436088562012, "learning_rate": 8.782894736842105e-06, "loss": 0.6946, "step": 4945 }, { "epoch": 1.41, "grad_norm": 13.03666877746582, "learning_rate": 8.778604118993135e-06, "loss": 0.7183, "step": 4946 }, { "epoch": 1.42, "grad_norm": 10.098986625671387, "learning_rate": 8.774313501144164e-06, "loss": 0.7109, "step": 4947 }, { "epoch": 1.42, "grad_norm": 10.848021507263184, "learning_rate": 8.770022883295194e-06, "loss": 0.6452, "step": 4948 }, { "epoch": 1.42, "grad_norm": 8.350137710571289, "learning_rate": 8.765732265446225e-06, "loss": 0.6083, "step": 4949 }, { "epoch": 1.42, "grad_norm": 9.368001937866211, "learning_rate": 8.761441647597255e-06, "loss": 0.7013, "step": 4950 }, { "epoch": 1.42, "grad_norm": 11.08501148223877, "learning_rate": 8.757151029748284e-06, "loss": 0.7593, "step": 4951 }, { "epoch": 1.42, "grad_norm": 12.226153373718262, "learning_rate": 8.752860411899315e-06, "loss": 0.6093, "step": 4952 }, { "epoch": 1.42, "grad_norm": 10.081727981567383, "learning_rate": 8.748569794050343e-06, "loss": 0.798, "step": 4953 }, { "epoch": 1.42, "grad_norm": 7.829829692840576, "learning_rate": 8.744279176201374e-06, "loss": 0.4565, "step": 4954 }, { "epoch": 1.42, "grad_norm": 11.475992202758789, "learning_rate": 8.739988558352403e-06, "loss": 0.6535, "step": 4955 }, { "epoch": 1.42, "grad_norm": 11.957167625427246, "learning_rate": 8.735697940503431e-06, "loss": 0.9409, "step": 4956 }, { "epoch": 1.42, "grad_norm": 8.489669799804688, "learning_rate": 8.731407322654463e-06, "loss": 0.7229, "step": 4957 }, { "epoch": 1.42, "grad_norm": 12.282538414001465, "learning_rate": 8.727116704805492e-06, "loss": 0.7623, "step": 4958 }, { "epoch": 1.42, "grad_norm": 8.948768615722656, "learning_rate": 8.722826086956523e-06, "loss": 0.6361, "step": 4959 }, { "epoch": 1.42, "grad_norm": 10.072561264038086, "learning_rate": 8.718535469107551e-06, "loss": 0.7876, "step": 4960 }, { "epoch": 1.42, "grad_norm": 9.630600929260254, "learning_rate": 8.714244851258582e-06, "loss": 0.5635, "step": 4961 }, { "epoch": 1.42, "grad_norm": 9.96124267578125, "learning_rate": 8.70995423340961e-06, "loss": 0.8819, "step": 4962 }, { "epoch": 1.42, "grad_norm": 10.096244812011719, "learning_rate": 8.705663615560641e-06, "loss": 0.8045, "step": 4963 }, { "epoch": 1.42, "grad_norm": 8.456628799438477, "learning_rate": 8.70137299771167e-06, "loss": 0.546, "step": 4964 }, { "epoch": 1.42, "grad_norm": 11.215423583984375, "learning_rate": 8.6970823798627e-06, "loss": 0.7391, "step": 4965 }, { "epoch": 1.42, "grad_norm": 10.309953689575195, "learning_rate": 8.69279176201373e-06, "loss": 0.7174, "step": 4966 }, { "epoch": 1.42, "grad_norm": 12.189631462097168, "learning_rate": 8.688501144164761e-06, "loss": 1.013, "step": 4967 }, { "epoch": 1.42, "grad_norm": 10.816985130310059, "learning_rate": 8.68421052631579e-06, "loss": 0.8628, "step": 4968 }, { "epoch": 1.42, "grad_norm": 9.234234809875488, "learning_rate": 8.679919908466819e-06, "loss": 0.8081, "step": 4969 }, { "epoch": 1.42, "grad_norm": 11.34066390991211, "learning_rate": 8.67562929061785e-06, "loss": 0.8229, "step": 4970 }, { "epoch": 1.42, "grad_norm": 10.21949291229248, "learning_rate": 8.671338672768878e-06, "loss": 0.5068, "step": 4971 }, { "epoch": 1.42, "grad_norm": 10.102566719055176, "learning_rate": 8.667048054919909e-06, "loss": 0.7015, "step": 4972 }, { "epoch": 1.42, "grad_norm": 8.445645332336426, "learning_rate": 8.662757437070937e-06, "loss": 0.6165, "step": 4973 }, { "epoch": 1.42, "grad_norm": 8.391802787780762, "learning_rate": 8.658466819221968e-06, "loss": 0.6852, "step": 4974 }, { "epoch": 1.42, "grad_norm": 10.527009963989258, "learning_rate": 8.654176201372998e-06, "loss": 0.6214, "step": 4975 }, { "epoch": 1.42, "grad_norm": 9.231841087341309, "learning_rate": 8.649885583524029e-06, "loss": 0.7137, "step": 4976 }, { "epoch": 1.42, "grad_norm": 9.761481285095215, "learning_rate": 8.645594965675057e-06, "loss": 0.8636, "step": 4977 }, { "epoch": 1.42, "grad_norm": 8.89303970336914, "learning_rate": 8.641304347826088e-06, "loss": 0.5767, "step": 4978 }, { "epoch": 1.42, "grad_norm": 12.108701705932617, "learning_rate": 8.637013729977117e-06, "loss": 0.6581, "step": 4979 }, { "epoch": 1.42, "grad_norm": 9.690510749816895, "learning_rate": 8.632723112128147e-06, "loss": 0.6257, "step": 4980 }, { "epoch": 1.42, "grad_norm": 11.89633560180664, "learning_rate": 8.628432494279176e-06, "loss": 0.8569, "step": 4981 }, { "epoch": 1.43, "grad_norm": 11.14169979095459, "learning_rate": 8.624141876430205e-06, "loss": 0.7619, "step": 4982 }, { "epoch": 1.43, "grad_norm": 10.043008804321289, "learning_rate": 8.619851258581237e-06, "loss": 0.6162, "step": 4983 }, { "epoch": 1.43, "grad_norm": 8.896636009216309, "learning_rate": 8.615560640732266e-06, "loss": 0.5833, "step": 4984 }, { "epoch": 1.43, "grad_norm": 9.469889640808105, "learning_rate": 8.611270022883296e-06, "loss": 0.8342, "step": 4985 }, { "epoch": 1.43, "grad_norm": 11.050342559814453, "learning_rate": 8.606979405034325e-06, "loss": 0.6791, "step": 4986 }, { "epoch": 1.43, "grad_norm": 11.315376281738281, "learning_rate": 8.602688787185355e-06, "loss": 0.7415, "step": 4987 }, { "epoch": 1.43, "grad_norm": 11.068825721740723, "learning_rate": 8.598398169336384e-06, "loss": 0.6817, "step": 4988 }, { "epoch": 1.43, "grad_norm": 8.905261993408203, "learning_rate": 8.594107551487415e-06, "loss": 0.4884, "step": 4989 }, { "epoch": 1.43, "grad_norm": 11.311631202697754, "learning_rate": 8.589816933638443e-06, "loss": 0.8217, "step": 4990 }, { "epoch": 1.43, "grad_norm": 10.067758560180664, "learning_rate": 8.585526315789474e-06, "loss": 0.7261, "step": 4991 }, { "epoch": 1.43, "grad_norm": 11.24259090423584, "learning_rate": 8.581235697940504e-06, "loss": 0.7978, "step": 4992 }, { "epoch": 1.43, "grad_norm": 10.914911270141602, "learning_rate": 8.576945080091535e-06, "loss": 0.8629, "step": 4993 }, { "epoch": 1.43, "grad_norm": 10.073575019836426, "learning_rate": 8.572654462242563e-06, "loss": 0.7815, "step": 4994 }, { "epoch": 1.43, "grad_norm": 9.616877555847168, "learning_rate": 8.568363844393592e-06, "loss": 0.639, "step": 4995 }, { "epoch": 1.43, "grad_norm": 11.003323554992676, "learning_rate": 8.564073226544623e-06, "loss": 0.5851, "step": 4996 }, { "epoch": 1.43, "grad_norm": 10.080148696899414, "learning_rate": 8.559782608695651e-06, "loss": 0.6707, "step": 4997 }, { "epoch": 1.43, "grad_norm": 8.979016304016113, "learning_rate": 8.555491990846682e-06, "loss": 0.7034, "step": 4998 }, { "epoch": 1.43, "grad_norm": 12.307140350341797, "learning_rate": 8.55120137299771e-06, "loss": 0.9493, "step": 4999 }, { "epoch": 1.43, "grad_norm": 10.767085075378418, "learning_rate": 8.546910755148743e-06, "loss": 0.6093, "step": 5000 }, { "epoch": 1.43, "grad_norm": 10.213473320007324, "learning_rate": 8.542620137299772e-06, "loss": 0.6934, "step": 5001 }, { "epoch": 1.43, "grad_norm": 9.938896179199219, "learning_rate": 8.538329519450802e-06, "loss": 0.7864, "step": 5002 }, { "epoch": 1.43, "grad_norm": 9.872451782226562, "learning_rate": 8.53403890160183e-06, "loss": 0.7673, "step": 5003 }, { "epoch": 1.43, "grad_norm": 10.641152381896973, "learning_rate": 8.529748283752861e-06, "loss": 1.0319, "step": 5004 }, { "epoch": 1.43, "grad_norm": 10.038233757019043, "learning_rate": 8.52545766590389e-06, "loss": 0.6311, "step": 5005 }, { "epoch": 1.43, "grad_norm": 9.689718246459961, "learning_rate": 8.52116704805492e-06, "loss": 0.8566, "step": 5006 }, { "epoch": 1.43, "grad_norm": 9.882425308227539, "learning_rate": 8.51687643020595e-06, "loss": 0.5638, "step": 5007 }, { "epoch": 1.43, "grad_norm": 9.150846481323242, "learning_rate": 8.512585812356978e-06, "loss": 0.5935, "step": 5008 }, { "epoch": 1.43, "grad_norm": 10.776080131530762, "learning_rate": 8.50829519450801e-06, "loss": 0.8544, "step": 5009 }, { "epoch": 1.43, "grad_norm": 10.461644172668457, "learning_rate": 8.504004576659039e-06, "loss": 0.832, "step": 5010 }, { "epoch": 1.43, "grad_norm": 11.013406753540039, "learning_rate": 8.49971395881007e-06, "loss": 0.7076, "step": 5011 }, { "epoch": 1.43, "grad_norm": 8.372725486755371, "learning_rate": 8.495423340961098e-06, "loss": 0.6376, "step": 5012 }, { "epoch": 1.43, "grad_norm": 13.2879638671875, "learning_rate": 8.491132723112129e-06, "loss": 0.7582, "step": 5013 }, { "epoch": 1.43, "grad_norm": 11.984237670898438, "learning_rate": 8.486842105263157e-06, "loss": 0.8337, "step": 5014 }, { "epoch": 1.43, "grad_norm": 8.97717571258545, "learning_rate": 8.482551487414188e-06, "loss": 0.7792, "step": 5015 }, { "epoch": 1.43, "grad_norm": 11.499009132385254, "learning_rate": 8.478260869565217e-06, "loss": 0.8281, "step": 5016 }, { "epoch": 1.44, "grad_norm": 8.801410675048828, "learning_rate": 8.473970251716249e-06, "loss": 0.7705, "step": 5017 }, { "epoch": 1.44, "grad_norm": 9.285280227661133, "learning_rate": 8.469679633867278e-06, "loss": 0.557, "step": 5018 }, { "epoch": 1.44, "grad_norm": 8.304474830627441, "learning_rate": 8.465389016018308e-06, "loss": 0.5849, "step": 5019 }, { "epoch": 1.44, "grad_norm": 9.440895080566406, "learning_rate": 8.461098398169337e-06, "loss": 0.5966, "step": 5020 }, { "epoch": 1.44, "grad_norm": 16.106412887573242, "learning_rate": 8.456807780320366e-06, "loss": 0.8179, "step": 5021 }, { "epoch": 1.44, "grad_norm": 11.817154884338379, "learning_rate": 8.452517162471396e-06, "loss": 0.7659, "step": 5022 }, { "epoch": 1.44, "grad_norm": 11.135847091674805, "learning_rate": 8.448226544622425e-06, "loss": 0.762, "step": 5023 }, { "epoch": 1.44, "grad_norm": 11.009657859802246, "learning_rate": 8.443935926773455e-06, "loss": 0.8869, "step": 5024 }, { "epoch": 1.44, "grad_norm": 9.861078262329102, "learning_rate": 8.439645308924484e-06, "loss": 0.6355, "step": 5025 }, { "epoch": 1.44, "grad_norm": 8.525294303894043, "learning_rate": 8.435354691075516e-06, "loss": 0.6068, "step": 5026 }, { "epoch": 1.44, "grad_norm": 15.591021537780762, "learning_rate": 8.431064073226545e-06, "loss": 1.1226, "step": 5027 }, { "epoch": 1.44, "grad_norm": 8.61467456817627, "learning_rate": 8.426773455377575e-06, "loss": 0.4673, "step": 5028 }, { "epoch": 1.44, "grad_norm": 11.177366256713867, "learning_rate": 8.422482837528604e-06, "loss": 0.7149, "step": 5029 }, { "epoch": 1.44, "grad_norm": 10.839001655578613, "learning_rate": 8.418192219679635e-06, "loss": 0.7335, "step": 5030 }, { "epoch": 1.44, "grad_norm": 10.47923755645752, "learning_rate": 8.413901601830663e-06, "loss": 0.77, "step": 5031 }, { "epoch": 1.44, "grad_norm": 10.473682403564453, "learning_rate": 8.409610983981694e-06, "loss": 0.5729, "step": 5032 }, { "epoch": 1.44, "grad_norm": 13.123830795288086, "learning_rate": 8.405320366132723e-06, "loss": 0.6412, "step": 5033 }, { "epoch": 1.44, "grad_norm": 13.672239303588867, "learning_rate": 8.401029748283751e-06, "loss": 0.8529, "step": 5034 }, { "epoch": 1.44, "grad_norm": 9.248321533203125, "learning_rate": 8.396739130434784e-06, "loss": 0.5481, "step": 5035 }, { "epoch": 1.44, "grad_norm": 11.924641609191895, "learning_rate": 8.392448512585812e-06, "loss": 0.9527, "step": 5036 }, { "epoch": 1.44, "grad_norm": 8.101284980773926, "learning_rate": 8.388157894736843e-06, "loss": 0.603, "step": 5037 }, { "epoch": 1.44, "grad_norm": 9.7366361618042, "learning_rate": 8.383867276887872e-06, "loss": 0.677, "step": 5038 }, { "epoch": 1.44, "grad_norm": 7.6654229164123535, "learning_rate": 8.379576659038902e-06, "loss": 0.68, "step": 5039 }, { "epoch": 1.44, "grad_norm": 12.373343467712402, "learning_rate": 8.37528604118993e-06, "loss": 0.9479, "step": 5040 }, { "epoch": 1.44, "grad_norm": 10.893401145935059, "learning_rate": 8.370995423340961e-06, "loss": 0.8434, "step": 5041 }, { "epoch": 1.44, "grad_norm": 10.966062545776367, "learning_rate": 8.36670480549199e-06, "loss": 0.5505, "step": 5042 }, { "epoch": 1.44, "grad_norm": 11.359344482421875, "learning_rate": 8.362414187643022e-06, "loss": 0.6423, "step": 5043 }, { "epoch": 1.44, "grad_norm": 10.55596923828125, "learning_rate": 8.358123569794051e-06, "loss": 0.7672, "step": 5044 }, { "epoch": 1.44, "grad_norm": 11.82313346862793, "learning_rate": 8.353832951945081e-06, "loss": 0.7701, "step": 5045 }, { "epoch": 1.44, "grad_norm": 9.508740425109863, "learning_rate": 8.34954233409611e-06, "loss": 0.6773, "step": 5046 }, { "epoch": 1.44, "grad_norm": 11.016886711120605, "learning_rate": 8.345251716247139e-06, "loss": 0.8646, "step": 5047 }, { "epoch": 1.44, "grad_norm": 10.397418022155762, "learning_rate": 8.34096109839817e-06, "loss": 0.684, "step": 5048 }, { "epoch": 1.44, "grad_norm": 10.650434494018555, "learning_rate": 8.336670480549198e-06, "loss": 0.8008, "step": 5049 }, { "epoch": 1.44, "grad_norm": 9.770722389221191, "learning_rate": 8.332379862700229e-06, "loss": 0.7797, "step": 5050 }, { "epoch": 1.44, "grad_norm": 9.314210891723633, "learning_rate": 8.328089244851257e-06, "loss": 0.7631, "step": 5051 }, { "epoch": 1.45, "grad_norm": 11.673943519592285, "learning_rate": 8.32379862700229e-06, "loss": 0.8589, "step": 5052 }, { "epoch": 1.45, "grad_norm": 12.262770652770996, "learning_rate": 8.319508009153318e-06, "loss": 0.8789, "step": 5053 }, { "epoch": 1.45, "grad_norm": 10.737245559692383, "learning_rate": 8.315217391304349e-06, "loss": 0.8933, "step": 5054 }, { "epoch": 1.45, "grad_norm": 12.311223030090332, "learning_rate": 8.310926773455378e-06, "loss": 0.5966, "step": 5055 }, { "epoch": 1.45, "grad_norm": 10.886199951171875, "learning_rate": 8.306636155606408e-06, "loss": 0.7343, "step": 5056 }, { "epoch": 1.45, "grad_norm": 12.238615036010742, "learning_rate": 8.302345537757437e-06, "loss": 0.8853, "step": 5057 }, { "epoch": 1.45, "grad_norm": 9.038883209228516, "learning_rate": 8.298054919908467e-06, "loss": 0.7911, "step": 5058 }, { "epoch": 1.45, "grad_norm": 7.761969089508057, "learning_rate": 8.293764302059496e-06, "loss": 0.6182, "step": 5059 }, { "epoch": 1.45, "grad_norm": 9.099241256713867, "learning_rate": 8.289473684210526e-06, "loss": 0.4603, "step": 5060 }, { "epoch": 1.45, "grad_norm": 10.834635734558105, "learning_rate": 8.285183066361557e-06, "loss": 0.7656, "step": 5061 }, { "epoch": 1.45, "grad_norm": 13.24924373626709, "learning_rate": 8.280892448512586e-06, "loss": 0.786, "step": 5062 }, { "epoch": 1.45, "grad_norm": 13.369250297546387, "learning_rate": 8.276601830663616e-06, "loss": 1.0903, "step": 5063 }, { "epoch": 1.45, "grad_norm": 13.29235553741455, "learning_rate": 8.272311212814645e-06, "loss": 0.793, "step": 5064 }, { "epoch": 1.45, "grad_norm": 9.594003677368164, "learning_rate": 8.268020594965675e-06, "loss": 0.5489, "step": 5065 }, { "epoch": 1.45, "grad_norm": 10.180651664733887, "learning_rate": 8.263729977116704e-06, "loss": 0.5971, "step": 5066 }, { "epoch": 1.45, "grad_norm": 10.12555980682373, "learning_rate": 8.259439359267735e-06, "loss": 0.6838, "step": 5067 }, { "epoch": 1.45, "grad_norm": 10.411796569824219, "learning_rate": 8.255148741418763e-06, "loss": 0.726, "step": 5068 }, { "epoch": 1.45, "grad_norm": 11.773266792297363, "learning_rate": 8.250858123569796e-06, "loss": 1.0189, "step": 5069 }, { "epoch": 1.45, "grad_norm": 9.530159950256348, "learning_rate": 8.246567505720824e-06, "loss": 0.6832, "step": 5070 }, { "epoch": 1.45, "grad_norm": 9.325148582458496, "learning_rate": 8.242276887871855e-06, "loss": 0.7125, "step": 5071 }, { "epoch": 1.45, "grad_norm": 10.458324432373047, "learning_rate": 8.237986270022884e-06, "loss": 0.6311, "step": 5072 }, { "epoch": 1.45, "grad_norm": 8.303791999816895, "learning_rate": 8.233695652173914e-06, "loss": 0.6966, "step": 5073 }, { "epoch": 1.45, "grad_norm": 12.064472198486328, "learning_rate": 8.229405034324943e-06, "loss": 0.7607, "step": 5074 }, { "epoch": 1.45, "grad_norm": 8.434898376464844, "learning_rate": 8.225114416475972e-06, "loss": 0.6423, "step": 5075 }, { "epoch": 1.45, "grad_norm": 10.684408187866211, "learning_rate": 8.220823798627002e-06, "loss": 0.7354, "step": 5076 }, { "epoch": 1.45, "grad_norm": 9.874177932739258, "learning_rate": 8.216533180778032e-06, "loss": 0.6514, "step": 5077 }, { "epoch": 1.45, "grad_norm": 12.246417045593262, "learning_rate": 8.212242562929063e-06, "loss": 0.8711, "step": 5078 }, { "epoch": 1.45, "grad_norm": 12.1536283493042, "learning_rate": 8.207951945080092e-06, "loss": 0.9474, "step": 5079 }, { "epoch": 1.45, "grad_norm": 6.930532932281494, "learning_rate": 8.203661327231122e-06, "loss": 0.3914, "step": 5080 }, { "epoch": 1.45, "grad_norm": 10.783819198608398, "learning_rate": 8.199370709382151e-06, "loss": 0.8458, "step": 5081 }, { "epoch": 1.45, "grad_norm": 8.25786304473877, "learning_rate": 8.195080091533181e-06, "loss": 0.5712, "step": 5082 }, { "epoch": 1.45, "grad_norm": 9.214256286621094, "learning_rate": 8.19078947368421e-06, "loss": 0.7327, "step": 5083 }, { "epoch": 1.45, "grad_norm": 10.542569160461426, "learning_rate": 8.18649885583524e-06, "loss": 0.7113, "step": 5084 }, { "epoch": 1.45, "grad_norm": 9.573670387268066, "learning_rate": 8.18220823798627e-06, "loss": 0.6155, "step": 5085 }, { "epoch": 1.45, "grad_norm": 9.836844444274902, "learning_rate": 8.177917620137302e-06, "loss": 0.6262, "step": 5086 }, { "epoch": 1.46, "grad_norm": 11.307541847229004, "learning_rate": 8.17362700228833e-06, "loss": 0.8254, "step": 5087 }, { "epoch": 1.46, "grad_norm": 6.5441107749938965, "learning_rate": 8.169336384439359e-06, "loss": 0.3907, "step": 5088 }, { "epoch": 1.46, "grad_norm": 9.281217575073242, "learning_rate": 8.16504576659039e-06, "loss": 0.5852, "step": 5089 }, { "epoch": 1.46, "grad_norm": 12.437320709228516, "learning_rate": 8.160755148741418e-06, "loss": 0.7752, "step": 5090 }, { "epoch": 1.46, "grad_norm": 9.643728256225586, "learning_rate": 8.156464530892449e-06, "loss": 0.5487, "step": 5091 }, { "epoch": 1.46, "grad_norm": 10.50843620300293, "learning_rate": 8.152173913043478e-06, "loss": 0.5361, "step": 5092 }, { "epoch": 1.46, "grad_norm": 9.972748756408691, "learning_rate": 8.147883295194508e-06, "loss": 0.6879, "step": 5093 }, { "epoch": 1.46, "grad_norm": 10.782821655273438, "learning_rate": 8.143592677345537e-06, "loss": 0.6245, "step": 5094 }, { "epoch": 1.46, "grad_norm": 7.436230182647705, "learning_rate": 8.139302059496569e-06, "loss": 0.4944, "step": 5095 }, { "epoch": 1.46, "grad_norm": 10.277985572814941, "learning_rate": 8.135011441647598e-06, "loss": 0.7533, "step": 5096 }, { "epoch": 1.46, "grad_norm": 9.366806030273438, "learning_rate": 8.130720823798628e-06, "loss": 0.666, "step": 5097 }, { "epoch": 1.46, "grad_norm": 13.114858627319336, "learning_rate": 8.126430205949657e-06, "loss": 0.7336, "step": 5098 }, { "epoch": 1.46, "grad_norm": 12.340123176574707, "learning_rate": 8.122139588100687e-06, "loss": 0.7865, "step": 5099 }, { "epoch": 1.46, "grad_norm": 10.428196907043457, "learning_rate": 8.117848970251716e-06, "loss": 0.59, "step": 5100 }, { "epoch": 1.46, "grad_norm": 16.39594841003418, "learning_rate": 8.113558352402745e-06, "loss": 0.8344, "step": 5101 }, { "epoch": 1.46, "grad_norm": 9.768933296203613, "learning_rate": 8.109267734553775e-06, "loss": 0.7913, "step": 5102 }, { "epoch": 1.46, "grad_norm": 11.725458145141602, "learning_rate": 8.104977116704806e-06, "loss": 0.585, "step": 5103 }, { "epoch": 1.46, "grad_norm": 12.467772483825684, "learning_rate": 8.100686498855836e-06, "loss": 0.8799, "step": 5104 }, { "epoch": 1.46, "grad_norm": 10.282450675964355, "learning_rate": 8.096395881006865e-06, "loss": 0.7503, "step": 5105 }, { "epoch": 1.46, "grad_norm": 12.790483474731445, "learning_rate": 8.092105263157896e-06, "loss": 1.0285, "step": 5106 }, { "epoch": 1.46, "grad_norm": 9.784635543823242, "learning_rate": 8.087814645308924e-06, "loss": 0.5898, "step": 5107 }, { "epoch": 1.46, "grad_norm": 10.346814155578613, "learning_rate": 8.083524027459955e-06, "loss": 0.7253, "step": 5108 }, { "epoch": 1.46, "grad_norm": 12.714737892150879, "learning_rate": 8.079233409610984e-06, "loss": 0.8393, "step": 5109 }, { "epoch": 1.46, "grad_norm": 11.371176719665527, "learning_rate": 8.074942791762014e-06, "loss": 0.8245, "step": 5110 }, { "epoch": 1.46, "grad_norm": 10.148053169250488, "learning_rate": 8.070652173913043e-06, "loss": 0.6768, "step": 5111 }, { "epoch": 1.46, "grad_norm": 11.980246543884277, "learning_rate": 8.066361556064075e-06, "loss": 0.8608, "step": 5112 }, { "epoch": 1.46, "grad_norm": 9.12360954284668, "learning_rate": 8.062070938215104e-06, "loss": 0.7646, "step": 5113 }, { "epoch": 1.46, "grad_norm": 11.374464988708496, "learning_rate": 8.057780320366132e-06, "loss": 0.8281, "step": 5114 }, { "epoch": 1.46, "grad_norm": 12.122367858886719, "learning_rate": 8.053489702517163e-06, "loss": 0.737, "step": 5115 }, { "epoch": 1.46, "grad_norm": 9.022582054138184, "learning_rate": 8.049199084668192e-06, "loss": 0.7077, "step": 5116 }, { "epoch": 1.46, "grad_norm": 7.594146251678467, "learning_rate": 8.044908466819222e-06, "loss": 0.5956, "step": 5117 }, { "epoch": 1.46, "grad_norm": 10.771967887878418, "learning_rate": 8.040617848970251e-06, "loss": 0.8169, "step": 5118 }, { "epoch": 1.46, "grad_norm": 12.040303230285645, "learning_rate": 8.036327231121281e-06, "loss": 0.7885, "step": 5119 }, { "epoch": 1.46, "grad_norm": 11.240238189697266, "learning_rate": 8.032036613272312e-06, "loss": 0.8766, "step": 5120 }, { "epoch": 1.46, "grad_norm": 9.21816635131836, "learning_rate": 8.027745995423342e-06, "loss": 0.6309, "step": 5121 }, { "epoch": 1.47, "grad_norm": 9.237491607666016, "learning_rate": 8.023455377574371e-06, "loss": 0.6262, "step": 5122 }, { "epoch": 1.47, "grad_norm": 12.814543724060059, "learning_rate": 8.019164759725402e-06, "loss": 0.862, "step": 5123 }, { "epoch": 1.47, "grad_norm": 10.67367172241211, "learning_rate": 8.01487414187643e-06, "loss": 1.0106, "step": 5124 }, { "epoch": 1.47, "grad_norm": 9.877201080322266, "learning_rate": 8.01058352402746e-06, "loss": 0.7026, "step": 5125 }, { "epoch": 1.47, "grad_norm": 11.837648391723633, "learning_rate": 8.00629290617849e-06, "loss": 0.9552, "step": 5126 }, { "epoch": 1.47, "grad_norm": 12.015484809875488, "learning_rate": 8.002002288329518e-06, "loss": 0.6225, "step": 5127 }, { "epoch": 1.47, "grad_norm": 8.69874095916748, "learning_rate": 7.997711670480549e-06, "loss": 0.6455, "step": 5128 }, { "epoch": 1.47, "grad_norm": 10.47046184539795, "learning_rate": 7.99342105263158e-06, "loss": 0.7297, "step": 5129 }, { "epoch": 1.47, "grad_norm": 10.689979553222656, "learning_rate": 7.98913043478261e-06, "loss": 0.8392, "step": 5130 }, { "epoch": 1.47, "grad_norm": 9.730806350708008, "learning_rate": 7.984839816933638e-06, "loss": 0.4623, "step": 5131 }, { "epoch": 1.47, "grad_norm": 12.895252227783203, "learning_rate": 7.980549199084669e-06, "loss": 0.9518, "step": 5132 }, { "epoch": 1.47, "grad_norm": 9.70818042755127, "learning_rate": 7.976258581235698e-06, "loss": 0.8398, "step": 5133 }, { "epoch": 1.47, "grad_norm": 10.01799488067627, "learning_rate": 7.971967963386728e-06, "loss": 0.8637, "step": 5134 }, { "epoch": 1.47, "grad_norm": 10.447866439819336, "learning_rate": 7.967677345537757e-06, "loss": 0.6831, "step": 5135 }, { "epoch": 1.47, "grad_norm": 11.744251251220703, "learning_rate": 7.963386727688787e-06, "loss": 0.9694, "step": 5136 }, { "epoch": 1.47, "grad_norm": 11.017406463623047, "learning_rate": 7.959096109839818e-06, "loss": 0.5433, "step": 5137 }, { "epoch": 1.47, "grad_norm": 8.464219093322754, "learning_rate": 7.954805491990848e-06, "loss": 0.5909, "step": 5138 }, { "epoch": 1.47, "grad_norm": 11.348611831665039, "learning_rate": 7.950514874141877e-06, "loss": 0.5935, "step": 5139 }, { "epoch": 1.47, "grad_norm": 10.337952613830566, "learning_rate": 7.946224256292906e-06, "loss": 0.5608, "step": 5140 }, { "epoch": 1.47, "grad_norm": 14.805388450622559, "learning_rate": 7.941933638443936e-06, "loss": 0.7561, "step": 5141 }, { "epoch": 1.47, "grad_norm": 9.155672073364258, "learning_rate": 7.937643020594965e-06, "loss": 0.5781, "step": 5142 }, { "epoch": 1.47, "grad_norm": 9.741296768188477, "learning_rate": 7.933352402745996e-06, "loss": 0.6299, "step": 5143 }, { "epoch": 1.47, "grad_norm": 10.216283798217773, "learning_rate": 7.929061784897024e-06, "loss": 0.8591, "step": 5144 }, { "epoch": 1.47, "grad_norm": 9.9946928024292, "learning_rate": 7.924771167048055e-06, "loss": 0.6312, "step": 5145 }, { "epoch": 1.47, "grad_norm": 9.108634948730469, "learning_rate": 7.920480549199085e-06, "loss": 0.602, "step": 5146 }, { "epoch": 1.47, "grad_norm": 12.92469596862793, "learning_rate": 7.916189931350116e-06, "loss": 0.6813, "step": 5147 }, { "epoch": 1.47, "grad_norm": 9.77869701385498, "learning_rate": 7.911899313501144e-06, "loss": 0.7651, "step": 5148 }, { "epoch": 1.47, "grad_norm": 11.956109046936035, "learning_rate": 7.907608695652175e-06, "loss": 0.7607, "step": 5149 }, { "epoch": 1.47, "grad_norm": 10.98892879486084, "learning_rate": 7.903318077803204e-06, "loss": 0.5443, "step": 5150 }, { "epoch": 1.47, "grad_norm": 10.866950988769531, "learning_rate": 7.899027459954234e-06, "loss": 0.6851, "step": 5151 }, { "epoch": 1.47, "grad_norm": 13.141809463500977, "learning_rate": 7.894736842105263e-06, "loss": 0.8302, "step": 5152 }, { "epoch": 1.47, "grad_norm": 10.317466735839844, "learning_rate": 7.890446224256292e-06, "loss": 0.7601, "step": 5153 }, { "epoch": 1.47, "grad_norm": 10.443330764770508, "learning_rate": 7.886155606407322e-06, "loss": 0.8592, "step": 5154 }, { "epoch": 1.47, "grad_norm": 10.033818244934082, "learning_rate": 7.881864988558353e-06, "loss": 0.891, "step": 5155 }, { "epoch": 1.47, "grad_norm": 12.123933792114258, "learning_rate": 7.877574370709383e-06, "loss": 1.1241, "step": 5156 }, { "epoch": 1.48, "grad_norm": 10.95120620727539, "learning_rate": 7.873283752860412e-06, "loss": 0.5987, "step": 5157 }, { "epoch": 1.48, "grad_norm": 9.693603515625, "learning_rate": 7.868993135011442e-06, "loss": 0.4539, "step": 5158 }, { "epoch": 1.48, "grad_norm": 9.845906257629395, "learning_rate": 7.864702517162471e-06, "loss": 0.7684, "step": 5159 }, { "epoch": 1.48, "grad_norm": 9.617039680480957, "learning_rate": 7.860411899313502e-06, "loss": 0.9473, "step": 5160 }, { "epoch": 1.48, "grad_norm": 11.552712440490723, "learning_rate": 7.85612128146453e-06, "loss": 0.8603, "step": 5161 }, { "epoch": 1.48, "grad_norm": 9.689935684204102, "learning_rate": 7.85183066361556e-06, "loss": 0.5713, "step": 5162 }, { "epoch": 1.48, "grad_norm": 10.119417190551758, "learning_rate": 7.847540045766591e-06, "loss": 0.5921, "step": 5163 }, { "epoch": 1.48, "grad_norm": 9.15572738647461, "learning_rate": 7.843249427917622e-06, "loss": 0.5496, "step": 5164 }, { "epoch": 1.48, "grad_norm": 15.126708984375, "learning_rate": 7.83895881006865e-06, "loss": 1.1661, "step": 5165 }, { "epoch": 1.48, "grad_norm": 12.176010131835938, "learning_rate": 7.83466819221968e-06, "loss": 0.8296, "step": 5166 }, { "epoch": 1.48, "grad_norm": 8.255621910095215, "learning_rate": 7.83037757437071e-06, "loss": 0.5731, "step": 5167 }, { "epoch": 1.48, "grad_norm": 11.428735733032227, "learning_rate": 7.826086956521738e-06, "loss": 0.7782, "step": 5168 }, { "epoch": 1.48, "grad_norm": 8.800541877746582, "learning_rate": 7.821796338672769e-06, "loss": 0.6463, "step": 5169 }, { "epoch": 1.48, "grad_norm": 12.368282318115234, "learning_rate": 7.817505720823798e-06, "loss": 0.6011, "step": 5170 }, { "epoch": 1.48, "grad_norm": 9.917195320129395, "learning_rate": 7.813215102974828e-06, "loss": 0.7234, "step": 5171 }, { "epoch": 1.48, "grad_norm": 9.662066459655762, "learning_rate": 7.808924485125859e-06, "loss": 0.7493, "step": 5172 }, { "epoch": 1.48, "grad_norm": 10.145095825195312, "learning_rate": 7.804633867276889e-06, "loss": 0.8596, "step": 5173 }, { "epoch": 1.48, "grad_norm": 11.008843421936035, "learning_rate": 7.800343249427918e-06, "loss": 0.6049, "step": 5174 }, { "epoch": 1.48, "grad_norm": 11.558208465576172, "learning_rate": 7.796052631578948e-06, "loss": 0.7109, "step": 5175 }, { "epoch": 1.48, "grad_norm": 10.150371551513672, "learning_rate": 7.791762013729977e-06, "loss": 0.8938, "step": 5176 }, { "epoch": 1.48, "grad_norm": 10.215923309326172, "learning_rate": 7.787471395881008e-06, "loss": 0.5018, "step": 5177 }, { "epoch": 1.48, "grad_norm": 8.31438159942627, "learning_rate": 7.783180778032036e-06, "loss": 0.4783, "step": 5178 }, { "epoch": 1.48, "grad_norm": 10.255351066589355, "learning_rate": 7.778890160183065e-06, "loss": 0.6412, "step": 5179 }, { "epoch": 1.48, "grad_norm": 10.33067512512207, "learning_rate": 7.774599542334097e-06, "loss": 0.7935, "step": 5180 }, { "epoch": 1.48, "grad_norm": 10.240641593933105, "learning_rate": 7.770308924485126e-06, "loss": 0.8504, "step": 5181 }, { "epoch": 1.48, "grad_norm": 10.01538372039795, "learning_rate": 7.766018306636156e-06, "loss": 0.771, "step": 5182 }, { "epoch": 1.48, "grad_norm": 8.917338371276855, "learning_rate": 7.761727688787185e-06, "loss": 0.7849, "step": 5183 }, { "epoch": 1.48, "grad_norm": 10.52344036102295, "learning_rate": 7.757437070938216e-06, "loss": 0.9558, "step": 5184 }, { "epoch": 1.48, "grad_norm": 8.942107200622559, "learning_rate": 7.753146453089244e-06, "loss": 0.7475, "step": 5185 }, { "epoch": 1.48, "grad_norm": 9.920065879821777, "learning_rate": 7.748855835240275e-06, "loss": 0.5781, "step": 5186 }, { "epoch": 1.48, "grad_norm": 9.799785614013672, "learning_rate": 7.744565217391304e-06, "loss": 0.7346, "step": 5187 }, { "epoch": 1.48, "grad_norm": 11.251009941101074, "learning_rate": 7.740274599542334e-06, "loss": 0.682, "step": 5188 }, { "epoch": 1.48, "grad_norm": 11.464249610900879, "learning_rate": 7.735983981693365e-06, "loss": 0.649, "step": 5189 }, { "epoch": 1.48, "grad_norm": 9.01133918762207, "learning_rate": 7.731693363844395e-06, "loss": 0.4474, "step": 5190 }, { "epoch": 1.48, "grad_norm": 9.98456859588623, "learning_rate": 7.727402745995424e-06, "loss": 0.5277, "step": 5191 }, { "epoch": 1.49, "grad_norm": 13.69474983215332, "learning_rate": 7.723112128146453e-06, "loss": 0.7136, "step": 5192 }, { "epoch": 1.49, "grad_norm": 10.09440803527832, "learning_rate": 7.718821510297483e-06, "loss": 0.601, "step": 5193 }, { "epoch": 1.49, "grad_norm": 11.429454803466797, "learning_rate": 7.714530892448512e-06, "loss": 0.7693, "step": 5194 }, { "epoch": 1.49, "grad_norm": 12.033621788024902, "learning_rate": 7.710240274599542e-06, "loss": 0.905, "step": 5195 }, { "epoch": 1.49, "grad_norm": 8.883831024169922, "learning_rate": 7.705949656750571e-06, "loss": 0.7298, "step": 5196 }, { "epoch": 1.49, "grad_norm": 12.658049583435059, "learning_rate": 7.701659038901603e-06, "loss": 0.9474, "step": 5197 }, { "epoch": 1.49, "grad_norm": 10.020696640014648, "learning_rate": 7.697368421052632e-06, "loss": 0.7189, "step": 5198 }, { "epoch": 1.49, "grad_norm": 9.961935043334961, "learning_rate": 7.693077803203662e-06, "loss": 0.6584, "step": 5199 }, { "epoch": 1.49, "grad_norm": 8.704461097717285, "learning_rate": 7.688787185354691e-06, "loss": 0.5271, "step": 5200 }, { "epoch": 1.49, "grad_norm": 10.55123233795166, "learning_rate": 7.684496567505722e-06, "loss": 0.6096, "step": 5201 }, { "epoch": 1.49, "grad_norm": 13.534534454345703, "learning_rate": 7.68020594965675e-06, "loss": 0.7524, "step": 5202 }, { "epoch": 1.49, "grad_norm": 10.84149169921875, "learning_rate": 7.675915331807781e-06, "loss": 0.7198, "step": 5203 }, { "epoch": 1.49, "grad_norm": 12.128814697265625, "learning_rate": 7.67162471395881e-06, "loss": 0.7491, "step": 5204 }, { "epoch": 1.49, "grad_norm": 12.675615310668945, "learning_rate": 7.667334096109838e-06, "loss": 1.0454, "step": 5205 }, { "epoch": 1.49, "grad_norm": 10.708645820617676, "learning_rate": 7.66304347826087e-06, "loss": 0.7547, "step": 5206 }, { "epoch": 1.49, "grad_norm": 9.514225959777832, "learning_rate": 7.6587528604119e-06, "loss": 0.6761, "step": 5207 }, { "epoch": 1.49, "grad_norm": 10.71254825592041, "learning_rate": 7.65446224256293e-06, "loss": 0.7642, "step": 5208 }, { "epoch": 1.49, "grad_norm": 8.257457733154297, "learning_rate": 7.650171624713959e-06, "loss": 0.6241, "step": 5209 }, { "epoch": 1.49, "grad_norm": 11.02060604095459, "learning_rate": 7.645881006864989e-06, "loss": 0.7695, "step": 5210 }, { "epoch": 1.49, "grad_norm": 7.717381477355957, "learning_rate": 7.641590389016018e-06, "loss": 0.6437, "step": 5211 }, { "epoch": 1.49, "grad_norm": 10.31637954711914, "learning_rate": 7.637299771167048e-06, "loss": 0.9296, "step": 5212 }, { "epoch": 1.49, "grad_norm": 14.598742485046387, "learning_rate": 7.633009153318077e-06, "loss": 0.8934, "step": 5213 }, { "epoch": 1.49, "grad_norm": 10.441706657409668, "learning_rate": 7.628718535469107e-06, "loss": 0.8108, "step": 5214 }, { "epoch": 1.49, "grad_norm": 19.70823860168457, "learning_rate": 7.624427917620138e-06, "loss": 0.7806, "step": 5215 }, { "epoch": 1.49, "grad_norm": 10.273853302001953, "learning_rate": 7.620137299771168e-06, "loss": 0.6576, "step": 5216 }, { "epoch": 1.49, "grad_norm": 8.733698844909668, "learning_rate": 7.615846681922197e-06, "loss": 0.5898, "step": 5217 }, { "epoch": 1.49, "grad_norm": 12.016242027282715, "learning_rate": 7.611556064073227e-06, "loss": 0.6687, "step": 5218 }, { "epoch": 1.49, "grad_norm": 8.932428359985352, "learning_rate": 7.6072654462242564e-06, "loss": 0.8076, "step": 5219 }, { "epoch": 1.49, "grad_norm": 10.967382431030273, "learning_rate": 7.602974828375286e-06, "loss": 0.8116, "step": 5220 }, { "epoch": 1.49, "grad_norm": 10.108834266662598, "learning_rate": 7.598684210526316e-06, "loss": 0.5624, "step": 5221 }, { "epoch": 1.49, "grad_norm": 12.46695327758789, "learning_rate": 7.594393592677345e-06, "loss": 0.9826, "step": 5222 }, { "epoch": 1.49, "grad_norm": 12.058609962463379, "learning_rate": 7.590102974828377e-06, "loss": 0.6397, "step": 5223 }, { "epoch": 1.49, "grad_norm": 10.250239372253418, "learning_rate": 7.585812356979406e-06, "loss": 0.5533, "step": 5224 }, { "epoch": 1.49, "grad_norm": 10.57835578918457, "learning_rate": 7.581521739130435e-06, "loss": 0.5701, "step": 5225 }, { "epoch": 1.49, "grad_norm": 11.8040189743042, "learning_rate": 7.577231121281465e-06, "loss": 0.8717, "step": 5226 }, { "epoch": 1.5, "grad_norm": 9.846546173095703, "learning_rate": 7.572940503432494e-06, "loss": 0.7043, "step": 5227 }, { "epoch": 1.5, "grad_norm": 9.991266250610352, "learning_rate": 7.568649885583524e-06, "loss": 0.7032, "step": 5228 }, { "epoch": 1.5, "grad_norm": 10.166804313659668, "learning_rate": 7.5643592677345534e-06, "loss": 0.8109, "step": 5229 }, { "epoch": 1.5, "grad_norm": 9.002490997314453, "learning_rate": 7.560068649885583e-06, "loss": 0.4385, "step": 5230 }, { "epoch": 1.5, "grad_norm": 11.902188301086426, "learning_rate": 7.555778032036613e-06, "loss": 0.7558, "step": 5231 }, { "epoch": 1.5, "grad_norm": 12.483781814575195, "learning_rate": 7.551487414187644e-06, "loss": 0.6895, "step": 5232 }, { "epoch": 1.5, "grad_norm": 9.413661003112793, "learning_rate": 7.547196796338674e-06, "loss": 0.6075, "step": 5233 }, { "epoch": 1.5, "grad_norm": 9.486357688903809, "learning_rate": 7.542906178489703e-06, "loss": 0.5532, "step": 5234 }, { "epoch": 1.5, "grad_norm": 9.101693153381348, "learning_rate": 7.538615560640733e-06, "loss": 0.6248, "step": 5235 }, { "epoch": 1.5, "grad_norm": 11.137267112731934, "learning_rate": 7.5343249427917624e-06, "loss": 0.8225, "step": 5236 }, { "epoch": 1.5, "grad_norm": 11.395636558532715, "learning_rate": 7.530034324942792e-06, "loss": 0.6358, "step": 5237 }, { "epoch": 1.5, "grad_norm": 10.854764938354492, "learning_rate": 7.525743707093821e-06, "loss": 0.6775, "step": 5238 }, { "epoch": 1.5, "grad_norm": 14.450782775878906, "learning_rate": 7.52145308924485e-06, "loss": 0.865, "step": 5239 }, { "epoch": 1.5, "grad_norm": 10.17805290222168, "learning_rate": 7.517162471395882e-06, "loss": 0.5995, "step": 5240 }, { "epoch": 1.5, "grad_norm": 12.272109985351562, "learning_rate": 7.512871853546911e-06, "loss": 0.9173, "step": 5241 }, { "epoch": 1.5, "grad_norm": 11.942349433898926, "learning_rate": 7.508581235697941e-06, "loss": 0.8059, "step": 5242 }, { "epoch": 1.5, "grad_norm": 10.477256774902344, "learning_rate": 7.5042906178489706e-06, "loss": 0.6005, "step": 5243 }, { "epoch": 1.5, "grad_norm": 10.549176216125488, "learning_rate": 7.5e-06, "loss": 0.8263, "step": 5244 }, { "epoch": 1.5, "grad_norm": 9.667335510253906, "learning_rate": 7.49570938215103e-06, "loss": 0.6895, "step": 5245 }, { "epoch": 1.5, "grad_norm": 8.928720474243164, "learning_rate": 7.491418764302059e-06, "loss": 0.647, "step": 5246 }, { "epoch": 1.5, "grad_norm": 10.364776611328125, "learning_rate": 7.48712814645309e-06, "loss": 0.6653, "step": 5247 }, { "epoch": 1.5, "grad_norm": 13.162821769714355, "learning_rate": 7.4828375286041195e-06, "loss": 1.0826, "step": 5248 }, { "epoch": 1.5, "grad_norm": 8.7410249710083, "learning_rate": 7.478546910755149e-06, "loss": 0.5761, "step": 5249 }, { "epoch": 1.5, "grad_norm": 9.24797534942627, "learning_rate": 7.474256292906179e-06, "loss": 0.4566, "step": 5250 }, { "epoch": 1.5, "grad_norm": 9.684334754943848, "learning_rate": 7.469965675057208e-06, "loss": 0.6107, "step": 5251 }, { "epoch": 1.5, "grad_norm": 12.080406188964844, "learning_rate": 7.465675057208238e-06, "loss": 0.9, "step": 5252 }, { "epoch": 1.5, "grad_norm": 13.190532684326172, "learning_rate": 7.4613844393592676e-06, "loss": 0.9134, "step": 5253 }, { "epoch": 1.5, "grad_norm": 11.988898277282715, "learning_rate": 7.457093821510297e-06, "loss": 0.8086, "step": 5254 }, { "epoch": 1.5, "grad_norm": 12.422467231750488, "learning_rate": 7.452803203661328e-06, "loss": 0.809, "step": 5255 }, { "epoch": 1.5, "grad_norm": 13.21649169921875, "learning_rate": 7.448512585812357e-06, "loss": 0.9793, "step": 5256 }, { "epoch": 1.5, "grad_norm": 11.757281303405762, "learning_rate": 7.444221967963387e-06, "loss": 0.8449, "step": 5257 }, { "epoch": 1.5, "grad_norm": 10.680924415588379, "learning_rate": 7.4399313501144165e-06, "loss": 0.7947, "step": 5258 }, { "epoch": 1.5, "grad_norm": 10.09373664855957, "learning_rate": 7.435640732265447e-06, "loss": 0.7494, "step": 5259 }, { "epoch": 1.5, "grad_norm": 11.1739501953125, "learning_rate": 7.4313501144164766e-06, "loss": 0.682, "step": 5260 }, { "epoch": 1.5, "grad_norm": 10.319697380065918, "learning_rate": 7.427059496567506e-06, "loss": 0.8318, "step": 5261 }, { "epoch": 1.51, "grad_norm": 9.602401733398438, "learning_rate": 7.422768878718536e-06, "loss": 0.5827, "step": 5262 }, { "epoch": 1.51, "grad_norm": 8.633907318115234, "learning_rate": 7.418478260869565e-06, "loss": 0.5389, "step": 5263 }, { "epoch": 1.51, "grad_norm": 8.840450286865234, "learning_rate": 7.414187643020595e-06, "loss": 0.7338, "step": 5264 }, { "epoch": 1.51, "grad_norm": 9.036559104919434, "learning_rate": 7.409897025171625e-06, "loss": 0.6257, "step": 5265 }, { "epoch": 1.51, "grad_norm": 12.07451057434082, "learning_rate": 7.405606407322654e-06, "loss": 0.6456, "step": 5266 }, { "epoch": 1.51, "grad_norm": 9.394609451293945, "learning_rate": 7.401315789473684e-06, "loss": 0.7133, "step": 5267 }, { "epoch": 1.51, "grad_norm": 9.017657279968262, "learning_rate": 7.397025171624714e-06, "loss": 0.532, "step": 5268 }, { "epoch": 1.51, "grad_norm": 11.069826126098633, "learning_rate": 7.392734553775744e-06, "loss": 0.4487, "step": 5269 }, { "epoch": 1.51, "grad_norm": 11.50070571899414, "learning_rate": 7.3884439359267736e-06, "loss": 0.7551, "step": 5270 }, { "epoch": 1.51, "grad_norm": 8.076889038085938, "learning_rate": 7.384153318077803e-06, "loss": 0.52, "step": 5271 }, { "epoch": 1.51, "grad_norm": 9.858548164367676, "learning_rate": 7.379862700228834e-06, "loss": 0.7357, "step": 5272 }, { "epoch": 1.51, "grad_norm": 11.235546112060547, "learning_rate": 7.375572082379863e-06, "loss": 0.6642, "step": 5273 }, { "epoch": 1.51, "grad_norm": 12.630517959594727, "learning_rate": 7.371281464530893e-06, "loss": 0.8561, "step": 5274 }, { "epoch": 1.51, "grad_norm": 11.6703462600708, "learning_rate": 7.3669908466819225e-06, "loss": 0.9949, "step": 5275 }, { "epoch": 1.51, "grad_norm": 13.284300804138184, "learning_rate": 7.362700228832952e-06, "loss": 0.6989, "step": 5276 }, { "epoch": 1.51, "grad_norm": 10.491209030151367, "learning_rate": 7.358409610983982e-06, "loss": 0.7176, "step": 5277 }, { "epoch": 1.51, "grad_norm": 8.57515811920166, "learning_rate": 7.354118993135011e-06, "loss": 0.5642, "step": 5278 }, { "epoch": 1.51, "grad_norm": 11.295157432556152, "learning_rate": 7.349828375286041e-06, "loss": 0.5904, "step": 5279 }, { "epoch": 1.51, "grad_norm": 10.756020545959473, "learning_rate": 7.3455377574370706e-06, "loss": 0.5272, "step": 5280 }, { "epoch": 1.51, "grad_norm": 14.983441352844238, "learning_rate": 7.341247139588101e-06, "loss": 0.6953, "step": 5281 }, { "epoch": 1.51, "grad_norm": 10.309775352478027, "learning_rate": 7.336956521739131e-06, "loss": 0.6887, "step": 5282 }, { "epoch": 1.51, "grad_norm": 12.075322151184082, "learning_rate": 7.33266590389016e-06, "loss": 0.9896, "step": 5283 }, { "epoch": 1.51, "grad_norm": 11.666068077087402, "learning_rate": 7.32837528604119e-06, "loss": 0.7826, "step": 5284 }, { "epoch": 1.51, "grad_norm": 13.928191184997559, "learning_rate": 7.32408466819222e-06, "loss": 0.883, "step": 5285 }, { "epoch": 1.51, "grad_norm": 11.600682258605957, "learning_rate": 7.31979405034325e-06, "loss": 0.666, "step": 5286 }, { "epoch": 1.51, "grad_norm": 10.639303207397461, "learning_rate": 7.3155034324942796e-06, "loss": 0.7518, "step": 5287 }, { "epoch": 1.51, "grad_norm": 13.749682426452637, "learning_rate": 7.311212814645309e-06, "loss": 0.9626, "step": 5288 }, { "epoch": 1.51, "grad_norm": 8.073677062988281, "learning_rate": 7.30692219679634e-06, "loss": 0.59, "step": 5289 }, { "epoch": 1.51, "grad_norm": 10.90993881225586, "learning_rate": 7.302631578947368e-06, "loss": 0.6807, "step": 5290 }, { "epoch": 1.51, "grad_norm": 10.742498397827148, "learning_rate": 7.298340961098398e-06, "loss": 0.5879, "step": 5291 }, { "epoch": 1.51, "grad_norm": 10.239691734313965, "learning_rate": 7.294050343249428e-06, "loss": 0.7894, "step": 5292 }, { "epoch": 1.51, "grad_norm": 10.508283615112305, "learning_rate": 7.289759725400457e-06, "loss": 0.7049, "step": 5293 }, { "epoch": 1.51, "grad_norm": 10.425931930541992, "learning_rate": 7.285469107551488e-06, "loss": 0.7857, "step": 5294 }, { "epoch": 1.51, "grad_norm": 12.55759334564209, "learning_rate": 7.281178489702517e-06, "loss": 0.8908, "step": 5295 }, { "epoch": 1.51, "grad_norm": 9.504537582397461, "learning_rate": 7.276887871853547e-06, "loss": 0.6734, "step": 5296 }, { "epoch": 1.52, "grad_norm": 9.230053901672363, "learning_rate": 7.2725972540045766e-06, "loss": 0.7552, "step": 5297 }, { "epoch": 1.52, "grad_norm": 9.277558326721191, "learning_rate": 7.268306636155607e-06, "loss": 0.5683, "step": 5298 }, { "epoch": 1.52, "grad_norm": 9.663713455200195, "learning_rate": 7.264016018306637e-06, "loss": 0.8314, "step": 5299 }, { "epoch": 1.52, "grad_norm": 11.926158905029297, "learning_rate": 7.259725400457666e-06, "loss": 0.9163, "step": 5300 }, { "epoch": 1.52, "grad_norm": 10.892145156860352, "learning_rate": 7.255434782608696e-06, "loss": 0.5672, "step": 5301 }, { "epoch": 1.52, "grad_norm": 12.315850257873535, "learning_rate": 7.251144164759726e-06, "loss": 0.8399, "step": 5302 }, { "epoch": 1.52, "grad_norm": 11.595917701721191, "learning_rate": 7.246853546910756e-06, "loss": 0.8012, "step": 5303 }, { "epoch": 1.52, "grad_norm": 10.428421974182129, "learning_rate": 7.242562929061785e-06, "loss": 0.7003, "step": 5304 }, { "epoch": 1.52, "grad_norm": 10.854645729064941, "learning_rate": 7.238272311212814e-06, "loss": 0.8191, "step": 5305 }, { "epoch": 1.52, "grad_norm": 12.198843955993652, "learning_rate": 7.233981693363844e-06, "loss": 0.8671, "step": 5306 }, { "epoch": 1.52, "grad_norm": 9.188078880310059, "learning_rate": 7.229691075514874e-06, "loss": 0.6152, "step": 5307 }, { "epoch": 1.52, "grad_norm": 9.002267837524414, "learning_rate": 7.225400457665904e-06, "loss": 0.7284, "step": 5308 }, { "epoch": 1.52, "grad_norm": 9.587257385253906, "learning_rate": 7.221109839816934e-06, "loss": 0.8177, "step": 5309 }, { "epoch": 1.52, "grad_norm": 11.889808654785156, "learning_rate": 7.216819221967963e-06, "loss": 0.7482, "step": 5310 }, { "epoch": 1.52, "grad_norm": 11.84115982055664, "learning_rate": 7.212528604118994e-06, "loss": 0.7478, "step": 5311 }, { "epoch": 1.52, "grad_norm": 11.276606559753418, "learning_rate": 7.208237986270023e-06, "loss": 0.7999, "step": 5312 }, { "epoch": 1.52, "grad_norm": 9.208023071289062, "learning_rate": 7.203947368421053e-06, "loss": 0.6271, "step": 5313 }, { "epoch": 1.52, "grad_norm": 8.481173515319824, "learning_rate": 7.1996567505720826e-06, "loss": 0.5259, "step": 5314 }, { "epoch": 1.52, "grad_norm": 10.612635612487793, "learning_rate": 7.195366132723113e-06, "loss": 0.8156, "step": 5315 }, { "epoch": 1.52, "grad_norm": 9.746124267578125, "learning_rate": 7.191075514874143e-06, "loss": 0.7464, "step": 5316 }, { "epoch": 1.52, "grad_norm": 11.440072059631348, "learning_rate": 7.186784897025171e-06, "loss": 0.5033, "step": 5317 }, { "epoch": 1.52, "grad_norm": 10.415870666503906, "learning_rate": 7.182494279176201e-06, "loss": 0.6788, "step": 5318 }, { "epoch": 1.52, "grad_norm": 9.202872276306152, "learning_rate": 7.1782036613272315e-06, "loss": 0.7637, "step": 5319 }, { "epoch": 1.52, "grad_norm": 10.398765563964844, "learning_rate": 7.173913043478261e-06, "loss": 0.8702, "step": 5320 }, { "epoch": 1.52, "grad_norm": 11.210211753845215, "learning_rate": 7.169622425629291e-06, "loss": 1.0157, "step": 5321 }, { "epoch": 1.52, "grad_norm": 8.577627182006836, "learning_rate": 7.16533180778032e-06, "loss": 0.5621, "step": 5322 }, { "epoch": 1.52, "grad_norm": 9.694450378417969, "learning_rate": 7.16104118993135e-06, "loss": 0.8201, "step": 5323 }, { "epoch": 1.52, "grad_norm": 9.710101127624512, "learning_rate": 7.15675057208238e-06, "loss": 0.591, "step": 5324 }, { "epoch": 1.52, "grad_norm": 9.863205909729004, "learning_rate": 7.15245995423341e-06, "loss": 0.6081, "step": 5325 }, { "epoch": 1.52, "grad_norm": 10.321282386779785, "learning_rate": 7.14816933638444e-06, "loss": 0.7331, "step": 5326 }, { "epoch": 1.52, "grad_norm": 7.0798258781433105, "learning_rate": 7.143878718535469e-06, "loss": 0.4342, "step": 5327 }, { "epoch": 1.52, "grad_norm": 10.438071250915527, "learning_rate": 7.1395881006865e-06, "loss": 0.8411, "step": 5328 }, { "epoch": 1.52, "grad_norm": 8.400115013122559, "learning_rate": 7.135297482837529e-06, "loss": 0.6325, "step": 5329 }, { "epoch": 1.52, "grad_norm": 11.314515113830566, "learning_rate": 7.131006864988558e-06, "loss": 0.516, "step": 5330 }, { "epoch": 1.52, "grad_norm": 10.420086860656738, "learning_rate": 7.126716247139588e-06, "loss": 0.8869, "step": 5331 }, { "epoch": 1.53, "grad_norm": 10.788599014282227, "learning_rate": 7.122425629290618e-06, "loss": 0.6434, "step": 5332 }, { "epoch": 1.53, "grad_norm": 10.751967430114746, "learning_rate": 7.118135011441648e-06, "loss": 0.8512, "step": 5333 }, { "epoch": 1.53, "grad_norm": 9.534699440002441, "learning_rate": 7.113844393592677e-06, "loss": 0.718, "step": 5334 }, { "epoch": 1.53, "grad_norm": 9.03892707824707, "learning_rate": 7.109553775743707e-06, "loss": 0.6529, "step": 5335 }, { "epoch": 1.53, "grad_norm": 10.13648796081543, "learning_rate": 7.105263157894737e-06, "loss": 0.6326, "step": 5336 }, { "epoch": 1.53, "grad_norm": 9.899974822998047, "learning_rate": 7.100972540045767e-06, "loss": 0.5475, "step": 5337 }, { "epoch": 1.53, "grad_norm": 11.557777404785156, "learning_rate": 7.096681922196797e-06, "loss": 0.8745, "step": 5338 }, { "epoch": 1.53, "grad_norm": 11.886506080627441, "learning_rate": 7.092391304347826e-06, "loss": 0.9558, "step": 5339 }, { "epoch": 1.53, "grad_norm": 7.03980016708374, "learning_rate": 7.088100686498856e-06, "loss": 0.4913, "step": 5340 }, { "epoch": 1.53, "grad_norm": 10.639065742492676, "learning_rate": 7.083810068649886e-06, "loss": 0.5013, "step": 5341 }, { "epoch": 1.53, "grad_norm": 16.474300384521484, "learning_rate": 7.079519450800916e-06, "loss": 0.9308, "step": 5342 }, { "epoch": 1.53, "grad_norm": 10.744767189025879, "learning_rate": 7.075228832951945e-06, "loss": 0.7818, "step": 5343 }, { "epoch": 1.53, "grad_norm": 11.738082885742188, "learning_rate": 7.070938215102974e-06, "loss": 0.7811, "step": 5344 }, { "epoch": 1.53, "grad_norm": 11.322881698608398, "learning_rate": 7.066647597254005e-06, "loss": 0.7869, "step": 5345 }, { "epoch": 1.53, "grad_norm": 11.683786392211914, "learning_rate": 7.0623569794050345e-06, "loss": 0.938, "step": 5346 }, { "epoch": 1.53, "grad_norm": 9.391820907592773, "learning_rate": 7.058066361556064e-06, "loss": 0.5965, "step": 5347 }, { "epoch": 1.53, "grad_norm": 13.156702041625977, "learning_rate": 7.053775743707094e-06, "loss": 0.8922, "step": 5348 }, { "epoch": 1.53, "grad_norm": 13.145834922790527, "learning_rate": 7.049485125858124e-06, "loss": 0.8078, "step": 5349 }, { "epoch": 1.53, "grad_norm": 8.215470314025879, "learning_rate": 7.045194508009154e-06, "loss": 0.4871, "step": 5350 }, { "epoch": 1.53, "grad_norm": 11.24267578125, "learning_rate": 7.040903890160183e-06, "loss": 0.8555, "step": 5351 }, { "epoch": 1.53, "grad_norm": 10.632845878601074, "learning_rate": 7.036613272311213e-06, "loss": 0.5971, "step": 5352 }, { "epoch": 1.53, "grad_norm": 13.724236488342285, "learning_rate": 7.032322654462243e-06, "loss": 0.7376, "step": 5353 }, { "epoch": 1.53, "grad_norm": 10.956292152404785, "learning_rate": 7.028032036613273e-06, "loss": 0.6456, "step": 5354 }, { "epoch": 1.53, "grad_norm": 8.754426002502441, "learning_rate": 7.023741418764303e-06, "loss": 0.5247, "step": 5355 }, { "epoch": 1.53, "grad_norm": 9.3694486618042, "learning_rate": 7.0194508009153315e-06, "loss": 0.6017, "step": 5356 }, { "epoch": 1.53, "grad_norm": 9.64429759979248, "learning_rate": 7.015160183066361e-06, "loss": 0.5578, "step": 5357 }, { "epoch": 1.53, "grad_norm": 11.855950355529785, "learning_rate": 7.0108695652173915e-06, "loss": 0.8204, "step": 5358 }, { "epoch": 1.53, "grad_norm": 10.41483211517334, "learning_rate": 7.006578947368421e-06, "loss": 0.5388, "step": 5359 }, { "epoch": 1.53, "grad_norm": 10.524611473083496, "learning_rate": 7.002288329519451e-06, "loss": 0.5432, "step": 5360 }, { "epoch": 1.53, "grad_norm": 13.025598526000977, "learning_rate": 6.99799771167048e-06, "loss": 0.961, "step": 5361 }, { "epoch": 1.53, "grad_norm": 9.626395225524902, "learning_rate": 6.993707093821511e-06, "loss": 0.6729, "step": 5362 }, { "epoch": 1.53, "grad_norm": 11.571722984313965, "learning_rate": 6.9894164759725405e-06, "loss": 0.7831, "step": 5363 }, { "epoch": 1.53, "grad_norm": 13.247154235839844, "learning_rate": 6.98512585812357e-06, "loss": 0.7105, "step": 5364 }, { "epoch": 1.53, "grad_norm": 9.18348503112793, "learning_rate": 6.9808352402746e-06, "loss": 0.8202, "step": 5365 }, { "epoch": 1.53, "grad_norm": 10.731383323669434, "learning_rate": 6.976544622425629e-06, "loss": 0.711, "step": 5366 }, { "epoch": 1.54, "grad_norm": 10.938879013061523, "learning_rate": 6.97225400457666e-06, "loss": 0.7683, "step": 5367 }, { "epoch": 1.54, "grad_norm": 9.101683616638184, "learning_rate": 6.967963386727689e-06, "loss": 0.5983, "step": 5368 }, { "epoch": 1.54, "grad_norm": 8.889518737792969, "learning_rate": 6.963672768878718e-06, "loss": 0.636, "step": 5369 }, { "epoch": 1.54, "grad_norm": 10.405555725097656, "learning_rate": 6.959382151029748e-06, "loss": 0.6939, "step": 5370 }, { "epoch": 1.54, "grad_norm": 10.569013595581055, "learning_rate": 6.955091533180778e-06, "loss": 0.8429, "step": 5371 }, { "epoch": 1.54, "grad_norm": 9.115777969360352, "learning_rate": 6.950800915331808e-06, "loss": 0.5183, "step": 5372 }, { "epoch": 1.54, "grad_norm": 9.988287925720215, "learning_rate": 6.9465102974828375e-06, "loss": 0.7661, "step": 5373 }, { "epoch": 1.54, "grad_norm": 11.403606414794922, "learning_rate": 6.942219679633867e-06, "loss": 0.8205, "step": 5374 }, { "epoch": 1.54, "grad_norm": 12.239830017089844, "learning_rate": 6.9379290617848975e-06, "loss": 1.0035, "step": 5375 }, { "epoch": 1.54, "grad_norm": 9.13248348236084, "learning_rate": 6.933638443935927e-06, "loss": 0.6676, "step": 5376 }, { "epoch": 1.54, "grad_norm": 11.136905670166016, "learning_rate": 6.929347826086957e-06, "loss": 0.8766, "step": 5377 }, { "epoch": 1.54, "grad_norm": 7.92160701751709, "learning_rate": 6.925057208237986e-06, "loss": 0.6367, "step": 5378 }, { "epoch": 1.54, "grad_norm": 10.290059089660645, "learning_rate": 6.920766590389017e-06, "loss": 0.6464, "step": 5379 }, { "epoch": 1.54, "grad_norm": 10.359493255615234, "learning_rate": 6.9164759725400465e-06, "loss": 0.7502, "step": 5380 }, { "epoch": 1.54, "grad_norm": 9.81421184539795, "learning_rate": 6.912185354691076e-06, "loss": 0.8105, "step": 5381 }, { "epoch": 1.54, "grad_norm": 10.404888153076172, "learning_rate": 6.907894736842106e-06, "loss": 0.796, "step": 5382 }, { "epoch": 1.54, "grad_norm": 9.707366943359375, "learning_rate": 6.9036041189931345e-06, "loss": 0.7516, "step": 5383 }, { "epoch": 1.54, "grad_norm": 10.13158893585205, "learning_rate": 6.899313501144165e-06, "loss": 0.802, "step": 5384 }, { "epoch": 1.54, "grad_norm": 11.407342910766602, "learning_rate": 6.8950228832951945e-06, "loss": 0.6751, "step": 5385 }, { "epoch": 1.54, "grad_norm": 13.483102798461914, "learning_rate": 6.890732265446224e-06, "loss": 0.8375, "step": 5386 }, { "epoch": 1.54, "grad_norm": 11.197330474853516, "learning_rate": 6.886441647597254e-06, "loss": 0.662, "step": 5387 }, { "epoch": 1.54, "grad_norm": 11.573650360107422, "learning_rate": 6.882151029748284e-06, "loss": 0.85, "step": 5388 }, { "epoch": 1.54, "grad_norm": 13.032238006591797, "learning_rate": 6.877860411899314e-06, "loss": 1.016, "step": 5389 }, { "epoch": 1.54, "grad_norm": 10.803162574768066, "learning_rate": 6.8735697940503434e-06, "loss": 0.56, "step": 5390 }, { "epoch": 1.54, "grad_norm": 10.913394927978516, "learning_rate": 6.869279176201373e-06, "loss": 0.6293, "step": 5391 }, { "epoch": 1.54, "grad_norm": 10.91557502746582, "learning_rate": 6.8649885583524035e-06, "loss": 0.8244, "step": 5392 }, { "epoch": 1.54, "grad_norm": 10.184675216674805, "learning_rate": 6.860697940503433e-06, "loss": 0.6208, "step": 5393 }, { "epoch": 1.54, "grad_norm": 10.222221374511719, "learning_rate": 6.856407322654463e-06, "loss": 0.6482, "step": 5394 }, { "epoch": 1.54, "grad_norm": 8.793624877929688, "learning_rate": 6.852116704805492e-06, "loss": 0.5475, "step": 5395 }, { "epoch": 1.54, "grad_norm": 11.794410705566406, "learning_rate": 6.847826086956521e-06, "loss": 0.5631, "step": 5396 }, { "epoch": 1.54, "grad_norm": 13.063423156738281, "learning_rate": 6.843535469107552e-06, "loss": 0.5843, "step": 5397 }, { "epoch": 1.54, "grad_norm": 12.135796546936035, "learning_rate": 6.839244851258581e-06, "loss": 0.6722, "step": 5398 }, { "epoch": 1.54, "grad_norm": 11.323772430419922, "learning_rate": 6.834954233409611e-06, "loss": 0.5577, "step": 5399 }, { "epoch": 1.54, "grad_norm": 11.501399993896484, "learning_rate": 6.8306636155606404e-06, "loss": 0.8055, "step": 5400 }, { "epoch": 1.54, "grad_norm": 9.352534294128418, "learning_rate": 6.826372997711671e-06, "loss": 0.7704, "step": 5401 }, { "epoch": 1.55, "grad_norm": 11.826374053955078, "learning_rate": 6.8220823798627005e-06, "loss": 0.6599, "step": 5402 }, { "epoch": 1.55, "grad_norm": 9.877493858337402, "learning_rate": 6.81779176201373e-06, "loss": 0.76, "step": 5403 }, { "epoch": 1.55, "grad_norm": 11.737428665161133, "learning_rate": 6.81350114416476e-06, "loss": 0.7593, "step": 5404 }, { "epoch": 1.55, "grad_norm": 10.759040832519531, "learning_rate": 6.80921052631579e-06, "loss": 0.9183, "step": 5405 }, { "epoch": 1.55, "grad_norm": 10.864048957824707, "learning_rate": 6.80491990846682e-06, "loss": 0.7596, "step": 5406 }, { "epoch": 1.55, "grad_norm": 9.24080753326416, "learning_rate": 6.8006292906178494e-06, "loss": 0.6632, "step": 5407 }, { "epoch": 1.55, "grad_norm": 11.46875, "learning_rate": 6.796338672768879e-06, "loss": 0.6237, "step": 5408 }, { "epoch": 1.55, "grad_norm": 11.83765697479248, "learning_rate": 6.792048054919909e-06, "loss": 0.7213, "step": 5409 }, { "epoch": 1.55, "grad_norm": 12.011991500854492, "learning_rate": 6.787757437070938e-06, "loss": 0.9464, "step": 5410 }, { "epoch": 1.55, "grad_norm": 10.703045845031738, "learning_rate": 6.783466819221968e-06, "loss": 0.7118, "step": 5411 }, { "epoch": 1.55, "grad_norm": 10.701443672180176, "learning_rate": 6.7791762013729975e-06, "loss": 0.5642, "step": 5412 }, { "epoch": 1.55, "grad_norm": 10.329233169555664, "learning_rate": 6.774885583524027e-06, "loss": 0.6452, "step": 5413 }, { "epoch": 1.55, "grad_norm": 8.791563034057617, "learning_rate": 6.770594965675058e-06, "loss": 0.5704, "step": 5414 }, { "epoch": 1.55, "grad_norm": 9.633209228515625, "learning_rate": 6.766304347826087e-06, "loss": 0.8627, "step": 5415 }, { "epoch": 1.55, "grad_norm": 9.805323600769043, "learning_rate": 6.762013729977117e-06, "loss": 0.6023, "step": 5416 }, { "epoch": 1.55, "grad_norm": 10.737568855285645, "learning_rate": 6.7577231121281464e-06, "loss": 0.708, "step": 5417 }, { "epoch": 1.55, "grad_norm": 8.404878616333008, "learning_rate": 6.753432494279177e-06, "loss": 0.6025, "step": 5418 }, { "epoch": 1.55, "grad_norm": 10.053535461425781, "learning_rate": 6.7491418764302065e-06, "loss": 0.8781, "step": 5419 }, { "epoch": 1.55, "grad_norm": 13.944951057434082, "learning_rate": 6.744851258581236e-06, "loss": 1.1078, "step": 5420 }, { "epoch": 1.55, "grad_norm": 9.490021705627441, "learning_rate": 6.740560640732266e-06, "loss": 0.6951, "step": 5421 }, { "epoch": 1.55, "grad_norm": 9.664155960083008, "learning_rate": 6.736270022883295e-06, "loss": 0.8254, "step": 5422 }, { "epoch": 1.55, "grad_norm": 11.093666076660156, "learning_rate": 6.731979405034325e-06, "loss": 0.6202, "step": 5423 }, { "epoch": 1.55, "grad_norm": 11.165013313293457, "learning_rate": 6.727688787185355e-06, "loss": 0.7902, "step": 5424 }, { "epoch": 1.55, "grad_norm": 10.131797790527344, "learning_rate": 6.723398169336384e-06, "loss": 0.596, "step": 5425 }, { "epoch": 1.55, "grad_norm": 9.271760940551758, "learning_rate": 6.719107551487414e-06, "loss": 0.4633, "step": 5426 }, { "epoch": 1.55, "grad_norm": 11.775052070617676, "learning_rate": 6.714816933638444e-06, "loss": 0.8464, "step": 5427 }, { "epoch": 1.55, "grad_norm": 8.292208671569824, "learning_rate": 6.710526315789474e-06, "loss": 0.5077, "step": 5428 }, { "epoch": 1.55, "grad_norm": 10.974455833435059, "learning_rate": 6.7062356979405035e-06, "loss": 0.721, "step": 5429 }, { "epoch": 1.55, "grad_norm": 10.512555122375488, "learning_rate": 6.701945080091533e-06, "loss": 0.961, "step": 5430 }, { "epoch": 1.55, "grad_norm": 15.095166206359863, "learning_rate": 6.697654462242564e-06, "loss": 0.8586, "step": 5431 }, { "epoch": 1.55, "grad_norm": 11.317481994628906, "learning_rate": 6.693363844393593e-06, "loss": 0.9296, "step": 5432 }, { "epoch": 1.55, "grad_norm": 10.495819091796875, "learning_rate": 6.689073226544623e-06, "loss": 0.8478, "step": 5433 }, { "epoch": 1.55, "grad_norm": 11.36196231842041, "learning_rate": 6.6847826086956524e-06, "loss": 0.9135, "step": 5434 }, { "epoch": 1.55, "grad_norm": 11.978899002075195, "learning_rate": 6.680491990846682e-06, "loss": 0.8818, "step": 5435 }, { "epoch": 1.55, "grad_norm": 10.200531005859375, "learning_rate": 6.676201372997712e-06, "loss": 0.8606, "step": 5436 }, { "epoch": 1.56, "grad_norm": 11.20178508758545, "learning_rate": 6.671910755148741e-06, "loss": 0.5133, "step": 5437 }, { "epoch": 1.56, "grad_norm": 10.661775588989258, "learning_rate": 6.667620137299771e-06, "loss": 0.6154, "step": 5438 }, { "epoch": 1.56, "grad_norm": 10.742033958435059, "learning_rate": 6.663329519450801e-06, "loss": 0.4946, "step": 5439 }, { "epoch": 1.56, "grad_norm": 11.584891319274902, "learning_rate": 6.659038901601831e-06, "loss": 0.8595, "step": 5440 }, { "epoch": 1.56, "grad_norm": 9.102738380432129, "learning_rate": 6.654748283752861e-06, "loss": 0.7664, "step": 5441 }, { "epoch": 1.56, "grad_norm": 10.778501510620117, "learning_rate": 6.65045766590389e-06, "loss": 0.6236, "step": 5442 }, { "epoch": 1.56, "grad_norm": 9.565971374511719, "learning_rate": 6.64616704805492e-06, "loss": 0.6284, "step": 5443 }, { "epoch": 1.56, "grad_norm": 14.338422775268555, "learning_rate": 6.64187643020595e-06, "loss": 1.364, "step": 5444 }, { "epoch": 1.56, "grad_norm": 9.952033996582031, "learning_rate": 6.63758581235698e-06, "loss": 0.6922, "step": 5445 }, { "epoch": 1.56, "grad_norm": 11.656608581542969, "learning_rate": 6.6332951945080095e-06, "loss": 0.8932, "step": 5446 }, { "epoch": 1.56, "grad_norm": 10.877176284790039, "learning_rate": 6.629004576659039e-06, "loss": 0.7369, "step": 5447 }, { "epoch": 1.56, "grad_norm": 9.53266429901123, "learning_rate": 6.624713958810069e-06, "loss": 0.7248, "step": 5448 }, { "epoch": 1.56, "grad_norm": 10.719442367553711, "learning_rate": 6.620423340961098e-06, "loss": 0.5127, "step": 5449 }, { "epoch": 1.56, "grad_norm": 9.9187593460083, "learning_rate": 6.616132723112128e-06, "loss": 0.7927, "step": 5450 }, { "epoch": 1.56, "grad_norm": 10.836196899414062, "learning_rate": 6.611842105263158e-06, "loss": 0.9459, "step": 5451 }, { "epoch": 1.56, "grad_norm": 9.903067588806152, "learning_rate": 6.607551487414188e-06, "loss": 0.7539, "step": 5452 }, { "epoch": 1.56, "grad_norm": 9.817303657531738, "learning_rate": 6.603260869565218e-06, "loss": 0.5441, "step": 5453 }, { "epoch": 1.56, "grad_norm": 8.64751148223877, "learning_rate": 6.598970251716247e-06, "loss": 0.5629, "step": 5454 }, { "epoch": 1.56, "grad_norm": 8.386674880981445, "learning_rate": 6.594679633867277e-06, "loss": 0.5778, "step": 5455 }, { "epoch": 1.56, "grad_norm": 8.778495788574219, "learning_rate": 6.5903890160183065e-06, "loss": 0.7115, "step": 5456 }, { "epoch": 1.56, "grad_norm": 9.286686897277832, "learning_rate": 6.586098398169337e-06, "loss": 0.6961, "step": 5457 }, { "epoch": 1.56, "grad_norm": 9.776361465454102, "learning_rate": 6.581807780320367e-06, "loss": 0.7327, "step": 5458 }, { "epoch": 1.56, "grad_norm": 13.709680557250977, "learning_rate": 6.577517162471396e-06, "loss": 0.9498, "step": 5459 }, { "epoch": 1.56, "grad_norm": 11.222743034362793, "learning_rate": 6.573226544622426e-06, "loss": 0.5232, "step": 5460 }, { "epoch": 1.56, "grad_norm": 12.961259841918945, "learning_rate": 6.5689359267734554e-06, "loss": 0.7338, "step": 5461 }, { "epoch": 1.56, "grad_norm": 9.301464080810547, "learning_rate": 6.564645308924485e-06, "loss": 0.7528, "step": 5462 }, { "epoch": 1.56, "grad_norm": 19.588850021362305, "learning_rate": 6.560354691075515e-06, "loss": 0.807, "step": 5463 }, { "epoch": 1.56, "grad_norm": 10.187501907348633, "learning_rate": 6.556064073226544e-06, "loss": 1.0132, "step": 5464 }, { "epoch": 1.56, "grad_norm": 10.580513954162598, "learning_rate": 6.551773455377575e-06, "loss": 0.5971, "step": 5465 }, { "epoch": 1.56, "grad_norm": 11.745843887329102, "learning_rate": 6.547482837528604e-06, "loss": 0.9573, "step": 5466 }, { "epoch": 1.56, "grad_norm": 8.349631309509277, "learning_rate": 6.543192219679634e-06, "loss": 0.4772, "step": 5467 }, { "epoch": 1.56, "grad_norm": 9.672638893127441, "learning_rate": 6.538901601830664e-06, "loss": 0.6586, "step": 5468 }, { "epoch": 1.56, "grad_norm": 10.535762786865234, "learning_rate": 6.534610983981694e-06, "loss": 0.7865, "step": 5469 }, { "epoch": 1.56, "grad_norm": 10.90261459350586, "learning_rate": 6.530320366132724e-06, "loss": 0.5622, "step": 5470 }, { "epoch": 1.56, "grad_norm": 10.270242691040039, "learning_rate": 6.526029748283753e-06, "loss": 0.7503, "step": 5471 }, { "epoch": 1.57, "grad_norm": 10.051499366760254, "learning_rate": 6.521739130434783e-06, "loss": 0.8852, "step": 5472 }, { "epoch": 1.57, "grad_norm": 15.139245986938477, "learning_rate": 6.5174485125858125e-06, "loss": 0.7186, "step": 5473 }, { "epoch": 1.57, "grad_norm": 12.09343433380127, "learning_rate": 6.513157894736842e-06, "loss": 0.8723, "step": 5474 }, { "epoch": 1.57, "grad_norm": 8.4949312210083, "learning_rate": 6.508867276887872e-06, "loss": 0.5647, "step": 5475 }, { "epoch": 1.57, "grad_norm": 12.268529891967773, "learning_rate": 6.504576659038901e-06, "loss": 0.8052, "step": 5476 }, { "epoch": 1.57, "grad_norm": 12.441484451293945, "learning_rate": 6.500286041189931e-06, "loss": 0.5815, "step": 5477 }, { "epoch": 1.57, "grad_norm": 10.255128860473633, "learning_rate": 6.495995423340961e-06, "loss": 0.6331, "step": 5478 }, { "epoch": 1.57, "grad_norm": 11.862914085388184, "learning_rate": 6.491704805491991e-06, "loss": 0.6901, "step": 5479 }, { "epoch": 1.57, "grad_norm": 10.020846366882324, "learning_rate": 6.487414187643021e-06, "loss": 0.7216, "step": 5480 }, { "epoch": 1.57, "grad_norm": 10.554515838623047, "learning_rate": 6.48312356979405e-06, "loss": 0.7237, "step": 5481 }, { "epoch": 1.57, "grad_norm": 12.486430168151855, "learning_rate": 6.478832951945081e-06, "loss": 0.6598, "step": 5482 }, { "epoch": 1.57, "grad_norm": 10.92392349243164, "learning_rate": 6.47454233409611e-06, "loss": 0.7801, "step": 5483 }, { "epoch": 1.57, "grad_norm": 10.194019317626953, "learning_rate": 6.47025171624714e-06, "loss": 0.6593, "step": 5484 }, { "epoch": 1.57, "grad_norm": 10.077391624450684, "learning_rate": 6.4659610983981696e-06, "loss": 0.6658, "step": 5485 }, { "epoch": 1.57, "grad_norm": 10.685776710510254, "learning_rate": 6.461670480549199e-06, "loss": 0.6464, "step": 5486 }, { "epoch": 1.57, "grad_norm": 9.115002632141113, "learning_rate": 6.45737986270023e-06, "loss": 0.7477, "step": 5487 }, { "epoch": 1.57, "grad_norm": 8.218844413757324, "learning_rate": 6.453089244851258e-06, "loss": 0.6765, "step": 5488 }, { "epoch": 1.57, "grad_norm": 12.361523628234863, "learning_rate": 6.448798627002288e-06, "loss": 0.749, "step": 5489 }, { "epoch": 1.57, "grad_norm": 8.853835105895996, "learning_rate": 6.444508009153318e-06, "loss": 0.6066, "step": 5490 }, { "epoch": 1.57, "grad_norm": 10.084883689880371, "learning_rate": 6.440217391304348e-06, "loss": 0.7014, "step": 5491 }, { "epoch": 1.57, "grad_norm": 12.231773376464844, "learning_rate": 6.435926773455378e-06, "loss": 0.9233, "step": 5492 }, { "epoch": 1.57, "grad_norm": 12.060643196105957, "learning_rate": 6.431636155606407e-06, "loss": 0.8505, "step": 5493 }, { "epoch": 1.57, "grad_norm": 12.02497386932373, "learning_rate": 6.427345537757437e-06, "loss": 0.6959, "step": 5494 }, { "epoch": 1.57, "grad_norm": 10.105432510375977, "learning_rate": 6.423054919908467e-06, "loss": 0.6364, "step": 5495 }, { "epoch": 1.57, "grad_norm": 11.441240310668945, "learning_rate": 6.418764302059497e-06, "loss": 0.7262, "step": 5496 }, { "epoch": 1.57, "grad_norm": 10.952935218811035, "learning_rate": 6.414473684210527e-06, "loss": 0.6929, "step": 5497 }, { "epoch": 1.57, "grad_norm": 11.399803161621094, "learning_rate": 6.410183066361556e-06, "loss": 0.857, "step": 5498 }, { "epoch": 1.57, "grad_norm": 16.05520248413086, "learning_rate": 6.405892448512587e-06, "loss": 0.8272, "step": 5499 }, { "epoch": 1.57, "grad_norm": 10.909257888793945, "learning_rate": 6.401601830663616e-06, "loss": 0.8576, "step": 5500 }, { "epoch": 1.57, "grad_norm": 9.228020668029785, "learning_rate": 6.397311212814645e-06, "loss": 0.4105, "step": 5501 }, { "epoch": 1.57, "grad_norm": 10.026965141296387, "learning_rate": 6.393020594965675e-06, "loss": 0.5532, "step": 5502 }, { "epoch": 1.57, "grad_norm": 12.771260261535645, "learning_rate": 6.388729977116704e-06, "loss": 0.7664, "step": 5503 }, { "epoch": 1.57, "grad_norm": 11.88595962524414, "learning_rate": 6.384439359267735e-06, "loss": 0.8299, "step": 5504 }, { "epoch": 1.57, "grad_norm": 9.56600570678711, "learning_rate": 6.380148741418764e-06, "loss": 0.7204, "step": 5505 }, { "epoch": 1.57, "grad_norm": 9.454607009887695, "learning_rate": 6.375858123569794e-06, "loss": 0.5083, "step": 5506 }, { "epoch": 1.58, "grad_norm": 11.280576705932617, "learning_rate": 6.371567505720824e-06, "loss": 0.7152, "step": 5507 }, { "epoch": 1.58, "grad_norm": 10.759710311889648, "learning_rate": 6.367276887871854e-06, "loss": 0.8317, "step": 5508 }, { "epoch": 1.58, "grad_norm": 9.474405288696289, "learning_rate": 6.362986270022884e-06, "loss": 0.7682, "step": 5509 }, { "epoch": 1.58, "grad_norm": 11.654435157775879, "learning_rate": 6.358695652173913e-06, "loss": 0.7855, "step": 5510 }, { "epoch": 1.58, "grad_norm": 11.794870376586914, "learning_rate": 6.354405034324943e-06, "loss": 0.7901, "step": 5511 }, { "epoch": 1.58, "grad_norm": 11.223348617553711, "learning_rate": 6.350114416475973e-06, "loss": 0.6573, "step": 5512 }, { "epoch": 1.58, "grad_norm": 7.961227893829346, "learning_rate": 6.345823798627003e-06, "loss": 0.5521, "step": 5513 }, { "epoch": 1.58, "grad_norm": 9.095694541931152, "learning_rate": 6.341533180778032e-06, "loss": 0.7054, "step": 5514 }, { "epoch": 1.58, "grad_norm": 10.895547866821289, "learning_rate": 6.337242562929061e-06, "loss": 0.9474, "step": 5515 }, { "epoch": 1.58, "grad_norm": 12.356182098388672, "learning_rate": 6.332951945080091e-06, "loss": 0.7122, "step": 5516 }, { "epoch": 1.58, "grad_norm": 9.5460786819458, "learning_rate": 6.3286613272311215e-06, "loss": 0.6152, "step": 5517 }, { "epoch": 1.58, "grad_norm": 9.811149597167969, "learning_rate": 6.324370709382151e-06, "loss": 0.7623, "step": 5518 }, { "epoch": 1.58, "grad_norm": 8.240185737609863, "learning_rate": 6.320080091533181e-06, "loss": 0.6884, "step": 5519 }, { "epoch": 1.58, "grad_norm": 11.257939338684082, "learning_rate": 6.31578947368421e-06, "loss": 0.481, "step": 5520 }, { "epoch": 1.58, "grad_norm": 10.784560203552246, "learning_rate": 6.311498855835241e-06, "loss": 0.5507, "step": 5521 }, { "epoch": 1.58, "grad_norm": 11.940464973449707, "learning_rate": 6.30720823798627e-06, "loss": 0.7215, "step": 5522 }, { "epoch": 1.58, "grad_norm": 11.188111305236816, "learning_rate": 6.3029176201373e-06, "loss": 0.5997, "step": 5523 }, { "epoch": 1.58, "grad_norm": 7.8785223960876465, "learning_rate": 6.29862700228833e-06, "loss": 0.592, "step": 5524 }, { "epoch": 1.58, "grad_norm": 11.687405586242676, "learning_rate": 6.29433638443936e-06, "loss": 0.679, "step": 5525 }, { "epoch": 1.58, "grad_norm": 12.611087799072266, "learning_rate": 6.29004576659039e-06, "loss": 1.0554, "step": 5526 }, { "epoch": 1.58, "grad_norm": 9.295668601989746, "learning_rate": 6.2857551487414185e-06, "loss": 0.4928, "step": 5527 }, { "epoch": 1.58, "grad_norm": 11.037130355834961, "learning_rate": 6.281464530892448e-06, "loss": 0.6082, "step": 5528 }, { "epoch": 1.58, "grad_norm": 9.137391090393066, "learning_rate": 6.2771739130434786e-06, "loss": 0.5593, "step": 5529 }, { "epoch": 1.58, "grad_norm": 12.662400245666504, "learning_rate": 6.272883295194508e-06, "loss": 0.8509, "step": 5530 }, { "epoch": 1.58, "grad_norm": 11.213057518005371, "learning_rate": 6.268592677345538e-06, "loss": 0.7416, "step": 5531 }, { "epoch": 1.58, "grad_norm": 10.6245698928833, "learning_rate": 6.264302059496567e-06, "loss": 0.5605, "step": 5532 }, { "epoch": 1.58, "grad_norm": 8.723930358886719, "learning_rate": 6.260011441647597e-06, "loss": 0.5844, "step": 5533 }, { "epoch": 1.58, "grad_norm": 10.928689002990723, "learning_rate": 6.2557208237986275e-06, "loss": 0.6429, "step": 5534 }, { "epoch": 1.58, "grad_norm": 11.049684524536133, "learning_rate": 6.251430205949657e-06, "loss": 0.8551, "step": 5535 }, { "epoch": 1.58, "grad_norm": 9.772027969360352, "learning_rate": 6.247139588100687e-06, "loss": 0.7677, "step": 5536 }, { "epoch": 1.58, "grad_norm": 9.763443946838379, "learning_rate": 6.242848970251716e-06, "loss": 0.7714, "step": 5537 }, { "epoch": 1.58, "grad_norm": 10.62363338470459, "learning_rate": 6.238558352402747e-06, "loss": 0.5815, "step": 5538 }, { "epoch": 1.58, "grad_norm": 8.02758502960205, "learning_rate": 6.234267734553776e-06, "loss": 0.5845, "step": 5539 }, { "epoch": 1.58, "grad_norm": 9.214056015014648, "learning_rate": 6.229977116704805e-06, "loss": 0.6295, "step": 5540 }, { "epoch": 1.58, "grad_norm": 11.316672325134277, "learning_rate": 6.225686498855835e-06, "loss": 0.6987, "step": 5541 }, { "epoch": 1.59, "grad_norm": 10.765800476074219, "learning_rate": 6.221395881006865e-06, "loss": 0.6417, "step": 5542 }, { "epoch": 1.59, "grad_norm": 9.062914848327637, "learning_rate": 6.217105263157895e-06, "loss": 0.6407, "step": 5543 }, { "epoch": 1.59, "grad_norm": 9.839356422424316, "learning_rate": 6.2128146453089245e-06, "loss": 0.5741, "step": 5544 }, { "epoch": 1.59, "grad_norm": 10.004151344299316, "learning_rate": 6.208524027459954e-06, "loss": 0.5263, "step": 5545 }, { "epoch": 1.59, "grad_norm": 9.841792106628418, "learning_rate": 6.204233409610984e-06, "loss": 0.624, "step": 5546 }, { "epoch": 1.59, "grad_norm": 13.122528076171875, "learning_rate": 6.199942791762014e-06, "loss": 1.1056, "step": 5547 }, { "epoch": 1.59, "grad_norm": 9.106522560119629, "learning_rate": 6.195652173913044e-06, "loss": 0.6107, "step": 5548 }, { "epoch": 1.59, "grad_norm": 9.179657936096191, "learning_rate": 6.191361556064073e-06, "loss": 0.6582, "step": 5549 }, { "epoch": 1.59, "grad_norm": 11.946647644042969, "learning_rate": 6.187070938215103e-06, "loss": 0.7773, "step": 5550 }, { "epoch": 1.59, "grad_norm": 11.384345054626465, "learning_rate": 6.1827803203661335e-06, "loss": 0.86, "step": 5551 }, { "epoch": 1.59, "grad_norm": 10.477992057800293, "learning_rate": 6.178489702517163e-06, "loss": 0.8906, "step": 5552 }, { "epoch": 1.59, "grad_norm": 14.003066062927246, "learning_rate": 6.174199084668192e-06, "loss": 0.8674, "step": 5553 }, { "epoch": 1.59, "grad_norm": 12.793807983398438, "learning_rate": 6.1699084668192215e-06, "loss": 0.7947, "step": 5554 }, { "epoch": 1.59, "grad_norm": 10.329636573791504, "learning_rate": 6.165617848970252e-06, "loss": 0.9208, "step": 5555 }, { "epoch": 1.59, "grad_norm": 10.548229217529297, "learning_rate": 6.1613272311212815e-06, "loss": 0.6526, "step": 5556 }, { "epoch": 1.59, "grad_norm": 9.881653785705566, "learning_rate": 6.157036613272311e-06, "loss": 0.7158, "step": 5557 }, { "epoch": 1.59, "grad_norm": 10.707265853881836, "learning_rate": 6.152745995423341e-06, "loss": 0.6928, "step": 5558 }, { "epoch": 1.59, "grad_norm": 9.42878532409668, "learning_rate": 6.148455377574371e-06, "loss": 0.6004, "step": 5559 }, { "epoch": 1.59, "grad_norm": 11.764908790588379, "learning_rate": 6.144164759725401e-06, "loss": 0.8836, "step": 5560 }, { "epoch": 1.59, "grad_norm": 10.387475967407227, "learning_rate": 6.1398741418764305e-06, "loss": 0.7904, "step": 5561 }, { "epoch": 1.59, "grad_norm": 11.080256462097168, "learning_rate": 6.13558352402746e-06, "loss": 0.6863, "step": 5562 }, { "epoch": 1.59, "grad_norm": 10.54133129119873, "learning_rate": 6.13129290617849e-06, "loss": 0.8626, "step": 5563 }, { "epoch": 1.59, "grad_norm": 9.514528274536133, "learning_rate": 6.12700228832952e-06, "loss": 0.6721, "step": 5564 }, { "epoch": 1.59, "grad_norm": 12.685568809509277, "learning_rate": 6.12271167048055e-06, "loss": 0.9852, "step": 5565 }, { "epoch": 1.59, "grad_norm": 10.1242036819458, "learning_rate": 6.118421052631579e-06, "loss": 0.7792, "step": 5566 }, { "epoch": 1.59, "grad_norm": 14.765839576721191, "learning_rate": 6.114130434782608e-06, "loss": 0.8978, "step": 5567 }, { "epoch": 1.59, "grad_norm": 9.789997100830078, "learning_rate": 6.109839816933639e-06, "loss": 0.6368, "step": 5568 }, { "epoch": 1.59, "grad_norm": 8.29202938079834, "learning_rate": 6.105549199084668e-06, "loss": 0.455, "step": 5569 }, { "epoch": 1.59, "grad_norm": 10.819106101989746, "learning_rate": 6.101258581235698e-06, "loss": 0.6887, "step": 5570 }, { "epoch": 1.59, "grad_norm": 10.95679759979248, "learning_rate": 6.0969679633867275e-06, "loss": 0.792, "step": 5571 }, { "epoch": 1.59, "grad_norm": 11.622454643249512, "learning_rate": 6.092677345537758e-06, "loss": 0.6522, "step": 5572 }, { "epoch": 1.59, "grad_norm": 9.316247940063477, "learning_rate": 6.0883867276887875e-06, "loss": 0.5935, "step": 5573 }, { "epoch": 1.59, "grad_norm": 11.921667098999023, "learning_rate": 6.084096109839817e-06, "loss": 0.7116, "step": 5574 }, { "epoch": 1.59, "grad_norm": 11.171442031860352, "learning_rate": 6.079805491990847e-06, "loss": 0.8473, "step": 5575 }, { "epoch": 1.59, "grad_norm": 17.87783432006836, "learning_rate": 6.075514874141876e-06, "loss": 0.8636, "step": 5576 }, { "epoch": 1.6, "grad_norm": 9.572556495666504, "learning_rate": 6.071224256292907e-06, "loss": 0.6621, "step": 5577 }, { "epoch": 1.6, "grad_norm": 10.472742080688477, "learning_rate": 6.0669336384439365e-06, "loss": 0.7571, "step": 5578 }, { "epoch": 1.6, "grad_norm": 9.450839042663574, "learning_rate": 6.062643020594966e-06, "loss": 0.5779, "step": 5579 }, { "epoch": 1.6, "grad_norm": 12.35782527923584, "learning_rate": 6.058352402745995e-06, "loss": 0.7953, "step": 5580 }, { "epoch": 1.6, "grad_norm": 10.768641471862793, "learning_rate": 6.054061784897025e-06, "loss": 0.8999, "step": 5581 }, { "epoch": 1.6, "grad_norm": 11.072747230529785, "learning_rate": 6.049771167048055e-06, "loss": 0.5263, "step": 5582 }, { "epoch": 1.6, "grad_norm": 8.882772445678711, "learning_rate": 6.0454805491990845e-06, "loss": 0.6575, "step": 5583 }, { "epoch": 1.6, "grad_norm": 11.697036743164062, "learning_rate": 6.041189931350114e-06, "loss": 0.9594, "step": 5584 }, { "epoch": 1.6, "grad_norm": 10.463035583496094, "learning_rate": 6.036899313501145e-06, "loss": 0.6768, "step": 5585 }, { "epoch": 1.6, "grad_norm": 11.004870414733887, "learning_rate": 6.032608695652174e-06, "loss": 0.5053, "step": 5586 }, { "epoch": 1.6, "grad_norm": 10.203104972839355, "learning_rate": 6.028318077803204e-06, "loss": 0.5423, "step": 5587 }, { "epoch": 1.6, "grad_norm": 10.33894157409668, "learning_rate": 6.0240274599542335e-06, "loss": 0.6638, "step": 5588 }, { "epoch": 1.6, "grad_norm": 9.002959251403809, "learning_rate": 6.019736842105264e-06, "loss": 0.7116, "step": 5589 }, { "epoch": 1.6, "grad_norm": 9.3934907913208, "learning_rate": 6.0154462242562935e-06, "loss": 0.6622, "step": 5590 }, { "epoch": 1.6, "grad_norm": 10.514039993286133, "learning_rate": 6.011155606407323e-06, "loss": 0.7518, "step": 5591 }, { "epoch": 1.6, "grad_norm": 9.802041053771973, "learning_rate": 6.006864988558353e-06, "loss": 0.9095, "step": 5592 }, { "epoch": 1.6, "grad_norm": 9.759675979614258, "learning_rate": 6.0025743707093815e-06, "loss": 0.7053, "step": 5593 }, { "epoch": 1.6, "grad_norm": 10.36616039276123, "learning_rate": 5.998283752860412e-06, "loss": 0.707, "step": 5594 }, { "epoch": 1.6, "grad_norm": 9.417503356933594, "learning_rate": 5.993993135011442e-06, "loss": 0.7954, "step": 5595 }, { "epoch": 1.6, "grad_norm": 10.612123489379883, "learning_rate": 5.989702517162471e-06, "loss": 0.5458, "step": 5596 }, { "epoch": 1.6, "grad_norm": 11.943852424621582, "learning_rate": 5.985411899313501e-06, "loss": 0.782, "step": 5597 }, { "epoch": 1.6, "grad_norm": 9.845441818237305, "learning_rate": 5.981121281464531e-06, "loss": 0.6633, "step": 5598 }, { "epoch": 1.6, "grad_norm": 9.53563117980957, "learning_rate": 5.976830663615561e-06, "loss": 0.7448, "step": 5599 }, { "epoch": 1.6, "grad_norm": 13.612221717834473, "learning_rate": 5.9725400457665905e-06, "loss": 0.9435, "step": 5600 }, { "epoch": 1.6, "grad_norm": 10.986820220947266, "learning_rate": 5.96824942791762e-06, "loss": 0.593, "step": 5601 }, { "epoch": 1.6, "grad_norm": 12.078536033630371, "learning_rate": 5.963958810068651e-06, "loss": 0.7624, "step": 5602 }, { "epoch": 1.6, "grad_norm": 9.601693153381348, "learning_rate": 5.95966819221968e-06, "loss": 0.5876, "step": 5603 }, { "epoch": 1.6, "grad_norm": 11.859810829162598, "learning_rate": 5.95537757437071e-06, "loss": 0.5815, "step": 5604 }, { "epoch": 1.6, "grad_norm": 8.395353317260742, "learning_rate": 5.9510869565217395e-06, "loss": 0.5982, "step": 5605 }, { "epoch": 1.6, "grad_norm": 11.383386611938477, "learning_rate": 5.946796338672768e-06, "loss": 0.8622, "step": 5606 }, { "epoch": 1.6, "grad_norm": 10.532876968383789, "learning_rate": 5.942505720823799e-06, "loss": 0.6901, "step": 5607 }, { "epoch": 1.6, "grad_norm": 12.721446990966797, "learning_rate": 5.938215102974828e-06, "loss": 0.5373, "step": 5608 }, { "epoch": 1.6, "grad_norm": 10.015861511230469, "learning_rate": 5.933924485125858e-06, "loss": 0.6831, "step": 5609 }, { "epoch": 1.6, "grad_norm": 10.524345397949219, "learning_rate": 5.9296338672768875e-06, "loss": 0.5984, "step": 5610 }, { "epoch": 1.6, "grad_norm": 10.858034133911133, "learning_rate": 5.925343249427918e-06, "loss": 0.608, "step": 5611 }, { "epoch": 1.61, "grad_norm": 8.153266906738281, "learning_rate": 5.921052631578948e-06, "loss": 0.5376, "step": 5612 }, { "epoch": 1.61, "grad_norm": 7.881692886352539, "learning_rate": 5.916762013729977e-06, "loss": 0.6893, "step": 5613 }, { "epoch": 1.61, "grad_norm": 14.807575225830078, "learning_rate": 5.912471395881007e-06, "loss": 0.9757, "step": 5614 }, { "epoch": 1.61, "grad_norm": 12.7268705368042, "learning_rate": 5.908180778032037e-06, "loss": 0.9282, "step": 5615 }, { "epoch": 1.61, "grad_norm": 8.834338188171387, "learning_rate": 5.903890160183067e-06, "loss": 0.6952, "step": 5616 }, { "epoch": 1.61, "grad_norm": 11.114319801330566, "learning_rate": 5.8995995423340965e-06, "loss": 0.9001, "step": 5617 }, { "epoch": 1.61, "grad_norm": 10.926636695861816, "learning_rate": 5.895308924485126e-06, "loss": 0.4992, "step": 5618 }, { "epoch": 1.61, "grad_norm": 12.872359275817871, "learning_rate": 5.891018306636156e-06, "loss": 0.9402, "step": 5619 }, { "epoch": 1.61, "grad_norm": 12.638120651245117, "learning_rate": 5.886727688787185e-06, "loss": 0.8616, "step": 5620 }, { "epoch": 1.61, "grad_norm": 11.012920379638672, "learning_rate": 5.882437070938215e-06, "loss": 0.83, "step": 5621 }, { "epoch": 1.61, "grad_norm": 12.366905212402344, "learning_rate": 5.878146453089245e-06, "loss": 0.7689, "step": 5622 }, { "epoch": 1.61, "grad_norm": 12.612177848815918, "learning_rate": 5.873855835240274e-06, "loss": 0.937, "step": 5623 }, { "epoch": 1.61, "grad_norm": 7.9943718910217285, "learning_rate": 5.869565217391305e-06, "loss": 0.5223, "step": 5624 }, { "epoch": 1.61, "grad_norm": 10.88665771484375, "learning_rate": 5.865274599542334e-06, "loss": 0.5805, "step": 5625 }, { "epoch": 1.61, "grad_norm": 8.64239501953125, "learning_rate": 5.860983981693364e-06, "loss": 0.5296, "step": 5626 }, { "epoch": 1.61, "grad_norm": 9.842996597290039, "learning_rate": 5.8566933638443935e-06, "loss": 0.5918, "step": 5627 }, { "epoch": 1.61, "grad_norm": 8.364131927490234, "learning_rate": 5.852402745995424e-06, "loss": 0.5218, "step": 5628 }, { "epoch": 1.61, "grad_norm": 12.647870063781738, "learning_rate": 5.848112128146454e-06, "loss": 0.7731, "step": 5629 }, { "epoch": 1.61, "grad_norm": 11.33095645904541, "learning_rate": 5.843821510297483e-06, "loss": 0.8719, "step": 5630 }, { "epoch": 1.61, "grad_norm": 9.10512924194336, "learning_rate": 5.839530892448513e-06, "loss": 0.4263, "step": 5631 }, { "epoch": 1.61, "grad_norm": 8.400649070739746, "learning_rate": 5.8352402745995424e-06, "loss": 0.6073, "step": 5632 }, { "epoch": 1.61, "grad_norm": 11.631400108337402, "learning_rate": 5.830949656750572e-06, "loss": 0.7929, "step": 5633 }, { "epoch": 1.61, "grad_norm": 11.168889045715332, "learning_rate": 5.826659038901602e-06, "loss": 0.7255, "step": 5634 }, { "epoch": 1.61, "grad_norm": 11.418717384338379, "learning_rate": 5.822368421052631e-06, "loss": 0.5485, "step": 5635 }, { "epoch": 1.61, "grad_norm": 14.530598640441895, "learning_rate": 5.818077803203661e-06, "loss": 0.7852, "step": 5636 }, { "epoch": 1.61, "grad_norm": 11.886549949645996, "learning_rate": 5.813787185354691e-06, "loss": 0.7901, "step": 5637 }, { "epoch": 1.61, "grad_norm": 14.043811798095703, "learning_rate": 5.809496567505721e-06, "loss": 1.0616, "step": 5638 }, { "epoch": 1.61, "grad_norm": 9.342390060424805, "learning_rate": 5.805205949656751e-06, "loss": 0.6468, "step": 5639 }, { "epoch": 1.61, "grad_norm": 9.720380783081055, "learning_rate": 5.80091533180778e-06, "loss": 0.6197, "step": 5640 }, { "epoch": 1.61, "grad_norm": 11.170440673828125, "learning_rate": 5.796624713958811e-06, "loss": 0.7108, "step": 5641 }, { "epoch": 1.61, "grad_norm": 9.893743515014648, "learning_rate": 5.79233409610984e-06, "loss": 0.7752, "step": 5642 }, { "epoch": 1.61, "grad_norm": 12.05979061126709, "learning_rate": 5.78804347826087e-06, "loss": 0.53, "step": 5643 }, { "epoch": 1.61, "grad_norm": 9.423600196838379, "learning_rate": 5.7837528604118995e-06, "loss": 0.7921, "step": 5644 }, { "epoch": 1.61, "grad_norm": 12.006379127502441, "learning_rate": 5.779462242562929e-06, "loss": 0.7143, "step": 5645 }, { "epoch": 1.61, "grad_norm": 13.299084663391113, "learning_rate": 5.775171624713959e-06, "loss": 0.7702, "step": 5646 }, { "epoch": 1.62, "grad_norm": 10.986786842346191, "learning_rate": 5.770881006864988e-06, "loss": 0.6752, "step": 5647 }, { "epoch": 1.62, "grad_norm": 10.160089492797852, "learning_rate": 5.766590389016018e-06, "loss": 0.6384, "step": 5648 }, { "epoch": 1.62, "grad_norm": 10.388351440429688, "learning_rate": 5.7622997711670484e-06, "loss": 0.8869, "step": 5649 }, { "epoch": 1.62, "grad_norm": 11.499975204467773, "learning_rate": 5.758009153318078e-06, "loss": 0.7715, "step": 5650 }, { "epoch": 1.62, "grad_norm": 11.53027629852295, "learning_rate": 5.753718535469108e-06, "loss": 0.6374, "step": 5651 }, { "epoch": 1.62, "grad_norm": 9.61728286743164, "learning_rate": 5.749427917620137e-06, "loss": 0.4762, "step": 5652 }, { "epoch": 1.62, "grad_norm": 10.64439582824707, "learning_rate": 5.745137299771167e-06, "loss": 0.874, "step": 5653 }, { "epoch": 1.62, "grad_norm": 10.544897079467773, "learning_rate": 5.740846681922197e-06, "loss": 0.7644, "step": 5654 }, { "epoch": 1.62, "grad_norm": 11.613075256347656, "learning_rate": 5.736556064073227e-06, "loss": 0.5905, "step": 5655 }, { "epoch": 1.62, "grad_norm": 13.937162399291992, "learning_rate": 5.732265446224257e-06, "loss": 0.8385, "step": 5656 }, { "epoch": 1.62, "grad_norm": 11.547974586486816, "learning_rate": 5.727974828375286e-06, "loss": 0.788, "step": 5657 }, { "epoch": 1.62, "grad_norm": 10.030085563659668, "learning_rate": 5.723684210526316e-06, "loss": 0.6463, "step": 5658 }, { "epoch": 1.62, "grad_norm": 11.306310653686523, "learning_rate": 5.7193935926773454e-06, "loss": 0.7094, "step": 5659 }, { "epoch": 1.62, "grad_norm": 12.70578670501709, "learning_rate": 5.715102974828375e-06, "loss": 0.7308, "step": 5660 }, { "epoch": 1.62, "grad_norm": 10.362273216247559, "learning_rate": 5.710812356979405e-06, "loss": 0.6183, "step": 5661 }, { "epoch": 1.62, "grad_norm": 11.30966854095459, "learning_rate": 5.706521739130435e-06, "loss": 0.561, "step": 5662 }, { "epoch": 1.62, "grad_norm": 9.392801284790039, "learning_rate": 5.702231121281465e-06, "loss": 0.6767, "step": 5663 }, { "epoch": 1.62, "grad_norm": 9.160632133483887, "learning_rate": 5.697940503432494e-06, "loss": 0.7882, "step": 5664 }, { "epoch": 1.62, "grad_norm": 9.824238777160645, "learning_rate": 5.693649885583524e-06, "loss": 0.5345, "step": 5665 }, { "epoch": 1.62, "grad_norm": 9.859192848205566, "learning_rate": 5.689359267734554e-06, "loss": 0.5978, "step": 5666 }, { "epoch": 1.62, "grad_norm": 10.57384967803955, "learning_rate": 5.685068649885584e-06, "loss": 0.6203, "step": 5667 }, { "epoch": 1.62, "grad_norm": 8.914021492004395, "learning_rate": 5.680778032036614e-06, "loss": 0.6637, "step": 5668 }, { "epoch": 1.62, "grad_norm": 9.800065994262695, "learning_rate": 5.676487414187643e-06, "loss": 0.6351, "step": 5669 }, { "epoch": 1.62, "grad_norm": 10.23118782043457, "learning_rate": 5.672196796338673e-06, "loss": 0.678, "step": 5670 }, { "epoch": 1.62, "grad_norm": 8.032459259033203, "learning_rate": 5.667906178489703e-06, "loss": 0.5446, "step": 5671 }, { "epoch": 1.62, "grad_norm": 10.34920883178711, "learning_rate": 5.663615560640732e-06, "loss": 0.8721, "step": 5672 }, { "epoch": 1.62, "grad_norm": 10.661069869995117, "learning_rate": 5.659324942791762e-06, "loss": 0.9113, "step": 5673 }, { "epoch": 1.62, "grad_norm": 10.502165794372559, "learning_rate": 5.655034324942791e-06, "loss": 0.9143, "step": 5674 }, { "epoch": 1.62, "grad_norm": 10.990413665771484, "learning_rate": 5.650743707093822e-06, "loss": 0.8208, "step": 5675 }, { "epoch": 1.62, "grad_norm": 9.408350944519043, "learning_rate": 5.6464530892448514e-06, "loss": 0.6148, "step": 5676 }, { "epoch": 1.62, "grad_norm": 10.625914573669434, "learning_rate": 5.642162471395881e-06, "loss": 0.6522, "step": 5677 }, { "epoch": 1.62, "grad_norm": 9.551958084106445, "learning_rate": 5.637871853546911e-06, "loss": 0.6935, "step": 5678 }, { "epoch": 1.62, "grad_norm": 13.339814186096191, "learning_rate": 5.633581235697941e-06, "loss": 0.5454, "step": 5679 }, { "epoch": 1.62, "grad_norm": 11.620394706726074, "learning_rate": 5.629290617848971e-06, "loss": 0.5537, "step": 5680 }, { "epoch": 1.62, "grad_norm": 8.314926147460938, "learning_rate": 5.625e-06, "loss": 0.6268, "step": 5681 }, { "epoch": 1.63, "grad_norm": 8.93075180053711, "learning_rate": 5.62070938215103e-06, "loss": 0.6914, "step": 5682 }, { "epoch": 1.63, "grad_norm": 10.87057113647461, "learning_rate": 5.61641876430206e-06, "loss": 0.6301, "step": 5683 }, { "epoch": 1.63, "grad_norm": 10.709811210632324, "learning_rate": 5.61212814645309e-06, "loss": 0.5426, "step": 5684 }, { "epoch": 1.63, "grad_norm": 10.641079902648926, "learning_rate": 5.607837528604119e-06, "loss": 0.9742, "step": 5685 }, { "epoch": 1.63, "grad_norm": 12.158310890197754, "learning_rate": 5.6035469107551484e-06, "loss": 0.7012, "step": 5686 }, { "epoch": 1.63, "grad_norm": 12.088842391967773, "learning_rate": 5.599256292906178e-06, "loss": 0.9431, "step": 5687 }, { "epoch": 1.63, "grad_norm": 9.152239799499512, "learning_rate": 5.5949656750572085e-06, "loss": 0.6064, "step": 5688 }, { "epoch": 1.63, "grad_norm": 8.571259498596191, "learning_rate": 5.590675057208238e-06, "loss": 0.5959, "step": 5689 }, { "epoch": 1.63, "grad_norm": 12.594555854797363, "learning_rate": 5.586384439359268e-06, "loss": 0.8432, "step": 5690 }, { "epoch": 1.63, "grad_norm": 7.2420654296875, "learning_rate": 5.582093821510297e-06, "loss": 0.5177, "step": 5691 }, { "epoch": 1.63, "grad_norm": 9.37486457824707, "learning_rate": 5.577803203661328e-06, "loss": 0.7414, "step": 5692 }, { "epoch": 1.63, "grad_norm": 11.70945930480957, "learning_rate": 5.5735125858123574e-06, "loss": 0.6451, "step": 5693 }, { "epoch": 1.63, "grad_norm": 9.084478378295898, "learning_rate": 5.569221967963387e-06, "loss": 0.5428, "step": 5694 }, { "epoch": 1.63, "grad_norm": 10.824153900146484, "learning_rate": 5.564931350114417e-06, "loss": 0.7393, "step": 5695 }, { "epoch": 1.63, "grad_norm": 11.970434188842773, "learning_rate": 5.560640732265447e-06, "loss": 0.8567, "step": 5696 }, { "epoch": 1.63, "grad_norm": 16.701438903808594, "learning_rate": 5.556350114416477e-06, "loss": 0.7511, "step": 5697 }, { "epoch": 1.63, "grad_norm": 11.472614288330078, "learning_rate": 5.5520594965675055e-06, "loss": 1.0356, "step": 5698 }, { "epoch": 1.63, "grad_norm": 8.926335334777832, "learning_rate": 5.547768878718535e-06, "loss": 0.6318, "step": 5699 }, { "epoch": 1.63, "grad_norm": 9.65076732635498, "learning_rate": 5.543478260869565e-06, "loss": 0.6234, "step": 5700 }, { "epoch": 1.63, "grad_norm": 9.989914894104004, "learning_rate": 5.539187643020595e-06, "loss": 0.5944, "step": 5701 }, { "epoch": 1.63, "grad_norm": 13.871895790100098, "learning_rate": 5.534897025171625e-06, "loss": 0.7599, "step": 5702 }, { "epoch": 1.63, "grad_norm": 9.64797592163086, "learning_rate": 5.5306064073226544e-06, "loss": 0.5855, "step": 5703 }, { "epoch": 1.63, "grad_norm": 12.163445472717285, "learning_rate": 5.526315789473684e-06, "loss": 0.7724, "step": 5704 }, { "epoch": 1.63, "grad_norm": 8.04688835144043, "learning_rate": 5.5220251716247145e-06, "loss": 0.6141, "step": 5705 }, { "epoch": 1.63, "grad_norm": 8.863974571228027, "learning_rate": 5.517734553775744e-06, "loss": 0.7573, "step": 5706 }, { "epoch": 1.63, "grad_norm": 11.304844856262207, "learning_rate": 5.513443935926774e-06, "loss": 0.9782, "step": 5707 }, { "epoch": 1.63, "grad_norm": 9.068216323852539, "learning_rate": 5.509153318077803e-06, "loss": 0.7531, "step": 5708 }, { "epoch": 1.63, "grad_norm": 10.172918319702148, "learning_rate": 5.504862700228834e-06, "loss": 0.6192, "step": 5709 }, { "epoch": 1.63, "grad_norm": 12.894253730773926, "learning_rate": 5.500572082379863e-06, "loss": 0.9341, "step": 5710 }, { "epoch": 1.63, "grad_norm": 13.358455657958984, "learning_rate": 5.496281464530892e-06, "loss": 0.8522, "step": 5711 }, { "epoch": 1.63, "grad_norm": 10.213536262512207, "learning_rate": 5.491990846681922e-06, "loss": 0.6582, "step": 5712 }, { "epoch": 1.63, "grad_norm": 12.736071586608887, "learning_rate": 5.487700228832951e-06, "loss": 0.6622, "step": 5713 }, { "epoch": 1.63, "grad_norm": 10.614320755004883, "learning_rate": 5.483409610983982e-06, "loss": 0.7729, "step": 5714 }, { "epoch": 1.63, "grad_norm": 10.915679931640625, "learning_rate": 5.4791189931350115e-06, "loss": 0.5835, "step": 5715 }, { "epoch": 1.64, "grad_norm": 10.88956356048584, "learning_rate": 5.474828375286041e-06, "loss": 0.7071, "step": 5716 }, { "epoch": 1.64, "grad_norm": 8.267797470092773, "learning_rate": 5.470537757437071e-06, "loss": 0.5224, "step": 5717 }, { "epoch": 1.64, "grad_norm": 8.929670333862305, "learning_rate": 5.466247139588101e-06, "loss": 0.4938, "step": 5718 }, { "epoch": 1.64, "grad_norm": 9.463593482971191, "learning_rate": 5.461956521739131e-06, "loss": 0.6707, "step": 5719 }, { "epoch": 1.64, "grad_norm": 7.468658924102783, "learning_rate": 5.45766590389016e-06, "loss": 0.5084, "step": 5720 }, { "epoch": 1.64, "grad_norm": 9.18384838104248, "learning_rate": 5.45337528604119e-06, "loss": 0.4628, "step": 5721 }, { "epoch": 1.64, "grad_norm": 11.797301292419434, "learning_rate": 5.4490846681922205e-06, "loss": 0.7877, "step": 5722 }, { "epoch": 1.64, "grad_norm": 10.957298278808594, "learning_rate": 5.44479405034325e-06, "loss": 0.9386, "step": 5723 }, { "epoch": 1.64, "grad_norm": 11.238849639892578, "learning_rate": 5.440503432494279e-06, "loss": 0.5708, "step": 5724 }, { "epoch": 1.64, "grad_norm": 11.021093368530273, "learning_rate": 5.4362128146453085e-06, "loss": 0.6834, "step": 5725 }, { "epoch": 1.64, "grad_norm": 9.856903076171875, "learning_rate": 5.431922196796339e-06, "loss": 0.5821, "step": 5726 }, { "epoch": 1.64, "grad_norm": 11.011984825134277, "learning_rate": 5.4276315789473686e-06, "loss": 0.7455, "step": 5727 }, { "epoch": 1.64, "grad_norm": 9.175439834594727, "learning_rate": 5.423340961098398e-06, "loss": 0.6383, "step": 5728 }, { "epoch": 1.64, "grad_norm": 11.727838516235352, "learning_rate": 5.419050343249428e-06, "loss": 0.6758, "step": 5729 }, { "epoch": 1.64, "grad_norm": 10.31438159942627, "learning_rate": 5.414759725400457e-06, "loss": 0.7227, "step": 5730 }, { "epoch": 1.64, "grad_norm": 10.410078048706055, "learning_rate": 5.410469107551488e-06, "loss": 0.6846, "step": 5731 }, { "epoch": 1.64, "grad_norm": 10.888920783996582, "learning_rate": 5.4061784897025175e-06, "loss": 0.6959, "step": 5732 }, { "epoch": 1.64, "grad_norm": 11.906867980957031, "learning_rate": 5.401887871853547e-06, "loss": 0.8296, "step": 5733 }, { "epoch": 1.64, "grad_norm": 8.357414245605469, "learning_rate": 5.397597254004577e-06, "loss": 0.4034, "step": 5734 }, { "epoch": 1.64, "grad_norm": 11.187140464782715, "learning_rate": 5.393306636155607e-06, "loss": 0.716, "step": 5735 }, { "epoch": 1.64, "grad_norm": 7.655837059020996, "learning_rate": 5.389016018306637e-06, "loss": 0.7081, "step": 5736 }, { "epoch": 1.64, "grad_norm": 10.159165382385254, "learning_rate": 5.3847254004576656e-06, "loss": 0.6526, "step": 5737 }, { "epoch": 1.64, "grad_norm": 9.793972969055176, "learning_rate": 5.380434782608695e-06, "loss": 0.7996, "step": 5738 }, { "epoch": 1.64, "grad_norm": 11.443796157836914, "learning_rate": 5.376144164759726e-06, "loss": 0.6855, "step": 5739 }, { "epoch": 1.64, "grad_norm": 9.832064628601074, "learning_rate": 5.371853546910755e-06, "loss": 0.7701, "step": 5740 }, { "epoch": 1.64, "grad_norm": 13.623041152954102, "learning_rate": 5.367562929061785e-06, "loss": 1.0157, "step": 5741 }, { "epoch": 1.64, "grad_norm": 10.20119571685791, "learning_rate": 5.3632723112128145e-06, "loss": 0.628, "step": 5742 }, { "epoch": 1.64, "grad_norm": 9.81611442565918, "learning_rate": 5.358981693363844e-06, "loss": 0.5421, "step": 5743 }, { "epoch": 1.64, "grad_norm": 8.950174331665039, "learning_rate": 5.3546910755148746e-06, "loss": 0.5966, "step": 5744 }, { "epoch": 1.64, "grad_norm": 9.24271011352539, "learning_rate": 5.350400457665904e-06, "loss": 0.6289, "step": 5745 }, { "epoch": 1.64, "grad_norm": 11.46676254272461, "learning_rate": 5.346109839816934e-06, "loss": 0.5403, "step": 5746 }, { "epoch": 1.64, "grad_norm": 8.326277732849121, "learning_rate": 5.341819221967963e-06, "loss": 0.4409, "step": 5747 }, { "epoch": 1.64, "grad_norm": 11.634745597839355, "learning_rate": 5.337528604118994e-06, "loss": 0.6893, "step": 5748 }, { "epoch": 1.64, "grad_norm": 8.160537719726562, "learning_rate": 5.3332379862700235e-06, "loss": 0.6457, "step": 5749 }, { "epoch": 1.64, "grad_norm": 12.94668197631836, "learning_rate": 5.328947368421053e-06, "loss": 0.5905, "step": 5750 }, { "epoch": 1.65, "grad_norm": 11.275633811950684, "learning_rate": 5.324656750572082e-06, "loss": 0.7235, "step": 5751 }, { "epoch": 1.65, "grad_norm": 12.722701072692871, "learning_rate": 5.320366132723112e-06, "loss": 0.6949, "step": 5752 }, { "epoch": 1.65, "grad_norm": 11.349908828735352, "learning_rate": 5.316075514874142e-06, "loss": 0.5982, "step": 5753 }, { "epoch": 1.65, "grad_norm": 9.948938369750977, "learning_rate": 5.3117848970251716e-06, "loss": 0.7043, "step": 5754 }, { "epoch": 1.65, "grad_norm": 10.155777931213379, "learning_rate": 5.307494279176201e-06, "loss": 0.6262, "step": 5755 }, { "epoch": 1.65, "grad_norm": 12.634535789489746, "learning_rate": 5.303203661327232e-06, "loss": 0.8843, "step": 5756 }, { "epoch": 1.65, "grad_norm": 12.780898094177246, "learning_rate": 5.298913043478261e-06, "loss": 0.864, "step": 5757 }, { "epoch": 1.65, "grad_norm": 9.863545417785645, "learning_rate": 5.294622425629291e-06, "loss": 0.6517, "step": 5758 }, { "epoch": 1.65, "grad_norm": 10.639693260192871, "learning_rate": 5.2903318077803205e-06, "loss": 0.6594, "step": 5759 }, { "epoch": 1.65, "grad_norm": 10.80581283569336, "learning_rate": 5.28604118993135e-06, "loss": 0.6584, "step": 5760 }, { "epoch": 1.65, "grad_norm": 9.108868598937988, "learning_rate": 5.2817505720823806e-06, "loss": 0.5799, "step": 5761 }, { "epoch": 1.65, "grad_norm": 11.019648551940918, "learning_rate": 5.27745995423341e-06, "loss": 0.7017, "step": 5762 }, { "epoch": 1.65, "grad_norm": 9.318804740905762, "learning_rate": 5.27316933638444e-06, "loss": 0.6385, "step": 5763 }, { "epoch": 1.65, "grad_norm": 10.86406421661377, "learning_rate": 5.2688787185354686e-06, "loss": 0.5914, "step": 5764 }, { "epoch": 1.65, "grad_norm": 10.691460609436035, "learning_rate": 5.264588100686499e-06, "loss": 0.845, "step": 5765 }, { "epoch": 1.65, "grad_norm": 10.869109153747559, "learning_rate": 5.260297482837529e-06, "loss": 0.9187, "step": 5766 }, { "epoch": 1.65, "grad_norm": 8.999061584472656, "learning_rate": 5.256006864988558e-06, "loss": 0.82, "step": 5767 }, { "epoch": 1.65, "grad_norm": 9.673580169677734, "learning_rate": 5.251716247139588e-06, "loss": 0.851, "step": 5768 }, { "epoch": 1.65, "grad_norm": 10.771028518676758, "learning_rate": 5.247425629290618e-06, "loss": 0.7403, "step": 5769 }, { "epoch": 1.65, "grad_norm": 9.66604232788086, "learning_rate": 5.243135011441648e-06, "loss": 0.8523, "step": 5770 }, { "epoch": 1.65, "grad_norm": 9.262810707092285, "learning_rate": 5.2388443935926776e-06, "loss": 0.5099, "step": 5771 }, { "epoch": 1.65, "grad_norm": 9.349327087402344, "learning_rate": 5.234553775743707e-06, "loss": 0.6424, "step": 5772 }, { "epoch": 1.65, "grad_norm": 10.648466110229492, "learning_rate": 5.230263157894737e-06, "loss": 0.8119, "step": 5773 }, { "epoch": 1.65, "grad_norm": 10.655963897705078, "learning_rate": 5.225972540045767e-06, "loss": 0.6532, "step": 5774 }, { "epoch": 1.65, "grad_norm": 9.538254737854004, "learning_rate": 5.221681922196797e-06, "loss": 0.6325, "step": 5775 }, { "epoch": 1.65, "grad_norm": 11.388775825500488, "learning_rate": 5.2173913043478265e-06, "loss": 0.8202, "step": 5776 }, { "epoch": 1.65, "grad_norm": 8.295978546142578, "learning_rate": 5.213100686498855e-06, "loss": 0.6304, "step": 5777 }, { "epoch": 1.65, "grad_norm": 9.392183303833008, "learning_rate": 5.208810068649886e-06, "loss": 0.9357, "step": 5778 }, { "epoch": 1.65, "grad_norm": 12.010884284973145, "learning_rate": 5.204519450800915e-06, "loss": 0.8542, "step": 5779 }, { "epoch": 1.65, "grad_norm": 9.261788368225098, "learning_rate": 5.200228832951945e-06, "loss": 0.709, "step": 5780 }, { "epoch": 1.65, "grad_norm": 11.460758209228516, "learning_rate": 5.1959382151029746e-06, "loss": 0.7145, "step": 5781 }, { "epoch": 1.65, "grad_norm": 11.312665939331055, "learning_rate": 5.191647597254005e-06, "loss": 0.7533, "step": 5782 }, { "epoch": 1.65, "grad_norm": 10.184603691101074, "learning_rate": 5.187356979405035e-06, "loss": 0.6701, "step": 5783 }, { "epoch": 1.65, "grad_norm": 10.924507141113281, "learning_rate": 5.183066361556064e-06, "loss": 0.5879, "step": 5784 }, { "epoch": 1.65, "grad_norm": 11.757448196411133, "learning_rate": 5.178775743707094e-06, "loss": 0.9316, "step": 5785 }, { "epoch": 1.66, "grad_norm": 9.38831901550293, "learning_rate": 5.174485125858124e-06, "loss": 0.5362, "step": 5786 }, { "epoch": 1.66, "grad_norm": 10.152300834655762, "learning_rate": 5.170194508009154e-06, "loss": 0.5182, "step": 5787 }, { "epoch": 1.66, "grad_norm": 10.15249252319336, "learning_rate": 5.1659038901601836e-06, "loss": 0.6635, "step": 5788 }, { "epoch": 1.66, "grad_norm": 10.151712417602539, "learning_rate": 5.161613272311213e-06, "loss": 0.9518, "step": 5789 }, { "epoch": 1.66, "grad_norm": 12.894401550292969, "learning_rate": 5.157322654462242e-06, "loss": 0.8927, "step": 5790 }, { "epoch": 1.66, "grad_norm": 10.738083839416504, "learning_rate": 5.153032036613272e-06, "loss": 0.701, "step": 5791 }, { "epoch": 1.66, "grad_norm": 12.905317306518555, "learning_rate": 5.148741418764302e-06, "loss": 1.1686, "step": 5792 }, { "epoch": 1.66, "grad_norm": 9.421422004699707, "learning_rate": 5.144450800915332e-06, "loss": 0.8037, "step": 5793 }, { "epoch": 1.66, "grad_norm": 10.57293701171875, "learning_rate": 5.140160183066361e-06, "loss": 0.8017, "step": 5794 }, { "epoch": 1.66, "grad_norm": 9.552532196044922, "learning_rate": 5.135869565217392e-06, "loss": 0.7185, "step": 5795 }, { "epoch": 1.66, "grad_norm": 9.750856399536133, "learning_rate": 5.131578947368421e-06, "loss": 0.7244, "step": 5796 }, { "epoch": 1.66, "grad_norm": 12.330699920654297, "learning_rate": 5.127288329519451e-06, "loss": 0.6662, "step": 5797 }, { "epoch": 1.66, "grad_norm": 10.388833045959473, "learning_rate": 5.1229977116704805e-06, "loss": 0.885, "step": 5798 }, { "epoch": 1.66, "grad_norm": 10.175448417663574, "learning_rate": 5.118707093821511e-06, "loss": 0.6561, "step": 5799 }, { "epoch": 1.66, "grad_norm": 9.187387466430664, "learning_rate": 5.114416475972541e-06, "loss": 0.646, "step": 5800 }, { "epoch": 1.66, "grad_norm": 8.872142791748047, "learning_rate": 5.11012585812357e-06, "loss": 0.7365, "step": 5801 }, { "epoch": 1.66, "grad_norm": 8.377429008483887, "learning_rate": 5.1058352402746e-06, "loss": 0.5354, "step": 5802 }, { "epoch": 1.66, "grad_norm": 7.479408264160156, "learning_rate": 5.101544622425629e-06, "loss": 0.5004, "step": 5803 }, { "epoch": 1.66, "grad_norm": 11.968086242675781, "learning_rate": 5.097254004576659e-06, "loss": 0.8115, "step": 5804 }, { "epoch": 1.66, "grad_norm": 8.832465171813965, "learning_rate": 5.092963386727689e-06, "loss": 0.5674, "step": 5805 }, { "epoch": 1.66, "grad_norm": 10.427408218383789, "learning_rate": 5.088672768878718e-06, "loss": 0.6168, "step": 5806 }, { "epoch": 1.66, "grad_norm": 10.2507963180542, "learning_rate": 5.084382151029748e-06, "loss": 0.6153, "step": 5807 }, { "epoch": 1.66, "grad_norm": 8.970211029052734, "learning_rate": 5.080091533180778e-06, "loss": 0.6329, "step": 5808 }, { "epoch": 1.66, "grad_norm": 12.040018081665039, "learning_rate": 5.075800915331808e-06, "loss": 0.8813, "step": 5809 }, { "epoch": 1.66, "grad_norm": 12.211027145385742, "learning_rate": 5.071510297482838e-06, "loss": 0.6981, "step": 5810 }, { "epoch": 1.66, "grad_norm": 12.553165435791016, "learning_rate": 5.067219679633867e-06, "loss": 0.6293, "step": 5811 }, { "epoch": 1.66, "grad_norm": 9.366776466369629, "learning_rate": 5.062929061784898e-06, "loss": 0.6127, "step": 5812 }, { "epoch": 1.66, "grad_norm": 10.16841983795166, "learning_rate": 5.058638443935927e-06, "loss": 0.6729, "step": 5813 }, { "epoch": 1.66, "grad_norm": 8.510993003845215, "learning_rate": 5.054347826086957e-06, "loss": 0.7226, "step": 5814 }, { "epoch": 1.66, "grad_norm": 12.433296203613281, "learning_rate": 5.0500572082379865e-06, "loss": 0.9685, "step": 5815 }, { "epoch": 1.66, "grad_norm": 11.985654830932617, "learning_rate": 5.045766590389016e-06, "loss": 0.7879, "step": 5816 }, { "epoch": 1.66, "grad_norm": 10.938732147216797, "learning_rate": 5.041475972540046e-06, "loss": 0.72, "step": 5817 }, { "epoch": 1.66, "grad_norm": 9.93817138671875, "learning_rate": 5.037185354691075e-06, "loss": 0.7446, "step": 5818 }, { "epoch": 1.66, "grad_norm": 11.764874458312988, "learning_rate": 5.032894736842105e-06, "loss": 0.7629, "step": 5819 }, { "epoch": 1.66, "grad_norm": 12.215780258178711, "learning_rate": 5.028604118993135e-06, "loss": 0.6836, "step": 5820 }, { "epoch": 1.67, "grad_norm": 10.806153297424316, "learning_rate": 5.024313501144165e-06, "loss": 0.6898, "step": 5821 }, { "epoch": 1.67, "grad_norm": 7.726294040679932, "learning_rate": 5.020022883295195e-06, "loss": 0.4908, "step": 5822 }, { "epoch": 1.67, "grad_norm": 11.043843269348145, "learning_rate": 5.015732265446224e-06, "loss": 0.6357, "step": 5823 }, { "epoch": 1.67, "grad_norm": 9.391953468322754, "learning_rate": 5.011441647597254e-06, "loss": 0.6654, "step": 5824 }, { "epoch": 1.67, "grad_norm": 9.231965065002441, "learning_rate": 5.007151029748284e-06, "loss": 0.5421, "step": 5825 }, { "epoch": 1.67, "grad_norm": 8.77448844909668, "learning_rate": 5.002860411899314e-06, "loss": 0.5603, "step": 5826 }, { "epoch": 1.67, "grad_norm": 9.12179946899414, "learning_rate": 4.998569794050344e-06, "loss": 0.6626, "step": 5827 }, { "epoch": 1.67, "grad_norm": 10.378664016723633, "learning_rate": 4.994279176201373e-06, "loss": 0.681, "step": 5828 }, { "epoch": 1.67, "grad_norm": 8.726974487304688, "learning_rate": 4.989988558352403e-06, "loss": 0.4317, "step": 5829 }, { "epoch": 1.67, "grad_norm": 10.938841819763184, "learning_rate": 4.9856979405034325e-06, "loss": 0.9883, "step": 5830 }, { "epoch": 1.67, "grad_norm": 11.10641860961914, "learning_rate": 4.981407322654462e-06, "loss": 0.9498, "step": 5831 }, { "epoch": 1.67, "grad_norm": 10.560961723327637, "learning_rate": 4.977116704805492e-06, "loss": 0.8008, "step": 5832 }, { "epoch": 1.67, "grad_norm": 11.211451530456543, "learning_rate": 4.972826086956521e-06, "loss": 0.5645, "step": 5833 }, { "epoch": 1.67, "grad_norm": 9.279936790466309, "learning_rate": 4.968535469107552e-06, "loss": 0.6464, "step": 5834 }, { "epoch": 1.67, "grad_norm": 13.09455394744873, "learning_rate": 4.964244851258581e-06, "loss": 0.8324, "step": 5835 }, { "epoch": 1.67, "grad_norm": 12.164727210998535, "learning_rate": 4.959954233409611e-06, "loss": 0.6953, "step": 5836 }, { "epoch": 1.67, "grad_norm": 10.946770668029785, "learning_rate": 4.955663615560641e-06, "loss": 0.5145, "step": 5837 }, { "epoch": 1.67, "grad_norm": 11.676422119140625, "learning_rate": 4.951372997711671e-06, "loss": 0.7639, "step": 5838 }, { "epoch": 1.67, "grad_norm": 9.145784378051758, "learning_rate": 4.947082379862701e-06, "loss": 0.6324, "step": 5839 }, { "epoch": 1.67, "grad_norm": 10.441269874572754, "learning_rate": 4.94279176201373e-06, "loss": 0.5452, "step": 5840 }, { "epoch": 1.67, "grad_norm": 9.933587074279785, "learning_rate": 4.93850114416476e-06, "loss": 0.6534, "step": 5841 }, { "epoch": 1.67, "grad_norm": 14.15119743347168, "learning_rate": 4.9342105263157895e-06, "loss": 0.829, "step": 5842 }, { "epoch": 1.67, "grad_norm": 10.811209678649902, "learning_rate": 4.929919908466819e-06, "loss": 0.5488, "step": 5843 }, { "epoch": 1.67, "grad_norm": 12.233796119689941, "learning_rate": 4.925629290617849e-06, "loss": 0.8341, "step": 5844 }, { "epoch": 1.67, "grad_norm": 14.163995742797852, "learning_rate": 4.921338672768878e-06, "loss": 0.8841, "step": 5845 }, { "epoch": 1.67, "grad_norm": 10.326066970825195, "learning_rate": 4.917048054919909e-06, "loss": 0.8647, "step": 5846 }, { "epoch": 1.67, "grad_norm": 11.209264755249023, "learning_rate": 4.9127574370709385e-06, "loss": 0.6987, "step": 5847 }, { "epoch": 1.67, "grad_norm": 11.691764831542969, "learning_rate": 4.908466819221968e-06, "loss": 0.7641, "step": 5848 }, { "epoch": 1.67, "grad_norm": 11.83598804473877, "learning_rate": 4.904176201372998e-06, "loss": 0.5928, "step": 5849 }, { "epoch": 1.67, "grad_norm": 9.098348617553711, "learning_rate": 4.899885583524027e-06, "loss": 0.5905, "step": 5850 }, { "epoch": 1.67, "grad_norm": 9.325675010681152, "learning_rate": 4.895594965675058e-06, "loss": 0.6716, "step": 5851 }, { "epoch": 1.67, "grad_norm": 9.184008598327637, "learning_rate": 4.891304347826087e-06, "loss": 0.4886, "step": 5852 }, { "epoch": 1.67, "grad_norm": 11.029022216796875, "learning_rate": 4.887013729977117e-06, "loss": 0.7218, "step": 5853 }, { "epoch": 1.67, "grad_norm": 10.732136726379395, "learning_rate": 4.882723112128147e-06, "loss": 0.6508, "step": 5854 }, { "epoch": 1.67, "grad_norm": 10.072098731994629, "learning_rate": 4.878432494279177e-06, "loss": 0.6364, "step": 5855 }, { "epoch": 1.68, "grad_norm": 7.013856410980225, "learning_rate": 4.874141876430206e-06, "loss": 0.4318, "step": 5856 }, { "epoch": 1.68, "grad_norm": 12.017395973205566, "learning_rate": 4.8698512585812354e-06, "loss": 0.7872, "step": 5857 }, { "epoch": 1.68, "grad_norm": 9.142374992370605, "learning_rate": 4.865560640732265e-06, "loss": 0.5402, "step": 5858 }, { "epoch": 1.68, "grad_norm": 10.501347541809082, "learning_rate": 4.8612700228832955e-06, "loss": 0.6895, "step": 5859 }, { "epoch": 1.68, "grad_norm": 12.26644515991211, "learning_rate": 4.856979405034325e-06, "loss": 0.8957, "step": 5860 }, { "epoch": 1.68, "grad_norm": 9.183009147644043, "learning_rate": 4.852688787185355e-06, "loss": 0.6938, "step": 5861 }, { "epoch": 1.68, "grad_norm": 13.985980033874512, "learning_rate": 4.848398169336384e-06, "loss": 0.8486, "step": 5862 }, { "epoch": 1.68, "grad_norm": 9.614011764526367, "learning_rate": 4.844107551487414e-06, "loss": 0.7999, "step": 5863 }, { "epoch": 1.68, "grad_norm": 9.026040077209473, "learning_rate": 4.8398169336384444e-06, "loss": 0.7847, "step": 5864 }, { "epoch": 1.68, "grad_norm": 11.509770393371582, "learning_rate": 4.835526315789474e-06, "loss": 0.7551, "step": 5865 }, { "epoch": 1.68, "grad_norm": 8.606500625610352, "learning_rate": 4.831235697940504e-06, "loss": 0.6628, "step": 5866 }, { "epoch": 1.68, "grad_norm": 9.944982528686523, "learning_rate": 4.826945080091533e-06, "loss": 0.7259, "step": 5867 }, { "epoch": 1.68, "grad_norm": 9.284320831298828, "learning_rate": 4.822654462242564e-06, "loss": 0.6398, "step": 5868 }, { "epoch": 1.68, "grad_norm": 10.699448585510254, "learning_rate": 4.8183638443935925e-06, "loss": 0.6661, "step": 5869 }, { "epoch": 1.68, "grad_norm": 10.851250648498535, "learning_rate": 4.814073226544622e-06, "loss": 0.5445, "step": 5870 }, { "epoch": 1.68, "grad_norm": 10.146505355834961, "learning_rate": 4.809782608695652e-06, "loss": 0.5736, "step": 5871 }, { "epoch": 1.68, "grad_norm": 9.085628509521484, "learning_rate": 4.805491990846682e-06, "loss": 0.7359, "step": 5872 }, { "epoch": 1.68, "grad_norm": 10.133440017700195, "learning_rate": 4.801201372997712e-06, "loss": 0.6137, "step": 5873 }, { "epoch": 1.68, "grad_norm": 8.663152694702148, "learning_rate": 4.7969107551487414e-06, "loss": 0.7389, "step": 5874 }, { "epoch": 1.68, "grad_norm": 10.566811561584473, "learning_rate": 4.792620137299771e-06, "loss": 0.7826, "step": 5875 }, { "epoch": 1.68, "grad_norm": 9.916440963745117, "learning_rate": 4.7883295194508015e-06, "loss": 0.5214, "step": 5876 }, { "epoch": 1.68, "grad_norm": 11.84753704071045, "learning_rate": 4.784038901601831e-06, "loss": 0.7269, "step": 5877 }, { "epoch": 1.68, "grad_norm": 11.298389434814453, "learning_rate": 4.779748283752861e-06, "loss": 0.7548, "step": 5878 }, { "epoch": 1.68, "grad_norm": 9.253491401672363, "learning_rate": 4.77545766590389e-06, "loss": 0.7006, "step": 5879 }, { "epoch": 1.68, "grad_norm": 10.172586441040039, "learning_rate": 4.77116704805492e-06, "loss": 0.7238, "step": 5880 }, { "epoch": 1.68, "grad_norm": 11.111824035644531, "learning_rate": 4.7668764302059504e-06, "loss": 0.6726, "step": 5881 }, { "epoch": 1.68, "grad_norm": 9.476541519165039, "learning_rate": 4.762585812356979e-06, "loss": 0.3624, "step": 5882 }, { "epoch": 1.68, "grad_norm": 8.951101303100586, "learning_rate": 4.758295194508009e-06, "loss": 0.6643, "step": 5883 }, { "epoch": 1.68, "grad_norm": 9.434000015258789, "learning_rate": 4.7540045766590384e-06, "loss": 0.5429, "step": 5884 }, { "epoch": 1.68, "grad_norm": 9.742546081542969, "learning_rate": 4.749713958810069e-06, "loss": 0.6604, "step": 5885 }, { "epoch": 1.68, "grad_norm": 8.147375106811523, "learning_rate": 4.7454233409610985e-06, "loss": 0.6377, "step": 5886 }, { "epoch": 1.68, "grad_norm": 11.91827392578125, "learning_rate": 4.741132723112128e-06, "loss": 0.7549, "step": 5887 }, { "epoch": 1.68, "grad_norm": 9.47176742553711, "learning_rate": 4.736842105263158e-06, "loss": 0.5754, "step": 5888 }, { "epoch": 1.68, "grad_norm": 11.063567161560059, "learning_rate": 4.732551487414188e-06, "loss": 0.7022, "step": 5889 }, { "epoch": 1.68, "grad_norm": 8.001396179199219, "learning_rate": 4.728260869565218e-06, "loss": 0.646, "step": 5890 }, { "epoch": 1.69, "grad_norm": 10.043627738952637, "learning_rate": 4.7239702517162474e-06, "loss": 0.943, "step": 5891 }, { "epoch": 1.69, "grad_norm": 9.53686237335205, "learning_rate": 4.719679633867277e-06, "loss": 0.5179, "step": 5892 }, { "epoch": 1.69, "grad_norm": 9.575545310974121, "learning_rate": 4.715389016018307e-06, "loss": 0.6462, "step": 5893 }, { "epoch": 1.69, "grad_norm": 13.83825397491455, "learning_rate": 4.711098398169337e-06, "loss": 0.736, "step": 5894 }, { "epoch": 1.69, "grad_norm": 10.548954963684082, "learning_rate": 4.706807780320366e-06, "loss": 0.6661, "step": 5895 }, { "epoch": 1.69, "grad_norm": 9.620893478393555, "learning_rate": 4.7025171624713955e-06, "loss": 0.6117, "step": 5896 }, { "epoch": 1.69, "grad_norm": 9.700591087341309, "learning_rate": 4.698226544622425e-06, "loss": 0.6078, "step": 5897 }, { "epoch": 1.69, "grad_norm": 11.308130264282227, "learning_rate": 4.693935926773456e-06, "loss": 0.6275, "step": 5898 }, { "epoch": 1.69, "grad_norm": 10.247930526733398, "learning_rate": 4.689645308924485e-06, "loss": 0.8355, "step": 5899 }, { "epoch": 1.69, "grad_norm": 11.837489128112793, "learning_rate": 4.685354691075515e-06, "loss": 0.7833, "step": 5900 }, { "epoch": 1.69, "grad_norm": 11.13876724243164, "learning_rate": 4.6810640732265444e-06, "loss": 0.5052, "step": 5901 }, { "epoch": 1.69, "grad_norm": 11.996820449829102, "learning_rate": 4.676773455377575e-06, "loss": 0.7055, "step": 5902 }, { "epoch": 1.69, "grad_norm": 13.874919891357422, "learning_rate": 4.6724828375286045e-06, "loss": 0.7694, "step": 5903 }, { "epoch": 1.69, "grad_norm": 13.17946720123291, "learning_rate": 4.668192219679634e-06, "loss": 0.8438, "step": 5904 }, { "epoch": 1.69, "grad_norm": 12.827901840209961, "learning_rate": 4.663901601830664e-06, "loss": 0.7192, "step": 5905 }, { "epoch": 1.69, "grad_norm": 12.541082382202148, "learning_rate": 4.659610983981694e-06, "loss": 0.8147, "step": 5906 }, { "epoch": 1.69, "grad_norm": 8.027711868286133, "learning_rate": 4.655320366132724e-06, "loss": 0.568, "step": 5907 }, { "epoch": 1.69, "grad_norm": 10.079437255859375, "learning_rate": 4.651029748283753e-06, "loss": 0.632, "step": 5908 }, { "epoch": 1.69, "grad_norm": 9.83368968963623, "learning_rate": 4.646739130434782e-06, "loss": 0.5738, "step": 5909 }, { "epoch": 1.69, "grad_norm": 14.067094802856445, "learning_rate": 4.642448512585812e-06, "loss": 0.954, "step": 5910 }, { "epoch": 1.69, "grad_norm": 6.759878635406494, "learning_rate": 4.638157894736842e-06, "loss": 0.5277, "step": 5911 }, { "epoch": 1.69, "grad_norm": 8.464375495910645, "learning_rate": 4.633867276887872e-06, "loss": 0.5855, "step": 5912 }, { "epoch": 1.69, "grad_norm": 11.092123031616211, "learning_rate": 4.6295766590389015e-06, "loss": 0.7731, "step": 5913 }, { "epoch": 1.69, "grad_norm": 11.064888954162598, "learning_rate": 4.625286041189931e-06, "loss": 0.9506, "step": 5914 }, { "epoch": 1.69, "grad_norm": 9.002968788146973, "learning_rate": 4.620995423340962e-06, "loss": 0.5487, "step": 5915 }, { "epoch": 1.69, "grad_norm": 12.4066162109375, "learning_rate": 4.616704805491991e-06, "loss": 0.9604, "step": 5916 }, { "epoch": 1.69, "grad_norm": 8.316583633422852, "learning_rate": 4.612414187643021e-06, "loss": 0.583, "step": 5917 }, { "epoch": 1.69, "grad_norm": 12.206084251403809, "learning_rate": 4.6081235697940504e-06, "loss": 0.7596, "step": 5918 }, { "epoch": 1.69, "grad_norm": 9.902289390563965, "learning_rate": 4.603832951945081e-06, "loss": 0.7456, "step": 5919 }, { "epoch": 1.69, "grad_norm": 14.221868515014648, "learning_rate": 4.5995423340961105e-06, "loss": 0.778, "step": 5920 }, { "epoch": 1.69, "grad_norm": 13.835212707519531, "learning_rate": 4.595251716247139e-06, "loss": 0.5654, "step": 5921 }, { "epoch": 1.69, "grad_norm": 10.850911140441895, "learning_rate": 4.590961098398169e-06, "loss": 0.5314, "step": 5922 }, { "epoch": 1.69, "grad_norm": 12.394707679748535, "learning_rate": 4.5866704805491985e-06, "loss": 0.5975, "step": 5923 }, { "epoch": 1.69, "grad_norm": 10.528264045715332, "learning_rate": 4.582379862700229e-06, "loss": 0.4408, "step": 5924 }, { "epoch": 1.69, "grad_norm": 13.076751708984375, "learning_rate": 4.578089244851259e-06, "loss": 0.9047, "step": 5925 }, { "epoch": 1.7, "grad_norm": 11.11559009552002, "learning_rate": 4.573798627002288e-06, "loss": 0.6955, "step": 5926 }, { "epoch": 1.7, "grad_norm": 11.401023864746094, "learning_rate": 4.569508009153318e-06, "loss": 0.4923, "step": 5927 }, { "epoch": 1.7, "grad_norm": 11.230006217956543, "learning_rate": 4.565217391304348e-06, "loss": 0.7525, "step": 5928 }, { "epoch": 1.7, "grad_norm": 9.576884269714355, "learning_rate": 4.560926773455378e-06, "loss": 0.4744, "step": 5929 }, { "epoch": 1.7, "grad_norm": 13.80195140838623, "learning_rate": 4.5566361556064075e-06, "loss": 0.8665, "step": 5930 }, { "epoch": 1.7, "grad_norm": 8.915005683898926, "learning_rate": 4.552345537757437e-06, "loss": 0.6348, "step": 5931 }, { "epoch": 1.7, "grad_norm": 11.691276550292969, "learning_rate": 4.548054919908468e-06, "loss": 0.6432, "step": 5932 }, { "epoch": 1.7, "grad_norm": 12.980295181274414, "learning_rate": 4.543764302059497e-06, "loss": 0.9116, "step": 5933 }, { "epoch": 1.7, "grad_norm": 9.581231117248535, "learning_rate": 4.539473684210527e-06, "loss": 0.6034, "step": 5934 }, { "epoch": 1.7, "grad_norm": 8.7820405960083, "learning_rate": 4.535183066361556e-06, "loss": 0.6066, "step": 5935 }, { "epoch": 1.7, "grad_norm": 11.12899398803711, "learning_rate": 4.530892448512586e-06, "loss": 0.6293, "step": 5936 }, { "epoch": 1.7, "grad_norm": 8.969178199768066, "learning_rate": 4.526601830663616e-06, "loss": 0.5033, "step": 5937 }, { "epoch": 1.7, "grad_norm": 11.642123222351074, "learning_rate": 4.522311212814645e-06, "loss": 1.0285, "step": 5938 }, { "epoch": 1.7, "grad_norm": 10.58486557006836, "learning_rate": 4.518020594965675e-06, "loss": 0.8515, "step": 5939 }, { "epoch": 1.7, "grad_norm": 13.439935684204102, "learning_rate": 4.5137299771167045e-06, "loss": 0.9082, "step": 5940 }, { "epoch": 1.7, "grad_norm": 11.394495964050293, "learning_rate": 4.509439359267735e-06, "loss": 0.8086, "step": 5941 }, { "epoch": 1.7, "grad_norm": 8.398945808410645, "learning_rate": 4.5051487414187646e-06, "loss": 0.8295, "step": 5942 }, { "epoch": 1.7, "grad_norm": 8.517271995544434, "learning_rate": 4.500858123569794e-06, "loss": 0.7168, "step": 5943 }, { "epoch": 1.7, "grad_norm": 9.0183744430542, "learning_rate": 4.496567505720824e-06, "loss": 0.6702, "step": 5944 }, { "epoch": 1.7, "grad_norm": 10.215263366699219, "learning_rate": 4.492276887871854e-06, "loss": 0.6134, "step": 5945 }, { "epoch": 1.7, "grad_norm": 12.507948875427246, "learning_rate": 4.487986270022884e-06, "loss": 0.7977, "step": 5946 }, { "epoch": 1.7, "grad_norm": 9.09069538116455, "learning_rate": 4.4836956521739135e-06, "loss": 0.629, "step": 5947 }, { "epoch": 1.7, "grad_norm": 10.303925514221191, "learning_rate": 4.479405034324942e-06, "loss": 0.6987, "step": 5948 }, { "epoch": 1.7, "grad_norm": 9.196348190307617, "learning_rate": 4.475114416475973e-06, "loss": 0.6685, "step": 5949 }, { "epoch": 1.7, "grad_norm": 11.39057445526123, "learning_rate": 4.470823798627002e-06, "loss": 0.6844, "step": 5950 }, { "epoch": 1.7, "grad_norm": 11.751300811767578, "learning_rate": 4.466533180778032e-06, "loss": 0.6349, "step": 5951 }, { "epoch": 1.7, "grad_norm": 10.53183650970459, "learning_rate": 4.4622425629290616e-06, "loss": 0.7825, "step": 5952 }, { "epoch": 1.7, "grad_norm": 9.162747383117676, "learning_rate": 4.457951945080091e-06, "loss": 0.6018, "step": 5953 }, { "epoch": 1.7, "grad_norm": 9.915216445922852, "learning_rate": 4.453661327231122e-06, "loss": 0.5699, "step": 5954 }, { "epoch": 1.7, "grad_norm": 8.251471519470215, "learning_rate": 4.449370709382151e-06, "loss": 0.5714, "step": 5955 }, { "epoch": 1.7, "grad_norm": 11.356422424316406, "learning_rate": 4.445080091533181e-06, "loss": 0.9157, "step": 5956 }, { "epoch": 1.7, "grad_norm": 7.7874836921691895, "learning_rate": 4.4407894736842105e-06, "loss": 0.5033, "step": 5957 }, { "epoch": 1.7, "grad_norm": 9.747831344604492, "learning_rate": 4.436498855835241e-06, "loss": 0.7735, "step": 5958 }, { "epoch": 1.7, "grad_norm": 9.569536209106445, "learning_rate": 4.4322082379862706e-06, "loss": 0.4177, "step": 5959 }, { "epoch": 1.7, "grad_norm": 14.832538604736328, "learning_rate": 4.4279176201373e-06, "loss": 0.9468, "step": 5960 }, { "epoch": 1.71, "grad_norm": 11.02159595489502, "learning_rate": 4.423627002288329e-06, "loss": 0.7396, "step": 5961 }, { "epoch": 1.71, "grad_norm": 8.493553161621094, "learning_rate": 4.419336384439359e-06, "loss": 0.5594, "step": 5962 }, { "epoch": 1.71, "grad_norm": 9.55902099609375, "learning_rate": 4.415045766590389e-06, "loss": 0.6982, "step": 5963 }, { "epoch": 1.71, "grad_norm": 10.69473934173584, "learning_rate": 4.410755148741419e-06, "loss": 0.7358, "step": 5964 }, { "epoch": 1.71, "grad_norm": 10.646027565002441, "learning_rate": 4.406464530892448e-06, "loss": 0.6932, "step": 5965 }, { "epoch": 1.71, "grad_norm": 10.105545043945312, "learning_rate": 4.402173913043479e-06, "loss": 0.5455, "step": 5966 }, { "epoch": 1.71, "grad_norm": 10.997332572937012, "learning_rate": 4.397883295194508e-06, "loss": 0.6765, "step": 5967 }, { "epoch": 1.71, "grad_norm": 11.80391788482666, "learning_rate": 4.393592677345538e-06, "loss": 0.9207, "step": 5968 }, { "epoch": 1.71, "grad_norm": 9.650821685791016, "learning_rate": 4.3893020594965676e-06, "loss": 0.8558, "step": 5969 }, { "epoch": 1.71, "grad_norm": 10.389674186706543, "learning_rate": 4.385011441647597e-06, "loss": 0.6678, "step": 5970 }, { "epoch": 1.71, "grad_norm": 12.466364860534668, "learning_rate": 4.380720823798628e-06, "loss": 0.8399, "step": 5971 }, { "epoch": 1.71, "grad_norm": 10.856403350830078, "learning_rate": 4.376430205949657e-06, "loss": 0.737, "step": 5972 }, { "epoch": 1.71, "grad_norm": 9.363508224487305, "learning_rate": 4.372139588100687e-06, "loss": 0.4673, "step": 5973 }, { "epoch": 1.71, "grad_norm": 11.127074241638184, "learning_rate": 4.367848970251716e-06, "loss": 1.125, "step": 5974 }, { "epoch": 1.71, "grad_norm": 9.285826683044434, "learning_rate": 4.363558352402746e-06, "loss": 0.5807, "step": 5975 }, { "epoch": 1.71, "grad_norm": 10.394722938537598, "learning_rate": 4.359267734553776e-06, "loss": 0.7673, "step": 5976 }, { "epoch": 1.71, "grad_norm": 13.34463119506836, "learning_rate": 4.354977116704805e-06, "loss": 0.5349, "step": 5977 }, { "epoch": 1.71, "grad_norm": 8.6209716796875, "learning_rate": 4.350686498855835e-06, "loss": 0.5501, "step": 5978 }, { "epoch": 1.71, "grad_norm": 13.577221870422363, "learning_rate": 4.346395881006865e-06, "loss": 0.8501, "step": 5979 }, { "epoch": 1.71, "grad_norm": 10.219799995422363, "learning_rate": 4.342105263157895e-06, "loss": 0.7215, "step": 5980 }, { "epoch": 1.71, "grad_norm": 13.079976081848145, "learning_rate": 4.337814645308925e-06, "loss": 0.7579, "step": 5981 }, { "epoch": 1.71, "grad_norm": 11.152042388916016, "learning_rate": 4.333524027459954e-06, "loss": 0.6373, "step": 5982 }, { "epoch": 1.71, "grad_norm": 11.564728736877441, "learning_rate": 4.329233409610984e-06, "loss": 0.7111, "step": 5983 }, { "epoch": 1.71, "grad_norm": 10.477561950683594, "learning_rate": 4.324942791762014e-06, "loss": 0.7411, "step": 5984 }, { "epoch": 1.71, "grad_norm": 10.553227424621582, "learning_rate": 4.320652173913044e-06, "loss": 0.8103, "step": 5985 }, { "epoch": 1.71, "grad_norm": 10.41597843170166, "learning_rate": 4.3163615560640736e-06, "loss": 0.6798, "step": 5986 }, { "epoch": 1.71, "grad_norm": 8.1062650680542, "learning_rate": 4.312070938215102e-06, "loss": 0.6382, "step": 5987 }, { "epoch": 1.71, "grad_norm": 8.41385269165039, "learning_rate": 4.307780320366133e-06, "loss": 0.5842, "step": 5988 }, { "epoch": 1.71, "grad_norm": 11.125850677490234, "learning_rate": 4.303489702517162e-06, "loss": 0.6911, "step": 5989 }, { "epoch": 1.71, "grad_norm": 10.93230152130127, "learning_rate": 4.299199084668192e-06, "loss": 0.5723, "step": 5990 }, { "epoch": 1.71, "grad_norm": 8.094341278076172, "learning_rate": 4.294908466819222e-06, "loss": 0.6396, "step": 5991 }, { "epoch": 1.71, "grad_norm": 13.452651977539062, "learning_rate": 4.290617848970252e-06, "loss": 0.7873, "step": 5992 }, { "epoch": 1.71, "grad_norm": 10.352450370788574, "learning_rate": 4.286327231121282e-06, "loss": 0.5889, "step": 5993 }, { "epoch": 1.71, "grad_norm": 9.265152931213379, "learning_rate": 4.282036613272311e-06, "loss": 0.7391, "step": 5994 }, { "epoch": 1.71, "grad_norm": 10.378124237060547, "learning_rate": 4.277745995423341e-06, "loss": 0.7037, "step": 5995 }, { "epoch": 1.72, "grad_norm": 11.603124618530273, "learning_rate": 4.273455377574371e-06, "loss": 0.73, "step": 5996 }, { "epoch": 1.72, "grad_norm": 12.37160587310791, "learning_rate": 4.269164759725401e-06, "loss": 0.5829, "step": 5997 }, { "epoch": 1.72, "grad_norm": 9.701993942260742, "learning_rate": 4.264874141876431e-06, "loss": 0.7356, "step": 5998 }, { "epoch": 1.72, "grad_norm": 10.731022834777832, "learning_rate": 4.26058352402746e-06, "loss": 0.7306, "step": 5999 }, { "epoch": 1.72, "grad_norm": 11.346711158752441, "learning_rate": 4.256292906178489e-06, "loss": 0.7553, "step": 6000 }, { "epoch": 1.72, "grad_norm": 10.900642395019531, "learning_rate": 4.2520022883295195e-06, "loss": 0.5546, "step": 6001 }, { "epoch": 1.72, "grad_norm": 13.566597938537598, "learning_rate": 4.247711670480549e-06, "loss": 0.8884, "step": 6002 }, { "epoch": 1.72, "grad_norm": 14.55006217956543, "learning_rate": 4.243421052631579e-06, "loss": 0.8857, "step": 6003 }, { "epoch": 1.72, "grad_norm": 12.602134704589844, "learning_rate": 4.239130434782608e-06, "loss": 0.8392, "step": 6004 }, { "epoch": 1.72, "grad_norm": 13.23162841796875, "learning_rate": 4.234839816933639e-06, "loss": 0.6262, "step": 6005 }, { "epoch": 1.72, "grad_norm": 11.039629936218262, "learning_rate": 4.230549199084668e-06, "loss": 0.6168, "step": 6006 }, { "epoch": 1.72, "grad_norm": 8.362481117248535, "learning_rate": 4.226258581235698e-06, "loss": 0.6102, "step": 6007 }, { "epoch": 1.72, "grad_norm": 7.873233318328857, "learning_rate": 4.221967963386728e-06, "loss": 0.6211, "step": 6008 }, { "epoch": 1.72, "grad_norm": 9.244438171386719, "learning_rate": 4.217677345537758e-06, "loss": 0.5464, "step": 6009 }, { "epoch": 1.72, "grad_norm": 11.47413444519043, "learning_rate": 4.213386727688788e-06, "loss": 0.7942, "step": 6010 }, { "epoch": 1.72, "grad_norm": 12.936746597290039, "learning_rate": 4.209096109839817e-06, "loss": 0.7059, "step": 6011 }, { "epoch": 1.72, "grad_norm": 11.094941139221191, "learning_rate": 4.204805491990847e-06, "loss": 0.4952, "step": 6012 }, { "epoch": 1.72, "grad_norm": 7.792799472808838, "learning_rate": 4.200514874141876e-06, "loss": 0.7102, "step": 6013 }, { "epoch": 1.72, "grad_norm": 11.127025604248047, "learning_rate": 4.196224256292906e-06, "loss": 0.78, "step": 6014 }, { "epoch": 1.72, "grad_norm": 9.540066719055176, "learning_rate": 4.191933638443936e-06, "loss": 0.7395, "step": 6015 }, { "epoch": 1.72, "grad_norm": 10.446444511413574, "learning_rate": 4.187643020594965e-06, "loss": 0.5893, "step": 6016 }, { "epoch": 1.72, "grad_norm": 10.989967346191406, "learning_rate": 4.183352402745995e-06, "loss": 0.58, "step": 6017 }, { "epoch": 1.72, "grad_norm": 10.281612396240234, "learning_rate": 4.1790617848970255e-06, "loss": 0.7098, "step": 6018 }, { "epoch": 1.72, "grad_norm": 11.980088233947754, "learning_rate": 4.174771167048055e-06, "loss": 0.5153, "step": 6019 }, { "epoch": 1.72, "grad_norm": 9.9744234085083, "learning_rate": 4.170480549199085e-06, "loss": 0.5124, "step": 6020 }, { "epoch": 1.72, "grad_norm": 9.606974601745605, "learning_rate": 4.166189931350114e-06, "loss": 0.6363, "step": 6021 }, { "epoch": 1.72, "grad_norm": 8.587697982788086, "learning_rate": 4.161899313501145e-06, "loss": 0.5235, "step": 6022 }, { "epoch": 1.72, "grad_norm": 10.671334266662598, "learning_rate": 4.157608695652174e-06, "loss": 0.7783, "step": 6023 }, { "epoch": 1.72, "grad_norm": 11.862448692321777, "learning_rate": 4.153318077803204e-06, "loss": 0.9054, "step": 6024 }, { "epoch": 1.72, "grad_norm": 9.895891189575195, "learning_rate": 4.149027459954234e-06, "loss": 0.5098, "step": 6025 }, { "epoch": 1.72, "grad_norm": 8.942290306091309, "learning_rate": 4.144736842105263e-06, "loss": 0.7021, "step": 6026 }, { "epoch": 1.72, "grad_norm": 10.103818893432617, "learning_rate": 4.140446224256293e-06, "loss": 0.5956, "step": 6027 }, { "epoch": 1.72, "grad_norm": 11.1176118850708, "learning_rate": 4.1361556064073225e-06, "loss": 0.8294, "step": 6028 }, { "epoch": 1.72, "grad_norm": 8.09962272644043, "learning_rate": 4.131864988558352e-06, "loss": 0.602, "step": 6029 }, { "epoch": 1.72, "grad_norm": 13.282578468322754, "learning_rate": 4.127574370709382e-06, "loss": 0.674, "step": 6030 }, { "epoch": 1.73, "grad_norm": 8.672886848449707, "learning_rate": 4.123283752860412e-06, "loss": 0.5272, "step": 6031 }, { "epoch": 1.73, "grad_norm": 11.450350761413574, "learning_rate": 4.118993135011442e-06, "loss": 0.9202, "step": 6032 }, { "epoch": 1.73, "grad_norm": 10.484188079833984, "learning_rate": 4.114702517162471e-06, "loss": 0.7453, "step": 6033 }, { "epoch": 1.73, "grad_norm": 11.441675186157227, "learning_rate": 4.110411899313501e-06, "loss": 0.716, "step": 6034 }, { "epoch": 1.73, "grad_norm": 9.349542617797852, "learning_rate": 4.1061212814645315e-06, "loss": 0.4711, "step": 6035 }, { "epoch": 1.73, "grad_norm": 7.599945068359375, "learning_rate": 4.101830663615561e-06, "loss": 0.5928, "step": 6036 }, { "epoch": 1.73, "grad_norm": 9.424768447875977, "learning_rate": 4.097540045766591e-06, "loss": 0.6022, "step": 6037 }, { "epoch": 1.73, "grad_norm": 16.08043670654297, "learning_rate": 4.09324942791762e-06, "loss": 0.7528, "step": 6038 }, { "epoch": 1.73, "grad_norm": 11.866475105285645, "learning_rate": 4.088958810068651e-06, "loss": 0.7812, "step": 6039 }, { "epoch": 1.73, "grad_norm": 7.43369197845459, "learning_rate": 4.0846681922196795e-06, "loss": 0.565, "step": 6040 }, { "epoch": 1.73, "grad_norm": 7.482633590698242, "learning_rate": 4.080377574370709e-06, "loss": 0.4834, "step": 6041 }, { "epoch": 1.73, "grad_norm": 12.829164505004883, "learning_rate": 4.076086956521739e-06, "loss": 0.6553, "step": 6042 }, { "epoch": 1.73, "grad_norm": 10.891663551330566, "learning_rate": 4.071796338672768e-06, "loss": 0.7566, "step": 6043 }, { "epoch": 1.73, "grad_norm": 10.123371124267578, "learning_rate": 4.067505720823799e-06, "loss": 0.7981, "step": 6044 }, { "epoch": 1.73, "grad_norm": 10.865095138549805, "learning_rate": 4.0632151029748285e-06, "loss": 0.9449, "step": 6045 }, { "epoch": 1.73, "grad_norm": 9.813224792480469, "learning_rate": 4.058924485125858e-06, "loss": 0.6667, "step": 6046 }, { "epoch": 1.73, "grad_norm": 10.570638656616211, "learning_rate": 4.054633867276888e-06, "loss": 0.7178, "step": 6047 }, { "epoch": 1.73, "grad_norm": 13.028220176696777, "learning_rate": 4.050343249427918e-06, "loss": 0.6327, "step": 6048 }, { "epoch": 1.73, "grad_norm": 9.264501571655273, "learning_rate": 4.046052631578948e-06, "loss": 0.6126, "step": 6049 }, { "epoch": 1.73, "grad_norm": 11.550825119018555, "learning_rate": 4.041762013729977e-06, "loss": 0.6428, "step": 6050 }, { "epoch": 1.73, "grad_norm": 10.864901542663574, "learning_rate": 4.037471395881007e-06, "loss": 0.5461, "step": 6051 }, { "epoch": 1.73, "grad_norm": 8.91755485534668, "learning_rate": 4.0331807780320375e-06, "loss": 0.3539, "step": 6052 }, { "epoch": 1.73, "grad_norm": 8.895212173461914, "learning_rate": 4.028890160183066e-06, "loss": 0.6364, "step": 6053 }, { "epoch": 1.73, "grad_norm": 10.656241416931152, "learning_rate": 4.024599542334096e-06, "loss": 0.8206, "step": 6054 }, { "epoch": 1.73, "grad_norm": 10.321805000305176, "learning_rate": 4.0203089244851255e-06, "loss": 0.6592, "step": 6055 }, { "epoch": 1.73, "grad_norm": 8.235122680664062, "learning_rate": 4.016018306636156e-06, "loss": 0.5673, "step": 6056 }, { "epoch": 1.73, "grad_norm": 10.012003898620605, "learning_rate": 4.0117276887871855e-06, "loss": 0.7307, "step": 6057 }, { "epoch": 1.73, "grad_norm": 13.317085266113281, "learning_rate": 4.007437070938215e-06, "loss": 0.8539, "step": 6058 }, { "epoch": 1.73, "grad_norm": 11.325658798217773, "learning_rate": 4.003146453089245e-06, "loss": 0.5153, "step": 6059 }, { "epoch": 1.73, "grad_norm": 11.494180679321289, "learning_rate": 3.998855835240274e-06, "loss": 0.863, "step": 6060 }, { "epoch": 1.73, "grad_norm": 11.76332950592041, "learning_rate": 3.994565217391305e-06, "loss": 0.9077, "step": 6061 }, { "epoch": 1.73, "grad_norm": 8.529455184936523, "learning_rate": 3.9902745995423345e-06, "loss": 0.5149, "step": 6062 }, { "epoch": 1.73, "grad_norm": 8.060349464416504, "learning_rate": 3.985983981693364e-06, "loss": 0.5088, "step": 6063 }, { "epoch": 1.73, "grad_norm": 9.382649421691895, "learning_rate": 3.981693363844394e-06, "loss": 0.5403, "step": 6064 }, { "epoch": 1.73, "grad_norm": 10.23032283782959, "learning_rate": 3.977402745995424e-06, "loss": 0.7574, "step": 6065 }, { "epoch": 1.74, "grad_norm": 11.511918067932129, "learning_rate": 3.973112128146453e-06, "loss": 0.7237, "step": 6066 }, { "epoch": 1.74, "grad_norm": 9.262679100036621, "learning_rate": 3.9688215102974825e-06, "loss": 0.7081, "step": 6067 }, { "epoch": 1.74, "grad_norm": 10.52244758605957, "learning_rate": 3.964530892448512e-06, "loss": 0.8418, "step": 6068 }, { "epoch": 1.74, "grad_norm": 8.32150936126709, "learning_rate": 3.960240274599543e-06, "loss": 0.5113, "step": 6069 }, { "epoch": 1.74, "grad_norm": 10.85783576965332, "learning_rate": 3.955949656750572e-06, "loss": 0.5971, "step": 6070 }, { "epoch": 1.74, "grad_norm": 12.290858268737793, "learning_rate": 3.951659038901602e-06, "loss": 0.7818, "step": 6071 }, { "epoch": 1.74, "grad_norm": 10.787084579467773, "learning_rate": 3.9473684210526315e-06, "loss": 0.5963, "step": 6072 }, { "epoch": 1.74, "grad_norm": 9.790975570678711, "learning_rate": 3.943077803203661e-06, "loss": 0.7175, "step": 6073 }, { "epoch": 1.74, "grad_norm": 10.893580436706543, "learning_rate": 3.9387871853546915e-06, "loss": 0.8964, "step": 6074 }, { "epoch": 1.74, "grad_norm": 11.846661567687988, "learning_rate": 3.934496567505721e-06, "loss": 0.6185, "step": 6075 }, { "epoch": 1.74, "grad_norm": 10.663214683532715, "learning_rate": 3.930205949656751e-06, "loss": 0.5727, "step": 6076 }, { "epoch": 1.74, "grad_norm": 11.027164459228516, "learning_rate": 3.92591533180778e-06, "loss": 0.7161, "step": 6077 }, { "epoch": 1.74, "grad_norm": 7.9825592041015625, "learning_rate": 3.921624713958811e-06, "loss": 0.3647, "step": 6078 }, { "epoch": 1.74, "grad_norm": 8.15426254272461, "learning_rate": 3.91733409610984e-06, "loss": 0.4139, "step": 6079 }, { "epoch": 1.74, "grad_norm": 8.603719711303711, "learning_rate": 3.913043478260869e-06, "loss": 0.6229, "step": 6080 }, { "epoch": 1.74, "grad_norm": 9.28713607788086, "learning_rate": 3.908752860411899e-06, "loss": 0.5782, "step": 6081 }, { "epoch": 1.74, "grad_norm": 9.771931648254395, "learning_rate": 3.904462242562929e-06, "loss": 0.5709, "step": 6082 }, { "epoch": 1.74, "grad_norm": 10.751740455627441, "learning_rate": 3.900171624713959e-06, "loss": 0.6636, "step": 6083 }, { "epoch": 1.74, "grad_norm": 11.382894515991211, "learning_rate": 3.8958810068649885e-06, "loss": 0.5822, "step": 6084 }, { "epoch": 1.74, "grad_norm": 13.858905792236328, "learning_rate": 3.891590389016018e-06, "loss": 0.9624, "step": 6085 }, { "epoch": 1.74, "grad_norm": 10.064650535583496, "learning_rate": 3.887299771167049e-06, "loss": 0.6448, "step": 6086 }, { "epoch": 1.74, "grad_norm": 8.4977445602417, "learning_rate": 3.883009153318078e-06, "loss": 0.466, "step": 6087 }, { "epoch": 1.74, "grad_norm": 13.306116104125977, "learning_rate": 3.878718535469108e-06, "loss": 0.7704, "step": 6088 }, { "epoch": 1.74, "grad_norm": 9.628791809082031, "learning_rate": 3.8744279176201375e-06, "loss": 0.4445, "step": 6089 }, { "epoch": 1.74, "grad_norm": 9.335134506225586, "learning_rate": 3.870137299771167e-06, "loss": 0.6053, "step": 6090 }, { "epoch": 1.74, "grad_norm": 13.88830852508545, "learning_rate": 3.8658466819221975e-06, "loss": 0.7347, "step": 6091 }, { "epoch": 1.74, "grad_norm": 8.299972534179688, "learning_rate": 3.861556064073226e-06, "loss": 0.3696, "step": 6092 }, { "epoch": 1.74, "grad_norm": 12.010845184326172, "learning_rate": 3.857265446224256e-06, "loss": 0.7255, "step": 6093 }, { "epoch": 1.74, "grad_norm": 12.07990837097168, "learning_rate": 3.8529748283752855e-06, "loss": 0.6944, "step": 6094 }, { "epoch": 1.74, "grad_norm": 10.700096130371094, "learning_rate": 3.848684210526316e-06, "loss": 0.764, "step": 6095 }, { "epoch": 1.74, "grad_norm": 9.16130542755127, "learning_rate": 3.844393592677346e-06, "loss": 0.536, "step": 6096 }, { "epoch": 1.74, "grad_norm": 10.576375007629395, "learning_rate": 3.840102974828375e-06, "loss": 0.6338, "step": 6097 }, { "epoch": 1.74, "grad_norm": 12.437286376953125, "learning_rate": 3.835812356979405e-06, "loss": 0.8559, "step": 6098 }, { "epoch": 1.74, "grad_norm": 10.215106964111328, "learning_rate": 3.831521739130435e-06, "loss": 0.7449, "step": 6099 }, { "epoch": 1.74, "grad_norm": 10.478534698486328, "learning_rate": 3.827231121281465e-06, "loss": 0.6246, "step": 6100 }, { "epoch": 1.75, "grad_norm": 12.23443603515625, "learning_rate": 3.8229405034324945e-06, "loss": 0.582, "step": 6101 }, { "epoch": 1.75, "grad_norm": 11.197013854980469, "learning_rate": 3.818649885583524e-06, "loss": 0.6055, "step": 6102 }, { "epoch": 1.75, "grad_norm": 9.7366361618042, "learning_rate": 3.8143592677345533e-06, "loss": 0.7919, "step": 6103 }, { "epoch": 1.75, "grad_norm": 10.818952560424805, "learning_rate": 3.810068649885584e-06, "loss": 0.7185, "step": 6104 }, { "epoch": 1.75, "grad_norm": 13.971001625061035, "learning_rate": 3.8057780320366134e-06, "loss": 1.0051, "step": 6105 }, { "epoch": 1.75, "grad_norm": 9.492419242858887, "learning_rate": 3.801487414187643e-06, "loss": 0.5068, "step": 6106 }, { "epoch": 1.75, "grad_norm": 9.137680053710938, "learning_rate": 3.7971967963386726e-06, "loss": 0.5911, "step": 6107 }, { "epoch": 1.75, "grad_norm": 10.991887092590332, "learning_rate": 3.792906178489703e-06, "loss": 0.6473, "step": 6108 }, { "epoch": 1.75, "grad_norm": 8.027948379516602, "learning_rate": 3.7886155606407323e-06, "loss": 0.5314, "step": 6109 }, { "epoch": 1.75, "grad_norm": 11.149654388427734, "learning_rate": 3.784324942791762e-06, "loss": 0.6914, "step": 6110 }, { "epoch": 1.75, "grad_norm": 9.29753589630127, "learning_rate": 3.7800343249427915e-06, "loss": 0.4906, "step": 6111 }, { "epoch": 1.75, "grad_norm": 10.15299129486084, "learning_rate": 3.775743707093822e-06, "loss": 0.5583, "step": 6112 }, { "epoch": 1.75, "grad_norm": 10.863149642944336, "learning_rate": 3.7714530892448516e-06, "loss": 0.7952, "step": 6113 }, { "epoch": 1.75, "grad_norm": 13.928953170776367, "learning_rate": 3.7671624713958812e-06, "loss": 0.6771, "step": 6114 }, { "epoch": 1.75, "grad_norm": 12.386027336120605, "learning_rate": 3.7628718535469104e-06, "loss": 0.8803, "step": 6115 }, { "epoch": 1.75, "grad_norm": 9.897882461547852, "learning_rate": 3.758581235697941e-06, "loss": 0.7667, "step": 6116 }, { "epoch": 1.75, "grad_norm": 10.644085884094238, "learning_rate": 3.7542906178489705e-06, "loss": 0.7295, "step": 6117 }, { "epoch": 1.75, "grad_norm": 8.818511009216309, "learning_rate": 3.75e-06, "loss": 0.6042, "step": 6118 }, { "epoch": 1.75, "grad_norm": 9.307840347290039, "learning_rate": 3.7457093821510297e-06, "loss": 0.6812, "step": 6119 }, { "epoch": 1.75, "grad_norm": 13.10869026184082, "learning_rate": 3.7414187643020597e-06, "loss": 0.6901, "step": 6120 }, { "epoch": 1.75, "grad_norm": 12.035422325134277, "learning_rate": 3.7371281464530894e-06, "loss": 0.6088, "step": 6121 }, { "epoch": 1.75, "grad_norm": 9.764453887939453, "learning_rate": 3.732837528604119e-06, "loss": 0.7152, "step": 6122 }, { "epoch": 1.75, "grad_norm": 12.01958179473877, "learning_rate": 3.7285469107551486e-06, "loss": 0.7195, "step": 6123 }, { "epoch": 1.75, "grad_norm": 11.711091041564941, "learning_rate": 3.7242562929061786e-06, "loss": 0.7594, "step": 6124 }, { "epoch": 1.75, "grad_norm": 12.01634407043457, "learning_rate": 3.7199656750572082e-06, "loss": 0.5769, "step": 6125 }, { "epoch": 1.75, "grad_norm": 10.122314453125, "learning_rate": 3.7156750572082383e-06, "loss": 0.6724, "step": 6126 }, { "epoch": 1.75, "grad_norm": 11.244257926940918, "learning_rate": 3.711384439359268e-06, "loss": 0.5739, "step": 6127 }, { "epoch": 1.75, "grad_norm": 11.775614738464355, "learning_rate": 3.7070938215102975e-06, "loss": 0.6301, "step": 6128 }, { "epoch": 1.75, "grad_norm": 9.514596939086914, "learning_rate": 3.702803203661327e-06, "loss": 0.6, "step": 6129 }, { "epoch": 1.75, "grad_norm": 10.512495040893555, "learning_rate": 3.698512585812357e-06, "loss": 0.7625, "step": 6130 }, { "epoch": 1.75, "grad_norm": 9.708354949951172, "learning_rate": 3.6942219679633868e-06, "loss": 0.6674, "step": 6131 }, { "epoch": 1.75, "grad_norm": 11.710132598876953, "learning_rate": 3.689931350114417e-06, "loss": 0.7033, "step": 6132 }, { "epoch": 1.75, "grad_norm": 15.638635635375977, "learning_rate": 3.6856407322654464e-06, "loss": 0.7225, "step": 6133 }, { "epoch": 1.75, "grad_norm": 10.383947372436523, "learning_rate": 3.681350114416476e-06, "loss": 0.4643, "step": 6134 }, { "epoch": 1.75, "grad_norm": 12.879046440124512, "learning_rate": 3.6770594965675057e-06, "loss": 0.6736, "step": 6135 }, { "epoch": 1.76, "grad_norm": 10.509659767150879, "learning_rate": 3.6727688787185353e-06, "loss": 0.532, "step": 6136 }, { "epoch": 1.76, "grad_norm": 8.082070350646973, "learning_rate": 3.6684782608695653e-06, "loss": 0.5275, "step": 6137 }, { "epoch": 1.76, "grad_norm": 11.578171730041504, "learning_rate": 3.664187643020595e-06, "loss": 0.7137, "step": 6138 }, { "epoch": 1.76, "grad_norm": 10.211572647094727, "learning_rate": 3.659897025171625e-06, "loss": 0.5826, "step": 6139 }, { "epoch": 1.76, "grad_norm": 9.431300163269043, "learning_rate": 3.6556064073226546e-06, "loss": 0.5547, "step": 6140 }, { "epoch": 1.76, "grad_norm": 11.96328353881836, "learning_rate": 3.651315789473684e-06, "loss": 0.7601, "step": 6141 }, { "epoch": 1.76, "grad_norm": 11.621810913085938, "learning_rate": 3.647025171624714e-06, "loss": 0.7721, "step": 6142 }, { "epoch": 1.76, "grad_norm": 11.324017524719238, "learning_rate": 3.642734553775744e-06, "loss": 0.7112, "step": 6143 }, { "epoch": 1.76, "grad_norm": 10.572038650512695, "learning_rate": 3.6384439359267735e-06, "loss": 0.7062, "step": 6144 }, { "epoch": 1.76, "grad_norm": 12.122496604919434, "learning_rate": 3.6341533180778035e-06, "loss": 0.6158, "step": 6145 }, { "epoch": 1.76, "grad_norm": 13.798746109008789, "learning_rate": 3.629862700228833e-06, "loss": 0.9658, "step": 6146 }, { "epoch": 1.76, "grad_norm": 12.25180435180664, "learning_rate": 3.625572082379863e-06, "loss": 0.6471, "step": 6147 }, { "epoch": 1.76, "grad_norm": 8.908062934875488, "learning_rate": 3.6212814645308924e-06, "loss": 0.5552, "step": 6148 }, { "epoch": 1.76, "grad_norm": 11.949036598205566, "learning_rate": 3.616990846681922e-06, "loss": 0.6593, "step": 6149 }, { "epoch": 1.76, "grad_norm": 9.419204711914062, "learning_rate": 3.612700228832952e-06, "loss": 0.6012, "step": 6150 }, { "epoch": 1.76, "grad_norm": 10.818522453308105, "learning_rate": 3.6084096109839816e-06, "loss": 0.7202, "step": 6151 }, { "epoch": 1.76, "grad_norm": 10.00659465789795, "learning_rate": 3.6041189931350117e-06, "loss": 0.4724, "step": 6152 }, { "epoch": 1.76, "grad_norm": 9.23212718963623, "learning_rate": 3.5998283752860413e-06, "loss": 0.7177, "step": 6153 }, { "epoch": 1.76, "grad_norm": 10.0756196975708, "learning_rate": 3.5955377574370713e-06, "loss": 0.714, "step": 6154 }, { "epoch": 1.76, "grad_norm": 10.584141731262207, "learning_rate": 3.5912471395881005e-06, "loss": 0.4958, "step": 6155 }, { "epoch": 1.76, "grad_norm": 9.411657333374023, "learning_rate": 3.5869565217391305e-06, "loss": 0.578, "step": 6156 }, { "epoch": 1.76, "grad_norm": 10.629715919494629, "learning_rate": 3.58266590389016e-06, "loss": 0.5276, "step": 6157 }, { "epoch": 1.76, "grad_norm": 10.674521446228027, "learning_rate": 3.57837528604119e-06, "loss": 0.6469, "step": 6158 }, { "epoch": 1.76, "grad_norm": 8.006171226501465, "learning_rate": 3.57408466819222e-06, "loss": 0.4589, "step": 6159 }, { "epoch": 1.76, "grad_norm": 10.794471740722656, "learning_rate": 3.56979405034325e-06, "loss": 0.7086, "step": 6160 }, { "epoch": 1.76, "grad_norm": 9.56449031829834, "learning_rate": 3.565503432494279e-06, "loss": 0.6166, "step": 6161 }, { "epoch": 1.76, "grad_norm": 11.872832298278809, "learning_rate": 3.561212814645309e-06, "loss": 0.5607, "step": 6162 }, { "epoch": 1.76, "grad_norm": 11.3371000289917, "learning_rate": 3.5569221967963387e-06, "loss": 0.6424, "step": 6163 }, { "epoch": 1.76, "grad_norm": 9.539777755737305, "learning_rate": 3.5526315789473683e-06, "loss": 0.5611, "step": 6164 }, { "epoch": 1.76, "grad_norm": 10.240815162658691, "learning_rate": 3.5483409610983983e-06, "loss": 0.7481, "step": 6165 }, { "epoch": 1.76, "grad_norm": 11.382087707519531, "learning_rate": 3.544050343249428e-06, "loss": 0.6187, "step": 6166 }, { "epoch": 1.76, "grad_norm": 10.351128578186035, "learning_rate": 3.539759725400458e-06, "loss": 0.6702, "step": 6167 }, { "epoch": 1.76, "grad_norm": 8.965996742248535, "learning_rate": 3.535469107551487e-06, "loss": 0.7342, "step": 6168 }, { "epoch": 1.76, "grad_norm": 12.034710884094238, "learning_rate": 3.5311784897025172e-06, "loss": 0.7361, "step": 6169 }, { "epoch": 1.76, "grad_norm": 11.237125396728516, "learning_rate": 3.526887871853547e-06, "loss": 0.7888, "step": 6170 }, { "epoch": 1.77, "grad_norm": 11.04775333404541, "learning_rate": 3.522597254004577e-06, "loss": 0.7907, "step": 6171 }, { "epoch": 1.77, "grad_norm": 11.389314651489258, "learning_rate": 3.5183066361556065e-06, "loss": 0.7913, "step": 6172 }, { "epoch": 1.77, "grad_norm": 10.503778457641602, "learning_rate": 3.5140160183066365e-06, "loss": 0.5925, "step": 6173 }, { "epoch": 1.77, "grad_norm": 11.094098091125488, "learning_rate": 3.5097254004576657e-06, "loss": 0.6376, "step": 6174 }, { "epoch": 1.77, "grad_norm": 9.925131797790527, "learning_rate": 3.5054347826086958e-06, "loss": 0.4822, "step": 6175 }, { "epoch": 1.77, "grad_norm": 9.414146423339844, "learning_rate": 3.5011441647597254e-06, "loss": 0.5317, "step": 6176 }, { "epoch": 1.77, "grad_norm": 13.774288177490234, "learning_rate": 3.4968535469107554e-06, "loss": 0.861, "step": 6177 }, { "epoch": 1.77, "grad_norm": 10.283080101013184, "learning_rate": 3.492562929061785e-06, "loss": 0.6522, "step": 6178 }, { "epoch": 1.77, "grad_norm": 10.067567825317383, "learning_rate": 3.4882723112128147e-06, "loss": 0.5531, "step": 6179 }, { "epoch": 1.77, "grad_norm": 12.848753929138184, "learning_rate": 3.4839816933638447e-06, "loss": 0.7347, "step": 6180 }, { "epoch": 1.77, "grad_norm": 9.71597671508789, "learning_rate": 3.479691075514874e-06, "loss": 0.5949, "step": 6181 }, { "epoch": 1.77, "grad_norm": 13.107844352722168, "learning_rate": 3.475400457665904e-06, "loss": 0.5409, "step": 6182 }, { "epoch": 1.77, "grad_norm": 7.763223648071289, "learning_rate": 3.4711098398169335e-06, "loss": 0.564, "step": 6183 }, { "epoch": 1.77, "grad_norm": 12.058732032775879, "learning_rate": 3.4668192219679636e-06, "loss": 0.7095, "step": 6184 }, { "epoch": 1.77, "grad_norm": 10.082940101623535, "learning_rate": 3.462528604118993e-06, "loss": 0.586, "step": 6185 }, { "epoch": 1.77, "grad_norm": 11.730283737182617, "learning_rate": 3.4582379862700232e-06, "loss": 0.6136, "step": 6186 }, { "epoch": 1.77, "grad_norm": 7.0045928955078125, "learning_rate": 3.453947368421053e-06, "loss": 0.5827, "step": 6187 }, { "epoch": 1.77, "grad_norm": 15.763656616210938, "learning_rate": 3.4496567505720825e-06, "loss": 0.8763, "step": 6188 }, { "epoch": 1.77, "grad_norm": 9.833060264587402, "learning_rate": 3.445366132723112e-06, "loss": 0.8132, "step": 6189 }, { "epoch": 1.77, "grad_norm": 10.767908096313477, "learning_rate": 3.441075514874142e-06, "loss": 0.7116, "step": 6190 }, { "epoch": 1.77, "grad_norm": 9.660247802734375, "learning_rate": 3.4367848970251717e-06, "loss": 0.6864, "step": 6191 }, { "epoch": 1.77, "grad_norm": 9.962508201599121, "learning_rate": 3.4324942791762018e-06, "loss": 0.7094, "step": 6192 }, { "epoch": 1.77, "grad_norm": 9.406431198120117, "learning_rate": 3.4282036613272314e-06, "loss": 0.4943, "step": 6193 }, { "epoch": 1.77, "grad_norm": 13.485053062438965, "learning_rate": 3.4239130434782606e-06, "loss": 1.0711, "step": 6194 }, { "epoch": 1.77, "grad_norm": 11.053961753845215, "learning_rate": 3.4196224256292906e-06, "loss": 0.6344, "step": 6195 }, { "epoch": 1.77, "grad_norm": 9.669668197631836, "learning_rate": 3.4153318077803202e-06, "loss": 0.5333, "step": 6196 }, { "epoch": 1.77, "grad_norm": 9.433085441589355, "learning_rate": 3.4110411899313503e-06, "loss": 0.6433, "step": 6197 }, { "epoch": 1.77, "grad_norm": 8.06063175201416, "learning_rate": 3.40675057208238e-06, "loss": 0.4725, "step": 6198 }, { "epoch": 1.77, "grad_norm": 11.81443977355957, "learning_rate": 3.40245995423341e-06, "loss": 0.8544, "step": 6199 }, { "epoch": 1.77, "grad_norm": 8.285453796386719, "learning_rate": 3.3981693363844395e-06, "loss": 0.5734, "step": 6200 }, { "epoch": 1.77, "grad_norm": 10.867969512939453, "learning_rate": 3.393878718535469e-06, "loss": 0.7148, "step": 6201 }, { "epoch": 1.77, "grad_norm": 10.558439254760742, "learning_rate": 3.3895881006864988e-06, "loss": 0.8534, "step": 6202 }, { "epoch": 1.77, "grad_norm": 9.229147911071777, "learning_rate": 3.385297482837529e-06, "loss": 0.6502, "step": 6203 }, { "epoch": 1.77, "grad_norm": 10.644229888916016, "learning_rate": 3.3810068649885584e-06, "loss": 0.8017, "step": 6204 }, { "epoch": 1.77, "grad_norm": 10.682366371154785, "learning_rate": 3.3767162471395885e-06, "loss": 0.6322, "step": 6205 }, { "epoch": 1.78, "grad_norm": 13.435039520263672, "learning_rate": 3.372425629290618e-06, "loss": 0.6487, "step": 6206 }, { "epoch": 1.78, "grad_norm": 11.46129035949707, "learning_rate": 3.3681350114416477e-06, "loss": 0.607, "step": 6207 }, { "epoch": 1.78, "grad_norm": 10.996201515197754, "learning_rate": 3.3638443935926773e-06, "loss": 0.5365, "step": 6208 }, { "epoch": 1.78, "grad_norm": 11.303156852722168, "learning_rate": 3.359553775743707e-06, "loss": 0.6161, "step": 6209 }, { "epoch": 1.78, "grad_norm": 10.77161693572998, "learning_rate": 3.355263157894737e-06, "loss": 0.5759, "step": 6210 }, { "epoch": 1.78, "grad_norm": 10.97195816040039, "learning_rate": 3.3509725400457666e-06, "loss": 0.5865, "step": 6211 }, { "epoch": 1.78, "grad_norm": 12.53463077545166, "learning_rate": 3.3466819221967966e-06, "loss": 0.7462, "step": 6212 }, { "epoch": 1.78, "grad_norm": 10.610153198242188, "learning_rate": 3.3423913043478262e-06, "loss": 0.65, "step": 6213 }, { "epoch": 1.78, "grad_norm": 10.48507022857666, "learning_rate": 3.338100686498856e-06, "loss": 0.6284, "step": 6214 }, { "epoch": 1.78, "grad_norm": 10.51944351196289, "learning_rate": 3.3338100686498854e-06, "loss": 0.7416, "step": 6215 }, { "epoch": 1.78, "grad_norm": 12.190573692321777, "learning_rate": 3.3295194508009155e-06, "loss": 0.6409, "step": 6216 }, { "epoch": 1.78, "grad_norm": 9.088427543640137, "learning_rate": 3.325228832951945e-06, "loss": 0.6849, "step": 6217 }, { "epoch": 1.78, "grad_norm": 7.932781219482422, "learning_rate": 3.320938215102975e-06, "loss": 0.5349, "step": 6218 }, { "epoch": 1.78, "grad_norm": 10.231762886047363, "learning_rate": 3.3166475972540048e-06, "loss": 0.6828, "step": 6219 }, { "epoch": 1.78, "grad_norm": 11.04877758026123, "learning_rate": 3.3123569794050344e-06, "loss": 0.6749, "step": 6220 }, { "epoch": 1.78, "grad_norm": 6.348419189453125, "learning_rate": 3.308066361556064e-06, "loss": 0.3623, "step": 6221 }, { "epoch": 1.78, "grad_norm": 9.415176391601562, "learning_rate": 3.303775743707094e-06, "loss": 0.5816, "step": 6222 }, { "epoch": 1.78, "grad_norm": 9.620346069335938, "learning_rate": 3.2994851258581236e-06, "loss": 0.5649, "step": 6223 }, { "epoch": 1.78, "grad_norm": 11.188264846801758, "learning_rate": 3.2951945080091533e-06, "loss": 0.7468, "step": 6224 }, { "epoch": 1.78, "grad_norm": 11.869451522827148, "learning_rate": 3.2909038901601833e-06, "loss": 0.9742, "step": 6225 }, { "epoch": 1.78, "grad_norm": 12.979801177978516, "learning_rate": 3.286613272311213e-06, "loss": 0.6048, "step": 6226 }, { "epoch": 1.78, "grad_norm": 12.555614471435547, "learning_rate": 3.2823226544622425e-06, "loss": 0.7426, "step": 6227 }, { "epoch": 1.78, "grad_norm": 9.1586275100708, "learning_rate": 3.278032036613272e-06, "loss": 0.5555, "step": 6228 }, { "epoch": 1.78, "grad_norm": 10.315823554992676, "learning_rate": 3.273741418764302e-06, "loss": 0.5908, "step": 6229 }, { "epoch": 1.78, "grad_norm": 8.977620124816895, "learning_rate": 3.269450800915332e-06, "loss": 0.638, "step": 6230 }, { "epoch": 1.78, "grad_norm": 12.179141998291016, "learning_rate": 3.265160183066362e-06, "loss": 0.6091, "step": 6231 }, { "epoch": 1.78, "grad_norm": 10.679584503173828, "learning_rate": 3.2608695652173914e-06, "loss": 0.546, "step": 6232 }, { "epoch": 1.78, "grad_norm": 8.82325267791748, "learning_rate": 3.256578947368421e-06, "loss": 0.7694, "step": 6233 }, { "epoch": 1.78, "grad_norm": 7.902207374572754, "learning_rate": 3.2522883295194507e-06, "loss": 0.6916, "step": 6234 }, { "epoch": 1.78, "grad_norm": 9.770059585571289, "learning_rate": 3.2479977116704807e-06, "loss": 0.6605, "step": 6235 }, { "epoch": 1.78, "grad_norm": 9.278421401977539, "learning_rate": 3.2437070938215103e-06, "loss": 0.5572, "step": 6236 }, { "epoch": 1.78, "grad_norm": 10.935799598693848, "learning_rate": 3.2394164759725404e-06, "loss": 0.6734, "step": 6237 }, { "epoch": 1.78, "grad_norm": 13.630748748779297, "learning_rate": 3.23512585812357e-06, "loss": 0.744, "step": 6238 }, { "epoch": 1.78, "grad_norm": 10.820865631103516, "learning_rate": 3.2308352402745996e-06, "loss": 0.6496, "step": 6239 }, { "epoch": 1.78, "grad_norm": 12.565092086791992, "learning_rate": 3.226544622425629e-06, "loss": 0.8417, "step": 6240 }, { "epoch": 1.79, "grad_norm": 10.59786319732666, "learning_rate": 3.222254004576659e-06, "loss": 0.6452, "step": 6241 }, { "epoch": 1.79, "grad_norm": 11.53398323059082, "learning_rate": 3.217963386727689e-06, "loss": 0.6539, "step": 6242 }, { "epoch": 1.79, "grad_norm": 9.137362480163574, "learning_rate": 3.2136727688787185e-06, "loss": 0.6683, "step": 6243 }, { "epoch": 1.79, "grad_norm": 9.848520278930664, "learning_rate": 3.2093821510297485e-06, "loss": 0.7281, "step": 6244 }, { "epoch": 1.79, "grad_norm": 12.271465301513672, "learning_rate": 3.205091533180778e-06, "loss": 0.5905, "step": 6245 }, { "epoch": 1.79, "grad_norm": 8.549403190612793, "learning_rate": 3.200800915331808e-06, "loss": 0.7741, "step": 6246 }, { "epoch": 1.79, "grad_norm": 10.232514381408691, "learning_rate": 3.1965102974828374e-06, "loss": 0.5319, "step": 6247 }, { "epoch": 1.79, "grad_norm": 8.45359992980957, "learning_rate": 3.1922196796338674e-06, "loss": 0.6075, "step": 6248 }, { "epoch": 1.79, "grad_norm": 9.884468078613281, "learning_rate": 3.187929061784897e-06, "loss": 0.6882, "step": 6249 }, { "epoch": 1.79, "grad_norm": 11.395963668823242, "learning_rate": 3.183638443935927e-06, "loss": 0.5281, "step": 6250 }, { "epoch": 1.79, "grad_norm": 10.683157920837402, "learning_rate": 3.1793478260869567e-06, "loss": 0.7669, "step": 6251 }, { "epoch": 1.79, "grad_norm": 10.354310989379883, "learning_rate": 3.1750572082379867e-06, "loss": 0.6731, "step": 6252 }, { "epoch": 1.79, "grad_norm": 11.60689640045166, "learning_rate": 3.170766590389016e-06, "loss": 0.6892, "step": 6253 }, { "epoch": 1.79, "grad_norm": 10.987874984741211, "learning_rate": 3.1664759725400455e-06, "loss": 0.6374, "step": 6254 }, { "epoch": 1.79, "grad_norm": 10.786839485168457, "learning_rate": 3.1621853546910755e-06, "loss": 0.5039, "step": 6255 }, { "epoch": 1.79, "grad_norm": 13.27176284790039, "learning_rate": 3.157894736842105e-06, "loss": 0.6787, "step": 6256 }, { "epoch": 1.79, "grad_norm": 11.333096504211426, "learning_rate": 3.153604118993135e-06, "loss": 0.664, "step": 6257 }, { "epoch": 1.79, "grad_norm": 10.735307693481445, "learning_rate": 3.149313501144165e-06, "loss": 0.7826, "step": 6258 }, { "epoch": 1.79, "grad_norm": 10.006080627441406, "learning_rate": 3.145022883295195e-06, "loss": 0.5373, "step": 6259 }, { "epoch": 1.79, "grad_norm": 10.427276611328125, "learning_rate": 3.140732265446224e-06, "loss": 0.9635, "step": 6260 }, { "epoch": 1.79, "grad_norm": 11.013106346130371, "learning_rate": 3.136441647597254e-06, "loss": 0.8264, "step": 6261 }, { "epoch": 1.79, "grad_norm": 9.57888126373291, "learning_rate": 3.1321510297482837e-06, "loss": 0.6285, "step": 6262 }, { "epoch": 1.79, "grad_norm": 11.295920372009277, "learning_rate": 3.1278604118993137e-06, "loss": 0.9006, "step": 6263 }, { "epoch": 1.79, "grad_norm": 11.340927124023438, "learning_rate": 3.1235697940503434e-06, "loss": 0.6735, "step": 6264 }, { "epoch": 1.79, "grad_norm": 8.415022850036621, "learning_rate": 3.1192791762013734e-06, "loss": 0.6388, "step": 6265 }, { "epoch": 1.79, "grad_norm": 11.195077896118164, "learning_rate": 3.1149885583524026e-06, "loss": 0.6975, "step": 6266 }, { "epoch": 1.79, "grad_norm": 9.73503589630127, "learning_rate": 3.1106979405034326e-06, "loss": 0.7999, "step": 6267 }, { "epoch": 1.79, "grad_norm": 13.010418891906738, "learning_rate": 3.1064073226544622e-06, "loss": 0.7029, "step": 6268 }, { "epoch": 1.79, "grad_norm": 10.655797958374023, "learning_rate": 3.102116704805492e-06, "loss": 0.6374, "step": 6269 }, { "epoch": 1.79, "grad_norm": 6.9743852615356445, "learning_rate": 3.097826086956522e-06, "loss": 0.5332, "step": 6270 }, { "epoch": 1.79, "grad_norm": 8.947936058044434, "learning_rate": 3.0935354691075515e-06, "loss": 0.5842, "step": 6271 }, { "epoch": 1.79, "grad_norm": 10.933826446533203, "learning_rate": 3.0892448512585815e-06, "loss": 0.9045, "step": 6272 }, { "epoch": 1.79, "grad_norm": 9.997200965881348, "learning_rate": 3.0849542334096107e-06, "loss": 0.7061, "step": 6273 }, { "epoch": 1.79, "grad_norm": 8.821471214294434, "learning_rate": 3.0806636155606408e-06, "loss": 0.6473, "step": 6274 }, { "epoch": 1.79, "grad_norm": 9.868239402770996, "learning_rate": 3.0763729977116704e-06, "loss": 0.6916, "step": 6275 }, { "epoch": 1.8, "grad_norm": 13.37822151184082, "learning_rate": 3.0720823798627004e-06, "loss": 0.7249, "step": 6276 }, { "epoch": 1.8, "grad_norm": 11.691786766052246, "learning_rate": 3.06779176201373e-06, "loss": 0.6795, "step": 6277 }, { "epoch": 1.8, "grad_norm": 12.730964660644531, "learning_rate": 3.06350114416476e-06, "loss": 1.0718, "step": 6278 }, { "epoch": 1.8, "grad_norm": 13.505888938903809, "learning_rate": 3.0592105263157897e-06, "loss": 0.8676, "step": 6279 }, { "epoch": 1.8, "grad_norm": 11.337920188903809, "learning_rate": 3.0549199084668193e-06, "loss": 0.7076, "step": 6280 }, { "epoch": 1.8, "grad_norm": 12.364757537841797, "learning_rate": 3.050629290617849e-06, "loss": 0.9709, "step": 6281 }, { "epoch": 1.8, "grad_norm": 10.987433433532715, "learning_rate": 3.046338672768879e-06, "loss": 0.3746, "step": 6282 }, { "epoch": 1.8, "grad_norm": 10.072982788085938, "learning_rate": 3.0420480549199086e-06, "loss": 0.6641, "step": 6283 }, { "epoch": 1.8, "grad_norm": 12.776458740234375, "learning_rate": 3.037757437070938e-06, "loss": 0.6595, "step": 6284 }, { "epoch": 1.8, "grad_norm": 11.782681465148926, "learning_rate": 3.0334668192219682e-06, "loss": 0.7771, "step": 6285 }, { "epoch": 1.8, "grad_norm": 10.204319953918457, "learning_rate": 3.0291762013729974e-06, "loss": 0.6043, "step": 6286 }, { "epoch": 1.8, "grad_norm": 10.723920822143555, "learning_rate": 3.0248855835240275e-06, "loss": 0.8629, "step": 6287 }, { "epoch": 1.8, "grad_norm": 10.14091682434082, "learning_rate": 3.020594965675057e-06, "loss": 0.5764, "step": 6288 }, { "epoch": 1.8, "grad_norm": 9.678997039794922, "learning_rate": 3.016304347826087e-06, "loss": 0.5826, "step": 6289 }, { "epoch": 1.8, "grad_norm": 8.583478927612305, "learning_rate": 3.0120137299771167e-06, "loss": 0.5579, "step": 6290 }, { "epoch": 1.8, "grad_norm": 9.08668041229248, "learning_rate": 3.0077231121281468e-06, "loss": 0.5925, "step": 6291 }, { "epoch": 1.8, "grad_norm": 9.817574501037598, "learning_rate": 3.0034324942791764e-06, "loss": 0.6855, "step": 6292 }, { "epoch": 1.8, "grad_norm": 13.133573532104492, "learning_rate": 2.999141876430206e-06, "loss": 0.8879, "step": 6293 }, { "epoch": 1.8, "grad_norm": 10.072456359863281, "learning_rate": 2.9948512585812356e-06, "loss": 0.8635, "step": 6294 }, { "epoch": 1.8, "grad_norm": 10.36079216003418, "learning_rate": 2.9905606407322657e-06, "loss": 0.7124, "step": 6295 }, { "epoch": 1.8, "grad_norm": 9.779273986816406, "learning_rate": 2.9862700228832953e-06, "loss": 0.5912, "step": 6296 }, { "epoch": 1.8, "grad_norm": 11.917943954467773, "learning_rate": 2.9819794050343253e-06, "loss": 0.9621, "step": 6297 }, { "epoch": 1.8, "grad_norm": 10.256078720092773, "learning_rate": 2.977688787185355e-06, "loss": 0.6369, "step": 6298 }, { "epoch": 1.8, "grad_norm": 8.891809463500977, "learning_rate": 2.973398169336384e-06, "loss": 0.6131, "step": 6299 }, { "epoch": 1.8, "grad_norm": 10.462176322937012, "learning_rate": 2.969107551487414e-06, "loss": 0.9024, "step": 6300 }, { "epoch": 1.8, "grad_norm": 11.854138374328613, "learning_rate": 2.9648169336384438e-06, "loss": 0.8871, "step": 6301 }, { "epoch": 1.8, "grad_norm": 11.050225257873535, "learning_rate": 2.960526315789474e-06, "loss": 0.6207, "step": 6302 }, { "epoch": 1.8, "grad_norm": 13.017751693725586, "learning_rate": 2.9562356979405034e-06, "loss": 0.5799, "step": 6303 }, { "epoch": 1.8, "grad_norm": 10.159361839294434, "learning_rate": 2.9519450800915335e-06, "loss": 0.6183, "step": 6304 }, { "epoch": 1.8, "grad_norm": 9.185395240783691, "learning_rate": 2.947654462242563e-06, "loss": 0.5525, "step": 6305 }, { "epoch": 1.8, "grad_norm": 10.041814804077148, "learning_rate": 2.9433638443935927e-06, "loss": 0.6716, "step": 6306 }, { "epoch": 1.8, "grad_norm": 11.238035202026367, "learning_rate": 2.9390732265446223e-06, "loss": 0.7286, "step": 6307 }, { "epoch": 1.8, "grad_norm": 7.606348991394043, "learning_rate": 2.9347826086956523e-06, "loss": 0.5237, "step": 6308 }, { "epoch": 1.8, "grad_norm": 11.537421226501465, "learning_rate": 2.930491990846682e-06, "loss": 0.7842, "step": 6309 }, { "epoch": 1.8, "grad_norm": 9.478804588317871, "learning_rate": 2.926201372997712e-06, "loss": 0.6767, "step": 6310 }, { "epoch": 1.81, "grad_norm": 9.909363746643066, "learning_rate": 2.9219107551487416e-06, "loss": 0.7462, "step": 6311 }, { "epoch": 1.81, "grad_norm": 11.550567626953125, "learning_rate": 2.9176201372997712e-06, "loss": 0.6006, "step": 6312 }, { "epoch": 1.81, "grad_norm": 9.454132080078125, "learning_rate": 2.913329519450801e-06, "loss": 0.684, "step": 6313 }, { "epoch": 1.81, "grad_norm": 8.273576736450195, "learning_rate": 2.9090389016018305e-06, "loss": 0.6919, "step": 6314 }, { "epoch": 1.81, "grad_norm": 9.448694229125977, "learning_rate": 2.9047482837528605e-06, "loss": 0.5262, "step": 6315 }, { "epoch": 1.81, "grad_norm": 10.833744049072266, "learning_rate": 2.90045766590389e-06, "loss": 0.7616, "step": 6316 }, { "epoch": 1.81, "grad_norm": 11.065820693969727, "learning_rate": 2.89616704805492e-06, "loss": 0.4999, "step": 6317 }, { "epoch": 1.81, "grad_norm": 9.91887092590332, "learning_rate": 2.8918764302059498e-06, "loss": 0.4809, "step": 6318 }, { "epoch": 1.81, "grad_norm": 11.010311126708984, "learning_rate": 2.8875858123569794e-06, "loss": 0.8754, "step": 6319 }, { "epoch": 1.81, "grad_norm": 12.345414161682129, "learning_rate": 2.883295194508009e-06, "loss": 0.7151, "step": 6320 }, { "epoch": 1.81, "grad_norm": 12.316413879394531, "learning_rate": 2.879004576659039e-06, "loss": 0.6672, "step": 6321 }, { "epoch": 1.81, "grad_norm": 8.642687797546387, "learning_rate": 2.8747139588100686e-06, "loss": 0.4366, "step": 6322 }, { "epoch": 1.81, "grad_norm": 10.080451011657715, "learning_rate": 2.8704233409610987e-06, "loss": 0.6632, "step": 6323 }, { "epoch": 1.81, "grad_norm": 11.197017669677734, "learning_rate": 2.8661327231121283e-06, "loss": 0.9502, "step": 6324 }, { "epoch": 1.81, "grad_norm": 11.442328453063965, "learning_rate": 2.861842105263158e-06, "loss": 0.6707, "step": 6325 }, { "epoch": 1.81, "grad_norm": 11.40806770324707, "learning_rate": 2.8575514874141875e-06, "loss": 0.6401, "step": 6326 }, { "epoch": 1.81, "grad_norm": 10.922065734863281, "learning_rate": 2.8532608695652176e-06, "loss": 0.8657, "step": 6327 }, { "epoch": 1.81, "grad_norm": 9.807641983032227, "learning_rate": 2.848970251716247e-06, "loss": 0.7005, "step": 6328 }, { "epoch": 1.81, "grad_norm": 8.661300659179688, "learning_rate": 2.844679633867277e-06, "loss": 0.5264, "step": 6329 }, { "epoch": 1.81, "grad_norm": 8.730284690856934, "learning_rate": 2.840389016018307e-06, "loss": 0.4594, "step": 6330 }, { "epoch": 1.81, "grad_norm": 9.94897747039795, "learning_rate": 2.8360983981693364e-06, "loss": 0.6994, "step": 6331 }, { "epoch": 1.81, "grad_norm": 10.989338874816895, "learning_rate": 2.831807780320366e-06, "loss": 0.6297, "step": 6332 }, { "epoch": 1.81, "grad_norm": 12.295429229736328, "learning_rate": 2.8275171624713957e-06, "loss": 0.8819, "step": 6333 }, { "epoch": 1.81, "grad_norm": 9.570334434509277, "learning_rate": 2.8232265446224257e-06, "loss": 0.4966, "step": 6334 }, { "epoch": 1.81, "grad_norm": 11.286737442016602, "learning_rate": 2.8189359267734553e-06, "loss": 0.9316, "step": 6335 }, { "epoch": 1.81, "grad_norm": 8.949596405029297, "learning_rate": 2.8146453089244854e-06, "loss": 0.4447, "step": 6336 }, { "epoch": 1.81, "grad_norm": 10.223958015441895, "learning_rate": 2.810354691075515e-06, "loss": 0.6091, "step": 6337 }, { "epoch": 1.81, "grad_norm": 10.857589721679688, "learning_rate": 2.806064073226545e-06, "loss": 0.6198, "step": 6338 }, { "epoch": 1.81, "grad_norm": 11.978499412536621, "learning_rate": 2.8017734553775742e-06, "loss": 0.6739, "step": 6339 }, { "epoch": 1.81, "grad_norm": 8.985401153564453, "learning_rate": 2.7974828375286043e-06, "loss": 0.6556, "step": 6340 }, { "epoch": 1.81, "grad_norm": 9.769513130187988, "learning_rate": 2.793192219679634e-06, "loss": 0.595, "step": 6341 }, { "epoch": 1.81, "grad_norm": 10.520943641662598, "learning_rate": 2.788901601830664e-06, "loss": 0.6485, "step": 6342 }, { "epoch": 1.81, "grad_norm": 13.118606567382812, "learning_rate": 2.7846109839816935e-06, "loss": 0.8612, "step": 6343 }, { "epoch": 1.81, "grad_norm": 10.005794525146484, "learning_rate": 2.7803203661327236e-06, "loss": 0.7682, "step": 6344 }, { "epoch": 1.81, "grad_norm": 8.515554428100586, "learning_rate": 2.7760297482837528e-06, "loss": 0.5026, "step": 6345 }, { "epoch": 1.82, "grad_norm": 14.833863258361816, "learning_rate": 2.7717391304347824e-06, "loss": 0.9351, "step": 6346 }, { "epoch": 1.82, "grad_norm": 11.056303024291992, "learning_rate": 2.7674485125858124e-06, "loss": 0.5222, "step": 6347 }, { "epoch": 1.82, "grad_norm": 9.482291221618652, "learning_rate": 2.763157894736842e-06, "loss": 0.6263, "step": 6348 }, { "epoch": 1.82, "grad_norm": 11.735074043273926, "learning_rate": 2.758867276887872e-06, "loss": 0.7135, "step": 6349 }, { "epoch": 1.82, "grad_norm": 9.922060012817383, "learning_rate": 2.7545766590389017e-06, "loss": 0.6701, "step": 6350 }, { "epoch": 1.82, "grad_norm": 7.189872741699219, "learning_rate": 2.7502860411899317e-06, "loss": 0.3701, "step": 6351 }, { "epoch": 1.82, "grad_norm": 10.476938247680664, "learning_rate": 2.745995423340961e-06, "loss": 0.7437, "step": 6352 }, { "epoch": 1.82, "grad_norm": 10.8806791305542, "learning_rate": 2.741704805491991e-06, "loss": 0.5855, "step": 6353 }, { "epoch": 1.82, "grad_norm": 11.708989143371582, "learning_rate": 2.7374141876430206e-06, "loss": 0.737, "step": 6354 }, { "epoch": 1.82, "grad_norm": 8.859798431396484, "learning_rate": 2.7331235697940506e-06, "loss": 0.467, "step": 6355 }, { "epoch": 1.82, "grad_norm": 9.817736625671387, "learning_rate": 2.72883295194508e-06, "loss": 0.7523, "step": 6356 }, { "epoch": 1.82, "grad_norm": 11.412315368652344, "learning_rate": 2.7245423340961102e-06, "loss": 0.7811, "step": 6357 }, { "epoch": 1.82, "grad_norm": 10.915645599365234, "learning_rate": 2.7202517162471394e-06, "loss": 0.8479, "step": 6358 }, { "epoch": 1.82, "grad_norm": 11.466460227966309, "learning_rate": 2.7159610983981695e-06, "loss": 0.5644, "step": 6359 }, { "epoch": 1.82, "grad_norm": 8.003796577453613, "learning_rate": 2.711670480549199e-06, "loss": 0.5993, "step": 6360 }, { "epoch": 1.82, "grad_norm": 12.29226016998291, "learning_rate": 2.7073798627002287e-06, "loss": 0.6913, "step": 6361 }, { "epoch": 1.82, "grad_norm": 9.387224197387695, "learning_rate": 2.7030892448512587e-06, "loss": 0.5956, "step": 6362 }, { "epoch": 1.82, "grad_norm": 11.623571395874023, "learning_rate": 2.6987986270022884e-06, "loss": 0.7545, "step": 6363 }, { "epoch": 1.82, "grad_norm": 11.96871280670166, "learning_rate": 2.6945080091533184e-06, "loss": 0.553, "step": 6364 }, { "epoch": 1.82, "grad_norm": 12.344478607177734, "learning_rate": 2.6902173913043476e-06, "loss": 0.7015, "step": 6365 }, { "epoch": 1.82, "grad_norm": 10.739533424377441, "learning_rate": 2.6859267734553776e-06, "loss": 0.4474, "step": 6366 }, { "epoch": 1.82, "grad_norm": 8.55423641204834, "learning_rate": 2.6816361556064072e-06, "loss": 0.6801, "step": 6367 }, { "epoch": 1.82, "grad_norm": 8.852259635925293, "learning_rate": 2.6773455377574373e-06, "loss": 0.5669, "step": 6368 }, { "epoch": 1.82, "grad_norm": 9.556403160095215, "learning_rate": 2.673054919908467e-06, "loss": 0.678, "step": 6369 }, { "epoch": 1.82, "grad_norm": 12.659199714660645, "learning_rate": 2.668764302059497e-06, "loss": 0.7636, "step": 6370 }, { "epoch": 1.82, "grad_norm": 10.23427677154541, "learning_rate": 2.6644736842105266e-06, "loss": 0.684, "step": 6371 }, { "epoch": 1.82, "grad_norm": 11.79377269744873, "learning_rate": 2.660183066361556e-06, "loss": 0.6206, "step": 6372 }, { "epoch": 1.82, "grad_norm": 10.17258358001709, "learning_rate": 2.6558924485125858e-06, "loss": 0.712, "step": 6373 }, { "epoch": 1.82, "grad_norm": 11.21706771850586, "learning_rate": 2.651601830663616e-06, "loss": 0.8673, "step": 6374 }, { "epoch": 1.82, "grad_norm": 11.416669845581055, "learning_rate": 2.6473112128146454e-06, "loss": 0.8331, "step": 6375 }, { "epoch": 1.82, "grad_norm": 9.43038558959961, "learning_rate": 2.643020594965675e-06, "loss": 0.5575, "step": 6376 }, { "epoch": 1.82, "grad_norm": 7.555305004119873, "learning_rate": 2.638729977116705e-06, "loss": 0.5514, "step": 6377 }, { "epoch": 1.82, "grad_norm": 8.861245155334473, "learning_rate": 2.6344393592677343e-06, "loss": 0.7121, "step": 6378 }, { "epoch": 1.82, "grad_norm": 14.446701049804688, "learning_rate": 2.6301487414187643e-06, "loss": 0.662, "step": 6379 }, { "epoch": 1.82, "grad_norm": 10.05135726928711, "learning_rate": 2.625858123569794e-06, "loss": 0.7382, "step": 6380 }, { "epoch": 1.83, "grad_norm": 11.255631446838379, "learning_rate": 2.621567505720824e-06, "loss": 0.655, "step": 6381 }, { "epoch": 1.83, "grad_norm": 11.895508766174316, "learning_rate": 2.6172768878718536e-06, "loss": 0.7342, "step": 6382 }, { "epoch": 1.83, "grad_norm": 10.853370666503906, "learning_rate": 2.6129862700228836e-06, "loss": 0.5712, "step": 6383 }, { "epoch": 1.83, "grad_norm": 12.39453411102295, "learning_rate": 2.6086956521739132e-06, "loss": 0.9292, "step": 6384 }, { "epoch": 1.83, "grad_norm": 8.208925247192383, "learning_rate": 2.604405034324943e-06, "loss": 0.578, "step": 6385 }, { "epoch": 1.83, "grad_norm": 11.307624816894531, "learning_rate": 2.6001144164759725e-06, "loss": 0.7251, "step": 6386 }, { "epoch": 1.83, "grad_norm": 8.692691802978516, "learning_rate": 2.5958237986270025e-06, "loss": 0.5645, "step": 6387 }, { "epoch": 1.83, "grad_norm": 11.707426071166992, "learning_rate": 2.591533180778032e-06, "loss": 0.5614, "step": 6388 }, { "epoch": 1.83, "grad_norm": 11.854446411132812, "learning_rate": 2.587242562929062e-06, "loss": 0.801, "step": 6389 }, { "epoch": 1.83, "grad_norm": 7.33786678314209, "learning_rate": 2.5829519450800918e-06, "loss": 0.4136, "step": 6390 }, { "epoch": 1.83, "grad_norm": 11.048632621765137, "learning_rate": 2.578661327231121e-06, "loss": 0.7862, "step": 6391 }, { "epoch": 1.83, "grad_norm": 10.102787017822266, "learning_rate": 2.574370709382151e-06, "loss": 0.6187, "step": 6392 }, { "epoch": 1.83, "grad_norm": 18.396352767944336, "learning_rate": 2.5700800915331806e-06, "loss": 0.682, "step": 6393 }, { "epoch": 1.83, "grad_norm": 13.416193008422852, "learning_rate": 2.5657894736842107e-06, "loss": 0.7955, "step": 6394 }, { "epoch": 1.83, "grad_norm": 7.945309162139893, "learning_rate": 2.5614988558352403e-06, "loss": 0.7932, "step": 6395 }, { "epoch": 1.83, "grad_norm": 10.73609733581543, "learning_rate": 2.5572082379862703e-06, "loss": 0.6372, "step": 6396 }, { "epoch": 1.83, "grad_norm": 9.39498519897461, "learning_rate": 2.5529176201373e-06, "loss": 0.5106, "step": 6397 }, { "epoch": 1.83, "grad_norm": 9.900101661682129, "learning_rate": 2.5486270022883295e-06, "loss": 0.6601, "step": 6398 }, { "epoch": 1.83, "grad_norm": 10.181139945983887, "learning_rate": 2.544336384439359e-06, "loss": 0.7377, "step": 6399 }, { "epoch": 1.83, "grad_norm": 8.726524353027344, "learning_rate": 2.540045766590389e-06, "loss": 0.4742, "step": 6400 }, { "epoch": 1.83, "grad_norm": 10.729772567749023, "learning_rate": 2.535755148741419e-06, "loss": 0.4859, "step": 6401 }, { "epoch": 1.83, "grad_norm": 10.562342643737793, "learning_rate": 2.531464530892449e-06, "loss": 0.7937, "step": 6402 }, { "epoch": 1.83, "grad_norm": 10.791984558105469, "learning_rate": 2.5271739130434785e-06, "loss": 0.6454, "step": 6403 }, { "epoch": 1.83, "grad_norm": 13.546317100524902, "learning_rate": 2.522883295194508e-06, "loss": 0.8172, "step": 6404 }, { "epoch": 1.83, "grad_norm": 10.978838920593262, "learning_rate": 2.5185926773455377e-06, "loss": 0.5623, "step": 6405 }, { "epoch": 1.83, "grad_norm": 10.70091724395752, "learning_rate": 2.5143020594965673e-06, "loss": 0.7327, "step": 6406 }, { "epoch": 1.83, "grad_norm": 9.919500350952148, "learning_rate": 2.5100114416475973e-06, "loss": 0.6551, "step": 6407 }, { "epoch": 1.83, "grad_norm": 10.135588645935059, "learning_rate": 2.505720823798627e-06, "loss": 0.6837, "step": 6408 }, { "epoch": 1.83, "grad_norm": 10.382122039794922, "learning_rate": 2.501430205949657e-06, "loss": 0.6848, "step": 6409 }, { "epoch": 1.83, "grad_norm": 10.038745880126953, "learning_rate": 2.4971395881006866e-06, "loss": 0.5203, "step": 6410 }, { "epoch": 1.83, "grad_norm": 11.14279842376709, "learning_rate": 2.4928489702517162e-06, "loss": 0.5948, "step": 6411 }, { "epoch": 1.83, "grad_norm": 10.874357223510742, "learning_rate": 2.488558352402746e-06, "loss": 0.7269, "step": 6412 }, { "epoch": 1.83, "grad_norm": 10.609356880187988, "learning_rate": 2.484267734553776e-06, "loss": 0.5865, "step": 6413 }, { "epoch": 1.83, "grad_norm": 9.98708724975586, "learning_rate": 2.4799771167048055e-06, "loss": 0.7918, "step": 6414 }, { "epoch": 1.83, "grad_norm": 11.888640403747559, "learning_rate": 2.4756864988558355e-06, "loss": 0.6676, "step": 6415 }, { "epoch": 1.84, "grad_norm": 10.499030113220215, "learning_rate": 2.471395881006865e-06, "loss": 0.5232, "step": 6416 }, { "epoch": 1.84, "grad_norm": 9.32913589477539, "learning_rate": 2.4671052631578948e-06, "loss": 0.5939, "step": 6417 }, { "epoch": 1.84, "grad_norm": 10.895303726196289, "learning_rate": 2.4628146453089244e-06, "loss": 0.7183, "step": 6418 }, { "epoch": 1.84, "grad_norm": 9.079419136047363, "learning_rate": 2.4585240274599544e-06, "loss": 0.7644, "step": 6419 }, { "epoch": 1.84, "grad_norm": 10.472522735595703, "learning_rate": 2.454233409610984e-06, "loss": 0.6486, "step": 6420 }, { "epoch": 1.84, "grad_norm": 8.136340141296387, "learning_rate": 2.4499427917620136e-06, "loss": 0.5381, "step": 6421 }, { "epoch": 1.84, "grad_norm": 10.92373275756836, "learning_rate": 2.4456521739130437e-06, "loss": 0.6147, "step": 6422 }, { "epoch": 1.84, "grad_norm": 12.690459251403809, "learning_rate": 2.4413615560640733e-06, "loss": 0.569, "step": 6423 }, { "epoch": 1.84, "grad_norm": 11.713183403015137, "learning_rate": 2.437070938215103e-06, "loss": 0.7141, "step": 6424 }, { "epoch": 1.84, "grad_norm": 10.668099403381348, "learning_rate": 2.4327803203661325e-06, "loss": 0.7687, "step": 6425 }, { "epoch": 1.84, "grad_norm": 14.403892517089844, "learning_rate": 2.4284897025171626e-06, "loss": 0.7468, "step": 6426 }, { "epoch": 1.84, "grad_norm": 7.870250225067139, "learning_rate": 2.424199084668192e-06, "loss": 0.6532, "step": 6427 }, { "epoch": 1.84, "grad_norm": 12.25476360321045, "learning_rate": 2.4199084668192222e-06, "loss": 0.6964, "step": 6428 }, { "epoch": 1.84, "grad_norm": 12.443902969360352, "learning_rate": 2.415617848970252e-06, "loss": 0.9736, "step": 6429 }, { "epoch": 1.84, "grad_norm": 13.772923469543457, "learning_rate": 2.411327231121282e-06, "loss": 0.7917, "step": 6430 }, { "epoch": 1.84, "grad_norm": 12.89171028137207, "learning_rate": 2.407036613272311e-06, "loss": 0.7432, "step": 6431 }, { "epoch": 1.84, "grad_norm": 10.942740440368652, "learning_rate": 2.402745995423341e-06, "loss": 0.6137, "step": 6432 }, { "epoch": 1.84, "grad_norm": 9.382735252380371, "learning_rate": 2.3984553775743707e-06, "loss": 0.6103, "step": 6433 }, { "epoch": 1.84, "grad_norm": 11.818822860717773, "learning_rate": 2.3941647597254008e-06, "loss": 0.7706, "step": 6434 }, { "epoch": 1.84, "grad_norm": 12.318153381347656, "learning_rate": 2.3898741418764304e-06, "loss": 0.5996, "step": 6435 }, { "epoch": 1.84, "grad_norm": 11.190994262695312, "learning_rate": 2.38558352402746e-06, "loss": 0.7502, "step": 6436 }, { "epoch": 1.84, "grad_norm": 10.103891372680664, "learning_rate": 2.3812929061784896e-06, "loss": 0.6892, "step": 6437 }, { "epoch": 1.84, "grad_norm": 13.064241409301758, "learning_rate": 2.3770022883295192e-06, "loss": 0.7656, "step": 6438 }, { "epoch": 1.84, "grad_norm": 15.39616870880127, "learning_rate": 2.3727116704805493e-06, "loss": 0.8093, "step": 6439 }, { "epoch": 1.84, "grad_norm": 14.783260345458984, "learning_rate": 2.368421052631579e-06, "loss": 0.7323, "step": 6440 }, { "epoch": 1.84, "grad_norm": 9.266852378845215, "learning_rate": 2.364130434782609e-06, "loss": 0.658, "step": 6441 }, { "epoch": 1.84, "grad_norm": 10.6998929977417, "learning_rate": 2.3598398169336385e-06, "loss": 0.7976, "step": 6442 }, { "epoch": 1.84, "grad_norm": 13.172496795654297, "learning_rate": 2.3555491990846686e-06, "loss": 0.6606, "step": 6443 }, { "epoch": 1.84, "grad_norm": 12.803412437438965, "learning_rate": 2.3512585812356978e-06, "loss": 0.45, "step": 6444 }, { "epoch": 1.84, "grad_norm": 7.840516567230225, "learning_rate": 2.346967963386728e-06, "loss": 0.581, "step": 6445 }, { "epoch": 1.84, "grad_norm": 10.895773887634277, "learning_rate": 2.3426773455377574e-06, "loss": 0.7987, "step": 6446 }, { "epoch": 1.84, "grad_norm": 9.917009353637695, "learning_rate": 2.3383867276887874e-06, "loss": 0.583, "step": 6447 }, { "epoch": 1.84, "grad_norm": 9.434247970581055, "learning_rate": 2.334096109839817e-06, "loss": 0.5384, "step": 6448 }, { "epoch": 1.84, "grad_norm": 12.754484176635742, "learning_rate": 2.329805491990847e-06, "loss": 0.715, "step": 6449 }, { "epoch": 1.84, "grad_norm": 10.924982070922852, "learning_rate": 2.3255148741418763e-06, "loss": 0.7895, "step": 6450 }, { "epoch": 1.85, "grad_norm": 8.170269012451172, "learning_rate": 2.321224256292906e-06, "loss": 0.5094, "step": 6451 }, { "epoch": 1.85, "grad_norm": 7.4521331787109375, "learning_rate": 2.316933638443936e-06, "loss": 0.4345, "step": 6452 }, { "epoch": 1.85, "grad_norm": 10.049761772155762, "learning_rate": 2.3126430205949656e-06, "loss": 0.6747, "step": 6453 }, { "epoch": 1.85, "grad_norm": 10.886943817138672, "learning_rate": 2.3083524027459956e-06, "loss": 0.5234, "step": 6454 }, { "epoch": 1.85, "grad_norm": 11.293790817260742, "learning_rate": 2.3040617848970252e-06, "loss": 0.6757, "step": 6455 }, { "epoch": 1.85, "grad_norm": 9.645736694335938, "learning_rate": 2.2997711670480553e-06, "loss": 0.8438, "step": 6456 }, { "epoch": 1.85, "grad_norm": 14.119002342224121, "learning_rate": 2.2954805491990844e-06, "loss": 0.7814, "step": 6457 }, { "epoch": 1.85, "grad_norm": 9.948843955993652, "learning_rate": 2.2911899313501145e-06, "loss": 0.5564, "step": 6458 }, { "epoch": 1.85, "grad_norm": 9.257054328918457, "learning_rate": 2.286899313501144e-06, "loss": 0.7186, "step": 6459 }, { "epoch": 1.85, "grad_norm": 10.969335556030273, "learning_rate": 2.282608695652174e-06, "loss": 0.8478, "step": 6460 }, { "epoch": 1.85, "grad_norm": 9.461034774780273, "learning_rate": 2.2783180778032038e-06, "loss": 0.7713, "step": 6461 }, { "epoch": 1.85, "grad_norm": 13.996525764465332, "learning_rate": 2.274027459954234e-06, "loss": 0.9145, "step": 6462 }, { "epoch": 1.85, "grad_norm": 12.406022071838379, "learning_rate": 2.2697368421052634e-06, "loss": 0.7289, "step": 6463 }, { "epoch": 1.85, "grad_norm": 7.784307956695557, "learning_rate": 2.265446224256293e-06, "loss": 0.605, "step": 6464 }, { "epoch": 1.85, "grad_norm": 10.131551742553711, "learning_rate": 2.2611556064073226e-06, "loss": 0.621, "step": 6465 }, { "epoch": 1.85, "grad_norm": 10.075953483581543, "learning_rate": 2.2568649885583522e-06, "loss": 0.6698, "step": 6466 }, { "epoch": 1.85, "grad_norm": 11.9581880569458, "learning_rate": 2.2525743707093823e-06, "loss": 0.8044, "step": 6467 }, { "epoch": 1.85, "grad_norm": 10.870621681213379, "learning_rate": 2.248283752860412e-06, "loss": 0.581, "step": 6468 }, { "epoch": 1.85, "grad_norm": 9.622464179992676, "learning_rate": 2.243993135011442e-06, "loss": 0.656, "step": 6469 }, { "epoch": 1.85, "grad_norm": 9.188612937927246, "learning_rate": 2.239702517162471e-06, "loss": 0.7667, "step": 6470 }, { "epoch": 1.85, "grad_norm": 11.247570037841797, "learning_rate": 2.235411899313501e-06, "loss": 0.7746, "step": 6471 }, { "epoch": 1.85, "grad_norm": 13.013686180114746, "learning_rate": 2.2311212814645308e-06, "loss": 0.8872, "step": 6472 }, { "epoch": 1.85, "grad_norm": 9.735376358032227, "learning_rate": 2.226830663615561e-06, "loss": 0.754, "step": 6473 }, { "epoch": 1.85, "grad_norm": 10.814042091369629, "learning_rate": 2.2225400457665904e-06, "loss": 0.8551, "step": 6474 }, { "epoch": 1.85, "grad_norm": 9.536465644836426, "learning_rate": 2.2182494279176205e-06, "loss": 0.7245, "step": 6475 }, { "epoch": 1.85, "grad_norm": 9.123568534851074, "learning_rate": 2.21395881006865e-06, "loss": 0.6765, "step": 6476 }, { "epoch": 1.85, "grad_norm": 10.892561912536621, "learning_rate": 2.2096681922196797e-06, "loss": 0.6025, "step": 6477 }, { "epoch": 1.85, "grad_norm": 8.635826110839844, "learning_rate": 2.2053775743707093e-06, "loss": 0.7246, "step": 6478 }, { "epoch": 1.85, "grad_norm": 12.406291961669922, "learning_rate": 2.2010869565217394e-06, "loss": 0.6637, "step": 6479 }, { "epoch": 1.85, "grad_norm": 10.550618171691895, "learning_rate": 2.196796338672769e-06, "loss": 0.6013, "step": 6480 }, { "epoch": 1.85, "grad_norm": 7.916990280151367, "learning_rate": 2.1925057208237986e-06, "loss": 0.5096, "step": 6481 }, { "epoch": 1.85, "grad_norm": 8.452898025512695, "learning_rate": 2.1882151029748286e-06, "loss": 0.6016, "step": 6482 }, { "epoch": 1.85, "grad_norm": 8.159934997558594, "learning_rate": 2.183924485125858e-06, "loss": 0.7398, "step": 6483 }, { "epoch": 1.85, "grad_norm": 8.472947120666504, "learning_rate": 2.179633867276888e-06, "loss": 0.4693, "step": 6484 }, { "epoch": 1.85, "grad_norm": 10.124168395996094, "learning_rate": 2.1753432494279175e-06, "loss": 0.5472, "step": 6485 }, { "epoch": 1.86, "grad_norm": 7.216623783111572, "learning_rate": 2.1710526315789475e-06, "loss": 0.5199, "step": 6486 }, { "epoch": 1.86, "grad_norm": 11.942208290100098, "learning_rate": 2.166762013729977e-06, "loss": 1.0072, "step": 6487 }, { "epoch": 1.86, "grad_norm": 9.318800926208496, "learning_rate": 2.162471395881007e-06, "loss": 0.6784, "step": 6488 }, { "epoch": 1.86, "grad_norm": 10.21501350402832, "learning_rate": 2.1581807780320368e-06, "loss": 0.5391, "step": 6489 }, { "epoch": 1.86, "grad_norm": 12.667245864868164, "learning_rate": 2.1538901601830664e-06, "loss": 0.6718, "step": 6490 }, { "epoch": 1.86, "grad_norm": 9.556787490844727, "learning_rate": 2.149599542334096e-06, "loss": 0.6574, "step": 6491 }, { "epoch": 1.86, "grad_norm": 12.611247062683105, "learning_rate": 2.145308924485126e-06, "loss": 0.821, "step": 6492 }, { "epoch": 1.86, "grad_norm": 8.203670501708984, "learning_rate": 2.1410183066361557e-06, "loss": 0.5282, "step": 6493 }, { "epoch": 1.86, "grad_norm": 11.463093757629395, "learning_rate": 2.1367276887871857e-06, "loss": 0.5527, "step": 6494 }, { "epoch": 1.86, "grad_norm": 8.550508499145508, "learning_rate": 2.1324370709382153e-06, "loss": 0.5446, "step": 6495 }, { "epoch": 1.86, "grad_norm": 9.182392120361328, "learning_rate": 2.1281464530892445e-06, "loss": 0.676, "step": 6496 }, { "epoch": 1.86, "grad_norm": 10.834150314331055, "learning_rate": 2.1238558352402745e-06, "loss": 0.743, "step": 6497 }, { "epoch": 1.86, "grad_norm": 9.196577072143555, "learning_rate": 2.119565217391304e-06, "loss": 0.7373, "step": 6498 }, { "epoch": 1.86, "grad_norm": 7.15338134765625, "learning_rate": 2.115274599542334e-06, "loss": 0.4727, "step": 6499 }, { "epoch": 1.86, "grad_norm": 10.727302551269531, "learning_rate": 2.110983981693364e-06, "loss": 0.7907, "step": 6500 }, { "epoch": 1.86, "grad_norm": 11.069611549377441, "learning_rate": 2.106693363844394e-06, "loss": 0.5606, "step": 6501 }, { "epoch": 1.86, "grad_norm": 12.03882884979248, "learning_rate": 2.1024027459954235e-06, "loss": 0.8557, "step": 6502 }, { "epoch": 1.86, "grad_norm": 9.859286308288574, "learning_rate": 2.098112128146453e-06, "loss": 0.6137, "step": 6503 }, { "epoch": 1.86, "grad_norm": 11.920726776123047, "learning_rate": 2.0938215102974827e-06, "loss": 0.652, "step": 6504 }, { "epoch": 1.86, "grad_norm": 13.011934280395508, "learning_rate": 2.0895308924485127e-06, "loss": 0.7809, "step": 6505 }, { "epoch": 1.86, "grad_norm": 10.034687995910645, "learning_rate": 2.0852402745995424e-06, "loss": 0.6016, "step": 6506 }, { "epoch": 1.86, "grad_norm": 12.651317596435547, "learning_rate": 2.0809496567505724e-06, "loss": 1.0099, "step": 6507 }, { "epoch": 1.86, "grad_norm": 11.004575729370117, "learning_rate": 2.076659038901602e-06, "loss": 0.621, "step": 6508 }, { "epoch": 1.86, "grad_norm": 11.537083625793457, "learning_rate": 2.0723684210526316e-06, "loss": 0.5622, "step": 6509 }, { "epoch": 1.86, "grad_norm": 8.899933815002441, "learning_rate": 2.0680778032036612e-06, "loss": 0.5059, "step": 6510 }, { "epoch": 1.86, "grad_norm": 10.634422302246094, "learning_rate": 2.063787185354691e-06, "loss": 0.9044, "step": 6511 }, { "epoch": 1.86, "grad_norm": 10.67432689666748, "learning_rate": 2.059496567505721e-06, "loss": 0.5265, "step": 6512 }, { "epoch": 1.86, "grad_norm": 9.965004920959473, "learning_rate": 2.0552059496567505e-06, "loss": 0.7052, "step": 6513 }, { "epoch": 1.86, "grad_norm": 8.582775115966797, "learning_rate": 2.0509153318077805e-06, "loss": 0.6821, "step": 6514 }, { "epoch": 1.86, "grad_norm": 10.898089408874512, "learning_rate": 2.04662471395881e-06, "loss": 0.6636, "step": 6515 }, { "epoch": 1.86, "grad_norm": 9.397239685058594, "learning_rate": 2.0423340961098398e-06, "loss": 0.6129, "step": 6516 }, { "epoch": 1.86, "grad_norm": 10.674189567565918, "learning_rate": 2.0380434782608694e-06, "loss": 0.6215, "step": 6517 }, { "epoch": 1.86, "grad_norm": 8.830999374389648, "learning_rate": 2.0337528604118994e-06, "loss": 0.4642, "step": 6518 }, { "epoch": 1.86, "grad_norm": 10.3596773147583, "learning_rate": 2.029462242562929e-06, "loss": 0.8204, "step": 6519 }, { "epoch": 1.86, "grad_norm": 11.077269554138184, "learning_rate": 2.025171624713959e-06, "loss": 0.6943, "step": 6520 }, { "epoch": 1.87, "grad_norm": 8.061532974243164, "learning_rate": 2.0208810068649887e-06, "loss": 0.5054, "step": 6521 }, { "epoch": 1.87, "grad_norm": 10.03957462310791, "learning_rate": 2.0165903890160187e-06, "loss": 0.7679, "step": 6522 }, { "epoch": 1.87, "grad_norm": 7.75835657119751, "learning_rate": 2.012299771167048e-06, "loss": 0.4455, "step": 6523 }, { "epoch": 1.87, "grad_norm": 9.567909240722656, "learning_rate": 2.008009153318078e-06, "loss": 0.6535, "step": 6524 }, { "epoch": 1.87, "grad_norm": 10.833852767944336, "learning_rate": 2.0037185354691076e-06, "loss": 0.7393, "step": 6525 }, { "epoch": 1.87, "grad_norm": 8.219368934631348, "learning_rate": 1.999427917620137e-06, "loss": 0.5447, "step": 6526 }, { "epoch": 1.87, "grad_norm": 11.239940643310547, "learning_rate": 1.9951372997711672e-06, "loss": 0.8011, "step": 6527 }, { "epoch": 1.87, "grad_norm": 8.554726600646973, "learning_rate": 1.990846681922197e-06, "loss": 0.5626, "step": 6528 }, { "epoch": 1.87, "grad_norm": 8.939287185668945, "learning_rate": 1.9865560640732265e-06, "loss": 0.4421, "step": 6529 }, { "epoch": 1.87, "grad_norm": 7.468092441558838, "learning_rate": 1.982265446224256e-06, "loss": 0.523, "step": 6530 }, { "epoch": 1.87, "grad_norm": 12.858736038208008, "learning_rate": 1.977974828375286e-06, "loss": 0.9184, "step": 6531 }, { "epoch": 1.87, "grad_norm": 9.084821701049805, "learning_rate": 1.9736842105263157e-06, "loss": 0.6705, "step": 6532 }, { "epoch": 1.87, "grad_norm": 11.919846534729004, "learning_rate": 1.9693935926773458e-06, "loss": 0.8067, "step": 6533 }, { "epoch": 1.87, "grad_norm": 8.74809741973877, "learning_rate": 1.9651029748283754e-06, "loss": 0.5937, "step": 6534 }, { "epoch": 1.87, "grad_norm": 9.585488319396973, "learning_rate": 1.9608123569794054e-06, "loss": 0.5007, "step": 6535 }, { "epoch": 1.87, "grad_norm": 10.856274604797363, "learning_rate": 1.9565217391304346e-06, "loss": 0.6984, "step": 6536 }, { "epoch": 1.87, "grad_norm": 9.430891036987305, "learning_rate": 1.9522311212814646e-06, "loss": 0.4644, "step": 6537 }, { "epoch": 1.87, "grad_norm": 12.390891075134277, "learning_rate": 1.9479405034324943e-06, "loss": 0.7448, "step": 6538 }, { "epoch": 1.87, "grad_norm": 11.068594932556152, "learning_rate": 1.9436498855835243e-06, "loss": 0.8033, "step": 6539 }, { "epoch": 1.87, "grad_norm": 10.79172134399414, "learning_rate": 1.939359267734554e-06, "loss": 0.7613, "step": 6540 }, { "epoch": 1.87, "grad_norm": 12.126513481140137, "learning_rate": 1.9350686498855835e-06, "loss": 0.9048, "step": 6541 }, { "epoch": 1.87, "grad_norm": 9.405591011047363, "learning_rate": 1.930778032036613e-06, "loss": 0.4953, "step": 6542 }, { "epoch": 1.87, "grad_norm": 12.395812034606934, "learning_rate": 1.9264874141876428e-06, "loss": 0.9121, "step": 6543 }, { "epoch": 1.87, "grad_norm": 12.50573444366455, "learning_rate": 1.922196796338673e-06, "loss": 0.714, "step": 6544 }, { "epoch": 1.87, "grad_norm": 9.9764986038208, "learning_rate": 1.9179061784897024e-06, "loss": 0.5217, "step": 6545 }, { "epoch": 1.87, "grad_norm": 12.422931671142578, "learning_rate": 1.9136155606407325e-06, "loss": 0.7425, "step": 6546 }, { "epoch": 1.87, "grad_norm": 11.33634090423584, "learning_rate": 1.909324942791762e-06, "loss": 0.7039, "step": 6547 }, { "epoch": 1.87, "grad_norm": 8.913440704345703, "learning_rate": 1.905034324942792e-06, "loss": 0.5332, "step": 6548 }, { "epoch": 1.87, "grad_norm": 10.53049373626709, "learning_rate": 1.9007437070938215e-06, "loss": 0.6939, "step": 6549 }, { "epoch": 1.87, "grad_norm": 9.380077362060547, "learning_rate": 1.8964530892448515e-06, "loss": 0.5322, "step": 6550 }, { "epoch": 1.87, "grad_norm": 8.71501350402832, "learning_rate": 1.892162471395881e-06, "loss": 0.6613, "step": 6551 }, { "epoch": 1.87, "grad_norm": 10.52595043182373, "learning_rate": 1.887871853546911e-06, "loss": 0.8388, "step": 6552 }, { "epoch": 1.87, "grad_norm": 18.139190673828125, "learning_rate": 1.8835812356979406e-06, "loss": 0.7646, "step": 6553 }, { "epoch": 1.87, "grad_norm": 14.185227394104004, "learning_rate": 1.8792906178489704e-06, "loss": 0.8541, "step": 6554 }, { "epoch": 1.88, "grad_norm": 11.55553913116455, "learning_rate": 1.875e-06, "loss": 0.7206, "step": 6555 }, { "epoch": 1.88, "grad_norm": 10.506942749023438, "learning_rate": 1.8707093821510299e-06, "loss": 0.5802, "step": 6556 }, { "epoch": 1.88, "grad_norm": 12.027273178100586, "learning_rate": 1.8664187643020595e-06, "loss": 0.6196, "step": 6557 }, { "epoch": 1.88, "grad_norm": 8.904149055480957, "learning_rate": 1.8621281464530893e-06, "loss": 0.6362, "step": 6558 }, { "epoch": 1.88, "grad_norm": 9.9290132522583, "learning_rate": 1.8578375286041191e-06, "loss": 0.6015, "step": 6559 }, { "epoch": 1.88, "grad_norm": 10.648885726928711, "learning_rate": 1.8535469107551488e-06, "loss": 0.6246, "step": 6560 }, { "epoch": 1.88, "grad_norm": 12.099038124084473, "learning_rate": 1.8492562929061786e-06, "loss": 0.6298, "step": 6561 }, { "epoch": 1.88, "grad_norm": 10.756821632385254, "learning_rate": 1.8449656750572084e-06, "loss": 0.67, "step": 6562 }, { "epoch": 1.88, "grad_norm": 10.574438095092773, "learning_rate": 1.840675057208238e-06, "loss": 0.4984, "step": 6563 }, { "epoch": 1.88, "grad_norm": 13.0107421875, "learning_rate": 1.8363844393592676e-06, "loss": 0.7365, "step": 6564 }, { "epoch": 1.88, "grad_norm": 10.021994590759277, "learning_rate": 1.8320938215102975e-06, "loss": 0.7611, "step": 6565 }, { "epoch": 1.88, "grad_norm": 10.102065086364746, "learning_rate": 1.8278032036613273e-06, "loss": 0.5019, "step": 6566 }, { "epoch": 1.88, "grad_norm": 12.003802299499512, "learning_rate": 1.823512585812357e-06, "loss": 0.8283, "step": 6567 }, { "epoch": 1.88, "grad_norm": 11.736248016357422, "learning_rate": 1.8192219679633867e-06, "loss": 0.7633, "step": 6568 }, { "epoch": 1.88, "grad_norm": 9.837981224060059, "learning_rate": 1.8149313501144166e-06, "loss": 0.761, "step": 6569 }, { "epoch": 1.88, "grad_norm": 13.176420211791992, "learning_rate": 1.8106407322654462e-06, "loss": 0.818, "step": 6570 }, { "epoch": 1.88, "grad_norm": 10.442317008972168, "learning_rate": 1.806350114416476e-06, "loss": 0.4333, "step": 6571 }, { "epoch": 1.88, "grad_norm": 12.715324401855469, "learning_rate": 1.8020594965675058e-06, "loss": 0.8797, "step": 6572 }, { "epoch": 1.88, "grad_norm": 12.06436538696289, "learning_rate": 1.7977688787185357e-06, "loss": 0.9243, "step": 6573 }, { "epoch": 1.88, "grad_norm": 11.596464157104492, "learning_rate": 1.7934782608695653e-06, "loss": 0.6724, "step": 6574 }, { "epoch": 1.88, "grad_norm": 9.028141021728516, "learning_rate": 1.789187643020595e-06, "loss": 0.6722, "step": 6575 }, { "epoch": 1.88, "grad_norm": 12.241581916809082, "learning_rate": 1.784897025171625e-06, "loss": 0.4524, "step": 6576 }, { "epoch": 1.88, "grad_norm": 12.716862678527832, "learning_rate": 1.7806064073226545e-06, "loss": 0.9328, "step": 6577 }, { "epoch": 1.88, "grad_norm": 9.879105567932129, "learning_rate": 1.7763157894736842e-06, "loss": 0.5696, "step": 6578 }, { "epoch": 1.88, "grad_norm": 9.3067626953125, "learning_rate": 1.772025171624714e-06, "loss": 0.6508, "step": 6579 }, { "epoch": 1.88, "grad_norm": 11.868824005126953, "learning_rate": 1.7677345537757436e-06, "loss": 0.7221, "step": 6580 }, { "epoch": 1.88, "grad_norm": 11.533590316772461, "learning_rate": 1.7634439359267734e-06, "loss": 0.658, "step": 6581 }, { "epoch": 1.88, "grad_norm": 9.161964416503906, "learning_rate": 1.7591533180778033e-06, "loss": 0.4541, "step": 6582 }, { "epoch": 1.88, "grad_norm": 11.080299377441406, "learning_rate": 1.7548627002288329e-06, "loss": 0.5937, "step": 6583 }, { "epoch": 1.88, "grad_norm": 11.026849746704102, "learning_rate": 1.7505720823798627e-06, "loss": 0.8369, "step": 6584 }, { "epoch": 1.88, "grad_norm": 13.56468391418457, "learning_rate": 1.7462814645308925e-06, "loss": 0.6955, "step": 6585 }, { "epoch": 1.88, "grad_norm": 11.296615600585938, "learning_rate": 1.7419908466819223e-06, "loss": 0.5353, "step": 6586 }, { "epoch": 1.88, "grad_norm": 12.954611778259277, "learning_rate": 1.737700228832952e-06, "loss": 0.961, "step": 6587 }, { "epoch": 1.88, "grad_norm": 14.633906364440918, "learning_rate": 1.7334096109839818e-06, "loss": 0.8205, "step": 6588 }, { "epoch": 1.88, "grad_norm": 8.348833084106445, "learning_rate": 1.7291189931350116e-06, "loss": 0.5662, "step": 6589 }, { "epoch": 1.89, "grad_norm": 11.917252540588379, "learning_rate": 1.7248283752860412e-06, "loss": 0.7883, "step": 6590 }, { "epoch": 1.89, "grad_norm": 10.018460273742676, "learning_rate": 1.720537757437071e-06, "loss": 0.5761, "step": 6591 }, { "epoch": 1.89, "grad_norm": 9.583654403686523, "learning_rate": 1.7162471395881009e-06, "loss": 0.653, "step": 6592 }, { "epoch": 1.89, "grad_norm": 8.523519515991211, "learning_rate": 1.7119565217391303e-06, "loss": 0.599, "step": 6593 }, { "epoch": 1.89, "grad_norm": 10.856172561645508, "learning_rate": 1.7076659038901601e-06, "loss": 0.6536, "step": 6594 }, { "epoch": 1.89, "grad_norm": 13.230097770690918, "learning_rate": 1.70337528604119e-06, "loss": 0.8298, "step": 6595 }, { "epoch": 1.89, "grad_norm": 8.523895263671875, "learning_rate": 1.6990846681922198e-06, "loss": 0.4115, "step": 6596 }, { "epoch": 1.89, "grad_norm": 10.178942680358887, "learning_rate": 1.6947940503432494e-06, "loss": 0.4652, "step": 6597 }, { "epoch": 1.89, "grad_norm": 9.806584358215332, "learning_rate": 1.6905034324942792e-06, "loss": 0.5653, "step": 6598 }, { "epoch": 1.89, "grad_norm": 9.303384780883789, "learning_rate": 1.686212814645309e-06, "loss": 0.515, "step": 6599 }, { "epoch": 1.89, "grad_norm": 8.282113075256348, "learning_rate": 1.6819221967963386e-06, "loss": 0.6797, "step": 6600 }, { "epoch": 1.89, "grad_norm": 11.019519805908203, "learning_rate": 1.6776315789473685e-06, "loss": 0.7665, "step": 6601 }, { "epoch": 1.89, "grad_norm": 9.183219909667969, "learning_rate": 1.6733409610983983e-06, "loss": 0.4708, "step": 6602 }, { "epoch": 1.89, "grad_norm": 10.683391571044922, "learning_rate": 1.669050343249428e-06, "loss": 0.6266, "step": 6603 }, { "epoch": 1.89, "grad_norm": 10.17486572265625, "learning_rate": 1.6647597254004577e-06, "loss": 0.5132, "step": 6604 }, { "epoch": 1.89, "grad_norm": 10.503311157226562, "learning_rate": 1.6604691075514876e-06, "loss": 0.5902, "step": 6605 }, { "epoch": 1.89, "grad_norm": 10.002145767211914, "learning_rate": 1.6561784897025172e-06, "loss": 0.8209, "step": 6606 }, { "epoch": 1.89, "grad_norm": 9.178977012634277, "learning_rate": 1.651887871853547e-06, "loss": 0.5495, "step": 6607 }, { "epoch": 1.89, "grad_norm": 8.63107681274414, "learning_rate": 1.6475972540045766e-06, "loss": 0.5714, "step": 6608 }, { "epoch": 1.89, "grad_norm": 10.814050674438477, "learning_rate": 1.6433066361556065e-06, "loss": 0.6961, "step": 6609 }, { "epoch": 1.89, "grad_norm": 10.427483558654785, "learning_rate": 1.639016018306636e-06, "loss": 0.6196, "step": 6610 }, { "epoch": 1.89, "grad_norm": 9.812458038330078, "learning_rate": 1.634725400457666e-06, "loss": 0.5838, "step": 6611 }, { "epoch": 1.89, "grad_norm": 10.628884315490723, "learning_rate": 1.6304347826086957e-06, "loss": 0.8683, "step": 6612 }, { "epoch": 1.89, "grad_norm": 9.876553535461426, "learning_rate": 1.6261441647597253e-06, "loss": 0.6857, "step": 6613 }, { "epoch": 1.89, "grad_norm": 8.790066719055176, "learning_rate": 1.6218535469107552e-06, "loss": 0.7048, "step": 6614 }, { "epoch": 1.89, "grad_norm": 10.74987506866455, "learning_rate": 1.617562929061785e-06, "loss": 0.6891, "step": 6615 }, { "epoch": 1.89, "grad_norm": 9.740676879882812, "learning_rate": 1.6132723112128146e-06, "loss": 0.6121, "step": 6616 }, { "epoch": 1.89, "grad_norm": 10.547184944152832, "learning_rate": 1.6089816933638444e-06, "loss": 0.7235, "step": 6617 }, { "epoch": 1.89, "grad_norm": 10.935553550720215, "learning_rate": 1.6046910755148743e-06, "loss": 0.6277, "step": 6618 }, { "epoch": 1.89, "grad_norm": 12.542131423950195, "learning_rate": 1.600400457665904e-06, "loss": 0.6723, "step": 6619 }, { "epoch": 1.89, "grad_norm": 12.385489463806152, "learning_rate": 1.5961098398169337e-06, "loss": 0.8717, "step": 6620 }, { "epoch": 1.89, "grad_norm": 10.140175819396973, "learning_rate": 1.5918192219679635e-06, "loss": 0.5769, "step": 6621 }, { "epoch": 1.89, "grad_norm": 9.585509300231934, "learning_rate": 1.5875286041189934e-06, "loss": 0.4811, "step": 6622 }, { "epoch": 1.89, "grad_norm": 9.377813339233398, "learning_rate": 1.5832379862700228e-06, "loss": 0.5785, "step": 6623 }, { "epoch": 1.89, "grad_norm": 10.11171817779541, "learning_rate": 1.5789473684210526e-06, "loss": 0.5008, "step": 6624 }, { "epoch": 1.9, "grad_norm": 9.451621055603027, "learning_rate": 1.5746567505720824e-06, "loss": 0.6607, "step": 6625 }, { "epoch": 1.9, "grad_norm": 9.769519805908203, "learning_rate": 1.570366132723112e-06, "loss": 0.7866, "step": 6626 }, { "epoch": 1.9, "grad_norm": 10.366878509521484, "learning_rate": 1.5660755148741419e-06, "loss": 0.6435, "step": 6627 }, { "epoch": 1.9, "grad_norm": 9.071807861328125, "learning_rate": 1.5617848970251717e-06, "loss": 0.5372, "step": 6628 }, { "epoch": 1.9, "grad_norm": 10.883258819580078, "learning_rate": 1.5574942791762013e-06, "loss": 0.6525, "step": 6629 }, { "epoch": 1.9, "grad_norm": 9.583721160888672, "learning_rate": 1.5532036613272311e-06, "loss": 0.6439, "step": 6630 }, { "epoch": 1.9, "grad_norm": 15.856637954711914, "learning_rate": 1.548913043478261e-06, "loss": 0.9104, "step": 6631 }, { "epoch": 1.9, "grad_norm": 12.541048049926758, "learning_rate": 1.5446224256292908e-06, "loss": 0.8598, "step": 6632 }, { "epoch": 1.9, "grad_norm": 9.865492820739746, "learning_rate": 1.5403318077803204e-06, "loss": 0.6173, "step": 6633 }, { "epoch": 1.9, "grad_norm": 9.361900329589844, "learning_rate": 1.5360411899313502e-06, "loss": 0.6299, "step": 6634 }, { "epoch": 1.9, "grad_norm": 12.05883502960205, "learning_rate": 1.53175057208238e-06, "loss": 0.6691, "step": 6635 }, { "epoch": 1.9, "grad_norm": 12.530620574951172, "learning_rate": 1.5274599542334097e-06, "loss": 0.8825, "step": 6636 }, { "epoch": 1.9, "grad_norm": 10.966458320617676, "learning_rate": 1.5231693363844395e-06, "loss": 0.5808, "step": 6637 }, { "epoch": 1.9, "grad_norm": 10.090479850769043, "learning_rate": 1.518878718535469e-06, "loss": 0.5668, "step": 6638 }, { "epoch": 1.9, "grad_norm": 11.146265029907227, "learning_rate": 1.5145881006864987e-06, "loss": 0.6695, "step": 6639 }, { "epoch": 1.9, "grad_norm": 9.504698753356934, "learning_rate": 1.5102974828375285e-06, "loss": 0.6011, "step": 6640 }, { "epoch": 1.9, "grad_norm": 9.853711128234863, "learning_rate": 1.5060068649885584e-06, "loss": 0.6912, "step": 6641 }, { "epoch": 1.9, "grad_norm": 12.366127967834473, "learning_rate": 1.5017162471395882e-06, "loss": 0.6353, "step": 6642 }, { "epoch": 1.9, "grad_norm": 10.047423362731934, "learning_rate": 1.4974256292906178e-06, "loss": 0.5828, "step": 6643 }, { "epoch": 1.9, "grad_norm": 10.754246711730957, "learning_rate": 1.4931350114416476e-06, "loss": 0.807, "step": 6644 }, { "epoch": 1.9, "grad_norm": 10.070907592773438, "learning_rate": 1.4888443935926775e-06, "loss": 0.5376, "step": 6645 }, { "epoch": 1.9, "grad_norm": 10.664957046508789, "learning_rate": 1.484553775743707e-06, "loss": 0.5351, "step": 6646 }, { "epoch": 1.9, "grad_norm": 9.667845726013184, "learning_rate": 1.480263157894737e-06, "loss": 0.5598, "step": 6647 }, { "epoch": 1.9, "grad_norm": 9.291950225830078, "learning_rate": 1.4759725400457667e-06, "loss": 0.5589, "step": 6648 }, { "epoch": 1.9, "grad_norm": 11.45644760131836, "learning_rate": 1.4716819221967963e-06, "loss": 0.9333, "step": 6649 }, { "epoch": 1.9, "grad_norm": 11.348915100097656, "learning_rate": 1.4673913043478262e-06, "loss": 0.7001, "step": 6650 }, { "epoch": 1.9, "grad_norm": 12.178325653076172, "learning_rate": 1.463100686498856e-06, "loss": 0.9092, "step": 6651 }, { "epoch": 1.9, "grad_norm": 10.490787506103516, "learning_rate": 1.4588100686498856e-06, "loss": 0.5223, "step": 6652 }, { "epoch": 1.9, "grad_norm": 8.094230651855469, "learning_rate": 1.4545194508009152e-06, "loss": 0.543, "step": 6653 }, { "epoch": 1.9, "grad_norm": 10.10330581665039, "learning_rate": 1.450228832951945e-06, "loss": 0.6563, "step": 6654 }, { "epoch": 1.9, "grad_norm": 11.853269577026367, "learning_rate": 1.4459382151029749e-06, "loss": 0.7333, "step": 6655 }, { "epoch": 1.9, "grad_norm": 11.071447372436523, "learning_rate": 1.4416475972540045e-06, "loss": 0.7413, "step": 6656 }, { "epoch": 1.9, "grad_norm": 11.710362434387207, "learning_rate": 1.4373569794050343e-06, "loss": 0.55, "step": 6657 }, { "epoch": 1.9, "grad_norm": 11.374383926391602, "learning_rate": 1.4330663615560641e-06, "loss": 0.7556, "step": 6658 }, { "epoch": 1.9, "grad_norm": 11.201468467712402, "learning_rate": 1.4287757437070938e-06, "loss": 0.6124, "step": 6659 }, { "epoch": 1.91, "grad_norm": 10.033202171325684, "learning_rate": 1.4244851258581236e-06, "loss": 0.5549, "step": 6660 }, { "epoch": 1.91, "grad_norm": 8.656717300415039, "learning_rate": 1.4201945080091534e-06, "loss": 0.4417, "step": 6661 }, { "epoch": 1.91, "grad_norm": 10.967692375183105, "learning_rate": 1.415903890160183e-06, "loss": 0.8864, "step": 6662 }, { "epoch": 1.91, "grad_norm": 10.697224617004395, "learning_rate": 1.4116132723112129e-06, "loss": 0.6277, "step": 6663 }, { "epoch": 1.91, "grad_norm": 11.160284042358398, "learning_rate": 1.4073226544622427e-06, "loss": 0.6296, "step": 6664 }, { "epoch": 1.91, "grad_norm": 10.617480278015137, "learning_rate": 1.4030320366132725e-06, "loss": 0.6303, "step": 6665 }, { "epoch": 1.91, "grad_norm": 11.47805118560791, "learning_rate": 1.3987414187643021e-06, "loss": 0.588, "step": 6666 }, { "epoch": 1.91, "grad_norm": 9.959935188293457, "learning_rate": 1.394450800915332e-06, "loss": 0.759, "step": 6667 }, { "epoch": 1.91, "grad_norm": 9.430790901184082, "learning_rate": 1.3901601830663618e-06, "loss": 0.6728, "step": 6668 }, { "epoch": 1.91, "grad_norm": 10.529243469238281, "learning_rate": 1.3858695652173912e-06, "loss": 0.494, "step": 6669 }, { "epoch": 1.91, "grad_norm": 10.530109405517578, "learning_rate": 1.381578947368421e-06, "loss": 0.8574, "step": 6670 }, { "epoch": 1.91, "grad_norm": 8.783020973205566, "learning_rate": 1.3772883295194508e-06, "loss": 0.6715, "step": 6671 }, { "epoch": 1.91, "grad_norm": 9.161794662475586, "learning_rate": 1.3729977116704805e-06, "loss": 0.6489, "step": 6672 }, { "epoch": 1.91, "grad_norm": 10.282054901123047, "learning_rate": 1.3687070938215103e-06, "loss": 0.5719, "step": 6673 }, { "epoch": 1.91, "grad_norm": 9.306244850158691, "learning_rate": 1.36441647597254e-06, "loss": 0.7034, "step": 6674 }, { "epoch": 1.91, "grad_norm": 12.614920616149902, "learning_rate": 1.3601258581235697e-06, "loss": 0.6356, "step": 6675 }, { "epoch": 1.91, "grad_norm": 10.753173828125, "learning_rate": 1.3558352402745995e-06, "loss": 0.7029, "step": 6676 }, { "epoch": 1.91, "grad_norm": 10.913387298583984, "learning_rate": 1.3515446224256294e-06, "loss": 0.6301, "step": 6677 }, { "epoch": 1.91, "grad_norm": 10.129559516906738, "learning_rate": 1.3472540045766592e-06, "loss": 0.7076, "step": 6678 }, { "epoch": 1.91, "grad_norm": 7.9117608070373535, "learning_rate": 1.3429633867276888e-06, "loss": 0.427, "step": 6679 }, { "epoch": 1.91, "grad_norm": 11.85035228729248, "learning_rate": 1.3386727688787186e-06, "loss": 0.6972, "step": 6680 }, { "epoch": 1.91, "grad_norm": 8.53018569946289, "learning_rate": 1.3343821510297485e-06, "loss": 0.5078, "step": 6681 }, { "epoch": 1.91, "grad_norm": 8.861907958984375, "learning_rate": 1.330091533180778e-06, "loss": 0.6506, "step": 6682 }, { "epoch": 1.91, "grad_norm": 9.757502555847168, "learning_rate": 1.325800915331808e-06, "loss": 0.6035, "step": 6683 }, { "epoch": 1.91, "grad_norm": 9.364039421081543, "learning_rate": 1.3215102974828375e-06, "loss": 0.5463, "step": 6684 }, { "epoch": 1.91, "grad_norm": 9.148898124694824, "learning_rate": 1.3172196796338671e-06, "loss": 0.538, "step": 6685 }, { "epoch": 1.91, "grad_norm": 11.64253044128418, "learning_rate": 1.312929061784897e-06, "loss": 0.6017, "step": 6686 }, { "epoch": 1.91, "grad_norm": 8.908527374267578, "learning_rate": 1.3086384439359268e-06, "loss": 0.4138, "step": 6687 }, { "epoch": 1.91, "grad_norm": 9.46848201751709, "learning_rate": 1.3043478260869566e-06, "loss": 0.7496, "step": 6688 }, { "epoch": 1.91, "grad_norm": 10.075117111206055, "learning_rate": 1.3000572082379862e-06, "loss": 0.8197, "step": 6689 }, { "epoch": 1.91, "grad_norm": 11.487811088562012, "learning_rate": 1.295766590389016e-06, "loss": 0.7324, "step": 6690 }, { "epoch": 1.91, "grad_norm": 8.055896759033203, "learning_rate": 1.2914759725400459e-06, "loss": 0.3772, "step": 6691 }, { "epoch": 1.91, "grad_norm": 9.442093849182129, "learning_rate": 1.2871853546910755e-06, "loss": 0.4689, "step": 6692 }, { "epoch": 1.91, "grad_norm": 10.520005226135254, "learning_rate": 1.2828947368421053e-06, "loss": 0.6256, "step": 6693 }, { "epoch": 1.91, "grad_norm": 11.794526100158691, "learning_rate": 1.2786041189931352e-06, "loss": 0.636, "step": 6694 }, { "epoch": 1.92, "grad_norm": 9.728378295898438, "learning_rate": 1.2743135011441648e-06, "loss": 0.6543, "step": 6695 }, { "epoch": 1.92, "grad_norm": 10.640963554382324, "learning_rate": 1.2700228832951946e-06, "loss": 0.6148, "step": 6696 }, { "epoch": 1.92, "grad_norm": 11.699413299560547, "learning_rate": 1.2657322654462244e-06, "loss": 0.766, "step": 6697 }, { "epoch": 1.92, "grad_norm": 9.714559555053711, "learning_rate": 1.261441647597254e-06, "loss": 0.5721, "step": 6698 }, { "epoch": 1.92, "grad_norm": 8.230494499206543, "learning_rate": 1.2571510297482837e-06, "loss": 0.5532, "step": 6699 }, { "epoch": 1.92, "grad_norm": 9.15118408203125, "learning_rate": 1.2528604118993135e-06, "loss": 0.5442, "step": 6700 }, { "epoch": 1.92, "grad_norm": 10.485254287719727, "learning_rate": 1.2485697940503433e-06, "loss": 0.6268, "step": 6701 }, { "epoch": 1.92, "grad_norm": 10.303457260131836, "learning_rate": 1.244279176201373e-06, "loss": 0.7238, "step": 6702 }, { "epoch": 1.92, "grad_norm": 10.692706108093262, "learning_rate": 1.2399885583524027e-06, "loss": 0.4998, "step": 6703 }, { "epoch": 1.92, "grad_norm": 14.482782363891602, "learning_rate": 1.2356979405034326e-06, "loss": 0.9019, "step": 6704 }, { "epoch": 1.92, "grad_norm": 9.645691871643066, "learning_rate": 1.2314073226544622e-06, "loss": 0.707, "step": 6705 }, { "epoch": 1.92, "grad_norm": 10.623063087463379, "learning_rate": 1.227116704805492e-06, "loss": 0.843, "step": 6706 }, { "epoch": 1.92, "grad_norm": 10.0907621383667, "learning_rate": 1.2228260869565218e-06, "loss": 0.8514, "step": 6707 }, { "epoch": 1.92, "grad_norm": 11.34632396697998, "learning_rate": 1.2185354691075515e-06, "loss": 0.853, "step": 6708 }, { "epoch": 1.92, "grad_norm": 8.570047378540039, "learning_rate": 1.2142448512585813e-06, "loss": 0.4797, "step": 6709 }, { "epoch": 1.92, "grad_norm": 9.260835647583008, "learning_rate": 1.2099542334096111e-06, "loss": 0.6946, "step": 6710 }, { "epoch": 1.92, "grad_norm": 9.483461380004883, "learning_rate": 1.205663615560641e-06, "loss": 0.3813, "step": 6711 }, { "epoch": 1.92, "grad_norm": 11.182538032531738, "learning_rate": 1.2013729977116706e-06, "loss": 0.6669, "step": 6712 }, { "epoch": 1.92, "grad_norm": 8.788228034973145, "learning_rate": 1.1970823798627004e-06, "loss": 0.3281, "step": 6713 }, { "epoch": 1.92, "grad_norm": 11.311713218688965, "learning_rate": 1.19279176201373e-06, "loss": 0.5773, "step": 6714 }, { "epoch": 1.92, "grad_norm": 12.638751029968262, "learning_rate": 1.1885011441647596e-06, "loss": 1.0074, "step": 6715 }, { "epoch": 1.92, "grad_norm": 10.119768142700195, "learning_rate": 1.1842105263157894e-06, "loss": 0.6024, "step": 6716 }, { "epoch": 1.92, "grad_norm": 9.68479061126709, "learning_rate": 1.1799199084668193e-06, "loss": 0.6385, "step": 6717 }, { "epoch": 1.92, "grad_norm": 8.239176750183105, "learning_rate": 1.1756292906178489e-06, "loss": 0.524, "step": 6718 }, { "epoch": 1.92, "grad_norm": 10.428190231323242, "learning_rate": 1.1713386727688787e-06, "loss": 0.6015, "step": 6719 }, { "epoch": 1.92, "grad_norm": 9.850186347961426, "learning_rate": 1.1670480549199085e-06, "loss": 0.7612, "step": 6720 }, { "epoch": 1.92, "grad_norm": 12.244690895080566, "learning_rate": 1.1627574370709381e-06, "loss": 0.7536, "step": 6721 }, { "epoch": 1.92, "grad_norm": 10.301718711853027, "learning_rate": 1.158466819221968e-06, "loss": 0.6864, "step": 6722 }, { "epoch": 1.92, "grad_norm": 11.460555076599121, "learning_rate": 1.1541762013729978e-06, "loss": 0.6202, "step": 6723 }, { "epoch": 1.92, "grad_norm": 9.115154266357422, "learning_rate": 1.1498855835240276e-06, "loss": 0.6418, "step": 6724 }, { "epoch": 1.92, "grad_norm": 11.595759391784668, "learning_rate": 1.1455949656750572e-06, "loss": 0.624, "step": 6725 }, { "epoch": 1.92, "grad_norm": 12.14228343963623, "learning_rate": 1.141304347826087e-06, "loss": 0.7796, "step": 6726 }, { "epoch": 1.92, "grad_norm": 10.284402847290039, "learning_rate": 1.137013729977117e-06, "loss": 0.5669, "step": 6727 }, { "epoch": 1.92, "grad_norm": 9.583091735839844, "learning_rate": 1.1327231121281465e-06, "loss": 0.5784, "step": 6728 }, { "epoch": 1.92, "grad_norm": 11.306744575500488, "learning_rate": 1.1284324942791761e-06, "loss": 0.7993, "step": 6729 }, { "epoch": 1.93, "grad_norm": 19.395160675048828, "learning_rate": 1.124141876430206e-06, "loss": 0.7529, "step": 6730 }, { "epoch": 1.93, "grad_norm": 7.858729362487793, "learning_rate": 1.1198512585812356e-06, "loss": 0.6799, "step": 6731 }, { "epoch": 1.93, "grad_norm": 10.490175247192383, "learning_rate": 1.1155606407322654e-06, "loss": 0.8072, "step": 6732 }, { "epoch": 1.93, "grad_norm": 10.685630798339844, "learning_rate": 1.1112700228832952e-06, "loss": 0.5765, "step": 6733 }, { "epoch": 1.93, "grad_norm": 11.851250648498535, "learning_rate": 1.106979405034325e-06, "loss": 0.5958, "step": 6734 }, { "epoch": 1.93, "grad_norm": 10.589179039001465, "learning_rate": 1.1026887871853547e-06, "loss": 0.7273, "step": 6735 }, { "epoch": 1.93, "grad_norm": 8.347055435180664, "learning_rate": 1.0983981693363845e-06, "loss": 0.6655, "step": 6736 }, { "epoch": 1.93, "grad_norm": 10.162137985229492, "learning_rate": 1.0941075514874143e-06, "loss": 0.7468, "step": 6737 }, { "epoch": 1.93, "grad_norm": 11.438544273376465, "learning_rate": 1.089816933638444e-06, "loss": 0.7288, "step": 6738 }, { "epoch": 1.93, "grad_norm": 8.79272747039795, "learning_rate": 1.0855263157894738e-06, "loss": 0.6498, "step": 6739 }, { "epoch": 1.93, "grad_norm": 10.488850593566895, "learning_rate": 1.0812356979405036e-06, "loss": 0.6367, "step": 6740 }, { "epoch": 1.93, "grad_norm": 8.263250350952148, "learning_rate": 1.0769450800915332e-06, "loss": 0.5896, "step": 6741 }, { "epoch": 1.93, "grad_norm": 11.923199653625488, "learning_rate": 1.072654462242563e-06, "loss": 0.788, "step": 6742 }, { "epoch": 1.93, "grad_norm": 9.139735221862793, "learning_rate": 1.0683638443935929e-06, "loss": 0.5462, "step": 6743 }, { "epoch": 1.93, "grad_norm": 10.757696151733398, "learning_rate": 1.0640732265446223e-06, "loss": 0.6614, "step": 6744 }, { "epoch": 1.93, "grad_norm": 8.452191352844238, "learning_rate": 1.059782608695652e-06, "loss": 0.5517, "step": 6745 }, { "epoch": 1.93, "grad_norm": 11.318562507629395, "learning_rate": 1.055491990846682e-06, "loss": 0.6681, "step": 6746 }, { "epoch": 1.93, "grad_norm": 11.440497398376465, "learning_rate": 1.0512013729977117e-06, "loss": 0.7208, "step": 6747 }, { "epoch": 1.93, "grad_norm": 11.99815845489502, "learning_rate": 1.0469107551487413e-06, "loss": 0.6689, "step": 6748 }, { "epoch": 1.93, "grad_norm": 10.977968215942383, "learning_rate": 1.0426201372997712e-06, "loss": 0.5295, "step": 6749 }, { "epoch": 1.93, "grad_norm": 12.640535354614258, "learning_rate": 1.038329519450801e-06, "loss": 0.8584, "step": 6750 }, { "epoch": 1.93, "grad_norm": 44.93844985961914, "learning_rate": 1.0340389016018306e-06, "loss": 0.5963, "step": 6751 }, { "epoch": 1.93, "grad_norm": 11.811992645263672, "learning_rate": 1.0297482837528604e-06, "loss": 0.6537, "step": 6752 }, { "epoch": 1.93, "grad_norm": 9.109289169311523, "learning_rate": 1.0254576659038903e-06, "loss": 0.6972, "step": 6753 }, { "epoch": 1.93, "grad_norm": 12.60461711883545, "learning_rate": 1.0211670480549199e-06, "loss": 0.8511, "step": 6754 }, { "epoch": 1.93, "grad_norm": 10.249258995056152, "learning_rate": 1.0168764302059497e-06, "loss": 0.9507, "step": 6755 }, { "epoch": 1.93, "grad_norm": 11.223470687866211, "learning_rate": 1.0125858123569795e-06, "loss": 0.7589, "step": 6756 }, { "epoch": 1.93, "grad_norm": 11.984776496887207, "learning_rate": 1.0082951945080094e-06, "loss": 0.6934, "step": 6757 }, { "epoch": 1.93, "grad_norm": 12.150588035583496, "learning_rate": 1.004004576659039e-06, "loss": 0.6201, "step": 6758 }, { "epoch": 1.93, "grad_norm": 10.411704063415527, "learning_rate": 9.997139588100686e-07, "loss": 0.61, "step": 6759 }, { "epoch": 1.93, "grad_norm": 10.817537307739258, "learning_rate": 9.954233409610984e-07, "loss": 0.7173, "step": 6760 }, { "epoch": 1.93, "grad_norm": 10.725568771362305, "learning_rate": 9.91132723112128e-07, "loss": 0.6655, "step": 6761 }, { "epoch": 1.93, "grad_norm": 9.771780014038086, "learning_rate": 9.868421052631579e-07, "loss": 0.7, "step": 6762 }, { "epoch": 1.93, "grad_norm": 9.953685760498047, "learning_rate": 9.825514874141877e-07, "loss": 0.641, "step": 6763 }, { "epoch": 1.93, "grad_norm": 11.242582321166992, "learning_rate": 9.782608695652173e-07, "loss": 0.5187, "step": 6764 }, { "epoch": 1.94, "grad_norm": 11.4192533493042, "learning_rate": 9.739702517162471e-07, "loss": 0.5498, "step": 6765 }, { "epoch": 1.94, "grad_norm": 9.859990119934082, "learning_rate": 9.69679633867277e-07, "loss": 0.7167, "step": 6766 }, { "epoch": 1.94, "grad_norm": 15.310134887695312, "learning_rate": 9.653890160183066e-07, "loss": 1.1055, "step": 6767 }, { "epoch": 1.94, "grad_norm": 9.646717071533203, "learning_rate": 9.610983981693364e-07, "loss": 0.4438, "step": 6768 }, { "epoch": 1.94, "grad_norm": 10.532123565673828, "learning_rate": 9.568077803203662e-07, "loss": 0.6696, "step": 6769 }, { "epoch": 1.94, "grad_norm": 15.500516891479492, "learning_rate": 9.52517162471396e-07, "loss": 0.9405, "step": 6770 }, { "epoch": 1.94, "grad_norm": 11.07739543914795, "learning_rate": 9.482265446224258e-07, "loss": 0.8091, "step": 6771 }, { "epoch": 1.94, "grad_norm": 10.713896751403809, "learning_rate": 9.439359267734555e-07, "loss": 0.6927, "step": 6772 }, { "epoch": 1.94, "grad_norm": 10.74335765838623, "learning_rate": 9.396453089244852e-07, "loss": 0.6765, "step": 6773 }, { "epoch": 1.94, "grad_norm": 10.386445999145508, "learning_rate": 9.353546910755149e-07, "loss": 0.6199, "step": 6774 }, { "epoch": 1.94, "grad_norm": 9.495280265808105, "learning_rate": 9.310640732265447e-07, "loss": 0.4989, "step": 6775 }, { "epoch": 1.94, "grad_norm": 8.543712615966797, "learning_rate": 9.267734553775744e-07, "loss": 0.6049, "step": 6776 }, { "epoch": 1.94, "grad_norm": 12.661754608154297, "learning_rate": 9.224828375286042e-07, "loss": 0.8616, "step": 6777 }, { "epoch": 1.94, "grad_norm": 9.792861938476562, "learning_rate": 9.181922196796338e-07, "loss": 0.592, "step": 6778 }, { "epoch": 1.94, "grad_norm": 10.058728218078613, "learning_rate": 9.139016018306636e-07, "loss": 0.5957, "step": 6779 }, { "epoch": 1.94, "grad_norm": 11.858846664428711, "learning_rate": 9.096109839816934e-07, "loss": 0.8671, "step": 6780 }, { "epoch": 1.94, "grad_norm": 6.873499870300293, "learning_rate": 9.053203661327231e-07, "loss": 0.3545, "step": 6781 }, { "epoch": 1.94, "grad_norm": 10.451911926269531, "learning_rate": 9.010297482837529e-07, "loss": 0.6086, "step": 6782 }, { "epoch": 1.94, "grad_norm": 10.603643417358398, "learning_rate": 8.967391304347826e-07, "loss": 0.6285, "step": 6783 }, { "epoch": 1.94, "grad_norm": 11.095579147338867, "learning_rate": 8.924485125858125e-07, "loss": 0.6823, "step": 6784 }, { "epoch": 1.94, "grad_norm": 9.788519859313965, "learning_rate": 8.881578947368421e-07, "loss": 0.7352, "step": 6785 }, { "epoch": 1.94, "grad_norm": 11.36791706085205, "learning_rate": 8.838672768878718e-07, "loss": 0.7521, "step": 6786 }, { "epoch": 1.94, "grad_norm": 10.876045227050781, "learning_rate": 8.795766590389016e-07, "loss": 0.6124, "step": 6787 }, { "epoch": 1.94, "grad_norm": 14.375767707824707, "learning_rate": 8.752860411899313e-07, "loss": 0.8835, "step": 6788 }, { "epoch": 1.94, "grad_norm": 8.53582763671875, "learning_rate": 8.709954233409612e-07, "loss": 0.673, "step": 6789 }, { "epoch": 1.94, "grad_norm": 8.364534378051758, "learning_rate": 8.667048054919909e-07, "loss": 0.5763, "step": 6790 }, { "epoch": 1.94, "grad_norm": 16.54009246826172, "learning_rate": 8.624141876430206e-07, "loss": 0.6385, "step": 6791 }, { "epoch": 1.94, "grad_norm": 9.698732376098633, "learning_rate": 8.581235697940504e-07, "loss": 0.6774, "step": 6792 }, { "epoch": 1.94, "grad_norm": 10.97348690032959, "learning_rate": 8.538329519450801e-07, "loss": 0.6796, "step": 6793 }, { "epoch": 1.94, "grad_norm": 9.030004501342773, "learning_rate": 8.495423340961099e-07, "loss": 0.6908, "step": 6794 }, { "epoch": 1.94, "grad_norm": 11.053754806518555, "learning_rate": 8.452517162471396e-07, "loss": 0.9486, "step": 6795 }, { "epoch": 1.94, "grad_norm": 14.873150825500488, "learning_rate": 8.409610983981693e-07, "loss": 0.8398, "step": 6796 }, { "epoch": 1.94, "grad_norm": 11.530095100402832, "learning_rate": 8.366704805491992e-07, "loss": 0.6613, "step": 6797 }, { "epoch": 1.94, "grad_norm": 12.763251304626465, "learning_rate": 8.323798627002289e-07, "loss": 0.8406, "step": 6798 }, { "epoch": 1.94, "grad_norm": 11.035303115844727, "learning_rate": 8.280892448512586e-07, "loss": 0.579, "step": 6799 }, { "epoch": 1.95, "grad_norm": 7.569090366363525, "learning_rate": 8.237986270022883e-07, "loss": 0.3526, "step": 6800 }, { "epoch": 1.95, "grad_norm": 9.218209266662598, "learning_rate": 8.19508009153318e-07, "loss": 0.6879, "step": 6801 }, { "epoch": 1.95, "grad_norm": 12.47700023651123, "learning_rate": 8.152173913043479e-07, "loss": 0.8903, "step": 6802 }, { "epoch": 1.95, "grad_norm": 10.173751831054688, "learning_rate": 8.109267734553776e-07, "loss": 0.6248, "step": 6803 }, { "epoch": 1.95, "grad_norm": 11.127490997314453, "learning_rate": 8.066361556064073e-07, "loss": 0.84, "step": 6804 }, { "epoch": 1.95, "grad_norm": 9.903820991516113, "learning_rate": 8.023455377574371e-07, "loss": 0.5848, "step": 6805 }, { "epoch": 1.95, "grad_norm": 11.57451343536377, "learning_rate": 7.980549199084668e-07, "loss": 0.7, "step": 6806 }, { "epoch": 1.95, "grad_norm": 11.393851280212402, "learning_rate": 7.937643020594967e-07, "loss": 0.8057, "step": 6807 }, { "epoch": 1.95, "grad_norm": 8.562248229980469, "learning_rate": 7.894736842105263e-07, "loss": 0.5488, "step": 6808 }, { "epoch": 1.95, "grad_norm": 13.203279495239258, "learning_rate": 7.85183066361556e-07, "loss": 0.9368, "step": 6809 }, { "epoch": 1.95, "grad_norm": 9.013036727905273, "learning_rate": 7.808924485125858e-07, "loss": 0.6234, "step": 6810 }, { "epoch": 1.95, "grad_norm": 10.600117683410645, "learning_rate": 7.766018306636156e-07, "loss": 0.8523, "step": 6811 }, { "epoch": 1.95, "grad_norm": 9.700066566467285, "learning_rate": 7.723112128146454e-07, "loss": 0.6776, "step": 6812 }, { "epoch": 1.95, "grad_norm": 10.107597351074219, "learning_rate": 7.680205949656751e-07, "loss": 0.6422, "step": 6813 }, { "epoch": 1.95, "grad_norm": 9.581838607788086, "learning_rate": 7.637299771167048e-07, "loss": 0.609, "step": 6814 }, { "epoch": 1.95, "grad_norm": 9.373744010925293, "learning_rate": 7.594393592677345e-07, "loss": 0.5848, "step": 6815 }, { "epoch": 1.95, "grad_norm": 8.906309127807617, "learning_rate": 7.551487414187643e-07, "loss": 0.7027, "step": 6816 }, { "epoch": 1.95, "grad_norm": 9.41042709350586, "learning_rate": 7.508581235697941e-07, "loss": 0.5145, "step": 6817 }, { "epoch": 1.95, "grad_norm": 9.03809642791748, "learning_rate": 7.465675057208238e-07, "loss": 0.5115, "step": 6818 }, { "epoch": 1.95, "grad_norm": 10.127779006958008, "learning_rate": 7.422768878718535e-07, "loss": 0.6582, "step": 6819 }, { "epoch": 1.95, "grad_norm": 11.648848533630371, "learning_rate": 7.379862700228834e-07, "loss": 0.7816, "step": 6820 }, { "epoch": 1.95, "grad_norm": 11.141276359558105, "learning_rate": 7.336956521739131e-07, "loss": 0.6873, "step": 6821 }, { "epoch": 1.95, "grad_norm": 12.081658363342285, "learning_rate": 7.294050343249428e-07, "loss": 0.7483, "step": 6822 }, { "epoch": 1.95, "grad_norm": 8.0576810836792, "learning_rate": 7.251144164759725e-07, "loss": 0.69, "step": 6823 }, { "epoch": 1.95, "grad_norm": 12.894620895385742, "learning_rate": 7.208237986270022e-07, "loss": 0.8952, "step": 6824 }, { "epoch": 1.95, "grad_norm": 9.190361976623535, "learning_rate": 7.165331807780321e-07, "loss": 0.5655, "step": 6825 }, { "epoch": 1.95, "grad_norm": 10.179404258728027, "learning_rate": 7.122425629290618e-07, "loss": 0.6606, "step": 6826 }, { "epoch": 1.95, "grad_norm": 9.90598201751709, "learning_rate": 7.079519450800915e-07, "loss": 0.8001, "step": 6827 }, { "epoch": 1.95, "grad_norm": 9.589546203613281, "learning_rate": 7.036613272311213e-07, "loss": 0.5828, "step": 6828 }, { "epoch": 1.95, "grad_norm": 9.023877143859863, "learning_rate": 6.993707093821511e-07, "loss": 0.5469, "step": 6829 }, { "epoch": 1.95, "grad_norm": 10.734456062316895, "learning_rate": 6.950800915331809e-07, "loss": 0.6977, "step": 6830 }, { "epoch": 1.95, "grad_norm": 10.52493667602539, "learning_rate": 6.907894736842105e-07, "loss": 0.7481, "step": 6831 }, { "epoch": 1.95, "grad_norm": 10.465811729431152, "learning_rate": 6.864988558352402e-07, "loss": 0.5465, "step": 6832 }, { "epoch": 1.95, "grad_norm": 9.976741790771484, "learning_rate": 6.8220823798627e-07, "loss": 0.8391, "step": 6833 }, { "epoch": 1.95, "grad_norm": 11.496618270874023, "learning_rate": 6.779176201372998e-07, "loss": 0.6632, "step": 6834 }, { "epoch": 1.96, "grad_norm": 7.427152633666992, "learning_rate": 6.736270022883296e-07, "loss": 0.4168, "step": 6835 }, { "epoch": 1.96, "grad_norm": 9.783970832824707, "learning_rate": 6.693363844393593e-07, "loss": 0.7624, "step": 6836 }, { "epoch": 1.96, "grad_norm": 9.616044044494629, "learning_rate": 6.65045766590389e-07, "loss": 0.5893, "step": 6837 }, { "epoch": 1.96, "grad_norm": 9.780241012573242, "learning_rate": 6.607551487414188e-07, "loss": 0.4578, "step": 6838 }, { "epoch": 1.96, "grad_norm": 9.057365417480469, "learning_rate": 6.564645308924485e-07, "loss": 0.5626, "step": 6839 }, { "epoch": 1.96, "grad_norm": 10.086860656738281, "learning_rate": 6.521739130434783e-07, "loss": 0.6514, "step": 6840 }, { "epoch": 1.96, "grad_norm": 10.18759822845459, "learning_rate": 6.47883295194508e-07, "loss": 0.5814, "step": 6841 }, { "epoch": 1.96, "grad_norm": 11.207710266113281, "learning_rate": 6.435926773455378e-07, "loss": 0.5791, "step": 6842 }, { "epoch": 1.96, "grad_norm": 10.417292594909668, "learning_rate": 6.393020594965676e-07, "loss": 0.6957, "step": 6843 }, { "epoch": 1.96, "grad_norm": 10.373567581176758, "learning_rate": 6.350114416475973e-07, "loss": 0.6666, "step": 6844 }, { "epoch": 1.96, "grad_norm": 9.936010360717773, "learning_rate": 6.30720823798627e-07, "loss": 0.6347, "step": 6845 }, { "epoch": 1.96, "grad_norm": 11.712944984436035, "learning_rate": 6.264302059496567e-07, "loss": 0.8248, "step": 6846 }, { "epoch": 1.96, "grad_norm": 11.301887512207031, "learning_rate": 6.221395881006865e-07, "loss": 0.7441, "step": 6847 }, { "epoch": 1.96, "grad_norm": 11.015470504760742, "learning_rate": 6.178489702517163e-07, "loss": 0.7018, "step": 6848 }, { "epoch": 1.96, "grad_norm": 10.960253715515137, "learning_rate": 6.13558352402746e-07, "loss": 0.5765, "step": 6849 }, { "epoch": 1.96, "grad_norm": 10.221090316772461, "learning_rate": 6.092677345537757e-07, "loss": 0.638, "step": 6850 }, { "epoch": 1.96, "grad_norm": 8.547921180725098, "learning_rate": 6.049771167048056e-07, "loss": 0.536, "step": 6851 }, { "epoch": 1.96, "grad_norm": 11.917266845703125, "learning_rate": 6.006864988558353e-07, "loss": 0.699, "step": 6852 }, { "epoch": 1.96, "grad_norm": 10.198384284973145, "learning_rate": 5.96395881006865e-07, "loss": 0.6332, "step": 6853 }, { "epoch": 1.96, "grad_norm": 8.864692687988281, "learning_rate": 5.921052631578947e-07, "loss": 0.5656, "step": 6854 }, { "epoch": 1.96, "grad_norm": 12.42785358428955, "learning_rate": 5.878146453089244e-07, "loss": 0.6115, "step": 6855 }, { "epoch": 1.96, "grad_norm": 11.457018852233887, "learning_rate": 5.835240274599543e-07, "loss": 0.4691, "step": 6856 }, { "epoch": 1.96, "grad_norm": 11.069910049438477, "learning_rate": 5.79233409610984e-07, "loss": 0.5925, "step": 6857 }, { "epoch": 1.96, "grad_norm": 10.123159408569336, "learning_rate": 5.749427917620138e-07, "loss": 0.6451, "step": 6858 }, { "epoch": 1.96, "grad_norm": 9.3661527633667, "learning_rate": 5.706521739130435e-07, "loss": 0.7237, "step": 6859 }, { "epoch": 1.96, "grad_norm": 8.912443161010742, "learning_rate": 5.663615560640733e-07, "loss": 0.6893, "step": 6860 }, { "epoch": 1.96, "grad_norm": 8.892996788024902, "learning_rate": 5.62070938215103e-07, "loss": 0.6673, "step": 6861 }, { "epoch": 1.96, "grad_norm": 8.603639602661133, "learning_rate": 5.577803203661327e-07, "loss": 0.3919, "step": 6862 }, { "epoch": 1.96, "grad_norm": 9.034506797790527, "learning_rate": 5.534897025171625e-07, "loss": 0.7098, "step": 6863 }, { "epoch": 1.96, "grad_norm": 9.981705665588379, "learning_rate": 5.491990846681922e-07, "loss": 0.7897, "step": 6864 }, { "epoch": 1.96, "grad_norm": 11.018440246582031, "learning_rate": 5.44908466819222e-07, "loss": 0.8389, "step": 6865 }, { "epoch": 1.96, "grad_norm": 11.106205940246582, "learning_rate": 5.406178489702518e-07, "loss": 0.8287, "step": 6866 }, { "epoch": 1.96, "grad_norm": 11.349292755126953, "learning_rate": 5.363272311212815e-07, "loss": 0.7209, "step": 6867 }, { "epoch": 1.96, "grad_norm": 11.872751235961914, "learning_rate": 5.320366132723111e-07, "loss": 0.6976, "step": 6868 }, { "epoch": 1.96, "grad_norm": 12.316161155700684, "learning_rate": 5.27745995423341e-07, "loss": 0.5499, "step": 6869 }, { "epoch": 1.97, "grad_norm": 8.931777000427246, "learning_rate": 5.234553775743707e-07, "loss": 0.6789, "step": 6870 }, { "epoch": 1.97, "grad_norm": 10.08340835571289, "learning_rate": 5.191647597254005e-07, "loss": 0.7906, "step": 6871 }, { "epoch": 1.97, "grad_norm": 12.119122505187988, "learning_rate": 5.148741418764302e-07, "loss": 0.8094, "step": 6872 }, { "epoch": 1.97, "grad_norm": 14.020956039428711, "learning_rate": 5.105835240274599e-07, "loss": 0.746, "step": 6873 }, { "epoch": 1.97, "grad_norm": 8.066704750061035, "learning_rate": 5.062929061784898e-07, "loss": 0.5912, "step": 6874 }, { "epoch": 1.97, "grad_norm": 8.146242141723633, "learning_rate": 5.020022883295195e-07, "loss": 0.399, "step": 6875 }, { "epoch": 1.97, "grad_norm": 9.605037689208984, "learning_rate": 4.977116704805492e-07, "loss": 0.5589, "step": 6876 }, { "epoch": 1.97, "grad_norm": 10.58935260772705, "learning_rate": 4.934210526315789e-07, "loss": 0.6743, "step": 6877 }, { "epoch": 1.97, "grad_norm": 12.243191719055176, "learning_rate": 4.891304347826087e-07, "loss": 0.6987, "step": 6878 }, { "epoch": 1.97, "grad_norm": 10.33033561706543, "learning_rate": 4.848398169336385e-07, "loss": 0.5374, "step": 6879 }, { "epoch": 1.97, "grad_norm": 10.57116413116455, "learning_rate": 4.805491990846682e-07, "loss": 0.8575, "step": 6880 }, { "epoch": 1.97, "grad_norm": 10.175572395324707, "learning_rate": 4.76258581235698e-07, "loss": 0.626, "step": 6881 }, { "epoch": 1.97, "grad_norm": 10.430510520935059, "learning_rate": 4.7196796338672775e-07, "loss": 0.6818, "step": 6882 }, { "epoch": 1.97, "grad_norm": 11.097426414489746, "learning_rate": 4.6767734553775747e-07, "loss": 0.8006, "step": 6883 }, { "epoch": 1.97, "grad_norm": 9.713412284851074, "learning_rate": 4.633867276887872e-07, "loss": 0.6842, "step": 6884 }, { "epoch": 1.97, "grad_norm": 14.023730278015137, "learning_rate": 4.590961098398169e-07, "loss": 0.7831, "step": 6885 }, { "epoch": 1.97, "grad_norm": 10.488276481628418, "learning_rate": 4.548054919908467e-07, "loss": 0.5933, "step": 6886 }, { "epoch": 1.97, "grad_norm": 8.53012466430664, "learning_rate": 4.5051487414187646e-07, "loss": 0.4922, "step": 6887 }, { "epoch": 1.97, "grad_norm": 11.886279106140137, "learning_rate": 4.4622425629290623e-07, "loss": 0.5628, "step": 6888 }, { "epoch": 1.97, "grad_norm": 9.568771362304688, "learning_rate": 4.419336384439359e-07, "loss": 0.5714, "step": 6889 }, { "epoch": 1.97, "grad_norm": 10.269181251525879, "learning_rate": 4.3764302059496567e-07, "loss": 0.586, "step": 6890 }, { "epoch": 1.97, "grad_norm": 11.323017120361328, "learning_rate": 4.3335240274599545e-07, "loss": 0.7212, "step": 6891 }, { "epoch": 1.97, "grad_norm": 11.437891006469727, "learning_rate": 4.290617848970252e-07, "loss": 0.6863, "step": 6892 }, { "epoch": 1.97, "grad_norm": 8.55978012084961, "learning_rate": 4.2477116704805494e-07, "loss": 0.5286, "step": 6893 }, { "epoch": 1.97, "grad_norm": 8.444185256958008, "learning_rate": 4.2048054919908466e-07, "loss": 0.4725, "step": 6894 }, { "epoch": 1.97, "grad_norm": 11.7064847946167, "learning_rate": 4.1618993135011444e-07, "loss": 0.6294, "step": 6895 }, { "epoch": 1.97, "grad_norm": 10.098320007324219, "learning_rate": 4.1189931350114416e-07, "loss": 0.8641, "step": 6896 }, { "epoch": 1.97, "grad_norm": 10.265768051147461, "learning_rate": 4.0760869565217393e-07, "loss": 0.5418, "step": 6897 }, { "epoch": 1.97, "grad_norm": 10.396321296691895, "learning_rate": 4.0331807780320365e-07, "loss": 0.7486, "step": 6898 }, { "epoch": 1.97, "grad_norm": 9.897319793701172, "learning_rate": 3.990274599542334e-07, "loss": 0.6535, "step": 6899 }, { "epoch": 1.97, "grad_norm": 9.139906883239746, "learning_rate": 3.9473684210526315e-07, "loss": 0.5608, "step": 6900 }, { "epoch": 1.97, "grad_norm": 11.586169242858887, "learning_rate": 3.904462242562929e-07, "loss": 0.8048, "step": 6901 }, { "epoch": 1.97, "grad_norm": 10.12833309173584, "learning_rate": 3.861556064073227e-07, "loss": 0.6463, "step": 6902 }, { "epoch": 1.97, "grad_norm": 11.549627304077148, "learning_rate": 3.818649885583524e-07, "loss": 0.5828, "step": 6903 }, { "epoch": 1.97, "grad_norm": 9.387853622436523, "learning_rate": 3.7757437070938213e-07, "loss": 0.5356, "step": 6904 }, { "epoch": 1.98, "grad_norm": 10.681439399719238, "learning_rate": 3.732837528604119e-07, "loss": 0.5921, "step": 6905 }, { "epoch": 1.98, "grad_norm": 10.00284481048584, "learning_rate": 3.689931350114417e-07, "loss": 0.6901, "step": 6906 }, { "epoch": 1.98, "grad_norm": 10.689186096191406, "learning_rate": 3.647025171624714e-07, "loss": 0.6819, "step": 6907 }, { "epoch": 1.98, "grad_norm": 7.789943695068359, "learning_rate": 3.604118993135011e-07, "loss": 0.5717, "step": 6908 }, { "epoch": 1.98, "grad_norm": 12.032934188842773, "learning_rate": 3.561212814645309e-07, "loss": 0.864, "step": 6909 }, { "epoch": 1.98, "grad_norm": 9.302270889282227, "learning_rate": 3.5183066361556067e-07, "loss": 0.5825, "step": 6910 }, { "epoch": 1.98, "grad_norm": 5.9612908363342285, "learning_rate": 3.4754004576659044e-07, "loss": 0.3212, "step": 6911 }, { "epoch": 1.98, "grad_norm": 11.921589851379395, "learning_rate": 3.432494279176201e-07, "loss": 0.7346, "step": 6912 }, { "epoch": 1.98, "grad_norm": 10.079961776733398, "learning_rate": 3.389588100686499e-07, "loss": 0.782, "step": 6913 }, { "epoch": 1.98, "grad_norm": 10.618403434753418, "learning_rate": 3.3466819221967966e-07, "loss": 0.7526, "step": 6914 }, { "epoch": 1.98, "grad_norm": 11.24497127532959, "learning_rate": 3.303775743707094e-07, "loss": 0.7186, "step": 6915 }, { "epoch": 1.98, "grad_norm": 9.965060234069824, "learning_rate": 3.2608695652173915e-07, "loss": 0.6016, "step": 6916 }, { "epoch": 1.98, "grad_norm": 10.062822341918945, "learning_rate": 3.217963386727689e-07, "loss": 0.5654, "step": 6917 }, { "epoch": 1.98, "grad_norm": 11.358135223388672, "learning_rate": 3.1750572082379865e-07, "loss": 0.7088, "step": 6918 }, { "epoch": 1.98, "grad_norm": 9.111773490905762, "learning_rate": 3.1321510297482837e-07, "loss": 0.7644, "step": 6919 }, { "epoch": 1.98, "grad_norm": 9.192373275756836, "learning_rate": 3.0892448512585814e-07, "loss": 0.6394, "step": 6920 }, { "epoch": 1.98, "grad_norm": 12.135351181030273, "learning_rate": 3.0463386727688786e-07, "loss": 0.8185, "step": 6921 }, { "epoch": 1.98, "grad_norm": 11.259346008300781, "learning_rate": 3.0034324942791764e-07, "loss": 0.6089, "step": 6922 }, { "epoch": 1.98, "grad_norm": 7.142961502075195, "learning_rate": 2.9605263157894736e-07, "loss": 0.4448, "step": 6923 }, { "epoch": 1.98, "grad_norm": 8.596514701843262, "learning_rate": 2.9176201372997713e-07, "loss": 0.603, "step": 6924 }, { "epoch": 1.98, "grad_norm": 10.275640487670898, "learning_rate": 2.874713958810069e-07, "loss": 0.9107, "step": 6925 }, { "epoch": 1.98, "grad_norm": 12.552618980407715, "learning_rate": 2.8318077803203663e-07, "loss": 0.7594, "step": 6926 }, { "epoch": 1.98, "grad_norm": 7.591946601867676, "learning_rate": 2.7889016018306635e-07, "loss": 0.5472, "step": 6927 }, { "epoch": 1.98, "grad_norm": 9.330859184265137, "learning_rate": 2.745995423340961e-07, "loss": 0.6062, "step": 6928 }, { "epoch": 1.98, "grad_norm": 11.907567024230957, "learning_rate": 2.703089244851259e-07, "loss": 0.6068, "step": 6929 }, { "epoch": 1.98, "grad_norm": 10.675895690917969, "learning_rate": 2.6601830663615556e-07, "loss": 0.683, "step": 6930 }, { "epoch": 1.98, "grad_norm": 10.48024845123291, "learning_rate": 2.6172768878718534e-07, "loss": 0.4274, "step": 6931 }, { "epoch": 1.98, "grad_norm": 8.749180793762207, "learning_rate": 2.574370709382151e-07, "loss": 0.622, "step": 6932 }, { "epoch": 1.98, "grad_norm": 10.966500282287598, "learning_rate": 2.531464530892449e-07, "loss": 0.6093, "step": 6933 }, { "epoch": 1.98, "grad_norm": 8.568903923034668, "learning_rate": 2.488558352402746e-07, "loss": 0.6865, "step": 6934 }, { "epoch": 1.98, "grad_norm": 12.321759223937988, "learning_rate": 2.445652173913043e-07, "loss": 0.86, "step": 6935 }, { "epoch": 1.98, "grad_norm": 11.141681671142578, "learning_rate": 2.402745995423341e-07, "loss": 0.7193, "step": 6936 }, { "epoch": 1.98, "grad_norm": 8.963074684143066, "learning_rate": 2.3598398169336387e-07, "loss": 0.4704, "step": 6937 }, { "epoch": 1.98, "grad_norm": 10.483250617980957, "learning_rate": 2.316933638443936e-07, "loss": 0.6919, "step": 6938 }, { "epoch": 1.98, "grad_norm": 11.217721939086914, "learning_rate": 2.2740274599542334e-07, "loss": 0.6015, "step": 6939 }, { "epoch": 1.99, "grad_norm": 9.580672264099121, "learning_rate": 2.2311212814645312e-07, "loss": 0.5452, "step": 6940 }, { "epoch": 1.99, "grad_norm": 8.676939964294434, "learning_rate": 2.1882151029748284e-07, "loss": 0.4453, "step": 6941 }, { "epoch": 1.99, "grad_norm": 9.502946853637695, "learning_rate": 2.145308924485126e-07, "loss": 0.5431, "step": 6942 }, { "epoch": 1.99, "grad_norm": 7.5875773429870605, "learning_rate": 2.1024027459954233e-07, "loss": 0.5096, "step": 6943 }, { "epoch": 1.99, "grad_norm": 9.894153594970703, "learning_rate": 2.0594965675057208e-07, "loss": 0.6333, "step": 6944 }, { "epoch": 1.99, "grad_norm": 11.48495101928711, "learning_rate": 2.0165903890160183e-07, "loss": 0.6755, "step": 6945 }, { "epoch": 1.99, "grad_norm": 9.783747673034668, "learning_rate": 1.9736842105263157e-07, "loss": 0.6055, "step": 6946 }, { "epoch": 1.99, "grad_norm": 10.02064323425293, "learning_rate": 1.9307780320366135e-07, "loss": 0.4871, "step": 6947 }, { "epoch": 1.99, "grad_norm": 12.362141609191895, "learning_rate": 1.8878718535469107e-07, "loss": 0.6577, "step": 6948 }, { "epoch": 1.99, "grad_norm": 13.172623634338379, "learning_rate": 1.8449656750572084e-07, "loss": 0.6607, "step": 6949 }, { "epoch": 1.99, "grad_norm": 8.153654098510742, "learning_rate": 1.8020594965675056e-07, "loss": 0.5096, "step": 6950 }, { "epoch": 1.99, "grad_norm": 9.872180938720703, "learning_rate": 1.7591533180778034e-07, "loss": 0.5401, "step": 6951 }, { "epoch": 1.99, "grad_norm": 8.815258026123047, "learning_rate": 1.7162471395881006e-07, "loss": 0.5247, "step": 6952 }, { "epoch": 1.99, "grad_norm": 7.608972549438477, "learning_rate": 1.6733409610983983e-07, "loss": 0.4153, "step": 6953 }, { "epoch": 1.99, "grad_norm": 10.867547035217285, "learning_rate": 1.6304347826086958e-07, "loss": 0.7146, "step": 6954 }, { "epoch": 1.99, "grad_norm": 8.751923561096191, "learning_rate": 1.5875286041189932e-07, "loss": 0.4785, "step": 6955 }, { "epoch": 1.99, "grad_norm": 12.871894836425781, "learning_rate": 1.5446224256292907e-07, "loss": 0.7724, "step": 6956 }, { "epoch": 1.99, "grad_norm": 12.138284683227539, "learning_rate": 1.5017162471395882e-07, "loss": 0.687, "step": 6957 }, { "epoch": 1.99, "grad_norm": 12.697810173034668, "learning_rate": 1.4588100686498857e-07, "loss": 0.7714, "step": 6958 }, { "epoch": 1.99, "grad_norm": 10.738144874572754, "learning_rate": 1.4159038901601831e-07, "loss": 0.7474, "step": 6959 }, { "epoch": 1.99, "grad_norm": 12.630233764648438, "learning_rate": 1.3729977116704806e-07, "loss": 0.7768, "step": 6960 }, { "epoch": 1.99, "grad_norm": 10.851924896240234, "learning_rate": 1.3300915331807778e-07, "loss": 0.6323, "step": 6961 }, { "epoch": 1.99, "grad_norm": 11.448753356933594, "learning_rate": 1.2871853546910756e-07, "loss": 0.5869, "step": 6962 }, { "epoch": 1.99, "grad_norm": 10.769323348999023, "learning_rate": 1.244279176201373e-07, "loss": 0.6573, "step": 6963 }, { "epoch": 1.99, "grad_norm": 12.800044059753418, "learning_rate": 1.2013729977116705e-07, "loss": 0.6271, "step": 6964 }, { "epoch": 1.99, "grad_norm": 11.841652870178223, "learning_rate": 1.158466819221968e-07, "loss": 0.7032, "step": 6965 }, { "epoch": 1.99, "grad_norm": 11.031814575195312, "learning_rate": 1.1155606407322656e-07, "loss": 0.6932, "step": 6966 }, { "epoch": 1.99, "grad_norm": 10.714957237243652, "learning_rate": 1.072654462242563e-07, "loss": 0.6247, "step": 6967 }, { "epoch": 1.99, "grad_norm": 12.333422660827637, "learning_rate": 1.0297482837528604e-07, "loss": 0.8033, "step": 6968 }, { "epoch": 1.99, "grad_norm": 10.396112442016602, "learning_rate": 9.868421052631579e-08, "loss": 0.6173, "step": 6969 }, { "epoch": 1.99, "grad_norm": 10.016500473022461, "learning_rate": 9.439359267734553e-08, "loss": 0.5538, "step": 6970 }, { "epoch": 1.99, "grad_norm": 10.614705085754395, "learning_rate": 9.010297482837528e-08, "loss": 0.717, "step": 6971 }, { "epoch": 1.99, "grad_norm": 7.811626434326172, "learning_rate": 8.581235697940503e-08, "loss": 0.5426, "step": 6972 }, { "epoch": 1.99, "grad_norm": 9.46379566192627, "learning_rate": 8.152173913043479e-08, "loss": 0.57, "step": 6973 }, { "epoch": 1.99, "grad_norm": 10.381284713745117, "learning_rate": 7.723112128146454e-08, "loss": 0.6539, "step": 6974 }, { "epoch": 2.0, "grad_norm": 10.76805591583252, "learning_rate": 7.294050343249428e-08, "loss": 0.7282, "step": 6975 }, { "epoch": 2.0, "grad_norm": 9.52723217010498, "learning_rate": 6.864988558352403e-08, "loss": 0.6539, "step": 6976 }, { "epoch": 2.0, "grad_norm": 10.24339485168457, "learning_rate": 6.435926773455378e-08, "loss": 0.6859, "step": 6977 }, { "epoch": 2.0, "grad_norm": 11.314688682556152, "learning_rate": 6.006864988558353e-08, "loss": 0.6341, "step": 6978 }, { "epoch": 2.0, "grad_norm": 11.507288932800293, "learning_rate": 5.577803203661328e-08, "loss": 0.8784, "step": 6979 }, { "epoch": 2.0, "grad_norm": 9.23391342163086, "learning_rate": 5.148741418764302e-08, "loss": 0.6483, "step": 6980 }, { "epoch": 2.0, "grad_norm": 13.069464683532715, "learning_rate": 4.719679633867277e-08, "loss": 0.7351, "step": 6981 }, { "epoch": 2.0, "grad_norm": 12.40121078491211, "learning_rate": 4.2906178489702514e-08, "loss": 0.7474, "step": 6982 }, { "epoch": 2.0, "grad_norm": 11.74376392364502, "learning_rate": 3.861556064073227e-08, "loss": 0.7727, "step": 6983 }, { "epoch": 2.0, "grad_norm": 10.733787536621094, "learning_rate": 3.4324942791762015e-08, "loss": 0.6788, "step": 6984 }, { "epoch": 2.0, "grad_norm": 10.416940689086914, "learning_rate": 3.003432494279176e-08, "loss": 0.5102, "step": 6985 }, { "epoch": 2.0, "grad_norm": 9.378046989440918, "learning_rate": 2.574370709382151e-08, "loss": 0.5839, "step": 6986 }, { "epoch": 2.0, "grad_norm": 13.186521530151367, "learning_rate": 2.1453089244851257e-08, "loss": 0.5744, "step": 6987 }, { "epoch": 2.0, "grad_norm": 12.878933906555176, "learning_rate": 1.7162471395881008e-08, "loss": 0.8448, "step": 6988 }, { "epoch": 2.0, "grad_norm": 9.82333755493164, "learning_rate": 1.2871853546910755e-08, "loss": 0.451, "step": 6989 }, { "epoch": 2.0, "grad_norm": 10.506118774414062, "learning_rate": 8.581235697940504e-09, "loss": 0.6907, "step": 6990 }, { "epoch": 2.0, "grad_norm": 9.47684383392334, "learning_rate": 4.290617848970252e-09, "loss": 0.6083, "step": 6991 }, { "epoch": 2.0, "grad_norm": 14.73945140838623, "learning_rate": 0.0, "loss": 0.8721, "step": 6992 }, { "epoch": 2.0, "step": 6992, "total_flos": 1.315371792860375e+17, "train_loss": 1.0098489519694578, "train_runtime": 2148.6486, "train_samples_per_second": 312.383, "train_steps_per_second": 3.254 } ], "logging_steps": 1.0, "max_steps": 6992, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.315371792860375e+17, "train_batch_size": 96, "trial_name": null, "trial_params": null }