diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.428289573531644, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00030353619669145547, + "grad_norm": 9.52797794342041, + "learning_rate": 1e-05, + "loss": 5.0165, + "step": 1 + }, + { + "epoch": 0.0006070723933829109, + "grad_norm": 10.161993026733398, + "learning_rate": 2e-05, + "loss": 4.7408, + "step": 2 + }, + { + "epoch": 0.0009106085900743664, + "grad_norm": 7.550526142120361, + "learning_rate": 3e-05, + "loss": 5.0209, + "step": 3 + }, + { + "epoch": 0.0012141447867658219, + "grad_norm": 5.454105377197266, + "learning_rate": 4e-05, + "loss": 4.4045, + "step": 4 + }, + { + "epoch": 0.0015176809834572772, + "grad_norm": 3.715569257736206, + "learning_rate": 5e-05, + "loss": 4.0617, + "step": 5 + }, + { + "epoch": 0.0018212171801487327, + "grad_norm": 3.9456210136413574, + "learning_rate": 6e-05, + "loss": 3.813, + "step": 6 + }, + { + "epoch": 0.002124753376840188, + "grad_norm": 3.6386630535125732, + "learning_rate": 7e-05, + "loss": 3.5396, + "step": 7 + }, + { + "epoch": 0.0024282895735316438, + "grad_norm": 2.5329768657684326, + "learning_rate": 8e-05, + "loss": 3.1611, + "step": 8 + }, + { + "epoch": 0.002731825770223099, + "grad_norm": 2.09954571723938, + "learning_rate": 9e-05, + "loss": 2.8787, + "step": 9 + }, + { + "epoch": 0.0030353619669145544, + "grad_norm": 2.0083999633789062, + "learning_rate": 0.0001, + "loss": 2.6942, + "step": 10 + }, + { + "epoch": 0.00333889816360601, + "grad_norm": 1.419735074043274, + "learning_rate": 9.999392466585664e-05, + "loss": 2.5674, + "step": 11 + }, + { + "epoch": 0.0036424343602974654, + "grad_norm": 3.0809664726257324, + "learning_rate": 9.998784933171324e-05, + "loss": 2.2474, + "step": 12 + }, + { + "epoch": 0.003945970556988921, + "grad_norm": 1.4494595527648926, + "learning_rate": 9.998177399756987e-05, + "loss": 2.1931, + "step": 13 + }, + { + "epoch": 0.004249506753680376, + "grad_norm": 1.4052276611328125, + "learning_rate": 9.99756986634265e-05, + "loss": 2.2179, + "step": 14 + }, + { + "epoch": 0.004553042950371832, + "grad_norm": 1.0900732278823853, + "learning_rate": 9.996962332928312e-05, + "loss": 2.3455, + "step": 15 + }, + { + "epoch": 0.0048565791470632875, + "grad_norm": 1.078604817390442, + "learning_rate": 9.996354799513974e-05, + "loss": 2.1906, + "step": 16 + }, + { + "epoch": 0.005160115343754742, + "grad_norm": 1.0777554512023926, + "learning_rate": 9.995747266099635e-05, + "loss": 2.4069, + "step": 17 + }, + { + "epoch": 0.005463651540446198, + "grad_norm": 1.2703579664230347, + "learning_rate": 9.995139732685298e-05, + "loss": 2.1901, + "step": 18 + }, + { + "epoch": 0.005767187737137654, + "grad_norm": 2.06676983833313, + "learning_rate": 9.99453219927096e-05, + "loss": 2.4616, + "step": 19 + }, + { + "epoch": 0.006070723933829109, + "grad_norm": 1.0544441938400269, + "learning_rate": 9.993924665856622e-05, + "loss": 1.9529, + "step": 20 + }, + { + "epoch": 0.0063742601305205645, + "grad_norm": 1.1237947940826416, + "learning_rate": 9.993317132442285e-05, + "loss": 2.8619, + "step": 21 + }, + { + "epoch": 0.00667779632721202, + "grad_norm": 0.8750623464584351, + "learning_rate": 9.992709599027947e-05, + "loss": 2.0467, + "step": 22 + }, + { + "epoch": 0.006981332523903475, + "grad_norm": 0.8135535717010498, + "learning_rate": 9.992102065613608e-05, + "loss": 2.3566, + "step": 23 + }, + { + "epoch": 0.007284868720594931, + "grad_norm": 0.8838183879852295, + "learning_rate": 9.991494532199271e-05, + "loss": 2.2145, + "step": 24 + }, + { + "epoch": 0.007588404917286387, + "grad_norm": 0.7460266351699829, + "learning_rate": 9.990886998784935e-05, + "loss": 1.7996, + "step": 25 + }, + { + "epoch": 0.007891941113977842, + "grad_norm": 0.7469210028648376, + "learning_rate": 9.990279465370595e-05, + "loss": 1.9147, + "step": 26 + }, + { + "epoch": 0.008195477310669297, + "grad_norm": 0.796752393245697, + "learning_rate": 9.989671931956258e-05, + "loss": 1.6982, + "step": 27 + }, + { + "epoch": 0.008499013507360752, + "grad_norm": 0.9568108916282654, + "learning_rate": 9.989064398541921e-05, + "loss": 2.294, + "step": 28 + }, + { + "epoch": 0.008802549704052209, + "grad_norm": 0.7790305018424988, + "learning_rate": 9.988456865127583e-05, + "loss": 1.7283, + "step": 29 + }, + { + "epoch": 0.009106085900743664, + "grad_norm": 0.5705334544181824, + "learning_rate": 9.987849331713245e-05, + "loss": 2.0917, + "step": 30 + }, + { + "epoch": 0.009409622097435118, + "grad_norm": 0.8099403381347656, + "learning_rate": 9.987241798298906e-05, + "loss": 2.1187, + "step": 31 + }, + { + "epoch": 0.009713158294126575, + "grad_norm": 0.728687584400177, + "learning_rate": 9.98663426488457e-05, + "loss": 2.0019, + "step": 32 + }, + { + "epoch": 0.01001669449081803, + "grad_norm": 0.7341739535331726, + "learning_rate": 9.986026731470231e-05, + "loss": 2.2379, + "step": 33 + }, + { + "epoch": 0.010320230687509485, + "grad_norm": 0.6100563406944275, + "learning_rate": 9.985419198055893e-05, + "loss": 2.2385, + "step": 34 + }, + { + "epoch": 0.010623766884200941, + "grad_norm": 0.57859206199646, + "learning_rate": 9.984811664641556e-05, + "loss": 1.9331, + "step": 35 + }, + { + "epoch": 0.010927303080892396, + "grad_norm": 0.5878285765647888, + "learning_rate": 9.984204131227218e-05, + "loss": 1.8691, + "step": 36 + }, + { + "epoch": 0.011230839277583851, + "grad_norm": 0.5095940232276917, + "learning_rate": 9.98359659781288e-05, + "loss": 2.1699, + "step": 37 + }, + { + "epoch": 0.011534375474275308, + "grad_norm": 0.5028595924377441, + "learning_rate": 9.982989064398542e-05, + "loss": 1.6226, + "step": 38 + }, + { + "epoch": 0.011837911670966763, + "grad_norm": 0.6969617009162903, + "learning_rate": 9.982381530984206e-05, + "loss": 1.7049, + "step": 39 + }, + { + "epoch": 0.012141447867658217, + "grad_norm": 0.6432283520698547, + "learning_rate": 9.981773997569866e-05, + "loss": 1.645, + "step": 40 + }, + { + "epoch": 0.012444984064349674, + "grad_norm": 0.5575637221336365, + "learning_rate": 9.981166464155529e-05, + "loss": 2.122, + "step": 41 + }, + { + "epoch": 0.012748520261041129, + "grad_norm": 0.8630117177963257, + "learning_rate": 9.980558930741192e-05, + "loss": 2.4391, + "step": 42 + }, + { + "epoch": 0.013052056457732584, + "grad_norm": 0.7215672135353088, + "learning_rate": 9.979951397326854e-05, + "loss": 1.6903, + "step": 43 + }, + { + "epoch": 0.01335559265442404, + "grad_norm": 0.6649103164672852, + "learning_rate": 9.979343863912516e-05, + "loss": 2.0192, + "step": 44 + }, + { + "epoch": 0.013659128851115495, + "grad_norm": 0.7561375498771667, + "learning_rate": 9.978736330498177e-05, + "loss": 2.1745, + "step": 45 + }, + { + "epoch": 0.01396266504780695, + "grad_norm": 1.5740697383880615, + "learning_rate": 9.97812879708384e-05, + "loss": 2.4883, + "step": 46 + }, + { + "epoch": 0.014266201244498407, + "grad_norm": 0.49843546748161316, + "learning_rate": 9.977521263669502e-05, + "loss": 2.3359, + "step": 47 + }, + { + "epoch": 0.014569737441189862, + "grad_norm": 0.6524083018302917, + "learning_rate": 9.976913730255164e-05, + "loss": 2.0283, + "step": 48 + }, + { + "epoch": 0.014873273637881317, + "grad_norm": 0.5995165705680847, + "learning_rate": 9.976306196840827e-05, + "loss": 2.2854, + "step": 49 + }, + { + "epoch": 0.015176809834572773, + "grad_norm": 0.533091127872467, + "learning_rate": 9.975698663426489e-05, + "loss": 1.7316, + "step": 50 + }, + { + "epoch": 0.015480346031264228, + "grad_norm": 0.4611203372478485, + "learning_rate": 9.97509113001215e-05, + "loss": 1.9873, + "step": 51 + }, + { + "epoch": 0.015783882227955685, + "grad_norm": 0.5517066121101379, + "learning_rate": 9.974483596597813e-05, + "loss": 2.3221, + "step": 52 + }, + { + "epoch": 0.01608741842464714, + "grad_norm": 1.1481316089630127, + "learning_rate": 9.973876063183477e-05, + "loss": 2.1987, + "step": 53 + }, + { + "epoch": 0.016390954621338594, + "grad_norm": 0.5169709324836731, + "learning_rate": 9.973268529769137e-05, + "loss": 2.0714, + "step": 54 + }, + { + "epoch": 0.01669449081803005, + "grad_norm": 0.5325965881347656, + "learning_rate": 9.9726609963548e-05, + "loss": 2.0495, + "step": 55 + }, + { + "epoch": 0.016998027014721504, + "grad_norm": 0.5272805690765381, + "learning_rate": 9.972053462940463e-05, + "loss": 1.6467, + "step": 56 + }, + { + "epoch": 0.017301563211412962, + "grad_norm": 0.5756974816322327, + "learning_rate": 9.971445929526125e-05, + "loss": 1.8153, + "step": 57 + }, + { + "epoch": 0.017605099408104417, + "grad_norm": 0.49965259432792664, + "learning_rate": 9.970838396111787e-05, + "loss": 1.7549, + "step": 58 + }, + { + "epoch": 0.017908635604795872, + "grad_norm": 0.4551718235015869, + "learning_rate": 9.970230862697448e-05, + "loss": 2.0268, + "step": 59 + }, + { + "epoch": 0.018212171801487327, + "grad_norm": 0.4995061159133911, + "learning_rate": 9.969623329283111e-05, + "loss": 2.1194, + "step": 60 + }, + { + "epoch": 0.018515707998178782, + "grad_norm": 0.6005909442901611, + "learning_rate": 9.969015795868773e-05, + "loss": 2.0995, + "step": 61 + }, + { + "epoch": 0.018819244194870237, + "grad_norm": 0.5313609838485718, + "learning_rate": 9.968408262454435e-05, + "loss": 2.2653, + "step": 62 + }, + { + "epoch": 0.019122780391561695, + "grad_norm": 0.4645906388759613, + "learning_rate": 9.967800729040098e-05, + "loss": 2.0501, + "step": 63 + }, + { + "epoch": 0.01942631658825315, + "grad_norm": 0.4981083869934082, + "learning_rate": 9.96719319562576e-05, + "loss": 1.2802, + "step": 64 + }, + { + "epoch": 0.019729852784944605, + "grad_norm": 0.7034462094306946, + "learning_rate": 9.966585662211421e-05, + "loss": 1.8468, + "step": 65 + }, + { + "epoch": 0.02003338898163606, + "grad_norm": 0.5249907374382019, + "learning_rate": 9.965978128797084e-05, + "loss": 1.7569, + "step": 66 + }, + { + "epoch": 0.020336925178327515, + "grad_norm": 0.7569686770439148, + "learning_rate": 9.965370595382748e-05, + "loss": 2.2157, + "step": 67 + }, + { + "epoch": 0.02064046137501897, + "grad_norm": 0.7423145174980164, + "learning_rate": 9.964763061968408e-05, + "loss": 1.961, + "step": 68 + }, + { + "epoch": 0.020943997571710428, + "grad_norm": 0.6891425251960754, + "learning_rate": 9.964155528554071e-05, + "loss": 2.0419, + "step": 69 + }, + { + "epoch": 0.021247533768401883, + "grad_norm": 0.5633382201194763, + "learning_rate": 9.963547995139734e-05, + "loss": 1.7981, + "step": 70 + }, + { + "epoch": 0.021551069965093338, + "grad_norm": 0.4792400598526001, + "learning_rate": 9.962940461725396e-05, + "loss": 1.7946, + "step": 71 + }, + { + "epoch": 0.021854606161784793, + "grad_norm": 0.43436333537101746, + "learning_rate": 9.962332928311058e-05, + "loss": 1.9947, + "step": 72 + }, + { + "epoch": 0.022158142358476247, + "grad_norm": 0.511132001876831, + "learning_rate": 9.96172539489672e-05, + "loss": 1.8512, + "step": 73 + }, + { + "epoch": 0.022461678555167702, + "grad_norm": 0.5628978610038757, + "learning_rate": 9.961117861482382e-05, + "loss": 2.3258, + "step": 74 + }, + { + "epoch": 0.02276521475185916, + "grad_norm": 0.5179631114006042, + "learning_rate": 9.960510328068044e-05, + "loss": 2.1093, + "step": 75 + }, + { + "epoch": 0.023068750948550616, + "grad_norm": 0.45745086669921875, + "learning_rate": 9.959902794653706e-05, + "loss": 2.0091, + "step": 76 + }, + { + "epoch": 0.02337228714524207, + "grad_norm": 0.49223676323890686, + "learning_rate": 9.959295261239369e-05, + "loss": 1.8603, + "step": 77 + }, + { + "epoch": 0.023675823341933525, + "grad_norm": 0.44269105792045593, + "learning_rate": 9.958687727825031e-05, + "loss": 2.0431, + "step": 78 + }, + { + "epoch": 0.02397935953862498, + "grad_norm": 0.45361781120300293, + "learning_rate": 9.958080194410692e-05, + "loss": 2.1202, + "step": 79 + }, + { + "epoch": 0.024282895735316435, + "grad_norm": 0.4460793137550354, + "learning_rate": 9.957472660996356e-05, + "loss": 2.0395, + "step": 80 + }, + { + "epoch": 0.024586431932007893, + "grad_norm": 0.3576311767101288, + "learning_rate": 9.956865127582019e-05, + "loss": 1.6929, + "step": 81 + }, + { + "epoch": 0.024889968128699348, + "grad_norm": 0.47067755460739136, + "learning_rate": 9.956257594167679e-05, + "loss": 1.7033, + "step": 82 + }, + { + "epoch": 0.025193504325390803, + "grad_norm": 0.40777909755706787, + "learning_rate": 9.955650060753342e-05, + "loss": 1.8594, + "step": 83 + }, + { + "epoch": 0.025497040522082258, + "grad_norm": 0.4231606721878052, + "learning_rate": 9.955042527339005e-05, + "loss": 1.9413, + "step": 84 + }, + { + "epoch": 0.025800576718773713, + "grad_norm": 0.4901526868343353, + "learning_rate": 9.954434993924666e-05, + "loss": 1.5754, + "step": 85 + }, + { + "epoch": 0.026104112915465168, + "grad_norm": 0.4473549723625183, + "learning_rate": 9.953827460510329e-05, + "loss": 2.0099, + "step": 86 + }, + { + "epoch": 0.026407649112156626, + "grad_norm": 0.4234200119972229, + "learning_rate": 9.95321992709599e-05, + "loss": 1.9134, + "step": 87 + }, + { + "epoch": 0.02671118530884808, + "grad_norm": 0.497842937707901, + "learning_rate": 9.952612393681653e-05, + "loss": 1.5014, + "step": 88 + }, + { + "epoch": 0.027014721505539536, + "grad_norm": 0.4480627775192261, + "learning_rate": 9.952004860267315e-05, + "loss": 1.8608, + "step": 89 + }, + { + "epoch": 0.02731825770223099, + "grad_norm": 0.4578416049480438, + "learning_rate": 9.951397326852977e-05, + "loss": 1.9455, + "step": 90 + }, + { + "epoch": 0.027621793898922446, + "grad_norm": 0.4651184380054474, + "learning_rate": 9.95078979343864e-05, + "loss": 1.7394, + "step": 91 + }, + { + "epoch": 0.0279253300956139, + "grad_norm": 0.48281168937683105, + "learning_rate": 9.950182260024302e-05, + "loss": 2.219, + "step": 92 + }, + { + "epoch": 0.02822886629230536, + "grad_norm": 0.3925339877605438, + "learning_rate": 9.949574726609963e-05, + "loss": 1.9998, + "step": 93 + }, + { + "epoch": 0.028532402488996814, + "grad_norm": 0.5093829035758972, + "learning_rate": 9.948967193195627e-05, + "loss": 1.5958, + "step": 94 + }, + { + "epoch": 0.02883593868568827, + "grad_norm": 0.4480256736278534, + "learning_rate": 9.94835965978129e-05, + "loss": 1.7606, + "step": 95 + }, + { + "epoch": 0.029139474882379723, + "grad_norm": 0.41442152857780457, + "learning_rate": 9.94775212636695e-05, + "loss": 1.7481, + "step": 96 + }, + { + "epoch": 0.029443011079071178, + "grad_norm": 0.373604953289032, + "learning_rate": 9.947144592952613e-05, + "loss": 1.91, + "step": 97 + }, + { + "epoch": 0.029746547275762633, + "grad_norm": 0.4274522662162781, + "learning_rate": 9.946537059538275e-05, + "loss": 1.9125, + "step": 98 + }, + { + "epoch": 0.03005008347245409, + "grad_norm": 0.47791674733161926, + "learning_rate": 9.945929526123937e-05, + "loss": 2.0647, + "step": 99 + }, + { + "epoch": 0.030353619669145546, + "grad_norm": 0.456820547580719, + "learning_rate": 9.9453219927096e-05, + "loss": 2.1057, + "step": 100 + }, + { + "epoch": 0.030657155865837, + "grad_norm": 0.41789788007736206, + "learning_rate": 9.944714459295261e-05, + "loss": 2.0159, + "step": 101 + }, + { + "epoch": 0.030960692062528456, + "grad_norm": 0.4459668695926666, + "learning_rate": 9.944106925880924e-05, + "loss": 1.927, + "step": 102 + }, + { + "epoch": 0.031264228259219914, + "grad_norm": 0.372925341129303, + "learning_rate": 9.943499392466586e-05, + "loss": 1.6992, + "step": 103 + }, + { + "epoch": 0.03156776445591137, + "grad_norm": 0.4778668284416199, + "learning_rate": 9.942891859052248e-05, + "loss": 2.1148, + "step": 104 + }, + { + "epoch": 0.031871300652602824, + "grad_norm": 0.4480198323726654, + "learning_rate": 9.942284325637911e-05, + "loss": 1.9734, + "step": 105 + }, + { + "epoch": 0.03217483684929428, + "grad_norm": 0.40198591351509094, + "learning_rate": 9.941676792223573e-05, + "loss": 1.5448, + "step": 106 + }, + { + "epoch": 0.032478373045985734, + "grad_norm": 0.40328651666641235, + "learning_rate": 9.941069258809234e-05, + "loss": 2.1084, + "step": 107 + }, + { + "epoch": 0.03278190924267719, + "grad_norm": 0.43856972455978394, + "learning_rate": 9.940461725394898e-05, + "loss": 2.1748, + "step": 108 + }, + { + "epoch": 0.033085445439368644, + "grad_norm": 0.46910691261291504, + "learning_rate": 9.939854191980559e-05, + "loss": 1.9526, + "step": 109 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.4143713116645813, + "learning_rate": 9.939246658566221e-05, + "loss": 1.9133, + "step": 110 + }, + { + "epoch": 0.03369251783275155, + "grad_norm": 0.45832857489585876, + "learning_rate": 9.938639125151884e-05, + "loss": 1.7964, + "step": 111 + }, + { + "epoch": 0.03399605402944301, + "grad_norm": 0.4263196587562561, + "learning_rate": 9.938031591737546e-05, + "loss": 1.9231, + "step": 112 + }, + { + "epoch": 0.03429959022613446, + "grad_norm": 0.38841062784194946, + "learning_rate": 9.937424058323208e-05, + "loss": 1.9941, + "step": 113 + }, + { + "epoch": 0.034603126422825925, + "grad_norm": 0.39627939462661743, + "learning_rate": 9.93681652490887e-05, + "loss": 1.591, + "step": 114 + }, + { + "epoch": 0.03490666261951738, + "grad_norm": 0.4354992210865021, + "learning_rate": 9.936208991494532e-05, + "loss": 1.8843, + "step": 115 + }, + { + "epoch": 0.035210198816208835, + "grad_norm": 0.4674322009086609, + "learning_rate": 9.935601458080195e-05, + "loss": 1.9687, + "step": 116 + }, + { + "epoch": 0.03551373501290029, + "grad_norm": 0.4263432025909424, + "learning_rate": 9.934993924665857e-05, + "loss": 2.0061, + "step": 117 + }, + { + "epoch": 0.035817271209591744, + "grad_norm": 0.4172697067260742, + "learning_rate": 9.934386391251519e-05, + "loss": 2.0079, + "step": 118 + }, + { + "epoch": 0.0361208074062832, + "grad_norm": 0.35841792821884155, + "learning_rate": 9.933778857837182e-05, + "loss": 2.0048, + "step": 119 + }, + { + "epoch": 0.036424343602974654, + "grad_norm": 0.4118800759315491, + "learning_rate": 9.933171324422844e-05, + "loss": 1.8136, + "step": 120 + }, + { + "epoch": 0.03672787979966611, + "grad_norm": 0.4894438087940216, + "learning_rate": 9.932563791008505e-05, + "loss": 1.9214, + "step": 121 + }, + { + "epoch": 0.037031415996357564, + "grad_norm": 0.4079352915287018, + "learning_rate": 9.931956257594169e-05, + "loss": 2.2445, + "step": 122 + }, + { + "epoch": 0.03733495219304902, + "grad_norm": 0.41293051838874817, + "learning_rate": 9.93134872417983e-05, + "loss": 2.0845, + "step": 123 + }, + { + "epoch": 0.037638488389740474, + "grad_norm": 0.4413944482803345, + "learning_rate": 9.930741190765492e-05, + "loss": 1.8782, + "step": 124 + }, + { + "epoch": 0.03794202458643193, + "grad_norm": 0.4036192297935486, + "learning_rate": 9.930133657351155e-05, + "loss": 1.9393, + "step": 125 + }, + { + "epoch": 0.03824556078312339, + "grad_norm": 0.7759333848953247, + "learning_rate": 9.929526123936817e-05, + "loss": 1.9035, + "step": 126 + }, + { + "epoch": 0.038549096979814845, + "grad_norm": 0.4737033247947693, + "learning_rate": 9.928918590522479e-05, + "loss": 2.0535, + "step": 127 + }, + { + "epoch": 0.0388526331765063, + "grad_norm": 0.5254648923873901, + "learning_rate": 9.928311057108142e-05, + "loss": 1.9854, + "step": 128 + }, + { + "epoch": 0.039156169373197755, + "grad_norm": 0.46957090497016907, + "learning_rate": 9.927703523693803e-05, + "loss": 2.1865, + "step": 129 + }, + { + "epoch": 0.03945970556988921, + "grad_norm": 0.4427931010723114, + "learning_rate": 9.927095990279466e-05, + "loss": 1.9696, + "step": 130 + }, + { + "epoch": 0.039763241766580665, + "grad_norm": 0.42948615550994873, + "learning_rate": 9.926488456865128e-05, + "loss": 1.5367, + "step": 131 + }, + { + "epoch": 0.04006677796327212, + "grad_norm": 0.3952697515487671, + "learning_rate": 9.92588092345079e-05, + "loss": 1.9648, + "step": 132 + }, + { + "epoch": 0.040370314159963575, + "grad_norm": 0.41384372115135193, + "learning_rate": 9.925273390036453e-05, + "loss": 1.9115, + "step": 133 + }, + { + "epoch": 0.04067385035665503, + "grad_norm": 0.44592148065567017, + "learning_rate": 9.924665856622115e-05, + "loss": 2.2336, + "step": 134 + }, + { + "epoch": 0.040977386553346484, + "grad_norm": 0.43720191717147827, + "learning_rate": 9.924058323207776e-05, + "loss": 2.1014, + "step": 135 + }, + { + "epoch": 0.04128092275003794, + "grad_norm": 0.6224471926689148, + "learning_rate": 9.92345078979344e-05, + "loss": 1.9093, + "step": 136 + }, + { + "epoch": 0.041584458946729394, + "grad_norm": 0.40913721919059753, + "learning_rate": 9.922843256379101e-05, + "loss": 2.036, + "step": 137 + }, + { + "epoch": 0.041887995143420856, + "grad_norm": 0.5675486922264099, + "learning_rate": 9.922235722964763e-05, + "loss": 1.7925, + "step": 138 + }, + { + "epoch": 0.04219153134011231, + "grad_norm": 0.4174894690513611, + "learning_rate": 9.921628189550426e-05, + "loss": 1.742, + "step": 139 + }, + { + "epoch": 0.042495067536803766, + "grad_norm": 0.5149232745170593, + "learning_rate": 9.921020656136088e-05, + "loss": 2.0117, + "step": 140 + }, + { + "epoch": 0.04279860373349522, + "grad_norm": 0.4599703252315521, + "learning_rate": 9.92041312272175e-05, + "loss": 1.9401, + "step": 141 + }, + { + "epoch": 0.043102139930186675, + "grad_norm": 0.39801791310310364, + "learning_rate": 9.919805589307413e-05, + "loss": 1.74, + "step": 142 + }, + { + "epoch": 0.04340567612687813, + "grad_norm": 0.4469515085220337, + "learning_rate": 9.919198055893074e-05, + "loss": 1.9919, + "step": 143 + }, + { + "epoch": 0.043709212323569585, + "grad_norm": 0.4179072678089142, + "learning_rate": 9.918590522478737e-05, + "loss": 1.9618, + "step": 144 + }, + { + "epoch": 0.04401274852026104, + "grad_norm": 0.3512915372848511, + "learning_rate": 9.917982989064399e-05, + "loss": 2.0603, + "step": 145 + }, + { + "epoch": 0.044316284716952495, + "grad_norm": 0.6461288928985596, + "learning_rate": 9.917375455650061e-05, + "loss": 1.9461, + "step": 146 + }, + { + "epoch": 0.04461982091364395, + "grad_norm": 0.4113643169403076, + "learning_rate": 9.916767922235724e-05, + "loss": 1.5332, + "step": 147 + }, + { + "epoch": 0.044923357110335405, + "grad_norm": 0.5560798645019531, + "learning_rate": 9.916160388821386e-05, + "loss": 1.678, + "step": 148 + }, + { + "epoch": 0.045226893307026866, + "grad_norm": 0.5448784828186035, + "learning_rate": 9.915552855407047e-05, + "loss": 1.8036, + "step": 149 + }, + { + "epoch": 0.04553042950371832, + "grad_norm": 0.4570043087005615, + "learning_rate": 9.91494532199271e-05, + "loss": 2.0126, + "step": 150 + }, + { + "epoch": 0.045833965700409776, + "grad_norm": 0.4167179465293884, + "learning_rate": 9.914337788578372e-05, + "loss": 1.7567, + "step": 151 + }, + { + "epoch": 0.04613750189710123, + "grad_norm": 1.3264193534851074, + "learning_rate": 9.913730255164034e-05, + "loss": 2.012, + "step": 152 + }, + { + "epoch": 0.046441038093792686, + "grad_norm": 0.45362886786460876, + "learning_rate": 9.913122721749697e-05, + "loss": 1.8789, + "step": 153 + }, + { + "epoch": 0.04674457429048414, + "grad_norm": 2.0713539123535156, + "learning_rate": 9.912515188335359e-05, + "loss": 2.0798, + "step": 154 + }, + { + "epoch": 0.047048110487175596, + "grad_norm": 1.430906891822815, + "learning_rate": 9.91190765492102e-05, + "loss": 1.9401, + "step": 155 + }, + { + "epoch": 0.04735164668386705, + "grad_norm": 0.846182107925415, + "learning_rate": 9.911300121506684e-05, + "loss": 1.9073, + "step": 156 + }, + { + "epoch": 0.047655182880558505, + "grad_norm": 0.5027226805686951, + "learning_rate": 9.910692588092345e-05, + "loss": 2.1521, + "step": 157 + }, + { + "epoch": 0.04795871907724996, + "grad_norm": 0.32647275924682617, + "learning_rate": 9.910085054678007e-05, + "loss": 1.705, + "step": 158 + }, + { + "epoch": 0.048262255273941415, + "grad_norm": 0.4337715804576874, + "learning_rate": 9.90947752126367e-05, + "loss": 1.9844, + "step": 159 + }, + { + "epoch": 0.04856579147063287, + "grad_norm": 0.4408979117870331, + "learning_rate": 9.908869987849332e-05, + "loss": 1.81, + "step": 160 + }, + { + "epoch": 0.04886932766732433, + "grad_norm": 3.5793535709381104, + "learning_rate": 9.908262454434995e-05, + "loss": 1.8569, + "step": 161 + }, + { + "epoch": 0.04917286386401579, + "grad_norm": 0.47893545031547546, + "learning_rate": 9.907654921020657e-05, + "loss": 1.9397, + "step": 162 + }, + { + "epoch": 0.04947640006070724, + "grad_norm": 0.36375802755355835, + "learning_rate": 9.907047387606318e-05, + "loss": 1.8064, + "step": 163 + }, + { + "epoch": 0.049779936257398696, + "grad_norm": 0.3935683071613312, + "learning_rate": 9.906439854191982e-05, + "loss": 2.0968, + "step": 164 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 2.1048178672790527, + "learning_rate": 9.905832320777643e-05, + "loss": 1.8488, + "step": 165 + }, + { + "epoch": 0.050387008650781606, + "grad_norm": 0.35579410195350647, + "learning_rate": 9.905224787363305e-05, + "loss": 1.837, + "step": 166 + }, + { + "epoch": 0.05069054484747306, + "grad_norm": 0.37291133403778076, + "learning_rate": 9.904617253948968e-05, + "loss": 1.5921, + "step": 167 + }, + { + "epoch": 0.050994081044164516, + "grad_norm": 0.37633177638053894, + "learning_rate": 9.90400972053463e-05, + "loss": 1.9613, + "step": 168 + }, + { + "epoch": 0.05129761724085597, + "grad_norm": 0.49120867252349854, + "learning_rate": 9.903402187120292e-05, + "loss": 2.1737, + "step": 169 + }, + { + "epoch": 0.051601153437547426, + "grad_norm": 0.41102972626686096, + "learning_rate": 9.902794653705955e-05, + "loss": 1.8833, + "step": 170 + }, + { + "epoch": 0.05190468963423888, + "grad_norm": 0.3835681676864624, + "learning_rate": 9.902187120291616e-05, + "loss": 2.0283, + "step": 171 + }, + { + "epoch": 0.052208225830930335, + "grad_norm": 0.4194372296333313, + "learning_rate": 9.901579586877278e-05, + "loss": 1.6146, + "step": 172 + }, + { + "epoch": 0.0525117620276218, + "grad_norm": 0.411516934633255, + "learning_rate": 9.900972053462941e-05, + "loss": 2.0012, + "step": 173 + }, + { + "epoch": 0.05281529822431325, + "grad_norm": 0.4565434455871582, + "learning_rate": 9.900364520048603e-05, + "loss": 2.2415, + "step": 174 + }, + { + "epoch": 0.05311883442100471, + "grad_norm": 0.4352016746997833, + "learning_rate": 9.899756986634266e-05, + "loss": 1.6505, + "step": 175 + }, + { + "epoch": 0.05342237061769616, + "grad_norm": 0.5442507266998291, + "learning_rate": 9.899149453219928e-05, + "loss": 1.9692, + "step": 176 + }, + { + "epoch": 0.05372590681438762, + "grad_norm": 0.39451470971107483, + "learning_rate": 9.89854191980559e-05, + "loss": 1.9682, + "step": 177 + }, + { + "epoch": 0.05402944301107907, + "grad_norm": 0.34474217891693115, + "learning_rate": 9.897934386391253e-05, + "loss": 1.6806, + "step": 178 + }, + { + "epoch": 0.054332979207770526, + "grad_norm": 0.45165541768074036, + "learning_rate": 9.897326852976914e-05, + "loss": 1.9329, + "step": 179 + }, + { + "epoch": 0.05463651540446198, + "grad_norm": 0.5402116775512695, + "learning_rate": 9.896719319562576e-05, + "loss": 1.6939, + "step": 180 + }, + { + "epoch": 0.054940051601153436, + "grad_norm": 0.40272051095962524, + "learning_rate": 9.896111786148239e-05, + "loss": 1.985, + "step": 181 + }, + { + "epoch": 0.05524358779784489, + "grad_norm": 0.3833436667919159, + "learning_rate": 9.895504252733901e-05, + "loss": 1.7804, + "step": 182 + }, + { + "epoch": 0.055547123994536346, + "grad_norm": 0.34711307287216187, + "learning_rate": 9.894896719319563e-05, + "loss": 1.865, + "step": 183 + }, + { + "epoch": 0.0558506601912278, + "grad_norm": 0.43081262707710266, + "learning_rate": 9.894289185905226e-05, + "loss": 1.8066, + "step": 184 + }, + { + "epoch": 0.05615419638791926, + "grad_norm": 0.38740819692611694, + "learning_rate": 9.893681652490887e-05, + "loss": 1.738, + "step": 185 + }, + { + "epoch": 0.05645773258461072, + "grad_norm": 0.46878042817115784, + "learning_rate": 9.893074119076549e-05, + "loss": 1.4563, + "step": 186 + }, + { + "epoch": 0.05676126878130217, + "grad_norm": 0.4415140151977539, + "learning_rate": 9.892466585662212e-05, + "loss": 1.565, + "step": 187 + }, + { + "epoch": 0.05706480497799363, + "grad_norm": 0.43196091055870056, + "learning_rate": 9.891859052247874e-05, + "loss": 2.1562, + "step": 188 + }, + { + "epoch": 0.05736834117468508, + "grad_norm": 0.4677179455757141, + "learning_rate": 9.891251518833537e-05, + "loss": 2.0413, + "step": 189 + }, + { + "epoch": 0.05767187737137654, + "grad_norm": 0.4087100327014923, + "learning_rate": 9.890643985419199e-05, + "loss": 1.5434, + "step": 190 + }, + { + "epoch": 0.05797541356806799, + "grad_norm": 0.37906375527381897, + "learning_rate": 9.89003645200486e-05, + "loss": 1.5561, + "step": 191 + }, + { + "epoch": 0.05827894976475945, + "grad_norm": 0.5014649033546448, + "learning_rate": 9.889428918590524e-05, + "loss": 1.6243, + "step": 192 + }, + { + "epoch": 0.0585824859614509, + "grad_norm": 0.6972336769104004, + "learning_rate": 9.888821385176185e-05, + "loss": 2.1616, + "step": 193 + }, + { + "epoch": 0.058886022158142357, + "grad_norm": 0.46012699604034424, + "learning_rate": 9.888213851761847e-05, + "loss": 2.1195, + "step": 194 + }, + { + "epoch": 0.05918955835483381, + "grad_norm": 0.36921924352645874, + "learning_rate": 9.88760631834751e-05, + "loss": 2.1071, + "step": 195 + }, + { + "epoch": 0.059493094551525266, + "grad_norm": 0.36246025562286377, + "learning_rate": 9.886998784933172e-05, + "loss": 1.9948, + "step": 196 + }, + { + "epoch": 0.05979663074821673, + "grad_norm": 0.3935892581939697, + "learning_rate": 9.886391251518834e-05, + "loss": 2.3065, + "step": 197 + }, + { + "epoch": 0.06010016694490818, + "grad_norm": 0.36333411931991577, + "learning_rate": 9.885783718104497e-05, + "loss": 1.8746, + "step": 198 + }, + { + "epoch": 0.06040370314159964, + "grad_norm": 0.4027535617351532, + "learning_rate": 9.885176184690158e-05, + "loss": 2.156, + "step": 199 + }, + { + "epoch": 0.06070723933829109, + "grad_norm": 0.42472004890441895, + "learning_rate": 9.88456865127582e-05, + "loss": 1.8647, + "step": 200 + }, + { + "epoch": 0.06101077553498255, + "grad_norm": 0.38055720925331116, + "learning_rate": 9.883961117861483e-05, + "loss": 1.9185, + "step": 201 + }, + { + "epoch": 0.061314311731674, + "grad_norm": 0.3831098973751068, + "learning_rate": 9.883353584447145e-05, + "loss": 2.2488, + "step": 202 + }, + { + "epoch": 0.06161784792836546, + "grad_norm": 0.35769203305244446, + "learning_rate": 9.882746051032808e-05, + "loss": 1.9281, + "step": 203 + }, + { + "epoch": 0.06192138412505691, + "grad_norm": 0.3576291799545288, + "learning_rate": 9.88213851761847e-05, + "loss": 1.7082, + "step": 204 + }, + { + "epoch": 0.06222492032174837, + "grad_norm": 0.3641425669193268, + "learning_rate": 9.881530984204131e-05, + "loss": 1.9374, + "step": 205 + }, + { + "epoch": 0.06252845651843983, + "grad_norm": 0.4281562268733978, + "learning_rate": 9.880923450789795e-05, + "loss": 2.0797, + "step": 206 + }, + { + "epoch": 0.06283199271513128, + "grad_norm": 0.39947500824928284, + "learning_rate": 9.880315917375455e-05, + "loss": 2.1399, + "step": 207 + }, + { + "epoch": 0.06313552891182274, + "grad_norm": 0.4200506806373596, + "learning_rate": 9.879708383961118e-05, + "loss": 2.0443, + "step": 208 + }, + { + "epoch": 0.06343906510851419, + "grad_norm": 0.35776716470718384, + "learning_rate": 9.879100850546781e-05, + "loss": 1.9637, + "step": 209 + }, + { + "epoch": 0.06374260130520565, + "grad_norm": 0.3676275610923767, + "learning_rate": 9.878493317132443e-05, + "loss": 2.2018, + "step": 210 + }, + { + "epoch": 0.0640461375018971, + "grad_norm": 0.48199740052223206, + "learning_rate": 9.877885783718105e-05, + "loss": 1.8948, + "step": 211 + }, + { + "epoch": 0.06434967369858856, + "grad_norm": 0.40157443284988403, + "learning_rate": 9.877278250303768e-05, + "loss": 1.9011, + "step": 212 + }, + { + "epoch": 0.06465320989528, + "grad_norm": 0.3959876596927643, + "learning_rate": 9.87667071688943e-05, + "loss": 1.8019, + "step": 213 + }, + { + "epoch": 0.06495674609197147, + "grad_norm": 0.4266337752342224, + "learning_rate": 9.876063183475091e-05, + "loss": 1.6282, + "step": 214 + }, + { + "epoch": 0.06526028228866293, + "grad_norm": 0.5142415165901184, + "learning_rate": 9.875455650060754e-05, + "loss": 2.014, + "step": 215 + }, + { + "epoch": 0.06556381848535438, + "grad_norm": 0.3834533095359802, + "learning_rate": 9.874848116646416e-05, + "loss": 2.2733, + "step": 216 + }, + { + "epoch": 0.06586735468204584, + "grad_norm": 0.4485650062561035, + "learning_rate": 9.874240583232079e-05, + "loss": 2.0707, + "step": 217 + }, + { + "epoch": 0.06617089087873729, + "grad_norm": 0.37866663932800293, + "learning_rate": 9.873633049817741e-05, + "loss": 2.2419, + "step": 218 + }, + { + "epoch": 0.06647442707542875, + "grad_norm": 0.5389169454574585, + "learning_rate": 9.873025516403402e-05, + "loss": 1.9736, + "step": 219 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.3923830986022949, + "learning_rate": 9.872417982989066e-05, + "loss": 2.0164, + "step": 220 + }, + { + "epoch": 0.06708149946881166, + "grad_norm": 0.431417316198349, + "learning_rate": 9.871810449574726e-05, + "loss": 1.7535, + "step": 221 + }, + { + "epoch": 0.0673850356655031, + "grad_norm": 0.4980961084365845, + "learning_rate": 9.871202916160389e-05, + "loss": 2.0751, + "step": 222 + }, + { + "epoch": 0.06768857186219457, + "grad_norm": 0.38455912470817566, + "learning_rate": 9.870595382746052e-05, + "loss": 1.9394, + "step": 223 + }, + { + "epoch": 0.06799210805888602, + "grad_norm": 0.3911600708961487, + "learning_rate": 9.869987849331714e-05, + "loss": 1.6384, + "step": 224 + }, + { + "epoch": 0.06829564425557748, + "grad_norm": 0.39567652344703674, + "learning_rate": 9.869380315917376e-05, + "loss": 2.0082, + "step": 225 + }, + { + "epoch": 0.06859918045226893, + "grad_norm": 0.3773573935031891, + "learning_rate": 9.868772782503039e-05, + "loss": 2.0852, + "step": 226 + }, + { + "epoch": 0.06890271664896039, + "grad_norm": 0.4387274384498596, + "learning_rate": 9.8681652490887e-05, + "loss": 2.2043, + "step": 227 + }, + { + "epoch": 0.06920625284565185, + "grad_norm": 0.4070594012737274, + "learning_rate": 9.867557715674362e-05, + "loss": 1.8638, + "step": 228 + }, + { + "epoch": 0.0695097890423433, + "grad_norm": 0.5250163674354553, + "learning_rate": 9.866950182260025e-05, + "loss": 2.0826, + "step": 229 + }, + { + "epoch": 0.06981332523903476, + "grad_norm": 0.47242820262908936, + "learning_rate": 9.866342648845687e-05, + "loss": 1.5517, + "step": 230 + }, + { + "epoch": 0.07011686143572621, + "grad_norm": 0.41242364048957825, + "learning_rate": 9.865735115431349e-05, + "loss": 1.3791, + "step": 231 + }, + { + "epoch": 0.07042039763241767, + "grad_norm": 0.4049898087978363, + "learning_rate": 9.865127582017012e-05, + "loss": 2.0933, + "step": 232 + }, + { + "epoch": 0.07072393382910912, + "grad_norm": 0.5341041684150696, + "learning_rate": 9.864520048602673e-05, + "loss": 1.8293, + "step": 233 + }, + { + "epoch": 0.07102747002580058, + "grad_norm": 0.4930991530418396, + "learning_rate": 9.863912515188337e-05, + "loss": 2.3447, + "step": 234 + }, + { + "epoch": 0.07133100622249203, + "grad_norm": 0.752202033996582, + "learning_rate": 9.863304981773997e-05, + "loss": 1.9733, + "step": 235 + }, + { + "epoch": 0.07163454241918349, + "grad_norm": 0.3552611768245697, + "learning_rate": 9.86269744835966e-05, + "loss": 2.0324, + "step": 236 + }, + { + "epoch": 0.07193807861587494, + "grad_norm": 0.3639819622039795, + "learning_rate": 9.862089914945323e-05, + "loss": 1.9325, + "step": 237 + }, + { + "epoch": 0.0722416148125664, + "grad_norm": 0.4028383195400238, + "learning_rate": 9.861482381530985e-05, + "loss": 1.9652, + "step": 238 + }, + { + "epoch": 0.07254515100925786, + "grad_norm": 0.3904295563697815, + "learning_rate": 9.860874848116647e-05, + "loss": 1.8133, + "step": 239 + }, + { + "epoch": 0.07284868720594931, + "grad_norm": 0.46043211221694946, + "learning_rate": 9.86026731470231e-05, + "loss": 1.6711, + "step": 240 + }, + { + "epoch": 0.07315222340264077, + "grad_norm": 0.41137024760246277, + "learning_rate": 9.859659781287971e-05, + "loss": 2.1129, + "step": 241 + }, + { + "epoch": 0.07345575959933222, + "grad_norm": 0.3776731491088867, + "learning_rate": 9.859052247873633e-05, + "loss": 1.5451, + "step": 242 + }, + { + "epoch": 0.07375929579602368, + "grad_norm": 0.8163847923278809, + "learning_rate": 9.858444714459296e-05, + "loss": 1.8133, + "step": 243 + }, + { + "epoch": 0.07406283199271513, + "grad_norm": 0.7984678149223328, + "learning_rate": 9.857837181044958e-05, + "loss": 1.6879, + "step": 244 + }, + { + "epoch": 0.07436636818940659, + "grad_norm": 0.3759590983390808, + "learning_rate": 9.85722964763062e-05, + "loss": 2.0183, + "step": 245 + }, + { + "epoch": 0.07466990438609804, + "grad_norm": 0.4622940421104431, + "learning_rate": 9.856622114216283e-05, + "loss": 1.9958, + "step": 246 + }, + { + "epoch": 0.0749734405827895, + "grad_norm": 0.4710557758808136, + "learning_rate": 9.856014580801944e-05, + "loss": 1.5483, + "step": 247 + }, + { + "epoch": 0.07527697677948095, + "grad_norm": 0.3766056001186371, + "learning_rate": 9.855407047387608e-05, + "loss": 1.8697, + "step": 248 + }, + { + "epoch": 0.07558051297617241, + "grad_norm": 0.6338986158370972, + "learning_rate": 9.854799513973268e-05, + "loss": 2.2657, + "step": 249 + }, + { + "epoch": 0.07588404917286386, + "grad_norm": 0.4152657687664032, + "learning_rate": 9.854191980558931e-05, + "loss": 1.5967, + "step": 250 + }, + { + "epoch": 0.07618758536955532, + "grad_norm": 0.37085869908332825, + "learning_rate": 9.853584447144594e-05, + "loss": 1.915, + "step": 251 + }, + { + "epoch": 0.07649112156624678, + "grad_norm": 0.40199750661849976, + "learning_rate": 9.852976913730256e-05, + "loss": 2.0662, + "step": 252 + }, + { + "epoch": 0.07679465776293823, + "grad_norm": 0.39193621277809143, + "learning_rate": 9.852369380315918e-05, + "loss": 1.911, + "step": 253 + }, + { + "epoch": 0.07709819395962969, + "grad_norm": 0.39082249999046326, + "learning_rate": 9.85176184690158e-05, + "loss": 1.6207, + "step": 254 + }, + { + "epoch": 0.07740173015632114, + "grad_norm": 0.3943793773651123, + "learning_rate": 9.851154313487242e-05, + "loss": 2.1254, + "step": 255 + }, + { + "epoch": 0.0777052663530126, + "grad_norm": 0.34571030735969543, + "learning_rate": 9.850546780072904e-05, + "loss": 1.9696, + "step": 256 + }, + { + "epoch": 0.07800880254970405, + "grad_norm": 0.4847205579280853, + "learning_rate": 9.849939246658567e-05, + "loss": 2.1382, + "step": 257 + }, + { + "epoch": 0.07831233874639551, + "grad_norm": 0.47491976618766785, + "learning_rate": 9.849331713244229e-05, + "loss": 2.1109, + "step": 258 + }, + { + "epoch": 0.07861587494308696, + "grad_norm": 0.3984815776348114, + "learning_rate": 9.84872417982989e-05, + "loss": 2.0019, + "step": 259 + }, + { + "epoch": 0.07891941113977842, + "grad_norm": 0.578295886516571, + "learning_rate": 9.848116646415554e-05, + "loss": 1.6984, + "step": 260 + }, + { + "epoch": 0.07922294733646987, + "grad_norm": 0.4641231894493103, + "learning_rate": 9.847509113001215e-05, + "loss": 2.0974, + "step": 261 + }, + { + "epoch": 0.07952648353316133, + "grad_norm": 0.3704085052013397, + "learning_rate": 9.846901579586879e-05, + "loss": 1.8907, + "step": 262 + }, + { + "epoch": 0.07983001972985279, + "grad_norm": 0.40248993039131165, + "learning_rate": 9.846294046172539e-05, + "loss": 1.9194, + "step": 263 + }, + { + "epoch": 0.08013355592654424, + "grad_norm": 0.40396660566329956, + "learning_rate": 9.845686512758202e-05, + "loss": 1.7075, + "step": 264 + }, + { + "epoch": 0.0804370921232357, + "grad_norm": 0.44500696659088135, + "learning_rate": 9.845078979343865e-05, + "loss": 1.7463, + "step": 265 + }, + { + "epoch": 0.08074062831992715, + "grad_norm": 0.3681708574295044, + "learning_rate": 9.844471445929527e-05, + "loss": 1.7162, + "step": 266 + }, + { + "epoch": 0.08104416451661861, + "grad_norm": 0.47645455598831177, + "learning_rate": 9.843863912515189e-05, + "loss": 1.7759, + "step": 267 + }, + { + "epoch": 0.08134770071331006, + "grad_norm": 0.3663488030433655, + "learning_rate": 9.843256379100852e-05, + "loss": 1.8687, + "step": 268 + }, + { + "epoch": 0.08165123691000152, + "grad_norm": 0.33710968494415283, + "learning_rate": 9.842648845686513e-05, + "loss": 1.9777, + "step": 269 + }, + { + "epoch": 0.08195477310669297, + "grad_norm": 0.4824034571647644, + "learning_rate": 9.842041312272175e-05, + "loss": 1.4345, + "step": 270 + }, + { + "epoch": 0.08225830930338443, + "grad_norm": 0.3703802824020386, + "learning_rate": 9.841433778857838e-05, + "loss": 1.7835, + "step": 271 + }, + { + "epoch": 0.08256184550007588, + "grad_norm": 0.4279334545135498, + "learning_rate": 9.8408262454435e-05, + "loss": 2.1811, + "step": 272 + }, + { + "epoch": 0.08286538169676734, + "grad_norm": 0.3500446379184723, + "learning_rate": 9.840218712029162e-05, + "loss": 2.0992, + "step": 273 + }, + { + "epoch": 0.08316891789345879, + "grad_norm": 0.4278954863548279, + "learning_rate": 9.839611178614823e-05, + "loss": 1.4691, + "step": 274 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.6769374012947083, + "learning_rate": 9.839003645200486e-05, + "loss": 1.8223, + "step": 275 + }, + { + "epoch": 0.08377599028684171, + "grad_norm": 0.34110525250434875, + "learning_rate": 9.83839611178615e-05, + "loss": 1.9469, + "step": 276 + }, + { + "epoch": 0.08407952648353316, + "grad_norm": 0.37355175614356995, + "learning_rate": 9.83778857837181e-05, + "loss": 1.9281, + "step": 277 + }, + { + "epoch": 0.08438306268022462, + "grad_norm": 0.3968208134174347, + "learning_rate": 9.837181044957473e-05, + "loss": 2.0537, + "step": 278 + }, + { + "epoch": 0.08468659887691607, + "grad_norm": 0.3811870515346527, + "learning_rate": 9.836573511543136e-05, + "loss": 1.9715, + "step": 279 + }, + { + "epoch": 0.08499013507360753, + "grad_norm": 0.3258214294910431, + "learning_rate": 9.835965978128796e-05, + "loss": 1.979, + "step": 280 + }, + { + "epoch": 0.08529367127029898, + "grad_norm": 0.38593369722366333, + "learning_rate": 9.83535844471446e-05, + "loss": 2.135, + "step": 281 + }, + { + "epoch": 0.08559720746699044, + "grad_norm": 0.3811703026294708, + "learning_rate": 9.834750911300123e-05, + "loss": 2.1786, + "step": 282 + }, + { + "epoch": 0.08590074366368189, + "grad_norm": 0.3784421980381012, + "learning_rate": 9.834143377885784e-05, + "loss": 2.0193, + "step": 283 + }, + { + "epoch": 0.08620427986037335, + "grad_norm": 0.3660358190536499, + "learning_rate": 9.833535844471446e-05, + "loss": 1.9975, + "step": 284 + }, + { + "epoch": 0.0865078160570648, + "grad_norm": 0.3747190237045288, + "learning_rate": 9.832928311057109e-05, + "loss": 1.7897, + "step": 285 + }, + { + "epoch": 0.08681135225375626, + "grad_norm": 0.37042975425720215, + "learning_rate": 9.832320777642771e-05, + "loss": 1.9026, + "step": 286 + }, + { + "epoch": 0.08711488845044772, + "grad_norm": 0.3642013669013977, + "learning_rate": 9.831713244228433e-05, + "loss": 1.9611, + "step": 287 + }, + { + "epoch": 0.08741842464713917, + "grad_norm": 0.38183900713920593, + "learning_rate": 9.831105710814094e-05, + "loss": 1.8648, + "step": 288 + }, + { + "epoch": 0.08772196084383063, + "grad_norm": 0.4243112802505493, + "learning_rate": 9.830498177399757e-05, + "loss": 1.734, + "step": 289 + }, + { + "epoch": 0.08802549704052208, + "grad_norm": 0.3763525187969208, + "learning_rate": 9.82989064398542e-05, + "loss": 1.8955, + "step": 290 + }, + { + "epoch": 0.08832903323721354, + "grad_norm": 0.37548086047172546, + "learning_rate": 9.829283110571081e-05, + "loss": 1.9246, + "step": 291 + }, + { + "epoch": 0.08863256943390499, + "grad_norm": 0.5070151090621948, + "learning_rate": 9.828675577156744e-05, + "loss": 1.6474, + "step": 292 + }, + { + "epoch": 0.08893610563059645, + "grad_norm": 0.3903336822986603, + "learning_rate": 9.828068043742407e-05, + "loss": 1.5546, + "step": 293 + }, + { + "epoch": 0.0892396418272879, + "grad_norm": 0.42705482244491577, + "learning_rate": 9.827460510328068e-05, + "loss": 1.5506, + "step": 294 + }, + { + "epoch": 0.08954317802397936, + "grad_norm": 0.4342738687992096, + "learning_rate": 9.82685297691373e-05, + "loss": 1.6173, + "step": 295 + }, + { + "epoch": 0.08984671422067081, + "grad_norm": 0.3975971043109894, + "learning_rate": 9.826245443499394e-05, + "loss": 1.9652, + "step": 296 + }, + { + "epoch": 0.09015025041736227, + "grad_norm": 0.42342832684516907, + "learning_rate": 9.825637910085055e-05, + "loss": 1.9464, + "step": 297 + }, + { + "epoch": 0.09045378661405373, + "grad_norm": 0.381565660238266, + "learning_rate": 9.825030376670717e-05, + "loss": 2.0949, + "step": 298 + }, + { + "epoch": 0.09075732281074518, + "grad_norm": 0.4632069170475006, + "learning_rate": 9.82442284325638e-05, + "loss": 1.4451, + "step": 299 + }, + { + "epoch": 0.09106085900743664, + "grad_norm": 0.36039817333221436, + "learning_rate": 9.823815309842042e-05, + "loss": 1.2199, + "step": 300 + }, + { + "epoch": 0.09136439520412809, + "grad_norm": 0.37576648592948914, + "learning_rate": 9.823207776427704e-05, + "loss": 1.9884, + "step": 301 + }, + { + "epoch": 0.09166793140081955, + "grad_norm": 0.3673763573169708, + "learning_rate": 9.822600243013365e-05, + "loss": 1.7103, + "step": 302 + }, + { + "epoch": 0.091971467597511, + "grad_norm": 0.3729887008666992, + "learning_rate": 9.821992709599029e-05, + "loss": 1.9215, + "step": 303 + }, + { + "epoch": 0.09227500379420246, + "grad_norm": 0.3857046365737915, + "learning_rate": 9.82138517618469e-05, + "loss": 2.1883, + "step": 304 + }, + { + "epoch": 0.09257853999089391, + "grad_norm": 0.4226963520050049, + "learning_rate": 9.820777642770352e-05, + "loss": 1.6413, + "step": 305 + }, + { + "epoch": 0.09288207618758537, + "grad_norm": 0.40093332529067993, + "learning_rate": 9.820170109356015e-05, + "loss": 1.9897, + "step": 306 + }, + { + "epoch": 0.09318561238427682, + "grad_norm": 0.4287321865558624, + "learning_rate": 9.819562575941678e-05, + "loss": 1.8708, + "step": 307 + }, + { + "epoch": 0.09348914858096828, + "grad_norm": 0.3933330178260803, + "learning_rate": 9.818955042527339e-05, + "loss": 2.003, + "step": 308 + }, + { + "epoch": 0.09379268477765973, + "grad_norm": 0.3991425633430481, + "learning_rate": 9.818347509113002e-05, + "loss": 1.7305, + "step": 309 + }, + { + "epoch": 0.09409622097435119, + "grad_norm": 0.37534525990486145, + "learning_rate": 9.817739975698665e-05, + "loss": 1.9767, + "step": 310 + }, + { + "epoch": 0.09439975717104265, + "grad_norm": 0.4293142557144165, + "learning_rate": 9.817132442284326e-05, + "loss": 2.1563, + "step": 311 + }, + { + "epoch": 0.0947032933677341, + "grad_norm": 0.5783388614654541, + "learning_rate": 9.816524908869988e-05, + "loss": 1.4839, + "step": 312 + }, + { + "epoch": 0.09500682956442556, + "grad_norm": 0.3414449989795685, + "learning_rate": 9.815917375455651e-05, + "loss": 1.9974, + "step": 313 + }, + { + "epoch": 0.09531036576111701, + "grad_norm": 0.3927890956401825, + "learning_rate": 9.815309842041313e-05, + "loss": 1.9683, + "step": 314 + }, + { + "epoch": 0.09561390195780847, + "grad_norm": 0.42801201343536377, + "learning_rate": 9.814702308626975e-05, + "loss": 2.1414, + "step": 315 + }, + { + "epoch": 0.09591743815449992, + "grad_norm": 0.4715151786804199, + "learning_rate": 9.814094775212636e-05, + "loss": 2.0055, + "step": 316 + }, + { + "epoch": 0.09622097435119138, + "grad_norm": 0.42110496759414673, + "learning_rate": 9.8134872417983e-05, + "loss": 2.0693, + "step": 317 + }, + { + "epoch": 0.09652451054788283, + "grad_norm": 0.39333397150039673, + "learning_rate": 9.812879708383961e-05, + "loss": 2.1362, + "step": 318 + }, + { + "epoch": 0.09682804674457429, + "grad_norm": 0.42686814069747925, + "learning_rate": 9.812272174969623e-05, + "loss": 1.9485, + "step": 319 + }, + { + "epoch": 0.09713158294126574, + "grad_norm": 0.38239583373069763, + "learning_rate": 9.811664641555286e-05, + "loss": 1.3584, + "step": 320 + }, + { + "epoch": 0.0974351191379572, + "grad_norm": 0.3651975393295288, + "learning_rate": 9.811057108140949e-05, + "loss": 2.2135, + "step": 321 + }, + { + "epoch": 0.09773865533464866, + "grad_norm": 0.34531673789024353, + "learning_rate": 9.81044957472661e-05, + "loss": 1.7106, + "step": 322 + }, + { + "epoch": 0.09804219153134011, + "grad_norm": 0.38727474212646484, + "learning_rate": 9.809842041312273e-05, + "loss": 1.8647, + "step": 323 + }, + { + "epoch": 0.09834572772803157, + "grad_norm": 0.4127596318721771, + "learning_rate": 9.809234507897936e-05, + "loss": 1.5775, + "step": 324 + }, + { + "epoch": 0.09864926392472302, + "grad_norm": 0.337333083152771, + "learning_rate": 9.808626974483597e-05, + "loss": 1.8619, + "step": 325 + }, + { + "epoch": 0.09895280012141448, + "grad_norm": 0.43906038999557495, + "learning_rate": 9.808019441069259e-05, + "loss": 2.1753, + "step": 326 + }, + { + "epoch": 0.09925633631810593, + "grad_norm": 0.4216412901878357, + "learning_rate": 9.807411907654922e-05, + "loss": 1.8322, + "step": 327 + }, + { + "epoch": 0.09955987251479739, + "grad_norm": 0.3964472711086273, + "learning_rate": 9.806804374240584e-05, + "loss": 1.6452, + "step": 328 + }, + { + "epoch": 0.09986340871148884, + "grad_norm": 0.3590555489063263, + "learning_rate": 9.806196840826246e-05, + "loss": 1.6471, + "step": 329 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.3878382742404938, + "learning_rate": 9.805589307411907e-05, + "loss": 1.7192, + "step": 330 + }, + { + "epoch": 0.10047048110487175, + "grad_norm": 0.37866318225860596, + "learning_rate": 9.80498177399757e-05, + "loss": 2.0156, + "step": 331 + }, + { + "epoch": 0.10077401730156321, + "grad_norm": 0.3977656364440918, + "learning_rate": 9.804374240583232e-05, + "loss": 1.3686, + "step": 332 + }, + { + "epoch": 0.10107755349825466, + "grad_norm": 0.39724108576774597, + "learning_rate": 9.803766707168894e-05, + "loss": 1.6206, + "step": 333 + }, + { + "epoch": 0.10138108969494612, + "grad_norm": 0.7311023473739624, + "learning_rate": 9.803159173754557e-05, + "loss": 1.7186, + "step": 334 + }, + { + "epoch": 0.10168462589163758, + "grad_norm": 0.3953106701374054, + "learning_rate": 9.80255164034022e-05, + "loss": 1.9674, + "step": 335 + }, + { + "epoch": 0.10198816208832903, + "grad_norm": 0.4133211076259613, + "learning_rate": 9.80194410692588e-05, + "loss": 1.9536, + "step": 336 + }, + { + "epoch": 0.1022916982850205, + "grad_norm": 0.4300665855407715, + "learning_rate": 9.801336573511544e-05, + "loss": 2.0676, + "step": 337 + }, + { + "epoch": 0.10259523448171194, + "grad_norm": 0.3569762706756592, + "learning_rate": 9.800729040097207e-05, + "loss": 2.138, + "step": 338 + }, + { + "epoch": 0.1028987706784034, + "grad_norm": 0.37851640582084656, + "learning_rate": 9.800121506682868e-05, + "loss": 1.7479, + "step": 339 + }, + { + "epoch": 0.10320230687509485, + "grad_norm": 0.3435342013835907, + "learning_rate": 9.79951397326853e-05, + "loss": 2.0182, + "step": 340 + }, + { + "epoch": 0.10350584307178631, + "grad_norm": 0.487394779920578, + "learning_rate": 9.798906439854193e-05, + "loss": 1.8017, + "step": 341 + }, + { + "epoch": 0.10380937926847776, + "grad_norm": 0.3741822838783264, + "learning_rate": 9.798298906439855e-05, + "loss": 1.9261, + "step": 342 + }, + { + "epoch": 0.10411291546516922, + "grad_norm": 0.4044167101383209, + "learning_rate": 9.797691373025517e-05, + "loss": 1.962, + "step": 343 + }, + { + "epoch": 0.10441645166186067, + "grad_norm": 0.4507991373538971, + "learning_rate": 9.797083839611178e-05, + "loss": 1.6664, + "step": 344 + }, + { + "epoch": 0.10471998785855213, + "grad_norm": 0.41394731402397156, + "learning_rate": 9.796476306196842e-05, + "loss": 1.7994, + "step": 345 + }, + { + "epoch": 0.1050235240552436, + "grad_norm": 0.4100608825683594, + "learning_rate": 9.795868772782503e-05, + "loss": 1.8795, + "step": 346 + }, + { + "epoch": 0.10532706025193504, + "grad_norm": 0.5010010600090027, + "learning_rate": 9.795261239368165e-05, + "loss": 1.7712, + "step": 347 + }, + { + "epoch": 0.1056305964486265, + "grad_norm": 0.3657280206680298, + "learning_rate": 9.794653705953828e-05, + "loss": 2.0675, + "step": 348 + }, + { + "epoch": 0.10593413264531795, + "grad_norm": 0.43551188707351685, + "learning_rate": 9.794046172539491e-05, + "loss": 2.3229, + "step": 349 + }, + { + "epoch": 0.10623766884200941, + "grad_norm": 0.4035640060901642, + "learning_rate": 9.793438639125152e-05, + "loss": 1.5348, + "step": 350 + }, + { + "epoch": 0.10654120503870086, + "grad_norm": 0.38934487104415894, + "learning_rate": 9.792831105710815e-05, + "loss": 2.0634, + "step": 351 + }, + { + "epoch": 0.10684474123539232, + "grad_norm": 0.3808942437171936, + "learning_rate": 9.792223572296478e-05, + "loss": 1.5801, + "step": 352 + }, + { + "epoch": 0.10714827743208377, + "grad_norm": 0.4263344407081604, + "learning_rate": 9.791616038882138e-05, + "loss": 2.1149, + "step": 353 + }, + { + "epoch": 0.10745181362877523, + "grad_norm": 0.40345048904418945, + "learning_rate": 9.791008505467801e-05, + "loss": 1.6522, + "step": 354 + }, + { + "epoch": 0.10775534982546668, + "grad_norm": 0.39628833532333374, + "learning_rate": 9.790400972053464e-05, + "loss": 1.8936, + "step": 355 + }, + { + "epoch": 0.10805888602215814, + "grad_norm": 0.3981876075267792, + "learning_rate": 9.789793438639126e-05, + "loss": 1.8532, + "step": 356 + }, + { + "epoch": 0.10836242221884959, + "grad_norm": 0.41689878702163696, + "learning_rate": 9.789185905224788e-05, + "loss": 1.811, + "step": 357 + }, + { + "epoch": 0.10866595841554105, + "grad_norm": 0.3519899845123291, + "learning_rate": 9.78857837181045e-05, + "loss": 2.1879, + "step": 358 + }, + { + "epoch": 0.10896949461223251, + "grad_norm": 0.4501926004886627, + "learning_rate": 9.787970838396113e-05, + "loss": 1.7545, + "step": 359 + }, + { + "epoch": 0.10927303080892396, + "grad_norm": 0.3503700792789459, + "learning_rate": 9.787363304981774e-05, + "loss": 1.8818, + "step": 360 + }, + { + "epoch": 0.10957656700561542, + "grad_norm": 0.3641771674156189, + "learning_rate": 9.786755771567436e-05, + "loss": 1.9666, + "step": 361 + }, + { + "epoch": 0.10988010320230687, + "grad_norm": 0.41548213362693787, + "learning_rate": 9.786148238153099e-05, + "loss": 1.8781, + "step": 362 + }, + { + "epoch": 0.11018363939899833, + "grad_norm": 0.40850362181663513, + "learning_rate": 9.785540704738762e-05, + "loss": 1.8369, + "step": 363 + }, + { + "epoch": 0.11048717559568978, + "grad_norm": 0.3874415159225464, + "learning_rate": 9.784933171324423e-05, + "loss": 1.6867, + "step": 364 + }, + { + "epoch": 0.11079071179238124, + "grad_norm": 0.40616413950920105, + "learning_rate": 9.784325637910086e-05, + "loss": 1.7234, + "step": 365 + }, + { + "epoch": 0.11109424798907269, + "grad_norm": 0.7947202920913696, + "learning_rate": 9.783718104495749e-05, + "loss": 1.5327, + "step": 366 + }, + { + "epoch": 0.11139778418576415, + "grad_norm": 0.3792203664779663, + "learning_rate": 9.783110571081409e-05, + "loss": 1.5403, + "step": 367 + }, + { + "epoch": 0.1117013203824556, + "grad_norm": 0.4576598107814789, + "learning_rate": 9.782503037667072e-05, + "loss": 2.2472, + "step": 368 + }, + { + "epoch": 0.11200485657914706, + "grad_norm": 0.37935471534729004, + "learning_rate": 9.781895504252734e-05, + "loss": 1.8421, + "step": 369 + }, + { + "epoch": 0.11230839277583853, + "grad_norm": 0.3658997416496277, + "learning_rate": 9.781287970838397e-05, + "loss": 1.6879, + "step": 370 + }, + { + "epoch": 0.11261192897252997, + "grad_norm": 0.3936321437358856, + "learning_rate": 9.780680437424059e-05, + "loss": 2.0365, + "step": 371 + }, + { + "epoch": 0.11291546516922144, + "grad_norm": 0.3935524821281433, + "learning_rate": 9.78007290400972e-05, + "loss": 1.8648, + "step": 372 + }, + { + "epoch": 0.11321900136591288, + "grad_norm": 0.3798617422580719, + "learning_rate": 9.779465370595384e-05, + "loss": 2.0528, + "step": 373 + }, + { + "epoch": 0.11352253756260434, + "grad_norm": 0.38386639952659607, + "learning_rate": 9.778857837181045e-05, + "loss": 1.5629, + "step": 374 + }, + { + "epoch": 0.11382607375929579, + "grad_norm": 0.4665718674659729, + "learning_rate": 9.778250303766707e-05, + "loss": 1.6207, + "step": 375 + }, + { + "epoch": 0.11412960995598725, + "grad_norm": 0.35728296637535095, + "learning_rate": 9.77764277035237e-05, + "loss": 1.9548, + "step": 376 + }, + { + "epoch": 0.1144331461526787, + "grad_norm": 0.3415043354034424, + "learning_rate": 9.777035236938032e-05, + "loss": 2.0376, + "step": 377 + }, + { + "epoch": 0.11473668234937016, + "grad_norm": 0.38225334882736206, + "learning_rate": 9.776427703523694e-05, + "loss": 1.7175, + "step": 378 + }, + { + "epoch": 0.11504021854606161, + "grad_norm": 0.3931468427181244, + "learning_rate": 9.775820170109357e-05, + "loss": 2.1832, + "step": 379 + }, + { + "epoch": 0.11534375474275307, + "grad_norm": 0.3954283595085144, + "learning_rate": 9.77521263669502e-05, + "loss": 2.1863, + "step": 380 + }, + { + "epoch": 0.11564729093944452, + "grad_norm": 0.31073784828186035, + "learning_rate": 9.77460510328068e-05, + "loss": 1.8583, + "step": 381 + }, + { + "epoch": 0.11595082713613598, + "grad_norm": 0.37894561886787415, + "learning_rate": 9.773997569866343e-05, + "loss": 2.0385, + "step": 382 + }, + { + "epoch": 0.11625436333282745, + "grad_norm": 0.3493829369544983, + "learning_rate": 9.773390036452005e-05, + "loss": 1.8854, + "step": 383 + }, + { + "epoch": 0.1165578995295189, + "grad_norm": 0.5518279075622559, + "learning_rate": 9.772782503037668e-05, + "loss": 1.7403, + "step": 384 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.3724190294742584, + "learning_rate": 9.77217496962333e-05, + "loss": 1.7759, + "step": 385 + }, + { + "epoch": 0.1171649719229018, + "grad_norm": 0.4635847508907318, + "learning_rate": 9.771567436208991e-05, + "loss": 1.8441, + "step": 386 + }, + { + "epoch": 0.11746850811959327, + "grad_norm": 0.38281580805778503, + "learning_rate": 9.770959902794655e-05, + "loss": 2.0332, + "step": 387 + }, + { + "epoch": 0.11777204431628471, + "grad_norm": 0.4179950952529907, + "learning_rate": 9.770352369380316e-05, + "loss": 2.3451, + "step": 388 + }, + { + "epoch": 0.11807558051297617, + "grad_norm": 0.34729671478271484, + "learning_rate": 9.769744835965978e-05, + "loss": 1.9186, + "step": 389 + }, + { + "epoch": 0.11837911670966762, + "grad_norm": 0.40492531657218933, + "learning_rate": 9.769137302551641e-05, + "loss": 2.1711, + "step": 390 + }, + { + "epoch": 0.11868265290635908, + "grad_norm": 0.38143807649612427, + "learning_rate": 9.768529769137303e-05, + "loss": 1.7601, + "step": 391 + }, + { + "epoch": 0.11898618910305053, + "grad_norm": 0.35463643074035645, + "learning_rate": 9.767922235722965e-05, + "loss": 1.8068, + "step": 392 + }, + { + "epoch": 0.119289725299742, + "grad_norm": 0.3719339370727539, + "learning_rate": 9.767314702308628e-05, + "loss": 1.6296, + "step": 393 + }, + { + "epoch": 0.11959326149643346, + "grad_norm": 0.8585293292999268, + "learning_rate": 9.766707168894291e-05, + "loss": 1.8841, + "step": 394 + }, + { + "epoch": 0.1198967976931249, + "grad_norm": 0.5286839604377747, + "learning_rate": 9.766099635479951e-05, + "loss": 1.3645, + "step": 395 + }, + { + "epoch": 0.12020033388981637, + "grad_norm": 0.44176310300827026, + "learning_rate": 9.765492102065614e-05, + "loss": 2.1759, + "step": 396 + }, + { + "epoch": 0.12050387008650781, + "grad_norm": 0.39778321981430054, + "learning_rate": 9.764884568651276e-05, + "loss": 1.8344, + "step": 397 + }, + { + "epoch": 0.12080740628319928, + "grad_norm": 0.4364762604236603, + "learning_rate": 9.764277035236939e-05, + "loss": 1.7834, + "step": 398 + }, + { + "epoch": 0.12111094247989072, + "grad_norm": 0.37305301427841187, + "learning_rate": 9.763669501822601e-05, + "loss": 1.7474, + "step": 399 + }, + { + "epoch": 0.12141447867658219, + "grad_norm": 0.38804179430007935, + "learning_rate": 9.763061968408262e-05, + "loss": 1.6241, + "step": 400 + }, + { + "epoch": 0.12171801487327363, + "grad_norm": 0.9124923944473267, + "learning_rate": 9.762454434993926e-05, + "loss": 2.1456, + "step": 401 + }, + { + "epoch": 0.1220215510699651, + "grad_norm": 0.38728946447372437, + "learning_rate": 9.761846901579587e-05, + "loss": 1.9724, + "step": 402 + }, + { + "epoch": 0.12232508726665654, + "grad_norm": 0.4121726155281067, + "learning_rate": 9.761239368165249e-05, + "loss": 2.1119, + "step": 403 + }, + { + "epoch": 0.122628623463348, + "grad_norm": 0.46508121490478516, + "learning_rate": 9.760631834750912e-05, + "loss": 1.4402, + "step": 404 + }, + { + "epoch": 0.12293215966003945, + "grad_norm": 0.4460875391960144, + "learning_rate": 9.760024301336574e-05, + "loss": 2.0572, + "step": 405 + }, + { + "epoch": 0.12323569585673091, + "grad_norm": 0.38444089889526367, + "learning_rate": 9.759416767922236e-05, + "loss": 1.9943, + "step": 406 + }, + { + "epoch": 0.12353923205342238, + "grad_norm": 0.3515356779098511, + "learning_rate": 9.758809234507899e-05, + "loss": 1.5699, + "step": 407 + }, + { + "epoch": 0.12384276825011382, + "grad_norm": 0.4010019302368164, + "learning_rate": 9.758201701093562e-05, + "loss": 1.8674, + "step": 408 + }, + { + "epoch": 0.12414630444680529, + "grad_norm": 0.4250737428665161, + "learning_rate": 9.757594167679222e-05, + "loss": 1.0738, + "step": 409 + }, + { + "epoch": 0.12444984064349673, + "grad_norm": 0.3719541132450104, + "learning_rate": 9.756986634264885e-05, + "loss": 1.7292, + "step": 410 + }, + { + "epoch": 0.1247533768401882, + "grad_norm": 0.385420024394989, + "learning_rate": 9.756379100850547e-05, + "loss": 1.9503, + "step": 411 + }, + { + "epoch": 0.12505691303687966, + "grad_norm": 0.480056494474411, + "learning_rate": 9.75577156743621e-05, + "loss": 2.0093, + "step": 412 + }, + { + "epoch": 0.1253604492335711, + "grad_norm": 0.38757050037384033, + "learning_rate": 9.755164034021872e-05, + "loss": 2.2636, + "step": 413 + }, + { + "epoch": 0.12566398543026255, + "grad_norm": 0.3712436854839325, + "learning_rate": 9.754556500607533e-05, + "loss": 2.0479, + "step": 414 + }, + { + "epoch": 0.12596752162695402, + "grad_norm": 2.4313645362854004, + "learning_rate": 9.753948967193197e-05, + "loss": 2.376, + "step": 415 + }, + { + "epoch": 0.12627105782364548, + "grad_norm": 0.8028842210769653, + "learning_rate": 9.753341433778858e-05, + "loss": 1.7702, + "step": 416 + }, + { + "epoch": 0.1265745940203369, + "grad_norm": 0.48586025834083557, + "learning_rate": 9.75273390036452e-05, + "loss": 1.7623, + "step": 417 + }, + { + "epoch": 0.12687813021702837, + "grad_norm": 0.4017583131790161, + "learning_rate": 9.752126366950183e-05, + "loss": 1.8756, + "step": 418 + }, + { + "epoch": 0.12718166641371983, + "grad_norm": 0.3845275044441223, + "learning_rate": 9.751518833535845e-05, + "loss": 1.95, + "step": 419 + }, + { + "epoch": 0.1274852026104113, + "grad_norm": 0.41064974665641785, + "learning_rate": 9.750911300121507e-05, + "loss": 1.9622, + "step": 420 + }, + { + "epoch": 0.12778873880710276, + "grad_norm": 0.33571726083755493, + "learning_rate": 9.75030376670717e-05, + "loss": 1.7291, + "step": 421 + }, + { + "epoch": 0.1280922750037942, + "grad_norm": 0.3988417387008667, + "learning_rate": 9.749696233292833e-05, + "loss": 1.9349, + "step": 422 + }, + { + "epoch": 0.12839581120048565, + "grad_norm": 0.37586870789527893, + "learning_rate": 9.749088699878493e-05, + "loss": 1.5649, + "step": 423 + }, + { + "epoch": 0.12869934739717712, + "grad_norm": 0.37013643980026245, + "learning_rate": 9.748481166464156e-05, + "loss": 1.9448, + "step": 424 + }, + { + "epoch": 0.12900288359386858, + "grad_norm": 0.31406837701797485, + "learning_rate": 9.747873633049818e-05, + "loss": 1.8496, + "step": 425 + }, + { + "epoch": 0.12930641979056, + "grad_norm": 0.3691607415676117, + "learning_rate": 9.74726609963548e-05, + "loss": 2.0649, + "step": 426 + }, + { + "epoch": 0.12960995598725147, + "grad_norm": 0.39633169770240784, + "learning_rate": 9.746658566221143e-05, + "loss": 1.727, + "step": 427 + }, + { + "epoch": 0.12991349218394294, + "grad_norm": 0.47319236397743225, + "learning_rate": 9.746051032806804e-05, + "loss": 2.4487, + "step": 428 + }, + { + "epoch": 0.1302170283806344, + "grad_norm": 0.35506609082221985, + "learning_rate": 9.745443499392468e-05, + "loss": 2.0857, + "step": 429 + }, + { + "epoch": 0.13052056457732586, + "grad_norm": 0.38134855031967163, + "learning_rate": 9.744835965978129e-05, + "loss": 1.5862, + "step": 430 + }, + { + "epoch": 0.1308241007740173, + "grad_norm": 0.6288440227508545, + "learning_rate": 9.744228432563791e-05, + "loss": 2.2402, + "step": 431 + }, + { + "epoch": 0.13112763697070876, + "grad_norm": 0.42172396183013916, + "learning_rate": 9.743620899149454e-05, + "loss": 1.7245, + "step": 432 + }, + { + "epoch": 0.13143117316740022, + "grad_norm": 0.38452383875846863, + "learning_rate": 9.743013365735116e-05, + "loss": 1.6666, + "step": 433 + }, + { + "epoch": 0.13173470936409168, + "grad_norm": 2.0956268310546875, + "learning_rate": 9.742405832320778e-05, + "loss": 1.8268, + "step": 434 + }, + { + "epoch": 0.1320382455607831, + "grad_norm": 0.4363501965999603, + "learning_rate": 9.74179829890644e-05, + "loss": 2.3149, + "step": 435 + }, + { + "epoch": 0.13234178175747457, + "grad_norm": 0.38243743777275085, + "learning_rate": 9.741190765492104e-05, + "loss": 1.9706, + "step": 436 + }, + { + "epoch": 0.13264531795416604, + "grad_norm": 0.38724249601364136, + "learning_rate": 9.740583232077764e-05, + "loss": 1.779, + "step": 437 + }, + { + "epoch": 0.1329488541508575, + "grad_norm": 0.43606194853782654, + "learning_rate": 9.739975698663427e-05, + "loss": 2.0371, + "step": 438 + }, + { + "epoch": 0.13325239034754893, + "grad_norm": 0.3511301279067993, + "learning_rate": 9.739368165249089e-05, + "loss": 1.8771, + "step": 439 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.3883466124534607, + "learning_rate": 9.73876063183475e-05, + "loss": 2.1165, + "step": 440 + }, + { + "epoch": 0.13385946274093186, + "grad_norm": 0.41711342334747314, + "learning_rate": 9.738153098420414e-05, + "loss": 1.8367, + "step": 441 + }, + { + "epoch": 0.13416299893762332, + "grad_norm": 0.6146459579467773, + "learning_rate": 9.737545565006075e-05, + "loss": 1.9233, + "step": 442 + }, + { + "epoch": 0.13446653513431478, + "grad_norm": 0.37016820907592773, + "learning_rate": 9.736938031591739e-05, + "loss": 1.8804, + "step": 443 + }, + { + "epoch": 0.1347700713310062, + "grad_norm": 0.3620823323726654, + "learning_rate": 9.7363304981774e-05, + "loss": 1.5837, + "step": 444 + }, + { + "epoch": 0.13507360752769768, + "grad_norm": 0.37375590205192566, + "learning_rate": 9.735722964763062e-05, + "loss": 1.8095, + "step": 445 + }, + { + "epoch": 0.13537714372438914, + "grad_norm": 0.6026252508163452, + "learning_rate": 9.735115431348725e-05, + "loss": 2.041, + "step": 446 + }, + { + "epoch": 0.1356806799210806, + "grad_norm": 0.3753829002380371, + "learning_rate": 9.734507897934387e-05, + "loss": 2.0627, + "step": 447 + }, + { + "epoch": 0.13598421611777203, + "grad_norm": 0.3974304795265198, + "learning_rate": 9.733900364520049e-05, + "loss": 2.1671, + "step": 448 + }, + { + "epoch": 0.1362877523144635, + "grad_norm": 0.34336552023887634, + "learning_rate": 9.733292831105712e-05, + "loss": 1.9118, + "step": 449 + }, + { + "epoch": 0.13659128851115496, + "grad_norm": 0.3588969111442566, + "learning_rate": 9.732685297691373e-05, + "loss": 1.9768, + "step": 450 + }, + { + "epoch": 0.13689482470784642, + "grad_norm": 0.38693130016326904, + "learning_rate": 9.732077764277035e-05, + "loss": 2.1524, + "step": 451 + }, + { + "epoch": 0.13719836090453785, + "grad_norm": 0.370612233877182, + "learning_rate": 9.731470230862698e-05, + "loss": 1.8753, + "step": 452 + }, + { + "epoch": 0.13750189710122931, + "grad_norm": 0.4038615822792053, + "learning_rate": 9.73086269744836e-05, + "loss": 2.1024, + "step": 453 + }, + { + "epoch": 0.13780543329792078, + "grad_norm": 0.3728694021701813, + "learning_rate": 9.730255164034022e-05, + "loss": 1.8864, + "step": 454 + }, + { + "epoch": 0.13810896949461224, + "grad_norm": 0.37269484996795654, + "learning_rate": 9.729647630619685e-05, + "loss": 1.8244, + "step": 455 + }, + { + "epoch": 0.1384125056913037, + "grad_norm": 0.39840301871299744, + "learning_rate": 9.729040097205346e-05, + "loss": 1.8215, + "step": 456 + }, + { + "epoch": 0.13871604188799513, + "grad_norm": 0.39856579899787903, + "learning_rate": 9.72843256379101e-05, + "loss": 1.9475, + "step": 457 + }, + { + "epoch": 0.1390195780846866, + "grad_norm": 0.43041157722473145, + "learning_rate": 9.727825030376671e-05, + "loss": 1.87, + "step": 458 + }, + { + "epoch": 0.13932311428137806, + "grad_norm": 0.4047417640686035, + "learning_rate": 9.727217496962333e-05, + "loss": 2.1425, + "step": 459 + }, + { + "epoch": 0.13962665047806952, + "grad_norm": 0.3901901841163635, + "learning_rate": 9.726609963547996e-05, + "loss": 1.917, + "step": 460 + }, + { + "epoch": 0.13993018667476095, + "grad_norm": 0.40706855058670044, + "learning_rate": 9.726002430133658e-05, + "loss": 2.1278, + "step": 461 + }, + { + "epoch": 0.14023372287145242, + "grad_norm": 0.47956356406211853, + "learning_rate": 9.72539489671932e-05, + "loss": 1.2435, + "step": 462 + }, + { + "epoch": 0.14053725906814388, + "grad_norm": 0.35697320103645325, + "learning_rate": 9.724787363304983e-05, + "loss": 1.7965, + "step": 463 + }, + { + "epoch": 0.14084079526483534, + "grad_norm": 0.4102901518344879, + "learning_rate": 9.724179829890644e-05, + "loss": 1.7132, + "step": 464 + }, + { + "epoch": 0.1411443314615268, + "grad_norm": 0.3353058099746704, + "learning_rate": 9.723572296476306e-05, + "loss": 1.674, + "step": 465 + }, + { + "epoch": 0.14144786765821823, + "grad_norm": 0.3946186900138855, + "learning_rate": 9.722964763061969e-05, + "loss": 2.0067, + "step": 466 + }, + { + "epoch": 0.1417514038549097, + "grad_norm": 0.3974400460720062, + "learning_rate": 9.722357229647631e-05, + "loss": 1.7242, + "step": 467 + }, + { + "epoch": 0.14205494005160116, + "grad_norm": 1.3334546089172363, + "learning_rate": 9.721749696233293e-05, + "loss": 2.0141, + "step": 468 + }, + { + "epoch": 0.14235847624829262, + "grad_norm": 0.36386749148368835, + "learning_rate": 9.721142162818956e-05, + "loss": 1.8221, + "step": 469 + }, + { + "epoch": 0.14266201244498405, + "grad_norm": 0.35332655906677246, + "learning_rate": 9.720534629404617e-05, + "loss": 1.4251, + "step": 470 + }, + { + "epoch": 0.14296554864167552, + "grad_norm": 0.563428521156311, + "learning_rate": 9.71992709599028e-05, + "loss": 1.8633, + "step": 471 + }, + { + "epoch": 0.14326908483836698, + "grad_norm": 0.39971691370010376, + "learning_rate": 9.719319562575942e-05, + "loss": 2.0397, + "step": 472 + }, + { + "epoch": 0.14357262103505844, + "grad_norm": 0.39253416657447815, + "learning_rate": 9.718712029161604e-05, + "loss": 1.3104, + "step": 473 + }, + { + "epoch": 0.14387615723174987, + "grad_norm": 0.41245678067207336, + "learning_rate": 9.718104495747267e-05, + "loss": 1.8821, + "step": 474 + }, + { + "epoch": 0.14417969342844134, + "grad_norm": 0.5424125790596008, + "learning_rate": 9.717496962332929e-05, + "loss": 1.7739, + "step": 475 + }, + { + "epoch": 0.1444832296251328, + "grad_norm": 0.425329327583313, + "learning_rate": 9.71688942891859e-05, + "loss": 1.8047, + "step": 476 + }, + { + "epoch": 0.14478676582182426, + "grad_norm": 0.43624332547187805, + "learning_rate": 9.716281895504254e-05, + "loss": 1.5254, + "step": 477 + }, + { + "epoch": 0.14509030201851572, + "grad_norm": 0.4078616201877594, + "learning_rate": 9.715674362089915e-05, + "loss": 1.8286, + "step": 478 + }, + { + "epoch": 0.14539383821520716, + "grad_norm": 0.4144497513771057, + "learning_rate": 9.715066828675577e-05, + "loss": 2.0737, + "step": 479 + }, + { + "epoch": 0.14569737441189862, + "grad_norm": 0.37600383162498474, + "learning_rate": 9.71445929526124e-05, + "loss": 2.2512, + "step": 480 + }, + { + "epoch": 0.14600091060859008, + "grad_norm": 0.376644492149353, + "learning_rate": 9.713851761846902e-05, + "loss": 2.0374, + "step": 481 + }, + { + "epoch": 0.14630444680528154, + "grad_norm": 1.3389711380004883, + "learning_rate": 9.713244228432564e-05, + "loss": 1.7461, + "step": 482 + }, + { + "epoch": 0.14660798300197297, + "grad_norm": 0.32981812953948975, + "learning_rate": 9.712636695018227e-05, + "loss": 1.8383, + "step": 483 + }, + { + "epoch": 0.14691151919866444, + "grad_norm": 0.3440997004508972, + "learning_rate": 9.712029161603888e-05, + "loss": 1.6228, + "step": 484 + }, + { + "epoch": 0.1472150553953559, + "grad_norm": 0.3392031788825989, + "learning_rate": 9.711421628189552e-05, + "loss": 1.9423, + "step": 485 + }, + { + "epoch": 0.14751859159204736, + "grad_norm": 0.37523385882377625, + "learning_rate": 9.710814094775213e-05, + "loss": 2.1037, + "step": 486 + }, + { + "epoch": 0.1478221277887388, + "grad_norm": 0.36372002959251404, + "learning_rate": 9.710206561360875e-05, + "loss": 1.863, + "step": 487 + }, + { + "epoch": 0.14812566398543026, + "grad_norm": 0.3782525360584259, + "learning_rate": 9.709599027946538e-05, + "loss": 2.1418, + "step": 488 + }, + { + "epoch": 0.14842920018212172, + "grad_norm": 0.7462687492370605, + "learning_rate": 9.7089914945322e-05, + "loss": 1.8172, + "step": 489 + }, + { + "epoch": 0.14873273637881318, + "grad_norm": 0.3471963107585907, + "learning_rate": 9.708383961117862e-05, + "loss": 2.1015, + "step": 490 + }, + { + "epoch": 0.14903627257550464, + "grad_norm": 0.5325261950492859, + "learning_rate": 9.707776427703525e-05, + "loss": 2.05, + "step": 491 + }, + { + "epoch": 0.14933980877219608, + "grad_norm": 0.5748963952064514, + "learning_rate": 9.707168894289186e-05, + "loss": 2.1217, + "step": 492 + }, + { + "epoch": 0.14964334496888754, + "grad_norm": 0.3891385495662689, + "learning_rate": 9.706561360874848e-05, + "loss": 1.5301, + "step": 493 + }, + { + "epoch": 0.149946881165579, + "grad_norm": 0.48258477449417114, + "learning_rate": 9.705953827460511e-05, + "loss": 1.839, + "step": 494 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.5301745533943176, + "learning_rate": 9.705346294046173e-05, + "loss": 1.7934, + "step": 495 + }, + { + "epoch": 0.1505539535589619, + "grad_norm": 0.3614468276500702, + "learning_rate": 9.704738760631835e-05, + "loss": 1.7176, + "step": 496 + }, + { + "epoch": 0.15085748975565336, + "grad_norm": 0.31026577949523926, + "learning_rate": 9.704131227217498e-05, + "loss": 1.925, + "step": 497 + }, + { + "epoch": 0.15116102595234482, + "grad_norm": 0.37441205978393555, + "learning_rate": 9.70352369380316e-05, + "loss": 2.1532, + "step": 498 + }, + { + "epoch": 0.15146456214903628, + "grad_norm": 0.37447264790534973, + "learning_rate": 9.702916160388823e-05, + "loss": 2.0351, + "step": 499 + }, + { + "epoch": 0.15176809834572771, + "grad_norm": 0.7793715000152588, + "learning_rate": 9.702308626974484e-05, + "loss": 2.1011, + "step": 500 + }, + { + "epoch": 0.15207163454241918, + "grad_norm": 0.3725285232067108, + "learning_rate": 9.701701093560146e-05, + "loss": 2.1641, + "step": 501 + }, + { + "epoch": 0.15237517073911064, + "grad_norm": 0.35334041714668274, + "learning_rate": 9.701093560145809e-05, + "loss": 2.0773, + "step": 502 + }, + { + "epoch": 0.1526787069358021, + "grad_norm": 0.3819803297519684, + "learning_rate": 9.700486026731471e-05, + "loss": 1.836, + "step": 503 + }, + { + "epoch": 0.15298224313249356, + "grad_norm": 0.403060644865036, + "learning_rate": 9.699878493317133e-05, + "loss": 2.1178, + "step": 504 + }, + { + "epoch": 0.153285779329185, + "grad_norm": 0.2956171929836273, + "learning_rate": 9.699270959902796e-05, + "loss": 1.6397, + "step": 505 + }, + { + "epoch": 0.15358931552587646, + "grad_norm": 0.30349212884902954, + "learning_rate": 9.698663426488457e-05, + "loss": 1.6435, + "step": 506 + }, + { + "epoch": 0.15389285172256792, + "grad_norm": 0.38176605105400085, + "learning_rate": 9.698055893074119e-05, + "loss": 1.8004, + "step": 507 + }, + { + "epoch": 0.15419638791925938, + "grad_norm": 0.5072764158248901, + "learning_rate": 9.697448359659782e-05, + "loss": 1.6926, + "step": 508 + }, + { + "epoch": 0.15449992411595082, + "grad_norm": 0.5380321145057678, + "learning_rate": 9.696840826245444e-05, + "loss": 2.2396, + "step": 509 + }, + { + "epoch": 0.15480346031264228, + "grad_norm": 0.40872499346733093, + "learning_rate": 9.696233292831106e-05, + "loss": 1.8645, + "step": 510 + }, + { + "epoch": 0.15510699650933374, + "grad_norm": 0.9282563924789429, + "learning_rate": 9.695625759416769e-05, + "loss": 1.7143, + "step": 511 + }, + { + "epoch": 0.1554105327060252, + "grad_norm": 0.7355011105537415, + "learning_rate": 9.69501822600243e-05, + "loss": 1.7461, + "step": 512 + }, + { + "epoch": 0.15571406890271666, + "grad_norm": 0.9056992530822754, + "learning_rate": 9.694410692588092e-05, + "loss": 2.1454, + "step": 513 + }, + { + "epoch": 0.1560176050994081, + "grad_norm": 0.38970059156417847, + "learning_rate": 9.693803159173755e-05, + "loss": 2.3313, + "step": 514 + }, + { + "epoch": 0.15632114129609956, + "grad_norm": 0.3651840090751648, + "learning_rate": 9.693195625759417e-05, + "loss": 1.84, + "step": 515 + }, + { + "epoch": 0.15662467749279102, + "grad_norm": 0.38748839497566223, + "learning_rate": 9.69258809234508e-05, + "loss": 1.9666, + "step": 516 + }, + { + "epoch": 0.15692821368948248, + "grad_norm": 0.407427042722702, + "learning_rate": 9.691980558930742e-05, + "loss": 1.9351, + "step": 517 + }, + { + "epoch": 0.15723174988617392, + "grad_norm": 0.31920358538627625, + "learning_rate": 9.691373025516404e-05, + "loss": 1.7928, + "step": 518 + }, + { + "epoch": 0.15753528608286538, + "grad_norm": 0.4002731442451477, + "learning_rate": 9.690765492102067e-05, + "loss": 1.4087, + "step": 519 + }, + { + "epoch": 0.15783882227955684, + "grad_norm": 0.42125266790390015, + "learning_rate": 9.690157958687728e-05, + "loss": 1.6185, + "step": 520 + }, + { + "epoch": 0.1581423584762483, + "grad_norm": 0.3706381022930145, + "learning_rate": 9.68955042527339e-05, + "loss": 2.0546, + "step": 521 + }, + { + "epoch": 0.15844589467293974, + "grad_norm": 0.41669219732284546, + "learning_rate": 9.688942891859053e-05, + "loss": 2.0399, + "step": 522 + }, + { + "epoch": 0.1587494308696312, + "grad_norm": 0.36784589290618896, + "learning_rate": 9.688335358444715e-05, + "loss": 1.8182, + "step": 523 + }, + { + "epoch": 0.15905296706632266, + "grad_norm": 0.3830098807811737, + "learning_rate": 9.687727825030377e-05, + "loss": 2.0728, + "step": 524 + }, + { + "epoch": 0.15935650326301412, + "grad_norm": 0.37658411264419556, + "learning_rate": 9.68712029161604e-05, + "loss": 1.9915, + "step": 525 + }, + { + "epoch": 0.15966003945970558, + "grad_norm": 0.375053733587265, + "learning_rate": 9.686512758201701e-05, + "loss": 1.7589, + "step": 526 + }, + { + "epoch": 0.15996357565639702, + "grad_norm": 0.3810443580150604, + "learning_rate": 9.685905224787363e-05, + "loss": 1.6921, + "step": 527 + }, + { + "epoch": 0.16026711185308848, + "grad_norm": 0.41676437854766846, + "learning_rate": 9.685297691373026e-05, + "loss": 1.1885, + "step": 528 + }, + { + "epoch": 0.16057064804977994, + "grad_norm": 0.40823522210121155, + "learning_rate": 9.684690157958688e-05, + "loss": 1.8335, + "step": 529 + }, + { + "epoch": 0.1608741842464714, + "grad_norm": 0.3795296251773834, + "learning_rate": 9.684082624544351e-05, + "loss": 1.9209, + "step": 530 + }, + { + "epoch": 0.16117772044316284, + "grad_norm": 0.41227850317955017, + "learning_rate": 9.683475091130013e-05, + "loss": 1.7791, + "step": 531 + }, + { + "epoch": 0.1614812566398543, + "grad_norm": 0.436483234167099, + "learning_rate": 9.682867557715675e-05, + "loss": 1.8648, + "step": 532 + }, + { + "epoch": 0.16178479283654576, + "grad_norm": 0.43618106842041016, + "learning_rate": 9.682260024301338e-05, + "loss": 1.7593, + "step": 533 + }, + { + "epoch": 0.16208832903323722, + "grad_norm": 0.37166401743888855, + "learning_rate": 9.681652490887e-05, + "loss": 1.9301, + "step": 534 + }, + { + "epoch": 0.16239186522992866, + "grad_norm": 0.5716313123703003, + "learning_rate": 9.681044957472661e-05, + "loss": 2.1892, + "step": 535 + }, + { + "epoch": 0.16269540142662012, + "grad_norm": 0.4008532166481018, + "learning_rate": 9.680437424058324e-05, + "loss": 1.8818, + "step": 536 + }, + { + "epoch": 0.16299893762331158, + "grad_norm": 0.42276448011398315, + "learning_rate": 9.679829890643986e-05, + "loss": 1.4488, + "step": 537 + }, + { + "epoch": 0.16330247382000304, + "grad_norm": 0.343649685382843, + "learning_rate": 9.679222357229648e-05, + "loss": 1.9549, + "step": 538 + }, + { + "epoch": 0.1636060100166945, + "grad_norm": 0.3381790518760681, + "learning_rate": 9.678614823815311e-05, + "loss": 1.9949, + "step": 539 + }, + { + "epoch": 0.16390954621338594, + "grad_norm": 0.3788328468799591, + "learning_rate": 9.678007290400973e-05, + "loss": 2.0377, + "step": 540 + }, + { + "epoch": 0.1642130824100774, + "grad_norm": 0.39532333612442017, + "learning_rate": 9.677399756986634e-05, + "loss": 1.8837, + "step": 541 + }, + { + "epoch": 0.16451661860676886, + "grad_norm": 0.36701199412345886, + "learning_rate": 9.676792223572297e-05, + "loss": 1.9046, + "step": 542 + }, + { + "epoch": 0.16482015480346032, + "grad_norm": 0.4146950840950012, + "learning_rate": 9.676184690157959e-05, + "loss": 1.8369, + "step": 543 + }, + { + "epoch": 0.16512369100015176, + "grad_norm": 0.34827515482902527, + "learning_rate": 9.675577156743622e-05, + "loss": 2.0196, + "step": 544 + }, + { + "epoch": 0.16542722719684322, + "grad_norm": 0.36529168486595154, + "learning_rate": 9.674969623329283e-05, + "loss": 1.9189, + "step": 545 + }, + { + "epoch": 0.16573076339353468, + "grad_norm": 0.3718273639678955, + "learning_rate": 9.674362089914946e-05, + "loss": 1.933, + "step": 546 + }, + { + "epoch": 0.16603429959022614, + "grad_norm": 0.3853289484977722, + "learning_rate": 9.673754556500609e-05, + "loss": 1.9796, + "step": 547 + }, + { + "epoch": 0.16633783578691758, + "grad_norm": 0.38597023487091064, + "learning_rate": 9.67314702308627e-05, + "loss": 1.5167, + "step": 548 + }, + { + "epoch": 0.16664137198360904, + "grad_norm": 0.3792777955532074, + "learning_rate": 9.672539489671932e-05, + "loss": 1.9285, + "step": 549 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3506297171115875, + "learning_rate": 9.671931956257595e-05, + "loss": 1.8449, + "step": 550 + }, + { + "epoch": 0.16724844437699196, + "grad_norm": 0.3851594030857086, + "learning_rate": 9.671324422843257e-05, + "loss": 1.9774, + "step": 551 + }, + { + "epoch": 0.16755198057368342, + "grad_norm": 0.3438011705875397, + "learning_rate": 9.670716889428919e-05, + "loss": 2.0849, + "step": 552 + }, + { + "epoch": 0.16785551677037486, + "grad_norm": 0.3682856559753418, + "learning_rate": 9.670109356014582e-05, + "loss": 2.09, + "step": 553 + }, + { + "epoch": 0.16815905296706632, + "grad_norm": 0.5098361372947693, + "learning_rate": 9.669501822600244e-05, + "loss": 1.7843, + "step": 554 + }, + { + "epoch": 0.16846258916375778, + "grad_norm": 0.3482840359210968, + "learning_rate": 9.668894289185905e-05, + "loss": 1.5129, + "step": 555 + }, + { + "epoch": 0.16876612536044924, + "grad_norm": 0.3557680547237396, + "learning_rate": 9.668286755771568e-05, + "loss": 2.1033, + "step": 556 + }, + { + "epoch": 0.16906966155714068, + "grad_norm": 0.3622763752937317, + "learning_rate": 9.66767922235723e-05, + "loss": 1.7729, + "step": 557 + }, + { + "epoch": 0.16937319775383214, + "grad_norm": 0.3461545407772064, + "learning_rate": 9.667071688942893e-05, + "loss": 1.7999, + "step": 558 + }, + { + "epoch": 0.1696767339505236, + "grad_norm": 0.39597707986831665, + "learning_rate": 9.666464155528554e-05, + "loss": 1.8805, + "step": 559 + }, + { + "epoch": 0.16998027014721506, + "grad_norm": 0.39985769987106323, + "learning_rate": 9.665856622114217e-05, + "loss": 1.8858, + "step": 560 + }, + { + "epoch": 0.17028380634390652, + "grad_norm": 0.352029025554657, + "learning_rate": 9.66524908869988e-05, + "loss": 1.8077, + "step": 561 + }, + { + "epoch": 0.17058734254059796, + "grad_norm": 0.5778902173042297, + "learning_rate": 9.66464155528554e-05, + "loss": 1.7001, + "step": 562 + }, + { + "epoch": 0.17089087873728942, + "grad_norm": 0.49807438254356384, + "learning_rate": 9.664034021871203e-05, + "loss": 1.7536, + "step": 563 + }, + { + "epoch": 0.17119441493398088, + "grad_norm": 0.6479670405387878, + "learning_rate": 9.663426488456866e-05, + "loss": 1.7827, + "step": 564 + }, + { + "epoch": 0.17149795113067234, + "grad_norm": 0.4150646924972534, + "learning_rate": 9.662818955042528e-05, + "loss": 2.093, + "step": 565 + }, + { + "epoch": 0.17180148732736378, + "grad_norm": 0.4018631875514984, + "learning_rate": 9.66221142162819e-05, + "loss": 1.9505, + "step": 566 + }, + { + "epoch": 0.17210502352405524, + "grad_norm": 0.34705424308776855, + "learning_rate": 9.661603888213853e-05, + "loss": 2.0355, + "step": 567 + }, + { + "epoch": 0.1724085597207467, + "grad_norm": 0.40585950016975403, + "learning_rate": 9.660996354799515e-05, + "loss": 1.5372, + "step": 568 + }, + { + "epoch": 0.17271209591743816, + "grad_norm": 0.31652507185935974, + "learning_rate": 9.660388821385176e-05, + "loss": 1.6156, + "step": 569 + }, + { + "epoch": 0.1730156321141296, + "grad_norm": 0.40016746520996094, + "learning_rate": 9.65978128797084e-05, + "loss": 1.6054, + "step": 570 + }, + { + "epoch": 0.17331916831082106, + "grad_norm": 0.3570103943347931, + "learning_rate": 9.659173754556501e-05, + "loss": 1.863, + "step": 571 + }, + { + "epoch": 0.17362270450751252, + "grad_norm": 0.3687574565410614, + "learning_rate": 9.658566221142164e-05, + "loss": 1.6009, + "step": 572 + }, + { + "epoch": 0.17392624070420398, + "grad_norm": 0.35581347346305847, + "learning_rate": 9.657958687727825e-05, + "loss": 2.2078, + "step": 573 + }, + { + "epoch": 0.17422977690089544, + "grad_norm": 0.44036948680877686, + "learning_rate": 9.657351154313488e-05, + "loss": 1.7993, + "step": 574 + }, + { + "epoch": 0.17453331309758688, + "grad_norm": 0.3790392577648163, + "learning_rate": 9.656743620899151e-05, + "loss": 1.9854, + "step": 575 + }, + { + "epoch": 0.17483684929427834, + "grad_norm": 0.37755638360977173, + "learning_rate": 9.656136087484811e-05, + "loss": 1.8991, + "step": 576 + }, + { + "epoch": 0.1751403854909698, + "grad_norm": 0.37551677227020264, + "learning_rate": 9.655528554070474e-05, + "loss": 1.5446, + "step": 577 + }, + { + "epoch": 0.17544392168766126, + "grad_norm": 0.36646074056625366, + "learning_rate": 9.654921020656137e-05, + "loss": 1.6491, + "step": 578 + }, + { + "epoch": 0.1757474578843527, + "grad_norm": 0.40674564242362976, + "learning_rate": 9.654313487241799e-05, + "loss": 2.0545, + "step": 579 + }, + { + "epoch": 0.17605099408104416, + "grad_norm": 0.40863969922065735, + "learning_rate": 9.653705953827461e-05, + "loss": 1.6848, + "step": 580 + }, + { + "epoch": 0.17635453027773562, + "grad_norm": 0.3962380886077881, + "learning_rate": 9.653098420413124e-05, + "loss": 1.8451, + "step": 581 + }, + { + "epoch": 0.17665806647442708, + "grad_norm": 0.3665854334831238, + "learning_rate": 9.652490886998786e-05, + "loss": 2.0664, + "step": 582 + }, + { + "epoch": 0.17696160267111852, + "grad_norm": 0.3678790032863617, + "learning_rate": 9.651883353584447e-05, + "loss": 1.9612, + "step": 583 + }, + { + "epoch": 0.17726513886780998, + "grad_norm": 0.37832558155059814, + "learning_rate": 9.65127582017011e-05, + "loss": 2.3693, + "step": 584 + }, + { + "epoch": 0.17756867506450144, + "grad_norm": 0.37538209557533264, + "learning_rate": 9.650668286755772e-05, + "loss": 1.9305, + "step": 585 + }, + { + "epoch": 0.1778722112611929, + "grad_norm": 0.4227273166179657, + "learning_rate": 9.650060753341434e-05, + "loss": 2.1853, + "step": 586 + }, + { + "epoch": 0.17817574745788436, + "grad_norm": 0.35160574316978455, + "learning_rate": 9.649453219927096e-05, + "loss": 1.6491, + "step": 587 + }, + { + "epoch": 0.1784792836545758, + "grad_norm": 0.3960542380809784, + "learning_rate": 9.648845686512759e-05, + "loss": 2.052, + "step": 588 + }, + { + "epoch": 0.17878281985126726, + "grad_norm": 0.4215950667858124, + "learning_rate": 9.648238153098422e-05, + "loss": 1.4572, + "step": 589 + }, + { + "epoch": 0.17908635604795872, + "grad_norm": 0.35461676120758057, + "learning_rate": 9.647630619684082e-05, + "loss": 1.8047, + "step": 590 + }, + { + "epoch": 0.17938989224465018, + "grad_norm": 0.3570484220981598, + "learning_rate": 9.647023086269745e-05, + "loss": 1.5481, + "step": 591 + }, + { + "epoch": 0.17969342844134162, + "grad_norm": 0.3583620488643646, + "learning_rate": 9.646415552855408e-05, + "loss": 1.6005, + "step": 592 + }, + { + "epoch": 0.17999696463803308, + "grad_norm": 0.3991422653198242, + "learning_rate": 9.64580801944107e-05, + "loss": 1.5039, + "step": 593 + }, + { + "epoch": 0.18030050083472454, + "grad_norm": 0.4470183551311493, + "learning_rate": 9.645200486026732e-05, + "loss": 1.9181, + "step": 594 + }, + { + "epoch": 0.180604037031416, + "grad_norm": 0.6622103452682495, + "learning_rate": 9.644592952612395e-05, + "loss": 1.9772, + "step": 595 + }, + { + "epoch": 0.18090757322810747, + "grad_norm": 0.33143168687820435, + "learning_rate": 9.643985419198057e-05, + "loss": 1.9468, + "step": 596 + }, + { + "epoch": 0.1812111094247989, + "grad_norm": 0.3072865903377533, + "learning_rate": 9.643377885783718e-05, + "loss": 1.8147, + "step": 597 + }, + { + "epoch": 0.18151464562149036, + "grad_norm": 0.5602253079414368, + "learning_rate": 9.642770352369381e-05, + "loss": 2.069, + "step": 598 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.29752591252326965, + "learning_rate": 9.642162818955043e-05, + "loss": 2.039, + "step": 599 + }, + { + "epoch": 0.18212171801487329, + "grad_norm": 0.37780001759529114, + "learning_rate": 9.641555285540705e-05, + "loss": 1.8751, + "step": 600 + }, + { + "epoch": 0.18242525421156472, + "grad_norm": 0.40171170234680176, + "learning_rate": 9.640947752126367e-05, + "loss": 1.8439, + "step": 601 + }, + { + "epoch": 0.18272879040825618, + "grad_norm": 0.3567606210708618, + "learning_rate": 9.64034021871203e-05, + "loss": 1.7814, + "step": 602 + }, + { + "epoch": 0.18303232660494764, + "grad_norm": 0.3742719888687134, + "learning_rate": 9.639732685297693e-05, + "loss": 2.3184, + "step": 603 + }, + { + "epoch": 0.1833358628016391, + "grad_norm": 0.3715921938419342, + "learning_rate": 9.639125151883353e-05, + "loss": 1.9357, + "step": 604 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.4021666347980499, + "learning_rate": 9.638517618469016e-05, + "loss": 2.0251, + "step": 605 + }, + { + "epoch": 0.183942935195022, + "grad_norm": 0.3618490695953369, + "learning_rate": 9.637910085054679e-05, + "loss": 1.8889, + "step": 606 + }, + { + "epoch": 0.18424647139171346, + "grad_norm": 0.3828302025794983, + "learning_rate": 9.637302551640341e-05, + "loss": 1.8222, + "step": 607 + }, + { + "epoch": 0.18455000758840492, + "grad_norm": 0.3974449336528778, + "learning_rate": 9.636695018226003e-05, + "loss": 2.0484, + "step": 608 + }, + { + "epoch": 0.18485354378509639, + "grad_norm": 0.4195742607116699, + "learning_rate": 9.636087484811666e-05, + "loss": 1.8497, + "step": 609 + }, + { + "epoch": 0.18515707998178782, + "grad_norm": 0.3607097864151001, + "learning_rate": 9.635479951397328e-05, + "loss": 1.7574, + "step": 610 + }, + { + "epoch": 0.18546061617847928, + "grad_norm": 0.39163729548454285, + "learning_rate": 9.634872417982989e-05, + "loss": 1.8299, + "step": 611 + }, + { + "epoch": 0.18576415237517074, + "grad_norm": 0.4060773551464081, + "learning_rate": 9.634264884568652e-05, + "loss": 1.8041, + "step": 612 + }, + { + "epoch": 0.1860676885718622, + "grad_norm": 0.34089556336402893, + "learning_rate": 9.633657351154314e-05, + "loss": 1.7692, + "step": 613 + }, + { + "epoch": 0.18637122476855364, + "grad_norm": 0.33763736486434937, + "learning_rate": 9.633049817739976e-05, + "loss": 2.0368, + "step": 614 + }, + { + "epoch": 0.1866747609652451, + "grad_norm": 0.3397420644760132, + "learning_rate": 9.632442284325638e-05, + "loss": 1.9108, + "step": 615 + }, + { + "epoch": 0.18697829716193656, + "grad_norm": 0.387208491563797, + "learning_rate": 9.6318347509113e-05, + "loss": 1.7698, + "step": 616 + }, + { + "epoch": 0.18728183335862802, + "grad_norm": 0.4464956820011139, + "learning_rate": 9.631227217496964e-05, + "loss": 1.7602, + "step": 617 + }, + { + "epoch": 0.18758536955531946, + "grad_norm": 0.34682369232177734, + "learning_rate": 9.630619684082624e-05, + "loss": 1.6602, + "step": 618 + }, + { + "epoch": 0.18788890575201092, + "grad_norm": 0.8865132331848145, + "learning_rate": 9.630012150668287e-05, + "loss": 2.1785, + "step": 619 + }, + { + "epoch": 0.18819244194870238, + "grad_norm": 0.32631704211235046, + "learning_rate": 9.62940461725395e-05, + "loss": 1.5203, + "step": 620 + }, + { + "epoch": 0.18849597814539384, + "grad_norm": 0.3332744836807251, + "learning_rate": 9.628797083839612e-05, + "loss": 1.8768, + "step": 621 + }, + { + "epoch": 0.1887995143420853, + "grad_norm": 0.43369799852371216, + "learning_rate": 9.628189550425274e-05, + "loss": 2.088, + "step": 622 + }, + { + "epoch": 0.18910305053877674, + "grad_norm": 0.34848517179489136, + "learning_rate": 9.627582017010937e-05, + "loss": 1.7685, + "step": 623 + }, + { + "epoch": 0.1894065867354682, + "grad_norm": 0.40195101499557495, + "learning_rate": 9.626974483596599e-05, + "loss": 2.0937, + "step": 624 + }, + { + "epoch": 0.18971012293215966, + "grad_norm": 0.3992537558078766, + "learning_rate": 9.62636695018226e-05, + "loss": 1.7799, + "step": 625 + }, + { + "epoch": 0.19001365912885113, + "grad_norm": 0.41797420382499695, + "learning_rate": 9.625759416767922e-05, + "loss": 1.92, + "step": 626 + }, + { + "epoch": 0.19031719532554256, + "grad_norm": 0.37648969888687134, + "learning_rate": 9.625151883353585e-05, + "loss": 2.0243, + "step": 627 + }, + { + "epoch": 0.19062073152223402, + "grad_norm": 0.3513944447040558, + "learning_rate": 9.624544349939247e-05, + "loss": 2.0676, + "step": 628 + }, + { + "epoch": 0.19092426771892548, + "grad_norm": 0.3975341022014618, + "learning_rate": 9.623936816524909e-05, + "loss": 1.8837, + "step": 629 + }, + { + "epoch": 0.19122780391561695, + "grad_norm": 0.3954656422138214, + "learning_rate": 9.623329283110572e-05, + "loss": 1.9648, + "step": 630 + }, + { + "epoch": 0.19153134011230838, + "grad_norm": 0.32590335607528687, + "learning_rate": 9.622721749696235e-05, + "loss": 2.0704, + "step": 631 + }, + { + "epoch": 0.19183487630899984, + "grad_norm": 0.7592522501945496, + "learning_rate": 9.622114216281895e-05, + "loss": 1.8721, + "step": 632 + }, + { + "epoch": 0.1921384125056913, + "grad_norm": 0.4411126375198364, + "learning_rate": 9.621506682867558e-05, + "loss": 1.963, + "step": 633 + }, + { + "epoch": 0.19244194870238276, + "grad_norm": 0.37471216917037964, + "learning_rate": 9.620899149453221e-05, + "loss": 2.0679, + "step": 634 + }, + { + "epoch": 0.19274548489907423, + "grad_norm": 0.39219167828559875, + "learning_rate": 9.620291616038882e-05, + "loss": 2.0597, + "step": 635 + }, + { + "epoch": 0.19304902109576566, + "grad_norm": 0.3059561550617218, + "learning_rate": 9.619684082624545e-05, + "loss": 1.7527, + "step": 636 + }, + { + "epoch": 0.19335255729245712, + "grad_norm": 0.3843368589878082, + "learning_rate": 9.619076549210208e-05, + "loss": 1.9737, + "step": 637 + }, + { + "epoch": 0.19365609348914858, + "grad_norm": 0.3923681378364563, + "learning_rate": 9.61846901579587e-05, + "loss": 1.7179, + "step": 638 + }, + { + "epoch": 0.19395962968584005, + "grad_norm": 0.4614477753639221, + "learning_rate": 9.617861482381531e-05, + "loss": 1.2844, + "step": 639 + }, + { + "epoch": 0.19426316588253148, + "grad_norm": 0.3571571409702301, + "learning_rate": 9.617253948967193e-05, + "loss": 2.1014, + "step": 640 + }, + { + "epoch": 0.19456670207922294, + "grad_norm": 0.40552857518196106, + "learning_rate": 9.616646415552856e-05, + "loss": 1.8588, + "step": 641 + }, + { + "epoch": 0.1948702382759144, + "grad_norm": 0.3739052414894104, + "learning_rate": 9.616038882138518e-05, + "loss": 2.2262, + "step": 642 + }, + { + "epoch": 0.19517377447260587, + "grad_norm": 0.33607810735702515, + "learning_rate": 9.61543134872418e-05, + "loss": 1.8196, + "step": 643 + }, + { + "epoch": 0.19547731066929733, + "grad_norm": 0.3751862645149231, + "learning_rate": 9.614823815309843e-05, + "loss": 1.7183, + "step": 644 + }, + { + "epoch": 0.19578084686598876, + "grad_norm": 0.4978049397468567, + "learning_rate": 9.614216281895506e-05, + "loss": 1.6918, + "step": 645 + }, + { + "epoch": 0.19608438306268022, + "grad_norm": 0.3755020499229431, + "learning_rate": 9.613608748481166e-05, + "loss": 2.0704, + "step": 646 + }, + { + "epoch": 0.19638791925937168, + "grad_norm": 0.3641931414604187, + "learning_rate": 9.613001215066829e-05, + "loss": 1.8087, + "step": 647 + }, + { + "epoch": 0.19669145545606315, + "grad_norm": 0.32229694724082947, + "learning_rate": 9.612393681652492e-05, + "loss": 1.9157, + "step": 648 + }, + { + "epoch": 0.19699499165275458, + "grad_norm": 0.4132642149925232, + "learning_rate": 9.611786148238153e-05, + "loss": 1.6715, + "step": 649 + }, + { + "epoch": 0.19729852784944604, + "grad_norm": 0.38652992248535156, + "learning_rate": 9.611178614823816e-05, + "loss": 1.814, + "step": 650 + }, + { + "epoch": 0.1976020640461375, + "grad_norm": 0.432373970746994, + "learning_rate": 9.610571081409479e-05, + "loss": 1.8719, + "step": 651 + }, + { + "epoch": 0.19790560024282897, + "grad_norm": 0.4002588987350464, + "learning_rate": 9.60996354799514e-05, + "loss": 1.9697, + "step": 652 + }, + { + "epoch": 0.1982091364395204, + "grad_norm": 0.3377281427383423, + "learning_rate": 9.609356014580802e-05, + "loss": 2.0114, + "step": 653 + }, + { + "epoch": 0.19851267263621186, + "grad_norm": 0.40434688329696655, + "learning_rate": 9.608748481166464e-05, + "loss": 2.2406, + "step": 654 + }, + { + "epoch": 0.19881620883290332, + "grad_norm": 0.33377552032470703, + "learning_rate": 9.608140947752127e-05, + "loss": 1.8749, + "step": 655 + }, + { + "epoch": 0.19911974502959479, + "grad_norm": 0.36146265268325806, + "learning_rate": 9.607533414337789e-05, + "loss": 1.5821, + "step": 656 + }, + { + "epoch": 0.19942328122628625, + "grad_norm": 1.8864728212356567, + "learning_rate": 9.60692588092345e-05, + "loss": 1.9895, + "step": 657 + }, + { + "epoch": 0.19972681742297768, + "grad_norm": 0.35898399353027344, + "learning_rate": 9.606318347509114e-05, + "loss": 1.6315, + "step": 658 + }, + { + "epoch": 0.20003035361966914, + "grad_norm": 0.44391825795173645, + "learning_rate": 9.605710814094775e-05, + "loss": 1.5046, + "step": 659 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.37957173585891724, + "learning_rate": 9.605103280680437e-05, + "loss": 1.8532, + "step": 660 + }, + { + "epoch": 0.20063742601305207, + "grad_norm": 1.4045872688293457, + "learning_rate": 9.6044957472661e-05, + "loss": 2.1214, + "step": 661 + }, + { + "epoch": 0.2009409622097435, + "grad_norm": 0.4327601194381714, + "learning_rate": 9.603888213851763e-05, + "loss": 2.1803, + "step": 662 + }, + { + "epoch": 0.20124449840643496, + "grad_norm": 3.2122411727905273, + "learning_rate": 9.603280680437424e-05, + "loss": 1.989, + "step": 663 + }, + { + "epoch": 0.20154803460312642, + "grad_norm": 0.4538092017173767, + "learning_rate": 9.602673147023087e-05, + "loss": 1.7016, + "step": 664 + }, + { + "epoch": 0.2018515707998179, + "grad_norm": 0.5968027710914612, + "learning_rate": 9.60206561360875e-05, + "loss": 1.8538, + "step": 665 + }, + { + "epoch": 0.20215510699650932, + "grad_norm": 0.3749493956565857, + "learning_rate": 9.601458080194412e-05, + "loss": 1.8747, + "step": 666 + }, + { + "epoch": 0.20245864319320078, + "grad_norm": 0.3741036355495453, + "learning_rate": 9.600850546780073e-05, + "loss": 1.9896, + "step": 667 + }, + { + "epoch": 0.20276217938989224, + "grad_norm": 0.3708043098449707, + "learning_rate": 9.600243013365735e-05, + "loss": 1.7925, + "step": 668 + }, + { + "epoch": 0.2030657155865837, + "grad_norm": 0.32813695073127747, + "learning_rate": 9.599635479951398e-05, + "loss": 1.8272, + "step": 669 + }, + { + "epoch": 0.20336925178327517, + "grad_norm": 0.6151819229125977, + "learning_rate": 9.59902794653706e-05, + "loss": 1.927, + "step": 670 + }, + { + "epoch": 0.2036727879799666, + "grad_norm": 0.42905279994010925, + "learning_rate": 9.598420413122722e-05, + "loss": 1.5442, + "step": 671 + }, + { + "epoch": 0.20397632417665806, + "grad_norm": 0.4102342426776886, + "learning_rate": 9.597812879708385e-05, + "loss": 1.2097, + "step": 672 + }, + { + "epoch": 0.20427986037334953, + "grad_norm": 0.392560750246048, + "learning_rate": 9.597205346294046e-05, + "loss": 1.8166, + "step": 673 + }, + { + "epoch": 0.204583396570041, + "grad_norm": 0.4056089222431183, + "learning_rate": 9.596597812879708e-05, + "loss": 1.5171, + "step": 674 + }, + { + "epoch": 0.20488693276673242, + "grad_norm": 0.4734075963497162, + "learning_rate": 9.595990279465371e-05, + "loss": 1.4529, + "step": 675 + }, + { + "epoch": 0.20519046896342388, + "grad_norm": 0.41490182280540466, + "learning_rate": 9.595382746051034e-05, + "loss": 1.8316, + "step": 676 + }, + { + "epoch": 0.20549400516011535, + "grad_norm": 0.3590947091579437, + "learning_rate": 9.594775212636695e-05, + "loss": 1.9073, + "step": 677 + }, + { + "epoch": 0.2057975413568068, + "grad_norm": 0.3779642581939697, + "learning_rate": 9.594167679222358e-05, + "loss": 1.9669, + "step": 678 + }, + { + "epoch": 0.20610107755349824, + "grad_norm": 0.39710256457328796, + "learning_rate": 9.593560145808021e-05, + "loss": 2.0278, + "step": 679 + }, + { + "epoch": 0.2064046137501897, + "grad_norm": 0.4168045222759247, + "learning_rate": 9.592952612393683e-05, + "loss": 1.5158, + "step": 680 + }, + { + "epoch": 0.20670814994688116, + "grad_norm": 0.3751262426376343, + "learning_rate": 9.592345078979344e-05, + "loss": 2.1459, + "step": 681 + }, + { + "epoch": 0.20701168614357263, + "grad_norm": 0.49441012740135193, + "learning_rate": 9.591737545565006e-05, + "loss": 1.8968, + "step": 682 + }, + { + "epoch": 0.2073152223402641, + "grad_norm": 0.4807801842689514, + "learning_rate": 9.591130012150669e-05, + "loss": 2.2741, + "step": 683 + }, + { + "epoch": 0.20761875853695552, + "grad_norm": 0.3886473774909973, + "learning_rate": 9.590522478736331e-05, + "loss": 1.4511, + "step": 684 + }, + { + "epoch": 0.20792229473364698, + "grad_norm": 0.6425371170043945, + "learning_rate": 9.589914945321993e-05, + "loss": 2.2643, + "step": 685 + }, + { + "epoch": 0.20822583093033845, + "grad_norm": 0.37720414996147156, + "learning_rate": 9.589307411907656e-05, + "loss": 1.8431, + "step": 686 + }, + { + "epoch": 0.2085293671270299, + "grad_norm": 0.35544151067733765, + "learning_rate": 9.588699878493317e-05, + "loss": 1.9822, + "step": 687 + }, + { + "epoch": 0.20883290332372134, + "grad_norm": 0.561444878578186, + "learning_rate": 9.588092345078979e-05, + "loss": 1.9507, + "step": 688 + }, + { + "epoch": 0.2091364395204128, + "grad_norm": 0.37386366724967957, + "learning_rate": 9.587484811664642e-05, + "loss": 2.0673, + "step": 689 + }, + { + "epoch": 0.20943997571710427, + "grad_norm": 0.3882986307144165, + "learning_rate": 9.586877278250305e-05, + "loss": 2.0166, + "step": 690 + }, + { + "epoch": 0.20974351191379573, + "grad_norm": 0.41471484303474426, + "learning_rate": 9.586269744835966e-05, + "loss": 1.7111, + "step": 691 + }, + { + "epoch": 0.2100470481104872, + "grad_norm": 0.467939555644989, + "learning_rate": 9.585662211421629e-05, + "loss": 1.6116, + "step": 692 + }, + { + "epoch": 0.21035058430717862, + "grad_norm": 0.905303955078125, + "learning_rate": 9.585054678007292e-05, + "loss": 2.0287, + "step": 693 + }, + { + "epoch": 0.21065412050387008, + "grad_norm": 0.3820960819721222, + "learning_rate": 9.584447144592954e-05, + "loss": 1.7834, + "step": 694 + }, + { + "epoch": 0.21095765670056155, + "grad_norm": 0.4265238642692566, + "learning_rate": 9.583839611178615e-05, + "loss": 1.5176, + "step": 695 + }, + { + "epoch": 0.211261192897253, + "grad_norm": 0.30739274621009827, + "learning_rate": 9.583232077764277e-05, + "loss": 1.3758, + "step": 696 + }, + { + "epoch": 0.21156472909394444, + "grad_norm": 0.3890193998813629, + "learning_rate": 9.58262454434994e-05, + "loss": 1.8188, + "step": 697 + }, + { + "epoch": 0.2118682652906359, + "grad_norm": 0.3726442754268646, + "learning_rate": 9.582017010935602e-05, + "loss": 1.8957, + "step": 698 + }, + { + "epoch": 0.21217180148732737, + "grad_norm": 0.43913599848747253, + "learning_rate": 9.581409477521264e-05, + "loss": 1.8882, + "step": 699 + }, + { + "epoch": 0.21247533768401883, + "grad_norm": 0.4009544253349304, + "learning_rate": 9.580801944106927e-05, + "loss": 1.8844, + "step": 700 + }, + { + "epoch": 0.21277887388071026, + "grad_norm": 0.6625222563743591, + "learning_rate": 9.580194410692588e-05, + "loss": 1.8564, + "step": 701 + }, + { + "epoch": 0.21308241007740172, + "grad_norm": 0.3108811676502228, + "learning_rate": 9.57958687727825e-05, + "loss": 1.5883, + "step": 702 + }, + { + "epoch": 0.21338594627409319, + "grad_norm": 0.35348960757255554, + "learning_rate": 9.578979343863913e-05, + "loss": 1.6062, + "step": 703 + }, + { + "epoch": 0.21368948247078465, + "grad_norm": 0.3460123538970947, + "learning_rate": 9.578371810449576e-05, + "loss": 1.3375, + "step": 704 + }, + { + "epoch": 0.2139930186674761, + "grad_norm": 0.3396036922931671, + "learning_rate": 9.577764277035237e-05, + "loss": 2.007, + "step": 705 + }, + { + "epoch": 0.21429655486416754, + "grad_norm": 0.38622626662254333, + "learning_rate": 9.5771567436209e-05, + "loss": 1.7612, + "step": 706 + }, + { + "epoch": 0.214600091060859, + "grad_norm": 0.39317429065704346, + "learning_rate": 9.576549210206563e-05, + "loss": 1.7884, + "step": 707 + }, + { + "epoch": 0.21490362725755047, + "grad_norm": 0.3730657994747162, + "learning_rate": 9.575941676792223e-05, + "loss": 1.9813, + "step": 708 + }, + { + "epoch": 0.21520716345424193, + "grad_norm": 0.3781750202178955, + "learning_rate": 9.575334143377886e-05, + "loss": 1.8843, + "step": 709 + }, + { + "epoch": 0.21551069965093336, + "grad_norm": 0.4760946035385132, + "learning_rate": 9.574726609963548e-05, + "loss": 2.0092, + "step": 710 + }, + { + "epoch": 0.21581423584762482, + "grad_norm": 0.4052627980709076, + "learning_rate": 9.574119076549211e-05, + "loss": 1.9742, + "step": 711 + }, + { + "epoch": 0.2161177720443163, + "grad_norm": 0.34289002418518066, + "learning_rate": 9.573511543134873e-05, + "loss": 2.0408, + "step": 712 + }, + { + "epoch": 0.21642130824100775, + "grad_norm": 0.3720855116844177, + "learning_rate": 9.572904009720535e-05, + "loss": 1.6685, + "step": 713 + }, + { + "epoch": 0.21672484443769918, + "grad_norm": 0.4015984535217285, + "learning_rate": 9.572296476306198e-05, + "loss": 1.9651, + "step": 714 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.45196712017059326, + "learning_rate": 9.57168894289186e-05, + "loss": 2.0588, + "step": 715 + }, + { + "epoch": 0.2173319168310821, + "grad_norm": 0.3434293866157532, + "learning_rate": 9.571081409477521e-05, + "loss": 1.748, + "step": 716 + }, + { + "epoch": 0.21763545302777357, + "grad_norm": 0.29288217425346375, + "learning_rate": 9.570473876063184e-05, + "loss": 2.0274, + "step": 717 + }, + { + "epoch": 0.21793898922446503, + "grad_norm": 0.4192684590816498, + "learning_rate": 9.569866342648847e-05, + "loss": 1.3597, + "step": 718 + }, + { + "epoch": 0.21824252542115646, + "grad_norm": 0.4450276494026184, + "learning_rate": 9.569258809234508e-05, + "loss": 1.4675, + "step": 719 + }, + { + "epoch": 0.21854606161784793, + "grad_norm": 0.33388352394104004, + "learning_rate": 9.568651275820171e-05, + "loss": 1.9747, + "step": 720 + }, + { + "epoch": 0.2188495978145394, + "grad_norm": 0.34411269426345825, + "learning_rate": 9.568043742405832e-05, + "loss": 1.9119, + "step": 721 + }, + { + "epoch": 0.21915313401123085, + "grad_norm": 0.38926756381988525, + "learning_rate": 9.567436208991494e-05, + "loss": 1.8859, + "step": 722 + }, + { + "epoch": 0.21945667020792228, + "grad_norm": 0.3513714671134949, + "learning_rate": 9.566828675577157e-05, + "loss": 1.9125, + "step": 723 + }, + { + "epoch": 0.21976020640461374, + "grad_norm": 0.9200549721717834, + "learning_rate": 9.566221142162819e-05, + "loss": 2.1854, + "step": 724 + }, + { + "epoch": 0.2200637426013052, + "grad_norm": 0.444815456867218, + "learning_rate": 9.565613608748482e-05, + "loss": 1.8998, + "step": 725 + }, + { + "epoch": 0.22036727879799667, + "grad_norm": 0.37483492493629456, + "learning_rate": 9.565006075334144e-05, + "loss": 1.7089, + "step": 726 + }, + { + "epoch": 0.22067081499468813, + "grad_norm": 0.32369089126586914, + "learning_rate": 9.564398541919806e-05, + "loss": 2.1223, + "step": 727 + }, + { + "epoch": 0.22097435119137956, + "grad_norm": 0.3537048399448395, + "learning_rate": 9.563791008505469e-05, + "loss": 1.3256, + "step": 728 + }, + { + "epoch": 0.22127788738807103, + "grad_norm": 0.408723384141922, + "learning_rate": 9.56318347509113e-05, + "loss": 1.8056, + "step": 729 + }, + { + "epoch": 0.2215814235847625, + "grad_norm": 0.37529709935188293, + "learning_rate": 9.562575941676792e-05, + "loss": 1.9281, + "step": 730 + }, + { + "epoch": 0.22188495978145395, + "grad_norm": 0.3406868278980255, + "learning_rate": 9.561968408262455e-05, + "loss": 1.9673, + "step": 731 + }, + { + "epoch": 0.22218849597814538, + "grad_norm": 0.34361201524734497, + "learning_rate": 9.561360874848117e-05, + "loss": 1.7777, + "step": 732 + }, + { + "epoch": 0.22249203217483685, + "grad_norm": 0.3995072543621063, + "learning_rate": 9.560753341433779e-05, + "loss": 1.9959, + "step": 733 + }, + { + "epoch": 0.2227955683715283, + "grad_norm": 0.4618263244628906, + "learning_rate": 9.560145808019442e-05, + "loss": 1.7524, + "step": 734 + }, + { + "epoch": 0.22309910456821977, + "grad_norm": 0.36778688430786133, + "learning_rate": 9.559538274605103e-05, + "loss": 1.6332, + "step": 735 + }, + { + "epoch": 0.2234026407649112, + "grad_norm": 0.47031348943710327, + "learning_rate": 9.558930741190765e-05, + "loss": 2.0364, + "step": 736 + }, + { + "epoch": 0.22370617696160267, + "grad_norm": 0.6130351424217224, + "learning_rate": 9.558323207776428e-05, + "loss": 1.7814, + "step": 737 + }, + { + "epoch": 0.22400971315829413, + "grad_norm": 0.3733448088169098, + "learning_rate": 9.55771567436209e-05, + "loss": 1.8888, + "step": 738 + }, + { + "epoch": 0.2243132493549856, + "grad_norm": 0.368182510137558, + "learning_rate": 9.557108140947753e-05, + "loss": 2.3162, + "step": 739 + }, + { + "epoch": 0.22461678555167705, + "grad_norm": 0.4311901926994324, + "learning_rate": 9.556500607533415e-05, + "loss": 2.1294, + "step": 740 + }, + { + "epoch": 0.22492032174836848, + "grad_norm": 0.38696759939193726, + "learning_rate": 9.555893074119077e-05, + "loss": 1.9524, + "step": 741 + }, + { + "epoch": 0.22522385794505995, + "grad_norm": 0.37136873602867126, + "learning_rate": 9.55528554070474e-05, + "loss": 1.8677, + "step": 742 + }, + { + "epoch": 0.2255273941417514, + "grad_norm": 0.4084314703941345, + "learning_rate": 9.554678007290401e-05, + "loss": 1.8621, + "step": 743 + }, + { + "epoch": 0.22583093033844287, + "grad_norm": 0.4216344952583313, + "learning_rate": 9.554070473876063e-05, + "loss": 2.0455, + "step": 744 + }, + { + "epoch": 0.2261344665351343, + "grad_norm": 0.36579129099845886, + "learning_rate": 9.553462940461726e-05, + "loss": 1.8993, + "step": 745 + }, + { + "epoch": 0.22643800273182577, + "grad_norm": 0.4048181474208832, + "learning_rate": 9.552855407047388e-05, + "loss": 1.2016, + "step": 746 + }, + { + "epoch": 0.22674153892851723, + "grad_norm": 0.4071241617202759, + "learning_rate": 9.55224787363305e-05, + "loss": 1.9272, + "step": 747 + }, + { + "epoch": 0.2270450751252087, + "grad_norm": 0.3970381021499634, + "learning_rate": 9.551640340218713e-05, + "loss": 2.0819, + "step": 748 + }, + { + "epoch": 0.22734861132190012, + "grad_norm": 0.3891443610191345, + "learning_rate": 9.551032806804374e-05, + "loss": 2.2604, + "step": 749 + }, + { + "epoch": 0.22765214751859159, + "grad_norm": 0.40169456601142883, + "learning_rate": 9.550425273390036e-05, + "loss": 1.9514, + "step": 750 + }, + { + "epoch": 0.22795568371528305, + "grad_norm": 0.7191595435142517, + "learning_rate": 9.549817739975699e-05, + "loss": 1.1686, + "step": 751 + }, + { + "epoch": 0.2282592199119745, + "grad_norm": 0.3602886497974396, + "learning_rate": 9.549210206561361e-05, + "loss": 1.8877, + "step": 752 + }, + { + "epoch": 0.22856275610866597, + "grad_norm": 0.34270042181015015, + "learning_rate": 9.548602673147024e-05, + "loss": 1.4296, + "step": 753 + }, + { + "epoch": 0.2288662923053574, + "grad_norm": 0.37293288111686707, + "learning_rate": 9.547995139732686e-05, + "loss": 1.6695, + "step": 754 + }, + { + "epoch": 0.22916982850204887, + "grad_norm": 0.31505951285362244, + "learning_rate": 9.547387606318348e-05, + "loss": 1.9869, + "step": 755 + }, + { + "epoch": 0.22947336469874033, + "grad_norm": 0.38319501280784607, + "learning_rate": 9.54678007290401e-05, + "loss": 1.9769, + "step": 756 + }, + { + "epoch": 0.2297769008954318, + "grad_norm": 0.37378913164138794, + "learning_rate": 9.546172539489672e-05, + "loss": 1.9998, + "step": 757 + }, + { + "epoch": 0.23008043709212322, + "grad_norm": 0.34337082505226135, + "learning_rate": 9.545565006075334e-05, + "loss": 2.048, + "step": 758 + }, + { + "epoch": 0.2303839732888147, + "grad_norm": 0.35315895080566406, + "learning_rate": 9.544957472660997e-05, + "loss": 1.8742, + "step": 759 + }, + { + "epoch": 0.23068750948550615, + "grad_norm": 0.3854929506778717, + "learning_rate": 9.544349939246659e-05, + "loss": 1.4978, + "step": 760 + }, + { + "epoch": 0.2309910456821976, + "grad_norm": 0.35743293166160583, + "learning_rate": 9.543742405832321e-05, + "loss": 2.159, + "step": 761 + }, + { + "epoch": 0.23129458187888904, + "grad_norm": 0.39035484194755554, + "learning_rate": 9.543134872417984e-05, + "loss": 1.9588, + "step": 762 + }, + { + "epoch": 0.2315981180755805, + "grad_norm": 0.35890859365463257, + "learning_rate": 9.542527339003645e-05, + "loss": 1.7466, + "step": 763 + }, + { + "epoch": 0.23190165427227197, + "grad_norm": 0.46986308693885803, + "learning_rate": 9.541919805589307e-05, + "loss": 1.8819, + "step": 764 + }, + { + "epoch": 0.23220519046896343, + "grad_norm": 0.4226287305355072, + "learning_rate": 9.54131227217497e-05, + "loss": 1.9421, + "step": 765 + }, + { + "epoch": 0.2325087266656549, + "grad_norm": 0.4461078345775604, + "learning_rate": 9.540704738760632e-05, + "loss": 1.6888, + "step": 766 + }, + { + "epoch": 0.23281226286234633, + "grad_norm": 0.363406240940094, + "learning_rate": 9.540097205346295e-05, + "loss": 1.9234, + "step": 767 + }, + { + "epoch": 0.2331157990590378, + "grad_norm": 0.3713390529155731, + "learning_rate": 9.539489671931957e-05, + "loss": 1.9741, + "step": 768 + }, + { + "epoch": 0.23341933525572925, + "grad_norm": 0.3300642967224121, + "learning_rate": 9.538882138517619e-05, + "loss": 1.5029, + "step": 769 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.30819302797317505, + "learning_rate": 9.538274605103282e-05, + "loss": 1.6739, + "step": 770 + }, + { + "epoch": 0.23402640764911214, + "grad_norm": 0.3575786054134369, + "learning_rate": 9.537667071688943e-05, + "loss": 1.5768, + "step": 771 + }, + { + "epoch": 0.2343299438458036, + "grad_norm": 0.32882705330848694, + "learning_rate": 9.537059538274605e-05, + "loss": 1.9512, + "step": 772 + }, + { + "epoch": 0.23463348004249507, + "grad_norm": 0.3340393900871277, + "learning_rate": 9.536452004860268e-05, + "loss": 1.927, + "step": 773 + }, + { + "epoch": 0.23493701623918653, + "grad_norm": 0.33640411496162415, + "learning_rate": 9.53584447144593e-05, + "loss": 1.9496, + "step": 774 + }, + { + "epoch": 0.235240552435878, + "grad_norm": 0.3581593930721283, + "learning_rate": 9.535236938031592e-05, + "loss": 1.8766, + "step": 775 + }, + { + "epoch": 0.23554408863256943, + "grad_norm": 0.44084489345550537, + "learning_rate": 9.534629404617255e-05, + "loss": 2.0274, + "step": 776 + }, + { + "epoch": 0.2358476248292609, + "grad_norm": 0.363518089056015, + "learning_rate": 9.534021871202917e-05, + "loss": 1.9956, + "step": 777 + }, + { + "epoch": 0.23615116102595235, + "grad_norm": 0.35967034101486206, + "learning_rate": 9.533414337788578e-05, + "loss": 1.7312, + "step": 778 + }, + { + "epoch": 0.2364546972226438, + "grad_norm": 0.3683255910873413, + "learning_rate": 9.532806804374241e-05, + "loss": 1.9594, + "step": 779 + }, + { + "epoch": 0.23675823341933525, + "grad_norm": 0.3063610792160034, + "learning_rate": 9.532199270959903e-05, + "loss": 1.3972, + "step": 780 + }, + { + "epoch": 0.2370617696160267, + "grad_norm": 0.4217472970485687, + "learning_rate": 9.531591737545565e-05, + "loss": 1.9495, + "step": 781 + }, + { + "epoch": 0.23736530581271817, + "grad_norm": 0.46779391169548035, + "learning_rate": 9.530984204131228e-05, + "loss": 1.9965, + "step": 782 + }, + { + "epoch": 0.23766884200940963, + "grad_norm": 0.351810485124588, + "learning_rate": 9.53037667071689e-05, + "loss": 1.8034, + "step": 783 + }, + { + "epoch": 0.23797237820610107, + "grad_norm": 0.3302007019519806, + "learning_rate": 9.529769137302553e-05, + "loss": 2.0156, + "step": 784 + }, + { + "epoch": 0.23827591440279253, + "grad_norm": 0.3699585497379303, + "learning_rate": 9.529161603888214e-05, + "loss": 1.7217, + "step": 785 + }, + { + "epoch": 0.238579450599484, + "grad_norm": 0.34256428480148315, + "learning_rate": 9.528554070473876e-05, + "loss": 1.924, + "step": 786 + }, + { + "epoch": 0.23888298679617545, + "grad_norm": 0.4008747339248657, + "learning_rate": 9.527946537059539e-05, + "loss": 1.8903, + "step": 787 + }, + { + "epoch": 0.2391865229928669, + "grad_norm": 0.4281119108200073, + "learning_rate": 9.527339003645201e-05, + "loss": 2.1402, + "step": 788 + }, + { + "epoch": 0.23949005918955835, + "grad_norm": 0.4065872132778168, + "learning_rate": 9.526731470230863e-05, + "loss": 1.8399, + "step": 789 + }, + { + "epoch": 0.2397935953862498, + "grad_norm": 0.35334911942481995, + "learning_rate": 9.526123936816526e-05, + "loss": 1.8206, + "step": 790 + }, + { + "epoch": 0.24009713158294127, + "grad_norm": 0.35420283675193787, + "learning_rate": 9.525516403402188e-05, + "loss": 1.956, + "step": 791 + }, + { + "epoch": 0.24040066777963273, + "grad_norm": 0.5720547437667847, + "learning_rate": 9.524908869987849e-05, + "loss": 2.1932, + "step": 792 + }, + { + "epoch": 0.24070420397632417, + "grad_norm": 0.3512174189090729, + "learning_rate": 9.524301336573512e-05, + "loss": 2.1031, + "step": 793 + }, + { + "epoch": 0.24100774017301563, + "grad_norm": 0.3975936770439148, + "learning_rate": 9.523693803159174e-05, + "loss": 1.5962, + "step": 794 + }, + { + "epoch": 0.2413112763697071, + "grad_norm": 0.3723268210887909, + "learning_rate": 9.523086269744836e-05, + "loss": 1.492, + "step": 795 + }, + { + "epoch": 0.24161481256639855, + "grad_norm": 0.5287608504295349, + "learning_rate": 9.522478736330499e-05, + "loss": 1.5488, + "step": 796 + }, + { + "epoch": 0.24191834876308999, + "grad_norm": 0.37749987840652466, + "learning_rate": 9.52187120291616e-05, + "loss": 1.5724, + "step": 797 + }, + { + "epoch": 0.24222188495978145, + "grad_norm": 0.38260164856910706, + "learning_rate": 9.521263669501824e-05, + "loss": 2.1049, + "step": 798 + }, + { + "epoch": 0.2425254211564729, + "grad_norm": 0.3552962839603424, + "learning_rate": 9.520656136087485e-05, + "loss": 1.8853, + "step": 799 + }, + { + "epoch": 0.24282895735316437, + "grad_norm": 0.5752935409545898, + "learning_rate": 9.520048602673147e-05, + "loss": 1.6359, + "step": 800 + }, + { + "epoch": 0.24313249354985583, + "grad_norm": 0.41982319951057434, + "learning_rate": 9.51944106925881e-05, + "loss": 1.485, + "step": 801 + }, + { + "epoch": 0.24343602974654727, + "grad_norm": 0.3913584351539612, + "learning_rate": 9.518833535844472e-05, + "loss": 1.9818, + "step": 802 + }, + { + "epoch": 0.24373956594323873, + "grad_norm": 0.3771272897720337, + "learning_rate": 9.518226002430134e-05, + "loss": 1.7237, + "step": 803 + }, + { + "epoch": 0.2440431021399302, + "grad_norm": 0.3625226318836212, + "learning_rate": 9.517618469015797e-05, + "loss": 1.6776, + "step": 804 + }, + { + "epoch": 0.24434663833662165, + "grad_norm": 0.3253527283668518, + "learning_rate": 9.517010935601459e-05, + "loss": 2.0659, + "step": 805 + }, + { + "epoch": 0.2446501745333131, + "grad_norm": 0.3705154359340668, + "learning_rate": 9.51640340218712e-05, + "loss": 2.116, + "step": 806 + }, + { + "epoch": 0.24495371073000455, + "grad_norm": 0.3321172595024109, + "learning_rate": 9.515795868772783e-05, + "loss": 2.1026, + "step": 807 + }, + { + "epoch": 0.245257246926696, + "grad_norm": 0.41880494356155396, + "learning_rate": 9.515188335358445e-05, + "loss": 1.7541, + "step": 808 + }, + { + "epoch": 0.24556078312338747, + "grad_norm": 0.38695165514945984, + "learning_rate": 9.514580801944107e-05, + "loss": 1.9328, + "step": 809 + }, + { + "epoch": 0.2458643193200789, + "grad_norm": 0.37348538637161255, + "learning_rate": 9.51397326852977e-05, + "loss": 1.9065, + "step": 810 + }, + { + "epoch": 0.24616785551677037, + "grad_norm": 1.5822879076004028, + "learning_rate": 9.513365735115432e-05, + "loss": 1.8297, + "step": 811 + }, + { + "epoch": 0.24647139171346183, + "grad_norm": 0.3697100579738617, + "learning_rate": 9.512758201701095e-05, + "loss": 1.9091, + "step": 812 + }, + { + "epoch": 0.2467749279101533, + "grad_norm": 0.40801766514778137, + "learning_rate": 9.512150668286756e-05, + "loss": 1.8284, + "step": 813 + }, + { + "epoch": 0.24707846410684475, + "grad_norm": 0.4060746133327484, + "learning_rate": 9.511543134872418e-05, + "loss": 2.0244, + "step": 814 + }, + { + "epoch": 0.2473820003035362, + "grad_norm": 0.38555091619491577, + "learning_rate": 9.510935601458081e-05, + "loss": 1.6098, + "step": 815 + }, + { + "epoch": 0.24768553650022765, + "grad_norm": 0.39763063192367554, + "learning_rate": 9.510328068043743e-05, + "loss": 1.836, + "step": 816 + }, + { + "epoch": 0.2479890726969191, + "grad_norm": 0.6933274269104004, + "learning_rate": 9.509720534629405e-05, + "loss": 2.1968, + "step": 817 + }, + { + "epoch": 0.24829260889361057, + "grad_norm": 1.307569146156311, + "learning_rate": 9.509113001215068e-05, + "loss": 2.3396, + "step": 818 + }, + { + "epoch": 0.248596145090302, + "grad_norm": 0.3513609766960144, + "learning_rate": 9.50850546780073e-05, + "loss": 1.7302, + "step": 819 + }, + { + "epoch": 0.24889968128699347, + "grad_norm": 0.36949577927589417, + "learning_rate": 9.507897934386391e-05, + "loss": 1.7796, + "step": 820 + }, + { + "epoch": 0.24920321748368493, + "grad_norm": 0.38934049010276794, + "learning_rate": 9.507290400972054e-05, + "loss": 1.7644, + "step": 821 + }, + { + "epoch": 0.2495067536803764, + "grad_norm": 0.5927665829658508, + "learning_rate": 9.506682867557716e-05, + "loss": 1.9029, + "step": 822 + }, + { + "epoch": 0.24981028987706785, + "grad_norm": 0.5131897926330566, + "learning_rate": 9.506075334143378e-05, + "loss": 2.1282, + "step": 823 + }, + { + "epoch": 0.2501138260737593, + "grad_norm": 0.36232516169548035, + "learning_rate": 9.505467800729041e-05, + "loss": 2.1073, + "step": 824 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.43212029337882996, + "learning_rate": 9.504860267314703e-05, + "loss": 2.0364, + "step": 825 + }, + { + "epoch": 0.2507208984671422, + "grad_norm": 0.36575961112976074, + "learning_rate": 9.504252733900366e-05, + "loss": 1.8609, + "step": 826 + }, + { + "epoch": 0.2510244346638337, + "grad_norm": 0.32013362646102905, + "learning_rate": 9.503645200486027e-05, + "loss": 1.4097, + "step": 827 + }, + { + "epoch": 0.2513279708605251, + "grad_norm": 0.4062201976776123, + "learning_rate": 9.503037667071689e-05, + "loss": 1.9872, + "step": 828 + }, + { + "epoch": 0.2516315070572166, + "grad_norm": 0.3433174192905426, + "learning_rate": 9.502430133657352e-05, + "loss": 2.0662, + "step": 829 + }, + { + "epoch": 0.25193504325390803, + "grad_norm": 0.3925630748271942, + "learning_rate": 9.501822600243013e-05, + "loss": 1.5876, + "step": 830 + }, + { + "epoch": 0.25223857945059946, + "grad_norm": 0.32962149381637573, + "learning_rate": 9.501215066828676e-05, + "loss": 1.7445, + "step": 831 + }, + { + "epoch": 0.25254211564729095, + "grad_norm": 0.35508283972740173, + "learning_rate": 9.500607533414339e-05, + "loss": 1.8836, + "step": 832 + }, + { + "epoch": 0.2528456518439824, + "grad_norm": 0.34893691539764404, + "learning_rate": 9.5e-05, + "loss": 1.6312, + "step": 833 + }, + { + "epoch": 0.2531491880406738, + "grad_norm": 0.4068532884120941, + "learning_rate": 9.499392466585662e-05, + "loss": 1.5567, + "step": 834 + }, + { + "epoch": 0.2534527242373653, + "grad_norm": 0.37818485498428345, + "learning_rate": 9.498784933171325e-05, + "loss": 2.0791, + "step": 835 + }, + { + "epoch": 0.25375626043405675, + "grad_norm": 0.884172797203064, + "learning_rate": 9.498177399756987e-05, + "loss": 1.8642, + "step": 836 + }, + { + "epoch": 0.25405979663074824, + "grad_norm": 0.4108290374279022, + "learning_rate": 9.497569866342649e-05, + "loss": 1.9545, + "step": 837 + }, + { + "epoch": 0.25436333282743967, + "grad_norm": 0.37885358929634094, + "learning_rate": 9.496962332928312e-05, + "loss": 1.3709, + "step": 838 + }, + { + "epoch": 0.2546668690241311, + "grad_norm": 0.3919561505317688, + "learning_rate": 9.496354799513974e-05, + "loss": 1.9548, + "step": 839 + }, + { + "epoch": 0.2549704052208226, + "grad_norm": 0.3945852518081665, + "learning_rate": 9.495747266099637e-05, + "loss": 1.8139, + "step": 840 + }, + { + "epoch": 0.255273941417514, + "grad_norm": 0.3272388279438019, + "learning_rate": 9.495139732685298e-05, + "loss": 2.1711, + "step": 841 + }, + { + "epoch": 0.2555774776142055, + "grad_norm": 0.3214159905910492, + "learning_rate": 9.49453219927096e-05, + "loss": 1.6202, + "step": 842 + }, + { + "epoch": 0.25588101381089695, + "grad_norm": 0.6175217628479004, + "learning_rate": 9.493924665856623e-05, + "loss": 1.9976, + "step": 843 + }, + { + "epoch": 0.2561845500075884, + "grad_norm": 0.36993956565856934, + "learning_rate": 9.493317132442284e-05, + "loss": 1.9981, + "step": 844 + }, + { + "epoch": 0.2564880862042799, + "grad_norm": 0.4294464588165283, + "learning_rate": 9.492709599027947e-05, + "loss": 2.017, + "step": 845 + }, + { + "epoch": 0.2567916224009713, + "grad_norm": 0.4055061638355255, + "learning_rate": 9.49210206561361e-05, + "loss": 1.9383, + "step": 846 + }, + { + "epoch": 0.2570951585976628, + "grad_norm": 0.3574405014514923, + "learning_rate": 9.491494532199272e-05, + "loss": 1.8058, + "step": 847 + }, + { + "epoch": 0.25739869479435423, + "grad_norm": 0.35684704780578613, + "learning_rate": 9.490886998784933e-05, + "loss": 1.5661, + "step": 848 + }, + { + "epoch": 0.25770223099104567, + "grad_norm": 0.35031405091285706, + "learning_rate": 9.490279465370596e-05, + "loss": 1.6422, + "step": 849 + }, + { + "epoch": 0.25800576718773716, + "grad_norm": 0.390667200088501, + "learning_rate": 9.489671931956258e-05, + "loss": 1.9743, + "step": 850 + }, + { + "epoch": 0.2583093033844286, + "grad_norm": 0.33744457364082336, + "learning_rate": 9.48906439854192e-05, + "loss": 1.4922, + "step": 851 + }, + { + "epoch": 0.25861283958112, + "grad_norm": 0.3162226676940918, + "learning_rate": 9.488456865127583e-05, + "loss": 1.7791, + "step": 852 + }, + { + "epoch": 0.2589163757778115, + "grad_norm": 0.49357378482818604, + "learning_rate": 9.487849331713245e-05, + "loss": 1.4353, + "step": 853 + }, + { + "epoch": 0.25921991197450295, + "grad_norm": 0.4280342757701874, + "learning_rate": 9.487241798298906e-05, + "loss": 1.7277, + "step": 854 + }, + { + "epoch": 0.25952344817119444, + "grad_norm": 0.4271382987499237, + "learning_rate": 9.48663426488457e-05, + "loss": 1.3241, + "step": 855 + }, + { + "epoch": 0.25982698436788587, + "grad_norm": 0.3773948550224304, + "learning_rate": 9.486026731470231e-05, + "loss": 2.0892, + "step": 856 + }, + { + "epoch": 0.2601305205645773, + "grad_norm": 0.3343275785446167, + "learning_rate": 9.485419198055894e-05, + "loss": 1.7968, + "step": 857 + }, + { + "epoch": 0.2604340567612688, + "grad_norm": 0.3711187243461609, + "learning_rate": 9.484811664641555e-05, + "loss": 1.8599, + "step": 858 + }, + { + "epoch": 0.26073759295796023, + "grad_norm": 0.6738047003746033, + "learning_rate": 9.484204131227218e-05, + "loss": 2.0192, + "step": 859 + }, + { + "epoch": 0.2610411291546517, + "grad_norm": 0.3094058334827423, + "learning_rate": 9.483596597812881e-05, + "loss": 1.8827, + "step": 860 + }, + { + "epoch": 0.26134466535134315, + "grad_norm": 0.4207117259502411, + "learning_rate": 9.482989064398543e-05, + "loss": 1.8029, + "step": 861 + }, + { + "epoch": 0.2616482015480346, + "grad_norm": 0.3958408832550049, + "learning_rate": 9.482381530984204e-05, + "loss": 1.9911, + "step": 862 + }, + { + "epoch": 0.2619517377447261, + "grad_norm": 0.6868960857391357, + "learning_rate": 9.481773997569867e-05, + "loss": 1.9394, + "step": 863 + }, + { + "epoch": 0.2622552739414175, + "grad_norm": 1.131034016609192, + "learning_rate": 9.481166464155529e-05, + "loss": 1.486, + "step": 864 + }, + { + "epoch": 0.26255881013810894, + "grad_norm": 0.42944055795669556, + "learning_rate": 9.480558930741191e-05, + "loss": 1.9993, + "step": 865 + }, + { + "epoch": 0.26286234633480043, + "grad_norm": 0.3888295292854309, + "learning_rate": 9.479951397326854e-05, + "loss": 2.0604, + "step": 866 + }, + { + "epoch": 0.26316588253149187, + "grad_norm": 0.41875898838043213, + "learning_rate": 9.479343863912516e-05, + "loss": 1.8242, + "step": 867 + }, + { + "epoch": 0.26346941872818336, + "grad_norm": 0.44148901104927063, + "learning_rate": 9.478736330498177e-05, + "loss": 1.7827, + "step": 868 + }, + { + "epoch": 0.2637729549248748, + "grad_norm": 0.41976141929626465, + "learning_rate": 9.47812879708384e-05, + "loss": 2.127, + "step": 869 + }, + { + "epoch": 0.2640764911215662, + "grad_norm": 0.5538145899772644, + "learning_rate": 9.477521263669502e-05, + "loss": 1.7636, + "step": 870 + }, + { + "epoch": 0.2643800273182577, + "grad_norm": 0.36378878355026245, + "learning_rate": 9.476913730255165e-05, + "loss": 1.8159, + "step": 871 + }, + { + "epoch": 0.26468356351494915, + "grad_norm": 0.3874679505825043, + "learning_rate": 9.476306196840826e-05, + "loss": 1.4785, + "step": 872 + }, + { + "epoch": 0.26498709971164064, + "grad_norm": 0.4508163332939148, + "learning_rate": 9.475698663426489e-05, + "loss": 1.9393, + "step": 873 + }, + { + "epoch": 0.2652906359083321, + "grad_norm": 0.38843271136283875, + "learning_rate": 9.475091130012152e-05, + "loss": 1.4532, + "step": 874 + }, + { + "epoch": 0.2655941721050235, + "grad_norm": 0.4603917598724365, + "learning_rate": 9.474483596597814e-05, + "loss": 2.1182, + "step": 875 + }, + { + "epoch": 0.265897708301715, + "grad_norm": 0.39668476581573486, + "learning_rate": 9.473876063183475e-05, + "loss": 1.9447, + "step": 876 + }, + { + "epoch": 0.26620124449840643, + "grad_norm": 4.796502113342285, + "learning_rate": 9.473268529769138e-05, + "loss": 1.9395, + "step": 877 + }, + { + "epoch": 0.26650478069509786, + "grad_norm": 0.38534435629844666, + "learning_rate": 9.4726609963548e-05, + "loss": 2.0862, + "step": 878 + }, + { + "epoch": 0.26680831689178935, + "grad_norm": 0.9022141695022583, + "learning_rate": 9.472053462940462e-05, + "loss": 1.8303, + "step": 879 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.4020310342311859, + "learning_rate": 9.471445929526125e-05, + "loss": 1.5084, + "step": 880 + }, + { + "epoch": 0.2674153892851723, + "grad_norm": 0.30202022194862366, + "learning_rate": 9.470838396111787e-05, + "loss": 1.8474, + "step": 881 + }, + { + "epoch": 0.2677189254818637, + "grad_norm": 0.35603514313697815, + "learning_rate": 9.470230862697448e-05, + "loss": 1.8973, + "step": 882 + }, + { + "epoch": 0.26802246167855515, + "grad_norm": 0.3749227523803711, + "learning_rate": 9.469623329283111e-05, + "loss": 1.9763, + "step": 883 + }, + { + "epoch": 0.26832599787524664, + "grad_norm": 0.45645421743392944, + "learning_rate": 9.469015795868773e-05, + "loss": 1.5944, + "step": 884 + }, + { + "epoch": 0.26862953407193807, + "grad_norm": 0.5855579972267151, + "learning_rate": 9.468408262454436e-05, + "loss": 1.8724, + "step": 885 + }, + { + "epoch": 0.26893307026862956, + "grad_norm": 0.3752727210521698, + "learning_rate": 9.467800729040097e-05, + "loss": 2.1792, + "step": 886 + }, + { + "epoch": 0.269236606465321, + "grad_norm": 0.8951378464698792, + "learning_rate": 9.46719319562576e-05, + "loss": 1.4745, + "step": 887 + }, + { + "epoch": 0.2695401426620124, + "grad_norm": 0.5524512529373169, + "learning_rate": 9.466585662211423e-05, + "loss": 1.9872, + "step": 888 + }, + { + "epoch": 0.2698436788587039, + "grad_norm": 0.3917500078678131, + "learning_rate": 9.465978128797085e-05, + "loss": 1.579, + "step": 889 + }, + { + "epoch": 0.27014721505539535, + "grad_norm": 0.41635704040527344, + "learning_rate": 9.465370595382746e-05, + "loss": 1.5984, + "step": 890 + }, + { + "epoch": 0.2704507512520868, + "grad_norm": 0.3544903099536896, + "learning_rate": 9.46476306196841e-05, + "loss": 1.9112, + "step": 891 + }, + { + "epoch": 0.2707542874487783, + "grad_norm": 0.4568898379802704, + "learning_rate": 9.464155528554071e-05, + "loss": 1.9857, + "step": 892 + }, + { + "epoch": 0.2710578236454697, + "grad_norm": 0.4155702590942383, + "learning_rate": 9.463547995139733e-05, + "loss": 1.7986, + "step": 893 + }, + { + "epoch": 0.2713613598421612, + "grad_norm": 0.37953928112983704, + "learning_rate": 9.462940461725396e-05, + "loss": 1.8383, + "step": 894 + }, + { + "epoch": 0.27166489603885263, + "grad_norm": 0.37993937730789185, + "learning_rate": 9.462332928311058e-05, + "loss": 2.0555, + "step": 895 + }, + { + "epoch": 0.27196843223554407, + "grad_norm": 0.4355872571468353, + "learning_rate": 9.46172539489672e-05, + "loss": 1.9307, + "step": 896 + }, + { + "epoch": 0.27227196843223556, + "grad_norm": 0.38673707842826843, + "learning_rate": 9.461117861482381e-05, + "loss": 1.7155, + "step": 897 + }, + { + "epoch": 0.272575504628927, + "grad_norm": 0.38927558064460754, + "learning_rate": 9.460510328068044e-05, + "loss": 2.1022, + "step": 898 + }, + { + "epoch": 0.2728790408256185, + "grad_norm": 0.40219199657440186, + "learning_rate": 9.459902794653707e-05, + "loss": 1.3915, + "step": 899 + }, + { + "epoch": 0.2731825770223099, + "grad_norm": 0.3896184265613556, + "learning_rate": 9.459295261239368e-05, + "loss": 1.9976, + "step": 900 + }, + { + "epoch": 0.27348611321900135, + "grad_norm": 0.37489351630210876, + "learning_rate": 9.458687727825031e-05, + "loss": 1.8479, + "step": 901 + }, + { + "epoch": 0.27378964941569284, + "grad_norm": 0.39215734601020813, + "learning_rate": 9.458080194410694e-05, + "loss": 1.9005, + "step": 902 + }, + { + "epoch": 0.27409318561238427, + "grad_norm": 0.5054829716682434, + "learning_rate": 9.457472660996356e-05, + "loss": 1.8805, + "step": 903 + }, + { + "epoch": 0.2743967218090757, + "grad_norm": 0.38437893986701965, + "learning_rate": 9.456865127582017e-05, + "loss": 1.5506, + "step": 904 + }, + { + "epoch": 0.2747002580057672, + "grad_norm": 0.38727036118507385, + "learning_rate": 9.45625759416768e-05, + "loss": 1.7189, + "step": 905 + }, + { + "epoch": 0.27500379420245863, + "grad_norm": 0.4260677993297577, + "learning_rate": 9.455650060753342e-05, + "loss": 2.1553, + "step": 906 + }, + { + "epoch": 0.2753073303991501, + "grad_norm": 0.3969596326351166, + "learning_rate": 9.455042527339004e-05, + "loss": 1.73, + "step": 907 + }, + { + "epoch": 0.27561086659584155, + "grad_norm": 0.371412456035614, + "learning_rate": 9.454434993924667e-05, + "loss": 1.618, + "step": 908 + }, + { + "epoch": 0.275914402792533, + "grad_norm": 0.32723626494407654, + "learning_rate": 9.453827460510329e-05, + "loss": 1.897, + "step": 909 + }, + { + "epoch": 0.2762179389892245, + "grad_norm": 0.37436455488204956, + "learning_rate": 9.45321992709599e-05, + "loss": 1.4027, + "step": 910 + }, + { + "epoch": 0.2765214751859159, + "grad_norm": 0.3615550398826599, + "learning_rate": 9.452612393681652e-05, + "loss": 1.8835, + "step": 911 + }, + { + "epoch": 0.2768250113826074, + "grad_norm": 0.37427717447280884, + "learning_rate": 9.452004860267315e-05, + "loss": 1.5918, + "step": 912 + }, + { + "epoch": 0.27712854757929883, + "grad_norm": 0.4030051827430725, + "learning_rate": 9.451397326852978e-05, + "loss": 1.5694, + "step": 913 + }, + { + "epoch": 0.27743208377599027, + "grad_norm": 0.3948831260204315, + "learning_rate": 9.450789793438639e-05, + "loss": 1.7315, + "step": 914 + }, + { + "epoch": 0.27773561997268176, + "grad_norm": 0.4105396866798401, + "learning_rate": 9.450182260024302e-05, + "loss": 2.0528, + "step": 915 + }, + { + "epoch": 0.2780391561693732, + "grad_norm": 0.400312215089798, + "learning_rate": 9.449574726609965e-05, + "loss": 1.6631, + "step": 916 + }, + { + "epoch": 0.2783426923660646, + "grad_norm": 0.40099987387657166, + "learning_rate": 9.448967193195625e-05, + "loss": 1.9922, + "step": 917 + }, + { + "epoch": 0.2786462285627561, + "grad_norm": 0.39861205220222473, + "learning_rate": 9.448359659781288e-05, + "loss": 1.8319, + "step": 918 + }, + { + "epoch": 0.27894976475944755, + "grad_norm": 0.33672603964805603, + "learning_rate": 9.447752126366951e-05, + "loss": 1.8562, + "step": 919 + }, + { + "epoch": 0.27925330095613904, + "grad_norm": 0.3398993909358978, + "learning_rate": 9.447144592952613e-05, + "loss": 1.8801, + "step": 920 + }, + { + "epoch": 0.2795568371528305, + "grad_norm": 0.6748337149620056, + "learning_rate": 9.446537059538275e-05, + "loss": 2.0353, + "step": 921 + }, + { + "epoch": 0.2798603733495219, + "grad_norm": 0.33281663060188293, + "learning_rate": 9.445929526123938e-05, + "loss": 2.0932, + "step": 922 + }, + { + "epoch": 0.2801639095462134, + "grad_norm": 0.37020498514175415, + "learning_rate": 9.4453219927096e-05, + "loss": 1.8438, + "step": 923 + }, + { + "epoch": 0.28046744574290483, + "grad_norm": 0.40763506293296814, + "learning_rate": 9.444714459295261e-05, + "loss": 1.6915, + "step": 924 + }, + { + "epoch": 0.2807709819395963, + "grad_norm": 0.36651310324668884, + "learning_rate": 9.444106925880923e-05, + "loss": 2.0502, + "step": 925 + }, + { + "epoch": 0.28107451813628775, + "grad_norm": 0.6006852388381958, + "learning_rate": 9.443499392466586e-05, + "loss": 1.9601, + "step": 926 + }, + { + "epoch": 0.2813780543329792, + "grad_norm": 0.45634040236473083, + "learning_rate": 9.442891859052248e-05, + "loss": 1.5068, + "step": 927 + }, + { + "epoch": 0.2816815905296707, + "grad_norm": 0.3380034565925598, + "learning_rate": 9.44228432563791e-05, + "loss": 1.8657, + "step": 928 + }, + { + "epoch": 0.2819851267263621, + "grad_norm": 0.39120668172836304, + "learning_rate": 9.441676792223573e-05, + "loss": 1.9683, + "step": 929 + }, + { + "epoch": 0.2822886629230536, + "grad_norm": 0.41591060161590576, + "learning_rate": 9.441069258809236e-05, + "loss": 1.9994, + "step": 930 + }, + { + "epoch": 0.28259219911974504, + "grad_norm": 0.3863435983657837, + "learning_rate": 9.440461725394896e-05, + "loss": 1.8127, + "step": 931 + }, + { + "epoch": 0.28289573531643647, + "grad_norm": 0.3713644742965698, + "learning_rate": 9.439854191980559e-05, + "loss": 1.7514, + "step": 932 + }, + { + "epoch": 0.28319927151312796, + "grad_norm": 0.36419039964675903, + "learning_rate": 9.439246658566222e-05, + "loss": 1.9044, + "step": 933 + }, + { + "epoch": 0.2835028077098194, + "grad_norm": 0.4059010148048401, + "learning_rate": 9.438639125151884e-05, + "loss": 1.783, + "step": 934 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.5016249418258667, + "learning_rate": 9.438031591737546e-05, + "loss": 1.9958, + "step": 935 + }, + { + "epoch": 0.2841098801032023, + "grad_norm": 0.4264843463897705, + "learning_rate": 9.437424058323209e-05, + "loss": 1.5526, + "step": 936 + }, + { + "epoch": 0.28441341629989375, + "grad_norm": 0.5768559575080872, + "learning_rate": 9.43681652490887e-05, + "loss": 1.719, + "step": 937 + }, + { + "epoch": 0.28471695249658524, + "grad_norm": 0.42008429765701294, + "learning_rate": 9.436208991494532e-05, + "loss": 2.057, + "step": 938 + }, + { + "epoch": 0.2850204886932767, + "grad_norm": 0.3530850112438202, + "learning_rate": 9.435601458080194e-05, + "loss": 1.7118, + "step": 939 + }, + { + "epoch": 0.2853240248899681, + "grad_norm": 0.44346508383750916, + "learning_rate": 9.434993924665857e-05, + "loss": 2.1416, + "step": 940 + }, + { + "epoch": 0.2856275610866596, + "grad_norm": 0.645882785320282, + "learning_rate": 9.434386391251519e-05, + "loss": 1.8368, + "step": 941 + }, + { + "epoch": 0.28593109728335103, + "grad_norm": 0.784821093082428, + "learning_rate": 9.43377885783718e-05, + "loss": 1.9541, + "step": 942 + }, + { + "epoch": 0.2862346334800425, + "grad_norm": 0.43880385160446167, + "learning_rate": 9.433171324422844e-05, + "loss": 2.0319, + "step": 943 + }, + { + "epoch": 0.28653816967673396, + "grad_norm": 0.6283034682273865, + "learning_rate": 9.432563791008507e-05, + "loss": 1.5762, + "step": 944 + }, + { + "epoch": 0.2868417058734254, + "grad_norm": 0.3591736853122711, + "learning_rate": 9.431956257594167e-05, + "loss": 2.1589, + "step": 945 + }, + { + "epoch": 0.2871452420701169, + "grad_norm": 0.3970873951911926, + "learning_rate": 9.43134872417983e-05, + "loss": 1.4798, + "step": 946 + }, + { + "epoch": 0.2874487782668083, + "grad_norm": 0.42486631870269775, + "learning_rate": 9.430741190765493e-05, + "loss": 1.8345, + "step": 947 + }, + { + "epoch": 0.28775231446349975, + "grad_norm": 0.37290090322494507, + "learning_rate": 9.430133657351155e-05, + "loss": 1.9313, + "step": 948 + }, + { + "epoch": 0.28805585066019124, + "grad_norm": 0.47855010628700256, + "learning_rate": 9.429526123936817e-05, + "loss": 1.4405, + "step": 949 + }, + { + "epoch": 0.28835938685688267, + "grad_norm": 0.4648813009262085, + "learning_rate": 9.42891859052248e-05, + "loss": 1.6818, + "step": 950 + }, + { + "epoch": 0.28866292305357416, + "grad_norm": 0.40000760555267334, + "learning_rate": 9.428311057108142e-05, + "loss": 1.9014, + "step": 951 + }, + { + "epoch": 0.2889664592502656, + "grad_norm": 0.3846280872821808, + "learning_rate": 9.427703523693803e-05, + "loss": 1.6268, + "step": 952 + }, + { + "epoch": 0.28926999544695703, + "grad_norm": 0.43172597885131836, + "learning_rate": 9.427095990279465e-05, + "loss": 1.6287, + "step": 953 + }, + { + "epoch": 0.2895735316436485, + "grad_norm": 0.42565402388572693, + "learning_rate": 9.426488456865128e-05, + "loss": 1.8678, + "step": 954 + }, + { + "epoch": 0.28987706784033995, + "grad_norm": 1.070906400680542, + "learning_rate": 9.42588092345079e-05, + "loss": 1.7688, + "step": 955 + }, + { + "epoch": 0.29018060403703144, + "grad_norm": 0.4792560935020447, + "learning_rate": 9.425273390036452e-05, + "loss": 1.09, + "step": 956 + }, + { + "epoch": 0.2904841402337229, + "grad_norm": 0.37043797969818115, + "learning_rate": 9.424665856622115e-05, + "loss": 1.6931, + "step": 957 + }, + { + "epoch": 0.2907876764304143, + "grad_norm": 0.37764909863471985, + "learning_rate": 9.424058323207778e-05, + "loss": 1.8621, + "step": 958 + }, + { + "epoch": 0.2910912126271058, + "grad_norm": 0.40328919887542725, + "learning_rate": 9.423450789793438e-05, + "loss": 1.9484, + "step": 959 + }, + { + "epoch": 0.29139474882379723, + "grad_norm": 0.4451077878475189, + "learning_rate": 9.422843256379101e-05, + "loss": 1.7273, + "step": 960 + }, + { + "epoch": 0.29169828502048867, + "grad_norm": 0.5410102009773254, + "learning_rate": 9.422235722964764e-05, + "loss": 2.0116, + "step": 961 + }, + { + "epoch": 0.29200182121718016, + "grad_norm": 0.42526179552078247, + "learning_rate": 9.421628189550426e-05, + "loss": 1.6596, + "step": 962 + }, + { + "epoch": 0.2923053574138716, + "grad_norm": 0.3813883066177368, + "learning_rate": 9.421020656136088e-05, + "loss": 2.0083, + "step": 963 + }, + { + "epoch": 0.2926088936105631, + "grad_norm": 0.3967495858669281, + "learning_rate": 9.420413122721751e-05, + "loss": 1.8665, + "step": 964 + }, + { + "epoch": 0.2929124298072545, + "grad_norm": 0.4672113060951233, + "learning_rate": 9.419805589307413e-05, + "loss": 2.1132, + "step": 965 + }, + { + "epoch": 0.29321596600394595, + "grad_norm": 0.4068308472633362, + "learning_rate": 9.419198055893074e-05, + "loss": 2.0042, + "step": 966 + }, + { + "epoch": 0.29351950220063744, + "grad_norm": 0.8895217180252075, + "learning_rate": 9.418590522478736e-05, + "loss": 1.9725, + "step": 967 + }, + { + "epoch": 0.2938230383973289, + "grad_norm": 0.6839628219604492, + "learning_rate": 9.417982989064399e-05, + "loss": 1.5712, + "step": 968 + }, + { + "epoch": 0.29412657459402036, + "grad_norm": 0.5890039801597595, + "learning_rate": 9.417375455650061e-05, + "loss": 2.1177, + "step": 969 + }, + { + "epoch": 0.2944301107907118, + "grad_norm": 0.33217447996139526, + "learning_rate": 9.416767922235723e-05, + "loss": 1.9683, + "step": 970 + }, + { + "epoch": 0.29473364698740323, + "grad_norm": 0.44125109910964966, + "learning_rate": 9.416160388821386e-05, + "loss": 1.6272, + "step": 971 + }, + { + "epoch": 0.2950371831840947, + "grad_norm": 0.367145836353302, + "learning_rate": 9.415552855407049e-05, + "loss": 2.0624, + "step": 972 + }, + { + "epoch": 0.29534071938078615, + "grad_norm": 0.3220556080341339, + "learning_rate": 9.414945321992709e-05, + "loss": 1.4513, + "step": 973 + }, + { + "epoch": 0.2956442555774776, + "grad_norm": 0.37686339020729065, + "learning_rate": 9.414337788578372e-05, + "loss": 1.6418, + "step": 974 + }, + { + "epoch": 0.2959477917741691, + "grad_norm": 0.46043211221694946, + "learning_rate": 9.413730255164035e-05, + "loss": 2.0433, + "step": 975 + }, + { + "epoch": 0.2962513279708605, + "grad_norm": 0.40463754534721375, + "learning_rate": 9.413122721749697e-05, + "loss": 1.8214, + "step": 976 + }, + { + "epoch": 0.296554864167552, + "grad_norm": 0.406583309173584, + "learning_rate": 9.412515188335359e-05, + "loss": 1.5704, + "step": 977 + }, + { + "epoch": 0.29685840036424344, + "grad_norm": 0.4335365295410156, + "learning_rate": 9.41190765492102e-05, + "loss": 1.8772, + "step": 978 + }, + { + "epoch": 0.29716193656093487, + "grad_norm": 0.43915802240371704, + "learning_rate": 9.411300121506684e-05, + "loss": 1.5376, + "step": 979 + }, + { + "epoch": 0.29746547275762636, + "grad_norm": 0.36118191480636597, + "learning_rate": 9.410692588092345e-05, + "loss": 1.8994, + "step": 980 + }, + { + "epoch": 0.2977690089543178, + "grad_norm": 0.4184354841709137, + "learning_rate": 9.410085054678007e-05, + "loss": 2.0041, + "step": 981 + }, + { + "epoch": 0.2980725451510093, + "grad_norm": 0.3743583559989929, + "learning_rate": 9.40947752126367e-05, + "loss": 1.8986, + "step": 982 + }, + { + "epoch": 0.2983760813477007, + "grad_norm": 0.4110506474971771, + "learning_rate": 9.408869987849332e-05, + "loss": 2.0617, + "step": 983 + }, + { + "epoch": 0.29867961754439215, + "grad_norm": 0.33404871821403503, + "learning_rate": 9.408262454434994e-05, + "loss": 2.0014, + "step": 984 + }, + { + "epoch": 0.29898315374108364, + "grad_norm": 0.3586455285549164, + "learning_rate": 9.407654921020657e-05, + "loss": 1.4912, + "step": 985 + }, + { + "epoch": 0.2992866899377751, + "grad_norm": 0.3859756886959076, + "learning_rate": 9.40704738760632e-05, + "loss": 1.7919, + "step": 986 + }, + { + "epoch": 0.2995902261344665, + "grad_norm": 0.4533100724220276, + "learning_rate": 9.40643985419198e-05, + "loss": 2.1134, + "step": 987 + }, + { + "epoch": 0.299893762331158, + "grad_norm": 0.399854838848114, + "learning_rate": 9.405832320777643e-05, + "loss": 1.8198, + "step": 988 + }, + { + "epoch": 0.30019729852784943, + "grad_norm": 0.3582475781440735, + "learning_rate": 9.405224787363306e-05, + "loss": 1.6337, + "step": 989 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.39537113904953003, + "learning_rate": 9.404617253948967e-05, + "loss": 1.9856, + "step": 990 + }, + { + "epoch": 0.30080437092123236, + "grad_norm": 0.3662082254886627, + "learning_rate": 9.40400972053463e-05, + "loss": 2.0424, + "step": 991 + }, + { + "epoch": 0.3011079071179238, + "grad_norm": 0.38339659571647644, + "learning_rate": 9.403402187120292e-05, + "loss": 2.0293, + "step": 992 + }, + { + "epoch": 0.3014114433146153, + "grad_norm": 0.3134559988975525, + "learning_rate": 9.402794653705955e-05, + "loss": 1.8086, + "step": 993 + }, + { + "epoch": 0.3017149795113067, + "grad_norm": 0.4155486226081848, + "learning_rate": 9.402187120291616e-05, + "loss": 1.8748, + "step": 994 + }, + { + "epoch": 0.3020185157079982, + "grad_norm": 0.41562893986701965, + "learning_rate": 9.401579586877278e-05, + "loss": 1.6104, + "step": 995 + }, + { + "epoch": 0.30232205190468964, + "grad_norm": 0.36112940311431885, + "learning_rate": 9.400972053462941e-05, + "loss": 1.335, + "step": 996 + }, + { + "epoch": 0.30262558810138107, + "grad_norm": 0.4332577586174011, + "learning_rate": 9.400364520048603e-05, + "loss": 1.6098, + "step": 997 + }, + { + "epoch": 0.30292912429807256, + "grad_norm": 0.3520275950431824, + "learning_rate": 9.399756986634265e-05, + "loss": 1.9924, + "step": 998 + }, + { + "epoch": 0.303232660494764, + "grad_norm": 0.44331827759742737, + "learning_rate": 9.399149453219928e-05, + "loss": 1.8021, + "step": 999 + }, + { + "epoch": 0.30353619669145543, + "grad_norm": 0.38627490401268005, + "learning_rate": 9.398541919805591e-05, + "loss": 1.8952, + "step": 1000 + }, + { + "epoch": 0.3038397328881469, + "grad_norm": 0.42670029401779175, + "learning_rate": 9.397934386391251e-05, + "loss": 1.7, + "step": 1001 + }, + { + "epoch": 0.30414326908483835, + "grad_norm": 0.31961289048194885, + "learning_rate": 9.397326852976914e-05, + "loss": 1.4874, + "step": 1002 + }, + { + "epoch": 0.30444680528152984, + "grad_norm": 0.46648967266082764, + "learning_rate": 9.396719319562577e-05, + "loss": 1.962, + "step": 1003 + }, + { + "epoch": 0.3047503414782213, + "grad_norm": 0.39356762170791626, + "learning_rate": 9.396111786148238e-05, + "loss": 2.0206, + "step": 1004 + }, + { + "epoch": 0.3050538776749127, + "grad_norm": 0.37756818532943726, + "learning_rate": 9.395504252733901e-05, + "loss": 1.9863, + "step": 1005 + }, + { + "epoch": 0.3053574138716042, + "grad_norm": 0.3291250765323639, + "learning_rate": 9.394896719319563e-05, + "loss": 1.9422, + "step": 1006 + }, + { + "epoch": 0.30566095006829563, + "grad_norm": 0.397297739982605, + "learning_rate": 9.394289185905226e-05, + "loss": 1.7533, + "step": 1007 + }, + { + "epoch": 0.3059644862649871, + "grad_norm": 0.33320048451423645, + "learning_rate": 9.393681652490887e-05, + "loss": 1.6411, + "step": 1008 + }, + { + "epoch": 0.30626802246167856, + "grad_norm": 0.38921716809272766, + "learning_rate": 9.393074119076549e-05, + "loss": 1.9216, + "step": 1009 + }, + { + "epoch": 0.30657155865837, + "grad_norm": 0.40245047211647034, + "learning_rate": 9.392466585662212e-05, + "loss": 1.9853, + "step": 1010 + }, + { + "epoch": 0.3068750948550615, + "grad_norm": 0.5569208264350891, + "learning_rate": 9.391859052247874e-05, + "loss": 2.157, + "step": 1011 + }, + { + "epoch": 0.3071786310517529, + "grad_norm": 0.4204193949699402, + "learning_rate": 9.391251518833536e-05, + "loss": 1.666, + "step": 1012 + }, + { + "epoch": 0.3074821672484444, + "grad_norm": 0.3458712100982666, + "learning_rate": 9.390643985419199e-05, + "loss": 1.9564, + "step": 1013 + }, + { + "epoch": 0.30778570344513584, + "grad_norm": 0.42556729912757874, + "learning_rate": 9.39003645200486e-05, + "loss": 2.0565, + "step": 1014 + }, + { + "epoch": 0.3080892396418273, + "grad_norm": 0.3334849774837494, + "learning_rate": 9.389428918590522e-05, + "loss": 1.9534, + "step": 1015 + }, + { + "epoch": 0.30839277583851876, + "grad_norm": 0.3297790288925171, + "learning_rate": 9.388821385176185e-05, + "loss": 2.0032, + "step": 1016 + }, + { + "epoch": 0.3086963120352102, + "grad_norm": 0.4108186662197113, + "learning_rate": 9.388213851761848e-05, + "loss": 1.6698, + "step": 1017 + }, + { + "epoch": 0.30899984823190163, + "grad_norm": 0.4515385925769806, + "learning_rate": 9.387606318347509e-05, + "loss": 1.9709, + "step": 1018 + }, + { + "epoch": 0.3093033844285931, + "grad_norm": 0.38401028513908386, + "learning_rate": 9.386998784933172e-05, + "loss": 1.9928, + "step": 1019 + }, + { + "epoch": 0.30960692062528455, + "grad_norm": 0.32774823904037476, + "learning_rate": 9.386391251518834e-05, + "loss": 2.1256, + "step": 1020 + }, + { + "epoch": 0.30991045682197604, + "grad_norm": 0.45378655195236206, + "learning_rate": 9.385783718104497e-05, + "loss": 1.8078, + "step": 1021 + }, + { + "epoch": 0.3102139930186675, + "grad_norm": 0.3340519368648529, + "learning_rate": 9.385176184690158e-05, + "loss": 1.6102, + "step": 1022 + }, + { + "epoch": 0.3105175292153589, + "grad_norm": 0.5457311868667603, + "learning_rate": 9.38456865127582e-05, + "loss": 1.9735, + "step": 1023 + }, + { + "epoch": 0.3108210654120504, + "grad_norm": 0.3604097068309784, + "learning_rate": 9.383961117861483e-05, + "loss": 1.8048, + "step": 1024 + }, + { + "epoch": 0.31112460160874184, + "grad_norm": 0.3677893579006195, + "learning_rate": 9.383353584447145e-05, + "loss": 1.5346, + "step": 1025 + }, + { + "epoch": 0.3114281378054333, + "grad_norm": 0.49554312229156494, + "learning_rate": 9.382746051032807e-05, + "loss": 2.0472, + "step": 1026 + }, + { + "epoch": 0.31173167400212476, + "grad_norm": 0.37693944573402405, + "learning_rate": 9.38213851761847e-05, + "loss": 1.8848, + "step": 1027 + }, + { + "epoch": 0.3120352101988162, + "grad_norm": 0.6364639401435852, + "learning_rate": 9.381530984204132e-05, + "loss": 1.7289, + "step": 1028 + }, + { + "epoch": 0.3123387463955077, + "grad_norm": 0.36025428771972656, + "learning_rate": 9.380923450789793e-05, + "loss": 1.8544, + "step": 1029 + }, + { + "epoch": 0.3126422825921991, + "grad_norm": 0.4033251106739044, + "learning_rate": 9.380315917375456e-05, + "loss": 1.8661, + "step": 1030 + }, + { + "epoch": 0.31294581878889055, + "grad_norm": 0.41504162549972534, + "learning_rate": 9.37970838396112e-05, + "loss": 1.9619, + "step": 1031 + }, + { + "epoch": 0.31324935498558204, + "grad_norm": 0.38639551401138306, + "learning_rate": 9.37910085054678e-05, + "loss": 2.1785, + "step": 1032 + }, + { + "epoch": 0.3135528911822735, + "grad_norm": 0.3487949073314667, + "learning_rate": 9.378493317132443e-05, + "loss": 1.5571, + "step": 1033 + }, + { + "epoch": 0.31385642737896496, + "grad_norm": 0.3317317068576813, + "learning_rate": 9.377885783718105e-05, + "loss": 2.1086, + "step": 1034 + }, + { + "epoch": 0.3141599635756564, + "grad_norm": 0.35874056816101074, + "learning_rate": 9.377278250303768e-05, + "loss": 1.9453, + "step": 1035 + }, + { + "epoch": 0.31446349977234783, + "grad_norm": 0.3823045790195465, + "learning_rate": 9.37667071688943e-05, + "loss": 1.6444, + "step": 1036 + }, + { + "epoch": 0.3147670359690393, + "grad_norm": 0.39954647421836853, + "learning_rate": 9.376063183475091e-05, + "loss": 2.0497, + "step": 1037 + }, + { + "epoch": 0.31507057216573076, + "grad_norm": 0.34357962012290955, + "learning_rate": 9.375455650060754e-05, + "loss": 1.8391, + "step": 1038 + }, + { + "epoch": 0.31537410836242225, + "grad_norm": 0.35260939598083496, + "learning_rate": 9.374848116646416e-05, + "loss": 1.9691, + "step": 1039 + }, + { + "epoch": 0.3156776445591137, + "grad_norm": 0.33483296632766724, + "learning_rate": 9.374240583232078e-05, + "loss": 1.8933, + "step": 1040 + }, + { + "epoch": 0.3159811807558051, + "grad_norm": 0.4771517515182495, + "learning_rate": 9.373633049817741e-05, + "loss": 1.8574, + "step": 1041 + }, + { + "epoch": 0.3162847169524966, + "grad_norm": 0.3025968372821808, + "learning_rate": 9.373025516403403e-05, + "loss": 1.7995, + "step": 1042 + }, + { + "epoch": 0.31658825314918804, + "grad_norm": 0.39535394310951233, + "learning_rate": 9.372417982989064e-05, + "loss": 1.6662, + "step": 1043 + }, + { + "epoch": 0.31689178934587947, + "grad_norm": 0.35718834400177, + "learning_rate": 9.371810449574727e-05, + "loss": 2.3543, + "step": 1044 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.40815529227256775, + "learning_rate": 9.37120291616039e-05, + "loss": 1.8978, + "step": 1045 + }, + { + "epoch": 0.3174988617392624, + "grad_norm": 0.38799992203712463, + "learning_rate": 9.370595382746051e-05, + "loss": 1.9145, + "step": 1046 + }, + { + "epoch": 0.3178023979359539, + "grad_norm": 0.3711848556995392, + "learning_rate": 9.369987849331714e-05, + "loss": 1.9129, + "step": 1047 + }, + { + "epoch": 0.3181059341326453, + "grad_norm": 1.041429877281189, + "learning_rate": 9.369380315917376e-05, + "loss": 1.3535, + "step": 1048 + }, + { + "epoch": 0.31840947032933675, + "grad_norm": 0.4107154309749603, + "learning_rate": 9.368772782503039e-05, + "loss": 1.6886, + "step": 1049 + }, + { + "epoch": 0.31871300652602824, + "grad_norm": 0.35202670097351074, + "learning_rate": 9.3681652490887e-05, + "loss": 1.4685, + "step": 1050 + }, + { + "epoch": 0.3190165427227197, + "grad_norm": 0.39248141646385193, + "learning_rate": 9.367557715674362e-05, + "loss": 1.6177, + "step": 1051 + }, + { + "epoch": 0.31932007891941117, + "grad_norm": 0.3911724388599396, + "learning_rate": 9.366950182260025e-05, + "loss": 1.5015, + "step": 1052 + }, + { + "epoch": 0.3196236151161026, + "grad_norm": 0.8974817991256714, + "learning_rate": 9.366342648845687e-05, + "loss": 1.5422, + "step": 1053 + }, + { + "epoch": 0.31992715131279403, + "grad_norm": 0.4150513708591461, + "learning_rate": 9.365735115431349e-05, + "loss": 2.0201, + "step": 1054 + }, + { + "epoch": 0.3202306875094855, + "grad_norm": 0.4205161929130554, + "learning_rate": 9.365127582017012e-05, + "loss": 1.881, + "step": 1055 + }, + { + "epoch": 0.32053422370617696, + "grad_norm": 0.36916840076446533, + "learning_rate": 9.364520048602674e-05, + "loss": 1.9912, + "step": 1056 + }, + { + "epoch": 0.3208377599028684, + "grad_norm": 0.45616719126701355, + "learning_rate": 9.363912515188335e-05, + "loss": 1.825, + "step": 1057 + }, + { + "epoch": 0.3211412960995599, + "grad_norm": 0.3602239787578583, + "learning_rate": 9.363304981773998e-05, + "loss": 1.5264, + "step": 1058 + }, + { + "epoch": 0.3214448322962513, + "grad_norm": 0.39383935928344727, + "learning_rate": 9.362697448359661e-05, + "loss": 1.9719, + "step": 1059 + }, + { + "epoch": 0.3217483684929428, + "grad_norm": 0.321859210729599, + "learning_rate": 9.362089914945322e-05, + "loss": 1.7703, + "step": 1060 + }, + { + "epoch": 0.32205190468963424, + "grad_norm": 0.40060603618621826, + "learning_rate": 9.361482381530985e-05, + "loss": 1.7199, + "step": 1061 + }, + { + "epoch": 0.3223554408863257, + "grad_norm": 0.4096384644508362, + "learning_rate": 9.360874848116647e-05, + "loss": 1.5069, + "step": 1062 + }, + { + "epoch": 0.32265897708301716, + "grad_norm": 0.35391515493392944, + "learning_rate": 9.360267314702308e-05, + "loss": 1.8928, + "step": 1063 + }, + { + "epoch": 0.3229625132797086, + "grad_norm": 0.3309794068336487, + "learning_rate": 9.359659781287971e-05, + "loss": 1.6238, + "step": 1064 + }, + { + "epoch": 0.3232660494764001, + "grad_norm": 0.37579798698425293, + "learning_rate": 9.359052247873633e-05, + "loss": 2.329, + "step": 1065 + }, + { + "epoch": 0.3235695856730915, + "grad_norm": 0.41262614727020264, + "learning_rate": 9.358444714459296e-05, + "loss": 1.9192, + "step": 1066 + }, + { + "epoch": 0.32387312186978295, + "grad_norm": 0.3737616539001465, + "learning_rate": 9.357837181044958e-05, + "loss": 1.9614, + "step": 1067 + }, + { + "epoch": 0.32417665806647444, + "grad_norm": 0.35716524720191956, + "learning_rate": 9.35722964763062e-05, + "loss": 1.9046, + "step": 1068 + }, + { + "epoch": 0.3244801942631659, + "grad_norm": 1.6110327243804932, + "learning_rate": 9.356622114216283e-05, + "loss": 1.5437, + "step": 1069 + }, + { + "epoch": 0.3247837304598573, + "grad_norm": 0.3114778399467468, + "learning_rate": 9.356014580801945e-05, + "loss": 1.8962, + "step": 1070 + }, + { + "epoch": 0.3250872666565488, + "grad_norm": 0.35084468126296997, + "learning_rate": 9.355407047387606e-05, + "loss": 2.0253, + "step": 1071 + }, + { + "epoch": 0.32539080285324024, + "grad_norm": 0.38513630628585815, + "learning_rate": 9.35479951397327e-05, + "loss": 1.7556, + "step": 1072 + }, + { + "epoch": 0.3256943390499317, + "grad_norm": 0.41520386934280396, + "learning_rate": 9.354191980558931e-05, + "loss": 1.274, + "step": 1073 + }, + { + "epoch": 0.32599787524662316, + "grad_norm": 0.3998602032661438, + "learning_rate": 9.353584447144593e-05, + "loss": 1.9963, + "step": 1074 + }, + { + "epoch": 0.3263014114433146, + "grad_norm": 0.3973468244075775, + "learning_rate": 9.352976913730256e-05, + "loss": 2.2281, + "step": 1075 + }, + { + "epoch": 0.3266049476400061, + "grad_norm": 0.37020763754844666, + "learning_rate": 9.352369380315918e-05, + "loss": 1.6891, + "step": 1076 + }, + { + "epoch": 0.3269084838366975, + "grad_norm": 0.43367013335227966, + "learning_rate": 9.35176184690158e-05, + "loss": 1.5859, + "step": 1077 + }, + { + "epoch": 0.327212020033389, + "grad_norm": 0.3882901072502136, + "learning_rate": 9.351154313487242e-05, + "loss": 1.5294, + "step": 1078 + }, + { + "epoch": 0.32751555623008044, + "grad_norm": 0.38236895203590393, + "learning_rate": 9.350546780072904e-05, + "loss": 2.0035, + "step": 1079 + }, + { + "epoch": 0.3278190924267719, + "grad_norm": 0.42090603709220886, + "learning_rate": 9.349939246658567e-05, + "loss": 1.325, + "step": 1080 + }, + { + "epoch": 0.32812262862346336, + "grad_norm": 0.4210514724254608, + "learning_rate": 9.349331713244229e-05, + "loss": 1.9018, + "step": 1081 + }, + { + "epoch": 0.3284261648201548, + "grad_norm": 0.3695550858974457, + "learning_rate": 9.348724179829891e-05, + "loss": 2.0823, + "step": 1082 + }, + { + "epoch": 0.32872970101684623, + "grad_norm": 0.44178470969200134, + "learning_rate": 9.348116646415554e-05, + "loss": 1.9396, + "step": 1083 + }, + { + "epoch": 0.3290332372135377, + "grad_norm": 2.9311540126800537, + "learning_rate": 9.347509113001216e-05, + "loss": 2.0483, + "step": 1084 + }, + { + "epoch": 0.32933677341022916, + "grad_norm": 0.38238954544067383, + "learning_rate": 9.346901579586877e-05, + "loss": 2.0408, + "step": 1085 + }, + { + "epoch": 0.32964030960692065, + "grad_norm": 0.420622318983078, + "learning_rate": 9.34629404617254e-05, + "loss": 1.9811, + "step": 1086 + }, + { + "epoch": 0.3299438458036121, + "grad_norm": 0.47827744483947754, + "learning_rate": 9.345686512758202e-05, + "loss": 1.7816, + "step": 1087 + }, + { + "epoch": 0.3302473820003035, + "grad_norm": 0.3673538565635681, + "learning_rate": 9.345078979343864e-05, + "loss": 1.9916, + "step": 1088 + }, + { + "epoch": 0.330550918196995, + "grad_norm": 1.2525584697723389, + "learning_rate": 9.344471445929527e-05, + "loss": 2.0323, + "step": 1089 + }, + { + "epoch": 0.33085445439368644, + "grad_norm": 0.3575446605682373, + "learning_rate": 9.343863912515189e-05, + "loss": 2.0254, + "step": 1090 + }, + { + "epoch": 0.3311579905903779, + "grad_norm": 0.4579968750476837, + "learning_rate": 9.34325637910085e-05, + "loss": 1.4365, + "step": 1091 + }, + { + "epoch": 0.33146152678706936, + "grad_norm": 0.5363442301750183, + "learning_rate": 9.342648845686513e-05, + "loss": 2.0635, + "step": 1092 + }, + { + "epoch": 0.3317650629837608, + "grad_norm": 0.4065784215927124, + "learning_rate": 9.342041312272175e-05, + "loss": 1.6133, + "step": 1093 + }, + { + "epoch": 0.3320685991804523, + "grad_norm": 0.4256560504436493, + "learning_rate": 9.341433778857838e-05, + "loss": 1.7574, + "step": 1094 + }, + { + "epoch": 0.3323721353771437, + "grad_norm": 0.3566704988479614, + "learning_rate": 9.3408262454435e-05, + "loss": 1.4446, + "step": 1095 + }, + { + "epoch": 0.33267567157383515, + "grad_norm": 0.39680102467536926, + "learning_rate": 9.340218712029162e-05, + "loss": 2.1008, + "step": 1096 + }, + { + "epoch": 0.33297920777052664, + "grad_norm": 0.39213013648986816, + "learning_rate": 9.339611178614825e-05, + "loss": 1.5924, + "step": 1097 + }, + { + "epoch": 0.3332827439672181, + "grad_norm": 0.39503929018974304, + "learning_rate": 9.339003645200487e-05, + "loss": 1.9126, + "step": 1098 + }, + { + "epoch": 0.33358628016390957, + "grad_norm": 0.34226784110069275, + "learning_rate": 9.338396111786148e-05, + "loss": 1.3475, + "step": 1099 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.3511487543582916, + "learning_rate": 9.337788578371811e-05, + "loss": 1.6679, + "step": 1100 + }, + { + "epoch": 0.33419335255729243, + "grad_norm": 0.6215702295303345, + "learning_rate": 9.337181044957473e-05, + "loss": 1.7599, + "step": 1101 + }, + { + "epoch": 0.3344968887539839, + "grad_norm": 0.34477895498275757, + "learning_rate": 9.336573511543135e-05, + "loss": 1.4905, + "step": 1102 + }, + { + "epoch": 0.33480042495067536, + "grad_norm": 0.37696805596351624, + "learning_rate": 9.335965978128798e-05, + "loss": 1.2661, + "step": 1103 + }, + { + "epoch": 0.33510396114736685, + "grad_norm": 0.3722575306892395, + "learning_rate": 9.33535844471446e-05, + "loss": 1.8483, + "step": 1104 + }, + { + "epoch": 0.3354074973440583, + "grad_norm": 0.5641891360282898, + "learning_rate": 9.334750911300121e-05, + "loss": 2.0121, + "step": 1105 + }, + { + "epoch": 0.3357110335407497, + "grad_norm": 0.37221911549568176, + "learning_rate": 9.334143377885784e-05, + "loss": 1.9564, + "step": 1106 + }, + { + "epoch": 0.3360145697374412, + "grad_norm": 0.3777831494808197, + "learning_rate": 9.333535844471446e-05, + "loss": 1.8625, + "step": 1107 + }, + { + "epoch": 0.33631810593413264, + "grad_norm": 0.4311445951461792, + "learning_rate": 9.332928311057109e-05, + "loss": 2.038, + "step": 1108 + }, + { + "epoch": 0.33662164213082413, + "grad_norm": 0.4373043179512024, + "learning_rate": 9.332320777642771e-05, + "loss": 1.6282, + "step": 1109 + }, + { + "epoch": 0.33692517832751556, + "grad_norm": 0.3800273835659027, + "learning_rate": 9.331713244228433e-05, + "loss": 1.5552, + "step": 1110 + }, + { + "epoch": 0.337228714524207, + "grad_norm": 0.6762371063232422, + "learning_rate": 9.331105710814096e-05, + "loss": 1.7299, + "step": 1111 + }, + { + "epoch": 0.3375322507208985, + "grad_norm": 0.3713971972465515, + "learning_rate": 9.330498177399758e-05, + "loss": 1.9919, + "step": 1112 + }, + { + "epoch": 0.3378357869175899, + "grad_norm": 0.40268123149871826, + "learning_rate": 9.329890643985419e-05, + "loss": 1.9952, + "step": 1113 + }, + { + "epoch": 0.33813932311428135, + "grad_norm": 0.44786572456359863, + "learning_rate": 9.329283110571082e-05, + "loss": 2.1622, + "step": 1114 + }, + { + "epoch": 0.33844285931097284, + "grad_norm": 0.39639097452163696, + "learning_rate": 9.328675577156744e-05, + "loss": 1.7995, + "step": 1115 + }, + { + "epoch": 0.3387463955076643, + "grad_norm": 0.5204857587814331, + "learning_rate": 9.328068043742406e-05, + "loss": 1.9925, + "step": 1116 + }, + { + "epoch": 0.33904993170435577, + "grad_norm": 0.4180005192756653, + "learning_rate": 9.327460510328069e-05, + "loss": 1.8489, + "step": 1117 + }, + { + "epoch": 0.3393534679010472, + "grad_norm": 0.42055949568748474, + "learning_rate": 9.32685297691373e-05, + "loss": 1.7729, + "step": 1118 + }, + { + "epoch": 0.33965700409773864, + "grad_norm": 0.4213305115699768, + "learning_rate": 9.326245443499392e-05, + "loss": 1.9299, + "step": 1119 + }, + { + "epoch": 0.3399605402944301, + "grad_norm": 0.7117316126823425, + "learning_rate": 9.325637910085055e-05, + "loss": 1.7163, + "step": 1120 + }, + { + "epoch": 0.34026407649112156, + "grad_norm": 0.3836345374584198, + "learning_rate": 9.325030376670717e-05, + "loss": 1.8703, + "step": 1121 + }, + { + "epoch": 0.34056761268781305, + "grad_norm": 0.5493946075439453, + "learning_rate": 9.32442284325638e-05, + "loss": 2.1161, + "step": 1122 + }, + { + "epoch": 0.3408711488845045, + "grad_norm": 0.4323013722896576, + "learning_rate": 9.323815309842042e-05, + "loss": 1.9867, + "step": 1123 + }, + { + "epoch": 0.3411746850811959, + "grad_norm": 0.37991368770599365, + "learning_rate": 9.323207776427704e-05, + "loss": 1.935, + "step": 1124 + }, + { + "epoch": 0.3414782212778874, + "grad_norm": 0.40191560983657837, + "learning_rate": 9.322600243013367e-05, + "loss": 1.7617, + "step": 1125 + }, + { + "epoch": 0.34178175747457884, + "grad_norm": 0.42309120297431946, + "learning_rate": 9.321992709599029e-05, + "loss": 2.1438, + "step": 1126 + }, + { + "epoch": 0.3420852936712703, + "grad_norm": 0.3918818235397339, + "learning_rate": 9.32138517618469e-05, + "loss": 1.9434, + "step": 1127 + }, + { + "epoch": 0.34238882986796176, + "grad_norm": 0.9797879457473755, + "learning_rate": 9.320777642770353e-05, + "loss": 1.8893, + "step": 1128 + }, + { + "epoch": 0.3426923660646532, + "grad_norm": 0.38459375500679016, + "learning_rate": 9.320170109356015e-05, + "loss": 1.9626, + "step": 1129 + }, + { + "epoch": 0.3429959022613447, + "grad_norm": 0.35924455523490906, + "learning_rate": 9.319562575941677e-05, + "loss": 1.7108, + "step": 1130 + }, + { + "epoch": 0.3432994384580361, + "grad_norm": 0.3576562702655792, + "learning_rate": 9.31895504252734e-05, + "loss": 1.9426, + "step": 1131 + }, + { + "epoch": 0.34360297465472756, + "grad_norm": 0.3931269943714142, + "learning_rate": 9.318347509113002e-05, + "loss": 1.9598, + "step": 1132 + }, + { + "epoch": 0.34390651085141904, + "grad_norm": 0.41744112968444824, + "learning_rate": 9.317739975698663e-05, + "loss": 1.9522, + "step": 1133 + }, + { + "epoch": 0.3442100470481105, + "grad_norm": 0.3721160590648651, + "learning_rate": 9.317132442284326e-05, + "loss": 1.366, + "step": 1134 + }, + { + "epoch": 0.34451358324480197, + "grad_norm": 0.4886751174926758, + "learning_rate": 9.316524908869988e-05, + "loss": 1.8528, + "step": 1135 + }, + { + "epoch": 0.3448171194414934, + "grad_norm": 0.3837689161300659, + "learning_rate": 9.31591737545565e-05, + "loss": 2.0877, + "step": 1136 + }, + { + "epoch": 0.34512065563818484, + "grad_norm": 0.3721841871738434, + "learning_rate": 9.315309842041313e-05, + "loss": 2.0144, + "step": 1137 + }, + { + "epoch": 0.3454241918348763, + "grad_norm": 0.4325003921985626, + "learning_rate": 9.314702308626975e-05, + "loss": 2.2377, + "step": 1138 + }, + { + "epoch": 0.34572772803156776, + "grad_norm": 0.39936354756355286, + "learning_rate": 9.314094775212638e-05, + "loss": 1.9469, + "step": 1139 + }, + { + "epoch": 0.3460312642282592, + "grad_norm": 0.38498643040657043, + "learning_rate": 9.3134872417983e-05, + "loss": 2.0822, + "step": 1140 + }, + { + "epoch": 0.3463348004249507, + "grad_norm": 0.3657349944114685, + "learning_rate": 9.312879708383961e-05, + "loss": 1.8089, + "step": 1141 + }, + { + "epoch": 0.3466383366216421, + "grad_norm": 2.0269200801849365, + "learning_rate": 9.312272174969624e-05, + "loss": 1.8882, + "step": 1142 + }, + { + "epoch": 0.3469418728183336, + "grad_norm": 0.3976801931858063, + "learning_rate": 9.311664641555286e-05, + "loss": 1.8956, + "step": 1143 + }, + { + "epoch": 0.34724540901502504, + "grad_norm": 0.47736337780952454, + "learning_rate": 9.311057108140948e-05, + "loss": 1.4612, + "step": 1144 + }, + { + "epoch": 0.3475489452117165, + "grad_norm": 0.4764254093170166, + "learning_rate": 9.310449574726611e-05, + "loss": 2.119, + "step": 1145 + }, + { + "epoch": 0.34785248140840797, + "grad_norm": 0.49367082118988037, + "learning_rate": 9.309842041312273e-05, + "loss": 1.1816, + "step": 1146 + }, + { + "epoch": 0.3481560176050994, + "grad_norm": 0.48990949988365173, + "learning_rate": 9.309234507897934e-05, + "loss": 1.6951, + "step": 1147 + }, + { + "epoch": 0.3484595538017909, + "grad_norm": 0.6447961330413818, + "learning_rate": 9.308626974483597e-05, + "loss": 2.0145, + "step": 1148 + }, + { + "epoch": 0.3487630899984823, + "grad_norm": 0.8322371244430542, + "learning_rate": 9.308019441069259e-05, + "loss": 1.8679, + "step": 1149 + }, + { + "epoch": 0.34906662619517376, + "grad_norm": 0.3726497292518616, + "learning_rate": 9.307411907654921e-05, + "loss": 1.8455, + "step": 1150 + }, + { + "epoch": 0.34937016239186525, + "grad_norm": 0.3494066298007965, + "learning_rate": 9.306804374240584e-05, + "loss": 1.7699, + "step": 1151 + }, + { + "epoch": 0.3496736985885567, + "grad_norm": 0.44510725140571594, + "learning_rate": 9.306196840826246e-05, + "loss": 1.6315, + "step": 1152 + }, + { + "epoch": 0.3499772347852481, + "grad_norm": 0.4738346338272095, + "learning_rate": 9.305589307411909e-05, + "loss": 1.9982, + "step": 1153 + }, + { + "epoch": 0.3502807709819396, + "grad_norm": 0.6915324330329895, + "learning_rate": 9.304981773997569e-05, + "loss": 1.6566, + "step": 1154 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.35767850279808044, + "learning_rate": 9.304374240583232e-05, + "loss": 1.6553, + "step": 1155 + }, + { + "epoch": 0.35088784337532253, + "grad_norm": 0.4144536852836609, + "learning_rate": 9.303766707168895e-05, + "loss": 1.4838, + "step": 1156 + }, + { + "epoch": 0.35119137957201396, + "grad_norm": 0.42863425612449646, + "learning_rate": 9.303159173754557e-05, + "loss": 2.0101, + "step": 1157 + }, + { + "epoch": 0.3514949157687054, + "grad_norm": 0.38044658303260803, + "learning_rate": 9.302551640340219e-05, + "loss": 2.0358, + "step": 1158 + }, + { + "epoch": 0.3517984519653969, + "grad_norm": 0.3667512536048889, + "learning_rate": 9.301944106925882e-05, + "loss": 2.0601, + "step": 1159 + }, + { + "epoch": 0.3521019881620883, + "grad_norm": 0.4198186993598938, + "learning_rate": 9.301336573511544e-05, + "loss": 1.8418, + "step": 1160 + }, + { + "epoch": 0.3524055243587798, + "grad_norm": 0.40647754073143005, + "learning_rate": 9.300729040097205e-05, + "loss": 1.945, + "step": 1161 + }, + { + "epoch": 0.35270906055547124, + "grad_norm": 0.4339864253997803, + "learning_rate": 9.300121506682868e-05, + "loss": 1.8991, + "step": 1162 + }, + { + "epoch": 0.3530125967521627, + "grad_norm": 0.43949249386787415, + "learning_rate": 9.29951397326853e-05, + "loss": 1.8012, + "step": 1163 + }, + { + "epoch": 0.35331613294885417, + "grad_norm": 0.3767072558403015, + "learning_rate": 9.298906439854192e-05, + "loss": 1.4843, + "step": 1164 + }, + { + "epoch": 0.3536196691455456, + "grad_norm": 0.34623175859451294, + "learning_rate": 9.298298906439855e-05, + "loss": 1.7296, + "step": 1165 + }, + { + "epoch": 0.35392320534223703, + "grad_norm": 0.4682632088661194, + "learning_rate": 9.297691373025517e-05, + "loss": 1.6166, + "step": 1166 + }, + { + "epoch": 0.3542267415389285, + "grad_norm": 0.43289923667907715, + "learning_rate": 9.29708383961118e-05, + "loss": 1.8117, + "step": 1167 + }, + { + "epoch": 0.35453027773561996, + "grad_norm": 0.40620309114456177, + "learning_rate": 9.29647630619684e-05, + "loss": 2.2582, + "step": 1168 + }, + { + "epoch": 0.35483381393231145, + "grad_norm": 0.4079282879829407, + "learning_rate": 9.295868772782503e-05, + "loss": 1.8266, + "step": 1169 + }, + { + "epoch": 0.3551373501290029, + "grad_norm": 0.4398365020751953, + "learning_rate": 9.295261239368166e-05, + "loss": 1.8592, + "step": 1170 + }, + { + "epoch": 0.3554408863256943, + "grad_norm": 0.41404253244400024, + "learning_rate": 9.294653705953828e-05, + "loss": 1.4286, + "step": 1171 + }, + { + "epoch": 0.3557444225223858, + "grad_norm": 0.3746820390224457, + "learning_rate": 9.29404617253949e-05, + "loss": 1.751, + "step": 1172 + }, + { + "epoch": 0.35604795871907724, + "grad_norm": 0.3549497723579407, + "learning_rate": 9.293438639125153e-05, + "loss": 1.1857, + "step": 1173 + }, + { + "epoch": 0.35635149491576873, + "grad_norm": 0.3803435266017914, + "learning_rate": 9.292831105710815e-05, + "loss": 1.9532, + "step": 1174 + }, + { + "epoch": 0.35665503111246016, + "grad_norm": 0.46608418226242065, + "learning_rate": 9.292223572296476e-05, + "loss": 1.7284, + "step": 1175 + }, + { + "epoch": 0.3569585673091516, + "grad_norm": 0.3718934953212738, + "learning_rate": 9.29161603888214e-05, + "loss": 1.8234, + "step": 1176 + }, + { + "epoch": 0.3572621035058431, + "grad_norm": 0.43626031279563904, + "learning_rate": 9.291008505467801e-05, + "loss": 1.6322, + "step": 1177 + }, + { + "epoch": 0.3575656397025345, + "grad_norm": 0.3897557258605957, + "learning_rate": 9.290400972053463e-05, + "loss": 1.7562, + "step": 1178 + }, + { + "epoch": 0.35786917589922596, + "grad_norm": 0.4063243269920349, + "learning_rate": 9.289793438639126e-05, + "loss": 1.7012, + "step": 1179 + }, + { + "epoch": 0.35817271209591744, + "grad_norm": 0.33185258507728577, + "learning_rate": 9.289185905224788e-05, + "loss": 1.8933, + "step": 1180 + }, + { + "epoch": 0.3584762482926089, + "grad_norm": 0.35498881340026855, + "learning_rate": 9.288578371810451e-05, + "loss": 1.84, + "step": 1181 + }, + { + "epoch": 0.35877978448930037, + "grad_norm": 0.37165796756744385, + "learning_rate": 9.287970838396111e-05, + "loss": 2.0435, + "step": 1182 + }, + { + "epoch": 0.3590833206859918, + "grad_norm": 0.4139983355998993, + "learning_rate": 9.287363304981774e-05, + "loss": 1.5223, + "step": 1183 + }, + { + "epoch": 0.35938685688268324, + "grad_norm": 0.4164494276046753, + "learning_rate": 9.286755771567437e-05, + "loss": 2.125, + "step": 1184 + }, + { + "epoch": 0.3596903930793747, + "grad_norm": 0.35237401723861694, + "learning_rate": 9.286148238153098e-05, + "loss": 1.9644, + "step": 1185 + }, + { + "epoch": 0.35999392927606616, + "grad_norm": 0.4741188883781433, + "learning_rate": 9.285540704738761e-05, + "loss": 1.7027, + "step": 1186 + }, + { + "epoch": 0.36029746547275765, + "grad_norm": 0.6668043732643127, + "learning_rate": 9.284933171324424e-05, + "loss": 1.7109, + "step": 1187 + }, + { + "epoch": 0.3606010016694491, + "grad_norm": 0.39687463641166687, + "learning_rate": 9.284325637910086e-05, + "loss": 1.9012, + "step": 1188 + }, + { + "epoch": 0.3609045378661405, + "grad_norm": 0.39605942368507385, + "learning_rate": 9.283718104495747e-05, + "loss": 2.0095, + "step": 1189 + }, + { + "epoch": 0.361208074062832, + "grad_norm": 0.6824695467948914, + "learning_rate": 9.28311057108141e-05, + "loss": 1.6799, + "step": 1190 + }, + { + "epoch": 0.36151161025952344, + "grad_norm": 0.31606560945510864, + "learning_rate": 9.282503037667072e-05, + "loss": 1.4793, + "step": 1191 + }, + { + "epoch": 0.36181514645621493, + "grad_norm": 0.3778662085533142, + "learning_rate": 9.281895504252734e-05, + "loss": 1.7399, + "step": 1192 + }, + { + "epoch": 0.36211868265290637, + "grad_norm": 0.39530149102211, + "learning_rate": 9.281287970838397e-05, + "loss": 1.7939, + "step": 1193 + }, + { + "epoch": 0.3624222188495978, + "grad_norm": 0.4434921145439148, + "learning_rate": 9.280680437424059e-05, + "loss": 1.5982, + "step": 1194 + }, + { + "epoch": 0.3627257550462893, + "grad_norm": 0.40200692415237427, + "learning_rate": 9.280072904009722e-05, + "loss": 1.6496, + "step": 1195 + }, + { + "epoch": 0.3630292912429807, + "grad_norm": 0.3617413341999054, + "learning_rate": 9.279465370595382e-05, + "loss": 1.9734, + "step": 1196 + }, + { + "epoch": 0.36333282743967216, + "grad_norm": 0.3840635120868683, + "learning_rate": 9.278857837181045e-05, + "loss": 2.0827, + "step": 1197 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.36481353640556335, + "learning_rate": 9.278250303766708e-05, + "loss": 1.781, + "step": 1198 + }, + { + "epoch": 0.3639398998330551, + "grad_norm": 0.3300980031490326, + "learning_rate": 9.277642770352369e-05, + "loss": 1.8124, + "step": 1199 + }, + { + "epoch": 0.36424343602974657, + "grad_norm": 0.3816182315349579, + "learning_rate": 9.277035236938032e-05, + "loss": 2.0853, + "step": 1200 + }, + { + "epoch": 0.364546972226438, + "grad_norm": 0.40531983971595764, + "learning_rate": 9.276427703523695e-05, + "loss": 1.8582, + "step": 1201 + }, + { + "epoch": 0.36485050842312944, + "grad_norm": 0.5847654938697815, + "learning_rate": 9.275820170109357e-05, + "loss": 1.9022, + "step": 1202 + }, + { + "epoch": 0.36515404461982093, + "grad_norm": 0.37587395310401917, + "learning_rate": 9.275212636695018e-05, + "loss": 1.5774, + "step": 1203 + }, + { + "epoch": 0.36545758081651236, + "grad_norm": 0.4057527780532837, + "learning_rate": 9.274605103280681e-05, + "loss": 1.965, + "step": 1204 + }, + { + "epoch": 0.36576111701320385, + "grad_norm": 0.7079107761383057, + "learning_rate": 9.273997569866343e-05, + "loss": 1.721, + "step": 1205 + }, + { + "epoch": 0.3660646532098953, + "grad_norm": 0.828880786895752, + "learning_rate": 9.273390036452005e-05, + "loss": 1.8865, + "step": 1206 + }, + { + "epoch": 0.3663681894065867, + "grad_norm": 0.3704030215740204, + "learning_rate": 9.272782503037668e-05, + "loss": 2.1905, + "step": 1207 + }, + { + "epoch": 0.3666717256032782, + "grad_norm": 0.3877900540828705, + "learning_rate": 9.27217496962333e-05, + "loss": 1.9098, + "step": 1208 + }, + { + "epoch": 0.36697526179996964, + "grad_norm": 0.39982378482818604, + "learning_rate": 9.271567436208991e-05, + "loss": 1.948, + "step": 1209 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.4450254440307617, + "learning_rate": 9.270959902794653e-05, + "loss": 2.0398, + "step": 1210 + }, + { + "epoch": 0.36758233419335257, + "grad_norm": 0.4938177168369293, + "learning_rate": 9.270352369380316e-05, + "loss": 1.9824, + "step": 1211 + }, + { + "epoch": 0.367885870390044, + "grad_norm": 0.44947531819343567, + "learning_rate": 9.26974483596598e-05, + "loss": 2.0937, + "step": 1212 + }, + { + "epoch": 0.3681894065867355, + "grad_norm": 0.5059708952903748, + "learning_rate": 9.26913730255164e-05, + "loss": 1.5887, + "step": 1213 + }, + { + "epoch": 0.3684929427834269, + "grad_norm": 0.38562676310539246, + "learning_rate": 9.268529769137303e-05, + "loss": 1.898, + "step": 1214 + }, + { + "epoch": 0.36879647898011836, + "grad_norm": 0.8131862282752991, + "learning_rate": 9.267922235722966e-05, + "loss": 1.9549, + "step": 1215 + }, + { + "epoch": 0.36910001517680985, + "grad_norm": 0.3856705129146576, + "learning_rate": 9.267314702308628e-05, + "loss": 1.9469, + "step": 1216 + }, + { + "epoch": 0.3694035513735013, + "grad_norm": 0.38688114285469055, + "learning_rate": 9.26670716889429e-05, + "loss": 2.0386, + "step": 1217 + }, + { + "epoch": 0.36970708757019277, + "grad_norm": 0.4043256938457489, + "learning_rate": 9.266099635479952e-05, + "loss": 2.0083, + "step": 1218 + }, + { + "epoch": 0.3700106237668842, + "grad_norm": 2.2425715923309326, + "learning_rate": 9.265492102065614e-05, + "loss": 1.9006, + "step": 1219 + }, + { + "epoch": 0.37031415996357564, + "grad_norm": 0.35413646697998047, + "learning_rate": 9.264884568651276e-05, + "loss": 1.597, + "step": 1220 + }, + { + "epoch": 0.37061769616026713, + "grad_norm": 0.4035986363887787, + "learning_rate": 9.264277035236939e-05, + "loss": 1.9021, + "step": 1221 + }, + { + "epoch": 0.37092123235695856, + "grad_norm": 0.38641074299812317, + "learning_rate": 9.263669501822601e-05, + "loss": 2.1007, + "step": 1222 + }, + { + "epoch": 0.37122476855365, + "grad_norm": 0.3715684711933136, + "learning_rate": 9.263061968408262e-05, + "loss": 1.8929, + "step": 1223 + }, + { + "epoch": 0.3715283047503415, + "grad_norm": 0.3876987397670746, + "learning_rate": 9.262454434993924e-05, + "loss": 1.8983, + "step": 1224 + }, + { + "epoch": 0.3718318409470329, + "grad_norm": 0.4943421483039856, + "learning_rate": 9.261846901579587e-05, + "loss": 1.7334, + "step": 1225 + }, + { + "epoch": 0.3721353771437244, + "grad_norm": 0.41828441619873047, + "learning_rate": 9.26123936816525e-05, + "loss": 2.1366, + "step": 1226 + }, + { + "epoch": 0.37243891334041584, + "grad_norm": 0.4057375490665436, + "learning_rate": 9.260631834750911e-05, + "loss": 1.8695, + "step": 1227 + }, + { + "epoch": 0.3727424495371073, + "grad_norm": 0.37381577491760254, + "learning_rate": 9.260024301336574e-05, + "loss": 1.8757, + "step": 1228 + }, + { + "epoch": 0.37304598573379877, + "grad_norm": 0.31567415595054626, + "learning_rate": 9.259416767922237e-05, + "loss": 1.579, + "step": 1229 + }, + { + "epoch": 0.3733495219304902, + "grad_norm": 0.3704005181789398, + "learning_rate": 9.258809234507899e-05, + "loss": 2.0297, + "step": 1230 + }, + { + "epoch": 0.3736530581271817, + "grad_norm": 0.37612470984458923, + "learning_rate": 9.25820170109356e-05, + "loss": 1.8501, + "step": 1231 + }, + { + "epoch": 0.3739565943238731, + "grad_norm": 0.37165501713752747, + "learning_rate": 9.257594167679223e-05, + "loss": 1.5672, + "step": 1232 + }, + { + "epoch": 0.37426013052056456, + "grad_norm": 0.9847288131713867, + "learning_rate": 9.256986634264885e-05, + "loss": 1.4694, + "step": 1233 + }, + { + "epoch": 0.37456366671725605, + "grad_norm": 0.35515448451042175, + "learning_rate": 9.256379100850547e-05, + "loss": 1.8861, + "step": 1234 + }, + { + "epoch": 0.3748672029139475, + "grad_norm": 0.46874669194221497, + "learning_rate": 9.25577156743621e-05, + "loss": 2.002, + "step": 1235 + }, + { + "epoch": 0.3751707391106389, + "grad_norm": 0.4635021686553955, + "learning_rate": 9.255164034021872e-05, + "loss": 1.3803, + "step": 1236 + }, + { + "epoch": 0.3754742753073304, + "grad_norm": 0.3871179521083832, + "learning_rate": 9.254556500607533e-05, + "loss": 1.9017, + "step": 1237 + }, + { + "epoch": 0.37577781150402184, + "grad_norm": 0.3958319127559662, + "learning_rate": 9.253948967193195e-05, + "loss": 2.0054, + "step": 1238 + }, + { + "epoch": 0.37608134770071333, + "grad_norm": 0.38364940881729126, + "learning_rate": 9.253341433778858e-05, + "loss": 1.7178, + "step": 1239 + }, + { + "epoch": 0.37638488389740477, + "grad_norm": 0.4198092222213745, + "learning_rate": 9.252733900364521e-05, + "loss": 1.9953, + "step": 1240 + }, + { + "epoch": 0.3766884200940962, + "grad_norm": 0.46621835231781006, + "learning_rate": 9.252126366950182e-05, + "loss": 1.7053, + "step": 1241 + }, + { + "epoch": 0.3769919562907877, + "grad_norm": 0.3871505558490753, + "learning_rate": 9.251518833535845e-05, + "loss": 1.7543, + "step": 1242 + }, + { + "epoch": 0.3772954924874791, + "grad_norm": 0.33642569184303284, + "learning_rate": 9.250911300121508e-05, + "loss": 1.875, + "step": 1243 + }, + { + "epoch": 0.3775990286841706, + "grad_norm": 0.37663060426712036, + "learning_rate": 9.25030376670717e-05, + "loss": 2.0942, + "step": 1244 + }, + { + "epoch": 0.37790256488086205, + "grad_norm": 0.5118516087532043, + "learning_rate": 9.249696233292831e-05, + "loss": 1.6102, + "step": 1245 + }, + { + "epoch": 0.3782061010775535, + "grad_norm": 0.37116679549217224, + "learning_rate": 9.249088699878494e-05, + "loss": 1.8115, + "step": 1246 + }, + { + "epoch": 0.37850963727424497, + "grad_norm": 0.3737630248069763, + "learning_rate": 9.248481166464156e-05, + "loss": 1.8607, + "step": 1247 + }, + { + "epoch": 0.3788131734709364, + "grad_norm": 0.4388040006160736, + "learning_rate": 9.247873633049818e-05, + "loss": 1.8458, + "step": 1248 + }, + { + "epoch": 0.37911670966762784, + "grad_norm": 0.401643842458725, + "learning_rate": 9.24726609963548e-05, + "loss": 2.022, + "step": 1249 + }, + { + "epoch": 0.3794202458643193, + "grad_norm": 0.4450658857822418, + "learning_rate": 9.246658566221143e-05, + "loss": 2.082, + "step": 1250 + }, + { + "epoch": 0.37972378206101076, + "grad_norm": 0.4192996025085449, + "learning_rate": 9.246051032806805e-05, + "loss": 1.8848, + "step": 1251 + }, + { + "epoch": 0.38002731825770225, + "grad_norm": 0.4925002157688141, + "learning_rate": 9.245443499392466e-05, + "loss": 1.9252, + "step": 1252 + }, + { + "epoch": 0.3803308544543937, + "grad_norm": 0.43910741806030273, + "learning_rate": 9.244835965978129e-05, + "loss": 1.0955, + "step": 1253 + }, + { + "epoch": 0.3806343906510851, + "grad_norm": 0.3679327070713043, + "learning_rate": 9.244228432563792e-05, + "loss": 1.9442, + "step": 1254 + }, + { + "epoch": 0.3809379268477766, + "grad_norm": 0.353431761264801, + "learning_rate": 9.243620899149453e-05, + "loss": 2.1047, + "step": 1255 + }, + { + "epoch": 0.38124146304446804, + "grad_norm": 0.4353777766227722, + "learning_rate": 9.243013365735116e-05, + "loss": 1.9008, + "step": 1256 + }, + { + "epoch": 0.38154499924115953, + "grad_norm": 0.5220703482627869, + "learning_rate": 9.242405832320779e-05, + "loss": 1.9991, + "step": 1257 + }, + { + "epoch": 0.38184853543785097, + "grad_norm": 0.4233221709728241, + "learning_rate": 9.24179829890644e-05, + "loss": 1.946, + "step": 1258 + }, + { + "epoch": 0.3821520716345424, + "grad_norm": 0.4323975145816803, + "learning_rate": 9.241190765492102e-05, + "loss": 1.2619, + "step": 1259 + }, + { + "epoch": 0.3824556078312339, + "grad_norm": 0.41842687129974365, + "learning_rate": 9.240583232077766e-05, + "loss": 1.7097, + "step": 1260 + }, + { + "epoch": 0.3827591440279253, + "grad_norm": 0.37142685055732727, + "learning_rate": 9.239975698663427e-05, + "loss": 1.771, + "step": 1261 + }, + { + "epoch": 0.38306268022461676, + "grad_norm": 0.3784460127353668, + "learning_rate": 9.239368165249089e-05, + "loss": 1.7196, + "step": 1262 + }, + { + "epoch": 0.38336621642130825, + "grad_norm": 0.4241008460521698, + "learning_rate": 9.238760631834751e-05, + "loss": 1.5656, + "step": 1263 + }, + { + "epoch": 0.3836697526179997, + "grad_norm": 0.4829429090023041, + "learning_rate": 9.238153098420414e-05, + "loss": 2.0334, + "step": 1264 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.3593828082084656, + "learning_rate": 9.237545565006076e-05, + "loss": 1.8076, + "step": 1265 + }, + { + "epoch": 0.3842768250113826, + "grad_norm": 0.4482446014881134, + "learning_rate": 9.236938031591737e-05, + "loss": 1.7199, + "step": 1266 + }, + { + "epoch": 0.38458036120807404, + "grad_norm": 0.3537690043449402, + "learning_rate": 9.2363304981774e-05, + "loss": 2.027, + "step": 1267 + }, + { + "epoch": 0.38488389740476553, + "grad_norm": 0.3928816616535187, + "learning_rate": 9.235722964763063e-05, + "loss": 1.811, + "step": 1268 + }, + { + "epoch": 0.38518743360145696, + "grad_norm": 0.4176971912384033, + "learning_rate": 9.235115431348724e-05, + "loss": 1.9443, + "step": 1269 + }, + { + "epoch": 0.38549096979814845, + "grad_norm": 0.3628327548503876, + "learning_rate": 9.234507897934387e-05, + "loss": 2.1905, + "step": 1270 + }, + { + "epoch": 0.3857945059948399, + "grad_norm": 0.40045323967933655, + "learning_rate": 9.23390036452005e-05, + "loss": 1.8231, + "step": 1271 + }, + { + "epoch": 0.3860980421915313, + "grad_norm": 0.36478134989738464, + "learning_rate": 9.23329283110571e-05, + "loss": 1.9453, + "step": 1272 + }, + { + "epoch": 0.3864015783882228, + "grad_norm": 0.36314669251441956, + "learning_rate": 9.232685297691373e-05, + "loss": 1.7708, + "step": 1273 + }, + { + "epoch": 0.38670511458491424, + "grad_norm": 0.44175347685813904, + "learning_rate": 9.232077764277037e-05, + "loss": 2.1961, + "step": 1274 + }, + { + "epoch": 0.38700865078160573, + "grad_norm": 0.411424845457077, + "learning_rate": 9.231470230862698e-05, + "loss": 1.7878, + "step": 1275 + }, + { + "epoch": 0.38731218697829717, + "grad_norm": 0.4166533648967743, + "learning_rate": 9.23086269744836e-05, + "loss": 1.9688, + "step": 1276 + }, + { + "epoch": 0.3876157231749886, + "grad_norm": 0.3575800061225891, + "learning_rate": 9.230255164034022e-05, + "loss": 1.94, + "step": 1277 + }, + { + "epoch": 0.3879192593716801, + "grad_norm": 0.37767383456230164, + "learning_rate": 9.229647630619685e-05, + "loss": 2.1039, + "step": 1278 + }, + { + "epoch": 0.3882227955683715, + "grad_norm": 0.5323564410209656, + "learning_rate": 9.229040097205347e-05, + "loss": 2.1214, + "step": 1279 + }, + { + "epoch": 0.38852633176506296, + "grad_norm": 0.37731266021728516, + "learning_rate": 9.228432563791008e-05, + "loss": 1.5483, + "step": 1280 + }, + { + "epoch": 0.38882986796175445, + "grad_norm": 0.3776138126850128, + "learning_rate": 9.227825030376671e-05, + "loss": 1.8353, + "step": 1281 + }, + { + "epoch": 0.3891334041584459, + "grad_norm": 0.40437617897987366, + "learning_rate": 9.227217496962333e-05, + "loss": 1.7512, + "step": 1282 + }, + { + "epoch": 0.3894369403551374, + "grad_norm": 0.4422746002674103, + "learning_rate": 9.226609963547995e-05, + "loss": 1.8497, + "step": 1283 + }, + { + "epoch": 0.3897404765518288, + "grad_norm": 0.3358697295188904, + "learning_rate": 9.226002430133658e-05, + "loss": 1.8597, + "step": 1284 + }, + { + "epoch": 0.39004401274852024, + "grad_norm": 0.8504922986030579, + "learning_rate": 9.225394896719321e-05, + "loss": 1.4645, + "step": 1285 + }, + { + "epoch": 0.39034754894521173, + "grad_norm": 0.35627248883247375, + "learning_rate": 9.224787363304981e-05, + "loss": 2.0882, + "step": 1286 + }, + { + "epoch": 0.39065108514190316, + "grad_norm": 0.3549906015396118, + "learning_rate": 9.224179829890644e-05, + "loss": 1.1979, + "step": 1287 + }, + { + "epoch": 0.39095462133859465, + "grad_norm": 0.41105836629867554, + "learning_rate": 9.223572296476308e-05, + "loss": 1.5483, + "step": 1288 + }, + { + "epoch": 0.3912581575352861, + "grad_norm": 0.38563552498817444, + "learning_rate": 9.222964763061969e-05, + "loss": 1.6972, + "step": 1289 + }, + { + "epoch": 0.3915616937319775, + "grad_norm": 0.6308382749557495, + "learning_rate": 9.222357229647631e-05, + "loss": 1.7006, + "step": 1290 + }, + { + "epoch": 0.391865229928669, + "grad_norm": 0.4371561110019684, + "learning_rate": 9.221749696233293e-05, + "loss": 1.838, + "step": 1291 + }, + { + "epoch": 0.39216876612536045, + "grad_norm": 0.421274334192276, + "learning_rate": 9.221142162818956e-05, + "loss": 2.0326, + "step": 1292 + }, + { + "epoch": 0.3924723023220519, + "grad_norm": 0.39431074261665344, + "learning_rate": 9.220534629404618e-05, + "loss": 1.9654, + "step": 1293 + }, + { + "epoch": 0.39277583851874337, + "grad_norm": 0.3802948594093323, + "learning_rate": 9.219927095990279e-05, + "loss": 1.8311, + "step": 1294 + }, + { + "epoch": 0.3930793747154348, + "grad_norm": 0.44941607117652893, + "learning_rate": 9.219319562575942e-05, + "loss": 1.879, + "step": 1295 + }, + { + "epoch": 0.3933829109121263, + "grad_norm": 0.395014226436615, + "learning_rate": 9.218712029161604e-05, + "loss": 1.5158, + "step": 1296 + }, + { + "epoch": 0.3936864471088177, + "grad_norm": 0.3692936599254608, + "learning_rate": 9.218104495747266e-05, + "loss": 2.1428, + "step": 1297 + }, + { + "epoch": 0.39398998330550916, + "grad_norm": 0.41931676864624023, + "learning_rate": 9.217496962332929e-05, + "loss": 1.5854, + "step": 1298 + }, + { + "epoch": 0.39429351950220065, + "grad_norm": 0.4195273816585541, + "learning_rate": 9.216889428918592e-05, + "loss": 1.4504, + "step": 1299 + }, + { + "epoch": 0.3945970556988921, + "grad_norm": 0.4246782958507538, + "learning_rate": 9.216281895504252e-05, + "loss": 1.5243, + "step": 1300 + }, + { + "epoch": 0.3949005918955836, + "grad_norm": 0.3366101086139679, + "learning_rate": 9.215674362089915e-05, + "loss": 1.5563, + "step": 1301 + }, + { + "epoch": 0.395204128092275, + "grad_norm": 0.5027258992195129, + "learning_rate": 9.215066828675579e-05, + "loss": 1.8705, + "step": 1302 + }, + { + "epoch": 0.39550766428896644, + "grad_norm": 0.33939701318740845, + "learning_rate": 9.21445929526124e-05, + "loss": 1.5192, + "step": 1303 + }, + { + "epoch": 0.39581120048565793, + "grad_norm": 0.5187803506851196, + "learning_rate": 9.213851761846902e-05, + "loss": 1.5153, + "step": 1304 + }, + { + "epoch": 0.39611473668234937, + "grad_norm": 0.43332159519195557, + "learning_rate": 9.213244228432564e-05, + "loss": 1.9878, + "step": 1305 + }, + { + "epoch": 0.3964182728790408, + "grad_norm": 0.371183842420578, + "learning_rate": 9.212636695018227e-05, + "loss": 1.9458, + "step": 1306 + }, + { + "epoch": 0.3967218090757323, + "grad_norm": 0.40977227687835693, + "learning_rate": 9.212029161603889e-05, + "loss": 2.17, + "step": 1307 + }, + { + "epoch": 0.3970253452724237, + "grad_norm": 0.37145286798477173, + "learning_rate": 9.21142162818955e-05, + "loss": 1.9565, + "step": 1308 + }, + { + "epoch": 0.3973288814691152, + "grad_norm": 0.45751968026161194, + "learning_rate": 9.210814094775213e-05, + "loss": 1.7469, + "step": 1309 + }, + { + "epoch": 0.39763241766580665, + "grad_norm": 0.39320966601371765, + "learning_rate": 9.210206561360875e-05, + "loss": 2.2532, + "step": 1310 + }, + { + "epoch": 0.3979359538624981, + "grad_norm": 0.44945451617240906, + "learning_rate": 9.209599027946537e-05, + "loss": 1.896, + "step": 1311 + }, + { + "epoch": 0.39823949005918957, + "grad_norm": 0.4179849326610565, + "learning_rate": 9.2089914945322e-05, + "loss": 2.0536, + "step": 1312 + }, + { + "epoch": 0.398543026255881, + "grad_norm": 0.3893973231315613, + "learning_rate": 9.208383961117863e-05, + "loss": 1.5888, + "step": 1313 + }, + { + "epoch": 0.3988465624525725, + "grad_norm": 0.4161340892314911, + "learning_rate": 9.207776427703523e-05, + "loss": 2.071, + "step": 1314 + }, + { + "epoch": 0.39915009864926393, + "grad_norm": 0.37969034910202026, + "learning_rate": 9.207168894289186e-05, + "loss": 1.7566, + "step": 1315 + }, + { + "epoch": 0.39945363484595536, + "grad_norm": 0.4157601296901703, + "learning_rate": 9.20656136087485e-05, + "loss": 2.1174, + "step": 1316 + }, + { + "epoch": 0.39975717104264685, + "grad_norm": 0.3726348876953125, + "learning_rate": 9.205953827460511e-05, + "loss": 1.9701, + "step": 1317 + }, + { + "epoch": 0.4000607072393383, + "grad_norm": 0.39407408237457275, + "learning_rate": 9.205346294046173e-05, + "loss": 1.6288, + "step": 1318 + }, + { + "epoch": 0.4003642434360297, + "grad_norm": 0.37205028533935547, + "learning_rate": 9.204738760631835e-05, + "loss": 1.801, + "step": 1319 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.39794477820396423, + "learning_rate": 9.204131227217498e-05, + "loss": 2.1164, + "step": 1320 + }, + { + "epoch": 0.40097131582941264, + "grad_norm": 0.4078124761581421, + "learning_rate": 9.20352369380316e-05, + "loss": 1.8339, + "step": 1321 + }, + { + "epoch": 0.40127485202610413, + "grad_norm": 0.4183814227581024, + "learning_rate": 9.202916160388821e-05, + "loss": 1.7706, + "step": 1322 + }, + { + "epoch": 0.40157838822279557, + "grad_norm": 0.6123658418655396, + "learning_rate": 9.202308626974484e-05, + "loss": 1.7745, + "step": 1323 + }, + { + "epoch": 0.401881924419487, + "grad_norm": 0.36878085136413574, + "learning_rate": 9.201701093560146e-05, + "loss": 1.2733, + "step": 1324 + }, + { + "epoch": 0.4021854606161785, + "grad_norm": 0.3583606779575348, + "learning_rate": 9.201093560145808e-05, + "loss": 1.3474, + "step": 1325 + }, + { + "epoch": 0.4024889968128699, + "grad_norm": 0.4098053574562073, + "learning_rate": 9.200486026731471e-05, + "loss": 1.845, + "step": 1326 + }, + { + "epoch": 0.4027925330095614, + "grad_norm": 0.5891076326370239, + "learning_rate": 9.199878493317134e-05, + "loss": 2.0312, + "step": 1327 + }, + { + "epoch": 0.40309606920625285, + "grad_norm": 0.5270339250564575, + "learning_rate": 9.199270959902794e-05, + "loss": 1.6679, + "step": 1328 + }, + { + "epoch": 0.4033996054029443, + "grad_norm": 0.4184766411781311, + "learning_rate": 9.198663426488457e-05, + "loss": 2.1587, + "step": 1329 + }, + { + "epoch": 0.4037031415996358, + "grad_norm": 0.3945539593696594, + "learning_rate": 9.198055893074119e-05, + "loss": 1.9374, + "step": 1330 + }, + { + "epoch": 0.4040066777963272, + "grad_norm": 0.3906068205833435, + "learning_rate": 9.197448359659781e-05, + "loss": 1.4602, + "step": 1331 + }, + { + "epoch": 0.40431021399301864, + "grad_norm": 0.4073745906352997, + "learning_rate": 9.196840826245444e-05, + "loss": 2.0568, + "step": 1332 + }, + { + "epoch": 0.40461375018971013, + "grad_norm": 0.35083553194999695, + "learning_rate": 9.196233292831106e-05, + "loss": 1.5915, + "step": 1333 + }, + { + "epoch": 0.40491728638640156, + "grad_norm": 0.38344746828079224, + "learning_rate": 9.195625759416769e-05, + "loss": 1.8988, + "step": 1334 + }, + { + "epoch": 0.40522082258309305, + "grad_norm": 0.8442848920822144, + "learning_rate": 9.19501822600243e-05, + "loss": 1.6229, + "step": 1335 + }, + { + "epoch": 0.4055243587797845, + "grad_norm": 0.41683101654052734, + "learning_rate": 9.194410692588092e-05, + "loss": 1.6984, + "step": 1336 + }, + { + "epoch": 0.4058278949764759, + "grad_norm": 0.43008947372436523, + "learning_rate": 9.193803159173755e-05, + "loss": 1.8312, + "step": 1337 + }, + { + "epoch": 0.4061314311731674, + "grad_norm": 0.43498390913009644, + "learning_rate": 9.193195625759417e-05, + "loss": 1.735, + "step": 1338 + }, + { + "epoch": 0.40643496736985885, + "grad_norm": 0.361969530582428, + "learning_rate": 9.192588092345079e-05, + "loss": 2.015, + "step": 1339 + }, + { + "epoch": 0.40673850356655034, + "grad_norm": 0.4028913378715515, + "learning_rate": 9.191980558930742e-05, + "loss": 1.9139, + "step": 1340 + }, + { + "epoch": 0.40704203976324177, + "grad_norm": 0.46840906143188477, + "learning_rate": 9.191373025516405e-05, + "loss": 1.913, + "step": 1341 + }, + { + "epoch": 0.4073455759599332, + "grad_norm": 0.39075982570648193, + "learning_rate": 9.190765492102065e-05, + "loss": 1.8908, + "step": 1342 + }, + { + "epoch": 0.4076491121566247, + "grad_norm": 0.3519285023212433, + "learning_rate": 9.190157958687728e-05, + "loss": 1.8016, + "step": 1343 + }, + { + "epoch": 0.4079526483533161, + "grad_norm": 0.43734681606292725, + "learning_rate": 9.18955042527339e-05, + "loss": 1.5545, + "step": 1344 + }, + { + "epoch": 0.40825618455000756, + "grad_norm": 0.4044792950153351, + "learning_rate": 9.188942891859052e-05, + "loss": 1.836, + "step": 1345 + }, + { + "epoch": 0.40855972074669905, + "grad_norm": 0.3661639988422394, + "learning_rate": 9.188335358444715e-05, + "loss": 2.0347, + "step": 1346 + }, + { + "epoch": 0.4088632569433905, + "grad_norm": 0.46894827485084534, + "learning_rate": 9.187727825030377e-05, + "loss": 1.5674, + "step": 1347 + }, + { + "epoch": 0.409166793140082, + "grad_norm": 0.4389861226081848, + "learning_rate": 9.18712029161604e-05, + "loss": 2.0472, + "step": 1348 + }, + { + "epoch": 0.4094703293367734, + "grad_norm": 0.3514555096626282, + "learning_rate": 9.186512758201702e-05, + "loss": 1.7453, + "step": 1349 + }, + { + "epoch": 0.40977386553346484, + "grad_norm": 0.35691046714782715, + "learning_rate": 9.185905224787363e-05, + "loss": 2.2705, + "step": 1350 + }, + { + "epoch": 0.41007740173015633, + "grad_norm": 0.3770231604576111, + "learning_rate": 9.185297691373026e-05, + "loss": 1.9172, + "step": 1351 + }, + { + "epoch": 0.41038093792684777, + "grad_norm": 0.40932852029800415, + "learning_rate": 9.184690157958688e-05, + "loss": 1.8489, + "step": 1352 + }, + { + "epoch": 0.41068447412353926, + "grad_norm": 0.39512181282043457, + "learning_rate": 9.18408262454435e-05, + "loss": 1.7082, + "step": 1353 + }, + { + "epoch": 0.4109880103202307, + "grad_norm": 0.33877119421958923, + "learning_rate": 9.183475091130013e-05, + "loss": 1.7629, + "step": 1354 + }, + { + "epoch": 0.4112915465169221, + "grad_norm": 0.4188339412212372, + "learning_rate": 9.182867557715675e-05, + "loss": 1.4726, + "step": 1355 + }, + { + "epoch": 0.4115950827136136, + "grad_norm": 0.3661527931690216, + "learning_rate": 9.182260024301336e-05, + "loss": 1.7752, + "step": 1356 + }, + { + "epoch": 0.41189861891030505, + "grad_norm": 0.3717115819454193, + "learning_rate": 9.181652490887e-05, + "loss": 1.6128, + "step": 1357 + }, + { + "epoch": 0.4122021551069965, + "grad_norm": 0.783671498298645, + "learning_rate": 9.181044957472661e-05, + "loss": 1.3576, + "step": 1358 + }, + { + "epoch": 0.41250569130368797, + "grad_norm": 0.5294111967086792, + "learning_rate": 9.180437424058323e-05, + "loss": 1.6084, + "step": 1359 + }, + { + "epoch": 0.4128092275003794, + "grad_norm": 0.42108646035194397, + "learning_rate": 9.179829890643986e-05, + "loss": 2.0071, + "step": 1360 + }, + { + "epoch": 0.4131127636970709, + "grad_norm": 0.3648010492324829, + "learning_rate": 9.179222357229648e-05, + "loss": 1.9791, + "step": 1361 + }, + { + "epoch": 0.41341629989376233, + "grad_norm": 0.31227168440818787, + "learning_rate": 9.178614823815311e-05, + "loss": 1.811, + "step": 1362 + }, + { + "epoch": 0.41371983609045376, + "grad_norm": 0.34013819694519043, + "learning_rate": 9.178007290400973e-05, + "loss": 1.6035, + "step": 1363 + }, + { + "epoch": 0.41402337228714525, + "grad_norm": 0.3848358392715454, + "learning_rate": 9.177399756986634e-05, + "loss": 1.7878, + "step": 1364 + }, + { + "epoch": 0.4143269084838367, + "grad_norm": 0.33737045526504517, + "learning_rate": 9.176792223572297e-05, + "loss": 1.5855, + "step": 1365 + }, + { + "epoch": 0.4146304446805282, + "grad_norm": 0.3722662329673767, + "learning_rate": 9.176184690157959e-05, + "loss": 1.4857, + "step": 1366 + }, + { + "epoch": 0.4149339808772196, + "grad_norm": 0.38730594515800476, + "learning_rate": 9.175577156743621e-05, + "loss": 1.858, + "step": 1367 + }, + { + "epoch": 0.41523751707391104, + "grad_norm": 0.34036555886268616, + "learning_rate": 9.174969623329284e-05, + "loss": 1.4445, + "step": 1368 + }, + { + "epoch": 0.41554105327060253, + "grad_norm": 0.395327091217041, + "learning_rate": 9.174362089914946e-05, + "loss": 1.5194, + "step": 1369 + }, + { + "epoch": 0.41584458946729397, + "grad_norm": 0.4212843179702759, + "learning_rate": 9.173754556500607e-05, + "loss": 2.3923, + "step": 1370 + }, + { + "epoch": 0.41614812566398546, + "grad_norm": 0.33540114760398865, + "learning_rate": 9.17314702308627e-05, + "loss": 1.9129, + "step": 1371 + }, + { + "epoch": 0.4164516618606769, + "grad_norm": 0.43007227778434753, + "learning_rate": 9.172539489671932e-05, + "loss": 1.7658, + "step": 1372 + }, + { + "epoch": 0.4167551980573683, + "grad_norm": 0.3466784358024597, + "learning_rate": 9.171931956257594e-05, + "loss": 1.8647, + "step": 1373 + }, + { + "epoch": 0.4170587342540598, + "grad_norm": 0.35446929931640625, + "learning_rate": 9.171324422843257e-05, + "loss": 1.5265, + "step": 1374 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.3868924081325531, + "learning_rate": 9.170716889428919e-05, + "loss": 1.6355, + "step": 1375 + }, + { + "epoch": 0.4176658066474427, + "grad_norm": 0.39501097798347473, + "learning_rate": 9.170109356014582e-05, + "loss": 1.7426, + "step": 1376 + }, + { + "epoch": 0.4179693428441342, + "grad_norm": 0.44299614429473877, + "learning_rate": 9.169501822600244e-05, + "loss": 1.8582, + "step": 1377 + }, + { + "epoch": 0.4182728790408256, + "grad_norm": 0.438927561044693, + "learning_rate": 9.168894289185905e-05, + "loss": 1.9043, + "step": 1378 + }, + { + "epoch": 0.4185764152375171, + "grad_norm": 0.3874059319496155, + "learning_rate": 9.168286755771568e-05, + "loss": 1.2591, + "step": 1379 + }, + { + "epoch": 0.41887995143420853, + "grad_norm": 0.40715524554252625, + "learning_rate": 9.16767922235723e-05, + "loss": 1.9779, + "step": 1380 + }, + { + "epoch": 0.41918348763089996, + "grad_norm": 0.3737177848815918, + "learning_rate": 9.167071688942892e-05, + "loss": 1.97, + "step": 1381 + }, + { + "epoch": 0.41948702382759145, + "grad_norm": 0.3690639138221741, + "learning_rate": 9.166464155528555e-05, + "loss": 1.9618, + "step": 1382 + }, + { + "epoch": 0.4197905600242829, + "grad_norm": 1.3496792316436768, + "learning_rate": 9.165856622114217e-05, + "loss": 2.1141, + "step": 1383 + }, + { + "epoch": 0.4200940962209744, + "grad_norm": 0.39503785967826843, + "learning_rate": 9.165249088699878e-05, + "loss": 1.8984, + "step": 1384 + }, + { + "epoch": 0.4203976324176658, + "grad_norm": 0.33576783537864685, + "learning_rate": 9.164641555285541e-05, + "loss": 1.8536, + "step": 1385 + }, + { + "epoch": 0.42070116861435725, + "grad_norm": 0.8600859642028809, + "learning_rate": 9.164034021871203e-05, + "loss": 1.3809, + "step": 1386 + }, + { + "epoch": 0.42100470481104874, + "grad_norm": 0.3842533230781555, + "learning_rate": 9.163426488456865e-05, + "loss": 1.6408, + "step": 1387 + }, + { + "epoch": 0.42130824100774017, + "grad_norm": 0.4019504487514496, + "learning_rate": 9.162818955042528e-05, + "loss": 1.9738, + "step": 1388 + }, + { + "epoch": 0.4216117772044316, + "grad_norm": 0.399406760931015, + "learning_rate": 9.16221142162819e-05, + "loss": 1.9897, + "step": 1389 + }, + { + "epoch": 0.4219153134011231, + "grad_norm": 0.35225290060043335, + "learning_rate": 9.161603888213853e-05, + "loss": 1.8217, + "step": 1390 + }, + { + "epoch": 0.4222188495978145, + "grad_norm": 0.3673458695411682, + "learning_rate": 9.160996354799515e-05, + "loss": 1.9175, + "step": 1391 + }, + { + "epoch": 0.422522385794506, + "grad_norm": 0.37856656312942505, + "learning_rate": 9.160388821385176e-05, + "loss": 1.8, + "step": 1392 + }, + { + "epoch": 0.42282592199119745, + "grad_norm": 0.3543725907802582, + "learning_rate": 9.15978128797084e-05, + "loss": 2.0975, + "step": 1393 + }, + { + "epoch": 0.4231294581878889, + "grad_norm": 0.34620916843414307, + "learning_rate": 9.159173754556501e-05, + "loss": 1.803, + "step": 1394 + }, + { + "epoch": 0.4234329943845804, + "grad_norm": 0.428543359041214, + "learning_rate": 9.158566221142163e-05, + "loss": 1.8852, + "step": 1395 + }, + { + "epoch": 0.4237365305812718, + "grad_norm": 0.41286444664001465, + "learning_rate": 9.157958687727826e-05, + "loss": 1.7661, + "step": 1396 + }, + { + "epoch": 0.4240400667779633, + "grad_norm": 0.42155444622039795, + "learning_rate": 9.157351154313488e-05, + "loss": 1.9728, + "step": 1397 + }, + { + "epoch": 0.42434360297465473, + "grad_norm": 0.4446348547935486, + "learning_rate": 9.15674362089915e-05, + "loss": 1.6411, + "step": 1398 + }, + { + "epoch": 0.42464713917134617, + "grad_norm": 0.38683468103408813, + "learning_rate": 9.156136087484812e-05, + "loss": 1.9311, + "step": 1399 + }, + { + "epoch": 0.42495067536803766, + "grad_norm": 0.398798406124115, + "learning_rate": 9.155528554070474e-05, + "loss": 1.867, + "step": 1400 + }, + { + "epoch": 0.4252542115647291, + "grad_norm": 0.3723427951335907, + "learning_rate": 9.154921020656136e-05, + "loss": 2.1345, + "step": 1401 + }, + { + "epoch": 0.4255577477614205, + "grad_norm": 0.3853520452976227, + "learning_rate": 9.154313487241799e-05, + "loss": 1.4843, + "step": 1402 + }, + { + "epoch": 0.425861283958112, + "grad_norm": 0.4148903489112854, + "learning_rate": 9.153705953827461e-05, + "loss": 1.9043, + "step": 1403 + }, + { + "epoch": 0.42616482015480345, + "grad_norm": 0.4134661555290222, + "learning_rate": 9.153098420413124e-05, + "loss": 2.0662, + "step": 1404 + }, + { + "epoch": 0.42646835635149494, + "grad_norm": 0.4663408100605011, + "learning_rate": 9.152490886998786e-05, + "loss": 1.8455, + "step": 1405 + }, + { + "epoch": 0.42677189254818637, + "grad_norm": 0.3829919695854187, + "learning_rate": 9.151883353584447e-05, + "loss": 1.7822, + "step": 1406 + }, + { + "epoch": 0.4270754287448778, + "grad_norm": 0.3487464487552643, + "learning_rate": 9.15127582017011e-05, + "loss": 1.9024, + "step": 1407 + }, + { + "epoch": 0.4273789649415693, + "grad_norm": 0.4045817255973816, + "learning_rate": 9.150668286755772e-05, + "loss": 1.6833, + "step": 1408 + }, + { + "epoch": 0.42768250113826073, + "grad_norm": 0.4237898588180542, + "learning_rate": 9.150060753341434e-05, + "loss": 2.0711, + "step": 1409 + }, + { + "epoch": 0.4279860373349522, + "grad_norm": 0.3947038948535919, + "learning_rate": 9.149453219927097e-05, + "loss": 1.6692, + "step": 1410 + }, + { + "epoch": 0.42828957353164365, + "grad_norm": 0.373927503824234, + "learning_rate": 9.148845686512759e-05, + "loss": 2.1039, + "step": 1411 + }, + { + "epoch": 0.4285931097283351, + "grad_norm": 0.45322802662849426, + "learning_rate": 9.14823815309842e-05, + "loss": 1.4528, + "step": 1412 + }, + { + "epoch": 0.4288966459250266, + "grad_norm": 0.4237847328186035, + "learning_rate": 9.147630619684083e-05, + "loss": 1.8462, + "step": 1413 + }, + { + "epoch": 0.429200182121718, + "grad_norm": 0.39840593934059143, + "learning_rate": 9.147023086269745e-05, + "loss": 1.8354, + "step": 1414 + }, + { + "epoch": 0.42950371831840944, + "grad_norm": 0.3490378260612488, + "learning_rate": 9.146415552855407e-05, + "loss": 1.9182, + "step": 1415 + }, + { + "epoch": 0.42980725451510093, + "grad_norm": 0.37050196528434753, + "learning_rate": 9.14580801944107e-05, + "loss": 2.1893, + "step": 1416 + }, + { + "epoch": 0.43011079071179237, + "grad_norm": 0.7810099720954895, + "learning_rate": 9.145200486026732e-05, + "loss": 1.7489, + "step": 1417 + }, + { + "epoch": 0.43041432690848386, + "grad_norm": 0.35921812057495117, + "learning_rate": 9.144592952612393e-05, + "loss": 1.9051, + "step": 1418 + }, + { + "epoch": 0.4307178631051753, + "grad_norm": 0.42429161071777344, + "learning_rate": 9.143985419198057e-05, + "loss": 1.4487, + "step": 1419 + }, + { + "epoch": 0.4310213993018667, + "grad_norm": 0.37482765316963196, + "learning_rate": 9.143377885783718e-05, + "loss": 1.7627, + "step": 1420 + }, + { + "epoch": 0.4313249354985582, + "grad_norm": 0.39142462611198425, + "learning_rate": 9.142770352369381e-05, + "loss": 1.7077, + "step": 1421 + }, + { + "epoch": 0.43162847169524965, + "grad_norm": 0.33967357873916626, + "learning_rate": 9.142162818955043e-05, + "loss": 1.7666, + "step": 1422 + }, + { + "epoch": 0.43193200789194114, + "grad_norm": 0.3520660400390625, + "learning_rate": 9.141555285540705e-05, + "loss": 2.1716, + "step": 1423 + }, + { + "epoch": 0.4322355440886326, + "grad_norm": 0.3724939823150635, + "learning_rate": 9.140947752126368e-05, + "loss": 1.8, + "step": 1424 + }, + { + "epoch": 0.432539080285324, + "grad_norm": 0.37572380900382996, + "learning_rate": 9.14034021871203e-05, + "loss": 1.8956, + "step": 1425 + }, + { + "epoch": 0.4328426164820155, + "grad_norm": 0.38770124316215515, + "learning_rate": 9.139732685297691e-05, + "loss": 1.6381, + "step": 1426 + }, + { + "epoch": 0.43314615267870693, + "grad_norm": 0.5836375951766968, + "learning_rate": 9.139125151883354e-05, + "loss": 1.9264, + "step": 1427 + }, + { + "epoch": 0.43344968887539836, + "grad_norm": 0.44695645570755005, + "learning_rate": 9.138517618469016e-05, + "loss": 1.7427, + "step": 1428 + }, + { + "epoch": 0.43375322507208985, + "grad_norm": 0.3857296407222748, + "learning_rate": 9.137910085054678e-05, + "loss": 1.5448, + "step": 1429 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.417868971824646, + "learning_rate": 9.137302551640341e-05, + "loss": 1.9924, + "step": 1430 + }, + { + "epoch": 0.4343602974654728, + "grad_norm": 0.42111891508102417, + "learning_rate": 9.136695018226003e-05, + "loss": 1.9506, + "step": 1431 + }, + { + "epoch": 0.4346638336621642, + "grad_norm": 0.4096441864967346, + "learning_rate": 9.136087484811664e-05, + "loss": 1.6159, + "step": 1432 + }, + { + "epoch": 0.43496736985885565, + "grad_norm": 0.4601602852344513, + "learning_rate": 9.135479951397328e-05, + "loss": 1.5702, + "step": 1433 + }, + { + "epoch": 0.43527090605554714, + "grad_norm": 0.4030752182006836, + "learning_rate": 9.134872417982989e-05, + "loss": 1.9785, + "step": 1434 + }, + { + "epoch": 0.43557444225223857, + "grad_norm": 0.4357512593269348, + "learning_rate": 9.134264884568652e-05, + "loss": 1.8718, + "step": 1435 + }, + { + "epoch": 0.43587797844893006, + "grad_norm": 0.3511190712451935, + "learning_rate": 9.133657351154314e-05, + "loss": 1.9243, + "step": 1436 + }, + { + "epoch": 0.4361815146456215, + "grad_norm": 0.4554003179073334, + "learning_rate": 9.133049817739976e-05, + "loss": 1.5576, + "step": 1437 + }, + { + "epoch": 0.4364850508423129, + "grad_norm": 0.37637701630592346, + "learning_rate": 9.132442284325639e-05, + "loss": 2.3059, + "step": 1438 + }, + { + "epoch": 0.4367885870390044, + "grad_norm": 0.39330780506134033, + "learning_rate": 9.1318347509113e-05, + "loss": 1.4548, + "step": 1439 + }, + { + "epoch": 0.43709212323569585, + "grad_norm": 0.44056230783462524, + "learning_rate": 9.131227217496962e-05, + "loss": 1.7839, + "step": 1440 + }, + { + "epoch": 0.4373956594323873, + "grad_norm": 1.5862314701080322, + "learning_rate": 9.130619684082625e-05, + "loss": 1.6374, + "step": 1441 + }, + { + "epoch": 0.4376991956290788, + "grad_norm": 0.44076550006866455, + "learning_rate": 9.130012150668287e-05, + "loss": 2.0375, + "step": 1442 + }, + { + "epoch": 0.4380027318257702, + "grad_norm": 0.46712005138397217, + "learning_rate": 9.129404617253949e-05, + "loss": 2.0498, + "step": 1443 + }, + { + "epoch": 0.4383062680224617, + "grad_norm": 0.4472239315509796, + "learning_rate": 9.128797083839612e-05, + "loss": 2.1283, + "step": 1444 + }, + { + "epoch": 0.43860980421915313, + "grad_norm": 0.46304264664649963, + "learning_rate": 9.128189550425274e-05, + "loss": 1.9628, + "step": 1445 + }, + { + "epoch": 0.43891334041584457, + "grad_norm": 0.4066753387451172, + "learning_rate": 9.127582017010935e-05, + "loss": 1.4578, + "step": 1446 + }, + { + "epoch": 0.43921687661253606, + "grad_norm": 0.4386885166168213, + "learning_rate": 9.126974483596599e-05, + "loss": 1.8655, + "step": 1447 + }, + { + "epoch": 0.4395204128092275, + "grad_norm": 0.5175670981407166, + "learning_rate": 9.12636695018226e-05, + "loss": 1.9672, + "step": 1448 + }, + { + "epoch": 0.439823949005919, + "grad_norm": 0.39056891202926636, + "learning_rate": 9.125759416767923e-05, + "loss": 2.2645, + "step": 1449 + }, + { + "epoch": 0.4401274852026104, + "grad_norm": 0.3297121822834015, + "learning_rate": 9.125151883353585e-05, + "loss": 1.651, + "step": 1450 + }, + { + "epoch": 0.44043102139930185, + "grad_norm": 0.37801650166511536, + "learning_rate": 9.124544349939247e-05, + "loss": 1.938, + "step": 1451 + }, + { + "epoch": 0.44073455759599334, + "grad_norm": 0.45800700783729553, + "learning_rate": 9.12393681652491e-05, + "loss": 1.8465, + "step": 1452 + }, + { + "epoch": 0.44103809379268477, + "grad_norm": 0.42198505997657776, + "learning_rate": 9.123329283110572e-05, + "loss": 1.9878, + "step": 1453 + }, + { + "epoch": 0.44134162998937626, + "grad_norm": 0.9476953744888306, + "learning_rate": 9.122721749696233e-05, + "loss": 2.127, + "step": 1454 + }, + { + "epoch": 0.4416451661860677, + "grad_norm": 0.6569995880126953, + "learning_rate": 9.122114216281896e-05, + "loss": 2.1351, + "step": 1455 + }, + { + "epoch": 0.44194870238275913, + "grad_norm": 0.7246467471122742, + "learning_rate": 9.121506682867558e-05, + "loss": 1.9843, + "step": 1456 + }, + { + "epoch": 0.4422522385794506, + "grad_norm": 0.3717383146286011, + "learning_rate": 9.12089914945322e-05, + "loss": 1.7456, + "step": 1457 + }, + { + "epoch": 0.44255577477614205, + "grad_norm": 0.39930397272109985, + "learning_rate": 9.120291616038883e-05, + "loss": 2.1122, + "step": 1458 + }, + { + "epoch": 0.4428593109728335, + "grad_norm": 0.4484943747520447, + "learning_rate": 9.119684082624545e-05, + "loss": 1.8622, + "step": 1459 + }, + { + "epoch": 0.443162847169525, + "grad_norm": 0.45804062485694885, + "learning_rate": 9.119076549210206e-05, + "loss": 2.0503, + "step": 1460 + }, + { + "epoch": 0.4434663833662164, + "grad_norm": 0.381073921918869, + "learning_rate": 9.11846901579587e-05, + "loss": 1.8256, + "step": 1461 + }, + { + "epoch": 0.4437699195629079, + "grad_norm": 0.4491977095603943, + "learning_rate": 9.117861482381531e-05, + "loss": 1.6401, + "step": 1462 + }, + { + "epoch": 0.44407345575959933, + "grad_norm": 0.3925999104976654, + "learning_rate": 9.117253948967194e-05, + "loss": 1.9513, + "step": 1463 + }, + { + "epoch": 0.44437699195629077, + "grad_norm": 0.45975080132484436, + "learning_rate": 9.116646415552856e-05, + "loss": 1.7791, + "step": 1464 + }, + { + "epoch": 0.44468052815298226, + "grad_norm": 0.44993898272514343, + "learning_rate": 9.116038882138518e-05, + "loss": 0.6258, + "step": 1465 + }, + { + "epoch": 0.4449840643496737, + "grad_norm": 0.4088694453239441, + "learning_rate": 9.115431348724181e-05, + "loss": 1.768, + "step": 1466 + }, + { + "epoch": 0.4452876005463652, + "grad_norm": 0.5844696760177612, + "learning_rate": 9.114823815309841e-05, + "loss": 1.7229, + "step": 1467 + }, + { + "epoch": 0.4455911367430566, + "grad_norm": 0.5378713607788086, + "learning_rate": 9.114216281895504e-05, + "loss": 1.8761, + "step": 1468 + }, + { + "epoch": 0.44589467293974805, + "grad_norm": 1.5058445930480957, + "learning_rate": 9.113608748481167e-05, + "loss": 2.0215, + "step": 1469 + }, + { + "epoch": 0.44619820913643954, + "grad_norm": 0.44474056363105774, + "learning_rate": 9.113001215066829e-05, + "loss": 1.9797, + "step": 1470 + }, + { + "epoch": 0.446501745333131, + "grad_norm": 0.4373909831047058, + "learning_rate": 9.112393681652491e-05, + "loss": 2.1042, + "step": 1471 + }, + { + "epoch": 0.4468052815298224, + "grad_norm": 0.5322824716567993, + "learning_rate": 9.111786148238154e-05, + "loss": 1.6328, + "step": 1472 + }, + { + "epoch": 0.4471088177265139, + "grad_norm": 0.4264838993549347, + "learning_rate": 9.111178614823816e-05, + "loss": 1.9764, + "step": 1473 + }, + { + "epoch": 0.44741235392320533, + "grad_norm": 0.39688101410865784, + "learning_rate": 9.110571081409478e-05, + "loss": 1.8385, + "step": 1474 + }, + { + "epoch": 0.4477158901198968, + "grad_norm": 0.3781752586364746, + "learning_rate": 9.10996354799514e-05, + "loss": 2.1128, + "step": 1475 + }, + { + "epoch": 0.44801942631658825, + "grad_norm": 0.40686413645744324, + "learning_rate": 9.109356014580802e-05, + "loss": 2.033, + "step": 1476 + }, + { + "epoch": 0.4483229625132797, + "grad_norm": 0.42852646112442017, + "learning_rate": 9.108748481166465e-05, + "loss": 1.714, + "step": 1477 + }, + { + "epoch": 0.4486264987099712, + "grad_norm": 0.3613603413105011, + "learning_rate": 9.108140947752127e-05, + "loss": 1.2022, + "step": 1478 + }, + { + "epoch": 0.4489300349066626, + "grad_norm": 0.4005518853664398, + "learning_rate": 9.107533414337789e-05, + "loss": 1.9914, + "step": 1479 + }, + { + "epoch": 0.4492335711033541, + "grad_norm": 0.3479957580566406, + "learning_rate": 9.106925880923452e-05, + "loss": 1.7423, + "step": 1480 + }, + { + "epoch": 0.44953710730004554, + "grad_norm": 0.43999946117401123, + "learning_rate": 9.106318347509112e-05, + "loss": 2.0009, + "step": 1481 + }, + { + "epoch": 0.44984064349673697, + "grad_norm": 0.36132052540779114, + "learning_rate": 9.105710814094775e-05, + "loss": 1.916, + "step": 1482 + }, + { + "epoch": 0.45014417969342846, + "grad_norm": 0.33822518587112427, + "learning_rate": 9.105103280680438e-05, + "loss": 1.6778, + "step": 1483 + }, + { + "epoch": 0.4504477158901199, + "grad_norm": 0.35278624296188354, + "learning_rate": 9.1044957472661e-05, + "loss": 1.8943, + "step": 1484 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.47397172451019287, + "learning_rate": 9.103888213851762e-05, + "loss": 1.7541, + "step": 1485 + }, + { + "epoch": 0.4510547882835028, + "grad_norm": 0.3714633882045746, + "learning_rate": 9.103280680437425e-05, + "loss": 2.026, + "step": 1486 + }, + { + "epoch": 0.45135832448019425, + "grad_norm": 1.6088794469833374, + "learning_rate": 9.102673147023087e-05, + "loss": 1.8904, + "step": 1487 + }, + { + "epoch": 0.45166186067688574, + "grad_norm": 0.39234501123428345, + "learning_rate": 9.102065613608749e-05, + "loss": 1.9149, + "step": 1488 + }, + { + "epoch": 0.4519653968735772, + "grad_norm": 0.4205072224140167, + "learning_rate": 9.101458080194412e-05, + "loss": 2.0117, + "step": 1489 + }, + { + "epoch": 0.4522689330702686, + "grad_norm": 0.45428764820098877, + "learning_rate": 9.100850546780073e-05, + "loss": 1.9742, + "step": 1490 + }, + { + "epoch": 0.4525724692669601, + "grad_norm": 0.35566025972366333, + "learning_rate": 9.100243013365735e-05, + "loss": 1.8445, + "step": 1491 + }, + { + "epoch": 0.45287600546365153, + "grad_norm": 0.4020955562591553, + "learning_rate": 9.099635479951398e-05, + "loss": 1.9665, + "step": 1492 + }, + { + "epoch": 0.453179541660343, + "grad_norm": 0.6123180985450745, + "learning_rate": 9.09902794653706e-05, + "loss": 1.6361, + "step": 1493 + }, + { + "epoch": 0.45348307785703446, + "grad_norm": 0.44139203429222107, + "learning_rate": 9.098420413122723e-05, + "loss": 1.8156, + "step": 1494 + }, + { + "epoch": 0.4537866140537259, + "grad_norm": 0.4224632680416107, + "learning_rate": 9.097812879708383e-05, + "loss": 1.8907, + "step": 1495 + }, + { + "epoch": 0.4540901502504174, + "grad_norm": 0.40932169556617737, + "learning_rate": 9.097205346294046e-05, + "loss": 1.9179, + "step": 1496 + }, + { + "epoch": 0.4543936864471088, + "grad_norm": 0.41995543241500854, + "learning_rate": 9.09659781287971e-05, + "loss": 1.7627, + "step": 1497 + }, + { + "epoch": 0.45469722264380025, + "grad_norm": 0.33541586995124817, + "learning_rate": 9.095990279465371e-05, + "loss": 2.0334, + "step": 1498 + }, + { + "epoch": 0.45500075884049174, + "grad_norm": 0.426469624042511, + "learning_rate": 9.095382746051033e-05, + "loss": 2.0636, + "step": 1499 + }, + { + "epoch": 0.45530429503718317, + "grad_norm": 0.4037235379219055, + "learning_rate": 9.094775212636696e-05, + "loss": 1.9431, + "step": 1500 + }, + { + "epoch": 0.45560783123387466, + "grad_norm": 0.35326942801475525, + "learning_rate": 9.094167679222358e-05, + "loss": 1.9306, + "step": 1501 + }, + { + "epoch": 0.4559113674305661, + "grad_norm": 0.3722929358482361, + "learning_rate": 9.09356014580802e-05, + "loss": 1.2554, + "step": 1502 + }, + { + "epoch": 0.45621490362725753, + "grad_norm": 0.5637504458427429, + "learning_rate": 9.092952612393683e-05, + "loss": 2.2883, + "step": 1503 + }, + { + "epoch": 0.456518439823949, + "grad_norm": 0.4601937532424927, + "learning_rate": 9.092345078979344e-05, + "loss": 1.8051, + "step": 1504 + }, + { + "epoch": 0.45682197602064045, + "grad_norm": 0.4153023660182953, + "learning_rate": 9.091737545565006e-05, + "loss": 2.045, + "step": 1505 + }, + { + "epoch": 0.45712551221733194, + "grad_norm": 0.34770330786705017, + "learning_rate": 9.091130012150668e-05, + "loss": 1.8672, + "step": 1506 + }, + { + "epoch": 0.4574290484140234, + "grad_norm": 0.3669261932373047, + "learning_rate": 9.090522478736331e-05, + "loss": 1.7184, + "step": 1507 + }, + { + "epoch": 0.4577325846107148, + "grad_norm": 0.4862425923347473, + "learning_rate": 9.089914945321994e-05, + "loss": 1.7152, + "step": 1508 + }, + { + "epoch": 0.4580361208074063, + "grad_norm": 0.39143872261047363, + "learning_rate": 9.089307411907654e-05, + "loss": 1.6634, + "step": 1509 + }, + { + "epoch": 0.45833965700409773, + "grad_norm": 0.48413509130477905, + "learning_rate": 9.088699878493317e-05, + "loss": 1.8655, + "step": 1510 + }, + { + "epoch": 0.45864319320078917, + "grad_norm": 0.438029944896698, + "learning_rate": 9.08809234507898e-05, + "loss": 1.8924, + "step": 1511 + }, + { + "epoch": 0.45894672939748066, + "grad_norm": 0.39872634410858154, + "learning_rate": 9.087484811664642e-05, + "loss": 2.0113, + "step": 1512 + }, + { + "epoch": 0.4592502655941721, + "grad_norm": 0.6361043453216553, + "learning_rate": 9.086877278250304e-05, + "loss": 2.004, + "step": 1513 + }, + { + "epoch": 0.4595538017908636, + "grad_norm": 0.35867545008659363, + "learning_rate": 9.086269744835967e-05, + "loss": 1.6438, + "step": 1514 + }, + { + "epoch": 0.459857337987555, + "grad_norm": 0.375430166721344, + "learning_rate": 9.085662211421629e-05, + "loss": 1.6383, + "step": 1515 + }, + { + "epoch": 0.46016087418424645, + "grad_norm": 1.4054492712020874, + "learning_rate": 9.08505467800729e-05, + "loss": 1.6688, + "step": 1516 + }, + { + "epoch": 0.46046441038093794, + "grad_norm": 0.35748517513275146, + "learning_rate": 9.084447144592954e-05, + "loss": 1.8517, + "step": 1517 + }, + { + "epoch": 0.4607679465776294, + "grad_norm": 0.3136993944644928, + "learning_rate": 9.083839611178615e-05, + "loss": 2.0279, + "step": 1518 + }, + { + "epoch": 0.46107148277432086, + "grad_norm": 0.39442840218544006, + "learning_rate": 9.083232077764277e-05, + "loss": 2.0558, + "step": 1519 + }, + { + "epoch": 0.4613750189710123, + "grad_norm": 0.3278721272945404, + "learning_rate": 9.082624544349939e-05, + "loss": 1.8702, + "step": 1520 + }, + { + "epoch": 0.46167855516770373, + "grad_norm": 0.6478224992752075, + "learning_rate": 9.082017010935602e-05, + "loss": 1.9689, + "step": 1521 + }, + { + "epoch": 0.4619820913643952, + "grad_norm": 0.39185309410095215, + "learning_rate": 9.081409477521265e-05, + "loss": 1.871, + "step": 1522 + }, + { + "epoch": 0.46228562756108665, + "grad_norm": 0.4506731927394867, + "learning_rate": 9.080801944106925e-05, + "loss": 1.3312, + "step": 1523 + }, + { + "epoch": 0.4625891637577781, + "grad_norm": 0.36045706272125244, + "learning_rate": 9.080194410692588e-05, + "loss": 1.4391, + "step": 1524 + }, + { + "epoch": 0.4628926999544696, + "grad_norm": 0.40836915373802185, + "learning_rate": 9.079586877278252e-05, + "loss": 2.0619, + "step": 1525 + }, + { + "epoch": 0.463196236151161, + "grad_norm": 0.39617446064949036, + "learning_rate": 9.078979343863913e-05, + "loss": 1.3619, + "step": 1526 + }, + { + "epoch": 0.4634997723478525, + "grad_norm": 0.41966769099235535, + "learning_rate": 9.078371810449575e-05, + "loss": 1.9004, + "step": 1527 + }, + { + "epoch": 0.46380330854454394, + "grad_norm": 0.39979109168052673, + "learning_rate": 9.077764277035238e-05, + "loss": 1.8401, + "step": 1528 + }, + { + "epoch": 0.46410684474123537, + "grad_norm": 0.3719238042831421, + "learning_rate": 9.0771567436209e-05, + "loss": 1.8935, + "step": 1529 + }, + { + "epoch": 0.46441038093792686, + "grad_norm": 0.4243968427181244, + "learning_rate": 9.076549210206562e-05, + "loss": 1.9741, + "step": 1530 + }, + { + "epoch": 0.4647139171346183, + "grad_norm": 0.46206673979759216, + "learning_rate": 9.075941676792225e-05, + "loss": 1.929, + "step": 1531 + }, + { + "epoch": 0.4650174533313098, + "grad_norm": 0.49223679304122925, + "learning_rate": 9.075334143377886e-05, + "loss": 1.5711, + "step": 1532 + }, + { + "epoch": 0.4653209895280012, + "grad_norm": 0.40891674160957336, + "learning_rate": 9.074726609963548e-05, + "loss": 1.3376, + "step": 1533 + }, + { + "epoch": 0.46562452572469265, + "grad_norm": 0.4146333336830139, + "learning_rate": 9.07411907654921e-05, + "loss": 1.9774, + "step": 1534 + }, + { + "epoch": 0.46592806192138414, + "grad_norm": 0.39834362268447876, + "learning_rate": 9.073511543134873e-05, + "loss": 1.8253, + "step": 1535 + }, + { + "epoch": 0.4662315981180756, + "grad_norm": 0.4415489137172699, + "learning_rate": 9.072904009720536e-05, + "loss": 2.0604, + "step": 1536 + }, + { + "epoch": 0.466535134314767, + "grad_norm": 0.40179288387298584, + "learning_rate": 9.072296476306196e-05, + "loss": 2.0014, + "step": 1537 + }, + { + "epoch": 0.4668386705114585, + "grad_norm": 0.3849865794181824, + "learning_rate": 9.07168894289186e-05, + "loss": 2.0134, + "step": 1538 + }, + { + "epoch": 0.46714220670814993, + "grad_norm": 0.4075673520565033, + "learning_rate": 9.071081409477523e-05, + "loss": 1.7784, + "step": 1539 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.3913114368915558, + "learning_rate": 9.070473876063183e-05, + "loss": 2.0035, + "step": 1540 + }, + { + "epoch": 0.46774927910153286, + "grad_norm": 1.1502317190170288, + "learning_rate": 9.069866342648846e-05, + "loss": 1.9697, + "step": 1541 + }, + { + "epoch": 0.4680528152982243, + "grad_norm": 0.3618208169937134, + "learning_rate": 9.069258809234509e-05, + "loss": 1.8629, + "step": 1542 + }, + { + "epoch": 0.4683563514949158, + "grad_norm": 0.5658997297286987, + "learning_rate": 9.068651275820171e-05, + "loss": 1.9923, + "step": 1543 + }, + { + "epoch": 0.4686598876916072, + "grad_norm": 0.6084100008010864, + "learning_rate": 9.068043742405833e-05, + "loss": 2.2139, + "step": 1544 + }, + { + "epoch": 0.4689634238882987, + "grad_norm": 0.45577460527420044, + "learning_rate": 9.067436208991496e-05, + "loss": 1.602, + "step": 1545 + }, + { + "epoch": 0.46926696008499014, + "grad_norm": 0.38301292061805725, + "learning_rate": 9.066828675577157e-05, + "loss": 1.7644, + "step": 1546 + }, + { + "epoch": 0.46957049628168157, + "grad_norm": 0.42755424976348877, + "learning_rate": 9.066221142162819e-05, + "loss": 1.7429, + "step": 1547 + }, + { + "epoch": 0.46987403247837306, + "grad_norm": 0.3980792760848999, + "learning_rate": 9.065613608748481e-05, + "loss": 1.8362, + "step": 1548 + }, + { + "epoch": 0.4701775686750645, + "grad_norm": 0.41398894786834717, + "learning_rate": 9.065006075334144e-05, + "loss": 1.7865, + "step": 1549 + }, + { + "epoch": 0.470481104871756, + "grad_norm": 0.416704922914505, + "learning_rate": 9.064398541919807e-05, + "loss": 2.1474, + "step": 1550 + }, + { + "epoch": 0.4707846410684474, + "grad_norm": 0.3613940477371216, + "learning_rate": 9.063791008505467e-05, + "loss": 1.9861, + "step": 1551 + }, + { + "epoch": 0.47108817726513885, + "grad_norm": 0.3752197027206421, + "learning_rate": 9.06318347509113e-05, + "loss": 1.5374, + "step": 1552 + }, + { + "epoch": 0.47139171346183034, + "grad_norm": 0.3436816930770874, + "learning_rate": 9.062575941676794e-05, + "loss": 2.0983, + "step": 1553 + }, + { + "epoch": 0.4716952496585218, + "grad_norm": 0.40420001745224, + "learning_rate": 9.061968408262454e-05, + "loss": 1.8447, + "step": 1554 + }, + { + "epoch": 0.4719987858552132, + "grad_norm": 0.5580700039863586, + "learning_rate": 9.061360874848117e-05, + "loss": 1.4499, + "step": 1555 + }, + { + "epoch": 0.4723023220519047, + "grad_norm": 0.42122432589530945, + "learning_rate": 9.06075334143378e-05, + "loss": 1.9475, + "step": 1556 + }, + { + "epoch": 0.47260585824859613, + "grad_norm": 0.37578698992729187, + "learning_rate": 9.060145808019442e-05, + "loss": 2.2064, + "step": 1557 + }, + { + "epoch": 0.4729093944452876, + "grad_norm": 0.3756474554538727, + "learning_rate": 9.059538274605104e-05, + "loss": 1.6932, + "step": 1558 + }, + { + "epoch": 0.47321293064197906, + "grad_norm": 0.45632341504096985, + "learning_rate": 9.058930741190767e-05, + "loss": 2.0437, + "step": 1559 + }, + { + "epoch": 0.4735164668386705, + "grad_norm": 0.41071614623069763, + "learning_rate": 9.058323207776428e-05, + "loss": 1.9358, + "step": 1560 + }, + { + "epoch": 0.473820003035362, + "grad_norm": 0.5713014006614685, + "learning_rate": 9.05771567436209e-05, + "loss": 1.9766, + "step": 1561 + }, + { + "epoch": 0.4741235392320534, + "grad_norm": 0.3685849606990814, + "learning_rate": 9.057108140947752e-05, + "loss": 1.9811, + "step": 1562 + }, + { + "epoch": 0.4744270754287449, + "grad_norm": 0.4106161594390869, + "learning_rate": 9.056500607533415e-05, + "loss": 1.6636, + "step": 1563 + }, + { + "epoch": 0.47473061162543634, + "grad_norm": 0.4227912724018097, + "learning_rate": 9.055893074119077e-05, + "loss": 2.1302, + "step": 1564 + }, + { + "epoch": 0.4750341478221278, + "grad_norm": 0.6117652058601379, + "learning_rate": 9.055285540704738e-05, + "loss": 1.9783, + "step": 1565 + }, + { + "epoch": 0.47533768401881926, + "grad_norm": 0.34352535009384155, + "learning_rate": 9.054678007290401e-05, + "loss": 1.5251, + "step": 1566 + }, + { + "epoch": 0.4756412202155107, + "grad_norm": 0.4252207577228546, + "learning_rate": 9.054070473876065e-05, + "loss": 2.1315, + "step": 1567 + }, + { + "epoch": 0.47594475641220213, + "grad_norm": 0.4429045617580414, + "learning_rate": 9.053462940461725e-05, + "loss": 2.2023, + "step": 1568 + }, + { + "epoch": 0.4762482926088936, + "grad_norm": 0.36126387119293213, + "learning_rate": 9.052855407047388e-05, + "loss": 2.0909, + "step": 1569 + }, + { + "epoch": 0.47655182880558505, + "grad_norm": 0.40380343794822693, + "learning_rate": 9.052247873633051e-05, + "loss": 1.614, + "step": 1570 + }, + { + "epoch": 0.47685536500227654, + "grad_norm": 0.37091997265815735, + "learning_rate": 9.051640340218713e-05, + "loss": 2.0191, + "step": 1571 + }, + { + "epoch": 0.477158901198968, + "grad_norm": 0.3446311354637146, + "learning_rate": 9.051032806804375e-05, + "loss": 1.9633, + "step": 1572 + }, + { + "epoch": 0.4774624373956594, + "grad_norm": 0.37436115741729736, + "learning_rate": 9.050425273390038e-05, + "loss": 1.7945, + "step": 1573 + }, + { + "epoch": 0.4777659735923509, + "grad_norm": 0.36618462204933167, + "learning_rate": 9.0498177399757e-05, + "loss": 1.838, + "step": 1574 + }, + { + "epoch": 0.47806950978904234, + "grad_norm": 0.4327848255634308, + "learning_rate": 9.049210206561361e-05, + "loss": 1.8218, + "step": 1575 + }, + { + "epoch": 0.4783730459857338, + "grad_norm": 0.33957040309906006, + "learning_rate": 9.048602673147023e-05, + "loss": 1.3948, + "step": 1576 + }, + { + "epoch": 0.47867658218242526, + "grad_norm": 0.34288668632507324, + "learning_rate": 9.047995139732686e-05, + "loss": 1.856, + "step": 1577 + }, + { + "epoch": 0.4789801183791167, + "grad_norm": 0.42410871386528015, + "learning_rate": 9.047387606318348e-05, + "loss": 1.6138, + "step": 1578 + }, + { + "epoch": 0.4792836545758082, + "grad_norm": 0.325130432844162, + "learning_rate": 9.04678007290401e-05, + "loss": 1.5631, + "step": 1579 + }, + { + "epoch": 0.4795871907724996, + "grad_norm": 0.46126997470855713, + "learning_rate": 9.046172539489672e-05, + "loss": 1.8045, + "step": 1580 + }, + { + "epoch": 0.47989072696919105, + "grad_norm": 0.4125445783138275, + "learning_rate": 9.045565006075336e-05, + "loss": 1.9054, + "step": 1581 + }, + { + "epoch": 0.48019426316588254, + "grad_norm": 0.3341776430606842, + "learning_rate": 9.044957472660996e-05, + "loss": 1.9269, + "step": 1582 + }, + { + "epoch": 0.480497799362574, + "grad_norm": 0.37623131275177, + "learning_rate": 9.044349939246659e-05, + "loss": 1.8621, + "step": 1583 + }, + { + "epoch": 0.48080133555926546, + "grad_norm": 0.42698100209236145, + "learning_rate": 9.043742405832322e-05, + "loss": 1.9414, + "step": 1584 + }, + { + "epoch": 0.4811048717559569, + "grad_norm": 0.39322131872177124, + "learning_rate": 9.043134872417984e-05, + "loss": 1.6427, + "step": 1585 + }, + { + "epoch": 0.48140840795264833, + "grad_norm": 0.4348810315132141, + "learning_rate": 9.042527339003646e-05, + "loss": 2.0169, + "step": 1586 + }, + { + "epoch": 0.4817119441493398, + "grad_norm": 0.42536425590515137, + "learning_rate": 9.041919805589307e-05, + "loss": 1.79, + "step": 1587 + }, + { + "epoch": 0.48201548034603126, + "grad_norm": 0.35837772488594055, + "learning_rate": 9.04131227217497e-05, + "loss": 2.0152, + "step": 1588 + }, + { + "epoch": 0.48231901654272274, + "grad_norm": 0.4053284525871277, + "learning_rate": 9.040704738760632e-05, + "loss": 1.9912, + "step": 1589 + }, + { + "epoch": 0.4826225527394142, + "grad_norm": 1.0780633687973022, + "learning_rate": 9.040097205346294e-05, + "loss": 2.3151, + "step": 1590 + }, + { + "epoch": 0.4829260889361056, + "grad_norm": 0.3571546673774719, + "learning_rate": 9.039489671931957e-05, + "loss": 2.007, + "step": 1591 + }, + { + "epoch": 0.4832296251327971, + "grad_norm": 1.1343384981155396, + "learning_rate": 9.038882138517619e-05, + "loss": 2.2369, + "step": 1592 + }, + { + "epoch": 0.48353316132948854, + "grad_norm": 0.43974751234054565, + "learning_rate": 9.03827460510328e-05, + "loss": 2.1774, + "step": 1593 + }, + { + "epoch": 0.48383669752617997, + "grad_norm": 0.5721186995506287, + "learning_rate": 9.037667071688943e-05, + "loss": 1.9005, + "step": 1594 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.43478089570999146, + "learning_rate": 9.037059538274607e-05, + "loss": 1.8163, + "step": 1595 + }, + { + "epoch": 0.4844437699195629, + "grad_norm": 0.4186250865459442, + "learning_rate": 9.036452004860267e-05, + "loss": 1.5557, + "step": 1596 + }, + { + "epoch": 0.4847473061162544, + "grad_norm": 0.363033264875412, + "learning_rate": 9.03584447144593e-05, + "loss": 1.9102, + "step": 1597 + }, + { + "epoch": 0.4850508423129458, + "grad_norm": 0.39680740237236023, + "learning_rate": 9.035236938031593e-05, + "loss": 1.8232, + "step": 1598 + }, + { + "epoch": 0.48535437850963725, + "grad_norm": 0.3754984736442566, + "learning_rate": 9.034629404617255e-05, + "loss": 1.7743, + "step": 1599 + }, + { + "epoch": 0.48565791470632874, + "grad_norm": 0.4426131546497345, + "learning_rate": 9.034021871202917e-05, + "loss": 1.8806, + "step": 1600 + }, + { + "epoch": 0.4859614509030202, + "grad_norm": 0.37828418612480164, + "learning_rate": 9.033414337788578e-05, + "loss": 1.7686, + "step": 1601 + }, + { + "epoch": 0.48626498709971167, + "grad_norm": 0.44848862290382385, + "learning_rate": 9.032806804374241e-05, + "loss": 1.6695, + "step": 1602 + }, + { + "epoch": 0.4865685232964031, + "grad_norm": 0.357838898897171, + "learning_rate": 9.032199270959903e-05, + "loss": 1.8404, + "step": 1603 + }, + { + "epoch": 0.48687205949309453, + "grad_norm": 0.6578190326690674, + "learning_rate": 9.031591737545565e-05, + "loss": 1.3599, + "step": 1604 + }, + { + "epoch": 0.487175595689786, + "grad_norm": 0.4240557849407196, + "learning_rate": 9.030984204131228e-05, + "loss": 1.3183, + "step": 1605 + }, + { + "epoch": 0.48747913188647746, + "grad_norm": 0.4170602262020111, + "learning_rate": 9.03037667071689e-05, + "loss": 2.193, + "step": 1606 + }, + { + "epoch": 0.4877826680831689, + "grad_norm": 0.39807751774787903, + "learning_rate": 9.029769137302551e-05, + "loss": 1.5821, + "step": 1607 + }, + { + "epoch": 0.4880862042798604, + "grad_norm": 0.54439777135849, + "learning_rate": 9.029161603888214e-05, + "loss": 1.6293, + "step": 1608 + }, + { + "epoch": 0.4883897404765518, + "grad_norm": 0.39446118474006653, + "learning_rate": 9.028554070473878e-05, + "loss": 1.6712, + "step": 1609 + }, + { + "epoch": 0.4886932766732433, + "grad_norm": 0.42656177282333374, + "learning_rate": 9.027946537059538e-05, + "loss": 1.7296, + "step": 1610 + }, + { + "epoch": 0.48899681286993474, + "grad_norm": 0.4832558333873749, + "learning_rate": 9.027339003645201e-05, + "loss": 1.8888, + "step": 1611 + }, + { + "epoch": 0.4893003490666262, + "grad_norm": 0.44794905185699463, + "learning_rate": 9.026731470230864e-05, + "loss": 1.6074, + "step": 1612 + }, + { + "epoch": 0.48960388526331766, + "grad_norm": 0.344939649105072, + "learning_rate": 9.026123936816524e-05, + "loss": 1.9943, + "step": 1613 + }, + { + "epoch": 0.4899074214600091, + "grad_norm": 0.42949387431144714, + "learning_rate": 9.025516403402188e-05, + "loss": 1.9922, + "step": 1614 + }, + { + "epoch": 0.4902109576567006, + "grad_norm": 0.39325597882270813, + "learning_rate": 9.024908869987849e-05, + "loss": 1.8447, + "step": 1615 + }, + { + "epoch": 0.490514493853392, + "grad_norm": 0.3923071622848511, + "learning_rate": 9.024301336573512e-05, + "loss": 1.7969, + "step": 1616 + }, + { + "epoch": 0.49081803005008345, + "grad_norm": 0.3386680483818054, + "learning_rate": 9.023693803159174e-05, + "loss": 1.9222, + "step": 1617 + }, + { + "epoch": 0.49112156624677494, + "grad_norm": 0.40206924080848694, + "learning_rate": 9.023086269744836e-05, + "loss": 2.1749, + "step": 1618 + }, + { + "epoch": 0.4914251024434664, + "grad_norm": 0.36428967118263245, + "learning_rate": 9.022478736330499e-05, + "loss": 1.4163, + "step": 1619 + }, + { + "epoch": 0.4917286386401578, + "grad_norm": 0.4516347050666809, + "learning_rate": 9.02187120291616e-05, + "loss": 1.8412, + "step": 1620 + }, + { + "epoch": 0.4920321748368493, + "grad_norm": 0.40233004093170166, + "learning_rate": 9.021263669501822e-05, + "loss": 1.6124, + "step": 1621 + }, + { + "epoch": 0.49233571103354073, + "grad_norm": 0.4065000116825104, + "learning_rate": 9.020656136087485e-05, + "loss": 1.7479, + "step": 1622 + }, + { + "epoch": 0.4926392472302322, + "grad_norm": 0.42242977023124695, + "learning_rate": 9.020048602673149e-05, + "loss": 2.3126, + "step": 1623 + }, + { + "epoch": 0.49294278342692366, + "grad_norm": 0.3774438500404358, + "learning_rate": 9.019441069258809e-05, + "loss": 2.0075, + "step": 1624 + }, + { + "epoch": 0.4932463196236151, + "grad_norm": 0.3382234275341034, + "learning_rate": 9.018833535844472e-05, + "loss": 1.8028, + "step": 1625 + }, + { + "epoch": 0.4935498558203066, + "grad_norm": 0.443689227104187, + "learning_rate": 9.018226002430135e-05, + "loss": 1.9506, + "step": 1626 + }, + { + "epoch": 0.493853392016998, + "grad_norm": 0.32814332842826843, + "learning_rate": 9.017618469015795e-05, + "loss": 1.85, + "step": 1627 + }, + { + "epoch": 0.4941569282136895, + "grad_norm": 0.691228449344635, + "learning_rate": 9.017010935601459e-05, + "loss": 1.3521, + "step": 1628 + }, + { + "epoch": 0.49446046441038094, + "grad_norm": 0.43137383460998535, + "learning_rate": 9.01640340218712e-05, + "loss": 1.9107, + "step": 1629 + }, + { + "epoch": 0.4947640006070724, + "grad_norm": 0.3534761965274811, + "learning_rate": 9.015795868772783e-05, + "loss": 1.3167, + "step": 1630 + }, + { + "epoch": 0.49506753680376386, + "grad_norm": 0.5987849831581116, + "learning_rate": 9.015188335358445e-05, + "loss": 2.0258, + "step": 1631 + }, + { + "epoch": 0.4953710730004553, + "grad_norm": 0.38956066966056824, + "learning_rate": 9.014580801944107e-05, + "loss": 1.5787, + "step": 1632 + }, + { + "epoch": 0.4956746091971468, + "grad_norm": 0.43218016624450684, + "learning_rate": 9.01397326852977e-05, + "loss": 1.4582, + "step": 1633 + }, + { + "epoch": 0.4959781453938382, + "grad_norm": 0.8035671710968018, + "learning_rate": 9.013365735115432e-05, + "loss": 2.2415, + "step": 1634 + }, + { + "epoch": 0.49628168159052966, + "grad_norm": 0.41837078332901, + "learning_rate": 9.012758201701093e-05, + "loss": 1.9213, + "step": 1635 + }, + { + "epoch": 0.49658521778722114, + "grad_norm": 0.48308447003364563, + "learning_rate": 9.012150668286756e-05, + "loss": 1.5812, + "step": 1636 + }, + { + "epoch": 0.4968887539839126, + "grad_norm": 0.4080790877342224, + "learning_rate": 9.011543134872418e-05, + "loss": 1.8055, + "step": 1637 + }, + { + "epoch": 0.497192290180604, + "grad_norm": 0.409053772687912, + "learning_rate": 9.01093560145808e-05, + "loss": 2.0126, + "step": 1638 + }, + { + "epoch": 0.4974958263772955, + "grad_norm": 0.41290226578712463, + "learning_rate": 9.010328068043743e-05, + "loss": 1.8663, + "step": 1639 + }, + { + "epoch": 0.49779936257398694, + "grad_norm": 0.36996471881866455, + "learning_rate": 9.009720534629406e-05, + "loss": 1.9887, + "step": 1640 + }, + { + "epoch": 0.4981028987706784, + "grad_norm": 0.4474611282348633, + "learning_rate": 9.009113001215066e-05, + "loss": 1.6636, + "step": 1641 + }, + { + "epoch": 0.49840643496736986, + "grad_norm": 0.3717537224292755, + "learning_rate": 9.00850546780073e-05, + "loss": 1.7292, + "step": 1642 + }, + { + "epoch": 0.4987099711640613, + "grad_norm": 0.6839573979377747, + "learning_rate": 9.007897934386391e-05, + "loss": 2.1037, + "step": 1643 + }, + { + "epoch": 0.4990135073607528, + "grad_norm": 0.3877841532230377, + "learning_rate": 9.007290400972054e-05, + "loss": 2.1324, + "step": 1644 + }, + { + "epoch": 0.4993170435574442, + "grad_norm": 0.42409414052963257, + "learning_rate": 9.006682867557716e-05, + "loss": 2.0867, + "step": 1645 + }, + { + "epoch": 0.4996205797541357, + "grad_norm": 0.38519206643104553, + "learning_rate": 9.006075334143378e-05, + "loss": 2.0589, + "step": 1646 + }, + { + "epoch": 0.49992411595082714, + "grad_norm": 0.3910469710826874, + "learning_rate": 9.005467800729041e-05, + "loss": 1.7952, + "step": 1647 + }, + { + "epoch": 0.5002276521475186, + "grad_norm": 0.3802652359008789, + "learning_rate": 9.004860267314703e-05, + "loss": 1.8394, + "step": 1648 + }, + { + "epoch": 0.5005311883442101, + "grad_norm": 1.9837124347686768, + "learning_rate": 9.004252733900364e-05, + "loss": 1.9703, + "step": 1649 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.40731772780418396, + "learning_rate": 9.003645200486027e-05, + "loss": 1.665, + "step": 1650 + }, + { + "epoch": 0.5011382607375929, + "grad_norm": 0.4358116686344147, + "learning_rate": 9.003037667071689e-05, + "loss": 1.369, + "step": 1651 + }, + { + "epoch": 0.5014417969342844, + "grad_norm": 0.49716782569885254, + "learning_rate": 9.002430133657351e-05, + "loss": 2.2221, + "step": 1652 + }, + { + "epoch": 0.5017453331309759, + "grad_norm": 0.41779419779777527, + "learning_rate": 9.001822600243014e-05, + "loss": 2.0265, + "step": 1653 + }, + { + "epoch": 0.5020488693276673, + "grad_norm": 0.40375036001205444, + "learning_rate": 9.001215066828677e-05, + "loss": 1.9761, + "step": 1654 + }, + { + "epoch": 0.5023524055243588, + "grad_norm": 0.3802977204322815, + "learning_rate": 9.000607533414337e-05, + "loss": 1.76, + "step": 1655 + }, + { + "epoch": 0.5026559417210502, + "grad_norm": 0.33772045373916626, + "learning_rate": 9e-05, + "loss": 1.4531, + "step": 1656 + }, + { + "epoch": 0.5029594779177416, + "grad_norm": 0.4556722640991211, + "learning_rate": 8.999392466585662e-05, + "loss": 1.5091, + "step": 1657 + }, + { + "epoch": 0.5032630141144332, + "grad_norm": 0.37798872590065, + "learning_rate": 8.998784933171325e-05, + "loss": 1.378, + "step": 1658 + }, + { + "epoch": 0.5035665503111246, + "grad_norm": 0.3921298086643219, + "learning_rate": 8.998177399756987e-05, + "loss": 1.901, + "step": 1659 + }, + { + "epoch": 0.5038700865078161, + "grad_norm": 0.39993181824684143, + "learning_rate": 8.997569866342649e-05, + "loss": 1.9796, + "step": 1660 + }, + { + "epoch": 0.5041736227045075, + "grad_norm": 0.41690680384635925, + "learning_rate": 8.996962332928312e-05, + "loss": 1.6252, + "step": 1661 + }, + { + "epoch": 0.5044771589011989, + "grad_norm": 0.4252752363681793, + "learning_rate": 8.996354799513974e-05, + "loss": 1.9233, + "step": 1662 + }, + { + "epoch": 0.5047806950978905, + "grad_norm": 0.43236085772514343, + "learning_rate": 8.995747266099635e-05, + "loss": 1.5527, + "step": 1663 + }, + { + "epoch": 0.5050842312945819, + "grad_norm": 0.32605788111686707, + "learning_rate": 8.995139732685298e-05, + "loss": 1.8349, + "step": 1664 + }, + { + "epoch": 0.5053877674912733, + "grad_norm": 0.8619269728660583, + "learning_rate": 8.99453219927096e-05, + "loss": 1.3305, + "step": 1665 + }, + { + "epoch": 0.5056913036879648, + "grad_norm": 0.429949551820755, + "learning_rate": 8.993924665856622e-05, + "loss": 1.531, + "step": 1666 + }, + { + "epoch": 0.5059948398846562, + "grad_norm": 0.38018864393234253, + "learning_rate": 8.993317132442285e-05, + "loss": 1.4132, + "step": 1667 + }, + { + "epoch": 0.5062983760813476, + "grad_norm": 0.411668986082077, + "learning_rate": 8.992709599027948e-05, + "loss": 2.0736, + "step": 1668 + }, + { + "epoch": 0.5066019122780392, + "grad_norm": 0.41500651836395264, + "learning_rate": 8.992102065613608e-05, + "loss": 2.0055, + "step": 1669 + }, + { + "epoch": 0.5069054484747306, + "grad_norm": 0.3659593164920807, + "learning_rate": 8.991494532199272e-05, + "loss": 1.8854, + "step": 1670 + }, + { + "epoch": 0.5072089846714221, + "grad_norm": 0.4081539809703827, + "learning_rate": 8.990886998784933e-05, + "loss": 1.9027, + "step": 1671 + }, + { + "epoch": 0.5075125208681135, + "grad_norm": 0.4111250340938568, + "learning_rate": 8.990279465370596e-05, + "loss": 1.7838, + "step": 1672 + }, + { + "epoch": 0.5078160570648049, + "grad_norm": 0.37269532680511475, + "learning_rate": 8.989671931956258e-05, + "loss": 1.8353, + "step": 1673 + }, + { + "epoch": 0.5081195932614965, + "grad_norm": 0.4204343259334564, + "learning_rate": 8.98906439854192e-05, + "loss": 1.5011, + "step": 1674 + }, + { + "epoch": 0.5084231294581879, + "grad_norm": 0.4515773355960846, + "learning_rate": 8.988456865127583e-05, + "loss": 1.5303, + "step": 1675 + }, + { + "epoch": 0.5087266656548793, + "grad_norm": 0.44019004702568054, + "learning_rate": 8.987849331713245e-05, + "loss": 2.0108, + "step": 1676 + }, + { + "epoch": 0.5090302018515708, + "grad_norm": 0.47351813316345215, + "learning_rate": 8.987241798298906e-05, + "loss": 2.0518, + "step": 1677 + }, + { + "epoch": 0.5093337380482622, + "grad_norm": 0.40282347798347473, + "learning_rate": 8.98663426488457e-05, + "loss": 1.99, + "step": 1678 + }, + { + "epoch": 0.5096372742449538, + "grad_norm": 0.49869832396507263, + "learning_rate": 8.986026731470231e-05, + "loss": 1.9592, + "step": 1679 + }, + { + "epoch": 0.5099408104416452, + "grad_norm": 0.36178889870643616, + "learning_rate": 8.985419198055893e-05, + "loss": 1.927, + "step": 1680 + }, + { + "epoch": 0.5102443466383366, + "grad_norm": 0.3670339584350586, + "learning_rate": 8.984811664641556e-05, + "loss": 1.9516, + "step": 1681 + }, + { + "epoch": 0.510547882835028, + "grad_norm": 0.3458341658115387, + "learning_rate": 8.984204131227218e-05, + "loss": 1.8203, + "step": 1682 + }, + { + "epoch": 0.5108514190317195, + "grad_norm": 0.4636301100254059, + "learning_rate": 8.98359659781288e-05, + "loss": 1.7146, + "step": 1683 + }, + { + "epoch": 0.511154955228411, + "grad_norm": 0.45436516404151917, + "learning_rate": 8.982989064398543e-05, + "loss": 1.8514, + "step": 1684 + }, + { + "epoch": 0.5114584914251025, + "grad_norm": 0.46940287947654724, + "learning_rate": 8.982381530984204e-05, + "loss": 2.0162, + "step": 1685 + }, + { + "epoch": 0.5117620276217939, + "grad_norm": 0.4405171573162079, + "learning_rate": 8.981773997569866e-05, + "loss": 2.0003, + "step": 1686 + }, + { + "epoch": 0.5120655638184853, + "grad_norm": 0.4306286871433258, + "learning_rate": 8.981166464155529e-05, + "loss": 1.8338, + "step": 1687 + }, + { + "epoch": 0.5123691000151768, + "grad_norm": 0.43476733565330505, + "learning_rate": 8.980558930741191e-05, + "loss": 1.623, + "step": 1688 + }, + { + "epoch": 0.5126726362118683, + "grad_norm": 0.3655628561973572, + "learning_rate": 8.979951397326854e-05, + "loss": 2.098, + "step": 1689 + }, + { + "epoch": 0.5129761724085597, + "grad_norm": 0.36685287952423096, + "learning_rate": 8.979343863912516e-05, + "loss": 1.9533, + "step": 1690 + }, + { + "epoch": 0.5132797086052512, + "grad_norm": 0.4131629765033722, + "learning_rate": 8.978736330498177e-05, + "loss": 1.8093, + "step": 1691 + }, + { + "epoch": 0.5135832448019426, + "grad_norm": 0.36607033014297485, + "learning_rate": 8.97812879708384e-05, + "loss": 1.4293, + "step": 1692 + }, + { + "epoch": 0.513886780998634, + "grad_norm": 0.4478306174278259, + "learning_rate": 8.977521263669502e-05, + "loss": 1.916, + "step": 1693 + }, + { + "epoch": 0.5141903171953256, + "grad_norm": 0.4570290446281433, + "learning_rate": 8.976913730255164e-05, + "loss": 1.7859, + "step": 1694 + }, + { + "epoch": 0.514493853392017, + "grad_norm": 0.46024757623672485, + "learning_rate": 8.976306196840827e-05, + "loss": 1.6032, + "step": 1695 + }, + { + "epoch": 0.5147973895887085, + "grad_norm": 0.40080446004867554, + "learning_rate": 8.975698663426489e-05, + "loss": 1.7693, + "step": 1696 + }, + { + "epoch": 0.5151009257853999, + "grad_norm": 0.3736198842525482, + "learning_rate": 8.97509113001215e-05, + "loss": 1.8185, + "step": 1697 + }, + { + "epoch": 0.5154044619820913, + "grad_norm": 0.7444111704826355, + "learning_rate": 8.974483596597814e-05, + "loss": 1.9927, + "step": 1698 + }, + { + "epoch": 0.5157079981787828, + "grad_norm": 0.42862579226493835, + "learning_rate": 8.973876063183475e-05, + "loss": 1.9946, + "step": 1699 + }, + { + "epoch": 0.5160115343754743, + "grad_norm": 0.5150566101074219, + "learning_rate": 8.973268529769137e-05, + "loss": 1.6675, + "step": 1700 + }, + { + "epoch": 0.5163150705721657, + "grad_norm": 0.4260749816894531, + "learning_rate": 8.9726609963548e-05, + "loss": 2.0212, + "step": 1701 + }, + { + "epoch": 0.5166186067688572, + "grad_norm": 0.3930248022079468, + "learning_rate": 8.972053462940462e-05, + "loss": 1.8982, + "step": 1702 + }, + { + "epoch": 0.5169221429655486, + "grad_norm": 0.40357765555381775, + "learning_rate": 8.971445929526125e-05, + "loss": 1.8368, + "step": 1703 + }, + { + "epoch": 0.51722567916224, + "grad_norm": 0.3957735300064087, + "learning_rate": 8.970838396111787e-05, + "loss": 1.7569, + "step": 1704 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.3867725431919098, + "learning_rate": 8.970230862697448e-05, + "loss": 2.0536, + "step": 1705 + }, + { + "epoch": 0.517832751555623, + "grad_norm": 0.38773855566978455, + "learning_rate": 8.969623329283111e-05, + "loss": 1.9507, + "step": 1706 + }, + { + "epoch": 0.5181362877523145, + "grad_norm": 0.4161403775215149, + "learning_rate": 8.969015795868773e-05, + "loss": 1.1656, + "step": 1707 + }, + { + "epoch": 0.5184398239490059, + "grad_norm": 0.40050750970840454, + "learning_rate": 8.968408262454435e-05, + "loss": 1.9384, + "step": 1708 + }, + { + "epoch": 0.5187433601456973, + "grad_norm": 0.43072274327278137, + "learning_rate": 8.967800729040098e-05, + "loss": 1.9838, + "step": 1709 + }, + { + "epoch": 0.5190468963423889, + "grad_norm": 0.4291669428348541, + "learning_rate": 8.96719319562576e-05, + "loss": 2.0588, + "step": 1710 + }, + { + "epoch": 0.5193504325390803, + "grad_norm": 0.3524603545665741, + "learning_rate": 8.966585662211422e-05, + "loss": 1.9433, + "step": 1711 + }, + { + "epoch": 0.5196539687357717, + "grad_norm": 0.42883431911468506, + "learning_rate": 8.965978128797085e-05, + "loss": 2.0879, + "step": 1712 + }, + { + "epoch": 0.5199575049324632, + "grad_norm": 0.3711095452308655, + "learning_rate": 8.965370595382746e-05, + "loss": 1.8024, + "step": 1713 + }, + { + "epoch": 0.5202610411291546, + "grad_norm": 0.3979575037956238, + "learning_rate": 8.964763061968408e-05, + "loss": 1.889, + "step": 1714 + }, + { + "epoch": 0.5205645773258462, + "grad_norm": 0.3781624436378479, + "learning_rate": 8.964155528554071e-05, + "loss": 1.4032, + "step": 1715 + }, + { + "epoch": 0.5208681135225376, + "grad_norm": 0.4285725951194763, + "learning_rate": 8.963547995139733e-05, + "loss": 1.5933, + "step": 1716 + }, + { + "epoch": 0.521171649719229, + "grad_norm": 0.40880918502807617, + "learning_rate": 8.962940461725396e-05, + "loss": 1.4162, + "step": 1717 + }, + { + "epoch": 0.5214751859159205, + "grad_norm": 0.4186420440673828, + "learning_rate": 8.962332928311058e-05, + "loss": 1.6866, + "step": 1718 + }, + { + "epoch": 0.5217787221126119, + "grad_norm": 0.3772728443145752, + "learning_rate": 8.96172539489672e-05, + "loss": 1.8005, + "step": 1719 + }, + { + "epoch": 0.5220822583093034, + "grad_norm": 0.4102610945701599, + "learning_rate": 8.961117861482382e-05, + "loss": 2.0691, + "step": 1720 + }, + { + "epoch": 0.5223857945059949, + "grad_norm": 0.463878870010376, + "learning_rate": 8.960510328068044e-05, + "loss": 2.2497, + "step": 1721 + }, + { + "epoch": 0.5226893307026863, + "grad_norm": 0.3314138948917389, + "learning_rate": 8.959902794653706e-05, + "loss": 1.6946, + "step": 1722 + }, + { + "epoch": 0.5229928668993777, + "grad_norm": 0.7187567949295044, + "learning_rate": 8.959295261239369e-05, + "loss": 1.7443, + "step": 1723 + }, + { + "epoch": 0.5232964030960692, + "grad_norm": 0.42266663908958435, + "learning_rate": 8.958687727825031e-05, + "loss": 1.9827, + "step": 1724 + }, + { + "epoch": 0.5235999392927606, + "grad_norm": 0.39689430594444275, + "learning_rate": 8.958080194410693e-05, + "loss": 1.8162, + "step": 1725 + }, + { + "epoch": 0.5239034754894522, + "grad_norm": 0.36018458008766174, + "learning_rate": 8.957472660996356e-05, + "loss": 1.9901, + "step": 1726 + }, + { + "epoch": 0.5242070116861436, + "grad_norm": 0.29599374532699585, + "learning_rate": 8.956865127582017e-05, + "loss": 1.5581, + "step": 1727 + }, + { + "epoch": 0.524510547882835, + "grad_norm": 0.3953525424003601, + "learning_rate": 8.956257594167679e-05, + "loss": 1.8398, + "step": 1728 + }, + { + "epoch": 0.5248140840795265, + "grad_norm": 0.5847448110580444, + "learning_rate": 8.955650060753342e-05, + "loss": 1.6539, + "step": 1729 + }, + { + "epoch": 0.5251176202762179, + "grad_norm": 0.37169334292411804, + "learning_rate": 8.955042527339004e-05, + "loss": 1.0603, + "step": 1730 + }, + { + "epoch": 0.5254211564729094, + "grad_norm": 0.3689024746417999, + "learning_rate": 8.954434993924667e-05, + "loss": 1.4661, + "step": 1731 + }, + { + "epoch": 0.5257246926696009, + "grad_norm": 0.39325040578842163, + "learning_rate": 8.953827460510329e-05, + "loss": 1.9562, + "step": 1732 + }, + { + "epoch": 0.5260282288662923, + "grad_norm": 0.5037636756896973, + "learning_rate": 8.95321992709599e-05, + "loss": 1.5998, + "step": 1733 + }, + { + "epoch": 0.5263317650629837, + "grad_norm": 0.38126620650291443, + "learning_rate": 8.952612393681654e-05, + "loss": 1.8444, + "step": 1734 + }, + { + "epoch": 0.5266353012596752, + "grad_norm": 0.4108048379421234, + "learning_rate": 8.952004860267315e-05, + "loss": 1.5128, + "step": 1735 + }, + { + "epoch": 0.5269388374563667, + "grad_norm": 0.3624730408191681, + "learning_rate": 8.951397326852977e-05, + "loss": 1.9051, + "step": 1736 + }, + { + "epoch": 0.5272423736530581, + "grad_norm": 0.374348908662796, + "learning_rate": 8.95078979343864e-05, + "loss": 1.5501, + "step": 1737 + }, + { + "epoch": 0.5275459098497496, + "grad_norm": 0.504650890827179, + "learning_rate": 8.950182260024302e-05, + "loss": 1.9115, + "step": 1738 + }, + { + "epoch": 0.527849446046441, + "grad_norm": 0.31486794352531433, + "learning_rate": 8.949574726609964e-05, + "loss": 1.7507, + "step": 1739 + }, + { + "epoch": 0.5281529822431325, + "grad_norm": 0.38089415431022644, + "learning_rate": 8.948967193195627e-05, + "loss": 1.9424, + "step": 1740 + }, + { + "epoch": 0.528456518439824, + "grad_norm": 0.5939797163009644, + "learning_rate": 8.948359659781288e-05, + "loss": 2.0123, + "step": 1741 + }, + { + "epoch": 0.5287600546365154, + "grad_norm": 0.4175383746623993, + "learning_rate": 8.94775212636695e-05, + "loss": 1.9405, + "step": 1742 + }, + { + "epoch": 0.5290635908332069, + "grad_norm": 0.3071494996547699, + "learning_rate": 8.947144592952613e-05, + "loss": 0.9484, + "step": 1743 + }, + { + "epoch": 0.5293671270298983, + "grad_norm": 0.4822414219379425, + "learning_rate": 8.946537059538275e-05, + "loss": 1.7093, + "step": 1744 + }, + { + "epoch": 0.5296706632265897, + "grad_norm": 0.8036310076713562, + "learning_rate": 8.945929526123938e-05, + "loss": 2.1441, + "step": 1745 + }, + { + "epoch": 0.5299741994232813, + "grad_norm": 0.42779991030693054, + "learning_rate": 8.9453219927096e-05, + "loss": 1.9736, + "step": 1746 + }, + { + "epoch": 0.5302777356199727, + "grad_norm": 0.37124693393707275, + "learning_rate": 8.944714459295261e-05, + "loss": 2.0578, + "step": 1747 + }, + { + "epoch": 0.5305812718166641, + "grad_norm": 0.4504419267177582, + "learning_rate": 8.944106925880925e-05, + "loss": 2.0148, + "step": 1748 + }, + { + "epoch": 0.5308848080133556, + "grad_norm": 0.370437353849411, + "learning_rate": 8.943499392466586e-05, + "loss": 1.7045, + "step": 1749 + }, + { + "epoch": 0.531188344210047, + "grad_norm": 0.4089522063732147, + "learning_rate": 8.942891859052248e-05, + "loss": 1.7649, + "step": 1750 + }, + { + "epoch": 0.5314918804067384, + "grad_norm": 0.3770054280757904, + "learning_rate": 8.942284325637911e-05, + "loss": 1.8252, + "step": 1751 + }, + { + "epoch": 0.53179541660343, + "grad_norm": 0.45180705189704895, + "learning_rate": 8.941676792223573e-05, + "loss": 0.9027, + "step": 1752 + }, + { + "epoch": 0.5320989528001214, + "grad_norm": 0.415444016456604, + "learning_rate": 8.941069258809235e-05, + "loss": 1.8366, + "step": 1753 + }, + { + "epoch": 0.5324024889968129, + "grad_norm": 0.4421723783016205, + "learning_rate": 8.940461725394898e-05, + "loss": 1.6414, + "step": 1754 + }, + { + "epoch": 0.5327060251935043, + "grad_norm": 0.3791792392730713, + "learning_rate": 8.93985419198056e-05, + "loss": 2.0668, + "step": 1755 + }, + { + "epoch": 0.5330095613901957, + "grad_norm": 0.40155166387557983, + "learning_rate": 8.939246658566221e-05, + "loss": 1.8144, + "step": 1756 + }, + { + "epoch": 0.5333130975868873, + "grad_norm": 0.38897809386253357, + "learning_rate": 8.938639125151884e-05, + "loss": 1.4815, + "step": 1757 + }, + { + "epoch": 0.5336166337835787, + "grad_norm": 0.35486680269241333, + "learning_rate": 8.938031591737546e-05, + "loss": 1.8673, + "step": 1758 + }, + { + "epoch": 0.5339201699802701, + "grad_norm": 0.33397093415260315, + "learning_rate": 8.937424058323208e-05, + "loss": 1.7756, + "step": 1759 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.43346378207206726, + "learning_rate": 8.936816524908871e-05, + "loss": 1.8367, + "step": 1760 + }, + { + "epoch": 0.534527242373653, + "grad_norm": 0.37739312648773193, + "learning_rate": 8.936208991494532e-05, + "loss": 2.1916, + "step": 1761 + }, + { + "epoch": 0.5348307785703446, + "grad_norm": 0.32218697667121887, + "learning_rate": 8.935601458080196e-05, + "loss": 1.9885, + "step": 1762 + }, + { + "epoch": 0.535134314767036, + "grad_norm": 0.37920355796813965, + "learning_rate": 8.934993924665856e-05, + "loss": 1.7163, + "step": 1763 + }, + { + "epoch": 0.5354378509637274, + "grad_norm": 0.3895961344242096, + "learning_rate": 8.934386391251519e-05, + "loss": 1.8757, + "step": 1764 + }, + { + "epoch": 0.5357413871604189, + "grad_norm": 0.4898541271686554, + "learning_rate": 8.933778857837182e-05, + "loss": 1.7097, + "step": 1765 + }, + { + "epoch": 0.5360449233571103, + "grad_norm": 0.3851979672908783, + "learning_rate": 8.933171324422844e-05, + "loss": 1.8913, + "step": 1766 + }, + { + "epoch": 0.5363484595538018, + "grad_norm": 0.3567551076412201, + "learning_rate": 8.932563791008506e-05, + "loss": 1.8789, + "step": 1767 + }, + { + "epoch": 0.5366519957504933, + "grad_norm": 0.4687878489494324, + "learning_rate": 8.931956257594169e-05, + "loss": 1.936, + "step": 1768 + }, + { + "epoch": 0.5369555319471847, + "grad_norm": 0.36735373735427856, + "learning_rate": 8.93134872417983e-05, + "loss": 2.0743, + "step": 1769 + }, + { + "epoch": 0.5372590681438761, + "grad_norm": 0.508160412311554, + "learning_rate": 8.930741190765492e-05, + "loss": 1.9822, + "step": 1770 + }, + { + "epoch": 0.5375626043405676, + "grad_norm": 0.40640148520469666, + "learning_rate": 8.930133657351155e-05, + "loss": 1.5255, + "step": 1771 + }, + { + "epoch": 0.5378661405372591, + "grad_norm": 0.7253953218460083, + "learning_rate": 8.929526123936817e-05, + "loss": 1.8898, + "step": 1772 + }, + { + "epoch": 0.5381696767339506, + "grad_norm": 0.4226602017879486, + "learning_rate": 8.928918590522479e-05, + "loss": 1.7197, + "step": 1773 + }, + { + "epoch": 0.538473212930642, + "grad_norm": 0.42332541942596436, + "learning_rate": 8.928311057108142e-05, + "loss": 1.9117, + "step": 1774 + }, + { + "epoch": 0.5387767491273334, + "grad_norm": 0.8125683665275574, + "learning_rate": 8.927703523693803e-05, + "loss": 1.7325, + "step": 1775 + }, + { + "epoch": 0.5390802853240249, + "grad_norm": 0.44765642285346985, + "learning_rate": 8.927095990279467e-05, + "loss": 2.0353, + "step": 1776 + }, + { + "epoch": 0.5393838215207164, + "grad_norm": 0.45518067479133606, + "learning_rate": 8.926488456865127e-05, + "loss": 1.9536, + "step": 1777 + }, + { + "epoch": 0.5396873577174078, + "grad_norm": 0.3856181800365448, + "learning_rate": 8.92588092345079e-05, + "loss": 1.456, + "step": 1778 + }, + { + "epoch": 0.5399908939140993, + "grad_norm": 0.41640815138816833, + "learning_rate": 8.925273390036453e-05, + "loss": 1.7481, + "step": 1779 + }, + { + "epoch": 0.5402944301107907, + "grad_norm": 0.3643503189086914, + "learning_rate": 8.924665856622115e-05, + "loss": 1.8541, + "step": 1780 + }, + { + "epoch": 0.5405979663074821, + "grad_norm": 0.40610817074775696, + "learning_rate": 8.924058323207777e-05, + "loss": 1.8021, + "step": 1781 + }, + { + "epoch": 0.5409015025041736, + "grad_norm": 1.8827602863311768, + "learning_rate": 8.92345078979344e-05, + "loss": 1.4657, + "step": 1782 + }, + { + "epoch": 0.5412050387008651, + "grad_norm": 0.4862421154975891, + "learning_rate": 8.922843256379101e-05, + "loss": 1.6917, + "step": 1783 + }, + { + "epoch": 0.5415085748975565, + "grad_norm": 0.4079034626483917, + "learning_rate": 8.922235722964763e-05, + "loss": 1.4408, + "step": 1784 + }, + { + "epoch": 0.541812111094248, + "grad_norm": 0.37174421548843384, + "learning_rate": 8.921628189550426e-05, + "loss": 1.8314, + "step": 1785 + }, + { + "epoch": 0.5421156472909394, + "grad_norm": 0.4223754107952118, + "learning_rate": 8.921020656136088e-05, + "loss": 1.7007, + "step": 1786 + }, + { + "epoch": 0.5424191834876309, + "grad_norm": 0.371114581823349, + "learning_rate": 8.92041312272175e-05, + "loss": 1.8876, + "step": 1787 + }, + { + "epoch": 0.5427227196843224, + "grad_norm": 0.4263741672039032, + "learning_rate": 8.919805589307413e-05, + "loss": 2.1338, + "step": 1788 + }, + { + "epoch": 0.5430262558810138, + "grad_norm": 0.4573124349117279, + "learning_rate": 8.919198055893074e-05, + "loss": 1.9776, + "step": 1789 + }, + { + "epoch": 0.5433297920777053, + "grad_norm": 0.44550567865371704, + "learning_rate": 8.918590522478738e-05, + "loss": 1.7205, + "step": 1790 + }, + { + "epoch": 0.5436333282743967, + "grad_norm": 0.42521047592163086, + "learning_rate": 8.917982989064398e-05, + "loss": 1.9548, + "step": 1791 + }, + { + "epoch": 0.5439368644710881, + "grad_norm": 0.39518535137176514, + "learning_rate": 8.917375455650061e-05, + "loss": 2.0192, + "step": 1792 + }, + { + "epoch": 0.5442404006677797, + "grad_norm": 0.42280903458595276, + "learning_rate": 8.916767922235724e-05, + "loss": 1.5445, + "step": 1793 + }, + { + "epoch": 0.5445439368644711, + "grad_norm": 0.40115422010421753, + "learning_rate": 8.916160388821386e-05, + "loss": 1.9529, + "step": 1794 + }, + { + "epoch": 0.5448474730611625, + "grad_norm": 0.3923608958721161, + "learning_rate": 8.915552855407048e-05, + "loss": 2.0184, + "step": 1795 + }, + { + "epoch": 0.545151009257854, + "grad_norm": 0.3982231020927429, + "learning_rate": 8.91494532199271e-05, + "loss": 1.9162, + "step": 1796 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.4375683665275574, + "learning_rate": 8.914337788578372e-05, + "loss": 1.8728, + "step": 1797 + }, + { + "epoch": 0.545758081651237, + "grad_norm": 0.4353227913379669, + "learning_rate": 8.913730255164034e-05, + "loss": 1.4043, + "step": 1798 + }, + { + "epoch": 0.5460616178479284, + "grad_norm": 0.4171392619609833, + "learning_rate": 8.913122721749697e-05, + "loss": 1.9417, + "step": 1799 + }, + { + "epoch": 0.5463651540446198, + "grad_norm": 0.33565661311149597, + "learning_rate": 8.912515188335359e-05, + "loss": 1.888, + "step": 1800 + }, + { + "epoch": 0.5466686902413113, + "grad_norm": 0.3857763707637787, + "learning_rate": 8.91190765492102e-05, + "loss": 1.7882, + "step": 1801 + }, + { + "epoch": 0.5469722264380027, + "grad_norm": 0.3976082503795624, + "learning_rate": 8.911300121506684e-05, + "loss": 1.6312, + "step": 1802 + }, + { + "epoch": 0.5472757626346942, + "grad_norm": 0.43773913383483887, + "learning_rate": 8.910692588092345e-05, + "loss": 1.7299, + "step": 1803 + }, + { + "epoch": 0.5475792988313857, + "grad_norm": 0.39484649896621704, + "learning_rate": 8.910085054678009e-05, + "loss": 1.8826, + "step": 1804 + }, + { + "epoch": 0.5478828350280771, + "grad_norm": 0.42913469672203064, + "learning_rate": 8.909477521263669e-05, + "loss": 1.1713, + "step": 1805 + }, + { + "epoch": 0.5481863712247685, + "grad_norm": 0.43996962904930115, + "learning_rate": 8.908869987849332e-05, + "loss": 2.162, + "step": 1806 + }, + { + "epoch": 0.54848990742146, + "grad_norm": 0.7948350310325623, + "learning_rate": 8.908262454434995e-05, + "loss": 1.8808, + "step": 1807 + }, + { + "epoch": 0.5487934436181514, + "grad_norm": 0.43142643570899963, + "learning_rate": 8.907654921020657e-05, + "loss": 2.0966, + "step": 1808 + }, + { + "epoch": 0.549096979814843, + "grad_norm": 0.36545732617378235, + "learning_rate": 8.907047387606319e-05, + "loss": 1.7421, + "step": 1809 + }, + { + "epoch": 0.5494005160115344, + "grad_norm": 0.3977827727794647, + "learning_rate": 8.906439854191982e-05, + "loss": 1.9356, + "step": 1810 + }, + { + "epoch": 0.5497040522082258, + "grad_norm": 0.4487985670566559, + "learning_rate": 8.905832320777643e-05, + "loss": 1.8294, + "step": 1811 + }, + { + "epoch": 0.5500075884049173, + "grad_norm": 0.4151144027709961, + "learning_rate": 8.905224787363305e-05, + "loss": 1.7, + "step": 1812 + }, + { + "epoch": 0.5503111246016087, + "grad_norm": 0.5114679336547852, + "learning_rate": 8.904617253948968e-05, + "loss": 2.2179, + "step": 1813 + }, + { + "epoch": 0.5506146607983002, + "grad_norm": 0.4134223163127899, + "learning_rate": 8.90400972053463e-05, + "loss": 1.9573, + "step": 1814 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.5172004699707031, + "learning_rate": 8.903402187120292e-05, + "loss": 1.7614, + "step": 1815 + }, + { + "epoch": 0.5512217331916831, + "grad_norm": 0.4552132189273834, + "learning_rate": 8.902794653705955e-05, + "loss": 1.6595, + "step": 1816 + }, + { + "epoch": 0.5515252693883745, + "grad_norm": 0.4171915054321289, + "learning_rate": 8.902187120291616e-05, + "loss": 1.7968, + "step": 1817 + }, + { + "epoch": 0.551828805585066, + "grad_norm": 0.4485832452774048, + "learning_rate": 8.90157958687728e-05, + "loss": 2.0529, + "step": 1818 + }, + { + "epoch": 0.5521323417817575, + "grad_norm": 0.3997848331928253, + "learning_rate": 8.90097205346294e-05, + "loss": 1.7053, + "step": 1819 + }, + { + "epoch": 0.552435877978449, + "grad_norm": 0.47565630078315735, + "learning_rate": 8.900364520048603e-05, + "loss": 1.7842, + "step": 1820 + }, + { + "epoch": 0.5527394141751404, + "grad_norm": 0.42128419876098633, + "learning_rate": 8.899756986634266e-05, + "loss": 1.8866, + "step": 1821 + }, + { + "epoch": 0.5530429503718318, + "grad_norm": 0.4098486602306366, + "learning_rate": 8.899149453219926e-05, + "loss": 1.9268, + "step": 1822 + }, + { + "epoch": 0.5533464865685233, + "grad_norm": 0.3754071295261383, + "learning_rate": 8.89854191980559e-05, + "loss": 1.9191, + "step": 1823 + }, + { + "epoch": 0.5536500227652148, + "grad_norm": 0.4278963804244995, + "learning_rate": 8.897934386391253e-05, + "loss": 2.0914, + "step": 1824 + }, + { + "epoch": 0.5539535589619062, + "grad_norm": 0.41121765971183777, + "learning_rate": 8.897326852976914e-05, + "loss": 1.9135, + "step": 1825 + }, + { + "epoch": 0.5542570951585977, + "grad_norm": 0.7463552355766296, + "learning_rate": 8.896719319562576e-05, + "loss": 1.6663, + "step": 1826 + }, + { + "epoch": 0.5545606313552891, + "grad_norm": 0.3886711299419403, + "learning_rate": 8.896111786148239e-05, + "loss": 1.9131, + "step": 1827 + }, + { + "epoch": 0.5548641675519805, + "grad_norm": 0.3520048260688782, + "learning_rate": 8.895504252733901e-05, + "loss": 1.9166, + "step": 1828 + }, + { + "epoch": 0.5551677037486721, + "grad_norm": 0.3484227955341339, + "learning_rate": 8.894896719319563e-05, + "loss": 1.9053, + "step": 1829 + }, + { + "epoch": 0.5554712399453635, + "grad_norm": 0.7534793615341187, + "learning_rate": 8.894289185905226e-05, + "loss": 1.6738, + "step": 1830 + }, + { + "epoch": 0.555774776142055, + "grad_norm": 0.4037635326385498, + "learning_rate": 8.893681652490887e-05, + "loss": 2.1951, + "step": 1831 + }, + { + "epoch": 0.5560783123387464, + "grad_norm": 0.39184069633483887, + "learning_rate": 8.893074119076549e-05, + "loss": 2.0811, + "step": 1832 + }, + { + "epoch": 0.5563818485354378, + "grad_norm": 0.35053008794784546, + "learning_rate": 8.892466585662211e-05, + "loss": 1.8178, + "step": 1833 + }, + { + "epoch": 0.5566853847321293, + "grad_norm": 0.43768683075904846, + "learning_rate": 8.891859052247874e-05, + "loss": 2.0811, + "step": 1834 + }, + { + "epoch": 0.5569889209288208, + "grad_norm": 0.38592809438705444, + "learning_rate": 8.891251518833537e-05, + "loss": 1.8156, + "step": 1835 + }, + { + "epoch": 0.5572924571255122, + "grad_norm": 0.351408988237381, + "learning_rate": 8.890643985419197e-05, + "loss": 1.5505, + "step": 1836 + }, + { + "epoch": 0.5575959933222037, + "grad_norm": 0.4032740592956543, + "learning_rate": 8.89003645200486e-05, + "loss": 1.622, + "step": 1837 + }, + { + "epoch": 0.5578995295188951, + "grad_norm": 0.3902193307876587, + "learning_rate": 8.889428918590524e-05, + "loss": 2.0796, + "step": 1838 + }, + { + "epoch": 0.5582030657155865, + "grad_norm": 2.2613284587860107, + "learning_rate": 8.888821385176185e-05, + "loss": 1.6464, + "step": 1839 + }, + { + "epoch": 0.5585066019122781, + "grad_norm": 0.3818334937095642, + "learning_rate": 8.888213851761847e-05, + "loss": 1.0967, + "step": 1840 + }, + { + "epoch": 0.5588101381089695, + "grad_norm": 0.534939169883728, + "learning_rate": 8.88760631834751e-05, + "loss": 1.9834, + "step": 1841 + }, + { + "epoch": 0.559113674305661, + "grad_norm": 0.41335856914520264, + "learning_rate": 8.886998784933172e-05, + "loss": 1.9458, + "step": 1842 + }, + { + "epoch": 0.5594172105023524, + "grad_norm": 0.4256092309951782, + "learning_rate": 8.886391251518834e-05, + "loss": 1.9967, + "step": 1843 + }, + { + "epoch": 0.5597207466990438, + "grad_norm": 0.40793219208717346, + "learning_rate": 8.885783718104497e-05, + "loss": 1.6972, + "step": 1844 + }, + { + "epoch": 0.5600242828957354, + "grad_norm": 0.4092423915863037, + "learning_rate": 8.885176184690158e-05, + "loss": 1.9676, + "step": 1845 + }, + { + "epoch": 0.5603278190924268, + "grad_norm": 0.35754647850990295, + "learning_rate": 8.88456865127582e-05, + "loss": 1.1498, + "step": 1846 + }, + { + "epoch": 0.5606313552891182, + "grad_norm": 0.41491416096687317, + "learning_rate": 8.883961117861482e-05, + "loss": 2.2078, + "step": 1847 + }, + { + "epoch": 0.5609348914858097, + "grad_norm": 1.1699934005737305, + "learning_rate": 8.883353584447145e-05, + "loss": 2.1936, + "step": 1848 + }, + { + "epoch": 0.5612384276825011, + "grad_norm": 1.9053874015808105, + "learning_rate": 8.882746051032808e-05, + "loss": 1.7189, + "step": 1849 + }, + { + "epoch": 0.5615419638791926, + "grad_norm": 0.41807985305786133, + "learning_rate": 8.882138517618468e-05, + "loss": 1.7771, + "step": 1850 + }, + { + "epoch": 0.5618455000758841, + "grad_norm": 0.41903504729270935, + "learning_rate": 8.881530984204132e-05, + "loss": 2.1023, + "step": 1851 + }, + { + "epoch": 0.5621490362725755, + "grad_norm": 0.3394705653190613, + "learning_rate": 8.880923450789795e-05, + "loss": 0.9969, + "step": 1852 + }, + { + "epoch": 0.5624525724692669, + "grad_norm": 0.347989022731781, + "learning_rate": 8.880315917375456e-05, + "loss": 1.974, + "step": 1853 + }, + { + "epoch": 0.5627561086659584, + "grad_norm": 0.49732285737991333, + "learning_rate": 8.879708383961118e-05, + "loss": 1.7575, + "step": 1854 + }, + { + "epoch": 0.5630596448626499, + "grad_norm": 0.44572606682777405, + "learning_rate": 8.879100850546781e-05, + "loss": 1.8167, + "step": 1855 + }, + { + "epoch": 0.5633631810593414, + "grad_norm": 0.8100895881652832, + "learning_rate": 8.878493317132443e-05, + "loss": 1.9788, + "step": 1856 + }, + { + "epoch": 0.5636667172560328, + "grad_norm": 0.4205772578716278, + "learning_rate": 8.877885783718105e-05, + "loss": 1.9986, + "step": 1857 + }, + { + "epoch": 0.5639702534527242, + "grad_norm": 0.3976004719734192, + "learning_rate": 8.877278250303766e-05, + "loss": 1.9735, + "step": 1858 + }, + { + "epoch": 0.5642737896494157, + "grad_norm": 0.41813865303993225, + "learning_rate": 8.87667071688943e-05, + "loss": 1.8479, + "step": 1859 + }, + { + "epoch": 0.5645773258461072, + "grad_norm": 0.4901811182498932, + "learning_rate": 8.876063183475091e-05, + "loss": 2.3504, + "step": 1860 + }, + { + "epoch": 0.5648808620427986, + "grad_norm": 0.4103149473667145, + "learning_rate": 8.875455650060753e-05, + "loss": 2.0658, + "step": 1861 + }, + { + "epoch": 0.5651843982394901, + "grad_norm": 0.37885773181915283, + "learning_rate": 8.874848116646416e-05, + "loss": 1.8143, + "step": 1862 + }, + { + "epoch": 0.5654879344361815, + "grad_norm": 0.35186877846717834, + "learning_rate": 8.874240583232079e-05, + "loss": 1.9395, + "step": 1863 + }, + { + "epoch": 0.5657914706328729, + "grad_norm": 0.4435397982597351, + "learning_rate": 8.87363304981774e-05, + "loss": 1.7693, + "step": 1864 + }, + { + "epoch": 0.5660950068295644, + "grad_norm": 1.451499342918396, + "learning_rate": 8.873025516403403e-05, + "loss": 1.9007, + "step": 1865 + }, + { + "epoch": 0.5663985430262559, + "grad_norm": 0.41606009006500244, + "learning_rate": 8.872417982989066e-05, + "loss": 2.0327, + "step": 1866 + }, + { + "epoch": 0.5667020792229474, + "grad_norm": 0.38989219069480896, + "learning_rate": 8.871810449574727e-05, + "loss": 1.6791, + "step": 1867 + }, + { + "epoch": 0.5670056154196388, + "grad_norm": 0.3850671052932739, + "learning_rate": 8.871202916160389e-05, + "loss": 1.7889, + "step": 1868 + }, + { + "epoch": 0.5673091516163302, + "grad_norm": 0.43616947531700134, + "learning_rate": 8.870595382746052e-05, + "loss": 1.7769, + "step": 1869 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.39661890268325806, + "learning_rate": 8.869987849331714e-05, + "loss": 1.9286, + "step": 1870 + }, + { + "epoch": 0.5679162240097132, + "grad_norm": 0.43553540110588074, + "learning_rate": 8.869380315917376e-05, + "loss": 1.5863, + "step": 1871 + }, + { + "epoch": 0.5682197602064046, + "grad_norm": 0.3950207829475403, + "learning_rate": 8.868772782503037e-05, + "loss": 1.948, + "step": 1872 + }, + { + "epoch": 0.5685232964030961, + "grad_norm": 0.5240088701248169, + "learning_rate": 8.8681652490887e-05, + "loss": 1.8758, + "step": 1873 + }, + { + "epoch": 0.5688268325997875, + "grad_norm": 0.3744898736476898, + "learning_rate": 8.867557715674362e-05, + "loss": 2.2962, + "step": 1874 + }, + { + "epoch": 0.5691303687964789, + "grad_norm": 0.3877609074115753, + "learning_rate": 8.866950182260024e-05, + "loss": 1.675, + "step": 1875 + }, + { + "epoch": 0.5694339049931705, + "grad_norm": 0.3350330591201782, + "learning_rate": 8.866342648845687e-05, + "loss": 1.8538, + "step": 1876 + }, + { + "epoch": 0.5697374411898619, + "grad_norm": 0.38145220279693604, + "learning_rate": 8.86573511543135e-05, + "loss": 1.6781, + "step": 1877 + }, + { + "epoch": 0.5700409773865533, + "grad_norm": 0.43861034512519836, + "learning_rate": 8.86512758201701e-05, + "loss": 1.8, + "step": 1878 + }, + { + "epoch": 0.5703445135832448, + "grad_norm": 0.4304041266441345, + "learning_rate": 8.864520048602674e-05, + "loss": 1.6788, + "step": 1879 + }, + { + "epoch": 0.5706480497799362, + "grad_norm": 0.4199315309524536, + "learning_rate": 8.863912515188337e-05, + "loss": 1.8819, + "step": 1880 + }, + { + "epoch": 0.5709515859766278, + "grad_norm": 0.4044843912124634, + "learning_rate": 8.863304981773998e-05, + "loss": 1.8221, + "step": 1881 + }, + { + "epoch": 0.5712551221733192, + "grad_norm": 0.5554643273353577, + "learning_rate": 8.86269744835966e-05, + "loss": 1.6682, + "step": 1882 + }, + { + "epoch": 0.5715586583700106, + "grad_norm": 0.45517250895500183, + "learning_rate": 8.862089914945323e-05, + "loss": 1.8345, + "step": 1883 + }, + { + "epoch": 0.5718621945667021, + "grad_norm": 0.4475466310977936, + "learning_rate": 8.861482381530985e-05, + "loss": 1.9378, + "step": 1884 + }, + { + "epoch": 0.5721657307633935, + "grad_norm": 0.5567486882209778, + "learning_rate": 8.860874848116647e-05, + "loss": 1.6444, + "step": 1885 + }, + { + "epoch": 0.572469266960085, + "grad_norm": 0.3710486590862274, + "learning_rate": 8.860267314702308e-05, + "loss": 1.6935, + "step": 1886 + }, + { + "epoch": 0.5727728031567765, + "grad_norm": 0.4086054861545563, + "learning_rate": 8.859659781287971e-05, + "loss": 1.7854, + "step": 1887 + }, + { + "epoch": 0.5730763393534679, + "grad_norm": 0.46489015221595764, + "learning_rate": 8.859052247873633e-05, + "loss": 1.8197, + "step": 1888 + }, + { + "epoch": 0.5733798755501593, + "grad_norm": 0.7444620132446289, + "learning_rate": 8.858444714459295e-05, + "loss": 1.53, + "step": 1889 + }, + { + "epoch": 0.5736834117468508, + "grad_norm": 0.4494125545024872, + "learning_rate": 8.857837181044958e-05, + "loss": 1.6418, + "step": 1890 + }, + { + "epoch": 0.5739869479435422, + "grad_norm": 0.6012828946113586, + "learning_rate": 8.857229647630621e-05, + "loss": 2.3039, + "step": 1891 + }, + { + "epoch": 0.5742904841402338, + "grad_norm": 0.44922634959220886, + "learning_rate": 8.856622114216281e-05, + "loss": 1.8881, + "step": 1892 + }, + { + "epoch": 0.5745940203369252, + "grad_norm": 0.34000277519226074, + "learning_rate": 8.856014580801945e-05, + "loss": 1.5964, + "step": 1893 + }, + { + "epoch": 0.5748975565336166, + "grad_norm": 0.4107670485973358, + "learning_rate": 8.855407047387608e-05, + "loss": 1.9057, + "step": 1894 + }, + { + "epoch": 0.5752010927303081, + "grad_norm": 0.3938602805137634, + "learning_rate": 8.854799513973268e-05, + "loss": 2.0193, + "step": 1895 + }, + { + "epoch": 0.5755046289269995, + "grad_norm": 0.3723643720149994, + "learning_rate": 8.854191980558931e-05, + "loss": 2.0371, + "step": 1896 + }, + { + "epoch": 0.575808165123691, + "grad_norm": 0.7747316956520081, + "learning_rate": 8.853584447144594e-05, + "loss": 1.5783, + "step": 1897 + }, + { + "epoch": 0.5761117013203825, + "grad_norm": 0.40745773911476135, + "learning_rate": 8.852976913730256e-05, + "loss": 1.6217, + "step": 1898 + }, + { + "epoch": 0.5764152375170739, + "grad_norm": 0.363471120595932, + "learning_rate": 8.852369380315918e-05, + "loss": 1.948, + "step": 1899 + }, + { + "epoch": 0.5767187737137653, + "grad_norm": 0.3844568133354187, + "learning_rate": 8.85176184690158e-05, + "loss": 2.0447, + "step": 1900 + }, + { + "epoch": 0.5770223099104568, + "grad_norm": 0.42804035544395447, + "learning_rate": 8.851154313487242e-05, + "loss": 1.9196, + "step": 1901 + }, + { + "epoch": 0.5773258461071483, + "grad_norm": 0.36453336477279663, + "learning_rate": 8.850546780072904e-05, + "loss": 2.2236, + "step": 1902 + }, + { + "epoch": 0.5776293823038398, + "grad_norm": 0.41334068775177, + "learning_rate": 8.849939246658566e-05, + "loss": 1.8657, + "step": 1903 + }, + { + "epoch": 0.5779329185005312, + "grad_norm": 0.3925778567790985, + "learning_rate": 8.849331713244229e-05, + "loss": 1.9695, + "step": 1904 + }, + { + "epoch": 0.5782364546972226, + "grad_norm": 0.39274585247039795, + "learning_rate": 8.848724179829892e-05, + "loss": 1.664, + "step": 1905 + }, + { + "epoch": 0.5785399908939141, + "grad_norm": 0.37139561772346497, + "learning_rate": 8.848116646415552e-05, + "loss": 1.0956, + "step": 1906 + }, + { + "epoch": 0.5788435270906056, + "grad_norm": 0.4112982451915741, + "learning_rate": 8.847509113001216e-05, + "loss": 2.0189, + "step": 1907 + }, + { + "epoch": 0.579147063287297, + "grad_norm": 0.34007617831230164, + "learning_rate": 8.846901579586879e-05, + "loss": 1.537, + "step": 1908 + }, + { + "epoch": 0.5794505994839885, + "grad_norm": 0.43591251969337463, + "learning_rate": 8.846294046172539e-05, + "loss": 1.8668, + "step": 1909 + }, + { + "epoch": 0.5797541356806799, + "grad_norm": 0.4715147316455841, + "learning_rate": 8.845686512758202e-05, + "loss": 1.9474, + "step": 1910 + }, + { + "epoch": 0.5800576718773713, + "grad_norm": 0.5986727476119995, + "learning_rate": 8.845078979343865e-05, + "loss": 1.9555, + "step": 1911 + }, + { + "epoch": 0.5803612080740629, + "grad_norm": 0.43499329686164856, + "learning_rate": 8.844471445929527e-05, + "loss": 1.8719, + "step": 1912 + }, + { + "epoch": 0.5806647442707543, + "grad_norm": 0.4152344763278961, + "learning_rate": 8.843863912515189e-05, + "loss": 2.0322, + "step": 1913 + }, + { + "epoch": 0.5809682804674458, + "grad_norm": 0.4037158787250519, + "learning_rate": 8.84325637910085e-05, + "loss": 1.9687, + "step": 1914 + }, + { + "epoch": 0.5812718166641372, + "grad_norm": 0.4261537492275238, + "learning_rate": 8.842648845686513e-05, + "loss": 1.9674, + "step": 1915 + }, + { + "epoch": 0.5815753528608286, + "grad_norm": 0.3880082070827484, + "learning_rate": 8.842041312272175e-05, + "loss": 1.7925, + "step": 1916 + }, + { + "epoch": 0.58187888905752, + "grad_norm": 0.7090932130813599, + "learning_rate": 8.841433778857837e-05, + "loss": 1.8392, + "step": 1917 + }, + { + "epoch": 0.5821824252542116, + "grad_norm": 0.4407334625720978, + "learning_rate": 8.8408262454435e-05, + "loss": 1.9389, + "step": 1918 + }, + { + "epoch": 0.582485961450903, + "grad_norm": 0.40139150619506836, + "learning_rate": 8.840218712029162e-05, + "loss": 2.0225, + "step": 1919 + }, + { + "epoch": 0.5827894976475945, + "grad_norm": 0.7051631212234497, + "learning_rate": 8.839611178614823e-05, + "loss": 1.9287, + "step": 1920 + }, + { + "epoch": 0.5830930338442859, + "grad_norm": 0.4037090241909027, + "learning_rate": 8.839003645200487e-05, + "loss": 1.7706, + "step": 1921 + }, + { + "epoch": 0.5833965700409773, + "grad_norm": 0.4044518768787384, + "learning_rate": 8.83839611178615e-05, + "loss": 1.9106, + "step": 1922 + }, + { + "epoch": 0.5837001062376689, + "grad_norm": 0.5114139914512634, + "learning_rate": 8.83778857837181e-05, + "loss": 2.0017, + "step": 1923 + }, + { + "epoch": 0.5840036424343603, + "grad_norm": 0.39643585681915283, + "learning_rate": 8.837181044957473e-05, + "loss": 2.0096, + "step": 1924 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.4566240608692169, + "learning_rate": 8.836573511543136e-05, + "loss": 2.0962, + "step": 1925 + }, + { + "epoch": 0.5846107148277432, + "grad_norm": 0.37759748101234436, + "learning_rate": 8.835965978128798e-05, + "loss": 1.6786, + "step": 1926 + }, + { + "epoch": 0.5849142510244346, + "grad_norm": 0.37798550724983215, + "learning_rate": 8.83535844471446e-05, + "loss": 2.1075, + "step": 1927 + }, + { + "epoch": 0.5852177872211262, + "grad_norm": 0.40494439005851746, + "learning_rate": 8.834750911300121e-05, + "loss": 1.7167, + "step": 1928 + }, + { + "epoch": 0.5855213234178176, + "grad_norm": 0.3333325684070587, + "learning_rate": 8.834143377885784e-05, + "loss": 1.9133, + "step": 1929 + }, + { + "epoch": 0.585824859614509, + "grad_norm": 0.3827350437641144, + "learning_rate": 8.833535844471446e-05, + "loss": 1.8537, + "step": 1930 + }, + { + "epoch": 0.5861283958112005, + "grad_norm": 0.4088849127292633, + "learning_rate": 8.832928311057108e-05, + "loss": 1.9625, + "step": 1931 + }, + { + "epoch": 0.5864319320078919, + "grad_norm": 0.3575502932071686, + "learning_rate": 8.832320777642771e-05, + "loss": 1.851, + "step": 1932 + }, + { + "epoch": 0.5867354682045834, + "grad_norm": 0.38579368591308594, + "learning_rate": 8.831713244228433e-05, + "loss": 1.8121, + "step": 1933 + }, + { + "epoch": 0.5870390044012749, + "grad_norm": 0.37787890434265137, + "learning_rate": 8.831105710814094e-05, + "loss": 1.9509, + "step": 1934 + }, + { + "epoch": 0.5873425405979663, + "grad_norm": 0.4074660837650299, + "learning_rate": 8.830498177399758e-05, + "loss": 1.9987, + "step": 1935 + }, + { + "epoch": 0.5876460767946577, + "grad_norm": 0.7902248501777649, + "learning_rate": 8.82989064398542e-05, + "loss": 1.6704, + "step": 1936 + }, + { + "epoch": 0.5879496129913492, + "grad_norm": 0.3240687847137451, + "learning_rate": 8.829283110571081e-05, + "loss": 1.1956, + "step": 1937 + }, + { + "epoch": 0.5882531491880407, + "grad_norm": 0.410543829202652, + "learning_rate": 8.828675577156744e-05, + "loss": 1.9533, + "step": 1938 + }, + { + "epoch": 0.5885566853847322, + "grad_norm": 0.4559386670589447, + "learning_rate": 8.828068043742406e-05, + "loss": 1.566, + "step": 1939 + }, + { + "epoch": 0.5888602215814236, + "grad_norm": 0.44418251514434814, + "learning_rate": 8.827460510328069e-05, + "loss": 1.814, + "step": 1940 + }, + { + "epoch": 0.589163757778115, + "grad_norm": 0.42374011874198914, + "learning_rate": 8.82685297691373e-05, + "loss": 1.8946, + "step": 1941 + }, + { + "epoch": 0.5894672939748065, + "grad_norm": 0.44734686613082886, + "learning_rate": 8.826245443499392e-05, + "loss": 1.9893, + "step": 1942 + }, + { + "epoch": 0.589770830171498, + "grad_norm": 0.42960959672927856, + "learning_rate": 8.825637910085055e-05, + "loss": 1.5659, + "step": 1943 + }, + { + "epoch": 0.5900743663681894, + "grad_norm": 0.44513779878616333, + "learning_rate": 8.825030376670717e-05, + "loss": 1.969, + "step": 1944 + }, + { + "epoch": 0.5903779025648809, + "grad_norm": 0.39732202887535095, + "learning_rate": 8.824422843256379e-05, + "loss": 1.9469, + "step": 1945 + }, + { + "epoch": 0.5906814387615723, + "grad_norm": 0.490384042263031, + "learning_rate": 8.823815309842042e-05, + "loss": 1.7434, + "step": 1946 + }, + { + "epoch": 0.5909849749582637, + "grad_norm": 0.5644544959068298, + "learning_rate": 8.823207776427704e-05, + "loss": 2.1488, + "step": 1947 + }, + { + "epoch": 0.5912885111549552, + "grad_norm": 0.43499046564102173, + "learning_rate": 8.822600243013366e-05, + "loss": 1.6023, + "step": 1948 + }, + { + "epoch": 0.5915920473516467, + "grad_norm": 0.3970509469509125, + "learning_rate": 8.821992709599029e-05, + "loss": 1.5955, + "step": 1949 + }, + { + "epoch": 0.5918955835483382, + "grad_norm": 0.39471563696861267, + "learning_rate": 8.821385176184692e-05, + "loss": 1.9294, + "step": 1950 + }, + { + "epoch": 0.5921991197450296, + "grad_norm": 0.42955949902534485, + "learning_rate": 8.820777642770352e-05, + "loss": 1.9628, + "step": 1951 + }, + { + "epoch": 0.592502655941721, + "grad_norm": 0.3734053373336792, + "learning_rate": 8.820170109356015e-05, + "loss": 1.9927, + "step": 1952 + }, + { + "epoch": 0.5928061921384125, + "grad_norm": 0.40868285298347473, + "learning_rate": 8.819562575941677e-05, + "loss": 2.0704, + "step": 1953 + }, + { + "epoch": 0.593109728335104, + "grad_norm": 0.4374091625213623, + "learning_rate": 8.81895504252734e-05, + "loss": 1.7447, + "step": 1954 + }, + { + "epoch": 0.5934132645317954, + "grad_norm": 0.408299058675766, + "learning_rate": 8.818347509113002e-05, + "loss": 2.076, + "step": 1955 + }, + { + "epoch": 0.5937168007284869, + "grad_norm": 0.4676043391227722, + "learning_rate": 8.817739975698663e-05, + "loss": 1.9319, + "step": 1956 + }, + { + "epoch": 0.5940203369251783, + "grad_norm": 1.3173327445983887, + "learning_rate": 8.817132442284326e-05, + "loss": 1.6364, + "step": 1957 + }, + { + "epoch": 0.5943238731218697, + "grad_norm": 0.39462506771087646, + "learning_rate": 8.816524908869988e-05, + "loss": 2.1088, + "step": 1958 + }, + { + "epoch": 0.5946274093185613, + "grad_norm": 0.37660276889801025, + "learning_rate": 8.81591737545565e-05, + "loss": 1.641, + "step": 1959 + }, + { + "epoch": 0.5949309455152527, + "grad_norm": 0.3797924518585205, + "learning_rate": 8.815309842041313e-05, + "loss": 1.4328, + "step": 1960 + }, + { + "epoch": 0.5952344817119442, + "grad_norm": 0.33881229162216187, + "learning_rate": 8.814702308626975e-05, + "loss": 1.6713, + "step": 1961 + }, + { + "epoch": 0.5955380179086356, + "grad_norm": 0.43969300389289856, + "learning_rate": 8.814094775212637e-05, + "loss": 1.8251, + "step": 1962 + }, + { + "epoch": 0.595841554105327, + "grad_norm": 0.39608824253082275, + "learning_rate": 8.8134872417983e-05, + "loss": 2.0762, + "step": 1963 + }, + { + "epoch": 0.5961450903020186, + "grad_norm": 0.3688305914402008, + "learning_rate": 8.812879708383963e-05, + "loss": 1.886, + "step": 1964 + }, + { + "epoch": 0.59644862649871, + "grad_norm": 0.3397257328033447, + "learning_rate": 8.812272174969623e-05, + "loss": 1.8668, + "step": 1965 + }, + { + "epoch": 0.5967521626954014, + "grad_norm": 0.39257940649986267, + "learning_rate": 8.811664641555286e-05, + "loss": 1.879, + "step": 1966 + }, + { + "epoch": 0.5970556988920929, + "grad_norm": 0.41007375717163086, + "learning_rate": 8.811057108140948e-05, + "loss": 1.7411, + "step": 1967 + }, + { + "epoch": 0.5973592350887843, + "grad_norm": 0.3694823682308197, + "learning_rate": 8.81044957472661e-05, + "loss": 1.7675, + "step": 1968 + }, + { + "epoch": 0.5976627712854758, + "grad_norm": 0.9148819446563721, + "learning_rate": 8.809842041312273e-05, + "loss": 2.0984, + "step": 1969 + }, + { + "epoch": 0.5979663074821673, + "grad_norm": 0.379384309053421, + "learning_rate": 8.809234507897934e-05, + "loss": 2.1933, + "step": 1970 + }, + { + "epoch": 0.5982698436788587, + "grad_norm": 0.5637233257293701, + "learning_rate": 8.808626974483598e-05, + "loss": 1.2066, + "step": 1971 + }, + { + "epoch": 0.5985733798755501, + "grad_norm": 0.42961108684539795, + "learning_rate": 8.808019441069259e-05, + "loss": 1.6595, + "step": 1972 + }, + { + "epoch": 0.5988769160722416, + "grad_norm": 0.41248828172683716, + "learning_rate": 8.807411907654921e-05, + "loss": 2.0266, + "step": 1973 + }, + { + "epoch": 0.599180452268933, + "grad_norm": 0.41730985045433044, + "learning_rate": 8.806804374240584e-05, + "loss": 2.0968, + "step": 1974 + }, + { + "epoch": 0.5994839884656246, + "grad_norm": 0.4452510178089142, + "learning_rate": 8.806196840826246e-05, + "loss": 1.6038, + "step": 1975 + }, + { + "epoch": 0.599787524662316, + "grad_norm": 0.457256942987442, + "learning_rate": 8.805589307411908e-05, + "loss": 2.0862, + "step": 1976 + }, + { + "epoch": 0.6000910608590074, + "grad_norm": 0.38506418466567993, + "learning_rate": 8.80498177399757e-05, + "loss": 1.6764, + "step": 1977 + }, + { + "epoch": 0.6003945970556989, + "grad_norm": 0.4200589060783386, + "learning_rate": 8.804374240583234e-05, + "loss": 2.0962, + "step": 1978 + }, + { + "epoch": 0.6006981332523903, + "grad_norm": 0.41140785813331604, + "learning_rate": 8.803766707168894e-05, + "loss": 1.7345, + "step": 1979 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3584011495113373, + "learning_rate": 8.803159173754557e-05, + "loss": 2.1839, + "step": 1980 + }, + { + "epoch": 0.6013052056457733, + "grad_norm": 0.40637287497520447, + "learning_rate": 8.802551640340219e-05, + "loss": 2.0997, + "step": 1981 + }, + { + "epoch": 0.6016087418424647, + "grad_norm": 0.42887794971466064, + "learning_rate": 8.80194410692588e-05, + "loss": 1.967, + "step": 1982 + }, + { + "epoch": 0.6019122780391561, + "grad_norm": 0.42879635095596313, + "learning_rate": 8.801336573511544e-05, + "loss": 2.0618, + "step": 1983 + }, + { + "epoch": 0.6022158142358476, + "grad_norm": 0.5477713346481323, + "learning_rate": 8.800729040097205e-05, + "loss": 1.3743, + "step": 1984 + }, + { + "epoch": 0.6025193504325391, + "grad_norm": 0.3772994875907898, + "learning_rate": 8.800121506682869e-05, + "loss": 1.5254, + "step": 1985 + }, + { + "epoch": 0.6028228866292306, + "grad_norm": 0.4140057861804962, + "learning_rate": 8.79951397326853e-05, + "loss": 2.164, + "step": 1986 + }, + { + "epoch": 0.603126422825922, + "grad_norm": 0.44529426097869873, + "learning_rate": 8.798906439854192e-05, + "loss": 1.8042, + "step": 1987 + }, + { + "epoch": 0.6034299590226134, + "grad_norm": 0.39523541927337646, + "learning_rate": 8.798298906439855e-05, + "loss": 1.8134, + "step": 1988 + }, + { + "epoch": 0.6037334952193049, + "grad_norm": 0.38513484597206116, + "learning_rate": 8.797691373025517e-05, + "loss": 1.8259, + "step": 1989 + }, + { + "epoch": 0.6040370314159964, + "grad_norm": 0.4686470329761505, + "learning_rate": 8.797083839611179e-05, + "loss": 2.2113, + "step": 1990 + }, + { + "epoch": 0.6043405676126878, + "grad_norm": 0.4119713008403778, + "learning_rate": 8.796476306196842e-05, + "loss": 1.9614, + "step": 1991 + }, + { + "epoch": 0.6046441038093793, + "grad_norm": 0.40786707401275635, + "learning_rate": 8.795868772782503e-05, + "loss": 1.9369, + "step": 1992 + }, + { + "epoch": 0.6049476400060707, + "grad_norm": 0.3869648873806, + "learning_rate": 8.795261239368165e-05, + "loss": 2.1204, + "step": 1993 + }, + { + "epoch": 0.6052511762027621, + "grad_norm": 0.3826451301574707, + "learning_rate": 8.794653705953828e-05, + "loss": 2.0715, + "step": 1994 + }, + { + "epoch": 0.6055547123994537, + "grad_norm": 0.38412514328956604, + "learning_rate": 8.79404617253949e-05, + "loss": 1.9482, + "step": 1995 + }, + { + "epoch": 0.6058582485961451, + "grad_norm": 0.4388350248336792, + "learning_rate": 8.793438639125152e-05, + "loss": 1.9746, + "step": 1996 + }, + { + "epoch": 0.6061617847928366, + "grad_norm": 0.3750387132167816, + "learning_rate": 8.792831105710815e-05, + "loss": 1.7831, + "step": 1997 + }, + { + "epoch": 0.606465320989528, + "grad_norm": 0.42686113715171814, + "learning_rate": 8.792223572296476e-05, + "loss": 1.6419, + "step": 1998 + }, + { + "epoch": 0.6067688571862194, + "grad_norm": 0.39653515815734863, + "learning_rate": 8.79161603888214e-05, + "loss": 1.7617, + "step": 1999 + }, + { + "epoch": 0.6070723933829109, + "grad_norm": 0.4662545621395111, + "learning_rate": 8.791008505467801e-05, + "loss": 1.878, + "step": 2000 + }, + { + "epoch": 0.6073759295796024, + "grad_norm": 0.4733245074748993, + "learning_rate": 8.790400972053463e-05, + "loss": 1.874, + "step": 2001 + }, + { + "epoch": 0.6076794657762938, + "grad_norm": 0.4228340983390808, + "learning_rate": 8.789793438639126e-05, + "loss": 1.9415, + "step": 2002 + }, + { + "epoch": 0.6079830019729853, + "grad_norm": 0.4229651391506195, + "learning_rate": 8.789185905224788e-05, + "loss": 1.5762, + "step": 2003 + }, + { + "epoch": 0.6082865381696767, + "grad_norm": 0.4287284016609192, + "learning_rate": 8.78857837181045e-05, + "loss": 2.1414, + "step": 2004 + }, + { + "epoch": 0.6085900743663681, + "grad_norm": 0.555934488773346, + "learning_rate": 8.787970838396113e-05, + "loss": 1.9591, + "step": 2005 + }, + { + "epoch": 0.6088936105630597, + "grad_norm": 0.42303675413131714, + "learning_rate": 8.787363304981774e-05, + "loss": 1.5513, + "step": 2006 + }, + { + "epoch": 0.6091971467597511, + "grad_norm": 0.37572699785232544, + "learning_rate": 8.786755771567436e-05, + "loss": 1.9508, + "step": 2007 + }, + { + "epoch": 0.6095006829564426, + "grad_norm": 0.3933078944683075, + "learning_rate": 8.786148238153099e-05, + "loss": 1.9507, + "step": 2008 + }, + { + "epoch": 0.609804219153134, + "grad_norm": 0.46419456601142883, + "learning_rate": 8.785540704738761e-05, + "loss": 2.0019, + "step": 2009 + }, + { + "epoch": 0.6101077553498254, + "grad_norm": 0.38383206725120544, + "learning_rate": 8.784933171324423e-05, + "loss": 1.9533, + "step": 2010 + }, + { + "epoch": 0.610411291546517, + "grad_norm": 0.37486881017684937, + "learning_rate": 8.784325637910086e-05, + "loss": 1.9508, + "step": 2011 + }, + { + "epoch": 0.6107148277432084, + "grad_norm": 0.34909558296203613, + "learning_rate": 8.783718104495747e-05, + "loss": 1.8833, + "step": 2012 + }, + { + "epoch": 0.6110183639398998, + "grad_norm": 0.6226269006729126, + "learning_rate": 8.78311057108141e-05, + "loss": 2.1135, + "step": 2013 + }, + { + "epoch": 0.6113219001365913, + "grad_norm": 0.45638999342918396, + "learning_rate": 8.782503037667072e-05, + "loss": 1.8989, + "step": 2014 + }, + { + "epoch": 0.6116254363332827, + "grad_norm": 0.41857293248176575, + "learning_rate": 8.781895504252734e-05, + "loss": 1.7609, + "step": 2015 + }, + { + "epoch": 0.6119289725299742, + "grad_norm": 0.4325519800186157, + "learning_rate": 8.781287970838397e-05, + "loss": 1.6411, + "step": 2016 + }, + { + "epoch": 0.6122325087266657, + "grad_norm": 0.3558877110481262, + "learning_rate": 8.780680437424059e-05, + "loss": 1.7634, + "step": 2017 + }, + { + "epoch": 0.6125360449233571, + "grad_norm": 0.42849549651145935, + "learning_rate": 8.78007290400972e-05, + "loss": 1.8625, + "step": 2018 + }, + { + "epoch": 0.6128395811200485, + "grad_norm": 0.7057125568389893, + "learning_rate": 8.779465370595384e-05, + "loss": 2.0542, + "step": 2019 + }, + { + "epoch": 0.61314311731674, + "grad_norm": 0.3607623279094696, + "learning_rate": 8.778857837181045e-05, + "loss": 2.0613, + "step": 2020 + }, + { + "epoch": 0.6134466535134315, + "grad_norm": 0.35904109477996826, + "learning_rate": 8.778250303766707e-05, + "loss": 2.1633, + "step": 2021 + }, + { + "epoch": 0.613750189710123, + "grad_norm": 0.38341954350471497, + "learning_rate": 8.77764277035237e-05, + "loss": 1.925, + "step": 2022 + }, + { + "epoch": 0.6140537259068144, + "grad_norm": 0.8183413743972778, + "learning_rate": 8.777035236938032e-05, + "loss": 1.5858, + "step": 2023 + }, + { + "epoch": 0.6143572621035058, + "grad_norm": 0.4051649272441864, + "learning_rate": 8.776427703523694e-05, + "loss": 1.9788, + "step": 2024 + }, + { + "epoch": 0.6146607983001973, + "grad_norm": 0.40388303995132446, + "learning_rate": 8.775820170109357e-05, + "loss": 1.9113, + "step": 2025 + }, + { + "epoch": 0.6149643344968888, + "grad_norm": 0.38880276679992676, + "learning_rate": 8.775212636695018e-05, + "loss": 1.7535, + "step": 2026 + }, + { + "epoch": 0.6152678706935802, + "grad_norm": 0.41596999764442444, + "learning_rate": 8.774605103280682e-05, + "loss": 1.9532, + "step": 2027 + }, + { + "epoch": 0.6155714068902717, + "grad_norm": 0.3971737325191498, + "learning_rate": 8.773997569866343e-05, + "loss": 1.6123, + "step": 2028 + }, + { + "epoch": 0.6158749430869631, + "grad_norm": 0.610409140586853, + "learning_rate": 8.773390036452005e-05, + "loss": 1.969, + "step": 2029 + }, + { + "epoch": 0.6161784792836545, + "grad_norm": 0.4366918206214905, + "learning_rate": 8.772782503037668e-05, + "loss": 1.7944, + "step": 2030 + }, + { + "epoch": 0.616482015480346, + "grad_norm": 0.3931274712085724, + "learning_rate": 8.77217496962333e-05, + "loss": 2.0559, + "step": 2031 + }, + { + "epoch": 0.6167855516770375, + "grad_norm": 0.556197464466095, + "learning_rate": 8.771567436208992e-05, + "loss": 1.6191, + "step": 2032 + }, + { + "epoch": 0.617089087873729, + "grad_norm": 0.4099692404270172, + "learning_rate": 8.770959902794655e-05, + "loss": 1.6357, + "step": 2033 + }, + { + "epoch": 0.6173926240704204, + "grad_norm": 0.6582362055778503, + "learning_rate": 8.770352369380316e-05, + "loss": 2.1112, + "step": 2034 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.43522998690605164, + "learning_rate": 8.769744835965978e-05, + "loss": 2.078, + "step": 2035 + }, + { + "epoch": 0.6179996964638033, + "grad_norm": 0.3984440565109253, + "learning_rate": 8.769137302551641e-05, + "loss": 1.9548, + "step": 2036 + }, + { + "epoch": 0.6183032326604948, + "grad_norm": 0.4203691780567169, + "learning_rate": 8.768529769137303e-05, + "loss": 2.0214, + "step": 2037 + }, + { + "epoch": 0.6186067688571862, + "grad_norm": 0.4662054181098938, + "learning_rate": 8.767922235722965e-05, + "loss": 1.8141, + "step": 2038 + }, + { + "epoch": 0.6189103050538777, + "grad_norm": 0.440121591091156, + "learning_rate": 8.767314702308628e-05, + "loss": 1.8577, + "step": 2039 + }, + { + "epoch": 0.6192138412505691, + "grad_norm": 0.4438299536705017, + "learning_rate": 8.76670716889429e-05, + "loss": 1.5552, + "step": 2040 + }, + { + "epoch": 0.6195173774472605, + "grad_norm": 0.3925747871398926, + "learning_rate": 8.766099635479951e-05, + "loss": 1.9399, + "step": 2041 + }, + { + "epoch": 0.6198209136439521, + "grad_norm": 0.4043785333633423, + "learning_rate": 8.765492102065614e-05, + "loss": 1.9208, + "step": 2042 + }, + { + "epoch": 0.6201244498406435, + "grad_norm": 0.4448244273662567, + "learning_rate": 8.764884568651276e-05, + "loss": 1.6187, + "step": 2043 + }, + { + "epoch": 0.620427986037335, + "grad_norm": 0.5388829112052917, + "learning_rate": 8.764277035236939e-05, + "loss": 1.4629, + "step": 2044 + }, + { + "epoch": 0.6207315222340264, + "grad_norm": 0.3737129867076874, + "learning_rate": 8.763669501822601e-05, + "loss": 1.9238, + "step": 2045 + }, + { + "epoch": 0.6210350584307178, + "grad_norm": 0.4435792863368988, + "learning_rate": 8.763061968408263e-05, + "loss": 1.746, + "step": 2046 + }, + { + "epoch": 0.6213385946274094, + "grad_norm": 0.3660859167575836, + "learning_rate": 8.762454434993926e-05, + "loss": 1.9025, + "step": 2047 + }, + { + "epoch": 0.6216421308241008, + "grad_norm": 0.6945536136627197, + "learning_rate": 8.761846901579587e-05, + "loss": 2.0719, + "step": 2048 + }, + { + "epoch": 0.6219456670207922, + "grad_norm": 0.5578482151031494, + "learning_rate": 8.761239368165249e-05, + "loss": 2.0642, + "step": 2049 + }, + { + "epoch": 0.6222492032174837, + "grad_norm": 0.38080549240112305, + "learning_rate": 8.760631834750912e-05, + "loss": 1.9905, + "step": 2050 + }, + { + "epoch": 0.6225527394141751, + "grad_norm": 0.39509710669517517, + "learning_rate": 8.760024301336574e-05, + "loss": 1.8086, + "step": 2051 + }, + { + "epoch": 0.6228562756108666, + "grad_norm": 0.39778873324394226, + "learning_rate": 8.759416767922236e-05, + "loss": 1.8397, + "step": 2052 + }, + { + "epoch": 0.6231598118075581, + "grad_norm": 0.4001278877258301, + "learning_rate": 8.758809234507899e-05, + "loss": 2.1541, + "step": 2053 + }, + { + "epoch": 0.6234633480042495, + "grad_norm": 0.41478973627090454, + "learning_rate": 8.75820170109356e-05, + "loss": 1.8661, + "step": 2054 + }, + { + "epoch": 0.623766884200941, + "grad_norm": 0.44780445098876953, + "learning_rate": 8.757594167679222e-05, + "loss": 2.06, + "step": 2055 + }, + { + "epoch": 0.6240704203976324, + "grad_norm": 0.4024375081062317, + "learning_rate": 8.756986634264885e-05, + "loss": 1.8951, + "step": 2056 + }, + { + "epoch": 0.6243739565943238, + "grad_norm": 0.48133009672164917, + "learning_rate": 8.756379100850547e-05, + "loss": 1.8264, + "step": 2057 + }, + { + "epoch": 0.6246774927910154, + "grad_norm": 0.4362419843673706, + "learning_rate": 8.75577156743621e-05, + "loss": 1.584, + "step": 2058 + }, + { + "epoch": 0.6249810289877068, + "grad_norm": 0.39468279480934143, + "learning_rate": 8.755164034021872e-05, + "loss": 2.1541, + "step": 2059 + }, + { + "epoch": 0.6252845651843982, + "grad_norm": 0.3956018388271332, + "learning_rate": 8.754556500607534e-05, + "loss": 2.0688, + "step": 2060 + }, + { + "epoch": 0.6255881013810897, + "grad_norm": 0.3778972327709198, + "learning_rate": 8.753948967193197e-05, + "loss": 1.2158, + "step": 2061 + }, + { + "epoch": 0.6258916375777811, + "grad_norm": 0.6592405438423157, + "learning_rate": 8.753341433778858e-05, + "loss": 2.1388, + "step": 2062 + }, + { + "epoch": 0.6261951737744726, + "grad_norm": 0.44248607754707336, + "learning_rate": 8.75273390036452e-05, + "loss": 1.7007, + "step": 2063 + }, + { + "epoch": 0.6264987099711641, + "grad_norm": 0.40454086661338806, + "learning_rate": 8.752126366950183e-05, + "loss": 1.9904, + "step": 2064 + }, + { + "epoch": 0.6268022461678555, + "grad_norm": 0.4150254428386688, + "learning_rate": 8.751518833535845e-05, + "loss": 1.825, + "step": 2065 + }, + { + "epoch": 0.627105782364547, + "grad_norm": 0.39456769824028015, + "learning_rate": 8.750911300121507e-05, + "loss": 1.6171, + "step": 2066 + }, + { + "epoch": 0.6274093185612384, + "grad_norm": 0.42913463711738586, + "learning_rate": 8.75030376670717e-05, + "loss": 1.9738, + "step": 2067 + }, + { + "epoch": 0.6277128547579299, + "grad_norm": 0.6062834858894348, + "learning_rate": 8.749696233292831e-05, + "loss": 2.3641, + "step": 2068 + }, + { + "epoch": 0.6280163909546214, + "grad_norm": 0.4486273229122162, + "learning_rate": 8.749088699878493e-05, + "loss": 1.895, + "step": 2069 + }, + { + "epoch": 0.6283199271513128, + "grad_norm": 0.6650506854057312, + "learning_rate": 8.748481166464156e-05, + "loss": 1.9425, + "step": 2070 + }, + { + "epoch": 0.6286234633480042, + "grad_norm": 0.4337095618247986, + "learning_rate": 8.747873633049818e-05, + "loss": 1.6244, + "step": 2071 + }, + { + "epoch": 0.6289269995446957, + "grad_norm": 0.39554956555366516, + "learning_rate": 8.747266099635481e-05, + "loss": 1.9773, + "step": 2072 + }, + { + "epoch": 0.6292305357413872, + "grad_norm": 0.6905329823493958, + "learning_rate": 8.746658566221143e-05, + "loss": 1.4572, + "step": 2073 + }, + { + "epoch": 0.6295340719380786, + "grad_norm": 0.4814346730709076, + "learning_rate": 8.746051032806805e-05, + "loss": 1.5925, + "step": 2074 + }, + { + "epoch": 0.6298376081347701, + "grad_norm": 0.5194016695022583, + "learning_rate": 8.745443499392468e-05, + "loss": 1.9103, + "step": 2075 + }, + { + "epoch": 0.6301411443314615, + "grad_norm": 0.38328269124031067, + "learning_rate": 8.74483596597813e-05, + "loss": 2.0036, + "step": 2076 + }, + { + "epoch": 0.630444680528153, + "grad_norm": 0.3967950642108917, + "learning_rate": 8.744228432563791e-05, + "loss": 2.0632, + "step": 2077 + }, + { + "epoch": 0.6307482167248445, + "grad_norm": 0.41844338178634644, + "learning_rate": 8.743620899149454e-05, + "loss": 1.7965, + "step": 2078 + }, + { + "epoch": 0.6310517529215359, + "grad_norm": 0.4322264790534973, + "learning_rate": 8.743013365735116e-05, + "loss": 1.6439, + "step": 2079 + }, + { + "epoch": 0.6313552891182274, + "grad_norm": 1.2367935180664062, + "learning_rate": 8.742405832320778e-05, + "loss": 1.4027, + "step": 2080 + }, + { + "epoch": 0.6316588253149188, + "grad_norm": 0.40163764357566833, + "learning_rate": 8.741798298906441e-05, + "loss": 1.6465, + "step": 2081 + }, + { + "epoch": 0.6319623615116102, + "grad_norm": 0.4429662823677063, + "learning_rate": 8.741190765492102e-05, + "loss": 2.0381, + "step": 2082 + }, + { + "epoch": 0.6322658977083017, + "grad_norm": 0.4150178134441376, + "learning_rate": 8.740583232077764e-05, + "loss": 1.9406, + "step": 2083 + }, + { + "epoch": 0.6325694339049932, + "grad_norm": 1.1689107418060303, + "learning_rate": 8.739975698663427e-05, + "loss": 2.1465, + "step": 2084 + }, + { + "epoch": 0.6328729701016846, + "grad_norm": 0.39959049224853516, + "learning_rate": 8.739368165249089e-05, + "loss": 1.7467, + "step": 2085 + }, + { + "epoch": 0.6331765062983761, + "grad_norm": 0.4441443979740143, + "learning_rate": 8.738760631834752e-05, + "loss": 1.3684, + "step": 2086 + }, + { + "epoch": 0.6334800424950675, + "grad_norm": 0.42959195375442505, + "learning_rate": 8.738153098420414e-05, + "loss": 2.0832, + "step": 2087 + }, + { + "epoch": 0.6337835786917589, + "grad_norm": 0.5253334045410156, + "learning_rate": 8.737545565006076e-05, + "loss": 1.83, + "step": 2088 + }, + { + "epoch": 0.6340871148884505, + "grad_norm": 0.4475717842578888, + "learning_rate": 8.736938031591739e-05, + "loss": 1.9088, + "step": 2089 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.4162061810493469, + "learning_rate": 8.736330498177399e-05, + "loss": 1.6869, + "step": 2090 + }, + { + "epoch": 0.6346941872818334, + "grad_norm": 0.41907912492752075, + "learning_rate": 8.735722964763062e-05, + "loss": 1.6695, + "step": 2091 + }, + { + "epoch": 0.6349977234785248, + "grad_norm": 0.4472843110561371, + "learning_rate": 8.735115431348725e-05, + "loss": 2.213, + "step": 2092 + }, + { + "epoch": 0.6353012596752162, + "grad_norm": 0.4260854125022888, + "learning_rate": 8.734507897934387e-05, + "loss": 2.2187, + "step": 2093 + }, + { + "epoch": 0.6356047958719078, + "grad_norm": 0.5154047608375549, + "learning_rate": 8.733900364520049e-05, + "loss": 1.669, + "step": 2094 + }, + { + "epoch": 0.6359083320685992, + "grad_norm": 0.42840951681137085, + "learning_rate": 8.733292831105712e-05, + "loss": 1.5714, + "step": 2095 + }, + { + "epoch": 0.6362118682652906, + "grad_norm": 0.3721560537815094, + "learning_rate": 8.732685297691373e-05, + "loss": 1.8571, + "step": 2096 + }, + { + "epoch": 0.6365154044619821, + "grad_norm": 0.38668882846832275, + "learning_rate": 8.732077764277035e-05, + "loss": 1.9735, + "step": 2097 + }, + { + "epoch": 0.6368189406586735, + "grad_norm": 0.44400742650032043, + "learning_rate": 8.731470230862698e-05, + "loss": 1.8012, + "step": 2098 + }, + { + "epoch": 0.637122476855365, + "grad_norm": 0.4170168936252594, + "learning_rate": 8.73086269744836e-05, + "loss": 2.0064, + "step": 2099 + }, + { + "epoch": 0.6374260130520565, + "grad_norm": 0.4339911937713623, + "learning_rate": 8.730255164034023e-05, + "loss": 1.8095, + "step": 2100 + }, + { + "epoch": 0.6377295492487479, + "grad_norm": 0.4953417479991913, + "learning_rate": 8.729647630619685e-05, + "loss": 1.6443, + "step": 2101 + }, + { + "epoch": 0.6380330854454394, + "grad_norm": 0.43667685985565186, + "learning_rate": 8.729040097205347e-05, + "loss": 1.5005, + "step": 2102 + }, + { + "epoch": 0.6383366216421308, + "grad_norm": 0.42101868987083435, + "learning_rate": 8.72843256379101e-05, + "loss": 1.9, + "step": 2103 + }, + { + "epoch": 0.6386401578388223, + "grad_norm": 0.4094242751598358, + "learning_rate": 8.72782503037667e-05, + "loss": 1.4354, + "step": 2104 + }, + { + "epoch": 0.6389436940355138, + "grad_norm": 0.36078140139579773, + "learning_rate": 8.727217496962333e-05, + "loss": 1.9761, + "step": 2105 + }, + { + "epoch": 0.6392472302322052, + "grad_norm": 0.40915626287460327, + "learning_rate": 8.726609963547996e-05, + "loss": 1.7398, + "step": 2106 + }, + { + "epoch": 0.6395507664288966, + "grad_norm": 0.4518681466579437, + "learning_rate": 8.726002430133658e-05, + "loss": 1.6962, + "step": 2107 + }, + { + "epoch": 0.6398543026255881, + "grad_norm": 0.41864755749702454, + "learning_rate": 8.72539489671932e-05, + "loss": 1.7477, + "step": 2108 + }, + { + "epoch": 0.6401578388222796, + "grad_norm": 0.37776780128479004, + "learning_rate": 8.724787363304983e-05, + "loss": 1.4601, + "step": 2109 + }, + { + "epoch": 0.640461375018971, + "grad_norm": 0.4602903723716736, + "learning_rate": 8.724179829890644e-05, + "loss": 1.7791, + "step": 2110 + }, + { + "epoch": 0.6407649112156625, + "grad_norm": 0.3697658181190491, + "learning_rate": 8.723572296476306e-05, + "loss": 1.8286, + "step": 2111 + }, + { + "epoch": 0.6410684474123539, + "grad_norm": 0.3810010254383087, + "learning_rate": 8.722964763061969e-05, + "loss": 1.6823, + "step": 2112 + }, + { + "epoch": 0.6413719836090453, + "grad_norm": 0.3020067811012268, + "learning_rate": 8.722357229647631e-05, + "loss": 1.4566, + "step": 2113 + }, + { + "epoch": 0.6416755198057368, + "grad_norm": 0.35782814025878906, + "learning_rate": 8.721749696233293e-05, + "loss": 1.986, + "step": 2114 + }, + { + "epoch": 0.6419790560024283, + "grad_norm": 0.4075436294078827, + "learning_rate": 8.721142162818954e-05, + "loss": 2.0318, + "step": 2115 + }, + { + "epoch": 0.6422825921991198, + "grad_norm": 0.3835841715335846, + "learning_rate": 8.720534629404618e-05, + "loss": 1.7135, + "step": 2116 + }, + { + "epoch": 0.6425861283958112, + "grad_norm": 0.45285987854003906, + "learning_rate": 8.71992709599028e-05, + "loss": 1.9568, + "step": 2117 + }, + { + "epoch": 0.6428896645925026, + "grad_norm": 0.36824312806129456, + "learning_rate": 8.719319562575941e-05, + "loss": 1.6443, + "step": 2118 + }, + { + "epoch": 0.6431932007891941, + "grad_norm": 0.4950961172580719, + "learning_rate": 8.718712029161604e-05, + "loss": 2.1261, + "step": 2119 + }, + { + "epoch": 0.6434967369858856, + "grad_norm": 0.36859118938446045, + "learning_rate": 8.718104495747267e-05, + "loss": 2.0044, + "step": 2120 + }, + { + "epoch": 0.643800273182577, + "grad_norm": 0.43870800733566284, + "learning_rate": 8.717496962332929e-05, + "loss": 1.9407, + "step": 2121 + }, + { + "epoch": 0.6441038093792685, + "grad_norm": 0.37381303310394287, + "learning_rate": 8.71688942891859e-05, + "loss": 2.1765, + "step": 2122 + }, + { + "epoch": 0.6444073455759599, + "grad_norm": 0.39354661107063293, + "learning_rate": 8.716281895504254e-05, + "loss": 1.7037, + "step": 2123 + }, + { + "epoch": 0.6447108817726513, + "grad_norm": 0.3997972011566162, + "learning_rate": 8.715674362089915e-05, + "loss": 1.9634, + "step": 2124 + }, + { + "epoch": 0.6450144179693429, + "grad_norm": 0.4059608280658722, + "learning_rate": 8.715066828675577e-05, + "loss": 1.6695, + "step": 2125 + }, + { + "epoch": 0.6453179541660343, + "grad_norm": 0.5082445740699768, + "learning_rate": 8.71445929526124e-05, + "loss": 1.8471, + "step": 2126 + }, + { + "epoch": 0.6456214903627258, + "grad_norm": 0.3610053062438965, + "learning_rate": 8.713851761846902e-05, + "loss": 1.1613, + "step": 2127 + }, + { + "epoch": 0.6459250265594172, + "grad_norm": 0.3617028295993805, + "learning_rate": 8.713244228432564e-05, + "loss": 2.1392, + "step": 2128 + }, + { + "epoch": 0.6462285627561086, + "grad_norm": 0.366720587015152, + "learning_rate": 8.712636695018225e-05, + "loss": 2.0723, + "step": 2129 + }, + { + "epoch": 0.6465320989528002, + "grad_norm": 0.6331523656845093, + "learning_rate": 8.712029161603889e-05, + "loss": 2.167, + "step": 2130 + }, + { + "epoch": 0.6468356351494916, + "grad_norm": 0.3837411403656006, + "learning_rate": 8.711421628189552e-05, + "loss": 1.9423, + "step": 2131 + }, + { + "epoch": 0.647139171346183, + "grad_norm": 0.49465852975845337, + "learning_rate": 8.710814094775212e-05, + "loss": 1.1904, + "step": 2132 + }, + { + "epoch": 0.6474427075428745, + "grad_norm": 0.37504327297210693, + "learning_rate": 8.710206561360875e-05, + "loss": 1.3686, + "step": 2133 + }, + { + "epoch": 0.6477462437395659, + "grad_norm": 0.7189307808876038, + "learning_rate": 8.709599027946538e-05, + "loss": 2.2359, + "step": 2134 + }, + { + "epoch": 0.6480497799362575, + "grad_norm": 0.40414321422576904, + "learning_rate": 8.7089914945322e-05, + "loss": 1.9962, + "step": 2135 + }, + { + "epoch": 0.6483533161329489, + "grad_norm": 1.6091177463531494, + "learning_rate": 8.708383961117862e-05, + "loss": 2.1011, + "step": 2136 + }, + { + "epoch": 0.6486568523296403, + "grad_norm": 0.38812699913978577, + "learning_rate": 8.707776427703525e-05, + "loss": 1.8092, + "step": 2137 + }, + { + "epoch": 0.6489603885263318, + "grad_norm": 0.42820391058921814, + "learning_rate": 8.707168894289186e-05, + "loss": 1.6027, + "step": 2138 + }, + { + "epoch": 0.6492639247230232, + "grad_norm": 0.9884753823280334, + "learning_rate": 8.706561360874848e-05, + "loss": 1.4781, + "step": 2139 + }, + { + "epoch": 0.6495674609197146, + "grad_norm": 0.477003276348114, + "learning_rate": 8.705953827460511e-05, + "loss": 1.5069, + "step": 2140 + }, + { + "epoch": 0.6498709971164062, + "grad_norm": 0.4502262473106384, + "learning_rate": 8.705346294046173e-05, + "loss": 2.0444, + "step": 2141 + }, + { + "epoch": 0.6501745333130976, + "grad_norm": 0.36842817068099976, + "learning_rate": 8.704738760631835e-05, + "loss": 1.8169, + "step": 2142 + }, + { + "epoch": 0.650478069509789, + "grad_norm": 0.4413151741027832, + "learning_rate": 8.704131227217496e-05, + "loss": 2.0494, + "step": 2143 + }, + { + "epoch": 0.6507816057064805, + "grad_norm": 0.35122597217559814, + "learning_rate": 8.70352369380316e-05, + "loss": 1.6942, + "step": 2144 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.48351892828941345, + "learning_rate": 8.702916160388823e-05, + "loss": 2.3677, + "step": 2145 + }, + { + "epoch": 0.6513886780998634, + "grad_norm": 0.43341419100761414, + "learning_rate": 8.702308626974483e-05, + "loss": 1.9239, + "step": 2146 + }, + { + "epoch": 0.6516922142965549, + "grad_norm": 0.36051031947135925, + "learning_rate": 8.701701093560146e-05, + "loss": 1.7721, + "step": 2147 + }, + { + "epoch": 0.6519957504932463, + "grad_norm": 0.37466931343078613, + "learning_rate": 8.701093560145809e-05, + "loss": 2.0199, + "step": 2148 + }, + { + "epoch": 0.6522992866899378, + "grad_norm": 0.4176545739173889, + "learning_rate": 8.700486026731471e-05, + "loss": 1.8806, + "step": 2149 + }, + { + "epoch": 0.6526028228866292, + "grad_norm": 0.4158160984516144, + "learning_rate": 8.699878493317133e-05, + "loss": 1.4032, + "step": 2150 + }, + { + "epoch": 0.6529063590833207, + "grad_norm": 0.3781472444534302, + "learning_rate": 8.699270959902796e-05, + "loss": 1.6158, + "step": 2151 + }, + { + "epoch": 0.6532098952800122, + "grad_norm": 0.4139382243156433, + "learning_rate": 8.698663426488457e-05, + "loss": 1.6677, + "step": 2152 + }, + { + "epoch": 0.6535134314767036, + "grad_norm": 0.5988966226577759, + "learning_rate": 8.698055893074119e-05, + "loss": 2.0821, + "step": 2153 + }, + { + "epoch": 0.653816967673395, + "grad_norm": 0.3822804391384125, + "learning_rate": 8.697448359659782e-05, + "loss": 2.2819, + "step": 2154 + }, + { + "epoch": 0.6541205038700865, + "grad_norm": 0.42142486572265625, + "learning_rate": 8.696840826245444e-05, + "loss": 2.1515, + "step": 2155 + }, + { + "epoch": 0.654424040066778, + "grad_norm": 0.3964162766933441, + "learning_rate": 8.696233292831106e-05, + "loss": 1.6801, + "step": 2156 + }, + { + "epoch": 0.6547275762634694, + "grad_norm": 0.6642559170722961, + "learning_rate": 8.695625759416767e-05, + "loss": 1.9085, + "step": 2157 + }, + { + "epoch": 0.6550311124601609, + "grad_norm": 0.4267200827598572, + "learning_rate": 8.69501822600243e-05, + "loss": 1.8194, + "step": 2158 + }, + { + "epoch": 0.6553346486568523, + "grad_norm": 0.4862426221370697, + "learning_rate": 8.694410692588094e-05, + "loss": 2.0361, + "step": 2159 + }, + { + "epoch": 0.6556381848535437, + "grad_norm": 0.45392298698425293, + "learning_rate": 8.693803159173754e-05, + "loss": 2.034, + "step": 2160 + }, + { + "epoch": 0.6559417210502353, + "grad_norm": 0.4699818193912506, + "learning_rate": 8.693195625759417e-05, + "loss": 1.0096, + "step": 2161 + }, + { + "epoch": 0.6562452572469267, + "grad_norm": 0.4601641595363617, + "learning_rate": 8.69258809234508e-05, + "loss": 1.9042, + "step": 2162 + }, + { + "epoch": 0.6565487934436182, + "grad_norm": 0.3832731544971466, + "learning_rate": 8.69198055893074e-05, + "loss": 1.7728, + "step": 2163 + }, + { + "epoch": 0.6568523296403096, + "grad_norm": 0.41405048966407776, + "learning_rate": 8.691373025516404e-05, + "loss": 1.8591, + "step": 2164 + }, + { + "epoch": 0.657155865837001, + "grad_norm": 0.4332970380783081, + "learning_rate": 8.690765492102067e-05, + "loss": 1.4433, + "step": 2165 + }, + { + "epoch": 0.6574594020336925, + "grad_norm": 0.38901615142822266, + "learning_rate": 8.690157958687728e-05, + "loss": 1.9933, + "step": 2166 + }, + { + "epoch": 0.657762938230384, + "grad_norm": 0.5068726539611816, + "learning_rate": 8.68955042527339e-05, + "loss": 1.954, + "step": 2167 + }, + { + "epoch": 0.6580664744270754, + "grad_norm": 0.4076615571975708, + "learning_rate": 8.688942891859053e-05, + "loss": 1.699, + "step": 2168 + }, + { + "epoch": 0.6583700106237669, + "grad_norm": 0.38633993268013, + "learning_rate": 8.688335358444715e-05, + "loss": 1.8453, + "step": 2169 + }, + { + "epoch": 0.6586735468204583, + "grad_norm": 0.3873181641101837, + "learning_rate": 8.687727825030377e-05, + "loss": 1.8966, + "step": 2170 + }, + { + "epoch": 0.6589770830171497, + "grad_norm": 0.4472099840641022, + "learning_rate": 8.687120291616038e-05, + "loss": 2.1017, + "step": 2171 + }, + { + "epoch": 0.6592806192138413, + "grad_norm": 0.34563758969306946, + "learning_rate": 8.686512758201702e-05, + "loss": 1.6238, + "step": 2172 + }, + { + "epoch": 0.6595841554105327, + "grad_norm": 0.4515549838542938, + "learning_rate": 8.685905224787365e-05, + "loss": 1.7878, + "step": 2173 + }, + { + "epoch": 0.6598876916072242, + "grad_norm": 0.6528467535972595, + "learning_rate": 8.685297691373025e-05, + "loss": 1.7581, + "step": 2174 + }, + { + "epoch": 0.6601912278039156, + "grad_norm": 0.345264732837677, + "learning_rate": 8.684690157958688e-05, + "loss": 1.2984, + "step": 2175 + }, + { + "epoch": 0.660494764000607, + "grad_norm": 0.3934096395969391, + "learning_rate": 8.684082624544351e-05, + "loss": 2.084, + "step": 2176 + }, + { + "epoch": 0.6607983001972986, + "grad_norm": 0.3595477044582367, + "learning_rate": 8.683475091130012e-05, + "loss": 1.9587, + "step": 2177 + }, + { + "epoch": 0.66110183639399, + "grad_norm": 0.4324481189250946, + "learning_rate": 8.682867557715675e-05, + "loss": 1.8877, + "step": 2178 + }, + { + "epoch": 0.6614053725906814, + "grad_norm": 0.4493394196033478, + "learning_rate": 8.682260024301338e-05, + "loss": 1.8291, + "step": 2179 + }, + { + "epoch": 0.6617089087873729, + "grad_norm": 0.4085356891155243, + "learning_rate": 8.681652490887e-05, + "loss": 2.1646, + "step": 2180 + }, + { + "epoch": 0.6620124449840643, + "grad_norm": 0.4380393624305725, + "learning_rate": 8.681044957472661e-05, + "loss": 1.3779, + "step": 2181 + }, + { + "epoch": 0.6623159811807559, + "grad_norm": 0.3621211349964142, + "learning_rate": 8.680437424058324e-05, + "loss": 1.7786, + "step": 2182 + }, + { + "epoch": 0.6626195173774473, + "grad_norm": 0.49654054641723633, + "learning_rate": 8.679829890643986e-05, + "loss": 1.2754, + "step": 2183 + }, + { + "epoch": 0.6629230535741387, + "grad_norm": 0.49035829305648804, + "learning_rate": 8.679222357229648e-05, + "loss": 1.4453, + "step": 2184 + }, + { + "epoch": 0.6632265897708302, + "grad_norm": 0.5359811782836914, + "learning_rate": 8.67861482381531e-05, + "loss": 1.4067, + "step": 2185 + }, + { + "epoch": 0.6635301259675216, + "grad_norm": 0.4120253622531891, + "learning_rate": 8.678007290400973e-05, + "loss": 2.235, + "step": 2186 + }, + { + "epoch": 0.6638336621642131, + "grad_norm": 0.3773285448551178, + "learning_rate": 8.677399756986634e-05, + "loss": 2.1406, + "step": 2187 + }, + { + "epoch": 0.6641371983609046, + "grad_norm": 0.3956649899482727, + "learning_rate": 8.676792223572296e-05, + "loss": 1.5831, + "step": 2188 + }, + { + "epoch": 0.664440734557596, + "grad_norm": 0.3894088864326477, + "learning_rate": 8.676184690157959e-05, + "loss": 1.9276, + "step": 2189 + }, + { + "epoch": 0.6647442707542874, + "grad_norm": 0.4932451546192169, + "learning_rate": 8.675577156743622e-05, + "loss": 2.1134, + "step": 2190 + }, + { + "epoch": 0.6650478069509789, + "grad_norm": 0.41271400451660156, + "learning_rate": 8.674969623329283e-05, + "loss": 2.1299, + "step": 2191 + }, + { + "epoch": 0.6653513431476703, + "grad_norm": 0.596319317817688, + "learning_rate": 8.674362089914946e-05, + "loss": 1.4789, + "step": 2192 + }, + { + "epoch": 0.6656548793443618, + "grad_norm": 0.4255685806274414, + "learning_rate": 8.673754556500609e-05, + "loss": 1.7751, + "step": 2193 + }, + { + "epoch": 0.6659584155410533, + "grad_norm": 0.371003657579422, + "learning_rate": 8.67314702308627e-05, + "loss": 1.9355, + "step": 2194 + }, + { + "epoch": 0.6662619517377447, + "grad_norm": 0.43426600098609924, + "learning_rate": 8.672539489671932e-05, + "loss": 2.123, + "step": 2195 + }, + { + "epoch": 0.6665654879344362, + "grad_norm": 0.40644875168800354, + "learning_rate": 8.671931956257595e-05, + "loss": 2.1907, + "step": 2196 + }, + { + "epoch": 0.6668690241311276, + "grad_norm": 0.4468652904033661, + "learning_rate": 8.671324422843257e-05, + "loss": 1.8754, + "step": 2197 + }, + { + "epoch": 0.6671725603278191, + "grad_norm": 0.34468400478363037, + "learning_rate": 8.670716889428919e-05, + "loss": 1.989, + "step": 2198 + }, + { + "epoch": 0.6674760965245106, + "grad_norm": 0.430462121963501, + "learning_rate": 8.67010935601458e-05, + "loss": 1.6021, + "step": 2199 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.42845168709754944, + "learning_rate": 8.669501822600244e-05, + "loss": 2.1036, + "step": 2200 + }, + { + "epoch": 0.6680831689178934, + "grad_norm": 0.4242333769798279, + "learning_rate": 8.668894289185905e-05, + "loss": 1.8518, + "step": 2201 + }, + { + "epoch": 0.6683867051145849, + "grad_norm": 0.3754311800003052, + "learning_rate": 8.668286755771567e-05, + "loss": 1.8616, + "step": 2202 + }, + { + "epoch": 0.6686902413112764, + "grad_norm": 0.39913347363471985, + "learning_rate": 8.66767922235723e-05, + "loss": 2.2196, + "step": 2203 + }, + { + "epoch": 0.6689937775079678, + "grad_norm": 0.3791050910949707, + "learning_rate": 8.667071688942893e-05, + "loss": 1.8651, + "step": 2204 + }, + { + "epoch": 0.6692973137046593, + "grad_norm": 0.46585163474082947, + "learning_rate": 8.666464155528554e-05, + "loss": 2.0577, + "step": 2205 + }, + { + "epoch": 0.6696008499013507, + "grad_norm": 0.4098934233188629, + "learning_rate": 8.665856622114217e-05, + "loss": 2.0225, + "step": 2206 + }, + { + "epoch": 0.6699043860980421, + "grad_norm": 0.3545132577419281, + "learning_rate": 8.66524908869988e-05, + "loss": 1.8955, + "step": 2207 + }, + { + "epoch": 0.6702079222947337, + "grad_norm": 0.4183339774608612, + "learning_rate": 8.664641555285542e-05, + "loss": 2.1093, + "step": 2208 + }, + { + "epoch": 0.6705114584914251, + "grad_norm": 0.378439724445343, + "learning_rate": 8.664034021871203e-05, + "loss": 1.8865, + "step": 2209 + }, + { + "epoch": 0.6708149946881166, + "grad_norm": 0.45795947313308716, + "learning_rate": 8.663426488456865e-05, + "loss": 1.8891, + "step": 2210 + }, + { + "epoch": 0.671118530884808, + "grad_norm": 0.3554634153842926, + "learning_rate": 8.662818955042528e-05, + "loss": 2.1196, + "step": 2211 + }, + { + "epoch": 0.6714220670814994, + "grad_norm": 0.4456568956375122, + "learning_rate": 8.66221142162819e-05, + "loss": 1.6737, + "step": 2212 + }, + { + "epoch": 0.671725603278191, + "grad_norm": 0.39127472043037415, + "learning_rate": 8.661603888213852e-05, + "loss": 1.989, + "step": 2213 + }, + { + "epoch": 0.6720291394748824, + "grad_norm": 0.4240843653678894, + "learning_rate": 8.660996354799515e-05, + "loss": 2.0285, + "step": 2214 + }, + { + "epoch": 0.6723326756715738, + "grad_norm": 0.40605032444000244, + "learning_rate": 8.660388821385176e-05, + "loss": 1.6815, + "step": 2215 + }, + { + "epoch": 0.6726362118682653, + "grad_norm": 0.4075249433517456, + "learning_rate": 8.659781287970838e-05, + "loss": 1.8847, + "step": 2216 + }, + { + "epoch": 0.6729397480649567, + "grad_norm": 0.38832414150238037, + "learning_rate": 8.659173754556501e-05, + "loss": 1.7957, + "step": 2217 + }, + { + "epoch": 0.6732432842616483, + "grad_norm": 0.40097537636756897, + "learning_rate": 8.658566221142164e-05, + "loss": 1.7386, + "step": 2218 + }, + { + "epoch": 0.6735468204583397, + "grad_norm": 0.41220805048942566, + "learning_rate": 8.657958687727825e-05, + "loss": 2.1473, + "step": 2219 + }, + { + "epoch": 0.6738503566550311, + "grad_norm": 0.4157550036907196, + "learning_rate": 8.657351154313488e-05, + "loss": 1.7435, + "step": 2220 + }, + { + "epoch": 0.6741538928517226, + "grad_norm": 0.9074850082397461, + "learning_rate": 8.656743620899151e-05, + "loss": 1.7336, + "step": 2221 + }, + { + "epoch": 0.674457429048414, + "grad_norm": 0.4011635482311249, + "learning_rate": 8.656136087484813e-05, + "loss": 1.9814, + "step": 2222 + }, + { + "epoch": 0.6747609652451054, + "grad_norm": 0.4295683801174164, + "learning_rate": 8.655528554070474e-05, + "loss": 1.6332, + "step": 2223 + }, + { + "epoch": 0.675064501441797, + "grad_norm": 0.424452006816864, + "learning_rate": 8.654921020656136e-05, + "loss": 1.6975, + "step": 2224 + }, + { + "epoch": 0.6753680376384884, + "grad_norm": 0.3975834846496582, + "learning_rate": 8.654313487241799e-05, + "loss": 1.6783, + "step": 2225 + }, + { + "epoch": 0.6756715738351798, + "grad_norm": 0.49879249930381775, + "learning_rate": 8.653705953827461e-05, + "loss": 2.0694, + "step": 2226 + }, + { + "epoch": 0.6759751100318713, + "grad_norm": 0.424622505903244, + "learning_rate": 8.653098420413123e-05, + "loss": 1.7089, + "step": 2227 + }, + { + "epoch": 0.6762786462285627, + "grad_norm": 0.497159868478775, + "learning_rate": 8.652490886998786e-05, + "loss": 1.8934, + "step": 2228 + }, + { + "epoch": 0.6765821824252543, + "grad_norm": 0.40200483798980713, + "learning_rate": 8.651883353584447e-05, + "loss": 2.0158, + "step": 2229 + }, + { + "epoch": 0.6768857186219457, + "grad_norm": 0.4294535219669342, + "learning_rate": 8.651275820170109e-05, + "loss": 1.815, + "step": 2230 + }, + { + "epoch": 0.6771892548186371, + "grad_norm": 0.5176182389259338, + "learning_rate": 8.650668286755772e-05, + "loss": 2.0518, + "step": 2231 + }, + { + "epoch": 0.6774927910153286, + "grad_norm": 0.44558650255203247, + "learning_rate": 8.650060753341435e-05, + "loss": 1.7201, + "step": 2232 + }, + { + "epoch": 0.67779632721202, + "grad_norm": 0.38811054825782776, + "learning_rate": 8.649453219927096e-05, + "loss": 1.5197, + "step": 2233 + }, + { + "epoch": 0.6780998634087115, + "grad_norm": 0.3874174952507019, + "learning_rate": 8.648845686512759e-05, + "loss": 1.8105, + "step": 2234 + }, + { + "epoch": 0.678403399605403, + "grad_norm": 0.6453530788421631, + "learning_rate": 8.648238153098422e-05, + "loss": 1.585, + "step": 2235 + }, + { + "epoch": 0.6787069358020944, + "grad_norm": 0.4314938485622406, + "learning_rate": 8.647630619684082e-05, + "loss": 1.7869, + "step": 2236 + }, + { + "epoch": 0.6790104719987858, + "grad_norm": 0.37230706214904785, + "learning_rate": 8.647023086269745e-05, + "loss": 1.5571, + "step": 2237 + }, + { + "epoch": 0.6793140081954773, + "grad_norm": 0.47215935587882996, + "learning_rate": 8.646415552855407e-05, + "loss": 2.0845, + "step": 2238 + }, + { + "epoch": 0.6796175443921688, + "grad_norm": 0.4179088771343231, + "learning_rate": 8.64580801944107e-05, + "loss": 2.0181, + "step": 2239 + }, + { + "epoch": 0.6799210805888602, + "grad_norm": 0.7629103660583496, + "learning_rate": 8.645200486026732e-05, + "loss": 2.0156, + "step": 2240 + }, + { + "epoch": 0.6802246167855517, + "grad_norm": 0.3792973756790161, + "learning_rate": 8.644592952612394e-05, + "loss": 1.9137, + "step": 2241 + }, + { + "epoch": 0.6805281529822431, + "grad_norm": 0.38583695888519287, + "learning_rate": 8.643985419198057e-05, + "loss": 1.8321, + "step": 2242 + }, + { + "epoch": 0.6808316891789346, + "grad_norm": 0.4620136320590973, + "learning_rate": 8.643377885783718e-05, + "loss": 1.4017, + "step": 2243 + }, + { + "epoch": 0.6811352253756261, + "grad_norm": 0.47091394662857056, + "learning_rate": 8.64277035236938e-05, + "loss": 2.2232, + "step": 2244 + }, + { + "epoch": 0.6814387615723175, + "grad_norm": 0.3809249699115753, + "learning_rate": 8.642162818955043e-05, + "loss": 1.7188, + "step": 2245 + }, + { + "epoch": 0.681742297769009, + "grad_norm": 0.4558849334716797, + "learning_rate": 8.641555285540706e-05, + "loss": 1.7074, + "step": 2246 + }, + { + "epoch": 0.6820458339657004, + "grad_norm": 0.39358267188072205, + "learning_rate": 8.640947752126367e-05, + "loss": 1.6826, + "step": 2247 + }, + { + "epoch": 0.6823493701623918, + "grad_norm": 0.5007576942443848, + "learning_rate": 8.64034021871203e-05, + "loss": 1.683, + "step": 2248 + }, + { + "epoch": 0.6826529063590833, + "grad_norm": 0.43831315636634827, + "learning_rate": 8.639732685297693e-05, + "loss": 1.4773, + "step": 2249 + }, + { + "epoch": 0.6829564425557748, + "grad_norm": 0.41979843378067017, + "learning_rate": 8.639125151883353e-05, + "loss": 1.8137, + "step": 2250 + }, + { + "epoch": 0.6832599787524662, + "grad_norm": 0.4662984311580658, + "learning_rate": 8.638517618469016e-05, + "loss": 1.6978, + "step": 2251 + }, + { + "epoch": 0.6835635149491577, + "grad_norm": 0.4381478428840637, + "learning_rate": 8.637910085054678e-05, + "loss": 2.0157, + "step": 2252 + }, + { + "epoch": 0.6838670511458491, + "grad_norm": 0.7363194823265076, + "learning_rate": 8.637302551640341e-05, + "loss": 1.5466, + "step": 2253 + }, + { + "epoch": 0.6841705873425405, + "grad_norm": 0.5327618718147278, + "learning_rate": 8.636695018226003e-05, + "loss": 1.8425, + "step": 2254 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.4380737245082855, + "learning_rate": 8.636087484811665e-05, + "loss": 1.7879, + "step": 2255 + }, + { + "epoch": 0.6847776597359235, + "grad_norm": 0.8782259821891785, + "learning_rate": 8.635479951397328e-05, + "loss": 1.8806, + "step": 2256 + }, + { + "epoch": 0.685081195932615, + "grad_norm": 0.3841392397880554, + "learning_rate": 8.63487241798299e-05, + "loss": 2.014, + "step": 2257 + }, + { + "epoch": 0.6853847321293064, + "grad_norm": 0.39896446466445923, + "learning_rate": 8.634264884568651e-05, + "loss": 1.9356, + "step": 2258 + }, + { + "epoch": 0.6856882683259978, + "grad_norm": 0.41541773080825806, + "learning_rate": 8.633657351154314e-05, + "loss": 2.1365, + "step": 2259 + }, + { + "epoch": 0.6859918045226894, + "grad_norm": 0.453948438167572, + "learning_rate": 8.633049817739976e-05, + "loss": 1.9459, + "step": 2260 + }, + { + "epoch": 0.6862953407193808, + "grad_norm": 0.6398829221725464, + "learning_rate": 8.632442284325638e-05, + "loss": 1.3556, + "step": 2261 + }, + { + "epoch": 0.6865988769160722, + "grad_norm": 0.43574538826942444, + "learning_rate": 8.631834750911301e-05, + "loss": 1.9142, + "step": 2262 + }, + { + "epoch": 0.6869024131127637, + "grad_norm": 0.39180728793144226, + "learning_rate": 8.631227217496964e-05, + "loss": 1.8198, + "step": 2263 + }, + { + "epoch": 0.6872059493094551, + "grad_norm": 0.4146488904953003, + "learning_rate": 8.630619684082624e-05, + "loss": 1.7776, + "step": 2264 + }, + { + "epoch": 0.6875094855061467, + "grad_norm": 0.3681737184524536, + "learning_rate": 8.630012150668287e-05, + "loss": 1.4926, + "step": 2265 + }, + { + "epoch": 0.6878130217028381, + "grad_norm": 0.44278883934020996, + "learning_rate": 8.629404617253949e-05, + "loss": 1.8021, + "step": 2266 + }, + { + "epoch": 0.6881165578995295, + "grad_norm": 0.4687512218952179, + "learning_rate": 8.628797083839612e-05, + "loss": 1.75, + "step": 2267 + }, + { + "epoch": 0.688420094096221, + "grad_norm": 0.4102340042591095, + "learning_rate": 8.628189550425274e-05, + "loss": 1.9249, + "step": 2268 + }, + { + "epoch": 0.6887236302929124, + "grad_norm": 0.44898685812950134, + "learning_rate": 8.627582017010936e-05, + "loss": 1.9914, + "step": 2269 + }, + { + "epoch": 0.6890271664896039, + "grad_norm": 0.451225221157074, + "learning_rate": 8.626974483596599e-05, + "loss": 1.71, + "step": 2270 + }, + { + "epoch": 0.6893307026862954, + "grad_norm": 0.7062796950340271, + "learning_rate": 8.62636695018226e-05, + "loss": 1.2907, + "step": 2271 + }, + { + "epoch": 0.6896342388829868, + "grad_norm": 0.39842337369918823, + "learning_rate": 8.625759416767922e-05, + "loss": 1.6578, + "step": 2272 + }, + { + "epoch": 0.6899377750796782, + "grad_norm": 0.33577829599380493, + "learning_rate": 8.625151883353585e-05, + "loss": 1.7332, + "step": 2273 + }, + { + "epoch": 0.6902413112763697, + "grad_norm": 0.43298929929733276, + "learning_rate": 8.624544349939247e-05, + "loss": 2.0028, + "step": 2274 + }, + { + "epoch": 0.6905448474730611, + "grad_norm": 0.4451911449432373, + "learning_rate": 8.623936816524909e-05, + "loss": 1.4601, + "step": 2275 + }, + { + "epoch": 0.6908483836697527, + "grad_norm": 0.4683527946472168, + "learning_rate": 8.623329283110572e-05, + "loss": 1.5981, + "step": 2276 + }, + { + "epoch": 0.6911519198664441, + "grad_norm": 0.4420105814933777, + "learning_rate": 8.622721749696235e-05, + "loss": 1.6975, + "step": 2277 + }, + { + "epoch": 0.6914554560631355, + "grad_norm": 0.3732719421386719, + "learning_rate": 8.622114216281895e-05, + "loss": 2.1493, + "step": 2278 + }, + { + "epoch": 0.691758992259827, + "grad_norm": 0.4039726257324219, + "learning_rate": 8.621506682867558e-05, + "loss": 2.0208, + "step": 2279 + }, + { + "epoch": 0.6920625284565184, + "grad_norm": 0.35387054085731506, + "learning_rate": 8.62089914945322e-05, + "loss": 2.0324, + "step": 2280 + }, + { + "epoch": 0.6923660646532099, + "grad_norm": 0.4533388912677765, + "learning_rate": 8.620291616038883e-05, + "loss": 1.7281, + "step": 2281 + }, + { + "epoch": 0.6926696008499014, + "grad_norm": 0.37299293279647827, + "learning_rate": 8.619684082624545e-05, + "loss": 1.9662, + "step": 2282 + }, + { + "epoch": 0.6929731370465928, + "grad_norm": 0.41872239112854004, + "learning_rate": 8.619076549210207e-05, + "loss": 1.9887, + "step": 2283 + }, + { + "epoch": 0.6932766732432842, + "grad_norm": 0.8140760064125061, + "learning_rate": 8.61846901579587e-05, + "loss": 2.1649, + "step": 2284 + }, + { + "epoch": 0.6935802094399757, + "grad_norm": 0.3966423571109772, + "learning_rate": 8.617861482381531e-05, + "loss": 1.421, + "step": 2285 + }, + { + "epoch": 0.6938837456366672, + "grad_norm": 0.36617428064346313, + "learning_rate": 8.617253948967193e-05, + "loss": 1.8174, + "step": 2286 + }, + { + "epoch": 0.6941872818333586, + "grad_norm": 0.41297128796577454, + "learning_rate": 8.616646415552856e-05, + "loss": 1.9215, + "step": 2287 + }, + { + "epoch": 0.6944908180300501, + "grad_norm": 0.48277321457862854, + "learning_rate": 8.616038882138518e-05, + "loss": 1.3551, + "step": 2288 + }, + { + "epoch": 0.6947943542267415, + "grad_norm": 0.41190510988235474, + "learning_rate": 8.61543134872418e-05, + "loss": 1.8701, + "step": 2289 + }, + { + "epoch": 0.695097890423433, + "grad_norm": 0.34471115469932556, + "learning_rate": 8.614823815309843e-05, + "loss": 1.5073, + "step": 2290 + }, + { + "epoch": 0.6954014266201245, + "grad_norm": 0.4469250738620758, + "learning_rate": 8.614216281895504e-05, + "loss": 1.8257, + "step": 2291 + }, + { + "epoch": 0.6957049628168159, + "grad_norm": 0.38356101512908936, + "learning_rate": 8.613608748481166e-05, + "loss": 1.63, + "step": 2292 + }, + { + "epoch": 0.6960084990135074, + "grad_norm": 0.3836432099342346, + "learning_rate": 8.613001215066829e-05, + "loss": 1.6294, + "step": 2293 + }, + { + "epoch": 0.6963120352101988, + "grad_norm": 1.1250473260879517, + "learning_rate": 8.612393681652491e-05, + "loss": 1.0634, + "step": 2294 + }, + { + "epoch": 0.6966155714068902, + "grad_norm": 0.39849042892456055, + "learning_rate": 8.611786148238154e-05, + "loss": 1.4908, + "step": 2295 + }, + { + "epoch": 0.6969191076035818, + "grad_norm": 1.0617260932922363, + "learning_rate": 8.611178614823816e-05, + "loss": 2.0693, + "step": 2296 + }, + { + "epoch": 0.6972226438002732, + "grad_norm": 0.44789618253707886, + "learning_rate": 8.610571081409478e-05, + "loss": 1.7397, + "step": 2297 + }, + { + "epoch": 0.6975261799969646, + "grad_norm": 0.7480859756469727, + "learning_rate": 8.60996354799514e-05, + "loss": 1.8385, + "step": 2298 + }, + { + "epoch": 0.6978297161936561, + "grad_norm": 0.3201582133769989, + "learning_rate": 8.609356014580802e-05, + "loss": 1.4323, + "step": 2299 + }, + { + "epoch": 0.6981332523903475, + "grad_norm": 0.4212173521518707, + "learning_rate": 8.608748481166464e-05, + "loss": 1.6153, + "step": 2300 + }, + { + "epoch": 0.6984367885870391, + "grad_norm": 0.39297157526016235, + "learning_rate": 8.608140947752127e-05, + "loss": 1.9207, + "step": 2301 + }, + { + "epoch": 0.6987403247837305, + "grad_norm": 0.4868420660495758, + "learning_rate": 8.607533414337789e-05, + "loss": 1.8056, + "step": 2302 + }, + { + "epoch": 0.6990438609804219, + "grad_norm": 0.518147885799408, + "learning_rate": 8.60692588092345e-05, + "loss": 1.4521, + "step": 2303 + }, + { + "epoch": 0.6993473971771134, + "grad_norm": 0.4484739899635315, + "learning_rate": 8.606318347509114e-05, + "loss": 1.8651, + "step": 2304 + }, + { + "epoch": 0.6996509333738048, + "grad_norm": 0.4859076738357544, + "learning_rate": 8.605710814094775e-05, + "loss": 1.6752, + "step": 2305 + }, + { + "epoch": 0.6999544695704962, + "grad_norm": 0.4186297655105591, + "learning_rate": 8.605103280680437e-05, + "loss": 2.0165, + "step": 2306 + }, + { + "epoch": 0.7002580057671878, + "grad_norm": 0.34496191143989563, + "learning_rate": 8.6044957472661e-05, + "loss": 2.1395, + "step": 2307 + }, + { + "epoch": 0.7005615419638792, + "grad_norm": 0.3636651933193207, + "learning_rate": 8.603888213851762e-05, + "loss": 1.2115, + "step": 2308 + }, + { + "epoch": 0.7008650781605706, + "grad_norm": 0.38789573311805725, + "learning_rate": 8.603280680437425e-05, + "loss": 1.7992, + "step": 2309 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.41874828934669495, + "learning_rate": 8.602673147023087e-05, + "loss": 1.9813, + "step": 2310 + }, + { + "epoch": 0.7014721505539535, + "grad_norm": 0.6681198477745056, + "learning_rate": 8.602065613608749e-05, + "loss": 2.0765, + "step": 2311 + }, + { + "epoch": 0.7017756867506451, + "grad_norm": 0.4358363151550293, + "learning_rate": 8.601458080194412e-05, + "loss": 2.0836, + "step": 2312 + }, + { + "epoch": 0.7020792229473365, + "grad_norm": 0.4268842339515686, + "learning_rate": 8.600850546780073e-05, + "loss": 1.963, + "step": 2313 + }, + { + "epoch": 0.7023827591440279, + "grad_norm": 0.43456903100013733, + "learning_rate": 8.600243013365735e-05, + "loss": 2.031, + "step": 2314 + }, + { + "epoch": 0.7026862953407194, + "grad_norm": 0.9157736301422119, + "learning_rate": 8.599635479951398e-05, + "loss": 1.6625, + "step": 2315 + }, + { + "epoch": 0.7029898315374108, + "grad_norm": 0.41116464138031006, + "learning_rate": 8.59902794653706e-05, + "loss": 1.3184, + "step": 2316 + }, + { + "epoch": 0.7032933677341023, + "grad_norm": 0.38889098167419434, + "learning_rate": 8.598420413122722e-05, + "loss": 1.5656, + "step": 2317 + }, + { + "epoch": 0.7035969039307938, + "grad_norm": 0.4620545208454132, + "learning_rate": 8.597812879708385e-05, + "loss": 1.9967, + "step": 2318 + }, + { + "epoch": 0.7039004401274852, + "grad_norm": 0.44721749424934387, + "learning_rate": 8.597205346294046e-05, + "loss": 1.9404, + "step": 2319 + }, + { + "epoch": 0.7042039763241766, + "grad_norm": 0.46273544430732727, + "learning_rate": 8.596597812879708e-05, + "loss": 1.9765, + "step": 2320 + }, + { + "epoch": 0.7045075125208681, + "grad_norm": 0.3636545240879059, + "learning_rate": 8.595990279465371e-05, + "loss": 1.8925, + "step": 2321 + }, + { + "epoch": 0.7048110487175596, + "grad_norm": 0.49978089332580566, + "learning_rate": 8.595382746051033e-05, + "loss": 1.9087, + "step": 2322 + }, + { + "epoch": 0.705114584914251, + "grad_norm": 0.3676183819770813, + "learning_rate": 8.594775212636695e-05, + "loss": 1.9449, + "step": 2323 + }, + { + "epoch": 0.7054181211109425, + "grad_norm": 0.3930191397666931, + "learning_rate": 8.594167679222358e-05, + "loss": 2.1377, + "step": 2324 + }, + { + "epoch": 0.7057216573076339, + "grad_norm": 0.4476909935474396, + "learning_rate": 8.59356014580802e-05, + "loss": 1.886, + "step": 2325 + }, + { + "epoch": 0.7060251935043254, + "grad_norm": 0.4343526363372803, + "learning_rate": 8.592952612393683e-05, + "loss": 1.5936, + "step": 2326 + }, + { + "epoch": 0.7063287297010169, + "grad_norm": 0.42617321014404297, + "learning_rate": 8.592345078979344e-05, + "loss": 1.8253, + "step": 2327 + }, + { + "epoch": 0.7066322658977083, + "grad_norm": 0.4090782105922699, + "learning_rate": 8.591737545565006e-05, + "loss": 1.8596, + "step": 2328 + }, + { + "epoch": 0.7069358020943998, + "grad_norm": 0.4233112633228302, + "learning_rate": 8.591130012150669e-05, + "loss": 1.8529, + "step": 2329 + }, + { + "epoch": 0.7072393382910912, + "grad_norm": 0.4159391224384308, + "learning_rate": 8.590522478736331e-05, + "loss": 2.202, + "step": 2330 + }, + { + "epoch": 0.7075428744877826, + "grad_norm": 0.4303951859474182, + "learning_rate": 8.589914945321993e-05, + "loss": 2.2339, + "step": 2331 + }, + { + "epoch": 0.7078464106844741, + "grad_norm": 0.431086927652359, + "learning_rate": 8.589307411907656e-05, + "loss": 1.7753, + "step": 2332 + }, + { + "epoch": 0.7081499468811656, + "grad_norm": 0.4268263280391693, + "learning_rate": 8.588699878493317e-05, + "loss": 2.1529, + "step": 2333 + }, + { + "epoch": 0.708453483077857, + "grad_norm": 0.35274428129196167, + "learning_rate": 8.588092345078979e-05, + "loss": 1.4355, + "step": 2334 + }, + { + "epoch": 0.7087570192745485, + "grad_norm": 0.3985956311225891, + "learning_rate": 8.587484811664642e-05, + "loss": 1.7141, + "step": 2335 + }, + { + "epoch": 0.7090605554712399, + "grad_norm": 0.44768375158309937, + "learning_rate": 8.586877278250304e-05, + "loss": 1.6654, + "step": 2336 + }, + { + "epoch": 0.7093640916679314, + "grad_norm": 0.38372135162353516, + "learning_rate": 8.586269744835966e-05, + "loss": 1.8577, + "step": 2337 + }, + { + "epoch": 0.7096676278646229, + "grad_norm": 0.459806889295578, + "learning_rate": 8.585662211421629e-05, + "loss": 1.8502, + "step": 2338 + }, + { + "epoch": 0.7099711640613143, + "grad_norm": 0.36689698696136475, + "learning_rate": 8.58505467800729e-05, + "loss": 1.9931, + "step": 2339 + }, + { + "epoch": 0.7102747002580058, + "grad_norm": 0.5424461960792542, + "learning_rate": 8.584447144592954e-05, + "loss": 1.6979, + "step": 2340 + }, + { + "epoch": 0.7105782364546972, + "grad_norm": 0.663773238658905, + "learning_rate": 8.583839611178615e-05, + "loss": 1.9633, + "step": 2341 + }, + { + "epoch": 0.7108817726513886, + "grad_norm": 2.337242603302002, + "learning_rate": 8.583232077764277e-05, + "loss": 1.2587, + "step": 2342 + }, + { + "epoch": 0.7111853088480802, + "grad_norm": 0.4255028963088989, + "learning_rate": 8.58262454434994e-05, + "loss": 1.935, + "step": 2343 + }, + { + "epoch": 0.7114888450447716, + "grad_norm": 0.796564519405365, + "learning_rate": 8.582017010935602e-05, + "loss": 2.1544, + "step": 2344 + }, + { + "epoch": 0.711792381241463, + "grad_norm": 0.42163416743278503, + "learning_rate": 8.581409477521264e-05, + "loss": 1.9419, + "step": 2345 + }, + { + "epoch": 0.7120959174381545, + "grad_norm": 0.49495795369148254, + "learning_rate": 8.580801944106927e-05, + "loss": 1.6143, + "step": 2346 + }, + { + "epoch": 0.7123994536348459, + "grad_norm": 0.5532099008560181, + "learning_rate": 8.580194410692588e-05, + "loss": 1.9407, + "step": 2347 + }, + { + "epoch": 0.7127029898315375, + "grad_norm": 0.434341162443161, + "learning_rate": 8.57958687727825e-05, + "loss": 1.9626, + "step": 2348 + }, + { + "epoch": 0.7130065260282289, + "grad_norm": 0.5338404774665833, + "learning_rate": 8.578979343863913e-05, + "loss": 1.801, + "step": 2349 + }, + { + "epoch": 0.7133100622249203, + "grad_norm": 0.48087722063064575, + "learning_rate": 8.578371810449575e-05, + "loss": 2.2286, + "step": 2350 + }, + { + "epoch": 0.7136135984216118, + "grad_norm": 0.43688857555389404, + "learning_rate": 8.577764277035237e-05, + "loss": 1.7534, + "step": 2351 + }, + { + "epoch": 0.7139171346183032, + "grad_norm": 1.3858163356781006, + "learning_rate": 8.5771567436209e-05, + "loss": 1.4466, + "step": 2352 + }, + { + "epoch": 0.7142206708149947, + "grad_norm": 0.6149253249168396, + "learning_rate": 8.576549210206562e-05, + "loss": 2.0683, + "step": 2353 + }, + { + "epoch": 0.7145242070116862, + "grad_norm": 0.49920403957366943, + "learning_rate": 8.575941676792225e-05, + "loss": 1.9266, + "step": 2354 + }, + { + "epoch": 0.7148277432083776, + "grad_norm": 0.41959667205810547, + "learning_rate": 8.575334143377886e-05, + "loss": 1.7429, + "step": 2355 + }, + { + "epoch": 0.715131279405069, + "grad_norm": 0.5163973569869995, + "learning_rate": 8.574726609963548e-05, + "loss": 1.2735, + "step": 2356 + }, + { + "epoch": 0.7154348156017605, + "grad_norm": 0.37799614667892456, + "learning_rate": 8.574119076549211e-05, + "loss": 2.2448, + "step": 2357 + }, + { + "epoch": 0.7157383517984519, + "grad_norm": 0.43541470170021057, + "learning_rate": 8.573511543134873e-05, + "loss": 2.2739, + "step": 2358 + }, + { + "epoch": 0.7160418879951435, + "grad_norm": 1.3038394451141357, + "learning_rate": 8.572904009720535e-05, + "loss": 1.9907, + "step": 2359 + }, + { + "epoch": 0.7163454241918349, + "grad_norm": 0.6111695766448975, + "learning_rate": 8.572296476306198e-05, + "loss": 1.5575, + "step": 2360 + }, + { + "epoch": 0.7166489603885263, + "grad_norm": 1.2944895029067993, + "learning_rate": 8.57168894289186e-05, + "loss": 1.8409, + "step": 2361 + }, + { + "epoch": 0.7169524965852178, + "grad_norm": 0.42008545994758606, + "learning_rate": 8.571081409477521e-05, + "loss": 1.8825, + "step": 2362 + }, + { + "epoch": 0.7172560327819092, + "grad_norm": 0.48183196783065796, + "learning_rate": 8.570473876063184e-05, + "loss": 1.9233, + "step": 2363 + }, + { + "epoch": 0.7175595689786007, + "grad_norm": 0.41434016823768616, + "learning_rate": 8.569866342648846e-05, + "loss": 2.0636, + "step": 2364 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.3774077296257019, + "learning_rate": 8.569258809234508e-05, + "loss": 1.9155, + "step": 2365 + }, + { + "epoch": 0.7181666413719836, + "grad_norm": 0.350824236869812, + "learning_rate": 8.568651275820171e-05, + "loss": 1.4894, + "step": 2366 + }, + { + "epoch": 0.718470177568675, + "grad_norm": 1.5183087587356567, + "learning_rate": 8.568043742405833e-05, + "loss": 2.0812, + "step": 2367 + }, + { + "epoch": 0.7187737137653665, + "grad_norm": 0.3757447600364685, + "learning_rate": 8.567436208991496e-05, + "loss": 1.4652, + "step": 2368 + }, + { + "epoch": 0.719077249962058, + "grad_norm": 0.4151865839958191, + "learning_rate": 8.566828675577157e-05, + "loss": 1.677, + "step": 2369 + }, + { + "epoch": 0.7193807861587495, + "grad_norm": 0.4992164075374603, + "learning_rate": 8.566221142162819e-05, + "loss": 2.1459, + "step": 2370 + }, + { + "epoch": 0.7196843223554409, + "grad_norm": 0.3945586085319519, + "learning_rate": 8.565613608748482e-05, + "loss": 1.9362, + "step": 2371 + }, + { + "epoch": 0.7199878585521323, + "grad_norm": 0.4325678050518036, + "learning_rate": 8.565006075334144e-05, + "loss": 2.0223, + "step": 2372 + }, + { + "epoch": 0.7202913947488238, + "grad_norm": 0.39915481209754944, + "learning_rate": 8.564398541919806e-05, + "loss": 1.9572, + "step": 2373 + }, + { + "epoch": 0.7205949309455153, + "grad_norm": 0.45898914337158203, + "learning_rate": 8.563791008505469e-05, + "loss": 1.5554, + "step": 2374 + }, + { + "epoch": 0.7208984671422067, + "grad_norm": 0.4385409951210022, + "learning_rate": 8.56318347509113e-05, + "loss": 1.7864, + "step": 2375 + }, + { + "epoch": 0.7212020033388982, + "grad_norm": 0.40655046701431274, + "learning_rate": 8.562575941676792e-05, + "loss": 1.9718, + "step": 2376 + }, + { + "epoch": 0.7215055395355896, + "grad_norm": 0.42865580320358276, + "learning_rate": 8.561968408262455e-05, + "loss": 1.6585, + "step": 2377 + }, + { + "epoch": 0.721809075732281, + "grad_norm": 1.0274362564086914, + "learning_rate": 8.561360874848117e-05, + "loss": 1.9557, + "step": 2378 + }, + { + "epoch": 0.7221126119289726, + "grad_norm": 0.8454954028129578, + "learning_rate": 8.560753341433779e-05, + "loss": 2.0079, + "step": 2379 + }, + { + "epoch": 0.722416148125664, + "grad_norm": 0.3799399733543396, + "learning_rate": 8.560145808019442e-05, + "loss": 1.9851, + "step": 2380 + }, + { + "epoch": 0.7227196843223554, + "grad_norm": 0.5621289610862732, + "learning_rate": 8.559538274605104e-05, + "loss": 1.6631, + "step": 2381 + }, + { + "epoch": 0.7230232205190469, + "grad_norm": 0.42442479729652405, + "learning_rate": 8.558930741190767e-05, + "loss": 1.9328, + "step": 2382 + }, + { + "epoch": 0.7233267567157383, + "grad_norm": 0.4831121265888214, + "learning_rate": 8.558323207776428e-05, + "loss": 1.6994, + "step": 2383 + }, + { + "epoch": 0.7236302929124299, + "grad_norm": 0.4605385363101959, + "learning_rate": 8.55771567436209e-05, + "loss": 2.0735, + "step": 2384 + }, + { + "epoch": 0.7239338291091213, + "grad_norm": 0.4393116235733032, + "learning_rate": 8.557108140947753e-05, + "loss": 1.6179, + "step": 2385 + }, + { + "epoch": 0.7242373653058127, + "grad_norm": 0.3323841392993927, + "learning_rate": 8.556500607533414e-05, + "loss": 1.7223, + "step": 2386 + }, + { + "epoch": 0.7245409015025042, + "grad_norm": 1.153462290763855, + "learning_rate": 8.555893074119077e-05, + "loss": 2.2449, + "step": 2387 + }, + { + "epoch": 0.7248444376991956, + "grad_norm": 0.4617941677570343, + "learning_rate": 8.55528554070474e-05, + "loss": 1.4432, + "step": 2388 + }, + { + "epoch": 0.725147973895887, + "grad_norm": 0.38924935460090637, + "learning_rate": 8.554678007290401e-05, + "loss": 2.1933, + "step": 2389 + }, + { + "epoch": 0.7254515100925786, + "grad_norm": 0.37328121066093445, + "learning_rate": 8.554070473876063e-05, + "loss": 1.5654, + "step": 2390 + }, + { + "epoch": 0.72575504628927, + "grad_norm": 0.46307137608528137, + "learning_rate": 8.553462940461726e-05, + "loss": 1.4233, + "step": 2391 + }, + { + "epoch": 0.7260585824859614, + "grad_norm": 0.39463040232658386, + "learning_rate": 8.552855407047388e-05, + "loss": 1.8745, + "step": 2392 + }, + { + "epoch": 0.7263621186826529, + "grad_norm": 0.6351356506347656, + "learning_rate": 8.55224787363305e-05, + "loss": 2.0999, + "step": 2393 + }, + { + "epoch": 0.7266656548793443, + "grad_norm": 0.446508526802063, + "learning_rate": 8.551640340218713e-05, + "loss": 2.0818, + "step": 2394 + }, + { + "epoch": 0.7269691910760359, + "grad_norm": 0.3539383113384247, + "learning_rate": 8.551032806804375e-05, + "loss": 1.3604, + "step": 2395 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.4133947789669037, + "learning_rate": 8.550425273390036e-05, + "loss": 1.5174, + "step": 2396 + }, + { + "epoch": 0.7275762634694187, + "grad_norm": 0.3807066082954407, + "learning_rate": 8.5498177399757e-05, + "loss": 1.8238, + "step": 2397 + }, + { + "epoch": 0.7278797996661102, + "grad_norm": 0.41087058186531067, + "learning_rate": 8.549210206561361e-05, + "loss": 1.9404, + "step": 2398 + }, + { + "epoch": 0.7281833358628016, + "grad_norm": 0.36707812547683716, + "learning_rate": 8.548602673147024e-05, + "loss": 1.8164, + "step": 2399 + }, + { + "epoch": 0.7284868720594931, + "grad_norm": 0.38733971118927, + "learning_rate": 8.547995139732685e-05, + "loss": 1.3954, + "step": 2400 + }, + { + "epoch": 0.7287904082561846, + "grad_norm": 0.41041603684425354, + "learning_rate": 8.547387606318348e-05, + "loss": 1.5569, + "step": 2401 + }, + { + "epoch": 0.729093944452876, + "grad_norm": 0.42836543917655945, + "learning_rate": 8.546780072904011e-05, + "loss": 1.7968, + "step": 2402 + }, + { + "epoch": 0.7293974806495674, + "grad_norm": 0.4246993660926819, + "learning_rate": 8.546172539489672e-05, + "loss": 2.1727, + "step": 2403 + }, + { + "epoch": 0.7297010168462589, + "grad_norm": 0.43355593085289, + "learning_rate": 8.545565006075334e-05, + "loss": 1.5563, + "step": 2404 + }, + { + "epoch": 0.7300045530429504, + "grad_norm": 0.39305025339126587, + "learning_rate": 8.544957472660997e-05, + "loss": 1.8375, + "step": 2405 + }, + { + "epoch": 0.7303080892396419, + "grad_norm": 0.44923126697540283, + "learning_rate": 8.544349939246659e-05, + "loss": 1.6066, + "step": 2406 + }, + { + "epoch": 0.7306116254363333, + "grad_norm": 0.41019386053085327, + "learning_rate": 8.543742405832321e-05, + "loss": 2.0204, + "step": 2407 + }, + { + "epoch": 0.7309151616330247, + "grad_norm": 0.4895036220550537, + "learning_rate": 8.543134872417984e-05, + "loss": 2.1129, + "step": 2408 + }, + { + "epoch": 0.7312186978297162, + "grad_norm": 0.4031083583831787, + "learning_rate": 8.542527339003646e-05, + "loss": 1.9971, + "step": 2409 + }, + { + "epoch": 0.7315222340264077, + "grad_norm": 0.40298768877983093, + "learning_rate": 8.541919805589307e-05, + "loss": 1.9922, + "step": 2410 + }, + { + "epoch": 0.7318257702230991, + "grad_norm": 0.41940340399742126, + "learning_rate": 8.54131227217497e-05, + "loss": 1.8683, + "step": 2411 + }, + { + "epoch": 0.7321293064197906, + "grad_norm": 0.4068038761615753, + "learning_rate": 8.540704738760632e-05, + "loss": 1.8514, + "step": 2412 + }, + { + "epoch": 0.732432842616482, + "grad_norm": 0.3992190361022949, + "learning_rate": 8.540097205346295e-05, + "loss": 1.734, + "step": 2413 + }, + { + "epoch": 0.7327363788131734, + "grad_norm": 0.35920289158821106, + "learning_rate": 8.539489671931956e-05, + "loss": 2.0869, + "step": 2414 + }, + { + "epoch": 0.7330399150098649, + "grad_norm": 0.41339996457099915, + "learning_rate": 8.538882138517619e-05, + "loss": 1.862, + "step": 2415 + }, + { + "epoch": 0.7333434512065564, + "grad_norm": 0.35875940322875977, + "learning_rate": 8.538274605103282e-05, + "loss": 1.9953, + "step": 2416 + }, + { + "epoch": 0.7336469874032479, + "grad_norm": 0.39455875754356384, + "learning_rate": 8.537667071688943e-05, + "loss": 1.4772, + "step": 2417 + }, + { + "epoch": 0.7339505235999393, + "grad_norm": 0.4024868905544281, + "learning_rate": 8.537059538274605e-05, + "loss": 1.9192, + "step": 2418 + }, + { + "epoch": 0.7342540597966307, + "grad_norm": 0.43624451756477356, + "learning_rate": 8.536452004860268e-05, + "loss": 1.7586, + "step": 2419 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.4356803596019745, + "learning_rate": 8.53584447144593e-05, + "loss": 1.3589, + "step": 2420 + }, + { + "epoch": 0.7348611321900137, + "grad_norm": 0.3844490051269531, + "learning_rate": 8.535236938031592e-05, + "loss": 1.7586, + "step": 2421 + }, + { + "epoch": 0.7351646683867051, + "grad_norm": 0.36453956365585327, + "learning_rate": 8.534629404617255e-05, + "loss": 1.9643, + "step": 2422 + }, + { + "epoch": 0.7354682045833966, + "grad_norm": 0.42973411083221436, + "learning_rate": 8.534021871202917e-05, + "loss": 2.0022, + "step": 2423 + }, + { + "epoch": 0.735771740780088, + "grad_norm": 0.4491013288497925, + "learning_rate": 8.533414337788578e-05, + "loss": 1.9658, + "step": 2424 + }, + { + "epoch": 0.7360752769767794, + "grad_norm": 0.3806130886077881, + "learning_rate": 8.532806804374241e-05, + "loss": 1.7092, + "step": 2425 + }, + { + "epoch": 0.736378813173471, + "grad_norm": 0.37530237436294556, + "learning_rate": 8.532199270959903e-05, + "loss": 2.1067, + "step": 2426 + }, + { + "epoch": 0.7366823493701624, + "grad_norm": 0.36951944231987, + "learning_rate": 8.531591737545566e-05, + "loss": 2.1429, + "step": 2427 + }, + { + "epoch": 0.7369858855668538, + "grad_norm": 0.8292801380157471, + "learning_rate": 8.530984204131227e-05, + "loss": 1.9387, + "step": 2428 + }, + { + "epoch": 0.7372894217635453, + "grad_norm": 0.3690939247608185, + "learning_rate": 8.53037667071689e-05, + "loss": 1.96, + "step": 2429 + }, + { + "epoch": 0.7375929579602367, + "grad_norm": 0.3507663905620575, + "learning_rate": 8.529769137302553e-05, + "loss": 1.8147, + "step": 2430 + }, + { + "epoch": 0.7378964941569283, + "grad_norm": 0.4241466820240021, + "learning_rate": 8.529161603888215e-05, + "loss": 1.1116, + "step": 2431 + }, + { + "epoch": 0.7382000303536197, + "grad_norm": 0.40038058161735535, + "learning_rate": 8.528554070473876e-05, + "loss": 1.9528, + "step": 2432 + }, + { + "epoch": 0.7385035665503111, + "grad_norm": 0.41025862097740173, + "learning_rate": 8.527946537059539e-05, + "loss": 1.8288, + "step": 2433 + }, + { + "epoch": 0.7388071027470026, + "grad_norm": 0.43207821249961853, + "learning_rate": 8.527339003645201e-05, + "loss": 1.986, + "step": 2434 + }, + { + "epoch": 0.739110638943694, + "grad_norm": 0.4291042983531952, + "learning_rate": 8.526731470230863e-05, + "loss": 2.0695, + "step": 2435 + }, + { + "epoch": 0.7394141751403855, + "grad_norm": 0.39197900891304016, + "learning_rate": 8.526123936816526e-05, + "loss": 1.5059, + "step": 2436 + }, + { + "epoch": 0.739717711337077, + "grad_norm": 0.5944773554801941, + "learning_rate": 8.525516403402188e-05, + "loss": 1.6354, + "step": 2437 + }, + { + "epoch": 0.7400212475337684, + "grad_norm": 0.42565345764160156, + "learning_rate": 8.52490886998785e-05, + "loss": 1.5772, + "step": 2438 + }, + { + "epoch": 0.7403247837304598, + "grad_norm": 0.4184707999229431, + "learning_rate": 8.524301336573512e-05, + "loss": 1.8781, + "step": 2439 + }, + { + "epoch": 0.7406283199271513, + "grad_norm": 0.36030882596969604, + "learning_rate": 8.523693803159174e-05, + "loss": 1.8788, + "step": 2440 + }, + { + "epoch": 0.7409318561238427, + "grad_norm": 0.4323141872882843, + "learning_rate": 8.523086269744837e-05, + "loss": 2.0321, + "step": 2441 + }, + { + "epoch": 0.7412353923205343, + "grad_norm": 0.4332966208457947, + "learning_rate": 8.522478736330498e-05, + "loss": 1.7093, + "step": 2442 + }, + { + "epoch": 0.7415389285172257, + "grad_norm": 0.4085337221622467, + "learning_rate": 8.521871202916161e-05, + "loss": 1.7086, + "step": 2443 + }, + { + "epoch": 0.7418424647139171, + "grad_norm": 0.4357088506221771, + "learning_rate": 8.521263669501824e-05, + "loss": 1.5345, + "step": 2444 + }, + { + "epoch": 0.7421460009106086, + "grad_norm": 0.40508776903152466, + "learning_rate": 8.520656136087484e-05, + "loss": 2.0369, + "step": 2445 + }, + { + "epoch": 0.7424495371073, + "grad_norm": 0.36506882309913635, + "learning_rate": 8.520048602673147e-05, + "loss": 1.4258, + "step": 2446 + }, + { + "epoch": 0.7427530733039915, + "grad_norm": 0.3771931827068329, + "learning_rate": 8.51944106925881e-05, + "loss": 1.1239, + "step": 2447 + }, + { + "epoch": 0.743056609500683, + "grad_norm": 0.4152052700519562, + "learning_rate": 8.518833535844472e-05, + "loss": 2.093, + "step": 2448 + }, + { + "epoch": 0.7433601456973744, + "grad_norm": 0.4168509244918823, + "learning_rate": 8.518226002430134e-05, + "loss": 1.9488, + "step": 2449 + }, + { + "epoch": 0.7436636818940658, + "grad_norm": 0.44399914145469666, + "learning_rate": 8.517618469015797e-05, + "loss": 1.8483, + "step": 2450 + }, + { + "epoch": 0.7439672180907573, + "grad_norm": 0.3898546099662781, + "learning_rate": 8.517010935601459e-05, + "loss": 1.7384, + "step": 2451 + }, + { + "epoch": 0.7442707542874488, + "grad_norm": 0.3657229542732239, + "learning_rate": 8.51640340218712e-05, + "loss": 1.9651, + "step": 2452 + }, + { + "epoch": 0.7445742904841403, + "grad_norm": 0.5163128972053528, + "learning_rate": 8.515795868772783e-05, + "loss": 1.679, + "step": 2453 + }, + { + "epoch": 0.7448778266808317, + "grad_norm": 0.8351776599884033, + "learning_rate": 8.515188335358445e-05, + "loss": 1.4447, + "step": 2454 + }, + { + "epoch": 0.7451813628775231, + "grad_norm": 0.4343299865722656, + "learning_rate": 8.514580801944108e-05, + "loss": 1.8703, + "step": 2455 + }, + { + "epoch": 0.7454848990742146, + "grad_norm": 0.3905276954174042, + "learning_rate": 8.513973268529769e-05, + "loss": 1.8941, + "step": 2456 + }, + { + "epoch": 0.7457884352709061, + "grad_norm": 0.475789338350296, + "learning_rate": 8.513365735115432e-05, + "loss": 1.629, + "step": 2457 + }, + { + "epoch": 0.7460919714675975, + "grad_norm": 0.3969273567199707, + "learning_rate": 8.512758201701095e-05, + "loss": 1.8827, + "step": 2458 + }, + { + "epoch": 0.746395507664289, + "grad_norm": 0.4705328643321991, + "learning_rate": 8.512150668286755e-05, + "loss": 1.4177, + "step": 2459 + }, + { + "epoch": 0.7466990438609804, + "grad_norm": 0.4193515181541443, + "learning_rate": 8.511543134872418e-05, + "loss": 2.0139, + "step": 2460 + }, + { + "epoch": 0.7470025800576718, + "grad_norm": 0.38317739963531494, + "learning_rate": 8.510935601458081e-05, + "loss": 2.119, + "step": 2461 + }, + { + "epoch": 0.7473061162543634, + "grad_norm": 0.39867040514945984, + "learning_rate": 8.510328068043743e-05, + "loss": 1.9218, + "step": 2462 + }, + { + "epoch": 0.7476096524510548, + "grad_norm": 0.5308038592338562, + "learning_rate": 8.509720534629405e-05, + "loss": 1.5898, + "step": 2463 + }, + { + "epoch": 0.7479131886477463, + "grad_norm": 0.45667675137519836, + "learning_rate": 8.509113001215068e-05, + "loss": 1.7705, + "step": 2464 + }, + { + "epoch": 0.7482167248444377, + "grad_norm": 1.480726718902588, + "learning_rate": 8.50850546780073e-05, + "loss": 2.1438, + "step": 2465 + }, + { + "epoch": 0.7485202610411291, + "grad_norm": 0.46620655059814453, + "learning_rate": 8.507897934386391e-05, + "loss": 1.8973, + "step": 2466 + }, + { + "epoch": 0.7488237972378207, + "grad_norm": 0.34710168838500977, + "learning_rate": 8.507290400972053e-05, + "loss": 1.3338, + "step": 2467 + }, + { + "epoch": 0.7491273334345121, + "grad_norm": 0.43097588419914246, + "learning_rate": 8.506682867557716e-05, + "loss": 1.7958, + "step": 2468 + }, + { + "epoch": 0.7494308696312035, + "grad_norm": 0.3998434245586395, + "learning_rate": 8.506075334143378e-05, + "loss": 1.7615, + "step": 2469 + }, + { + "epoch": 0.749734405827895, + "grad_norm": 0.39192789793014526, + "learning_rate": 8.50546780072904e-05, + "loss": 1.7179, + "step": 2470 + }, + { + "epoch": 0.7500379420245864, + "grad_norm": 0.4148361086845398, + "learning_rate": 8.504860267314703e-05, + "loss": 1.6477, + "step": 2471 + }, + { + "epoch": 0.7503414782212778, + "grad_norm": 0.5068510174751282, + "learning_rate": 8.504252733900366e-05, + "loss": 2.045, + "step": 2472 + }, + { + "epoch": 0.7506450144179694, + "grad_norm": 0.4798752963542938, + "learning_rate": 8.503645200486026e-05, + "loss": 1.7506, + "step": 2473 + }, + { + "epoch": 0.7509485506146608, + "grad_norm": 0.4444788992404938, + "learning_rate": 8.503037667071689e-05, + "loss": 1.894, + "step": 2474 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.39380598068237305, + "learning_rate": 8.502430133657352e-05, + "loss": 2.0365, + "step": 2475 + }, + { + "epoch": 0.7515556230080437, + "grad_norm": 0.38357478380203247, + "learning_rate": 8.501822600243014e-05, + "loss": 1.8282, + "step": 2476 + }, + { + "epoch": 0.7518591592047351, + "grad_norm": 0.47529253363609314, + "learning_rate": 8.501215066828676e-05, + "loss": 1.7783, + "step": 2477 + }, + { + "epoch": 0.7521626954014267, + "grad_norm": 0.3402441740036011, + "learning_rate": 8.500607533414339e-05, + "loss": 1.8927, + "step": 2478 + }, + { + "epoch": 0.7524662315981181, + "grad_norm": 0.36784660816192627, + "learning_rate": 8.5e-05, + "loss": 1.4411, + "step": 2479 + }, + { + "epoch": 0.7527697677948095, + "grad_norm": 0.42149028182029724, + "learning_rate": 8.499392466585662e-05, + "loss": 2.0137, + "step": 2480 + }, + { + "epoch": 0.753073303991501, + "grad_norm": 0.40183788537979126, + "learning_rate": 8.498784933171324e-05, + "loss": 1.6207, + "step": 2481 + }, + { + "epoch": 0.7533768401881924, + "grad_norm": 0.45237985253334045, + "learning_rate": 8.498177399756987e-05, + "loss": 2.2596, + "step": 2482 + }, + { + "epoch": 0.7536803763848839, + "grad_norm": 0.4847509562969208, + "learning_rate": 8.497569866342649e-05, + "loss": 1.6941, + "step": 2483 + }, + { + "epoch": 0.7539839125815754, + "grad_norm": 0.4311809837818146, + "learning_rate": 8.49696233292831e-05, + "loss": 2.0248, + "step": 2484 + }, + { + "epoch": 0.7542874487782668, + "grad_norm": 0.6543784141540527, + "learning_rate": 8.496354799513974e-05, + "loss": 1.9065, + "step": 2485 + }, + { + "epoch": 0.7545909849749582, + "grad_norm": 0.3486241102218628, + "learning_rate": 8.495747266099637e-05, + "loss": 1.5518, + "step": 2486 + }, + { + "epoch": 0.7548945211716497, + "grad_norm": 0.44317248463630676, + "learning_rate": 8.495139732685297e-05, + "loss": 1.9064, + "step": 2487 + }, + { + "epoch": 0.7551980573683412, + "grad_norm": 0.44157078862190247, + "learning_rate": 8.49453219927096e-05, + "loss": 1.7866, + "step": 2488 + }, + { + "epoch": 0.7555015935650327, + "grad_norm": 0.4338137209415436, + "learning_rate": 8.493924665856623e-05, + "loss": 1.9897, + "step": 2489 + }, + { + "epoch": 0.7558051297617241, + "grad_norm": 0.45171141624450684, + "learning_rate": 8.493317132442285e-05, + "loss": 1.9706, + "step": 2490 + }, + { + "epoch": 0.7561086659584155, + "grad_norm": 0.9964777231216431, + "learning_rate": 8.492709599027947e-05, + "loss": 2.1163, + "step": 2491 + }, + { + "epoch": 0.756412202155107, + "grad_norm": 0.39545395970344543, + "learning_rate": 8.49210206561361e-05, + "loss": 1.6514, + "step": 2492 + }, + { + "epoch": 0.7567157383517985, + "grad_norm": 0.4575003683567047, + "learning_rate": 8.491494532199272e-05, + "loss": 1.9118, + "step": 2493 + }, + { + "epoch": 0.7570192745484899, + "grad_norm": 0.4249429702758789, + "learning_rate": 8.490886998784933e-05, + "loss": 1.9342, + "step": 2494 + }, + { + "epoch": 0.7573228107451814, + "grad_norm": 0.4887460768222809, + "learning_rate": 8.490279465370595e-05, + "loss": 1.9421, + "step": 2495 + }, + { + "epoch": 0.7576263469418728, + "grad_norm": 0.41777387261390686, + "learning_rate": 8.489671931956258e-05, + "loss": 1.5985, + "step": 2496 + }, + { + "epoch": 0.7579298831385642, + "grad_norm": 1.7083243131637573, + "learning_rate": 8.48906439854192e-05, + "loss": 1.8387, + "step": 2497 + }, + { + "epoch": 0.7582334193352557, + "grad_norm": 0.39955195784568787, + "learning_rate": 8.488456865127582e-05, + "loss": 2.0833, + "step": 2498 + }, + { + "epoch": 0.7585369555319472, + "grad_norm": 1.4131972789764404, + "learning_rate": 8.487849331713245e-05, + "loss": 1.5318, + "step": 2499 + }, + { + "epoch": 0.7588404917286387, + "grad_norm": 0.7458001375198364, + "learning_rate": 8.487241798298908e-05, + "loss": 1.9582, + "step": 2500 + }, + { + "epoch": 0.7591440279253301, + "grad_norm": 0.4677983820438385, + "learning_rate": 8.486634264884568e-05, + "loss": 1.8781, + "step": 2501 + }, + { + "epoch": 0.7594475641220215, + "grad_norm": 0.4976421594619751, + "learning_rate": 8.486026731470231e-05, + "loss": 2.1458, + "step": 2502 + }, + { + "epoch": 0.759751100318713, + "grad_norm": 0.3829711675643921, + "learning_rate": 8.485419198055894e-05, + "loss": 1.9114, + "step": 2503 + }, + { + "epoch": 0.7600546365154045, + "grad_norm": 0.5295559167861938, + "learning_rate": 8.484811664641556e-05, + "loss": 2.0679, + "step": 2504 + }, + { + "epoch": 0.7603581727120959, + "grad_norm": 0.7929876446723938, + "learning_rate": 8.484204131227218e-05, + "loss": 2.2658, + "step": 2505 + }, + { + "epoch": 0.7606617089087874, + "grad_norm": 0.35055285692214966, + "learning_rate": 8.483596597812881e-05, + "loss": 1.9443, + "step": 2506 + }, + { + "epoch": 0.7609652451054788, + "grad_norm": 0.39741021394729614, + "learning_rate": 8.482989064398543e-05, + "loss": 1.6983, + "step": 2507 + }, + { + "epoch": 0.7612687813021702, + "grad_norm": 0.4206577241420746, + "learning_rate": 8.482381530984204e-05, + "loss": 1.8886, + "step": 2508 + }, + { + "epoch": 0.7615723174988618, + "grad_norm": 0.7343670129776001, + "learning_rate": 8.481773997569866e-05, + "loss": 1.9581, + "step": 2509 + }, + { + "epoch": 0.7618758536955532, + "grad_norm": 0.3836110532283783, + "learning_rate": 8.481166464155529e-05, + "loss": 1.8658, + "step": 2510 + }, + { + "epoch": 0.7621793898922447, + "grad_norm": 0.44783517718315125, + "learning_rate": 8.480558930741191e-05, + "loss": 1.8772, + "step": 2511 + }, + { + "epoch": 0.7624829260889361, + "grad_norm": 0.44204702973365784, + "learning_rate": 8.479951397326853e-05, + "loss": 1.8684, + "step": 2512 + }, + { + "epoch": 0.7627864622856275, + "grad_norm": 0.45162737369537354, + "learning_rate": 8.479343863912516e-05, + "loss": 1.817, + "step": 2513 + }, + { + "epoch": 0.7630899984823191, + "grad_norm": 0.36719024181365967, + "learning_rate": 8.478736330498179e-05, + "loss": 2.0087, + "step": 2514 + }, + { + "epoch": 0.7633935346790105, + "grad_norm": 0.3979268968105316, + "learning_rate": 8.478128797083839e-05, + "loss": 2.091, + "step": 2515 + }, + { + "epoch": 0.7636970708757019, + "grad_norm": 0.45267635583877563, + "learning_rate": 8.477521263669502e-05, + "loss": 1.2638, + "step": 2516 + }, + { + "epoch": 0.7640006070723934, + "grad_norm": 0.43147364258766174, + "learning_rate": 8.476913730255165e-05, + "loss": 2.1202, + "step": 2517 + }, + { + "epoch": 0.7643041432690848, + "grad_norm": 0.46071958541870117, + "learning_rate": 8.476306196840826e-05, + "loss": 1.9875, + "step": 2518 + }, + { + "epoch": 0.7646076794657763, + "grad_norm": 0.3662787973880768, + "learning_rate": 8.475698663426489e-05, + "loss": 2.0632, + "step": 2519 + }, + { + "epoch": 0.7649112156624678, + "grad_norm": 1.479319453239441, + "learning_rate": 8.475091130012152e-05, + "loss": 2.1759, + "step": 2520 + }, + { + "epoch": 0.7652147518591592, + "grad_norm": 0.38541749119758606, + "learning_rate": 8.474483596597814e-05, + "loss": 1.6204, + "step": 2521 + }, + { + "epoch": 0.7655182880558506, + "grad_norm": 0.39057788252830505, + "learning_rate": 8.473876063183475e-05, + "loss": 1.5414, + "step": 2522 + }, + { + "epoch": 0.7658218242525421, + "grad_norm": 0.37674304842948914, + "learning_rate": 8.473268529769137e-05, + "loss": 1.8496, + "step": 2523 + }, + { + "epoch": 0.7661253604492335, + "grad_norm": 0.4432341158390045, + "learning_rate": 8.4726609963548e-05, + "loss": 1.8541, + "step": 2524 + }, + { + "epoch": 0.7664288966459251, + "grad_norm": 0.3844713866710663, + "learning_rate": 8.472053462940462e-05, + "loss": 1.9793, + "step": 2525 + }, + { + "epoch": 0.7667324328426165, + "grad_norm": 0.37515100836753845, + "learning_rate": 8.471445929526124e-05, + "loss": 1.9751, + "step": 2526 + }, + { + "epoch": 0.7670359690393079, + "grad_norm": 0.36097854375839233, + "learning_rate": 8.470838396111787e-05, + "loss": 1.2523, + "step": 2527 + }, + { + "epoch": 0.7673395052359994, + "grad_norm": 0.3747158646583557, + "learning_rate": 8.47023086269745e-05, + "loss": 1.3921, + "step": 2528 + }, + { + "epoch": 0.7676430414326908, + "grad_norm": 0.4365270435810089, + "learning_rate": 8.46962332928311e-05, + "loss": 2.1177, + "step": 2529 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.39251551032066345, + "learning_rate": 8.469015795868773e-05, + "loss": 1.2146, + "step": 2530 + }, + { + "epoch": 0.7682501138260738, + "grad_norm": 0.4519220292568207, + "learning_rate": 8.468408262454436e-05, + "loss": 1.7527, + "step": 2531 + }, + { + "epoch": 0.7685536500227652, + "grad_norm": 0.436301589012146, + "learning_rate": 8.467800729040097e-05, + "loss": 1.8437, + "step": 2532 + }, + { + "epoch": 0.7688571862194566, + "grad_norm": 0.4134485125541687, + "learning_rate": 8.46719319562576e-05, + "loss": 1.9497, + "step": 2533 + }, + { + "epoch": 0.7691607224161481, + "grad_norm": 1.2619364261627197, + "learning_rate": 8.466585662211423e-05, + "loss": 2.0033, + "step": 2534 + }, + { + "epoch": 0.7694642586128396, + "grad_norm": 0.3872610926628113, + "learning_rate": 8.465978128797085e-05, + "loss": 1.5046, + "step": 2535 + }, + { + "epoch": 0.7697677948095311, + "grad_norm": 0.4039923846721649, + "learning_rate": 8.465370595382746e-05, + "loss": 1.8782, + "step": 2536 + }, + { + "epoch": 0.7700713310062225, + "grad_norm": 0.3826749622821808, + "learning_rate": 8.464763061968408e-05, + "loss": 1.9288, + "step": 2537 + }, + { + "epoch": 0.7703748672029139, + "grad_norm": 0.4400002062320709, + "learning_rate": 8.464155528554071e-05, + "loss": 2.2257, + "step": 2538 + }, + { + "epoch": 0.7706784033996054, + "grad_norm": 0.6801483631134033, + "learning_rate": 8.463547995139733e-05, + "loss": 1.5758, + "step": 2539 + }, + { + "epoch": 0.7709819395962969, + "grad_norm": 0.4618719816207886, + "learning_rate": 8.462940461725395e-05, + "loss": 1.5398, + "step": 2540 + }, + { + "epoch": 0.7712854757929883, + "grad_norm": 0.3891013562679291, + "learning_rate": 8.462332928311058e-05, + "loss": 1.7876, + "step": 2541 + }, + { + "epoch": 0.7715890119896798, + "grad_norm": 0.36486607789993286, + "learning_rate": 8.46172539489672e-05, + "loss": 1.4358, + "step": 2542 + }, + { + "epoch": 0.7718925481863712, + "grad_norm": 0.4325253665447235, + "learning_rate": 8.461117861482381e-05, + "loss": 1.8423, + "step": 2543 + }, + { + "epoch": 0.7721960843830626, + "grad_norm": 0.3679717481136322, + "learning_rate": 8.460510328068044e-05, + "loss": 1.2354, + "step": 2544 + }, + { + "epoch": 0.7724996205797542, + "grad_norm": 0.3954870402812958, + "learning_rate": 8.459902794653707e-05, + "loss": 1.915, + "step": 2545 + }, + { + "epoch": 0.7728031567764456, + "grad_norm": 0.3950810432434082, + "learning_rate": 8.459295261239368e-05, + "loss": 1.9933, + "step": 2546 + }, + { + "epoch": 0.773106692973137, + "grad_norm": 0.5841623544692993, + "learning_rate": 8.458687727825031e-05, + "loss": 1.5726, + "step": 2547 + }, + { + "epoch": 0.7734102291698285, + "grad_norm": 0.39942488074302673, + "learning_rate": 8.458080194410694e-05, + "loss": 1.8425, + "step": 2548 + }, + { + "epoch": 0.7737137653665199, + "grad_norm": 0.4704907238483429, + "learning_rate": 8.457472660996356e-05, + "loss": 2.2747, + "step": 2549 + }, + { + "epoch": 0.7740173015632115, + "grad_norm": 0.46205440163612366, + "learning_rate": 8.456865127582017e-05, + "loss": 1.8099, + "step": 2550 + }, + { + "epoch": 0.7743208377599029, + "grad_norm": 0.4484197497367859, + "learning_rate": 8.456257594167679e-05, + "loss": 1.3406, + "step": 2551 + }, + { + "epoch": 0.7746243739565943, + "grad_norm": 0.412507027387619, + "learning_rate": 8.455650060753342e-05, + "loss": 2.0005, + "step": 2552 + }, + { + "epoch": 0.7749279101532858, + "grad_norm": 0.4087753891944885, + "learning_rate": 8.455042527339004e-05, + "loss": 2.0239, + "step": 2553 + }, + { + "epoch": 0.7752314463499772, + "grad_norm": 0.5045974850654602, + "learning_rate": 8.454434993924666e-05, + "loss": 1.6854, + "step": 2554 + }, + { + "epoch": 0.7755349825466686, + "grad_norm": 0.40595903992652893, + "learning_rate": 8.453827460510329e-05, + "loss": 1.8671, + "step": 2555 + }, + { + "epoch": 0.7758385187433602, + "grad_norm": 0.3777897357940674, + "learning_rate": 8.45321992709599e-05, + "loss": 1.5473, + "step": 2556 + }, + { + "epoch": 0.7761420549400516, + "grad_norm": 0.8116908073425293, + "learning_rate": 8.452612393681652e-05, + "loss": 2.0338, + "step": 2557 + }, + { + "epoch": 0.776445591136743, + "grad_norm": 0.4156283736228943, + "learning_rate": 8.452004860267315e-05, + "loss": 1.9501, + "step": 2558 + }, + { + "epoch": 0.7767491273334345, + "grad_norm": 0.3934132158756256, + "learning_rate": 8.451397326852978e-05, + "loss": 1.8579, + "step": 2559 + }, + { + "epoch": 0.7770526635301259, + "grad_norm": 0.39838075637817383, + "learning_rate": 8.450789793438639e-05, + "loss": 1.5156, + "step": 2560 + }, + { + "epoch": 0.7773561997268175, + "grad_norm": 0.38777777552604675, + "learning_rate": 8.450182260024302e-05, + "loss": 1.7479, + "step": 2561 + }, + { + "epoch": 0.7776597359235089, + "grad_norm": 0.43510836362838745, + "learning_rate": 8.449574726609964e-05, + "loss": 1.6293, + "step": 2562 + }, + { + "epoch": 0.7779632721202003, + "grad_norm": 0.42916715145111084, + "learning_rate": 8.448967193195627e-05, + "loss": 1.8737, + "step": 2563 + }, + { + "epoch": 0.7782668083168918, + "grad_norm": 0.4157852232456207, + "learning_rate": 8.448359659781288e-05, + "loss": 1.5549, + "step": 2564 + }, + { + "epoch": 0.7785703445135832, + "grad_norm": 0.39083394408226013, + "learning_rate": 8.44775212636695e-05, + "loss": 1.7937, + "step": 2565 + }, + { + "epoch": 0.7788738807102747, + "grad_norm": 0.4398769438266754, + "learning_rate": 8.447144592952613e-05, + "loss": 1.8285, + "step": 2566 + }, + { + "epoch": 0.7791774169069662, + "grad_norm": 0.4085114598274231, + "learning_rate": 8.446537059538275e-05, + "loss": 1.835, + "step": 2567 + }, + { + "epoch": 0.7794809531036576, + "grad_norm": 0.44986245036125183, + "learning_rate": 8.445929526123937e-05, + "loss": 1.6762, + "step": 2568 + }, + { + "epoch": 0.779784489300349, + "grad_norm": 0.3613260090351105, + "learning_rate": 8.4453219927096e-05, + "loss": 1.5119, + "step": 2569 + }, + { + "epoch": 0.7800880254970405, + "grad_norm": 1.3417142629623413, + "learning_rate": 8.444714459295261e-05, + "loss": 1.7255, + "step": 2570 + }, + { + "epoch": 0.780391561693732, + "grad_norm": 0.3687633275985718, + "learning_rate": 8.444106925880923e-05, + "loss": 1.846, + "step": 2571 + }, + { + "epoch": 0.7806950978904235, + "grad_norm": 0.5285593867301941, + "learning_rate": 8.443499392466586e-05, + "loss": 1.8364, + "step": 2572 + }, + { + "epoch": 0.7809986340871149, + "grad_norm": 0.6644722819328308, + "learning_rate": 8.44289185905225e-05, + "loss": 2.0563, + "step": 2573 + }, + { + "epoch": 0.7813021702838063, + "grad_norm": 0.4382190704345703, + "learning_rate": 8.44228432563791e-05, + "loss": 1.8737, + "step": 2574 + }, + { + "epoch": 0.7816057064804978, + "grad_norm": 0.43860745429992676, + "learning_rate": 8.441676792223573e-05, + "loss": 1.7924, + "step": 2575 + }, + { + "epoch": 0.7819092426771893, + "grad_norm": 0.39125654101371765, + "learning_rate": 8.441069258809235e-05, + "loss": 1.159, + "step": 2576 + }, + { + "epoch": 0.7822127788738807, + "grad_norm": 0.4389365017414093, + "learning_rate": 8.440461725394898e-05, + "loss": 1.9478, + "step": 2577 + }, + { + "epoch": 0.7825163150705722, + "grad_norm": 0.46001267433166504, + "learning_rate": 8.43985419198056e-05, + "loss": 1.3252, + "step": 2578 + }, + { + "epoch": 0.7828198512672636, + "grad_norm": 0.36286357045173645, + "learning_rate": 8.439246658566221e-05, + "loss": 1.9519, + "step": 2579 + }, + { + "epoch": 0.783123387463955, + "grad_norm": 0.6641744375228882, + "learning_rate": 8.438639125151884e-05, + "loss": 1.9772, + "step": 2580 + }, + { + "epoch": 0.7834269236606465, + "grad_norm": 0.43126794695854187, + "learning_rate": 8.438031591737546e-05, + "loss": 1.7583, + "step": 2581 + }, + { + "epoch": 0.783730459857338, + "grad_norm": 0.4094223380088806, + "learning_rate": 8.437424058323208e-05, + "loss": 1.7251, + "step": 2582 + }, + { + "epoch": 0.7840339960540295, + "grad_norm": 0.34994634985923767, + "learning_rate": 8.436816524908871e-05, + "loss": 1.8317, + "step": 2583 + }, + { + "epoch": 0.7843375322507209, + "grad_norm": 0.3431306481361389, + "learning_rate": 8.436208991494532e-05, + "loss": 1.9223, + "step": 2584 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.43418046832084656, + "learning_rate": 8.435601458080194e-05, + "loss": 2.0232, + "step": 2585 + }, + { + "epoch": 0.7849446046441038, + "grad_norm": 0.3941698670387268, + "learning_rate": 8.434993924665857e-05, + "loss": 2.0736, + "step": 2586 + }, + { + "epoch": 0.7852481408407953, + "grad_norm": 0.9890521764755249, + "learning_rate": 8.43438639125152e-05, + "loss": 1.6363, + "step": 2587 + }, + { + "epoch": 0.7855516770374867, + "grad_norm": 0.3400576114654541, + "learning_rate": 8.433778857837181e-05, + "loss": 1.8934, + "step": 2588 + }, + { + "epoch": 0.7858552132341782, + "grad_norm": 0.4412512183189392, + "learning_rate": 8.433171324422844e-05, + "loss": 1.6729, + "step": 2589 + }, + { + "epoch": 0.7861587494308696, + "grad_norm": 0.7804542183876038, + "learning_rate": 8.432563791008506e-05, + "loss": 2.0693, + "step": 2590 + }, + { + "epoch": 0.786462285627561, + "grad_norm": 0.48216378688812256, + "learning_rate": 8.431956257594167e-05, + "loss": 1.622, + "step": 2591 + }, + { + "epoch": 0.7867658218242526, + "grad_norm": 0.4186027944087982, + "learning_rate": 8.43134872417983e-05, + "loss": 2.0402, + "step": 2592 + }, + { + "epoch": 0.787069358020944, + "grad_norm": 0.35439014434814453, + "learning_rate": 8.430741190765492e-05, + "loss": 2.0655, + "step": 2593 + }, + { + "epoch": 0.7873728942176355, + "grad_norm": 0.4577588140964508, + "learning_rate": 8.430133657351155e-05, + "loss": 1.203, + "step": 2594 + }, + { + "epoch": 0.7876764304143269, + "grad_norm": 0.42012783885002136, + "learning_rate": 8.429526123936817e-05, + "loss": 1.8616, + "step": 2595 + }, + { + "epoch": 0.7879799666110183, + "grad_norm": 0.3673686683177948, + "learning_rate": 8.428918590522479e-05, + "loss": 2.0879, + "step": 2596 + }, + { + "epoch": 0.7882835028077099, + "grad_norm": 0.4281460642814636, + "learning_rate": 8.428311057108142e-05, + "loss": 1.8722, + "step": 2597 + }, + { + "epoch": 0.7885870390044013, + "grad_norm": 0.432608962059021, + "learning_rate": 8.427703523693803e-05, + "loss": 1.7152, + "step": 2598 + }, + { + "epoch": 0.7888905752010927, + "grad_norm": 0.3567800521850586, + "learning_rate": 8.427095990279465e-05, + "loss": 1.534, + "step": 2599 + }, + { + "epoch": 0.7891941113977842, + "grad_norm": 0.383045494556427, + "learning_rate": 8.426488456865128e-05, + "loss": 1.7063, + "step": 2600 + }, + { + "epoch": 0.7894976475944756, + "grad_norm": 0.3540615141391754, + "learning_rate": 8.425880923450791e-05, + "loss": 1.6996, + "step": 2601 + }, + { + "epoch": 0.7898011837911671, + "grad_norm": 0.44690582156181335, + "learning_rate": 8.425273390036452e-05, + "loss": 1.9774, + "step": 2602 + }, + { + "epoch": 0.7901047199878586, + "grad_norm": 0.49040475487709045, + "learning_rate": 8.424665856622115e-05, + "loss": 2.1205, + "step": 2603 + }, + { + "epoch": 0.79040825618455, + "grad_norm": 0.4510320723056793, + "learning_rate": 8.424058323207777e-05, + "loss": 2.0409, + "step": 2604 + }, + { + "epoch": 0.7907117923812415, + "grad_norm": 0.41996338963508606, + "learning_rate": 8.423450789793438e-05, + "loss": 1.7738, + "step": 2605 + }, + { + "epoch": 0.7910153285779329, + "grad_norm": 0.4076237976551056, + "learning_rate": 8.422843256379101e-05, + "loss": 2.1186, + "step": 2606 + }, + { + "epoch": 0.7913188647746243, + "grad_norm": 0.4231566786766052, + "learning_rate": 8.422235722964763e-05, + "loss": 1.8912, + "step": 2607 + }, + { + "epoch": 0.7916224009713159, + "grad_norm": 0.6864861249923706, + "learning_rate": 8.421628189550426e-05, + "loss": 1.9234, + "step": 2608 + }, + { + "epoch": 0.7919259371680073, + "grad_norm": 0.3856845796108246, + "learning_rate": 8.421020656136088e-05, + "loss": 2.0214, + "step": 2609 + }, + { + "epoch": 0.7922294733646987, + "grad_norm": 0.39030376076698303, + "learning_rate": 8.42041312272175e-05, + "loss": 1.9272, + "step": 2610 + }, + { + "epoch": 0.7925330095613902, + "grad_norm": 0.407306432723999, + "learning_rate": 8.419805589307413e-05, + "loss": 1.9412, + "step": 2611 + }, + { + "epoch": 0.7928365457580816, + "grad_norm": 0.41559773683547974, + "learning_rate": 8.419198055893074e-05, + "loss": 1.7524, + "step": 2612 + }, + { + "epoch": 0.7931400819547731, + "grad_norm": 0.382524311542511, + "learning_rate": 8.418590522478736e-05, + "loss": 2.0343, + "step": 2613 + }, + { + "epoch": 0.7934436181514646, + "grad_norm": 0.42608487606048584, + "learning_rate": 8.417982989064399e-05, + "loss": 1.9188, + "step": 2614 + }, + { + "epoch": 0.793747154348156, + "grad_norm": 0.3967500329017639, + "learning_rate": 8.417375455650061e-05, + "loss": 1.8737, + "step": 2615 + }, + { + "epoch": 0.7940506905448474, + "grad_norm": 0.493898868560791, + "learning_rate": 8.416767922235723e-05, + "loss": 1.8199, + "step": 2616 + }, + { + "epoch": 0.7943542267415389, + "grad_norm": 0.5007172226905823, + "learning_rate": 8.416160388821386e-05, + "loss": 2.0033, + "step": 2617 + }, + { + "epoch": 0.7946577629382304, + "grad_norm": 0.4337385594844818, + "learning_rate": 8.415552855407048e-05, + "loss": 2.1693, + "step": 2618 + }, + { + "epoch": 0.7949612991349219, + "grad_norm": 0.40338581800460815, + "learning_rate": 8.414945321992709e-05, + "loss": 1.9985, + "step": 2619 + }, + { + "epoch": 0.7952648353316133, + "grad_norm": 0.3842269778251648, + "learning_rate": 8.414337788578372e-05, + "loss": 1.5293, + "step": 2620 + }, + { + "epoch": 0.7955683715283047, + "grad_norm": 0.35648632049560547, + "learning_rate": 8.413730255164034e-05, + "loss": 1.9728, + "step": 2621 + }, + { + "epoch": 0.7958719077249962, + "grad_norm": 0.4350222945213318, + "learning_rate": 8.413122721749697e-05, + "loss": 1.3873, + "step": 2622 + }, + { + "epoch": 0.7961754439216877, + "grad_norm": 0.605980634689331, + "learning_rate": 8.412515188335359e-05, + "loss": 2.052, + "step": 2623 + }, + { + "epoch": 0.7964789801183791, + "grad_norm": 0.6555821299552917, + "learning_rate": 8.41190765492102e-05, + "loss": 2.146, + "step": 2624 + }, + { + "epoch": 0.7967825163150706, + "grad_norm": 0.42681270837783813, + "learning_rate": 8.411300121506684e-05, + "loss": 1.0012, + "step": 2625 + }, + { + "epoch": 0.797086052511762, + "grad_norm": 0.43222132325172424, + "learning_rate": 8.410692588092345e-05, + "loss": 2.1722, + "step": 2626 + }, + { + "epoch": 0.7973895887084534, + "grad_norm": 0.40917056798934937, + "learning_rate": 8.410085054678007e-05, + "loss": 2.1489, + "step": 2627 + }, + { + "epoch": 0.797693124905145, + "grad_norm": 0.4139658212661743, + "learning_rate": 8.40947752126367e-05, + "loss": 2.0852, + "step": 2628 + }, + { + "epoch": 0.7979966611018364, + "grad_norm": 1.7534079551696777, + "learning_rate": 8.408869987849332e-05, + "loss": 1.7122, + "step": 2629 + }, + { + "epoch": 0.7983001972985279, + "grad_norm": 0.37330594658851624, + "learning_rate": 8.408262454434994e-05, + "loss": 1.6402, + "step": 2630 + }, + { + "epoch": 0.7986037334952193, + "grad_norm": 0.7637292742729187, + "learning_rate": 8.407654921020657e-05, + "loss": 1.6067, + "step": 2631 + }, + { + "epoch": 0.7989072696919107, + "grad_norm": 0.48156240582466125, + "learning_rate": 8.407047387606319e-05, + "loss": 1.8114, + "step": 2632 + }, + { + "epoch": 0.7992108058886022, + "grad_norm": 0.3753802180290222, + "learning_rate": 8.40643985419198e-05, + "loss": 1.6239, + "step": 2633 + }, + { + "epoch": 0.7995143420852937, + "grad_norm": 0.4283507764339447, + "learning_rate": 8.405832320777643e-05, + "loss": 2.0778, + "step": 2634 + }, + { + "epoch": 0.7998178782819851, + "grad_norm": 0.3911525309085846, + "learning_rate": 8.405224787363305e-05, + "loss": 1.9279, + "step": 2635 + }, + { + "epoch": 0.8001214144786766, + "grad_norm": 0.4033350646495819, + "learning_rate": 8.404617253948968e-05, + "loss": 1.3199, + "step": 2636 + }, + { + "epoch": 0.800424950675368, + "grad_norm": 0.398269921541214, + "learning_rate": 8.40400972053463e-05, + "loss": 2.0073, + "step": 2637 + }, + { + "epoch": 0.8007284868720594, + "grad_norm": 0.5763013362884521, + "learning_rate": 8.403402187120292e-05, + "loss": 1.517, + "step": 2638 + }, + { + "epoch": 0.801032023068751, + "grad_norm": 0.4008599817752838, + "learning_rate": 8.402794653705955e-05, + "loss": 1.6136, + "step": 2639 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.39726853370666504, + "learning_rate": 8.402187120291616e-05, + "loss": 1.8454, + "step": 2640 + }, + { + "epoch": 0.8016390954621339, + "grad_norm": 0.4768484830856323, + "learning_rate": 8.401579586877278e-05, + "loss": 1.4961, + "step": 2641 + }, + { + "epoch": 0.8019426316588253, + "grad_norm": 0.4185827374458313, + "learning_rate": 8.400972053462941e-05, + "loss": 1.8617, + "step": 2642 + }, + { + "epoch": 0.8022461678555167, + "grad_norm": 0.43734210729599, + "learning_rate": 8.400364520048603e-05, + "loss": 1.4771, + "step": 2643 + }, + { + "epoch": 0.8025497040522083, + "grad_norm": 0.4270019233226776, + "learning_rate": 8.399756986634265e-05, + "loss": 1.8692, + "step": 2644 + }, + { + "epoch": 0.8028532402488997, + "grad_norm": 0.35486480593681335, + "learning_rate": 8.399149453219928e-05, + "loss": 1.8821, + "step": 2645 + }, + { + "epoch": 0.8031567764455911, + "grad_norm": 0.9134595394134521, + "learning_rate": 8.39854191980559e-05, + "loss": 1.8291, + "step": 2646 + }, + { + "epoch": 0.8034603126422826, + "grad_norm": 0.43372470140457153, + "learning_rate": 8.397934386391251e-05, + "loss": 2.0605, + "step": 2647 + }, + { + "epoch": 0.803763848838974, + "grad_norm": 0.39876699447631836, + "learning_rate": 8.397326852976914e-05, + "loss": 2.2645, + "step": 2648 + }, + { + "epoch": 0.8040673850356655, + "grad_norm": 0.39235416054725647, + "learning_rate": 8.396719319562576e-05, + "loss": 1.9287, + "step": 2649 + }, + { + "epoch": 0.804370921232357, + "grad_norm": 0.3264532685279846, + "learning_rate": 8.396111786148239e-05, + "loss": 1.856, + "step": 2650 + }, + { + "epoch": 0.8046744574290484, + "grad_norm": 0.4189594089984894, + "learning_rate": 8.395504252733901e-05, + "loss": 2.0628, + "step": 2651 + }, + { + "epoch": 0.8049779936257399, + "grad_norm": 0.3941250145435333, + "learning_rate": 8.394896719319563e-05, + "loss": 1.7251, + "step": 2652 + }, + { + "epoch": 0.8052815298224313, + "grad_norm": 0.42837756872177124, + "learning_rate": 8.394289185905226e-05, + "loss": 1.2304, + "step": 2653 + }, + { + "epoch": 0.8055850660191228, + "grad_norm": 0.8526172637939453, + "learning_rate": 8.393681652490887e-05, + "loss": 2.0248, + "step": 2654 + }, + { + "epoch": 0.8058886022158143, + "grad_norm": 0.36125120520591736, + "learning_rate": 8.393074119076549e-05, + "loss": 1.522, + "step": 2655 + }, + { + "epoch": 0.8061921384125057, + "grad_norm": 0.34955886006355286, + "learning_rate": 8.392466585662212e-05, + "loss": 1.5399, + "step": 2656 + }, + { + "epoch": 0.8064956746091971, + "grad_norm": 0.42194581031799316, + "learning_rate": 8.391859052247874e-05, + "loss": 1.5088, + "step": 2657 + }, + { + "epoch": 0.8067992108058886, + "grad_norm": 0.41130530834198, + "learning_rate": 8.391251518833536e-05, + "loss": 1.9864, + "step": 2658 + }, + { + "epoch": 0.8071027470025801, + "grad_norm": 0.3659766614437103, + "learning_rate": 8.390643985419199e-05, + "loss": 1.3164, + "step": 2659 + }, + { + "epoch": 0.8074062831992715, + "grad_norm": 0.4178526997566223, + "learning_rate": 8.39003645200486e-05, + "loss": 2.096, + "step": 2660 + }, + { + "epoch": 0.807709819395963, + "grad_norm": 0.44985684752464294, + "learning_rate": 8.389428918590522e-05, + "loss": 1.5053, + "step": 2661 + }, + { + "epoch": 0.8080133555926544, + "grad_norm": 0.5702995657920837, + "learning_rate": 8.388821385176185e-05, + "loss": 1.745, + "step": 2662 + }, + { + "epoch": 0.8083168917893458, + "grad_norm": 0.5479261875152588, + "learning_rate": 8.388213851761847e-05, + "loss": 1.6453, + "step": 2663 + }, + { + "epoch": 0.8086204279860373, + "grad_norm": 0.5145617723464966, + "learning_rate": 8.387606318347509e-05, + "loss": 1.3749, + "step": 2664 + }, + { + "epoch": 0.8089239641827288, + "grad_norm": 0.3433884084224701, + "learning_rate": 8.386998784933172e-05, + "loss": 1.8217, + "step": 2665 + }, + { + "epoch": 0.8092275003794203, + "grad_norm": 0.4309893548488617, + "learning_rate": 8.386391251518834e-05, + "loss": 1.8368, + "step": 2666 + }, + { + "epoch": 0.8095310365761117, + "grad_norm": 0.42593100666999817, + "learning_rate": 8.385783718104497e-05, + "loss": 1.897, + "step": 2667 + }, + { + "epoch": 0.8098345727728031, + "grad_norm": 0.3921912908554077, + "learning_rate": 8.385176184690159e-05, + "loss": 1.8504, + "step": 2668 + }, + { + "epoch": 0.8101381089694946, + "grad_norm": 0.4481246769428253, + "learning_rate": 8.38456865127582e-05, + "loss": 2.2592, + "step": 2669 + }, + { + "epoch": 0.8104416451661861, + "grad_norm": 0.3490237891674042, + "learning_rate": 8.383961117861483e-05, + "loss": 1.792, + "step": 2670 + }, + { + "epoch": 0.8107451813628775, + "grad_norm": 0.4693361818790436, + "learning_rate": 8.383353584447145e-05, + "loss": 2.0438, + "step": 2671 + }, + { + "epoch": 0.811048717559569, + "grad_norm": 0.3441024720668793, + "learning_rate": 8.382746051032807e-05, + "loss": 1.7983, + "step": 2672 + }, + { + "epoch": 0.8113522537562604, + "grad_norm": 0.4398588240146637, + "learning_rate": 8.38213851761847e-05, + "loss": 1.9503, + "step": 2673 + }, + { + "epoch": 0.8116557899529518, + "grad_norm": 0.36766424775123596, + "learning_rate": 8.381530984204132e-05, + "loss": 1.9745, + "step": 2674 + }, + { + "epoch": 0.8119593261496434, + "grad_norm": 0.4529463052749634, + "learning_rate": 8.380923450789793e-05, + "loss": 1.9025, + "step": 2675 + }, + { + "epoch": 0.8122628623463348, + "grad_norm": 0.4247633218765259, + "learning_rate": 8.380315917375456e-05, + "loss": 1.9669, + "step": 2676 + }, + { + "epoch": 0.8125663985430263, + "grad_norm": 0.39208900928497314, + "learning_rate": 8.379708383961118e-05, + "loss": 1.8593, + "step": 2677 + }, + { + "epoch": 0.8128699347397177, + "grad_norm": 0.46601489186286926, + "learning_rate": 8.37910085054678e-05, + "loss": 1.8102, + "step": 2678 + }, + { + "epoch": 0.8131734709364091, + "grad_norm": 0.4215412139892578, + "learning_rate": 8.378493317132443e-05, + "loss": 2.1686, + "step": 2679 + }, + { + "epoch": 0.8134770071331007, + "grad_norm": 0.4089846909046173, + "learning_rate": 8.377885783718105e-05, + "loss": 1.8722, + "step": 2680 + }, + { + "epoch": 0.8137805433297921, + "grad_norm": 0.43888887763023376, + "learning_rate": 8.377278250303768e-05, + "loss": 1.7578, + "step": 2681 + }, + { + "epoch": 0.8140840795264835, + "grad_norm": 0.6995130777359009, + "learning_rate": 8.37667071688943e-05, + "loss": 1.7296, + "step": 2682 + }, + { + "epoch": 0.814387615723175, + "grad_norm": 0.4700230360031128, + "learning_rate": 8.376063183475091e-05, + "loss": 1.3055, + "step": 2683 + }, + { + "epoch": 0.8146911519198664, + "grad_norm": 0.4099135398864746, + "learning_rate": 8.375455650060754e-05, + "loss": 1.9858, + "step": 2684 + }, + { + "epoch": 0.814994688116558, + "grad_norm": 0.44735094904899597, + "learning_rate": 8.374848116646416e-05, + "loss": 1.9615, + "step": 2685 + }, + { + "epoch": 0.8152982243132494, + "grad_norm": 0.6808655858039856, + "learning_rate": 8.374240583232078e-05, + "loss": 1.7893, + "step": 2686 + }, + { + "epoch": 0.8156017605099408, + "grad_norm": 0.4560304880142212, + "learning_rate": 8.373633049817741e-05, + "loss": 1.7048, + "step": 2687 + }, + { + "epoch": 0.8159052967066323, + "grad_norm": 0.4043562114238739, + "learning_rate": 8.373025516403403e-05, + "loss": 2.0911, + "step": 2688 + }, + { + "epoch": 0.8162088329033237, + "grad_norm": 0.40530329942703247, + "learning_rate": 8.372417982989064e-05, + "loss": 1.8597, + "step": 2689 + }, + { + "epoch": 0.8165123691000151, + "grad_norm": 0.42696669697761536, + "learning_rate": 8.371810449574727e-05, + "loss": 1.354, + "step": 2690 + }, + { + "epoch": 0.8168159052967067, + "grad_norm": 0.411856472492218, + "learning_rate": 8.371202916160389e-05, + "loss": 1.8913, + "step": 2691 + }, + { + "epoch": 0.8171194414933981, + "grad_norm": 1.3200637102127075, + "learning_rate": 8.370595382746051e-05, + "loss": 1.9903, + "step": 2692 + }, + { + "epoch": 0.8174229776900895, + "grad_norm": 0.4155752956867218, + "learning_rate": 8.369987849331714e-05, + "loss": 2.036, + "step": 2693 + }, + { + "epoch": 0.817726513886781, + "grad_norm": 0.4618370234966278, + "learning_rate": 8.369380315917376e-05, + "loss": 1.8101, + "step": 2694 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.338123619556427, + "learning_rate": 8.368772782503039e-05, + "loss": 1.7934, + "step": 2695 + }, + { + "epoch": 0.818333586280164, + "grad_norm": 0.4331413507461548, + "learning_rate": 8.3681652490887e-05, + "loss": 1.7394, + "step": 2696 + }, + { + "epoch": 0.8186371224768554, + "grad_norm": 0.3667849004268646, + "learning_rate": 8.367557715674362e-05, + "loss": 1.9162, + "step": 2697 + }, + { + "epoch": 0.8189406586735468, + "grad_norm": 0.4584942162036896, + "learning_rate": 8.366950182260025e-05, + "loss": 1.801, + "step": 2698 + }, + { + "epoch": 0.8192441948702383, + "grad_norm": 0.4310884475708008, + "learning_rate": 8.366342648845687e-05, + "loss": 1.952, + "step": 2699 + }, + { + "epoch": 0.8195477310669297, + "grad_norm": 0.35577401518821716, + "learning_rate": 8.365735115431349e-05, + "loss": 2.0338, + "step": 2700 + }, + { + "epoch": 0.8198512672636212, + "grad_norm": 0.4453931152820587, + "learning_rate": 8.365127582017012e-05, + "loss": 1.777, + "step": 2701 + }, + { + "epoch": 0.8201548034603127, + "grad_norm": 0.4156850576400757, + "learning_rate": 8.364520048602674e-05, + "loss": 1.8725, + "step": 2702 + }, + { + "epoch": 0.8204583396570041, + "grad_norm": 0.3999830186367035, + "learning_rate": 8.363912515188335e-05, + "loss": 2.0136, + "step": 2703 + }, + { + "epoch": 0.8207618758536955, + "grad_norm": 0.4082907736301422, + "learning_rate": 8.363304981773998e-05, + "loss": 1.9336, + "step": 2704 + }, + { + "epoch": 0.821065412050387, + "grad_norm": 0.41365379095077515, + "learning_rate": 8.36269744835966e-05, + "loss": 1.8598, + "step": 2705 + }, + { + "epoch": 0.8213689482470785, + "grad_norm": 0.4504840075969696, + "learning_rate": 8.362089914945322e-05, + "loss": 2.0346, + "step": 2706 + }, + { + "epoch": 0.82167248444377, + "grad_norm": 0.5199129581451416, + "learning_rate": 8.361482381530985e-05, + "loss": 1.3809, + "step": 2707 + }, + { + "epoch": 0.8219760206404614, + "grad_norm": 0.316165030002594, + "learning_rate": 8.360874848116647e-05, + "loss": 1.6682, + "step": 2708 + }, + { + "epoch": 0.8222795568371528, + "grad_norm": 0.434994637966156, + "learning_rate": 8.36026731470231e-05, + "loss": 1.9569, + "step": 2709 + }, + { + "epoch": 0.8225830930338442, + "grad_norm": 0.4767877757549286, + "learning_rate": 8.359659781287972e-05, + "loss": 2.0574, + "step": 2710 + }, + { + "epoch": 0.8228866292305358, + "grad_norm": 0.36715012788772583, + "learning_rate": 8.359052247873633e-05, + "loss": 1.5894, + "step": 2711 + }, + { + "epoch": 0.8231901654272272, + "grad_norm": 0.47204360365867615, + "learning_rate": 8.358444714459296e-05, + "loss": 2.0371, + "step": 2712 + }, + { + "epoch": 0.8234937016239187, + "grad_norm": 0.4339917302131653, + "learning_rate": 8.357837181044958e-05, + "loss": 1.6804, + "step": 2713 + }, + { + "epoch": 0.8237972378206101, + "grad_norm": 0.40211397409439087, + "learning_rate": 8.35722964763062e-05, + "loss": 1.8478, + "step": 2714 + }, + { + "epoch": 0.8241007740173015, + "grad_norm": 0.4626907408237457, + "learning_rate": 8.356622114216283e-05, + "loss": 1.0077, + "step": 2715 + }, + { + "epoch": 0.824404310213993, + "grad_norm": 0.5032857060432434, + "learning_rate": 8.356014580801945e-05, + "loss": 1.4585, + "step": 2716 + }, + { + "epoch": 0.8247078464106845, + "grad_norm": 0.7893280982971191, + "learning_rate": 8.355407047387606e-05, + "loss": 1.7855, + "step": 2717 + }, + { + "epoch": 0.8250113826073759, + "grad_norm": 0.4199140965938568, + "learning_rate": 8.35479951397327e-05, + "loss": 2.036, + "step": 2718 + }, + { + "epoch": 0.8253149188040674, + "grad_norm": 0.4148293137550354, + "learning_rate": 8.354191980558931e-05, + "loss": 1.6445, + "step": 2719 + }, + { + "epoch": 0.8256184550007588, + "grad_norm": 0.40081986784935, + "learning_rate": 8.353584447144593e-05, + "loss": 1.8132, + "step": 2720 + }, + { + "epoch": 0.8259219911974502, + "grad_norm": 0.38736727833747864, + "learning_rate": 8.352976913730256e-05, + "loss": 2.1591, + "step": 2721 + }, + { + "epoch": 0.8262255273941418, + "grad_norm": 0.39787808060646057, + "learning_rate": 8.352369380315918e-05, + "loss": 1.986, + "step": 2722 + }, + { + "epoch": 0.8265290635908332, + "grad_norm": 0.4428958594799042, + "learning_rate": 8.351761846901581e-05, + "loss": 2.0022, + "step": 2723 + }, + { + "epoch": 0.8268325997875247, + "grad_norm": 0.4137169420719147, + "learning_rate": 8.351154313487243e-05, + "loss": 1.647, + "step": 2724 + }, + { + "epoch": 0.8271361359842161, + "grad_norm": 0.3762650489807129, + "learning_rate": 8.350546780072904e-05, + "loss": 1.6598, + "step": 2725 + }, + { + "epoch": 0.8274396721809075, + "grad_norm": 0.4346376955509186, + "learning_rate": 8.349939246658567e-05, + "loss": 1.9286, + "step": 2726 + }, + { + "epoch": 0.8277432083775991, + "grad_norm": 0.4809822142124176, + "learning_rate": 8.349331713244228e-05, + "loss": 1.7861, + "step": 2727 + }, + { + "epoch": 0.8280467445742905, + "grad_norm": 0.4233179986476898, + "learning_rate": 8.348724179829891e-05, + "loss": 1.4572, + "step": 2728 + }, + { + "epoch": 0.8283502807709819, + "grad_norm": 0.4098990261554718, + "learning_rate": 8.348116646415554e-05, + "loss": 1.5889, + "step": 2729 + }, + { + "epoch": 0.8286538169676734, + "grad_norm": 0.5033220052719116, + "learning_rate": 8.347509113001216e-05, + "loss": 2.0879, + "step": 2730 + }, + { + "epoch": 0.8289573531643648, + "grad_norm": 0.4248674809932709, + "learning_rate": 8.346901579586877e-05, + "loss": 1.113, + "step": 2731 + }, + { + "epoch": 0.8292608893610564, + "grad_norm": 0.4501001834869385, + "learning_rate": 8.34629404617254e-05, + "loss": 1.984, + "step": 2732 + }, + { + "epoch": 0.8295644255577478, + "grad_norm": 0.4608478844165802, + "learning_rate": 8.345686512758202e-05, + "loss": 1.692, + "step": 2733 + }, + { + "epoch": 0.8298679617544392, + "grad_norm": 0.4299629330635071, + "learning_rate": 8.345078979343864e-05, + "loss": 2.0634, + "step": 2734 + }, + { + "epoch": 0.8301714979511307, + "grad_norm": 0.4118325412273407, + "learning_rate": 8.344471445929527e-05, + "loss": 1.9081, + "step": 2735 + }, + { + "epoch": 0.8304750341478221, + "grad_norm": 0.5083432793617249, + "learning_rate": 8.343863912515189e-05, + "loss": 2.1141, + "step": 2736 + }, + { + "epoch": 0.8307785703445136, + "grad_norm": 0.7300907373428345, + "learning_rate": 8.34325637910085e-05, + "loss": 1.3025, + "step": 2737 + }, + { + "epoch": 0.8310821065412051, + "grad_norm": 0.4016704261302948, + "learning_rate": 8.342648845686512e-05, + "loss": 1.8973, + "step": 2738 + }, + { + "epoch": 0.8313856427378965, + "grad_norm": 0.4292236268520355, + "learning_rate": 8.342041312272175e-05, + "loss": 1.9909, + "step": 2739 + }, + { + "epoch": 0.8316891789345879, + "grad_norm": 0.4190838634967804, + "learning_rate": 8.341433778857838e-05, + "loss": 1.8034, + "step": 2740 + }, + { + "epoch": 0.8319927151312794, + "grad_norm": 0.4143367111682892, + "learning_rate": 8.340826245443499e-05, + "loss": 2.0459, + "step": 2741 + }, + { + "epoch": 0.8322962513279709, + "grad_norm": 0.46704939007759094, + "learning_rate": 8.340218712029162e-05, + "loss": 1.6579, + "step": 2742 + }, + { + "epoch": 0.8325997875246623, + "grad_norm": 0.48142144083976746, + "learning_rate": 8.339611178614825e-05, + "loss": 1.8507, + "step": 2743 + }, + { + "epoch": 0.8329033237213538, + "grad_norm": 0.42653772234916687, + "learning_rate": 8.339003645200487e-05, + "loss": 1.9661, + "step": 2744 + }, + { + "epoch": 0.8332068599180452, + "grad_norm": 0.42195385694503784, + "learning_rate": 8.338396111786148e-05, + "loss": 1.583, + "step": 2745 + }, + { + "epoch": 0.8335103961147367, + "grad_norm": 0.5214222073554993, + "learning_rate": 8.337788578371811e-05, + "loss": 0.8356, + "step": 2746 + }, + { + "epoch": 0.8338139323114281, + "grad_norm": 0.4736870229244232, + "learning_rate": 8.337181044957473e-05, + "loss": 2.1115, + "step": 2747 + }, + { + "epoch": 0.8341174685081196, + "grad_norm": 0.4879785180091858, + "learning_rate": 8.336573511543135e-05, + "loss": 2.1468, + "step": 2748 + }, + { + "epoch": 0.8344210047048111, + "grad_norm": 0.33596518635749817, + "learning_rate": 8.335965978128798e-05, + "loss": 1.9451, + "step": 2749 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.3724137246608734, + "learning_rate": 8.33535844471446e-05, + "loss": 1.4332, + "step": 2750 + }, + { + "epoch": 0.8350280770981939, + "grad_norm": 0.41488635540008545, + "learning_rate": 8.334750911300121e-05, + "loss": 2.245, + "step": 2751 + }, + { + "epoch": 0.8353316132948854, + "grad_norm": 0.41388005018234253, + "learning_rate": 8.334143377885783e-05, + "loss": 2.0323, + "step": 2752 + }, + { + "epoch": 0.8356351494915769, + "grad_norm": 0.8086270093917847, + "learning_rate": 8.333535844471446e-05, + "loss": 1.9422, + "step": 2753 + }, + { + "epoch": 0.8359386856882683, + "grad_norm": 0.3645714223384857, + "learning_rate": 8.33292831105711e-05, + "loss": 1.7161, + "step": 2754 + }, + { + "epoch": 0.8362422218849598, + "grad_norm": 0.36916327476501465, + "learning_rate": 8.33232077764277e-05, + "loss": 1.7339, + "step": 2755 + }, + { + "epoch": 0.8365457580816512, + "grad_norm": 0.3351556956768036, + "learning_rate": 8.331713244228433e-05, + "loss": 1.5955, + "step": 2756 + }, + { + "epoch": 0.8368492942783426, + "grad_norm": 0.4345923364162445, + "learning_rate": 8.331105710814096e-05, + "loss": 1.871, + "step": 2757 + }, + { + "epoch": 0.8371528304750342, + "grad_norm": 0.4099547266960144, + "learning_rate": 8.330498177399758e-05, + "loss": 1.8057, + "step": 2758 + }, + { + "epoch": 0.8374563666717256, + "grad_norm": 0.45009273290634155, + "learning_rate": 8.32989064398542e-05, + "loss": 1.3956, + "step": 2759 + }, + { + "epoch": 0.8377599028684171, + "grad_norm": 0.3890456557273865, + "learning_rate": 8.329283110571082e-05, + "loss": 1.837, + "step": 2760 + }, + { + "epoch": 0.8380634390651085, + "grad_norm": 0.4065060615539551, + "learning_rate": 8.328675577156744e-05, + "loss": 2.1527, + "step": 2761 + }, + { + "epoch": 0.8383669752617999, + "grad_norm": 0.4432562589645386, + "learning_rate": 8.328068043742406e-05, + "loss": 1.5033, + "step": 2762 + }, + { + "epoch": 0.8386705114584915, + "grad_norm": 0.4977710247039795, + "learning_rate": 8.327460510328069e-05, + "loss": 2.1406, + "step": 2763 + }, + { + "epoch": 0.8389740476551829, + "grad_norm": 1.0339199304580688, + "learning_rate": 8.326852976913731e-05, + "loss": 1.9732, + "step": 2764 + }, + { + "epoch": 0.8392775838518743, + "grad_norm": 1.5824745893478394, + "learning_rate": 8.326245443499392e-05, + "loss": 1.7956, + "step": 2765 + }, + { + "epoch": 0.8395811200485658, + "grad_norm": 0.4485887587070465, + "learning_rate": 8.325637910085054e-05, + "loss": 1.71, + "step": 2766 + }, + { + "epoch": 0.8398846562452572, + "grad_norm": 0.3982546329498291, + "learning_rate": 8.325030376670717e-05, + "loss": 2.0978, + "step": 2767 + }, + { + "epoch": 0.8401881924419488, + "grad_norm": 0.5837999582290649, + "learning_rate": 8.32442284325638e-05, + "loss": 2.113, + "step": 2768 + }, + { + "epoch": 0.8404917286386402, + "grad_norm": 0.5739153623580933, + "learning_rate": 8.323815309842041e-05, + "loss": 2.1892, + "step": 2769 + }, + { + "epoch": 0.8407952648353316, + "grad_norm": 0.3813978135585785, + "learning_rate": 8.323207776427704e-05, + "loss": 1.2944, + "step": 2770 + }, + { + "epoch": 0.8410988010320231, + "grad_norm": 0.4146029055118561, + "learning_rate": 8.322600243013367e-05, + "loss": 2.0011, + "step": 2771 + }, + { + "epoch": 0.8414023372287145, + "grad_norm": 0.38315144181251526, + "learning_rate": 8.321992709599029e-05, + "loss": 2.1513, + "step": 2772 + }, + { + "epoch": 0.8417058734254059, + "grad_norm": 0.4339327812194824, + "learning_rate": 8.32138517618469e-05, + "loss": 1.882, + "step": 2773 + }, + { + "epoch": 0.8420094096220975, + "grad_norm": 0.40696778893470764, + "learning_rate": 8.320777642770353e-05, + "loss": 1.3785, + "step": 2774 + }, + { + "epoch": 0.8423129458187889, + "grad_norm": 0.401257187128067, + "learning_rate": 8.320170109356015e-05, + "loss": 1.8048, + "step": 2775 + }, + { + "epoch": 0.8426164820154803, + "grad_norm": 0.419649213552475, + "learning_rate": 8.319562575941677e-05, + "loss": 1.869, + "step": 2776 + }, + { + "epoch": 0.8429200182121718, + "grad_norm": 0.45188263058662415, + "learning_rate": 8.31895504252734e-05, + "loss": 1.9754, + "step": 2777 + }, + { + "epoch": 0.8432235544088632, + "grad_norm": 0.42580482363700867, + "learning_rate": 8.318347509113002e-05, + "loss": 2.0827, + "step": 2778 + }, + { + "epoch": 0.8435270906055548, + "grad_norm": 0.3485068678855896, + "learning_rate": 8.317739975698663e-05, + "loss": 1.1861, + "step": 2779 + }, + { + "epoch": 0.8438306268022462, + "grad_norm": 0.38991910219192505, + "learning_rate": 8.317132442284325e-05, + "loss": 1.9857, + "step": 2780 + }, + { + "epoch": 0.8441341629989376, + "grad_norm": 0.4066307842731476, + "learning_rate": 8.316524908869988e-05, + "loss": 1.6987, + "step": 2781 + }, + { + "epoch": 0.844437699195629, + "grad_norm": 0.40589094161987305, + "learning_rate": 8.315917375455651e-05, + "loss": 2.099, + "step": 2782 + }, + { + "epoch": 0.8447412353923205, + "grad_norm": 0.42218223214149475, + "learning_rate": 8.315309842041312e-05, + "loss": 1.7493, + "step": 2783 + }, + { + "epoch": 0.845044771589012, + "grad_norm": 0.33325353264808655, + "learning_rate": 8.314702308626975e-05, + "loss": 1.6937, + "step": 2784 + }, + { + "epoch": 0.8453483077857035, + "grad_norm": 0.4162006676197052, + "learning_rate": 8.314094775212638e-05, + "loss": 1.6099, + "step": 2785 + }, + { + "epoch": 0.8456518439823949, + "grad_norm": 1.9040342569351196, + "learning_rate": 8.3134872417983e-05, + "loss": 1.9213, + "step": 2786 + }, + { + "epoch": 0.8459553801790863, + "grad_norm": 0.4040900468826294, + "learning_rate": 8.312879708383961e-05, + "loss": 1.9817, + "step": 2787 + }, + { + "epoch": 0.8462589163757778, + "grad_norm": 0.4395250082015991, + "learning_rate": 8.312272174969624e-05, + "loss": 1.8821, + "step": 2788 + }, + { + "epoch": 0.8465624525724693, + "grad_norm": 0.40407246351242065, + "learning_rate": 8.311664641555286e-05, + "loss": 1.7777, + "step": 2789 + }, + { + "epoch": 0.8468659887691607, + "grad_norm": 0.39172056317329407, + "learning_rate": 8.311057108140948e-05, + "loss": 1.9446, + "step": 2790 + }, + { + "epoch": 0.8471695249658522, + "grad_norm": 0.4654727876186371, + "learning_rate": 8.310449574726611e-05, + "loss": 1.5926, + "step": 2791 + }, + { + "epoch": 0.8474730611625436, + "grad_norm": 0.41954633593559265, + "learning_rate": 8.309842041312273e-05, + "loss": 2.0347, + "step": 2792 + }, + { + "epoch": 0.847776597359235, + "grad_norm": 0.39012208580970764, + "learning_rate": 8.309234507897934e-05, + "loss": 1.8302, + "step": 2793 + }, + { + "epoch": 0.8480801335559266, + "grad_norm": 0.3932954967021942, + "learning_rate": 8.308626974483596e-05, + "loss": 2.0534, + "step": 2794 + }, + { + "epoch": 0.848383669752618, + "grad_norm": 0.40115275979042053, + "learning_rate": 8.308019441069259e-05, + "loss": 2.1026, + "step": 2795 + }, + { + "epoch": 0.8486872059493095, + "grad_norm": 0.6058691143989563, + "learning_rate": 8.307411907654922e-05, + "loss": 1.6423, + "step": 2796 + }, + { + "epoch": 0.8489907421460009, + "grad_norm": 0.3684822916984558, + "learning_rate": 8.306804374240583e-05, + "loss": 1.9901, + "step": 2797 + }, + { + "epoch": 0.8492942783426923, + "grad_norm": 0.3942423164844513, + "learning_rate": 8.306196840826246e-05, + "loss": 1.9238, + "step": 2798 + }, + { + "epoch": 0.8495978145393838, + "grad_norm": 0.3520863354206085, + "learning_rate": 8.305589307411909e-05, + "loss": 1.7857, + "step": 2799 + }, + { + "epoch": 0.8499013507360753, + "grad_norm": 0.7609321475028992, + "learning_rate": 8.304981773997569e-05, + "loss": 1.9176, + "step": 2800 + }, + { + "epoch": 0.8502048869327667, + "grad_norm": 0.45220932364463806, + "learning_rate": 8.304374240583232e-05, + "loss": 1.8932, + "step": 2801 + }, + { + "epoch": 0.8505084231294582, + "grad_norm": 0.33773747086524963, + "learning_rate": 8.303766707168895e-05, + "loss": 1.8779, + "step": 2802 + }, + { + "epoch": 0.8508119593261496, + "grad_norm": 0.4092886745929718, + "learning_rate": 8.303159173754557e-05, + "loss": 2.3972, + "step": 2803 + }, + { + "epoch": 0.851115495522841, + "grad_norm": 0.4083962142467499, + "learning_rate": 8.302551640340219e-05, + "loss": 1.5651, + "step": 2804 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.41298726201057434, + "learning_rate": 8.301944106925882e-05, + "loss": 1.9808, + "step": 2805 + }, + { + "epoch": 0.851722567916224, + "grad_norm": 0.3522525131702423, + "learning_rate": 8.301336573511544e-05, + "loss": 2.0116, + "step": 2806 + }, + { + "epoch": 0.8520261041129155, + "grad_norm": 0.3948490619659424, + "learning_rate": 8.300729040097205e-05, + "loss": 1.9998, + "step": 2807 + }, + { + "epoch": 0.8523296403096069, + "grad_norm": 0.40480837225914, + "learning_rate": 8.300121506682867e-05, + "loss": 1.9787, + "step": 2808 + }, + { + "epoch": 0.8526331765062983, + "grad_norm": 0.3458811044692993, + "learning_rate": 8.29951397326853e-05, + "loss": 1.9506, + "step": 2809 + }, + { + "epoch": 0.8529367127029899, + "grad_norm": 0.4472740888595581, + "learning_rate": 8.298906439854193e-05, + "loss": 1.7003, + "step": 2810 + }, + { + "epoch": 0.8532402488996813, + "grad_norm": 0.3910341262817383, + "learning_rate": 8.298298906439854e-05, + "loss": 1.6672, + "step": 2811 + }, + { + "epoch": 0.8535437850963727, + "grad_norm": 0.4467204213142395, + "learning_rate": 8.297691373025517e-05, + "loss": 1.8857, + "step": 2812 + }, + { + "epoch": 0.8538473212930642, + "grad_norm": 0.42083072662353516, + "learning_rate": 8.29708383961118e-05, + "loss": 2.0453, + "step": 2813 + }, + { + "epoch": 0.8541508574897556, + "grad_norm": 0.4398275315761566, + "learning_rate": 8.29647630619684e-05, + "loss": 1.8129, + "step": 2814 + }, + { + "epoch": 0.8544543936864472, + "grad_norm": 0.8038653135299683, + "learning_rate": 8.295868772782503e-05, + "loss": 1.9836, + "step": 2815 + }, + { + "epoch": 0.8547579298831386, + "grad_norm": 0.41887366771698, + "learning_rate": 8.295261239368166e-05, + "loss": 2.0524, + "step": 2816 + }, + { + "epoch": 0.85506146607983, + "grad_norm": 0.5513349175453186, + "learning_rate": 8.294653705953828e-05, + "loss": 1.681, + "step": 2817 + }, + { + "epoch": 0.8553650022765215, + "grad_norm": 0.4004881680011749, + "learning_rate": 8.29404617253949e-05, + "loss": 1.5912, + "step": 2818 + }, + { + "epoch": 0.8556685384732129, + "grad_norm": 0.3472290635108948, + "learning_rate": 8.293438639125152e-05, + "loss": 1.7615, + "step": 2819 + }, + { + "epoch": 0.8559720746699044, + "grad_norm": 0.4187697470188141, + "learning_rate": 8.292831105710815e-05, + "loss": 1.6436, + "step": 2820 + }, + { + "epoch": 0.8562756108665959, + "grad_norm": 0.418883353471756, + "learning_rate": 8.292223572296476e-05, + "loss": 1.8053, + "step": 2821 + }, + { + "epoch": 0.8565791470632873, + "grad_norm": 0.41798603534698486, + "learning_rate": 8.291616038882138e-05, + "loss": 1.568, + "step": 2822 + }, + { + "epoch": 0.8568826832599787, + "grad_norm": 0.3748184144496918, + "learning_rate": 8.291008505467801e-05, + "loss": 1.4218, + "step": 2823 + }, + { + "epoch": 0.8571862194566702, + "grad_norm": 0.42556214332580566, + "learning_rate": 8.290400972053463e-05, + "loss": 1.5612, + "step": 2824 + }, + { + "epoch": 0.8574897556533617, + "grad_norm": 0.46294355392456055, + "learning_rate": 8.289793438639125e-05, + "loss": 1.6574, + "step": 2825 + }, + { + "epoch": 0.8577932918500532, + "grad_norm": 0.40295061469078064, + "learning_rate": 8.289185905224788e-05, + "loss": 1.3524, + "step": 2826 + }, + { + "epoch": 0.8580968280467446, + "grad_norm": 0.465472549200058, + "learning_rate": 8.288578371810451e-05, + "loss": 2.017, + "step": 2827 + }, + { + "epoch": 0.858400364243436, + "grad_norm": 0.4338732957839966, + "learning_rate": 8.287970838396111e-05, + "loss": 1.7856, + "step": 2828 + }, + { + "epoch": 0.8587039004401275, + "grad_norm": 0.4338977336883545, + "learning_rate": 8.287363304981774e-05, + "loss": 1.8401, + "step": 2829 + }, + { + "epoch": 0.8590074366368189, + "grad_norm": 0.43514832854270935, + "learning_rate": 8.286755771567437e-05, + "loss": 1.7652, + "step": 2830 + }, + { + "epoch": 0.8593109728335104, + "grad_norm": 0.3935963213443756, + "learning_rate": 8.286148238153099e-05, + "loss": 1.9616, + "step": 2831 + }, + { + "epoch": 0.8596145090302019, + "grad_norm": 0.4481986463069916, + "learning_rate": 8.285540704738761e-05, + "loss": 1.8241, + "step": 2832 + }, + { + "epoch": 0.8599180452268933, + "grad_norm": 0.3898305594921112, + "learning_rate": 8.284933171324423e-05, + "loss": 2.0083, + "step": 2833 + }, + { + "epoch": 0.8602215814235847, + "grad_norm": 0.40316537022590637, + "learning_rate": 8.284325637910086e-05, + "loss": 1.5316, + "step": 2834 + }, + { + "epoch": 0.8605251176202762, + "grad_norm": 0.407939612865448, + "learning_rate": 8.283718104495747e-05, + "loss": 2.0299, + "step": 2835 + }, + { + "epoch": 0.8608286538169677, + "grad_norm": 1.6520899534225464, + "learning_rate": 8.283110571081409e-05, + "loss": 1.8243, + "step": 2836 + }, + { + "epoch": 0.8611321900136591, + "grad_norm": 0.4098230004310608, + "learning_rate": 8.282503037667072e-05, + "loss": 1.9032, + "step": 2837 + }, + { + "epoch": 0.8614357262103506, + "grad_norm": 0.3847675025463104, + "learning_rate": 8.281895504252734e-05, + "loss": 1.7605, + "step": 2838 + }, + { + "epoch": 0.861739262407042, + "grad_norm": 0.42115259170532227, + "learning_rate": 8.281287970838396e-05, + "loss": 1.4382, + "step": 2839 + }, + { + "epoch": 0.8620427986037335, + "grad_norm": 0.4232335090637207, + "learning_rate": 8.280680437424059e-05, + "loss": 1.9637, + "step": 2840 + }, + { + "epoch": 0.862346334800425, + "grad_norm": 0.3830999732017517, + "learning_rate": 8.280072904009722e-05, + "loss": 2.0951, + "step": 2841 + }, + { + "epoch": 0.8626498709971164, + "grad_norm": 0.4446307122707367, + "learning_rate": 8.279465370595382e-05, + "loss": 1.8081, + "step": 2842 + }, + { + "epoch": 0.8629534071938079, + "grad_norm": 0.40466341376304626, + "learning_rate": 8.278857837181045e-05, + "loss": 1.9391, + "step": 2843 + }, + { + "epoch": 0.8632569433904993, + "grad_norm": 0.4302142560482025, + "learning_rate": 8.278250303766708e-05, + "loss": 2.0331, + "step": 2844 + }, + { + "epoch": 0.8635604795871907, + "grad_norm": 0.40295708179473877, + "learning_rate": 8.27764277035237e-05, + "loss": 1.7359, + "step": 2845 + }, + { + "epoch": 0.8638640157838823, + "grad_norm": 0.5045837163925171, + "learning_rate": 8.277035236938032e-05, + "loss": 1.9034, + "step": 2846 + }, + { + "epoch": 0.8641675519805737, + "grad_norm": 0.4259899854660034, + "learning_rate": 8.276427703523694e-05, + "loss": 2.0042, + "step": 2847 + }, + { + "epoch": 0.8644710881772651, + "grad_norm": 0.4341868460178375, + "learning_rate": 8.275820170109357e-05, + "loss": 1.8425, + "step": 2848 + }, + { + "epoch": 0.8647746243739566, + "grad_norm": 0.46809810400009155, + "learning_rate": 8.275212636695018e-05, + "loss": 1.9288, + "step": 2849 + }, + { + "epoch": 0.865078160570648, + "grad_norm": 0.4060373306274414, + "learning_rate": 8.27460510328068e-05, + "loss": 1.7835, + "step": 2850 + }, + { + "epoch": 0.8653816967673396, + "grad_norm": 0.39879024028778076, + "learning_rate": 8.273997569866343e-05, + "loss": 1.7374, + "step": 2851 + }, + { + "epoch": 0.865685232964031, + "grad_norm": 0.4948522746562958, + "learning_rate": 8.273390036452005e-05, + "loss": 1.9127, + "step": 2852 + }, + { + "epoch": 0.8659887691607224, + "grad_norm": 0.40187695622444153, + "learning_rate": 8.272782503037667e-05, + "loss": 1.7527, + "step": 2853 + }, + { + "epoch": 0.8662923053574139, + "grad_norm": 0.4162091910839081, + "learning_rate": 8.27217496962333e-05, + "loss": 2.0477, + "step": 2854 + }, + { + "epoch": 0.8665958415541053, + "grad_norm": 0.4181444048881531, + "learning_rate": 8.271567436208993e-05, + "loss": 1.9956, + "step": 2855 + }, + { + "epoch": 0.8668993777507967, + "grad_norm": 0.44338878989219666, + "learning_rate": 8.270959902794653e-05, + "loss": 1.7806, + "step": 2856 + }, + { + "epoch": 0.8672029139474883, + "grad_norm": 0.4224783778190613, + "learning_rate": 8.270352369380316e-05, + "loss": 1.8572, + "step": 2857 + }, + { + "epoch": 0.8675064501441797, + "grad_norm": 0.4111135005950928, + "learning_rate": 8.26974483596598e-05, + "loss": 1.8744, + "step": 2858 + }, + { + "epoch": 0.8678099863408711, + "grad_norm": 0.40660667419433594, + "learning_rate": 8.269137302551641e-05, + "loss": 1.7401, + "step": 2859 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.430890291929245, + "learning_rate": 8.268529769137303e-05, + "loss": 1.5967, + "step": 2860 + }, + { + "epoch": 0.868417058734254, + "grad_norm": 0.45299017429351807, + "learning_rate": 8.267922235722965e-05, + "loss": 1.8742, + "step": 2861 + }, + { + "epoch": 0.8687205949309456, + "grad_norm": 0.3461768329143524, + "learning_rate": 8.267314702308628e-05, + "loss": 1.6326, + "step": 2862 + }, + { + "epoch": 0.869024131127637, + "grad_norm": 0.386844664812088, + "learning_rate": 8.26670716889429e-05, + "loss": 1.7126, + "step": 2863 + }, + { + "epoch": 0.8693276673243284, + "grad_norm": 0.6148979067802429, + "learning_rate": 8.266099635479951e-05, + "loss": 1.3573, + "step": 2864 + }, + { + "epoch": 0.8696312035210199, + "grad_norm": 0.4048292934894562, + "learning_rate": 8.265492102065614e-05, + "loss": 1.1731, + "step": 2865 + }, + { + "epoch": 0.8699347397177113, + "grad_norm": 0.3976982831954956, + "learning_rate": 8.264884568651276e-05, + "loss": 1.8089, + "step": 2866 + }, + { + "epoch": 0.8702382759144028, + "grad_norm": 0.39783963561058044, + "learning_rate": 8.264277035236938e-05, + "loss": 1.5432, + "step": 2867 + }, + { + "epoch": 0.8705418121110943, + "grad_norm": 0.3972279131412506, + "learning_rate": 8.263669501822601e-05, + "loss": 1.7548, + "step": 2868 + }, + { + "epoch": 0.8708453483077857, + "grad_norm": 0.43422597646713257, + "learning_rate": 8.263061968408264e-05, + "loss": 1.9792, + "step": 2869 + }, + { + "epoch": 0.8711488845044771, + "grad_norm": 0.3682768940925598, + "learning_rate": 8.262454434993924e-05, + "loss": 2.1472, + "step": 2870 + }, + { + "epoch": 0.8714524207011686, + "grad_norm": 0.37669479846954346, + "learning_rate": 8.261846901579587e-05, + "loss": 1.9962, + "step": 2871 + }, + { + "epoch": 0.8717559568978601, + "grad_norm": 0.36915603280067444, + "learning_rate": 8.26123936816525e-05, + "loss": 1.5438, + "step": 2872 + }, + { + "epoch": 0.8720594930945516, + "grad_norm": 0.4083096981048584, + "learning_rate": 8.260631834750911e-05, + "loss": 1.7711, + "step": 2873 + }, + { + "epoch": 0.872363029291243, + "grad_norm": 0.3865950107574463, + "learning_rate": 8.260024301336574e-05, + "loss": 1.8624, + "step": 2874 + }, + { + "epoch": 0.8726665654879344, + "grad_norm": 0.5207681655883789, + "learning_rate": 8.259416767922236e-05, + "loss": 2.0506, + "step": 2875 + }, + { + "epoch": 0.8729701016846259, + "grad_norm": 0.4441354274749756, + "learning_rate": 8.258809234507899e-05, + "loss": 1.822, + "step": 2876 + }, + { + "epoch": 0.8732736378813174, + "grad_norm": 0.3258417844772339, + "learning_rate": 8.25820170109356e-05, + "loss": 1.2733, + "step": 2877 + }, + { + "epoch": 0.8735771740780088, + "grad_norm": 0.37115880846977234, + "learning_rate": 8.257594167679222e-05, + "loss": 1.9161, + "step": 2878 + }, + { + "epoch": 0.8738807102747003, + "grad_norm": 0.47799551486968994, + "learning_rate": 8.256986634264885e-05, + "loss": 2.0963, + "step": 2879 + }, + { + "epoch": 0.8741842464713917, + "grad_norm": 0.4438342750072479, + "learning_rate": 8.256379100850547e-05, + "loss": 1.3525, + "step": 2880 + }, + { + "epoch": 0.8744877826680831, + "grad_norm": 0.3878926932811737, + "learning_rate": 8.255771567436209e-05, + "loss": 1.4205, + "step": 2881 + }, + { + "epoch": 0.8747913188647746, + "grad_norm": 0.4843897819519043, + "learning_rate": 8.255164034021872e-05, + "loss": 2.0388, + "step": 2882 + }, + { + "epoch": 0.8750948550614661, + "grad_norm": 0.5297700762748718, + "learning_rate": 8.254556500607535e-05, + "loss": 1.3358, + "step": 2883 + }, + { + "epoch": 0.8753983912581575, + "grad_norm": 0.34962332248687744, + "learning_rate": 8.253948967193195e-05, + "loss": 1.6778, + "step": 2884 + }, + { + "epoch": 0.875701927454849, + "grad_norm": 0.4917025864124298, + "learning_rate": 8.253341433778858e-05, + "loss": 1.7919, + "step": 2885 + }, + { + "epoch": 0.8760054636515404, + "grad_norm": 0.43004027009010315, + "learning_rate": 8.252733900364521e-05, + "loss": 1.8666, + "step": 2886 + }, + { + "epoch": 0.8763089998482319, + "grad_norm": 0.5672779679298401, + "learning_rate": 8.252126366950182e-05, + "loss": 1.5285, + "step": 2887 + }, + { + "epoch": 0.8766125360449234, + "grad_norm": 0.45307332277297974, + "learning_rate": 8.251518833535845e-05, + "loss": 1.9865, + "step": 2888 + }, + { + "epoch": 0.8769160722416148, + "grad_norm": 0.4099940359592438, + "learning_rate": 8.250911300121507e-05, + "loss": 1.8511, + "step": 2889 + }, + { + "epoch": 0.8772196084383063, + "grad_norm": 0.4223155081272125, + "learning_rate": 8.25030376670717e-05, + "loss": 1.5077, + "step": 2890 + }, + { + "epoch": 0.8775231446349977, + "grad_norm": 0.4101323187351227, + "learning_rate": 8.249696233292831e-05, + "loss": 1.7534, + "step": 2891 + }, + { + "epoch": 0.8778266808316891, + "grad_norm": 0.4333887994289398, + "learning_rate": 8.249088699878493e-05, + "loss": 1.9611, + "step": 2892 + }, + { + "epoch": 0.8781302170283807, + "grad_norm": 0.44225746393203735, + "learning_rate": 8.248481166464156e-05, + "loss": 1.9739, + "step": 2893 + }, + { + "epoch": 0.8784337532250721, + "grad_norm": 0.419316828250885, + "learning_rate": 8.247873633049818e-05, + "loss": 1.9278, + "step": 2894 + }, + { + "epoch": 0.8787372894217635, + "grad_norm": 0.39314213395118713, + "learning_rate": 8.24726609963548e-05, + "loss": 1.8768, + "step": 2895 + }, + { + "epoch": 0.879040825618455, + "grad_norm": 0.45280733704566956, + "learning_rate": 8.246658566221143e-05, + "loss": 1.4584, + "step": 2896 + }, + { + "epoch": 0.8793443618151464, + "grad_norm": 0.4470527768135071, + "learning_rate": 8.246051032806805e-05, + "loss": 1.992, + "step": 2897 + }, + { + "epoch": 0.879647898011838, + "grad_norm": 0.4180387854576111, + "learning_rate": 8.245443499392466e-05, + "loss": 1.7643, + "step": 2898 + }, + { + "epoch": 0.8799514342085294, + "grad_norm": 0.42866212129592896, + "learning_rate": 8.24483596597813e-05, + "loss": 1.4524, + "step": 2899 + }, + { + "epoch": 0.8802549704052208, + "grad_norm": 0.3700104057788849, + "learning_rate": 8.244228432563791e-05, + "loss": 1.7607, + "step": 2900 + }, + { + "epoch": 0.8805585066019123, + "grad_norm": 0.4378833770751953, + "learning_rate": 8.243620899149453e-05, + "loss": 1.7648, + "step": 2901 + }, + { + "epoch": 0.8808620427986037, + "grad_norm": 0.4206582307815552, + "learning_rate": 8.243013365735116e-05, + "loss": 1.159, + "step": 2902 + }, + { + "epoch": 0.8811655789952952, + "grad_norm": 0.4247249960899353, + "learning_rate": 8.242405832320778e-05, + "loss": 2.0676, + "step": 2903 + }, + { + "epoch": 0.8814691151919867, + "grad_norm": 0.43796390295028687, + "learning_rate": 8.241798298906441e-05, + "loss": 1.9104, + "step": 2904 + }, + { + "epoch": 0.8817726513886781, + "grad_norm": 0.4268593192100525, + "learning_rate": 8.241190765492103e-05, + "loss": 1.8041, + "step": 2905 + }, + { + "epoch": 0.8820761875853695, + "grad_norm": 0.5760425925254822, + "learning_rate": 8.240583232077764e-05, + "loss": 1.7562, + "step": 2906 + }, + { + "epoch": 0.882379723782061, + "grad_norm": 0.328421950340271, + "learning_rate": 8.239975698663427e-05, + "loss": 2.0184, + "step": 2907 + }, + { + "epoch": 0.8826832599787525, + "grad_norm": 0.4264001250267029, + "learning_rate": 8.239368165249089e-05, + "loss": 1.8521, + "step": 2908 + }, + { + "epoch": 0.882986796175444, + "grad_norm": 0.7516580820083618, + "learning_rate": 8.238760631834751e-05, + "loss": 1.9573, + "step": 2909 + }, + { + "epoch": 0.8832903323721354, + "grad_norm": 0.43976011872291565, + "learning_rate": 8.238153098420414e-05, + "loss": 1.9756, + "step": 2910 + }, + { + "epoch": 0.8835938685688268, + "grad_norm": 0.420858234167099, + "learning_rate": 8.237545565006076e-05, + "loss": 1.8513, + "step": 2911 + }, + { + "epoch": 0.8838974047655183, + "grad_norm": 0.45598578453063965, + "learning_rate": 8.236938031591737e-05, + "loss": 1.8986, + "step": 2912 + }, + { + "epoch": 0.8842009409622097, + "grad_norm": 0.3829743266105652, + "learning_rate": 8.2363304981774e-05, + "loss": 1.7241, + "step": 2913 + }, + { + "epoch": 0.8845044771589012, + "grad_norm": 1.6669212579727173, + "learning_rate": 8.235722964763062e-05, + "loss": 1.875, + "step": 2914 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.711898684501648, + "learning_rate": 8.235115431348724e-05, + "loss": 1.9559, + "step": 2915 + }, + { + "epoch": 0.8851115495522841, + "grad_norm": 0.46978119015693665, + "learning_rate": 8.234507897934387e-05, + "loss": 1.4868, + "step": 2916 + }, + { + "epoch": 0.8854150857489755, + "grad_norm": 0.4142061173915863, + "learning_rate": 8.233900364520049e-05, + "loss": 1.9654, + "step": 2917 + }, + { + "epoch": 0.885718621945667, + "grad_norm": 0.4385989010334015, + "learning_rate": 8.233292831105712e-05, + "loss": 1.7259, + "step": 2918 + }, + { + "epoch": 0.8860221581423585, + "grad_norm": 0.4307645261287689, + "learning_rate": 8.232685297691374e-05, + "loss": 2.033, + "step": 2919 + }, + { + "epoch": 0.88632569433905, + "grad_norm": 0.5880458950996399, + "learning_rate": 8.232077764277035e-05, + "loss": 1.5008, + "step": 2920 + }, + { + "epoch": 0.8866292305357414, + "grad_norm": 0.4887501299381256, + "learning_rate": 8.231470230862698e-05, + "loss": 1.9574, + "step": 2921 + }, + { + "epoch": 0.8869327667324328, + "grad_norm": 0.42289820313453674, + "learning_rate": 8.23086269744836e-05, + "loss": 1.8607, + "step": 2922 + }, + { + "epoch": 0.8872363029291243, + "grad_norm": 0.4192774295806885, + "learning_rate": 8.230255164034022e-05, + "loss": 2.0718, + "step": 2923 + }, + { + "epoch": 0.8875398391258158, + "grad_norm": 0.5114601850509644, + "learning_rate": 8.229647630619685e-05, + "loss": 1.9832, + "step": 2924 + }, + { + "epoch": 0.8878433753225072, + "grad_norm": 0.4116429388523102, + "learning_rate": 8.229040097205347e-05, + "loss": 1.7623, + "step": 2925 + }, + { + "epoch": 0.8881469115191987, + "grad_norm": 0.44943469762802124, + "learning_rate": 8.228432563791008e-05, + "loss": 1.8241, + "step": 2926 + }, + { + "epoch": 0.8884504477158901, + "grad_norm": 1.1577938795089722, + "learning_rate": 8.227825030376671e-05, + "loss": 1.9125, + "step": 2927 + }, + { + "epoch": 0.8887539839125815, + "grad_norm": 1.1404715776443481, + "learning_rate": 8.227217496962333e-05, + "loss": 1.4977, + "step": 2928 + }, + { + "epoch": 0.8890575201092731, + "grad_norm": 0.7202188968658447, + "learning_rate": 8.226609963547995e-05, + "loss": 2.0293, + "step": 2929 + }, + { + "epoch": 0.8893610563059645, + "grad_norm": 0.8101162910461426, + "learning_rate": 8.226002430133658e-05, + "loss": 1.888, + "step": 2930 + }, + { + "epoch": 0.889664592502656, + "grad_norm": 0.41163596510887146, + "learning_rate": 8.22539489671932e-05, + "loss": 1.849, + "step": 2931 + }, + { + "epoch": 0.8899681286993474, + "grad_norm": 0.42284974455833435, + "learning_rate": 8.224787363304983e-05, + "loss": 1.9611, + "step": 2932 + }, + { + "epoch": 0.8902716648960388, + "grad_norm": 0.6039950847625732, + "learning_rate": 8.224179829890645e-05, + "loss": 1.9822, + "step": 2933 + }, + { + "epoch": 0.8905752010927304, + "grad_norm": 0.3433489203453064, + "learning_rate": 8.223572296476306e-05, + "loss": 1.7947, + "step": 2934 + }, + { + "epoch": 0.8908787372894218, + "grad_norm": 0.3537866473197937, + "learning_rate": 8.22296476306197e-05, + "loss": 1.8749, + "step": 2935 + }, + { + "epoch": 0.8911822734861132, + "grad_norm": 0.3994251787662506, + "learning_rate": 8.222357229647631e-05, + "loss": 1.7805, + "step": 2936 + }, + { + "epoch": 0.8914858096828047, + "grad_norm": 0.3776698708534241, + "learning_rate": 8.221749696233293e-05, + "loss": 1.8582, + "step": 2937 + }, + { + "epoch": 0.8917893458794961, + "grad_norm": 0.42231059074401855, + "learning_rate": 8.221142162818956e-05, + "loss": 1.9542, + "step": 2938 + }, + { + "epoch": 0.8920928820761875, + "grad_norm": 0.470005065202713, + "learning_rate": 8.220534629404618e-05, + "loss": 1.1926, + "step": 2939 + }, + { + "epoch": 0.8923964182728791, + "grad_norm": 0.43730974197387695, + "learning_rate": 8.21992709599028e-05, + "loss": 1.549, + "step": 2940 + }, + { + "epoch": 0.8926999544695705, + "grad_norm": 0.4016040563583374, + "learning_rate": 8.219319562575942e-05, + "loss": 1.8797, + "step": 2941 + }, + { + "epoch": 0.893003490666262, + "grad_norm": 0.4425860345363617, + "learning_rate": 8.218712029161604e-05, + "loss": 1.4267, + "step": 2942 + }, + { + "epoch": 0.8933070268629534, + "grad_norm": 0.8383780717849731, + "learning_rate": 8.218104495747266e-05, + "loss": 1.8884, + "step": 2943 + }, + { + "epoch": 0.8936105630596448, + "grad_norm": 0.4015752077102661, + "learning_rate": 8.217496962332929e-05, + "loss": 2.0479, + "step": 2944 + }, + { + "epoch": 0.8939140992563364, + "grad_norm": 0.39999493956565857, + "learning_rate": 8.216889428918591e-05, + "loss": 1.8845, + "step": 2945 + }, + { + "epoch": 0.8942176354530278, + "grad_norm": 0.800762414932251, + "learning_rate": 8.216281895504252e-05, + "loss": 2.1039, + "step": 2946 + }, + { + "epoch": 0.8945211716497192, + "grad_norm": 0.38609185814857483, + "learning_rate": 8.215674362089916e-05, + "loss": 1.8951, + "step": 2947 + }, + { + "epoch": 0.8948247078464107, + "grad_norm": 0.37557461857795715, + "learning_rate": 8.215066828675577e-05, + "loss": 1.8418, + "step": 2948 + }, + { + "epoch": 0.8951282440431021, + "grad_norm": 0.4221288561820984, + "learning_rate": 8.21445929526124e-05, + "loss": 2.0154, + "step": 2949 + }, + { + "epoch": 0.8954317802397936, + "grad_norm": 0.3798159658908844, + "learning_rate": 8.213851761846902e-05, + "loss": 1.852, + "step": 2950 + }, + { + "epoch": 0.8957353164364851, + "grad_norm": 0.4777775704860687, + "learning_rate": 8.213244228432564e-05, + "loss": 1.9272, + "step": 2951 + }, + { + "epoch": 0.8960388526331765, + "grad_norm": 0.45156142115592957, + "learning_rate": 8.212636695018227e-05, + "loss": 1.7512, + "step": 2952 + }, + { + "epoch": 0.8963423888298679, + "grad_norm": 0.43190255761146545, + "learning_rate": 8.212029161603889e-05, + "loss": 2.0517, + "step": 2953 + }, + { + "epoch": 0.8966459250265594, + "grad_norm": 0.40969786047935486, + "learning_rate": 8.21142162818955e-05, + "loss": 1.843, + "step": 2954 + }, + { + "epoch": 0.8969494612232509, + "grad_norm": 0.3868393003940582, + "learning_rate": 8.210814094775213e-05, + "loss": 1.9854, + "step": 2955 + }, + { + "epoch": 0.8972529974199424, + "grad_norm": 0.39843276143074036, + "learning_rate": 8.210206561360875e-05, + "loss": 1.9419, + "step": 2956 + }, + { + "epoch": 0.8975565336166338, + "grad_norm": 0.3709312379360199, + "learning_rate": 8.209599027946537e-05, + "loss": 1.9075, + "step": 2957 + }, + { + "epoch": 0.8978600698133252, + "grad_norm": 0.3753807246685028, + "learning_rate": 8.2089914945322e-05, + "loss": 1.8474, + "step": 2958 + }, + { + "epoch": 0.8981636060100167, + "grad_norm": 0.47521868348121643, + "learning_rate": 8.208383961117862e-05, + "loss": 2.0679, + "step": 2959 + }, + { + "epoch": 0.8984671422067082, + "grad_norm": 0.3866266906261444, + "learning_rate": 8.207776427703523e-05, + "loss": 1.6691, + "step": 2960 + }, + { + "epoch": 0.8987706784033996, + "grad_norm": 0.417644739151001, + "learning_rate": 8.207168894289187e-05, + "loss": 1.7893, + "step": 2961 + }, + { + "epoch": 0.8990742146000911, + "grad_norm": 0.427492618560791, + "learning_rate": 8.206561360874848e-05, + "loss": 1.9072, + "step": 2962 + }, + { + "epoch": 0.8993777507967825, + "grad_norm": 0.4407294988632202, + "learning_rate": 8.205953827460511e-05, + "loss": 1.9718, + "step": 2963 + }, + { + "epoch": 0.8996812869934739, + "grad_norm": 0.4453076720237732, + "learning_rate": 8.205346294046173e-05, + "loss": 1.6475, + "step": 2964 + }, + { + "epoch": 0.8999848231901654, + "grad_norm": 0.43250027298927307, + "learning_rate": 8.204738760631835e-05, + "loss": 1.6695, + "step": 2965 + }, + { + "epoch": 0.9002883593868569, + "grad_norm": 0.4513264298439026, + "learning_rate": 8.204131227217498e-05, + "loss": 1.7753, + "step": 2966 + }, + { + "epoch": 0.9005918955835484, + "grad_norm": 0.3830716609954834, + "learning_rate": 8.20352369380316e-05, + "loss": 2.0716, + "step": 2967 + }, + { + "epoch": 0.9008954317802398, + "grad_norm": 0.4067733585834503, + "learning_rate": 8.202916160388821e-05, + "loss": 1.9266, + "step": 2968 + }, + { + "epoch": 0.9011989679769312, + "grad_norm": 0.39445656538009644, + "learning_rate": 8.202308626974484e-05, + "loss": 1.9995, + "step": 2969 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.7493338584899902, + "learning_rate": 8.201701093560146e-05, + "loss": 2.0499, + "step": 2970 + }, + { + "epoch": 0.9018060403703142, + "grad_norm": 0.4843970537185669, + "learning_rate": 8.201093560145808e-05, + "loss": 1.7133, + "step": 2971 + }, + { + "epoch": 0.9021095765670056, + "grad_norm": 0.4203130602836609, + "learning_rate": 8.200486026731471e-05, + "loss": 2.057, + "step": 2972 + }, + { + "epoch": 0.9024131127636971, + "grad_norm": 0.47080641984939575, + "learning_rate": 8.199878493317133e-05, + "loss": 1.9157, + "step": 2973 + }, + { + "epoch": 0.9027166489603885, + "grad_norm": 0.3420778512954712, + "learning_rate": 8.199270959902794e-05, + "loss": 1.7641, + "step": 2974 + }, + { + "epoch": 0.9030201851570799, + "grad_norm": 0.4011532962322235, + "learning_rate": 8.198663426488458e-05, + "loss": 2.0489, + "step": 2975 + }, + { + "epoch": 0.9033237213537715, + "grad_norm": 0.457653284072876, + "learning_rate": 8.198055893074119e-05, + "loss": 1.5258, + "step": 2976 + }, + { + "epoch": 0.9036272575504629, + "grad_norm": 0.45125746726989746, + "learning_rate": 8.197448359659782e-05, + "loss": 1.8996, + "step": 2977 + }, + { + "epoch": 0.9039307937471543, + "grad_norm": 0.44737517833709717, + "learning_rate": 8.196840826245444e-05, + "loss": 1.6851, + "step": 2978 + }, + { + "epoch": 0.9042343299438458, + "grad_norm": 0.4220506250858307, + "learning_rate": 8.196233292831106e-05, + "loss": 1.0195, + "step": 2979 + }, + { + "epoch": 0.9045378661405372, + "grad_norm": 0.40028899908065796, + "learning_rate": 8.195625759416769e-05, + "loss": 1.9251, + "step": 2980 + }, + { + "epoch": 0.9048414023372288, + "grad_norm": 0.3769090175628662, + "learning_rate": 8.19501822600243e-05, + "loss": 1.6839, + "step": 2981 + }, + { + "epoch": 0.9051449385339202, + "grad_norm": 0.41733232140541077, + "learning_rate": 8.194410692588092e-05, + "loss": 1.1995, + "step": 2982 + }, + { + "epoch": 0.9054484747306116, + "grad_norm": 0.4010336697101593, + "learning_rate": 8.193803159173755e-05, + "loss": 1.5531, + "step": 2983 + }, + { + "epoch": 0.9057520109273031, + "grad_norm": 0.38843366503715515, + "learning_rate": 8.193195625759417e-05, + "loss": 1.8149, + "step": 2984 + }, + { + "epoch": 0.9060555471239945, + "grad_norm": 0.3807307481765747, + "learning_rate": 8.192588092345079e-05, + "loss": 1.8379, + "step": 2985 + }, + { + "epoch": 0.906359083320686, + "grad_norm": 0.44730183482170105, + "learning_rate": 8.191980558930742e-05, + "loss": 1.6764, + "step": 2986 + }, + { + "epoch": 0.9066626195173775, + "grad_norm": 0.4236774146556854, + "learning_rate": 8.191373025516404e-05, + "loss": 1.926, + "step": 2987 + }, + { + "epoch": 0.9069661557140689, + "grad_norm": 0.35578781366348267, + "learning_rate": 8.190765492102065e-05, + "loss": 2.1609, + "step": 2988 + }, + { + "epoch": 0.9072696919107603, + "grad_norm": 0.41288191080093384, + "learning_rate": 8.190157958687729e-05, + "loss": 2.1007, + "step": 2989 + }, + { + "epoch": 0.9075732281074518, + "grad_norm": 0.43154072761535645, + "learning_rate": 8.18955042527339e-05, + "loss": 1.7687, + "step": 2990 + }, + { + "epoch": 0.9078767643041433, + "grad_norm": 0.41048216819763184, + "learning_rate": 8.188942891859053e-05, + "loss": 2.2021, + "step": 2991 + }, + { + "epoch": 0.9081803005008348, + "grad_norm": 0.4213089942932129, + "learning_rate": 8.188335358444715e-05, + "loss": 1.7208, + "step": 2992 + }, + { + "epoch": 0.9084838366975262, + "grad_norm": 0.9679743647575378, + "learning_rate": 8.187727825030377e-05, + "loss": 1.731, + "step": 2993 + }, + { + "epoch": 0.9087873728942176, + "grad_norm": 0.4650149643421173, + "learning_rate": 8.18712029161604e-05, + "loss": 1.9327, + "step": 2994 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.3545879125595093, + "learning_rate": 8.1865127582017e-05, + "loss": 1.3728, + "step": 2995 + }, + { + "epoch": 0.9093944452876005, + "grad_norm": 0.4107753336429596, + "learning_rate": 8.185905224787363e-05, + "loss": 2.0919, + "step": 2996 + }, + { + "epoch": 0.909697981484292, + "grad_norm": 0.44587281346321106, + "learning_rate": 8.185297691373026e-05, + "loss": 1.5915, + "step": 2997 + }, + { + "epoch": 0.9100015176809835, + "grad_norm": 0.4520403742790222, + "learning_rate": 8.184690157958688e-05, + "loss": 1.9165, + "step": 2998 + }, + { + "epoch": 0.9103050538776749, + "grad_norm": 0.39504629373550415, + "learning_rate": 8.18408262454435e-05, + "loss": 1.7551, + "step": 2999 + }, + { + "epoch": 0.9106085900743663, + "grad_norm": 0.3621729612350464, + "learning_rate": 8.183475091130013e-05, + "loss": 1.4211, + "step": 3000 + }, + { + "epoch": 0.9109121262710578, + "grad_norm": 0.6058910489082336, + "learning_rate": 8.182867557715675e-05, + "loss": 1.7533, + "step": 3001 + }, + { + "epoch": 0.9112156624677493, + "grad_norm": 0.49985721707344055, + "learning_rate": 8.182260024301336e-05, + "loss": 2.0437, + "step": 3002 + }, + { + "epoch": 0.9115191986644408, + "grad_norm": 14.125785827636719, + "learning_rate": 8.181652490887e-05, + "loss": 1.3568, + "step": 3003 + }, + { + "epoch": 0.9118227348611322, + "grad_norm": 0.42591822147369385, + "learning_rate": 8.181044957472661e-05, + "loss": 1.9824, + "step": 3004 + }, + { + "epoch": 0.9121262710578236, + "grad_norm": 0.4781965911388397, + "learning_rate": 8.180437424058324e-05, + "loss": 2.0458, + "step": 3005 + }, + { + "epoch": 0.9124298072545151, + "grad_norm": 0.40637922286987305, + "learning_rate": 8.179829890643986e-05, + "loss": 1.9987, + "step": 3006 + }, + { + "epoch": 0.9127333434512066, + "grad_norm": 0.43722665309906006, + "learning_rate": 8.179222357229648e-05, + "loss": 2.0207, + "step": 3007 + }, + { + "epoch": 0.913036879647898, + "grad_norm": 2.785123348236084, + "learning_rate": 8.178614823815311e-05, + "loss": 2.0649, + "step": 3008 + }, + { + "epoch": 0.9133404158445895, + "grad_norm": 0.4118681848049164, + "learning_rate": 8.178007290400971e-05, + "loss": 1.8433, + "step": 3009 + }, + { + "epoch": 0.9136439520412809, + "grad_norm": 0.38384199142456055, + "learning_rate": 8.177399756986634e-05, + "loss": 1.686, + "step": 3010 + }, + { + "epoch": 0.9139474882379723, + "grad_norm": 0.41295409202575684, + "learning_rate": 8.176792223572297e-05, + "loss": 1.8854, + "step": 3011 + }, + { + "epoch": 0.9142510244346639, + "grad_norm": 0.40270209312438965, + "learning_rate": 8.176184690157959e-05, + "loss": 1.8403, + "step": 3012 + }, + { + "epoch": 0.9145545606313553, + "grad_norm": 0.4634084701538086, + "learning_rate": 8.175577156743621e-05, + "loss": 1.2805, + "step": 3013 + }, + { + "epoch": 0.9148580968280468, + "grad_norm": 0.37608620524406433, + "learning_rate": 8.174969623329284e-05, + "loss": 1.3985, + "step": 3014 + }, + { + "epoch": 0.9151616330247382, + "grad_norm": 0.47492894530296326, + "learning_rate": 8.174362089914946e-05, + "loss": 1.6559, + "step": 3015 + }, + { + "epoch": 0.9154651692214296, + "grad_norm": 0.3841186463832855, + "learning_rate": 8.173754556500607e-05, + "loss": 2.0221, + "step": 3016 + }, + { + "epoch": 0.9157687054181212, + "grad_norm": 0.40183159708976746, + "learning_rate": 8.17314702308627e-05, + "loss": 1.7669, + "step": 3017 + }, + { + "epoch": 0.9160722416148126, + "grad_norm": 0.4649689197540283, + "learning_rate": 8.172539489671932e-05, + "loss": 1.9634, + "step": 3018 + }, + { + "epoch": 0.916375777811504, + "grad_norm": 0.5210034847259521, + "learning_rate": 8.171931956257594e-05, + "loss": 1.5208, + "step": 3019 + }, + { + "epoch": 0.9166793140081955, + "grad_norm": 0.41098159551620483, + "learning_rate": 8.171324422843257e-05, + "loss": 2.1456, + "step": 3020 + }, + { + "epoch": 0.9169828502048869, + "grad_norm": 0.4477085769176483, + "learning_rate": 8.170716889428919e-05, + "loss": 2.1252, + "step": 3021 + }, + { + "epoch": 0.9172863864015783, + "grad_norm": 0.6705775856971741, + "learning_rate": 8.170109356014582e-05, + "loss": 1.7007, + "step": 3022 + }, + { + "epoch": 0.9175899225982699, + "grad_norm": 0.3919045925140381, + "learning_rate": 8.169501822600242e-05, + "loss": 1.9774, + "step": 3023 + }, + { + "epoch": 0.9178934587949613, + "grad_norm": 0.41216278076171875, + "learning_rate": 8.168894289185905e-05, + "loss": 1.845, + "step": 3024 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.4093484580516815, + "learning_rate": 8.168286755771568e-05, + "loss": 1.7517, + "step": 3025 + }, + { + "epoch": 0.9185005311883442, + "grad_norm": 0.4002762734889984, + "learning_rate": 8.16767922235723e-05, + "loss": 1.9234, + "step": 3026 + }, + { + "epoch": 0.9188040673850356, + "grad_norm": 0.3966367542743683, + "learning_rate": 8.167071688942892e-05, + "loss": 1.9905, + "step": 3027 + }, + { + "epoch": 0.9191076035817272, + "grad_norm": 0.4566415250301361, + "learning_rate": 8.166464155528555e-05, + "loss": 1.6606, + "step": 3028 + }, + { + "epoch": 0.9194111397784186, + "grad_norm": 0.5808939933776855, + "learning_rate": 8.165856622114217e-05, + "loss": 1.6156, + "step": 3029 + }, + { + "epoch": 0.91971467597511, + "grad_norm": 0.4700441062450409, + "learning_rate": 8.165249088699878e-05, + "loss": 1.9213, + "step": 3030 + }, + { + "epoch": 0.9200182121718015, + "grad_norm": 0.4296051263809204, + "learning_rate": 8.164641555285542e-05, + "loss": 2.0518, + "step": 3031 + }, + { + "epoch": 0.9203217483684929, + "grad_norm": 0.469310462474823, + "learning_rate": 8.164034021871203e-05, + "loss": 2.1185, + "step": 3032 + }, + { + "epoch": 0.9206252845651844, + "grad_norm": 0.45901113748550415, + "learning_rate": 8.163426488456865e-05, + "loss": 1.8144, + "step": 3033 + }, + { + "epoch": 0.9209288207618759, + "grad_norm": 0.40197721123695374, + "learning_rate": 8.162818955042528e-05, + "loss": 1.7781, + "step": 3034 + }, + { + "epoch": 0.9212323569585673, + "grad_norm": 0.41534188389778137, + "learning_rate": 8.16221142162819e-05, + "loss": 1.9972, + "step": 3035 + }, + { + "epoch": 0.9215358931552587, + "grad_norm": 0.4121875762939453, + "learning_rate": 8.161603888213853e-05, + "loss": 1.4284, + "step": 3036 + }, + { + "epoch": 0.9218394293519502, + "grad_norm": 0.4393114149570465, + "learning_rate": 8.160996354799513e-05, + "loss": 1.785, + "step": 3037 + }, + { + "epoch": 0.9221429655486417, + "grad_norm": 0.3844849467277527, + "learning_rate": 8.160388821385176e-05, + "loss": 1.8865, + "step": 3038 + }, + { + "epoch": 0.9224465017453332, + "grad_norm": 0.4221876859664917, + "learning_rate": 8.15978128797084e-05, + "loss": 1.8166, + "step": 3039 + }, + { + "epoch": 0.9227500379420246, + "grad_norm": 0.4294770359992981, + "learning_rate": 8.159173754556501e-05, + "loss": 1.8273, + "step": 3040 + }, + { + "epoch": 0.923053574138716, + "grad_norm": 0.41192346811294556, + "learning_rate": 8.158566221142163e-05, + "loss": 1.9982, + "step": 3041 + }, + { + "epoch": 0.9233571103354075, + "grad_norm": 0.4439050555229187, + "learning_rate": 8.157958687727826e-05, + "loss": 2.0525, + "step": 3042 + }, + { + "epoch": 0.923660646532099, + "grad_norm": 0.4909377694129944, + "learning_rate": 8.157351154313488e-05, + "loss": 1.7385, + "step": 3043 + }, + { + "epoch": 0.9239641827287904, + "grad_norm": 0.3646416664123535, + "learning_rate": 8.15674362089915e-05, + "loss": 1.7543, + "step": 3044 + }, + { + "epoch": 0.9242677189254819, + "grad_norm": 0.46845096349716187, + "learning_rate": 8.156136087484813e-05, + "loss": 1.5907, + "step": 3045 + }, + { + "epoch": 0.9245712551221733, + "grad_norm": 0.3998015224933624, + "learning_rate": 8.155528554070474e-05, + "loss": 1.9652, + "step": 3046 + }, + { + "epoch": 0.9248747913188647, + "grad_norm": 0.43011385202407837, + "learning_rate": 8.154921020656136e-05, + "loss": 1.9294, + "step": 3047 + }, + { + "epoch": 0.9251783275155562, + "grad_norm": 1.033368706703186, + "learning_rate": 8.154313487241799e-05, + "loss": 1.9883, + "step": 3048 + }, + { + "epoch": 0.9254818637122477, + "grad_norm": 0.6372964382171631, + "learning_rate": 8.153705953827461e-05, + "loss": 1.6381, + "step": 3049 + }, + { + "epoch": 0.9257853999089392, + "grad_norm": 0.4168377220630646, + "learning_rate": 8.153098420413124e-05, + "loss": 1.5776, + "step": 3050 + }, + { + "epoch": 0.9260889361056306, + "grad_norm": 0.4470007121562958, + "learning_rate": 8.152490886998784e-05, + "loss": 1.7431, + "step": 3051 + }, + { + "epoch": 0.926392472302322, + "grad_norm": 0.4876750111579895, + "learning_rate": 8.151883353584447e-05, + "loss": 2.0082, + "step": 3052 + }, + { + "epoch": 0.9266960084990135, + "grad_norm": 0.4005252718925476, + "learning_rate": 8.15127582017011e-05, + "loss": 1.8192, + "step": 3053 + }, + { + "epoch": 0.926999544695705, + "grad_norm": 0.4852685332298279, + "learning_rate": 8.150668286755772e-05, + "loss": 1.5553, + "step": 3054 + }, + { + "epoch": 0.9273030808923964, + "grad_norm": 0.4594980776309967, + "learning_rate": 8.150060753341434e-05, + "loss": 1.784, + "step": 3055 + }, + { + "epoch": 0.9276066170890879, + "grad_norm": 0.34720897674560547, + "learning_rate": 8.149453219927097e-05, + "loss": 1.8604, + "step": 3056 + }, + { + "epoch": 0.9279101532857793, + "grad_norm": 0.423211932182312, + "learning_rate": 8.148845686512759e-05, + "loss": 1.9792, + "step": 3057 + }, + { + "epoch": 0.9282136894824707, + "grad_norm": 0.42972126603126526, + "learning_rate": 8.14823815309842e-05, + "loss": 1.4225, + "step": 3058 + }, + { + "epoch": 0.9285172256791623, + "grad_norm": 0.38373371958732605, + "learning_rate": 8.147630619684084e-05, + "loss": 1.6397, + "step": 3059 + }, + { + "epoch": 0.9288207618758537, + "grad_norm": 0.4351721405982971, + "learning_rate": 8.147023086269745e-05, + "loss": 1.6237, + "step": 3060 + }, + { + "epoch": 0.9291242980725452, + "grad_norm": 0.41888755559921265, + "learning_rate": 8.146415552855407e-05, + "loss": 1.7037, + "step": 3061 + }, + { + "epoch": 0.9294278342692366, + "grad_norm": 0.43660473823547363, + "learning_rate": 8.14580801944107e-05, + "loss": 1.9307, + "step": 3062 + }, + { + "epoch": 0.929731370465928, + "grad_norm": 0.4016878008842468, + "learning_rate": 8.145200486026732e-05, + "loss": 1.7994, + "step": 3063 + }, + { + "epoch": 0.9300349066626196, + "grad_norm": 0.5155421495437622, + "learning_rate": 8.144592952612395e-05, + "loss": 1.7505, + "step": 3064 + }, + { + "epoch": 0.930338442859311, + "grad_norm": 0.4258996844291687, + "learning_rate": 8.143985419198055e-05, + "loss": 1.1935, + "step": 3065 + }, + { + "epoch": 0.9306419790560024, + "grad_norm": 0.5270261168479919, + "learning_rate": 8.143377885783718e-05, + "loss": 2.095, + "step": 3066 + }, + { + "epoch": 0.9309455152526939, + "grad_norm": 0.382199764251709, + "learning_rate": 8.142770352369381e-05, + "loss": 1.9493, + "step": 3067 + }, + { + "epoch": 0.9312490514493853, + "grad_norm": 0.6669699549674988, + "learning_rate": 8.142162818955042e-05, + "loss": 2.07, + "step": 3068 + }, + { + "epoch": 0.9315525876460768, + "grad_norm": 0.3749605417251587, + "learning_rate": 8.141555285540705e-05, + "loss": 1.5008, + "step": 3069 + }, + { + "epoch": 0.9318561238427683, + "grad_norm": 0.4507908523082733, + "learning_rate": 8.140947752126368e-05, + "loss": 2.2879, + "step": 3070 + }, + { + "epoch": 0.9321596600394597, + "grad_norm": 0.42423611879348755, + "learning_rate": 8.14034021871203e-05, + "loss": 2.0068, + "step": 3071 + }, + { + "epoch": 0.9324631962361511, + "grad_norm": 0.4780293405056, + "learning_rate": 8.139732685297691e-05, + "loss": 1.9496, + "step": 3072 + }, + { + "epoch": 0.9327667324328426, + "grad_norm": 0.4152267873287201, + "learning_rate": 8.139125151883355e-05, + "loss": 1.7815, + "step": 3073 + }, + { + "epoch": 0.933070268629534, + "grad_norm": 0.40453848242759705, + "learning_rate": 8.138517618469016e-05, + "loss": 1.825, + "step": 3074 + }, + { + "epoch": 0.9333738048262256, + "grad_norm": 0.48477646708488464, + "learning_rate": 8.137910085054678e-05, + "loss": 1.7566, + "step": 3075 + }, + { + "epoch": 0.933677341022917, + "grad_norm": 0.49090731143951416, + "learning_rate": 8.13730255164034e-05, + "loss": 1.5743, + "step": 3076 + }, + { + "epoch": 0.9339808772196084, + "grad_norm": 0.44307780265808105, + "learning_rate": 8.136695018226003e-05, + "loss": 1.9686, + "step": 3077 + }, + { + "epoch": 0.9342844134162999, + "grad_norm": 0.6790413856506348, + "learning_rate": 8.136087484811666e-05, + "loss": 2.0449, + "step": 3078 + }, + { + "epoch": 0.9345879496129913, + "grad_norm": 0.46484366059303284, + "learning_rate": 8.135479951397326e-05, + "loss": 1.6846, + "step": 3079 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.5237354636192322, + "learning_rate": 8.13487241798299e-05, + "loss": 1.9181, + "step": 3080 + }, + { + "epoch": 0.9351950220063743, + "grad_norm": 0.3992574214935303, + "learning_rate": 8.134264884568652e-05, + "loss": 1.8549, + "step": 3081 + }, + { + "epoch": 0.9354985582030657, + "grad_norm": 0.37925541400909424, + "learning_rate": 8.133657351154313e-05, + "loss": 1.5961, + "step": 3082 + }, + { + "epoch": 0.9358020943997571, + "grad_norm": 2.249074935913086, + "learning_rate": 8.133049817739976e-05, + "loss": 1.7813, + "step": 3083 + }, + { + "epoch": 0.9361056305964486, + "grad_norm": 0.42430388927459717, + "learning_rate": 8.132442284325639e-05, + "loss": 1.5935, + "step": 3084 + }, + { + "epoch": 0.9364091667931401, + "grad_norm": 0.42997804284095764, + "learning_rate": 8.131834750911301e-05, + "loss": 1.7546, + "step": 3085 + }, + { + "epoch": 0.9367127029898316, + "grad_norm": 0.3883001506328583, + "learning_rate": 8.131227217496962e-05, + "loss": 1.9123, + "step": 3086 + }, + { + "epoch": 0.937016239186523, + "grad_norm": 0.39124423265457153, + "learning_rate": 8.130619684082626e-05, + "loss": 1.4895, + "step": 3087 + }, + { + "epoch": 0.9373197753832144, + "grad_norm": 0.41227900981903076, + "learning_rate": 8.130012150668287e-05, + "loss": 1.8467, + "step": 3088 + }, + { + "epoch": 0.9376233115799059, + "grad_norm": 0.40440481901168823, + "learning_rate": 8.129404617253949e-05, + "loss": 1.678, + "step": 3089 + }, + { + "epoch": 0.9379268477765974, + "grad_norm": 0.406684011220932, + "learning_rate": 8.128797083839611e-05, + "loss": 1.7328, + "step": 3090 + }, + { + "epoch": 0.9382303839732888, + "grad_norm": 0.409196138381958, + "learning_rate": 8.128189550425274e-05, + "loss": 1.645, + "step": 3091 + }, + { + "epoch": 0.9385339201699803, + "grad_norm": 0.46844586730003357, + "learning_rate": 8.127582017010936e-05, + "loss": 1.9826, + "step": 3092 + }, + { + "epoch": 0.9388374563666717, + "grad_norm": 0.4813016355037689, + "learning_rate": 8.126974483596597e-05, + "loss": 1.5903, + "step": 3093 + }, + { + "epoch": 0.9391409925633631, + "grad_norm": 0.4739225208759308, + "learning_rate": 8.12636695018226e-05, + "loss": 1.7212, + "step": 3094 + }, + { + "epoch": 0.9394445287600547, + "grad_norm": 0.39909741282463074, + "learning_rate": 8.125759416767923e-05, + "loss": 2.0123, + "step": 3095 + }, + { + "epoch": 0.9397480649567461, + "grad_norm": 0.4105834662914276, + "learning_rate": 8.125151883353584e-05, + "loss": 1.9729, + "step": 3096 + }, + { + "epoch": 0.9400516011534376, + "grad_norm": 0.41497233510017395, + "learning_rate": 8.124544349939247e-05, + "loss": 2.0673, + "step": 3097 + }, + { + "epoch": 0.940355137350129, + "grad_norm": 0.443263441324234, + "learning_rate": 8.12393681652491e-05, + "loss": 1.6216, + "step": 3098 + }, + { + "epoch": 0.9406586735468204, + "grad_norm": 0.47175195813179016, + "learning_rate": 8.123329283110572e-05, + "loss": 1.594, + "step": 3099 + }, + { + "epoch": 0.940962209743512, + "grad_norm": 0.434952974319458, + "learning_rate": 8.122721749696233e-05, + "loss": 1.8733, + "step": 3100 + }, + { + "epoch": 0.9412657459402034, + "grad_norm": 0.5037057399749756, + "learning_rate": 8.122114216281897e-05, + "loss": 1.5384, + "step": 3101 + }, + { + "epoch": 0.9415692821368948, + "grad_norm": 0.39780277013778687, + "learning_rate": 8.121506682867558e-05, + "loss": 2.0412, + "step": 3102 + }, + { + "epoch": 0.9418728183335863, + "grad_norm": 0.4376054108142853, + "learning_rate": 8.12089914945322e-05, + "loss": 1.9531, + "step": 3103 + }, + { + "epoch": 0.9421763545302777, + "grad_norm": 0.40796467661857605, + "learning_rate": 8.120291616038882e-05, + "loss": 2.062, + "step": 3104 + }, + { + "epoch": 0.9424798907269691, + "grad_norm": 0.47094616293907166, + "learning_rate": 8.119684082624545e-05, + "loss": 2.16, + "step": 3105 + }, + { + "epoch": 0.9427834269236607, + "grad_norm": 0.4758855104446411, + "learning_rate": 8.119076549210207e-05, + "loss": 1.9761, + "step": 3106 + }, + { + "epoch": 0.9430869631203521, + "grad_norm": 0.3994719088077545, + "learning_rate": 8.118469015795868e-05, + "loss": 1.8639, + "step": 3107 + }, + { + "epoch": 0.9433904993170436, + "grad_norm": 0.39443784952163696, + "learning_rate": 8.117861482381531e-05, + "loss": 1.8374, + "step": 3108 + }, + { + "epoch": 0.943694035513735, + "grad_norm": 0.3997192978858948, + "learning_rate": 8.117253948967194e-05, + "loss": 1.8491, + "step": 3109 + }, + { + "epoch": 0.9439975717104264, + "grad_norm": 0.4563603401184082, + "learning_rate": 8.116646415552855e-05, + "loss": 1.4991, + "step": 3110 + }, + { + "epoch": 0.944301107907118, + "grad_norm": 0.4601759612560272, + "learning_rate": 8.116038882138518e-05, + "loss": 1.6056, + "step": 3111 + }, + { + "epoch": 0.9446046441038094, + "grad_norm": 0.39985764026641846, + "learning_rate": 8.115431348724181e-05, + "loss": 1.391, + "step": 3112 + }, + { + "epoch": 0.9449081803005008, + "grad_norm": 0.5546020269393921, + "learning_rate": 8.114823815309843e-05, + "loss": 1.9887, + "step": 3113 + }, + { + "epoch": 0.9452117164971923, + "grad_norm": 0.4334501624107361, + "learning_rate": 8.114216281895504e-05, + "loss": 2.1764, + "step": 3114 + }, + { + "epoch": 0.9455152526938837, + "grad_norm": 0.47174403071403503, + "learning_rate": 8.113608748481168e-05, + "loss": 1.8397, + "step": 3115 + }, + { + "epoch": 0.9458187888905752, + "grad_norm": 0.4174114465713501, + "learning_rate": 8.113001215066829e-05, + "loss": 2.2144, + "step": 3116 + }, + { + "epoch": 0.9461223250872667, + "grad_norm": 0.7976917028427124, + "learning_rate": 8.112393681652491e-05, + "loss": 1.3549, + "step": 3117 + }, + { + "epoch": 0.9464258612839581, + "grad_norm": 0.3866395950317383, + "learning_rate": 8.111786148238153e-05, + "loss": 2.0, + "step": 3118 + }, + { + "epoch": 0.9467293974806495, + "grad_norm": 0.43397247791290283, + "learning_rate": 8.111178614823816e-05, + "loss": 1.7408, + "step": 3119 + }, + { + "epoch": 0.947032933677341, + "grad_norm": 0.4277322292327881, + "learning_rate": 8.110571081409478e-05, + "loss": 1.8638, + "step": 3120 + }, + { + "epoch": 0.9473364698740325, + "grad_norm": 0.38876983523368835, + "learning_rate": 8.109963547995139e-05, + "loss": 1.7645, + "step": 3121 + }, + { + "epoch": 0.947640006070724, + "grad_norm": 0.37743645906448364, + "learning_rate": 8.109356014580802e-05, + "loss": 1.9963, + "step": 3122 + }, + { + "epoch": 0.9479435422674154, + "grad_norm": 0.43921002745628357, + "learning_rate": 8.108748481166465e-05, + "loss": 2.0021, + "step": 3123 + }, + { + "epoch": 0.9482470784641068, + "grad_norm": 0.4989663362503052, + "learning_rate": 8.108140947752126e-05, + "loss": 1.6826, + "step": 3124 + }, + { + "epoch": 0.9485506146607983, + "grad_norm": 0.40931862592697144, + "learning_rate": 8.107533414337789e-05, + "loss": 1.681, + "step": 3125 + }, + { + "epoch": 0.9488541508574898, + "grad_norm": 0.44620081782341003, + "learning_rate": 8.106925880923452e-05, + "loss": 2.0738, + "step": 3126 + }, + { + "epoch": 0.9491576870541812, + "grad_norm": 0.42712563276290894, + "learning_rate": 8.106318347509114e-05, + "loss": 2.078, + "step": 3127 + }, + { + "epoch": 0.9494612232508727, + "grad_norm": 2.7110748291015625, + "learning_rate": 8.105710814094775e-05, + "loss": 1.8328, + "step": 3128 + }, + { + "epoch": 0.9497647594475641, + "grad_norm": 0.40240269899368286, + "learning_rate": 8.105103280680439e-05, + "loss": 1.8519, + "step": 3129 + }, + { + "epoch": 0.9500682956442555, + "grad_norm": 0.43927666544914246, + "learning_rate": 8.1044957472661e-05, + "loss": 1.7309, + "step": 3130 + }, + { + "epoch": 0.950371831840947, + "grad_norm": 0.4225032925605774, + "learning_rate": 8.103888213851762e-05, + "loss": 1.9518, + "step": 3131 + }, + { + "epoch": 0.9506753680376385, + "grad_norm": 0.4135547876358032, + "learning_rate": 8.103280680437424e-05, + "loss": 1.791, + "step": 3132 + }, + { + "epoch": 0.95097890423433, + "grad_norm": 0.5137977004051208, + "learning_rate": 8.102673147023087e-05, + "loss": 1.8914, + "step": 3133 + }, + { + "epoch": 0.9512824404310214, + "grad_norm": 0.44080087542533875, + "learning_rate": 8.102065613608749e-05, + "loss": 1.6005, + "step": 3134 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.4912469983100891, + "learning_rate": 8.10145808019441e-05, + "loss": 1.7609, + "step": 3135 + }, + { + "epoch": 0.9518895128244043, + "grad_norm": 0.6660062074661255, + "learning_rate": 8.100850546780073e-05, + "loss": 2.0835, + "step": 3136 + }, + { + "epoch": 0.9521930490210958, + "grad_norm": 0.39112183451652527, + "learning_rate": 8.100243013365736e-05, + "loss": 1.9975, + "step": 3137 + }, + { + "epoch": 0.9524965852177872, + "grad_norm": 0.41470736265182495, + "learning_rate": 8.099635479951397e-05, + "loss": 1.8165, + "step": 3138 + }, + { + "epoch": 0.9528001214144787, + "grad_norm": 0.6125030517578125, + "learning_rate": 8.09902794653706e-05, + "loss": 1.4878, + "step": 3139 + }, + { + "epoch": 0.9531036576111701, + "grad_norm": 0.3625620901584625, + "learning_rate": 8.098420413122723e-05, + "loss": 1.7098, + "step": 3140 + }, + { + "epoch": 0.9534071938078615, + "grad_norm": 0.3737241327762604, + "learning_rate": 8.097812879708383e-05, + "loss": 1.8225, + "step": 3141 + }, + { + "epoch": 0.9537107300045531, + "grad_norm": 0.4835364520549774, + "learning_rate": 8.097205346294047e-05, + "loss": 2.2708, + "step": 3142 + }, + { + "epoch": 0.9540142662012445, + "grad_norm": 0.3605796694755554, + "learning_rate": 8.09659781287971e-05, + "loss": 2.1616, + "step": 3143 + }, + { + "epoch": 0.954317802397936, + "grad_norm": 0.42037534713745117, + "learning_rate": 8.095990279465371e-05, + "loss": 1.8847, + "step": 3144 + }, + { + "epoch": 0.9546213385946274, + "grad_norm": 0.4341660141944885, + "learning_rate": 8.095382746051033e-05, + "loss": 1.579, + "step": 3145 + }, + { + "epoch": 0.9549248747913188, + "grad_norm": 0.39239785075187683, + "learning_rate": 8.094775212636695e-05, + "loss": 1.9024, + "step": 3146 + }, + { + "epoch": 0.9552284109880104, + "grad_norm": 0.4219903349876404, + "learning_rate": 8.094167679222358e-05, + "loss": 1.7518, + "step": 3147 + }, + { + "epoch": 0.9555319471847018, + "grad_norm": 0.36863937973976135, + "learning_rate": 8.09356014580802e-05, + "loss": 2.0326, + "step": 3148 + }, + { + "epoch": 0.9558354833813932, + "grad_norm": 0.4089399576187134, + "learning_rate": 8.092952612393681e-05, + "loss": 1.9478, + "step": 3149 + }, + { + "epoch": 0.9561390195780847, + "grad_norm": 0.3865533769130707, + "learning_rate": 8.092345078979344e-05, + "loss": 1.883, + "step": 3150 + }, + { + "epoch": 0.9564425557747761, + "grad_norm": 0.3673511743545532, + "learning_rate": 8.091737545565008e-05, + "loss": 1.8116, + "step": 3151 + }, + { + "epoch": 0.9567460919714676, + "grad_norm": 0.4296679198741913, + "learning_rate": 8.091130012150668e-05, + "loss": 1.891, + "step": 3152 + }, + { + "epoch": 0.9570496281681591, + "grad_norm": 0.3618902266025543, + "learning_rate": 8.090522478736331e-05, + "loss": 1.8951, + "step": 3153 + }, + { + "epoch": 0.9573531643648505, + "grad_norm": 0.3620889484882355, + "learning_rate": 8.089914945321994e-05, + "loss": 1.5316, + "step": 3154 + }, + { + "epoch": 0.957656700561542, + "grad_norm": 0.4978037178516388, + "learning_rate": 8.089307411907654e-05, + "loss": 1.6175, + "step": 3155 + }, + { + "epoch": 0.9579602367582334, + "grad_norm": 0.4385554790496826, + "learning_rate": 8.088699878493318e-05, + "loss": 1.8811, + "step": 3156 + }, + { + "epoch": 0.9582637729549248, + "grad_norm": 0.42445600032806396, + "learning_rate": 8.08809234507898e-05, + "loss": 1.9388, + "step": 3157 + }, + { + "epoch": 0.9585673091516164, + "grad_norm": 0.4952315092086792, + "learning_rate": 8.087484811664642e-05, + "loss": 2.0404, + "step": 3158 + }, + { + "epoch": 0.9588708453483078, + "grad_norm": 0.3969573676586151, + "learning_rate": 8.086877278250304e-05, + "loss": 1.818, + "step": 3159 + }, + { + "epoch": 0.9591743815449992, + "grad_norm": 0.41406628489494324, + "learning_rate": 8.086269744835966e-05, + "loss": 2.0014, + "step": 3160 + }, + { + "epoch": 0.9594779177416907, + "grad_norm": 0.40631070733070374, + "learning_rate": 8.085662211421629e-05, + "loss": 1.7627, + "step": 3161 + }, + { + "epoch": 0.9597814539383821, + "grad_norm": 0.41568198800086975, + "learning_rate": 8.08505467800729e-05, + "loss": 2.0605, + "step": 3162 + }, + { + "epoch": 0.9600849901350736, + "grad_norm": 0.39019855856895447, + "learning_rate": 8.084447144592952e-05, + "loss": 2.3052, + "step": 3163 + }, + { + "epoch": 0.9603885263317651, + "grad_norm": 0.42019182443618774, + "learning_rate": 8.083839611178615e-05, + "loss": 1.1618, + "step": 3164 + }, + { + "epoch": 0.9606920625284565, + "grad_norm": 0.3448597192764282, + "learning_rate": 8.083232077764277e-05, + "loss": 1.1122, + "step": 3165 + }, + { + "epoch": 0.960995598725148, + "grad_norm": 0.3484005928039551, + "learning_rate": 8.082624544349939e-05, + "loss": 1.3973, + "step": 3166 + }, + { + "epoch": 0.9612991349218394, + "grad_norm": 0.3877616822719574, + "learning_rate": 8.082017010935602e-05, + "loss": 1.7281, + "step": 3167 + }, + { + "epoch": 0.9616026711185309, + "grad_norm": 0.7124067544937134, + "learning_rate": 8.081409477521265e-05, + "loss": 1.7848, + "step": 3168 + }, + { + "epoch": 0.9619062073152224, + "grad_norm": 0.4344068467617035, + "learning_rate": 8.080801944106925e-05, + "loss": 1.1598, + "step": 3169 + }, + { + "epoch": 0.9622097435119138, + "grad_norm": 0.8230828046798706, + "learning_rate": 8.080194410692589e-05, + "loss": 1.7979, + "step": 3170 + }, + { + "epoch": 0.9625132797086052, + "grad_norm": 0.45202380418777466, + "learning_rate": 8.07958687727825e-05, + "loss": 1.9827, + "step": 3171 + }, + { + "epoch": 0.9628168159052967, + "grad_norm": 0.37519025802612305, + "learning_rate": 8.078979343863913e-05, + "loss": 1.9966, + "step": 3172 + }, + { + "epoch": 0.9631203521019882, + "grad_norm": 0.42726776003837585, + "learning_rate": 8.078371810449575e-05, + "loss": 1.9923, + "step": 3173 + }, + { + "epoch": 0.9634238882986796, + "grad_norm": 0.5753629207611084, + "learning_rate": 8.077764277035237e-05, + "loss": 1.695, + "step": 3174 + }, + { + "epoch": 0.9637274244953711, + "grad_norm": 0.44009268283843994, + "learning_rate": 8.0771567436209e-05, + "loss": 1.5039, + "step": 3175 + }, + { + "epoch": 0.9640309606920625, + "grad_norm": 0.42067059874534607, + "learning_rate": 8.076549210206562e-05, + "loss": 1.8273, + "step": 3176 + }, + { + "epoch": 0.964334496888754, + "grad_norm": 0.44108089804649353, + "learning_rate": 8.075941676792223e-05, + "loss": 1.6372, + "step": 3177 + }, + { + "epoch": 0.9646380330854455, + "grad_norm": 0.39648228883743286, + "learning_rate": 8.075334143377886e-05, + "loss": 2.1179, + "step": 3178 + }, + { + "epoch": 0.9649415692821369, + "grad_norm": 0.45205631852149963, + "learning_rate": 8.074726609963548e-05, + "loss": 1.9005, + "step": 3179 + }, + { + "epoch": 0.9652451054788284, + "grad_norm": 0.45935380458831787, + "learning_rate": 8.07411907654921e-05, + "loss": 1.7634, + "step": 3180 + }, + { + "epoch": 0.9655486416755198, + "grad_norm": 0.3873693645000458, + "learning_rate": 8.073511543134873e-05, + "loss": 1.9629, + "step": 3181 + }, + { + "epoch": 0.9658521778722112, + "grad_norm": 0.3731973469257355, + "learning_rate": 8.072904009720536e-05, + "loss": 2.0085, + "step": 3182 + }, + { + "epoch": 0.9661557140689028, + "grad_norm": 0.45661619305610657, + "learning_rate": 8.072296476306196e-05, + "loss": 1.3693, + "step": 3183 + }, + { + "epoch": 0.9664592502655942, + "grad_norm": 0.47569990158081055, + "learning_rate": 8.07168894289186e-05, + "loss": 1.6855, + "step": 3184 + }, + { + "epoch": 0.9667627864622856, + "grad_norm": 0.4035504460334778, + "learning_rate": 8.071081409477521e-05, + "loss": 2.1902, + "step": 3185 + }, + { + "epoch": 0.9670663226589771, + "grad_norm": 0.6134029030799866, + "learning_rate": 8.070473876063184e-05, + "loss": 2.0059, + "step": 3186 + }, + { + "epoch": 0.9673698588556685, + "grad_norm": 0.5165479779243469, + "learning_rate": 8.069866342648846e-05, + "loss": 1.6179, + "step": 3187 + }, + { + "epoch": 0.9676733950523599, + "grad_norm": 0.48403364419937134, + "learning_rate": 8.069258809234508e-05, + "loss": 2.1823, + "step": 3188 + }, + { + "epoch": 0.9679769312490515, + "grad_norm": 0.4141898453235626, + "learning_rate": 8.068651275820171e-05, + "loss": 1.7823, + "step": 3189 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.4937414228916168, + "learning_rate": 8.068043742405833e-05, + "loss": 1.5423, + "step": 3190 + }, + { + "epoch": 0.9685840036424344, + "grad_norm": 0.39996278285980225, + "learning_rate": 8.067436208991494e-05, + "loss": 1.8201, + "step": 3191 + }, + { + "epoch": 0.9688875398391258, + "grad_norm": 0.4205084443092346, + "learning_rate": 8.066828675577157e-05, + "loss": 2.1392, + "step": 3192 + }, + { + "epoch": 0.9691910760358172, + "grad_norm": 0.43701469898223877, + "learning_rate": 8.066221142162819e-05, + "loss": 1.8178, + "step": 3193 + }, + { + "epoch": 0.9694946122325088, + "grad_norm": 0.45265719294548035, + "learning_rate": 8.065613608748481e-05, + "loss": 1.6506, + "step": 3194 + }, + { + "epoch": 0.9697981484292002, + "grad_norm": 0.43316105008125305, + "learning_rate": 8.065006075334144e-05, + "loss": 1.6709, + "step": 3195 + }, + { + "epoch": 0.9701016846258916, + "grad_norm": 0.37833353877067566, + "learning_rate": 8.064398541919807e-05, + "loss": 1.4646, + "step": 3196 + }, + { + "epoch": 0.9704052208225831, + "grad_norm": 0.444698691368103, + "learning_rate": 8.063791008505467e-05, + "loss": 1.6426, + "step": 3197 + }, + { + "epoch": 0.9707087570192745, + "grad_norm": 0.43853360414505005, + "learning_rate": 8.06318347509113e-05, + "loss": 2.1562, + "step": 3198 + }, + { + "epoch": 0.971012293215966, + "grad_norm": 0.6573916673660278, + "learning_rate": 8.062575941676792e-05, + "loss": 2.1327, + "step": 3199 + }, + { + "epoch": 0.9713158294126575, + "grad_norm": 0.41661205887794495, + "learning_rate": 8.061968408262455e-05, + "loss": 1.68, + "step": 3200 + }, + { + "epoch": 0.9716193656093489, + "grad_norm": 0.7264708876609802, + "learning_rate": 8.061360874848117e-05, + "loss": 1.8907, + "step": 3201 + }, + { + "epoch": 0.9719229018060404, + "grad_norm": 0.3977676331996918, + "learning_rate": 8.060753341433779e-05, + "loss": 1.9205, + "step": 3202 + }, + { + "epoch": 0.9722264380027318, + "grad_norm": 0.7518191337585449, + "learning_rate": 8.060145808019442e-05, + "loss": 1.4833, + "step": 3203 + }, + { + "epoch": 0.9725299741994233, + "grad_norm": 0.4262489080429077, + "learning_rate": 8.059538274605104e-05, + "loss": 1.8307, + "step": 3204 + }, + { + "epoch": 0.9728335103961148, + "grad_norm": 0.40081748366355896, + "learning_rate": 8.058930741190765e-05, + "loss": 0.9998, + "step": 3205 + }, + { + "epoch": 0.9731370465928062, + "grad_norm": 0.46323978900909424, + "learning_rate": 8.058323207776428e-05, + "loss": 1.9063, + "step": 3206 + }, + { + "epoch": 0.9734405827894976, + "grad_norm": 0.40302255749702454, + "learning_rate": 8.05771567436209e-05, + "loss": 1.8153, + "step": 3207 + }, + { + "epoch": 0.9737441189861891, + "grad_norm": 0.4447222948074341, + "learning_rate": 8.057108140947752e-05, + "loss": 2.0744, + "step": 3208 + }, + { + "epoch": 0.9740476551828806, + "grad_norm": 0.33706068992614746, + "learning_rate": 8.056500607533415e-05, + "loss": 1.7981, + "step": 3209 + }, + { + "epoch": 0.974351191379572, + "grad_norm": 0.44239774346351624, + "learning_rate": 8.055893074119078e-05, + "loss": 1.7168, + "step": 3210 + }, + { + "epoch": 0.9746547275762635, + "grad_norm": 0.41518473625183105, + "learning_rate": 8.055285540704738e-05, + "loss": 1.9278, + "step": 3211 + }, + { + "epoch": 0.9749582637729549, + "grad_norm": 0.4727713167667389, + "learning_rate": 8.054678007290402e-05, + "loss": 1.6719, + "step": 3212 + }, + { + "epoch": 0.9752617999696463, + "grad_norm": 0.4056665599346161, + "learning_rate": 8.054070473876063e-05, + "loss": 1.7211, + "step": 3213 + }, + { + "epoch": 0.9755653361663378, + "grad_norm": 0.47795236110687256, + "learning_rate": 8.053462940461726e-05, + "loss": 1.5074, + "step": 3214 + }, + { + "epoch": 0.9758688723630293, + "grad_norm": 0.5859802961349487, + "learning_rate": 8.052855407047388e-05, + "loss": 1.7038, + "step": 3215 + }, + { + "epoch": 0.9761724085597208, + "grad_norm": 0.398113876581192, + "learning_rate": 8.05224787363305e-05, + "loss": 1.7802, + "step": 3216 + }, + { + "epoch": 0.9764759447564122, + "grad_norm": 0.3661412000656128, + "learning_rate": 8.051640340218713e-05, + "loss": 1.4388, + "step": 3217 + }, + { + "epoch": 0.9767794809531036, + "grad_norm": 0.40348801016807556, + "learning_rate": 8.051032806804375e-05, + "loss": 1.8187, + "step": 3218 + }, + { + "epoch": 0.9770830171497951, + "grad_norm": 0.3885161876678467, + "learning_rate": 8.050425273390036e-05, + "loss": 1.7975, + "step": 3219 + }, + { + "epoch": 0.9773865533464866, + "grad_norm": 0.3739737868309021, + "learning_rate": 8.0498177399757e-05, + "loss": 1.7525, + "step": 3220 + }, + { + "epoch": 0.977690089543178, + "grad_norm": 0.48323333263397217, + "learning_rate": 8.049210206561361e-05, + "loss": 1.3203, + "step": 3221 + }, + { + "epoch": 0.9779936257398695, + "grad_norm": 0.3983123004436493, + "learning_rate": 8.048602673147023e-05, + "loss": 1.994, + "step": 3222 + }, + { + "epoch": 0.9782971619365609, + "grad_norm": 0.4193548560142517, + "learning_rate": 8.047995139732686e-05, + "loss": 1.811, + "step": 3223 + }, + { + "epoch": 0.9786006981332523, + "grad_norm": 0.353444367647171, + "learning_rate": 8.047387606318349e-05, + "loss": 1.7789, + "step": 3224 + }, + { + "epoch": 0.9789042343299439, + "grad_norm": 0.5068827867507935, + "learning_rate": 8.04678007290401e-05, + "loss": 1.9425, + "step": 3225 + }, + { + "epoch": 0.9792077705266353, + "grad_norm": 0.42118749022483826, + "learning_rate": 8.046172539489673e-05, + "loss": 1.8055, + "step": 3226 + }, + { + "epoch": 0.9795113067233268, + "grad_norm": 0.4077788293361664, + "learning_rate": 8.045565006075334e-05, + "loss": 1.3257, + "step": 3227 + }, + { + "epoch": 0.9798148429200182, + "grad_norm": 0.45757341384887695, + "learning_rate": 8.044957472660996e-05, + "loss": 1.998, + "step": 3228 + }, + { + "epoch": 0.9801183791167096, + "grad_norm": 0.4154861271381378, + "learning_rate": 8.044349939246659e-05, + "loss": 1.8151, + "step": 3229 + }, + { + "epoch": 0.9804219153134012, + "grad_norm": 0.48109593987464905, + "learning_rate": 8.043742405832321e-05, + "loss": 1.7802, + "step": 3230 + }, + { + "epoch": 0.9807254515100926, + "grad_norm": 0.4576222002506256, + "learning_rate": 8.043134872417984e-05, + "loss": 1.7928, + "step": 3231 + }, + { + "epoch": 0.981028987706784, + "grad_norm": 0.38525086641311646, + "learning_rate": 8.042527339003646e-05, + "loss": 1.4135, + "step": 3232 + }, + { + "epoch": 0.9813325239034755, + "grad_norm": 0.3866974115371704, + "learning_rate": 8.041919805589307e-05, + "loss": 1.9068, + "step": 3233 + }, + { + "epoch": 0.9816360601001669, + "grad_norm": 0.421790212392807, + "learning_rate": 8.04131227217497e-05, + "loss": 1.8243, + "step": 3234 + }, + { + "epoch": 0.9819395962968585, + "grad_norm": 0.5579865574836731, + "learning_rate": 8.040704738760632e-05, + "loss": 1.8662, + "step": 3235 + }, + { + "epoch": 0.9822431324935499, + "grad_norm": 0.5178837180137634, + "learning_rate": 8.040097205346294e-05, + "loss": 1.815, + "step": 3236 + }, + { + "epoch": 0.9825466686902413, + "grad_norm": 0.3817935883998871, + "learning_rate": 8.039489671931957e-05, + "loss": 2.2422, + "step": 3237 + }, + { + "epoch": 0.9828502048869328, + "grad_norm": 0.4080420136451721, + "learning_rate": 8.038882138517619e-05, + "loss": 2.0504, + "step": 3238 + }, + { + "epoch": 0.9831537410836242, + "grad_norm": 0.3719751536846161, + "learning_rate": 8.03827460510328e-05, + "loss": 1.9778, + "step": 3239 + }, + { + "epoch": 0.9834572772803156, + "grad_norm": 0.36413270235061646, + "learning_rate": 8.037667071688944e-05, + "loss": 1.3326, + "step": 3240 + }, + { + "epoch": 0.9837608134770072, + "grad_norm": 0.3482026755809784, + "learning_rate": 8.037059538274605e-05, + "loss": 1.7243, + "step": 3241 + }, + { + "epoch": 0.9840643496736986, + "grad_norm": 0.3891375958919525, + "learning_rate": 8.036452004860267e-05, + "loss": 1.8408, + "step": 3242 + }, + { + "epoch": 0.98436788587039, + "grad_norm": 0.4400385618209839, + "learning_rate": 8.03584447144593e-05, + "loss": 1.967, + "step": 3243 + }, + { + "epoch": 0.9846714220670815, + "grad_norm": 0.3769470751285553, + "learning_rate": 8.035236938031592e-05, + "loss": 1.3612, + "step": 3244 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.39424487948417664, + "learning_rate": 8.034629404617255e-05, + "loss": 1.8445, + "step": 3245 + }, + { + "epoch": 0.9852784944604644, + "grad_norm": 0.4074876308441162, + "learning_rate": 8.034021871202917e-05, + "loss": 1.9461, + "step": 3246 + }, + { + "epoch": 0.9855820306571559, + "grad_norm": 0.4052838683128357, + "learning_rate": 8.033414337788578e-05, + "loss": 1.6049, + "step": 3247 + }, + { + "epoch": 0.9858855668538473, + "grad_norm": 0.4411472678184509, + "learning_rate": 8.032806804374241e-05, + "loss": 2.0511, + "step": 3248 + }, + { + "epoch": 0.9861891030505388, + "grad_norm": 0.37311851978302, + "learning_rate": 8.032199270959903e-05, + "loss": 1.9176, + "step": 3249 + }, + { + "epoch": 0.9864926392472302, + "grad_norm": 0.3146267235279083, + "learning_rate": 8.031591737545565e-05, + "loss": 1.813, + "step": 3250 + }, + { + "epoch": 0.9867961754439217, + "grad_norm": 0.5194718241691589, + "learning_rate": 8.030984204131228e-05, + "loss": 2.0425, + "step": 3251 + }, + { + "epoch": 0.9870997116406132, + "grad_norm": 0.37070557475090027, + "learning_rate": 8.03037667071689e-05, + "loss": 1.8255, + "step": 3252 + }, + { + "epoch": 0.9874032478373046, + "grad_norm": 0.37021851539611816, + "learning_rate": 8.029769137302551e-05, + "loss": 2.2163, + "step": 3253 + }, + { + "epoch": 0.987706784033996, + "grad_norm": 0.5118260979652405, + "learning_rate": 8.029161603888215e-05, + "loss": 1.4771, + "step": 3254 + }, + { + "epoch": 0.9880103202306875, + "grad_norm": 0.4722789227962494, + "learning_rate": 8.028554070473876e-05, + "loss": 1.9019, + "step": 3255 + }, + { + "epoch": 0.988313856427379, + "grad_norm": 0.3468252420425415, + "learning_rate": 8.027946537059538e-05, + "loss": 1.7159, + "step": 3256 + }, + { + "epoch": 0.9886173926240704, + "grad_norm": 0.4422720968723297, + "learning_rate": 8.027339003645201e-05, + "loss": 1.8671, + "step": 3257 + }, + { + "epoch": 0.9889209288207619, + "grad_norm": 0.46859246492385864, + "learning_rate": 8.026731470230863e-05, + "loss": 2.0586, + "step": 3258 + }, + { + "epoch": 0.9892244650174533, + "grad_norm": 0.46339279413223267, + "learning_rate": 8.026123936816526e-05, + "loss": 1.7544, + "step": 3259 + }, + { + "epoch": 0.9895280012141447, + "grad_norm": 0.3819115161895752, + "learning_rate": 8.025516403402188e-05, + "loss": 1.9807, + "step": 3260 + }, + { + "epoch": 0.9898315374108363, + "grad_norm": 0.8981953263282776, + "learning_rate": 8.02490886998785e-05, + "loss": 1.5445, + "step": 3261 + }, + { + "epoch": 0.9901350736075277, + "grad_norm": 0.4964045584201813, + "learning_rate": 8.024301336573512e-05, + "loss": 1.6069, + "step": 3262 + }, + { + "epoch": 0.9904386098042192, + "grad_norm": 0.4120222330093384, + "learning_rate": 8.023693803159174e-05, + "loss": 1.8865, + "step": 3263 + }, + { + "epoch": 0.9907421460009106, + "grad_norm": 0.44508838653564453, + "learning_rate": 8.023086269744836e-05, + "loss": 1.4333, + "step": 3264 + }, + { + "epoch": 0.991045682197602, + "grad_norm": 0.3859883248806, + "learning_rate": 8.022478736330499e-05, + "loss": 1.9533, + "step": 3265 + }, + { + "epoch": 0.9913492183942936, + "grad_norm": 0.4727214276790619, + "learning_rate": 8.021871202916161e-05, + "loss": 1.4972, + "step": 3266 + }, + { + "epoch": 0.991652754590985, + "grad_norm": 0.8692718148231506, + "learning_rate": 8.021263669501822e-05, + "loss": 1.5174, + "step": 3267 + }, + { + "epoch": 0.9919562907876764, + "grad_norm": 0.4142051637172699, + "learning_rate": 8.020656136087486e-05, + "loss": 1.8747, + "step": 3268 + }, + { + "epoch": 0.9922598269843679, + "grad_norm": 0.4075202941894531, + "learning_rate": 8.020048602673147e-05, + "loss": 1.9666, + "step": 3269 + }, + { + "epoch": 0.9925633631810593, + "grad_norm": 0.7073702216148376, + "learning_rate": 8.019441069258809e-05, + "loss": 1.9629, + "step": 3270 + }, + { + "epoch": 0.9928668993777507, + "grad_norm": 0.4240557849407196, + "learning_rate": 8.018833535844472e-05, + "loss": 1.6251, + "step": 3271 + }, + { + "epoch": 0.9931704355744423, + "grad_norm": 0.4226653277873993, + "learning_rate": 8.018226002430134e-05, + "loss": 1.9254, + "step": 3272 + }, + { + "epoch": 0.9934739717711337, + "grad_norm": 0.40740150213241577, + "learning_rate": 8.017618469015797e-05, + "loss": 1.8494, + "step": 3273 + }, + { + "epoch": 0.9937775079678252, + "grad_norm": 0.4575270712375641, + "learning_rate": 8.017010935601459e-05, + "loss": 1.5237, + "step": 3274 + }, + { + "epoch": 0.9940810441645166, + "grad_norm": 0.48337459564208984, + "learning_rate": 8.01640340218712e-05, + "loss": 1.5319, + "step": 3275 + }, + { + "epoch": 0.994384580361208, + "grad_norm": 0.3628256916999817, + "learning_rate": 8.015795868772783e-05, + "loss": 2.0454, + "step": 3276 + }, + { + "epoch": 0.9946881165578996, + "grad_norm": 0.3945721387863159, + "learning_rate": 8.015188335358445e-05, + "loss": 2.0646, + "step": 3277 + }, + { + "epoch": 0.994991652754591, + "grad_norm": 0.38345763087272644, + "learning_rate": 8.014580801944107e-05, + "loss": 1.4601, + "step": 3278 + }, + { + "epoch": 0.9952951889512824, + "grad_norm": 0.47283461689949036, + "learning_rate": 8.01397326852977e-05, + "loss": 1.6951, + "step": 3279 + }, + { + "epoch": 0.9955987251479739, + "grad_norm": 0.47353407740592957, + "learning_rate": 8.013365735115432e-05, + "loss": 2.0192, + "step": 3280 + }, + { + "epoch": 0.9959022613446653, + "grad_norm": 0.41565829515457153, + "learning_rate": 8.012758201701093e-05, + "loss": 1.8168, + "step": 3281 + }, + { + "epoch": 0.9962057975413569, + "grad_norm": 0.43817979097366333, + "learning_rate": 8.012150668286757e-05, + "loss": 2.1453, + "step": 3282 + }, + { + "epoch": 0.9965093337380483, + "grad_norm": 0.5480432510375977, + "learning_rate": 8.011543134872418e-05, + "loss": 1.6872, + "step": 3283 + }, + { + "epoch": 0.9968128699347397, + "grad_norm": 0.344694584608078, + "learning_rate": 8.01093560145808e-05, + "loss": 1.0506, + "step": 3284 + }, + { + "epoch": 0.9971164061314312, + "grad_norm": 0.39683830738067627, + "learning_rate": 8.010328068043743e-05, + "loss": 1.9114, + "step": 3285 + }, + { + "epoch": 0.9974199423281226, + "grad_norm": 0.865807294845581, + "learning_rate": 8.009720534629405e-05, + "loss": 2.0548, + "step": 3286 + }, + { + "epoch": 0.9977234785248141, + "grad_norm": 0.4596058130264282, + "learning_rate": 8.009113001215068e-05, + "loss": 1.5162, + "step": 3287 + }, + { + "epoch": 0.9980270147215056, + "grad_norm": 0.45966169238090515, + "learning_rate": 8.00850546780073e-05, + "loss": 1.9112, + "step": 3288 + }, + { + "epoch": 0.998330550918197, + "grad_norm": 0.44408029317855835, + "learning_rate": 8.007897934386391e-05, + "loss": 1.7549, + "step": 3289 + }, + { + "epoch": 0.9986340871148884, + "grad_norm": 0.4286332428455353, + "learning_rate": 8.007290400972054e-05, + "loss": 1.9614, + "step": 3290 + }, + { + "epoch": 0.9989376233115799, + "grad_norm": 0.40551066398620605, + "learning_rate": 8.006682867557716e-05, + "loss": 1.9918, + "step": 3291 + }, + { + "epoch": 0.9992411595082714, + "grad_norm": 0.41468697786331177, + "learning_rate": 8.006075334143378e-05, + "loss": 1.9249, + "step": 3292 + }, + { + "epoch": 0.9995446957049628, + "grad_norm": 0.506384551525116, + "learning_rate": 8.005467800729041e-05, + "loss": 1.9334, + "step": 3293 + }, + { + "epoch": 0.9998482319016543, + "grad_norm": 0.4209151268005371, + "learning_rate": 8.004860267314703e-05, + "loss": 1.728, + "step": 3294 + }, + { + "epoch": 1.0001517680983458, + "grad_norm": 17.907875061035156, + "learning_rate": 8.004252733900364e-05, + "loss": 2.4794, + "step": 3295 + }, + { + "epoch": 1.0004553042950373, + "grad_norm": 0.46547284722328186, + "learning_rate": 8.003645200486028e-05, + "loss": 1.4745, + "step": 3296 + }, + { + "epoch": 1.0007588404917287, + "grad_norm": 0.3899800777435303, + "learning_rate": 8.003037667071689e-05, + "loss": 1.5406, + "step": 3297 + }, + { + "epoch": 1.0010623766884201, + "grad_norm": 0.48273205757141113, + "learning_rate": 8.002430133657351e-05, + "loss": 1.4638, + "step": 3298 + }, + { + "epoch": 1.0013659128851116, + "grad_norm": 0.36288753151893616, + "learning_rate": 8.001822600243014e-05, + "loss": 1.8582, + "step": 3299 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.4598756432533264, + "learning_rate": 8.001215066828676e-05, + "loss": 1.1343, + "step": 3300 + }, + { + "epoch": 1.0019729852784944, + "grad_norm": 0.4313514530658722, + "learning_rate": 8.000607533414338e-05, + "loss": 1.6411, + "step": 3301 + }, + { + "epoch": 1.0022765214751859, + "grad_norm": 0.5020793676376343, + "learning_rate": 8e-05, + "loss": 1.8333, + "step": 3302 + }, + { + "epoch": 1.0025800576718773, + "grad_norm": 0.7939902544021606, + "learning_rate": 7.999392466585662e-05, + "loss": 1.0223, + "step": 3303 + }, + { + "epoch": 1.0028835938685687, + "grad_norm": 0.42683956027030945, + "learning_rate": 7.998784933171325e-05, + "loss": 1.6555, + "step": 3304 + }, + { + "epoch": 1.0031871300652604, + "grad_norm": 1.1804200410842896, + "learning_rate": 7.998177399756987e-05, + "loss": 1.2333, + "step": 3305 + }, + { + "epoch": 1.0034906662619518, + "grad_norm": 0.4585864543914795, + "learning_rate": 7.997569866342649e-05, + "loss": 1.7396, + "step": 3306 + }, + { + "epoch": 1.0037942024586433, + "grad_norm": 0.6374893188476562, + "learning_rate": 7.996962332928312e-05, + "loss": 1.4289, + "step": 3307 + }, + { + "epoch": 1.0040977386553347, + "grad_norm": 0.7758880853652954, + "learning_rate": 7.996354799513974e-05, + "loss": 1.2918, + "step": 3308 + }, + { + "epoch": 1.0044012748520261, + "grad_norm": 0.7899906039237976, + "learning_rate": 7.995747266099635e-05, + "loss": 0.9088, + "step": 3309 + }, + { + "epoch": 1.0047048110487176, + "grad_norm": 0.5590714812278748, + "learning_rate": 7.995139732685299e-05, + "loss": 1.2513, + "step": 3310 + }, + { + "epoch": 1.005008347245409, + "grad_norm": 0.49430859088897705, + "learning_rate": 7.99453219927096e-05, + "loss": 1.5882, + "step": 3311 + }, + { + "epoch": 1.0053118834421004, + "grad_norm": 0.4428652822971344, + "learning_rate": 7.993924665856622e-05, + "loss": 1.6225, + "step": 3312 + }, + { + "epoch": 1.0056154196387919, + "grad_norm": 1.8553460836410522, + "learning_rate": 7.993317132442285e-05, + "loss": 1.3601, + "step": 3313 + }, + { + "epoch": 1.0059189558354833, + "grad_norm": 0.5211709141731262, + "learning_rate": 7.992709599027947e-05, + "loss": 1.8398, + "step": 3314 + }, + { + "epoch": 1.0062224920321747, + "grad_norm": 0.7685166001319885, + "learning_rate": 7.992102065613609e-05, + "loss": 1.5013, + "step": 3315 + }, + { + "epoch": 1.0065260282288664, + "grad_norm": 0.4375928044319153, + "learning_rate": 7.991494532199272e-05, + "loss": 1.8564, + "step": 3316 + }, + { + "epoch": 1.0068295644255578, + "grad_norm": 0.44753187894821167, + "learning_rate": 7.990886998784933e-05, + "loss": 1.3722, + "step": 3317 + }, + { + "epoch": 1.0071331006222493, + "grad_norm": 0.48083680868148804, + "learning_rate": 7.990279465370596e-05, + "loss": 1.4704, + "step": 3318 + }, + { + "epoch": 1.0074366368189407, + "grad_norm": 0.3680810332298279, + "learning_rate": 7.989671931956258e-05, + "loss": 1.2053, + "step": 3319 + }, + { + "epoch": 1.0077401730156321, + "grad_norm": 0.37688201665878296, + "learning_rate": 7.98906439854192e-05, + "loss": 1.9585, + "step": 3320 + }, + { + "epoch": 1.0080437092123236, + "grad_norm": 0.4439717233181, + "learning_rate": 7.988456865127583e-05, + "loss": 1.6347, + "step": 3321 + }, + { + "epoch": 1.008347245409015, + "grad_norm": 0.44323423504829407, + "learning_rate": 7.987849331713245e-05, + "loss": 1.7071, + "step": 3322 + }, + { + "epoch": 1.0086507816057064, + "grad_norm": 0.44141215085983276, + "learning_rate": 7.987241798298906e-05, + "loss": 1.5686, + "step": 3323 + }, + { + "epoch": 1.0089543178023979, + "grad_norm": 0.3377261459827423, + "learning_rate": 7.98663426488457e-05, + "loss": 1.0637, + "step": 3324 + }, + { + "epoch": 1.0092578539990893, + "grad_norm": 0.679061770439148, + "learning_rate": 7.986026731470231e-05, + "loss": 1.6332, + "step": 3325 + }, + { + "epoch": 1.009561390195781, + "grad_norm": 0.40934574604034424, + "learning_rate": 7.985419198055893e-05, + "loss": 1.591, + "step": 3326 + }, + { + "epoch": 1.0098649263924724, + "grad_norm": 0.4708541929721832, + "learning_rate": 7.984811664641556e-05, + "loss": 1.4567, + "step": 3327 + }, + { + "epoch": 1.0101684625891638, + "grad_norm": 0.4251214861869812, + "learning_rate": 7.984204131227218e-05, + "loss": 1.7905, + "step": 3328 + }, + { + "epoch": 1.0104719987858553, + "grad_norm": 0.48691290616989136, + "learning_rate": 7.98359659781288e-05, + "loss": 1.8187, + "step": 3329 + }, + { + "epoch": 1.0107755349825467, + "grad_norm": 0.4369681179523468, + "learning_rate": 7.982989064398543e-05, + "loss": 1.5784, + "step": 3330 + }, + { + "epoch": 1.0110790711792381, + "grad_norm": 0.47362881898880005, + "learning_rate": 7.982381530984204e-05, + "loss": 1.2642, + "step": 3331 + }, + { + "epoch": 1.0113826073759296, + "grad_norm": 0.4974597096443176, + "learning_rate": 7.981773997569867e-05, + "loss": 1.6925, + "step": 3332 + }, + { + "epoch": 1.011686143572621, + "grad_norm": 0.46564406156539917, + "learning_rate": 7.981166464155529e-05, + "loss": 1.6209, + "step": 3333 + }, + { + "epoch": 1.0119896797693124, + "grad_norm": 0.4477474093437195, + "learning_rate": 7.980558930741191e-05, + "loss": 1.8006, + "step": 3334 + }, + { + "epoch": 1.0122932159660039, + "grad_norm": 0.4635123312473297, + "learning_rate": 7.979951397326854e-05, + "loss": 1.818, + "step": 3335 + }, + { + "epoch": 1.0125967521626955, + "grad_norm": 0.43166083097457886, + "learning_rate": 7.979343863912516e-05, + "loss": 1.3599, + "step": 3336 + }, + { + "epoch": 1.012900288359387, + "grad_norm": 0.39611899852752686, + "learning_rate": 7.978736330498177e-05, + "loss": 1.4963, + "step": 3337 + }, + { + "epoch": 1.0132038245560784, + "grad_norm": 0.917677104473114, + "learning_rate": 7.97812879708384e-05, + "loss": 1.3816, + "step": 3338 + }, + { + "epoch": 1.0135073607527698, + "grad_norm": 0.4772632420063019, + "learning_rate": 7.977521263669502e-05, + "loss": 1.699, + "step": 3339 + }, + { + "epoch": 1.0138108969494612, + "grad_norm": 0.5998721122741699, + "learning_rate": 7.976913730255164e-05, + "loss": 1.5147, + "step": 3340 + }, + { + "epoch": 1.0141144331461527, + "grad_norm": 0.47984611988067627, + "learning_rate": 7.976306196840827e-05, + "loss": 1.4805, + "step": 3341 + }, + { + "epoch": 1.0144179693428441, + "grad_norm": 0.4247418940067291, + "learning_rate": 7.975698663426489e-05, + "loss": 1.3944, + "step": 3342 + }, + { + "epoch": 1.0147215055395356, + "grad_norm": 0.5164505839347839, + "learning_rate": 7.97509113001215e-05, + "loss": 1.4708, + "step": 3343 + }, + { + "epoch": 1.015025041736227, + "grad_norm": 0.43840450048446655, + "learning_rate": 7.974483596597814e-05, + "loss": 1.5382, + "step": 3344 + }, + { + "epoch": 1.0153285779329184, + "grad_norm": 0.49245715141296387, + "learning_rate": 7.973876063183475e-05, + "loss": 1.6582, + "step": 3345 + }, + { + "epoch": 1.0156321141296099, + "grad_norm": 0.6701889634132385, + "learning_rate": 7.973268529769138e-05, + "loss": 1.8984, + "step": 3346 + }, + { + "epoch": 1.0159356503263015, + "grad_norm": 0.4831668734550476, + "learning_rate": 7.972660996354799e-05, + "loss": 1.4743, + "step": 3347 + }, + { + "epoch": 1.016239186522993, + "grad_norm": 0.4388216733932495, + "learning_rate": 7.972053462940462e-05, + "loss": 1.5717, + "step": 3348 + }, + { + "epoch": 1.0165427227196844, + "grad_norm": 0.3998357951641083, + "learning_rate": 7.971445929526125e-05, + "loss": 1.6379, + "step": 3349 + }, + { + "epoch": 1.0168462589163758, + "grad_norm": 0.4894062876701355, + "learning_rate": 7.970838396111785e-05, + "loss": 1.3522, + "step": 3350 + }, + { + "epoch": 1.0171497951130672, + "grad_norm": 0.6286391019821167, + "learning_rate": 7.970230862697448e-05, + "loss": 1.5174, + "step": 3351 + }, + { + "epoch": 1.0174533313097587, + "grad_norm": 0.688150942325592, + "learning_rate": 7.969623329283112e-05, + "loss": 1.8916, + "step": 3352 + }, + { + "epoch": 1.0177568675064501, + "grad_norm": 0.39143821597099304, + "learning_rate": 7.969015795868773e-05, + "loss": 1.7765, + "step": 3353 + }, + { + "epoch": 1.0180604037031415, + "grad_norm": 0.5299899578094482, + "learning_rate": 7.968408262454435e-05, + "loss": 1.5828, + "step": 3354 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.43358203768730164, + "learning_rate": 7.967800729040098e-05, + "loss": 1.4084, + "step": 3355 + }, + { + "epoch": 1.0186674760965244, + "grad_norm": 0.43103456497192383, + "learning_rate": 7.96719319562576e-05, + "loss": 1.5633, + "step": 3356 + }, + { + "epoch": 1.018971012293216, + "grad_norm": 0.4097878634929657, + "learning_rate": 7.966585662211422e-05, + "loss": 1.3976, + "step": 3357 + }, + { + "epoch": 1.0192745484899075, + "grad_norm": 0.47395214438438416, + "learning_rate": 7.965978128797085e-05, + "loss": 1.6707, + "step": 3358 + }, + { + "epoch": 1.019578084686599, + "grad_norm": 0.6641651391983032, + "learning_rate": 7.965370595382746e-05, + "loss": 1.8728, + "step": 3359 + }, + { + "epoch": 1.0198816208832904, + "grad_norm": 0.48009195923805237, + "learning_rate": 7.96476306196841e-05, + "loss": 1.675, + "step": 3360 + }, + { + "epoch": 1.0201851570799818, + "grad_norm": 0.430106520652771, + "learning_rate": 7.96415552855407e-05, + "loss": 1.7039, + "step": 3361 + }, + { + "epoch": 1.0204886932766732, + "grad_norm": 0.42592278122901917, + "learning_rate": 7.963547995139733e-05, + "loss": 1.5168, + "step": 3362 + }, + { + "epoch": 1.0207922294733647, + "grad_norm": 0.5778846144676208, + "learning_rate": 7.962940461725396e-05, + "loss": 1.0481, + "step": 3363 + }, + { + "epoch": 1.021095765670056, + "grad_norm": 0.4378105103969574, + "learning_rate": 7.962332928311056e-05, + "loss": 1.9381, + "step": 3364 + }, + { + "epoch": 1.0213993018667475, + "grad_norm": 0.4664958715438843, + "learning_rate": 7.96172539489672e-05, + "loss": 1.6863, + "step": 3365 + }, + { + "epoch": 1.021702838063439, + "grad_norm": 0.455496609210968, + "learning_rate": 7.961117861482383e-05, + "loss": 1.7554, + "step": 3366 + }, + { + "epoch": 1.0220063742601304, + "grad_norm": 0.5868107676506042, + "learning_rate": 7.960510328068044e-05, + "loss": 1.9657, + "step": 3367 + }, + { + "epoch": 1.022309910456822, + "grad_norm": 0.5736465454101562, + "learning_rate": 7.959902794653706e-05, + "loss": 1.2157, + "step": 3368 + }, + { + "epoch": 1.0226134466535135, + "grad_norm": 0.3856525421142578, + "learning_rate": 7.959295261239369e-05, + "loss": 1.7684, + "step": 3369 + }, + { + "epoch": 1.022916982850205, + "grad_norm": 0.5012997388839722, + "learning_rate": 7.958687727825031e-05, + "loss": 1.5705, + "step": 3370 + }, + { + "epoch": 1.0232205190468964, + "grad_norm": 0.4648292362689972, + "learning_rate": 7.958080194410693e-05, + "loss": 1.9879, + "step": 3371 + }, + { + "epoch": 1.0235240552435878, + "grad_norm": 0.39332127571105957, + "learning_rate": 7.957472660996356e-05, + "loss": 1.741, + "step": 3372 + }, + { + "epoch": 1.0238275914402792, + "grad_norm": 0.4548643231391907, + "learning_rate": 7.956865127582017e-05, + "loss": 1.6544, + "step": 3373 + }, + { + "epoch": 1.0241311276369707, + "grad_norm": 0.36641523241996765, + "learning_rate": 7.956257594167679e-05, + "loss": 1.2098, + "step": 3374 + }, + { + "epoch": 1.024434663833662, + "grad_norm": 0.46462637186050415, + "learning_rate": 7.955650060753341e-05, + "loss": 1.3876, + "step": 3375 + }, + { + "epoch": 1.0247382000303535, + "grad_norm": 0.9742159247398376, + "learning_rate": 7.955042527339004e-05, + "loss": 1.8654, + "step": 3376 + }, + { + "epoch": 1.025041736227045, + "grad_norm": 0.5226752758026123, + "learning_rate": 7.954434993924667e-05, + "loss": 1.1396, + "step": 3377 + }, + { + "epoch": 1.0253452724237366, + "grad_norm": 0.43976494669914246, + "learning_rate": 7.953827460510327e-05, + "loss": 1.7537, + "step": 3378 + }, + { + "epoch": 1.025648808620428, + "grad_norm": 0.4897270202636719, + "learning_rate": 7.95321992709599e-05, + "loss": 1.382, + "step": 3379 + }, + { + "epoch": 1.0259523448171195, + "grad_norm": 0.42977437376976013, + "learning_rate": 7.952612393681654e-05, + "loss": 1.69, + "step": 3380 + }, + { + "epoch": 1.026255881013811, + "grad_norm": 0.4650570750236511, + "learning_rate": 7.952004860267315e-05, + "loss": 1.5666, + "step": 3381 + }, + { + "epoch": 1.0265594172105024, + "grad_norm": 0.5345761179924011, + "learning_rate": 7.951397326852977e-05, + "loss": 1.5281, + "step": 3382 + }, + { + "epoch": 1.0268629534071938, + "grad_norm": 0.43827125430107117, + "learning_rate": 7.95078979343864e-05, + "loss": 1.8579, + "step": 3383 + }, + { + "epoch": 1.0271664896038852, + "grad_norm": 0.4599241614341736, + "learning_rate": 7.950182260024302e-05, + "loss": 1.5928, + "step": 3384 + }, + { + "epoch": 1.0274700258005767, + "grad_norm": 1.1530771255493164, + "learning_rate": 7.949574726609964e-05, + "loss": 1.5092, + "step": 3385 + }, + { + "epoch": 1.027773561997268, + "grad_norm": 0.48699623346328735, + "learning_rate": 7.948967193195627e-05, + "loss": 1.7638, + "step": 3386 + }, + { + "epoch": 1.0280770981939595, + "grad_norm": 0.5288783311843872, + "learning_rate": 7.948359659781288e-05, + "loss": 1.7314, + "step": 3387 + }, + { + "epoch": 1.0283806343906512, + "grad_norm": 0.4574908912181854, + "learning_rate": 7.94775212636695e-05, + "loss": 1.9344, + "step": 3388 + }, + { + "epoch": 1.0286841705873426, + "grad_norm": 0.47413721680641174, + "learning_rate": 7.947144592952612e-05, + "loss": 1.3177, + "step": 3389 + }, + { + "epoch": 1.028987706784034, + "grad_norm": 0.46366703510284424, + "learning_rate": 7.946537059538275e-05, + "loss": 1.8695, + "step": 3390 + }, + { + "epoch": 1.0292912429807255, + "grad_norm": 0.4010477662086487, + "learning_rate": 7.945929526123938e-05, + "loss": 1.623, + "step": 3391 + }, + { + "epoch": 1.029594779177417, + "grad_norm": 0.501057505607605, + "learning_rate": 7.945321992709598e-05, + "loss": 1.6748, + "step": 3392 + }, + { + "epoch": 1.0298983153741084, + "grad_norm": 0.4147251546382904, + "learning_rate": 7.944714459295262e-05, + "loss": 1.76, + "step": 3393 + }, + { + "epoch": 1.0302018515707998, + "grad_norm": 0.5023919939994812, + "learning_rate": 7.944106925880925e-05, + "loss": 1.7666, + "step": 3394 + }, + { + "epoch": 1.0305053877674912, + "grad_norm": 0.4336966574192047, + "learning_rate": 7.943499392466586e-05, + "loss": 1.8498, + "step": 3395 + }, + { + "epoch": 1.0308089239641827, + "grad_norm": 0.50406813621521, + "learning_rate": 7.942891859052248e-05, + "loss": 1.0032, + "step": 3396 + }, + { + "epoch": 1.031112460160874, + "grad_norm": 0.5218415856361389, + "learning_rate": 7.942284325637911e-05, + "loss": 1.7252, + "step": 3397 + }, + { + "epoch": 1.0314159963575655, + "grad_norm": 0.5142799019813538, + "learning_rate": 7.941676792223573e-05, + "loss": 1.9791, + "step": 3398 + }, + { + "epoch": 1.0317195325542572, + "grad_norm": 0.5369110107421875, + "learning_rate": 7.941069258809235e-05, + "loss": 1.4356, + "step": 3399 + }, + { + "epoch": 1.0320230687509486, + "grad_norm": 0.4954996109008789, + "learning_rate": 7.940461725394898e-05, + "loss": 1.665, + "step": 3400 + }, + { + "epoch": 1.03232660494764, + "grad_norm": 0.5331052541732788, + "learning_rate": 7.93985419198056e-05, + "loss": 1.5129, + "step": 3401 + }, + { + "epoch": 1.0326301411443315, + "grad_norm": 0.4011031985282898, + "learning_rate": 7.939246658566221e-05, + "loss": 1.5203, + "step": 3402 + }, + { + "epoch": 1.032933677341023, + "grad_norm": 0.8139665722846985, + "learning_rate": 7.938639125151883e-05, + "loss": 1.5036, + "step": 3403 + }, + { + "epoch": 1.0332372135377144, + "grad_norm": 0.4838857650756836, + "learning_rate": 7.938031591737546e-05, + "loss": 1.7915, + "step": 3404 + }, + { + "epoch": 1.0335407497344058, + "grad_norm": 0.5446197390556335, + "learning_rate": 7.937424058323209e-05, + "loss": 1.7374, + "step": 3405 + }, + { + "epoch": 1.0338442859310972, + "grad_norm": 0.7249342799186707, + "learning_rate": 7.93681652490887e-05, + "loss": 1.5549, + "step": 3406 + }, + { + "epoch": 1.0341478221277887, + "grad_norm": 0.4857841730117798, + "learning_rate": 7.936208991494533e-05, + "loss": 1.6947, + "step": 3407 + }, + { + "epoch": 1.03445135832448, + "grad_norm": 0.4289863705635071, + "learning_rate": 7.935601458080196e-05, + "loss": 1.8783, + "step": 3408 + }, + { + "epoch": 1.0347548945211718, + "grad_norm": 0.49779224395751953, + "learning_rate": 7.934993924665857e-05, + "loss": 1.7971, + "step": 3409 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.5169624090194702, + "learning_rate": 7.934386391251519e-05, + "loss": 1.4507, + "step": 3410 + }, + { + "epoch": 1.0353619669145546, + "grad_norm": 0.4716205894947052, + "learning_rate": 7.933778857837182e-05, + "loss": 1.8483, + "step": 3411 + }, + { + "epoch": 1.035665503111246, + "grad_norm": 0.5545279383659363, + "learning_rate": 7.933171324422844e-05, + "loss": 1.5672, + "step": 3412 + }, + { + "epoch": 1.0359690393079375, + "grad_norm": 0.4328896105289459, + "learning_rate": 7.932563791008506e-05, + "loss": 1.1695, + "step": 3413 + }, + { + "epoch": 1.036272575504629, + "grad_norm": 0.4805368185043335, + "learning_rate": 7.931956257594169e-05, + "loss": 1.7504, + "step": 3414 + }, + { + "epoch": 1.0365761117013204, + "grad_norm": 0.5162798166275024, + "learning_rate": 7.93134872417983e-05, + "loss": 1.4748, + "step": 3415 + }, + { + "epoch": 1.0368796478980118, + "grad_norm": 0.5200609564781189, + "learning_rate": 7.930741190765492e-05, + "loss": 2.0036, + "step": 3416 + }, + { + "epoch": 1.0371831840947032, + "grad_norm": 0.4653424620628357, + "learning_rate": 7.930133657351154e-05, + "loss": 1.9499, + "step": 3417 + }, + { + "epoch": 1.0374867202913947, + "grad_norm": 0.4723150432109833, + "learning_rate": 7.929526123936817e-05, + "loss": 1.4042, + "step": 3418 + }, + { + "epoch": 1.037790256488086, + "grad_norm": 0.5302563905715942, + "learning_rate": 7.92891859052248e-05, + "loss": 1.4871, + "step": 3419 + }, + { + "epoch": 1.0380937926847777, + "grad_norm": 0.47659730911254883, + "learning_rate": 7.92831105710814e-05, + "loss": 1.4755, + "step": 3420 + }, + { + "epoch": 1.0383973288814692, + "grad_norm": 0.5367438197135925, + "learning_rate": 7.927703523693804e-05, + "loss": 1.8863, + "step": 3421 + }, + { + "epoch": 1.0387008650781606, + "grad_norm": 0.5086414217948914, + "learning_rate": 7.927095990279467e-05, + "loss": 1.5784, + "step": 3422 + }, + { + "epoch": 1.039004401274852, + "grad_norm": 0.4822576642036438, + "learning_rate": 7.926488456865127e-05, + "loss": 1.8435, + "step": 3423 + }, + { + "epoch": 1.0393079374715435, + "grad_norm": 0.5086636543273926, + "learning_rate": 7.92588092345079e-05, + "loss": 1.732, + "step": 3424 + }, + { + "epoch": 1.039611473668235, + "grad_norm": 0.49060937762260437, + "learning_rate": 7.925273390036453e-05, + "loss": 1.7369, + "step": 3425 + }, + { + "epoch": 1.0399150098649264, + "grad_norm": 0.4944159984588623, + "learning_rate": 7.924665856622115e-05, + "loss": 1.2844, + "step": 3426 + }, + { + "epoch": 1.0402185460616178, + "grad_norm": 0.4141417443752289, + "learning_rate": 7.924058323207777e-05, + "loss": 1.3346, + "step": 3427 + }, + { + "epoch": 1.0405220822583092, + "grad_norm": 0.4598718285560608, + "learning_rate": 7.923450789793438e-05, + "loss": 1.5991, + "step": 3428 + }, + { + "epoch": 1.0408256184550007, + "grad_norm": 0.5402548313140869, + "learning_rate": 7.922843256379101e-05, + "loss": 1.78, + "step": 3429 + }, + { + "epoch": 1.0411291546516923, + "grad_norm": 0.4793176054954529, + "learning_rate": 7.922235722964763e-05, + "loss": 1.5652, + "step": 3430 + }, + { + "epoch": 1.0414326908483837, + "grad_norm": 0.4527183175086975, + "learning_rate": 7.921628189550425e-05, + "loss": 1.82, + "step": 3431 + }, + { + "epoch": 1.0417362270450752, + "grad_norm": 0.4884622395038605, + "learning_rate": 7.921020656136088e-05, + "loss": 1.6954, + "step": 3432 + }, + { + "epoch": 1.0420397632417666, + "grad_norm": 0.46866077184677124, + "learning_rate": 7.920413122721751e-05, + "loss": 1.712, + "step": 3433 + }, + { + "epoch": 1.042343299438458, + "grad_norm": 0.40482431650161743, + "learning_rate": 7.919805589307411e-05, + "loss": 1.8968, + "step": 3434 + }, + { + "epoch": 1.0426468356351495, + "grad_norm": 0.5239852666854858, + "learning_rate": 7.919198055893075e-05, + "loss": 1.7953, + "step": 3435 + }, + { + "epoch": 1.042950371831841, + "grad_norm": 0.484953373670578, + "learning_rate": 7.918590522478738e-05, + "loss": 1.8354, + "step": 3436 + }, + { + "epoch": 1.0432539080285324, + "grad_norm": 0.47952115535736084, + "learning_rate": 7.917982989064398e-05, + "loss": 1.2171, + "step": 3437 + }, + { + "epoch": 1.0435574442252238, + "grad_norm": 0.47822096943855286, + "learning_rate": 7.917375455650061e-05, + "loss": 1.7159, + "step": 3438 + }, + { + "epoch": 1.0438609804219152, + "grad_norm": 0.44206422567367554, + "learning_rate": 7.916767922235724e-05, + "loss": 1.8257, + "step": 3439 + }, + { + "epoch": 1.0441645166186069, + "grad_norm": 0.5453143119812012, + "learning_rate": 7.916160388821386e-05, + "loss": 1.7791, + "step": 3440 + }, + { + "epoch": 1.0444680528152983, + "grad_norm": 0.4759043753147125, + "learning_rate": 7.915552855407048e-05, + "loss": 1.692, + "step": 3441 + }, + { + "epoch": 1.0447715890119897, + "grad_norm": 0.484531044960022, + "learning_rate": 7.91494532199271e-05, + "loss": 1.806, + "step": 3442 + }, + { + "epoch": 1.0450751252086812, + "grad_norm": 0.5373866558074951, + "learning_rate": 7.914337788578372e-05, + "loss": 1.5913, + "step": 3443 + }, + { + "epoch": 1.0453786614053726, + "grad_norm": 0.5190970301628113, + "learning_rate": 7.913730255164034e-05, + "loss": 1.5233, + "step": 3444 + }, + { + "epoch": 1.045682197602064, + "grad_norm": 0.500152051448822, + "learning_rate": 7.913122721749696e-05, + "loss": 1.7589, + "step": 3445 + }, + { + "epoch": 1.0459857337987555, + "grad_norm": 0.3860965073108673, + "learning_rate": 7.912515188335359e-05, + "loss": 1.2231, + "step": 3446 + }, + { + "epoch": 1.046289269995447, + "grad_norm": 0.44290807843208313, + "learning_rate": 7.911907654921021e-05, + "loss": 1.5446, + "step": 3447 + }, + { + "epoch": 1.0465928061921383, + "grad_norm": 0.46361368894577026, + "learning_rate": 7.911300121506682e-05, + "loss": 1.4829, + "step": 3448 + }, + { + "epoch": 1.0468963423888298, + "grad_norm": 0.40358835458755493, + "learning_rate": 7.910692588092346e-05, + "loss": 1.6495, + "step": 3449 + }, + { + "epoch": 1.0471998785855212, + "grad_norm": 0.5722264051437378, + "learning_rate": 7.910085054678009e-05, + "loss": 1.7335, + "step": 3450 + }, + { + "epoch": 1.0475034147822129, + "grad_norm": 0.49722689390182495, + "learning_rate": 7.909477521263669e-05, + "loss": 1.6493, + "step": 3451 + }, + { + "epoch": 1.0478069509789043, + "grad_norm": 0.5183900594711304, + "learning_rate": 7.908869987849332e-05, + "loss": 1.7254, + "step": 3452 + }, + { + "epoch": 1.0481104871755957, + "grad_norm": 0.5188613533973694, + "learning_rate": 7.908262454434995e-05, + "loss": 2.0111, + "step": 3453 + }, + { + "epoch": 1.0484140233722872, + "grad_norm": 0.5030909180641174, + "learning_rate": 7.907654921020657e-05, + "loss": 1.395, + "step": 3454 + }, + { + "epoch": 1.0487175595689786, + "grad_norm": 0.4069419205188751, + "learning_rate": 7.907047387606319e-05, + "loss": 1.9025, + "step": 3455 + }, + { + "epoch": 1.04902109576567, + "grad_norm": 0.5355219841003418, + "learning_rate": 7.90643985419198e-05, + "loss": 1.732, + "step": 3456 + }, + { + "epoch": 1.0493246319623615, + "grad_norm": 0.43117785453796387, + "learning_rate": 7.905832320777643e-05, + "loss": 1.4534, + "step": 3457 + }, + { + "epoch": 1.049628168159053, + "grad_norm": 0.4561751186847687, + "learning_rate": 7.905224787363305e-05, + "loss": 1.5682, + "step": 3458 + }, + { + "epoch": 1.0499317043557443, + "grad_norm": 0.4510141611099243, + "learning_rate": 7.904617253948967e-05, + "loss": 1.6789, + "step": 3459 + }, + { + "epoch": 1.0502352405524358, + "grad_norm": 0.5011105537414551, + "learning_rate": 7.90400972053463e-05, + "loss": 1.4711, + "step": 3460 + }, + { + "epoch": 1.0505387767491274, + "grad_norm": 0.5226435661315918, + "learning_rate": 7.903402187120292e-05, + "loss": 1.4425, + "step": 3461 + }, + { + "epoch": 1.0508423129458189, + "grad_norm": 0.46023955941200256, + "learning_rate": 7.902794653705953e-05, + "loss": 1.0283, + "step": 3462 + }, + { + "epoch": 1.0511458491425103, + "grad_norm": 0.5048952698707581, + "learning_rate": 7.902187120291617e-05, + "loss": 1.6029, + "step": 3463 + }, + { + "epoch": 1.0514493853392017, + "grad_norm": 0.6409230828285217, + "learning_rate": 7.90157958687728e-05, + "loss": 1.1697, + "step": 3464 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.5188806653022766, + "learning_rate": 7.90097205346294e-05, + "loss": 1.8136, + "step": 3465 + }, + { + "epoch": 1.0520564577325846, + "grad_norm": 0.5835402011871338, + "learning_rate": 7.900364520048603e-05, + "loss": 1.511, + "step": 3466 + }, + { + "epoch": 1.052359993929276, + "grad_norm": 0.6449020504951477, + "learning_rate": 7.899756986634266e-05, + "loss": 1.98, + "step": 3467 + }, + { + "epoch": 1.0526635301259675, + "grad_norm": 0.47613629698753357, + "learning_rate": 7.899149453219928e-05, + "loss": 1.6631, + "step": 3468 + }, + { + "epoch": 1.052967066322659, + "grad_norm": 0.4372462332248688, + "learning_rate": 7.89854191980559e-05, + "loss": 1.7773, + "step": 3469 + }, + { + "epoch": 1.0532706025193503, + "grad_norm": 0.48001718521118164, + "learning_rate": 7.897934386391251e-05, + "loss": 1.703, + "step": 3470 + }, + { + "epoch": 1.053574138716042, + "grad_norm": 0.5756060481071472, + "learning_rate": 7.897326852976914e-05, + "loss": 1.4202, + "step": 3471 + }, + { + "epoch": 1.0538776749127334, + "grad_norm": 0.48645758628845215, + "learning_rate": 7.896719319562576e-05, + "loss": 1.7291, + "step": 3472 + }, + { + "epoch": 1.0541812111094249, + "grad_norm": 0.4413807988166809, + "learning_rate": 7.896111786148238e-05, + "loss": 1.505, + "step": 3473 + }, + { + "epoch": 1.0544847473061163, + "grad_norm": 0.43039625883102417, + "learning_rate": 7.895504252733901e-05, + "loss": 1.5668, + "step": 3474 + }, + { + "epoch": 1.0547882835028077, + "grad_norm": 0.5196880102157593, + "learning_rate": 7.894896719319563e-05, + "loss": 1.9353, + "step": 3475 + }, + { + "epoch": 1.0550918196994992, + "grad_norm": 0.6965150833129883, + "learning_rate": 7.894289185905224e-05, + "loss": 1.1513, + "step": 3476 + }, + { + "epoch": 1.0553953558961906, + "grad_norm": 0.4723784625530243, + "learning_rate": 7.893681652490888e-05, + "loss": 1.5391, + "step": 3477 + }, + { + "epoch": 1.055698892092882, + "grad_norm": 0.47085341811180115, + "learning_rate": 7.89307411907655e-05, + "loss": 1.452, + "step": 3478 + }, + { + "epoch": 1.0560024282895735, + "grad_norm": 0.515957772731781, + "learning_rate": 7.892466585662211e-05, + "loss": 1.7636, + "step": 3479 + }, + { + "epoch": 1.056305964486265, + "grad_norm": 0.6064741611480713, + "learning_rate": 7.891859052247874e-05, + "loss": 1.7575, + "step": 3480 + }, + { + "epoch": 1.0566095006829563, + "grad_norm": 0.567486047744751, + "learning_rate": 7.891251518833537e-05, + "loss": 1.0875, + "step": 3481 + }, + { + "epoch": 1.056913036879648, + "grad_norm": 0.4897995591163635, + "learning_rate": 7.890643985419199e-05, + "loss": 1.5105, + "step": 3482 + }, + { + "epoch": 1.0572165730763394, + "grad_norm": 0.47024548053741455, + "learning_rate": 7.89003645200486e-05, + "loss": 1.77, + "step": 3483 + }, + { + "epoch": 1.0575201092730309, + "grad_norm": 0.7996636033058167, + "learning_rate": 7.889428918590522e-05, + "loss": 1.3641, + "step": 3484 + }, + { + "epoch": 1.0578236454697223, + "grad_norm": 0.813572347164154, + "learning_rate": 7.888821385176185e-05, + "loss": 1.2667, + "step": 3485 + }, + { + "epoch": 1.0581271816664137, + "grad_norm": 0.38006821274757385, + "learning_rate": 7.888213851761847e-05, + "loss": 2.011, + "step": 3486 + }, + { + "epoch": 1.0584307178631052, + "grad_norm": 0.5023001432418823, + "learning_rate": 7.887606318347509e-05, + "loss": 1.4452, + "step": 3487 + }, + { + "epoch": 1.0587342540597966, + "grad_norm": 0.6475557088851929, + "learning_rate": 7.886998784933172e-05, + "loss": 1.1677, + "step": 3488 + }, + { + "epoch": 1.059037790256488, + "grad_norm": 0.5049715042114258, + "learning_rate": 7.886391251518834e-05, + "loss": 1.0539, + "step": 3489 + }, + { + "epoch": 1.0593413264531795, + "grad_norm": 0.5918720960617065, + "learning_rate": 7.885783718104495e-05, + "loss": 1.1472, + "step": 3490 + }, + { + "epoch": 1.059644862649871, + "grad_norm": 0.4469449520111084, + "learning_rate": 7.885176184690159e-05, + "loss": 1.6658, + "step": 3491 + }, + { + "epoch": 1.0599483988465626, + "grad_norm": 0.4816749393939972, + "learning_rate": 7.884568651275822e-05, + "loss": 1.9424, + "step": 3492 + }, + { + "epoch": 1.060251935043254, + "grad_norm": 0.44042688608169556, + "learning_rate": 7.883961117861482e-05, + "loss": 1.559, + "step": 3493 + }, + { + "epoch": 1.0605554712399454, + "grad_norm": 0.4422488212585449, + "learning_rate": 7.883353584447145e-05, + "loss": 0.8213, + "step": 3494 + }, + { + "epoch": 1.0608590074366369, + "grad_norm": 0.44115856289863586, + "learning_rate": 7.882746051032808e-05, + "loss": 1.3014, + "step": 3495 + }, + { + "epoch": 1.0611625436333283, + "grad_norm": 0.5100114345550537, + "learning_rate": 7.882138517618469e-05, + "loss": 1.7285, + "step": 3496 + }, + { + "epoch": 1.0614660798300197, + "grad_norm": 0.4293980002403259, + "learning_rate": 7.881530984204132e-05, + "loss": 1.6291, + "step": 3497 + }, + { + "epoch": 1.0617696160267112, + "grad_norm": 0.4303349256515503, + "learning_rate": 7.880923450789793e-05, + "loss": 1.677, + "step": 3498 + }, + { + "epoch": 1.0620731522234026, + "grad_norm": 1.325836420059204, + "learning_rate": 7.880315917375456e-05, + "loss": 1.4604, + "step": 3499 + }, + { + "epoch": 1.062376688420094, + "grad_norm": 0.5173283815383911, + "learning_rate": 7.879708383961118e-05, + "loss": 1.8935, + "step": 3500 + }, + { + "epoch": 1.0626802246167855, + "grad_norm": 0.5339661240577698, + "learning_rate": 7.87910085054678e-05, + "loss": 1.4497, + "step": 3501 + }, + { + "epoch": 1.0629837608134771, + "grad_norm": 0.5330355763435364, + "learning_rate": 7.878493317132443e-05, + "loss": 1.7404, + "step": 3502 + }, + { + "epoch": 1.0632872970101686, + "grad_norm": 0.5396672487258911, + "learning_rate": 7.877885783718105e-05, + "loss": 1.3268, + "step": 3503 + }, + { + "epoch": 1.06359083320686, + "grad_norm": 0.44838812947273254, + "learning_rate": 7.877278250303766e-05, + "loss": 1.6525, + "step": 3504 + }, + { + "epoch": 1.0638943694035514, + "grad_norm": 0.50009685754776, + "learning_rate": 7.87667071688943e-05, + "loss": 1.6385, + "step": 3505 + }, + { + "epoch": 1.0641979056002429, + "grad_norm": 0.6882514953613281, + "learning_rate": 7.876063183475093e-05, + "loss": 1.3527, + "step": 3506 + }, + { + "epoch": 1.0645014417969343, + "grad_norm": 0.5103173851966858, + "learning_rate": 7.875455650060753e-05, + "loss": 1.3589, + "step": 3507 + }, + { + "epoch": 1.0648049779936257, + "grad_norm": 0.5229162573814392, + "learning_rate": 7.874848116646416e-05, + "loss": 1.5474, + "step": 3508 + }, + { + "epoch": 1.0651085141903172, + "grad_norm": 0.5207902193069458, + "learning_rate": 7.874240583232079e-05, + "loss": 1.807, + "step": 3509 + }, + { + "epoch": 1.0654120503870086, + "grad_norm": 0.46750408411026, + "learning_rate": 7.87363304981774e-05, + "loss": 1.4907, + "step": 3510 + }, + { + "epoch": 1.0657155865837, + "grad_norm": 1.095977544784546, + "learning_rate": 7.873025516403403e-05, + "loss": 1.6401, + "step": 3511 + }, + { + "epoch": 1.0660191227803915, + "grad_norm": 0.4337494373321533, + "learning_rate": 7.872417982989064e-05, + "loss": 1.8245, + "step": 3512 + }, + { + "epoch": 1.0663226589770831, + "grad_norm": 0.3852023184299469, + "learning_rate": 7.871810449574727e-05, + "loss": 1.506, + "step": 3513 + }, + { + "epoch": 1.0666261951737745, + "grad_norm": 0.45099326968193054, + "learning_rate": 7.871202916160389e-05, + "loss": 1.8121, + "step": 3514 + }, + { + "epoch": 1.066929731370466, + "grad_norm": 0.6037120223045349, + "learning_rate": 7.870595382746051e-05, + "loss": 1.7321, + "step": 3515 + }, + { + "epoch": 1.0672332675671574, + "grad_norm": 0.4074588716030121, + "learning_rate": 7.869987849331714e-05, + "loss": 1.6336, + "step": 3516 + }, + { + "epoch": 1.0675368037638489, + "grad_norm": 0.40023350715637207, + "learning_rate": 7.869380315917376e-05, + "loss": 0.9946, + "step": 3517 + }, + { + "epoch": 1.0678403399605403, + "grad_norm": 0.8330900073051453, + "learning_rate": 7.868772782503037e-05, + "loss": 1.4609, + "step": 3518 + }, + { + "epoch": 1.0681438761572317, + "grad_norm": 0.5196427702903748, + "learning_rate": 7.8681652490887e-05, + "loss": 1.7803, + "step": 3519 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.575749933719635, + "learning_rate": 7.867557715674362e-05, + "loss": 1.6784, + "step": 3520 + }, + { + "epoch": 1.0687509485506146, + "grad_norm": 0.5069593787193298, + "learning_rate": 7.866950182260024e-05, + "loss": 1.9097, + "step": 3521 + }, + { + "epoch": 1.069054484747306, + "grad_norm": 0.5487728714942932, + "learning_rate": 7.866342648845687e-05, + "loss": 1.8385, + "step": 3522 + }, + { + "epoch": 1.0693580209439975, + "grad_norm": 0.5695396065711975, + "learning_rate": 7.865735115431349e-05, + "loss": 1.8143, + "step": 3523 + }, + { + "epoch": 1.0696615571406891, + "grad_norm": 0.47702378034591675, + "learning_rate": 7.86512758201701e-05, + "loss": 1.2498, + "step": 3524 + }, + { + "epoch": 1.0699650933373805, + "grad_norm": 0.4768955707550049, + "learning_rate": 7.864520048602674e-05, + "loss": 1.7542, + "step": 3525 + }, + { + "epoch": 1.070268629534072, + "grad_norm": 0.4759134352207184, + "learning_rate": 7.863912515188335e-05, + "loss": 0.9774, + "step": 3526 + }, + { + "epoch": 1.0705721657307634, + "grad_norm": 0.6089837551116943, + "learning_rate": 7.863304981773998e-05, + "loss": 0.7761, + "step": 3527 + }, + { + "epoch": 1.0708757019274548, + "grad_norm": 0.5931398272514343, + "learning_rate": 7.86269744835966e-05, + "loss": 1.244, + "step": 3528 + }, + { + "epoch": 1.0711792381241463, + "grad_norm": 0.4667022228240967, + "learning_rate": 7.862089914945322e-05, + "loss": 1.3669, + "step": 3529 + }, + { + "epoch": 1.0714827743208377, + "grad_norm": 0.4586002230644226, + "learning_rate": 7.861482381530985e-05, + "loss": 1.6041, + "step": 3530 + }, + { + "epoch": 1.0717863105175292, + "grad_norm": 0.5036244988441467, + "learning_rate": 7.860874848116647e-05, + "loss": 1.3989, + "step": 3531 + }, + { + "epoch": 1.0720898467142206, + "grad_norm": 0.40763425827026367, + "learning_rate": 7.860267314702308e-05, + "loss": 1.1835, + "step": 3532 + }, + { + "epoch": 1.0723933829109122, + "grad_norm": 0.44515642523765564, + "learning_rate": 7.859659781287972e-05, + "loss": 1.4935, + "step": 3533 + }, + { + "epoch": 1.0726969191076037, + "grad_norm": 0.5427178740501404, + "learning_rate": 7.859052247873633e-05, + "loss": 1.9076, + "step": 3534 + }, + { + "epoch": 1.073000455304295, + "grad_norm": 0.4585944712162018, + "learning_rate": 7.858444714459295e-05, + "loss": 1.7363, + "step": 3535 + }, + { + "epoch": 1.0733039915009865, + "grad_norm": 0.46946725249290466, + "learning_rate": 7.857837181044958e-05, + "loss": 1.6894, + "step": 3536 + }, + { + "epoch": 1.073607527697678, + "grad_norm": 0.5090848803520203, + "learning_rate": 7.85722964763062e-05, + "loss": 1.8616, + "step": 3537 + }, + { + "epoch": 1.0739110638943694, + "grad_norm": 0.5192902684211731, + "learning_rate": 7.856622114216282e-05, + "loss": 1.5899, + "step": 3538 + }, + { + "epoch": 1.0742146000910608, + "grad_norm": 0.4348808228969574, + "learning_rate": 7.856014580801945e-05, + "loss": 1.7192, + "step": 3539 + }, + { + "epoch": 1.0745181362877523, + "grad_norm": 0.5693963170051575, + "learning_rate": 7.855407047387606e-05, + "loss": 1.3645, + "step": 3540 + }, + { + "epoch": 1.0748216724844437, + "grad_norm": 0.4064824879169464, + "learning_rate": 7.85479951397327e-05, + "loss": 1.5659, + "step": 3541 + }, + { + "epoch": 1.0751252086811351, + "grad_norm": 0.4797777235507965, + "learning_rate": 7.854191980558931e-05, + "loss": 1.8501, + "step": 3542 + }, + { + "epoch": 1.0754287448778266, + "grad_norm": 0.5156259536743164, + "learning_rate": 7.853584447144593e-05, + "loss": 1.5016, + "step": 3543 + }, + { + "epoch": 1.0757322810745182, + "grad_norm": 0.9919567108154297, + "learning_rate": 7.852976913730256e-05, + "loss": 1.3734, + "step": 3544 + }, + { + "epoch": 1.0760358172712097, + "grad_norm": 0.5478760600090027, + "learning_rate": 7.852369380315918e-05, + "loss": 1.8446, + "step": 3545 + }, + { + "epoch": 1.076339353467901, + "grad_norm": 0.6765535473823547, + "learning_rate": 7.85176184690158e-05, + "loss": 1.5215, + "step": 3546 + }, + { + "epoch": 1.0766428896645925, + "grad_norm": 0.4867497384548187, + "learning_rate": 7.851154313487243e-05, + "loss": 1.3427, + "step": 3547 + }, + { + "epoch": 1.076946425861284, + "grad_norm": 0.4023679792881012, + "learning_rate": 7.850546780072904e-05, + "loss": 1.7548, + "step": 3548 + }, + { + "epoch": 1.0772499620579754, + "grad_norm": 0.42770206928253174, + "learning_rate": 7.849939246658566e-05, + "loss": 1.4375, + "step": 3549 + }, + { + "epoch": 1.0775534982546668, + "grad_norm": 0.5628126859664917, + "learning_rate": 7.849331713244229e-05, + "loss": 1.6336, + "step": 3550 + }, + { + "epoch": 1.0778570344513583, + "grad_norm": 0.5270586013793945, + "learning_rate": 7.848724179829891e-05, + "loss": 1.8782, + "step": 3551 + }, + { + "epoch": 1.0781605706480497, + "grad_norm": 0.7205768823623657, + "learning_rate": 7.848116646415553e-05, + "loss": 1.7201, + "step": 3552 + }, + { + "epoch": 1.0784641068447411, + "grad_norm": 0.5051723122596741, + "learning_rate": 7.847509113001216e-05, + "loss": 1.6346, + "step": 3553 + }, + { + "epoch": 1.0787676430414326, + "grad_norm": 0.41711702942848206, + "learning_rate": 7.846901579586877e-05, + "loss": 1.765, + "step": 3554 + }, + { + "epoch": 1.0790711792381242, + "grad_norm": 0.4348052442073822, + "learning_rate": 7.84629404617254e-05, + "loss": 1.6667, + "step": 3555 + }, + { + "epoch": 1.0793747154348157, + "grad_norm": 0.5323374271392822, + "learning_rate": 7.845686512758202e-05, + "loss": 1.8005, + "step": 3556 + }, + { + "epoch": 1.079678251631507, + "grad_norm": 0.47705498337745667, + "learning_rate": 7.845078979343864e-05, + "loss": 1.8168, + "step": 3557 + }, + { + "epoch": 1.0799817878281985, + "grad_norm": 0.535015344619751, + "learning_rate": 7.844471445929527e-05, + "loss": 1.7441, + "step": 3558 + }, + { + "epoch": 1.08028532402489, + "grad_norm": 0.4847927391529083, + "learning_rate": 7.843863912515189e-05, + "loss": 1.5963, + "step": 3559 + }, + { + "epoch": 1.0805888602215814, + "grad_norm": 0.5845076441764832, + "learning_rate": 7.84325637910085e-05, + "loss": 1.116, + "step": 3560 + }, + { + "epoch": 1.0808923964182728, + "grad_norm": 0.5248334407806396, + "learning_rate": 7.842648845686514e-05, + "loss": 2.0422, + "step": 3561 + }, + { + "epoch": 1.0811959326149643, + "grad_norm": 0.5417022705078125, + "learning_rate": 7.842041312272175e-05, + "loss": 1.5701, + "step": 3562 + }, + { + "epoch": 1.0814994688116557, + "grad_norm": 0.4764825701713562, + "learning_rate": 7.841433778857837e-05, + "loss": 1.4946, + "step": 3563 + }, + { + "epoch": 1.0818030050083474, + "grad_norm": 0.4735731780529022, + "learning_rate": 7.8408262454435e-05, + "loss": 2.0424, + "step": 3564 + }, + { + "epoch": 1.0821065412050388, + "grad_norm": 0.4083727300167084, + "learning_rate": 7.840218712029162e-05, + "loss": 1.2251, + "step": 3565 + }, + { + "epoch": 1.0824100774017302, + "grad_norm": 0.5175759792327881, + "learning_rate": 7.839611178614824e-05, + "loss": 1.3758, + "step": 3566 + }, + { + "epoch": 1.0827136135984217, + "grad_norm": 0.4588059186935425, + "learning_rate": 7.839003645200487e-05, + "loss": 1.4034, + "step": 3567 + }, + { + "epoch": 1.083017149795113, + "grad_norm": 0.5879805088043213, + "learning_rate": 7.838396111786148e-05, + "loss": 1.6349, + "step": 3568 + }, + { + "epoch": 1.0833206859918045, + "grad_norm": 0.48351842164993286, + "learning_rate": 7.83778857837181e-05, + "loss": 1.8329, + "step": 3569 + }, + { + "epoch": 1.083624222188496, + "grad_norm": 0.5158828496932983, + "learning_rate": 7.837181044957473e-05, + "loss": 1.6281, + "step": 3570 + }, + { + "epoch": 1.0839277583851874, + "grad_norm": 0.7203484177589417, + "learning_rate": 7.836573511543135e-05, + "loss": 0.9363, + "step": 3571 + }, + { + "epoch": 1.0842312945818788, + "grad_norm": 0.5488767623901367, + "learning_rate": 7.835965978128798e-05, + "loss": 1.7487, + "step": 3572 + }, + { + "epoch": 1.0845348307785703, + "grad_norm": 0.5658820867538452, + "learning_rate": 7.83535844471446e-05, + "loss": 1.8557, + "step": 3573 + }, + { + "epoch": 1.0848383669752617, + "grad_norm": 0.5227528214454651, + "learning_rate": 7.834750911300121e-05, + "loss": 1.4973, + "step": 3574 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.5170645713806152, + "learning_rate": 7.834143377885785e-05, + "loss": 1.7767, + "step": 3575 + }, + { + "epoch": 1.0854454393686448, + "grad_norm": 0.48934417963027954, + "learning_rate": 7.833535844471446e-05, + "loss": 1.7613, + "step": 3576 + }, + { + "epoch": 1.0857489755653362, + "grad_norm": 0.4472818374633789, + "learning_rate": 7.832928311057108e-05, + "loss": 1.6749, + "step": 3577 + }, + { + "epoch": 1.0860525117620277, + "grad_norm": 0.4309948682785034, + "learning_rate": 7.832320777642771e-05, + "loss": 1.8213, + "step": 3578 + }, + { + "epoch": 1.086356047958719, + "grad_norm": 0.5259717702865601, + "learning_rate": 7.831713244228433e-05, + "loss": 1.5028, + "step": 3579 + }, + { + "epoch": 1.0866595841554105, + "grad_norm": 0.4831100106239319, + "learning_rate": 7.831105710814095e-05, + "loss": 1.5417, + "step": 3580 + }, + { + "epoch": 1.086963120352102, + "grad_norm": 0.5474818348884583, + "learning_rate": 7.830498177399758e-05, + "loss": 1.3114, + "step": 3581 + }, + { + "epoch": 1.0872666565487934, + "grad_norm": 0.6029711961746216, + "learning_rate": 7.82989064398542e-05, + "loss": 1.6251, + "step": 3582 + }, + { + "epoch": 1.0875701927454848, + "grad_norm": 0.6199969053268433, + "learning_rate": 7.829283110571081e-05, + "loss": 1.7712, + "step": 3583 + }, + { + "epoch": 1.0878737289421763, + "grad_norm": 0.571630597114563, + "learning_rate": 7.828675577156744e-05, + "loss": 1.8169, + "step": 3584 + }, + { + "epoch": 1.0881772651388677, + "grad_norm": 0.476755827665329, + "learning_rate": 7.828068043742406e-05, + "loss": 1.4152, + "step": 3585 + }, + { + "epoch": 1.0884808013355594, + "grad_norm": 0.5712706446647644, + "learning_rate": 7.827460510328069e-05, + "loss": 1.624, + "step": 3586 + }, + { + "epoch": 1.0887843375322508, + "grad_norm": 0.4737652540206909, + "learning_rate": 7.826852976913731e-05, + "loss": 1.812, + "step": 3587 + }, + { + "epoch": 1.0890878737289422, + "grad_norm": 0.7318893671035767, + "learning_rate": 7.826245443499392e-05, + "loss": 1.7989, + "step": 3588 + }, + { + "epoch": 1.0893914099256337, + "grad_norm": 0.5808560848236084, + "learning_rate": 7.825637910085056e-05, + "loss": 1.7167, + "step": 3589 + }, + { + "epoch": 1.089694946122325, + "grad_norm": 0.49355220794677734, + "learning_rate": 7.825030376670717e-05, + "loss": 1.5009, + "step": 3590 + }, + { + "epoch": 1.0899984823190165, + "grad_norm": 0.4661107361316681, + "learning_rate": 7.824422843256379e-05, + "loss": 1.2807, + "step": 3591 + }, + { + "epoch": 1.090302018515708, + "grad_norm": 0.3863863945007324, + "learning_rate": 7.823815309842042e-05, + "loss": 1.6173, + "step": 3592 + }, + { + "epoch": 1.0906055547123994, + "grad_norm": 0.48039016127586365, + "learning_rate": 7.823207776427704e-05, + "loss": 1.7246, + "step": 3593 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.4487806558609009, + "learning_rate": 7.822600243013366e-05, + "loss": 1.408, + "step": 3594 + }, + { + "epoch": 1.0912126271057823, + "grad_norm": 0.6070311069488525, + "learning_rate": 7.821992709599029e-05, + "loss": 1.5847, + "step": 3595 + }, + { + "epoch": 1.091516163302474, + "grad_norm": 0.5059540271759033, + "learning_rate": 7.82138517618469e-05, + "loss": 1.9817, + "step": 3596 + }, + { + "epoch": 1.0918196994991654, + "grad_norm": 0.5185919404029846, + "learning_rate": 7.820777642770352e-05, + "loss": 1.761, + "step": 3597 + }, + { + "epoch": 1.0921232356958568, + "grad_norm": 0.5528532266616821, + "learning_rate": 7.820170109356015e-05, + "loss": 1.5324, + "step": 3598 + }, + { + "epoch": 1.0924267718925482, + "grad_norm": 0.46091577410697937, + "learning_rate": 7.819562575941677e-05, + "loss": 1.5934, + "step": 3599 + }, + { + "epoch": 1.0927303080892397, + "grad_norm": 0.5525655746459961, + "learning_rate": 7.81895504252734e-05, + "loss": 1.742, + "step": 3600 + }, + { + "epoch": 1.093033844285931, + "grad_norm": 0.8037269115447998, + "learning_rate": 7.818347509113002e-05, + "loss": 1.2865, + "step": 3601 + }, + { + "epoch": 1.0933373804826225, + "grad_norm": 0.500320315361023, + "learning_rate": 7.817739975698664e-05, + "loss": 1.7335, + "step": 3602 + }, + { + "epoch": 1.093640916679314, + "grad_norm": 0.4381422698497772, + "learning_rate": 7.817132442284327e-05, + "loss": 1.7203, + "step": 3603 + }, + { + "epoch": 1.0939444528760054, + "grad_norm": 0.5011032819747925, + "learning_rate": 7.816524908869988e-05, + "loss": 1.903, + "step": 3604 + }, + { + "epoch": 1.0942479890726968, + "grad_norm": 0.4082486033439636, + "learning_rate": 7.81591737545565e-05, + "loss": 1.7394, + "step": 3605 + }, + { + "epoch": 1.0945515252693885, + "grad_norm": 0.3757207691669464, + "learning_rate": 7.815309842041313e-05, + "loss": 1.9376, + "step": 3606 + }, + { + "epoch": 1.09485506146608, + "grad_norm": 0.5873184204101562, + "learning_rate": 7.814702308626975e-05, + "loss": 1.8047, + "step": 3607 + }, + { + "epoch": 1.0951585976627713, + "grad_norm": 0.4013366401195526, + "learning_rate": 7.814094775212637e-05, + "loss": 1.7548, + "step": 3608 + }, + { + "epoch": 1.0954621338594628, + "grad_norm": 0.4884313642978668, + "learning_rate": 7.8134872417983e-05, + "loss": 1.7476, + "step": 3609 + }, + { + "epoch": 1.0957656700561542, + "grad_norm": 0.5261145234107971, + "learning_rate": 7.812879708383961e-05, + "loss": 1.3334, + "step": 3610 + }, + { + "epoch": 1.0960692062528457, + "grad_norm": 0.4099176824092865, + "learning_rate": 7.812272174969623e-05, + "loss": 1.1143, + "step": 3611 + }, + { + "epoch": 1.096372742449537, + "grad_norm": 0.571192741394043, + "learning_rate": 7.811664641555286e-05, + "loss": 1.7579, + "step": 3612 + }, + { + "epoch": 1.0966762786462285, + "grad_norm": 0.48647162318229675, + "learning_rate": 7.811057108140948e-05, + "loss": 1.8592, + "step": 3613 + }, + { + "epoch": 1.09697981484292, + "grad_norm": 0.4793272316455841, + "learning_rate": 7.810449574726611e-05, + "loss": 1.7882, + "step": 3614 + }, + { + "epoch": 1.0972833510396114, + "grad_norm": 0.49005764722824097, + "learning_rate": 7.809842041312273e-05, + "loss": 1.52, + "step": 3615 + }, + { + "epoch": 1.0975868872363028, + "grad_norm": 0.5489885210990906, + "learning_rate": 7.809234507897935e-05, + "loss": 1.7768, + "step": 3616 + }, + { + "epoch": 1.0978904234329945, + "grad_norm": 0.5832868814468384, + "learning_rate": 7.808626974483598e-05, + "loss": 1.2381, + "step": 3617 + }, + { + "epoch": 1.098193959629686, + "grad_norm": 1.2282688617706299, + "learning_rate": 7.80801944106926e-05, + "loss": 1.6286, + "step": 3618 + }, + { + "epoch": 1.0984974958263773, + "grad_norm": 0.5094382166862488, + "learning_rate": 7.807411907654921e-05, + "loss": 1.9736, + "step": 3619 + }, + { + "epoch": 1.0988010320230688, + "grad_norm": 0.4855671525001526, + "learning_rate": 7.806804374240584e-05, + "loss": 1.9063, + "step": 3620 + }, + { + "epoch": 1.0991045682197602, + "grad_norm": 0.5706669092178345, + "learning_rate": 7.806196840826246e-05, + "loss": 1.9509, + "step": 3621 + }, + { + "epoch": 1.0994081044164516, + "grad_norm": 0.5625984072685242, + "learning_rate": 7.805589307411908e-05, + "loss": 1.5796, + "step": 3622 + }, + { + "epoch": 1.099711640613143, + "grad_norm": 0.5737254023551941, + "learning_rate": 7.804981773997571e-05, + "loss": 1.6272, + "step": 3623 + }, + { + "epoch": 1.1000151768098345, + "grad_norm": 0.3637593388557434, + "learning_rate": 7.804374240583232e-05, + "loss": 1.2409, + "step": 3624 + }, + { + "epoch": 1.100318713006526, + "grad_norm": 0.4899303615093231, + "learning_rate": 7.803766707168894e-05, + "loss": 1.3884, + "step": 3625 + }, + { + "epoch": 1.1006222492032174, + "grad_norm": 0.524005115032196, + "learning_rate": 7.803159173754557e-05, + "loss": 1.9453, + "step": 3626 + }, + { + "epoch": 1.100925785399909, + "grad_norm": 0.5083621740341187, + "learning_rate": 7.802551640340219e-05, + "loss": 1.5158, + "step": 3627 + }, + { + "epoch": 1.1012293215966005, + "grad_norm": 0.46930131316185, + "learning_rate": 7.801944106925882e-05, + "loss": 1.7324, + "step": 3628 + }, + { + "epoch": 1.101532857793292, + "grad_norm": 0.5122260451316833, + "learning_rate": 7.801336573511544e-05, + "loss": 1.4132, + "step": 3629 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.5043088793754578, + "learning_rate": 7.800729040097206e-05, + "loss": 1.6932, + "step": 3630 + }, + { + "epoch": 1.1021399301866748, + "grad_norm": 0.5585395097732544, + "learning_rate": 7.800121506682869e-05, + "loss": 1.6254, + "step": 3631 + }, + { + "epoch": 1.1024434663833662, + "grad_norm": 0.4394286572933197, + "learning_rate": 7.799513973268529e-05, + "loss": 1.6988, + "step": 3632 + }, + { + "epoch": 1.1027470025800576, + "grad_norm": 0.5732413530349731, + "learning_rate": 7.798906439854192e-05, + "loss": 1.0782, + "step": 3633 + }, + { + "epoch": 1.103050538776749, + "grad_norm": 0.49380823969841003, + "learning_rate": 7.798298906439855e-05, + "loss": 1.3272, + "step": 3634 + }, + { + "epoch": 1.1033540749734405, + "grad_norm": 0.5082643628120422, + "learning_rate": 7.797691373025517e-05, + "loss": 1.6587, + "step": 3635 + }, + { + "epoch": 1.103657611170132, + "grad_norm": 0.6942585706710815, + "learning_rate": 7.797083839611179e-05, + "loss": 1.565, + "step": 3636 + }, + { + "epoch": 1.1039611473668236, + "grad_norm": 0.5077084302902222, + "learning_rate": 7.796476306196842e-05, + "loss": 1.8823, + "step": 3637 + }, + { + "epoch": 1.104264683563515, + "grad_norm": 0.5313974022865295, + "learning_rate": 7.795868772782503e-05, + "loss": 1.5792, + "step": 3638 + }, + { + "epoch": 1.1045682197602065, + "grad_norm": 0.5107327699661255, + "learning_rate": 7.795261239368165e-05, + "loss": 1.4091, + "step": 3639 + }, + { + "epoch": 1.104871755956898, + "grad_norm": 0.530952513217926, + "learning_rate": 7.794653705953828e-05, + "loss": 1.918, + "step": 3640 + }, + { + "epoch": 1.1051752921535893, + "grad_norm": 0.6229440569877625, + "learning_rate": 7.79404617253949e-05, + "loss": 1.7242, + "step": 3641 + }, + { + "epoch": 1.1054788283502808, + "grad_norm": 0.5772741436958313, + "learning_rate": 7.793438639125152e-05, + "loss": 1.4634, + "step": 3642 + }, + { + "epoch": 1.1057823645469722, + "grad_norm": 0.6212802529335022, + "learning_rate": 7.792831105710815e-05, + "loss": 1.0477, + "step": 3643 + }, + { + "epoch": 1.1060859007436636, + "grad_norm": 0.51209557056427, + "learning_rate": 7.792223572296477e-05, + "loss": 1.6945, + "step": 3644 + }, + { + "epoch": 1.106389436940355, + "grad_norm": 0.45700934529304504, + "learning_rate": 7.79161603888214e-05, + "loss": 1.6441, + "step": 3645 + }, + { + "epoch": 1.1066929731370465, + "grad_norm": 0.5479162931442261, + "learning_rate": 7.7910085054678e-05, + "loss": 1.2612, + "step": 3646 + }, + { + "epoch": 1.106996509333738, + "grad_norm": 0.5235689878463745, + "learning_rate": 7.790400972053463e-05, + "loss": 1.5583, + "step": 3647 + }, + { + "epoch": 1.1073000455304296, + "grad_norm": 0.40267738699913025, + "learning_rate": 7.789793438639126e-05, + "loss": 1.5987, + "step": 3648 + }, + { + "epoch": 1.107603581727121, + "grad_norm": 0.4579909145832062, + "learning_rate": 7.789185905224788e-05, + "loss": 2.0193, + "step": 3649 + }, + { + "epoch": 1.1079071179238125, + "grad_norm": 0.7406178712844849, + "learning_rate": 7.78857837181045e-05, + "loss": 1.3012, + "step": 3650 + }, + { + "epoch": 1.108210654120504, + "grad_norm": 0.5075519680976868, + "learning_rate": 7.787970838396113e-05, + "loss": 1.8573, + "step": 3651 + }, + { + "epoch": 1.1085141903171953, + "grad_norm": 0.5122193098068237, + "learning_rate": 7.787363304981774e-05, + "loss": 1.4937, + "step": 3652 + }, + { + "epoch": 1.1088177265138868, + "grad_norm": 0.5174267292022705, + "learning_rate": 7.786755771567436e-05, + "loss": 1.3752, + "step": 3653 + }, + { + "epoch": 1.1091212627105782, + "grad_norm": 0.47906357049942017, + "learning_rate": 7.786148238153099e-05, + "loss": 1.4443, + "step": 3654 + }, + { + "epoch": 1.1094247989072696, + "grad_norm": 0.5059614777565002, + "learning_rate": 7.785540704738761e-05, + "loss": 1.5518, + "step": 3655 + }, + { + "epoch": 1.109728335103961, + "grad_norm": 0.49383166432380676, + "learning_rate": 7.784933171324423e-05, + "loss": 1.6696, + "step": 3656 + }, + { + "epoch": 1.1100318713006525, + "grad_norm": 0.45457521080970764, + "learning_rate": 7.784325637910086e-05, + "loss": 1.4528, + "step": 3657 + }, + { + "epoch": 1.1103354074973442, + "grad_norm": 0.4575364291667938, + "learning_rate": 7.783718104495748e-05, + "loss": 1.7467, + "step": 3658 + }, + { + "epoch": 1.1106389436940356, + "grad_norm": 0.4990423619747162, + "learning_rate": 7.78311057108141e-05, + "loss": 1.6548, + "step": 3659 + }, + { + "epoch": 1.110942479890727, + "grad_norm": 0.5598446726799011, + "learning_rate": 7.782503037667071e-05, + "loss": 1.4941, + "step": 3660 + }, + { + "epoch": 1.1112460160874185, + "grad_norm": 0.466371089220047, + "learning_rate": 7.781895504252734e-05, + "loss": 1.7623, + "step": 3661 + }, + { + "epoch": 1.11154955228411, + "grad_norm": 0.4354589879512787, + "learning_rate": 7.781287970838397e-05, + "loss": 1.9063, + "step": 3662 + }, + { + "epoch": 1.1118530884808013, + "grad_norm": 0.7934980988502502, + "learning_rate": 7.780680437424059e-05, + "loss": 1.8391, + "step": 3663 + }, + { + "epoch": 1.1121566246774928, + "grad_norm": 0.4731541872024536, + "learning_rate": 7.78007290400972e-05, + "loss": 1.9535, + "step": 3664 + }, + { + "epoch": 1.1124601608741842, + "grad_norm": 0.47908467054367065, + "learning_rate": 7.779465370595384e-05, + "loss": 1.8113, + "step": 3665 + }, + { + "epoch": 1.1127636970708756, + "grad_norm": 0.4935145676136017, + "learning_rate": 7.778857837181045e-05, + "loss": 1.7136, + "step": 3666 + }, + { + "epoch": 1.113067233267567, + "grad_norm": 0.4779261350631714, + "learning_rate": 7.778250303766707e-05, + "loss": 1.5593, + "step": 3667 + }, + { + "epoch": 1.1133707694642587, + "grad_norm": 0.3891371786594391, + "learning_rate": 7.77764277035237e-05, + "loss": 1.2089, + "step": 3668 + }, + { + "epoch": 1.1136743056609502, + "grad_norm": 0.5638146996498108, + "learning_rate": 7.777035236938032e-05, + "loss": 1.345, + "step": 3669 + }, + { + "epoch": 1.1139778418576416, + "grad_norm": 0.46427562832832336, + "learning_rate": 7.776427703523694e-05, + "loss": 1.8294, + "step": 3670 + }, + { + "epoch": 1.114281378054333, + "grad_norm": 1.02618408203125, + "learning_rate": 7.775820170109357e-05, + "loss": 1.608, + "step": 3671 + }, + { + "epoch": 1.1145849142510245, + "grad_norm": 0.5669841766357422, + "learning_rate": 7.775212636695019e-05, + "loss": 1.5572, + "step": 3672 + }, + { + "epoch": 1.114888450447716, + "grad_norm": 0.5150823593139648, + "learning_rate": 7.774605103280682e-05, + "loss": 1.6798, + "step": 3673 + }, + { + "epoch": 1.1151919866444073, + "grad_norm": 0.6217275857925415, + "learning_rate": 7.773997569866342e-05, + "loss": 1.4402, + "step": 3674 + }, + { + "epoch": 1.1154955228410988, + "grad_norm": 0.508321225643158, + "learning_rate": 7.773390036452005e-05, + "loss": 1.6684, + "step": 3675 + }, + { + "epoch": 1.1157990590377902, + "grad_norm": 0.44217655062675476, + "learning_rate": 7.772782503037668e-05, + "loss": 1.5984, + "step": 3676 + }, + { + "epoch": 1.1161025952344816, + "grad_norm": 0.4717262089252472, + "learning_rate": 7.77217496962333e-05, + "loss": 1.4775, + "step": 3677 + }, + { + "epoch": 1.116406131431173, + "grad_norm": 0.4989759922027588, + "learning_rate": 7.771567436208992e-05, + "loss": 1.4132, + "step": 3678 + }, + { + "epoch": 1.1167096676278647, + "grad_norm": 0.44810184836387634, + "learning_rate": 7.770959902794655e-05, + "loss": 1.7462, + "step": 3679 + }, + { + "epoch": 1.1170132038245562, + "grad_norm": 0.4343874156475067, + "learning_rate": 7.770352369380316e-05, + "loss": 1.6128, + "step": 3680 + }, + { + "epoch": 1.1173167400212476, + "grad_norm": 0.4640476107597351, + "learning_rate": 7.769744835965978e-05, + "loss": 1.6007, + "step": 3681 + }, + { + "epoch": 1.117620276217939, + "grad_norm": 0.4636215567588806, + "learning_rate": 7.769137302551641e-05, + "loss": 1.4001, + "step": 3682 + }, + { + "epoch": 1.1179238124146305, + "grad_norm": 0.5073500871658325, + "learning_rate": 7.768529769137303e-05, + "loss": 1.8682, + "step": 3683 + }, + { + "epoch": 1.118227348611322, + "grad_norm": 0.5101370811462402, + "learning_rate": 7.767922235722965e-05, + "loss": 1.8096, + "step": 3684 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.42578715085983276, + "learning_rate": 7.767314702308628e-05, + "loss": 1.393, + "step": 3685 + }, + { + "epoch": 1.1188344210047048, + "grad_norm": 0.4419322907924652, + "learning_rate": 7.76670716889429e-05, + "loss": 1.7337, + "step": 3686 + }, + { + "epoch": 1.1191379572013962, + "grad_norm": 0.48602306842803955, + "learning_rate": 7.766099635479953e-05, + "loss": 1.6385, + "step": 3687 + }, + { + "epoch": 1.1194414933980876, + "grad_norm": 0.46349820494651794, + "learning_rate": 7.765492102065613e-05, + "loss": 1.8216, + "step": 3688 + }, + { + "epoch": 1.119745029594779, + "grad_norm": 0.5057324767112732, + "learning_rate": 7.764884568651276e-05, + "loss": 1.4216, + "step": 3689 + }, + { + "epoch": 1.1200485657914707, + "grad_norm": 0.45940324664115906, + "learning_rate": 7.764277035236939e-05, + "loss": 1.7301, + "step": 3690 + }, + { + "epoch": 1.1203521019881622, + "grad_norm": 0.47218936681747437, + "learning_rate": 7.763669501822601e-05, + "loss": 1.6976, + "step": 3691 + }, + { + "epoch": 1.1206556381848536, + "grad_norm": 0.4720531105995178, + "learning_rate": 7.763061968408263e-05, + "loss": 1.4658, + "step": 3692 + }, + { + "epoch": 1.120959174381545, + "grad_norm": 0.48740169405937195, + "learning_rate": 7.762454434993926e-05, + "loss": 1.325, + "step": 3693 + }, + { + "epoch": 1.1212627105782365, + "grad_norm": 0.4727463722229004, + "learning_rate": 7.761846901579587e-05, + "loss": 1.7131, + "step": 3694 + }, + { + "epoch": 1.1215662467749279, + "grad_norm": 0.563522458076477, + "learning_rate": 7.761239368165249e-05, + "loss": 1.7699, + "step": 3695 + }, + { + "epoch": 1.1218697829716193, + "grad_norm": 0.492064893245697, + "learning_rate": 7.760631834750912e-05, + "loss": 1.9043, + "step": 3696 + }, + { + "epoch": 1.1221733191683108, + "grad_norm": 1.3425596952438354, + "learning_rate": 7.760024301336574e-05, + "loss": 1.6455, + "step": 3697 + }, + { + "epoch": 1.1224768553650022, + "grad_norm": 0.4847075641155243, + "learning_rate": 7.759416767922236e-05, + "loss": 1.7395, + "step": 3698 + }, + { + "epoch": 1.1227803915616938, + "grad_norm": 0.7759005427360535, + "learning_rate": 7.758809234507897e-05, + "loss": 1.6917, + "step": 3699 + }, + { + "epoch": 1.1230839277583853, + "grad_norm": 0.6782048344612122, + "learning_rate": 7.75820170109356e-05, + "loss": 1.443, + "step": 3700 + }, + { + "epoch": 1.1233874639550767, + "grad_norm": 0.4265955686569214, + "learning_rate": 7.757594167679224e-05, + "loss": 1.2195, + "step": 3701 + }, + { + "epoch": 1.1236910001517681, + "grad_norm": 0.8154575824737549, + "learning_rate": 7.756986634264884e-05, + "loss": 1.4988, + "step": 3702 + }, + { + "epoch": 1.1239945363484596, + "grad_norm": 0.4791830778121948, + "learning_rate": 7.756379100850547e-05, + "loss": 1.2148, + "step": 3703 + }, + { + "epoch": 1.124298072545151, + "grad_norm": 0.4666757583618164, + "learning_rate": 7.75577156743621e-05, + "loss": 1.5228, + "step": 3704 + }, + { + "epoch": 1.1246016087418425, + "grad_norm": 0.4709447920322418, + "learning_rate": 7.75516403402187e-05, + "loss": 1.6371, + "step": 3705 + }, + { + "epoch": 1.1249051449385339, + "grad_norm": 0.47154557704925537, + "learning_rate": 7.754556500607534e-05, + "loss": 1.1386, + "step": 3706 + }, + { + "epoch": 1.1252086811352253, + "grad_norm": 0.384400337934494, + "learning_rate": 7.753948967193197e-05, + "loss": 1.6998, + "step": 3707 + }, + { + "epoch": 1.1255122173319168, + "grad_norm": 0.5026933550834656, + "learning_rate": 7.753341433778858e-05, + "loss": 1.8225, + "step": 3708 + }, + { + "epoch": 1.1258157535286082, + "grad_norm": 0.5157676339149475, + "learning_rate": 7.75273390036452e-05, + "loss": 1.4864, + "step": 3709 + }, + { + "epoch": 1.1261192897252998, + "grad_norm": 1.0850920677185059, + "learning_rate": 7.752126366950183e-05, + "loss": 1.8376, + "step": 3710 + }, + { + "epoch": 1.1264228259219913, + "grad_norm": 0.481871634721756, + "learning_rate": 7.751518833535845e-05, + "loss": 1.6568, + "step": 3711 + }, + { + "epoch": 1.1267263621186827, + "grad_norm": 0.5413371920585632, + "learning_rate": 7.750911300121507e-05, + "loss": 1.3793, + "step": 3712 + }, + { + "epoch": 1.1270298983153741, + "grad_norm": 0.44527462124824524, + "learning_rate": 7.750303766707168e-05, + "loss": 1.773, + "step": 3713 + }, + { + "epoch": 1.1273334345120656, + "grad_norm": 0.49938610196113586, + "learning_rate": 7.749696233292832e-05, + "loss": 1.5552, + "step": 3714 + }, + { + "epoch": 1.127636970708757, + "grad_norm": 0.5346314311027527, + "learning_rate": 7.749088699878495e-05, + "loss": 1.5582, + "step": 3715 + }, + { + "epoch": 1.1279405069054484, + "grad_norm": 0.9484091401100159, + "learning_rate": 7.748481166464155e-05, + "loss": 1.5542, + "step": 3716 + }, + { + "epoch": 1.1282440431021399, + "grad_norm": 0.47302237153053284, + "learning_rate": 7.747873633049818e-05, + "loss": 1.9569, + "step": 3717 + }, + { + "epoch": 1.1285475792988313, + "grad_norm": 0.5025098323822021, + "learning_rate": 7.747266099635481e-05, + "loss": 1.4258, + "step": 3718 + }, + { + "epoch": 1.1288511154955228, + "grad_norm": 0.42215389013290405, + "learning_rate": 7.746658566221142e-05, + "loss": 1.6635, + "step": 3719 + }, + { + "epoch": 1.1291546516922142, + "grad_norm": 0.49948540329933167, + "learning_rate": 7.746051032806805e-05, + "loss": 1.6155, + "step": 3720 + }, + { + "epoch": 1.1294581878889058, + "grad_norm": 0.6525446176528931, + "learning_rate": 7.745443499392468e-05, + "loss": 1.6404, + "step": 3721 + }, + { + "epoch": 1.1297617240855973, + "grad_norm": 0.5007261633872986, + "learning_rate": 7.74483596597813e-05, + "loss": 1.2159, + "step": 3722 + }, + { + "epoch": 1.1300652602822887, + "grad_norm": 0.6284709572792053, + "learning_rate": 7.744228432563791e-05, + "loss": 1.8035, + "step": 3723 + }, + { + "epoch": 1.1303687964789801, + "grad_norm": 0.705549418926239, + "learning_rate": 7.743620899149454e-05, + "loss": 1.2551, + "step": 3724 + }, + { + "epoch": 1.1306723326756716, + "grad_norm": 1.1247000694274902, + "learning_rate": 7.743013365735116e-05, + "loss": 1.652, + "step": 3725 + }, + { + "epoch": 1.130975868872363, + "grad_norm": 0.4521750211715698, + "learning_rate": 7.742405832320778e-05, + "loss": 1.6, + "step": 3726 + }, + { + "epoch": 1.1312794050690544, + "grad_norm": 0.4077288806438446, + "learning_rate": 7.74179829890644e-05, + "loss": 1.8314, + "step": 3727 + }, + { + "epoch": 1.1315829412657459, + "grad_norm": 0.4165663421154022, + "learning_rate": 7.741190765492103e-05, + "loss": 1.9458, + "step": 3728 + }, + { + "epoch": 1.1318864774624373, + "grad_norm": 0.55204176902771, + "learning_rate": 7.740583232077764e-05, + "loss": 1.661, + "step": 3729 + }, + { + "epoch": 1.132190013659129, + "grad_norm": 0.5003572106361389, + "learning_rate": 7.739975698663426e-05, + "loss": 1.6445, + "step": 3730 + }, + { + "epoch": 1.1324935498558204, + "grad_norm": 0.5279999375343323, + "learning_rate": 7.739368165249089e-05, + "loss": 1.5472, + "step": 3731 + }, + { + "epoch": 1.1327970860525118, + "grad_norm": 0.5142978429794312, + "learning_rate": 7.738760631834752e-05, + "loss": 1.6682, + "step": 3732 + }, + { + "epoch": 1.1331006222492033, + "grad_norm": 0.5571588277816772, + "learning_rate": 7.738153098420413e-05, + "loss": 1.5633, + "step": 3733 + }, + { + "epoch": 1.1334041584458947, + "grad_norm": 0.6555919647216797, + "learning_rate": 7.737545565006076e-05, + "loss": 1.2002, + "step": 3734 + }, + { + "epoch": 1.1337076946425861, + "grad_norm": 0.482624888420105, + "learning_rate": 7.736938031591739e-05, + "loss": 1.7551, + "step": 3735 + }, + { + "epoch": 1.1340112308392776, + "grad_norm": 0.4221251904964447, + "learning_rate": 7.7363304981774e-05, + "loss": 1.1514, + "step": 3736 + }, + { + "epoch": 1.134314767035969, + "grad_norm": 0.482462614774704, + "learning_rate": 7.735722964763062e-05, + "loss": 1.2174, + "step": 3737 + }, + { + "epoch": 1.1346183032326604, + "grad_norm": 0.5528337955474854, + "learning_rate": 7.735115431348725e-05, + "loss": 1.7362, + "step": 3738 + }, + { + "epoch": 1.1349218394293519, + "grad_norm": 0.4969632625579834, + "learning_rate": 7.734507897934387e-05, + "loss": 1.991, + "step": 3739 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.5240209698677063, + "learning_rate": 7.733900364520049e-05, + "loss": 1.5519, + "step": 3740 + }, + { + "epoch": 1.135528911822735, + "grad_norm": 0.41266825795173645, + "learning_rate": 7.73329283110571e-05, + "loss": 1.3405, + "step": 3741 + }, + { + "epoch": 1.1358324480194264, + "grad_norm": 0.5517387986183167, + "learning_rate": 7.732685297691374e-05, + "loss": 1.1253, + "step": 3742 + }, + { + "epoch": 1.1361359842161178, + "grad_norm": 0.49983713030815125, + "learning_rate": 7.732077764277035e-05, + "loss": 1.3818, + "step": 3743 + }, + { + "epoch": 1.1364395204128093, + "grad_norm": 0.5212677717208862, + "learning_rate": 7.731470230862697e-05, + "loss": 1.8162, + "step": 3744 + }, + { + "epoch": 1.1367430566095007, + "grad_norm": 0.5420991778373718, + "learning_rate": 7.73086269744836e-05, + "loss": 1.5643, + "step": 3745 + }, + { + "epoch": 1.1370465928061921, + "grad_norm": 0.5572590827941895, + "learning_rate": 7.730255164034023e-05, + "loss": 1.6639, + "step": 3746 + }, + { + "epoch": 1.1373501290028836, + "grad_norm": 0.5790618658065796, + "learning_rate": 7.729647630619684e-05, + "loss": 1.7084, + "step": 3747 + }, + { + "epoch": 1.137653665199575, + "grad_norm": 0.6989924311637878, + "learning_rate": 7.729040097205347e-05, + "loss": 1.7443, + "step": 3748 + }, + { + "epoch": 1.1379572013962664, + "grad_norm": 0.4986421763896942, + "learning_rate": 7.72843256379101e-05, + "loss": 1.5655, + "step": 3749 + }, + { + "epoch": 1.1382607375929579, + "grad_norm": 0.6421902179718018, + "learning_rate": 7.727825030376671e-05, + "loss": 1.4474, + "step": 3750 + }, + { + "epoch": 1.1385642737896493, + "grad_norm": 0.4390571117401123, + "learning_rate": 7.727217496962333e-05, + "loss": 1.9309, + "step": 3751 + }, + { + "epoch": 1.138867809986341, + "grad_norm": 0.6357335448265076, + "learning_rate": 7.726609963547996e-05, + "loss": 1.4736, + "step": 3752 + }, + { + "epoch": 1.1391713461830324, + "grad_norm": 0.5134897828102112, + "learning_rate": 7.726002430133658e-05, + "loss": 1.623, + "step": 3753 + }, + { + "epoch": 1.1394748823797238, + "grad_norm": 0.5518725514411926, + "learning_rate": 7.72539489671932e-05, + "loss": 1.7415, + "step": 3754 + }, + { + "epoch": 1.1397784185764153, + "grad_norm": 0.49003735184669495, + "learning_rate": 7.724787363304981e-05, + "loss": 1.8879, + "step": 3755 + }, + { + "epoch": 1.1400819547731067, + "grad_norm": 0.5795713067054749, + "learning_rate": 7.724179829890645e-05, + "loss": 1.4763, + "step": 3756 + }, + { + "epoch": 1.1403854909697981, + "grad_norm": 0.4583165645599365, + "learning_rate": 7.723572296476306e-05, + "loss": 1.7021, + "step": 3757 + }, + { + "epoch": 1.1406890271664896, + "grad_norm": 0.5582238435745239, + "learning_rate": 7.722964763061968e-05, + "loss": 1.2102, + "step": 3758 + }, + { + "epoch": 1.140992563363181, + "grad_norm": 0.8933469653129578, + "learning_rate": 7.722357229647631e-05, + "loss": 1.5831, + "step": 3759 + }, + { + "epoch": 1.1412960995598724, + "grad_norm": 0.5090795755386353, + "learning_rate": 7.721749696233294e-05, + "loss": 1.7801, + "step": 3760 + }, + { + "epoch": 1.141599635756564, + "grad_norm": 0.4822721481323242, + "learning_rate": 7.721142162818955e-05, + "loss": 1.5706, + "step": 3761 + }, + { + "epoch": 1.1419031719532553, + "grad_norm": 0.5089300870895386, + "learning_rate": 7.720534629404618e-05, + "loss": 1.7128, + "step": 3762 + }, + { + "epoch": 1.142206708149947, + "grad_norm": 0.5474359393119812, + "learning_rate": 7.719927095990281e-05, + "loss": 1.5872, + "step": 3763 + }, + { + "epoch": 1.1425102443466384, + "grad_norm": 0.582497775554657, + "learning_rate": 7.719319562575942e-05, + "loss": 1.5936, + "step": 3764 + }, + { + "epoch": 1.1428137805433298, + "grad_norm": 0.49585655331611633, + "learning_rate": 7.718712029161604e-05, + "loss": 1.1567, + "step": 3765 + }, + { + "epoch": 1.1431173167400213, + "grad_norm": 0.5703508257865906, + "learning_rate": 7.718104495747267e-05, + "loss": 1.5919, + "step": 3766 + }, + { + "epoch": 1.1434208529367127, + "grad_norm": 0.5075818300247192, + "learning_rate": 7.717496962332929e-05, + "loss": 1.683, + "step": 3767 + }, + { + "epoch": 1.1437243891334041, + "grad_norm": 0.5167961120605469, + "learning_rate": 7.716889428918591e-05, + "loss": 1.425, + "step": 3768 + }, + { + "epoch": 1.1440279253300956, + "grad_norm": 0.5668395161628723, + "learning_rate": 7.716281895504252e-05, + "loss": 1.4884, + "step": 3769 + }, + { + "epoch": 1.144331461526787, + "grad_norm": 0.4944659173488617, + "learning_rate": 7.715674362089916e-05, + "loss": 1.1739, + "step": 3770 + }, + { + "epoch": 1.1446349977234784, + "grad_norm": 0.792289137840271, + "learning_rate": 7.715066828675577e-05, + "loss": 1.7746, + "step": 3771 + }, + { + "epoch": 1.14493853392017, + "grad_norm": 0.48457542061805725, + "learning_rate": 7.714459295261239e-05, + "loss": 1.7788, + "step": 3772 + }, + { + "epoch": 1.1452420701168615, + "grad_norm": 0.5210672616958618, + "learning_rate": 7.713851761846902e-05, + "loss": 1.7434, + "step": 3773 + }, + { + "epoch": 1.145545606313553, + "grad_norm": 0.41982871294021606, + "learning_rate": 7.713244228432565e-05, + "loss": 1.6847, + "step": 3774 + }, + { + "epoch": 1.1458491425102444, + "grad_norm": 0.5397769212722778, + "learning_rate": 7.712636695018226e-05, + "loss": 1.7824, + "step": 3775 + }, + { + "epoch": 1.1461526787069358, + "grad_norm": 0.9942769408226013, + "learning_rate": 7.712029161603889e-05, + "loss": 1.2639, + "step": 3776 + }, + { + "epoch": 1.1464562149036273, + "grad_norm": 0.5531973838806152, + "learning_rate": 7.711421628189552e-05, + "loss": 1.6249, + "step": 3777 + }, + { + "epoch": 1.1467597511003187, + "grad_norm": 0.4524674415588379, + "learning_rate": 7.710814094775212e-05, + "loss": 1.1845, + "step": 3778 + }, + { + "epoch": 1.1470632872970101, + "grad_norm": 0.4991176426410675, + "learning_rate": 7.710206561360875e-05, + "loss": 1.7489, + "step": 3779 + }, + { + "epoch": 1.1473668234937016, + "grad_norm": 0.46409863233566284, + "learning_rate": 7.709599027946537e-05, + "loss": 1.9125, + "step": 3780 + }, + { + "epoch": 1.147670359690393, + "grad_norm": 0.5320178866386414, + "learning_rate": 7.7089914945322e-05, + "loss": 1.5643, + "step": 3781 + }, + { + "epoch": 1.1479738958870844, + "grad_norm": 0.5475717782974243, + "learning_rate": 7.708383961117862e-05, + "loss": 1.6597, + "step": 3782 + }, + { + "epoch": 1.148277432083776, + "grad_norm": 0.6391961574554443, + "learning_rate": 7.707776427703523e-05, + "loss": 1.7717, + "step": 3783 + }, + { + "epoch": 1.1485809682804675, + "grad_norm": 0.583355724811554, + "learning_rate": 7.707168894289187e-05, + "loss": 1.8934, + "step": 3784 + }, + { + "epoch": 1.148884504477159, + "grad_norm": 0.891815721988678, + "learning_rate": 7.706561360874848e-05, + "loss": 0.836, + "step": 3785 + }, + { + "epoch": 1.1491880406738504, + "grad_norm": 0.5519468188285828, + "learning_rate": 7.70595382746051e-05, + "loss": 1.785, + "step": 3786 + }, + { + "epoch": 1.1494915768705418, + "grad_norm": 0.5709235668182373, + "learning_rate": 7.705346294046173e-05, + "loss": 1.3448, + "step": 3787 + }, + { + "epoch": 1.1497951130672333, + "grad_norm": 0.453735888004303, + "learning_rate": 7.704738760631836e-05, + "loss": 1.7276, + "step": 3788 + }, + { + "epoch": 1.1500986492639247, + "grad_norm": 0.4751914441585541, + "learning_rate": 7.704131227217497e-05, + "loss": 1.7094, + "step": 3789 + }, + { + "epoch": 1.1504021854606161, + "grad_norm": 0.4283442199230194, + "learning_rate": 7.70352369380316e-05, + "loss": 1.8033, + "step": 3790 + }, + { + "epoch": 1.1507057216573076, + "grad_norm": 0.4466361701488495, + "learning_rate": 7.702916160388823e-05, + "loss": 0.9553, + "step": 3791 + }, + { + "epoch": 1.1510092578539992, + "grad_norm": 0.4145885407924652, + "learning_rate": 7.702308626974483e-05, + "loss": 1.3865, + "step": 3792 + }, + { + "epoch": 1.1513127940506904, + "grad_norm": 0.5452487468719482, + "learning_rate": 7.701701093560146e-05, + "loss": 1.7645, + "step": 3793 + }, + { + "epoch": 1.151616330247382, + "grad_norm": 0.5216187834739685, + "learning_rate": 7.701093560145808e-05, + "loss": 1.7205, + "step": 3794 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.4803890287876129, + "learning_rate": 7.700486026731471e-05, + "loss": 1.8845, + "step": 3795 + }, + { + "epoch": 1.152223402640765, + "grad_norm": 0.42299672961235046, + "learning_rate": 7.699878493317133e-05, + "loss": 1.5909, + "step": 3796 + }, + { + "epoch": 1.1525269388374564, + "grad_norm": 0.6640902161598206, + "learning_rate": 7.699270959902794e-05, + "loss": 1.7704, + "step": 3797 + }, + { + "epoch": 1.1528304750341478, + "grad_norm": 0.4829888939857483, + "learning_rate": 7.698663426488458e-05, + "loss": 1.4564, + "step": 3798 + }, + { + "epoch": 1.1531340112308393, + "grad_norm": 0.49383172392845154, + "learning_rate": 7.698055893074119e-05, + "loss": 1.5888, + "step": 3799 + }, + { + "epoch": 1.1534375474275307, + "grad_norm": 0.4922170042991638, + "learning_rate": 7.697448359659781e-05, + "loss": 1.6228, + "step": 3800 + }, + { + "epoch": 1.1537410836242221, + "grad_norm": 0.5983831286430359, + "learning_rate": 7.696840826245444e-05, + "loss": 1.3906, + "step": 3801 + }, + { + "epoch": 1.1540446198209136, + "grad_norm": 0.4924396574497223, + "learning_rate": 7.696233292831106e-05, + "loss": 1.3455, + "step": 3802 + }, + { + "epoch": 1.1543481560176052, + "grad_norm": 0.39909911155700684, + "learning_rate": 7.695625759416768e-05, + "loss": 1.1296, + "step": 3803 + }, + { + "epoch": 1.1546516922142966, + "grad_norm": 0.5369203686714172, + "learning_rate": 7.69501822600243e-05, + "loss": 1.7645, + "step": 3804 + }, + { + "epoch": 1.154955228410988, + "grad_norm": 0.5576856732368469, + "learning_rate": 7.694410692588094e-05, + "loss": 1.6252, + "step": 3805 + }, + { + "epoch": 1.1552587646076795, + "grad_norm": 0.6163928508758545, + "learning_rate": 7.693803159173754e-05, + "loss": 1.1073, + "step": 3806 + }, + { + "epoch": 1.155562300804371, + "grad_norm": 0.47761568427085876, + "learning_rate": 7.693195625759417e-05, + "loss": 1.725, + "step": 3807 + }, + { + "epoch": 1.1558658370010624, + "grad_norm": 0.4886780083179474, + "learning_rate": 7.692588092345079e-05, + "loss": 2.0903, + "step": 3808 + }, + { + "epoch": 1.1561693731977538, + "grad_norm": 0.5503537058830261, + "learning_rate": 7.691980558930742e-05, + "loss": 1.6063, + "step": 3809 + }, + { + "epoch": 1.1564729093944452, + "grad_norm": 0.647091805934906, + "learning_rate": 7.691373025516404e-05, + "loss": 1.2219, + "step": 3810 + }, + { + "epoch": 1.1567764455911367, + "grad_norm": 0.7522347569465637, + "learning_rate": 7.690765492102065e-05, + "loss": 1.5603, + "step": 3811 + }, + { + "epoch": 1.1570799817878281, + "grad_norm": 0.5273557305335999, + "learning_rate": 7.690157958687729e-05, + "loss": 1.6698, + "step": 3812 + }, + { + "epoch": 1.1573835179845195, + "grad_norm": 0.5218062996864319, + "learning_rate": 7.68955042527339e-05, + "loss": 1.1702, + "step": 3813 + }, + { + "epoch": 1.1576870541812112, + "grad_norm": 0.4625975489616394, + "learning_rate": 7.688942891859052e-05, + "loss": 1.8227, + "step": 3814 + }, + { + "epoch": 1.1579905903779026, + "grad_norm": 0.49970725178718567, + "learning_rate": 7.688335358444715e-05, + "loss": 1.6382, + "step": 3815 + }, + { + "epoch": 1.158294126574594, + "grad_norm": 0.6002604365348816, + "learning_rate": 7.687727825030377e-05, + "loss": 1.7239, + "step": 3816 + }, + { + "epoch": 1.1585976627712855, + "grad_norm": 0.6793041825294495, + "learning_rate": 7.687120291616039e-05, + "loss": 2.0427, + "step": 3817 + }, + { + "epoch": 1.158901198967977, + "grad_norm": 0.5442394018173218, + "learning_rate": 7.686512758201702e-05, + "loss": 1.6008, + "step": 3818 + }, + { + "epoch": 1.1592047351646684, + "grad_norm": 0.4671969413757324, + "learning_rate": 7.685905224787365e-05, + "loss": 1.401, + "step": 3819 + }, + { + "epoch": 1.1595082713613598, + "grad_norm": 0.4723747968673706, + "learning_rate": 7.685297691373025e-05, + "loss": 1.5624, + "step": 3820 + }, + { + "epoch": 1.1598118075580512, + "grad_norm": 0.5985869765281677, + "learning_rate": 7.684690157958688e-05, + "loss": 1.5069, + "step": 3821 + }, + { + "epoch": 1.1601153437547427, + "grad_norm": 0.47640082240104675, + "learning_rate": 7.68408262454435e-05, + "loss": 1.0615, + "step": 3822 + }, + { + "epoch": 1.1604188799514341, + "grad_norm": 0.4906187057495117, + "learning_rate": 7.683475091130013e-05, + "loss": 1.4341, + "step": 3823 + }, + { + "epoch": 1.1607224161481255, + "grad_norm": 0.6372618675231934, + "learning_rate": 7.682867557715675e-05, + "loss": 1.2915, + "step": 3824 + }, + { + "epoch": 1.1610259523448172, + "grad_norm": 0.42580631375312805, + "learning_rate": 7.682260024301336e-05, + "loss": 1.2624, + "step": 3825 + }, + { + "epoch": 1.1613294885415086, + "grad_norm": 0.41982603073120117, + "learning_rate": 7.681652490887e-05, + "loss": 1.7227, + "step": 3826 + }, + { + "epoch": 1.1616330247382, + "grad_norm": 0.44015559554100037, + "learning_rate": 7.681044957472661e-05, + "loss": 1.4198, + "step": 3827 + }, + { + "epoch": 1.1619365609348915, + "grad_norm": 0.5147770047187805, + "learning_rate": 7.680437424058323e-05, + "loss": 1.3839, + "step": 3828 + }, + { + "epoch": 1.162240097131583, + "grad_norm": 0.5992082953453064, + "learning_rate": 7.679829890643986e-05, + "loss": 1.6994, + "step": 3829 + }, + { + "epoch": 1.1625436333282744, + "grad_norm": 0.5068255662918091, + "learning_rate": 7.679222357229648e-05, + "loss": 1.6389, + "step": 3830 + }, + { + "epoch": 1.1628471695249658, + "grad_norm": 0.5025370717048645, + "learning_rate": 7.67861482381531e-05, + "loss": 1.4422, + "step": 3831 + }, + { + "epoch": 1.1631507057216572, + "grad_norm": 0.5961645841598511, + "learning_rate": 7.678007290400973e-05, + "loss": 1.3941, + "step": 3832 + }, + { + "epoch": 1.1634542419183487, + "grad_norm": 0.5184653997421265, + "learning_rate": 7.677399756986636e-05, + "loss": 1.7886, + "step": 3833 + }, + { + "epoch": 1.1637577781150403, + "grad_norm": 0.4389922022819519, + "learning_rate": 7.676792223572296e-05, + "loss": 1.7861, + "step": 3834 + }, + { + "epoch": 1.1640613143117318, + "grad_norm": 0.5118327140808105, + "learning_rate": 7.676184690157959e-05, + "loss": 1.7161, + "step": 3835 + }, + { + "epoch": 1.1643648505084232, + "grad_norm": 0.5762491226196289, + "learning_rate": 7.675577156743621e-05, + "loss": 1.7012, + "step": 3836 + }, + { + "epoch": 1.1646683867051146, + "grad_norm": 0.480589359998703, + "learning_rate": 7.674969623329284e-05, + "loss": 1.6764, + "step": 3837 + }, + { + "epoch": 1.164971922901806, + "grad_norm": 0.45624813437461853, + "learning_rate": 7.674362089914946e-05, + "loss": 1.5741, + "step": 3838 + }, + { + "epoch": 1.1652754590984975, + "grad_norm": 0.5073031783103943, + "learning_rate": 7.673754556500608e-05, + "loss": 1.263, + "step": 3839 + }, + { + "epoch": 1.165578995295189, + "grad_norm": 0.44414857029914856, + "learning_rate": 7.67314702308627e-05, + "loss": 1.0236, + "step": 3840 + }, + { + "epoch": 1.1658825314918804, + "grad_norm": 0.5479186177253723, + "learning_rate": 7.672539489671932e-05, + "loss": 1.9612, + "step": 3841 + }, + { + "epoch": 1.1661860676885718, + "grad_norm": 0.4903987944126129, + "learning_rate": 7.671931956257594e-05, + "loss": 1.6913, + "step": 3842 + }, + { + "epoch": 1.1664896038852632, + "grad_norm": 0.5010794401168823, + "learning_rate": 7.671324422843257e-05, + "loss": 1.8156, + "step": 3843 + }, + { + "epoch": 1.1667931400819547, + "grad_norm": 0.4327058792114258, + "learning_rate": 7.670716889428919e-05, + "loss": 1.557, + "step": 3844 + }, + { + "epoch": 1.1670966762786463, + "grad_norm": 0.6099236011505127, + "learning_rate": 7.67010935601458e-05, + "loss": 1.7191, + "step": 3845 + }, + { + "epoch": 1.1674002124753378, + "grad_norm": 0.7435611486434937, + "learning_rate": 7.669501822600244e-05, + "loss": 1.8566, + "step": 3846 + }, + { + "epoch": 1.1677037486720292, + "grad_norm": 0.6030800938606262, + "learning_rate": 7.668894289185907e-05, + "loss": 1.3435, + "step": 3847 + }, + { + "epoch": 1.1680072848687206, + "grad_norm": 0.4840324819087982, + "learning_rate": 7.668286755771567e-05, + "loss": 1.5476, + "step": 3848 + }, + { + "epoch": 1.168310821065412, + "grad_norm": 0.686964213848114, + "learning_rate": 7.66767922235723e-05, + "loss": 1.5845, + "step": 3849 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.4797843396663666, + "learning_rate": 7.667071688942892e-05, + "loss": 1.7808, + "step": 3850 + }, + { + "epoch": 1.168917893458795, + "grad_norm": 0.5187574028968811, + "learning_rate": 7.666464155528554e-05, + "loss": 1.4564, + "step": 3851 + }, + { + "epoch": 1.1692214296554864, + "grad_norm": 0.42190396785736084, + "learning_rate": 7.665856622114217e-05, + "loss": 1.4857, + "step": 3852 + }, + { + "epoch": 1.1695249658521778, + "grad_norm": 0.4939133822917938, + "learning_rate": 7.665249088699879e-05, + "loss": 1.4132, + "step": 3853 + }, + { + "epoch": 1.1698285020488692, + "grad_norm": 0.4695587456226349, + "learning_rate": 7.664641555285542e-05, + "loss": 1.577, + "step": 3854 + }, + { + "epoch": 1.1701320382455607, + "grad_norm": 0.5055351257324219, + "learning_rate": 7.664034021871203e-05, + "loss": 1.6084, + "step": 3855 + }, + { + "epoch": 1.1704355744422523, + "grad_norm": 0.4340987503528595, + "learning_rate": 7.663426488456865e-05, + "loss": 1.8373, + "step": 3856 + }, + { + "epoch": 1.1707391106389438, + "grad_norm": 0.5082830190658569, + "learning_rate": 7.662818955042528e-05, + "loss": 1.9309, + "step": 3857 + }, + { + "epoch": 1.1710426468356352, + "grad_norm": 0.5326313972473145, + "learning_rate": 7.66221142162819e-05, + "loss": 1.5347, + "step": 3858 + }, + { + "epoch": 1.1713461830323266, + "grad_norm": 0.6817587018013, + "learning_rate": 7.661603888213852e-05, + "loss": 1.9486, + "step": 3859 + }, + { + "epoch": 1.171649719229018, + "grad_norm": 0.5530791282653809, + "learning_rate": 7.660996354799515e-05, + "loss": 1.5328, + "step": 3860 + }, + { + "epoch": 1.1719532554257095, + "grad_norm": 0.4731312692165375, + "learning_rate": 7.660388821385178e-05, + "loss": 1.7807, + "step": 3861 + }, + { + "epoch": 1.172256791622401, + "grad_norm": 0.46114182472229004, + "learning_rate": 7.659781287970838e-05, + "loss": 1.5321, + "step": 3862 + }, + { + "epoch": 1.1725603278190924, + "grad_norm": 0.4836636185646057, + "learning_rate": 7.659173754556501e-05, + "loss": 1.8332, + "step": 3863 + }, + { + "epoch": 1.1728638640157838, + "grad_norm": 0.44714653491973877, + "learning_rate": 7.658566221142163e-05, + "loss": 1.6486, + "step": 3864 + }, + { + "epoch": 1.1731674002124755, + "grad_norm": 0.5285139083862305, + "learning_rate": 7.657958687727825e-05, + "loss": 1.6531, + "step": 3865 + }, + { + "epoch": 1.1734709364091669, + "grad_norm": 0.44782644510269165, + "learning_rate": 7.657351154313488e-05, + "loss": 1.8439, + "step": 3866 + }, + { + "epoch": 1.1737744726058583, + "grad_norm": 0.4893675148487091, + "learning_rate": 7.65674362089915e-05, + "loss": 1.1381, + "step": 3867 + }, + { + "epoch": 1.1740780088025498, + "grad_norm": 0.619848370552063, + "learning_rate": 7.656136087484813e-05, + "loss": 1.4307, + "step": 3868 + }, + { + "epoch": 1.1743815449992412, + "grad_norm": 0.5027971267700195, + "learning_rate": 7.655528554070474e-05, + "loss": 1.5816, + "step": 3869 + }, + { + "epoch": 1.1746850811959326, + "grad_norm": 0.5814145803451538, + "learning_rate": 7.654921020656136e-05, + "loss": 2.0414, + "step": 3870 + }, + { + "epoch": 1.174988617392624, + "grad_norm": 0.5027217268943787, + "learning_rate": 7.654313487241799e-05, + "loss": 1.6058, + "step": 3871 + }, + { + "epoch": 1.1752921535893155, + "grad_norm": 0.5492193102836609, + "learning_rate": 7.653705953827461e-05, + "loss": 1.6025, + "step": 3872 + }, + { + "epoch": 1.175595689786007, + "grad_norm": 0.5875594615936279, + "learning_rate": 7.653098420413123e-05, + "loss": 1.6729, + "step": 3873 + }, + { + "epoch": 1.1758992259826984, + "grad_norm": 0.46128249168395996, + "learning_rate": 7.652490886998786e-05, + "loss": 1.7184, + "step": 3874 + }, + { + "epoch": 1.1762027621793898, + "grad_norm": 0.4899282157421112, + "learning_rate": 7.651883353584447e-05, + "loss": 1.7772, + "step": 3875 + }, + { + "epoch": 1.1765062983760814, + "grad_norm": 0.4934176206588745, + "learning_rate": 7.651275820170109e-05, + "loss": 1.5329, + "step": 3876 + }, + { + "epoch": 1.1768098345727729, + "grad_norm": 0.41739147901535034, + "learning_rate": 7.650668286755772e-05, + "loss": 1.0405, + "step": 3877 + }, + { + "epoch": 1.1771133707694643, + "grad_norm": 0.5608689785003662, + "learning_rate": 7.650060753341434e-05, + "loss": 1.459, + "step": 3878 + }, + { + "epoch": 1.1774169069661558, + "grad_norm": 5.719343185424805, + "learning_rate": 7.649453219927096e-05, + "loss": 1.4504, + "step": 3879 + }, + { + "epoch": 1.1777204431628472, + "grad_norm": 0.5679183006286621, + "learning_rate": 7.648845686512759e-05, + "loss": 1.699, + "step": 3880 + }, + { + "epoch": 1.1780239793595386, + "grad_norm": 0.5237777233123779, + "learning_rate": 7.64823815309842e-05, + "loss": 1.7255, + "step": 3881 + }, + { + "epoch": 1.17832751555623, + "grad_norm": 0.5510279536247253, + "learning_rate": 7.647630619684084e-05, + "loss": 1.8984, + "step": 3882 + }, + { + "epoch": 1.1786310517529215, + "grad_norm": 0.4513683021068573, + "learning_rate": 7.647023086269745e-05, + "loss": 1.8629, + "step": 3883 + }, + { + "epoch": 1.178934587949613, + "grad_norm": 0.5571762919425964, + "learning_rate": 7.646415552855407e-05, + "loss": 1.4258, + "step": 3884 + }, + { + "epoch": 1.1792381241463044, + "grad_norm": 0.5506730675697327, + "learning_rate": 7.64580801944107e-05, + "loss": 1.5378, + "step": 3885 + }, + { + "epoch": 1.1795416603429958, + "grad_norm": 0.4214894771575928, + "learning_rate": 7.645200486026732e-05, + "loss": 1.4376, + "step": 3886 + }, + { + "epoch": 1.1798451965396874, + "grad_norm": 0.5280786752700806, + "learning_rate": 7.644592952612394e-05, + "loss": 1.6226, + "step": 3887 + }, + { + "epoch": 1.1801487327363789, + "grad_norm": 2.3207452297210693, + "learning_rate": 7.643985419198057e-05, + "loss": 1.3407, + "step": 3888 + }, + { + "epoch": 1.1804522689330703, + "grad_norm": 0.5287031531333923, + "learning_rate": 7.643377885783718e-05, + "loss": 1.7205, + "step": 3889 + }, + { + "epoch": 1.1807558051297617, + "grad_norm": 0.5691362619400024, + "learning_rate": 7.64277035236938e-05, + "loss": 1.707, + "step": 3890 + }, + { + "epoch": 1.1810593413264532, + "grad_norm": 0.5688780546188354, + "learning_rate": 7.642162818955043e-05, + "loss": 1.6914, + "step": 3891 + }, + { + "epoch": 1.1813628775231446, + "grad_norm": 0.6007869839668274, + "learning_rate": 7.641555285540705e-05, + "loss": 1.6556, + "step": 3892 + }, + { + "epoch": 1.181666413719836, + "grad_norm": 0.8336607217788696, + "learning_rate": 7.640947752126367e-05, + "loss": 1.6392, + "step": 3893 + }, + { + "epoch": 1.1819699499165275, + "grad_norm": 0.5636674761772156, + "learning_rate": 7.64034021871203e-05, + "loss": 1.602, + "step": 3894 + }, + { + "epoch": 1.182273486113219, + "grad_norm": 0.47849076986312866, + "learning_rate": 7.639732685297692e-05, + "loss": 1.6095, + "step": 3895 + }, + { + "epoch": 1.1825770223099106, + "grad_norm": 0.4776079058647156, + "learning_rate": 7.639125151883355e-05, + "loss": 1.5303, + "step": 3896 + }, + { + "epoch": 1.182880558506602, + "grad_norm": 0.5701802968978882, + "learning_rate": 7.638517618469016e-05, + "loss": 1.4568, + "step": 3897 + }, + { + "epoch": 1.1831840947032934, + "grad_norm": 0.4271094799041748, + "learning_rate": 7.637910085054678e-05, + "loss": 1.7324, + "step": 3898 + }, + { + "epoch": 1.1834876308999849, + "grad_norm": 0.5306187272071838, + "learning_rate": 7.637302551640341e-05, + "loss": 1.9042, + "step": 3899 + }, + { + "epoch": 1.1837911670966763, + "grad_norm": 0.5607674717903137, + "learning_rate": 7.636695018226003e-05, + "loss": 1.5753, + "step": 3900 + }, + { + "epoch": 1.1840947032933677, + "grad_norm": 0.520979106426239, + "learning_rate": 7.636087484811665e-05, + "loss": 1.773, + "step": 3901 + }, + { + "epoch": 1.1843982394900592, + "grad_norm": 0.5560202598571777, + "learning_rate": 7.635479951397328e-05, + "loss": 1.685, + "step": 3902 + }, + { + "epoch": 1.1847017756867506, + "grad_norm": 0.4960952401161194, + "learning_rate": 7.63487241798299e-05, + "loss": 1.4656, + "step": 3903 + }, + { + "epoch": 1.185005311883442, + "grad_norm": 0.5220305323600769, + "learning_rate": 7.634264884568651e-05, + "loss": 1.7571, + "step": 3904 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.5441679954528809, + "learning_rate": 7.633657351154314e-05, + "loss": 1.7369, + "step": 3905 + }, + { + "epoch": 1.185612384276825, + "grad_norm": 0.524355411529541, + "learning_rate": 7.633049817739976e-05, + "loss": 1.554, + "step": 3906 + }, + { + "epoch": 1.1859159204735166, + "grad_norm": 0.580812931060791, + "learning_rate": 7.632442284325638e-05, + "loss": 1.1878, + "step": 3907 + }, + { + "epoch": 1.186219456670208, + "grad_norm": 0.4383397400379181, + "learning_rate": 7.631834750911301e-05, + "loss": 1.3957, + "step": 3908 + }, + { + "epoch": 1.1865229928668994, + "grad_norm": 0.5575391054153442, + "learning_rate": 7.631227217496963e-05, + "loss": 1.7958, + "step": 3909 + }, + { + "epoch": 1.1868265290635909, + "grad_norm": 0.6178303956985474, + "learning_rate": 7.630619684082626e-05, + "loss": 1.8002, + "step": 3910 + }, + { + "epoch": 1.1871300652602823, + "grad_norm": 0.7053147554397583, + "learning_rate": 7.630012150668287e-05, + "loss": 1.0536, + "step": 3911 + }, + { + "epoch": 1.1874336014569737, + "grad_norm": 1.381752848625183, + "learning_rate": 7.629404617253949e-05, + "loss": 1.3821, + "step": 3912 + }, + { + "epoch": 1.1877371376536652, + "grad_norm": 1.020168662071228, + "learning_rate": 7.628797083839612e-05, + "loss": 1.3616, + "step": 3913 + }, + { + "epoch": 1.1880406738503566, + "grad_norm": 0.5046608448028564, + "learning_rate": 7.628189550425274e-05, + "loss": 1.7283, + "step": 3914 + }, + { + "epoch": 1.188344210047048, + "grad_norm": 0.4386448562145233, + "learning_rate": 7.627582017010936e-05, + "loss": 1.7318, + "step": 3915 + }, + { + "epoch": 1.1886477462437395, + "grad_norm": 0.46474358439445496, + "learning_rate": 7.626974483596599e-05, + "loss": 1.8927, + "step": 3916 + }, + { + "epoch": 1.188951282440431, + "grad_norm": 0.5807692408561707, + "learning_rate": 7.62636695018226e-05, + "loss": 1.6145, + "step": 3917 + }, + { + "epoch": 1.1892548186371226, + "grad_norm": 0.4750295579433441, + "learning_rate": 7.625759416767922e-05, + "loss": 1.4997, + "step": 3918 + }, + { + "epoch": 1.189558354833814, + "grad_norm": 0.5042990446090698, + "learning_rate": 7.625151883353585e-05, + "loss": 1.8107, + "step": 3919 + }, + { + "epoch": 1.1898618910305054, + "grad_norm": 0.48407718539237976, + "learning_rate": 7.624544349939247e-05, + "loss": 1.4194, + "step": 3920 + }, + { + "epoch": 1.1901654272271969, + "grad_norm": 0.5130017399787903, + "learning_rate": 7.623936816524909e-05, + "loss": 1.4494, + "step": 3921 + }, + { + "epoch": 1.1904689634238883, + "grad_norm": 0.46143487095832825, + "learning_rate": 7.623329283110572e-05, + "loss": 1.5981, + "step": 3922 + }, + { + "epoch": 1.1907724996205797, + "grad_norm": 0.8400600552558899, + "learning_rate": 7.622721749696234e-05, + "loss": 1.2068, + "step": 3923 + }, + { + "epoch": 1.1910760358172712, + "grad_norm": 0.5218877792358398, + "learning_rate": 7.622114216281895e-05, + "loss": 1.6137, + "step": 3924 + }, + { + "epoch": 1.1913795720139626, + "grad_norm": 0.4495093822479248, + "learning_rate": 7.621506682867558e-05, + "loss": 1.6691, + "step": 3925 + }, + { + "epoch": 1.191683108210654, + "grad_norm": 0.49023687839508057, + "learning_rate": 7.62089914945322e-05, + "loss": 1.6207, + "step": 3926 + }, + { + "epoch": 1.1919866444073457, + "grad_norm": 0.5561721324920654, + "learning_rate": 7.620291616038883e-05, + "loss": 1.4456, + "step": 3927 + }, + { + "epoch": 1.192290180604037, + "grad_norm": 0.5107851028442383, + "learning_rate": 7.619684082624545e-05, + "loss": 1.78, + "step": 3928 + }, + { + "epoch": 1.1925937168007286, + "grad_norm": 0.5281449556350708, + "learning_rate": 7.619076549210207e-05, + "loss": 1.9445, + "step": 3929 + }, + { + "epoch": 1.19289725299742, + "grad_norm": 0.5192548036575317, + "learning_rate": 7.61846901579587e-05, + "loss": 1.6706, + "step": 3930 + }, + { + "epoch": 1.1932007891941114, + "grad_norm": 0.5205463171005249, + "learning_rate": 7.617861482381531e-05, + "loss": 1.6415, + "step": 3931 + }, + { + "epoch": 1.1935043253908029, + "grad_norm": 0.4953666925430298, + "learning_rate": 7.617253948967193e-05, + "loss": 1.6263, + "step": 3932 + }, + { + "epoch": 1.1938078615874943, + "grad_norm": 0.49807044863700867, + "learning_rate": 7.616646415552856e-05, + "loss": 1.6871, + "step": 3933 + }, + { + "epoch": 1.1941113977841857, + "grad_norm": 0.8351933360099792, + "learning_rate": 7.616038882138518e-05, + "loss": 1.0512, + "step": 3934 + }, + { + "epoch": 1.1944149339808772, + "grad_norm": 0.4444892406463623, + "learning_rate": 7.61543134872418e-05, + "loss": 1.6981, + "step": 3935 + }, + { + "epoch": 1.1947184701775686, + "grad_norm": 0.46068355441093445, + "learning_rate": 7.614823815309843e-05, + "loss": 1.8101, + "step": 3936 + }, + { + "epoch": 1.19502200637426, + "grad_norm": 0.6468572616577148, + "learning_rate": 7.614216281895505e-05, + "loss": 1.411, + "step": 3937 + }, + { + "epoch": 1.1953255425709517, + "grad_norm": 0.5605432391166687, + "learning_rate": 7.613608748481166e-05, + "loss": 1.1574, + "step": 3938 + }, + { + "epoch": 1.1956290787676431, + "grad_norm": 0.4770459532737732, + "learning_rate": 7.61300121506683e-05, + "loss": 1.7571, + "step": 3939 + }, + { + "epoch": 1.1959326149643346, + "grad_norm": 0.5230698585510254, + "learning_rate": 7.612393681652491e-05, + "loss": 1.6551, + "step": 3940 + }, + { + "epoch": 1.196236151161026, + "grad_norm": 0.7350290417671204, + "learning_rate": 7.611786148238154e-05, + "loss": 1.6698, + "step": 3941 + }, + { + "epoch": 1.1965396873577174, + "grad_norm": 0.5905072689056396, + "learning_rate": 7.611178614823816e-05, + "loss": 1.7354, + "step": 3942 + }, + { + "epoch": 1.1968432235544089, + "grad_norm": 0.5296047329902649, + "learning_rate": 7.610571081409478e-05, + "loss": 1.7228, + "step": 3943 + }, + { + "epoch": 1.1971467597511003, + "grad_norm": 0.49780750274658203, + "learning_rate": 7.609963547995141e-05, + "loss": 1.2224, + "step": 3944 + }, + { + "epoch": 1.1974502959477917, + "grad_norm": 0.4543820917606354, + "learning_rate": 7.609356014580802e-05, + "loss": 1.6028, + "step": 3945 + }, + { + "epoch": 1.1977538321444832, + "grad_norm": 0.4909208118915558, + "learning_rate": 7.608748481166464e-05, + "loss": 1.9409, + "step": 3946 + }, + { + "epoch": 1.1980573683411746, + "grad_norm": 0.47982802987098694, + "learning_rate": 7.608140947752127e-05, + "loss": 1.8759, + "step": 3947 + }, + { + "epoch": 1.198360904537866, + "grad_norm": 1.0359922647476196, + "learning_rate": 7.607533414337789e-05, + "loss": 1.2115, + "step": 3948 + }, + { + "epoch": 1.1986644407345577, + "grad_norm": 0.5492017865180969, + "learning_rate": 7.606925880923451e-05, + "loss": 1.5465, + "step": 3949 + }, + { + "epoch": 1.1989679769312491, + "grad_norm": 0.3987594544887543, + "learning_rate": 7.606318347509114e-05, + "loss": 1.6226, + "step": 3950 + }, + { + "epoch": 1.1992715131279406, + "grad_norm": 0.4642569422721863, + "learning_rate": 7.605710814094776e-05, + "loss": 1.9385, + "step": 3951 + }, + { + "epoch": 1.199575049324632, + "grad_norm": 0.4473128914833069, + "learning_rate": 7.605103280680437e-05, + "loss": 1.7057, + "step": 3952 + }, + { + "epoch": 1.1998785855213234, + "grad_norm": 0.4777715504169464, + "learning_rate": 7.6044957472661e-05, + "loss": 1.3958, + "step": 3953 + }, + { + "epoch": 1.2001821217180149, + "grad_norm": 0.553566575050354, + "learning_rate": 7.603888213851762e-05, + "loss": 1.649, + "step": 3954 + }, + { + "epoch": 1.2004856579147063, + "grad_norm": 0.4801444411277771, + "learning_rate": 7.603280680437425e-05, + "loss": 1.7801, + "step": 3955 + }, + { + "epoch": 1.2007891941113977, + "grad_norm": 0.4610240161418915, + "learning_rate": 7.602673147023086e-05, + "loss": 1.9021, + "step": 3956 + }, + { + "epoch": 1.2010927303080892, + "grad_norm": 0.48058998584747314, + "learning_rate": 7.602065613608749e-05, + "loss": 1.0347, + "step": 3957 + }, + { + "epoch": 1.2013962665047808, + "grad_norm": 0.5109126567840576, + "learning_rate": 7.601458080194412e-05, + "loss": 1.6481, + "step": 3958 + }, + { + "epoch": 1.201699802701472, + "grad_norm": 0.5463404655456543, + "learning_rate": 7.600850546780073e-05, + "loss": 1.4141, + "step": 3959 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.46814149618148804, + "learning_rate": 7.600243013365735e-05, + "loss": 1.7893, + "step": 3960 + }, + { + "epoch": 1.2023068750948551, + "grad_norm": 0.5103051066398621, + "learning_rate": 7.599635479951398e-05, + "loss": 1.533, + "step": 3961 + }, + { + "epoch": 1.2026104112915466, + "grad_norm": 0.4707978665828705, + "learning_rate": 7.59902794653706e-05, + "loss": 1.3515, + "step": 3962 + }, + { + "epoch": 1.202913947488238, + "grad_norm": 1.0241955518722534, + "learning_rate": 7.598420413122722e-05, + "loss": 1.4342, + "step": 3963 + }, + { + "epoch": 1.2032174836849294, + "grad_norm": 0.4893020689487457, + "learning_rate": 7.597812879708385e-05, + "loss": 1.8932, + "step": 3964 + }, + { + "epoch": 1.2035210198816209, + "grad_norm": 0.47995486855506897, + "learning_rate": 7.597205346294047e-05, + "loss": 1.5502, + "step": 3965 + }, + { + "epoch": 1.2038245560783123, + "grad_norm": 0.5193171501159668, + "learning_rate": 7.596597812879708e-05, + "loss": 1.6352, + "step": 3966 + }, + { + "epoch": 1.2041280922750037, + "grad_norm": 0.5245213508605957, + "learning_rate": 7.595990279465371e-05, + "loss": 1.4066, + "step": 3967 + }, + { + "epoch": 1.2044316284716952, + "grad_norm": 0.578769326210022, + "learning_rate": 7.595382746051033e-05, + "loss": 1.7556, + "step": 3968 + }, + { + "epoch": 1.2047351646683868, + "grad_norm": 0.4418366253376007, + "learning_rate": 7.594775212636696e-05, + "loss": 0.9755, + "step": 3969 + }, + { + "epoch": 1.2050387008650782, + "grad_norm": 0.5069161653518677, + "learning_rate": 7.594167679222357e-05, + "loss": 1.7738, + "step": 3970 + }, + { + "epoch": 1.2053422370617697, + "grad_norm": 0.6108199954032898, + "learning_rate": 7.59356014580802e-05, + "loss": 1.6229, + "step": 3971 + }, + { + "epoch": 1.2056457732584611, + "grad_norm": 0.4657975137233734, + "learning_rate": 7.592952612393683e-05, + "loss": 1.7213, + "step": 3972 + }, + { + "epoch": 1.2059493094551526, + "grad_norm": 0.4727039337158203, + "learning_rate": 7.592345078979343e-05, + "loss": 1.6633, + "step": 3973 + }, + { + "epoch": 1.206252845651844, + "grad_norm": 0.48204702138900757, + "learning_rate": 7.591737545565006e-05, + "loss": 1.7443, + "step": 3974 + }, + { + "epoch": 1.2065563818485354, + "grad_norm": 0.4971252977848053, + "learning_rate": 7.591130012150669e-05, + "loss": 1.8952, + "step": 3975 + }, + { + "epoch": 1.2068599180452269, + "grad_norm": 0.5344823002815247, + "learning_rate": 7.590522478736331e-05, + "loss": 1.402, + "step": 3976 + }, + { + "epoch": 1.2071634542419183, + "grad_norm": 0.5859917402267456, + "learning_rate": 7.589914945321993e-05, + "loss": 1.8069, + "step": 3977 + }, + { + "epoch": 1.2074669904386097, + "grad_norm": 0.5768531560897827, + "learning_rate": 7.589307411907656e-05, + "loss": 1.4101, + "step": 3978 + }, + { + "epoch": 1.2077705266353012, + "grad_norm": 0.4760257303714752, + "learning_rate": 7.588699878493318e-05, + "loss": 1.6148, + "step": 3979 + }, + { + "epoch": 1.2080740628319928, + "grad_norm": 0.5474233031272888, + "learning_rate": 7.588092345078979e-05, + "loss": 1.4744, + "step": 3980 + }, + { + "epoch": 1.2083775990286842, + "grad_norm": 1.7555142641067505, + "learning_rate": 7.587484811664642e-05, + "loss": 1.6162, + "step": 3981 + }, + { + "epoch": 1.2086811352253757, + "grad_norm": 0.4019928574562073, + "learning_rate": 7.586877278250304e-05, + "loss": 1.1217, + "step": 3982 + }, + { + "epoch": 1.2089846714220671, + "grad_norm": 0.5030451416969299, + "learning_rate": 7.586269744835967e-05, + "loss": 1.7434, + "step": 3983 + }, + { + "epoch": 1.2092882076187585, + "grad_norm": 0.4046245813369751, + "learning_rate": 7.585662211421628e-05, + "loss": 1.3424, + "step": 3984 + }, + { + "epoch": 1.20959174381545, + "grad_norm": 0.5747511386871338, + "learning_rate": 7.58505467800729e-05, + "loss": 1.7236, + "step": 3985 + }, + { + "epoch": 1.2098952800121414, + "grad_norm": 0.505330502986908, + "learning_rate": 7.584447144592954e-05, + "loss": 1.1954, + "step": 3986 + }, + { + "epoch": 1.2101988162088329, + "grad_norm": 0.4753364026546478, + "learning_rate": 7.583839611178614e-05, + "loss": 1.6601, + "step": 3987 + }, + { + "epoch": 1.2105023524055243, + "grad_norm": 0.45532703399658203, + "learning_rate": 7.583232077764277e-05, + "loss": 1.6707, + "step": 3988 + }, + { + "epoch": 1.2108058886022157, + "grad_norm": 0.5530490875244141, + "learning_rate": 7.58262454434994e-05, + "loss": 1.5897, + "step": 3989 + }, + { + "epoch": 1.2111094247989072, + "grad_norm": 0.47323623299598694, + "learning_rate": 7.582017010935602e-05, + "loss": 1.7492, + "step": 3990 + }, + { + "epoch": 1.2114129609955988, + "grad_norm": 0.5553590655326843, + "learning_rate": 7.581409477521264e-05, + "loss": 1.4958, + "step": 3991 + }, + { + "epoch": 1.2117164971922902, + "grad_norm": 0.4471113979816437, + "learning_rate": 7.580801944106927e-05, + "loss": 1.8111, + "step": 3992 + }, + { + "epoch": 1.2120200333889817, + "grad_norm": 0.5062560439109802, + "learning_rate": 7.580194410692589e-05, + "loss": 1.6767, + "step": 3993 + }, + { + "epoch": 1.212323569585673, + "grad_norm": 0.5048001408576965, + "learning_rate": 7.57958687727825e-05, + "loss": 1.9827, + "step": 3994 + }, + { + "epoch": 1.2126271057823645, + "grad_norm": 0.42375367879867554, + "learning_rate": 7.578979343863913e-05, + "loss": 1.2515, + "step": 3995 + }, + { + "epoch": 1.212930641979056, + "grad_norm": 0.5349414348602295, + "learning_rate": 7.578371810449575e-05, + "loss": 1.7492, + "step": 3996 + }, + { + "epoch": 1.2132341781757474, + "grad_norm": 0.45927226543426514, + "learning_rate": 7.577764277035237e-05, + "loss": 1.5423, + "step": 3997 + }, + { + "epoch": 1.2135377143724388, + "grad_norm": 0.6156039237976074, + "learning_rate": 7.577156743620899e-05, + "loss": 1.3404, + "step": 3998 + }, + { + "epoch": 1.2138412505691303, + "grad_norm": 1.3208363056182861, + "learning_rate": 7.576549210206562e-05, + "loss": 1.2658, + "step": 3999 + }, + { + "epoch": 1.214144786765822, + "grad_norm": 0.5266988277435303, + "learning_rate": 7.575941676792225e-05, + "loss": 1.6878, + "step": 4000 + }, + { + "epoch": 1.2144483229625134, + "grad_norm": 0.463829904794693, + "learning_rate": 7.575334143377885e-05, + "loss": 1.6529, + "step": 4001 + }, + { + "epoch": 1.2147518591592048, + "grad_norm": 0.4354858994483948, + "learning_rate": 7.574726609963548e-05, + "loss": 1.9436, + "step": 4002 + }, + { + "epoch": 1.2150553953558962, + "grad_norm": 0.889329731464386, + "learning_rate": 7.574119076549211e-05, + "loss": 0.7387, + "step": 4003 + }, + { + "epoch": 1.2153589315525877, + "grad_norm": 0.9312804341316223, + "learning_rate": 7.573511543134873e-05, + "loss": 1.9534, + "step": 4004 + }, + { + "epoch": 1.215662467749279, + "grad_norm": 0.5896217823028564, + "learning_rate": 7.572904009720535e-05, + "loss": 1.8036, + "step": 4005 + }, + { + "epoch": 1.2159660039459705, + "grad_norm": 0.5803027153015137, + "learning_rate": 7.572296476306198e-05, + "loss": 1.7561, + "step": 4006 + }, + { + "epoch": 1.216269540142662, + "grad_norm": 0.4633532166481018, + "learning_rate": 7.57168894289186e-05, + "loss": 2.0534, + "step": 4007 + }, + { + "epoch": 1.2165730763393534, + "grad_norm": 0.4791993200778961, + "learning_rate": 7.571081409477521e-05, + "loss": 1.8331, + "step": 4008 + }, + { + "epoch": 1.2168766125360448, + "grad_norm": 0.48696058988571167, + "learning_rate": 7.570473876063184e-05, + "loss": 1.6295, + "step": 4009 + }, + { + "epoch": 1.2171801487327363, + "grad_norm": 0.433896005153656, + "learning_rate": 7.569866342648846e-05, + "loss": 1.2841, + "step": 4010 + }, + { + "epoch": 1.217483684929428, + "grad_norm": 0.5070579648017883, + "learning_rate": 7.569258809234508e-05, + "loss": 1.7859, + "step": 4011 + }, + { + "epoch": 1.2177872211261194, + "grad_norm": 0.5580217838287354, + "learning_rate": 7.56865127582017e-05, + "loss": 1.5646, + "step": 4012 + }, + { + "epoch": 1.2180907573228108, + "grad_norm": 0.594814121723175, + "learning_rate": 7.568043742405833e-05, + "loss": 1.8118, + "step": 4013 + }, + { + "epoch": 1.2183942935195022, + "grad_norm": 0.5270577669143677, + "learning_rate": 7.567436208991496e-05, + "loss": 1.8065, + "step": 4014 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.6173904538154602, + "learning_rate": 7.566828675577156e-05, + "loss": 1.7159, + "step": 4015 + }, + { + "epoch": 1.219001365912885, + "grad_norm": 0.6258283257484436, + "learning_rate": 7.566221142162819e-05, + "loss": 1.1892, + "step": 4016 + }, + { + "epoch": 1.2193049021095765, + "grad_norm": 0.42736998200416565, + "learning_rate": 7.565613608748482e-05, + "loss": 1.3273, + "step": 4017 + }, + { + "epoch": 1.219608438306268, + "grad_norm": 0.5543321371078491, + "learning_rate": 7.565006075334144e-05, + "loss": 1.4035, + "step": 4018 + }, + { + "epoch": 1.2199119745029594, + "grad_norm": 0.5516905188560486, + "learning_rate": 7.564398541919806e-05, + "loss": 1.5628, + "step": 4019 + }, + { + "epoch": 1.2202155106996508, + "grad_norm": 0.4710666537284851, + "learning_rate": 7.563791008505469e-05, + "loss": 1.664, + "step": 4020 + }, + { + "epoch": 1.2205190468963423, + "grad_norm": 0.5181185007095337, + "learning_rate": 7.56318347509113e-05, + "loss": 1.7942, + "step": 4021 + }, + { + "epoch": 1.220822583093034, + "grad_norm": 0.4957810044288635, + "learning_rate": 7.562575941676792e-05, + "loss": 1.899, + "step": 4022 + }, + { + "epoch": 1.2211261192897254, + "grad_norm": 0.5114620923995972, + "learning_rate": 7.561968408262455e-05, + "loss": 1.5602, + "step": 4023 + }, + { + "epoch": 1.2214296554864168, + "grad_norm": 0.4717017710208893, + "learning_rate": 7.561360874848117e-05, + "loss": 1.4065, + "step": 4024 + }, + { + "epoch": 1.2217331916831082, + "grad_norm": 0.5395920872688293, + "learning_rate": 7.560753341433779e-05, + "loss": 1.8311, + "step": 4025 + }, + { + "epoch": 1.2220367278797997, + "grad_norm": 0.47249385714530945, + "learning_rate": 7.56014580801944e-05, + "loss": 1.9296, + "step": 4026 + }, + { + "epoch": 1.222340264076491, + "grad_norm": 0.4514206051826477, + "learning_rate": 7.559538274605104e-05, + "loss": 1.2713, + "step": 4027 + }, + { + "epoch": 1.2226438002731825, + "grad_norm": 0.9952641129493713, + "learning_rate": 7.558930741190767e-05, + "loss": 1.5778, + "step": 4028 + }, + { + "epoch": 1.222947336469874, + "grad_norm": 0.6609991192817688, + "learning_rate": 7.558323207776427e-05, + "loss": 1.9552, + "step": 4029 + }, + { + "epoch": 1.2232508726665654, + "grad_norm": 0.5197901129722595, + "learning_rate": 7.55771567436209e-05, + "loss": 1.7676, + "step": 4030 + }, + { + "epoch": 1.223554408863257, + "grad_norm": 0.5448186993598938, + "learning_rate": 7.557108140947753e-05, + "loss": 1.4093, + "step": 4031 + }, + { + "epoch": 1.2238579450599485, + "grad_norm": 0.4948391318321228, + "learning_rate": 7.556500607533415e-05, + "loss": 1.8091, + "step": 4032 + }, + { + "epoch": 1.22416148125664, + "grad_norm": 0.6797294020652771, + "learning_rate": 7.555893074119077e-05, + "loss": 1.8645, + "step": 4033 + }, + { + "epoch": 1.2244650174533314, + "grad_norm": 0.3642887771129608, + "learning_rate": 7.55528554070474e-05, + "loss": 1.4542, + "step": 4034 + }, + { + "epoch": 1.2247685536500228, + "grad_norm": 0.5181020498275757, + "learning_rate": 7.554678007290402e-05, + "loss": 1.791, + "step": 4035 + }, + { + "epoch": 1.2250720898467142, + "grad_norm": 0.5388804078102112, + "learning_rate": 7.554070473876063e-05, + "loss": 1.9918, + "step": 4036 + }, + { + "epoch": 1.2253756260434057, + "grad_norm": 0.4962565302848816, + "learning_rate": 7.553462940461726e-05, + "loss": 1.6094, + "step": 4037 + }, + { + "epoch": 1.225679162240097, + "grad_norm": 0.5955974459648132, + "learning_rate": 7.552855407047388e-05, + "loss": 1.401, + "step": 4038 + }, + { + "epoch": 1.2259826984367885, + "grad_norm": 0.5586566925048828, + "learning_rate": 7.55224787363305e-05, + "loss": 1.7028, + "step": 4039 + }, + { + "epoch": 1.22628623463348, + "grad_norm": 0.5239779949188232, + "learning_rate": 7.551640340218712e-05, + "loss": 2.0994, + "step": 4040 + }, + { + "epoch": 1.2265897708301714, + "grad_norm": 0.5144902467727661, + "learning_rate": 7.551032806804375e-05, + "loss": 1.5493, + "step": 4041 + }, + { + "epoch": 1.226893307026863, + "grad_norm": 0.4962595999240875, + "learning_rate": 7.550425273390038e-05, + "loss": 1.7629, + "step": 4042 + }, + { + "epoch": 1.2271968432235545, + "grad_norm": 0.5348252058029175, + "learning_rate": 7.549817739975698e-05, + "loss": 1.7287, + "step": 4043 + }, + { + "epoch": 1.227500379420246, + "grad_norm": 0.485503613948822, + "learning_rate": 7.549210206561361e-05, + "loss": 1.7518, + "step": 4044 + }, + { + "epoch": 1.2278039156169374, + "grad_norm": 0.46359485387802124, + "learning_rate": 7.548602673147024e-05, + "loss": 1.7105, + "step": 4045 + }, + { + "epoch": 1.2281074518136288, + "grad_norm": 0.5181253552436829, + "learning_rate": 7.547995139732685e-05, + "loss": 1.4408, + "step": 4046 + }, + { + "epoch": 1.2284109880103202, + "grad_norm": 0.5859808921813965, + "learning_rate": 7.547387606318348e-05, + "loss": 1.8802, + "step": 4047 + }, + { + "epoch": 1.2287145242070117, + "grad_norm": 0.7627731561660767, + "learning_rate": 7.546780072904011e-05, + "loss": 1.6918, + "step": 4048 + }, + { + "epoch": 1.229018060403703, + "grad_norm": 0.4201076626777649, + "learning_rate": 7.546172539489673e-05, + "loss": 1.6491, + "step": 4049 + }, + { + "epoch": 1.2293215966003945, + "grad_norm": 0.6305765509605408, + "learning_rate": 7.545565006075334e-05, + "loss": 0.5473, + "step": 4050 + }, + { + "epoch": 1.229625132797086, + "grad_norm": 0.519670307636261, + "learning_rate": 7.544957472660996e-05, + "loss": 1.6919, + "step": 4051 + }, + { + "epoch": 1.2299286689937774, + "grad_norm": 0.5410406589508057, + "learning_rate": 7.544349939246659e-05, + "loss": 0.9771, + "step": 4052 + }, + { + "epoch": 1.230232205190469, + "grad_norm": 0.489894837141037, + "learning_rate": 7.543742405832321e-05, + "loss": 1.5552, + "step": 4053 + }, + { + "epoch": 1.2305357413871605, + "grad_norm": 0.8147485256195068, + "learning_rate": 7.543134872417983e-05, + "loss": 1.1342, + "step": 4054 + }, + { + "epoch": 1.230839277583852, + "grad_norm": 0.5854855179786682, + "learning_rate": 7.542527339003646e-05, + "loss": 1.2235, + "step": 4055 + }, + { + "epoch": 1.2311428137805434, + "grad_norm": 0.597017228603363, + "learning_rate": 7.541919805589309e-05, + "loss": 1.7028, + "step": 4056 + }, + { + "epoch": 1.2314463499772348, + "grad_norm": 0.5282664895057678, + "learning_rate": 7.541312272174969e-05, + "loss": 1.1727, + "step": 4057 + }, + { + "epoch": 1.2317498861739262, + "grad_norm": 0.5463590621948242, + "learning_rate": 7.540704738760632e-05, + "loss": 1.7269, + "step": 4058 + }, + { + "epoch": 1.2320534223706177, + "grad_norm": 0.5592811107635498, + "learning_rate": 7.540097205346295e-05, + "loss": 1.2984, + "step": 4059 + }, + { + "epoch": 1.232356958567309, + "grad_norm": 0.58669513463974, + "learning_rate": 7.539489671931956e-05, + "loss": 1.585, + "step": 4060 + }, + { + "epoch": 1.2326604947640005, + "grad_norm": 0.5241060853004456, + "learning_rate": 7.538882138517619e-05, + "loss": 1.2848, + "step": 4061 + }, + { + "epoch": 1.2329640309606922, + "grad_norm": 0.45810186862945557, + "learning_rate": 7.538274605103282e-05, + "loss": 1.4243, + "step": 4062 + }, + { + "epoch": 1.2332675671573834, + "grad_norm": 0.468780517578125, + "learning_rate": 7.537667071688944e-05, + "loss": 1.3984, + "step": 4063 + }, + { + "epoch": 1.233571103354075, + "grad_norm": 0.46137523651123047, + "learning_rate": 7.537059538274605e-05, + "loss": 1.9646, + "step": 4064 + }, + { + "epoch": 1.2338746395507665, + "grad_norm": 0.6160327196121216, + "learning_rate": 7.536452004860267e-05, + "loss": 1.7623, + "step": 4065 + }, + { + "epoch": 1.234178175747458, + "grad_norm": 0.5266595482826233, + "learning_rate": 7.53584447144593e-05, + "loss": 1.7096, + "step": 4066 + }, + { + "epoch": 1.2344817119441494, + "grad_norm": 0.5507217645645142, + "learning_rate": 7.535236938031592e-05, + "loss": 1.1104, + "step": 4067 + }, + { + "epoch": 1.2347852481408408, + "grad_norm": 0.5340211987495422, + "learning_rate": 7.534629404617254e-05, + "loss": 1.6101, + "step": 4068 + }, + { + "epoch": 1.2350887843375322, + "grad_norm": 0.6032447218894958, + "learning_rate": 7.534021871202917e-05, + "loss": 1.4905, + "step": 4069 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.5784475207328796, + "learning_rate": 7.533414337788578e-05, + "loss": 1.2617, + "step": 4070 + }, + { + "epoch": 1.235695856730915, + "grad_norm": 0.6673290133476257, + "learning_rate": 7.53280680437424e-05, + "loss": 1.3954, + "step": 4071 + }, + { + "epoch": 1.2359993929276065, + "grad_norm": 0.5767869353294373, + "learning_rate": 7.532199270959903e-05, + "loss": 1.6808, + "step": 4072 + }, + { + "epoch": 1.2363029291242982, + "grad_norm": 0.5009193420410156, + "learning_rate": 7.531591737545566e-05, + "loss": 1.7008, + "step": 4073 + }, + { + "epoch": 1.2366064653209896, + "grad_norm": 0.4754045903682709, + "learning_rate": 7.530984204131227e-05, + "loss": 1.7596, + "step": 4074 + }, + { + "epoch": 1.236910001517681, + "grad_norm": 0.620161235332489, + "learning_rate": 7.53037667071689e-05, + "loss": 1.4089, + "step": 4075 + }, + { + "epoch": 1.2372135377143725, + "grad_norm": 0.34108883142471313, + "learning_rate": 7.529769137302553e-05, + "loss": 1.2953, + "step": 4076 + }, + { + "epoch": 1.237517073911064, + "grad_norm": 0.9694557189941406, + "learning_rate": 7.529161603888215e-05, + "loss": 1.3095, + "step": 4077 + }, + { + "epoch": 1.2378206101077553, + "grad_norm": 0.5211403965950012, + "learning_rate": 7.528554070473876e-05, + "loss": 1.7275, + "step": 4078 + }, + { + "epoch": 1.2381241463044468, + "grad_norm": 0.4836254119873047, + "learning_rate": 7.527946537059538e-05, + "loss": 1.8843, + "step": 4079 + }, + { + "epoch": 1.2384276825011382, + "grad_norm": 0.4383199214935303, + "learning_rate": 7.527339003645201e-05, + "loss": 1.7991, + "step": 4080 + }, + { + "epoch": 1.2387312186978297, + "grad_norm": 0.5982313752174377, + "learning_rate": 7.526731470230863e-05, + "loss": 1.4946, + "step": 4081 + }, + { + "epoch": 1.239034754894521, + "grad_norm": 0.5452768802642822, + "learning_rate": 7.526123936816525e-05, + "loss": 1.5868, + "step": 4082 + }, + { + "epoch": 1.2393382910912125, + "grad_norm": 0.5064995288848877, + "learning_rate": 7.525516403402188e-05, + "loss": 1.7856, + "step": 4083 + }, + { + "epoch": 1.2396418272879042, + "grad_norm": 0.5409367084503174, + "learning_rate": 7.52490886998785e-05, + "loss": 1.7328, + "step": 4084 + }, + { + "epoch": 1.2399453634845956, + "grad_norm": 0.5907915830612183, + "learning_rate": 7.524301336573511e-05, + "loss": 1.2613, + "step": 4085 + }, + { + "epoch": 1.240248899681287, + "grad_norm": 0.42374351620674133, + "learning_rate": 7.523693803159174e-05, + "loss": 1.952, + "step": 4086 + }, + { + "epoch": 1.2405524358779785, + "grad_norm": 0.5498590469360352, + "learning_rate": 7.523086269744837e-05, + "loss": 1.7781, + "step": 4087 + }, + { + "epoch": 1.24085597207467, + "grad_norm": 0.5030960440635681, + "learning_rate": 7.522478736330498e-05, + "loss": 1.3881, + "step": 4088 + }, + { + "epoch": 1.2411595082713613, + "grad_norm": 0.5195364356040955, + "learning_rate": 7.521871202916161e-05, + "loss": 1.7551, + "step": 4089 + }, + { + "epoch": 1.2414630444680528, + "grad_norm": 0.5500257611274719, + "learning_rate": 7.521263669501824e-05, + "loss": 1.316, + "step": 4090 + }, + { + "epoch": 1.2417665806647442, + "grad_norm": 0.5093240737915039, + "learning_rate": 7.520656136087486e-05, + "loss": 1.3301, + "step": 4091 + }, + { + "epoch": 1.2420701168614356, + "grad_norm": 0.6616035103797913, + "learning_rate": 7.520048602673147e-05, + "loss": 1.9355, + "step": 4092 + }, + { + "epoch": 1.2423736530581273, + "grad_norm": 0.5518209338188171, + "learning_rate": 7.519441069258809e-05, + "loss": 1.7574, + "step": 4093 + }, + { + "epoch": 1.2426771892548185, + "grad_norm": 0.4014967978000641, + "learning_rate": 7.518833535844472e-05, + "loss": 1.8058, + "step": 4094 + }, + { + "epoch": 1.2429807254515102, + "grad_norm": 0.4472224712371826, + "learning_rate": 7.518226002430134e-05, + "loss": 1.6429, + "step": 4095 + }, + { + "epoch": 1.2432842616482016, + "grad_norm": 0.5114421248435974, + "learning_rate": 7.517618469015796e-05, + "loss": 1.6209, + "step": 4096 + }, + { + "epoch": 1.243587797844893, + "grad_norm": 0.5454509854316711, + "learning_rate": 7.517010935601459e-05, + "loss": 1.7061, + "step": 4097 + }, + { + "epoch": 1.2438913340415845, + "grad_norm": 0.41268596053123474, + "learning_rate": 7.51640340218712e-05, + "loss": 1.9439, + "step": 4098 + }, + { + "epoch": 1.244194870238276, + "grad_norm": 0.5628203749656677, + "learning_rate": 7.515795868772782e-05, + "loss": 1.8348, + "step": 4099 + }, + { + "epoch": 1.2444984064349673, + "grad_norm": 0.7100119590759277, + "learning_rate": 7.515188335358445e-05, + "loss": 1.764, + "step": 4100 + }, + { + "epoch": 1.2448019426316588, + "grad_norm": 0.4779648184776306, + "learning_rate": 7.514580801944108e-05, + "loss": 1.7384, + "step": 4101 + }, + { + "epoch": 1.2451054788283502, + "grad_norm": 0.42757317423820496, + "learning_rate": 7.513973268529769e-05, + "loss": 1.3117, + "step": 4102 + }, + { + "epoch": 1.2454090150250416, + "grad_norm": 0.6299741268157959, + "learning_rate": 7.513365735115432e-05, + "loss": 1.5541, + "step": 4103 + }, + { + "epoch": 1.2457125512217333, + "grad_norm": 0.5417861938476562, + "learning_rate": 7.512758201701095e-05, + "loss": 1.7433, + "step": 4104 + }, + { + "epoch": 1.2460160874184247, + "grad_norm": 0.504732072353363, + "learning_rate": 7.512150668286757e-05, + "loss": 1.6971, + "step": 4105 + }, + { + "epoch": 1.2463196236151162, + "grad_norm": 1.007019281387329, + "learning_rate": 7.511543134872418e-05, + "loss": 1.4254, + "step": 4106 + }, + { + "epoch": 1.2466231598118076, + "grad_norm": 0.7208684086799622, + "learning_rate": 7.51093560145808e-05, + "loss": 1.8213, + "step": 4107 + }, + { + "epoch": 1.246926696008499, + "grad_norm": 0.5569952726364136, + "learning_rate": 7.510328068043743e-05, + "loss": 1.8765, + "step": 4108 + }, + { + "epoch": 1.2472302322051905, + "grad_norm": 0.5467456579208374, + "learning_rate": 7.509720534629405e-05, + "loss": 1.6964, + "step": 4109 + }, + { + "epoch": 1.247533768401882, + "grad_norm": 0.6151578426361084, + "learning_rate": 7.509113001215067e-05, + "loss": 1.8341, + "step": 4110 + }, + { + "epoch": 1.2478373045985733, + "grad_norm": 0.5277178883552551, + "learning_rate": 7.50850546780073e-05, + "loss": 1.6909, + "step": 4111 + }, + { + "epoch": 1.2481408407952648, + "grad_norm": 0.5101326107978821, + "learning_rate": 7.507897934386391e-05, + "loss": 1.6674, + "step": 4112 + }, + { + "epoch": 1.2484443769919562, + "grad_norm": 0.49837616086006165, + "learning_rate": 7.507290400972053e-05, + "loss": 1.6809, + "step": 4113 + }, + { + "epoch": 1.2487479131886476, + "grad_norm": 0.46518057584762573, + "learning_rate": 7.506682867557716e-05, + "loss": 1.3971, + "step": 4114 + }, + { + "epoch": 1.2490514493853393, + "grad_norm": 0.6974937915802002, + "learning_rate": 7.50607533414338e-05, + "loss": 1.4269, + "step": 4115 + }, + { + "epoch": 1.2493549855820307, + "grad_norm": 0.5411034822463989, + "learning_rate": 7.50546780072904e-05, + "loss": 1.5861, + "step": 4116 + }, + { + "epoch": 1.2496585217787222, + "grad_norm": 0.5407208204269409, + "learning_rate": 7.504860267314703e-05, + "loss": 1.6395, + "step": 4117 + }, + { + "epoch": 1.2499620579754136, + "grad_norm": 0.4818359911441803, + "learning_rate": 7.504252733900366e-05, + "loss": 1.2755, + "step": 4118 + }, + { + "epoch": 1.250265594172105, + "grad_norm": 0.5112556219100952, + "learning_rate": 7.503645200486028e-05, + "loss": 1.4617, + "step": 4119 + }, + { + "epoch": 1.2505691303687965, + "grad_norm": 0.48120614886283875, + "learning_rate": 7.50303766707169e-05, + "loss": 1.8576, + "step": 4120 + }, + { + "epoch": 1.250872666565488, + "grad_norm": 0.5144551992416382, + "learning_rate": 7.502430133657351e-05, + "loss": 1.4099, + "step": 4121 + }, + { + "epoch": 1.2511762027621793, + "grad_norm": 0.5238993763923645, + "learning_rate": 7.501822600243014e-05, + "loss": 0.8086, + "step": 4122 + }, + { + "epoch": 1.2514797389588708, + "grad_norm": 0.5506192445755005, + "learning_rate": 7.501215066828676e-05, + "loss": 1.7507, + "step": 4123 + }, + { + "epoch": 1.2517832751555624, + "grad_norm": 0.5206360816955566, + "learning_rate": 7.500607533414338e-05, + "loss": 1.3207, + "step": 4124 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.5082906484603882, + "learning_rate": 7.500000000000001e-05, + "loss": 1.6108, + "step": 4125 + }, + { + "epoch": 1.2523903475489453, + "grad_norm": 0.8926411271095276, + "learning_rate": 7.499392466585662e-05, + "loss": 1.0148, + "step": 4126 + }, + { + "epoch": 1.2526938837456367, + "grad_norm": 0.4711840748786926, + "learning_rate": 7.498784933171324e-05, + "loss": 1.6824, + "step": 4127 + }, + { + "epoch": 1.2529974199423282, + "grad_norm": 0.5053786039352417, + "learning_rate": 7.498177399756987e-05, + "loss": 1.7507, + "step": 4128 + }, + { + "epoch": 1.2533009561390196, + "grad_norm": 0.4618197977542877, + "learning_rate": 7.49756986634265e-05, + "loss": 1.1271, + "step": 4129 + }, + { + "epoch": 1.253604492335711, + "grad_norm": 0.42752858996391296, + "learning_rate": 7.496962332928311e-05, + "loss": 1.8872, + "step": 4130 + }, + { + "epoch": 1.2539080285324025, + "grad_norm": 0.4735817313194275, + "learning_rate": 7.496354799513974e-05, + "loss": 1.6949, + "step": 4131 + }, + { + "epoch": 1.254211564729094, + "grad_norm": 0.5096668004989624, + "learning_rate": 7.495747266099636e-05, + "loss": 0.8228, + "step": 4132 + }, + { + "epoch": 1.2545151009257853, + "grad_norm": 0.6059778332710266, + "learning_rate": 7.495139732685297e-05, + "loss": 1.4917, + "step": 4133 + }, + { + "epoch": 1.2548186371224768, + "grad_norm": 0.6292750835418701, + "learning_rate": 7.49453219927096e-05, + "loss": 1.2934, + "step": 4134 + }, + { + "epoch": 1.2551221733191684, + "grad_norm": 0.5451275706291199, + "learning_rate": 7.493924665856622e-05, + "loss": 2.0059, + "step": 4135 + }, + { + "epoch": 1.2554257095158596, + "grad_norm": 0.5799239277839661, + "learning_rate": 7.493317132442285e-05, + "loss": 1.4994, + "step": 4136 + }, + { + "epoch": 1.2557292457125513, + "grad_norm": 0.4744884967803955, + "learning_rate": 7.492709599027947e-05, + "loss": 1.5334, + "step": 4137 + }, + { + "epoch": 1.2560327819092427, + "grad_norm": 0.4778893291950226, + "learning_rate": 7.492102065613609e-05, + "loss": 1.7536, + "step": 4138 + }, + { + "epoch": 1.2563363181059342, + "grad_norm": 0.5556294918060303, + "learning_rate": 7.491494532199272e-05, + "loss": 1.9507, + "step": 4139 + }, + { + "epoch": 1.2566398543026256, + "grad_norm": 0.8392012119293213, + "learning_rate": 7.490886998784933e-05, + "loss": 1.1061, + "step": 4140 + }, + { + "epoch": 1.256943390499317, + "grad_norm": 0.42154213786125183, + "learning_rate": 7.490279465370595e-05, + "loss": 1.1891, + "step": 4141 + }, + { + "epoch": 1.2572469266960085, + "grad_norm": 0.46638983488082886, + "learning_rate": 7.489671931956258e-05, + "loss": 1.7741, + "step": 4142 + }, + { + "epoch": 1.2575504628927, + "grad_norm": 0.5562369227409363, + "learning_rate": 7.48906439854192e-05, + "loss": 1.6542, + "step": 4143 + }, + { + "epoch": 1.2578539990893913, + "grad_norm": 0.5079572796821594, + "learning_rate": 7.488456865127582e-05, + "loss": 1.9253, + "step": 4144 + }, + { + "epoch": 1.2581575352860828, + "grad_norm": 0.8773594498634338, + "learning_rate": 7.487849331713245e-05, + "loss": 1.0602, + "step": 4145 + }, + { + "epoch": 1.2584610714827744, + "grad_norm": 0.6297957301139832, + "learning_rate": 7.487241798298907e-05, + "loss": 1.4078, + "step": 4146 + }, + { + "epoch": 1.2587646076794659, + "grad_norm": 0.6245468854904175, + "learning_rate": 7.486634264884568e-05, + "loss": 1.5408, + "step": 4147 + }, + { + "epoch": 1.2590681438761573, + "grad_norm": 0.6770250201225281, + "learning_rate": 7.486026731470231e-05, + "loss": 1.5883, + "step": 4148 + }, + { + "epoch": 1.2593716800728487, + "grad_norm": 0.571240246295929, + "learning_rate": 7.485419198055893e-05, + "loss": 1.851, + "step": 4149 + }, + { + "epoch": 1.2596752162695402, + "grad_norm": 0.6159947514533997, + "learning_rate": 7.484811664641556e-05, + "loss": 1.7828, + "step": 4150 + }, + { + "epoch": 1.2599787524662316, + "grad_norm": 0.4492059051990509, + "learning_rate": 7.484204131227218e-05, + "loss": 1.7235, + "step": 4151 + }, + { + "epoch": 1.260282288662923, + "grad_norm": 1.0111632347106934, + "learning_rate": 7.48359659781288e-05, + "loss": 1.5976, + "step": 4152 + }, + { + "epoch": 1.2605858248596145, + "grad_norm": 0.5812898278236389, + "learning_rate": 7.482989064398543e-05, + "loss": 1.9861, + "step": 4153 + }, + { + "epoch": 1.260889361056306, + "grad_norm": 0.4427884519100189, + "learning_rate": 7.482381530984204e-05, + "loss": 1.7063, + "step": 4154 + }, + { + "epoch": 1.2611928972529975, + "grad_norm": 0.4709545373916626, + "learning_rate": 7.481773997569866e-05, + "loss": 1.8049, + "step": 4155 + }, + { + "epoch": 1.2614964334496888, + "grad_norm": 0.5175103545188904, + "learning_rate": 7.481166464155529e-05, + "loss": 1.8579, + "step": 4156 + }, + { + "epoch": 1.2617999696463804, + "grad_norm": 0.6060042977333069, + "learning_rate": 7.480558930741191e-05, + "loss": 1.1567, + "step": 4157 + }, + { + "epoch": 1.2621035058430718, + "grad_norm": 0.6093948483467102, + "learning_rate": 7.479951397326853e-05, + "loss": 1.2373, + "step": 4158 + }, + { + "epoch": 1.2624070420397633, + "grad_norm": 0.5557296872138977, + "learning_rate": 7.479343863912516e-05, + "loss": 1.7976, + "step": 4159 + }, + { + "epoch": 1.2627105782364547, + "grad_norm": 0.5482549667358398, + "learning_rate": 7.478736330498178e-05, + "loss": 1.4302, + "step": 4160 + }, + { + "epoch": 1.2630141144331462, + "grad_norm": 0.5282002091407776, + "learning_rate": 7.478128797083839e-05, + "loss": 1.7744, + "step": 4161 + }, + { + "epoch": 1.2633176506298376, + "grad_norm": 0.49145814776420593, + "learning_rate": 7.477521263669502e-05, + "loss": 1.2955, + "step": 4162 + }, + { + "epoch": 1.263621186826529, + "grad_norm": 0.838631272315979, + "learning_rate": 7.476913730255164e-05, + "loss": 1.8272, + "step": 4163 + }, + { + "epoch": 1.2639247230232205, + "grad_norm": 0.5460755825042725, + "learning_rate": 7.476306196840827e-05, + "loss": 1.6878, + "step": 4164 + }, + { + "epoch": 1.2642282592199119, + "grad_norm": 0.5344128608703613, + "learning_rate": 7.475698663426489e-05, + "loss": 1.9606, + "step": 4165 + }, + { + "epoch": 1.2645317954166035, + "grad_norm": 0.5265620350837708, + "learning_rate": 7.47509113001215e-05, + "loss": 1.8173, + "step": 4166 + }, + { + "epoch": 1.2648353316132948, + "grad_norm": 0.5582383275032043, + "learning_rate": 7.474483596597814e-05, + "loss": 1.4313, + "step": 4167 + }, + { + "epoch": 1.2651388678099864, + "grad_norm": 0.5039302110671997, + "learning_rate": 7.473876063183475e-05, + "loss": 1.6151, + "step": 4168 + }, + { + "epoch": 1.2654424040066778, + "grad_norm": 0.48106664419174194, + "learning_rate": 7.473268529769137e-05, + "loss": 1.376, + "step": 4169 + }, + { + "epoch": 1.2657459402033693, + "grad_norm": 0.7529036998748779, + "learning_rate": 7.4726609963548e-05, + "loss": 1.2728, + "step": 4170 + }, + { + "epoch": 1.2660494764000607, + "grad_norm": 0.4341298043727875, + "learning_rate": 7.472053462940462e-05, + "loss": 1.1662, + "step": 4171 + }, + { + "epoch": 1.2663530125967521, + "grad_norm": 0.5080009698867798, + "learning_rate": 7.471445929526124e-05, + "loss": 1.8927, + "step": 4172 + }, + { + "epoch": 1.2666565487934436, + "grad_norm": 0.5409614443778992, + "learning_rate": 7.470838396111787e-05, + "loss": 1.548, + "step": 4173 + }, + { + "epoch": 1.266960084990135, + "grad_norm": 0.6036258339881897, + "learning_rate": 7.470230862697449e-05, + "loss": 1.5646, + "step": 4174 + }, + { + "epoch": 1.2672636211868264, + "grad_norm": 0.496652752161026, + "learning_rate": 7.46962332928311e-05, + "loss": 1.7042, + "step": 4175 + }, + { + "epoch": 1.2675671573835179, + "grad_norm": 0.5276821255683899, + "learning_rate": 7.469015795868773e-05, + "loss": 1.4722, + "step": 4176 + }, + { + "epoch": 1.2678706935802095, + "grad_norm": 0.4797695577144623, + "learning_rate": 7.468408262454435e-05, + "loss": 1.6569, + "step": 4177 + }, + { + "epoch": 1.268174229776901, + "grad_norm": 0.5196139812469482, + "learning_rate": 7.467800729040098e-05, + "loss": 1.7454, + "step": 4178 + }, + { + "epoch": 1.2684777659735924, + "grad_norm": 0.5565782785415649, + "learning_rate": 7.46719319562576e-05, + "loss": 1.6089, + "step": 4179 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.49715206027030945, + "learning_rate": 7.466585662211422e-05, + "loss": 1.6336, + "step": 4180 + }, + { + "epoch": 1.2690848383669753, + "grad_norm": 0.5397220849990845, + "learning_rate": 7.465978128797085e-05, + "loss": 1.452, + "step": 4181 + }, + { + "epoch": 1.2693883745636667, + "grad_norm": 0.7954735159873962, + "learning_rate": 7.465370595382746e-05, + "loss": 0.973, + "step": 4182 + }, + { + "epoch": 1.2696919107603581, + "grad_norm": 0.6067697405815125, + "learning_rate": 7.464763061968408e-05, + "loss": 1.5954, + "step": 4183 + }, + { + "epoch": 1.2699954469570496, + "grad_norm": 0.5883306860923767, + "learning_rate": 7.464155528554071e-05, + "loss": 2.072, + "step": 4184 + }, + { + "epoch": 1.270298983153741, + "grad_norm": 0.503072202205658, + "learning_rate": 7.463547995139733e-05, + "loss": 1.6075, + "step": 4185 + }, + { + "epoch": 1.2706025193504327, + "grad_norm": 0.4737991392612457, + "learning_rate": 7.462940461725395e-05, + "loss": 1.8853, + "step": 4186 + }, + { + "epoch": 1.2709060555471239, + "grad_norm": 0.6354210376739502, + "learning_rate": 7.462332928311058e-05, + "loss": 1.901, + "step": 4187 + }, + { + "epoch": 1.2712095917438155, + "grad_norm": 0.5082765817642212, + "learning_rate": 7.46172539489672e-05, + "loss": 1.3334, + "step": 4188 + }, + { + "epoch": 1.271513127940507, + "grad_norm": 0.5836446285247803, + "learning_rate": 7.461117861482381e-05, + "loss": 1.4225, + "step": 4189 + }, + { + "epoch": 1.2718166641371984, + "grad_norm": 0.6026635766029358, + "learning_rate": 7.460510328068044e-05, + "loss": 1.693, + "step": 4190 + }, + { + "epoch": 1.2721202003338898, + "grad_norm": 0.4391961395740509, + "learning_rate": 7.459902794653706e-05, + "loss": 1.5203, + "step": 4191 + }, + { + "epoch": 1.2724237365305813, + "grad_norm": 0.4942375123500824, + "learning_rate": 7.459295261239369e-05, + "loss": 1.8957, + "step": 4192 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.5597366094589233, + "learning_rate": 7.458687727825031e-05, + "loss": 1.6267, + "step": 4193 + }, + { + "epoch": 1.2730308089239641, + "grad_norm": 0.49395179748535156, + "learning_rate": 7.458080194410693e-05, + "loss": 1.6232, + "step": 4194 + }, + { + "epoch": 1.2733343451206556, + "grad_norm": 0.6147474646568298, + "learning_rate": 7.457472660996356e-05, + "loss": 1.1225, + "step": 4195 + }, + { + "epoch": 1.273637881317347, + "grad_norm": 0.5345576405525208, + "learning_rate": 7.456865127582017e-05, + "loss": 1.8809, + "step": 4196 + }, + { + "epoch": 1.2739414175140387, + "grad_norm": 0.5789499878883362, + "learning_rate": 7.456257594167679e-05, + "loss": 1.67, + "step": 4197 + }, + { + "epoch": 1.2742449537107299, + "grad_norm": 0.49217602610588074, + "learning_rate": 7.455650060753342e-05, + "loss": 1.8027, + "step": 4198 + }, + { + "epoch": 1.2745484899074215, + "grad_norm": 0.4023227095603943, + "learning_rate": 7.455042527339004e-05, + "loss": 1.1221, + "step": 4199 + }, + { + "epoch": 1.274852026104113, + "grad_norm": 0.5066853761672974, + "learning_rate": 7.454434993924666e-05, + "loss": 1.3252, + "step": 4200 + }, + { + "epoch": 1.2751555623008044, + "grad_norm": 0.46731558442115784, + "learning_rate": 7.453827460510329e-05, + "loss": 1.3056, + "step": 4201 + }, + { + "epoch": 1.2754590984974958, + "grad_norm": 0.594733715057373, + "learning_rate": 7.45321992709599e-05, + "loss": 1.8543, + "step": 4202 + }, + { + "epoch": 1.2757626346941873, + "grad_norm": 0.4806402027606964, + "learning_rate": 7.452612393681652e-05, + "loss": 1.8651, + "step": 4203 + }, + { + "epoch": 1.2760661708908787, + "grad_norm": 0.476089745759964, + "learning_rate": 7.452004860267315e-05, + "loss": 1.8084, + "step": 4204 + }, + { + "epoch": 1.2763697070875701, + "grad_norm": 0.4468570947647095, + "learning_rate": 7.451397326852977e-05, + "loss": 1.8509, + "step": 4205 + }, + { + "epoch": 1.2766732432842616, + "grad_norm": 0.608421802520752, + "learning_rate": 7.450789793438639e-05, + "loss": 1.3824, + "step": 4206 + }, + { + "epoch": 1.276976779480953, + "grad_norm": 0.517336905002594, + "learning_rate": 7.450182260024302e-05, + "loss": 1.7146, + "step": 4207 + }, + { + "epoch": 1.2772803156776447, + "grad_norm": 0.4810909032821655, + "learning_rate": 7.449574726609964e-05, + "loss": 1.6262, + "step": 4208 + }, + { + "epoch": 1.277583851874336, + "grad_norm": 0.5203115344047546, + "learning_rate": 7.448967193195627e-05, + "loss": 1.5095, + "step": 4209 + }, + { + "epoch": 1.2778873880710275, + "grad_norm": 0.48836538195610046, + "learning_rate": 7.448359659781288e-05, + "loss": 1.076, + "step": 4210 + }, + { + "epoch": 1.278190924267719, + "grad_norm": 0.49540603160858154, + "learning_rate": 7.44775212636695e-05, + "loss": 1.3751, + "step": 4211 + }, + { + "epoch": 1.2784944604644104, + "grad_norm": 0.4693083167076111, + "learning_rate": 7.447144592952613e-05, + "loss": 2.007, + "step": 4212 + }, + { + "epoch": 1.2787979966611018, + "grad_norm": 0.5337846875190735, + "learning_rate": 7.446537059538275e-05, + "loss": 1.7861, + "step": 4213 + }, + { + "epoch": 1.2791015328577933, + "grad_norm": 0.5780406594276428, + "learning_rate": 7.445929526123937e-05, + "loss": 1.6548, + "step": 4214 + }, + { + "epoch": 1.2794050690544847, + "grad_norm": 0.4388402998447418, + "learning_rate": 7.4453219927096e-05, + "loss": 1.8911, + "step": 4215 + }, + { + "epoch": 1.2797086052511761, + "grad_norm": 0.5437458753585815, + "learning_rate": 7.444714459295262e-05, + "loss": 2.0187, + "step": 4216 + }, + { + "epoch": 1.2800121414478678, + "grad_norm": 2.0740556716918945, + "learning_rate": 7.444106925880923e-05, + "loss": 1.926, + "step": 4217 + }, + { + "epoch": 1.280315677644559, + "grad_norm": 0.5022751688957214, + "learning_rate": 7.443499392466586e-05, + "loss": 1.6786, + "step": 4218 + }, + { + "epoch": 1.2806192138412507, + "grad_norm": 0.5324148535728455, + "learning_rate": 7.442891859052248e-05, + "loss": 1.6156, + "step": 4219 + }, + { + "epoch": 1.280922750037942, + "grad_norm": 0.5735637545585632, + "learning_rate": 7.44228432563791e-05, + "loss": 1.4541, + "step": 4220 + }, + { + "epoch": 1.2812262862346335, + "grad_norm": 0.4589940905570984, + "learning_rate": 7.441676792223573e-05, + "loss": 1.8461, + "step": 4221 + }, + { + "epoch": 1.281529822431325, + "grad_norm": 0.44274652004241943, + "learning_rate": 7.441069258809235e-05, + "loss": 1.992, + "step": 4222 + }, + { + "epoch": 1.2818333586280164, + "grad_norm": 0.5244073867797852, + "learning_rate": 7.440461725394898e-05, + "loss": 1.5408, + "step": 4223 + }, + { + "epoch": 1.2821368948247078, + "grad_norm": 0.513891339302063, + "learning_rate": 7.43985419198056e-05, + "loss": 1.6297, + "step": 4224 + }, + { + "epoch": 1.2824404310213993, + "grad_norm": 0.416079580783844, + "learning_rate": 7.439246658566221e-05, + "loss": 1.6186, + "step": 4225 + }, + { + "epoch": 1.2827439672180907, + "grad_norm": 0.4873940944671631, + "learning_rate": 7.438639125151884e-05, + "loss": 1.83, + "step": 4226 + }, + { + "epoch": 1.2830475034147821, + "grad_norm": 0.5091072916984558, + "learning_rate": 7.438031591737546e-05, + "loss": 1.4658, + "step": 4227 + }, + { + "epoch": 1.2833510396114738, + "grad_norm": 0.6976317167282104, + "learning_rate": 7.437424058323208e-05, + "loss": 1.8133, + "step": 4228 + }, + { + "epoch": 1.283654575808165, + "grad_norm": 0.5269485116004944, + "learning_rate": 7.436816524908871e-05, + "loss": 1.8103, + "step": 4229 + }, + { + "epoch": 1.2839581120048567, + "grad_norm": 0.44276031851768494, + "learning_rate": 7.436208991494533e-05, + "loss": 1.8184, + "step": 4230 + }, + { + "epoch": 1.284261648201548, + "grad_norm": 0.4946112632751465, + "learning_rate": 7.435601458080194e-05, + "loss": 1.7158, + "step": 4231 + }, + { + "epoch": 1.2845651843982395, + "grad_norm": 1.0636193752288818, + "learning_rate": 7.434993924665857e-05, + "loss": 1.621, + "step": 4232 + }, + { + "epoch": 1.284868720594931, + "grad_norm": 0.42557114362716675, + "learning_rate": 7.434386391251519e-05, + "loss": 1.4983, + "step": 4233 + }, + { + "epoch": 1.2851722567916224, + "grad_norm": 0.6611326932907104, + "learning_rate": 7.433778857837181e-05, + "loss": 1.8904, + "step": 4234 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.47779926657676697, + "learning_rate": 7.433171324422844e-05, + "loss": 1.8208, + "step": 4235 + }, + { + "epoch": 1.2857793291850053, + "grad_norm": 0.5227665901184082, + "learning_rate": 7.432563791008506e-05, + "loss": 1.713, + "step": 4236 + }, + { + "epoch": 1.2860828653816967, + "grad_norm": 0.5510228872299194, + "learning_rate": 7.431956257594169e-05, + "loss": 1.632, + "step": 4237 + }, + { + "epoch": 1.2863864015783881, + "grad_norm": 0.44311997294425964, + "learning_rate": 7.43134872417983e-05, + "loss": 1.5965, + "step": 4238 + }, + { + "epoch": 1.2866899377750798, + "grad_norm": 0.4647054672241211, + "learning_rate": 7.430741190765492e-05, + "loss": 1.7261, + "step": 4239 + }, + { + "epoch": 1.286993473971771, + "grad_norm": 0.564996063709259, + "learning_rate": 7.430133657351155e-05, + "loss": 1.1348, + "step": 4240 + }, + { + "epoch": 1.2872970101684627, + "grad_norm": 0.4639973044395447, + "learning_rate": 7.429526123936817e-05, + "loss": 1.4364, + "step": 4241 + }, + { + "epoch": 1.287600546365154, + "grad_norm": 0.5808007121086121, + "learning_rate": 7.428918590522479e-05, + "loss": 1.5214, + "step": 4242 + }, + { + "epoch": 1.2879040825618455, + "grad_norm": 0.5553866624832153, + "learning_rate": 7.428311057108142e-05, + "loss": 1.5734, + "step": 4243 + }, + { + "epoch": 1.288207618758537, + "grad_norm": 0.4662241041660309, + "learning_rate": 7.427703523693804e-05, + "loss": 2.1107, + "step": 4244 + }, + { + "epoch": 1.2885111549552284, + "grad_norm": 0.542239785194397, + "learning_rate": 7.427095990279465e-05, + "loss": 1.1648, + "step": 4245 + }, + { + "epoch": 1.2888146911519198, + "grad_norm": 0.531491756439209, + "learning_rate": 7.426488456865128e-05, + "loss": 1.8248, + "step": 4246 + }, + { + "epoch": 1.2891182273486113, + "grad_norm": 0.595971941947937, + "learning_rate": 7.42588092345079e-05, + "loss": 1.4951, + "step": 4247 + }, + { + "epoch": 1.289421763545303, + "grad_norm": 1.127319574356079, + "learning_rate": 7.425273390036452e-05, + "loss": 1.4612, + "step": 4248 + }, + { + "epoch": 1.2897252997419941, + "grad_norm": 0.5719674825668335, + "learning_rate": 7.424665856622115e-05, + "loss": 1.4476, + "step": 4249 + }, + { + "epoch": 1.2900288359386858, + "grad_norm": 0.552703320980072, + "learning_rate": 7.424058323207777e-05, + "loss": 2.0541, + "step": 4250 + }, + { + "epoch": 1.2903323721353772, + "grad_norm": 0.48743313550949097, + "learning_rate": 7.42345078979344e-05, + "loss": 1.3482, + "step": 4251 + }, + { + "epoch": 1.2906359083320686, + "grad_norm": 0.49920594692230225, + "learning_rate": 7.422843256379101e-05, + "loss": 1.2493, + "step": 4252 + }, + { + "epoch": 1.29093944452876, + "grad_norm": 2.3087148666381836, + "learning_rate": 7.422235722964763e-05, + "loss": 1.861, + "step": 4253 + }, + { + "epoch": 1.2912429807254515, + "grad_norm": 0.5297853946685791, + "learning_rate": 7.421628189550426e-05, + "loss": 1.4438, + "step": 4254 + }, + { + "epoch": 1.291546516922143, + "grad_norm": 0.5467548966407776, + "learning_rate": 7.421020656136087e-05, + "loss": 1.4626, + "step": 4255 + }, + { + "epoch": 1.2918500531188344, + "grad_norm": 0.5381457805633545, + "learning_rate": 7.42041312272175e-05, + "loss": 1.6073, + "step": 4256 + }, + { + "epoch": 1.2921535893155258, + "grad_norm": 0.5748366713523865, + "learning_rate": 7.419805589307413e-05, + "loss": 1.5323, + "step": 4257 + }, + { + "epoch": 1.2924571255122173, + "grad_norm": 0.4963832497596741, + "learning_rate": 7.419198055893075e-05, + "loss": 1.7466, + "step": 4258 + }, + { + "epoch": 1.292760661708909, + "grad_norm": 0.5308859348297119, + "learning_rate": 7.418590522478736e-05, + "loss": 1.613, + "step": 4259 + }, + { + "epoch": 1.2930641979056001, + "grad_norm": 0.5278881192207336, + "learning_rate": 7.4179829890644e-05, + "loss": 1.5005, + "step": 4260 + }, + { + "epoch": 1.2933677341022918, + "grad_norm": 0.5616053938865662, + "learning_rate": 7.417375455650061e-05, + "loss": 1.4673, + "step": 4261 + }, + { + "epoch": 1.2936712702989832, + "grad_norm": 0.4754112660884857, + "learning_rate": 7.416767922235723e-05, + "loss": 1.1533, + "step": 4262 + }, + { + "epoch": 1.2939748064956746, + "grad_norm": 0.4585922062397003, + "learning_rate": 7.416160388821386e-05, + "loss": 1.3095, + "step": 4263 + }, + { + "epoch": 1.294278342692366, + "grad_norm": 0.5392476916313171, + "learning_rate": 7.415552855407048e-05, + "loss": 1.8251, + "step": 4264 + }, + { + "epoch": 1.2945818788890575, + "grad_norm": 0.5653027296066284, + "learning_rate": 7.414945321992711e-05, + "loss": 1.4677, + "step": 4265 + }, + { + "epoch": 1.294885415085749, + "grad_norm": 0.4403519034385681, + "learning_rate": 7.414337788578372e-05, + "loss": 1.2953, + "step": 4266 + }, + { + "epoch": 1.2951889512824404, + "grad_norm": 0.5099658370018005, + "learning_rate": 7.413730255164034e-05, + "loss": 1.6105, + "step": 4267 + }, + { + "epoch": 1.2954924874791318, + "grad_norm": 0.48418527841567993, + "learning_rate": 7.413122721749697e-05, + "loss": 1.7371, + "step": 4268 + }, + { + "epoch": 1.2957960236758232, + "grad_norm": 0.4962136447429657, + "learning_rate": 7.412515188335358e-05, + "loss": 1.8332, + "step": 4269 + }, + { + "epoch": 1.296099559872515, + "grad_norm": 0.5248817801475525, + "learning_rate": 7.411907654921021e-05, + "loss": 1.0489, + "step": 4270 + }, + { + "epoch": 1.2964030960692061, + "grad_norm": 0.5496529936790466, + "learning_rate": 7.411300121506684e-05, + "loss": 1.5079, + "step": 4271 + }, + { + "epoch": 1.2967066322658978, + "grad_norm": 0.49035531282424927, + "learning_rate": 7.410692588092346e-05, + "loss": 1.7401, + "step": 4272 + }, + { + "epoch": 1.2970101684625892, + "grad_norm": 0.5801602602005005, + "learning_rate": 7.410085054678007e-05, + "loss": 2.0675, + "step": 4273 + }, + { + "epoch": 1.2973137046592806, + "grad_norm": 0.5056676268577576, + "learning_rate": 7.40947752126367e-05, + "loss": 1.7762, + "step": 4274 + }, + { + "epoch": 1.297617240855972, + "grad_norm": 0.7785037159919739, + "learning_rate": 7.408869987849332e-05, + "loss": 1.4058, + "step": 4275 + }, + { + "epoch": 1.2979207770526635, + "grad_norm": 1.1016004085540771, + "learning_rate": 7.408262454434994e-05, + "loss": 1.099, + "step": 4276 + }, + { + "epoch": 1.298224313249355, + "grad_norm": 0.4835539162158966, + "learning_rate": 7.407654921020657e-05, + "loss": 1.5021, + "step": 4277 + }, + { + "epoch": 1.2985278494460464, + "grad_norm": 0.4456567168235779, + "learning_rate": 7.407047387606319e-05, + "loss": 1.3587, + "step": 4278 + }, + { + "epoch": 1.298831385642738, + "grad_norm": 0.5003551840782166, + "learning_rate": 7.40643985419198e-05, + "loss": 1.6672, + "step": 4279 + }, + { + "epoch": 1.2991349218394292, + "grad_norm": 0.5069398880004883, + "learning_rate": 7.405832320777643e-05, + "loss": 1.4526, + "step": 4280 + }, + { + "epoch": 1.299438458036121, + "grad_norm": 0.5351117253303528, + "learning_rate": 7.405224787363305e-05, + "loss": 1.7542, + "step": 4281 + }, + { + "epoch": 1.2997419942328123, + "grad_norm": 0.5914597511291504, + "learning_rate": 7.404617253948968e-05, + "loss": 1.4507, + "step": 4282 + }, + { + "epoch": 1.3000455304295038, + "grad_norm": 0.6194307804107666, + "learning_rate": 7.404009720534629e-05, + "loss": 1.7291, + "step": 4283 + }, + { + "epoch": 1.3003490666261952, + "grad_norm": 0.37127256393432617, + "learning_rate": 7.403402187120292e-05, + "loss": 1.2599, + "step": 4284 + }, + { + "epoch": 1.3006526028228866, + "grad_norm": 0.49058303236961365, + "learning_rate": 7.402794653705955e-05, + "loss": 1.8608, + "step": 4285 + }, + { + "epoch": 1.300956139019578, + "grad_norm": 0.5458847880363464, + "learning_rate": 7.402187120291617e-05, + "loss": 1.7041, + "step": 4286 + }, + { + "epoch": 1.3012596752162695, + "grad_norm": 0.5570408701896667, + "learning_rate": 7.401579586877278e-05, + "loss": 1.8484, + "step": 4287 + }, + { + "epoch": 1.301563211412961, + "grad_norm": 0.5224193930625916, + "learning_rate": 7.400972053462941e-05, + "loss": 1.5517, + "step": 4288 + }, + { + "epoch": 1.3018667476096524, + "grad_norm": 0.6026464700698853, + "learning_rate": 7.400364520048603e-05, + "loss": 1.0592, + "step": 4289 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.5078088641166687, + "learning_rate": 7.399756986634265e-05, + "loss": 1.559, + "step": 4290 + }, + { + "epoch": 1.3024738200030352, + "grad_norm": 0.5993932485580444, + "learning_rate": 7.399149453219928e-05, + "loss": 1.608, + "step": 4291 + }, + { + "epoch": 1.302777356199727, + "grad_norm": 0.5589376091957092, + "learning_rate": 7.39854191980559e-05, + "loss": 1.5105, + "step": 4292 + }, + { + "epoch": 1.3030808923964183, + "grad_norm": 0.5910342931747437, + "learning_rate": 7.397934386391251e-05, + "loss": 1.6214, + "step": 4293 + }, + { + "epoch": 1.3033844285931098, + "grad_norm": 0.5900040864944458, + "learning_rate": 7.397326852976914e-05, + "loss": 1.6379, + "step": 4294 + }, + { + "epoch": 1.3036879647898012, + "grad_norm": 0.561262845993042, + "learning_rate": 7.396719319562576e-05, + "loss": 1.7905, + "step": 4295 + }, + { + "epoch": 1.3039915009864926, + "grad_norm": 0.4261028468608856, + "learning_rate": 7.396111786148239e-05, + "loss": 1.5822, + "step": 4296 + }, + { + "epoch": 1.304295037183184, + "grad_norm": 0.5175686478614807, + "learning_rate": 7.3955042527339e-05, + "loss": 1.6183, + "step": 4297 + }, + { + "epoch": 1.3045985733798755, + "grad_norm": 0.46133172512054443, + "learning_rate": 7.394896719319563e-05, + "loss": 1.7778, + "step": 4298 + }, + { + "epoch": 1.304902109576567, + "grad_norm": 1.0579636096954346, + "learning_rate": 7.394289185905226e-05, + "loss": 1.7789, + "step": 4299 + }, + { + "epoch": 1.3052056457732584, + "grad_norm": 0.5155093669891357, + "learning_rate": 7.393681652490888e-05, + "loss": 1.6568, + "step": 4300 + }, + { + "epoch": 1.30550918196995, + "grad_norm": 0.4386572241783142, + "learning_rate": 7.393074119076549e-05, + "loss": 1.8987, + "step": 4301 + }, + { + "epoch": 1.3058127181666412, + "grad_norm": 0.5592470169067383, + "learning_rate": 7.392466585662212e-05, + "loss": 1.764, + "step": 4302 + }, + { + "epoch": 1.306116254363333, + "grad_norm": 0.5633784532546997, + "learning_rate": 7.391859052247874e-05, + "loss": 1.6752, + "step": 4303 + }, + { + "epoch": 1.3064197905600243, + "grad_norm": 0.860517680644989, + "learning_rate": 7.391251518833536e-05, + "loss": 0.976, + "step": 4304 + }, + { + "epoch": 1.3067233267567158, + "grad_norm": 0.44970962405204773, + "learning_rate": 7.390643985419199e-05, + "loss": 1.7791, + "step": 4305 + }, + { + "epoch": 1.3070268629534072, + "grad_norm": 0.5651503205299377, + "learning_rate": 7.390036452004861e-05, + "loss": 1.2142, + "step": 4306 + }, + { + "epoch": 1.3073303991500986, + "grad_norm": 0.5687608122825623, + "learning_rate": 7.389428918590522e-05, + "loss": 1.91, + "step": 4307 + }, + { + "epoch": 1.30763393534679, + "grad_norm": 0.5673285126686096, + "learning_rate": 7.388821385176184e-05, + "loss": 1.6851, + "step": 4308 + }, + { + "epoch": 1.3079374715434815, + "grad_norm": 0.5170315504074097, + "learning_rate": 7.388213851761847e-05, + "loss": 1.8565, + "step": 4309 + }, + { + "epoch": 1.308241007740173, + "grad_norm": 0.45069336891174316, + "learning_rate": 7.38760631834751e-05, + "loss": 1.9078, + "step": 4310 + }, + { + "epoch": 1.3085445439368644, + "grad_norm": 0.5088868141174316, + "learning_rate": 7.386998784933171e-05, + "loss": 1.5292, + "step": 4311 + }, + { + "epoch": 1.308848080133556, + "grad_norm": 0.5309886932373047, + "learning_rate": 7.386391251518834e-05, + "loss": 1.8614, + "step": 4312 + }, + { + "epoch": 1.3091516163302475, + "grad_norm": 0.5747066140174866, + "learning_rate": 7.385783718104497e-05, + "loss": 1.4692, + "step": 4313 + }, + { + "epoch": 1.309455152526939, + "grad_norm": 0.5247424840927124, + "learning_rate": 7.385176184690159e-05, + "loss": 1.9508, + "step": 4314 + }, + { + "epoch": 1.3097586887236303, + "grad_norm": 0.7920752167701721, + "learning_rate": 7.38456865127582e-05, + "loss": 1.3849, + "step": 4315 + }, + { + "epoch": 1.3100622249203218, + "grad_norm": 0.5336897373199463, + "learning_rate": 7.383961117861483e-05, + "loss": 1.8924, + "step": 4316 + }, + { + "epoch": 1.3103657611170132, + "grad_norm": 0.5634369850158691, + "learning_rate": 7.383353584447145e-05, + "loss": 1.6056, + "step": 4317 + }, + { + "epoch": 1.3106692973137046, + "grad_norm": 0.5356997847557068, + "learning_rate": 7.382746051032807e-05, + "loss": 1.7636, + "step": 4318 + }, + { + "epoch": 1.310972833510396, + "grad_norm": 0.552720308303833, + "learning_rate": 7.38213851761847e-05, + "loss": 1.5682, + "step": 4319 + }, + { + "epoch": 1.3112763697070875, + "grad_norm": 0.5768778324127197, + "learning_rate": 7.381530984204132e-05, + "loss": 1.2844, + "step": 4320 + }, + { + "epoch": 1.3115799059037792, + "grad_norm": 0.45900124311447144, + "learning_rate": 7.380923450789793e-05, + "loss": 1.6181, + "step": 4321 + }, + { + "epoch": 1.3118834421004704, + "grad_norm": 0.5029597878456116, + "learning_rate": 7.380315917375455e-05, + "loss": 1.4201, + "step": 4322 + }, + { + "epoch": 1.312186978297162, + "grad_norm": 0.55179363489151, + "learning_rate": 7.379708383961118e-05, + "loss": 1.4521, + "step": 4323 + }, + { + "epoch": 1.3124905144938535, + "grad_norm": 0.49221259355545044, + "learning_rate": 7.379100850546781e-05, + "loss": 1.5294, + "step": 4324 + }, + { + "epoch": 1.312794050690545, + "grad_norm": 0.5700541734695435, + "learning_rate": 7.378493317132442e-05, + "loss": 1.5504, + "step": 4325 + }, + { + "epoch": 1.3130975868872363, + "grad_norm": 0.3949977159500122, + "learning_rate": 7.377885783718105e-05, + "loss": 1.4024, + "step": 4326 + }, + { + "epoch": 1.3134011230839278, + "grad_norm": 0.4766468107700348, + "learning_rate": 7.377278250303768e-05, + "loss": 1.9027, + "step": 4327 + }, + { + "epoch": 1.3137046592806192, + "grad_norm": 0.6489190459251404, + "learning_rate": 7.376670716889428e-05, + "loss": 1.6306, + "step": 4328 + }, + { + "epoch": 1.3140081954773106, + "grad_norm": 0.5466142892837524, + "learning_rate": 7.376063183475091e-05, + "loss": 1.619, + "step": 4329 + }, + { + "epoch": 1.314311731674002, + "grad_norm": 0.5554240345954895, + "learning_rate": 7.375455650060754e-05, + "loss": 1.8692, + "step": 4330 + }, + { + "epoch": 1.3146152678706935, + "grad_norm": 0.473034143447876, + "learning_rate": 7.374848116646416e-05, + "loss": 1.8351, + "step": 4331 + }, + { + "epoch": 1.3149188040673851, + "grad_norm": 0.5100724697113037, + "learning_rate": 7.374240583232078e-05, + "loss": 1.6781, + "step": 4332 + }, + { + "epoch": 1.3152223402640764, + "grad_norm": 0.48680275678634644, + "learning_rate": 7.373633049817741e-05, + "loss": 1.6358, + "step": 4333 + }, + { + "epoch": 1.315525876460768, + "grad_norm": 0.5869154334068298, + "learning_rate": 7.373025516403403e-05, + "loss": 1.5926, + "step": 4334 + }, + { + "epoch": 1.3158294126574595, + "grad_norm": 0.4726203382015228, + "learning_rate": 7.372417982989064e-05, + "loss": 1.6651, + "step": 4335 + }, + { + "epoch": 1.3161329488541509, + "grad_norm": 0.9809671640396118, + "learning_rate": 7.371810449574726e-05, + "loss": 1.1371, + "step": 4336 + }, + { + "epoch": 1.3164364850508423, + "grad_norm": 0.5169830322265625, + "learning_rate": 7.371202916160389e-05, + "loss": 1.4736, + "step": 4337 + }, + { + "epoch": 1.3167400212475338, + "grad_norm": 0.48734250664711, + "learning_rate": 7.370595382746052e-05, + "loss": 1.6371, + "step": 4338 + }, + { + "epoch": 1.3170435574442252, + "grad_norm": 0.5264248847961426, + "learning_rate": 7.369987849331713e-05, + "loss": 1.7498, + "step": 4339 + }, + { + "epoch": 1.3173470936409166, + "grad_norm": 0.5730972290039062, + "learning_rate": 7.369380315917376e-05, + "loss": 1.3057, + "step": 4340 + }, + { + "epoch": 1.317650629837608, + "grad_norm": 0.776110827922821, + "learning_rate": 7.368772782503039e-05, + "loss": 1.4484, + "step": 4341 + }, + { + "epoch": 1.3179541660342995, + "grad_norm": 0.5012614130973816, + "learning_rate": 7.368165249088699e-05, + "loss": 1.041, + "step": 4342 + }, + { + "epoch": 1.3182577022309911, + "grad_norm": 0.5454205870628357, + "learning_rate": 7.367557715674362e-05, + "loss": 1.3911, + "step": 4343 + }, + { + "epoch": 1.3185612384276826, + "grad_norm": 0.4966050386428833, + "learning_rate": 7.366950182260025e-05, + "loss": 1.8287, + "step": 4344 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.5605126619338989, + "learning_rate": 7.366342648845687e-05, + "loss": 2.0233, + "step": 4345 + }, + { + "epoch": 1.3191683108210654, + "grad_norm": 0.5626348853111267, + "learning_rate": 7.365735115431349e-05, + "loss": 1.4652, + "step": 4346 + }, + { + "epoch": 1.3194718470177569, + "grad_norm": 0.5361192226409912, + "learning_rate": 7.365127582017012e-05, + "loss": 1.8023, + "step": 4347 + }, + { + "epoch": 1.3197753832144483, + "grad_norm": 0.6468534469604492, + "learning_rate": 7.364520048602674e-05, + "loss": 1.7418, + "step": 4348 + }, + { + "epoch": 1.3200789194111398, + "grad_norm": 0.5218302607536316, + "learning_rate": 7.363912515188335e-05, + "loss": 1.7832, + "step": 4349 + }, + { + "epoch": 1.3203824556078312, + "grad_norm": 0.4624869227409363, + "learning_rate": 7.363304981773997e-05, + "loss": 1.8654, + "step": 4350 + }, + { + "epoch": 1.3206859918045226, + "grad_norm": 1.073112964630127, + "learning_rate": 7.36269744835966e-05, + "loss": 1.6806, + "step": 4351 + }, + { + "epoch": 1.3209895280012143, + "grad_norm": 0.5314664244651794, + "learning_rate": 7.362089914945322e-05, + "loss": 1.7686, + "step": 4352 + }, + { + "epoch": 1.3212930641979055, + "grad_norm": 0.49280011653900146, + "learning_rate": 7.361482381530984e-05, + "loss": 1.8869, + "step": 4353 + }, + { + "epoch": 1.3215966003945971, + "grad_norm": 0.5610837340354919, + "learning_rate": 7.360874848116647e-05, + "loss": 1.6387, + "step": 4354 + }, + { + "epoch": 1.3219001365912886, + "grad_norm": 0.47585153579711914, + "learning_rate": 7.36026731470231e-05, + "loss": 1.5936, + "step": 4355 + }, + { + "epoch": 1.32220367278798, + "grad_norm": 0.6694095134735107, + "learning_rate": 7.35965978128797e-05, + "loss": 1.3855, + "step": 4356 + }, + { + "epoch": 1.3225072089846714, + "grad_norm": 0.5073167085647583, + "learning_rate": 7.359052247873633e-05, + "loss": 1.609, + "step": 4357 + }, + { + "epoch": 1.3228107451813629, + "grad_norm": 0.567101240158081, + "learning_rate": 7.358444714459296e-05, + "loss": 1.7975, + "step": 4358 + }, + { + "epoch": 1.3231142813780543, + "grad_norm": 0.5210965275764465, + "learning_rate": 7.357837181044958e-05, + "loss": 1.6289, + "step": 4359 + }, + { + "epoch": 1.3234178175747457, + "grad_norm": 0.463466078042984, + "learning_rate": 7.35722964763062e-05, + "loss": 1.4126, + "step": 4360 + }, + { + "epoch": 1.3237213537714372, + "grad_norm": 0.523644208908081, + "learning_rate": 7.356622114216283e-05, + "loss": 1.6897, + "step": 4361 + }, + { + "epoch": 1.3240248899681286, + "grad_norm": 0.5138276815414429, + "learning_rate": 7.356014580801945e-05, + "loss": 1.6961, + "step": 4362 + }, + { + "epoch": 1.3243284261648203, + "grad_norm": 0.4663618505001068, + "learning_rate": 7.355407047387606e-05, + "loss": 1.2274, + "step": 4363 + }, + { + "epoch": 1.3246319623615115, + "grad_norm": 0.5608078837394714, + "learning_rate": 7.354799513973268e-05, + "loss": 1.3882, + "step": 4364 + }, + { + "epoch": 1.3249354985582031, + "grad_norm": 0.4431968629360199, + "learning_rate": 7.354191980558931e-05, + "loss": 1.8665, + "step": 4365 + }, + { + "epoch": 1.3252390347548946, + "grad_norm": 0.5137575268745422, + "learning_rate": 7.353584447144593e-05, + "loss": 1.8504, + "step": 4366 + }, + { + "epoch": 1.325542570951586, + "grad_norm": 0.5060153603553772, + "learning_rate": 7.352976913730255e-05, + "loss": 1.981, + "step": 4367 + }, + { + "epoch": 1.3258461071482774, + "grad_norm": 0.4878283441066742, + "learning_rate": 7.352369380315918e-05, + "loss": 0.8917, + "step": 4368 + }, + { + "epoch": 1.3261496433449689, + "grad_norm": 0.5126661658287048, + "learning_rate": 7.351761846901581e-05, + "loss": 1.7912, + "step": 4369 + }, + { + "epoch": 1.3264531795416603, + "grad_norm": 0.742200493812561, + "learning_rate": 7.351154313487241e-05, + "loss": 1.3088, + "step": 4370 + }, + { + "epoch": 1.3267567157383517, + "grad_norm": 0.5806966423988342, + "learning_rate": 7.350546780072904e-05, + "loss": 1.1248, + "step": 4371 + }, + { + "epoch": 1.3270602519350432, + "grad_norm": 0.46205493807792664, + "learning_rate": 7.349939246658567e-05, + "loss": 1.6596, + "step": 4372 + }, + { + "epoch": 1.3273637881317346, + "grad_norm": 0.5538312196731567, + "learning_rate": 7.349331713244229e-05, + "loss": 1.3168, + "step": 4373 + }, + { + "epoch": 1.3276673243284263, + "grad_norm": 0.49692967534065247, + "learning_rate": 7.348724179829891e-05, + "loss": 1.9917, + "step": 4374 + }, + { + "epoch": 1.3279708605251177, + "grad_norm": 0.5603296160697937, + "learning_rate": 7.348116646415554e-05, + "loss": 1.5504, + "step": 4375 + }, + { + "epoch": 1.3282743967218091, + "grad_norm": 0.5459002256393433, + "learning_rate": 7.347509113001216e-05, + "loss": 1.5883, + "step": 4376 + }, + { + "epoch": 1.3285779329185006, + "grad_norm": 0.5703235864639282, + "learning_rate": 7.346901579586877e-05, + "loss": 1.5153, + "step": 4377 + }, + { + "epoch": 1.328881469115192, + "grad_norm": 0.5803839564323425, + "learning_rate": 7.346294046172539e-05, + "loss": 1.6111, + "step": 4378 + }, + { + "epoch": 1.3291850053118834, + "grad_norm": 0.514336347579956, + "learning_rate": 7.345686512758202e-05, + "loss": 1.4001, + "step": 4379 + }, + { + "epoch": 1.3294885415085749, + "grad_norm": 0.5741514563560486, + "learning_rate": 7.345078979343864e-05, + "loss": 1.8245, + "step": 4380 + }, + { + "epoch": 1.3297920777052663, + "grad_norm": 0.5243557691574097, + "learning_rate": 7.344471445929526e-05, + "loss": 1.6894, + "step": 4381 + }, + { + "epoch": 1.3300956139019577, + "grad_norm": 0.529828667640686, + "learning_rate": 7.343863912515189e-05, + "loss": 1.9486, + "step": 4382 + }, + { + "epoch": 1.3303991500986494, + "grad_norm": 0.5640398859977722, + "learning_rate": 7.343256379100852e-05, + "loss": 1.3146, + "step": 4383 + }, + { + "epoch": 1.3307026862953406, + "grad_norm": 0.537376344203949, + "learning_rate": 7.342648845686512e-05, + "loss": 1.6864, + "step": 4384 + }, + { + "epoch": 1.3310062224920323, + "grad_norm": 0.5403789281845093, + "learning_rate": 7.342041312272175e-05, + "loss": 1.2394, + "step": 4385 + }, + { + "epoch": 1.3313097586887237, + "grad_norm": 0.5978288054466248, + "learning_rate": 7.341433778857838e-05, + "loss": 1.6157, + "step": 4386 + }, + { + "epoch": 1.3316132948854151, + "grad_norm": 0.5005367398262024, + "learning_rate": 7.3408262454435e-05, + "loss": 1.8174, + "step": 4387 + }, + { + "epoch": 1.3319168310821066, + "grad_norm": 0.4958413541316986, + "learning_rate": 7.340218712029162e-05, + "loss": 1.0845, + "step": 4388 + }, + { + "epoch": 1.332220367278798, + "grad_norm": 0.48872581124305725, + "learning_rate": 7.339611178614824e-05, + "loss": 1.7209, + "step": 4389 + }, + { + "epoch": 1.3325239034754894, + "grad_norm": 0.4161425232887268, + "learning_rate": 7.339003645200487e-05, + "loss": 1.4084, + "step": 4390 + }, + { + "epoch": 1.3328274396721809, + "grad_norm": 0.5561462044715881, + "learning_rate": 7.338396111786148e-05, + "loss": 1.7643, + "step": 4391 + }, + { + "epoch": 1.3331309758688723, + "grad_norm": 0.5434063673019409, + "learning_rate": 7.33778857837181e-05, + "loss": 1.4755, + "step": 4392 + }, + { + "epoch": 1.3334345120655637, + "grad_norm": 0.516948938369751, + "learning_rate": 7.337181044957473e-05, + "loss": 1.6645, + "step": 4393 + }, + { + "epoch": 1.3337380482622554, + "grad_norm": 0.420340359210968, + "learning_rate": 7.336573511543135e-05, + "loss": 1.1486, + "step": 4394 + }, + { + "epoch": 1.3340415844589466, + "grad_norm": 0.49778157472610474, + "learning_rate": 7.335965978128797e-05, + "loss": 1.8017, + "step": 4395 + }, + { + "epoch": 1.3343451206556383, + "grad_norm": 0.5771467685699463, + "learning_rate": 7.33535844471446e-05, + "loss": 1.7241, + "step": 4396 + }, + { + "epoch": 1.3346486568523297, + "grad_norm": 0.5317919254302979, + "learning_rate": 7.334750911300123e-05, + "loss": 1.7381, + "step": 4397 + }, + { + "epoch": 1.3349521930490211, + "grad_norm": 0.37729716300964355, + "learning_rate": 7.334143377885783e-05, + "loss": 1.4022, + "step": 4398 + }, + { + "epoch": 1.3352557292457126, + "grad_norm": 0.4068670868873596, + "learning_rate": 7.333535844471446e-05, + "loss": 1.8955, + "step": 4399 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.5250211358070374, + "learning_rate": 7.33292831105711e-05, + "loss": 1.9555, + "step": 4400 + }, + { + "epoch": 1.3358628016390954, + "grad_norm": 0.45118898153305054, + "learning_rate": 7.33232077764277e-05, + "loss": 1.9174, + "step": 4401 + }, + { + "epoch": 1.3361663378357869, + "grad_norm": 0.5139362812042236, + "learning_rate": 7.331713244228433e-05, + "loss": 1.4453, + "step": 4402 + }, + { + "epoch": 1.3364698740324783, + "grad_norm": 0.5674066543579102, + "learning_rate": 7.331105710814095e-05, + "loss": 1.7853, + "step": 4403 + }, + { + "epoch": 1.3367734102291697, + "grad_norm": 0.5428693294525146, + "learning_rate": 7.330498177399758e-05, + "loss": 1.7516, + "step": 4404 + }, + { + "epoch": 1.3370769464258614, + "grad_norm": 0.4749910533428192, + "learning_rate": 7.32989064398542e-05, + "loss": 1.1845, + "step": 4405 + }, + { + "epoch": 1.3373804826225526, + "grad_norm": 0.5248123407363892, + "learning_rate": 7.329283110571081e-05, + "loss": 1.7238, + "step": 4406 + }, + { + "epoch": 1.3376840188192443, + "grad_norm": 0.5120264887809753, + "learning_rate": 7.328675577156744e-05, + "loss": 1.5947, + "step": 4407 + }, + { + "epoch": 1.3379875550159357, + "grad_norm": 0.4882364869117737, + "learning_rate": 7.328068043742406e-05, + "loss": 1.7606, + "step": 4408 + }, + { + "epoch": 1.3382910912126271, + "grad_norm": 0.5949615240097046, + "learning_rate": 7.327460510328068e-05, + "loss": 1.6298, + "step": 4409 + }, + { + "epoch": 1.3385946274093186, + "grad_norm": 0.5893341898918152, + "learning_rate": 7.326852976913731e-05, + "loss": 1.6613, + "step": 4410 + }, + { + "epoch": 1.33889816360601, + "grad_norm": 0.6306172609329224, + "learning_rate": 7.326245443499394e-05, + "loss": 1.3808, + "step": 4411 + }, + { + "epoch": 1.3392016998027014, + "grad_norm": 0.5589039325714111, + "learning_rate": 7.325637910085054e-05, + "loss": 1.5282, + "step": 4412 + }, + { + "epoch": 1.3395052359993929, + "grad_norm": 0.4946483373641968, + "learning_rate": 7.325030376670717e-05, + "loss": 1.5924, + "step": 4413 + }, + { + "epoch": 1.3398087721960845, + "grad_norm": 0.5384162068367004, + "learning_rate": 7.32442284325638e-05, + "loss": 1.7231, + "step": 4414 + }, + { + "epoch": 1.3401123083927757, + "grad_norm": 0.5065357685089111, + "learning_rate": 7.323815309842041e-05, + "loss": 1.795, + "step": 4415 + }, + { + "epoch": 1.3404158445894674, + "grad_norm": 0.5467350482940674, + "learning_rate": 7.323207776427704e-05, + "loss": 2.1203, + "step": 4416 + }, + { + "epoch": 1.3407193807861588, + "grad_norm": 0.9943485260009766, + "learning_rate": 7.322600243013366e-05, + "loss": 1.6615, + "step": 4417 + }, + { + "epoch": 1.3410229169828503, + "grad_norm": 0.48699676990509033, + "learning_rate": 7.321992709599029e-05, + "loss": 1.6614, + "step": 4418 + }, + { + "epoch": 1.3413264531795417, + "grad_norm": 0.5375555753707886, + "learning_rate": 7.32138517618469e-05, + "loss": 1.8225, + "step": 4419 + }, + { + "epoch": 1.3416299893762331, + "grad_norm": 0.43219107389450073, + "learning_rate": 7.320777642770352e-05, + "loss": 1.8322, + "step": 4420 + }, + { + "epoch": 1.3419335255729246, + "grad_norm": 0.8610438108444214, + "learning_rate": 7.320170109356015e-05, + "loss": 1.5714, + "step": 4421 + }, + { + "epoch": 1.342237061769616, + "grad_norm": 0.4116555154323578, + "learning_rate": 7.319562575941677e-05, + "loss": 1.527, + "step": 4422 + }, + { + "epoch": 1.3425405979663074, + "grad_norm": 0.7473874688148499, + "learning_rate": 7.318955042527339e-05, + "loss": 1.7131, + "step": 4423 + }, + { + "epoch": 1.3428441341629989, + "grad_norm": 0.6852518320083618, + "learning_rate": 7.318347509113002e-05, + "loss": 1.9008, + "step": 4424 + }, + { + "epoch": 1.3431476703596905, + "grad_norm": 0.4978015124797821, + "learning_rate": 7.317739975698664e-05, + "loss": 1.6112, + "step": 4425 + }, + { + "epoch": 1.3434512065563817, + "grad_norm": 0.4848748445510864, + "learning_rate": 7.317132442284325e-05, + "loss": 2.0183, + "step": 4426 + }, + { + "epoch": 1.3437547427530734, + "grad_norm": 0.5683912634849548, + "learning_rate": 7.316524908869988e-05, + "loss": 1.4524, + "step": 4427 + }, + { + "epoch": 1.3440582789497648, + "grad_norm": 0.5863691568374634, + "learning_rate": 7.315917375455651e-05, + "loss": 1.5636, + "step": 4428 + }, + { + "epoch": 1.3443618151464563, + "grad_norm": 0.5731346011161804, + "learning_rate": 7.315309842041312e-05, + "loss": 1.7794, + "step": 4429 + }, + { + "epoch": 1.3446653513431477, + "grad_norm": 0.46514976024627686, + "learning_rate": 7.314702308626975e-05, + "loss": 1.6708, + "step": 4430 + }, + { + "epoch": 1.3449688875398391, + "grad_norm": 0.47035089135169983, + "learning_rate": 7.314094775212637e-05, + "loss": 1.4858, + "step": 4431 + }, + { + "epoch": 1.3452724237365306, + "grad_norm": 0.6432058215141296, + "learning_rate": 7.3134872417983e-05, + "loss": 1.9622, + "step": 4432 + }, + { + "epoch": 1.345575959933222, + "grad_norm": 0.6775307655334473, + "learning_rate": 7.312879708383961e-05, + "loss": 1.874, + "step": 4433 + }, + { + "epoch": 1.3458794961299134, + "grad_norm": 0.4946788251399994, + "learning_rate": 7.312272174969623e-05, + "loss": 1.6998, + "step": 4434 + }, + { + "epoch": 1.3461830323266049, + "grad_norm": 0.5256187319755554, + "learning_rate": 7.311664641555286e-05, + "loss": 1.6884, + "step": 4435 + }, + { + "epoch": 1.3464865685232965, + "grad_norm": 0.526914119720459, + "learning_rate": 7.311057108140948e-05, + "loss": 1.5151, + "step": 4436 + }, + { + "epoch": 1.3467901047199877, + "grad_norm": 0.5384836196899414, + "learning_rate": 7.31044957472661e-05, + "loss": 1.5689, + "step": 4437 + }, + { + "epoch": 1.3470936409166794, + "grad_norm": 0.47421467304229736, + "learning_rate": 7.309842041312273e-05, + "loss": 1.6315, + "step": 4438 + }, + { + "epoch": 1.3473971771133708, + "grad_norm": 0.5070328712463379, + "learning_rate": 7.309234507897935e-05, + "loss": 1.7336, + "step": 4439 + }, + { + "epoch": 1.3477007133100622, + "grad_norm": 0.5571221113204956, + "learning_rate": 7.308626974483596e-05, + "loss": 1.4672, + "step": 4440 + }, + { + "epoch": 1.3480042495067537, + "grad_norm": 0.5508000254631042, + "learning_rate": 7.30801944106926e-05, + "loss": 1.7045, + "step": 4441 + }, + { + "epoch": 1.3483077857034451, + "grad_norm": 0.46867290139198303, + "learning_rate": 7.307411907654922e-05, + "loss": 1.7144, + "step": 4442 + }, + { + "epoch": 1.3486113219001366, + "grad_norm": 0.486806720495224, + "learning_rate": 7.306804374240583e-05, + "loss": 1.2906, + "step": 4443 + }, + { + "epoch": 1.348914858096828, + "grad_norm": 0.5570629835128784, + "learning_rate": 7.306196840826246e-05, + "loss": 1.2759, + "step": 4444 + }, + { + "epoch": 1.3492183942935196, + "grad_norm": 0.4644160270690918, + "learning_rate": 7.305589307411908e-05, + "loss": 1.7632, + "step": 4445 + }, + { + "epoch": 1.3495219304902109, + "grad_norm": 0.6232413053512573, + "learning_rate": 7.304981773997571e-05, + "loss": 1.6626, + "step": 4446 + }, + { + "epoch": 1.3498254666869025, + "grad_norm": 0.61170893907547, + "learning_rate": 7.304374240583232e-05, + "loss": 1.5876, + "step": 4447 + }, + { + "epoch": 1.350129002883594, + "grad_norm": 0.5944005250930786, + "learning_rate": 7.303766707168894e-05, + "loss": 1.8504, + "step": 4448 + }, + { + "epoch": 1.3504325390802854, + "grad_norm": 0.39353135228157043, + "learning_rate": 7.303159173754557e-05, + "loss": 0.5195, + "step": 4449 + }, + { + "epoch": 1.3507360752769768, + "grad_norm": 0.5182314515113831, + "learning_rate": 7.302551640340219e-05, + "loss": 1.7379, + "step": 4450 + }, + { + "epoch": 1.3510396114736682, + "grad_norm": 0.43427690863609314, + "learning_rate": 7.301944106925881e-05, + "loss": 1.8747, + "step": 4451 + }, + { + "epoch": 1.3513431476703597, + "grad_norm": 0.5645577907562256, + "learning_rate": 7.301336573511544e-05, + "loss": 1.6844, + "step": 4452 + }, + { + "epoch": 1.3516466838670511, + "grad_norm": 0.4782490134239197, + "learning_rate": 7.300729040097206e-05, + "loss": 1.242, + "step": 4453 + }, + { + "epoch": 1.3519502200637425, + "grad_norm": 0.3485776484012604, + "learning_rate": 7.300121506682867e-05, + "loss": 1.4581, + "step": 4454 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.5047571659088135, + "learning_rate": 7.29951397326853e-05, + "loss": 1.7181, + "step": 4455 + }, + { + "epoch": 1.3525572924571256, + "grad_norm": 0.48931753635406494, + "learning_rate": 7.298906439854193e-05, + "loss": 1.8102, + "step": 4456 + }, + { + "epoch": 1.3528608286538168, + "grad_norm": 0.585381031036377, + "learning_rate": 7.298298906439854e-05, + "loss": 1.4796, + "step": 4457 + }, + { + "epoch": 1.3531643648505085, + "grad_norm": 0.5762146711349487, + "learning_rate": 7.297691373025517e-05, + "loss": 1.7404, + "step": 4458 + }, + { + "epoch": 1.3534679010472, + "grad_norm": 0.37271368503570557, + "learning_rate": 7.297083839611179e-05, + "loss": 1.2543, + "step": 4459 + }, + { + "epoch": 1.3537714372438914, + "grad_norm": 0.4939133822917938, + "learning_rate": 7.296476306196842e-05, + "loss": 1.6854, + "step": 4460 + }, + { + "epoch": 1.3540749734405828, + "grad_norm": 0.5658159255981445, + "learning_rate": 7.295868772782503e-05, + "loss": 1.9223, + "step": 4461 + }, + { + "epoch": 1.3543785096372742, + "grad_norm": 0.5128167271614075, + "learning_rate": 7.295261239368165e-05, + "loss": 1.8189, + "step": 4462 + }, + { + "epoch": 1.3546820458339657, + "grad_norm": 0.6183301210403442, + "learning_rate": 7.294653705953828e-05, + "loss": 1.3022, + "step": 4463 + }, + { + "epoch": 1.354985582030657, + "grad_norm": 0.5234330892562866, + "learning_rate": 7.29404617253949e-05, + "loss": 1.7716, + "step": 4464 + }, + { + "epoch": 1.3552891182273485, + "grad_norm": 0.5098745226860046, + "learning_rate": 7.293438639125152e-05, + "loss": 1.8543, + "step": 4465 + }, + { + "epoch": 1.35559265442404, + "grad_norm": 0.5108742117881775, + "learning_rate": 7.292831105710815e-05, + "loss": 1.3576, + "step": 4466 + }, + { + "epoch": 1.3558961906207316, + "grad_norm": 0.4484347701072693, + "learning_rate": 7.292223572296477e-05, + "loss": 1.7337, + "step": 4467 + }, + { + "epoch": 1.3561997268174228, + "grad_norm": 0.6011006236076355, + "learning_rate": 7.291616038882138e-05, + "loss": 1.1075, + "step": 4468 + }, + { + "epoch": 1.3565032630141145, + "grad_norm": 0.530351459980011, + "learning_rate": 7.291008505467801e-05, + "loss": 1.6846, + "step": 4469 + }, + { + "epoch": 1.356806799210806, + "grad_norm": 0.5644029974937439, + "learning_rate": 7.290400972053464e-05, + "loss": 1.6254, + "step": 4470 + }, + { + "epoch": 1.3571103354074974, + "grad_norm": 0.5434947609901428, + "learning_rate": 7.289793438639125e-05, + "loss": 1.7514, + "step": 4471 + }, + { + "epoch": 1.3574138716041888, + "grad_norm": 0.4561974108219147, + "learning_rate": 7.289185905224788e-05, + "loss": 1.7515, + "step": 4472 + }, + { + "epoch": 1.3577174078008802, + "grad_norm": 0.5263285636901855, + "learning_rate": 7.28857837181045e-05, + "loss": 1.549, + "step": 4473 + }, + { + "epoch": 1.3580209439975717, + "grad_norm": 0.4440256953239441, + "learning_rate": 7.287970838396111e-05, + "loss": 1.4366, + "step": 4474 + }, + { + "epoch": 1.358324480194263, + "grad_norm": 0.4798101782798767, + "learning_rate": 7.287363304981774e-05, + "loss": 1.9394, + "step": 4475 + }, + { + "epoch": 1.3586280163909545, + "grad_norm": 0.5924159288406372, + "learning_rate": 7.286755771567436e-05, + "loss": 1.1942, + "step": 4476 + }, + { + "epoch": 1.358931552587646, + "grad_norm": 0.5440402030944824, + "learning_rate": 7.286148238153099e-05, + "loss": 1.889, + "step": 4477 + }, + { + "epoch": 1.3592350887843376, + "grad_norm": 0.5075298547744751, + "learning_rate": 7.285540704738761e-05, + "loss": 1.8719, + "step": 4478 + }, + { + "epoch": 1.359538624981029, + "grad_norm": 0.5377494692802429, + "learning_rate": 7.284933171324423e-05, + "loss": 1.67, + "step": 4479 + }, + { + "epoch": 1.3598421611777205, + "grad_norm": 0.5329782366752625, + "learning_rate": 7.284325637910086e-05, + "loss": 1.7687, + "step": 4480 + }, + { + "epoch": 1.360145697374412, + "grad_norm": 0.5144550204277039, + "learning_rate": 7.283718104495748e-05, + "loss": 1.6271, + "step": 4481 + }, + { + "epoch": 1.3604492335711034, + "grad_norm": 0.4970364570617676, + "learning_rate": 7.283110571081409e-05, + "loss": 1.7589, + "step": 4482 + }, + { + "epoch": 1.3607527697677948, + "grad_norm": 1.1922450065612793, + "learning_rate": 7.282503037667072e-05, + "loss": 1.6477, + "step": 4483 + }, + { + "epoch": 1.3610563059644862, + "grad_norm": 0.5640878677368164, + "learning_rate": 7.281895504252734e-05, + "loss": 1.6746, + "step": 4484 + }, + { + "epoch": 1.3613598421611777, + "grad_norm": 0.496259868144989, + "learning_rate": 7.281287970838396e-05, + "loss": 1.7005, + "step": 4485 + }, + { + "epoch": 1.361663378357869, + "grad_norm": 0.5209731459617615, + "learning_rate": 7.280680437424059e-05, + "loss": 1.7817, + "step": 4486 + }, + { + "epoch": 1.3619669145545608, + "grad_norm": 0.5042216777801514, + "learning_rate": 7.28007290400972e-05, + "loss": 1.8656, + "step": 4487 + }, + { + "epoch": 1.362270450751252, + "grad_norm": 0.693298876285553, + "learning_rate": 7.279465370595382e-05, + "loss": 1.9507, + "step": 4488 + }, + { + "epoch": 1.3625739869479436, + "grad_norm": 0.5207253694534302, + "learning_rate": 7.278857837181045e-05, + "loss": 1.886, + "step": 4489 + }, + { + "epoch": 1.362877523144635, + "grad_norm": 0.5372057557106018, + "learning_rate": 7.278250303766707e-05, + "loss": 1.1538, + "step": 4490 + }, + { + "epoch": 1.3631810593413265, + "grad_norm": 0.43779316544532776, + "learning_rate": 7.27764277035237e-05, + "loss": 1.0436, + "step": 4491 + }, + { + "epoch": 1.363484595538018, + "grad_norm": 0.5032690763473511, + "learning_rate": 7.277035236938032e-05, + "loss": 1.5441, + "step": 4492 + }, + { + "epoch": 1.3637881317347094, + "grad_norm": 0.43449172377586365, + "learning_rate": 7.276427703523694e-05, + "loss": 1.787, + "step": 4493 + }, + { + "epoch": 1.3640916679314008, + "grad_norm": 0.5264309644699097, + "learning_rate": 7.275820170109357e-05, + "loss": 1.8857, + "step": 4494 + }, + { + "epoch": 1.3643952041280922, + "grad_norm": 0.5467169284820557, + "learning_rate": 7.275212636695019e-05, + "loss": 1.4043, + "step": 4495 + }, + { + "epoch": 1.3646987403247837, + "grad_norm": 0.4802314043045044, + "learning_rate": 7.27460510328068e-05, + "loss": 1.6738, + "step": 4496 + }, + { + "epoch": 1.365002276521475, + "grad_norm": 0.50968998670578, + "learning_rate": 7.273997569866343e-05, + "loss": 1.7424, + "step": 4497 + }, + { + "epoch": 1.3653058127181668, + "grad_norm": 0.5726447701454163, + "learning_rate": 7.273390036452005e-05, + "loss": 1.8105, + "step": 4498 + }, + { + "epoch": 1.365609348914858, + "grad_norm": 0.6234380006790161, + "learning_rate": 7.272782503037667e-05, + "loss": 1.688, + "step": 4499 + }, + { + "epoch": 1.3659128851115496, + "grad_norm": 0.4061424136161804, + "learning_rate": 7.27217496962333e-05, + "loss": 1.8207, + "step": 4500 + }, + { + "epoch": 1.366216421308241, + "grad_norm": 0.5762432813644409, + "learning_rate": 7.271567436208992e-05, + "loss": 1.6701, + "step": 4501 + }, + { + "epoch": 1.3665199575049325, + "grad_norm": 0.63739013671875, + "learning_rate": 7.270959902794653e-05, + "loss": 1.0446, + "step": 4502 + }, + { + "epoch": 1.366823493701624, + "grad_norm": 0.47659537196159363, + "learning_rate": 7.270352369380316e-05, + "loss": 1.6165, + "step": 4503 + }, + { + "epoch": 1.3671270298983154, + "grad_norm": 0.4327382445335388, + "learning_rate": 7.269744835965978e-05, + "loss": 1.7651, + "step": 4504 + }, + { + "epoch": 1.3674305660950068, + "grad_norm": 0.4395967423915863, + "learning_rate": 7.269137302551641e-05, + "loss": 1.7153, + "step": 4505 + }, + { + "epoch": 1.3677341022916982, + "grad_norm": 0.4870195686817169, + "learning_rate": 7.268529769137303e-05, + "loss": 1.6897, + "step": 4506 + }, + { + "epoch": 1.3680376384883897, + "grad_norm": 0.46602505445480347, + "learning_rate": 7.267922235722965e-05, + "loss": 1.8504, + "step": 4507 + }, + { + "epoch": 1.368341174685081, + "grad_norm": 0.7441166639328003, + "learning_rate": 7.267314702308628e-05, + "loss": 1.4581, + "step": 4508 + }, + { + "epoch": 1.3686447108817728, + "grad_norm": 0.6213736534118652, + "learning_rate": 7.26670716889429e-05, + "loss": 1.4608, + "step": 4509 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.6059181690216064, + "learning_rate": 7.266099635479951e-05, + "loss": 1.8415, + "step": 4510 + }, + { + "epoch": 1.3692517832751556, + "grad_norm": 0.9757100343704224, + "learning_rate": 7.265492102065614e-05, + "loss": 1.2277, + "step": 4511 + }, + { + "epoch": 1.369555319471847, + "grad_norm": 0.46525654196739197, + "learning_rate": 7.264884568651276e-05, + "loss": 1.5944, + "step": 4512 + }, + { + "epoch": 1.3698588556685385, + "grad_norm": 0.5008564591407776, + "learning_rate": 7.264277035236938e-05, + "loss": 1.8998, + "step": 4513 + }, + { + "epoch": 1.37016239186523, + "grad_norm": 0.44502413272857666, + "learning_rate": 7.263669501822601e-05, + "loss": 1.6404, + "step": 4514 + }, + { + "epoch": 1.3704659280619214, + "grad_norm": 0.5428724884986877, + "learning_rate": 7.263061968408263e-05, + "loss": 1.6454, + "step": 4515 + }, + { + "epoch": 1.3707694642586128, + "grad_norm": 0.47314050793647766, + "learning_rate": 7.262454434993924e-05, + "loss": 1.6914, + "step": 4516 + }, + { + "epoch": 1.3710730004553042, + "grad_norm": 0.574695885181427, + "learning_rate": 7.261846901579587e-05, + "loss": 1.7187, + "step": 4517 + }, + { + "epoch": 1.3713765366519959, + "grad_norm": 0.5672833919525146, + "learning_rate": 7.261239368165249e-05, + "loss": 1.7231, + "step": 4518 + }, + { + "epoch": 1.371680072848687, + "grad_norm": 0.46130290627479553, + "learning_rate": 7.260631834750912e-05, + "loss": 1.2445, + "step": 4519 + }, + { + "epoch": 1.3719836090453787, + "grad_norm": 0.5272838473320007, + "learning_rate": 7.260024301336574e-05, + "loss": 1.8076, + "step": 4520 + }, + { + "epoch": 1.3722871452420702, + "grad_norm": 0.47636985778808594, + "learning_rate": 7.259416767922236e-05, + "loss": 1.8179, + "step": 4521 + }, + { + "epoch": 1.3725906814387616, + "grad_norm": 0.4305800199508667, + "learning_rate": 7.258809234507899e-05, + "loss": 1.6511, + "step": 4522 + }, + { + "epoch": 1.372894217635453, + "grad_norm": 1.3684027194976807, + "learning_rate": 7.25820170109356e-05, + "loss": 1.4146, + "step": 4523 + }, + { + "epoch": 1.3731977538321445, + "grad_norm": 0.5367915630340576, + "learning_rate": 7.257594167679222e-05, + "loss": 1.2383, + "step": 4524 + }, + { + "epoch": 1.373501290028836, + "grad_norm": 1.0694987773895264, + "learning_rate": 7.256986634264885e-05, + "loss": 1.1652, + "step": 4525 + }, + { + "epoch": 1.3738048262255274, + "grad_norm": 0.5531676411628723, + "learning_rate": 7.256379100850547e-05, + "loss": 1.3935, + "step": 4526 + }, + { + "epoch": 1.3741083624222188, + "grad_norm": 0.5630438327789307, + "learning_rate": 7.255771567436209e-05, + "loss": 1.4998, + "step": 4527 + }, + { + "epoch": 1.3744118986189102, + "grad_norm": 0.5596137642860413, + "learning_rate": 7.255164034021872e-05, + "loss": 1.3709, + "step": 4528 + }, + { + "epoch": 1.3747154348156019, + "grad_norm": 0.6031253337860107, + "learning_rate": 7.254556500607534e-05, + "loss": 1.2487, + "step": 4529 + }, + { + "epoch": 1.375018971012293, + "grad_norm": 0.662558913230896, + "learning_rate": 7.253948967193195e-05, + "loss": 1.7042, + "step": 4530 + }, + { + "epoch": 1.3753225072089847, + "grad_norm": 0.5421009659767151, + "learning_rate": 7.253341433778858e-05, + "loss": 1.7412, + "step": 4531 + }, + { + "epoch": 1.3756260434056762, + "grad_norm": 0.5114768743515015, + "learning_rate": 7.25273390036452e-05, + "loss": 1.837, + "step": 4532 + }, + { + "epoch": 1.3759295796023676, + "grad_norm": 0.5638769268989563, + "learning_rate": 7.252126366950183e-05, + "loss": 1.8487, + "step": 4533 + }, + { + "epoch": 1.376233115799059, + "grad_norm": 0.5670020580291748, + "learning_rate": 7.251518833535845e-05, + "loss": 1.5702, + "step": 4534 + }, + { + "epoch": 1.3765366519957505, + "grad_norm": 0.5094712972640991, + "learning_rate": 7.250911300121507e-05, + "loss": 1.8152, + "step": 4535 + }, + { + "epoch": 1.376840188192442, + "grad_norm": 0.5454041361808777, + "learning_rate": 7.25030376670717e-05, + "loss": 1.3741, + "step": 4536 + }, + { + "epoch": 1.3771437243891334, + "grad_norm": 0.5326266884803772, + "learning_rate": 7.249696233292832e-05, + "loss": 1.6344, + "step": 4537 + }, + { + "epoch": 1.3774472605858248, + "grad_norm": 0.4124714732170105, + "learning_rate": 7.249088699878493e-05, + "loss": 1.4858, + "step": 4538 + }, + { + "epoch": 1.3777507967825162, + "grad_norm": 0.5569986701011658, + "learning_rate": 7.248481166464156e-05, + "loss": 1.8529, + "step": 4539 + }, + { + "epoch": 1.3780543329792079, + "grad_norm": 0.5609976053237915, + "learning_rate": 7.247873633049818e-05, + "loss": 1.8264, + "step": 4540 + }, + { + "epoch": 1.3783578691758993, + "grad_norm": 0.4581679403781891, + "learning_rate": 7.24726609963548e-05, + "loss": 1.7181, + "step": 4541 + }, + { + "epoch": 1.3786614053725907, + "grad_norm": 0.551991879940033, + "learning_rate": 7.246658566221143e-05, + "loss": 0.8978, + "step": 4542 + }, + { + "epoch": 1.3789649415692822, + "grad_norm": 0.5136808156967163, + "learning_rate": 7.246051032806805e-05, + "loss": 1.3164, + "step": 4543 + }, + { + "epoch": 1.3792684777659736, + "grad_norm": 0.5180895328521729, + "learning_rate": 7.245443499392466e-05, + "loss": 1.7857, + "step": 4544 + }, + { + "epoch": 1.379572013962665, + "grad_norm": 0.6461337208747864, + "learning_rate": 7.24483596597813e-05, + "loss": 1.1507, + "step": 4545 + }, + { + "epoch": 1.3798755501593565, + "grad_norm": 0.5065274834632874, + "learning_rate": 7.244228432563791e-05, + "loss": 2.0678, + "step": 4546 + }, + { + "epoch": 1.380179086356048, + "grad_norm": 0.5826600790023804, + "learning_rate": 7.243620899149453e-05, + "loss": 1.8561, + "step": 4547 + }, + { + "epoch": 1.3804826225527393, + "grad_norm": 0.8745676279067993, + "learning_rate": 7.243013365735116e-05, + "loss": 1.3648, + "step": 4548 + }, + { + "epoch": 1.380786158749431, + "grad_norm": 0.5953565239906311, + "learning_rate": 7.242405832320778e-05, + "loss": 1.7645, + "step": 4549 + }, + { + "epoch": 1.3810896949461222, + "grad_norm": 0.5085800290107727, + "learning_rate": 7.241798298906441e-05, + "loss": 1.5934, + "step": 4550 + }, + { + "epoch": 1.3813932311428139, + "grad_norm": 0.5552278161048889, + "learning_rate": 7.241190765492103e-05, + "loss": 1.8317, + "step": 4551 + }, + { + "epoch": 1.3816967673395053, + "grad_norm": 0.7417198419570923, + "learning_rate": 7.240583232077764e-05, + "loss": 1.1408, + "step": 4552 + }, + { + "epoch": 1.3820003035361967, + "grad_norm": 0.5697821378707886, + "learning_rate": 7.239975698663427e-05, + "loss": 1.7124, + "step": 4553 + }, + { + "epoch": 1.3823038397328882, + "grad_norm": 0.49888259172439575, + "learning_rate": 7.239368165249089e-05, + "loss": 1.3308, + "step": 4554 + }, + { + "epoch": 1.3826073759295796, + "grad_norm": 0.7289189100265503, + "learning_rate": 7.238760631834751e-05, + "loss": 1.4048, + "step": 4555 + }, + { + "epoch": 1.382910912126271, + "grad_norm": 0.49783867597579956, + "learning_rate": 7.238153098420414e-05, + "loss": 1.7252, + "step": 4556 + }, + { + "epoch": 1.3832144483229625, + "grad_norm": 0.5540681481361389, + "learning_rate": 7.237545565006076e-05, + "loss": 1.456, + "step": 4557 + }, + { + "epoch": 1.383517984519654, + "grad_norm": 0.5310572385787964, + "learning_rate": 7.236938031591737e-05, + "loss": 2.0569, + "step": 4558 + }, + { + "epoch": 1.3838215207163453, + "grad_norm": 0.6051456332206726, + "learning_rate": 7.2363304981774e-05, + "loss": 1.6324, + "step": 4559 + }, + { + "epoch": 1.384125056913037, + "grad_norm": 0.39043620228767395, + "learning_rate": 7.235722964763062e-05, + "loss": 0.9041, + "step": 4560 + }, + { + "epoch": 1.3844285931097282, + "grad_norm": 0.518295168876648, + "learning_rate": 7.235115431348724e-05, + "loss": 1.509, + "step": 4561 + }, + { + "epoch": 1.3847321293064199, + "grad_norm": 0.4797629415988922, + "learning_rate": 7.234507897934387e-05, + "loss": 1.8273, + "step": 4562 + }, + { + "epoch": 1.3850356655031113, + "grad_norm": 0.5168799757957458, + "learning_rate": 7.233900364520049e-05, + "loss": 1.7992, + "step": 4563 + }, + { + "epoch": 1.3853392016998027, + "grad_norm": 0.4868592619895935, + "learning_rate": 7.233292831105712e-05, + "loss": 2.075, + "step": 4564 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.52605140209198, + "learning_rate": 7.232685297691372e-05, + "loss": 1.7701, + "step": 4565 + }, + { + "epoch": 1.3859462740931856, + "grad_norm": 0.5345576405525208, + "learning_rate": 7.232077764277035e-05, + "loss": 1.3641, + "step": 4566 + }, + { + "epoch": 1.386249810289877, + "grad_norm": 0.5572211742401123, + "learning_rate": 7.231470230862698e-05, + "loss": 1.5284, + "step": 4567 + }, + { + "epoch": 1.3865533464865685, + "grad_norm": 0.5272506475448608, + "learning_rate": 7.23086269744836e-05, + "loss": 1.4096, + "step": 4568 + }, + { + "epoch": 1.38685688268326, + "grad_norm": 17.82926368713379, + "learning_rate": 7.230255164034022e-05, + "loss": 1.8241, + "step": 4569 + }, + { + "epoch": 1.3871604188799513, + "grad_norm": 2.107409715652466, + "learning_rate": 7.229647630619685e-05, + "loss": 2.2869, + "step": 4570 + }, + { + "epoch": 1.387463955076643, + "grad_norm": 0.4298652708530426, + "learning_rate": 7.229040097205347e-05, + "loss": 1.2312, + "step": 4571 + }, + { + "epoch": 1.3877674912733342, + "grad_norm": 0.5333446860313416, + "learning_rate": 7.228432563791008e-05, + "loss": 1.3574, + "step": 4572 + }, + { + "epoch": 1.3880710274700259, + "grad_norm": 0.465961217880249, + "learning_rate": 7.227825030376672e-05, + "loss": 1.8196, + "step": 4573 + }, + { + "epoch": 1.3883745636667173, + "grad_norm": 0.5217798948287964, + "learning_rate": 7.227217496962333e-05, + "loss": 1.8073, + "step": 4574 + }, + { + "epoch": 1.3886780998634087, + "grad_norm": 1.037131428718567, + "learning_rate": 7.226609963547995e-05, + "loss": 1.6744, + "step": 4575 + }, + { + "epoch": 1.3889816360601002, + "grad_norm": 0.8267009854316711, + "learning_rate": 7.226002430133658e-05, + "loss": 1.8694, + "step": 4576 + }, + { + "epoch": 1.3892851722567916, + "grad_norm": 0.6034876108169556, + "learning_rate": 7.22539489671932e-05, + "loss": 1.6975, + "step": 4577 + }, + { + "epoch": 1.389588708453483, + "grad_norm": 0.4525824189186096, + "learning_rate": 7.224787363304983e-05, + "loss": 1.994, + "step": 4578 + }, + { + "epoch": 1.3898922446501745, + "grad_norm": 0.6136592626571655, + "learning_rate": 7.224179829890643e-05, + "loss": 1.7241, + "step": 4579 + }, + { + "epoch": 1.3901957808468661, + "grad_norm": 0.6351958513259888, + "learning_rate": 7.223572296476306e-05, + "loss": 1.722, + "step": 4580 + }, + { + "epoch": 1.3904993170435573, + "grad_norm": 0.4822506010532379, + "learning_rate": 7.22296476306197e-05, + "loss": 1.5643, + "step": 4581 + }, + { + "epoch": 1.390802853240249, + "grad_norm": 0.5029870271682739, + "learning_rate": 7.222357229647631e-05, + "loss": 1.6093, + "step": 4582 + }, + { + "epoch": 1.3911063894369404, + "grad_norm": 0.45101577043533325, + "learning_rate": 7.221749696233293e-05, + "loss": 1.6051, + "step": 4583 + }, + { + "epoch": 1.3914099256336319, + "grad_norm": 0.8177331686019897, + "learning_rate": 7.221142162818956e-05, + "loss": 1.5745, + "step": 4584 + }, + { + "epoch": 1.3917134618303233, + "grad_norm": 0.3580580949783325, + "learning_rate": 7.220534629404618e-05, + "loss": 1.3952, + "step": 4585 + }, + { + "epoch": 1.3920169980270147, + "grad_norm": 0.48784148693084717, + "learning_rate": 7.21992709599028e-05, + "loss": 1.3858, + "step": 4586 + }, + { + "epoch": 1.3923205342237062, + "grad_norm": 0.4869060516357422, + "learning_rate": 7.219319562575943e-05, + "loss": 1.477, + "step": 4587 + }, + { + "epoch": 1.3926240704203976, + "grad_norm": 0.5329310297966003, + "learning_rate": 7.218712029161604e-05, + "loss": 1.8107, + "step": 4588 + }, + { + "epoch": 1.392927606617089, + "grad_norm": 0.5233326554298401, + "learning_rate": 7.218104495747266e-05, + "loss": 1.6642, + "step": 4589 + }, + { + "epoch": 1.3932311428137805, + "grad_norm": 0.6266956925392151, + "learning_rate": 7.217496962332929e-05, + "loss": 1.7805, + "step": 4590 + }, + { + "epoch": 1.3935346790104721, + "grad_norm": 0.5174371004104614, + "learning_rate": 7.216889428918591e-05, + "loss": 1.4675, + "step": 4591 + }, + { + "epoch": 1.3938382152071633, + "grad_norm": 0.5754081606864929, + "learning_rate": 7.216281895504254e-05, + "loss": 2.0404, + "step": 4592 + }, + { + "epoch": 1.394141751403855, + "grad_norm": 0.4270972013473511, + "learning_rate": 7.215674362089914e-05, + "loss": 1.4554, + "step": 4593 + }, + { + "epoch": 1.3944452876005464, + "grad_norm": 0.604895830154419, + "learning_rate": 7.215066828675577e-05, + "loss": 1.4743, + "step": 4594 + }, + { + "epoch": 1.3947488237972379, + "grad_norm": 0.5406295657157898, + "learning_rate": 7.21445929526124e-05, + "loss": 1.5569, + "step": 4595 + }, + { + "epoch": 1.3950523599939293, + "grad_norm": 0.5292085409164429, + "learning_rate": 7.213851761846902e-05, + "loss": 1.4687, + "step": 4596 + }, + { + "epoch": 1.3953558961906207, + "grad_norm": 0.5510256886482239, + "learning_rate": 7.213244228432564e-05, + "loss": 1.4113, + "step": 4597 + }, + { + "epoch": 1.3956594323873122, + "grad_norm": 0.5336708426475525, + "learning_rate": 7.212636695018227e-05, + "loss": 1.6209, + "step": 4598 + }, + { + "epoch": 1.3959629685840036, + "grad_norm": 0.5480448603630066, + "learning_rate": 7.212029161603889e-05, + "loss": 1.613, + "step": 4599 + }, + { + "epoch": 1.396266504780695, + "grad_norm": 0.49696651101112366, + "learning_rate": 7.21142162818955e-05, + "loss": 1.5235, + "step": 4600 + }, + { + "epoch": 1.3965700409773865, + "grad_norm": 0.5844339728355408, + "learning_rate": 7.210814094775214e-05, + "loss": 1.6704, + "step": 4601 + }, + { + "epoch": 1.3968735771740781, + "grad_norm": 0.5771584510803223, + "learning_rate": 7.210206561360875e-05, + "loss": 1.3323, + "step": 4602 + }, + { + "epoch": 1.3971771133707693, + "grad_norm": 0.5130957365036011, + "learning_rate": 7.209599027946537e-05, + "loss": 1.4892, + "step": 4603 + }, + { + "epoch": 1.397480649567461, + "grad_norm": 0.47314324975013733, + "learning_rate": 7.2089914945322e-05, + "loss": 1.7793, + "step": 4604 + }, + { + "epoch": 1.3977841857641524, + "grad_norm": 0.4471718966960907, + "learning_rate": 7.208383961117862e-05, + "loss": 1.7329, + "step": 4605 + }, + { + "epoch": 1.3980877219608439, + "grad_norm": 0.5555965900421143, + "learning_rate": 7.207776427703525e-05, + "loss": 1.574, + "step": 4606 + }, + { + "epoch": 1.3983912581575353, + "grad_norm": 0.5500729084014893, + "learning_rate": 7.207168894289185e-05, + "loss": 1.5674, + "step": 4607 + }, + { + "epoch": 1.3986947943542267, + "grad_norm": 0.5009059906005859, + "learning_rate": 7.206561360874848e-05, + "loss": 1.7574, + "step": 4608 + }, + { + "epoch": 1.3989983305509182, + "grad_norm": 0.5441694259643555, + "learning_rate": 7.205953827460511e-05, + "loss": 1.4509, + "step": 4609 + }, + { + "epoch": 1.3993018667476096, + "grad_norm": 0.4883042573928833, + "learning_rate": 7.205346294046172e-05, + "loss": 1.4991, + "step": 4610 + }, + { + "epoch": 1.399605402944301, + "grad_norm": 0.5141351222991943, + "learning_rate": 7.204738760631835e-05, + "loss": 1.4975, + "step": 4611 + }, + { + "epoch": 1.3999089391409925, + "grad_norm": 0.5896480083465576, + "learning_rate": 7.204131227217498e-05, + "loss": 1.4992, + "step": 4612 + }, + { + "epoch": 1.4002124753376841, + "grad_norm": 0.7314833402633667, + "learning_rate": 7.20352369380316e-05, + "loss": 0.8182, + "step": 4613 + }, + { + "epoch": 1.4005160115343755, + "grad_norm": 0.5311540961265564, + "learning_rate": 7.202916160388821e-05, + "loss": 1.5784, + "step": 4614 + }, + { + "epoch": 1.400819547731067, + "grad_norm": 0.5259482264518738, + "learning_rate": 7.202308626974485e-05, + "loss": 1.6322, + "step": 4615 + }, + { + "epoch": 1.4011230839277584, + "grad_norm": 0.5433558821678162, + "learning_rate": 7.201701093560146e-05, + "loss": 1.3337, + "step": 4616 + }, + { + "epoch": 1.4014266201244499, + "grad_norm": 0.6455905437469482, + "learning_rate": 7.201093560145808e-05, + "loss": 1.9787, + "step": 4617 + }, + { + "epoch": 1.4017301563211413, + "grad_norm": 0.5605013966560364, + "learning_rate": 7.200486026731471e-05, + "loss": 1.4333, + "step": 4618 + }, + { + "epoch": 1.4020336925178327, + "grad_norm": 0.5594799518585205, + "learning_rate": 7.199878493317133e-05, + "loss": 1.7032, + "step": 4619 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.49050381779670715, + "learning_rate": 7.199270959902796e-05, + "loss": 1.8006, + "step": 4620 + }, + { + "epoch": 1.4026407649112156, + "grad_norm": 0.601580798625946, + "learning_rate": 7.198663426488456e-05, + "loss": 1.5474, + "step": 4621 + }, + { + "epoch": 1.4029443011079072, + "grad_norm": 0.6719093918800354, + "learning_rate": 7.19805589307412e-05, + "loss": 1.6041, + "step": 4622 + }, + { + "epoch": 1.4032478373045985, + "grad_norm": 0.5384756922721863, + "learning_rate": 7.197448359659782e-05, + "loss": 1.5229, + "step": 4623 + }, + { + "epoch": 1.4035513735012901, + "grad_norm": 0.52190101146698, + "learning_rate": 7.196840826245443e-05, + "loss": 1.7249, + "step": 4624 + }, + { + "epoch": 1.4038549096979815, + "grad_norm": 0.47463634610176086, + "learning_rate": 7.196233292831106e-05, + "loss": 1.6262, + "step": 4625 + }, + { + "epoch": 1.404158445894673, + "grad_norm": 0.48097196221351624, + "learning_rate": 7.195625759416769e-05, + "loss": 1.7884, + "step": 4626 + }, + { + "epoch": 1.4044619820913644, + "grad_norm": 0.42945027351379395, + "learning_rate": 7.195018226002431e-05, + "loss": 1.2333, + "step": 4627 + }, + { + "epoch": 1.4047655182880558, + "grad_norm": 0.49355918169021606, + "learning_rate": 7.194410692588092e-05, + "loss": 1.4478, + "step": 4628 + }, + { + "epoch": 1.4050690544847473, + "grad_norm": 0.5966702699661255, + "learning_rate": 7.193803159173756e-05, + "loss": 1.0146, + "step": 4629 + }, + { + "epoch": 1.4053725906814387, + "grad_norm": 0.524314820766449, + "learning_rate": 7.193195625759417e-05, + "loss": 1.6238, + "step": 4630 + }, + { + "epoch": 1.4056761268781301, + "grad_norm": 0.4699951708316803, + "learning_rate": 7.192588092345079e-05, + "loss": 1.3799, + "step": 4631 + }, + { + "epoch": 1.4059796630748216, + "grad_norm": 0.531152069568634, + "learning_rate": 7.191980558930742e-05, + "loss": 2.0555, + "step": 4632 + }, + { + "epoch": 1.4062831992715132, + "grad_norm": 0.5721887946128845, + "learning_rate": 7.191373025516404e-05, + "loss": 1.339, + "step": 4633 + }, + { + "epoch": 1.4065867354682045, + "grad_norm": 0.39430439472198486, + "learning_rate": 7.190765492102066e-05, + "loss": 1.4839, + "step": 4634 + }, + { + "epoch": 1.406890271664896, + "grad_norm": 0.6990336775779724, + "learning_rate": 7.190157958687727e-05, + "loss": 1.6197, + "step": 4635 + }, + { + "epoch": 1.4071938078615875, + "grad_norm": 0.4952123463153839, + "learning_rate": 7.18955042527339e-05, + "loss": 1.6727, + "step": 4636 + }, + { + "epoch": 1.407497344058279, + "grad_norm": 0.5764554738998413, + "learning_rate": 7.188942891859053e-05, + "loss": 2.0462, + "step": 4637 + }, + { + "epoch": 1.4078008802549704, + "grad_norm": 0.6098769903182983, + "learning_rate": 7.188335358444714e-05, + "loss": 1.8601, + "step": 4638 + }, + { + "epoch": 1.4081044164516618, + "grad_norm": 0.6008402109146118, + "learning_rate": 7.187727825030377e-05, + "loss": 1.6561, + "step": 4639 + }, + { + "epoch": 1.4084079526483533, + "grad_norm": 0.5050938129425049, + "learning_rate": 7.18712029161604e-05, + "loss": 1.8548, + "step": 4640 + }, + { + "epoch": 1.4087114888450447, + "grad_norm": 0.5040962100028992, + "learning_rate": 7.186512758201702e-05, + "loss": 1.8083, + "step": 4641 + }, + { + "epoch": 1.4090150250417361, + "grad_norm": 0.5517044067382812, + "learning_rate": 7.185905224787363e-05, + "loss": 1.1551, + "step": 4642 + }, + { + "epoch": 1.4093185612384276, + "grad_norm": 0.562538743019104, + "learning_rate": 7.185297691373027e-05, + "loss": 1.4542, + "step": 4643 + }, + { + "epoch": 1.4096220974351192, + "grad_norm": 0.9561034440994263, + "learning_rate": 7.184690157958688e-05, + "loss": 1.39, + "step": 4644 + }, + { + "epoch": 1.4099256336318107, + "grad_norm": 0.5557636618614197, + "learning_rate": 7.18408262454435e-05, + "loss": 1.9666, + "step": 4645 + }, + { + "epoch": 1.410229169828502, + "grad_norm": 0.5418885350227356, + "learning_rate": 7.183475091130013e-05, + "loss": 1.4215, + "step": 4646 + }, + { + "epoch": 1.4105327060251935, + "grad_norm": 0.4758126139640808, + "learning_rate": 7.182867557715675e-05, + "loss": 1.6912, + "step": 4647 + }, + { + "epoch": 1.410836242221885, + "grad_norm": 0.5334361791610718, + "learning_rate": 7.182260024301337e-05, + "loss": 1.645, + "step": 4648 + }, + { + "epoch": 1.4111397784185764, + "grad_norm": 0.5273361802101135, + "learning_rate": 7.181652490886998e-05, + "loss": 1.5353, + "step": 4649 + }, + { + "epoch": 1.4114433146152678, + "grad_norm": 0.5110300183296204, + "learning_rate": 7.181044957472661e-05, + "loss": 1.952, + "step": 4650 + }, + { + "epoch": 1.4117468508119593, + "grad_norm": 0.49933740496635437, + "learning_rate": 7.180437424058324e-05, + "loss": 1.8396, + "step": 4651 + }, + { + "epoch": 1.4120503870086507, + "grad_norm": 0.6788985133171082, + "learning_rate": 7.179829890643985e-05, + "loss": 1.8449, + "step": 4652 + }, + { + "epoch": 1.4123539232053424, + "grad_norm": 0.5057569146156311, + "learning_rate": 7.179222357229648e-05, + "loss": 1.832, + "step": 4653 + }, + { + "epoch": 1.4126574594020336, + "grad_norm": 0.44517782330513, + "learning_rate": 7.178614823815311e-05, + "loss": 1.6484, + "step": 4654 + }, + { + "epoch": 1.4129609955987252, + "grad_norm": 0.557269275188446, + "learning_rate": 7.178007290400973e-05, + "loss": 1.3732, + "step": 4655 + }, + { + "epoch": 1.4132645317954167, + "grad_norm": 0.47099003195762634, + "learning_rate": 7.177399756986634e-05, + "loss": 1.8153, + "step": 4656 + }, + { + "epoch": 1.413568067992108, + "grad_norm": 0.5688049793243408, + "learning_rate": 7.176792223572298e-05, + "loss": 1.5837, + "step": 4657 + }, + { + "epoch": 1.4138716041887995, + "grad_norm": 0.4637562930583954, + "learning_rate": 7.176184690157959e-05, + "loss": 1.0759, + "step": 4658 + }, + { + "epoch": 1.414175140385491, + "grad_norm": 0.5533589720726013, + "learning_rate": 7.175577156743621e-05, + "loss": 1.8164, + "step": 4659 + }, + { + "epoch": 1.4144786765821824, + "grad_norm": 0.5231189727783203, + "learning_rate": 7.174969623329283e-05, + "loss": 1.6957, + "step": 4660 + }, + { + "epoch": 1.4147822127788738, + "grad_norm": 0.49182331562042236, + "learning_rate": 7.174362089914946e-05, + "loss": 1.776, + "step": 4661 + }, + { + "epoch": 1.4150857489755653, + "grad_norm": 0.5737461447715759, + "learning_rate": 7.173754556500608e-05, + "loss": 1.2501, + "step": 4662 + }, + { + "epoch": 1.4153892851722567, + "grad_norm": 0.4491937756538391, + "learning_rate": 7.173147023086269e-05, + "loss": 1.1878, + "step": 4663 + }, + { + "epoch": 1.4156928213689484, + "grad_norm": 0.6204633712768555, + "learning_rate": 7.172539489671932e-05, + "loss": 1.2013, + "step": 4664 + }, + { + "epoch": 1.4159963575656396, + "grad_norm": 0.4740472435951233, + "learning_rate": 7.171931956257595e-05, + "loss": 2.0967, + "step": 4665 + }, + { + "epoch": 1.4162998937623312, + "grad_norm": 0.42531633377075195, + "learning_rate": 7.171324422843256e-05, + "loss": 1.247, + "step": 4666 + }, + { + "epoch": 1.4166034299590227, + "grad_norm": 0.7472957968711853, + "learning_rate": 7.170716889428919e-05, + "loss": 1.5081, + "step": 4667 + }, + { + "epoch": 1.416906966155714, + "grad_norm": 0.5398672223091125, + "learning_rate": 7.170109356014582e-05, + "loss": 2.0357, + "step": 4668 + }, + { + "epoch": 1.4172105023524055, + "grad_norm": 0.4912889301776886, + "learning_rate": 7.169501822600244e-05, + "loss": 1.4911, + "step": 4669 + }, + { + "epoch": 1.417514038549097, + "grad_norm": 0.5321101546287537, + "learning_rate": 7.168894289185905e-05, + "loss": 1.8128, + "step": 4670 + }, + { + "epoch": 1.4178175747457884, + "grad_norm": 0.5869212746620178, + "learning_rate": 7.168286755771569e-05, + "loss": 1.5963, + "step": 4671 + }, + { + "epoch": 1.4181211109424798, + "grad_norm": 0.5631369948387146, + "learning_rate": 7.16767922235723e-05, + "loss": 1.6133, + "step": 4672 + }, + { + "epoch": 1.4184246471391713, + "grad_norm": 0.6509801149368286, + "learning_rate": 7.167071688942892e-05, + "loss": 1.5701, + "step": 4673 + }, + { + "epoch": 1.4187281833358627, + "grad_norm": 0.5732050538063049, + "learning_rate": 7.166464155528554e-05, + "loss": 1.4822, + "step": 4674 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.5432000756263733, + "learning_rate": 7.165856622114217e-05, + "loss": 1.6423, + "step": 4675 + }, + { + "epoch": 1.4193352557292458, + "grad_norm": 0.4711626172065735, + "learning_rate": 7.165249088699879e-05, + "loss": 1.8018, + "step": 4676 + }, + { + "epoch": 1.4196387919259372, + "grad_norm": 0.6901485323905945, + "learning_rate": 7.16464155528554e-05, + "loss": 1.3306, + "step": 4677 + }, + { + "epoch": 1.4199423281226287, + "grad_norm": 0.4604353606700897, + "learning_rate": 7.164034021871203e-05, + "loss": 1.6195, + "step": 4678 + }, + { + "epoch": 1.42024586431932, + "grad_norm": 0.5561215281486511, + "learning_rate": 7.163426488456866e-05, + "loss": 1.4401, + "step": 4679 + }, + { + "epoch": 1.4205494005160115, + "grad_norm": 0.566388726234436, + "learning_rate": 7.162818955042527e-05, + "loss": 1.5088, + "step": 4680 + }, + { + "epoch": 1.420852936712703, + "grad_norm": 0.4941454827785492, + "learning_rate": 7.16221142162819e-05, + "loss": 1.7022, + "step": 4681 + }, + { + "epoch": 1.4211564729093944, + "grad_norm": 0.5353823900222778, + "learning_rate": 7.161603888213853e-05, + "loss": 1.4053, + "step": 4682 + }, + { + "epoch": 1.4214600091060858, + "grad_norm": 0.5283215045928955, + "learning_rate": 7.160996354799513e-05, + "loss": 1.9388, + "step": 4683 + }, + { + "epoch": 1.4217635453027775, + "grad_norm": 0.46900656819343567, + "learning_rate": 7.160388821385176e-05, + "loss": 1.5299, + "step": 4684 + }, + { + "epoch": 1.4220670814994687, + "grad_norm": 0.5333936810493469, + "learning_rate": 7.15978128797084e-05, + "loss": 1.5468, + "step": 4685 + }, + { + "epoch": 1.4223706176961604, + "grad_norm": 0.4555998742580414, + "learning_rate": 7.159173754556501e-05, + "loss": 1.677, + "step": 4686 + }, + { + "epoch": 1.4226741538928518, + "grad_norm": 0.5287268161773682, + "learning_rate": 7.158566221142163e-05, + "loss": 1.5348, + "step": 4687 + }, + { + "epoch": 1.4229776900895432, + "grad_norm": 0.489156037569046, + "learning_rate": 7.157958687727825e-05, + "loss": 1.8835, + "step": 4688 + }, + { + "epoch": 1.4232812262862347, + "grad_norm": 0.5219280123710632, + "learning_rate": 7.157351154313488e-05, + "loss": 1.8132, + "step": 4689 + }, + { + "epoch": 1.423584762482926, + "grad_norm": 0.5453715920448303, + "learning_rate": 7.15674362089915e-05, + "loss": 1.4375, + "step": 4690 + }, + { + "epoch": 1.4238882986796175, + "grad_norm": 0.5377395749092102, + "learning_rate": 7.156136087484811e-05, + "loss": 1.4851, + "step": 4691 + }, + { + "epoch": 1.424191834876309, + "grad_norm": 0.5290038585662842, + "learning_rate": 7.155528554070474e-05, + "loss": 1.9343, + "step": 4692 + }, + { + "epoch": 1.4244953710730004, + "grad_norm": 0.5442641973495483, + "learning_rate": 7.154921020656137e-05, + "loss": 1.3488, + "step": 4693 + }, + { + "epoch": 1.4247989072696918, + "grad_norm": 0.6024147272109985, + "learning_rate": 7.154313487241798e-05, + "loss": 1.6288, + "step": 4694 + }, + { + "epoch": 1.4251024434663835, + "grad_norm": 0.5711460113525391, + "learning_rate": 7.153705953827461e-05, + "loss": 1.3639, + "step": 4695 + }, + { + "epoch": 1.4254059796630747, + "grad_norm": 0.5421019196510315, + "learning_rate": 7.153098420413124e-05, + "loss": 1.923, + "step": 4696 + }, + { + "epoch": 1.4257095158597664, + "grad_norm": 0.5579157471656799, + "learning_rate": 7.152490886998784e-05, + "loss": 1.5599, + "step": 4697 + }, + { + "epoch": 1.4260130520564578, + "grad_norm": 0.515901505947113, + "learning_rate": 7.151883353584447e-05, + "loss": 1.6635, + "step": 4698 + }, + { + "epoch": 1.4263165882531492, + "grad_norm": 0.5151125192642212, + "learning_rate": 7.15127582017011e-05, + "loss": 1.7712, + "step": 4699 + }, + { + "epoch": 1.4266201244498407, + "grad_norm": 0.4949184060096741, + "learning_rate": 7.150668286755772e-05, + "loss": 1.9386, + "step": 4700 + }, + { + "epoch": 1.426923660646532, + "grad_norm": 0.5237774848937988, + "learning_rate": 7.150060753341434e-05, + "loss": 1.6551, + "step": 4701 + }, + { + "epoch": 1.4272271968432235, + "grad_norm": 0.45499083399772644, + "learning_rate": 7.149453219927096e-05, + "loss": 1.5892, + "step": 4702 + }, + { + "epoch": 1.427530733039915, + "grad_norm": 0.5620893239974976, + "learning_rate": 7.148845686512759e-05, + "loss": 1.5941, + "step": 4703 + }, + { + "epoch": 1.4278342692366064, + "grad_norm": 0.5575366616249084, + "learning_rate": 7.14823815309842e-05, + "loss": 1.7663, + "step": 4704 + }, + { + "epoch": 1.4281378054332978, + "grad_norm": 0.5817383527755737, + "learning_rate": 7.147630619684082e-05, + "loss": 1.0529, + "step": 4705 + }, + { + "epoch": 1.4284413416299895, + "grad_norm": 0.4917854964733124, + "learning_rate": 7.147023086269745e-05, + "loss": 1.8866, + "step": 4706 + }, + { + "epoch": 1.428744877826681, + "grad_norm": 0.5297862887382507, + "learning_rate": 7.146415552855407e-05, + "loss": 1.9065, + "step": 4707 + }, + { + "epoch": 1.4290484140233723, + "grad_norm": 0.4822104275226593, + "learning_rate": 7.145808019441069e-05, + "loss": 2.1447, + "step": 4708 + }, + { + "epoch": 1.4293519502200638, + "grad_norm": 0.5464652180671692, + "learning_rate": 7.145200486026732e-05, + "loss": 1.5848, + "step": 4709 + }, + { + "epoch": 1.4296554864167552, + "grad_norm": 0.541857123374939, + "learning_rate": 7.144592952612395e-05, + "loss": 1.8056, + "step": 4710 + }, + { + "epoch": 1.4299590226134467, + "grad_norm": 0.5261391401290894, + "learning_rate": 7.143985419198055e-05, + "loss": 1.6894, + "step": 4711 + }, + { + "epoch": 1.430262558810138, + "grad_norm": 0.6953042149543762, + "learning_rate": 7.143377885783718e-05, + "loss": 1.5562, + "step": 4712 + }, + { + "epoch": 1.4305660950068295, + "grad_norm": 1.253004789352417, + "learning_rate": 7.142770352369382e-05, + "loss": 1.6115, + "step": 4713 + }, + { + "epoch": 1.430869631203521, + "grad_norm": 0.46701014041900635, + "learning_rate": 7.142162818955043e-05, + "loss": 1.8057, + "step": 4714 + }, + { + "epoch": 1.4311731674002126, + "grad_norm": 0.4235196113586426, + "learning_rate": 7.141555285540705e-05, + "loss": 1.6542, + "step": 4715 + }, + { + "epoch": 1.4314767035969038, + "grad_norm": 0.6919435858726501, + "learning_rate": 7.140947752126367e-05, + "loss": 1.8055, + "step": 4716 + }, + { + "epoch": 1.4317802397935955, + "grad_norm": 0.5037835836410522, + "learning_rate": 7.14034021871203e-05, + "loss": 1.7444, + "step": 4717 + }, + { + "epoch": 1.432083775990287, + "grad_norm": 0.5179030299186707, + "learning_rate": 7.139732685297692e-05, + "loss": 0.9057, + "step": 4718 + }, + { + "epoch": 1.4323873121869783, + "grad_norm": 0.4623833894729614, + "learning_rate": 7.139125151883353e-05, + "loss": 1.8318, + "step": 4719 + }, + { + "epoch": 1.4326908483836698, + "grad_norm": 0.6244356632232666, + "learning_rate": 7.138517618469016e-05, + "loss": 1.7593, + "step": 4720 + }, + { + "epoch": 1.4329943845803612, + "grad_norm": 0.41617628931999207, + "learning_rate": 7.137910085054678e-05, + "loss": 1.3169, + "step": 4721 + }, + { + "epoch": 1.4332979207770526, + "grad_norm": 0.5794664025306702, + "learning_rate": 7.13730255164034e-05, + "loss": 1.4539, + "step": 4722 + }, + { + "epoch": 1.433601456973744, + "grad_norm": 0.5115159749984741, + "learning_rate": 7.136695018226003e-05, + "loss": 1.5652, + "step": 4723 + }, + { + "epoch": 1.4339049931704355, + "grad_norm": 0.5601288080215454, + "learning_rate": 7.136087484811666e-05, + "loss": 1.3252, + "step": 4724 + }, + { + "epoch": 1.434208529367127, + "grad_norm": 0.5465983152389526, + "learning_rate": 7.135479951397326e-05, + "loss": 1.8357, + "step": 4725 + }, + { + "epoch": 1.4345120655638186, + "grad_norm": 0.5939611792564392, + "learning_rate": 7.13487241798299e-05, + "loss": 1.1053, + "step": 4726 + }, + { + "epoch": 1.4348156017605098, + "grad_norm": 0.563441276550293, + "learning_rate": 7.134264884568653e-05, + "loss": 1.9001, + "step": 4727 + }, + { + "epoch": 1.4351191379572015, + "grad_norm": 0.5361672043800354, + "learning_rate": 7.133657351154314e-05, + "loss": 1.4157, + "step": 4728 + }, + { + "epoch": 1.435422674153893, + "grad_norm": 0.4924670159816742, + "learning_rate": 7.133049817739976e-05, + "loss": 1.7192, + "step": 4729 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.5592833757400513, + "learning_rate": 7.132442284325638e-05, + "loss": 1.4881, + "step": 4730 + }, + { + "epoch": 1.4360297465472758, + "grad_norm": 0.47412756085395813, + "learning_rate": 7.131834750911301e-05, + "loss": 2.1238, + "step": 4731 + }, + { + "epoch": 1.4363332827439672, + "grad_norm": 0.5320108532905579, + "learning_rate": 7.131227217496963e-05, + "loss": 1.4834, + "step": 4732 + }, + { + "epoch": 1.4366368189406586, + "grad_norm": 0.4866894483566284, + "learning_rate": 7.130619684082624e-05, + "loss": 1.6534, + "step": 4733 + }, + { + "epoch": 1.43694035513735, + "grad_norm": 0.4984651207923889, + "learning_rate": 7.130012150668287e-05, + "loss": 1.576, + "step": 4734 + }, + { + "epoch": 1.4372438913340415, + "grad_norm": 0.4538702368736267, + "learning_rate": 7.129404617253949e-05, + "loss": 1.8083, + "step": 4735 + }, + { + "epoch": 1.437547427530733, + "grad_norm": 0.5471993684768677, + "learning_rate": 7.128797083839611e-05, + "loss": 1.5235, + "step": 4736 + }, + { + "epoch": 1.4378509637274246, + "grad_norm": 0.4857175350189209, + "learning_rate": 7.128189550425274e-05, + "loss": 1.8666, + "step": 4737 + }, + { + "epoch": 1.4381544999241158, + "grad_norm": 0.4341141879558563, + "learning_rate": 7.127582017010937e-05, + "loss": 0.8605, + "step": 4738 + }, + { + "epoch": 1.4384580361208075, + "grad_norm": 0.5297701954841614, + "learning_rate": 7.126974483596597e-05, + "loss": 2.0106, + "step": 4739 + }, + { + "epoch": 1.438761572317499, + "grad_norm": 0.44098925590515137, + "learning_rate": 7.12636695018226e-05, + "loss": 1.3884, + "step": 4740 + }, + { + "epoch": 1.4390651085141903, + "grad_norm": 0.6269996762275696, + "learning_rate": 7.125759416767922e-05, + "loss": 1.7329, + "step": 4741 + }, + { + "epoch": 1.4393686447108818, + "grad_norm": 0.4201589524745941, + "learning_rate": 7.125151883353585e-05, + "loss": 1.908, + "step": 4742 + }, + { + "epoch": 1.4396721809075732, + "grad_norm": 0.49259108304977417, + "learning_rate": 7.124544349939247e-05, + "loss": 1.688, + "step": 4743 + }, + { + "epoch": 1.4399757171042646, + "grad_norm": 0.511873722076416, + "learning_rate": 7.123936816524909e-05, + "loss": 1.7732, + "step": 4744 + }, + { + "epoch": 1.440279253300956, + "grad_norm": 0.4922768771648407, + "learning_rate": 7.123329283110572e-05, + "loss": 1.721, + "step": 4745 + }, + { + "epoch": 1.4405827894976477, + "grad_norm": 0.4309976398944855, + "learning_rate": 7.122721749696234e-05, + "loss": 1.722, + "step": 4746 + }, + { + "epoch": 1.440886325694339, + "grad_norm": 0.5664550065994263, + "learning_rate": 7.122114216281895e-05, + "loss": 1.7154, + "step": 4747 + }, + { + "epoch": 1.4411898618910306, + "grad_norm": 0.5660502910614014, + "learning_rate": 7.121506682867558e-05, + "loss": 1.6217, + "step": 4748 + }, + { + "epoch": 1.441493398087722, + "grad_norm": 0.48245811462402344, + "learning_rate": 7.12089914945322e-05, + "loss": 1.7066, + "step": 4749 + }, + { + "epoch": 1.4417969342844135, + "grad_norm": 0.5408527255058289, + "learning_rate": 7.120291616038882e-05, + "loss": 1.4672, + "step": 4750 + }, + { + "epoch": 1.442100470481105, + "grad_norm": 0.542116105556488, + "learning_rate": 7.119684082624545e-05, + "loss": 1.842, + "step": 4751 + }, + { + "epoch": 1.4424040066777963, + "grad_norm": 0.6647868752479553, + "learning_rate": 7.119076549210208e-05, + "loss": 1.6458, + "step": 4752 + }, + { + "epoch": 1.4427075428744878, + "grad_norm": 0.5036889910697937, + "learning_rate": 7.118469015795868e-05, + "loss": 1.3188, + "step": 4753 + }, + { + "epoch": 1.4430110790711792, + "grad_norm": 0.5360887050628662, + "learning_rate": 7.117861482381531e-05, + "loss": 1.5876, + "step": 4754 + }, + { + "epoch": 1.4433146152678706, + "grad_norm": 0.45022231340408325, + "learning_rate": 7.117253948967193e-05, + "loss": 1.5266, + "step": 4755 + }, + { + "epoch": 1.443618151464562, + "grad_norm": 0.5475982427597046, + "learning_rate": 7.116646415552855e-05, + "loss": 1.476, + "step": 4756 + }, + { + "epoch": 1.4439216876612537, + "grad_norm": 0.615437924861908, + "learning_rate": 7.116038882138518e-05, + "loss": 1.6777, + "step": 4757 + }, + { + "epoch": 1.444225223857945, + "grad_norm": 0.5551777482032776, + "learning_rate": 7.11543134872418e-05, + "loss": 1.4589, + "step": 4758 + }, + { + "epoch": 1.4445287600546366, + "grad_norm": 0.636407196521759, + "learning_rate": 7.114823815309843e-05, + "loss": 2.1606, + "step": 4759 + }, + { + "epoch": 1.444832296251328, + "grad_norm": 0.5781602263450623, + "learning_rate": 7.114216281895505e-05, + "loss": 1.3962, + "step": 4760 + }, + { + "epoch": 1.4451358324480195, + "grad_norm": 0.612797737121582, + "learning_rate": 7.113608748481166e-05, + "loss": 1.5954, + "step": 4761 + }, + { + "epoch": 1.445439368644711, + "grad_norm": 0.5090053677558899, + "learning_rate": 7.11300121506683e-05, + "loss": 1.8134, + "step": 4762 + }, + { + "epoch": 1.4457429048414023, + "grad_norm": 0.49501726031303406, + "learning_rate": 7.112393681652491e-05, + "loss": 1.8646, + "step": 4763 + }, + { + "epoch": 1.4460464410380938, + "grad_norm": 0.44436565041542053, + "learning_rate": 7.111786148238153e-05, + "loss": 1.431, + "step": 4764 + }, + { + "epoch": 1.4463499772347852, + "grad_norm": 0.5765573978424072, + "learning_rate": 7.111178614823816e-05, + "loss": 1.6304, + "step": 4765 + }, + { + "epoch": 1.4466535134314766, + "grad_norm": 0.5061953663825989, + "learning_rate": 7.110571081409479e-05, + "loss": 1.905, + "step": 4766 + }, + { + "epoch": 1.446957049628168, + "grad_norm": 0.5362197756767273, + "learning_rate": 7.10996354799514e-05, + "loss": 1.8781, + "step": 4767 + }, + { + "epoch": 1.4472605858248597, + "grad_norm": 0.5062169432640076, + "learning_rate": 7.109356014580802e-05, + "loss": 1.7436, + "step": 4768 + }, + { + "epoch": 1.447564122021551, + "grad_norm": 0.4389554560184479, + "learning_rate": 7.108748481166464e-05, + "loss": 1.1959, + "step": 4769 + }, + { + "epoch": 1.4478676582182426, + "grad_norm": 0.5079648494720459, + "learning_rate": 7.108140947752126e-05, + "loss": 1.8988, + "step": 4770 + }, + { + "epoch": 1.448171194414934, + "grad_norm": 0.5056853294372559, + "learning_rate": 7.107533414337789e-05, + "loss": 1.3921, + "step": 4771 + }, + { + "epoch": 1.4484747306116255, + "grad_norm": 0.4805181622505188, + "learning_rate": 7.106925880923451e-05, + "loss": 1.6137, + "step": 4772 + }, + { + "epoch": 1.448778266808317, + "grad_norm": 0.43535923957824707, + "learning_rate": 7.106318347509114e-05, + "loss": 1.619, + "step": 4773 + }, + { + "epoch": 1.4490818030050083, + "grad_norm": 0.5391502976417542, + "learning_rate": 7.105710814094776e-05, + "loss": 1.637, + "step": 4774 + }, + { + "epoch": 1.4493853392016998, + "grad_norm": 0.569017767906189, + "learning_rate": 7.105103280680437e-05, + "loss": 1.6042, + "step": 4775 + }, + { + "epoch": 1.4496888753983912, + "grad_norm": 0.5475939512252808, + "learning_rate": 7.1044957472661e-05, + "loss": 1.9324, + "step": 4776 + }, + { + "epoch": 1.4499924115950826, + "grad_norm": 0.5549785494804382, + "learning_rate": 7.103888213851762e-05, + "loss": 1.7943, + "step": 4777 + }, + { + "epoch": 1.450295947791774, + "grad_norm": 0.5545724034309387, + "learning_rate": 7.103280680437424e-05, + "loss": 1.4165, + "step": 4778 + }, + { + "epoch": 1.4505994839884657, + "grad_norm": 0.4446735978126526, + "learning_rate": 7.102673147023087e-05, + "loss": 0.9686, + "step": 4779 + }, + { + "epoch": 1.4509030201851572, + "grad_norm": 0.5210545063018799, + "learning_rate": 7.102065613608749e-05, + "loss": 1.802, + "step": 4780 + }, + { + "epoch": 1.4512065563818486, + "grad_norm": 0.5372775793075562, + "learning_rate": 7.10145808019441e-05, + "loss": 1.8198, + "step": 4781 + }, + { + "epoch": 1.45151009257854, + "grad_norm": 0.513157069683075, + "learning_rate": 7.100850546780073e-05, + "loss": 1.8384, + "step": 4782 + }, + { + "epoch": 1.4518136287752315, + "grad_norm": 0.5085527896881104, + "learning_rate": 7.100243013365735e-05, + "loss": 1.4529, + "step": 4783 + }, + { + "epoch": 1.452117164971923, + "grad_norm": 0.5514604449272156, + "learning_rate": 7.099635479951397e-05, + "loss": 1.4275, + "step": 4784 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.6645312905311584, + "learning_rate": 7.09902794653706e-05, + "loss": 1.3341, + "step": 4785 + }, + { + "epoch": 1.4527242373653058, + "grad_norm": 0.4445338249206543, + "learning_rate": 7.098420413122722e-05, + "loss": 1.8186, + "step": 4786 + }, + { + "epoch": 1.4530277735619972, + "grad_norm": 0.528165340423584, + "learning_rate": 7.097812879708385e-05, + "loss": 1.4959, + "step": 4787 + }, + { + "epoch": 1.4533313097586888, + "grad_norm": 0.5748735666275024, + "learning_rate": 7.097205346294047e-05, + "loss": 1.8661, + "step": 4788 + }, + { + "epoch": 1.45363484595538, + "grad_norm": 0.4958101212978363, + "learning_rate": 7.096597812879708e-05, + "loss": 1.7738, + "step": 4789 + }, + { + "epoch": 1.4539383821520717, + "grad_norm": 0.622242271900177, + "learning_rate": 7.095990279465371e-05, + "loss": 1.8896, + "step": 4790 + }, + { + "epoch": 1.4542419183487632, + "grad_norm": 0.861436665058136, + "learning_rate": 7.095382746051033e-05, + "loss": 1.5506, + "step": 4791 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.5317303538322449, + "learning_rate": 7.094775212636695e-05, + "loss": 1.9661, + "step": 4792 + }, + { + "epoch": 1.454848990742146, + "grad_norm": 0.42405256628990173, + "learning_rate": 7.094167679222358e-05, + "loss": 1.8856, + "step": 4793 + }, + { + "epoch": 1.4551525269388375, + "grad_norm": 0.5652716755867004, + "learning_rate": 7.09356014580802e-05, + "loss": 1.5874, + "step": 4794 + }, + { + "epoch": 1.4554560631355289, + "grad_norm": 0.46071988344192505, + "learning_rate": 7.092952612393681e-05, + "loss": 1.7498, + "step": 4795 + }, + { + "epoch": 1.4557595993322203, + "grad_norm": 0.5258306264877319, + "learning_rate": 7.092345078979345e-05, + "loss": 1.5423, + "step": 4796 + }, + { + "epoch": 1.4560631355289118, + "grad_norm": 0.5660673975944519, + "learning_rate": 7.091737545565006e-05, + "loss": 1.4623, + "step": 4797 + }, + { + "epoch": 1.4563666717256032, + "grad_norm": 0.5659142732620239, + "learning_rate": 7.091130012150668e-05, + "loss": 1.9051, + "step": 4798 + }, + { + "epoch": 1.4566702079222948, + "grad_norm": 0.537135899066925, + "learning_rate": 7.090522478736331e-05, + "loss": 1.4598, + "step": 4799 + }, + { + "epoch": 1.456973744118986, + "grad_norm": 0.5714205503463745, + "learning_rate": 7.089914945321993e-05, + "loss": 1.9371, + "step": 4800 + }, + { + "epoch": 1.4572772803156777, + "grad_norm": 0.47099271416664124, + "learning_rate": 7.089307411907656e-05, + "loss": 2.0266, + "step": 4801 + }, + { + "epoch": 1.4575808165123691, + "grad_norm": 0.46086302399635315, + "learning_rate": 7.088699878493318e-05, + "loss": 1.0404, + "step": 4802 + }, + { + "epoch": 1.4578843527090606, + "grad_norm": 0.5183219313621521, + "learning_rate": 7.08809234507898e-05, + "loss": 1.4641, + "step": 4803 + }, + { + "epoch": 1.458187888905752, + "grad_norm": 0.3814326226711273, + "learning_rate": 7.087484811664642e-05, + "loss": 1.1975, + "step": 4804 + }, + { + "epoch": 1.4584914251024435, + "grad_norm": 0.5242198705673218, + "learning_rate": 7.086877278250304e-05, + "loss": 1.5483, + "step": 4805 + }, + { + "epoch": 1.4587949612991349, + "grad_norm": 0.4313776195049286, + "learning_rate": 7.086269744835966e-05, + "loss": 1.7125, + "step": 4806 + }, + { + "epoch": 1.4590984974958263, + "grad_norm": 0.5048007965087891, + "learning_rate": 7.085662211421629e-05, + "loss": 1.7111, + "step": 4807 + }, + { + "epoch": 1.4594020336925178, + "grad_norm": 0.5413311123847961, + "learning_rate": 7.085054678007291e-05, + "loss": 1.3843, + "step": 4808 + }, + { + "epoch": 1.4597055698892092, + "grad_norm": 0.4879262447357178, + "learning_rate": 7.084447144592952e-05, + "loss": 1.4675, + "step": 4809 + }, + { + "epoch": 1.4600091060859008, + "grad_norm": 0.4626248776912689, + "learning_rate": 7.083839611178616e-05, + "loss": 1.8313, + "step": 4810 + }, + { + "epoch": 1.4603126422825923, + "grad_norm": 0.4955083727836609, + "learning_rate": 7.083232077764277e-05, + "loss": 1.5718, + "step": 4811 + }, + { + "epoch": 1.4606161784792837, + "grad_norm": 0.5106918215751648, + "learning_rate": 7.082624544349939e-05, + "loss": 1.8617, + "step": 4812 + }, + { + "epoch": 1.4609197146759751, + "grad_norm": 0.5505479574203491, + "learning_rate": 7.082017010935602e-05, + "loss": 1.2744, + "step": 4813 + }, + { + "epoch": 1.4612232508726666, + "grad_norm": 0.5480737686157227, + "learning_rate": 7.081409477521264e-05, + "loss": 1.6127, + "step": 4814 + }, + { + "epoch": 1.461526787069358, + "grad_norm": 0.5669567584991455, + "learning_rate": 7.080801944106927e-05, + "loss": 1.4748, + "step": 4815 + }, + { + "epoch": 1.4618303232660494, + "grad_norm": 0.47782737016677856, + "learning_rate": 7.080194410692589e-05, + "loss": 1.9775, + "step": 4816 + }, + { + "epoch": 1.4621338594627409, + "grad_norm": 0.5649453401565552, + "learning_rate": 7.07958687727825e-05, + "loss": 1.7222, + "step": 4817 + }, + { + "epoch": 1.4624373956594323, + "grad_norm": 0.5078287720680237, + "learning_rate": 7.078979343863913e-05, + "loss": 1.5271, + "step": 4818 + }, + { + "epoch": 1.462740931856124, + "grad_norm": 0.5010936856269836, + "learning_rate": 7.078371810449575e-05, + "loss": 1.6845, + "step": 4819 + }, + { + "epoch": 1.4630444680528152, + "grad_norm": 0.4189418852329254, + "learning_rate": 7.077764277035237e-05, + "loss": 1.6715, + "step": 4820 + }, + { + "epoch": 1.4633480042495068, + "grad_norm": 0.46101582050323486, + "learning_rate": 7.0771567436209e-05, + "loss": 1.9044, + "step": 4821 + }, + { + "epoch": 1.4636515404461983, + "grad_norm": 0.45041772723197937, + "learning_rate": 7.076549210206562e-05, + "loss": 1.666, + "step": 4822 + }, + { + "epoch": 1.4639550766428897, + "grad_norm": 0.49406835436820984, + "learning_rate": 7.075941676792223e-05, + "loss": 1.8148, + "step": 4823 + }, + { + "epoch": 1.4642586128395811, + "grad_norm": 0.5963855981826782, + "learning_rate": 7.075334143377887e-05, + "loss": 1.389, + "step": 4824 + }, + { + "epoch": 1.4645621490362726, + "grad_norm": 0.6109672784805298, + "learning_rate": 7.074726609963548e-05, + "loss": 1.5109, + "step": 4825 + }, + { + "epoch": 1.464865685232964, + "grad_norm": 0.627612829208374, + "learning_rate": 7.07411907654921e-05, + "loss": 1.721, + "step": 4826 + }, + { + "epoch": 1.4651692214296554, + "grad_norm": 0.5766866207122803, + "learning_rate": 7.073511543134873e-05, + "loss": 1.6085, + "step": 4827 + }, + { + "epoch": 1.4654727576263469, + "grad_norm": 0.5267140865325928, + "learning_rate": 7.072904009720535e-05, + "loss": 1.4195, + "step": 4828 + }, + { + "epoch": 1.4657762938230383, + "grad_norm": 0.7349816560745239, + "learning_rate": 7.072296476306197e-05, + "loss": 1.2837, + "step": 4829 + }, + { + "epoch": 1.46607983001973, + "grad_norm": 0.47294366359710693, + "learning_rate": 7.07168894289186e-05, + "loss": 1.6431, + "step": 4830 + }, + { + "epoch": 1.4663833662164212, + "grad_norm": 0.7959756255149841, + "learning_rate": 7.071081409477521e-05, + "loss": 1.7113, + "step": 4831 + }, + { + "epoch": 1.4666869024131128, + "grad_norm": 0.5271220803260803, + "learning_rate": 7.070473876063184e-05, + "loss": 1.7415, + "step": 4832 + }, + { + "epoch": 1.4669904386098043, + "grad_norm": 0.5845800042152405, + "learning_rate": 7.069866342648846e-05, + "loss": 1.6222, + "step": 4833 + }, + { + "epoch": 1.4672939748064957, + "grad_norm": 0.5617759227752686, + "learning_rate": 7.069258809234508e-05, + "loss": 1.3666, + "step": 4834 + }, + { + "epoch": 1.4675975110031871, + "grad_norm": 0.489835262298584, + "learning_rate": 7.068651275820171e-05, + "loss": 1.6019, + "step": 4835 + }, + { + "epoch": 1.4679010471998786, + "grad_norm": 0.6882174611091614, + "learning_rate": 7.068043742405833e-05, + "loss": 1.8398, + "step": 4836 + }, + { + "epoch": 1.46820458339657, + "grad_norm": 0.5395167469978333, + "learning_rate": 7.067436208991494e-05, + "loss": 1.7218, + "step": 4837 + }, + { + "epoch": 1.4685081195932614, + "grad_norm": 0.5546643137931824, + "learning_rate": 7.066828675577158e-05, + "loss": 1.6591, + "step": 4838 + }, + { + "epoch": 1.4688116557899529, + "grad_norm": 0.4737721085548401, + "learning_rate": 7.066221142162819e-05, + "loss": 1.8764, + "step": 4839 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.5098763108253479, + "learning_rate": 7.065613608748481e-05, + "loss": 1.4366, + "step": 4840 + }, + { + "epoch": 1.469418728183336, + "grad_norm": 0.49989405274391174, + "learning_rate": 7.065006075334144e-05, + "loss": 1.5866, + "step": 4841 + }, + { + "epoch": 1.4697222643800274, + "grad_norm": 0.4978291690349579, + "learning_rate": 7.064398541919806e-05, + "loss": 1.7242, + "step": 4842 + }, + { + "epoch": 1.4700258005767188, + "grad_norm": 0.5850281715393066, + "learning_rate": 7.063791008505468e-05, + "loss": 1.5802, + "step": 4843 + }, + { + "epoch": 1.4703293367734103, + "grad_norm": 0.806866466999054, + "learning_rate": 7.06318347509113e-05, + "loss": 1.3167, + "step": 4844 + }, + { + "epoch": 1.4706328729701017, + "grad_norm": 0.7724255323410034, + "learning_rate": 7.062575941676792e-05, + "loss": 1.7092, + "step": 4845 + }, + { + "epoch": 1.4709364091667931, + "grad_norm": 0.5280294418334961, + "learning_rate": 7.061968408262455e-05, + "loss": 1.4041, + "step": 4846 + }, + { + "epoch": 1.4712399453634846, + "grad_norm": 0.5118871927261353, + "learning_rate": 7.061360874848117e-05, + "loss": 1.6343, + "step": 4847 + }, + { + "epoch": 1.471543481560176, + "grad_norm": 0.6602824330329895, + "learning_rate": 7.060753341433779e-05, + "loss": 1.4313, + "step": 4848 + }, + { + "epoch": 1.4718470177568674, + "grad_norm": 0.5540229082107544, + "learning_rate": 7.060145808019442e-05, + "loss": 1.2642, + "step": 4849 + }, + { + "epoch": 1.472150553953559, + "grad_norm": 0.5670307278633118, + "learning_rate": 7.059538274605104e-05, + "loss": 1.7549, + "step": 4850 + }, + { + "epoch": 1.4724540901502503, + "grad_norm": 0.5450428128242493, + "learning_rate": 7.058930741190765e-05, + "loss": 1.5367, + "step": 4851 + }, + { + "epoch": 1.472757626346942, + "grad_norm": 0.5904386043548584, + "learning_rate": 7.058323207776429e-05, + "loss": 1.815, + "step": 4852 + }, + { + "epoch": 1.4730611625436334, + "grad_norm": 0.5121384263038635, + "learning_rate": 7.05771567436209e-05, + "loss": 1.443, + "step": 4853 + }, + { + "epoch": 1.4733646987403248, + "grad_norm": 0.49516379833221436, + "learning_rate": 7.057108140947752e-05, + "loss": 1.2342, + "step": 4854 + }, + { + "epoch": 1.4736682349370163, + "grad_norm": 0.5796108245849609, + "learning_rate": 7.056500607533415e-05, + "loss": 1.4407, + "step": 4855 + }, + { + "epoch": 1.4739717711337077, + "grad_norm": 0.5850707292556763, + "learning_rate": 7.055893074119077e-05, + "loss": 1.9238, + "step": 4856 + }, + { + "epoch": 1.4742753073303991, + "grad_norm": 0.6012457013130188, + "learning_rate": 7.055285540704739e-05, + "loss": 1.9955, + "step": 4857 + }, + { + "epoch": 1.4745788435270906, + "grad_norm": 0.5519123077392578, + "learning_rate": 7.054678007290402e-05, + "loss": 1.6369, + "step": 4858 + }, + { + "epoch": 1.474882379723782, + "grad_norm": 0.7996104955673218, + "learning_rate": 7.054070473876063e-05, + "loss": 1.6808, + "step": 4859 + }, + { + "epoch": 1.4751859159204734, + "grad_norm": 0.6222035884857178, + "learning_rate": 7.053462940461726e-05, + "loss": 1.5358, + "step": 4860 + }, + { + "epoch": 1.475489452117165, + "grad_norm": 0.6133623719215393, + "learning_rate": 7.052855407047388e-05, + "loss": 1.1753, + "step": 4861 + }, + { + "epoch": 1.4757929883138563, + "grad_norm": 0.5412503480911255, + "learning_rate": 7.05224787363305e-05, + "loss": 1.344, + "step": 4862 + }, + { + "epoch": 1.476096524510548, + "grad_norm": 0.490725576877594, + "learning_rate": 7.051640340218713e-05, + "loss": 1.3589, + "step": 4863 + }, + { + "epoch": 1.4764000607072394, + "grad_norm": 0.5937108397483826, + "learning_rate": 7.051032806804375e-05, + "loss": 1.7992, + "step": 4864 + }, + { + "epoch": 1.4767035969039308, + "grad_norm": 0.4251902401447296, + "learning_rate": 7.050425273390036e-05, + "loss": 1.221, + "step": 4865 + }, + { + "epoch": 1.4770071331006223, + "grad_norm": 0.5891363620758057, + "learning_rate": 7.0498177399757e-05, + "loss": 1.5272, + "step": 4866 + }, + { + "epoch": 1.4773106692973137, + "grad_norm": 0.47068464756011963, + "learning_rate": 7.049210206561361e-05, + "loss": 1.2203, + "step": 4867 + }, + { + "epoch": 1.4776142054940051, + "grad_norm": 0.5927923917770386, + "learning_rate": 7.048602673147023e-05, + "loss": 1.8061, + "step": 4868 + }, + { + "epoch": 1.4779177416906966, + "grad_norm": 0.5405677556991577, + "learning_rate": 7.047995139732686e-05, + "loss": 1.0712, + "step": 4869 + }, + { + "epoch": 1.478221277887388, + "grad_norm": 0.5127203464508057, + "learning_rate": 7.047387606318348e-05, + "loss": 1.9416, + "step": 4870 + }, + { + "epoch": 1.4785248140840794, + "grad_norm": 0.7339115738868713, + "learning_rate": 7.04678007290401e-05, + "loss": 1.679, + "step": 4871 + }, + { + "epoch": 1.478828350280771, + "grad_norm": 0.3953307271003723, + "learning_rate": 7.046172539489673e-05, + "loss": 1.4667, + "step": 4872 + }, + { + "epoch": 1.4791318864774623, + "grad_norm": 0.5734744668006897, + "learning_rate": 7.045565006075334e-05, + "loss": 1.3896, + "step": 4873 + }, + { + "epoch": 1.479435422674154, + "grad_norm": 0.47078248858451843, + "learning_rate": 7.044957472660997e-05, + "loss": 1.3958, + "step": 4874 + }, + { + "epoch": 1.4797389588708454, + "grad_norm": 0.698595404624939, + "learning_rate": 7.044349939246659e-05, + "loss": 1.5145, + "step": 4875 + }, + { + "epoch": 1.4800424950675368, + "grad_norm": 0.45018211007118225, + "learning_rate": 7.043742405832321e-05, + "loss": 1.8364, + "step": 4876 + }, + { + "epoch": 1.4803460312642283, + "grad_norm": 0.5878861546516418, + "learning_rate": 7.043134872417984e-05, + "loss": 2.0177, + "step": 4877 + }, + { + "epoch": 1.4806495674609197, + "grad_norm": 0.5218092799186707, + "learning_rate": 7.042527339003644e-05, + "loss": 1.6696, + "step": 4878 + }, + { + "epoch": 1.4809531036576111, + "grad_norm": 0.5571547150611877, + "learning_rate": 7.041919805589307e-05, + "loss": 1.9252, + "step": 4879 + }, + { + "epoch": 1.4812566398543026, + "grad_norm": 0.9365406036376953, + "learning_rate": 7.04131227217497e-05, + "loss": 1.6427, + "step": 4880 + }, + { + "epoch": 1.4815601760509942, + "grad_norm": 0.43765008449554443, + "learning_rate": 7.040704738760632e-05, + "loss": 1.7834, + "step": 4881 + }, + { + "epoch": 1.4818637122476854, + "grad_norm": 0.5410556197166443, + "learning_rate": 7.040097205346294e-05, + "loss": 1.4718, + "step": 4882 + }, + { + "epoch": 1.482167248444377, + "grad_norm": 0.5025068521499634, + "learning_rate": 7.039489671931957e-05, + "loss": 2.0082, + "step": 4883 + }, + { + "epoch": 1.4824707846410685, + "grad_norm": 0.46235191822052, + "learning_rate": 7.038882138517619e-05, + "loss": 1.4398, + "step": 4884 + }, + { + "epoch": 1.48277432083776, + "grad_norm": 0.45842286944389343, + "learning_rate": 7.03827460510328e-05, + "loss": 1.4679, + "step": 4885 + }, + { + "epoch": 1.4830778570344514, + "grad_norm": 0.5397442579269409, + "learning_rate": 7.037667071688944e-05, + "loss": 1.7636, + "step": 4886 + }, + { + "epoch": 1.4833813932311428, + "grad_norm": 0.8042169213294983, + "learning_rate": 7.037059538274605e-05, + "loss": 1.5971, + "step": 4887 + }, + { + "epoch": 1.4836849294278343, + "grad_norm": 0.4883686602115631, + "learning_rate": 7.036452004860268e-05, + "loss": 1.7809, + "step": 4888 + }, + { + "epoch": 1.4839884656245257, + "grad_norm": 0.5571742653846741, + "learning_rate": 7.03584447144593e-05, + "loss": 1.652, + "step": 4889 + }, + { + "epoch": 1.4842920018212171, + "grad_norm": 0.5409842729568481, + "learning_rate": 7.035236938031592e-05, + "loss": 1.7244, + "step": 4890 + }, + { + "epoch": 1.4845955380179086, + "grad_norm": 0.6065815091133118, + "learning_rate": 7.034629404617255e-05, + "loss": 1.2745, + "step": 4891 + }, + { + "epoch": 1.4848990742146002, + "grad_norm": 0.499057412147522, + "learning_rate": 7.034021871202915e-05, + "loss": 1.6754, + "step": 4892 + }, + { + "epoch": 1.4852026104112914, + "grad_norm": 0.5934293866157532, + "learning_rate": 7.033414337788578e-05, + "loss": 1.6668, + "step": 4893 + }, + { + "epoch": 1.485506146607983, + "grad_norm": 0.5210697650909424, + "learning_rate": 7.032806804374242e-05, + "loss": 1.768, + "step": 4894 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.5933223962783813, + "learning_rate": 7.032199270959903e-05, + "loss": 0.8766, + "step": 4895 + }, + { + "epoch": 1.486113219001366, + "grad_norm": 0.5411461591720581, + "learning_rate": 7.031591737545565e-05, + "loss": 1.7448, + "step": 4896 + }, + { + "epoch": 1.4864167551980574, + "grad_norm": 0.6732122302055359, + "learning_rate": 7.030984204131228e-05, + "loss": 1.9058, + "step": 4897 + }, + { + "epoch": 1.4867202913947488, + "grad_norm": 0.4998563528060913, + "learning_rate": 7.03037667071689e-05, + "loss": 1.6999, + "step": 4898 + }, + { + "epoch": 1.4870238275914403, + "grad_norm": 0.9698503613471985, + "learning_rate": 7.029769137302552e-05, + "loss": 1.5221, + "step": 4899 + }, + { + "epoch": 1.4873273637881317, + "grad_norm": 0.4289691746234894, + "learning_rate": 7.029161603888215e-05, + "loss": 1.7837, + "step": 4900 + }, + { + "epoch": 1.4876308999848231, + "grad_norm": 0.5373130440711975, + "learning_rate": 7.028554070473876e-05, + "loss": 1.2654, + "step": 4901 + }, + { + "epoch": 1.4879344361815146, + "grad_norm": 0.4715062975883484, + "learning_rate": 7.027946537059538e-05, + "loss": 1.7417, + "step": 4902 + }, + { + "epoch": 1.4882379723782062, + "grad_norm": 0.494567334651947, + "learning_rate": 7.027339003645201e-05, + "loss": 1.6927, + "step": 4903 + }, + { + "epoch": 1.4885415085748974, + "grad_norm": 0.5440135598182678, + "learning_rate": 7.026731470230863e-05, + "loss": 1.7141, + "step": 4904 + }, + { + "epoch": 1.488845044771589, + "grad_norm": 0.5883985161781311, + "learning_rate": 7.026123936816526e-05, + "loss": 1.3748, + "step": 4905 + }, + { + "epoch": 1.4891485809682805, + "grad_norm": 0.5393871068954468, + "learning_rate": 7.025516403402186e-05, + "loss": 1.6513, + "step": 4906 + }, + { + "epoch": 1.489452117164972, + "grad_norm": 0.49532851576805115, + "learning_rate": 7.02490886998785e-05, + "loss": 1.6941, + "step": 4907 + }, + { + "epoch": 1.4897556533616634, + "grad_norm": 0.5608948469161987, + "learning_rate": 7.024301336573513e-05, + "loss": 1.698, + "step": 4908 + }, + { + "epoch": 1.4900591895583548, + "grad_norm": 0.45833712816238403, + "learning_rate": 7.023693803159174e-05, + "loss": 0.855, + "step": 4909 + }, + { + "epoch": 1.4903627257550462, + "grad_norm": 0.7820297479629517, + "learning_rate": 7.023086269744836e-05, + "loss": 0.8055, + "step": 4910 + }, + { + "epoch": 1.4906662619517377, + "grad_norm": 0.531315803527832, + "learning_rate": 7.022478736330499e-05, + "loss": 1.7807, + "step": 4911 + }, + { + "epoch": 1.4909697981484293, + "grad_norm": 0.5644878149032593, + "learning_rate": 7.021871202916161e-05, + "loss": 1.6072, + "step": 4912 + }, + { + "epoch": 1.4912733343451205, + "grad_norm": 0.5558152198791504, + "learning_rate": 7.021263669501823e-05, + "loss": 1.7253, + "step": 4913 + }, + { + "epoch": 1.4915768705418122, + "grad_norm": 0.6361897587776184, + "learning_rate": 7.020656136087486e-05, + "loss": 2.008, + "step": 4914 + }, + { + "epoch": 1.4918804067385036, + "grad_norm": 0.5509002208709717, + "learning_rate": 7.020048602673147e-05, + "loss": 1.5245, + "step": 4915 + }, + { + "epoch": 1.492183942935195, + "grad_norm": 0.6185054183006287, + "learning_rate": 7.019441069258809e-05, + "loss": 1.5209, + "step": 4916 + }, + { + "epoch": 1.4924874791318865, + "grad_norm": 0.6301572918891907, + "learning_rate": 7.018833535844471e-05, + "loss": 1.6199, + "step": 4917 + }, + { + "epoch": 1.492791015328578, + "grad_norm": 0.6556615233421326, + "learning_rate": 7.018226002430134e-05, + "loss": 1.1805, + "step": 4918 + }, + { + "epoch": 1.4930945515252694, + "grad_norm": 0.6239224672317505, + "learning_rate": 7.017618469015797e-05, + "loss": 1.7463, + "step": 4919 + }, + { + "epoch": 1.4933980877219608, + "grad_norm": 0.5668457746505737, + "learning_rate": 7.017010935601457e-05, + "loss": 1.5714, + "step": 4920 + }, + { + "epoch": 1.4937016239186522, + "grad_norm": 0.4906651973724365, + "learning_rate": 7.01640340218712e-05, + "loss": 1.7403, + "step": 4921 + }, + { + "epoch": 1.4940051601153437, + "grad_norm": 0.35869914293289185, + "learning_rate": 7.015795868772784e-05, + "loss": 0.96, + "step": 4922 + }, + { + "epoch": 1.4943086963120353, + "grad_norm": 0.5741409063339233, + "learning_rate": 7.015188335358445e-05, + "loss": 1.4338, + "step": 4923 + }, + { + "epoch": 1.4946122325087265, + "grad_norm": 0.4738176763057709, + "learning_rate": 7.014580801944107e-05, + "loss": 1.511, + "step": 4924 + }, + { + "epoch": 1.4949157687054182, + "grad_norm": 0.5779879093170166, + "learning_rate": 7.01397326852977e-05, + "loss": 1.8035, + "step": 4925 + }, + { + "epoch": 1.4952193049021096, + "grad_norm": 11.4434232711792, + "learning_rate": 7.013365735115432e-05, + "loss": 1.404, + "step": 4926 + }, + { + "epoch": 1.495522841098801, + "grad_norm": 0.5170081257820129, + "learning_rate": 7.012758201701094e-05, + "loss": 1.6944, + "step": 4927 + }, + { + "epoch": 1.4958263772954925, + "grad_norm": 0.6129285097122192, + "learning_rate": 7.012150668286757e-05, + "loss": 1.411, + "step": 4928 + }, + { + "epoch": 1.496129913492184, + "grad_norm": 0.4695473313331604, + "learning_rate": 7.011543134872418e-05, + "loss": 1.9634, + "step": 4929 + }, + { + "epoch": 1.4964334496888754, + "grad_norm": 0.4601220190525055, + "learning_rate": 7.01093560145808e-05, + "loss": 1.9807, + "step": 4930 + }, + { + "epoch": 1.4967369858855668, + "grad_norm": 0.6048194766044617, + "learning_rate": 7.010328068043742e-05, + "loss": 1.666, + "step": 4931 + }, + { + "epoch": 1.4970405220822582, + "grad_norm": 0.5324653387069702, + "learning_rate": 7.009720534629405e-05, + "loss": 1.7026, + "step": 4932 + }, + { + "epoch": 1.4973440582789497, + "grad_norm": 0.5313439965248108, + "learning_rate": 7.009113001215068e-05, + "loss": 1.7723, + "step": 4933 + }, + { + "epoch": 1.4976475944756413, + "grad_norm": 0.5671830177307129, + "learning_rate": 7.008505467800728e-05, + "loss": 1.3399, + "step": 4934 + }, + { + "epoch": 1.4979511306723325, + "grad_norm": 0.5517529249191284, + "learning_rate": 7.007897934386391e-05, + "loss": 1.3371, + "step": 4935 + }, + { + "epoch": 1.4982546668690242, + "grad_norm": 0.47462624311447144, + "learning_rate": 7.007290400972055e-05, + "loss": 1.77, + "step": 4936 + }, + { + "epoch": 1.4985582030657156, + "grad_norm": 0.5272954702377319, + "learning_rate": 7.006682867557716e-05, + "loss": 1.738, + "step": 4937 + }, + { + "epoch": 1.498861739262407, + "grad_norm": 0.48636749386787415, + "learning_rate": 7.006075334143378e-05, + "loss": 1.807, + "step": 4938 + }, + { + "epoch": 1.4991652754590985, + "grad_norm": 0.4686090350151062, + "learning_rate": 7.005467800729041e-05, + "loss": 1.9367, + "step": 4939 + }, + { + "epoch": 1.49946881165579, + "grad_norm": 0.48979878425598145, + "learning_rate": 7.004860267314703e-05, + "loss": 1.3131, + "step": 4940 + }, + { + "epoch": 1.4997723478524814, + "grad_norm": 0.571522057056427, + "learning_rate": 7.004252733900365e-05, + "loss": 1.7552, + "step": 4941 + }, + { + "epoch": 1.5000758840491728, + "grad_norm": 0.4702144265174866, + "learning_rate": 7.003645200486028e-05, + "loss": 1.6307, + "step": 4942 + }, + { + "epoch": 1.5003794202458645, + "grad_norm": 0.5681264400482178, + "learning_rate": 7.00303766707169e-05, + "loss": 1.8803, + "step": 4943 + }, + { + "epoch": 1.5006829564425557, + "grad_norm": 0.5513084530830383, + "learning_rate": 7.002430133657351e-05, + "loss": 1.8452, + "step": 4944 + }, + { + "epoch": 1.5009864926392473, + "grad_norm": 0.5764619708061218, + "learning_rate": 7.001822600243013e-05, + "loss": 1.0705, + "step": 4945 + }, + { + "epoch": 1.5012900288359385, + "grad_norm": 0.5168572664260864, + "learning_rate": 7.001215066828676e-05, + "loss": 1.8198, + "step": 4946 + }, + { + "epoch": 1.5015935650326302, + "grad_norm": 1.6397596597671509, + "learning_rate": 7.000607533414339e-05, + "loss": 1.6889, + "step": 4947 + }, + { + "epoch": 1.5018971012293216, + "grad_norm": 1.1885254383087158, + "learning_rate": 7e-05, + "loss": 1.4736, + "step": 4948 + }, + { + "epoch": 1.502200637426013, + "grad_norm": 0.6035342216491699, + "learning_rate": 6.999392466585662e-05, + "loss": 1.2236, + "step": 4949 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.5453857183456421, + "learning_rate": 6.998784933171326e-05, + "loss": 1.4884, + "step": 4950 + }, + { + "epoch": 1.502807709819396, + "grad_norm": 0.6001697778701782, + "learning_rate": 6.998177399756986e-05, + "loss": 1.6897, + "step": 4951 + }, + { + "epoch": 1.5031112460160874, + "grad_norm": 0.5285768508911133, + "learning_rate": 6.997569866342649e-05, + "loss": 1.5918, + "step": 4952 + }, + { + "epoch": 1.5034147822127788, + "grad_norm": 0.5310909748077393, + "learning_rate": 6.996962332928312e-05, + "loss": 0.9908, + "step": 4953 + }, + { + "epoch": 1.5037183184094705, + "grad_norm": 0.5659394264221191, + "learning_rate": 6.996354799513974e-05, + "loss": 1.6639, + "step": 4954 + }, + { + "epoch": 1.5040218546061617, + "grad_norm": 0.49565961956977844, + "learning_rate": 6.995747266099636e-05, + "loss": 1.8041, + "step": 4955 + }, + { + "epoch": 1.5043253908028533, + "grad_norm": 0.6244103312492371, + "learning_rate": 6.995139732685299e-05, + "loss": 1.7278, + "step": 4956 + }, + { + "epoch": 1.5046289269995445, + "grad_norm": 0.5681789517402649, + "learning_rate": 6.99453219927096e-05, + "loss": 1.7712, + "step": 4957 + }, + { + "epoch": 1.5049324631962362, + "grad_norm": 0.6059290170669556, + "learning_rate": 6.993924665856622e-05, + "loss": 1.6451, + "step": 4958 + }, + { + "epoch": 1.5052359993929276, + "grad_norm": 0.5958153605461121, + "learning_rate": 6.993317132442284e-05, + "loss": 1.5677, + "step": 4959 + }, + { + "epoch": 1.505539535589619, + "grad_norm": 0.4673093855381012, + "learning_rate": 6.992709599027947e-05, + "loss": 0.9809, + "step": 4960 + }, + { + "epoch": 1.5058430717863105, + "grad_norm": 0.5468495488166809, + "learning_rate": 6.99210206561361e-05, + "loss": 1.5763, + "step": 4961 + }, + { + "epoch": 1.506146607983002, + "grad_norm": 0.48700201511383057, + "learning_rate": 6.99149453219927e-05, + "loss": 1.5329, + "step": 4962 + }, + { + "epoch": 1.5064501441796936, + "grad_norm": 0.5750671625137329, + "learning_rate": 6.990886998784933e-05, + "loss": 1.904, + "step": 4963 + }, + { + "epoch": 1.5067536803763848, + "grad_norm": 0.6705580353736877, + "learning_rate": 6.990279465370597e-05, + "loss": 1.6202, + "step": 4964 + }, + { + "epoch": 1.5070572165730765, + "grad_norm": 0.7310038208961487, + "learning_rate": 6.989671931956257e-05, + "loss": 1.9462, + "step": 4965 + }, + { + "epoch": 1.5073607527697677, + "grad_norm": 0.5241039991378784, + "learning_rate": 6.98906439854192e-05, + "loss": 0.926, + "step": 4966 + }, + { + "epoch": 1.5076642889664593, + "grad_norm": 0.5601332187652588, + "learning_rate": 6.988456865127583e-05, + "loss": 1.6858, + "step": 4967 + }, + { + "epoch": 1.5079678251631508, + "grad_norm": 0.5411149859428406, + "learning_rate": 6.987849331713245e-05, + "loss": 1.4975, + "step": 4968 + }, + { + "epoch": 1.5082713613598422, + "grad_norm": 0.512823224067688, + "learning_rate": 6.987241798298907e-05, + "loss": 1.9705, + "step": 4969 + }, + { + "epoch": 1.5085748975565336, + "grad_norm": 0.41236433386802673, + "learning_rate": 6.98663426488457e-05, + "loss": 1.7138, + "step": 4970 + }, + { + "epoch": 1.508878433753225, + "grad_norm": 1.0118792057037354, + "learning_rate": 6.986026731470231e-05, + "loss": 1.6026, + "step": 4971 + }, + { + "epoch": 1.5091819699499165, + "grad_norm": 0.4692041575908661, + "learning_rate": 6.985419198055893e-05, + "loss": 1.6974, + "step": 4972 + }, + { + "epoch": 1.509485506146608, + "grad_norm": 0.5381711721420288, + "learning_rate": 6.984811664641555e-05, + "loss": 1.5082, + "step": 4973 + }, + { + "epoch": 1.5097890423432996, + "grad_norm": 0.5390987992286682, + "learning_rate": 6.984204131227218e-05, + "loss": 1.852, + "step": 4974 + }, + { + "epoch": 1.5100925785399908, + "grad_norm": 0.5014956593513489, + "learning_rate": 6.98359659781288e-05, + "loss": 1.7055, + "step": 4975 + }, + { + "epoch": 1.5103961147366824, + "grad_norm": 0.6018857359886169, + "learning_rate": 6.982989064398541e-05, + "loss": 1.4449, + "step": 4976 + }, + { + "epoch": 1.5106996509333737, + "grad_norm": 1.7235846519470215, + "learning_rate": 6.982381530984204e-05, + "loss": 1.6223, + "step": 4977 + }, + { + "epoch": 1.5110031871300653, + "grad_norm": 0.4839053153991699, + "learning_rate": 6.981773997569868e-05, + "loss": 1.7088, + "step": 4978 + }, + { + "epoch": 1.5113067233267568, + "grad_norm": 0.5103007555007935, + "learning_rate": 6.981166464155528e-05, + "loss": 1.743, + "step": 4979 + }, + { + "epoch": 1.5116102595234482, + "grad_norm": 0.5718008279800415, + "learning_rate": 6.980558930741191e-05, + "loss": 1.5478, + "step": 4980 + }, + { + "epoch": 1.5119137957201396, + "grad_norm": 0.48067647218704224, + "learning_rate": 6.979951397326854e-05, + "loss": 1.6347, + "step": 4981 + }, + { + "epoch": 1.512217331916831, + "grad_norm": 0.5288820266723633, + "learning_rate": 6.979343863912516e-05, + "loss": 1.5188, + "step": 4982 + }, + { + "epoch": 1.5125208681135225, + "grad_norm": 0.5474231839179993, + "learning_rate": 6.978736330498178e-05, + "loss": 1.8939, + "step": 4983 + }, + { + "epoch": 1.512824404310214, + "grad_norm": 0.4533451795578003, + "learning_rate": 6.97812879708384e-05, + "loss": 1.8278, + "step": 4984 + }, + { + "epoch": 1.5131279405069056, + "grad_norm": 0.5542921423912048, + "learning_rate": 6.977521263669502e-05, + "loss": 1.8469, + "step": 4985 + }, + { + "epoch": 1.5134314767035968, + "grad_norm": 0.48920702934265137, + "learning_rate": 6.976913730255164e-05, + "loss": 1.747, + "step": 4986 + }, + { + "epoch": 1.5137350129002884, + "grad_norm": 0.5489048361778259, + "learning_rate": 6.976306196840826e-05, + "loss": 1.882, + "step": 4987 + }, + { + "epoch": 1.5140385490969797, + "grad_norm": 0.5060453414916992, + "learning_rate": 6.975698663426489e-05, + "loss": 1.9188, + "step": 4988 + }, + { + "epoch": 1.5143420852936713, + "grad_norm": 0.48653027415275574, + "learning_rate": 6.97509113001215e-05, + "loss": 1.8193, + "step": 4989 + }, + { + "epoch": 1.5146456214903627, + "grad_norm": 0.5047556757926941, + "learning_rate": 6.974483596597812e-05, + "loss": 1.7924, + "step": 4990 + }, + { + "epoch": 1.5149491576870542, + "grad_norm": 0.5091820955276489, + "learning_rate": 6.973876063183475e-05, + "loss": 1.7446, + "step": 4991 + }, + { + "epoch": 1.5152526938837456, + "grad_norm": 0.5132384300231934, + "learning_rate": 6.973268529769139e-05, + "loss": 1.6975, + "step": 4992 + }, + { + "epoch": 1.515556230080437, + "grad_norm": 0.5605453848838806, + "learning_rate": 6.972660996354799e-05, + "loss": 1.747, + "step": 4993 + }, + { + "epoch": 1.5158597662771287, + "grad_norm": 0.607867419719696, + "learning_rate": 6.972053462940462e-05, + "loss": 1.7074, + "step": 4994 + }, + { + "epoch": 1.51616330247382, + "grad_norm": 0.8664141297340393, + "learning_rate": 6.971445929526125e-05, + "loss": 1.4887, + "step": 4995 + }, + { + "epoch": 1.5164668386705116, + "grad_norm": 0.5505083799362183, + "learning_rate": 6.970838396111787e-05, + "loss": 2.0955, + "step": 4996 + }, + { + "epoch": 1.5167703748672028, + "grad_norm": 0.49877801537513733, + "learning_rate": 6.970230862697449e-05, + "loss": 1.8985, + "step": 4997 + }, + { + "epoch": 1.5170739110638944, + "grad_norm": 0.5322781801223755, + "learning_rate": 6.969623329283112e-05, + "loss": 1.7619, + "step": 4998 + }, + { + "epoch": 1.5173774472605859, + "grad_norm": 0.5772939920425415, + "learning_rate": 6.969015795868773e-05, + "loss": 1.5791, + "step": 4999 + }, + { + "epoch": 1.5176809834572773, + "grad_norm": 0.5946705341339111, + "learning_rate": 6.968408262454435e-05, + "loss": 1.4594, + "step": 5000 + }, + { + "epoch": 1.5179845196539687, + "grad_norm": 0.4682813584804535, + "learning_rate": 6.967800729040097e-05, + "loss": 1.3301, + "step": 5001 + }, + { + "epoch": 1.5182880558506602, + "grad_norm": 0.542500376701355, + "learning_rate": 6.96719319562576e-05, + "loss": 1.7164, + "step": 5002 + }, + { + "epoch": 1.5185915920473516, + "grad_norm": 0.5071382522583008, + "learning_rate": 6.966585662211422e-05, + "loss": 1.3703, + "step": 5003 + }, + { + "epoch": 1.518895128244043, + "grad_norm": 0.5770359039306641, + "learning_rate": 6.965978128797083e-05, + "loss": 1.4195, + "step": 5004 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.5445078015327454, + "learning_rate": 6.965370595382746e-05, + "loss": 1.6649, + "step": 5005 + }, + { + "epoch": 1.519502200637426, + "grad_norm": 0.5967651009559631, + "learning_rate": 6.96476306196841e-05, + "loss": 1.3306, + "step": 5006 + }, + { + "epoch": 1.5198057368341176, + "grad_norm": 0.7904636859893799, + "learning_rate": 6.96415552855407e-05, + "loss": 1.6451, + "step": 5007 + }, + { + "epoch": 1.5201092730308088, + "grad_norm": 0.5439865589141846, + "learning_rate": 6.963547995139733e-05, + "loss": 1.9616, + "step": 5008 + }, + { + "epoch": 1.5204128092275004, + "grad_norm": 0.5701196789741516, + "learning_rate": 6.962940461725396e-05, + "loss": 1.2662, + "step": 5009 + }, + { + "epoch": 1.5207163454241919, + "grad_norm": 0.5003536343574524, + "learning_rate": 6.962332928311058e-05, + "loss": 1.7916, + "step": 5010 + }, + { + "epoch": 1.5210198816208833, + "grad_norm": 0.6876890659332275, + "learning_rate": 6.96172539489672e-05, + "loss": 1.8057, + "step": 5011 + }, + { + "epoch": 1.5213234178175747, + "grad_norm": 0.48604485392570496, + "learning_rate": 6.961117861482381e-05, + "loss": 1.8073, + "step": 5012 + }, + { + "epoch": 1.5216269540142662, + "grad_norm": 0.4319641888141632, + "learning_rate": 6.960510328068044e-05, + "loss": 1.7092, + "step": 5013 + }, + { + "epoch": 1.5219304902109576, + "grad_norm": 0.49503380060195923, + "learning_rate": 6.959902794653706e-05, + "loss": 1.7313, + "step": 5014 + }, + { + "epoch": 1.522234026407649, + "grad_norm": 0.46390098333358765, + "learning_rate": 6.959295261239368e-05, + "loss": 1.7932, + "step": 5015 + }, + { + "epoch": 1.5225375626043407, + "grad_norm": 0.5236671566963196, + "learning_rate": 6.958687727825031e-05, + "loss": 1.527, + "step": 5016 + }, + { + "epoch": 1.522841098801032, + "grad_norm": 0.5421310663223267, + "learning_rate": 6.958080194410693e-05, + "loss": 1.7505, + "step": 5017 + }, + { + "epoch": 1.5231446349977236, + "grad_norm": 0.43941354751586914, + "learning_rate": 6.957472660996354e-05, + "loss": 1.1991, + "step": 5018 + }, + { + "epoch": 1.5234481711944148, + "grad_norm": 0.8650984764099121, + "learning_rate": 6.956865127582017e-05, + "loss": 1.349, + "step": 5019 + }, + { + "epoch": 1.5237517073911064, + "grad_norm": 0.6118148565292358, + "learning_rate": 6.95625759416768e-05, + "loss": 1.7518, + "step": 5020 + }, + { + "epoch": 1.5240552435877979, + "grad_norm": 0.5155607461929321, + "learning_rate": 6.955650060753341e-05, + "loss": 1.743, + "step": 5021 + }, + { + "epoch": 1.5243587797844893, + "grad_norm": 0.5593464374542236, + "learning_rate": 6.955042527339004e-05, + "loss": 2.0366, + "step": 5022 + }, + { + "epoch": 1.5246623159811807, + "grad_norm": 0.45686206221580505, + "learning_rate": 6.954434993924667e-05, + "loss": 1.3583, + "step": 5023 + }, + { + "epoch": 1.5249658521778722, + "grad_norm": 0.5127416253089905, + "learning_rate": 6.953827460510329e-05, + "loss": 1.8798, + "step": 5024 + }, + { + "epoch": 1.5252693883745638, + "grad_norm": 0.49719300866127014, + "learning_rate": 6.95321992709599e-05, + "loss": 1.8293, + "step": 5025 + }, + { + "epoch": 1.525572924571255, + "grad_norm": 0.5015152096748352, + "learning_rate": 6.952612393681652e-05, + "loss": 1.5741, + "step": 5026 + }, + { + "epoch": 1.5258764607679467, + "grad_norm": 0.7065471410751343, + "learning_rate": 6.952004860267315e-05, + "loss": 1.8697, + "step": 5027 + }, + { + "epoch": 1.526179996964638, + "grad_norm": 0.5936737656593323, + "learning_rate": 6.951397326852977e-05, + "loss": 1.6771, + "step": 5028 + }, + { + "epoch": 1.5264835331613296, + "grad_norm": 0.40442827343940735, + "learning_rate": 6.950789793438639e-05, + "loss": 1.876, + "step": 5029 + }, + { + "epoch": 1.526787069358021, + "grad_norm": 0.3835110366344452, + "learning_rate": 6.950182260024302e-05, + "loss": 1.5589, + "step": 5030 + }, + { + "epoch": 1.5270906055547124, + "grad_norm": 0.49415749311447144, + "learning_rate": 6.949574726609964e-05, + "loss": 1.2888, + "step": 5031 + }, + { + "epoch": 1.5273941417514039, + "grad_norm": 0.677581787109375, + "learning_rate": 6.948967193195625e-05, + "loss": 1.8734, + "step": 5032 + }, + { + "epoch": 1.5276976779480953, + "grad_norm": 0.573188066482544, + "learning_rate": 6.948359659781289e-05, + "loss": 1.4829, + "step": 5033 + }, + { + "epoch": 1.5280012141447867, + "grad_norm": 0.5042519569396973, + "learning_rate": 6.947752126366952e-05, + "loss": 1.7926, + "step": 5034 + }, + { + "epoch": 1.5283047503414782, + "grad_norm": 0.4527590572834015, + "learning_rate": 6.947144592952612e-05, + "loss": 1.8739, + "step": 5035 + }, + { + "epoch": 1.5286082865381698, + "grad_norm": 0.6127368211746216, + "learning_rate": 6.946537059538275e-05, + "loss": 1.6489, + "step": 5036 + }, + { + "epoch": 1.528911822734861, + "grad_norm": 0.6344648599624634, + "learning_rate": 6.945929526123938e-05, + "loss": 1.303, + "step": 5037 + }, + { + "epoch": 1.5292153589315527, + "grad_norm": 0.4736098349094391, + "learning_rate": 6.945321992709599e-05, + "loss": 1.8006, + "step": 5038 + }, + { + "epoch": 1.529518895128244, + "grad_norm": 0.5820353031158447, + "learning_rate": 6.944714459295262e-05, + "loss": 1.1874, + "step": 5039 + }, + { + "epoch": 1.5298224313249356, + "grad_norm": 0.7533172369003296, + "learning_rate": 6.944106925880923e-05, + "loss": 1.8722, + "step": 5040 + }, + { + "epoch": 1.530125967521627, + "grad_norm": 0.5187054872512817, + "learning_rate": 6.943499392466586e-05, + "loss": 1.4976, + "step": 5041 + }, + { + "epoch": 1.5304295037183184, + "grad_norm": 1.9647060632705688, + "learning_rate": 6.942891859052248e-05, + "loss": 1.6822, + "step": 5042 + }, + { + "epoch": 1.5307330399150099, + "grad_norm": 0.49814414978027344, + "learning_rate": 6.94228432563791e-05, + "loss": 1.7004, + "step": 5043 + }, + { + "epoch": 1.5310365761117013, + "grad_norm": 0.47123029828071594, + "learning_rate": 6.941676792223573e-05, + "loss": 1.4828, + "step": 5044 + }, + { + "epoch": 1.5313401123083927, + "grad_norm": 0.5213639140129089, + "learning_rate": 6.941069258809235e-05, + "loss": 1.2126, + "step": 5045 + }, + { + "epoch": 1.5316436485050842, + "grad_norm": 1.4494082927703857, + "learning_rate": 6.940461725394896e-05, + "loss": 1.6, + "step": 5046 + }, + { + "epoch": 1.5319471847017758, + "grad_norm": 0.497048556804657, + "learning_rate": 6.93985419198056e-05, + "loss": 1.7893, + "step": 5047 + }, + { + "epoch": 1.532250720898467, + "grad_norm": 0.6306423544883728, + "learning_rate": 6.939246658566221e-05, + "loss": 1.8145, + "step": 5048 + }, + { + "epoch": 1.5325542570951587, + "grad_norm": 0.5947224497795105, + "learning_rate": 6.938639125151883e-05, + "loss": 2.114, + "step": 5049 + }, + { + "epoch": 1.53285779329185, + "grad_norm": 0.5670956969261169, + "learning_rate": 6.938031591737546e-05, + "loss": 1.8438, + "step": 5050 + }, + { + "epoch": 1.5331613294885416, + "grad_norm": 0.5403228402137756, + "learning_rate": 6.937424058323209e-05, + "loss": 1.4862, + "step": 5051 + }, + { + "epoch": 1.533464865685233, + "grad_norm": 0.5355117321014404, + "learning_rate": 6.93681652490887e-05, + "loss": 1.6828, + "step": 5052 + }, + { + "epoch": 1.5337684018819244, + "grad_norm": 0.5913071036338806, + "learning_rate": 6.936208991494533e-05, + "loss": 1.0808, + "step": 5053 + }, + { + "epoch": 1.5340719380786159, + "grad_norm": 0.590332567691803, + "learning_rate": 6.935601458080194e-05, + "loss": 1.4234, + "step": 5054 + }, + { + "epoch": 1.5343754742753073, + "grad_norm": 0.5567223429679871, + "learning_rate": 6.934993924665857e-05, + "loss": 1.8105, + "step": 5055 + }, + { + "epoch": 1.534679010471999, + "grad_norm": 0.5712899565696716, + "learning_rate": 6.934386391251519e-05, + "loss": 1.8365, + "step": 5056 + }, + { + "epoch": 1.5349825466686902, + "grad_norm": 0.46898317337036133, + "learning_rate": 6.933778857837181e-05, + "loss": 1.7624, + "step": 5057 + }, + { + "epoch": 1.5352860828653818, + "grad_norm": 0.5434770584106445, + "learning_rate": 6.933171324422844e-05, + "loss": 1.8745, + "step": 5058 + }, + { + "epoch": 1.535589619062073, + "grad_norm": 0.5103024840354919, + "learning_rate": 6.932563791008506e-05, + "loss": 1.314, + "step": 5059 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.5385938882827759, + "learning_rate": 6.931956257594167e-05, + "loss": 1.4106, + "step": 5060 + }, + { + "epoch": 1.536196691455456, + "grad_norm": 0.52946537733078, + "learning_rate": 6.93134872417983e-05, + "loss": 1.7167, + "step": 5061 + }, + { + "epoch": 1.5365002276521476, + "grad_norm": 0.499805212020874, + "learning_rate": 6.930741190765492e-05, + "loss": 1.8297, + "step": 5062 + }, + { + "epoch": 1.536803763848839, + "grad_norm": 0.48632124066352844, + "learning_rate": 6.930133657351154e-05, + "loss": 1.6382, + "step": 5063 + }, + { + "epoch": 1.5371073000455304, + "grad_norm": 0.5471576452255249, + "learning_rate": 6.929526123936817e-05, + "loss": 1.9328, + "step": 5064 + }, + { + "epoch": 1.5374108362422219, + "grad_norm": 0.5234617590904236, + "learning_rate": 6.92891859052248e-05, + "loss": 1.7133, + "step": 5065 + }, + { + "epoch": 1.5377143724389133, + "grad_norm": 0.7335671186447144, + "learning_rate": 6.92831105710814e-05, + "loss": 1.5148, + "step": 5066 + }, + { + "epoch": 1.538017908635605, + "grad_norm": 0.5422836542129517, + "learning_rate": 6.927703523693804e-05, + "loss": 1.5883, + "step": 5067 + }, + { + "epoch": 1.5383214448322962, + "grad_norm": 0.5048597455024719, + "learning_rate": 6.927095990279465e-05, + "loss": 1.7012, + "step": 5068 + }, + { + "epoch": 1.5386249810289878, + "grad_norm": 0.49904757738113403, + "learning_rate": 6.926488456865128e-05, + "loss": 1.8099, + "step": 5069 + }, + { + "epoch": 1.538928517225679, + "grad_norm": 0.527485191822052, + "learning_rate": 6.92588092345079e-05, + "loss": 1.7986, + "step": 5070 + }, + { + "epoch": 1.5392320534223707, + "grad_norm": 0.5265811681747437, + "learning_rate": 6.925273390036452e-05, + "loss": 1.4898, + "step": 5071 + }, + { + "epoch": 1.5395355896190621, + "grad_norm": 0.5025148391723633, + "learning_rate": 6.924665856622115e-05, + "loss": 1.201, + "step": 5072 + }, + { + "epoch": 1.5398391258157536, + "grad_norm": 0.561477780342102, + "learning_rate": 6.924058323207777e-05, + "loss": 1.7444, + "step": 5073 + }, + { + "epoch": 1.540142662012445, + "grad_norm": 0.4805125892162323, + "learning_rate": 6.923450789793438e-05, + "loss": 1.749, + "step": 5074 + }, + { + "epoch": 1.5404461982091364, + "grad_norm": 0.5457605123519897, + "learning_rate": 6.922843256379102e-05, + "loss": 1.5691, + "step": 5075 + }, + { + "epoch": 1.5407497344058279, + "grad_norm": 0.4902309775352478, + "learning_rate": 6.922235722964763e-05, + "loss": 1.7727, + "step": 5076 + }, + { + "epoch": 1.5410532706025193, + "grad_norm": 0.505024790763855, + "learning_rate": 6.921628189550425e-05, + "loss": 1.809, + "step": 5077 + }, + { + "epoch": 1.541356806799211, + "grad_norm": 0.6486569046974182, + "learning_rate": 6.921020656136088e-05, + "loss": 1.544, + "step": 5078 + }, + { + "epoch": 1.5416603429959022, + "grad_norm": 0.6943787336349487, + "learning_rate": 6.920413122721751e-05, + "loss": 1.5817, + "step": 5079 + }, + { + "epoch": 1.5419638791925938, + "grad_norm": 0.49823305010795593, + "learning_rate": 6.919805589307412e-05, + "loss": 1.5363, + "step": 5080 + }, + { + "epoch": 1.542267415389285, + "grad_norm": 0.49246180057525635, + "learning_rate": 6.919198055893075e-05, + "loss": 1.5483, + "step": 5081 + }, + { + "epoch": 1.5425709515859767, + "grad_norm": 0.5904031991958618, + "learning_rate": 6.918590522478736e-05, + "loss": 1.3076, + "step": 5082 + }, + { + "epoch": 1.5428744877826681, + "grad_norm": 0.5842667818069458, + "learning_rate": 6.9179829890644e-05, + "loss": 1.7947, + "step": 5083 + }, + { + "epoch": 1.5431780239793595, + "grad_norm": 0.5764684677124023, + "learning_rate": 6.917375455650061e-05, + "loss": 1.6732, + "step": 5084 + }, + { + "epoch": 1.543481560176051, + "grad_norm": 0.4869197607040405, + "learning_rate": 6.916767922235723e-05, + "loss": 1.2825, + "step": 5085 + }, + { + "epoch": 1.5437850963727424, + "grad_norm": 0.4678569436073303, + "learning_rate": 6.916160388821386e-05, + "loss": 1.8523, + "step": 5086 + }, + { + "epoch": 1.5440886325694338, + "grad_norm": 0.6023167967796326, + "learning_rate": 6.915552855407048e-05, + "loss": 1.1368, + "step": 5087 + }, + { + "epoch": 1.5443921687661253, + "grad_norm": 0.6071855425834656, + "learning_rate": 6.91494532199271e-05, + "loss": 1.6182, + "step": 5088 + }, + { + "epoch": 1.544695704962817, + "grad_norm": 0.8810903429985046, + "learning_rate": 6.914337788578373e-05, + "loss": 1.9165, + "step": 5089 + }, + { + "epoch": 1.5449992411595082, + "grad_norm": 0.6049922108650208, + "learning_rate": 6.913730255164034e-05, + "loss": 1.7061, + "step": 5090 + }, + { + "epoch": 1.5453027773561998, + "grad_norm": 0.5413761138916016, + "learning_rate": 6.913122721749696e-05, + "loss": 1.8952, + "step": 5091 + }, + { + "epoch": 1.545606313552891, + "grad_norm": 0.519356906414032, + "learning_rate": 6.912515188335359e-05, + "loss": 1.7272, + "step": 5092 + }, + { + "epoch": 1.5459098497495827, + "grad_norm": 0.4805813729763031, + "learning_rate": 6.911907654921021e-05, + "loss": 1.7761, + "step": 5093 + }, + { + "epoch": 1.546213385946274, + "grad_norm": 0.5507004857063293, + "learning_rate": 6.911300121506683e-05, + "loss": 1.7849, + "step": 5094 + }, + { + "epoch": 1.5465169221429655, + "grad_norm": 0.48082712292671204, + "learning_rate": 6.910692588092346e-05, + "loss": 1.7475, + "step": 5095 + }, + { + "epoch": 1.546820458339657, + "grad_norm": 0.5184032320976257, + "learning_rate": 6.910085054678007e-05, + "loss": 1.4425, + "step": 5096 + }, + { + "epoch": 1.5471239945363484, + "grad_norm": 0.6835402250289917, + "learning_rate": 6.90947752126367e-05, + "loss": 1.7552, + "step": 5097 + }, + { + "epoch": 1.54742753073304, + "grad_norm": 0.5267919898033142, + "learning_rate": 6.908869987849332e-05, + "loss": 1.2211, + "step": 5098 + }, + { + "epoch": 1.5477310669297313, + "grad_norm": 0.7247775197029114, + "learning_rate": 6.908262454434994e-05, + "loss": 1.4436, + "step": 5099 + }, + { + "epoch": 1.548034603126423, + "grad_norm": 0.5839969515800476, + "learning_rate": 6.907654921020657e-05, + "loss": 1.7545, + "step": 5100 + }, + { + "epoch": 1.5483381393231141, + "grad_norm": 0.5333119034767151, + "learning_rate": 6.907047387606319e-05, + "loss": 1.6516, + "step": 5101 + }, + { + "epoch": 1.5486416755198058, + "grad_norm": 0.5837914943695068, + "learning_rate": 6.90643985419198e-05, + "loss": 1.7908, + "step": 5102 + }, + { + "epoch": 1.5489452117164972, + "grad_norm": 0.5100420713424683, + "learning_rate": 6.905832320777644e-05, + "loss": 1.4083, + "step": 5103 + }, + { + "epoch": 1.5492487479131887, + "grad_norm": 0.5830096006393433, + "learning_rate": 6.905224787363305e-05, + "loss": 1.7333, + "step": 5104 + }, + { + "epoch": 1.54955228410988, + "grad_norm": 1.1144078969955444, + "learning_rate": 6.904617253948967e-05, + "loss": 1.5282, + "step": 5105 + }, + { + "epoch": 1.5498558203065715, + "grad_norm": 0.41218632459640503, + "learning_rate": 6.90400972053463e-05, + "loss": 0.9582, + "step": 5106 + }, + { + "epoch": 1.550159356503263, + "grad_norm": 0.5433921813964844, + "learning_rate": 6.903402187120292e-05, + "loss": 1.8383, + "step": 5107 + }, + { + "epoch": 1.5504628926999544, + "grad_norm": 0.5715109705924988, + "learning_rate": 6.902794653705954e-05, + "loss": 1.7578, + "step": 5108 + }, + { + "epoch": 1.550766428896646, + "grad_norm": 0.5353966355323792, + "learning_rate": 6.902187120291617e-05, + "loss": 1.4237, + "step": 5109 + }, + { + "epoch": 1.5510699650933373, + "grad_norm": 0.6180744171142578, + "learning_rate": 6.901579586877278e-05, + "loss": 1.5549, + "step": 5110 + }, + { + "epoch": 1.551373501290029, + "grad_norm": 0.48517996072769165, + "learning_rate": 6.90097205346294e-05, + "loss": 1.7625, + "step": 5111 + }, + { + "epoch": 1.5516770374867201, + "grad_norm": 0.5804780721664429, + "learning_rate": 6.900364520048603e-05, + "loss": 1.7948, + "step": 5112 + }, + { + "epoch": 1.5519805736834118, + "grad_norm": 0.47300025820732117, + "learning_rate": 6.899756986634265e-05, + "loss": 1.6219, + "step": 5113 + }, + { + "epoch": 1.5522841098801032, + "grad_norm": 0.5687447190284729, + "learning_rate": 6.899149453219928e-05, + "loss": 1.7505, + "step": 5114 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.5722203254699707, + "learning_rate": 6.89854191980559e-05, + "loss": 1.7819, + "step": 5115 + }, + { + "epoch": 1.552891182273486, + "grad_norm": 0.5327666401863098, + "learning_rate": 6.897934386391251e-05, + "loss": 1.7752, + "step": 5116 + }, + { + "epoch": 1.5531947184701775, + "grad_norm": 0.5604074001312256, + "learning_rate": 6.897326852976915e-05, + "loss": 1.5922, + "step": 5117 + }, + { + "epoch": 1.553498254666869, + "grad_norm": 0.4844719469547272, + "learning_rate": 6.896719319562576e-05, + "loss": 1.3195, + "step": 5118 + }, + { + "epoch": 1.5538017908635604, + "grad_norm": 0.6321808099746704, + "learning_rate": 6.896111786148238e-05, + "loss": 1.8087, + "step": 5119 + }, + { + "epoch": 1.554105327060252, + "grad_norm": 0.49531543254852295, + "learning_rate": 6.895504252733901e-05, + "loss": 1.7925, + "step": 5120 + }, + { + "epoch": 1.5544088632569433, + "grad_norm": 0.5053551197052002, + "learning_rate": 6.894896719319563e-05, + "loss": 1.6427, + "step": 5121 + }, + { + "epoch": 1.554712399453635, + "grad_norm": 0.5420728325843811, + "learning_rate": 6.894289185905225e-05, + "loss": 1.8029, + "step": 5122 + }, + { + "epoch": 1.5550159356503261, + "grad_norm": 0.6182068586349487, + "learning_rate": 6.893681652490888e-05, + "loss": 2.2098, + "step": 5123 + }, + { + "epoch": 1.5553194718470178, + "grad_norm": 0.5669702887535095, + "learning_rate": 6.89307411907655e-05, + "loss": 1.5848, + "step": 5124 + }, + { + "epoch": 1.5556230080437092, + "grad_norm": 0.5447514653205872, + "learning_rate": 6.892466585662211e-05, + "loss": 1.3853, + "step": 5125 + }, + { + "epoch": 1.5559265442404007, + "grad_norm": 0.38322126865386963, + "learning_rate": 6.891859052247874e-05, + "loss": 1.2577, + "step": 5126 + }, + { + "epoch": 1.556230080437092, + "grad_norm": 0.5988068580627441, + "learning_rate": 6.891251518833536e-05, + "loss": 1.4318, + "step": 5127 + }, + { + "epoch": 1.5565336166337835, + "grad_norm": 0.43476438522338867, + "learning_rate": 6.890643985419199e-05, + "loss": 1.6554, + "step": 5128 + }, + { + "epoch": 1.5568371528304752, + "grad_norm": 0.5268298983573914, + "learning_rate": 6.890036452004861e-05, + "loss": 1.562, + "step": 5129 + }, + { + "epoch": 1.5571406890271664, + "grad_norm": 0.5895389914512634, + "learning_rate": 6.889428918590522e-05, + "loss": 1.5002, + "step": 5130 + }, + { + "epoch": 1.557444225223858, + "grad_norm": 0.5553779006004333, + "learning_rate": 6.888821385176186e-05, + "loss": 1.7396, + "step": 5131 + }, + { + "epoch": 1.5577477614205493, + "grad_norm": 0.5850470066070557, + "learning_rate": 6.888213851761847e-05, + "loss": 1.578, + "step": 5132 + }, + { + "epoch": 1.558051297617241, + "grad_norm": 0.514907717704773, + "learning_rate": 6.887606318347509e-05, + "loss": 1.7704, + "step": 5133 + }, + { + "epoch": 1.5583548338139324, + "grad_norm": 1.6753712892532349, + "learning_rate": 6.886998784933172e-05, + "loss": 1.4806, + "step": 5134 + }, + { + "epoch": 1.5586583700106238, + "grad_norm": 0.5311963558197021, + "learning_rate": 6.886391251518834e-05, + "loss": 1.5369, + "step": 5135 + }, + { + "epoch": 1.5589619062073152, + "grad_norm": 0.547295093536377, + "learning_rate": 6.885783718104496e-05, + "loss": 1.8403, + "step": 5136 + }, + { + "epoch": 1.5592654424040067, + "grad_norm": 0.5358843207359314, + "learning_rate": 6.885176184690159e-05, + "loss": 2.058, + "step": 5137 + }, + { + "epoch": 1.559568978600698, + "grad_norm": 0.5402520895004272, + "learning_rate": 6.88456865127582e-05, + "loss": 1.8126, + "step": 5138 + }, + { + "epoch": 1.5598725147973895, + "grad_norm": 0.4622855484485626, + "learning_rate": 6.883961117861482e-05, + "loss": 1.2372, + "step": 5139 + }, + { + "epoch": 1.5601760509940812, + "grad_norm": 0.5184701681137085, + "learning_rate": 6.883353584447145e-05, + "loss": 1.9226, + "step": 5140 + }, + { + "epoch": 1.5604795871907724, + "grad_norm": 0.4912964701652527, + "learning_rate": 6.882746051032807e-05, + "loss": 1.7784, + "step": 5141 + }, + { + "epoch": 1.560783123387464, + "grad_norm": 0.5692001581192017, + "learning_rate": 6.88213851761847e-05, + "loss": 1.7847, + "step": 5142 + }, + { + "epoch": 1.5610866595841553, + "grad_norm": 1.0396461486816406, + "learning_rate": 6.881530984204132e-05, + "loss": 1.7171, + "step": 5143 + }, + { + "epoch": 1.561390195780847, + "grad_norm": 0.5134807825088501, + "learning_rate": 6.880923450789793e-05, + "loss": 1.5056, + "step": 5144 + }, + { + "epoch": 1.5616937319775384, + "grad_norm": 0.539484977722168, + "learning_rate": 6.880315917375457e-05, + "loss": 1.686, + "step": 5145 + }, + { + "epoch": 1.5619972681742298, + "grad_norm": 0.5286623239517212, + "learning_rate": 6.879708383961118e-05, + "loss": 1.827, + "step": 5146 + }, + { + "epoch": 1.5623008043709212, + "grad_norm": 0.5663209557533264, + "learning_rate": 6.87910085054678e-05, + "loss": 1.7342, + "step": 5147 + }, + { + "epoch": 1.5626043405676127, + "grad_norm": 0.5498020052909851, + "learning_rate": 6.878493317132443e-05, + "loss": 1.8626, + "step": 5148 + }, + { + "epoch": 1.562907876764304, + "grad_norm": 0.5602872371673584, + "learning_rate": 6.877885783718105e-05, + "loss": 1.7361, + "step": 5149 + }, + { + "epoch": 1.5632114129609955, + "grad_norm": 0.4635816514492035, + "learning_rate": 6.877278250303767e-05, + "loss": 1.6465, + "step": 5150 + }, + { + "epoch": 1.5635149491576872, + "grad_norm": 0.600685715675354, + "learning_rate": 6.87667071688943e-05, + "loss": 1.6324, + "step": 5151 + }, + { + "epoch": 1.5638184853543784, + "grad_norm": 0.525507926940918, + "learning_rate": 6.876063183475091e-05, + "loss": 1.7158, + "step": 5152 + }, + { + "epoch": 1.56412202155107, + "grad_norm": 1.154478669166565, + "learning_rate": 6.875455650060753e-05, + "loss": 2.0856, + "step": 5153 + }, + { + "epoch": 1.5644255577477613, + "grad_norm": 0.43475109338760376, + "learning_rate": 6.874848116646416e-05, + "loss": 1.3634, + "step": 5154 + }, + { + "epoch": 1.564729093944453, + "grad_norm": 1.1901203393936157, + "learning_rate": 6.874240583232078e-05, + "loss": 1.1507, + "step": 5155 + }, + { + "epoch": 1.5650326301411444, + "grad_norm": 0.48481816053390503, + "learning_rate": 6.873633049817741e-05, + "loss": 1.6388, + "step": 5156 + }, + { + "epoch": 1.5653361663378358, + "grad_norm": 0.5131592154502869, + "learning_rate": 6.873025516403403e-05, + "loss": 1.4084, + "step": 5157 + }, + { + "epoch": 1.5656397025345272, + "grad_norm": 0.5295942425727844, + "learning_rate": 6.872417982989064e-05, + "loss": 1.2543, + "step": 5158 + }, + { + "epoch": 1.5659432387312187, + "grad_norm": 0.6020081043243408, + "learning_rate": 6.871810449574728e-05, + "loss": 1.6094, + "step": 5159 + }, + { + "epoch": 1.5662467749279103, + "grad_norm": 0.5166780352592468, + "learning_rate": 6.871202916160389e-05, + "loss": 1.7124, + "step": 5160 + }, + { + "epoch": 1.5665503111246015, + "grad_norm": 0.5535523891448975, + "learning_rate": 6.870595382746051e-05, + "loss": 1.7073, + "step": 5161 + }, + { + "epoch": 1.5668538473212932, + "grad_norm": 0.6225024461746216, + "learning_rate": 6.869987849331714e-05, + "loss": 1.2864, + "step": 5162 + }, + { + "epoch": 1.5671573835179844, + "grad_norm": 1.0813506841659546, + "learning_rate": 6.869380315917376e-05, + "loss": 1.8424, + "step": 5163 + }, + { + "epoch": 1.567460919714676, + "grad_norm": 0.5852144360542297, + "learning_rate": 6.868772782503038e-05, + "loss": 1.5746, + "step": 5164 + }, + { + "epoch": 1.5677644559113675, + "grad_norm": 0.6405759453773499, + "learning_rate": 6.8681652490887e-05, + "loss": 1.8351, + "step": 5165 + }, + { + "epoch": 1.568067992108059, + "grad_norm": 0.5141618847846985, + "learning_rate": 6.867557715674362e-05, + "loss": 1.7889, + "step": 5166 + }, + { + "epoch": 1.5683715283047504, + "grad_norm": 1.1576392650604248, + "learning_rate": 6.866950182260024e-05, + "loss": 1.2938, + "step": 5167 + }, + { + "epoch": 1.5686750645014418, + "grad_norm": 0.5479152202606201, + "learning_rate": 6.866342648845687e-05, + "loss": 1.808, + "step": 5168 + }, + { + "epoch": 1.5689786006981332, + "grad_norm": 0.5595178604125977, + "learning_rate": 6.865735115431349e-05, + "loss": 1.6656, + "step": 5169 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.4428215026855469, + "learning_rate": 6.865127582017012e-05, + "loss": 1.7519, + "step": 5170 + }, + { + "epoch": 1.5695856730915163, + "grad_norm": 0.5423892736434937, + "learning_rate": 6.864520048602674e-05, + "loss": 1.703, + "step": 5171 + }, + { + "epoch": 1.5698892092882075, + "grad_norm": 0.4570651650428772, + "learning_rate": 6.863912515188335e-05, + "loss": 1.7215, + "step": 5172 + }, + { + "epoch": 1.5701927454848992, + "grad_norm": 0.5852006673812866, + "learning_rate": 6.863304981773999e-05, + "loss": 1.7473, + "step": 5173 + }, + { + "epoch": 1.5704962816815904, + "grad_norm": 0.6939977407455444, + "learning_rate": 6.86269744835966e-05, + "loss": 1.4824, + "step": 5174 + }, + { + "epoch": 1.570799817878282, + "grad_norm": 0.4711341857910156, + "learning_rate": 6.862089914945322e-05, + "loss": 1.706, + "step": 5175 + }, + { + "epoch": 1.5711033540749735, + "grad_norm": 0.5366457104682922, + "learning_rate": 6.861482381530985e-05, + "loss": 1.9728, + "step": 5176 + }, + { + "epoch": 1.571406890271665, + "grad_norm": 0.5411386489868164, + "learning_rate": 6.860874848116647e-05, + "loss": 1.7552, + "step": 5177 + }, + { + "epoch": 1.5717104264683563, + "grad_norm": 0.5751674771308899, + "learning_rate": 6.860267314702309e-05, + "loss": 1.9909, + "step": 5178 + }, + { + "epoch": 1.5720139626650478, + "grad_norm": 0.4560869038105011, + "learning_rate": 6.859659781287972e-05, + "loss": 1.4371, + "step": 5179 + }, + { + "epoch": 1.5723174988617392, + "grad_norm": 0.53570955991745, + "learning_rate": 6.859052247873633e-05, + "loss": 1.6533, + "step": 5180 + }, + { + "epoch": 1.5726210350584306, + "grad_norm": 0.4982752799987793, + "learning_rate": 6.858444714459295e-05, + "loss": 1.2318, + "step": 5181 + }, + { + "epoch": 1.5729245712551223, + "grad_norm": 0.5627778768539429, + "learning_rate": 6.857837181044958e-05, + "loss": 1.7169, + "step": 5182 + }, + { + "epoch": 1.5732281074518135, + "grad_norm": 0.4794982373714447, + "learning_rate": 6.85722964763062e-05, + "loss": 1.2168, + "step": 5183 + }, + { + "epoch": 1.5735316436485052, + "grad_norm": 0.5762937664985657, + "learning_rate": 6.856622114216282e-05, + "loss": 1.4961, + "step": 5184 + }, + { + "epoch": 1.5738351798451964, + "grad_norm": 0.5434368252754211, + "learning_rate": 6.856014580801945e-05, + "loss": 1.6214, + "step": 5185 + }, + { + "epoch": 1.574138716041888, + "grad_norm": 0.7843817472457886, + "learning_rate": 6.855407047387606e-05, + "loss": 1.7494, + "step": 5186 + }, + { + "epoch": 1.5744422522385795, + "grad_norm": 0.5866186618804932, + "learning_rate": 6.85479951397327e-05, + "loss": 1.575, + "step": 5187 + }, + { + "epoch": 1.574745788435271, + "grad_norm": 0.5419967174530029, + "learning_rate": 6.85419198055893e-05, + "loss": 1.7683, + "step": 5188 + }, + { + "epoch": 1.5750493246319623, + "grad_norm": 0.4712877571582794, + "learning_rate": 6.853584447144593e-05, + "loss": 1.6409, + "step": 5189 + }, + { + "epoch": 1.5753528608286538, + "grad_norm": 0.5708344578742981, + "learning_rate": 6.852976913730256e-05, + "loss": 1.6154, + "step": 5190 + }, + { + "epoch": 1.5756563970253454, + "grad_norm": 0.6477283835411072, + "learning_rate": 6.852369380315918e-05, + "loss": 1.3777, + "step": 5191 + }, + { + "epoch": 1.5759599332220366, + "grad_norm": 0.5167595744132996, + "learning_rate": 6.85176184690158e-05, + "loss": 1.7067, + "step": 5192 + }, + { + "epoch": 1.5762634694187283, + "grad_norm": 0.9224995374679565, + "learning_rate": 6.851154313487243e-05, + "loss": 1.8738, + "step": 5193 + }, + { + "epoch": 1.5765670056154195, + "grad_norm": 0.528824508190155, + "learning_rate": 6.850546780072904e-05, + "loss": 1.703, + "step": 5194 + }, + { + "epoch": 1.5768705418121112, + "grad_norm": 0.5642966032028198, + "learning_rate": 6.849939246658566e-05, + "loss": 1.7035, + "step": 5195 + }, + { + "epoch": 1.5771740780088026, + "grad_norm": 0.5331950783729553, + "learning_rate": 6.849331713244229e-05, + "loss": 1.7863, + "step": 5196 + }, + { + "epoch": 1.577477614205494, + "grad_norm": 0.6543232798576355, + "learning_rate": 6.848724179829891e-05, + "loss": 1.6948, + "step": 5197 + }, + { + "epoch": 1.5777811504021855, + "grad_norm": 0.5717130899429321, + "learning_rate": 6.848116646415553e-05, + "loss": 1.79, + "step": 5198 + }, + { + "epoch": 1.578084686598877, + "grad_norm": 0.6440120935440063, + "learning_rate": 6.847509113001216e-05, + "loss": 1.4557, + "step": 5199 + }, + { + "epoch": 1.5783882227955683, + "grad_norm": 0.5330291390419006, + "learning_rate": 6.846901579586877e-05, + "loss": 1.6472, + "step": 5200 + }, + { + "epoch": 1.5786917589922598, + "grad_norm": 0.5164377689361572, + "learning_rate": 6.84629404617254e-05, + "loss": 1.8938, + "step": 5201 + }, + { + "epoch": 1.5789952951889514, + "grad_norm": 0.5387392044067383, + "learning_rate": 6.845686512758201e-05, + "loss": 1.8797, + "step": 5202 + }, + { + "epoch": 1.5792988313856426, + "grad_norm": 0.6447203159332275, + "learning_rate": 6.845078979343864e-05, + "loss": 1.3384, + "step": 5203 + }, + { + "epoch": 1.5796023675823343, + "grad_norm": 0.5613716244697571, + "learning_rate": 6.844471445929527e-05, + "loss": 1.2776, + "step": 5204 + }, + { + "epoch": 1.5799059037790255, + "grad_norm": 0.711937665939331, + "learning_rate": 6.843863912515189e-05, + "loss": 1.4993, + "step": 5205 + }, + { + "epoch": 1.5802094399757172, + "grad_norm": 0.5429368019104004, + "learning_rate": 6.84325637910085e-05, + "loss": 1.8176, + "step": 5206 + }, + { + "epoch": 1.5805129761724086, + "grad_norm": 0.48608335852622986, + "learning_rate": 6.842648845686514e-05, + "loss": 1.1093, + "step": 5207 + }, + { + "epoch": 1.5808165123691, + "grad_norm": 0.5331500172615051, + "learning_rate": 6.842041312272175e-05, + "loss": 1.5347, + "step": 5208 + }, + { + "epoch": 1.5811200485657915, + "grad_norm": 0.4486270844936371, + "learning_rate": 6.841433778857837e-05, + "loss": 1.8155, + "step": 5209 + }, + { + "epoch": 1.581423584762483, + "grad_norm": 0.5292797088623047, + "learning_rate": 6.8408262454435e-05, + "loss": 1.778, + "step": 5210 + }, + { + "epoch": 1.5817271209591743, + "grad_norm": 0.5913912057876587, + "learning_rate": 6.840218712029162e-05, + "loss": 1.6109, + "step": 5211 + }, + { + "epoch": 1.5820306571558658, + "grad_norm": 0.5955752730369568, + "learning_rate": 6.839611178614824e-05, + "loss": 1.3188, + "step": 5212 + }, + { + "epoch": 1.5823341933525574, + "grad_norm": 0.9389486908912659, + "learning_rate": 6.839003645200487e-05, + "loss": 1.757, + "step": 5213 + }, + { + "epoch": 1.5826377295492486, + "grad_norm": 0.5492468476295471, + "learning_rate": 6.838396111786148e-05, + "loss": 1.9314, + "step": 5214 + }, + { + "epoch": 1.5829412657459403, + "grad_norm": 0.5454115271568298, + "learning_rate": 6.837788578371812e-05, + "loss": 1.7117, + "step": 5215 + }, + { + "epoch": 1.5832448019426315, + "grad_norm": 0.5753993391990662, + "learning_rate": 6.837181044957472e-05, + "loss": 1.2838, + "step": 5216 + }, + { + "epoch": 1.5835483381393232, + "grad_norm": 0.494477778673172, + "learning_rate": 6.836573511543135e-05, + "loss": 1.3306, + "step": 5217 + }, + { + "epoch": 1.5838518743360146, + "grad_norm": 0.5156183242797852, + "learning_rate": 6.835965978128798e-05, + "loss": 1.2108, + "step": 5218 + }, + { + "epoch": 1.584155410532706, + "grad_norm": 0.46987971663475037, + "learning_rate": 6.83535844471446e-05, + "loss": 1.8227, + "step": 5219 + }, + { + "epoch": 1.5844589467293975, + "grad_norm": 0.720950186252594, + "learning_rate": 6.834750911300122e-05, + "loss": 1.7258, + "step": 5220 + }, + { + "epoch": 1.584762482926089, + "grad_norm": 0.5426019430160522, + "learning_rate": 6.834143377885785e-05, + "loss": 1.6847, + "step": 5221 + }, + { + "epoch": 1.5850660191227806, + "grad_norm": 0.5420331954956055, + "learning_rate": 6.833535844471446e-05, + "loss": 1.4088, + "step": 5222 + }, + { + "epoch": 1.5853695553194718, + "grad_norm": 0.4799429476261139, + "learning_rate": 6.832928311057108e-05, + "loss": 1.3208, + "step": 5223 + }, + { + "epoch": 1.5856730915161634, + "grad_norm": 0.3376842737197876, + "learning_rate": 6.832320777642771e-05, + "loss": 0.8923, + "step": 5224 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.5960623621940613, + "learning_rate": 6.831713244228433e-05, + "loss": 1.213, + "step": 5225 + }, + { + "epoch": 1.5862801639095463, + "grad_norm": 0.415962815284729, + "learning_rate": 6.831105710814095e-05, + "loss": 1.8835, + "step": 5226 + }, + { + "epoch": 1.5865837001062375, + "grad_norm": 0.59657222032547, + "learning_rate": 6.830498177399758e-05, + "loss": 1.3524, + "step": 5227 + }, + { + "epoch": 1.5868872363029292, + "grad_norm": 0.508263111114502, + "learning_rate": 6.82989064398542e-05, + "loss": 1.5945, + "step": 5228 + }, + { + "epoch": 1.5871907724996206, + "grad_norm": 0.44551774859428406, + "learning_rate": 6.829283110571083e-05, + "loss": 1.1683, + "step": 5229 + }, + { + "epoch": 1.587494308696312, + "grad_norm": 0.5371022820472717, + "learning_rate": 6.828675577156743e-05, + "loss": 1.6683, + "step": 5230 + }, + { + "epoch": 1.5877978448930035, + "grad_norm": 0.5673996806144714, + "learning_rate": 6.828068043742406e-05, + "loss": 1.6156, + "step": 5231 + }, + { + "epoch": 1.588101381089695, + "grad_norm": 0.5424557328224182, + "learning_rate": 6.827460510328069e-05, + "loss": 2.1004, + "step": 5232 + }, + { + "epoch": 1.5884049172863866, + "grad_norm": 0.6087379455566406, + "learning_rate": 6.82685297691373e-05, + "loss": 1.685, + "step": 5233 + }, + { + "epoch": 1.5887084534830778, + "grad_norm": 0.48891234397888184, + "learning_rate": 6.826245443499393e-05, + "loss": 1.8414, + "step": 5234 + }, + { + "epoch": 1.5890119896797694, + "grad_norm": 0.5249536633491516, + "learning_rate": 6.825637910085056e-05, + "loss": 1.8926, + "step": 5235 + }, + { + "epoch": 1.5893155258764606, + "grad_norm": 0.5451249480247498, + "learning_rate": 6.825030376670717e-05, + "loss": 1.3554, + "step": 5236 + }, + { + "epoch": 1.5896190620731523, + "grad_norm": 0.5022341012954712, + "learning_rate": 6.824422843256379e-05, + "loss": 2.001, + "step": 5237 + }, + { + "epoch": 1.5899225982698437, + "grad_norm": 0.5455743074417114, + "learning_rate": 6.823815309842042e-05, + "loss": 1.6482, + "step": 5238 + }, + { + "epoch": 1.5902261344665352, + "grad_norm": 0.48343515396118164, + "learning_rate": 6.823207776427704e-05, + "loss": 1.7445, + "step": 5239 + }, + { + "epoch": 1.5905296706632266, + "grad_norm": 0.5498737096786499, + "learning_rate": 6.822600243013366e-05, + "loss": 1.4105, + "step": 5240 + }, + { + "epoch": 1.590833206859918, + "grad_norm": 0.6236885190010071, + "learning_rate": 6.821992709599029e-05, + "loss": 1.5102, + "step": 5241 + }, + { + "epoch": 1.5911367430566095, + "grad_norm": 0.5544707179069519, + "learning_rate": 6.82138517618469e-05, + "loss": 1.5343, + "step": 5242 + }, + { + "epoch": 1.591440279253301, + "grad_norm": 0.6272419095039368, + "learning_rate": 6.820777642770354e-05, + "loss": 1.9041, + "step": 5243 + }, + { + "epoch": 1.5917438154499925, + "grad_norm": 0.5442690849304199, + "learning_rate": 6.820170109356014e-05, + "loss": 1.7401, + "step": 5244 + }, + { + "epoch": 1.5920473516466838, + "grad_norm": 0.5792570114135742, + "learning_rate": 6.819562575941677e-05, + "loss": 1.3998, + "step": 5245 + }, + { + "epoch": 1.5923508878433754, + "grad_norm": 0.5700094103813171, + "learning_rate": 6.81895504252734e-05, + "loss": 1.9555, + "step": 5246 + }, + { + "epoch": 1.5926544240400666, + "grad_norm": 0.6132831573486328, + "learning_rate": 6.818347509113e-05, + "loss": 1.5922, + "step": 5247 + }, + { + "epoch": 1.5929579602367583, + "grad_norm": 0.46173346042633057, + "learning_rate": 6.817739975698664e-05, + "loss": 1.3685, + "step": 5248 + }, + { + "epoch": 1.5932614964334497, + "grad_norm": 0.4786145091056824, + "learning_rate": 6.817132442284327e-05, + "loss": 1.9428, + "step": 5249 + }, + { + "epoch": 1.5935650326301412, + "grad_norm": 1.006852388381958, + "learning_rate": 6.816524908869988e-05, + "loss": 1.1743, + "step": 5250 + }, + { + "epoch": 1.5938685688268326, + "grad_norm": 0.6216186881065369, + "learning_rate": 6.81591737545565e-05, + "loss": 1.7946, + "step": 5251 + }, + { + "epoch": 1.594172105023524, + "grad_norm": 0.561336874961853, + "learning_rate": 6.815309842041313e-05, + "loss": 1.7092, + "step": 5252 + }, + { + "epoch": 1.5944756412202155, + "grad_norm": 0.6000168919563293, + "learning_rate": 6.814702308626975e-05, + "loss": 1.5225, + "step": 5253 + }, + { + "epoch": 1.594779177416907, + "grad_norm": 0.5488656163215637, + "learning_rate": 6.814094775212637e-05, + "loss": 1.7632, + "step": 5254 + }, + { + "epoch": 1.5950827136135985, + "grad_norm": 0.52173912525177, + "learning_rate": 6.8134872417983e-05, + "loss": 1.7053, + "step": 5255 + }, + { + "epoch": 1.5953862498102898, + "grad_norm": 0.927346408367157, + "learning_rate": 6.812879708383961e-05, + "loss": 1.5436, + "step": 5256 + }, + { + "epoch": 1.5956897860069814, + "grad_norm": 0.42421141266822815, + "learning_rate": 6.812272174969623e-05, + "loss": 1.7197, + "step": 5257 + }, + { + "epoch": 1.5959933222036726, + "grad_norm": 0.5720416903495789, + "learning_rate": 6.811664641555285e-05, + "loss": 1.8079, + "step": 5258 + }, + { + "epoch": 1.5962968584003643, + "grad_norm": 0.6596317291259766, + "learning_rate": 6.811057108140948e-05, + "loss": 1.7341, + "step": 5259 + }, + { + "epoch": 1.5966003945970557, + "grad_norm": 0.5089192390441895, + "learning_rate": 6.810449574726611e-05, + "loss": 1.5992, + "step": 5260 + }, + { + "epoch": 1.5969039307937472, + "grad_norm": 0.5197166204452515, + "learning_rate": 6.809842041312272e-05, + "loss": 1.6093, + "step": 5261 + }, + { + "epoch": 1.5972074669904386, + "grad_norm": 0.552408754825592, + "learning_rate": 6.809234507897935e-05, + "loss": 1.5114, + "step": 5262 + }, + { + "epoch": 1.59751100318713, + "grad_norm": 0.5977384448051453, + "learning_rate": 6.808626974483598e-05, + "loss": 1.7183, + "step": 5263 + }, + { + "epoch": 1.5978145393838217, + "grad_norm": 0.547103762626648, + "learning_rate": 6.80801944106926e-05, + "loss": 1.7443, + "step": 5264 + }, + { + "epoch": 1.5981180755805129, + "grad_norm": 0.5751621723175049, + "learning_rate": 6.807411907654921e-05, + "loss": 1.7588, + "step": 5265 + }, + { + "epoch": 1.5984216117772045, + "grad_norm": 0.47714513540267944, + "learning_rate": 6.806804374240584e-05, + "loss": 1.7564, + "step": 5266 + }, + { + "epoch": 1.5987251479738958, + "grad_norm": 0.5768815279006958, + "learning_rate": 6.806196840826246e-05, + "loss": 1.7652, + "step": 5267 + }, + { + "epoch": 1.5990286841705874, + "grad_norm": 0.5746716260910034, + "learning_rate": 6.805589307411908e-05, + "loss": 1.5079, + "step": 5268 + }, + { + "epoch": 1.5993322203672788, + "grad_norm": 0.5528024435043335, + "learning_rate": 6.80498177399757e-05, + "loss": 1.5186, + "step": 5269 + }, + { + "epoch": 1.5996357565639703, + "grad_norm": 0.5926470756530762, + "learning_rate": 6.804374240583233e-05, + "loss": 1.5568, + "step": 5270 + }, + { + "epoch": 1.5999392927606617, + "grad_norm": 0.5206483006477356, + "learning_rate": 6.803766707168894e-05, + "loss": 1.0035, + "step": 5271 + }, + { + "epoch": 1.6002428289573531, + "grad_norm": 0.4643096923828125, + "learning_rate": 6.803159173754556e-05, + "loss": 0.8317, + "step": 5272 + }, + { + "epoch": 1.6005463651540446, + "grad_norm": 0.48165419697761536, + "learning_rate": 6.802551640340219e-05, + "loss": 1.8613, + "step": 5273 + }, + { + "epoch": 1.600849901350736, + "grad_norm": 0.5018272995948792, + "learning_rate": 6.801944106925882e-05, + "loss": 1.7777, + "step": 5274 + }, + { + "epoch": 1.6011534375474277, + "grad_norm": 0.7083479762077332, + "learning_rate": 6.801336573511543e-05, + "loss": 1.8053, + "step": 5275 + }, + { + "epoch": 1.6014569737441189, + "grad_norm": 0.5450612902641296, + "learning_rate": 6.800729040097206e-05, + "loss": 1.5861, + "step": 5276 + }, + { + "epoch": 1.6017605099408105, + "grad_norm": 0.599209189414978, + "learning_rate": 6.800121506682869e-05, + "loss": 1.2736, + "step": 5277 + }, + { + "epoch": 1.6020640461375018, + "grad_norm": 0.5083154439926147, + "learning_rate": 6.79951397326853e-05, + "loss": 1.7954, + "step": 5278 + }, + { + "epoch": 1.6023675823341934, + "grad_norm": 0.5951920747756958, + "learning_rate": 6.798906439854192e-05, + "loss": 1.322, + "step": 5279 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.5329943299293518, + "learning_rate": 6.798298906439855e-05, + "loss": 1.8546, + "step": 5280 + }, + { + "epoch": 1.6029746547275763, + "grad_norm": 0.45665401220321655, + "learning_rate": 6.797691373025517e-05, + "loss": 1.9274, + "step": 5281 + }, + { + "epoch": 1.6032781909242677, + "grad_norm": 0.5830223560333252, + "learning_rate": 6.797083839611179e-05, + "loss": 1.7661, + "step": 5282 + }, + { + "epoch": 1.6035817271209591, + "grad_norm": 0.5497011542320251, + "learning_rate": 6.79647630619684e-05, + "loss": 1.29, + "step": 5283 + }, + { + "epoch": 1.6038852633176506, + "grad_norm": 0.47509273886680603, + "learning_rate": 6.795868772782504e-05, + "loss": 1.8748, + "step": 5284 + }, + { + "epoch": 1.604188799514342, + "grad_norm": 0.5527020692825317, + "learning_rate": 6.795261239368165e-05, + "loss": 1.186, + "step": 5285 + }, + { + "epoch": 1.6044923357110337, + "grad_norm": 0.5587241053581238, + "learning_rate": 6.794653705953827e-05, + "loss": 1.3485, + "step": 5286 + }, + { + "epoch": 1.6047958719077249, + "grad_norm": 0.7357602119445801, + "learning_rate": 6.79404617253949e-05, + "loss": 1.3425, + "step": 5287 + }, + { + "epoch": 1.6050994081044165, + "grad_norm": 0.600986659526825, + "learning_rate": 6.793438639125153e-05, + "loss": 1.3589, + "step": 5288 + }, + { + "epoch": 1.6054029443011077, + "grad_norm": 0.8718162178993225, + "learning_rate": 6.792831105710814e-05, + "loss": 1.4548, + "step": 5289 + }, + { + "epoch": 1.6057064804977994, + "grad_norm": 0.5700815320014954, + "learning_rate": 6.792223572296477e-05, + "loss": 1.5316, + "step": 5290 + }, + { + "epoch": 1.6060100166944908, + "grad_norm": 0.4497663080692291, + "learning_rate": 6.79161603888214e-05, + "loss": 1.6142, + "step": 5291 + }, + { + "epoch": 1.6063135528911823, + "grad_norm": 0.4942954182624817, + "learning_rate": 6.791008505467801e-05, + "loss": 2.073, + "step": 5292 + }, + { + "epoch": 1.6066170890878737, + "grad_norm": 0.578714907169342, + "learning_rate": 6.790400972053463e-05, + "loss": 1.6847, + "step": 5293 + }, + { + "epoch": 1.6069206252845651, + "grad_norm": 0.5462824106216431, + "learning_rate": 6.789793438639126e-05, + "loss": 1.8571, + "step": 5294 + }, + { + "epoch": 1.6072241614812568, + "grad_norm": 0.47509217262268066, + "learning_rate": 6.789185905224788e-05, + "loss": 1.6175, + "step": 5295 + }, + { + "epoch": 1.607527697677948, + "grad_norm": 0.5601421594619751, + "learning_rate": 6.78857837181045e-05, + "loss": 1.1761, + "step": 5296 + }, + { + "epoch": 1.6078312338746397, + "grad_norm": 0.5942484140396118, + "learning_rate": 6.787970838396111e-05, + "loss": 1.2908, + "step": 5297 + }, + { + "epoch": 1.6081347700713309, + "grad_norm": 0.5373112559318542, + "learning_rate": 6.787363304981775e-05, + "loss": 1.7585, + "step": 5298 + }, + { + "epoch": 1.6084383062680225, + "grad_norm": 0.5723139643669128, + "learning_rate": 6.786755771567436e-05, + "loss": 1.7596, + "step": 5299 + }, + { + "epoch": 1.608741842464714, + "grad_norm": 0.5496042370796204, + "learning_rate": 6.786148238153098e-05, + "loss": 1.7498, + "step": 5300 + }, + { + "epoch": 1.6090453786614054, + "grad_norm": 0.5220587253570557, + "learning_rate": 6.785540704738761e-05, + "loss": 1.6976, + "step": 5301 + }, + { + "epoch": 1.6093489148580968, + "grad_norm": 0.5628178715705872, + "learning_rate": 6.784933171324424e-05, + "loss": 1.6136, + "step": 5302 + }, + { + "epoch": 1.6096524510547883, + "grad_norm": 0.48423969745635986, + "learning_rate": 6.784325637910085e-05, + "loss": 1.1443, + "step": 5303 + }, + { + "epoch": 1.6099559872514797, + "grad_norm": 0.8351702094078064, + "learning_rate": 6.783718104495748e-05, + "loss": 1.7297, + "step": 5304 + }, + { + "epoch": 1.6102595234481711, + "grad_norm": 0.5108897089958191, + "learning_rate": 6.783110571081411e-05, + "loss": 1.706, + "step": 5305 + }, + { + "epoch": 1.6105630596448628, + "grad_norm": 0.5052831768989563, + "learning_rate": 6.782503037667071e-05, + "loss": 1.801, + "step": 5306 + }, + { + "epoch": 1.610866595841554, + "grad_norm": 0.8400011658668518, + "learning_rate": 6.781895504252734e-05, + "loss": 1.423, + "step": 5307 + }, + { + "epoch": 1.6111701320382457, + "grad_norm": 0.5768548250198364, + "learning_rate": 6.781287970838397e-05, + "loss": 1.8448, + "step": 5308 + }, + { + "epoch": 1.6114736682349369, + "grad_norm": 0.4723953306674957, + "learning_rate": 6.780680437424059e-05, + "loss": 1.564, + "step": 5309 + }, + { + "epoch": 1.6117772044316285, + "grad_norm": 0.5078704357147217, + "learning_rate": 6.780072904009721e-05, + "loss": 1.7979, + "step": 5310 + }, + { + "epoch": 1.61208074062832, + "grad_norm": 0.5367867350578308, + "learning_rate": 6.779465370595382e-05, + "loss": 1.3874, + "step": 5311 + }, + { + "epoch": 1.6123842768250114, + "grad_norm": 0.5529655814170837, + "learning_rate": 6.778857837181046e-05, + "loss": 1.8505, + "step": 5312 + }, + { + "epoch": 1.6126878130217028, + "grad_norm": 0.5230738520622253, + "learning_rate": 6.778250303766707e-05, + "loss": 1.5063, + "step": 5313 + }, + { + "epoch": 1.6129913492183943, + "grad_norm": 0.5180802345275879, + "learning_rate": 6.777642770352369e-05, + "loss": 1.9768, + "step": 5314 + }, + { + "epoch": 1.6132948854150857, + "grad_norm": 0.47264987230300903, + "learning_rate": 6.777035236938032e-05, + "loss": 1.58, + "step": 5315 + }, + { + "epoch": 1.6135984216117771, + "grad_norm": 0.5119348764419556, + "learning_rate": 6.776427703523695e-05, + "loss": 1.5589, + "step": 5316 + }, + { + "epoch": 1.6139019578084688, + "grad_norm": 0.4975685477256775, + "learning_rate": 6.775820170109356e-05, + "loss": 1.6955, + "step": 5317 + }, + { + "epoch": 1.61420549400516, + "grad_norm": 0.4829816520214081, + "learning_rate": 6.775212636695019e-05, + "loss": 1.7274, + "step": 5318 + }, + { + "epoch": 1.6145090302018517, + "grad_norm": 0.5632861256599426, + "learning_rate": 6.774605103280682e-05, + "loss": 1.9359, + "step": 5319 + }, + { + "epoch": 1.6148125663985429, + "grad_norm": 0.46568477153778076, + "learning_rate": 6.773997569866342e-05, + "loss": 1.9913, + "step": 5320 + }, + { + "epoch": 1.6151161025952345, + "grad_norm": 0.581303060054779, + "learning_rate": 6.773390036452005e-05, + "loss": 1.7004, + "step": 5321 + }, + { + "epoch": 1.615419638791926, + "grad_norm": 0.5229167342185974, + "learning_rate": 6.772782503037668e-05, + "loss": 1.7712, + "step": 5322 + }, + { + "epoch": 1.6157231749886174, + "grad_norm": 0.5725805759429932, + "learning_rate": 6.77217496962333e-05, + "loss": 1.8174, + "step": 5323 + }, + { + "epoch": 1.6160267111853088, + "grad_norm": 0.8762467503547668, + "learning_rate": 6.771567436208992e-05, + "loss": 1.6576, + "step": 5324 + }, + { + "epoch": 1.6163302473820003, + "grad_norm": 0.598650336265564, + "learning_rate": 6.770959902794653e-05, + "loss": 1.7338, + "step": 5325 + }, + { + "epoch": 1.616633783578692, + "grad_norm": 0.6000232696533203, + "learning_rate": 6.770352369380317e-05, + "loss": 1.1433, + "step": 5326 + }, + { + "epoch": 1.6169373197753831, + "grad_norm": 0.5699084401130676, + "learning_rate": 6.769744835965978e-05, + "loss": 1.4685, + "step": 5327 + }, + { + "epoch": 1.6172408559720748, + "grad_norm": 0.5472105741500854, + "learning_rate": 6.76913730255164e-05, + "loss": 1.6136, + "step": 5328 + }, + { + "epoch": 1.617544392168766, + "grad_norm": 0.5260597467422485, + "learning_rate": 6.768529769137303e-05, + "loss": 1.8754, + "step": 5329 + }, + { + "epoch": 1.6178479283654577, + "grad_norm": 0.5261357426643372, + "learning_rate": 6.767922235722965e-05, + "loss": 1.641, + "step": 5330 + }, + { + "epoch": 1.618151464562149, + "grad_norm": 0.4434361755847931, + "learning_rate": 6.767314702308627e-05, + "loss": 1.3906, + "step": 5331 + }, + { + "epoch": 1.6184550007588405, + "grad_norm": 0.4359007477760315, + "learning_rate": 6.76670716889429e-05, + "loss": 1.3848, + "step": 5332 + }, + { + "epoch": 1.618758536955532, + "grad_norm": 0.4851773679256439, + "learning_rate": 6.766099635479953e-05, + "loss": 1.6975, + "step": 5333 + }, + { + "epoch": 1.6190620731522234, + "grad_norm": 0.505078136920929, + "learning_rate": 6.765492102065613e-05, + "loss": 1.593, + "step": 5334 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.470007985830307, + "learning_rate": 6.764884568651276e-05, + "loss": 1.7861, + "step": 5335 + }, + { + "epoch": 1.6196691455456063, + "grad_norm": 0.480383962392807, + "learning_rate": 6.764277035236939e-05, + "loss": 1.6374, + "step": 5336 + }, + { + "epoch": 1.619972681742298, + "grad_norm": 0.8806627988815308, + "learning_rate": 6.763669501822601e-05, + "loss": 1.5483, + "step": 5337 + }, + { + "epoch": 1.6202762179389891, + "grad_norm": 0.4468989968299866, + "learning_rate": 6.763061968408263e-05, + "loss": 1.7802, + "step": 5338 + }, + { + "epoch": 1.6205797541356808, + "grad_norm": 0.4716281294822693, + "learning_rate": 6.762454434993924e-05, + "loss": 2.0222, + "step": 5339 + }, + { + "epoch": 1.620883290332372, + "grad_norm": 0.5289998054504395, + "learning_rate": 6.761846901579588e-05, + "loss": 1.7585, + "step": 5340 + }, + { + "epoch": 1.6211868265290637, + "grad_norm": 0.5218448042869568, + "learning_rate": 6.761239368165249e-05, + "loss": 1.7652, + "step": 5341 + }, + { + "epoch": 1.621490362725755, + "grad_norm": 0.5116521120071411, + "learning_rate": 6.760631834750911e-05, + "loss": 1.8309, + "step": 5342 + }, + { + "epoch": 1.6217938989224465, + "grad_norm": 0.5118533372879028, + "learning_rate": 6.760024301336574e-05, + "loss": 1.1779, + "step": 5343 + }, + { + "epoch": 1.622097435119138, + "grad_norm": 0.46334773302078247, + "learning_rate": 6.759416767922236e-05, + "loss": 1.3687, + "step": 5344 + }, + { + "epoch": 1.6224009713158294, + "grad_norm": 0.39834198355674744, + "learning_rate": 6.758809234507898e-05, + "loss": 0.8092, + "step": 5345 + }, + { + "epoch": 1.6227045075125208, + "grad_norm": 0.6149663925170898, + "learning_rate": 6.75820170109356e-05, + "loss": 1.6454, + "step": 5346 + }, + { + "epoch": 1.6230080437092123, + "grad_norm": 0.47009339928627014, + "learning_rate": 6.757594167679224e-05, + "loss": 1.7656, + "step": 5347 + }, + { + "epoch": 1.623311579905904, + "grad_norm": 0.5676798820495605, + "learning_rate": 6.756986634264884e-05, + "loss": 1.7765, + "step": 5348 + }, + { + "epoch": 1.6236151161025951, + "grad_norm": 0.6189036965370178, + "learning_rate": 6.756379100850547e-05, + "loss": 1.5765, + "step": 5349 + }, + { + "epoch": 1.6239186522992868, + "grad_norm": 0.5358760356903076, + "learning_rate": 6.75577156743621e-05, + "loss": 1.528, + "step": 5350 + }, + { + "epoch": 1.624222188495978, + "grad_norm": 0.701900064945221, + "learning_rate": 6.755164034021872e-05, + "loss": 1.4507, + "step": 5351 + }, + { + "epoch": 1.6245257246926696, + "grad_norm": 0.552905797958374, + "learning_rate": 6.754556500607534e-05, + "loss": 1.304, + "step": 5352 + }, + { + "epoch": 1.624829260889361, + "grad_norm": 0.6372193098068237, + "learning_rate": 6.753948967193195e-05, + "loss": 1.9799, + "step": 5353 + }, + { + "epoch": 1.6251327970860525, + "grad_norm": 0.5515216588973999, + "learning_rate": 6.753341433778859e-05, + "loss": 1.8413, + "step": 5354 + }, + { + "epoch": 1.625436333282744, + "grad_norm": 0.42681434750556946, + "learning_rate": 6.75273390036452e-05, + "loss": 0.8476, + "step": 5355 + }, + { + "epoch": 1.6257398694794354, + "grad_norm": 0.520078182220459, + "learning_rate": 6.752126366950182e-05, + "loss": 1.3612, + "step": 5356 + }, + { + "epoch": 1.626043405676127, + "grad_norm": 0.5389737486839294, + "learning_rate": 6.751518833535845e-05, + "loss": 1.7407, + "step": 5357 + }, + { + "epoch": 1.6263469418728183, + "grad_norm": 0.5108140110969543, + "learning_rate": 6.750911300121507e-05, + "loss": 1.6941, + "step": 5358 + }, + { + "epoch": 1.62665047806951, + "grad_norm": 0.8992660641670227, + "learning_rate": 6.750303766707169e-05, + "loss": 1.7602, + "step": 5359 + }, + { + "epoch": 1.6269540142662011, + "grad_norm": 0.5920006632804871, + "learning_rate": 6.749696233292832e-05, + "loss": 1.6317, + "step": 5360 + }, + { + "epoch": 1.6272575504628928, + "grad_norm": 0.5391355752944946, + "learning_rate": 6.749088699878495e-05, + "loss": 0.8721, + "step": 5361 + }, + { + "epoch": 1.6275610866595842, + "grad_norm": 0.5665925741195679, + "learning_rate": 6.748481166464155e-05, + "loss": 1.3489, + "step": 5362 + }, + { + "epoch": 1.6278646228562756, + "grad_norm": 0.7543338537216187, + "learning_rate": 6.747873633049818e-05, + "loss": 1.6003, + "step": 5363 + }, + { + "epoch": 1.628168159052967, + "grad_norm": 0.5854896903038025, + "learning_rate": 6.74726609963548e-05, + "loss": 1.7037, + "step": 5364 + }, + { + "epoch": 1.6284716952496585, + "grad_norm": 0.541285514831543, + "learning_rate": 6.746658566221143e-05, + "loss": 1.5401, + "step": 5365 + }, + { + "epoch": 1.62877523144635, + "grad_norm": 0.5873636603355408, + "learning_rate": 6.746051032806805e-05, + "loss": 1.6896, + "step": 5366 + }, + { + "epoch": 1.6290787676430414, + "grad_norm": 0.6596795320510864, + "learning_rate": 6.745443499392466e-05, + "loss": 1.783, + "step": 5367 + }, + { + "epoch": 1.629382303839733, + "grad_norm": 0.6165364384651184, + "learning_rate": 6.74483596597813e-05, + "loss": 1.8629, + "step": 5368 + }, + { + "epoch": 1.6296858400364242, + "grad_norm": 0.5157227516174316, + "learning_rate": 6.744228432563791e-05, + "loss": 1.743, + "step": 5369 + }, + { + "epoch": 1.629989376233116, + "grad_norm": 0.5414877533912659, + "learning_rate": 6.743620899149453e-05, + "loss": 1.1882, + "step": 5370 + }, + { + "epoch": 1.6302929124298071, + "grad_norm": 0.5076656341552734, + "learning_rate": 6.743013365735116e-05, + "loss": 1.8108, + "step": 5371 + }, + { + "epoch": 1.6305964486264988, + "grad_norm": 0.48527640104293823, + "learning_rate": 6.742405832320778e-05, + "loss": 1.8499, + "step": 5372 + }, + { + "epoch": 1.6308999848231902, + "grad_norm": 0.5806058049201965, + "learning_rate": 6.74179829890644e-05, + "loss": 1.919, + "step": 5373 + }, + { + "epoch": 1.6312035210198816, + "grad_norm": 0.5198022127151489, + "learning_rate": 6.741190765492103e-05, + "loss": 1.7945, + "step": 5374 + }, + { + "epoch": 1.631507057216573, + "grad_norm": 0.5610677599906921, + "learning_rate": 6.740583232077766e-05, + "loss": 1.7286, + "step": 5375 + }, + { + "epoch": 1.6318105934132645, + "grad_norm": 0.5591566562652588, + "learning_rate": 6.739975698663426e-05, + "loss": 1.8059, + "step": 5376 + }, + { + "epoch": 1.632114129609956, + "grad_norm": 0.5589379072189331, + "learning_rate": 6.739368165249089e-05, + "loss": 1.5548, + "step": 5377 + }, + { + "epoch": 1.6324176658066474, + "grad_norm": 0.5973590016365051, + "learning_rate": 6.738760631834751e-05, + "loss": 1.6746, + "step": 5378 + }, + { + "epoch": 1.632721202003339, + "grad_norm": 0.5872961282730103, + "learning_rate": 6.738153098420413e-05, + "loss": 1.6873, + "step": 5379 + }, + { + "epoch": 1.6330247382000302, + "grad_norm": 1.088761329650879, + "learning_rate": 6.737545565006076e-05, + "loss": 1.2821, + "step": 5380 + }, + { + "epoch": 1.633328274396722, + "grad_norm": 0.497652143239975, + "learning_rate": 6.736938031591737e-05, + "loss": 1.7009, + "step": 5381 + }, + { + "epoch": 1.6336318105934131, + "grad_norm": 0.4582177698612213, + "learning_rate": 6.7363304981774e-05, + "loss": 1.4134, + "step": 5382 + }, + { + "epoch": 1.6339353467901048, + "grad_norm": 0.5756871104240417, + "learning_rate": 6.735722964763062e-05, + "loss": 1.5523, + "step": 5383 + }, + { + "epoch": 1.6342388829867962, + "grad_norm": 0.4630436301231384, + "learning_rate": 6.735115431348724e-05, + "loss": 1.6713, + "step": 5384 + }, + { + "epoch": 1.6345424191834876, + "grad_norm": 0.5446672439575195, + "learning_rate": 6.734507897934387e-05, + "loss": 1.7208, + "step": 5385 + }, + { + "epoch": 1.634845955380179, + "grad_norm": 0.5166425108909607, + "learning_rate": 6.733900364520049e-05, + "loss": 1.6424, + "step": 5386 + }, + { + "epoch": 1.6351494915768705, + "grad_norm": 0.6008763313293457, + "learning_rate": 6.73329283110571e-05, + "loss": 1.5234, + "step": 5387 + }, + { + "epoch": 1.6354530277735622, + "grad_norm": 0.5188408493995667, + "learning_rate": 6.732685297691374e-05, + "loss": 1.868, + "step": 5388 + }, + { + "epoch": 1.6357565639702534, + "grad_norm": 1.0882227420806885, + "learning_rate": 6.732077764277037e-05, + "loss": 1.6903, + "step": 5389 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.5503698587417603, + "learning_rate": 6.731470230862697e-05, + "loss": 1.8615, + "step": 5390 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.5375723838806152, + "learning_rate": 6.73086269744836e-05, + "loss": 1.6473, + "step": 5391 + }, + { + "epoch": 1.636667172560328, + "grad_norm": 0.5821309685707092, + "learning_rate": 6.730255164034022e-05, + "loss": 1.8633, + "step": 5392 + }, + { + "epoch": 1.636970708757019, + "grad_norm": 0.6550545692443848, + "learning_rate": 6.729647630619684e-05, + "loss": 1.6438, + "step": 5393 + }, + { + "epoch": 1.6372742449537108, + "grad_norm": 0.5280491709709167, + "learning_rate": 6.729040097205347e-05, + "loss": 1.4762, + "step": 5394 + }, + { + "epoch": 1.6375777811504022, + "grad_norm": 0.5491107106208801, + "learning_rate": 6.728432563791008e-05, + "loss": 1.7162, + "step": 5395 + }, + { + "epoch": 1.6378813173470936, + "grad_norm": 0.4663378596305847, + "learning_rate": 6.727825030376672e-05, + "loss": 1.7681, + "step": 5396 + }, + { + "epoch": 1.638184853543785, + "grad_norm": 0.8106181025505066, + "learning_rate": 6.727217496962333e-05, + "loss": 1.2499, + "step": 5397 + }, + { + "epoch": 1.6384883897404765, + "grad_norm": 0.52627032995224, + "learning_rate": 6.726609963547995e-05, + "loss": 2.023, + "step": 5398 + }, + { + "epoch": 1.6387919259371682, + "grad_norm": 0.48420122265815735, + "learning_rate": 6.726002430133658e-05, + "loss": 1.7587, + "step": 5399 + }, + { + "epoch": 1.6390954621338594, + "grad_norm": 0.513408362865448, + "learning_rate": 6.72539489671932e-05, + "loss": 1.4867, + "step": 5400 + }, + { + "epoch": 1.639398998330551, + "grad_norm": 0.48651421070098877, + "learning_rate": 6.724787363304982e-05, + "loss": 1.217, + "step": 5401 + }, + { + "epoch": 1.6397025345272422, + "grad_norm": 0.5510370135307312, + "learning_rate": 6.724179829890645e-05, + "loss": 1.6771, + "step": 5402 + }, + { + "epoch": 1.640006070723934, + "grad_norm": 0.6256886124610901, + "learning_rate": 6.723572296476306e-05, + "loss": 1.7332, + "step": 5403 + }, + { + "epoch": 1.6403096069206253, + "grad_norm": 0.5471596717834473, + "learning_rate": 6.722964763061968e-05, + "loss": 1.8147, + "step": 5404 + }, + { + "epoch": 1.6406131431173168, + "grad_norm": 0.5959319472312927, + "learning_rate": 6.722357229647631e-05, + "loss": 1.6621, + "step": 5405 + }, + { + "epoch": 1.6409166793140082, + "grad_norm": 0.5674776434898376, + "learning_rate": 6.721749696233293e-05, + "loss": 1.6725, + "step": 5406 + }, + { + "epoch": 1.6412202155106996, + "grad_norm": 0.6195299625396729, + "learning_rate": 6.721142162818955e-05, + "loss": 1.75, + "step": 5407 + }, + { + "epoch": 1.641523751707391, + "grad_norm": 0.5325772762298584, + "learning_rate": 6.720534629404618e-05, + "loss": 1.4533, + "step": 5408 + }, + { + "epoch": 1.6418272879040825, + "grad_norm": 0.5618863105773926, + "learning_rate": 6.71992709599028e-05, + "loss": 1.8481, + "step": 5409 + }, + { + "epoch": 1.6421308241007742, + "grad_norm": 0.5095944404602051, + "learning_rate": 6.719319562575943e-05, + "loss": 1.7641, + "step": 5410 + }, + { + "epoch": 1.6424343602974654, + "grad_norm": 0.508712649345398, + "learning_rate": 6.718712029161604e-05, + "loss": 1.825, + "step": 5411 + }, + { + "epoch": 1.642737896494157, + "grad_norm": 0.5344487428665161, + "learning_rate": 6.718104495747266e-05, + "loss": 1.7304, + "step": 5412 + }, + { + "epoch": 1.6430414326908482, + "grad_norm": 0.7639570832252502, + "learning_rate": 6.717496962332929e-05, + "loss": 1.4032, + "step": 5413 + }, + { + "epoch": 1.64334496888754, + "grad_norm": 0.5500749349594116, + "learning_rate": 6.716889428918591e-05, + "loss": 1.719, + "step": 5414 + }, + { + "epoch": 1.6436485050842313, + "grad_norm": 0.5417535901069641, + "learning_rate": 6.716281895504253e-05, + "loss": 1.8949, + "step": 5415 + }, + { + "epoch": 1.6439520412809228, + "grad_norm": 0.8672186136245728, + "learning_rate": 6.715674362089916e-05, + "loss": 1.5031, + "step": 5416 + }, + { + "epoch": 1.6442555774776142, + "grad_norm": 0.6225829124450684, + "learning_rate": 6.715066828675577e-05, + "loss": 1.6316, + "step": 5417 + }, + { + "epoch": 1.6445591136743056, + "grad_norm": 0.6430771350860596, + "learning_rate": 6.714459295261239e-05, + "loss": 1.9351, + "step": 5418 + }, + { + "epoch": 1.644862649870997, + "grad_norm": 0.5645813941955566, + "learning_rate": 6.713851761846902e-05, + "loss": 1.6483, + "step": 5419 + }, + { + "epoch": 1.6451661860676885, + "grad_norm": 0.5713335871696472, + "learning_rate": 6.713244228432564e-05, + "loss": 1.7766, + "step": 5420 + }, + { + "epoch": 1.6454697222643802, + "grad_norm": 0.4520507752895355, + "learning_rate": 6.712636695018226e-05, + "loss": 1.2049, + "step": 5421 + }, + { + "epoch": 1.6457732584610714, + "grad_norm": 0.5464449524879456, + "learning_rate": 6.712029161603889e-05, + "loss": 1.6615, + "step": 5422 + }, + { + "epoch": 1.646076794657763, + "grad_norm": 0.5757522583007812, + "learning_rate": 6.71142162818955e-05, + "loss": 1.9467, + "step": 5423 + }, + { + "epoch": 1.6463803308544542, + "grad_norm": 0.43012985587120056, + "learning_rate": 6.710814094775214e-05, + "loss": 1.713, + "step": 5424 + }, + { + "epoch": 1.6466838670511459, + "grad_norm": 0.5255352854728699, + "learning_rate": 6.710206561360875e-05, + "loss": 1.3849, + "step": 5425 + }, + { + "epoch": 1.6469874032478373, + "grad_norm": 0.5202666521072388, + "learning_rate": 6.709599027946537e-05, + "loss": 1.7943, + "step": 5426 + }, + { + "epoch": 1.6472909394445288, + "grad_norm": 0.4663451313972473, + "learning_rate": 6.7089914945322e-05, + "loss": 1.6036, + "step": 5427 + }, + { + "epoch": 1.6475944756412202, + "grad_norm": 0.4620378315448761, + "learning_rate": 6.708383961117862e-05, + "loss": 1.2386, + "step": 5428 + }, + { + "epoch": 1.6478980118379116, + "grad_norm": 0.4991806447505951, + "learning_rate": 6.707776427703524e-05, + "loss": 1.5412, + "step": 5429 + }, + { + "epoch": 1.6482015480346033, + "grad_norm": 0.5099241733551025, + "learning_rate": 6.707168894289187e-05, + "loss": 1.2018, + "step": 5430 + }, + { + "epoch": 1.6485050842312945, + "grad_norm": 0.48727279901504517, + "learning_rate": 6.706561360874848e-05, + "loss": 1.7994, + "step": 5431 + }, + { + "epoch": 1.6488086204279861, + "grad_norm": 0.5382767915725708, + "learning_rate": 6.70595382746051e-05, + "loss": 1.4265, + "step": 5432 + }, + { + "epoch": 1.6491121566246774, + "grad_norm": 0.611785888671875, + "learning_rate": 6.705346294046173e-05, + "loss": 1.1372, + "step": 5433 + }, + { + "epoch": 1.649415692821369, + "grad_norm": 0.47946327924728394, + "learning_rate": 6.704738760631835e-05, + "loss": 1.6548, + "step": 5434 + }, + { + "epoch": 1.6497192290180605, + "grad_norm": 0.5408942699432373, + "learning_rate": 6.704131227217497e-05, + "loss": 1.7279, + "step": 5435 + }, + { + "epoch": 1.6500227652147519, + "grad_norm": 0.5369447469711304, + "learning_rate": 6.70352369380316e-05, + "loss": 1.7837, + "step": 5436 + }, + { + "epoch": 1.6503263014114433, + "grad_norm": 0.5782811641693115, + "learning_rate": 6.702916160388821e-05, + "loss": 1.4272, + "step": 5437 + }, + { + "epoch": 1.6506298376081348, + "grad_norm": 0.6250472068786621, + "learning_rate": 6.702308626974485e-05, + "loss": 1.6532, + "step": 5438 + }, + { + "epoch": 1.6509333738048262, + "grad_norm": 0.5835246443748474, + "learning_rate": 6.701701093560146e-05, + "loss": 1.6724, + "step": 5439 + }, + { + "epoch": 1.6512369100015176, + "grad_norm": 0.7966321110725403, + "learning_rate": 6.701093560145808e-05, + "loss": 1.4718, + "step": 5440 + }, + { + "epoch": 1.6515404461982093, + "grad_norm": 0.5973135232925415, + "learning_rate": 6.700486026731471e-05, + "loss": 1.7702, + "step": 5441 + }, + { + "epoch": 1.6518439823949005, + "grad_norm": 0.5492159724235535, + "learning_rate": 6.699878493317133e-05, + "loss": 1.775, + "step": 5442 + }, + { + "epoch": 1.6521475185915921, + "grad_norm": 0.5329028367996216, + "learning_rate": 6.699270959902795e-05, + "loss": 1.4301, + "step": 5443 + }, + { + "epoch": 1.6524510547882834, + "grad_norm": 0.5012845993041992, + "learning_rate": 6.698663426488458e-05, + "loss": 1.9806, + "step": 5444 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.5947259068489075, + "learning_rate": 6.69805589307412e-05, + "loss": 1.844, + "step": 5445 + }, + { + "epoch": 1.6530581271816664, + "grad_norm": 0.7111178040504456, + "learning_rate": 6.697448359659781e-05, + "loss": 1.3966, + "step": 5446 + }, + { + "epoch": 1.6533616633783579, + "grad_norm": 0.5320685505867004, + "learning_rate": 6.696840826245444e-05, + "loss": 2.0219, + "step": 5447 + }, + { + "epoch": 1.6536651995750493, + "grad_norm": 0.5629007816314697, + "learning_rate": 6.696233292831106e-05, + "loss": 1.4143, + "step": 5448 + }, + { + "epoch": 1.6539687357717408, + "grad_norm": 0.616475522518158, + "learning_rate": 6.695625759416768e-05, + "loss": 1.4303, + "step": 5449 + }, + { + "epoch": 1.6542722719684322, + "grad_norm": 0.61955326795578, + "learning_rate": 6.695018226002431e-05, + "loss": 1.4353, + "step": 5450 + }, + { + "epoch": 1.6545758081651236, + "grad_norm": 0.5788480639457703, + "learning_rate": 6.694410692588092e-05, + "loss": 1.1104, + "step": 5451 + }, + { + "epoch": 1.6548793443618153, + "grad_norm": 0.6140144467353821, + "learning_rate": 6.693803159173754e-05, + "loss": 1.245, + "step": 5452 + }, + { + "epoch": 1.6551828805585065, + "grad_norm": 0.5252561569213867, + "learning_rate": 6.693195625759417e-05, + "loss": 1.6649, + "step": 5453 + }, + { + "epoch": 1.6554864167551981, + "grad_norm": 0.6164594888687134, + "learning_rate": 6.692588092345079e-05, + "loss": 1.8658, + "step": 5454 + }, + { + "epoch": 1.6557899529518894, + "grad_norm": 0.6214210987091064, + "learning_rate": 6.691980558930742e-05, + "loss": 1.7975, + "step": 5455 + }, + { + "epoch": 1.656093489148581, + "grad_norm": 0.6027306914329529, + "learning_rate": 6.691373025516404e-05, + "loss": 1.733, + "step": 5456 + }, + { + "epoch": 1.6563970253452724, + "grad_norm": 0.5661727786064148, + "learning_rate": 6.690765492102066e-05, + "loss": 1.1875, + "step": 5457 + }, + { + "epoch": 1.6567005615419639, + "grad_norm": 0.506899356842041, + "learning_rate": 6.690157958687729e-05, + "loss": 1.8503, + "step": 5458 + }, + { + "epoch": 1.6570040977386553, + "grad_norm": 0.3860659897327423, + "learning_rate": 6.68955042527339e-05, + "loss": 1.5618, + "step": 5459 + }, + { + "epoch": 1.6573076339353467, + "grad_norm": 0.36981797218322754, + "learning_rate": 6.688942891859052e-05, + "loss": 1.8145, + "step": 5460 + }, + { + "epoch": 1.6576111701320384, + "grad_norm": 0.5027870535850525, + "learning_rate": 6.688335358444715e-05, + "loss": 1.7278, + "step": 5461 + }, + { + "epoch": 1.6579147063287296, + "grad_norm": 0.4753192663192749, + "learning_rate": 6.687727825030377e-05, + "loss": 1.9114, + "step": 5462 + }, + { + "epoch": 1.6582182425254213, + "grad_norm": 0.481952041387558, + "learning_rate": 6.687120291616039e-05, + "loss": 1.8791, + "step": 5463 + }, + { + "epoch": 1.6585217787221125, + "grad_norm": 0.549480140209198, + "learning_rate": 6.686512758201702e-05, + "loss": 1.2981, + "step": 5464 + }, + { + "epoch": 1.6588253149188041, + "grad_norm": 0.49252748489379883, + "learning_rate": 6.685905224787363e-05, + "loss": 1.9547, + "step": 5465 + }, + { + "epoch": 1.6591288511154956, + "grad_norm": 0.5749268531799316, + "learning_rate": 6.685297691373025e-05, + "loss": 2.0041, + "step": 5466 + }, + { + "epoch": 1.659432387312187, + "grad_norm": 0.6280375123023987, + "learning_rate": 6.684690157958688e-05, + "loss": 1.4371, + "step": 5467 + }, + { + "epoch": 1.6597359235088784, + "grad_norm": 0.6023044586181641, + "learning_rate": 6.68408262454435e-05, + "loss": 1.5359, + "step": 5468 + }, + { + "epoch": 1.6600394597055699, + "grad_norm": 0.5130484700202942, + "learning_rate": 6.683475091130013e-05, + "loss": 1.7051, + "step": 5469 + }, + { + "epoch": 1.6603429959022613, + "grad_norm": 0.9089249968528748, + "learning_rate": 6.682867557715675e-05, + "loss": 1.5448, + "step": 5470 + }, + { + "epoch": 1.6606465320989527, + "grad_norm": 0.5765674114227295, + "learning_rate": 6.682260024301337e-05, + "loss": 1.5053, + "step": 5471 + }, + { + "epoch": 1.6609500682956444, + "grad_norm": 0.9535808563232422, + "learning_rate": 6.681652490887e-05, + "loss": 1.4057, + "step": 5472 + }, + { + "epoch": 1.6612536044923356, + "grad_norm": 0.6133729219436646, + "learning_rate": 6.681044957472661e-05, + "loss": 1.1964, + "step": 5473 + }, + { + "epoch": 1.6615571406890273, + "grad_norm": 0.5631382465362549, + "learning_rate": 6.680437424058323e-05, + "loss": 1.6014, + "step": 5474 + }, + { + "epoch": 1.6618606768857185, + "grad_norm": 0.5070339441299438, + "learning_rate": 6.679829890643986e-05, + "loss": 1.2667, + "step": 5475 + }, + { + "epoch": 1.6621642130824101, + "grad_norm": 0.564780592918396, + "learning_rate": 6.679222357229648e-05, + "loss": 1.6802, + "step": 5476 + }, + { + "epoch": 1.6624677492791016, + "grad_norm": 0.5945841073989868, + "learning_rate": 6.67861482381531e-05, + "loss": 1.5995, + "step": 5477 + }, + { + "epoch": 1.662771285475793, + "grad_norm": 0.5960447788238525, + "learning_rate": 6.678007290400973e-05, + "loss": 1.4147, + "step": 5478 + }, + { + "epoch": 1.6630748216724844, + "grad_norm": 0.9950317144393921, + "learning_rate": 6.677399756986634e-05, + "loss": 1.2725, + "step": 5479 + }, + { + "epoch": 1.6633783578691759, + "grad_norm": 0.5889421105384827, + "learning_rate": 6.676792223572296e-05, + "loss": 1.5387, + "step": 5480 + }, + { + "epoch": 1.6636818940658673, + "grad_norm": 0.5153008103370667, + "learning_rate": 6.676184690157959e-05, + "loss": 1.7412, + "step": 5481 + }, + { + "epoch": 1.6639854302625587, + "grad_norm": 0.4582356810569763, + "learning_rate": 6.675577156743621e-05, + "loss": 1.8757, + "step": 5482 + }, + { + "epoch": 1.6642889664592504, + "grad_norm": 0.4646925926208496, + "learning_rate": 6.674969623329284e-05, + "loss": 1.8008, + "step": 5483 + }, + { + "epoch": 1.6645925026559416, + "grad_norm": 0.5988579988479614, + "learning_rate": 6.674362089914946e-05, + "loss": 1.2828, + "step": 5484 + }, + { + "epoch": 1.6648960388526333, + "grad_norm": 0.49637970328330994, + "learning_rate": 6.673754556500608e-05, + "loss": 1.6402, + "step": 5485 + }, + { + "epoch": 1.6651995750493245, + "grad_norm": 0.5080837607383728, + "learning_rate": 6.67314702308627e-05, + "loss": 1.7121, + "step": 5486 + }, + { + "epoch": 1.6655031112460161, + "grad_norm": 0.40366098284721375, + "learning_rate": 6.672539489671932e-05, + "loss": 1.6499, + "step": 5487 + }, + { + "epoch": 1.6658066474427076, + "grad_norm": 0.4660029411315918, + "learning_rate": 6.671931956257594e-05, + "loss": 1.7143, + "step": 5488 + }, + { + "epoch": 1.666110183639399, + "grad_norm": 0.34281787276268005, + "learning_rate": 6.671324422843257e-05, + "loss": 1.2508, + "step": 5489 + }, + { + "epoch": 1.6664137198360904, + "grad_norm": 0.4135943055152893, + "learning_rate": 6.670716889428919e-05, + "loss": 1.0782, + "step": 5490 + }, + { + "epoch": 1.6667172560327819, + "grad_norm": 0.5196151733398438, + "learning_rate": 6.670109356014581e-05, + "loss": 1.3375, + "step": 5491 + }, + { + "epoch": 1.6670207922294735, + "grad_norm": 0.4847051799297333, + "learning_rate": 6.669501822600244e-05, + "loss": 1.6875, + "step": 5492 + }, + { + "epoch": 1.6673243284261647, + "grad_norm": 0.46132588386535645, + "learning_rate": 6.668894289185906e-05, + "loss": 1.6544, + "step": 5493 + }, + { + "epoch": 1.6676278646228564, + "grad_norm": 0.5738229751586914, + "learning_rate": 6.668286755771567e-05, + "loss": 2.0053, + "step": 5494 + }, + { + "epoch": 1.6679314008195476, + "grad_norm": 0.5141200423240662, + "learning_rate": 6.66767922235723e-05, + "loss": 1.4922, + "step": 5495 + }, + { + "epoch": 1.6682349370162393, + "grad_norm": 0.48200786113739014, + "learning_rate": 6.667071688942892e-05, + "loss": 1.9125, + "step": 5496 + }, + { + "epoch": 1.6685384732129307, + "grad_norm": 0.5733198523521423, + "learning_rate": 6.666464155528555e-05, + "loss": 2.0049, + "step": 5497 + }, + { + "epoch": 1.6688420094096221, + "grad_norm": 0.512097954750061, + "learning_rate": 6.665856622114217e-05, + "loss": 1.7827, + "step": 5498 + }, + { + "epoch": 1.6691455456063136, + "grad_norm": 0.5178970098495483, + "learning_rate": 6.665249088699879e-05, + "loss": 1.8091, + "step": 5499 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 2.328645944595337, + "learning_rate": 6.664641555285542e-05, + "loss": 1.8017, + "step": 5500 + }, + { + "epoch": 1.6697526179996964, + "grad_norm": 0.5665689706802368, + "learning_rate": 6.664034021871203e-05, + "loss": 1.843, + "step": 5501 + }, + { + "epoch": 1.6700561541963879, + "grad_norm": 0.5300931930541992, + "learning_rate": 6.663426488456865e-05, + "loss": 1.3377, + "step": 5502 + }, + { + "epoch": 1.6703596903930795, + "grad_norm": 0.6273548603057861, + "learning_rate": 6.662818955042528e-05, + "loss": 1.9491, + "step": 5503 + }, + { + "epoch": 1.6706632265897707, + "grad_norm": 0.5122102499008179, + "learning_rate": 6.66221142162819e-05, + "loss": 1.8161, + "step": 5504 + }, + { + "epoch": 1.6709667627864624, + "grad_norm": 0.45519566535949707, + "learning_rate": 6.661603888213852e-05, + "loss": 1.7292, + "step": 5505 + }, + { + "epoch": 1.6712702989831536, + "grad_norm": 0.6773088574409485, + "learning_rate": 6.660996354799515e-05, + "loss": 1.2953, + "step": 5506 + }, + { + "epoch": 1.6715738351798453, + "grad_norm": 1.0610431432724, + "learning_rate": 6.660388821385177e-05, + "loss": 1.3204, + "step": 5507 + }, + { + "epoch": 1.6718773713765367, + "grad_norm": 0.6220400929450989, + "learning_rate": 6.659781287970838e-05, + "loss": 1.7026, + "step": 5508 + }, + { + "epoch": 1.6721809075732281, + "grad_norm": 0.5171409249305725, + "learning_rate": 6.659173754556501e-05, + "loss": 1.5276, + "step": 5509 + }, + { + "epoch": 1.6724844437699196, + "grad_norm": 1.0584473609924316, + "learning_rate": 6.658566221142163e-05, + "loss": 1.2664, + "step": 5510 + }, + { + "epoch": 1.672787979966611, + "grad_norm": 0.5404136776924133, + "learning_rate": 6.657958687727826e-05, + "loss": 1.3878, + "step": 5511 + }, + { + "epoch": 1.6730915161633024, + "grad_norm": 0.5437831282615662, + "learning_rate": 6.657351154313488e-05, + "loss": 1.6651, + "step": 5512 + }, + { + "epoch": 1.6733950523599939, + "grad_norm": 0.5370603799819946, + "learning_rate": 6.65674362089915e-05, + "loss": 1.7805, + "step": 5513 + }, + { + "epoch": 1.6736985885566855, + "grad_norm": 0.5555904507637024, + "learning_rate": 6.656136087484813e-05, + "loss": 1.6038, + "step": 5514 + }, + { + "epoch": 1.6740021247533767, + "grad_norm": 0.5612810850143433, + "learning_rate": 6.655528554070473e-05, + "loss": 1.3638, + "step": 5515 + }, + { + "epoch": 1.6743056609500684, + "grad_norm": 0.4679469168186188, + "learning_rate": 6.654921020656136e-05, + "loss": 1.201, + "step": 5516 + }, + { + "epoch": 1.6746091971467596, + "grad_norm": 0.5470007061958313, + "learning_rate": 6.654313487241799e-05, + "loss": 1.8458, + "step": 5517 + }, + { + "epoch": 1.6749127333434513, + "grad_norm": 0.5731515288352966, + "learning_rate": 6.653705953827461e-05, + "loss": 1.5961, + "step": 5518 + }, + { + "epoch": 1.6752162695401427, + "grad_norm": 0.44922706484794617, + "learning_rate": 6.653098420413123e-05, + "loss": 2.1234, + "step": 5519 + }, + { + "epoch": 1.6755198057368341, + "grad_norm": 0.4658230245113373, + "learning_rate": 6.652490886998786e-05, + "loss": 1.5829, + "step": 5520 + }, + { + "epoch": 1.6758233419335256, + "grad_norm": 0.4976028501987457, + "learning_rate": 6.651883353584448e-05, + "loss": 1.5513, + "step": 5521 + }, + { + "epoch": 1.676126878130217, + "grad_norm": 0.5739427804946899, + "learning_rate": 6.651275820170109e-05, + "loss": 1.514, + "step": 5522 + }, + { + "epoch": 1.6764304143269086, + "grad_norm": 0.5371562242507935, + "learning_rate": 6.650668286755772e-05, + "loss": 1.6313, + "step": 5523 + }, + { + "epoch": 1.6767339505235999, + "grad_norm": 0.8080946207046509, + "learning_rate": 6.650060753341434e-05, + "loss": 1.4731, + "step": 5524 + }, + { + "epoch": 1.6770374867202915, + "grad_norm": 0.5355061888694763, + "learning_rate": 6.649453219927097e-05, + "loss": 1.7795, + "step": 5525 + }, + { + "epoch": 1.6773410229169827, + "grad_norm": 0.529988706111908, + "learning_rate": 6.648845686512758e-05, + "loss": 1.7496, + "step": 5526 + }, + { + "epoch": 1.6776445591136744, + "grad_norm": 0.46063679456710815, + "learning_rate": 6.64823815309842e-05, + "loss": 1.9183, + "step": 5527 + }, + { + "epoch": 1.6779480953103656, + "grad_norm": 0.4897077977657318, + "learning_rate": 6.647630619684084e-05, + "loss": 1.4162, + "step": 5528 + }, + { + "epoch": 1.6782516315070573, + "grad_norm": 0.47567248344421387, + "learning_rate": 6.647023086269744e-05, + "loss": 1.2476, + "step": 5529 + }, + { + "epoch": 1.6785551677037487, + "grad_norm": 0.5894106030464172, + "learning_rate": 6.646415552855407e-05, + "loss": 1.8962, + "step": 5530 + }, + { + "epoch": 1.6788587039004401, + "grad_norm": 0.5598317980766296, + "learning_rate": 6.64580801944107e-05, + "loss": 1.9378, + "step": 5531 + }, + { + "epoch": 1.6791622400971316, + "grad_norm": 0.46344462037086487, + "learning_rate": 6.645200486026732e-05, + "loss": 1.3396, + "step": 5532 + }, + { + "epoch": 1.679465776293823, + "grad_norm": 0.5595990419387817, + "learning_rate": 6.644592952612394e-05, + "loss": 1.435, + "step": 5533 + }, + { + "epoch": 1.6797693124905146, + "grad_norm": 0.570959746837616, + "learning_rate": 6.643985419198057e-05, + "loss": 1.853, + "step": 5534 + }, + { + "epoch": 1.6800728486872059, + "grad_norm": 0.5187576413154602, + "learning_rate": 6.643377885783719e-05, + "loss": 1.7441, + "step": 5535 + }, + { + "epoch": 1.6803763848838975, + "grad_norm": 0.5761917233467102, + "learning_rate": 6.64277035236938e-05, + "loss": 1.7951, + "step": 5536 + }, + { + "epoch": 1.6806799210805887, + "grad_norm": 0.7706677913665771, + "learning_rate": 6.642162818955043e-05, + "loss": 1.5578, + "step": 5537 + }, + { + "epoch": 1.6809834572772804, + "grad_norm": 0.49452951550483704, + "learning_rate": 6.641555285540705e-05, + "loss": 1.8111, + "step": 5538 + }, + { + "epoch": 1.6812869934739718, + "grad_norm": 0.8533628582954407, + "learning_rate": 6.640947752126367e-05, + "loss": 1.9594, + "step": 5539 + }, + { + "epoch": 1.6815905296706632, + "grad_norm": 0.5128864049911499, + "learning_rate": 6.640340218712029e-05, + "loss": 1.8131, + "step": 5540 + }, + { + "epoch": 1.6818940658673547, + "grad_norm": 0.5614345669746399, + "learning_rate": 6.639732685297692e-05, + "loss": 1.6136, + "step": 5541 + }, + { + "epoch": 1.6821976020640461, + "grad_norm": 0.5893770456314087, + "learning_rate": 6.639125151883355e-05, + "loss": 1.8654, + "step": 5542 + }, + { + "epoch": 1.6825011382607375, + "grad_norm": 0.49633264541625977, + "learning_rate": 6.638517618469015e-05, + "loss": 1.6421, + "step": 5543 + }, + { + "epoch": 1.682804674457429, + "grad_norm": 0.5814844965934753, + "learning_rate": 6.637910085054678e-05, + "loss": 1.5601, + "step": 5544 + }, + { + "epoch": 1.6831082106541206, + "grad_norm": 0.544435441493988, + "learning_rate": 6.637302551640341e-05, + "loss": 1.7607, + "step": 5545 + }, + { + "epoch": 1.6834117468508119, + "grad_norm": 0.5191029906272888, + "learning_rate": 6.636695018226003e-05, + "loss": 1.7743, + "step": 5546 + }, + { + "epoch": 1.6837152830475035, + "grad_norm": 0.588700532913208, + "learning_rate": 6.636087484811665e-05, + "loss": 1.4228, + "step": 5547 + }, + { + "epoch": 1.6840188192441947, + "grad_norm": 0.5407133102416992, + "learning_rate": 6.635479951397328e-05, + "loss": 1.6116, + "step": 5548 + }, + { + "epoch": 1.6843223554408864, + "grad_norm": 0.5809290409088135, + "learning_rate": 6.63487241798299e-05, + "loss": 1.4475, + "step": 5549 + }, + { + "epoch": 1.6846258916375778, + "grad_norm": 0.5454392433166504, + "learning_rate": 6.634264884568651e-05, + "loss": 1.6486, + "step": 5550 + }, + { + "epoch": 1.6849294278342692, + "grad_norm": 0.5466883182525635, + "learning_rate": 6.633657351154314e-05, + "loss": 1.7363, + "step": 5551 + }, + { + "epoch": 1.6852329640309607, + "grad_norm": 0.6080778241157532, + "learning_rate": 6.633049817739976e-05, + "loss": 1.9441, + "step": 5552 + }, + { + "epoch": 1.6855365002276521, + "grad_norm": 0.5893242955207825, + "learning_rate": 6.632442284325638e-05, + "loss": 1.7365, + "step": 5553 + }, + { + "epoch": 1.6858400364243438, + "grad_norm": 0.5920213460922241, + "learning_rate": 6.6318347509113e-05, + "loss": 1.0138, + "step": 5554 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.5826712846755981, + "learning_rate": 6.631227217496963e-05, + "loss": 1.2456, + "step": 5555 + }, + { + "epoch": 1.6864471088177266, + "grad_norm": 0.6475664377212524, + "learning_rate": 6.630619684082626e-05, + "loss": 1.8795, + "step": 5556 + }, + { + "epoch": 1.6867506450144178, + "grad_norm": 0.4567904770374298, + "learning_rate": 6.630012150668286e-05, + "loss": 1.8851, + "step": 5557 + }, + { + "epoch": 1.6870541812111095, + "grad_norm": 0.5504480004310608, + "learning_rate": 6.629404617253949e-05, + "loss": 1.093, + "step": 5558 + }, + { + "epoch": 1.6873577174078007, + "grad_norm": 0.44067057967185974, + "learning_rate": 6.628797083839612e-05, + "loss": 0.9901, + "step": 5559 + }, + { + "epoch": 1.6876612536044924, + "grad_norm": 0.5783705711364746, + "learning_rate": 6.628189550425274e-05, + "loss": 1.417, + "step": 5560 + }, + { + "epoch": 1.6879647898011838, + "grad_norm": 0.5694515109062195, + "learning_rate": 6.627582017010936e-05, + "loss": 1.7743, + "step": 5561 + }, + { + "epoch": 1.6882683259978752, + "grad_norm": 0.7065865993499756, + "learning_rate": 6.626974483596599e-05, + "loss": 1.779, + "step": 5562 + }, + { + "epoch": 1.6885718621945667, + "grad_norm": 1.2169636487960815, + "learning_rate": 6.62636695018226e-05, + "loss": 1.4069, + "step": 5563 + }, + { + "epoch": 1.688875398391258, + "grad_norm": 0.565172553062439, + "learning_rate": 6.625759416767922e-05, + "loss": 2.1258, + "step": 5564 + }, + { + "epoch": 1.6891789345879498, + "grad_norm": 0.479299396276474, + "learning_rate": 6.625151883353585e-05, + "loss": 1.4836, + "step": 5565 + }, + { + "epoch": 1.689482470784641, + "grad_norm": 0.4096144735813141, + "learning_rate": 6.624544349939247e-05, + "loss": 1.3714, + "step": 5566 + }, + { + "epoch": 1.6897860069813326, + "grad_norm": 0.4696303606033325, + "learning_rate": 6.623936816524909e-05, + "loss": 2.2721, + "step": 5567 + }, + { + "epoch": 1.6900895431780238, + "grad_norm": 0.6039106249809265, + "learning_rate": 6.62332928311057e-05, + "loss": 1.7076, + "step": 5568 + }, + { + "epoch": 1.6903930793747155, + "grad_norm": 0.5275202393531799, + "learning_rate": 6.622721749696234e-05, + "loss": 1.3444, + "step": 5569 + }, + { + "epoch": 1.690696615571407, + "grad_norm": 0.49647027254104614, + "learning_rate": 6.622114216281897e-05, + "loss": 1.1222, + "step": 5570 + }, + { + "epoch": 1.6910001517680984, + "grad_norm": 0.5454918742179871, + "learning_rate": 6.621506682867557e-05, + "loss": 1.3901, + "step": 5571 + }, + { + "epoch": 1.6913036879647898, + "grad_norm": 0.5135229825973511, + "learning_rate": 6.62089914945322e-05, + "loss": 1.561, + "step": 5572 + }, + { + "epoch": 1.6916072241614812, + "grad_norm": 0.4712585210800171, + "learning_rate": 6.620291616038883e-05, + "loss": 1.7875, + "step": 5573 + }, + { + "epoch": 1.6919107603581727, + "grad_norm": 0.5299578905105591, + "learning_rate": 6.619684082624545e-05, + "loss": 1.6143, + "step": 5574 + }, + { + "epoch": 1.692214296554864, + "grad_norm": 0.4971330463886261, + "learning_rate": 6.619076549210207e-05, + "loss": 1.8084, + "step": 5575 + }, + { + "epoch": 1.6925178327515558, + "grad_norm": 0.5195701122283936, + "learning_rate": 6.61846901579587e-05, + "loss": 1.8311, + "step": 5576 + }, + { + "epoch": 1.692821368948247, + "grad_norm": 0.434212327003479, + "learning_rate": 6.617861482381532e-05, + "loss": 1.0112, + "step": 5577 + }, + { + "epoch": 1.6931249051449386, + "grad_norm": 0.5618522763252258, + "learning_rate": 6.617253948967193e-05, + "loss": 1.6371, + "step": 5578 + }, + { + "epoch": 1.6934284413416298, + "grad_norm": 0.5451750159263611, + "learning_rate": 6.616646415552856e-05, + "loss": 1.8356, + "step": 5579 + }, + { + "epoch": 1.6937319775383215, + "grad_norm": 0.5345383286476135, + "learning_rate": 6.616038882138518e-05, + "loss": 1.5302, + "step": 5580 + }, + { + "epoch": 1.694035513735013, + "grad_norm": 0.5338907241821289, + "learning_rate": 6.61543134872418e-05, + "loss": 1.2822, + "step": 5581 + }, + { + "epoch": 1.6943390499317044, + "grad_norm": 0.47024282813072205, + "learning_rate": 6.614823815309842e-05, + "loss": 1.6541, + "step": 5582 + }, + { + "epoch": 1.6946425861283958, + "grad_norm": 0.5072962641716003, + "learning_rate": 6.614216281895505e-05, + "loss": 1.7995, + "step": 5583 + }, + { + "epoch": 1.6949461223250872, + "grad_norm": 0.5675525665283203, + "learning_rate": 6.613608748481168e-05, + "loss": 1.6175, + "step": 5584 + }, + { + "epoch": 1.6952496585217787, + "grad_norm": 0.5576388239860535, + "learning_rate": 6.613001215066828e-05, + "loss": 1.6772, + "step": 5585 + }, + { + "epoch": 1.69555319471847, + "grad_norm": 0.5666404366493225, + "learning_rate": 6.612393681652491e-05, + "loss": 1.4908, + "step": 5586 + }, + { + "epoch": 1.6958567309151618, + "grad_norm": 0.6059871315956116, + "learning_rate": 6.611786148238154e-05, + "loss": 1.5443, + "step": 5587 + }, + { + "epoch": 1.696160267111853, + "grad_norm": 0.4992029368877411, + "learning_rate": 6.611178614823815e-05, + "loss": 1.7427, + "step": 5588 + }, + { + "epoch": 1.6964638033085446, + "grad_norm": 0.4931040406227112, + "learning_rate": 6.610571081409478e-05, + "loss": 1.5386, + "step": 5589 + }, + { + "epoch": 1.6967673395052358, + "grad_norm": 0.6507347226142883, + "learning_rate": 6.609963547995141e-05, + "loss": 1.3519, + "step": 5590 + }, + { + "epoch": 1.6970708757019275, + "grad_norm": 0.5607397556304932, + "learning_rate": 6.609356014580803e-05, + "loss": 1.6292, + "step": 5591 + }, + { + "epoch": 1.697374411898619, + "grad_norm": 0.5708796977996826, + "learning_rate": 6.608748481166464e-05, + "loss": 1.671, + "step": 5592 + }, + { + "epoch": 1.6976779480953104, + "grad_norm": 0.4833160936832428, + "learning_rate": 6.608140947752127e-05, + "loss": 1.8766, + "step": 5593 + }, + { + "epoch": 1.6979814842920018, + "grad_norm": 0.6059253811836243, + "learning_rate": 6.607533414337789e-05, + "loss": 1.4497, + "step": 5594 + }, + { + "epoch": 1.6982850204886932, + "grad_norm": 0.5514014959335327, + "learning_rate": 6.606925880923451e-05, + "loss": 1.268, + "step": 5595 + }, + { + "epoch": 1.6985885566853849, + "grad_norm": 0.7988373041152954, + "learning_rate": 6.606318347509113e-05, + "loss": 1.4431, + "step": 5596 + }, + { + "epoch": 1.698892092882076, + "grad_norm": 0.5660038590431213, + "learning_rate": 6.605710814094776e-05, + "loss": 1.8063, + "step": 5597 + }, + { + "epoch": 1.6991956290787678, + "grad_norm": 0.5148012042045593, + "learning_rate": 6.605103280680439e-05, + "loss": 1.969, + "step": 5598 + }, + { + "epoch": 1.699499165275459, + "grad_norm": 0.5059932470321655, + "learning_rate": 6.604495747266099e-05, + "loss": 2.0285, + "step": 5599 + }, + { + "epoch": 1.6998027014721506, + "grad_norm": 0.6167613863945007, + "learning_rate": 6.603888213851762e-05, + "loss": 1.5008, + "step": 5600 + }, + { + "epoch": 1.700106237668842, + "grad_norm": 0.6057916283607483, + "learning_rate": 6.603280680437425e-05, + "loss": 1.8194, + "step": 5601 + }, + { + "epoch": 1.7004097738655335, + "grad_norm": 0.590444803237915, + "learning_rate": 6.602673147023086e-05, + "loss": 1.5952, + "step": 5602 + }, + { + "epoch": 1.700713310062225, + "grad_norm": 0.5104463696479797, + "learning_rate": 6.602065613608749e-05, + "loss": 1.9437, + "step": 5603 + }, + { + "epoch": 1.7010168462589164, + "grad_norm": 0.6614755392074585, + "learning_rate": 6.601458080194412e-05, + "loss": 1.7985, + "step": 5604 + }, + { + "epoch": 1.7013203824556078, + "grad_norm": 0.5348273515701294, + "learning_rate": 6.600850546780074e-05, + "loss": 1.81, + "step": 5605 + }, + { + "epoch": 1.7016239186522992, + "grad_norm": 0.7011120915412903, + "learning_rate": 6.600243013365735e-05, + "loss": 1.3921, + "step": 5606 + }, + { + "epoch": 1.7019274548489909, + "grad_norm": 0.5143483281135559, + "learning_rate": 6.599635479951398e-05, + "loss": 1.3965, + "step": 5607 + }, + { + "epoch": 1.702230991045682, + "grad_norm": 0.6444684267044067, + "learning_rate": 6.59902794653706e-05, + "loss": 1.6044, + "step": 5608 + }, + { + "epoch": 1.7025345272423738, + "grad_norm": 0.5477455854415894, + "learning_rate": 6.598420413122722e-05, + "loss": 1.4496, + "step": 5609 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.4528508484363556, + "learning_rate": 6.597812879708384e-05, + "loss": 1.5698, + "step": 5610 + }, + { + "epoch": 1.7031415996357566, + "grad_norm": 0.8523666262626648, + "learning_rate": 6.597205346294047e-05, + "loss": 1.7668, + "step": 5611 + }, + { + "epoch": 1.703445135832448, + "grad_norm": 0.45186835527420044, + "learning_rate": 6.596597812879708e-05, + "loss": 1.3897, + "step": 5612 + }, + { + "epoch": 1.7037486720291395, + "grad_norm": 0.47669535875320435, + "learning_rate": 6.59599027946537e-05, + "loss": 1.8281, + "step": 5613 + }, + { + "epoch": 1.704052208225831, + "grad_norm": 0.4351363778114319, + "learning_rate": 6.595382746051033e-05, + "loss": 1.3241, + "step": 5614 + }, + { + "epoch": 1.7043557444225224, + "grad_norm": 0.508640468120575, + "learning_rate": 6.594775212636696e-05, + "loss": 2.0058, + "step": 5615 + }, + { + "epoch": 1.7046592806192138, + "grad_norm": 0.615515947341919, + "learning_rate": 6.594167679222357e-05, + "loss": 1.352, + "step": 5616 + }, + { + "epoch": 1.7049628168159052, + "grad_norm": 0.6008703708648682, + "learning_rate": 6.59356014580802e-05, + "loss": 1.5652, + "step": 5617 + }, + { + "epoch": 1.7052663530125969, + "grad_norm": 0.5095522403717041, + "learning_rate": 6.592952612393683e-05, + "loss": 1.7274, + "step": 5618 + }, + { + "epoch": 1.705569889209288, + "grad_norm": 0.5482503771781921, + "learning_rate": 6.592345078979345e-05, + "loss": 1.8572, + "step": 5619 + }, + { + "epoch": 1.7058734254059797, + "grad_norm": 0.601202666759491, + "learning_rate": 6.591737545565006e-05, + "loss": 1.5425, + "step": 5620 + }, + { + "epoch": 1.706176961602671, + "grad_norm": 0.5597031116485596, + "learning_rate": 6.591130012150668e-05, + "loss": 1.7216, + "step": 5621 + }, + { + "epoch": 1.7064804977993626, + "grad_norm": 0.6926930546760559, + "learning_rate": 6.590522478736331e-05, + "loss": 1.7835, + "step": 5622 + }, + { + "epoch": 1.706784033996054, + "grad_norm": 0.5762909054756165, + "learning_rate": 6.589914945321993e-05, + "loss": 1.7096, + "step": 5623 + }, + { + "epoch": 1.7070875701927455, + "grad_norm": 0.43438559770584106, + "learning_rate": 6.589307411907655e-05, + "loss": 1.6576, + "step": 5624 + }, + { + "epoch": 1.707391106389437, + "grad_norm": 0.577350378036499, + "learning_rate": 6.588699878493318e-05, + "loss": 1.3354, + "step": 5625 + }, + { + "epoch": 1.7076946425861284, + "grad_norm": 0.7164928913116455, + "learning_rate": 6.58809234507898e-05, + "loss": 1.7567, + "step": 5626 + }, + { + "epoch": 1.70799817878282, + "grad_norm": 0.9256330132484436, + "learning_rate": 6.587484811664641e-05, + "loss": 1.5759, + "step": 5627 + }, + { + "epoch": 1.7083017149795112, + "grad_norm": 0.45018166303634644, + "learning_rate": 6.586877278250304e-05, + "loss": 1.7099, + "step": 5628 + }, + { + "epoch": 1.7086052511762029, + "grad_norm": 0.5842235088348389, + "learning_rate": 6.586269744835967e-05, + "loss": 1.927, + "step": 5629 + }, + { + "epoch": 1.708908787372894, + "grad_norm": 0.5370432138442993, + "learning_rate": 6.585662211421628e-05, + "loss": 1.4116, + "step": 5630 + }, + { + "epoch": 1.7092123235695857, + "grad_norm": 0.5643729567527771, + "learning_rate": 6.585054678007291e-05, + "loss": 1.5234, + "step": 5631 + }, + { + "epoch": 1.7095158597662772, + "grad_norm": 0.5104454755783081, + "learning_rate": 6.584447144592954e-05, + "loss": 1.6687, + "step": 5632 + }, + { + "epoch": 1.7098193959629686, + "grad_norm": 0.5528333783149719, + "learning_rate": 6.583839611178616e-05, + "loss": 1.7628, + "step": 5633 + }, + { + "epoch": 1.71012293215966, + "grad_norm": 0.4312419891357422, + "learning_rate": 6.583232077764277e-05, + "loss": 1.3351, + "step": 5634 + }, + { + "epoch": 1.7104264683563515, + "grad_norm": 0.5308565497398376, + "learning_rate": 6.582624544349939e-05, + "loss": 1.3432, + "step": 5635 + }, + { + "epoch": 1.710730004553043, + "grad_norm": 0.5155957937240601, + "learning_rate": 6.582017010935602e-05, + "loss": 1.7007, + "step": 5636 + }, + { + "epoch": 1.7110335407497343, + "grad_norm": 0.48766398429870605, + "learning_rate": 6.581409477521264e-05, + "loss": 1.8807, + "step": 5637 + }, + { + "epoch": 1.711337076946426, + "grad_norm": 0.6066737771034241, + "learning_rate": 6.580801944106926e-05, + "loss": 1.705, + "step": 5638 + }, + { + "epoch": 1.7116406131431172, + "grad_norm": 0.38555800914764404, + "learning_rate": 6.580194410692589e-05, + "loss": 1.6603, + "step": 5639 + }, + { + "epoch": 1.7119441493398089, + "grad_norm": 0.5054239630699158, + "learning_rate": 6.57958687727825e-05, + "loss": 1.8099, + "step": 5640 + }, + { + "epoch": 1.7122476855365, + "grad_norm": 2.3474161624908447, + "learning_rate": 6.578979343863912e-05, + "loss": 1.8, + "step": 5641 + }, + { + "epoch": 1.7125512217331917, + "grad_norm": 0.5971972942352295, + "learning_rate": 6.578371810449575e-05, + "loss": 1.7147, + "step": 5642 + }, + { + "epoch": 1.7128547579298832, + "grad_norm": 0.5372626781463623, + "learning_rate": 6.577764277035238e-05, + "loss": 1.7786, + "step": 5643 + }, + { + "epoch": 1.7131582941265746, + "grad_norm": 0.4543991684913635, + "learning_rate": 6.577156743620899e-05, + "loss": 1.4787, + "step": 5644 + }, + { + "epoch": 1.713461830323266, + "grad_norm": 0.5099576115608215, + "learning_rate": 6.576549210206562e-05, + "loss": 1.7248, + "step": 5645 + }, + { + "epoch": 1.7137653665199575, + "grad_norm": 0.5634260177612305, + "learning_rate": 6.575941676792225e-05, + "loss": 1.6313, + "step": 5646 + }, + { + "epoch": 1.714068902716649, + "grad_norm": 0.6225250363349915, + "learning_rate": 6.575334143377887e-05, + "loss": 1.4485, + "step": 5647 + }, + { + "epoch": 1.7143724389133403, + "grad_norm": 0.6028598546981812, + "learning_rate": 6.574726609963548e-05, + "loss": 1.4313, + "step": 5648 + }, + { + "epoch": 1.714675975110032, + "grad_norm": 0.5556081533432007, + "learning_rate": 6.57411907654921e-05, + "loss": 1.7669, + "step": 5649 + }, + { + "epoch": 1.7149795113067232, + "grad_norm": 0.4711391031742096, + "learning_rate": 6.573511543134873e-05, + "loss": 1.1927, + "step": 5650 + }, + { + "epoch": 1.7152830475034149, + "grad_norm": 0.62098228931427, + "learning_rate": 6.572904009720535e-05, + "loss": 1.9925, + "step": 5651 + }, + { + "epoch": 1.715586583700106, + "grad_norm": 0.402541846036911, + "learning_rate": 6.572296476306197e-05, + "loss": 0.7512, + "step": 5652 + }, + { + "epoch": 1.7158901198967977, + "grad_norm": 0.45836901664733887, + "learning_rate": 6.57168894289186e-05, + "loss": 1.435, + "step": 5653 + }, + { + "epoch": 1.7161936560934892, + "grad_norm": 0.48346778750419617, + "learning_rate": 6.571081409477521e-05, + "loss": 1.4181, + "step": 5654 + }, + { + "epoch": 1.7164971922901806, + "grad_norm": 0.6208150386810303, + "learning_rate": 6.570473876063183e-05, + "loss": 1.6864, + "step": 5655 + }, + { + "epoch": 1.716800728486872, + "grad_norm": 0.5359233617782593, + "learning_rate": 6.569866342648846e-05, + "loss": 1.5866, + "step": 5656 + }, + { + "epoch": 1.7171042646835635, + "grad_norm": 0.4424448311328888, + "learning_rate": 6.569258809234509e-05, + "loss": 1.2408, + "step": 5657 + }, + { + "epoch": 1.7174078008802551, + "grad_norm": 0.5332330465316772, + "learning_rate": 6.56865127582017e-05, + "loss": 1.4862, + "step": 5658 + }, + { + "epoch": 1.7177113370769463, + "grad_norm": 0.4810725748538971, + "learning_rate": 6.568043742405833e-05, + "loss": 1.714, + "step": 5659 + }, + { + "epoch": 1.718014873273638, + "grad_norm": 0.5168288946151733, + "learning_rate": 6.567436208991496e-05, + "loss": 1.5932, + "step": 5660 + }, + { + "epoch": 1.7183184094703292, + "grad_norm": 0.49804630875587463, + "learning_rate": 6.566828675577156e-05, + "loss": 1.6901, + "step": 5661 + }, + { + "epoch": 1.7186219456670209, + "grad_norm": 0.5672121047973633, + "learning_rate": 6.566221142162819e-05, + "loss": 1.993, + "step": 5662 + }, + { + "epoch": 1.7189254818637123, + "grad_norm": 0.5449413061141968, + "learning_rate": 6.565613608748481e-05, + "loss": 1.7013, + "step": 5663 + }, + { + "epoch": 1.7192290180604037, + "grad_norm": 0.5823309421539307, + "learning_rate": 6.565006075334144e-05, + "loss": 1.5827, + "step": 5664 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.4752315580844879, + "learning_rate": 6.564398541919806e-05, + "loss": 1.3413, + "step": 5665 + }, + { + "epoch": 1.7198360904537866, + "grad_norm": 0.6719446182250977, + "learning_rate": 6.563791008505468e-05, + "loss": 1.3361, + "step": 5666 + }, + { + "epoch": 1.720139626650478, + "grad_norm": 0.650968074798584, + "learning_rate": 6.56318347509113e-05, + "loss": 1.3424, + "step": 5667 + }, + { + "epoch": 1.7204431628471695, + "grad_norm": 0.668044924736023, + "learning_rate": 6.562575941676792e-05, + "loss": 1.1552, + "step": 5668 + }, + { + "epoch": 1.7207466990438611, + "grad_norm": 0.7418035864830017, + "learning_rate": 6.561968408262454e-05, + "loss": 1.7164, + "step": 5669 + }, + { + "epoch": 1.7210502352405523, + "grad_norm": 0.5805455446243286, + "learning_rate": 6.561360874848117e-05, + "loss": 1.8215, + "step": 5670 + }, + { + "epoch": 1.721353771437244, + "grad_norm": 0.5062075853347778, + "learning_rate": 6.56075334143378e-05, + "loss": 1.818, + "step": 5671 + }, + { + "epoch": 1.7216573076339352, + "grad_norm": 0.5593624114990234, + "learning_rate": 6.56014580801944e-05, + "loss": 1.492, + "step": 5672 + }, + { + "epoch": 1.7219608438306269, + "grad_norm": 0.6025186777114868, + "learning_rate": 6.559538274605104e-05, + "loss": 1.6984, + "step": 5673 + }, + { + "epoch": 1.7222643800273183, + "grad_norm": 0.5048421025276184, + "learning_rate": 6.558930741190767e-05, + "loss": 0.9581, + "step": 5674 + }, + { + "epoch": 1.7225679162240097, + "grad_norm": 0.5279005765914917, + "learning_rate": 6.558323207776427e-05, + "loss": 1.7759, + "step": 5675 + }, + { + "epoch": 1.7228714524207012, + "grad_norm": 0.39026376605033875, + "learning_rate": 6.55771567436209e-05, + "loss": 1.631, + "step": 5676 + }, + { + "epoch": 1.7231749886173926, + "grad_norm": 0.5611656308174133, + "learning_rate": 6.557108140947752e-05, + "loss": 1.7361, + "step": 5677 + }, + { + "epoch": 1.723478524814084, + "grad_norm": 0.5910829305648804, + "learning_rate": 6.556500607533415e-05, + "loss": 1.5679, + "step": 5678 + }, + { + "epoch": 1.7237820610107755, + "grad_norm": 0.6744527220726013, + "learning_rate": 6.555893074119077e-05, + "loss": 1.6144, + "step": 5679 + }, + { + "epoch": 1.7240855972074671, + "grad_norm": 0.5710015892982483, + "learning_rate": 6.555285540704739e-05, + "loss": 1.6817, + "step": 5680 + }, + { + "epoch": 1.7243891334041583, + "grad_norm": 0.5634138584136963, + "learning_rate": 6.554678007290402e-05, + "loss": 1.5614, + "step": 5681 + }, + { + "epoch": 1.72469266960085, + "grad_norm": 0.538596510887146, + "learning_rate": 6.554070473876063e-05, + "loss": 1.6871, + "step": 5682 + }, + { + "epoch": 1.7249962057975412, + "grad_norm": 0.456910103559494, + "learning_rate": 6.553462940461725e-05, + "loss": 1.9021, + "step": 5683 + }, + { + "epoch": 1.7252997419942329, + "grad_norm": 0.5184640288352966, + "learning_rate": 6.552855407047388e-05, + "loss": 1.4493, + "step": 5684 + }, + { + "epoch": 1.7256032781909243, + "grad_norm": 0.4227922558784485, + "learning_rate": 6.55224787363305e-05, + "loss": 1.1346, + "step": 5685 + }, + { + "epoch": 1.7259068143876157, + "grad_norm": 0.5384974479675293, + "learning_rate": 6.551640340218712e-05, + "loss": 1.4499, + "step": 5686 + }, + { + "epoch": 1.7262103505843072, + "grad_norm": 0.5459827184677124, + "learning_rate": 6.551032806804375e-05, + "loss": 1.6721, + "step": 5687 + }, + { + "epoch": 1.7265138867809986, + "grad_norm": 0.48895028233528137, + "learning_rate": 6.550425273390038e-05, + "loss": 1.6447, + "step": 5688 + }, + { + "epoch": 1.7268174229776903, + "grad_norm": 0.4322478175163269, + "learning_rate": 6.549817739975698e-05, + "loss": 1.2844, + "step": 5689 + }, + { + "epoch": 1.7271209591743815, + "grad_norm": 0.6133584380149841, + "learning_rate": 6.549210206561361e-05, + "loss": 1.0293, + "step": 5690 + }, + { + "epoch": 1.7274244953710731, + "grad_norm": 0.4673958420753479, + "learning_rate": 6.548602673147023e-05, + "loss": 2.098, + "step": 5691 + }, + { + "epoch": 1.7277280315677643, + "grad_norm": 0.8196238875389099, + "learning_rate": 6.547995139732686e-05, + "loss": 1.1086, + "step": 5692 + }, + { + "epoch": 1.728031567764456, + "grad_norm": 0.6799973249435425, + "learning_rate": 6.547387606318348e-05, + "loss": 1.8953, + "step": 5693 + }, + { + "epoch": 1.7283351039611472, + "grad_norm": 0.5416783690452576, + "learning_rate": 6.54678007290401e-05, + "loss": 1.633, + "step": 5694 + }, + { + "epoch": 1.7286386401578389, + "grad_norm": 0.574783444404602, + "learning_rate": 6.546172539489673e-05, + "loss": 1.7365, + "step": 5695 + }, + { + "epoch": 1.7289421763545303, + "grad_norm": 0.7082532644271851, + "learning_rate": 6.545565006075334e-05, + "loss": 1.5609, + "step": 5696 + }, + { + "epoch": 1.7292457125512217, + "grad_norm": 0.4839562177658081, + "learning_rate": 6.544957472660996e-05, + "loss": 1.6787, + "step": 5697 + }, + { + "epoch": 1.7295492487479132, + "grad_norm": 0.5359554290771484, + "learning_rate": 6.544349939246659e-05, + "loss": 1.3859, + "step": 5698 + }, + { + "epoch": 1.7298527849446046, + "grad_norm": 0.4709901511669159, + "learning_rate": 6.543742405832321e-05, + "loss": 1.3647, + "step": 5699 + }, + { + "epoch": 1.7301563211412962, + "grad_norm": 0.5647301077842712, + "learning_rate": 6.543134872417983e-05, + "loss": 1.7588, + "step": 5700 + }, + { + "epoch": 1.7304598573379875, + "grad_norm": 0.5082557201385498, + "learning_rate": 6.542527339003646e-05, + "loss": 1.6857, + "step": 5701 + }, + { + "epoch": 1.7307633935346791, + "grad_norm": 0.5972119569778442, + "learning_rate": 6.541919805589307e-05, + "loss": 1.5892, + "step": 5702 + }, + { + "epoch": 1.7310669297313703, + "grad_norm": 0.5038626790046692, + "learning_rate": 6.541312272174969e-05, + "loss": 1.7169, + "step": 5703 + }, + { + "epoch": 1.731370465928062, + "grad_norm": 0.5989472270011902, + "learning_rate": 6.540704738760632e-05, + "loss": 1.8422, + "step": 5704 + }, + { + "epoch": 1.7316740021247534, + "grad_norm": 0.6729888916015625, + "learning_rate": 6.540097205346294e-05, + "loss": 1.3468, + "step": 5705 + }, + { + "epoch": 1.7319775383214449, + "grad_norm": 0.4645465910434723, + "learning_rate": 6.539489671931957e-05, + "loss": 2.0354, + "step": 5706 + }, + { + "epoch": 1.7322810745181363, + "grad_norm": 0.49751991033554077, + "learning_rate": 6.538882138517619e-05, + "loss": 1.4206, + "step": 5707 + }, + { + "epoch": 1.7325846107148277, + "grad_norm": 0.49642395973205566, + "learning_rate": 6.53827460510328e-05, + "loss": 1.5807, + "step": 5708 + }, + { + "epoch": 1.7328881469115192, + "grad_norm": 0.6195915937423706, + "learning_rate": 6.537667071688944e-05, + "loss": 1.0846, + "step": 5709 + }, + { + "epoch": 1.7331916831082106, + "grad_norm": 0.5105364322662354, + "learning_rate": 6.537059538274605e-05, + "loss": 1.3858, + "step": 5710 + }, + { + "epoch": 1.7334952193049022, + "grad_norm": 0.5473313331604004, + "learning_rate": 6.536452004860267e-05, + "loss": 1.6553, + "step": 5711 + }, + { + "epoch": 1.7337987555015935, + "grad_norm": 0.5856526494026184, + "learning_rate": 6.53584447144593e-05, + "loss": 1.3455, + "step": 5712 + }, + { + "epoch": 1.7341022916982851, + "grad_norm": 0.5909231305122375, + "learning_rate": 6.535236938031592e-05, + "loss": 1.9817, + "step": 5713 + }, + { + "epoch": 1.7344058278949763, + "grad_norm": 0.5128015279769897, + "learning_rate": 6.534629404617254e-05, + "loss": 1.7289, + "step": 5714 + }, + { + "epoch": 1.734709364091668, + "grad_norm": 0.55885249376297, + "learning_rate": 6.534021871202917e-05, + "loss": 1.7433, + "step": 5715 + }, + { + "epoch": 1.7350129002883594, + "grad_norm": 1.039367437362671, + "learning_rate": 6.533414337788578e-05, + "loss": 1.3287, + "step": 5716 + }, + { + "epoch": 1.7353164364850509, + "grad_norm": 0.6337462067604065, + "learning_rate": 6.53280680437424e-05, + "loss": 1.5387, + "step": 5717 + }, + { + "epoch": 1.7356199726817423, + "grad_norm": 0.5924265384674072, + "learning_rate": 6.532199270959903e-05, + "loss": 1.5792, + "step": 5718 + }, + { + "epoch": 1.7359235088784337, + "grad_norm": 0.48296046257019043, + "learning_rate": 6.531591737545565e-05, + "loss": 1.328, + "step": 5719 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.5941514372825623, + "learning_rate": 6.530984204131228e-05, + "loss": 1.6456, + "step": 5720 + }, + { + "epoch": 1.7365305812718166, + "grad_norm": 0.5522921085357666, + "learning_rate": 6.53037667071689e-05, + "loss": 1.4514, + "step": 5721 + }, + { + "epoch": 1.7368341174685082, + "grad_norm": 0.5471270680427551, + "learning_rate": 6.529769137302552e-05, + "loss": 1.5356, + "step": 5722 + }, + { + "epoch": 1.7371376536651995, + "grad_norm": 1.03130304813385, + "learning_rate": 6.529161603888215e-05, + "loss": 1.6965, + "step": 5723 + }, + { + "epoch": 1.737441189861891, + "grad_norm": 0.6405633687973022, + "learning_rate": 6.528554070473876e-05, + "loss": 1.2192, + "step": 5724 + }, + { + "epoch": 1.7377447260585823, + "grad_norm": 0.5106350779533386, + "learning_rate": 6.527946537059538e-05, + "loss": 1.4618, + "step": 5725 + }, + { + "epoch": 1.738048262255274, + "grad_norm": 0.5871725082397461, + "learning_rate": 6.527339003645201e-05, + "loss": 1.8213, + "step": 5726 + }, + { + "epoch": 1.7383517984519654, + "grad_norm": 0.5859651565551758, + "learning_rate": 6.526731470230863e-05, + "loss": 1.3484, + "step": 5727 + }, + { + "epoch": 1.7386553346486568, + "grad_norm": 0.5065982937812805, + "learning_rate": 6.526123936816525e-05, + "loss": 1.6662, + "step": 5728 + }, + { + "epoch": 1.7389588708453483, + "grad_norm": 0.5518446564674377, + "learning_rate": 6.525516403402188e-05, + "loss": 1.5213, + "step": 5729 + }, + { + "epoch": 1.7392624070420397, + "grad_norm": 0.5022513270378113, + "learning_rate": 6.52490886998785e-05, + "loss": 1.2575, + "step": 5730 + }, + { + "epoch": 1.7395659432387314, + "grad_norm": 0.3669544756412506, + "learning_rate": 6.524301336573511e-05, + "loss": 1.8608, + "step": 5731 + }, + { + "epoch": 1.7398694794354226, + "grad_norm": 0.9789682626724243, + "learning_rate": 6.523693803159174e-05, + "loss": 1.2424, + "step": 5732 + }, + { + "epoch": 1.7401730156321142, + "grad_norm": 0.5330730080604553, + "learning_rate": 6.523086269744836e-05, + "loss": 1.641, + "step": 5733 + }, + { + "epoch": 1.7404765518288055, + "grad_norm": 0.4809507727622986, + "learning_rate": 6.522478736330498e-05, + "loss": 1.1522, + "step": 5734 + }, + { + "epoch": 1.740780088025497, + "grad_norm": 0.6297109723091125, + "learning_rate": 6.521871202916161e-05, + "loss": 1.8506, + "step": 5735 + }, + { + "epoch": 1.7410836242221885, + "grad_norm": 0.5746430158615112, + "learning_rate": 6.521263669501823e-05, + "loss": 1.3589, + "step": 5736 + }, + { + "epoch": 1.74138716041888, + "grad_norm": 0.5811137557029724, + "learning_rate": 6.520656136087486e-05, + "loss": 1.8811, + "step": 5737 + }, + { + "epoch": 1.7416906966155714, + "grad_norm": 0.5107501149177551, + "learning_rate": 6.520048602673147e-05, + "loss": 1.6527, + "step": 5738 + }, + { + "epoch": 1.7419942328122628, + "grad_norm": 0.6415001749992371, + "learning_rate": 6.519441069258809e-05, + "loss": 1.0182, + "step": 5739 + }, + { + "epoch": 1.7422977690089543, + "grad_norm": 0.6572467684745789, + "learning_rate": 6.518833535844472e-05, + "loss": 1.5543, + "step": 5740 + }, + { + "epoch": 1.7426013052056457, + "grad_norm": 0.5816811323165894, + "learning_rate": 6.518226002430134e-05, + "loss": 2.0466, + "step": 5741 + }, + { + "epoch": 1.7429048414023374, + "grad_norm": 0.5232481360435486, + "learning_rate": 6.517618469015796e-05, + "loss": 1.4519, + "step": 5742 + }, + { + "epoch": 1.7432083775990286, + "grad_norm": 0.720382809638977, + "learning_rate": 6.517010935601459e-05, + "loss": 1.2823, + "step": 5743 + }, + { + "epoch": 1.7435119137957202, + "grad_norm": 0.6193926930427551, + "learning_rate": 6.51640340218712e-05, + "loss": 1.4534, + "step": 5744 + }, + { + "epoch": 1.7438154499924114, + "grad_norm": 0.629203736782074, + "learning_rate": 6.515795868772782e-05, + "loss": 1.606, + "step": 5745 + }, + { + "epoch": 1.744118986189103, + "grad_norm": 0.7067033648490906, + "learning_rate": 6.515188335358445e-05, + "loss": 1.8257, + "step": 5746 + }, + { + "epoch": 1.7444225223857945, + "grad_norm": 0.596479594707489, + "learning_rate": 6.514580801944107e-05, + "loss": 1.5257, + "step": 5747 + }, + { + "epoch": 1.744726058582486, + "grad_norm": 0.5363408923149109, + "learning_rate": 6.513973268529769e-05, + "loss": 1.7282, + "step": 5748 + }, + { + "epoch": 1.7450295947791774, + "grad_norm": 0.5793718695640564, + "learning_rate": 6.513365735115432e-05, + "loss": 2.0703, + "step": 5749 + }, + { + "epoch": 1.7453331309758688, + "grad_norm": 0.5103743672370911, + "learning_rate": 6.512758201701094e-05, + "loss": 1.6265, + "step": 5750 + }, + { + "epoch": 1.7456366671725603, + "grad_norm": 0.5116788148880005, + "learning_rate": 6.512150668286757e-05, + "loss": 1.8109, + "step": 5751 + }, + { + "epoch": 1.7459402033692517, + "grad_norm": 0.6101179718971252, + "learning_rate": 6.511543134872418e-05, + "loss": 1.5663, + "step": 5752 + }, + { + "epoch": 1.7462437395659434, + "grad_norm": 0.766834557056427, + "learning_rate": 6.51093560145808e-05, + "loss": 1.9858, + "step": 5753 + }, + { + "epoch": 1.7465472757626346, + "grad_norm": 0.5078772306442261, + "learning_rate": 6.510328068043743e-05, + "loss": 1.3673, + "step": 5754 + }, + { + "epoch": 1.7468508119593262, + "grad_norm": 0.47746485471725464, + "learning_rate": 6.509720534629405e-05, + "loss": 1.316, + "step": 5755 + }, + { + "epoch": 1.7471543481560174, + "grad_norm": 0.5668113827705383, + "learning_rate": 6.509113001215067e-05, + "loss": 1.6966, + "step": 5756 + }, + { + "epoch": 1.747457884352709, + "grad_norm": 0.5345942974090576, + "learning_rate": 6.50850546780073e-05, + "loss": 1.195, + "step": 5757 + }, + { + "epoch": 1.7477614205494005, + "grad_norm": 0.5363343358039856, + "learning_rate": 6.507897934386392e-05, + "loss": 1.709, + "step": 5758 + }, + { + "epoch": 1.748064956746092, + "grad_norm": 0.8389655351638794, + "learning_rate": 6.507290400972053e-05, + "loss": 1.7834, + "step": 5759 + }, + { + "epoch": 1.7483684929427834, + "grad_norm": 0.4319862723350525, + "learning_rate": 6.506682867557716e-05, + "loss": 1.9358, + "step": 5760 + }, + { + "epoch": 1.7486720291394748, + "grad_norm": 0.5649601221084595, + "learning_rate": 6.506075334143378e-05, + "loss": 1.2926, + "step": 5761 + }, + { + "epoch": 1.7489755653361665, + "grad_norm": 0.5124643445014954, + "learning_rate": 6.50546780072904e-05, + "loss": 2.1127, + "step": 5762 + }, + { + "epoch": 1.7492791015328577, + "grad_norm": 0.5134212374687195, + "learning_rate": 6.504860267314703e-05, + "loss": 1.4703, + "step": 5763 + }, + { + "epoch": 1.7495826377295494, + "grad_norm": 0.7799593210220337, + "learning_rate": 6.504252733900365e-05, + "loss": 1.1072, + "step": 5764 + }, + { + "epoch": 1.7498861739262406, + "grad_norm": 0.6452533602714539, + "learning_rate": 6.503645200486028e-05, + "loss": 1.6831, + "step": 5765 + }, + { + "epoch": 1.7501897101229322, + "grad_norm": 0.5632435083389282, + "learning_rate": 6.50303766707169e-05, + "loss": 1.7022, + "step": 5766 + }, + { + "epoch": 1.7504932463196237, + "grad_norm": 0.49832579493522644, + "learning_rate": 6.502430133657351e-05, + "loss": 1.3726, + "step": 5767 + }, + { + "epoch": 1.750796782516315, + "grad_norm": 0.5050408840179443, + "learning_rate": 6.501822600243014e-05, + "loss": 1.2312, + "step": 5768 + }, + { + "epoch": 1.7511003187130065, + "grad_norm": 0.5031700134277344, + "learning_rate": 6.501215066828676e-05, + "loss": 1.1839, + "step": 5769 + }, + { + "epoch": 1.751403854909698, + "grad_norm": 0.5715921521186829, + "learning_rate": 6.500607533414338e-05, + "loss": 1.7294, + "step": 5770 + }, + { + "epoch": 1.7517073911063894, + "grad_norm": 0.6526091694831848, + "learning_rate": 6.500000000000001e-05, + "loss": 1.3739, + "step": 5771 + }, + { + "epoch": 1.7520109273030808, + "grad_norm": 0.5826200246810913, + "learning_rate": 6.499392466585663e-05, + "loss": 1.5172, + "step": 5772 + }, + { + "epoch": 1.7523144634997725, + "grad_norm": 0.6188724040985107, + "learning_rate": 6.498784933171324e-05, + "loss": 1.7267, + "step": 5773 + }, + { + "epoch": 1.7526179996964637, + "grad_norm": 0.5869378447532654, + "learning_rate": 6.498177399756987e-05, + "loss": 1.6295, + "step": 5774 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.5730337500572205, + "learning_rate": 6.497569866342649e-05, + "loss": 1.4468, + "step": 5775 + }, + { + "epoch": 1.7532250720898466, + "grad_norm": 0.47720491886138916, + "learning_rate": 6.496962332928311e-05, + "loss": 1.4877, + "step": 5776 + }, + { + "epoch": 1.7535286082865382, + "grad_norm": 0.5709559917449951, + "learning_rate": 6.496354799513974e-05, + "loss": 1.9345, + "step": 5777 + }, + { + "epoch": 1.7538321444832297, + "grad_norm": 0.5799897909164429, + "learning_rate": 6.495747266099636e-05, + "loss": 0.7032, + "step": 5778 + }, + { + "epoch": 1.754135680679921, + "grad_norm": 0.6293965578079224, + "learning_rate": 6.495139732685299e-05, + "loss": 1.7294, + "step": 5779 + }, + { + "epoch": 1.7544392168766125, + "grad_norm": 0.567211925983429, + "learning_rate": 6.49453219927096e-05, + "loss": 1.8237, + "step": 5780 + }, + { + "epoch": 1.754742753073304, + "grad_norm": 0.5054707527160645, + "learning_rate": 6.493924665856622e-05, + "loss": 1.2977, + "step": 5781 + }, + { + "epoch": 1.7550462892699954, + "grad_norm": 0.6522750854492188, + "learning_rate": 6.493317132442285e-05, + "loss": 1.278, + "step": 5782 + }, + { + "epoch": 1.7553498254666868, + "grad_norm": 0.5055107474327087, + "learning_rate": 6.492709599027947e-05, + "loss": 1.6936, + "step": 5783 + }, + { + "epoch": 1.7556533616633785, + "grad_norm": 0.7780368328094482, + "learning_rate": 6.492102065613609e-05, + "loss": 1.8676, + "step": 5784 + }, + { + "epoch": 1.7559568978600697, + "grad_norm": 0.5879009366035461, + "learning_rate": 6.491494532199272e-05, + "loss": 1.8683, + "step": 5785 + }, + { + "epoch": 1.7562604340567614, + "grad_norm": 0.5584444999694824, + "learning_rate": 6.490886998784934e-05, + "loss": 1.9057, + "step": 5786 + }, + { + "epoch": 1.7565639702534526, + "grad_norm": 0.6271816492080688, + "learning_rate": 6.490279465370595e-05, + "loss": 1.7766, + "step": 5787 + }, + { + "epoch": 1.7568675064501442, + "grad_norm": 0.6667237281799316, + "learning_rate": 6.489671931956258e-05, + "loss": 1.6506, + "step": 5788 + }, + { + "epoch": 1.7571710426468357, + "grad_norm": 0.5379130840301514, + "learning_rate": 6.48906439854192e-05, + "loss": 1.6144, + "step": 5789 + }, + { + "epoch": 1.757474578843527, + "grad_norm": 0.8868510723114014, + "learning_rate": 6.488456865127582e-05, + "loss": 1.7302, + "step": 5790 + }, + { + "epoch": 1.7577781150402185, + "grad_norm": 0.5535308718681335, + "learning_rate": 6.487849331713245e-05, + "loss": 1.7013, + "step": 5791 + }, + { + "epoch": 1.75808165123691, + "grad_norm": 0.5302169919013977, + "learning_rate": 6.487241798298907e-05, + "loss": 1.9597, + "step": 5792 + }, + { + "epoch": 1.7583851874336016, + "grad_norm": 0.5474647879600525, + "learning_rate": 6.48663426488457e-05, + "loss": 2.1173, + "step": 5793 + }, + { + "epoch": 1.7586887236302928, + "grad_norm": 0.8962429165840149, + "learning_rate": 6.486026731470231e-05, + "loss": 1.2716, + "step": 5794 + }, + { + "epoch": 1.7589922598269845, + "grad_norm": 0.6951600909233093, + "learning_rate": 6.485419198055893e-05, + "loss": 1.4244, + "step": 5795 + }, + { + "epoch": 1.7592957960236757, + "grad_norm": 0.5327600836753845, + "learning_rate": 6.484811664641556e-05, + "loss": 1.5641, + "step": 5796 + }, + { + "epoch": 1.7595993322203674, + "grad_norm": 0.471091091632843, + "learning_rate": 6.484204131227217e-05, + "loss": 1.7891, + "step": 5797 + }, + { + "epoch": 1.7599028684170588, + "grad_norm": 0.5820968747138977, + "learning_rate": 6.48359659781288e-05, + "loss": 1.59, + "step": 5798 + }, + { + "epoch": 1.7602064046137502, + "grad_norm": 1.5941643714904785, + "learning_rate": 6.482989064398543e-05, + "loss": 1.131, + "step": 5799 + }, + { + "epoch": 1.7605099408104417, + "grad_norm": 0.6009232997894287, + "learning_rate": 6.482381530984205e-05, + "loss": 1.8783, + "step": 5800 + }, + { + "epoch": 1.760813477007133, + "grad_norm": 0.5255837440490723, + "learning_rate": 6.481773997569866e-05, + "loss": 1.9547, + "step": 5801 + }, + { + "epoch": 1.7611170132038245, + "grad_norm": 0.6272095441818237, + "learning_rate": 6.48116646415553e-05, + "loss": 1.7901, + "step": 5802 + }, + { + "epoch": 1.761420549400516, + "grad_norm": 0.47905588150024414, + "learning_rate": 6.480558930741191e-05, + "loss": 1.9139, + "step": 5803 + }, + { + "epoch": 1.7617240855972076, + "grad_norm": 0.5940137505531311, + "learning_rate": 6.479951397326853e-05, + "loss": 1.7599, + "step": 5804 + }, + { + "epoch": 1.7620276217938988, + "grad_norm": 0.52225661277771, + "learning_rate": 6.479343863912516e-05, + "loss": 1.8058, + "step": 5805 + }, + { + "epoch": 1.7623311579905905, + "grad_norm": 0.529253363609314, + "learning_rate": 6.478736330498178e-05, + "loss": 1.7519, + "step": 5806 + }, + { + "epoch": 1.7626346941872817, + "grad_norm": 0.5793299078941345, + "learning_rate": 6.47812879708384e-05, + "loss": 1.848, + "step": 5807 + }, + { + "epoch": 1.7629382303839733, + "grad_norm": 0.5690224170684814, + "learning_rate": 6.477521263669502e-05, + "loss": 1.5886, + "step": 5808 + }, + { + "epoch": 1.7632417665806648, + "grad_norm": 0.6605258584022522, + "learning_rate": 6.476913730255164e-05, + "loss": 1.642, + "step": 5809 + }, + { + "epoch": 1.7635453027773562, + "grad_norm": 0.5015376210212708, + "learning_rate": 6.476306196840827e-05, + "loss": 1.5451, + "step": 5810 + }, + { + "epoch": 1.7638488389740477, + "grad_norm": 0.491745263338089, + "learning_rate": 6.475698663426488e-05, + "loss": 1.7558, + "step": 5811 + }, + { + "epoch": 1.764152375170739, + "grad_norm": 0.5883635878562927, + "learning_rate": 6.475091130012151e-05, + "loss": 1.4678, + "step": 5812 + }, + { + "epoch": 1.7644559113674305, + "grad_norm": 0.5973695516586304, + "learning_rate": 6.474483596597814e-05, + "loss": 1.5873, + "step": 5813 + }, + { + "epoch": 1.764759447564122, + "grad_norm": 0.4223356246948242, + "learning_rate": 6.473876063183476e-05, + "loss": 1.9833, + "step": 5814 + }, + { + "epoch": 1.7650629837608136, + "grad_norm": 0.5074962973594666, + "learning_rate": 6.473268529769137e-05, + "loss": 1.8624, + "step": 5815 + }, + { + "epoch": 1.7653665199575048, + "grad_norm": 0.5878717303276062, + "learning_rate": 6.4726609963548e-05, + "loss": 1.1902, + "step": 5816 + }, + { + "epoch": 1.7656700561541965, + "grad_norm": 0.4983219504356384, + "learning_rate": 6.472053462940462e-05, + "loss": 2.2586, + "step": 5817 + }, + { + "epoch": 1.7659735923508877, + "grad_norm": 0.5959672927856445, + "learning_rate": 6.471445929526124e-05, + "loss": 1.6374, + "step": 5818 + }, + { + "epoch": 1.7662771285475793, + "grad_norm": 0.5485727190971375, + "learning_rate": 6.470838396111787e-05, + "loss": 1.3097, + "step": 5819 + }, + { + "epoch": 1.7665806647442708, + "grad_norm": 0.578179121017456, + "learning_rate": 6.470230862697449e-05, + "loss": 1.5444, + "step": 5820 + }, + { + "epoch": 1.7668842009409622, + "grad_norm": 0.5944360494613647, + "learning_rate": 6.46962332928311e-05, + "loss": 1.1695, + "step": 5821 + }, + { + "epoch": 1.7671877371376536, + "grad_norm": 0.747759997844696, + "learning_rate": 6.469015795868773e-05, + "loss": 1.5559, + "step": 5822 + }, + { + "epoch": 1.767491273334345, + "grad_norm": 0.40747299790382385, + "learning_rate": 6.468408262454435e-05, + "loss": 1.3606, + "step": 5823 + }, + { + "epoch": 1.7677948095310367, + "grad_norm": 0.5746089220046997, + "learning_rate": 6.467800729040098e-05, + "loss": 1.5747, + "step": 5824 + }, + { + "epoch": 1.768098345727728, + "grad_norm": 0.5214455127716064, + "learning_rate": 6.467193195625759e-05, + "loss": 1.875, + "step": 5825 + }, + { + "epoch": 1.7684018819244196, + "grad_norm": 0.5907041430473328, + "learning_rate": 6.466585662211422e-05, + "loss": 1.8091, + "step": 5826 + }, + { + "epoch": 1.7687054181211108, + "grad_norm": 0.5252591967582703, + "learning_rate": 6.465978128797085e-05, + "loss": 1.3878, + "step": 5827 + }, + { + "epoch": 1.7690089543178025, + "grad_norm": 0.4580781161785126, + "learning_rate": 6.465370595382747e-05, + "loss": 2.0493, + "step": 5828 + }, + { + "epoch": 1.769312490514494, + "grad_norm": 1.1108380556106567, + "learning_rate": 6.464763061968408e-05, + "loss": 1.4833, + "step": 5829 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.5863052010536194, + "learning_rate": 6.464155528554071e-05, + "loss": 1.4405, + "step": 5830 + }, + { + "epoch": 1.7699195629078768, + "grad_norm": 0.6235532760620117, + "learning_rate": 6.463547995139733e-05, + "loss": 1.1955, + "step": 5831 + }, + { + "epoch": 1.7702230991045682, + "grad_norm": 0.8234938383102417, + "learning_rate": 6.462940461725395e-05, + "loss": 1.6902, + "step": 5832 + }, + { + "epoch": 1.7705266353012596, + "grad_norm": 0.4940117597579956, + "learning_rate": 6.462332928311058e-05, + "loss": 1.7612, + "step": 5833 + }, + { + "epoch": 1.770830171497951, + "grad_norm": 0.5862531661987305, + "learning_rate": 6.46172539489672e-05, + "loss": 1.7272, + "step": 5834 + }, + { + "epoch": 1.7711337076946427, + "grad_norm": 0.5498414635658264, + "learning_rate": 6.461117861482381e-05, + "loss": 1.9877, + "step": 5835 + }, + { + "epoch": 1.771437243891334, + "grad_norm": 0.6384761333465576, + "learning_rate": 6.460510328068044e-05, + "loss": 1.5546, + "step": 5836 + }, + { + "epoch": 1.7717407800880256, + "grad_norm": 0.5216341018676758, + "learning_rate": 6.459902794653706e-05, + "loss": 1.7439, + "step": 5837 + }, + { + "epoch": 1.7720443162847168, + "grad_norm": 0.749945342540741, + "learning_rate": 6.459295261239369e-05, + "loss": 1.5544, + "step": 5838 + }, + { + "epoch": 1.7723478524814085, + "grad_norm": 0.47345829010009766, + "learning_rate": 6.45868772782503e-05, + "loss": 1.3169, + "step": 5839 + }, + { + "epoch": 1.7726513886781, + "grad_norm": 0.5755912661552429, + "learning_rate": 6.458080194410693e-05, + "loss": 1.6923, + "step": 5840 + }, + { + "epoch": 1.7729549248747913, + "grad_norm": 0.5522906184196472, + "learning_rate": 6.457472660996356e-05, + "loss": 1.5496, + "step": 5841 + }, + { + "epoch": 1.7732584610714828, + "grad_norm": 0.6490026116371155, + "learning_rate": 6.456865127582018e-05, + "loss": 1.6102, + "step": 5842 + }, + { + "epoch": 1.7735619972681742, + "grad_norm": 0.5617067813873291, + "learning_rate": 6.456257594167679e-05, + "loss": 1.6246, + "step": 5843 + }, + { + "epoch": 1.7738655334648656, + "grad_norm": 0.6315189599990845, + "learning_rate": 6.455650060753342e-05, + "loss": 1.6435, + "step": 5844 + }, + { + "epoch": 1.774169069661557, + "grad_norm": 0.5512092709541321, + "learning_rate": 6.455042527339004e-05, + "loss": 1.5024, + "step": 5845 + }, + { + "epoch": 1.7744726058582487, + "grad_norm": 0.4029645025730133, + "learning_rate": 6.454434993924666e-05, + "loss": 1.0551, + "step": 5846 + }, + { + "epoch": 1.77477614205494, + "grad_norm": 0.5837946534156799, + "learning_rate": 6.453827460510329e-05, + "loss": 1.546, + "step": 5847 + }, + { + "epoch": 1.7750796782516316, + "grad_norm": 0.5171747207641602, + "learning_rate": 6.45321992709599e-05, + "loss": 1.7922, + "step": 5848 + }, + { + "epoch": 1.7753832144483228, + "grad_norm": 0.8446613550186157, + "learning_rate": 6.452612393681652e-05, + "loss": 1.7363, + "step": 5849 + }, + { + "epoch": 1.7756867506450145, + "grad_norm": 0.5637487769126892, + "learning_rate": 6.452004860267315e-05, + "loss": 1.5976, + "step": 5850 + }, + { + "epoch": 1.775990286841706, + "grad_norm": 0.5242686867713928, + "learning_rate": 6.451397326852977e-05, + "loss": 1.692, + "step": 5851 + }, + { + "epoch": 1.7762938230383973, + "grad_norm": 0.6187634468078613, + "learning_rate": 6.45078979343864e-05, + "loss": 1.5291, + "step": 5852 + }, + { + "epoch": 1.7765973592350888, + "grad_norm": 1.0247151851654053, + "learning_rate": 6.4501822600243e-05, + "loss": 1.5718, + "step": 5853 + }, + { + "epoch": 1.7769008954317802, + "grad_norm": 0.6299808025360107, + "learning_rate": 6.449574726609964e-05, + "loss": 1.5077, + "step": 5854 + }, + { + "epoch": 1.7772044316284719, + "grad_norm": 0.6078516840934753, + "learning_rate": 6.448967193195627e-05, + "loss": 1.0061, + "step": 5855 + }, + { + "epoch": 1.777507967825163, + "grad_norm": 0.4225638210773468, + "learning_rate": 6.448359659781287e-05, + "loss": 1.7302, + "step": 5856 + }, + { + "epoch": 1.7778115040218547, + "grad_norm": 0.5707997679710388, + "learning_rate": 6.44775212636695e-05, + "loss": 1.3869, + "step": 5857 + }, + { + "epoch": 1.778115040218546, + "grad_norm": 0.5926483273506165, + "learning_rate": 6.447144592952613e-05, + "loss": 1.6026, + "step": 5858 + }, + { + "epoch": 1.7784185764152376, + "grad_norm": 0.5582571625709534, + "learning_rate": 6.446537059538275e-05, + "loss": 1.6688, + "step": 5859 + }, + { + "epoch": 1.7787221126119288, + "grad_norm": 0.48093557357788086, + "learning_rate": 6.445929526123937e-05, + "loss": 1.9678, + "step": 5860 + }, + { + "epoch": 1.7790256488086205, + "grad_norm": 0.5301848649978638, + "learning_rate": 6.4453219927096e-05, + "loss": 1.4983, + "step": 5861 + }, + { + "epoch": 1.779329185005312, + "grad_norm": 0.600426435470581, + "learning_rate": 6.444714459295262e-05, + "loss": 2.1841, + "step": 5862 + }, + { + "epoch": 1.7796327212020033, + "grad_norm": 0.5542836785316467, + "learning_rate": 6.444106925880923e-05, + "loss": 1.9038, + "step": 5863 + }, + { + "epoch": 1.7799362573986948, + "grad_norm": 0.6525148749351501, + "learning_rate": 6.443499392466586e-05, + "loss": 1.094, + "step": 5864 + }, + { + "epoch": 1.7802397935953862, + "grad_norm": 0.5191807150840759, + "learning_rate": 6.442891859052248e-05, + "loss": 2.0945, + "step": 5865 + }, + { + "epoch": 1.7805433297920779, + "grad_norm": 0.6895755529403687, + "learning_rate": 6.442284325637911e-05, + "loss": 1.2236, + "step": 5866 + }, + { + "epoch": 1.780846865988769, + "grad_norm": 0.5294615626335144, + "learning_rate": 6.441676792223572e-05, + "loss": 1.5897, + "step": 5867 + }, + { + "epoch": 1.7811504021854607, + "grad_norm": 0.46759140491485596, + "learning_rate": 6.441069258809235e-05, + "loss": 1.7159, + "step": 5868 + }, + { + "epoch": 1.781453938382152, + "grad_norm": 0.5720352530479431, + "learning_rate": 6.440461725394898e-05, + "loss": 1.4336, + "step": 5869 + }, + { + "epoch": 1.7817574745788436, + "grad_norm": 0.5489372611045837, + "learning_rate": 6.439854191980558e-05, + "loss": 1.7358, + "step": 5870 + }, + { + "epoch": 1.782061010775535, + "grad_norm": 0.4542980492115021, + "learning_rate": 6.439246658566221e-05, + "loss": 1.5082, + "step": 5871 + }, + { + "epoch": 1.7823645469722265, + "grad_norm": 0.4929881691932678, + "learning_rate": 6.438639125151884e-05, + "loss": 1.5841, + "step": 5872 + }, + { + "epoch": 1.782668083168918, + "grad_norm": 0.5201451182365417, + "learning_rate": 6.438031591737546e-05, + "loss": 1.9786, + "step": 5873 + }, + { + "epoch": 1.7829716193656093, + "grad_norm": 0.5401360988616943, + "learning_rate": 6.437424058323208e-05, + "loss": 1.6557, + "step": 5874 + }, + { + "epoch": 1.7832751555623008, + "grad_norm": 0.5674952864646912, + "learning_rate": 6.436816524908871e-05, + "loss": 1.2626, + "step": 5875 + }, + { + "epoch": 1.7835786917589922, + "grad_norm": 0.561320960521698, + "learning_rate": 6.436208991494533e-05, + "loss": 1.8853, + "step": 5876 + }, + { + "epoch": 1.7838822279556839, + "grad_norm": 0.5434008240699768, + "learning_rate": 6.435601458080194e-05, + "loss": 1.1288, + "step": 5877 + }, + { + "epoch": 1.784185764152375, + "grad_norm": 0.4528067409992218, + "learning_rate": 6.434993924665856e-05, + "loss": 1.8259, + "step": 5878 + }, + { + "epoch": 1.7844893003490667, + "grad_norm": 0.48274731636047363, + "learning_rate": 6.434386391251519e-05, + "loss": 1.248, + "step": 5879 + }, + { + "epoch": 1.784792836545758, + "grad_norm": 0.844624936580658, + "learning_rate": 6.433778857837181e-05, + "loss": 1.2987, + "step": 5880 + }, + { + "epoch": 1.7850963727424496, + "grad_norm": 0.8764456510543823, + "learning_rate": 6.433171324422843e-05, + "loss": 1.0901, + "step": 5881 + }, + { + "epoch": 1.785399908939141, + "grad_norm": 0.5158057808876038, + "learning_rate": 6.432563791008506e-05, + "loss": 1.7408, + "step": 5882 + }, + { + "epoch": 1.7857034451358325, + "grad_norm": 0.616571843624115, + "learning_rate": 6.431956257594169e-05, + "loss": 1.4418, + "step": 5883 + }, + { + "epoch": 1.786006981332524, + "grad_norm": 0.5972772240638733, + "learning_rate": 6.431348724179829e-05, + "loss": 1.3225, + "step": 5884 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.547287106513977, + "learning_rate": 6.430741190765492e-05, + "loss": 1.6017, + "step": 5885 + }, + { + "epoch": 1.7866140537259068, + "grad_norm": 0.8032040596008301, + "learning_rate": 6.430133657351155e-05, + "loss": 1.4679, + "step": 5886 + }, + { + "epoch": 1.7869175899225982, + "grad_norm": 0.47403547167778015, + "learning_rate": 6.429526123936817e-05, + "loss": 1.6739, + "step": 5887 + }, + { + "epoch": 1.7872211261192898, + "grad_norm": 0.5629516243934631, + "learning_rate": 6.428918590522479e-05, + "loss": 1.752, + "step": 5888 + }, + { + "epoch": 1.787524662315981, + "grad_norm": 0.47188514471054077, + "learning_rate": 6.428311057108142e-05, + "loss": 1.3455, + "step": 5889 + }, + { + "epoch": 1.7878281985126727, + "grad_norm": 0.6849742531776428, + "learning_rate": 6.427703523693804e-05, + "loss": 1.7675, + "step": 5890 + }, + { + "epoch": 1.788131734709364, + "grad_norm": 0.5418858528137207, + "learning_rate": 6.427095990279465e-05, + "loss": 1.4488, + "step": 5891 + }, + { + "epoch": 1.7884352709060556, + "grad_norm": 0.5874664187431335, + "learning_rate": 6.426488456865127e-05, + "loss": 1.8787, + "step": 5892 + }, + { + "epoch": 1.788738807102747, + "grad_norm": 0.4815152585506439, + "learning_rate": 6.42588092345079e-05, + "loss": 1.2008, + "step": 5893 + }, + { + "epoch": 1.7890423432994385, + "grad_norm": 0.5272401571273804, + "learning_rate": 6.425273390036452e-05, + "loss": 1.9368, + "step": 5894 + }, + { + "epoch": 1.7893458794961299, + "grad_norm": 0.5799587368965149, + "learning_rate": 6.424665856622114e-05, + "loss": 1.8509, + "step": 5895 + }, + { + "epoch": 1.7896494156928213, + "grad_norm": 1.0671026706695557, + "learning_rate": 6.424058323207777e-05, + "loss": 1.6619, + "step": 5896 + }, + { + "epoch": 1.789952951889513, + "grad_norm": 0.7798335552215576, + "learning_rate": 6.42345078979344e-05, + "loss": 1.7029, + "step": 5897 + }, + { + "epoch": 1.7902564880862042, + "grad_norm": 0.5832023024559021, + "learning_rate": 6.4228432563791e-05, + "loss": 1.9654, + "step": 5898 + }, + { + "epoch": 1.7905600242828958, + "grad_norm": 0.4774154722690582, + "learning_rate": 6.422235722964763e-05, + "loss": 1.7518, + "step": 5899 + }, + { + "epoch": 1.790863560479587, + "grad_norm": 0.4843001663684845, + "learning_rate": 6.421628189550426e-05, + "loss": 1.6774, + "step": 5900 + }, + { + "epoch": 1.7911670966762787, + "grad_norm": 0.5146987438201904, + "learning_rate": 6.421020656136088e-05, + "loss": 1.7348, + "step": 5901 + }, + { + "epoch": 1.7914706328729701, + "grad_norm": 0.566594123840332, + "learning_rate": 6.42041312272175e-05, + "loss": 1.5495, + "step": 5902 + }, + { + "epoch": 1.7917741690696616, + "grad_norm": 0.7687141299247742, + "learning_rate": 6.419805589307413e-05, + "loss": 1.9389, + "step": 5903 + }, + { + "epoch": 1.792077705266353, + "grad_norm": 0.6663510799407959, + "learning_rate": 6.419198055893075e-05, + "loss": 1.3843, + "step": 5904 + }, + { + "epoch": 1.7923812414630445, + "grad_norm": 0.5539908409118652, + "learning_rate": 6.418590522478736e-05, + "loss": 1.8642, + "step": 5905 + }, + { + "epoch": 1.7926847776597359, + "grad_norm": 0.4932482838630676, + "learning_rate": 6.417982989064398e-05, + "loss": 1.6206, + "step": 5906 + }, + { + "epoch": 1.7929883138564273, + "grad_norm": 0.617225170135498, + "learning_rate": 6.417375455650061e-05, + "loss": 1.674, + "step": 5907 + }, + { + "epoch": 1.793291850053119, + "grad_norm": 0.5194925665855408, + "learning_rate": 6.416767922235723e-05, + "loss": 1.5878, + "step": 5908 + }, + { + "epoch": 1.7935953862498102, + "grad_norm": 0.8317811489105225, + "learning_rate": 6.416160388821385e-05, + "loss": 1.4024, + "step": 5909 + }, + { + "epoch": 1.7938989224465018, + "grad_norm": 0.6334354281425476, + "learning_rate": 6.415552855407048e-05, + "loss": 1.4149, + "step": 5910 + }, + { + "epoch": 1.794202458643193, + "grad_norm": 0.5162734389305115, + "learning_rate": 6.414945321992711e-05, + "loss": 1.6833, + "step": 5911 + }, + { + "epoch": 1.7945059948398847, + "grad_norm": 0.6123242378234863, + "learning_rate": 6.414337788578371e-05, + "loss": 1.3959, + "step": 5912 + }, + { + "epoch": 1.7948095310365761, + "grad_norm": 0.5130218863487244, + "learning_rate": 6.413730255164034e-05, + "loss": 1.9731, + "step": 5913 + }, + { + "epoch": 1.7951130672332676, + "grad_norm": 0.6168215274810791, + "learning_rate": 6.413122721749697e-05, + "loss": 1.7271, + "step": 5914 + }, + { + "epoch": 1.795416603429959, + "grad_norm": 0.6758812665939331, + "learning_rate": 6.412515188335359e-05, + "loss": 2.0077, + "step": 5915 + }, + { + "epoch": 1.7957201396266504, + "grad_norm": 1.066888451576233, + "learning_rate": 6.411907654921021e-05, + "loss": 1.0796, + "step": 5916 + }, + { + "epoch": 1.7960236758233419, + "grad_norm": 0.6181840896606445, + "learning_rate": 6.411300121506684e-05, + "loss": 1.0032, + "step": 5917 + }, + { + "epoch": 1.7963272120200333, + "grad_norm": 0.49878695607185364, + "learning_rate": 6.410692588092346e-05, + "loss": 1.7083, + "step": 5918 + }, + { + "epoch": 1.796630748216725, + "grad_norm": 0.5682203769683838, + "learning_rate": 6.410085054678007e-05, + "loss": 1.7901, + "step": 5919 + }, + { + "epoch": 1.7969342844134162, + "grad_norm": 0.46962597966194153, + "learning_rate": 6.409477521263669e-05, + "loss": 1.8081, + "step": 5920 + }, + { + "epoch": 1.7972378206101078, + "grad_norm": 0.4845025837421417, + "learning_rate": 6.408869987849332e-05, + "loss": 1.1512, + "step": 5921 + }, + { + "epoch": 1.797541356806799, + "grad_norm": 0.552946150302887, + "learning_rate": 6.408262454434994e-05, + "loss": 1.8408, + "step": 5922 + }, + { + "epoch": 1.7978448930034907, + "grad_norm": 0.6529552936553955, + "learning_rate": 6.407654921020656e-05, + "loss": 1.2169, + "step": 5923 + }, + { + "epoch": 1.7981484292001821, + "grad_norm": 0.5143837332725525, + "learning_rate": 6.407047387606319e-05, + "loss": 1.2939, + "step": 5924 + }, + { + "epoch": 1.7984519653968736, + "grad_norm": 0.5021446347236633, + "learning_rate": 6.406439854191982e-05, + "loss": 1.771, + "step": 5925 + }, + { + "epoch": 1.798755501593565, + "grad_norm": 0.5703146457672119, + "learning_rate": 6.405832320777642e-05, + "loss": 1.0796, + "step": 5926 + }, + { + "epoch": 1.7990590377902564, + "grad_norm": 0.5527166724205017, + "learning_rate": 6.405224787363305e-05, + "loss": 1.7345, + "step": 5927 + }, + { + "epoch": 1.799362573986948, + "grad_norm": 0.5868936777114868, + "learning_rate": 6.404617253948968e-05, + "loss": 1.9107, + "step": 5928 + }, + { + "epoch": 1.7996661101836393, + "grad_norm": 0.5727707147598267, + "learning_rate": 6.40400972053463e-05, + "loss": 1.6983, + "step": 5929 + }, + { + "epoch": 1.799969646380331, + "grad_norm": 0.4878747761249542, + "learning_rate": 6.403402187120292e-05, + "loss": 1.6873, + "step": 5930 + }, + { + "epoch": 1.8002731825770222, + "grad_norm": 0.46012574434280396, + "learning_rate": 6.402794653705955e-05, + "loss": 1.6663, + "step": 5931 + }, + { + "epoch": 1.8005767187737138, + "grad_norm": 0.5912777781486511, + "learning_rate": 6.402187120291617e-05, + "loss": 1.8697, + "step": 5932 + }, + { + "epoch": 1.8008802549704053, + "grad_norm": 0.5376688241958618, + "learning_rate": 6.401579586877278e-05, + "loss": 1.7366, + "step": 5933 + }, + { + "epoch": 1.8011837911670967, + "grad_norm": 0.5066462755203247, + "learning_rate": 6.40097205346294e-05, + "loss": 1.6613, + "step": 5934 + }, + { + "epoch": 1.8014873273637881, + "grad_norm": 0.551946759223938, + "learning_rate": 6.400364520048603e-05, + "loss": 1.663, + "step": 5935 + }, + { + "epoch": 1.8017908635604796, + "grad_norm": 0.5621793866157532, + "learning_rate": 6.399756986634265e-05, + "loss": 1.4712, + "step": 5936 + }, + { + "epoch": 1.802094399757171, + "grad_norm": 0.5620403289794922, + "learning_rate": 6.399149453219927e-05, + "loss": 1.3263, + "step": 5937 + }, + { + "epoch": 1.8023979359538624, + "grad_norm": 0.5411643385887146, + "learning_rate": 6.39854191980559e-05, + "loss": 1.5606, + "step": 5938 + }, + { + "epoch": 1.802701472150554, + "grad_norm": 0.5093483924865723, + "learning_rate": 6.397934386391253e-05, + "loss": 1.8361, + "step": 5939 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.9590352773666382, + "learning_rate": 6.397326852976913e-05, + "loss": 1.8535, + "step": 5940 + }, + { + "epoch": 1.803308544543937, + "grad_norm": 0.5259732604026794, + "learning_rate": 6.396719319562576e-05, + "loss": 1.9654, + "step": 5941 + }, + { + "epoch": 1.8036120807406282, + "grad_norm": 0.427685022354126, + "learning_rate": 6.39611178614824e-05, + "loss": 1.5456, + "step": 5942 + }, + { + "epoch": 1.8039156169373198, + "grad_norm": 0.554628849029541, + "learning_rate": 6.3955042527339e-05, + "loss": 1.5662, + "step": 5943 + }, + { + "epoch": 1.8042191531340113, + "grad_norm": 0.5747518539428711, + "learning_rate": 6.394896719319563e-05, + "loss": 1.7766, + "step": 5944 + }, + { + "epoch": 1.8045226893307027, + "grad_norm": 0.5522488951683044, + "learning_rate": 6.394289185905226e-05, + "loss": 1.7876, + "step": 5945 + }, + { + "epoch": 1.8048262255273941, + "grad_norm": 0.6148546934127808, + "learning_rate": 6.393681652490888e-05, + "loss": 2.2045, + "step": 5946 + }, + { + "epoch": 1.8051297617240856, + "grad_norm": 0.5216572284698486, + "learning_rate": 6.39307411907655e-05, + "loss": 1.5652, + "step": 5947 + }, + { + "epoch": 1.805433297920777, + "grad_norm": 0.4756337106227875, + "learning_rate": 6.392466585662211e-05, + "loss": 1.7144, + "step": 5948 + }, + { + "epoch": 1.8057368341174684, + "grad_norm": 0.552494466304779, + "learning_rate": 6.391859052247874e-05, + "loss": 1.4098, + "step": 5949 + }, + { + "epoch": 1.80604037031416, + "grad_norm": 0.4858109951019287, + "learning_rate": 6.391251518833536e-05, + "loss": 1.8139, + "step": 5950 + }, + { + "epoch": 1.8063439065108513, + "grad_norm": 0.43136751651763916, + "learning_rate": 6.390643985419198e-05, + "loss": 1.3983, + "step": 5951 + }, + { + "epoch": 1.806647442707543, + "grad_norm": 0.5166333317756653, + "learning_rate": 6.390036452004861e-05, + "loss": 1.7784, + "step": 5952 + }, + { + "epoch": 1.8069509789042342, + "grad_norm": 0.5090475678443909, + "learning_rate": 6.389428918590522e-05, + "loss": 1.6639, + "step": 5953 + }, + { + "epoch": 1.8072545151009258, + "grad_norm": 0.492970734834671, + "learning_rate": 6.388821385176184e-05, + "loss": 1.6899, + "step": 5954 + }, + { + "epoch": 1.8075580512976173, + "grad_norm": 0.5264026522636414, + "learning_rate": 6.388213851761847e-05, + "loss": 1.5677, + "step": 5955 + }, + { + "epoch": 1.8078615874943087, + "grad_norm": 0.5379698872566223, + "learning_rate": 6.38760631834751e-05, + "loss": 1.8217, + "step": 5956 + }, + { + "epoch": 1.8081651236910001, + "grad_norm": 0.5782027244567871, + "learning_rate": 6.386998784933171e-05, + "loss": 1.7714, + "step": 5957 + }, + { + "epoch": 1.8084686598876916, + "grad_norm": 0.45545729994773865, + "learning_rate": 6.386391251518834e-05, + "loss": 1.712, + "step": 5958 + }, + { + "epoch": 1.8087721960843832, + "grad_norm": 0.6243247985839844, + "learning_rate": 6.385783718104497e-05, + "loss": 1.749, + "step": 5959 + }, + { + "epoch": 1.8090757322810744, + "grad_norm": 0.5548247694969177, + "learning_rate": 6.385176184690159e-05, + "loss": 1.6275, + "step": 5960 + }, + { + "epoch": 1.809379268477766, + "grad_norm": 0.5740573406219482, + "learning_rate": 6.38456865127582e-05, + "loss": 1.3607, + "step": 5961 + }, + { + "epoch": 1.8096828046744573, + "grad_norm": 0.4793718159198761, + "learning_rate": 6.383961117861482e-05, + "loss": 1.6703, + "step": 5962 + }, + { + "epoch": 1.809986340871149, + "grad_norm": 0.5316084623336792, + "learning_rate": 6.383353584447145e-05, + "loss": 1.7506, + "step": 5963 + }, + { + "epoch": 1.8102898770678404, + "grad_norm": 0.4940855801105499, + "learning_rate": 6.382746051032807e-05, + "loss": 1.8435, + "step": 5964 + }, + { + "epoch": 1.8105934132645318, + "grad_norm": 0.583676815032959, + "learning_rate": 6.382138517618469e-05, + "loss": 1.8004, + "step": 5965 + }, + { + "epoch": 1.8108969494612233, + "grad_norm": 0.4545220136642456, + "learning_rate": 6.381530984204132e-05, + "loss": 1.6069, + "step": 5966 + }, + { + "epoch": 1.8112004856579147, + "grad_norm": 0.49649152159690857, + "learning_rate": 6.380923450789794e-05, + "loss": 1.5472, + "step": 5967 + }, + { + "epoch": 1.8115040218546061, + "grad_norm": 0.5210832357406616, + "learning_rate": 6.380315917375455e-05, + "loss": 1.4535, + "step": 5968 + }, + { + "epoch": 1.8118075580512976, + "grad_norm": 0.5498248934745789, + "learning_rate": 6.379708383961118e-05, + "loss": 2.0708, + "step": 5969 + }, + { + "epoch": 1.8121110942479892, + "grad_norm": 0.46531158685684204, + "learning_rate": 6.379100850546781e-05, + "loss": 1.5227, + "step": 5970 + }, + { + "epoch": 1.8124146304446804, + "grad_norm": 0.6665170192718506, + "learning_rate": 6.378493317132442e-05, + "loss": 1.6048, + "step": 5971 + }, + { + "epoch": 1.812718166641372, + "grad_norm": 0.46771401166915894, + "learning_rate": 6.377885783718105e-05, + "loss": 1.9367, + "step": 5972 + }, + { + "epoch": 1.8130217028380633, + "grad_norm": 0.4593519866466522, + "learning_rate": 6.377278250303767e-05, + "loss": 1.8305, + "step": 5973 + }, + { + "epoch": 1.813325239034755, + "grad_norm": 0.5220834016799927, + "learning_rate": 6.37667071688943e-05, + "loss": 1.8261, + "step": 5974 + }, + { + "epoch": 1.8136287752314464, + "grad_norm": 0.5303083062171936, + "learning_rate": 6.376063183475091e-05, + "loss": 1.7589, + "step": 5975 + }, + { + "epoch": 1.8139323114281378, + "grad_norm": 0.6920210123062134, + "learning_rate": 6.375455650060753e-05, + "loss": 1.0832, + "step": 5976 + }, + { + "epoch": 1.8142358476248293, + "grad_norm": 0.49981755018234253, + "learning_rate": 6.374848116646416e-05, + "loss": 1.9543, + "step": 5977 + }, + { + "epoch": 1.8145393838215207, + "grad_norm": 0.49285998940467834, + "learning_rate": 6.374240583232078e-05, + "loss": 1.9847, + "step": 5978 + }, + { + "epoch": 1.8148429200182121, + "grad_norm": 0.571113109588623, + "learning_rate": 6.37363304981774e-05, + "loss": 0.9056, + "step": 5979 + }, + { + "epoch": 1.8151464562149036, + "grad_norm": 0.5736732482910156, + "learning_rate": 6.373025516403403e-05, + "loss": 1.554, + "step": 5980 + }, + { + "epoch": 1.8154499924115952, + "grad_norm": 0.5702198147773743, + "learning_rate": 6.372417982989065e-05, + "loss": 1.4019, + "step": 5981 + }, + { + "epoch": 1.8157535286082864, + "grad_norm": 0.5917282104492188, + "learning_rate": 6.371810449574726e-05, + "loss": 1.7169, + "step": 5982 + }, + { + "epoch": 1.816057064804978, + "grad_norm": 0.4668388366699219, + "learning_rate": 6.37120291616039e-05, + "loss": 1.3697, + "step": 5983 + }, + { + "epoch": 1.8163606010016693, + "grad_norm": 0.48866280913352966, + "learning_rate": 6.370595382746052e-05, + "loss": 1.8745, + "step": 5984 + }, + { + "epoch": 1.816664137198361, + "grad_norm": 0.5371363759040833, + "learning_rate": 6.369987849331713e-05, + "loss": 1.3244, + "step": 5985 + }, + { + "epoch": 1.8169676733950524, + "grad_norm": 0.7522103190422058, + "learning_rate": 6.369380315917376e-05, + "loss": 1.1633, + "step": 5986 + }, + { + "epoch": 1.8172712095917438, + "grad_norm": 0.5259113311767578, + "learning_rate": 6.368772782503038e-05, + "loss": 1.5496, + "step": 5987 + }, + { + "epoch": 1.8175747457884353, + "grad_norm": 0.5885976552963257, + "learning_rate": 6.368165249088701e-05, + "loss": 1.4305, + "step": 5988 + }, + { + "epoch": 1.8178782819851267, + "grad_norm": 0.5653246641159058, + "learning_rate": 6.367557715674362e-05, + "loss": 1.8765, + "step": 5989 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.5662322044372559, + "learning_rate": 6.366950182260024e-05, + "loss": 1.8685, + "step": 5990 + }, + { + "epoch": 1.8184853543785096, + "grad_norm": 0.4957374632358551, + "learning_rate": 6.366342648845687e-05, + "loss": 1.7966, + "step": 5991 + }, + { + "epoch": 1.8187888905752012, + "grad_norm": 0.575343906879425, + "learning_rate": 6.365735115431349e-05, + "loss": 1.6969, + "step": 5992 + }, + { + "epoch": 1.8190924267718924, + "grad_norm": 0.5825738906860352, + "learning_rate": 6.365127582017011e-05, + "loss": 1.6724, + "step": 5993 + }, + { + "epoch": 1.819395962968584, + "grad_norm": 0.5546246767044067, + "learning_rate": 6.364520048602674e-05, + "loss": 1.769, + "step": 5994 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.6090353727340698, + "learning_rate": 6.363912515188336e-05, + "loss": 1.5532, + "step": 5995 + }, + { + "epoch": 1.820003035361967, + "grad_norm": 0.6108836531639099, + "learning_rate": 6.363304981773997e-05, + "loss": 1.7435, + "step": 5996 + }, + { + "epoch": 1.8203065715586584, + "grad_norm": 0.5270663499832153, + "learning_rate": 6.36269744835966e-05, + "loss": 1.4083, + "step": 5997 + }, + { + "epoch": 1.8206101077553498, + "grad_norm": 0.5462440848350525, + "learning_rate": 6.362089914945323e-05, + "loss": 1.3642, + "step": 5998 + }, + { + "epoch": 1.8209136439520412, + "grad_norm": 0.6120818257331848, + "learning_rate": 6.361482381530984e-05, + "loss": 1.3654, + "step": 5999 + }, + { + "epoch": 1.8212171801487327, + "grad_norm": 0.5536101460456848, + "learning_rate": 6.360874848116647e-05, + "loss": 1.6613, + "step": 6000 + }, + { + "epoch": 1.8215207163454243, + "grad_norm": 0.5645522475242615, + "learning_rate": 6.360267314702309e-05, + "loss": 1.585, + "step": 6001 + }, + { + "epoch": 1.8218242525421156, + "grad_norm": 0.7577182054519653, + "learning_rate": 6.359659781287972e-05, + "loss": 1.596, + "step": 6002 + }, + { + "epoch": 1.8221277887388072, + "grad_norm": 0.5090943574905396, + "learning_rate": 6.359052247873633e-05, + "loss": 1.9588, + "step": 6003 + }, + { + "epoch": 1.8224313249354984, + "grad_norm": 0.6380836963653564, + "learning_rate": 6.358444714459295e-05, + "loss": 1.8904, + "step": 6004 + }, + { + "epoch": 1.82273486113219, + "grad_norm": 0.5438816547393799, + "learning_rate": 6.357837181044958e-05, + "loss": 1.4335, + "step": 6005 + }, + { + "epoch": 1.8230383973288815, + "grad_norm": 0.5515280961990356, + "learning_rate": 6.35722964763062e-05, + "loss": 1.7108, + "step": 6006 + }, + { + "epoch": 1.823341933525573, + "grad_norm": 0.5127303600311279, + "learning_rate": 6.356622114216282e-05, + "loss": 1.8656, + "step": 6007 + }, + { + "epoch": 1.8236454697222644, + "grad_norm": 0.574600338935852, + "learning_rate": 6.356014580801945e-05, + "loss": 1.8687, + "step": 6008 + }, + { + "epoch": 1.8239490059189558, + "grad_norm": 0.5011196136474609, + "learning_rate": 6.355407047387607e-05, + "loss": 1.0319, + "step": 6009 + }, + { + "epoch": 1.8242525421156472, + "grad_norm": 0.48129895329475403, + "learning_rate": 6.354799513973268e-05, + "loss": 1.9097, + "step": 6010 + }, + { + "epoch": 1.8245560783123387, + "grad_norm": 0.5280462503433228, + "learning_rate": 6.354191980558931e-05, + "loss": 1.7439, + "step": 6011 + }, + { + "epoch": 1.8248596145090303, + "grad_norm": 0.558509886264801, + "learning_rate": 6.353584447144594e-05, + "loss": 1.6233, + "step": 6012 + }, + { + "epoch": 1.8251631507057215, + "grad_norm": 0.5026416182518005, + "learning_rate": 6.352976913730255e-05, + "loss": 1.9798, + "step": 6013 + }, + { + "epoch": 1.8254666869024132, + "grad_norm": 0.7570869326591492, + "learning_rate": 6.352369380315918e-05, + "loss": 1.509, + "step": 6014 + }, + { + "epoch": 1.8257702230991044, + "grad_norm": 0.5660725235939026, + "learning_rate": 6.35176184690158e-05, + "loss": 1.6066, + "step": 6015 + }, + { + "epoch": 1.826073759295796, + "grad_norm": 0.5537878274917603, + "learning_rate": 6.351154313487241e-05, + "loss": 1.5985, + "step": 6016 + }, + { + "epoch": 1.8263772954924875, + "grad_norm": 0.8709487318992615, + "learning_rate": 6.350546780072904e-05, + "loss": 1.1894, + "step": 6017 + }, + { + "epoch": 1.826680831689179, + "grad_norm": 0.5389176607131958, + "learning_rate": 6.349939246658566e-05, + "loss": 1.7235, + "step": 6018 + }, + { + "epoch": 1.8269843678858704, + "grad_norm": 0.5328723788261414, + "learning_rate": 6.349331713244229e-05, + "loss": 1.6992, + "step": 6019 + }, + { + "epoch": 1.8272879040825618, + "grad_norm": 0.5136182308197021, + "learning_rate": 6.348724179829891e-05, + "loss": 1.5087, + "step": 6020 + }, + { + "epoch": 1.8275914402792535, + "grad_norm": 0.5750499367713928, + "learning_rate": 6.348116646415553e-05, + "loss": 1.7912, + "step": 6021 + }, + { + "epoch": 1.8278949764759447, + "grad_norm": 0.5460924506187439, + "learning_rate": 6.347509113001216e-05, + "loss": 1.7777, + "step": 6022 + }, + { + "epoch": 1.8281985126726363, + "grad_norm": 0.5502283573150635, + "learning_rate": 6.346901579586878e-05, + "loss": 1.5873, + "step": 6023 + }, + { + "epoch": 1.8285020488693275, + "grad_norm": 0.551810622215271, + "learning_rate": 6.346294046172539e-05, + "loss": 1.9483, + "step": 6024 + }, + { + "epoch": 1.8288055850660192, + "grad_norm": 0.4859268367290497, + "learning_rate": 6.345686512758202e-05, + "loss": 2.0799, + "step": 6025 + }, + { + "epoch": 1.8291091212627104, + "grad_norm": 0.48601651191711426, + "learning_rate": 6.345078979343865e-05, + "loss": 1.5712, + "step": 6026 + }, + { + "epoch": 1.829412657459402, + "grad_norm": 0.5264910459518433, + "learning_rate": 6.344471445929526e-05, + "loss": 1.8402, + "step": 6027 + }, + { + "epoch": 1.8297161936560935, + "grad_norm": 0.5978955030441284, + "learning_rate": 6.343863912515189e-05, + "loss": 1.7215, + "step": 6028 + }, + { + "epoch": 1.830019729852785, + "grad_norm": 0.5713289976119995, + "learning_rate": 6.34325637910085e-05, + "loss": 1.9287, + "step": 6029 + }, + { + "epoch": 1.8303232660494764, + "grad_norm": 0.5466620922088623, + "learning_rate": 6.342648845686512e-05, + "loss": 1.5087, + "step": 6030 + }, + { + "epoch": 1.8306268022461678, + "grad_norm": 0.5094414949417114, + "learning_rate": 6.342041312272175e-05, + "loss": 1.8889, + "step": 6031 + }, + { + "epoch": 1.8309303384428595, + "grad_norm": 0.536065936088562, + "learning_rate": 6.341433778857837e-05, + "loss": 1.7133, + "step": 6032 + }, + { + "epoch": 1.8312338746395507, + "grad_norm": 0.5807048678398132, + "learning_rate": 6.3408262454435e-05, + "loss": 1.4843, + "step": 6033 + }, + { + "epoch": 1.8315374108362423, + "grad_norm": 0.4928985834121704, + "learning_rate": 6.340218712029162e-05, + "loss": 0.9791, + "step": 6034 + }, + { + "epoch": 1.8318409470329335, + "grad_norm": 0.5296457409858704, + "learning_rate": 6.339611178614824e-05, + "loss": 1.8372, + "step": 6035 + }, + { + "epoch": 1.8321444832296252, + "grad_norm": 0.6519824862480164, + "learning_rate": 6.339003645200487e-05, + "loss": 1.1811, + "step": 6036 + }, + { + "epoch": 1.8324480194263166, + "grad_norm": 0.5059677362442017, + "learning_rate": 6.338396111786149e-05, + "loss": 2.0749, + "step": 6037 + }, + { + "epoch": 1.832751555623008, + "grad_norm": 0.5943655371665955, + "learning_rate": 6.33778857837181e-05, + "loss": 1.2195, + "step": 6038 + }, + { + "epoch": 1.8330550918196995, + "grad_norm": 0.5527958273887634, + "learning_rate": 6.337181044957473e-05, + "loss": 1.4819, + "step": 6039 + }, + { + "epoch": 1.833358628016391, + "grad_norm": 0.5767375826835632, + "learning_rate": 6.336573511543135e-05, + "loss": 1.77, + "step": 6040 + }, + { + "epoch": 1.8336621642130824, + "grad_norm": 0.46123191714286804, + "learning_rate": 6.335965978128797e-05, + "loss": 1.2523, + "step": 6041 + }, + { + "epoch": 1.8339657004097738, + "grad_norm": 0.6473827958106995, + "learning_rate": 6.33535844471446e-05, + "loss": 1.3337, + "step": 6042 + }, + { + "epoch": 1.8342692366064655, + "grad_norm": 0.5438799858093262, + "learning_rate": 6.334750911300122e-05, + "loss": 1.3156, + "step": 6043 + }, + { + "epoch": 1.8345727728031567, + "grad_norm": 0.658911943435669, + "learning_rate": 6.334143377885783e-05, + "loss": 1.7473, + "step": 6044 + }, + { + "epoch": 1.8348763089998483, + "grad_norm": 0.5106703042984009, + "learning_rate": 6.333535844471446e-05, + "loss": 1.7683, + "step": 6045 + }, + { + "epoch": 1.8351798451965395, + "grad_norm": 0.5918740630149841, + "learning_rate": 6.332928311057108e-05, + "loss": 1.2451, + "step": 6046 + }, + { + "epoch": 1.8354833813932312, + "grad_norm": 0.49266964197158813, + "learning_rate": 6.332320777642771e-05, + "loss": 1.255, + "step": 6047 + }, + { + "epoch": 1.8357869175899226, + "grad_norm": 0.48280391097068787, + "learning_rate": 6.331713244228433e-05, + "loss": 1.7687, + "step": 6048 + }, + { + "epoch": 1.836090453786614, + "grad_norm": 0.51821368932724, + "learning_rate": 6.331105710814095e-05, + "loss": 1.8662, + "step": 6049 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.5473127365112305, + "learning_rate": 6.330498177399758e-05, + "loss": 1.3156, + "step": 6050 + }, + { + "epoch": 1.836697526179997, + "grad_norm": 0.5145398378372192, + "learning_rate": 6.32989064398542e-05, + "loss": 1.748, + "step": 6051 + }, + { + "epoch": 1.8370010623766884, + "grad_norm": 0.4762469530105591, + "learning_rate": 6.329283110571081e-05, + "loss": 1.9218, + "step": 6052 + }, + { + "epoch": 1.8373045985733798, + "grad_norm": 0.4372353255748749, + "learning_rate": 6.328675577156744e-05, + "loss": 1.958, + "step": 6053 + }, + { + "epoch": 1.8376081347700715, + "grad_norm": 0.47850942611694336, + "learning_rate": 6.328068043742406e-05, + "loss": 1.9153, + "step": 6054 + }, + { + "epoch": 1.8379116709667627, + "grad_norm": 0.5003155469894409, + "learning_rate": 6.327460510328068e-05, + "loss": 1.8781, + "step": 6055 + }, + { + "epoch": 1.8382152071634543, + "grad_norm": 0.5914559960365295, + "learning_rate": 6.326852976913731e-05, + "loss": 1.6648, + "step": 6056 + }, + { + "epoch": 1.8385187433601455, + "grad_norm": 0.5313575267791748, + "learning_rate": 6.326245443499393e-05, + "loss": 1.1451, + "step": 6057 + }, + { + "epoch": 1.8388222795568372, + "grad_norm": 0.4557872414588928, + "learning_rate": 6.325637910085054e-05, + "loss": 1.6781, + "step": 6058 + }, + { + "epoch": 1.8391258157535286, + "grad_norm": 0.537188708782196, + "learning_rate": 6.325030376670717e-05, + "loss": 1.4251, + "step": 6059 + }, + { + "epoch": 1.83942935195022, + "grad_norm": 0.5582257509231567, + "learning_rate": 6.324422843256379e-05, + "loss": 2.0075, + "step": 6060 + }, + { + "epoch": 1.8397328881469115, + "grad_norm": 0.5774347186088562, + "learning_rate": 6.323815309842042e-05, + "loss": 2.0587, + "step": 6061 + }, + { + "epoch": 1.840036424343603, + "grad_norm": 0.5662617683410645, + "learning_rate": 6.323207776427704e-05, + "loss": 1.7198, + "step": 6062 + }, + { + "epoch": 1.8403399605402946, + "grad_norm": 0.5646880269050598, + "learning_rate": 6.322600243013366e-05, + "loss": 1.8586, + "step": 6063 + }, + { + "epoch": 1.8406434967369858, + "grad_norm": 0.53641676902771, + "learning_rate": 6.321992709599029e-05, + "loss": 0.9103, + "step": 6064 + }, + { + "epoch": 1.8409470329336775, + "grad_norm": 0.5239225625991821, + "learning_rate": 6.32138517618469e-05, + "loss": 1.5655, + "step": 6065 + }, + { + "epoch": 1.8412505691303687, + "grad_norm": 0.4748789668083191, + "learning_rate": 6.320777642770352e-05, + "loss": 1.2857, + "step": 6066 + }, + { + "epoch": 1.8415541053270603, + "grad_norm": 0.5578672289848328, + "learning_rate": 6.320170109356015e-05, + "loss": 1.6388, + "step": 6067 + }, + { + "epoch": 1.8418576415237518, + "grad_norm": 0.48566946387290955, + "learning_rate": 6.319562575941677e-05, + "loss": 1.7807, + "step": 6068 + }, + { + "epoch": 1.8421611777204432, + "grad_norm": 0.49624478816986084, + "learning_rate": 6.318955042527339e-05, + "loss": 1.5199, + "step": 6069 + }, + { + "epoch": 1.8424647139171346, + "grad_norm": 0.5602688789367676, + "learning_rate": 6.318347509113002e-05, + "loss": 1.1253, + "step": 6070 + }, + { + "epoch": 1.842768250113826, + "grad_norm": 0.5979968309402466, + "learning_rate": 6.317739975698664e-05, + "loss": 1.4047, + "step": 6071 + }, + { + "epoch": 1.8430717863105175, + "grad_norm": 0.5504110455513, + "learning_rate": 6.317132442284325e-05, + "loss": 1.8578, + "step": 6072 + }, + { + "epoch": 1.843375322507209, + "grad_norm": 0.9628989696502686, + "learning_rate": 6.316524908869988e-05, + "loss": 1.363, + "step": 6073 + }, + { + "epoch": 1.8436788587039006, + "grad_norm": 0.5489274859428406, + "learning_rate": 6.31591737545565e-05, + "loss": 1.351, + "step": 6074 + }, + { + "epoch": 1.8439823949005918, + "grad_norm": 0.6439406275749207, + "learning_rate": 6.315309842041313e-05, + "loss": 1.7498, + "step": 6075 + }, + { + "epoch": 1.8442859310972834, + "grad_norm": 0.5306099653244019, + "learning_rate": 6.314702308626975e-05, + "loss": 1.6709, + "step": 6076 + }, + { + "epoch": 1.8445894672939747, + "grad_norm": 0.5945127606391907, + "learning_rate": 6.314094775212637e-05, + "loss": 1.4394, + "step": 6077 + }, + { + "epoch": 1.8448930034906663, + "grad_norm": 0.600626528263092, + "learning_rate": 6.3134872417983e-05, + "loss": 1.9918, + "step": 6078 + }, + { + "epoch": 1.8451965396873578, + "grad_norm": 0.5113967061042786, + "learning_rate": 6.312879708383962e-05, + "loss": 1.9253, + "step": 6079 + }, + { + "epoch": 1.8455000758840492, + "grad_norm": 0.5761411786079407, + "learning_rate": 6.312272174969623e-05, + "loss": 1.512, + "step": 6080 + }, + { + "epoch": 1.8458036120807406, + "grad_norm": 0.5874478816986084, + "learning_rate": 6.311664641555286e-05, + "loss": 1.5656, + "step": 6081 + }, + { + "epoch": 1.846107148277432, + "grad_norm": 0.611596941947937, + "learning_rate": 6.311057108140948e-05, + "loss": 1.5055, + "step": 6082 + }, + { + "epoch": 1.8464106844741235, + "grad_norm": 0.4646596908569336, + "learning_rate": 6.31044957472661e-05, + "loss": 1.8092, + "step": 6083 + }, + { + "epoch": 1.846714220670815, + "grad_norm": 0.6680156588554382, + "learning_rate": 6.309842041312273e-05, + "loss": 1.3079, + "step": 6084 + }, + { + "epoch": 1.8470177568675066, + "grad_norm": 0.5887166261672974, + "learning_rate": 6.309234507897935e-05, + "loss": 1.6493, + "step": 6085 + }, + { + "epoch": 1.8473212930641978, + "grad_norm": 0.6366755962371826, + "learning_rate": 6.308626974483596e-05, + "loss": 1.4721, + "step": 6086 + }, + { + "epoch": 1.8476248292608894, + "grad_norm": 0.5246219635009766, + "learning_rate": 6.30801944106926e-05, + "loss": 1.877, + "step": 6087 + }, + { + "epoch": 1.8479283654575807, + "grad_norm": 0.6656885743141174, + "learning_rate": 6.307411907654921e-05, + "loss": 1.9393, + "step": 6088 + }, + { + "epoch": 1.8482319016542723, + "grad_norm": 0.5601686835289001, + "learning_rate": 6.306804374240583e-05, + "loss": 1.0694, + "step": 6089 + }, + { + "epoch": 1.8485354378509637, + "grad_norm": 0.8006256222724915, + "learning_rate": 6.306196840826246e-05, + "loss": 1.4974, + "step": 6090 + }, + { + "epoch": 1.8488389740476552, + "grad_norm": 0.49230626225471497, + "learning_rate": 6.305589307411908e-05, + "loss": 1.6028, + "step": 6091 + }, + { + "epoch": 1.8491425102443466, + "grad_norm": 0.6775741577148438, + "learning_rate": 6.304981773997571e-05, + "loss": 1.7014, + "step": 6092 + }, + { + "epoch": 1.849446046441038, + "grad_norm": 0.55632483959198, + "learning_rate": 6.304374240583233e-05, + "loss": 1.2937, + "step": 6093 + }, + { + "epoch": 1.8497495826377297, + "grad_norm": 0.494138240814209, + "learning_rate": 6.303766707168894e-05, + "loss": 1.4426, + "step": 6094 + }, + { + "epoch": 1.850053118834421, + "grad_norm": 0.5821829438209534, + "learning_rate": 6.303159173754557e-05, + "loss": 1.4578, + "step": 6095 + }, + { + "epoch": 1.8503566550311126, + "grad_norm": 0.4682660400867462, + "learning_rate": 6.302551640340219e-05, + "loss": 1.7926, + "step": 6096 + }, + { + "epoch": 1.8506601912278038, + "grad_norm": 0.5352194309234619, + "learning_rate": 6.301944106925881e-05, + "loss": 1.7727, + "step": 6097 + }, + { + "epoch": 1.8509637274244954, + "grad_norm": 0.6106446385383606, + "learning_rate": 6.301336573511544e-05, + "loss": 1.3591, + "step": 6098 + }, + { + "epoch": 1.8512672636211869, + "grad_norm": 0.575634777545929, + "learning_rate": 6.300729040097206e-05, + "loss": 1.1851, + "step": 6099 + }, + { + "epoch": 1.8515707998178783, + "grad_norm": 0.976506233215332, + "learning_rate": 6.300121506682867e-05, + "loss": 1.481, + "step": 6100 + }, + { + "epoch": 1.8518743360145697, + "grad_norm": 1.6843774318695068, + "learning_rate": 6.29951397326853e-05, + "loss": 1.8745, + "step": 6101 + }, + { + "epoch": 1.8521778722112612, + "grad_norm": 0.5201796293258667, + "learning_rate": 6.298906439854192e-05, + "loss": 1.5802, + "step": 6102 + }, + { + "epoch": 1.8524814084079526, + "grad_norm": 0.4609794616699219, + "learning_rate": 6.298298906439854e-05, + "loss": 1.57, + "step": 6103 + }, + { + "epoch": 1.852784944604644, + "grad_norm": 0.42774495482444763, + "learning_rate": 6.297691373025517e-05, + "loss": 1.5364, + "step": 6104 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.5276924967765808, + "learning_rate": 6.297083839611179e-05, + "loss": 1.5617, + "step": 6105 + }, + { + "epoch": 1.853392016998027, + "grad_norm": 0.5043931007385254, + "learning_rate": 6.296476306196842e-05, + "loss": 1.8358, + "step": 6106 + }, + { + "epoch": 1.8536955531947186, + "grad_norm": 0.5033897161483765, + "learning_rate": 6.295868772782504e-05, + "loss": 1.7926, + "step": 6107 + }, + { + "epoch": 1.8539990893914098, + "grad_norm": 0.5580025315284729, + "learning_rate": 6.295261239368165e-05, + "loss": 1.4676, + "step": 6108 + }, + { + "epoch": 1.8543026255881014, + "grad_norm": 1.387598991394043, + "learning_rate": 6.294653705953828e-05, + "loss": 1.3955, + "step": 6109 + }, + { + "epoch": 1.8546061617847929, + "grad_norm": 0.576951265335083, + "learning_rate": 6.29404617253949e-05, + "loss": 1.638, + "step": 6110 + }, + { + "epoch": 1.8549096979814843, + "grad_norm": 0.5881240963935852, + "learning_rate": 6.293438639125152e-05, + "loss": 1.5124, + "step": 6111 + }, + { + "epoch": 1.8552132341781757, + "grad_norm": 0.4950685501098633, + "learning_rate": 6.292831105710815e-05, + "loss": 1.3096, + "step": 6112 + }, + { + "epoch": 1.8555167703748672, + "grad_norm": 0.6170344948768616, + "learning_rate": 6.292223572296477e-05, + "loss": 1.4772, + "step": 6113 + }, + { + "epoch": 1.8558203065715586, + "grad_norm": 0.6243839263916016, + "learning_rate": 6.291616038882138e-05, + "loss": 1.681, + "step": 6114 + }, + { + "epoch": 1.85612384276825, + "grad_norm": 0.5996612906455994, + "learning_rate": 6.291008505467801e-05, + "loss": 1.5997, + "step": 6115 + }, + { + "epoch": 1.8564273789649417, + "grad_norm": 0.5222072005271912, + "learning_rate": 6.290400972053463e-05, + "loss": 1.435, + "step": 6116 + }, + { + "epoch": 1.856730915161633, + "grad_norm": 0.5111011266708374, + "learning_rate": 6.289793438639125e-05, + "loss": 1.8543, + "step": 6117 + }, + { + "epoch": 1.8570344513583246, + "grad_norm": 0.5086125135421753, + "learning_rate": 6.289185905224788e-05, + "loss": 1.8475, + "step": 6118 + }, + { + "epoch": 1.8573379875550158, + "grad_norm": 0.445705771446228, + "learning_rate": 6.28857837181045e-05, + "loss": 1.7086, + "step": 6119 + }, + { + "epoch": 1.8576415237517074, + "grad_norm": 0.5179358720779419, + "learning_rate": 6.287970838396113e-05, + "loss": 1.6929, + "step": 6120 + }, + { + "epoch": 1.8579450599483989, + "grad_norm": 0.5571700930595398, + "learning_rate": 6.287363304981775e-05, + "loss": 1.6619, + "step": 6121 + }, + { + "epoch": 1.8582485961450903, + "grad_norm": 0.6373270153999329, + "learning_rate": 6.286755771567436e-05, + "loss": 1.7158, + "step": 6122 + }, + { + "epoch": 1.8585521323417817, + "grad_norm": 0.7662277817726135, + "learning_rate": 6.2861482381531e-05, + "loss": 1.506, + "step": 6123 + }, + { + "epoch": 1.8588556685384732, + "grad_norm": 0.6126819252967834, + "learning_rate": 6.285540704738761e-05, + "loss": 1.4747, + "step": 6124 + }, + { + "epoch": 1.8591592047351648, + "grad_norm": 0.48531535267829895, + "learning_rate": 6.284933171324423e-05, + "loss": 1.3161, + "step": 6125 + }, + { + "epoch": 1.859462740931856, + "grad_norm": 0.6532206535339355, + "learning_rate": 6.284325637910086e-05, + "loss": 1.5314, + "step": 6126 + }, + { + "epoch": 1.8597662771285477, + "grad_norm": 0.5442935228347778, + "learning_rate": 6.283718104495748e-05, + "loss": 1.8921, + "step": 6127 + }, + { + "epoch": 1.860069813325239, + "grad_norm": 0.890833854675293, + "learning_rate": 6.28311057108141e-05, + "loss": 1.509, + "step": 6128 + }, + { + "epoch": 1.8603733495219306, + "grad_norm": 0.5734865069389343, + "learning_rate": 6.282503037667072e-05, + "loss": 1.5688, + "step": 6129 + }, + { + "epoch": 1.860676885718622, + "grad_norm": 0.48190489411354065, + "learning_rate": 6.281895504252734e-05, + "loss": 1.1954, + "step": 6130 + }, + { + "epoch": 1.8609804219153134, + "grad_norm": 0.6809987425804138, + "learning_rate": 6.281287970838396e-05, + "loss": 1.897, + "step": 6131 + }, + { + "epoch": 1.8612839581120049, + "grad_norm": 0.6227823495864868, + "learning_rate": 6.280680437424059e-05, + "loss": 1.9167, + "step": 6132 + }, + { + "epoch": 1.8615874943086963, + "grad_norm": 0.5331142544746399, + "learning_rate": 6.280072904009721e-05, + "loss": 1.6832, + "step": 6133 + }, + { + "epoch": 1.8618910305053877, + "grad_norm": 0.4640684425830841, + "learning_rate": 6.279465370595384e-05, + "loss": 1.5393, + "step": 6134 + }, + { + "epoch": 1.8621945667020792, + "grad_norm": 0.4620974659919739, + "learning_rate": 6.278857837181046e-05, + "loss": 1.9326, + "step": 6135 + }, + { + "epoch": 1.8624981028987708, + "grad_norm": 0.6997519135475159, + "learning_rate": 6.278250303766707e-05, + "loss": 1.3935, + "step": 6136 + }, + { + "epoch": 1.862801639095462, + "grad_norm": 0.5011730790138245, + "learning_rate": 6.27764277035237e-05, + "loss": 1.7596, + "step": 6137 + }, + { + "epoch": 1.8631051752921537, + "grad_norm": 0.4842537045478821, + "learning_rate": 6.277035236938031e-05, + "loss": 0.8998, + "step": 6138 + }, + { + "epoch": 1.863408711488845, + "grad_norm": 0.5254935622215271, + "learning_rate": 6.276427703523694e-05, + "loss": 1.775, + "step": 6139 + }, + { + "epoch": 1.8637122476855366, + "grad_norm": 0.5326409339904785, + "learning_rate": 6.275820170109357e-05, + "loss": 1.7003, + "step": 6140 + }, + { + "epoch": 1.864015783882228, + "grad_norm": 0.5449077486991882, + "learning_rate": 6.275212636695019e-05, + "loss": 1.6728, + "step": 6141 + }, + { + "epoch": 1.8643193200789194, + "grad_norm": 0.4609127938747406, + "learning_rate": 6.27460510328068e-05, + "loss": 1.7144, + "step": 6142 + }, + { + "epoch": 1.8646228562756109, + "grad_norm": 0.891974151134491, + "learning_rate": 6.273997569866343e-05, + "loss": 1.693, + "step": 6143 + }, + { + "epoch": 1.8649263924723023, + "grad_norm": 0.5309075713157654, + "learning_rate": 6.273390036452005e-05, + "loss": 1.3689, + "step": 6144 + }, + { + "epoch": 1.8652299286689937, + "grad_norm": 0.5689653754234314, + "learning_rate": 6.272782503037667e-05, + "loss": 1.7364, + "step": 6145 + }, + { + "epoch": 1.8655334648656852, + "grad_norm": 0.49552464485168457, + "learning_rate": 6.27217496962333e-05, + "loss": 1.9449, + "step": 6146 + }, + { + "epoch": 1.8658370010623768, + "grad_norm": 0.5504531264305115, + "learning_rate": 6.271567436208992e-05, + "loss": 1.7631, + "step": 6147 + }, + { + "epoch": 1.866140537259068, + "grad_norm": 1.0592516660690308, + "learning_rate": 6.270959902794655e-05, + "loss": 1.5151, + "step": 6148 + }, + { + "epoch": 1.8664440734557597, + "grad_norm": 0.5770890712738037, + "learning_rate": 6.270352369380315e-05, + "loss": 1.5414, + "step": 6149 + }, + { + "epoch": 1.866747609652451, + "grad_norm": 0.611977756023407, + "learning_rate": 6.269744835965978e-05, + "loss": 1.3377, + "step": 6150 + }, + { + "epoch": 1.8670511458491426, + "grad_norm": 0.5175086855888367, + "learning_rate": 6.269137302551641e-05, + "loss": 1.8129, + "step": 6151 + }, + { + "epoch": 1.867354682045834, + "grad_norm": 0.4967375099658966, + "learning_rate": 6.268529769137302e-05, + "loss": 1.6175, + "step": 6152 + }, + { + "epoch": 1.8676582182425254, + "grad_norm": 0.5550147891044617, + "learning_rate": 6.267922235722965e-05, + "loss": 1.9646, + "step": 6153 + }, + { + "epoch": 1.8679617544392169, + "grad_norm": 0.5940298438072205, + "learning_rate": 6.267314702308628e-05, + "loss": 1.8358, + "step": 6154 + }, + { + "epoch": 1.8682652906359083, + "grad_norm": 0.4860624074935913, + "learning_rate": 6.26670716889429e-05, + "loss": 1.7674, + "step": 6155 + }, + { + "epoch": 1.8685688268326, + "grad_norm": 0.5743589401245117, + "learning_rate": 6.266099635479951e-05, + "loss": 1.6537, + "step": 6156 + }, + { + "epoch": 1.8688723630292912, + "grad_norm": 0.5607566237449646, + "learning_rate": 6.265492102065614e-05, + "loss": 1.4747, + "step": 6157 + }, + { + "epoch": 1.8691758992259828, + "grad_norm": 0.5863677263259888, + "learning_rate": 6.264884568651276e-05, + "loss": 1.6027, + "step": 6158 + }, + { + "epoch": 1.869479435422674, + "grad_norm": 0.437791645526886, + "learning_rate": 6.264277035236938e-05, + "loss": 1.1876, + "step": 6159 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.6220367550849915, + "learning_rate": 6.263669501822601e-05, + "loss": 1.5344, + "step": 6160 + }, + { + "epoch": 1.8700865078160571, + "grad_norm": 0.5647712349891663, + "learning_rate": 6.263061968408263e-05, + "loss": 1.598, + "step": 6161 + }, + { + "epoch": 1.8703900440127486, + "grad_norm": 0.46893787384033203, + "learning_rate": 6.262454434993924e-05, + "loss": 1.3591, + "step": 6162 + }, + { + "epoch": 1.87069358020944, + "grad_norm": 0.6015875935554504, + "learning_rate": 6.261846901579586e-05, + "loss": 1.895, + "step": 6163 + }, + { + "epoch": 1.8709971164061314, + "grad_norm": 0.5706945061683655, + "learning_rate": 6.261239368165249e-05, + "loss": 1.8152, + "step": 6164 + }, + { + "epoch": 1.8713006526028229, + "grad_norm": 0.6036109328269958, + "learning_rate": 6.260631834750912e-05, + "loss": 1.7984, + "step": 6165 + }, + { + "epoch": 1.8716041887995143, + "grad_norm": 0.521976888179779, + "learning_rate": 6.260024301336573e-05, + "loss": 1.7626, + "step": 6166 + }, + { + "epoch": 1.871907724996206, + "grad_norm": 0.5366547107696533, + "learning_rate": 6.259416767922236e-05, + "loss": 1.8297, + "step": 6167 + }, + { + "epoch": 1.8722112611928972, + "grad_norm": 0.4634724259376526, + "learning_rate": 6.258809234507899e-05, + "loss": 1.7023, + "step": 6168 + }, + { + "epoch": 1.8725147973895888, + "grad_norm": 0.5139271020889282, + "learning_rate": 6.25820170109356e-05, + "loss": 1.2409, + "step": 6169 + }, + { + "epoch": 1.87281833358628, + "grad_norm": 0.42354339361190796, + "learning_rate": 6.257594167679222e-05, + "loss": 1.735, + "step": 6170 + }, + { + "epoch": 1.8731218697829717, + "grad_norm": 0.48973771929740906, + "learning_rate": 6.256986634264885e-05, + "loss": 1.8408, + "step": 6171 + }, + { + "epoch": 1.8734254059796631, + "grad_norm": 0.4284050166606903, + "learning_rate": 6.256379100850547e-05, + "loss": 0.5575, + "step": 6172 + }, + { + "epoch": 1.8737289421763546, + "grad_norm": 0.6064922213554382, + "learning_rate": 6.255771567436209e-05, + "loss": 1.293, + "step": 6173 + }, + { + "epoch": 1.874032478373046, + "grad_norm": 0.6467086672782898, + "learning_rate": 6.255164034021872e-05, + "loss": 1.1203, + "step": 6174 + }, + { + "epoch": 1.8743360145697374, + "grad_norm": 0.5496531128883362, + "learning_rate": 6.254556500607534e-05, + "loss": 1.5393, + "step": 6175 + }, + { + "epoch": 1.8746395507664289, + "grad_norm": 0.4759608507156372, + "learning_rate": 6.253948967193195e-05, + "loss": 1.7542, + "step": 6176 + }, + { + "epoch": 1.8749430869631203, + "grad_norm": 0.5617760419845581, + "learning_rate": 6.253341433778857e-05, + "loss": 1.7047, + "step": 6177 + }, + { + "epoch": 1.875246623159812, + "grad_norm": 0.5658890604972839, + "learning_rate": 6.25273390036452e-05, + "loss": 1.4287, + "step": 6178 + }, + { + "epoch": 1.8755501593565032, + "grad_norm": 0.357480525970459, + "learning_rate": 6.252126366950183e-05, + "loss": 1.1164, + "step": 6179 + }, + { + "epoch": 1.8758536955531948, + "grad_norm": 0.64265376329422, + "learning_rate": 6.251518833535844e-05, + "loss": 1.0389, + "step": 6180 + }, + { + "epoch": 1.876157231749886, + "grad_norm": 0.5272260904312134, + "learning_rate": 6.250911300121507e-05, + "loss": 1.7431, + "step": 6181 + }, + { + "epoch": 1.8764607679465777, + "grad_norm": 0.6049331426620483, + "learning_rate": 6.25030376670717e-05, + "loss": 1.43, + "step": 6182 + }, + { + "epoch": 1.8767643041432691, + "grad_norm": 0.5652697682380676, + "learning_rate": 6.249696233292832e-05, + "loss": 1.091, + "step": 6183 + }, + { + "epoch": 1.8770678403399605, + "grad_norm": 0.6228081583976746, + "learning_rate": 6.249088699878493e-05, + "loss": 1.5427, + "step": 6184 + }, + { + "epoch": 1.877371376536652, + "grad_norm": 0.6637395620346069, + "learning_rate": 6.248481166464156e-05, + "loss": 1.3756, + "step": 6185 + }, + { + "epoch": 1.8776749127333434, + "grad_norm": 0.5850158333778381, + "learning_rate": 6.247873633049818e-05, + "loss": 1.7537, + "step": 6186 + }, + { + "epoch": 1.877978448930035, + "grad_norm": 0.5947254300117493, + "learning_rate": 6.24726609963548e-05, + "loss": 1.4273, + "step": 6187 + }, + { + "epoch": 1.8782819851267263, + "grad_norm": 0.6103019714355469, + "learning_rate": 6.246658566221143e-05, + "loss": 1.9379, + "step": 6188 + }, + { + "epoch": 1.878585521323418, + "grad_norm": 0.6417216062545776, + "learning_rate": 6.246051032806805e-05, + "loss": 1.4602, + "step": 6189 + }, + { + "epoch": 1.8788890575201092, + "grad_norm": 0.5326799154281616, + "learning_rate": 6.245443499392466e-05, + "loss": 1.6638, + "step": 6190 + }, + { + "epoch": 1.8791925937168008, + "grad_norm": 0.6213211417198181, + "learning_rate": 6.244835965978128e-05, + "loss": 1.5516, + "step": 6191 + }, + { + "epoch": 1.879496129913492, + "grad_norm": 0.48354288935661316, + "learning_rate": 6.244228432563791e-05, + "loss": 1.3408, + "step": 6192 + }, + { + "epoch": 1.8797996661101837, + "grad_norm": 0.43827447295188904, + "learning_rate": 6.243620899149454e-05, + "loss": 1.7035, + "step": 6193 + }, + { + "epoch": 1.880103202306875, + "grad_norm": 0.5599086880683899, + "learning_rate": 6.243013365735115e-05, + "loss": 1.7855, + "step": 6194 + }, + { + "epoch": 1.8804067385035665, + "grad_norm": 0.6431657671928406, + "learning_rate": 6.242405832320778e-05, + "loss": 1.4125, + "step": 6195 + }, + { + "epoch": 1.880710274700258, + "grad_norm": 0.5782099366188049, + "learning_rate": 6.241798298906441e-05, + "loss": 1.9184, + "step": 6196 + }, + { + "epoch": 1.8810138108969494, + "grad_norm": 0.46235474944114685, + "learning_rate": 6.241190765492103e-05, + "loss": 1.7373, + "step": 6197 + }, + { + "epoch": 1.881317347093641, + "grad_norm": 0.5792590975761414, + "learning_rate": 6.240583232077764e-05, + "loss": 1.6566, + "step": 6198 + }, + { + "epoch": 1.8816208832903323, + "grad_norm": 0.48585742712020874, + "learning_rate": 6.239975698663427e-05, + "loss": 1.5372, + "step": 6199 + }, + { + "epoch": 1.881924419487024, + "grad_norm": 0.595456063747406, + "learning_rate": 6.239368165249089e-05, + "loss": 1.7805, + "step": 6200 + }, + { + "epoch": 1.8822279556837151, + "grad_norm": 0.5959881544113159, + "learning_rate": 6.238760631834751e-05, + "loss": 1.4049, + "step": 6201 + }, + { + "epoch": 1.8825314918804068, + "grad_norm": 0.6109514236450195, + "learning_rate": 6.238153098420414e-05, + "loss": 1.9479, + "step": 6202 + }, + { + "epoch": 1.8828350280770982, + "grad_norm": 0.4679960608482361, + "learning_rate": 6.237545565006076e-05, + "loss": 1.801, + "step": 6203 + }, + { + "epoch": 1.8831385642737897, + "grad_norm": 0.5735065937042236, + "learning_rate": 6.236938031591738e-05, + "loss": 1.6866, + "step": 6204 + }, + { + "epoch": 1.883442100470481, + "grad_norm": 0.8023161292076111, + "learning_rate": 6.236330498177399e-05, + "loss": 1.833, + "step": 6205 + }, + { + "epoch": 1.8837456366671725, + "grad_norm": 0.5725283026695251, + "learning_rate": 6.235722964763062e-05, + "loss": 1.3659, + "step": 6206 + }, + { + "epoch": 1.884049172863864, + "grad_norm": 0.6874731779098511, + "learning_rate": 6.235115431348725e-05, + "loss": 1.8193, + "step": 6207 + }, + { + "epoch": 1.8843527090605554, + "grad_norm": 0.5876416563987732, + "learning_rate": 6.234507897934386e-05, + "loss": 2.0175, + "step": 6208 + }, + { + "epoch": 1.884656245257247, + "grad_norm": 0.5721455812454224, + "learning_rate": 6.233900364520049e-05, + "loss": 1.4163, + "step": 6209 + }, + { + "epoch": 1.8849597814539383, + "grad_norm": 0.44347506761550903, + "learning_rate": 6.233292831105712e-05, + "loss": 1.3832, + "step": 6210 + }, + { + "epoch": 1.88526331765063, + "grad_norm": 0.5441204905509949, + "learning_rate": 6.232685297691372e-05, + "loss": 1.7932, + "step": 6211 + }, + { + "epoch": 1.8855668538473211, + "grad_norm": 0.49128007888793945, + "learning_rate": 6.232077764277035e-05, + "loss": 1.5375, + "step": 6212 + }, + { + "epoch": 1.8858703900440128, + "grad_norm": 0.42426589131355286, + "learning_rate": 6.231470230862698e-05, + "loss": 1.5113, + "step": 6213 + }, + { + "epoch": 1.8861739262407042, + "grad_norm": 0.6463568806648254, + "learning_rate": 6.23086269744836e-05, + "loss": 1.7184, + "step": 6214 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.5162055492401123, + "learning_rate": 6.230255164034022e-05, + "loss": 1.3341, + "step": 6215 + }, + { + "epoch": 1.886780998634087, + "grad_norm": 0.5229998826980591, + "learning_rate": 6.229647630619685e-05, + "loss": 1.8356, + "step": 6216 + }, + { + "epoch": 1.8870845348307785, + "grad_norm": 0.5528349876403809, + "learning_rate": 6.229040097205347e-05, + "loss": 1.8502, + "step": 6217 + }, + { + "epoch": 1.88738807102747, + "grad_norm": 0.6068422794342041, + "learning_rate": 6.228432563791009e-05, + "loss": 1.7407, + "step": 6218 + }, + { + "epoch": 1.8876916072241614, + "grad_norm": 0.5407156348228455, + "learning_rate": 6.22782503037667e-05, + "loss": 1.4199, + "step": 6219 + }, + { + "epoch": 1.887995143420853, + "grad_norm": 0.8403943777084351, + "learning_rate": 6.227217496962333e-05, + "loss": 1.5019, + "step": 6220 + }, + { + "epoch": 1.8882986796175443, + "grad_norm": 0.5002139210700989, + "learning_rate": 6.226609963547996e-05, + "loss": 1.7308, + "step": 6221 + }, + { + "epoch": 1.888602215814236, + "grad_norm": 0.5648574829101562, + "learning_rate": 6.226002430133657e-05, + "loss": 1.6151, + "step": 6222 + }, + { + "epoch": 1.8889057520109271, + "grad_norm": 0.59864342212677, + "learning_rate": 6.22539489671932e-05, + "loss": 1.6052, + "step": 6223 + }, + { + "epoch": 1.8892092882076188, + "grad_norm": 0.531084418296814, + "learning_rate": 6.224787363304983e-05, + "loss": 1.6574, + "step": 6224 + }, + { + "epoch": 1.8895128244043102, + "grad_norm": 0.554513692855835, + "learning_rate": 6.224179829890643e-05, + "loss": 1.4749, + "step": 6225 + }, + { + "epoch": 1.8898163606010017, + "grad_norm": 0.5515928864479065, + "learning_rate": 6.223572296476306e-05, + "loss": 1.8412, + "step": 6226 + }, + { + "epoch": 1.890119896797693, + "grad_norm": 0.4584737718105316, + "learning_rate": 6.22296476306197e-05, + "loss": 1.8693, + "step": 6227 + }, + { + "epoch": 1.8904234329943845, + "grad_norm": 0.5091758370399475, + "learning_rate": 6.222357229647631e-05, + "loss": 1.9317, + "step": 6228 + }, + { + "epoch": 1.8907269691910762, + "grad_norm": 0.5648382902145386, + "learning_rate": 6.221749696233293e-05, + "loss": 1.662, + "step": 6229 + }, + { + "epoch": 1.8910305053877674, + "grad_norm": 0.5485778450965881, + "learning_rate": 6.221142162818955e-05, + "loss": 1.6501, + "step": 6230 + }, + { + "epoch": 1.891334041584459, + "grad_norm": 0.5410951375961304, + "learning_rate": 6.220534629404618e-05, + "loss": 1.7698, + "step": 6231 + }, + { + "epoch": 1.8916375777811503, + "grad_norm": 0.6109384298324585, + "learning_rate": 6.21992709599028e-05, + "loss": 1.6953, + "step": 6232 + }, + { + "epoch": 1.891941113977842, + "grad_norm": 0.5590034127235413, + "learning_rate": 6.219319562575941e-05, + "loss": 1.4273, + "step": 6233 + }, + { + "epoch": 1.8922446501745334, + "grad_norm": 0.5122209191322327, + "learning_rate": 6.218712029161604e-05, + "loss": 1.7359, + "step": 6234 + }, + { + "epoch": 1.8925481863712248, + "grad_norm": 0.6979067325592041, + "learning_rate": 6.218104495747266e-05, + "loss": 1.4347, + "step": 6235 + }, + { + "epoch": 1.8928517225679162, + "grad_norm": 0.6123654246330261, + "learning_rate": 6.217496962332928e-05, + "loss": 1.6434, + "step": 6236 + }, + { + "epoch": 1.8931552587646077, + "grad_norm": 0.5874941945075989, + "learning_rate": 6.216889428918591e-05, + "loss": 1.7587, + "step": 6237 + }, + { + "epoch": 1.893458794961299, + "grad_norm": 0.6103301048278809, + "learning_rate": 6.216281895504254e-05, + "loss": 1.725, + "step": 6238 + }, + { + "epoch": 1.8937623311579905, + "grad_norm": 0.5363330841064453, + "learning_rate": 6.215674362089914e-05, + "loss": 1.7498, + "step": 6239 + }, + { + "epoch": 1.8940658673546822, + "grad_norm": 0.5891059041023254, + "learning_rate": 6.215066828675577e-05, + "loss": 1.8509, + "step": 6240 + }, + { + "epoch": 1.8943694035513734, + "grad_norm": 0.694438099861145, + "learning_rate": 6.21445929526124e-05, + "loss": 1.7223, + "step": 6241 + }, + { + "epoch": 1.894672939748065, + "grad_norm": 0.49839189648628235, + "learning_rate": 6.213851761846902e-05, + "loss": 1.8137, + "step": 6242 + }, + { + "epoch": 1.8949764759447563, + "grad_norm": 0.4575786590576172, + "learning_rate": 6.213244228432564e-05, + "loss": 1.8167, + "step": 6243 + }, + { + "epoch": 1.895280012141448, + "grad_norm": 0.9389039278030396, + "learning_rate": 6.212636695018226e-05, + "loss": 1.7694, + "step": 6244 + }, + { + "epoch": 1.8955835483381394, + "grad_norm": 0.5133308172225952, + "learning_rate": 6.212029161603889e-05, + "loss": 1.5743, + "step": 6245 + }, + { + "epoch": 1.8958870845348308, + "grad_norm": 0.5339624881744385, + "learning_rate": 6.21142162818955e-05, + "loss": 1.4944, + "step": 6246 + }, + { + "epoch": 1.8961906207315222, + "grad_norm": 0.5333174467086792, + "learning_rate": 6.210814094775212e-05, + "loss": 1.7742, + "step": 6247 + }, + { + "epoch": 1.8964941569282137, + "grad_norm": 0.45883241295814514, + "learning_rate": 6.210206561360875e-05, + "loss": 1.2235, + "step": 6248 + }, + { + "epoch": 1.896797693124905, + "grad_norm": 0.5622841715812683, + "learning_rate": 6.209599027946537e-05, + "loss": 1.7732, + "step": 6249 + }, + { + "epoch": 1.8971012293215965, + "grad_norm": 0.5983763337135315, + "learning_rate": 6.208991494532199e-05, + "loss": 1.7278, + "step": 6250 + }, + { + "epoch": 1.8974047655182882, + "grad_norm": 0.47994494438171387, + "learning_rate": 6.208383961117862e-05, + "loss": 1.6245, + "step": 6251 + }, + { + "epoch": 1.8977083017149794, + "grad_norm": 0.4889879822731018, + "learning_rate": 6.207776427703525e-05, + "loss": 1.7647, + "step": 6252 + }, + { + "epoch": 1.898011837911671, + "grad_norm": 0.40788859128952026, + "learning_rate": 6.207168894289185e-05, + "loss": 1.2151, + "step": 6253 + }, + { + "epoch": 1.8983153741083623, + "grad_norm": 0.5352895855903625, + "learning_rate": 6.206561360874848e-05, + "loss": 1.3245, + "step": 6254 + }, + { + "epoch": 1.898618910305054, + "grad_norm": 0.42484623193740845, + "learning_rate": 6.205953827460512e-05, + "loss": 2.3155, + "step": 6255 + }, + { + "epoch": 1.8989224465017454, + "grad_norm": 0.38781338930130005, + "learning_rate": 6.205346294046173e-05, + "loss": 1.1976, + "step": 6256 + }, + { + "epoch": 1.8992259826984368, + "grad_norm": 0.4916774034500122, + "learning_rate": 6.204738760631835e-05, + "loss": 1.407, + "step": 6257 + }, + { + "epoch": 1.8995295188951282, + "grad_norm": 0.5175955891609192, + "learning_rate": 6.204131227217497e-05, + "loss": 1.8932, + "step": 6258 + }, + { + "epoch": 1.8998330550918197, + "grad_norm": 0.5470436811447144, + "learning_rate": 6.20352369380316e-05, + "loss": 1.671, + "step": 6259 + }, + { + "epoch": 1.9001365912885113, + "grad_norm": 0.44192200899124146, + "learning_rate": 6.202916160388822e-05, + "loss": 1.3732, + "step": 6260 + }, + { + "epoch": 1.9004401274852025, + "grad_norm": 0.46629634499549866, + "learning_rate": 6.202308626974483e-05, + "loss": 1.8605, + "step": 6261 + }, + { + "epoch": 1.9007436636818942, + "grad_norm": 0.5211533904075623, + "learning_rate": 6.201701093560146e-05, + "loss": 1.6723, + "step": 6262 + }, + { + "epoch": 1.9010471998785854, + "grad_norm": 0.5735886096954346, + "learning_rate": 6.201093560145808e-05, + "loss": 1.5299, + "step": 6263 + }, + { + "epoch": 1.901350736075277, + "grad_norm": 0.5859761238098145, + "learning_rate": 6.20048602673147e-05, + "loss": 1.5531, + "step": 6264 + }, + { + "epoch": 1.9016542722719685, + "grad_norm": 0.4476751685142517, + "learning_rate": 6.199878493317133e-05, + "loss": 1.6964, + "step": 6265 + }, + { + "epoch": 1.90195780846866, + "grad_norm": 0.7125301957130432, + "learning_rate": 6.199270959902796e-05, + "loss": 1.3845, + "step": 6266 + }, + { + "epoch": 1.9022613446653514, + "grad_norm": 0.6609548330307007, + "learning_rate": 6.198663426488456e-05, + "loss": 2.045, + "step": 6267 + }, + { + "epoch": 1.9025648808620428, + "grad_norm": 0.4831355810165405, + "learning_rate": 6.19805589307412e-05, + "loss": 1.2296, + "step": 6268 + }, + { + "epoch": 1.9028684170587342, + "grad_norm": 0.501849353313446, + "learning_rate": 6.197448359659783e-05, + "loss": 1.7507, + "step": 6269 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.6798990964889526, + "learning_rate": 6.196840826245444e-05, + "loss": 1.8987, + "step": 6270 + }, + { + "epoch": 1.9034754894521173, + "grad_norm": 0.5525609850883484, + "learning_rate": 6.196233292831106e-05, + "loss": 1.6601, + "step": 6271 + }, + { + "epoch": 1.9037790256488085, + "grad_norm": 0.533176839351654, + "learning_rate": 6.195625759416768e-05, + "loss": 1.7864, + "step": 6272 + }, + { + "epoch": 1.9040825618455002, + "grad_norm": 0.49768903851509094, + "learning_rate": 6.195018226002431e-05, + "loss": 1.1226, + "step": 6273 + }, + { + "epoch": 1.9043860980421914, + "grad_norm": 0.5778190493583679, + "learning_rate": 6.194410692588093e-05, + "loss": 1.782, + "step": 6274 + }, + { + "epoch": 1.904689634238883, + "grad_norm": 0.5707374811172485, + "learning_rate": 6.193803159173754e-05, + "loss": 1.4747, + "step": 6275 + }, + { + "epoch": 1.9049931704355745, + "grad_norm": 0.5454164147377014, + "learning_rate": 6.193195625759417e-05, + "loss": 1.2567, + "step": 6276 + }, + { + "epoch": 1.905296706632266, + "grad_norm": 0.5996286869049072, + "learning_rate": 6.192588092345079e-05, + "loss": 1.8435, + "step": 6277 + }, + { + "epoch": 1.9056002428289573, + "grad_norm": 0.5817993879318237, + "learning_rate": 6.191980558930741e-05, + "loss": 1.6641, + "step": 6278 + }, + { + "epoch": 1.9059037790256488, + "grad_norm": 0.5574187636375427, + "learning_rate": 6.191373025516404e-05, + "loss": 1.6168, + "step": 6279 + }, + { + "epoch": 1.9062073152223402, + "grad_norm": 0.7683195471763611, + "learning_rate": 6.190765492102067e-05, + "loss": 1.8639, + "step": 6280 + }, + { + "epoch": 1.9065108514190316, + "grad_norm": 0.42818158864974976, + "learning_rate": 6.190157958687727e-05, + "loss": 1.2528, + "step": 6281 + }, + { + "epoch": 1.9068143876157233, + "grad_norm": 0.5653691291809082, + "learning_rate": 6.18955042527339e-05, + "loss": 1.7523, + "step": 6282 + }, + { + "epoch": 1.9071179238124145, + "grad_norm": 0.5621758699417114, + "learning_rate": 6.188942891859054e-05, + "loss": 1.825, + "step": 6283 + }, + { + "epoch": 1.9074214600091062, + "grad_norm": 0.6503968238830566, + "learning_rate": 6.188335358444714e-05, + "loss": 1.6188, + "step": 6284 + }, + { + "epoch": 1.9077249962057974, + "grad_norm": 0.5360357761383057, + "learning_rate": 6.187727825030377e-05, + "loss": 1.9039, + "step": 6285 + }, + { + "epoch": 1.908028532402489, + "grad_norm": 0.4974033534526825, + "learning_rate": 6.187120291616039e-05, + "loss": 1.8735, + "step": 6286 + }, + { + "epoch": 1.9083320685991805, + "grad_norm": 0.3803102970123291, + "learning_rate": 6.186512758201702e-05, + "loss": 1.6851, + "step": 6287 + }, + { + "epoch": 1.908635604795872, + "grad_norm": 0.5085203051567078, + "learning_rate": 6.185905224787364e-05, + "loss": 1.7257, + "step": 6288 + }, + { + "epoch": 1.9089391409925633, + "grad_norm": 0.49796509742736816, + "learning_rate": 6.185297691373025e-05, + "loss": 1.5958, + "step": 6289 + }, + { + "epoch": 1.9092426771892548, + "grad_norm": 0.5202645063400269, + "learning_rate": 6.184690157958688e-05, + "loss": 1.4409, + "step": 6290 + }, + { + "epoch": 1.9095462133859464, + "grad_norm": 0.5298627018928528, + "learning_rate": 6.18408262454435e-05, + "loss": 1.7299, + "step": 6291 + }, + { + "epoch": 1.9098497495826376, + "grad_norm": 0.5481365919113159, + "learning_rate": 6.183475091130012e-05, + "loss": 1.5826, + "step": 6292 + }, + { + "epoch": 1.9101532857793293, + "grad_norm": 0.5484110116958618, + "learning_rate": 6.182867557715675e-05, + "loss": 1.2558, + "step": 6293 + }, + { + "epoch": 1.9104568219760205, + "grad_norm": 0.5717966556549072, + "learning_rate": 6.182260024301338e-05, + "loss": 1.7264, + "step": 6294 + }, + { + "epoch": 1.9107603581727122, + "grad_norm": 0.5692905187606812, + "learning_rate": 6.181652490886998e-05, + "loss": 1.6922, + "step": 6295 + }, + { + "epoch": 1.9110638943694036, + "grad_norm": 0.45173266530036926, + "learning_rate": 6.181044957472661e-05, + "loss": 1.0979, + "step": 6296 + }, + { + "epoch": 1.911367430566095, + "grad_norm": 0.5674440860748291, + "learning_rate": 6.180437424058325e-05, + "loss": 1.7522, + "step": 6297 + }, + { + "epoch": 1.9116709667627865, + "grad_norm": 0.634099543094635, + "learning_rate": 6.179829890643985e-05, + "loss": 1.5519, + "step": 6298 + }, + { + "epoch": 1.911974502959478, + "grad_norm": 0.6131418943405151, + "learning_rate": 6.179222357229648e-05, + "loss": 1.5224, + "step": 6299 + }, + { + "epoch": 1.9122780391561693, + "grad_norm": 0.4736331105232239, + "learning_rate": 6.17861482381531e-05, + "loss": 1.2786, + "step": 6300 + }, + { + "epoch": 1.9125815753528608, + "grad_norm": 0.827942967414856, + "learning_rate": 6.178007290400973e-05, + "loss": 1.7119, + "step": 6301 + }, + { + "epoch": 1.9128851115495524, + "grad_norm": 0.572033166885376, + "learning_rate": 6.177399756986635e-05, + "loss": 1.6908, + "step": 6302 + }, + { + "epoch": 1.9131886477462436, + "grad_norm": 0.6228052377700806, + "learning_rate": 6.176792223572296e-05, + "loss": 1.7296, + "step": 6303 + }, + { + "epoch": 1.9134921839429353, + "grad_norm": 0.48696666955947876, + "learning_rate": 6.17618469015796e-05, + "loss": 1.5238, + "step": 6304 + }, + { + "epoch": 1.9137957201396265, + "grad_norm": 0.5116902589797974, + "learning_rate": 6.175577156743621e-05, + "loss": 1.8871, + "step": 6305 + }, + { + "epoch": 1.9140992563363182, + "grad_norm": 0.5537254810333252, + "learning_rate": 6.174969623329283e-05, + "loss": 1.712, + "step": 6306 + }, + { + "epoch": 1.9144027925330096, + "grad_norm": 0.43108007311820984, + "learning_rate": 6.174362089914946e-05, + "loss": 1.5564, + "step": 6307 + }, + { + "epoch": 1.914706328729701, + "grad_norm": 0.4873630404472351, + "learning_rate": 6.173754556500608e-05, + "loss": 1.8391, + "step": 6308 + }, + { + "epoch": 1.9150098649263925, + "grad_norm": 0.4794020354747772, + "learning_rate": 6.17314702308627e-05, + "loss": 1.9983, + "step": 6309 + }, + { + "epoch": 1.915313401123084, + "grad_norm": 0.4727613031864166, + "learning_rate": 6.172539489671932e-05, + "loss": 1.8201, + "step": 6310 + }, + { + "epoch": 1.9156169373197753, + "grad_norm": 0.5152159929275513, + "learning_rate": 6.171931956257596e-05, + "loss": 1.2213, + "step": 6311 + }, + { + "epoch": 1.9159204735164668, + "grad_norm": 0.5150118470191956, + "learning_rate": 6.171324422843256e-05, + "loss": 1.7555, + "step": 6312 + }, + { + "epoch": 1.9162240097131584, + "grad_norm": 0.5367141366004944, + "learning_rate": 6.170716889428919e-05, + "loss": 1.6911, + "step": 6313 + }, + { + "epoch": 1.9165275459098496, + "grad_norm": 0.6431503891944885, + "learning_rate": 6.170109356014581e-05, + "loss": 1.6756, + "step": 6314 + }, + { + "epoch": 1.9168310821065413, + "grad_norm": 1.273424506187439, + "learning_rate": 6.169501822600244e-05, + "loss": 1.3612, + "step": 6315 + }, + { + "epoch": 1.9171346183032325, + "grad_norm": 0.5230395793914795, + "learning_rate": 6.168894289185906e-05, + "loss": 1.7962, + "step": 6316 + }, + { + "epoch": 1.9174381544999242, + "grad_norm": 0.5366686582565308, + "learning_rate": 6.168286755771567e-05, + "loss": 1.8589, + "step": 6317 + }, + { + "epoch": 1.9177416906966156, + "grad_norm": 0.5949418544769287, + "learning_rate": 6.16767922235723e-05, + "loss": 1.7689, + "step": 6318 + }, + { + "epoch": 1.918045226893307, + "grad_norm": 0.800364077091217, + "learning_rate": 6.167071688942892e-05, + "loss": 1.7323, + "step": 6319 + }, + { + "epoch": 1.9183487630899985, + "grad_norm": 0.5248730778694153, + "learning_rate": 6.166464155528554e-05, + "loss": 1.6069, + "step": 6320 + }, + { + "epoch": 1.91865229928669, + "grad_norm": 0.5831948518753052, + "learning_rate": 6.165856622114217e-05, + "loss": 1.6719, + "step": 6321 + }, + { + "epoch": 1.9189558354833816, + "grad_norm": 0.6087794303894043, + "learning_rate": 6.165249088699879e-05, + "loss": 1.3223, + "step": 6322 + }, + { + "epoch": 1.9192593716800728, + "grad_norm": 0.5439948439598083, + "learning_rate": 6.16464155528554e-05, + "loss": 1.7126, + "step": 6323 + }, + { + "epoch": 1.9195629078767644, + "grad_norm": 0.7024933099746704, + "learning_rate": 6.164034021871203e-05, + "loss": 1.324, + "step": 6324 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.5878713130950928, + "learning_rate": 6.163426488456865e-05, + "loss": 1.7572, + "step": 6325 + }, + { + "epoch": 1.9201699802701473, + "grad_norm": 0.3653714656829834, + "learning_rate": 6.162818955042527e-05, + "loss": 1.2048, + "step": 6326 + }, + { + "epoch": 1.9204735164668387, + "grad_norm": 0.5881572365760803, + "learning_rate": 6.16221142162819e-05, + "loss": 1.4599, + "step": 6327 + }, + { + "epoch": 1.9207770526635302, + "grad_norm": 0.5380381941795349, + "learning_rate": 6.161603888213852e-05, + "loss": 1.618, + "step": 6328 + }, + { + "epoch": 1.9210805888602216, + "grad_norm": 0.6120547652244568, + "learning_rate": 6.160996354799515e-05, + "loss": 1.338, + "step": 6329 + }, + { + "epoch": 1.921384125056913, + "grad_norm": 0.6269816160202026, + "learning_rate": 6.160388821385177e-05, + "loss": 1.8559, + "step": 6330 + }, + { + "epoch": 1.9216876612536045, + "grad_norm": 0.6310059428215027, + "learning_rate": 6.159781287970838e-05, + "loss": 1.642, + "step": 6331 + }, + { + "epoch": 1.921991197450296, + "grad_norm": 0.6771363615989685, + "learning_rate": 6.159173754556501e-05, + "loss": 1.4403, + "step": 6332 + }, + { + "epoch": 1.9222947336469876, + "grad_norm": 1.0170625448226929, + "learning_rate": 6.158566221142163e-05, + "loss": 1.5708, + "step": 6333 + }, + { + "epoch": 1.9225982698436788, + "grad_norm": 0.7576006054878235, + "learning_rate": 6.157958687727825e-05, + "loss": 2.0975, + "step": 6334 + }, + { + "epoch": 1.9229018060403704, + "grad_norm": 0.5351456999778748, + "learning_rate": 6.157351154313488e-05, + "loss": 1.9641, + "step": 6335 + }, + { + "epoch": 1.9232053422370616, + "grad_norm": 0.4267328679561615, + "learning_rate": 6.15674362089915e-05, + "loss": 1.2032, + "step": 6336 + }, + { + "epoch": 1.9235088784337533, + "grad_norm": 0.5409083366394043, + "learning_rate": 6.156136087484811e-05, + "loss": 1.8177, + "step": 6337 + }, + { + "epoch": 1.9238124146304447, + "grad_norm": 0.5315467119216919, + "learning_rate": 6.155528554070474e-05, + "loss": 1.8095, + "step": 6338 + }, + { + "epoch": 1.9241159508271362, + "grad_norm": 0.5460966229438782, + "learning_rate": 6.154921020656136e-05, + "loss": 1.4244, + "step": 6339 + }, + { + "epoch": 1.9244194870238276, + "grad_norm": 0.4572533369064331, + "learning_rate": 6.154313487241798e-05, + "loss": 1.5232, + "step": 6340 + }, + { + "epoch": 1.924723023220519, + "grad_norm": 0.6189396381378174, + "learning_rate": 6.153705953827461e-05, + "loss": 1.6011, + "step": 6341 + }, + { + "epoch": 1.9250265594172105, + "grad_norm": 0.513096272945404, + "learning_rate": 6.153098420413123e-05, + "loss": 1.4146, + "step": 6342 + }, + { + "epoch": 1.925330095613902, + "grad_norm": 0.5706098675727844, + "learning_rate": 6.152490886998786e-05, + "loss": 1.2415, + "step": 6343 + }, + { + "epoch": 1.9256336318105935, + "grad_norm": 0.6112621426582336, + "learning_rate": 6.151883353584448e-05, + "loss": 1.3726, + "step": 6344 + }, + { + "epoch": 1.9259371680072848, + "grad_norm": 0.5262756943702698, + "learning_rate": 6.151275820170109e-05, + "loss": 1.9781, + "step": 6345 + }, + { + "epoch": 1.9262407042039764, + "grad_norm": 0.5784090161323547, + "learning_rate": 6.150668286755772e-05, + "loss": 1.714, + "step": 6346 + }, + { + "epoch": 1.9265442404006676, + "grad_norm": 0.6644015908241272, + "learning_rate": 6.150060753341434e-05, + "loss": 1.6659, + "step": 6347 + }, + { + "epoch": 1.9268477765973593, + "grad_norm": 0.5297151207923889, + "learning_rate": 6.149453219927096e-05, + "loss": 1.5757, + "step": 6348 + }, + { + "epoch": 1.9271513127940507, + "grad_norm": 0.6419957876205444, + "learning_rate": 6.148845686512759e-05, + "loss": 1.7383, + "step": 6349 + }, + { + "epoch": 1.9274548489907422, + "grad_norm": 0.5073803663253784, + "learning_rate": 6.14823815309842e-05, + "loss": 1.9598, + "step": 6350 + }, + { + "epoch": 1.9277583851874336, + "grad_norm": 0.6124430298805237, + "learning_rate": 6.147630619684082e-05, + "loss": 1.6829, + "step": 6351 + }, + { + "epoch": 1.928061921384125, + "grad_norm": 0.5520927309989929, + "learning_rate": 6.147023086269745e-05, + "loss": 1.7079, + "step": 6352 + }, + { + "epoch": 1.9283654575808167, + "grad_norm": 0.5771626830101013, + "learning_rate": 6.146415552855407e-05, + "loss": 1.3217, + "step": 6353 + }, + { + "epoch": 1.928668993777508, + "grad_norm": 0.6157678961753845, + "learning_rate": 6.145808019441069e-05, + "loss": 1.6344, + "step": 6354 + }, + { + "epoch": 1.9289725299741995, + "grad_norm": 0.5151667594909668, + "learning_rate": 6.145200486026732e-05, + "loss": 1.8214, + "step": 6355 + }, + { + "epoch": 1.9292760661708908, + "grad_norm": 0.48279091715812683, + "learning_rate": 6.144592952612394e-05, + "loss": 1.6539, + "step": 6356 + }, + { + "epoch": 1.9295796023675824, + "grad_norm": 0.8940995335578918, + "learning_rate": 6.143985419198055e-05, + "loss": 1.7098, + "step": 6357 + }, + { + "epoch": 1.9298831385642736, + "grad_norm": 0.510097861289978, + "learning_rate": 6.143377885783719e-05, + "loss": 1.2187, + "step": 6358 + }, + { + "epoch": 1.9301866747609653, + "grad_norm": 0.6991041898727417, + "learning_rate": 6.14277035236938e-05, + "loss": 1.6297, + "step": 6359 + }, + { + "epoch": 1.9304902109576567, + "grad_norm": 0.7418796420097351, + "learning_rate": 6.142162818955043e-05, + "loss": 1.8175, + "step": 6360 + }, + { + "epoch": 1.9307937471543482, + "grad_norm": 0.46536850929260254, + "learning_rate": 6.141555285540705e-05, + "loss": 1.3162, + "step": 6361 + }, + { + "epoch": 1.9310972833510396, + "grad_norm": 0.5243115425109863, + "learning_rate": 6.140947752126367e-05, + "loss": 2.014, + "step": 6362 + }, + { + "epoch": 1.931400819547731, + "grad_norm": 0.6091117262840271, + "learning_rate": 6.14034021871203e-05, + "loss": 1.3685, + "step": 6363 + }, + { + "epoch": 1.9317043557444227, + "grad_norm": 0.5803942680358887, + "learning_rate": 6.139732685297692e-05, + "loss": 1.3536, + "step": 6364 + }, + { + "epoch": 1.9320078919411139, + "grad_norm": 0.5577396750450134, + "learning_rate": 6.139125151883353e-05, + "loss": 2.0119, + "step": 6365 + }, + { + "epoch": 1.9323114281378055, + "grad_norm": 0.6743834018707275, + "learning_rate": 6.138517618469016e-05, + "loss": 1.8706, + "step": 6366 + }, + { + "epoch": 1.9326149643344968, + "grad_norm": 0.5312842130661011, + "learning_rate": 6.137910085054678e-05, + "loss": 1.9281, + "step": 6367 + }, + { + "epoch": 1.9329185005311884, + "grad_norm": 0.569107711315155, + "learning_rate": 6.13730255164034e-05, + "loss": 1.532, + "step": 6368 + }, + { + "epoch": 1.9332220367278798, + "grad_norm": 0.4576084315776825, + "learning_rate": 6.136695018226003e-05, + "loss": 1.1511, + "step": 6369 + }, + { + "epoch": 1.9335255729245713, + "grad_norm": 0.5539278984069824, + "learning_rate": 6.136087484811665e-05, + "loss": 1.4286, + "step": 6370 + }, + { + "epoch": 1.9338291091212627, + "grad_norm": 0.544119119644165, + "learning_rate": 6.135479951397326e-05, + "loss": 1.2395, + "step": 6371 + }, + { + "epoch": 1.9341326453179541, + "grad_norm": 0.4071330428123474, + "learning_rate": 6.13487241798299e-05, + "loss": 1.6382, + "step": 6372 + }, + { + "epoch": 1.9344361815146456, + "grad_norm": 0.5704235434532166, + "learning_rate": 6.134264884568651e-05, + "loss": 1.4867, + "step": 6373 + }, + { + "epoch": 1.934739717711337, + "grad_norm": 0.5418073534965515, + "learning_rate": 6.133657351154314e-05, + "loss": 1.6909, + "step": 6374 + }, + { + "epoch": 1.9350432539080287, + "grad_norm": 0.579681932926178, + "learning_rate": 6.133049817739976e-05, + "loss": 1.5281, + "step": 6375 + }, + { + "epoch": 1.9353467901047199, + "grad_norm": 0.6796471476554871, + "learning_rate": 6.132442284325638e-05, + "loss": 1.2198, + "step": 6376 + }, + { + "epoch": 1.9356503263014115, + "grad_norm": 0.535188615322113, + "learning_rate": 6.131834750911301e-05, + "loss": 1.4344, + "step": 6377 + }, + { + "epoch": 1.9359538624981028, + "grad_norm": 0.5495712757110596, + "learning_rate": 6.131227217496963e-05, + "loss": 1.3002, + "step": 6378 + }, + { + "epoch": 1.9362573986947944, + "grad_norm": 0.5547313690185547, + "learning_rate": 6.130619684082624e-05, + "loss": 1.9909, + "step": 6379 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.5010794997215271, + "learning_rate": 6.130012150668287e-05, + "loss": 1.778, + "step": 6380 + }, + { + "epoch": 1.9368644710881773, + "grad_norm": 0.47357454895973206, + "learning_rate": 6.129404617253949e-05, + "loss": 2.0388, + "step": 6381 + }, + { + "epoch": 1.9371680072848687, + "grad_norm": 0.6383755207061768, + "learning_rate": 6.128797083839611e-05, + "loss": 1.2668, + "step": 6382 + }, + { + "epoch": 1.9374715434815601, + "grad_norm": 0.49293264746665955, + "learning_rate": 6.128189550425274e-05, + "loss": 1.759, + "step": 6383 + }, + { + "epoch": 1.9377750796782516, + "grad_norm": 0.5476403832435608, + "learning_rate": 6.127582017010936e-05, + "loss": 1.63, + "step": 6384 + }, + { + "epoch": 1.938078615874943, + "grad_norm": 0.5941591858863831, + "learning_rate": 6.126974483596597e-05, + "loss": 1.8002, + "step": 6385 + }, + { + "epoch": 1.9383821520716347, + "grad_norm": 0.5813649892807007, + "learning_rate": 6.12636695018226e-05, + "loss": 1.5548, + "step": 6386 + }, + { + "epoch": 1.9386856882683259, + "grad_norm": 0.5048520565032959, + "learning_rate": 6.125759416767922e-05, + "loss": 1.8952, + "step": 6387 + }, + { + "epoch": 1.9389892244650175, + "grad_norm": 0.5147222876548767, + "learning_rate": 6.125151883353585e-05, + "loss": 1.7712, + "step": 6388 + }, + { + "epoch": 1.9392927606617087, + "grad_norm": 0.5488929748535156, + "learning_rate": 6.124544349939247e-05, + "loss": 1.5482, + "step": 6389 + }, + { + "epoch": 1.9395962968584004, + "grad_norm": 0.5807020664215088, + "learning_rate": 6.123936816524909e-05, + "loss": 1.8429, + "step": 6390 + }, + { + "epoch": 1.9398998330550918, + "grad_norm": 0.4687190651893616, + "learning_rate": 6.123329283110572e-05, + "loss": 1.4001, + "step": 6391 + }, + { + "epoch": 1.9402033692517833, + "grad_norm": 0.5817119479179382, + "learning_rate": 6.122721749696234e-05, + "loss": 1.6638, + "step": 6392 + }, + { + "epoch": 1.9405069054484747, + "grad_norm": 0.9106517434120178, + "learning_rate": 6.122114216281895e-05, + "loss": 0.6738, + "step": 6393 + }, + { + "epoch": 1.9408104416451661, + "grad_norm": 0.5468938946723938, + "learning_rate": 6.121506682867558e-05, + "loss": 1.8881, + "step": 6394 + }, + { + "epoch": 1.9411139778418578, + "grad_norm": 0.6180863380432129, + "learning_rate": 6.12089914945322e-05, + "loss": 1.8285, + "step": 6395 + }, + { + "epoch": 1.941417514038549, + "grad_norm": 0.5428357124328613, + "learning_rate": 6.120291616038882e-05, + "loss": 1.613, + "step": 6396 + }, + { + "epoch": 1.9417210502352407, + "grad_norm": 0.5250274538993835, + "learning_rate": 6.119684082624545e-05, + "loss": 1.3086, + "step": 6397 + }, + { + "epoch": 1.9420245864319319, + "grad_norm": 0.5188660621643066, + "learning_rate": 6.119076549210207e-05, + "loss": 1.8626, + "step": 6398 + }, + { + "epoch": 1.9423281226286235, + "grad_norm": 0.5318378806114197, + "learning_rate": 6.118469015795868e-05, + "loss": 1.6041, + "step": 6399 + }, + { + "epoch": 1.942631658825315, + "grad_norm": 0.5830715894699097, + "learning_rate": 6.117861482381532e-05, + "loss": 1.7264, + "step": 6400 + }, + { + "epoch": 1.9429351950220064, + "grad_norm": 0.5456033945083618, + "learning_rate": 6.117253948967193e-05, + "loss": 1.8035, + "step": 6401 + }, + { + "epoch": 1.9432387312186978, + "grad_norm": 0.4758380949497223, + "learning_rate": 6.116646415552856e-05, + "loss": 1.6618, + "step": 6402 + }, + { + "epoch": 1.9435422674153893, + "grad_norm": 0.6281861662864685, + "learning_rate": 6.116038882138518e-05, + "loss": 1.4869, + "step": 6403 + }, + { + "epoch": 1.9438458036120807, + "grad_norm": 0.6155133247375488, + "learning_rate": 6.11543134872418e-05, + "loss": 1.6041, + "step": 6404 + }, + { + "epoch": 1.9441493398087721, + "grad_norm": 0.549446702003479, + "learning_rate": 6.114823815309843e-05, + "loss": 1.6261, + "step": 6405 + }, + { + "epoch": 1.9444528760054638, + "grad_norm": 0.5333273410797119, + "learning_rate": 6.114216281895505e-05, + "loss": 1.7439, + "step": 6406 + }, + { + "epoch": 1.944756412202155, + "grad_norm": 0.5138979554176331, + "learning_rate": 6.113608748481166e-05, + "loss": 1.3236, + "step": 6407 + }, + { + "epoch": 1.9450599483988467, + "grad_norm": 0.5613072514533997, + "learning_rate": 6.11300121506683e-05, + "loss": 1.6681, + "step": 6408 + }, + { + "epoch": 1.9453634845955379, + "grad_norm": 0.611084520816803, + "learning_rate": 6.112393681652491e-05, + "loss": 1.5973, + "step": 6409 + }, + { + "epoch": 1.9456670207922295, + "grad_norm": 0.6191888451576233, + "learning_rate": 6.111786148238153e-05, + "loss": 1.3854, + "step": 6410 + }, + { + "epoch": 1.945970556988921, + "grad_norm": 0.6448600888252258, + "learning_rate": 6.111178614823816e-05, + "loss": 1.8407, + "step": 6411 + }, + { + "epoch": 1.9462740931856124, + "grad_norm": 0.5122054815292358, + "learning_rate": 6.110571081409478e-05, + "loss": 1.7577, + "step": 6412 + }, + { + "epoch": 1.9465776293823038, + "grad_norm": 0.5150049328804016, + "learning_rate": 6.10996354799514e-05, + "loss": 1.3837, + "step": 6413 + }, + { + "epoch": 1.9468811655789953, + "grad_norm": 0.5011094808578491, + "learning_rate": 6.109356014580803e-05, + "loss": 1.7184, + "step": 6414 + }, + { + "epoch": 1.9471847017756867, + "grad_norm": 0.44695425033569336, + "learning_rate": 6.108748481166464e-05, + "loss": 1.7555, + "step": 6415 + }, + { + "epoch": 1.9474882379723781, + "grad_norm": 0.5184510946273804, + "learning_rate": 6.108140947752127e-05, + "loss": 1.7578, + "step": 6416 + }, + { + "epoch": 1.9477917741690698, + "grad_norm": 0.6175386309623718, + "learning_rate": 6.107533414337789e-05, + "loss": 1.4583, + "step": 6417 + }, + { + "epoch": 1.948095310365761, + "grad_norm": 0.5669700503349304, + "learning_rate": 6.106925880923451e-05, + "loss": 1.6413, + "step": 6418 + }, + { + "epoch": 1.9483988465624527, + "grad_norm": 0.5198760032653809, + "learning_rate": 6.106318347509114e-05, + "loss": 1.8265, + "step": 6419 + }, + { + "epoch": 1.9487023827591439, + "grad_norm": 0.5493960976600647, + "learning_rate": 6.105710814094774e-05, + "loss": 1.0121, + "step": 6420 + }, + { + "epoch": 1.9490059189558355, + "grad_norm": 0.8704328536987305, + "learning_rate": 6.105103280680437e-05, + "loss": 1.4603, + "step": 6421 + }, + { + "epoch": 1.949309455152527, + "grad_norm": 0.5389131307601929, + "learning_rate": 6.1044957472661e-05, + "loss": 1.6166, + "step": 6422 + }, + { + "epoch": 1.9496129913492184, + "grad_norm": 0.5700472593307495, + "learning_rate": 6.103888213851762e-05, + "loss": 1.7217, + "step": 6423 + }, + { + "epoch": 1.9499165275459098, + "grad_norm": 0.6041691899299622, + "learning_rate": 6.1032806804374246e-05, + "loss": 1.5693, + "step": 6424 + }, + { + "epoch": 1.9502200637426013, + "grad_norm": 0.5294330716133118, + "learning_rate": 6.102673147023087e-05, + "loss": 1.644, + "step": 6425 + }, + { + "epoch": 1.950523599939293, + "grad_norm": 0.5551626682281494, + "learning_rate": 6.102065613608748e-05, + "loss": 1.6099, + "step": 6426 + }, + { + "epoch": 1.9508271361359841, + "grad_norm": 0.5445873737335205, + "learning_rate": 6.101458080194411e-05, + "loss": 1.4334, + "step": 6427 + }, + { + "epoch": 1.9511306723326758, + "grad_norm": 0.5853805541992188, + "learning_rate": 6.1008505467800736e-05, + "loss": 1.6584, + "step": 6428 + }, + { + "epoch": 1.951434208529367, + "grad_norm": 0.5774348974227905, + "learning_rate": 6.100243013365735e-05, + "loss": 1.6662, + "step": 6429 + }, + { + "epoch": 1.9517377447260587, + "grad_norm": 0.5022454261779785, + "learning_rate": 6.099635479951398e-05, + "loss": 1.9131, + "step": 6430 + }, + { + "epoch": 1.95204128092275, + "grad_norm": 0.44924312829971313, + "learning_rate": 6.09902794653706e-05, + "loss": 1.4716, + "step": 6431 + }, + { + "epoch": 1.9523448171194415, + "grad_norm": 0.5021010041236877, + "learning_rate": 6.098420413122722e-05, + "loss": 0.8655, + "step": 6432 + }, + { + "epoch": 1.952648353316133, + "grad_norm": 0.621820867061615, + "learning_rate": 6.097812879708384e-05, + "loss": 1.396, + "step": 6433 + }, + { + "epoch": 1.9529518895128244, + "grad_norm": 0.6849603056907654, + "learning_rate": 6.097205346294046e-05, + "loss": 1.2204, + "step": 6434 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.9602012038230896, + "learning_rate": 6.0965978128797084e-05, + "loss": 1.5787, + "step": 6435 + }, + { + "epoch": 1.9535589619062073, + "grad_norm": 0.5582414269447327, + "learning_rate": 6.095990279465371e-05, + "loss": 1.8998, + "step": 6436 + }, + { + "epoch": 1.953862498102899, + "grad_norm": 0.4775597155094147, + "learning_rate": 6.0953827460510325e-05, + "loss": 2.094, + "step": 6437 + }, + { + "epoch": 1.9541660342995901, + "grad_norm": 0.5407741665840149, + "learning_rate": 6.094775212636695e-05, + "loss": 1.6444, + "step": 6438 + }, + { + "epoch": 1.9544695704962818, + "grad_norm": 0.5293698906898499, + "learning_rate": 6.094167679222358e-05, + "loss": 1.7003, + "step": 6439 + }, + { + "epoch": 1.954773106692973, + "grad_norm": 0.5839650630950928, + "learning_rate": 6.093560145808019e-05, + "loss": 1.489, + "step": 6440 + }, + { + "epoch": 1.9550766428896647, + "grad_norm": 0.6273818612098694, + "learning_rate": 6.092952612393682e-05, + "loss": 1.5088, + "step": 6441 + }, + { + "epoch": 1.955380179086356, + "grad_norm": 0.5513050556182861, + "learning_rate": 6.0923450789793446e-05, + "loss": 1.652, + "step": 6442 + }, + { + "epoch": 1.9556837152830475, + "grad_norm": 0.48457103967666626, + "learning_rate": 6.091737545565006e-05, + "loss": 1.7693, + "step": 6443 + }, + { + "epoch": 1.955987251479739, + "grad_norm": 0.5889832973480225, + "learning_rate": 6.091130012150669e-05, + "loss": 1.9562, + "step": 6444 + }, + { + "epoch": 1.9562907876764304, + "grad_norm": 0.5198668241500854, + "learning_rate": 6.090522478736331e-05, + "loss": 1.592, + "step": 6445 + }, + { + "epoch": 1.9565943238731218, + "grad_norm": 0.462070107460022, + "learning_rate": 6.089914945321993e-05, + "loss": 2.0129, + "step": 6446 + }, + { + "epoch": 1.9568978600698133, + "grad_norm": 0.6464818120002747, + "learning_rate": 6.089307411907655e-05, + "loss": 1.4838, + "step": 6447 + }, + { + "epoch": 1.957201396266505, + "grad_norm": 0.510562539100647, + "learning_rate": 6.088699878493317e-05, + "loss": 1.6365, + "step": 6448 + }, + { + "epoch": 1.9575049324631961, + "grad_norm": 0.4521179497241974, + "learning_rate": 6.0880923450789794e-05, + "loss": 1.6811, + "step": 6449 + }, + { + "epoch": 1.9578084686598878, + "grad_norm": 0.45304521918296814, + "learning_rate": 6.087484811664642e-05, + "loss": 1.2221, + "step": 6450 + }, + { + "epoch": 1.958112004856579, + "grad_norm": 0.5869529843330383, + "learning_rate": 6.0868772782503036e-05, + "loss": 1.6444, + "step": 6451 + }, + { + "epoch": 1.9584155410532706, + "grad_norm": 0.6218820214271545, + "learning_rate": 6.086269744835966e-05, + "loss": 1.6352, + "step": 6452 + }, + { + "epoch": 1.958719077249962, + "grad_norm": 0.6100924015045166, + "learning_rate": 6.085662211421629e-05, + "loss": 1.4695, + "step": 6453 + }, + { + "epoch": 1.9590226134466535, + "grad_norm": 0.5317431688308716, + "learning_rate": 6.08505467800729e-05, + "loss": 1.6922, + "step": 6454 + }, + { + "epoch": 1.959326149643345, + "grad_norm": 0.4049581289291382, + "learning_rate": 6.084447144592953e-05, + "loss": 1.8279, + "step": 6455 + }, + { + "epoch": 1.9596296858400364, + "grad_norm": 0.6045603156089783, + "learning_rate": 6.0838396111786156e-05, + "loss": 1.86, + "step": 6456 + }, + { + "epoch": 1.959933222036728, + "grad_norm": 0.5300851464271545, + "learning_rate": 6.083232077764277e-05, + "loss": 1.5076, + "step": 6457 + }, + { + "epoch": 1.9602367582334193, + "grad_norm": 0.6136688590049744, + "learning_rate": 6.08262454434994e-05, + "loss": 1.1669, + "step": 6458 + }, + { + "epoch": 1.960540294430111, + "grad_norm": 0.5564177632331848, + "learning_rate": 6.082017010935602e-05, + "loss": 1.4164, + "step": 6459 + }, + { + "epoch": 1.9608438306268021, + "grad_norm": 0.5613592267036438, + "learning_rate": 6.081409477521264e-05, + "loss": 1.7866, + "step": 6460 + }, + { + "epoch": 1.9611473668234938, + "grad_norm": 0.6643790602684021, + "learning_rate": 6.080801944106926e-05, + "loss": 1.6314, + "step": 6461 + }, + { + "epoch": 1.9614509030201852, + "grad_norm": 0.9621322751045227, + "learning_rate": 6.080194410692588e-05, + "loss": 1.6243, + "step": 6462 + }, + { + "epoch": 1.9617544392168766, + "grad_norm": 0.5654889345169067, + "learning_rate": 6.0795868772782504e-05, + "loss": 1.6922, + "step": 6463 + }, + { + "epoch": 1.962057975413568, + "grad_norm": 0.5692543387413025, + "learning_rate": 6.078979343863913e-05, + "loss": 1.6057, + "step": 6464 + }, + { + "epoch": 1.9623615116102595, + "grad_norm": 0.6160483360290527, + "learning_rate": 6.0783718104495746e-05, + "loss": 1.6588, + "step": 6465 + }, + { + "epoch": 1.962665047806951, + "grad_norm": 0.5041912794113159, + "learning_rate": 6.077764277035237e-05, + "loss": 1.5149, + "step": 6466 + }, + { + "epoch": 1.9629685840036424, + "grad_norm": 0.6104092597961426, + "learning_rate": 6.0771567436209e-05, + "loss": 1.1343, + "step": 6467 + }, + { + "epoch": 1.963272120200334, + "grad_norm": 0.5036048293113708, + "learning_rate": 6.076549210206561e-05, + "loss": 1.5201, + "step": 6468 + }, + { + "epoch": 1.9635756563970252, + "grad_norm": 0.5256472229957581, + "learning_rate": 6.075941676792224e-05, + "loss": 1.7575, + "step": 6469 + }, + { + "epoch": 1.963879192593717, + "grad_norm": 0.5224828720092773, + "learning_rate": 6.0753341433778866e-05, + "loss": 1.7467, + "step": 6470 + }, + { + "epoch": 1.9641827287904081, + "grad_norm": 0.5759482979774475, + "learning_rate": 6.074726609963548e-05, + "loss": 1.7598, + "step": 6471 + }, + { + "epoch": 1.9644862649870998, + "grad_norm": 0.40608924627304077, + "learning_rate": 6.074119076549211e-05, + "loss": 1.4643, + "step": 6472 + }, + { + "epoch": 1.9647898011837912, + "grad_norm": 0.5966246724128723, + "learning_rate": 6.073511543134873e-05, + "loss": 1.8583, + "step": 6473 + }, + { + "epoch": 1.9650933373804826, + "grad_norm": 0.5416772961616516, + "learning_rate": 6.072904009720535e-05, + "loss": 1.6128, + "step": 6474 + }, + { + "epoch": 1.965396873577174, + "grad_norm": 0.5953087210655212, + "learning_rate": 6.072296476306197e-05, + "loss": 1.885, + "step": 6475 + }, + { + "epoch": 1.9657004097738655, + "grad_norm": 0.5860414505004883, + "learning_rate": 6.071688942891859e-05, + "loss": 1.8107, + "step": 6476 + }, + { + "epoch": 1.966003945970557, + "grad_norm": 1.0410816669464111, + "learning_rate": 6.0710814094775214e-05, + "loss": 1.4357, + "step": 6477 + }, + { + "epoch": 1.9663074821672484, + "grad_norm": 0.5218018293380737, + "learning_rate": 6.070473876063184e-05, + "loss": 1.5651, + "step": 6478 + }, + { + "epoch": 1.96661101836394, + "grad_norm": 0.5561720132827759, + "learning_rate": 6.0698663426488456e-05, + "loss": 1.7292, + "step": 6479 + }, + { + "epoch": 1.9669145545606312, + "grad_norm": 0.5160397887229919, + "learning_rate": 6.069258809234508e-05, + "loss": 1.7071, + "step": 6480 + }, + { + "epoch": 1.967218090757323, + "grad_norm": 0.673804521560669, + "learning_rate": 6.068651275820171e-05, + "loss": 1.7731, + "step": 6481 + }, + { + "epoch": 1.9675216269540141, + "grad_norm": 0.46362537145614624, + "learning_rate": 6.068043742405832e-05, + "loss": 1.3744, + "step": 6482 + }, + { + "epoch": 1.9678251631507058, + "grad_norm": 0.5752343535423279, + "learning_rate": 6.067436208991495e-05, + "loss": 1.6567, + "step": 6483 + }, + { + "epoch": 1.9681286993473972, + "grad_norm": 0.6016415357589722, + "learning_rate": 6.0668286755771576e-05, + "loss": 1.4306, + "step": 6484 + }, + { + "epoch": 1.9684322355440886, + "grad_norm": 0.5972555875778198, + "learning_rate": 6.066221142162819e-05, + "loss": 1.7145, + "step": 6485 + }, + { + "epoch": 1.96873577174078, + "grad_norm": 0.6853018403053284, + "learning_rate": 6.065613608748482e-05, + "loss": 1.8169, + "step": 6486 + }, + { + "epoch": 1.9690393079374715, + "grad_norm": 0.5290831923484802, + "learning_rate": 6.065006075334144e-05, + "loss": 1.914, + "step": 6487 + }, + { + "epoch": 1.9693428441341632, + "grad_norm": 0.5616101622581482, + "learning_rate": 6.064398541919806e-05, + "loss": 1.5203, + "step": 6488 + }, + { + "epoch": 1.9696463803308544, + "grad_norm": 0.5446542501449585, + "learning_rate": 6.063791008505468e-05, + "loss": 1.6184, + "step": 6489 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.46483322978019714, + "learning_rate": 6.06318347509113e-05, + "loss": 1.0398, + "step": 6490 + }, + { + "epoch": 1.9702534527242372, + "grad_norm": 0.5556966066360474, + "learning_rate": 6.0625759416767924e-05, + "loss": 1.6125, + "step": 6491 + }, + { + "epoch": 1.970556988920929, + "grad_norm": 0.526470422744751, + "learning_rate": 6.061968408262455e-05, + "loss": 1.7479, + "step": 6492 + }, + { + "epoch": 1.97086052511762, + "grad_norm": 0.4931873679161072, + "learning_rate": 6.0613608748481166e-05, + "loss": 1.9352, + "step": 6493 + }, + { + "epoch": 1.9711640613143118, + "grad_norm": 0.665084958076477, + "learning_rate": 6.060753341433779e-05, + "loss": 1.7474, + "step": 6494 + }, + { + "epoch": 1.9714675975110032, + "grad_norm": 0.5243136286735535, + "learning_rate": 6.060145808019442e-05, + "loss": 1.7402, + "step": 6495 + }, + { + "epoch": 1.9717711337076946, + "grad_norm": 0.651799201965332, + "learning_rate": 6.059538274605103e-05, + "loss": 1.0746, + "step": 6496 + }, + { + "epoch": 1.972074669904386, + "grad_norm": 0.5092620849609375, + "learning_rate": 6.058930741190766e-05, + "loss": 1.7713, + "step": 6497 + }, + { + "epoch": 1.9723782061010775, + "grad_norm": 0.5405744314193726, + "learning_rate": 6.0583232077764286e-05, + "loss": 1.1775, + "step": 6498 + }, + { + "epoch": 1.9726817422977692, + "grad_norm": 0.4236612617969513, + "learning_rate": 6.05771567436209e-05, + "loss": 0.8284, + "step": 6499 + }, + { + "epoch": 1.9729852784944604, + "grad_norm": 0.5803433656692505, + "learning_rate": 6.057108140947753e-05, + "loss": 1.8339, + "step": 6500 + }, + { + "epoch": 1.973288814691152, + "grad_norm": 0.5176873803138733, + "learning_rate": 6.056500607533414e-05, + "loss": 1.4704, + "step": 6501 + }, + { + "epoch": 1.9735923508878432, + "grad_norm": 0.5557324886322021, + "learning_rate": 6.055893074119077e-05, + "loss": 1.8978, + "step": 6502 + }, + { + "epoch": 1.973895887084535, + "grad_norm": 0.5535895228385925, + "learning_rate": 6.055285540704739e-05, + "loss": 1.2969, + "step": 6503 + }, + { + "epoch": 1.9741994232812263, + "grad_norm": 0.5652197003364563, + "learning_rate": 6.054678007290401e-05, + "loss": 1.541, + "step": 6504 + }, + { + "epoch": 1.9745029594779178, + "grad_norm": 0.639728307723999, + "learning_rate": 6.0540704738760634e-05, + "loss": 1.4226, + "step": 6505 + }, + { + "epoch": 1.9748064956746092, + "grad_norm": 0.5786595344543457, + "learning_rate": 6.053462940461726e-05, + "loss": 1.3508, + "step": 6506 + }, + { + "epoch": 1.9751100318713006, + "grad_norm": 0.5643638968467712, + "learning_rate": 6.0528554070473876e-05, + "loss": 1.802, + "step": 6507 + }, + { + "epoch": 1.975413568067992, + "grad_norm": 0.5532044768333435, + "learning_rate": 6.05224787363305e-05, + "loss": 1.4272, + "step": 6508 + }, + { + "epoch": 1.9757171042646835, + "grad_norm": 0.44745269417762756, + "learning_rate": 6.051640340218713e-05, + "loss": 1.7448, + "step": 6509 + }, + { + "epoch": 1.9760206404613752, + "grad_norm": 0.5335531234741211, + "learning_rate": 6.051032806804374e-05, + "loss": 1.791, + "step": 6510 + }, + { + "epoch": 1.9763241766580664, + "grad_norm": 0.6150341629981995, + "learning_rate": 6.0504252733900365e-05, + "loss": 1.7764, + "step": 6511 + }, + { + "epoch": 1.976627712854758, + "grad_norm": 0.48714327812194824, + "learning_rate": 6.0498177399756996e-05, + "loss": 1.9457, + "step": 6512 + }, + { + "epoch": 1.9769312490514492, + "grad_norm": 0.4661531448364258, + "learning_rate": 6.049210206561361e-05, + "loss": 1.6888, + "step": 6513 + }, + { + "epoch": 1.977234785248141, + "grad_norm": 0.5356633067131042, + "learning_rate": 6.048602673147024e-05, + "loss": 1.4209, + "step": 6514 + }, + { + "epoch": 1.9775383214448323, + "grad_norm": 0.5116220116615295, + "learning_rate": 6.047995139732685e-05, + "loss": 1.2706, + "step": 6515 + }, + { + "epoch": 1.9778418576415238, + "grad_norm": 0.42522934079170227, + "learning_rate": 6.047387606318348e-05, + "loss": 1.1104, + "step": 6516 + }, + { + "epoch": 1.9781453938382152, + "grad_norm": 0.5198089480400085, + "learning_rate": 6.04678007290401e-05, + "loss": 1.9558, + "step": 6517 + }, + { + "epoch": 1.9784489300349066, + "grad_norm": 0.533348798751831, + "learning_rate": 6.046172539489672e-05, + "loss": 1.5748, + "step": 6518 + }, + { + "epoch": 1.9787524662315983, + "grad_norm": 0.5725319981575012, + "learning_rate": 6.0455650060753344e-05, + "loss": 1.849, + "step": 6519 + }, + { + "epoch": 1.9790560024282895, + "grad_norm": 0.5064899325370789, + "learning_rate": 6.044957472660997e-05, + "loss": 1.9204, + "step": 6520 + }, + { + "epoch": 1.9793595386249812, + "grad_norm": 0.6601528525352478, + "learning_rate": 6.0443499392466586e-05, + "loss": 1.841, + "step": 6521 + }, + { + "epoch": 1.9796630748216724, + "grad_norm": 0.5768593549728394, + "learning_rate": 6.043742405832321e-05, + "loss": 1.633, + "step": 6522 + }, + { + "epoch": 1.979966611018364, + "grad_norm": 0.6356444358825684, + "learning_rate": 6.0431348724179834e-05, + "loss": 1.9165, + "step": 6523 + }, + { + "epoch": 1.9802701472150552, + "grad_norm": 0.5365557670593262, + "learning_rate": 6.042527339003645e-05, + "loss": 1.4321, + "step": 6524 + }, + { + "epoch": 1.9805736834117469, + "grad_norm": 0.47943785786628723, + "learning_rate": 6.0419198055893075e-05, + "loss": 1.668, + "step": 6525 + }, + { + "epoch": 1.9808772196084383, + "grad_norm": 0.604640781879425, + "learning_rate": 6.0413122721749706e-05, + "loss": 1.4776, + "step": 6526 + }, + { + "epoch": 1.9811807558051298, + "grad_norm": 0.6262120604515076, + "learning_rate": 6.040704738760632e-05, + "loss": 1.5482, + "step": 6527 + }, + { + "epoch": 1.9814842920018212, + "grad_norm": 0.5773532390594482, + "learning_rate": 6.040097205346295e-05, + "loss": 1.7658, + "step": 6528 + }, + { + "epoch": 1.9817878281985126, + "grad_norm": 0.5646322965621948, + "learning_rate": 6.039489671931956e-05, + "loss": 1.3962, + "step": 6529 + }, + { + "epoch": 1.9820913643952043, + "grad_norm": 0.6088689565658569, + "learning_rate": 6.038882138517619e-05, + "loss": 1.7198, + "step": 6530 + }, + { + "epoch": 1.9823949005918955, + "grad_norm": 0.6004126071929932, + "learning_rate": 6.038274605103281e-05, + "loss": 1.9443, + "step": 6531 + }, + { + "epoch": 1.9826984367885871, + "grad_norm": 0.48227787017822266, + "learning_rate": 6.037667071688943e-05, + "loss": 1.631, + "step": 6532 + }, + { + "epoch": 1.9830019729852784, + "grad_norm": 0.546604335308075, + "learning_rate": 6.0370595382746054e-05, + "loss": 1.2204, + "step": 6533 + }, + { + "epoch": 1.98330550918197, + "grad_norm": 0.6554203629493713, + "learning_rate": 6.036452004860268e-05, + "loss": 1.6814, + "step": 6534 + }, + { + "epoch": 1.9836090453786615, + "grad_norm": 0.4436679482460022, + "learning_rate": 6.0358444714459296e-05, + "loss": 1.7958, + "step": 6535 + }, + { + "epoch": 1.9839125815753529, + "grad_norm": 0.48438987135887146, + "learning_rate": 6.035236938031592e-05, + "loss": 1.8244, + "step": 6536 + }, + { + "epoch": 1.9842161177720443, + "grad_norm": 0.4136127233505249, + "learning_rate": 6.0346294046172544e-05, + "loss": 1.7304, + "step": 6537 + }, + { + "epoch": 1.9845196539687358, + "grad_norm": 0.4987366795539856, + "learning_rate": 6.034021871202916e-05, + "loss": 1.8897, + "step": 6538 + }, + { + "epoch": 1.9848231901654272, + "grad_norm": 0.5552710294723511, + "learning_rate": 6.0334143377885785e-05, + "loss": 1.5099, + "step": 6539 + }, + { + "epoch": 1.9851267263621186, + "grad_norm": 0.6219823956489563, + "learning_rate": 6.0328068043742416e-05, + "loss": 1.4882, + "step": 6540 + }, + { + "epoch": 1.9854302625588103, + "grad_norm": 0.5261727571487427, + "learning_rate": 6.032199270959903e-05, + "loss": 1.2227, + "step": 6541 + }, + { + "epoch": 1.9857337987555015, + "grad_norm": 0.5668848156929016, + "learning_rate": 6.031591737545566e-05, + "loss": 1.7845, + "step": 6542 + }, + { + "epoch": 1.9860373349521931, + "grad_norm": 0.47545093297958374, + "learning_rate": 6.030984204131227e-05, + "loss": 1.2132, + "step": 6543 + }, + { + "epoch": 1.9863408711488844, + "grad_norm": 0.5103720426559448, + "learning_rate": 6.03037667071689e-05, + "loss": 1.6584, + "step": 6544 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.5664292573928833, + "learning_rate": 6.029769137302552e-05, + "loss": 1.8483, + "step": 6545 + }, + { + "epoch": 1.9869479435422674, + "grad_norm": 0.4930339753627777, + "learning_rate": 6.029161603888214e-05, + "loss": 1.071, + "step": 6546 + }, + { + "epoch": 1.9872514797389589, + "grad_norm": 0.4483380615711212, + "learning_rate": 6.0285540704738765e-05, + "loss": 1.5424, + "step": 6547 + }, + { + "epoch": 1.9875550159356503, + "grad_norm": 0.5853394269943237, + "learning_rate": 6.027946537059539e-05, + "loss": 1.151, + "step": 6548 + }, + { + "epoch": 1.9878585521323417, + "grad_norm": 0.46078211069107056, + "learning_rate": 6.0273390036452006e-05, + "loss": 1.7595, + "step": 6549 + }, + { + "epoch": 1.9881620883290332, + "grad_norm": 0.4655674397945404, + "learning_rate": 6.026731470230863e-05, + "loss": 1.2309, + "step": 6550 + }, + { + "epoch": 1.9884656245257246, + "grad_norm": 0.5861518383026123, + "learning_rate": 6.0261239368165254e-05, + "loss": 1.813, + "step": 6551 + }, + { + "epoch": 1.9887691607224163, + "grad_norm": 0.7233760356903076, + "learning_rate": 6.025516403402187e-05, + "loss": 1.6074, + "step": 6552 + }, + { + "epoch": 1.9890726969191075, + "grad_norm": 0.8948001861572266, + "learning_rate": 6.0249088699878495e-05, + "loss": 1.2804, + "step": 6553 + }, + { + "epoch": 1.9893762331157991, + "grad_norm": 0.6039700508117676, + "learning_rate": 6.0243013365735126e-05, + "loss": 1.3581, + "step": 6554 + }, + { + "epoch": 1.9896797693124904, + "grad_norm": 0.6029567122459412, + "learning_rate": 6.023693803159174e-05, + "loss": 1.9989, + "step": 6555 + }, + { + "epoch": 1.989983305509182, + "grad_norm": 0.4397352635860443, + "learning_rate": 6.023086269744837e-05, + "loss": 1.2332, + "step": 6556 + }, + { + "epoch": 1.9902868417058734, + "grad_norm": 0.6175510883331299, + "learning_rate": 6.022478736330498e-05, + "loss": 1.8423, + "step": 6557 + }, + { + "epoch": 1.9905903779025649, + "grad_norm": 0.5127015113830566, + "learning_rate": 6.021871202916161e-05, + "loss": 0.9714, + "step": 6558 + }, + { + "epoch": 1.9908939140992563, + "grad_norm": 0.5095683336257935, + "learning_rate": 6.021263669501823e-05, + "loss": 1.8097, + "step": 6559 + }, + { + "epoch": 1.9911974502959477, + "grad_norm": 0.5457046031951904, + "learning_rate": 6.0206561360874844e-05, + "loss": 1.4398, + "step": 6560 + }, + { + "epoch": 1.9915009864926394, + "grad_norm": 0.9285824298858643, + "learning_rate": 6.0200486026731475e-05, + "loss": 1.4319, + "step": 6561 + }, + { + "epoch": 1.9918045226893306, + "grad_norm": 0.5472911596298218, + "learning_rate": 6.01944106925881e-05, + "loss": 1.9349, + "step": 6562 + }, + { + "epoch": 1.9921080588860223, + "grad_norm": 0.4925740957260132, + "learning_rate": 6.0188335358444716e-05, + "loss": 1.5178, + "step": 6563 + }, + { + "epoch": 1.9924115950827135, + "grad_norm": 0.5921374559402466, + "learning_rate": 6.018226002430134e-05, + "loss": 1.5178, + "step": 6564 + }, + { + "epoch": 1.9927151312794051, + "grad_norm": 0.555499792098999, + "learning_rate": 6.0176184690157964e-05, + "loss": 1.6957, + "step": 6565 + }, + { + "epoch": 1.9930186674760966, + "grad_norm": 0.550757110118866, + "learning_rate": 6.017010935601458e-05, + "loss": 1.6667, + "step": 6566 + }, + { + "epoch": 1.993322203672788, + "grad_norm": 0.544740617275238, + "learning_rate": 6.0164034021871206e-05, + "loss": 1.9381, + "step": 6567 + }, + { + "epoch": 1.9936257398694794, + "grad_norm": 0.4217390716075897, + "learning_rate": 6.0157958687727836e-05, + "loss": 1.3707, + "step": 6568 + }, + { + "epoch": 1.9939292760661709, + "grad_norm": 0.5475983619689941, + "learning_rate": 6.015188335358445e-05, + "loss": 1.7717, + "step": 6569 + }, + { + "epoch": 1.9942328122628623, + "grad_norm": 0.5119839906692505, + "learning_rate": 6.014580801944108e-05, + "loss": 1.4352, + "step": 6570 + }, + { + "epoch": 1.9945363484595537, + "grad_norm": 0.5068908333778381, + "learning_rate": 6.013973268529769e-05, + "loss": 1.3449, + "step": 6571 + }, + { + "epoch": 1.9948398846562454, + "grad_norm": 0.8712594509124756, + "learning_rate": 6.013365735115431e-05, + "loss": 1.2025, + "step": 6572 + }, + { + "epoch": 1.9951434208529366, + "grad_norm": 0.5083956122398376, + "learning_rate": 6.012758201701094e-05, + "loss": 1.8382, + "step": 6573 + }, + { + "epoch": 1.9954469570496283, + "grad_norm": 0.558754563331604, + "learning_rate": 6.0121506682867554e-05, + "loss": 1.5454, + "step": 6574 + }, + { + "epoch": 1.9957504932463195, + "grad_norm": 0.56935054063797, + "learning_rate": 6.0115431348724185e-05, + "loss": 1.5451, + "step": 6575 + }, + { + "epoch": 1.9960540294430111, + "grad_norm": 0.4553762674331665, + "learning_rate": 6.010935601458081e-05, + "loss": 2.3248, + "step": 6576 + }, + { + "epoch": 1.9963575656397026, + "grad_norm": 0.5736023187637329, + "learning_rate": 6.0103280680437426e-05, + "loss": 1.5156, + "step": 6577 + }, + { + "epoch": 1.996661101836394, + "grad_norm": 0.4840937554836273, + "learning_rate": 6.009720534629405e-05, + "loss": 1.3645, + "step": 6578 + }, + { + "epoch": 1.9969646380330854, + "grad_norm": 0.6681798100471497, + "learning_rate": 6.0091130012150674e-05, + "loss": 1.2697, + "step": 6579 + }, + { + "epoch": 1.9972681742297769, + "grad_norm": 0.6583466529846191, + "learning_rate": 6.008505467800729e-05, + "loss": 1.589, + "step": 6580 + }, + { + "epoch": 1.9975717104264683, + "grad_norm": 0.3614431619644165, + "learning_rate": 6.0078979343863916e-05, + "loss": 1.3172, + "step": 6581 + }, + { + "epoch": 1.9978752466231597, + "grad_norm": 0.4758198857307434, + "learning_rate": 6.007290400972053e-05, + "loss": 1.8047, + "step": 6582 + }, + { + "epoch": 1.9981787828198514, + "grad_norm": 0.5516407489776611, + "learning_rate": 6.006682867557716e-05, + "loss": 2.0063, + "step": 6583 + }, + { + "epoch": 1.9984823190165426, + "grad_norm": 0.5885918736457825, + "learning_rate": 6.006075334143378e-05, + "loss": 1.8758, + "step": 6584 + }, + { + "epoch": 1.9987858552132343, + "grad_norm": 0.559417724609375, + "learning_rate": 6.00546780072904e-05, + "loss": 1.8631, + "step": 6585 + }, + { + "epoch": 1.9990893914099255, + "grad_norm": 0.5297870635986328, + "learning_rate": 6.004860267314702e-05, + "loss": 1.2478, + "step": 6586 + }, + { + "epoch": 1.9993929276066171, + "grad_norm": 0.583615243434906, + "learning_rate": 6.004252733900365e-05, + "loss": 1.4074, + "step": 6587 + }, + { + "epoch": 1.9996964638033086, + "grad_norm": 0.6096095442771912, + "learning_rate": 6.0036452004860264e-05, + "loss": 1.8452, + "step": 6588 + }, + { + "epoch": 2.0, + "grad_norm": 0.48540517687797546, + "learning_rate": 6.0030376670716895e-05, + "loss": 1.4051, + "step": 6589 + }, + { + "epoch": 2.0003035361966917, + "grad_norm": 0.4750955104827881, + "learning_rate": 6.002430133657352e-05, + "loss": 1.5649, + "step": 6590 + }, + { + "epoch": 2.000607072393383, + "grad_norm": 0.559846818447113, + "learning_rate": 6.0018226002430136e-05, + "loss": 1.4351, + "step": 6591 + }, + { + "epoch": 2.0009106085900745, + "grad_norm": 0.6046615242958069, + "learning_rate": 6.001215066828676e-05, + "loss": 1.3853, + "step": 6592 + }, + { + "epoch": 2.0012141447867657, + "grad_norm": 0.5519492626190186, + "learning_rate": 6.0006075334143384e-05, + "loss": 1.3244, + "step": 6593 + }, + { + "epoch": 2.0015176809834574, + "grad_norm": 0.6226881742477417, + "learning_rate": 6e-05, + "loss": 1.1149, + "step": 6594 + }, + { + "epoch": 2.0018212171801486, + "grad_norm": 0.6426854729652405, + "learning_rate": 5.9993924665856626e-05, + "loss": 1.4761, + "step": 6595 + }, + { + "epoch": 2.0021247533768403, + "grad_norm": 0.7619990110397339, + "learning_rate": 5.998784933171324e-05, + "loss": 1.1087, + "step": 6596 + }, + { + "epoch": 2.0024282895735315, + "grad_norm": 1.1839655637741089, + "learning_rate": 5.998177399756987e-05, + "loss": 1.4132, + "step": 6597 + }, + { + "epoch": 2.002731825770223, + "grad_norm": 0.736304759979248, + "learning_rate": 5.997569866342649e-05, + "loss": 1.0338, + "step": 6598 + }, + { + "epoch": 2.0030353619669143, + "grad_norm": 0.7201933264732361, + "learning_rate": 5.996962332928311e-05, + "loss": 1.3339, + "step": 6599 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.5403030514717102, + "learning_rate": 5.996354799513973e-05, + "loss": 1.288, + "step": 6600 + }, + { + "epoch": 2.0036424343602977, + "grad_norm": 0.5611597895622253, + "learning_rate": 5.995747266099636e-05, + "loss": 0.7736, + "step": 6601 + }, + { + "epoch": 2.003945970556989, + "grad_norm": 0.6462942361831665, + "learning_rate": 5.9951397326852974e-05, + "loss": 1.2936, + "step": 6602 + }, + { + "epoch": 2.0042495067536805, + "grad_norm": 0.6441487073898315, + "learning_rate": 5.9945321992709605e-05, + "loss": 1.3673, + "step": 6603 + }, + { + "epoch": 2.0045530429503717, + "grad_norm": 0.6502240300178528, + "learning_rate": 5.993924665856623e-05, + "loss": 1.4637, + "step": 6604 + }, + { + "epoch": 2.0048565791470634, + "grad_norm": 0.8739331364631653, + "learning_rate": 5.9933171324422846e-05, + "loss": 0.9957, + "step": 6605 + }, + { + "epoch": 2.0051601153437546, + "grad_norm": 0.7587090134620667, + "learning_rate": 5.992709599027947e-05, + "loss": 1.4321, + "step": 6606 + }, + { + "epoch": 2.0054636515404463, + "grad_norm": 0.7050350904464722, + "learning_rate": 5.9921020656136094e-05, + "loss": 0.8794, + "step": 6607 + }, + { + "epoch": 2.0057671877371375, + "grad_norm": 0.6558219790458679, + "learning_rate": 5.991494532199271e-05, + "loss": 1.1112, + "step": 6608 + }, + { + "epoch": 2.006070723933829, + "grad_norm": 0.8796700835227966, + "learning_rate": 5.9908869987849336e-05, + "loss": 1.2927, + "step": 6609 + }, + { + "epoch": 2.006374260130521, + "grad_norm": 0.5149579644203186, + "learning_rate": 5.990279465370595e-05, + "loss": 0.8447, + "step": 6610 + }, + { + "epoch": 2.006677796327212, + "grad_norm": 0.8468472361564636, + "learning_rate": 5.989671931956258e-05, + "loss": 1.3351, + "step": 6611 + }, + { + "epoch": 2.0069813325239036, + "grad_norm": 0.7754473686218262, + "learning_rate": 5.98906439854192e-05, + "loss": 1.6105, + "step": 6612 + }, + { + "epoch": 2.007284868720595, + "grad_norm": 0.7056512832641602, + "learning_rate": 5.988456865127582e-05, + "loss": 1.0233, + "step": 6613 + }, + { + "epoch": 2.0075884049172865, + "grad_norm": 0.7129377126693726, + "learning_rate": 5.987849331713244e-05, + "loss": 1.2205, + "step": 6614 + }, + { + "epoch": 2.0078919411139777, + "grad_norm": 0.6188176274299622, + "learning_rate": 5.987241798298907e-05, + "loss": 1.501, + "step": 6615 + }, + { + "epoch": 2.0081954773106694, + "grad_norm": 0.6133326888084412, + "learning_rate": 5.9866342648845684e-05, + "loss": 0.9485, + "step": 6616 + }, + { + "epoch": 2.0084990135073606, + "grad_norm": 0.8083095550537109, + "learning_rate": 5.9860267314702315e-05, + "loss": 1.0196, + "step": 6617 + }, + { + "epoch": 2.0088025497040523, + "grad_norm": 0.7114616632461548, + "learning_rate": 5.985419198055894e-05, + "loss": 1.4478, + "step": 6618 + }, + { + "epoch": 2.0091060859007435, + "grad_norm": 0.681473970413208, + "learning_rate": 5.9848116646415556e-05, + "loss": 1.2194, + "step": 6619 + }, + { + "epoch": 2.009409622097435, + "grad_norm": 0.6493435502052307, + "learning_rate": 5.984204131227218e-05, + "loss": 1.0063, + "step": 6620 + }, + { + "epoch": 2.009713158294127, + "grad_norm": 0.5193372368812561, + "learning_rate": 5.9835965978128804e-05, + "loss": 0.99, + "step": 6621 + }, + { + "epoch": 2.010016694490818, + "grad_norm": 0.8599382638931274, + "learning_rate": 5.982989064398542e-05, + "loss": 1.4596, + "step": 6622 + }, + { + "epoch": 2.0103202306875096, + "grad_norm": 0.7509252429008484, + "learning_rate": 5.9823815309842046e-05, + "loss": 1.1321, + "step": 6623 + }, + { + "epoch": 2.010623766884201, + "grad_norm": 0.8264543414115906, + "learning_rate": 5.981773997569866e-05, + "loss": 1.1802, + "step": 6624 + }, + { + "epoch": 2.0109273030808925, + "grad_norm": 0.7668397426605225, + "learning_rate": 5.981166464155529e-05, + "loss": 1.2522, + "step": 6625 + }, + { + "epoch": 2.0112308392775837, + "grad_norm": 0.5355592370033264, + "learning_rate": 5.980558930741191e-05, + "loss": 1.2359, + "step": 6626 + }, + { + "epoch": 2.0115343754742754, + "grad_norm": 0.6542816758155823, + "learning_rate": 5.979951397326853e-05, + "loss": 1.4511, + "step": 6627 + }, + { + "epoch": 2.0118379116709666, + "grad_norm": 0.6245687007904053, + "learning_rate": 5.979343863912515e-05, + "loss": 0.9108, + "step": 6628 + }, + { + "epoch": 2.0121414478676583, + "grad_norm": 0.702171802520752, + "learning_rate": 5.9787363304981783e-05, + "loss": 1.4561, + "step": 6629 + }, + { + "epoch": 2.0124449840643495, + "grad_norm": 0.7270470261573792, + "learning_rate": 5.9781287970838394e-05, + "loss": 1.2363, + "step": 6630 + }, + { + "epoch": 2.012748520261041, + "grad_norm": 0.7082234025001526, + "learning_rate": 5.9775212636695025e-05, + "loss": 1.5224, + "step": 6631 + }, + { + "epoch": 2.0130520564577328, + "grad_norm": 0.7704906463623047, + "learning_rate": 5.976913730255165e-05, + "loss": 1.0233, + "step": 6632 + }, + { + "epoch": 2.013355592654424, + "grad_norm": 0.8580783605575562, + "learning_rate": 5.976306196840826e-05, + "loss": 1.0413, + "step": 6633 + }, + { + "epoch": 2.0136591288511156, + "grad_norm": 0.7564939260482788, + "learning_rate": 5.975698663426489e-05, + "loss": 1.2357, + "step": 6634 + }, + { + "epoch": 2.013962665047807, + "grad_norm": 0.8341938853263855, + "learning_rate": 5.9750911300121514e-05, + "loss": 1.5273, + "step": 6635 + }, + { + "epoch": 2.0142662012444985, + "grad_norm": 0.7217328548431396, + "learning_rate": 5.974483596597813e-05, + "loss": 1.5105, + "step": 6636 + }, + { + "epoch": 2.0145697374411897, + "grad_norm": 0.6179178357124329, + "learning_rate": 5.9738760631834756e-05, + "loss": 1.3321, + "step": 6637 + }, + { + "epoch": 2.0148732736378814, + "grad_norm": 0.8074550032615662, + "learning_rate": 5.973268529769137e-05, + "loss": 1.4196, + "step": 6638 + }, + { + "epoch": 2.0151768098345726, + "grad_norm": 0.7848823666572571, + "learning_rate": 5.9726609963548e-05, + "loss": 1.2375, + "step": 6639 + }, + { + "epoch": 2.0154803460312642, + "grad_norm": 0.597193717956543, + "learning_rate": 5.972053462940462e-05, + "loss": 0.9173, + "step": 6640 + }, + { + "epoch": 2.015783882227956, + "grad_norm": 0.7859654426574707, + "learning_rate": 5.971445929526124e-05, + "loss": 1.3904, + "step": 6641 + }, + { + "epoch": 2.016087418424647, + "grad_norm": 0.8810588121414185, + "learning_rate": 5.970838396111786e-05, + "loss": 1.2244, + "step": 6642 + }, + { + "epoch": 2.0163909546213388, + "grad_norm": 0.6478848457336426, + "learning_rate": 5.9702308626974493e-05, + "loss": 1.2265, + "step": 6643 + }, + { + "epoch": 2.01669449081803, + "grad_norm": 0.7811263203620911, + "learning_rate": 5.9696233292831104e-05, + "loss": 1.283, + "step": 6644 + }, + { + "epoch": 2.0169980270147216, + "grad_norm": 0.8496779203414917, + "learning_rate": 5.969015795868773e-05, + "loss": 1.208, + "step": 6645 + }, + { + "epoch": 2.017301563211413, + "grad_norm": 0.6808563470840454, + "learning_rate": 5.968408262454436e-05, + "loss": 1.5266, + "step": 6646 + }, + { + "epoch": 2.0176050994081045, + "grad_norm": 0.6841781139373779, + "learning_rate": 5.967800729040097e-05, + "loss": 1.3965, + "step": 6647 + }, + { + "epoch": 2.0179086356047957, + "grad_norm": 0.7297331094741821, + "learning_rate": 5.96719319562576e-05, + "loss": 0.6081, + "step": 6648 + }, + { + "epoch": 2.0182121718014874, + "grad_norm": 0.6482033133506775, + "learning_rate": 5.9665856622114224e-05, + "loss": 1.6838, + "step": 6649 + }, + { + "epoch": 2.0185157079981786, + "grad_norm": 0.7368562817573547, + "learning_rate": 5.965978128797084e-05, + "loss": 1.193, + "step": 6650 + }, + { + "epoch": 2.0188192441948702, + "grad_norm": 0.804199755191803, + "learning_rate": 5.9653705953827466e-05, + "loss": 1.2387, + "step": 6651 + }, + { + "epoch": 2.019122780391562, + "grad_norm": 0.7707446217536926, + "learning_rate": 5.964763061968408e-05, + "loss": 0.9575, + "step": 6652 + }, + { + "epoch": 2.019426316588253, + "grad_norm": 0.704628586769104, + "learning_rate": 5.964155528554071e-05, + "loss": 1.1401, + "step": 6653 + }, + { + "epoch": 2.0197298527849448, + "grad_norm": 0.6447728276252747, + "learning_rate": 5.963547995139733e-05, + "loss": 0.6612, + "step": 6654 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.6423813104629517, + "learning_rate": 5.962940461725395e-05, + "loss": 0.9907, + "step": 6655 + }, + { + "epoch": 2.0203369251783276, + "grad_norm": 0.6763865947723389, + "learning_rate": 5.962332928311057e-05, + "loss": 1.2592, + "step": 6656 + }, + { + "epoch": 2.020640461375019, + "grad_norm": 0.6126373410224915, + "learning_rate": 5.96172539489672e-05, + "loss": 1.3896, + "step": 6657 + }, + { + "epoch": 2.0209439975717105, + "grad_norm": 0.6113005876541138, + "learning_rate": 5.9611178614823814e-05, + "loss": 1.0545, + "step": 6658 + }, + { + "epoch": 2.0212475337684017, + "grad_norm": 0.5360875725746155, + "learning_rate": 5.960510328068044e-05, + "loss": 1.5621, + "step": 6659 + }, + { + "epoch": 2.0215510699650934, + "grad_norm": 0.7377156615257263, + "learning_rate": 5.959902794653707e-05, + "loss": 0.9596, + "step": 6660 + }, + { + "epoch": 2.0218546061617846, + "grad_norm": 0.7450338006019592, + "learning_rate": 5.959295261239368e-05, + "loss": 0.7481, + "step": 6661 + }, + { + "epoch": 2.0221581423584762, + "grad_norm": 0.8769157528877258, + "learning_rate": 5.958687727825031e-05, + "loss": 1.3477, + "step": 6662 + }, + { + "epoch": 2.022461678555168, + "grad_norm": 0.45230183005332947, + "learning_rate": 5.9580801944106934e-05, + "loss": 1.3026, + "step": 6663 + }, + { + "epoch": 2.022765214751859, + "grad_norm": 0.7798328995704651, + "learning_rate": 5.957472660996355e-05, + "loss": 1.3114, + "step": 6664 + }, + { + "epoch": 2.0230687509485508, + "grad_norm": 0.7356457114219666, + "learning_rate": 5.9568651275820176e-05, + "loss": 1.195, + "step": 6665 + }, + { + "epoch": 2.023372287145242, + "grad_norm": 0.587321937084198, + "learning_rate": 5.956257594167679e-05, + "loss": 1.7201, + "step": 6666 + }, + { + "epoch": 2.0236758233419336, + "grad_norm": 0.6452397704124451, + "learning_rate": 5.955650060753342e-05, + "loss": 1.3267, + "step": 6667 + }, + { + "epoch": 2.023979359538625, + "grad_norm": 0.7600719332695007, + "learning_rate": 5.955042527339004e-05, + "loss": 1.3694, + "step": 6668 + }, + { + "epoch": 2.0242828957353165, + "grad_norm": 0.8960398435592651, + "learning_rate": 5.954434993924666e-05, + "loss": 1.202, + "step": 6669 + }, + { + "epoch": 2.0245864319320077, + "grad_norm": 0.7933389544487, + "learning_rate": 5.953827460510328e-05, + "loss": 0.9678, + "step": 6670 + }, + { + "epoch": 2.0248899681286994, + "grad_norm": 0.773642897605896, + "learning_rate": 5.953219927095991e-05, + "loss": 1.3841, + "step": 6671 + }, + { + "epoch": 2.025193504325391, + "grad_norm": 0.6029089689254761, + "learning_rate": 5.9526123936816524e-05, + "loss": 1.1724, + "step": 6672 + }, + { + "epoch": 2.0254970405220822, + "grad_norm": 0.7390233278274536, + "learning_rate": 5.952004860267315e-05, + "loss": 0.7247, + "step": 6673 + }, + { + "epoch": 2.025800576718774, + "grad_norm": 0.7552881836891174, + "learning_rate": 5.951397326852978e-05, + "loss": 1.234, + "step": 6674 + }, + { + "epoch": 2.026104112915465, + "grad_norm": 1.0866544246673584, + "learning_rate": 5.950789793438639e-05, + "loss": 0.5423, + "step": 6675 + }, + { + "epoch": 2.0264076491121568, + "grad_norm": 0.6920125484466553, + "learning_rate": 5.950182260024302e-05, + "loss": 1.6804, + "step": 6676 + }, + { + "epoch": 2.026711185308848, + "grad_norm": 0.6557866334915161, + "learning_rate": 5.949574726609963e-05, + "loss": 1.4507, + "step": 6677 + }, + { + "epoch": 2.0270147215055396, + "grad_norm": 0.7183884382247925, + "learning_rate": 5.948967193195626e-05, + "loss": 0.7779, + "step": 6678 + }, + { + "epoch": 2.027318257702231, + "grad_norm": 0.6660280823707581, + "learning_rate": 5.9483596597812886e-05, + "loss": 1.5683, + "step": 6679 + }, + { + "epoch": 2.0276217938989225, + "grad_norm": 0.7593160271644592, + "learning_rate": 5.94775212636695e-05, + "loss": 1.4431, + "step": 6680 + }, + { + "epoch": 2.0279253300956137, + "grad_norm": 0.6688575744628906, + "learning_rate": 5.947144592952613e-05, + "loss": 1.4504, + "step": 6681 + }, + { + "epoch": 2.0282288662923054, + "grad_norm": 0.662132740020752, + "learning_rate": 5.946537059538275e-05, + "loss": 1.001, + "step": 6682 + }, + { + "epoch": 2.028532402488997, + "grad_norm": 0.6534841060638428, + "learning_rate": 5.945929526123937e-05, + "loss": 1.6814, + "step": 6683 + }, + { + "epoch": 2.0288359386856882, + "grad_norm": 0.7396661043167114, + "learning_rate": 5.945321992709599e-05, + "loss": 1.5876, + "step": 6684 + }, + { + "epoch": 2.02913947488238, + "grad_norm": 0.657581090927124, + "learning_rate": 5.944714459295262e-05, + "loss": 1.365, + "step": 6685 + }, + { + "epoch": 2.029443011079071, + "grad_norm": 0.6300247311592102, + "learning_rate": 5.9441069258809234e-05, + "loss": 1.3679, + "step": 6686 + }, + { + "epoch": 2.0297465472757628, + "grad_norm": 0.7843654751777649, + "learning_rate": 5.943499392466586e-05, + "loss": 1.4725, + "step": 6687 + }, + { + "epoch": 2.030050083472454, + "grad_norm": 0.7142737507820129, + "learning_rate": 5.942891859052249e-05, + "loss": 1.2956, + "step": 6688 + }, + { + "epoch": 2.0303536196691456, + "grad_norm": 0.5903081893920898, + "learning_rate": 5.94228432563791e-05, + "loss": 0.841, + "step": 6689 + }, + { + "epoch": 2.030657155865837, + "grad_norm": 0.6715419888496399, + "learning_rate": 5.941676792223573e-05, + "loss": 0.9159, + "step": 6690 + }, + { + "epoch": 2.0309606920625285, + "grad_norm": 0.5854306817054749, + "learning_rate": 5.941069258809234e-05, + "loss": 0.7889, + "step": 6691 + }, + { + "epoch": 2.0312642282592197, + "grad_norm": 0.8587661981582642, + "learning_rate": 5.940461725394897e-05, + "loss": 1.3361, + "step": 6692 + }, + { + "epoch": 2.0315677644559114, + "grad_norm": 0.7016955018043518, + "learning_rate": 5.9398541919805596e-05, + "loss": 0.9073, + "step": 6693 + }, + { + "epoch": 2.031871300652603, + "grad_norm": 0.6640695333480835, + "learning_rate": 5.9392466585662206e-05, + "loss": 1.3431, + "step": 6694 + }, + { + "epoch": 2.0321748368492942, + "grad_norm": 0.6428939700126648, + "learning_rate": 5.938639125151884e-05, + "loss": 1.5478, + "step": 6695 + }, + { + "epoch": 2.032478373045986, + "grad_norm": 0.7507612109184265, + "learning_rate": 5.938031591737546e-05, + "loss": 0.4712, + "step": 6696 + }, + { + "epoch": 2.032781909242677, + "grad_norm": 0.7252958416938782, + "learning_rate": 5.937424058323208e-05, + "loss": 1.3709, + "step": 6697 + }, + { + "epoch": 2.0330854454393688, + "grad_norm": 0.6997389793395996, + "learning_rate": 5.93681652490887e-05, + "loss": 1.1456, + "step": 6698 + }, + { + "epoch": 2.03338898163606, + "grad_norm": 0.7252766489982605, + "learning_rate": 5.936208991494533e-05, + "loss": 1.3329, + "step": 6699 + }, + { + "epoch": 2.0336925178327516, + "grad_norm": 1.0754157304763794, + "learning_rate": 5.9356014580801944e-05, + "loss": 0.7818, + "step": 6700 + }, + { + "epoch": 2.033996054029443, + "grad_norm": 0.7663254141807556, + "learning_rate": 5.934993924665857e-05, + "loss": 1.3483, + "step": 6701 + }, + { + "epoch": 2.0342995902261345, + "grad_norm": 0.7525759339332581, + "learning_rate": 5.93438639125152e-05, + "loss": 1.3253, + "step": 6702 + }, + { + "epoch": 2.0346031264228257, + "grad_norm": 1.0139453411102295, + "learning_rate": 5.933778857837181e-05, + "loss": 1.2358, + "step": 6703 + }, + { + "epoch": 2.0349066626195174, + "grad_norm": 0.7129602432250977, + "learning_rate": 5.933171324422844e-05, + "loss": 1.2781, + "step": 6704 + }, + { + "epoch": 2.035210198816209, + "grad_norm": 0.6935693621635437, + "learning_rate": 5.932563791008505e-05, + "loss": 0.946, + "step": 6705 + }, + { + "epoch": 2.0355137350129002, + "grad_norm": 0.5569899678230286, + "learning_rate": 5.9319562575941675e-05, + "loss": 1.6153, + "step": 6706 + }, + { + "epoch": 2.035817271209592, + "grad_norm": 0.6837217211723328, + "learning_rate": 5.9313487241798306e-05, + "loss": 1.6131, + "step": 6707 + }, + { + "epoch": 2.036120807406283, + "grad_norm": 0.6533646583557129, + "learning_rate": 5.9307411907654917e-05, + "loss": 1.2929, + "step": 6708 + }, + { + "epoch": 2.0364243436029748, + "grad_norm": 0.7194601893424988, + "learning_rate": 5.930133657351155e-05, + "loss": 1.0747, + "step": 6709 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.7047435641288757, + "learning_rate": 5.929526123936817e-05, + "loss": 1.1719, + "step": 6710 + }, + { + "epoch": 2.0370314159963576, + "grad_norm": 0.650740921497345, + "learning_rate": 5.928918590522479e-05, + "loss": 1.4469, + "step": 6711 + }, + { + "epoch": 2.037334952193049, + "grad_norm": 0.7121087908744812, + "learning_rate": 5.928311057108141e-05, + "loss": 1.1483, + "step": 6712 + }, + { + "epoch": 2.0376384883897405, + "grad_norm": 0.6568427681922913, + "learning_rate": 5.927703523693804e-05, + "loss": 0.7621, + "step": 6713 + }, + { + "epoch": 2.037942024586432, + "grad_norm": 0.7922488451004028, + "learning_rate": 5.9270959902794654e-05, + "loss": 1.528, + "step": 6714 + }, + { + "epoch": 2.0382455607831234, + "grad_norm": 0.5935818552970886, + "learning_rate": 5.926488456865128e-05, + "loss": 1.0031, + "step": 6715 + }, + { + "epoch": 2.038549096979815, + "grad_norm": 0.9066329002380371, + "learning_rate": 5.925880923450791e-05, + "loss": 0.8471, + "step": 6716 + }, + { + "epoch": 2.0388526331765062, + "grad_norm": 0.7111946940422058, + "learning_rate": 5.925273390036452e-05, + "loss": 1.2482, + "step": 6717 + }, + { + "epoch": 2.039156169373198, + "grad_norm": 0.7186754941940308, + "learning_rate": 5.9246658566221144e-05, + "loss": 1.1287, + "step": 6718 + }, + { + "epoch": 2.039459705569889, + "grad_norm": 0.7100489139556885, + "learning_rate": 5.924058323207776e-05, + "loss": 1.4158, + "step": 6719 + }, + { + "epoch": 2.0397632417665807, + "grad_norm": 1.1844197511672974, + "learning_rate": 5.9234507897934385e-05, + "loss": 0.5527, + "step": 6720 + }, + { + "epoch": 2.040066777963272, + "grad_norm": 0.45704275369644165, + "learning_rate": 5.9228432563791016e-05, + "loss": 0.8737, + "step": 6721 + }, + { + "epoch": 2.0403703141599636, + "grad_norm": 0.5917233228683472, + "learning_rate": 5.9222357229647627e-05, + "loss": 0.9702, + "step": 6722 + }, + { + "epoch": 2.040673850356655, + "grad_norm": 0.7638018131256104, + "learning_rate": 5.921628189550426e-05, + "loss": 1.378, + "step": 6723 + }, + { + "epoch": 2.0409773865533465, + "grad_norm": 0.8409146666526794, + "learning_rate": 5.921020656136088e-05, + "loss": 1.0934, + "step": 6724 + }, + { + "epoch": 2.041280922750038, + "grad_norm": 0.7116609811782837, + "learning_rate": 5.92041312272175e-05, + "loss": 1.0107, + "step": 6725 + }, + { + "epoch": 2.0415844589467294, + "grad_norm": 0.7379602789878845, + "learning_rate": 5.919805589307412e-05, + "loss": 1.4649, + "step": 6726 + }, + { + "epoch": 2.041887995143421, + "grad_norm": 0.7317615747451782, + "learning_rate": 5.919198055893075e-05, + "loss": 1.0289, + "step": 6727 + }, + { + "epoch": 2.042191531340112, + "grad_norm": 0.7722091674804688, + "learning_rate": 5.9185905224787364e-05, + "loss": 1.352, + "step": 6728 + }, + { + "epoch": 2.042495067536804, + "grad_norm": 0.8695550560951233, + "learning_rate": 5.917982989064399e-05, + "loss": 1.2503, + "step": 6729 + }, + { + "epoch": 2.042798603733495, + "grad_norm": 0.7947261333465576, + "learning_rate": 5.917375455650061e-05, + "loss": 1.4089, + "step": 6730 + }, + { + "epoch": 2.0431021399301867, + "grad_norm": 0.6829918622970581, + "learning_rate": 5.916767922235723e-05, + "loss": 1.3916, + "step": 6731 + }, + { + "epoch": 2.043405676126878, + "grad_norm": 0.8099695444107056, + "learning_rate": 5.9161603888213854e-05, + "loss": 1.0505, + "step": 6732 + }, + { + "epoch": 2.0437092123235696, + "grad_norm": 0.5343392491340637, + "learning_rate": 5.915552855407047e-05, + "loss": 1.172, + "step": 6733 + }, + { + "epoch": 2.044012748520261, + "grad_norm": 0.740508496761322, + "learning_rate": 5.9149453219927095e-05, + "loss": 1.3653, + "step": 6734 + }, + { + "epoch": 2.0443162847169525, + "grad_norm": 0.565724790096283, + "learning_rate": 5.9143377885783726e-05, + "loss": 1.5483, + "step": 6735 + }, + { + "epoch": 2.044619820913644, + "grad_norm": 0.8615663647651672, + "learning_rate": 5.913730255164034e-05, + "loss": 1.203, + "step": 6736 + }, + { + "epoch": 2.0449233571103353, + "grad_norm": 0.7551422119140625, + "learning_rate": 5.913122721749697e-05, + "loss": 1.4751, + "step": 6737 + }, + { + "epoch": 2.045226893307027, + "grad_norm": 0.6796860694885254, + "learning_rate": 5.912515188335359e-05, + "loss": 1.6234, + "step": 6738 + }, + { + "epoch": 2.045530429503718, + "grad_norm": 0.8114526271820068, + "learning_rate": 5.911907654921021e-05, + "loss": 1.1534, + "step": 6739 + }, + { + "epoch": 2.04583396570041, + "grad_norm": 0.7070174217224121, + "learning_rate": 5.911300121506683e-05, + "loss": 1.5523, + "step": 6740 + }, + { + "epoch": 2.046137501897101, + "grad_norm": 0.8905296325683594, + "learning_rate": 5.910692588092346e-05, + "loss": 0.5081, + "step": 6741 + }, + { + "epoch": 2.0464410380937927, + "grad_norm": 0.9360664486885071, + "learning_rate": 5.9100850546780074e-05, + "loss": 0.6751, + "step": 6742 + }, + { + "epoch": 2.046744574290484, + "grad_norm": 0.7047433853149414, + "learning_rate": 5.90947752126367e-05, + "loss": 1.3511, + "step": 6743 + }, + { + "epoch": 2.0470481104871756, + "grad_norm": 0.655782163143158, + "learning_rate": 5.908869987849332e-05, + "loss": 1.3505, + "step": 6744 + }, + { + "epoch": 2.0473516466838673, + "grad_norm": 0.5383403301239014, + "learning_rate": 5.908262454434994e-05, + "loss": 1.0058, + "step": 6745 + }, + { + "epoch": 2.0476551828805585, + "grad_norm": 0.8472936153411865, + "learning_rate": 5.9076549210206564e-05, + "loss": 1.4217, + "step": 6746 + }, + { + "epoch": 2.04795871907725, + "grad_norm": 0.71051424741745, + "learning_rate": 5.907047387606318e-05, + "loss": 1.2775, + "step": 6747 + }, + { + "epoch": 2.0482622552739413, + "grad_norm": 0.8670709133148193, + "learning_rate": 5.9064398541919805e-05, + "loss": 1.0466, + "step": 6748 + }, + { + "epoch": 2.048565791470633, + "grad_norm": 0.7335761785507202, + "learning_rate": 5.9058323207776436e-05, + "loss": 1.176, + "step": 6749 + }, + { + "epoch": 2.048869327667324, + "grad_norm": 0.6880519390106201, + "learning_rate": 5.905224787363305e-05, + "loss": 1.5351, + "step": 6750 + }, + { + "epoch": 2.049172863864016, + "grad_norm": 0.662702202796936, + "learning_rate": 5.904617253948968e-05, + "loss": 0.9803, + "step": 6751 + }, + { + "epoch": 2.049476400060707, + "grad_norm": 0.5580474734306335, + "learning_rate": 5.90400972053463e-05, + "loss": 1.8734, + "step": 6752 + }, + { + "epoch": 2.0497799362573987, + "grad_norm": 0.6297309398651123, + "learning_rate": 5.903402187120292e-05, + "loss": 1.5143, + "step": 6753 + }, + { + "epoch": 2.05008347245409, + "grad_norm": 0.744547426700592, + "learning_rate": 5.902794653705954e-05, + "loss": 1.1973, + "step": 6754 + }, + { + "epoch": 2.0503870086507816, + "grad_norm": 0.8597540855407715, + "learning_rate": 5.902187120291617e-05, + "loss": 1.3367, + "step": 6755 + }, + { + "epoch": 2.0506905448474733, + "grad_norm": 0.6745612621307373, + "learning_rate": 5.9015795868772784e-05, + "loss": 1.0339, + "step": 6756 + }, + { + "epoch": 2.0509940810441645, + "grad_norm": 0.6249368786811829, + "learning_rate": 5.900972053462941e-05, + "loss": 1.5042, + "step": 6757 + }, + { + "epoch": 2.051297617240856, + "grad_norm": 0.6534069776535034, + "learning_rate": 5.9003645200486026e-05, + "loss": 1.486, + "step": 6758 + }, + { + "epoch": 2.0516011534375473, + "grad_norm": 0.6959986686706543, + "learning_rate": 5.899756986634265e-05, + "loss": 1.5417, + "step": 6759 + }, + { + "epoch": 2.051904689634239, + "grad_norm": 0.8996198773384094, + "learning_rate": 5.8991494532199274e-05, + "loss": 0.9192, + "step": 6760 + }, + { + "epoch": 2.05220822583093, + "grad_norm": 0.7029753923416138, + "learning_rate": 5.898541919805589e-05, + "loss": 1.5122, + "step": 6761 + }, + { + "epoch": 2.052511762027622, + "grad_norm": 0.8004100322723389, + "learning_rate": 5.8979343863912515e-05, + "loss": 1.1594, + "step": 6762 + }, + { + "epoch": 2.052815298224313, + "grad_norm": 0.9627721905708313, + "learning_rate": 5.8973268529769146e-05, + "loss": 0.8332, + "step": 6763 + }, + { + "epoch": 2.0531188344210047, + "grad_norm": 1.0539624691009521, + "learning_rate": 5.896719319562576e-05, + "loss": 1.4044, + "step": 6764 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.48121675848960876, + "learning_rate": 5.896111786148239e-05, + "loss": 1.3278, + "step": 6765 + }, + { + "epoch": 2.0537259068143876, + "grad_norm": 0.6761457920074463, + "learning_rate": 5.895504252733901e-05, + "loss": 0.9294, + "step": 6766 + }, + { + "epoch": 2.0540294430110793, + "grad_norm": 0.6362090110778809, + "learning_rate": 5.894896719319562e-05, + "loss": 1.4175, + "step": 6767 + }, + { + "epoch": 2.0543329792077705, + "grad_norm": 0.7415809631347656, + "learning_rate": 5.894289185905225e-05, + "loss": 1.2596, + "step": 6768 + }, + { + "epoch": 2.054636515404462, + "grad_norm": 0.5656198263168335, + "learning_rate": 5.893681652490888e-05, + "loss": 1.1221, + "step": 6769 + }, + { + "epoch": 2.0549400516011533, + "grad_norm": 0.7808176875114441, + "learning_rate": 5.8930741190765494e-05, + "loss": 1.3354, + "step": 6770 + }, + { + "epoch": 2.055243587797845, + "grad_norm": 0.7137062549591064, + "learning_rate": 5.892466585662212e-05, + "loss": 1.291, + "step": 6771 + }, + { + "epoch": 2.055547123994536, + "grad_norm": 0.6661720871925354, + "learning_rate": 5.8918590522478736e-05, + "loss": 1.3515, + "step": 6772 + }, + { + "epoch": 2.055850660191228, + "grad_norm": 0.8269581198692322, + "learning_rate": 5.891251518833536e-05, + "loss": 1.2311, + "step": 6773 + }, + { + "epoch": 2.056154196387919, + "grad_norm": 0.629157543182373, + "learning_rate": 5.8906439854191984e-05, + "loss": 1.1386, + "step": 6774 + }, + { + "epoch": 2.0564577325846107, + "grad_norm": 0.7245308756828308, + "learning_rate": 5.89003645200486e-05, + "loss": 1.1915, + "step": 6775 + }, + { + "epoch": 2.0567612687813024, + "grad_norm": 0.8003824353218079, + "learning_rate": 5.8894289185905225e-05, + "loss": 1.1763, + "step": 6776 + }, + { + "epoch": 2.0570648049779936, + "grad_norm": 0.8073933720588684, + "learning_rate": 5.8888213851761856e-05, + "loss": 0.9701, + "step": 6777 + }, + { + "epoch": 2.0573683411746853, + "grad_norm": 0.7716240286827087, + "learning_rate": 5.888213851761847e-05, + "loss": 1.2758, + "step": 6778 + }, + { + "epoch": 2.0576718773713765, + "grad_norm": 0.7790003418922424, + "learning_rate": 5.887606318347509e-05, + "loss": 1.5667, + "step": 6779 + }, + { + "epoch": 2.057975413568068, + "grad_norm": 0.6642122268676758, + "learning_rate": 5.886998784933172e-05, + "loss": 1.4832, + "step": 6780 + }, + { + "epoch": 2.0582789497647593, + "grad_norm": 0.8268905878067017, + "learning_rate": 5.886391251518833e-05, + "loss": 1.0759, + "step": 6781 + }, + { + "epoch": 2.058582485961451, + "grad_norm": 0.629315972328186, + "learning_rate": 5.885783718104496e-05, + "loss": 0.765, + "step": 6782 + }, + { + "epoch": 2.058886022158142, + "grad_norm": 0.7830377221107483, + "learning_rate": 5.885176184690159e-05, + "loss": 1.4531, + "step": 6783 + }, + { + "epoch": 2.059189558354834, + "grad_norm": 0.6775173544883728, + "learning_rate": 5.8845686512758205e-05, + "loss": 1.3346, + "step": 6784 + }, + { + "epoch": 2.059493094551525, + "grad_norm": 0.7450172901153564, + "learning_rate": 5.883961117861483e-05, + "loss": 1.2739, + "step": 6785 + }, + { + "epoch": 2.0597966307482167, + "grad_norm": 0.6644347310066223, + "learning_rate": 5.8833535844471446e-05, + "loss": 1.6158, + "step": 6786 + }, + { + "epoch": 2.0601001669449084, + "grad_norm": 0.9597615003585815, + "learning_rate": 5.882746051032807e-05, + "loss": 1.3137, + "step": 6787 + }, + { + "epoch": 2.0604037031415996, + "grad_norm": 0.7239344120025635, + "learning_rate": 5.8821385176184694e-05, + "loss": 1.5057, + "step": 6788 + }, + { + "epoch": 2.0607072393382913, + "grad_norm": 0.7747911810874939, + "learning_rate": 5.881530984204131e-05, + "loss": 1.1718, + "step": 6789 + }, + { + "epoch": 2.0610107755349825, + "grad_norm": 0.5871680974960327, + "learning_rate": 5.8809234507897935e-05, + "loss": 1.3308, + "step": 6790 + }, + { + "epoch": 2.061314311731674, + "grad_norm": 0.824733316898346, + "learning_rate": 5.880315917375456e-05, + "loss": 1.2952, + "step": 6791 + }, + { + "epoch": 2.0616178479283653, + "grad_norm": 0.7985734939575195, + "learning_rate": 5.879708383961118e-05, + "loss": 1.5404, + "step": 6792 + }, + { + "epoch": 2.061921384125057, + "grad_norm": 0.6020705103874207, + "learning_rate": 5.87910085054678e-05, + "loss": 1.07, + "step": 6793 + }, + { + "epoch": 2.062224920321748, + "grad_norm": 0.8818385601043701, + "learning_rate": 5.878493317132443e-05, + "loss": 1.3636, + "step": 6794 + }, + { + "epoch": 2.06252845651844, + "grad_norm": 0.5820996165275574, + "learning_rate": 5.877885783718104e-05, + "loss": 1.5162, + "step": 6795 + }, + { + "epoch": 2.062831992715131, + "grad_norm": 0.7149410247802734, + "learning_rate": 5.877278250303767e-05, + "loss": 1.3473, + "step": 6796 + }, + { + "epoch": 2.0631355289118227, + "grad_norm": 0.8226627707481384, + "learning_rate": 5.87667071688943e-05, + "loss": 1.0854, + "step": 6797 + }, + { + "epoch": 2.0634390651085144, + "grad_norm": 0.5526022911071777, + "learning_rate": 5.8760631834750915e-05, + "loss": 1.4572, + "step": 6798 + }, + { + "epoch": 2.0637426013052056, + "grad_norm": 0.8518609404563904, + "learning_rate": 5.875455650060754e-05, + "loss": 1.0517, + "step": 6799 + }, + { + "epoch": 2.0640461375018972, + "grad_norm": 0.6736918091773987, + "learning_rate": 5.8748481166464156e-05, + "loss": 1.3883, + "step": 6800 + }, + { + "epoch": 2.0643496736985885, + "grad_norm": 0.6669840812683105, + "learning_rate": 5.874240583232078e-05, + "loss": 1.3735, + "step": 6801 + }, + { + "epoch": 2.06465320989528, + "grad_norm": 0.6384963989257812, + "learning_rate": 5.8736330498177404e-05, + "loss": 1.6629, + "step": 6802 + }, + { + "epoch": 2.0649567460919713, + "grad_norm": 0.7732337713241577, + "learning_rate": 5.873025516403402e-05, + "loss": 1.3029, + "step": 6803 + }, + { + "epoch": 2.065260282288663, + "grad_norm": 1.0315752029418945, + "learning_rate": 5.8724179829890646e-05, + "loss": 1.0835, + "step": 6804 + }, + { + "epoch": 2.065563818485354, + "grad_norm": 0.8240241408348083, + "learning_rate": 5.871810449574727e-05, + "loss": 1.1011, + "step": 6805 + }, + { + "epoch": 2.065867354682046, + "grad_norm": 0.7314655184745789, + "learning_rate": 5.871202916160389e-05, + "loss": 1.4013, + "step": 6806 + }, + { + "epoch": 2.0661708908787375, + "grad_norm": 0.8280476331710815, + "learning_rate": 5.870595382746051e-05, + "loss": 1.2687, + "step": 6807 + }, + { + "epoch": 2.0664744270754287, + "grad_norm": 0.6227006316184998, + "learning_rate": 5.869987849331714e-05, + "loss": 1.2817, + "step": 6808 + }, + { + "epoch": 2.0667779632721204, + "grad_norm": 0.6892315745353699, + "learning_rate": 5.869380315917375e-05, + "loss": 1.4893, + "step": 6809 + }, + { + "epoch": 2.0670814994688116, + "grad_norm": 0.6203703880310059, + "learning_rate": 5.868772782503038e-05, + "loss": 1.2614, + "step": 6810 + }, + { + "epoch": 2.0673850356655032, + "grad_norm": 0.814782977104187, + "learning_rate": 5.868165249088701e-05, + "loss": 1.0062, + "step": 6811 + }, + { + "epoch": 2.0676885718621945, + "grad_norm": 0.4778033494949341, + "learning_rate": 5.8675577156743625e-05, + "loss": 1.5858, + "step": 6812 + }, + { + "epoch": 2.067992108058886, + "grad_norm": 0.3774012625217438, + "learning_rate": 5.866950182260025e-05, + "loss": 1.3399, + "step": 6813 + }, + { + "epoch": 2.0682956442555773, + "grad_norm": 0.7372041344642639, + "learning_rate": 5.8663426488456866e-05, + "loss": 1.4809, + "step": 6814 + }, + { + "epoch": 2.068599180452269, + "grad_norm": 0.5884072780609131, + "learning_rate": 5.865735115431349e-05, + "loss": 1.2738, + "step": 6815 + }, + { + "epoch": 2.06890271664896, + "grad_norm": 0.7832264304161072, + "learning_rate": 5.8651275820170114e-05, + "loss": 1.5305, + "step": 6816 + }, + { + "epoch": 2.069206252845652, + "grad_norm": 0.6761386394500732, + "learning_rate": 5.864520048602673e-05, + "loss": 1.4384, + "step": 6817 + }, + { + "epoch": 2.0695097890423435, + "grad_norm": 0.9826549291610718, + "learning_rate": 5.8639125151883356e-05, + "loss": 1.1518, + "step": 6818 + }, + { + "epoch": 2.0698133252390347, + "grad_norm": 0.7240217924118042, + "learning_rate": 5.863304981773998e-05, + "loss": 1.4914, + "step": 6819 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.6312853693962097, + "learning_rate": 5.86269744835966e-05, + "loss": 1.3774, + "step": 6820 + }, + { + "epoch": 2.0704203976324176, + "grad_norm": 0.8521531224250793, + "learning_rate": 5.862089914945322e-05, + "loss": 1.3638, + "step": 6821 + }, + { + "epoch": 2.0707239338291092, + "grad_norm": 0.7164038419723511, + "learning_rate": 5.861482381530985e-05, + "loss": 1.3947, + "step": 6822 + }, + { + "epoch": 2.0710274700258005, + "grad_norm": 0.7422941327095032, + "learning_rate": 5.860874848116646e-05, + "loss": 1.3239, + "step": 6823 + }, + { + "epoch": 2.071331006222492, + "grad_norm": 0.5846385955810547, + "learning_rate": 5.860267314702309e-05, + "loss": 1.539, + "step": 6824 + }, + { + "epoch": 2.0716345424191833, + "grad_norm": 0.7266324758529663, + "learning_rate": 5.859659781287972e-05, + "loss": 1.2199, + "step": 6825 + }, + { + "epoch": 2.071938078615875, + "grad_norm": 0.6877673268318176, + "learning_rate": 5.8590522478736335e-05, + "loss": 1.3777, + "step": 6826 + }, + { + "epoch": 2.072241614812566, + "grad_norm": 0.6746101379394531, + "learning_rate": 5.858444714459296e-05, + "loss": 1.7111, + "step": 6827 + }, + { + "epoch": 2.072545151009258, + "grad_norm": 0.6098113059997559, + "learning_rate": 5.8578371810449576e-05, + "loss": 1.1953, + "step": 6828 + }, + { + "epoch": 2.0728486872059495, + "grad_norm": 0.8218063116073608, + "learning_rate": 5.85722964763062e-05, + "loss": 1.067, + "step": 6829 + }, + { + "epoch": 2.0731522234026407, + "grad_norm": 0.6264614462852478, + "learning_rate": 5.8566221142162824e-05, + "loss": 0.8346, + "step": 6830 + }, + { + "epoch": 2.0734557595993324, + "grad_norm": 0.6531704068183899, + "learning_rate": 5.856014580801944e-05, + "loss": 1.1214, + "step": 6831 + }, + { + "epoch": 2.0737592957960236, + "grad_norm": 0.8438115119934082, + "learning_rate": 5.8554070473876066e-05, + "loss": 1.1834, + "step": 6832 + }, + { + "epoch": 2.0740628319927152, + "grad_norm": 0.7190865874290466, + "learning_rate": 5.854799513973269e-05, + "loss": 1.4565, + "step": 6833 + }, + { + "epoch": 2.0743663681894065, + "grad_norm": 0.7674055099487305, + "learning_rate": 5.854191980558931e-05, + "loss": 1.3615, + "step": 6834 + }, + { + "epoch": 2.074669904386098, + "grad_norm": 1.1060407161712646, + "learning_rate": 5.853584447144593e-05, + "loss": 1.2854, + "step": 6835 + }, + { + "epoch": 2.0749734405827893, + "grad_norm": 0.5805683732032776, + "learning_rate": 5.852976913730256e-05, + "loss": 1.1622, + "step": 6836 + }, + { + "epoch": 2.075276976779481, + "grad_norm": 0.53741455078125, + "learning_rate": 5.852369380315917e-05, + "loss": 1.4795, + "step": 6837 + }, + { + "epoch": 2.075580512976172, + "grad_norm": 0.8981469869613647, + "learning_rate": 5.85176184690158e-05, + "loss": 1.3667, + "step": 6838 + }, + { + "epoch": 2.075884049172864, + "grad_norm": 0.734659731388092, + "learning_rate": 5.851154313487243e-05, + "loss": 1.1876, + "step": 6839 + }, + { + "epoch": 2.0761875853695555, + "grad_norm": 0.8583666086196899, + "learning_rate": 5.850546780072904e-05, + "loss": 1.0072, + "step": 6840 + }, + { + "epoch": 2.0764911215662467, + "grad_norm": 0.7365713119506836, + "learning_rate": 5.849939246658567e-05, + "loss": 0.9634, + "step": 6841 + }, + { + "epoch": 2.0767946577629384, + "grad_norm": 0.7284424901008606, + "learning_rate": 5.849331713244228e-05, + "loss": 1.28, + "step": 6842 + }, + { + "epoch": 2.0770981939596296, + "grad_norm": 0.8206003308296204, + "learning_rate": 5.848724179829891e-05, + "loss": 1.178, + "step": 6843 + }, + { + "epoch": 2.0774017301563212, + "grad_norm": 0.6586450338363647, + "learning_rate": 5.8481166464155534e-05, + "loss": 1.6239, + "step": 6844 + }, + { + "epoch": 2.0777052663530124, + "grad_norm": 0.8370587229728699, + "learning_rate": 5.847509113001215e-05, + "loss": 1.5696, + "step": 6845 + }, + { + "epoch": 2.078008802549704, + "grad_norm": 0.715146005153656, + "learning_rate": 5.8469015795868776e-05, + "loss": 0.9131, + "step": 6846 + }, + { + "epoch": 2.0783123387463953, + "grad_norm": 0.7461423277854919, + "learning_rate": 5.84629404617254e-05, + "loss": 1.6913, + "step": 6847 + }, + { + "epoch": 2.078615874943087, + "grad_norm": 0.613091230392456, + "learning_rate": 5.845686512758202e-05, + "loss": 1.2838, + "step": 6848 + }, + { + "epoch": 2.0789194111397786, + "grad_norm": 0.8387308716773987, + "learning_rate": 5.845078979343864e-05, + "loss": 1.502, + "step": 6849 + }, + { + "epoch": 2.07922294733647, + "grad_norm": 0.804394006729126, + "learning_rate": 5.844471445929527e-05, + "loss": 1.114, + "step": 6850 + }, + { + "epoch": 2.0795264835331615, + "grad_norm": 0.5923547148704529, + "learning_rate": 5.843863912515188e-05, + "loss": 1.2775, + "step": 6851 + }, + { + "epoch": 2.0798300197298527, + "grad_norm": 0.7429361939430237, + "learning_rate": 5.8432563791008507e-05, + "loss": 1.2353, + "step": 6852 + }, + { + "epoch": 2.0801335559265444, + "grad_norm": 0.7161668539047241, + "learning_rate": 5.8426488456865124e-05, + "loss": 1.4675, + "step": 6853 + }, + { + "epoch": 2.0804370921232356, + "grad_norm": 0.7612913250923157, + "learning_rate": 5.842041312272175e-05, + "loss": 1.1781, + "step": 6854 + }, + { + "epoch": 2.0807406283199272, + "grad_norm": 0.4716074466705322, + "learning_rate": 5.841433778857838e-05, + "loss": 1.1833, + "step": 6855 + }, + { + "epoch": 2.0810441645166184, + "grad_norm": 0.673354983329773, + "learning_rate": 5.840826245443499e-05, + "loss": 1.416, + "step": 6856 + }, + { + "epoch": 2.08134770071331, + "grad_norm": 0.7451295852661133, + "learning_rate": 5.840218712029162e-05, + "loss": 1.6398, + "step": 6857 + }, + { + "epoch": 2.0816512369100013, + "grad_norm": 0.8739036321640015, + "learning_rate": 5.8396111786148244e-05, + "loss": 1.4303, + "step": 6858 + }, + { + "epoch": 2.081954773106693, + "grad_norm": 0.761298418045044, + "learning_rate": 5.839003645200486e-05, + "loss": 1.3656, + "step": 6859 + }, + { + "epoch": 2.0822583093033846, + "grad_norm": 0.822521984577179, + "learning_rate": 5.8383961117861486e-05, + "loss": 1.42, + "step": 6860 + }, + { + "epoch": 2.082561845500076, + "grad_norm": 0.8375853300094604, + "learning_rate": 5.837788578371811e-05, + "loss": 1.0434, + "step": 6861 + }, + { + "epoch": 2.0828653816967675, + "grad_norm": 0.8537359833717346, + "learning_rate": 5.837181044957473e-05, + "loss": 1.2341, + "step": 6862 + }, + { + "epoch": 2.0831689178934587, + "grad_norm": 0.7931867241859436, + "learning_rate": 5.836573511543135e-05, + "loss": 0.8769, + "step": 6863 + }, + { + "epoch": 2.0834724540901504, + "grad_norm": 0.7443602681159973, + "learning_rate": 5.8359659781287975e-05, + "loss": 1.3125, + "step": 6864 + }, + { + "epoch": 2.0837759902868416, + "grad_norm": 0.7556854486465454, + "learning_rate": 5.835358444714459e-05, + "loss": 0.9453, + "step": 6865 + }, + { + "epoch": 2.0840795264835332, + "grad_norm": 0.5914473533630371, + "learning_rate": 5.834750911300122e-05, + "loss": 1.4683, + "step": 6866 + }, + { + "epoch": 2.0843830626802244, + "grad_norm": 0.5810937285423279, + "learning_rate": 5.8341433778857834e-05, + "loss": 0.9183, + "step": 6867 + }, + { + "epoch": 2.084686598876916, + "grad_norm": 0.7326523661613464, + "learning_rate": 5.833535844471446e-05, + "loss": 1.4119, + "step": 6868 + }, + { + "epoch": 2.0849901350736078, + "grad_norm": 0.7578046321868896, + "learning_rate": 5.832928311057109e-05, + "loss": 1.354, + "step": 6869 + }, + { + "epoch": 2.085293671270299, + "grad_norm": 0.6388635635375977, + "learning_rate": 5.83232077764277e-05, + "loss": 1.2435, + "step": 6870 + }, + { + "epoch": 2.0855972074669906, + "grad_norm": 0.8143454790115356, + "learning_rate": 5.831713244228433e-05, + "loss": 1.1209, + "step": 6871 + }, + { + "epoch": 2.085900743663682, + "grad_norm": 0.7743310332298279, + "learning_rate": 5.8311057108140954e-05, + "loss": 1.0791, + "step": 6872 + }, + { + "epoch": 2.0862042798603735, + "grad_norm": 0.6745989322662354, + "learning_rate": 5.830498177399757e-05, + "loss": 1.3102, + "step": 6873 + }, + { + "epoch": 2.0865078160570647, + "grad_norm": 0.8118981122970581, + "learning_rate": 5.8298906439854196e-05, + "loss": 0.9206, + "step": 6874 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.815087080001831, + "learning_rate": 5.829283110571082e-05, + "loss": 1.0726, + "step": 6875 + }, + { + "epoch": 2.0871148884504476, + "grad_norm": 0.7686581611633301, + "learning_rate": 5.828675577156744e-05, + "loss": 1.3252, + "step": 6876 + }, + { + "epoch": 2.0874184246471392, + "grad_norm": 0.8357821106910706, + "learning_rate": 5.828068043742406e-05, + "loss": 1.3634, + "step": 6877 + }, + { + "epoch": 2.0877219608438304, + "grad_norm": 0.6431812047958374, + "learning_rate": 5.8274605103280685e-05, + "loss": 1.6529, + "step": 6878 + }, + { + "epoch": 2.088025497040522, + "grad_norm": 0.7796066403388977, + "learning_rate": 5.82685297691373e-05, + "loss": 1.3433, + "step": 6879 + }, + { + "epoch": 2.0883290332372137, + "grad_norm": 0.7999757528305054, + "learning_rate": 5.826245443499393e-05, + "loss": 1.4674, + "step": 6880 + }, + { + "epoch": 2.088632569433905, + "grad_norm": 0.7284780740737915, + "learning_rate": 5.8256379100850544e-05, + "loss": 1.6366, + "step": 6881 + }, + { + "epoch": 2.0889361056305966, + "grad_norm": 0.6166238188743591, + "learning_rate": 5.825030376670717e-05, + "loss": 1.3335, + "step": 6882 + }, + { + "epoch": 2.089239641827288, + "grad_norm": 0.6601129174232483, + "learning_rate": 5.82442284325638e-05, + "loss": 1.0488, + "step": 6883 + }, + { + "epoch": 2.0895431780239795, + "grad_norm": 0.6007164120674133, + "learning_rate": 5.823815309842041e-05, + "loss": 1.3009, + "step": 6884 + }, + { + "epoch": 2.0898467142206707, + "grad_norm": 0.6937150359153748, + "learning_rate": 5.823207776427704e-05, + "loss": 1.3778, + "step": 6885 + }, + { + "epoch": 2.0901502504173624, + "grad_norm": 0.7025666236877441, + "learning_rate": 5.8226002430133664e-05, + "loss": 1.5254, + "step": 6886 + }, + { + "epoch": 2.0904537866140536, + "grad_norm": 0.6330369114875793, + "learning_rate": 5.821992709599028e-05, + "loss": 1.0681, + "step": 6887 + }, + { + "epoch": 2.0907573228107452, + "grad_norm": 0.9064787030220032, + "learning_rate": 5.8213851761846906e-05, + "loss": 1.2772, + "step": 6888 + }, + { + "epoch": 2.0910608590074364, + "grad_norm": 0.7519260048866272, + "learning_rate": 5.820777642770353e-05, + "loss": 1.2378, + "step": 6889 + }, + { + "epoch": 2.091364395204128, + "grad_norm": 0.8459030985832214, + "learning_rate": 5.820170109356015e-05, + "loss": 1.338, + "step": 6890 + }, + { + "epoch": 2.0916679314008197, + "grad_norm": 0.6403706073760986, + "learning_rate": 5.819562575941677e-05, + "loss": 1.5272, + "step": 6891 + }, + { + "epoch": 2.091971467597511, + "grad_norm": 1.1030857563018799, + "learning_rate": 5.8189550425273395e-05, + "loss": 1.1563, + "step": 6892 + }, + { + "epoch": 2.0922750037942026, + "grad_norm": 0.5904721021652222, + "learning_rate": 5.818347509113001e-05, + "loss": 0.9227, + "step": 6893 + }, + { + "epoch": 2.092578539990894, + "grad_norm": 0.5721134543418884, + "learning_rate": 5.817739975698664e-05, + "loss": 1.4074, + "step": 6894 + }, + { + "epoch": 2.0928820761875855, + "grad_norm": 0.6387042999267578, + "learning_rate": 5.8171324422843254e-05, + "loss": 1.3688, + "step": 6895 + }, + { + "epoch": 2.0931856123842767, + "grad_norm": 0.5705280900001526, + "learning_rate": 5.816524908869988e-05, + "loss": 1.3471, + "step": 6896 + }, + { + "epoch": 2.0934891485809684, + "grad_norm": 1.1113790273666382, + "learning_rate": 5.815917375455651e-05, + "loss": 1.1356, + "step": 6897 + }, + { + "epoch": 2.0937926847776596, + "grad_norm": 0.49781277775764465, + "learning_rate": 5.815309842041312e-05, + "loss": 1.6787, + "step": 6898 + }, + { + "epoch": 2.094096220974351, + "grad_norm": 0.6792327165603638, + "learning_rate": 5.814702308626975e-05, + "loss": 1.4144, + "step": 6899 + }, + { + "epoch": 2.0943997571710424, + "grad_norm": 0.7339219450950623, + "learning_rate": 5.8140947752126374e-05, + "loss": 1.087, + "step": 6900 + }, + { + "epoch": 2.094703293367734, + "grad_norm": 0.8383113145828247, + "learning_rate": 5.813487241798299e-05, + "loss": 1.585, + "step": 6901 + }, + { + "epoch": 2.0950068295644257, + "grad_norm": 0.6781391501426697, + "learning_rate": 5.8128797083839616e-05, + "loss": 1.4833, + "step": 6902 + }, + { + "epoch": 2.095310365761117, + "grad_norm": 0.8215876817703247, + "learning_rate": 5.812272174969624e-05, + "loss": 1.2299, + "step": 6903 + }, + { + "epoch": 2.0956139019578086, + "grad_norm": 0.6117851138114929, + "learning_rate": 5.811664641555286e-05, + "loss": 0.7209, + "step": 6904 + }, + { + "epoch": 2.0959174381545, + "grad_norm": 0.7788382172584534, + "learning_rate": 5.811057108140948e-05, + "loss": 1.4464, + "step": 6905 + }, + { + "epoch": 2.0962209743511915, + "grad_norm": 0.4706348180770874, + "learning_rate": 5.8104495747266105e-05, + "loss": 0.6949, + "step": 6906 + }, + { + "epoch": 2.0965245105478827, + "grad_norm": 0.9112883806228638, + "learning_rate": 5.809842041312272e-05, + "loss": 0.9623, + "step": 6907 + }, + { + "epoch": 2.0968280467445743, + "grad_norm": 0.7183716893196106, + "learning_rate": 5.809234507897935e-05, + "loss": 1.297, + "step": 6908 + }, + { + "epoch": 2.0971315829412656, + "grad_norm": 1.03567373752594, + "learning_rate": 5.8086269744835964e-05, + "loss": 1.3522, + "step": 6909 + }, + { + "epoch": 2.097435119137957, + "grad_norm": 0.8804583549499512, + "learning_rate": 5.808019441069259e-05, + "loss": 1.5869, + "step": 6910 + }, + { + "epoch": 2.097738655334649, + "grad_norm": 0.7099947333335876, + "learning_rate": 5.807411907654922e-05, + "loss": 1.1776, + "step": 6911 + }, + { + "epoch": 2.09804219153134, + "grad_norm": 0.7495343685150146, + "learning_rate": 5.806804374240583e-05, + "loss": 1.5691, + "step": 6912 + }, + { + "epoch": 2.0983457277280317, + "grad_norm": 0.7520137429237366, + "learning_rate": 5.806196840826246e-05, + "loss": 1.1412, + "step": 6913 + }, + { + "epoch": 2.098649263924723, + "grad_norm": 0.6386212706565857, + "learning_rate": 5.8055893074119085e-05, + "loss": 1.0, + "step": 6914 + }, + { + "epoch": 2.0989528001214146, + "grad_norm": 0.5278308987617493, + "learning_rate": 5.8049817739975695e-05, + "loss": 1.3963, + "step": 6915 + }, + { + "epoch": 2.099256336318106, + "grad_norm": 0.8359299898147583, + "learning_rate": 5.8043742405832326e-05, + "loss": 1.1387, + "step": 6916 + }, + { + "epoch": 2.0995598725147975, + "grad_norm": 0.6538137197494507, + "learning_rate": 5.803766707168895e-05, + "loss": 1.0053, + "step": 6917 + }, + { + "epoch": 2.0998634087114887, + "grad_norm": 0.7379575967788696, + "learning_rate": 5.803159173754557e-05, + "loss": 1.3158, + "step": 6918 + }, + { + "epoch": 2.1001669449081803, + "grad_norm": 0.5702930092811584, + "learning_rate": 5.802551640340219e-05, + "loss": 1.3555, + "step": 6919 + }, + { + "epoch": 2.1004704811048716, + "grad_norm": 0.8485485911369324, + "learning_rate": 5.8019441069258815e-05, + "loss": 1.2317, + "step": 6920 + }, + { + "epoch": 2.100774017301563, + "grad_norm": 0.7139195203781128, + "learning_rate": 5.801336573511543e-05, + "loss": 1.2626, + "step": 6921 + }, + { + "epoch": 2.101077553498255, + "grad_norm": 0.7576490044593811, + "learning_rate": 5.800729040097206e-05, + "loss": 0.9976, + "step": 6922 + }, + { + "epoch": 2.101381089694946, + "grad_norm": 0.7416251301765442, + "learning_rate": 5.8001215066828674e-05, + "loss": 1.2989, + "step": 6923 + }, + { + "epoch": 2.1016846258916377, + "grad_norm": 0.7933403849601746, + "learning_rate": 5.79951397326853e-05, + "loss": 1.6501, + "step": 6924 + }, + { + "epoch": 2.101988162088329, + "grad_norm": 0.6887480020523071, + "learning_rate": 5.798906439854193e-05, + "loss": 1.3108, + "step": 6925 + }, + { + "epoch": 2.1022916982850206, + "grad_norm": 0.6056838631629944, + "learning_rate": 5.798298906439854e-05, + "loss": 1.5473, + "step": 6926 + }, + { + "epoch": 2.102595234481712, + "grad_norm": 0.6874396800994873, + "learning_rate": 5.7976913730255164e-05, + "loss": 1.4108, + "step": 6927 + }, + { + "epoch": 2.1028987706784035, + "grad_norm": 0.7933046817779541, + "learning_rate": 5.7970838396111795e-05, + "loss": 1.4558, + "step": 6928 + }, + { + "epoch": 2.1032023068750947, + "grad_norm": 0.6857606172561646, + "learning_rate": 5.7964763061968405e-05, + "loss": 1.0801, + "step": 6929 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.8258737325668335, + "learning_rate": 5.7958687727825036e-05, + "loss": 0.9648, + "step": 6930 + }, + { + "epoch": 2.1038093792684776, + "grad_norm": 0.9222277998924255, + "learning_rate": 5.795261239368166e-05, + "loss": 1.2347, + "step": 6931 + }, + { + "epoch": 2.104112915465169, + "grad_norm": 0.5885825157165527, + "learning_rate": 5.794653705953828e-05, + "loss": 1.5702, + "step": 6932 + }, + { + "epoch": 2.104416451661861, + "grad_norm": 0.6979884505271912, + "learning_rate": 5.79404617253949e-05, + "loss": 0.6671, + "step": 6933 + }, + { + "epoch": 2.104719987858552, + "grad_norm": 0.7170132994651794, + "learning_rate": 5.793438639125152e-05, + "loss": 1.4453, + "step": 6934 + }, + { + "epoch": 2.1050235240552437, + "grad_norm": 0.6434389352798462, + "learning_rate": 5.792831105710814e-05, + "loss": 0.9343, + "step": 6935 + }, + { + "epoch": 2.105327060251935, + "grad_norm": 0.8252448439598083, + "learning_rate": 5.792223572296477e-05, + "loss": 1.335, + "step": 6936 + }, + { + "epoch": 2.1056305964486266, + "grad_norm": 0.7473122477531433, + "learning_rate": 5.7916160388821384e-05, + "loss": 1.5318, + "step": 6937 + }, + { + "epoch": 2.105934132645318, + "grad_norm": 0.7320314049720764, + "learning_rate": 5.791008505467801e-05, + "loss": 1.2928, + "step": 6938 + }, + { + "epoch": 2.1062376688420095, + "grad_norm": 0.5336587429046631, + "learning_rate": 5.790400972053463e-05, + "loss": 0.9455, + "step": 6939 + }, + { + "epoch": 2.1065412050387007, + "grad_norm": 0.7179783582687378, + "learning_rate": 5.789793438639125e-05, + "loss": 1.2575, + "step": 6940 + }, + { + "epoch": 2.1068447412353923, + "grad_norm": 0.8911033868789673, + "learning_rate": 5.7891859052247874e-05, + "loss": 1.3474, + "step": 6941 + }, + { + "epoch": 2.107148277432084, + "grad_norm": 0.8055987358093262, + "learning_rate": 5.7885783718104505e-05, + "loss": 1.1506, + "step": 6942 + }, + { + "epoch": 2.107451813628775, + "grad_norm": 0.8984194397926331, + "learning_rate": 5.7879708383961115e-05, + "loss": 1.0726, + "step": 6943 + }, + { + "epoch": 2.107755349825467, + "grad_norm": 0.6842384338378906, + "learning_rate": 5.7873633049817746e-05, + "loss": 1.002, + "step": 6944 + }, + { + "epoch": 2.108058886022158, + "grad_norm": 0.5936232805252075, + "learning_rate": 5.786755771567437e-05, + "loss": 1.3044, + "step": 6945 + }, + { + "epoch": 2.1083624222188497, + "grad_norm": 0.6418696641921997, + "learning_rate": 5.786148238153099e-05, + "loss": 1.3182, + "step": 6946 + }, + { + "epoch": 2.108665958415541, + "grad_norm": 0.5525402426719666, + "learning_rate": 5.785540704738761e-05, + "loss": 0.6605, + "step": 6947 + }, + { + "epoch": 2.1089694946122326, + "grad_norm": 0.9133402705192566, + "learning_rate": 5.784933171324423e-05, + "loss": 1.1272, + "step": 6948 + }, + { + "epoch": 2.109273030808924, + "grad_norm": 0.8316843509674072, + "learning_rate": 5.784325637910085e-05, + "loss": 1.0813, + "step": 6949 + }, + { + "epoch": 2.1095765670056155, + "grad_norm": 0.6806902289390564, + "learning_rate": 5.783718104495748e-05, + "loss": 1.1012, + "step": 6950 + }, + { + "epoch": 2.1098801032023067, + "grad_norm": 0.6396098136901855, + "learning_rate": 5.7831105710814094e-05, + "loss": 1.5173, + "step": 6951 + }, + { + "epoch": 2.1101836393989983, + "grad_norm": 0.5806583762168884, + "learning_rate": 5.782503037667072e-05, + "loss": 1.0212, + "step": 6952 + }, + { + "epoch": 2.11048717559569, + "grad_norm": 0.6874980330467224, + "learning_rate": 5.781895504252734e-05, + "loss": 1.5176, + "step": 6953 + }, + { + "epoch": 2.110790711792381, + "grad_norm": 0.7226924300193787, + "learning_rate": 5.781287970838396e-05, + "loss": 1.476, + "step": 6954 + }, + { + "epoch": 2.111094247989073, + "grad_norm": 0.7012338638305664, + "learning_rate": 5.7806804374240584e-05, + "loss": 1.2147, + "step": 6955 + }, + { + "epoch": 2.111397784185764, + "grad_norm": 1.2091950178146362, + "learning_rate": 5.7800729040097215e-05, + "loss": 0.9889, + "step": 6956 + }, + { + "epoch": 2.1117013203824557, + "grad_norm": 0.8179687261581421, + "learning_rate": 5.7794653705953825e-05, + "loss": 1.3606, + "step": 6957 + }, + { + "epoch": 2.112004856579147, + "grad_norm": 0.7715298533439636, + "learning_rate": 5.7788578371810456e-05, + "loss": 1.1802, + "step": 6958 + }, + { + "epoch": 2.1123083927758386, + "grad_norm": 0.7295283675193787, + "learning_rate": 5.778250303766708e-05, + "loss": 1.2638, + "step": 6959 + }, + { + "epoch": 2.11261192897253, + "grad_norm": 0.7757214307785034, + "learning_rate": 5.77764277035237e-05, + "loss": 1.3599, + "step": 6960 + }, + { + "epoch": 2.1129154651692215, + "grad_norm": 0.6359571814537048, + "learning_rate": 5.777035236938032e-05, + "loss": 1.2531, + "step": 6961 + }, + { + "epoch": 2.1132190013659127, + "grad_norm": 0.8131042122840881, + "learning_rate": 5.776427703523694e-05, + "loss": 1.5738, + "step": 6962 + }, + { + "epoch": 2.1135225375626043, + "grad_norm": 0.6301864981651306, + "learning_rate": 5.775820170109356e-05, + "loss": 1.4534, + "step": 6963 + }, + { + "epoch": 2.113826073759296, + "grad_norm": 0.569684624671936, + "learning_rate": 5.775212636695019e-05, + "loss": 1.0992, + "step": 6964 + }, + { + "epoch": 2.114129609955987, + "grad_norm": 0.5382822155952454, + "learning_rate": 5.7746051032806804e-05, + "loss": 1.0529, + "step": 6965 + }, + { + "epoch": 2.114433146152679, + "grad_norm": 0.6871811747550964, + "learning_rate": 5.773997569866343e-05, + "loss": 1.4066, + "step": 6966 + }, + { + "epoch": 2.11473668234937, + "grad_norm": 0.7224509119987488, + "learning_rate": 5.773390036452005e-05, + "loss": 1.3342, + "step": 6967 + }, + { + "epoch": 2.1150402185460617, + "grad_norm": 0.5536501407623291, + "learning_rate": 5.772782503037667e-05, + "loss": 1.6142, + "step": 6968 + }, + { + "epoch": 2.115343754742753, + "grad_norm": 0.7293581962585449, + "learning_rate": 5.7721749696233294e-05, + "loss": 1.2867, + "step": 6969 + }, + { + "epoch": 2.1156472909394446, + "grad_norm": 0.7771735787391663, + "learning_rate": 5.7715674362089925e-05, + "loss": 1.3612, + "step": 6970 + }, + { + "epoch": 2.115950827136136, + "grad_norm": 0.7306496500968933, + "learning_rate": 5.7709599027946535e-05, + "loss": 1.4965, + "step": 6971 + }, + { + "epoch": 2.1162543633328275, + "grad_norm": 0.7789996266365051, + "learning_rate": 5.7703523693803166e-05, + "loss": 0.8977, + "step": 6972 + }, + { + "epoch": 2.1165578995295187, + "grad_norm": 0.7751117944717407, + "learning_rate": 5.769744835965979e-05, + "loss": 1.3932, + "step": 6973 + }, + { + "epoch": 2.1168614357262103, + "grad_norm": 0.7575637102127075, + "learning_rate": 5.769137302551641e-05, + "loss": 1.5727, + "step": 6974 + }, + { + "epoch": 2.117164971922902, + "grad_norm": 0.7688522338867188, + "learning_rate": 5.768529769137303e-05, + "loss": 1.458, + "step": 6975 + }, + { + "epoch": 2.117468508119593, + "grad_norm": 0.6894252896308899, + "learning_rate": 5.767922235722964e-05, + "loss": 1.4373, + "step": 6976 + }, + { + "epoch": 2.117772044316285, + "grad_norm": 0.8942421674728394, + "learning_rate": 5.767314702308627e-05, + "loss": 1.3635, + "step": 6977 + }, + { + "epoch": 2.118075580512976, + "grad_norm": 0.8553876280784607, + "learning_rate": 5.76670716889429e-05, + "loss": 1.121, + "step": 6978 + }, + { + "epoch": 2.1183791167096677, + "grad_norm": 0.7508211135864258, + "learning_rate": 5.7660996354799514e-05, + "loss": 1.0187, + "step": 6979 + }, + { + "epoch": 2.118682652906359, + "grad_norm": 1.0324530601501465, + "learning_rate": 5.765492102065614e-05, + "loss": 1.1299, + "step": 6980 + }, + { + "epoch": 2.1189861891030506, + "grad_norm": 0.5394060611724854, + "learning_rate": 5.764884568651276e-05, + "loss": 0.9301, + "step": 6981 + }, + { + "epoch": 2.119289725299742, + "grad_norm": 0.6748804450035095, + "learning_rate": 5.764277035236938e-05, + "loss": 1.3764, + "step": 6982 + }, + { + "epoch": 2.1195932614964335, + "grad_norm": 0.90252286195755, + "learning_rate": 5.7636695018226004e-05, + "loss": 1.3735, + "step": 6983 + }, + { + "epoch": 2.119896797693125, + "grad_norm": 0.8094146847724915, + "learning_rate": 5.7630619684082635e-05, + "loss": 0.8956, + "step": 6984 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.6555180549621582, + "learning_rate": 5.7624544349939245e-05, + "loss": 0.8515, + "step": 6985 + }, + { + "epoch": 2.120503870086508, + "grad_norm": 0.7240637540817261, + "learning_rate": 5.7618469015795876e-05, + "loss": 1.4879, + "step": 6986 + }, + { + "epoch": 2.120807406283199, + "grad_norm": 0.7705114483833313, + "learning_rate": 5.76123936816525e-05, + "loss": 1.2672, + "step": 6987 + }, + { + "epoch": 2.121110942479891, + "grad_norm": 0.8182518482208252, + "learning_rate": 5.760631834750911e-05, + "loss": 1.4521, + "step": 6988 + }, + { + "epoch": 2.121414478676582, + "grad_norm": 0.6841187477111816, + "learning_rate": 5.760024301336574e-05, + "loss": 0.9986, + "step": 6989 + }, + { + "epoch": 2.1217180148732737, + "grad_norm": 0.7378225326538086, + "learning_rate": 5.759416767922235e-05, + "loss": 0.316, + "step": 6990 + }, + { + "epoch": 2.122021551069965, + "grad_norm": 0.5745599865913391, + "learning_rate": 5.758809234507898e-05, + "loss": 1.6092, + "step": 6991 + }, + { + "epoch": 2.1223250872666566, + "grad_norm": 0.8312602043151855, + "learning_rate": 5.758201701093561e-05, + "loss": 1.1245, + "step": 6992 + }, + { + "epoch": 2.122628623463348, + "grad_norm": 0.7721714377403259, + "learning_rate": 5.7575941676792224e-05, + "loss": 1.0455, + "step": 6993 + }, + { + "epoch": 2.1229321596600395, + "grad_norm": 0.7318625450134277, + "learning_rate": 5.756986634264885e-05, + "loss": 1.5849, + "step": 6994 + }, + { + "epoch": 2.123235695856731, + "grad_norm": 1.0568405389785767, + "learning_rate": 5.756379100850547e-05, + "loss": 1.1597, + "step": 6995 + }, + { + "epoch": 2.1235392320534223, + "grad_norm": 0.5453389883041382, + "learning_rate": 5.755771567436209e-05, + "loss": 1.7677, + "step": 6996 + }, + { + "epoch": 2.123842768250114, + "grad_norm": 0.8192586302757263, + "learning_rate": 5.7551640340218714e-05, + "loss": 1.3186, + "step": 6997 + }, + { + "epoch": 2.124146304446805, + "grad_norm": 0.836601197719574, + "learning_rate": 5.7545565006075345e-05, + "loss": 1.0741, + "step": 6998 + }, + { + "epoch": 2.124449840643497, + "grad_norm": 0.6476053595542908, + "learning_rate": 5.7539489671931955e-05, + "loss": 1.2964, + "step": 6999 + }, + { + "epoch": 2.124753376840188, + "grad_norm": 1.121575117111206, + "learning_rate": 5.753341433778858e-05, + "loss": 0.9233, + "step": 7000 + }, + { + "epoch": 2.1250569130368797, + "grad_norm": 0.8215871453285217, + "learning_rate": 5.752733900364521e-05, + "loss": 1.1041, + "step": 7001 + }, + { + "epoch": 2.125360449233571, + "grad_norm": 0.8727826476097107, + "learning_rate": 5.752126366950182e-05, + "loss": 1.1298, + "step": 7002 + }, + { + "epoch": 2.1256639854302626, + "grad_norm": 0.8024986386299133, + "learning_rate": 5.751518833535845e-05, + "loss": 1.078, + "step": 7003 + }, + { + "epoch": 2.1259675216269542, + "grad_norm": 0.6803773641586304, + "learning_rate": 5.750911300121506e-05, + "loss": 1.2013, + "step": 7004 + }, + { + "epoch": 2.1262710578236454, + "grad_norm": 0.7451168298721313, + "learning_rate": 5.750303766707169e-05, + "loss": 1.1364, + "step": 7005 + }, + { + "epoch": 2.126574594020337, + "grad_norm": 0.5670786499977112, + "learning_rate": 5.749696233292832e-05, + "loss": 1.0867, + "step": 7006 + }, + { + "epoch": 2.1268781302170283, + "grad_norm": 0.7828416228294373, + "learning_rate": 5.7490886998784934e-05, + "loss": 1.0044, + "step": 7007 + }, + { + "epoch": 2.12718166641372, + "grad_norm": 0.6469855308532715, + "learning_rate": 5.748481166464156e-05, + "loss": 1.5497, + "step": 7008 + }, + { + "epoch": 2.127485202610411, + "grad_norm": 0.6535457372665405, + "learning_rate": 5.747873633049818e-05, + "loss": 1.5059, + "step": 7009 + }, + { + "epoch": 2.127788738807103, + "grad_norm": 0.5588821768760681, + "learning_rate": 5.74726609963548e-05, + "loss": 1.2033, + "step": 7010 + }, + { + "epoch": 2.128092275003794, + "grad_norm": 0.8629704713821411, + "learning_rate": 5.7466585662211424e-05, + "loss": 1.2453, + "step": 7011 + }, + { + "epoch": 2.1283958112004857, + "grad_norm": 0.6546429991722107, + "learning_rate": 5.746051032806805e-05, + "loss": 1.6456, + "step": 7012 + }, + { + "epoch": 2.128699347397177, + "grad_norm": 0.7082663178443909, + "learning_rate": 5.7454434993924665e-05, + "loss": 1.4852, + "step": 7013 + }, + { + "epoch": 2.1290028835938686, + "grad_norm": 0.8340616226196289, + "learning_rate": 5.744835965978129e-05, + "loss": 1.3837, + "step": 7014 + }, + { + "epoch": 2.1293064197905602, + "grad_norm": 0.6598572731018066, + "learning_rate": 5.744228432563791e-05, + "loss": 1.2429, + "step": 7015 + }, + { + "epoch": 2.1296099559872514, + "grad_norm": 0.91293865442276, + "learning_rate": 5.743620899149453e-05, + "loss": 1.1037, + "step": 7016 + }, + { + "epoch": 2.129913492183943, + "grad_norm": 0.83259117603302, + "learning_rate": 5.743013365735116e-05, + "loss": 1.2325, + "step": 7017 + }, + { + "epoch": 2.1302170283806343, + "grad_norm": 0.8073476552963257, + "learning_rate": 5.742405832320777e-05, + "loss": 1.3386, + "step": 7018 + }, + { + "epoch": 2.130520564577326, + "grad_norm": 0.6302822232246399, + "learning_rate": 5.74179829890644e-05, + "loss": 0.7663, + "step": 7019 + }, + { + "epoch": 2.130824100774017, + "grad_norm": 0.5466523766517639, + "learning_rate": 5.741190765492103e-05, + "loss": 1.1305, + "step": 7020 + }, + { + "epoch": 2.131127636970709, + "grad_norm": 0.6615848541259766, + "learning_rate": 5.7405832320777645e-05, + "loss": 1.6637, + "step": 7021 + }, + { + "epoch": 2.1314311731674, + "grad_norm": 0.7762163877487183, + "learning_rate": 5.739975698663427e-05, + "loss": 1.4986, + "step": 7022 + }, + { + "epoch": 2.1317347093640917, + "grad_norm": 0.7263661026954651, + "learning_rate": 5.739368165249089e-05, + "loss": 1.1411, + "step": 7023 + }, + { + "epoch": 2.132038245560783, + "grad_norm": 0.6416277885437012, + "learning_rate": 5.738760631834751e-05, + "loss": 0.8715, + "step": 7024 + }, + { + "epoch": 2.1323417817574746, + "grad_norm": 0.5931896567344666, + "learning_rate": 5.7381530984204134e-05, + "loss": 1.0077, + "step": 7025 + }, + { + "epoch": 2.1326453179541662, + "grad_norm": 0.715061604976654, + "learning_rate": 5.737545565006076e-05, + "loss": 1.2576, + "step": 7026 + }, + { + "epoch": 2.1329488541508574, + "grad_norm": 0.7923848628997803, + "learning_rate": 5.7369380315917375e-05, + "loss": 1.4421, + "step": 7027 + }, + { + "epoch": 2.133252390347549, + "grad_norm": 0.7267158627510071, + "learning_rate": 5.7363304981774e-05, + "loss": 1.1591, + "step": 7028 + }, + { + "epoch": 2.1335559265442403, + "grad_norm": 0.7481387257575989, + "learning_rate": 5.735722964763062e-05, + "loss": 1.2607, + "step": 7029 + }, + { + "epoch": 2.133859462740932, + "grad_norm": 0.7214675545692444, + "learning_rate": 5.735115431348724e-05, + "loss": 0.7526, + "step": 7030 + }, + { + "epoch": 2.134162998937623, + "grad_norm": 0.6436885595321655, + "learning_rate": 5.734507897934387e-05, + "loss": 1.3025, + "step": 7031 + }, + { + "epoch": 2.134466535134315, + "grad_norm": 0.7761930227279663, + "learning_rate": 5.733900364520048e-05, + "loss": 1.4619, + "step": 7032 + }, + { + "epoch": 2.134770071331006, + "grad_norm": 0.6738321781158447, + "learning_rate": 5.733292831105711e-05, + "loss": 1.5857, + "step": 7033 + }, + { + "epoch": 2.1350736075276977, + "grad_norm": 0.8512619137763977, + "learning_rate": 5.732685297691374e-05, + "loss": 0.805, + "step": 7034 + }, + { + "epoch": 2.135377143724389, + "grad_norm": 0.6589305996894836, + "learning_rate": 5.7320777642770355e-05, + "loss": 1.4291, + "step": 7035 + }, + { + "epoch": 2.1356806799210806, + "grad_norm": 0.8158634901046753, + "learning_rate": 5.731470230862698e-05, + "loss": 1.1064, + "step": 7036 + }, + { + "epoch": 2.1359842161177722, + "grad_norm": 0.6389902830123901, + "learning_rate": 5.73086269744836e-05, + "loss": 1.0643, + "step": 7037 + }, + { + "epoch": 2.1362877523144634, + "grad_norm": 0.6268038749694824, + "learning_rate": 5.730255164034022e-05, + "loss": 1.4805, + "step": 7038 + }, + { + "epoch": 2.136591288511155, + "grad_norm": 0.8576473593711853, + "learning_rate": 5.7296476306196844e-05, + "loss": 1.165, + "step": 7039 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.7211112976074219, + "learning_rate": 5.729040097205347e-05, + "loss": 1.1561, + "step": 7040 + }, + { + "epoch": 2.137198360904538, + "grad_norm": 0.6866903901100159, + "learning_rate": 5.7284325637910086e-05, + "loss": 1.4048, + "step": 7041 + }, + { + "epoch": 2.137501897101229, + "grad_norm": 0.6264458894729614, + "learning_rate": 5.727825030376671e-05, + "loss": 0.8988, + "step": 7042 + }, + { + "epoch": 2.137805433297921, + "grad_norm": 0.7150564193725586, + "learning_rate": 5.727217496962333e-05, + "loss": 1.4186, + "step": 7043 + }, + { + "epoch": 2.138108969494612, + "grad_norm": 0.82100510597229, + "learning_rate": 5.726609963547995e-05, + "loss": 1.1084, + "step": 7044 + }, + { + "epoch": 2.1384125056913037, + "grad_norm": 0.5969480276107788, + "learning_rate": 5.726002430133658e-05, + "loss": 1.1065, + "step": 7045 + }, + { + "epoch": 2.138716041887995, + "grad_norm": 0.7427048087120056, + "learning_rate": 5.725394896719319e-05, + "loss": 1.0831, + "step": 7046 + }, + { + "epoch": 2.1390195780846866, + "grad_norm": 0.6148872375488281, + "learning_rate": 5.724787363304982e-05, + "loss": 1.2953, + "step": 7047 + }, + { + "epoch": 2.1393231142813782, + "grad_norm": 0.7812492251396179, + "learning_rate": 5.724179829890645e-05, + "loss": 1.3676, + "step": 7048 + }, + { + "epoch": 2.1396266504780694, + "grad_norm": 0.7223173379898071, + "learning_rate": 5.723572296476306e-05, + "loss": 0.7977, + "step": 7049 + }, + { + "epoch": 2.139930186674761, + "grad_norm": 0.6859977841377258, + "learning_rate": 5.722964763061969e-05, + "loss": 1.053, + "step": 7050 + }, + { + "epoch": 2.1402337228714523, + "grad_norm": 0.7460084557533264, + "learning_rate": 5.722357229647631e-05, + "loss": 1.2215, + "step": 7051 + }, + { + "epoch": 2.140537259068144, + "grad_norm": 0.7167086601257324, + "learning_rate": 5.721749696233293e-05, + "loss": 1.622, + "step": 7052 + }, + { + "epoch": 2.140840795264835, + "grad_norm": 0.7320702075958252, + "learning_rate": 5.7211421628189554e-05, + "loss": 1.0462, + "step": 7053 + }, + { + "epoch": 2.141144331461527, + "grad_norm": 0.6613295078277588, + "learning_rate": 5.720534629404618e-05, + "loss": 1.0691, + "step": 7054 + }, + { + "epoch": 2.141447867658218, + "grad_norm": 0.843561589717865, + "learning_rate": 5.7199270959902796e-05, + "loss": 1.3478, + "step": 7055 + }, + { + "epoch": 2.1417514038549097, + "grad_norm": 0.5995355248451233, + "learning_rate": 5.719319562575942e-05, + "loss": 1.2636, + "step": 7056 + }, + { + "epoch": 2.1420549400516014, + "grad_norm": 0.8956936597824097, + "learning_rate": 5.718712029161604e-05, + "loss": 0.8077, + "step": 7057 + }, + { + "epoch": 2.1423584762482926, + "grad_norm": 0.8197022676467896, + "learning_rate": 5.718104495747266e-05, + "loss": 1.258, + "step": 7058 + }, + { + "epoch": 2.142662012444984, + "grad_norm": 0.7216955423355103, + "learning_rate": 5.717496962332929e-05, + "loss": 1.5475, + "step": 7059 + }, + { + "epoch": 2.1429655486416754, + "grad_norm": 0.7288816571235657, + "learning_rate": 5.71688942891859e-05, + "loss": 1.1879, + "step": 7060 + }, + { + "epoch": 2.143269084838367, + "grad_norm": 0.7779216766357422, + "learning_rate": 5.7162818955042527e-05, + "loss": 1.3337, + "step": 7061 + }, + { + "epoch": 2.1435726210350583, + "grad_norm": 0.5709745287895203, + "learning_rate": 5.715674362089916e-05, + "loss": 1.1431, + "step": 7062 + }, + { + "epoch": 2.14387615723175, + "grad_norm": 0.7075303792953491, + "learning_rate": 5.715066828675577e-05, + "loss": 1.4308, + "step": 7063 + }, + { + "epoch": 2.144179693428441, + "grad_norm": 0.7079590559005737, + "learning_rate": 5.71445929526124e-05, + "loss": 1.4299, + "step": 7064 + }, + { + "epoch": 2.144483229625133, + "grad_norm": 0.9902227520942688, + "learning_rate": 5.713851761846902e-05, + "loss": 1.2042, + "step": 7065 + }, + { + "epoch": 2.1447867658218245, + "grad_norm": 1.1577403545379639, + "learning_rate": 5.713244228432564e-05, + "loss": 1.3816, + "step": 7066 + }, + { + "epoch": 2.1450903020185157, + "grad_norm": 0.8484241962432861, + "learning_rate": 5.7126366950182264e-05, + "loss": 1.3148, + "step": 7067 + }, + { + "epoch": 2.1453938382152073, + "grad_norm": 0.8071272969245911, + "learning_rate": 5.712029161603889e-05, + "loss": 1.4303, + "step": 7068 + }, + { + "epoch": 2.1456973744118986, + "grad_norm": 0.7060108184814453, + "learning_rate": 5.7114216281895506e-05, + "loss": 1.4284, + "step": 7069 + }, + { + "epoch": 2.14600091060859, + "grad_norm": 0.6524072289466858, + "learning_rate": 5.710814094775213e-05, + "loss": 1.6973, + "step": 7070 + }, + { + "epoch": 2.1463044468052814, + "grad_norm": 0.8758344650268555, + "learning_rate": 5.710206561360875e-05, + "loss": 1.2572, + "step": 7071 + }, + { + "epoch": 2.146607983001973, + "grad_norm": 1.1003342866897583, + "learning_rate": 5.709599027946537e-05, + "loss": 0.5954, + "step": 7072 + }, + { + "epoch": 2.1469115191986643, + "grad_norm": 0.6540477871894836, + "learning_rate": 5.7089914945321995e-05, + "loss": 1.3184, + "step": 7073 + }, + { + "epoch": 2.147215055395356, + "grad_norm": 0.5550004243850708, + "learning_rate": 5.708383961117861e-05, + "loss": 1.5281, + "step": 7074 + }, + { + "epoch": 2.147518591592047, + "grad_norm": 0.6814255714416504, + "learning_rate": 5.7077764277035237e-05, + "loss": 1.3497, + "step": 7075 + }, + { + "epoch": 2.147822127788739, + "grad_norm": 0.6634551882743835, + "learning_rate": 5.707168894289187e-05, + "loss": 0.8038, + "step": 7076 + }, + { + "epoch": 2.1481256639854305, + "grad_norm": 0.6326273083686829, + "learning_rate": 5.706561360874848e-05, + "loss": 1.3639, + "step": 7077 + }, + { + "epoch": 2.1484292001821217, + "grad_norm": 0.4899923503398895, + "learning_rate": 5.705953827460511e-05, + "loss": 1.2843, + "step": 7078 + }, + { + "epoch": 2.1487327363788133, + "grad_norm": 0.6824162602424622, + "learning_rate": 5.705346294046173e-05, + "loss": 1.3331, + "step": 7079 + }, + { + "epoch": 2.1490362725755046, + "grad_norm": 0.6425466537475586, + "learning_rate": 5.704738760631835e-05, + "loss": 1.1552, + "step": 7080 + }, + { + "epoch": 2.149339808772196, + "grad_norm": 1.1080219745635986, + "learning_rate": 5.7041312272174974e-05, + "loss": 1.1281, + "step": 7081 + }, + { + "epoch": 2.1496433449688874, + "grad_norm": 0.61561518907547, + "learning_rate": 5.70352369380316e-05, + "loss": 1.5421, + "step": 7082 + }, + { + "epoch": 2.149946881165579, + "grad_norm": 0.6175718903541565, + "learning_rate": 5.7029161603888216e-05, + "loss": 1.0581, + "step": 7083 + }, + { + "epoch": 2.1502504173622703, + "grad_norm": 0.5754169821739197, + "learning_rate": 5.702308626974484e-05, + "loss": 1.3451, + "step": 7084 + }, + { + "epoch": 2.150553953558962, + "grad_norm": 0.6784567832946777, + "learning_rate": 5.701701093560146e-05, + "loss": 1.6735, + "step": 7085 + }, + { + "epoch": 2.150857489755653, + "grad_norm": 0.7900217771530151, + "learning_rate": 5.701093560145808e-05, + "loss": 1.2737, + "step": 7086 + }, + { + "epoch": 2.151161025952345, + "grad_norm": 0.64167720079422, + "learning_rate": 5.7004860267314705e-05, + "loss": 1.2182, + "step": 7087 + }, + { + "epoch": 2.1514645621490365, + "grad_norm": 0.7034931778907776, + "learning_rate": 5.699878493317132e-05, + "loss": 1.2093, + "step": 7088 + }, + { + "epoch": 2.1517680983457277, + "grad_norm": 0.6175836324691772, + "learning_rate": 5.699270959902795e-05, + "loss": 1.5437, + "step": 7089 + }, + { + "epoch": 2.1520716345424193, + "grad_norm": 0.6727813482284546, + "learning_rate": 5.698663426488458e-05, + "loss": 1.3068, + "step": 7090 + }, + { + "epoch": 2.1523751707391106, + "grad_norm": 0.754546582698822, + "learning_rate": 5.698055893074119e-05, + "loss": 1.4687, + "step": 7091 + }, + { + "epoch": 2.152678706935802, + "grad_norm": 0.7307665348052979, + "learning_rate": 5.697448359659782e-05, + "loss": 1.0387, + "step": 7092 + }, + { + "epoch": 2.1529822431324934, + "grad_norm": 0.7365608215332031, + "learning_rate": 5.696840826245444e-05, + "loss": 0.857, + "step": 7093 + }, + { + "epoch": 2.153285779329185, + "grad_norm": 0.6369282007217407, + "learning_rate": 5.696233292831106e-05, + "loss": 1.3908, + "step": 7094 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 1.0504459142684937, + "learning_rate": 5.6956257594167684e-05, + "loss": 1.503, + "step": 7095 + }, + { + "epoch": 2.153892851722568, + "grad_norm": 0.5821953415870667, + "learning_rate": 5.695018226002431e-05, + "loss": 1.1134, + "step": 7096 + }, + { + "epoch": 2.154196387919259, + "grad_norm": 0.7673921585083008, + "learning_rate": 5.6944106925880926e-05, + "loss": 1.1303, + "step": 7097 + }, + { + "epoch": 2.154499924115951, + "grad_norm": 0.7406718730926514, + "learning_rate": 5.693803159173755e-05, + "loss": 1.6383, + "step": 7098 + }, + { + "epoch": 2.1548034603126425, + "grad_norm": 0.6577976942062378, + "learning_rate": 5.693195625759417e-05, + "loss": 1.2918, + "step": 7099 + }, + { + "epoch": 2.1551069965093337, + "grad_norm": 0.7887044548988342, + "learning_rate": 5.692588092345079e-05, + "loss": 1.0772, + "step": 7100 + }, + { + "epoch": 2.1554105327060253, + "grad_norm": 0.7450350522994995, + "learning_rate": 5.6919805589307415e-05, + "loss": 1.4461, + "step": 7101 + }, + { + "epoch": 2.1557140689027166, + "grad_norm": 0.5293591022491455, + "learning_rate": 5.691373025516403e-05, + "loss": 1.7682, + "step": 7102 + }, + { + "epoch": 2.156017605099408, + "grad_norm": 0.9171348214149475, + "learning_rate": 5.690765492102066e-05, + "loss": 1.2295, + "step": 7103 + }, + { + "epoch": 2.1563211412960994, + "grad_norm": 0.6998059153556824, + "learning_rate": 5.690157958687729e-05, + "loss": 1.2531, + "step": 7104 + }, + { + "epoch": 2.156624677492791, + "grad_norm": 0.7437159419059753, + "learning_rate": 5.68955042527339e-05, + "loss": 1.3937, + "step": 7105 + }, + { + "epoch": 2.1569282136894823, + "grad_norm": 0.8054535388946533, + "learning_rate": 5.688942891859053e-05, + "loss": 1.0773, + "step": 7106 + }, + { + "epoch": 2.157231749886174, + "grad_norm": 0.7918900847434998, + "learning_rate": 5.688335358444715e-05, + "loss": 1.3796, + "step": 7107 + }, + { + "epoch": 2.157535286082865, + "grad_norm": 0.64754319190979, + "learning_rate": 5.687727825030377e-05, + "loss": 1.0733, + "step": 7108 + }, + { + "epoch": 2.157838822279557, + "grad_norm": 0.6509382128715515, + "learning_rate": 5.6871202916160394e-05, + "loss": 0.5026, + "step": 7109 + }, + { + "epoch": 2.1581423584762485, + "grad_norm": 0.5929359793663025, + "learning_rate": 5.6865127582017005e-05, + "loss": 0.8078, + "step": 7110 + }, + { + "epoch": 2.1584458946729397, + "grad_norm": 0.9011654853820801, + "learning_rate": 5.6859052247873636e-05, + "loss": 1.1668, + "step": 7111 + }, + { + "epoch": 2.1587494308696313, + "grad_norm": 1.113480567932129, + "learning_rate": 5.685297691373026e-05, + "loss": 0.9573, + "step": 7112 + }, + { + "epoch": 2.1590529670663225, + "grad_norm": 0.7953112125396729, + "learning_rate": 5.684690157958688e-05, + "loss": 1.0673, + "step": 7113 + }, + { + "epoch": 2.159356503263014, + "grad_norm": 0.8165501952171326, + "learning_rate": 5.68408262454435e-05, + "loss": 1.4202, + "step": 7114 + }, + { + "epoch": 2.1596600394597054, + "grad_norm": 0.8039828538894653, + "learning_rate": 5.6834750911300125e-05, + "loss": 1.5351, + "step": 7115 + }, + { + "epoch": 2.159963575656397, + "grad_norm": 0.6484586596488953, + "learning_rate": 5.682867557715674e-05, + "loss": 1.495, + "step": 7116 + }, + { + "epoch": 2.1602671118530883, + "grad_norm": 0.859879195690155, + "learning_rate": 5.682260024301337e-05, + "loss": 1.3554, + "step": 7117 + }, + { + "epoch": 2.16057064804978, + "grad_norm": 0.6827611923217773, + "learning_rate": 5.681652490887e-05, + "loss": 1.3924, + "step": 7118 + }, + { + "epoch": 2.1608741842464716, + "grad_norm": 0.6194184422492981, + "learning_rate": 5.681044957472661e-05, + "loss": 1.3563, + "step": 7119 + }, + { + "epoch": 2.161177720443163, + "grad_norm": 0.884806215763092, + "learning_rate": 5.680437424058324e-05, + "loss": 1.2661, + "step": 7120 + }, + { + "epoch": 2.1614812566398545, + "grad_norm": 0.8571776151657104, + "learning_rate": 5.679829890643986e-05, + "loss": 1.3268, + "step": 7121 + }, + { + "epoch": 2.1617847928365457, + "grad_norm": 0.8481334447860718, + "learning_rate": 5.6792223572296474e-05, + "loss": 1.5147, + "step": 7122 + }, + { + "epoch": 2.1620883290332373, + "grad_norm": 0.6749445796012878, + "learning_rate": 5.6786148238153104e-05, + "loss": 0.5943, + "step": 7123 + }, + { + "epoch": 2.1623918652299285, + "grad_norm": 0.7724722623825073, + "learning_rate": 5.6780072904009715e-05, + "loss": 1.6409, + "step": 7124 + }, + { + "epoch": 2.16269540142662, + "grad_norm": 0.6369667053222656, + "learning_rate": 5.6773997569866346e-05, + "loss": 1.72, + "step": 7125 + }, + { + "epoch": 2.1629989376233114, + "grad_norm": 0.7980279326438904, + "learning_rate": 5.676792223572297e-05, + "loss": 1.3303, + "step": 7126 + }, + { + "epoch": 2.163302473820003, + "grad_norm": 0.6585915684700012, + "learning_rate": 5.676184690157959e-05, + "loss": 1.5053, + "step": 7127 + }, + { + "epoch": 2.1636060100166947, + "grad_norm": 0.7717660069465637, + "learning_rate": 5.675577156743621e-05, + "loss": 1.1307, + "step": 7128 + }, + { + "epoch": 2.163909546213386, + "grad_norm": 0.6564520001411438, + "learning_rate": 5.6749696233292835e-05, + "loss": 1.2847, + "step": 7129 + }, + { + "epoch": 2.1642130824100776, + "grad_norm": 0.7321675419807434, + "learning_rate": 5.674362089914945e-05, + "loss": 1.3272, + "step": 7130 + }, + { + "epoch": 2.164516618606769, + "grad_norm": 0.7516568303108215, + "learning_rate": 5.673754556500608e-05, + "loss": 1.3297, + "step": 7131 + }, + { + "epoch": 2.1648201548034605, + "grad_norm": 0.7043347954750061, + "learning_rate": 5.673147023086271e-05, + "loss": 1.6839, + "step": 7132 + }, + { + "epoch": 2.1651236910001517, + "grad_norm": 0.6395680904388428, + "learning_rate": 5.672539489671932e-05, + "loss": 1.0484, + "step": 7133 + }, + { + "epoch": 2.1654272271968433, + "grad_norm": 0.6057548522949219, + "learning_rate": 5.671931956257594e-05, + "loss": 1.2702, + "step": 7134 + }, + { + "epoch": 2.1657307633935345, + "grad_norm": 0.8291522860527039, + "learning_rate": 5.671324422843257e-05, + "loss": 1.0313, + "step": 7135 + }, + { + "epoch": 2.166034299590226, + "grad_norm": 0.7034058570861816, + "learning_rate": 5.6707168894289184e-05, + "loss": 1.4155, + "step": 7136 + }, + { + "epoch": 2.1663378357869174, + "grad_norm": 0.6599259972572327, + "learning_rate": 5.6701093560145814e-05, + "loss": 1.3979, + "step": 7137 + }, + { + "epoch": 2.166641371983609, + "grad_norm": 0.7508943676948547, + "learning_rate": 5.6695018226002425e-05, + "loss": 1.3171, + "step": 7138 + }, + { + "epoch": 2.1669449081803007, + "grad_norm": 0.544033408164978, + "learning_rate": 5.6688942891859056e-05, + "loss": 0.8611, + "step": 7139 + }, + { + "epoch": 2.167248444376992, + "grad_norm": 0.7824786305427551, + "learning_rate": 5.668286755771568e-05, + "loss": 1.5292, + "step": 7140 + }, + { + "epoch": 2.1675519805736836, + "grad_norm": 0.8762481212615967, + "learning_rate": 5.66767922235723e-05, + "loss": 1.2404, + "step": 7141 + }, + { + "epoch": 2.167855516770375, + "grad_norm": 0.7257769703865051, + "learning_rate": 5.667071688942892e-05, + "loss": 0.7228, + "step": 7142 + }, + { + "epoch": 2.1681590529670665, + "grad_norm": 0.9814961552619934, + "learning_rate": 5.6664641555285545e-05, + "loss": 0.9963, + "step": 7143 + }, + { + "epoch": 2.1684625891637577, + "grad_norm": 0.851203441619873, + "learning_rate": 5.665856622114216e-05, + "loss": 1.1862, + "step": 7144 + }, + { + "epoch": 2.1687661253604493, + "grad_norm": 0.6610409021377563, + "learning_rate": 5.665249088699879e-05, + "loss": 1.2907, + "step": 7145 + }, + { + "epoch": 2.1690696615571405, + "grad_norm": 0.6841842532157898, + "learning_rate": 5.664641555285541e-05, + "loss": 1.7986, + "step": 7146 + }, + { + "epoch": 2.169373197753832, + "grad_norm": 0.7544435262680054, + "learning_rate": 5.664034021871203e-05, + "loss": 1.4018, + "step": 7147 + }, + { + "epoch": 2.1696767339505234, + "grad_norm": 0.7136738300323486, + "learning_rate": 5.663426488456865e-05, + "loss": 1.3775, + "step": 7148 + }, + { + "epoch": 2.169980270147215, + "grad_norm": 1.1160085201263428, + "learning_rate": 5.662818955042528e-05, + "loss": 0.9446, + "step": 7149 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.7214122414588928, + "learning_rate": 5.6622114216281894e-05, + "loss": 1.4815, + "step": 7150 + }, + { + "epoch": 2.170587342540598, + "grad_norm": 0.6770926117897034, + "learning_rate": 5.6616038882138525e-05, + "loss": 1.6, + "step": 7151 + }, + { + "epoch": 2.1708908787372896, + "grad_norm": 0.6858698129653931, + "learning_rate": 5.6609963547995135e-05, + "loss": 1.3258, + "step": 7152 + }, + { + "epoch": 2.171194414933981, + "grad_norm": 0.755990743637085, + "learning_rate": 5.6603888213851766e-05, + "loss": 1.0352, + "step": 7153 + }, + { + "epoch": 2.1714979511306725, + "grad_norm": 0.628933310508728, + "learning_rate": 5.659781287970839e-05, + "loss": 0.8605, + "step": 7154 + }, + { + "epoch": 2.1718014873273637, + "grad_norm": 0.631551206111908, + "learning_rate": 5.659173754556501e-05, + "loss": 1.3337, + "step": 7155 + }, + { + "epoch": 2.1721050235240553, + "grad_norm": 0.8183661103248596, + "learning_rate": 5.658566221142163e-05, + "loss": 1.3968, + "step": 7156 + }, + { + "epoch": 2.1724085597207465, + "grad_norm": 0.5968457460403442, + "learning_rate": 5.6579586877278255e-05, + "loss": 1.587, + "step": 7157 + }, + { + "epoch": 2.172712095917438, + "grad_norm": 0.7125000953674316, + "learning_rate": 5.657351154313487e-05, + "loss": 1.5769, + "step": 7158 + }, + { + "epoch": 2.1730156321141294, + "grad_norm": 0.8173585534095764, + "learning_rate": 5.65674362089915e-05, + "loss": 1.4033, + "step": 7159 + }, + { + "epoch": 2.173319168310821, + "grad_norm": 0.7674162983894348, + "learning_rate": 5.656136087484812e-05, + "loss": 0.765, + "step": 7160 + }, + { + "epoch": 2.1736227045075127, + "grad_norm": 0.6227114200592041, + "learning_rate": 5.655528554070474e-05, + "loss": 1.4455, + "step": 7161 + }, + { + "epoch": 2.173926240704204, + "grad_norm": 0.7266408205032349, + "learning_rate": 5.654921020656136e-05, + "loss": 1.3391, + "step": 7162 + }, + { + "epoch": 2.1742297769008956, + "grad_norm": 0.6072399020195007, + "learning_rate": 5.654313487241799e-05, + "loss": 1.7996, + "step": 7163 + }, + { + "epoch": 2.174533313097587, + "grad_norm": 0.7643356323242188, + "learning_rate": 5.6537059538274604e-05, + "loss": 1.3845, + "step": 7164 + }, + { + "epoch": 2.1748368492942785, + "grad_norm": 0.8032098412513733, + "learning_rate": 5.6530984204131235e-05, + "loss": 1.3607, + "step": 7165 + }, + { + "epoch": 2.1751403854909697, + "grad_norm": 0.6203994750976562, + "learning_rate": 5.6524908869987845e-05, + "loss": 1.1317, + "step": 7166 + }, + { + "epoch": 2.1754439216876613, + "grad_norm": 0.7892910838127136, + "learning_rate": 5.6518833535844476e-05, + "loss": 1.2879, + "step": 7167 + }, + { + "epoch": 2.1757474578843525, + "grad_norm": 0.5828709006309509, + "learning_rate": 5.65127582017011e-05, + "loss": 1.5039, + "step": 7168 + }, + { + "epoch": 2.176050994081044, + "grad_norm": 0.6579250693321228, + "learning_rate": 5.650668286755772e-05, + "loss": 1.483, + "step": 7169 + }, + { + "epoch": 2.1763545302777354, + "grad_norm": 0.7929391860961914, + "learning_rate": 5.650060753341434e-05, + "loss": 1.4478, + "step": 7170 + }, + { + "epoch": 2.176658066474427, + "grad_norm": 0.7163347601890564, + "learning_rate": 5.6494532199270966e-05, + "loss": 0.7903, + "step": 7171 + }, + { + "epoch": 2.1769616026711187, + "grad_norm": 0.6493278741836548, + "learning_rate": 5.648845686512758e-05, + "loss": 1.1142, + "step": 7172 + }, + { + "epoch": 2.17726513886781, + "grad_norm": 0.659180760383606, + "learning_rate": 5.648238153098421e-05, + "loss": 1.2759, + "step": 7173 + }, + { + "epoch": 2.1775686750645016, + "grad_norm": 0.6536158323287964, + "learning_rate": 5.647630619684083e-05, + "loss": 1.51, + "step": 7174 + }, + { + "epoch": 2.177872211261193, + "grad_norm": 0.8379760980606079, + "learning_rate": 5.647023086269745e-05, + "loss": 0.8592, + "step": 7175 + }, + { + "epoch": 2.1781757474578844, + "grad_norm": 0.6624657511711121, + "learning_rate": 5.646415552855407e-05, + "loss": 1.5387, + "step": 7176 + }, + { + "epoch": 2.1784792836545757, + "grad_norm": 0.8176611065864563, + "learning_rate": 5.64580801944107e-05, + "loss": 0.8141, + "step": 7177 + }, + { + "epoch": 2.1787828198512673, + "grad_norm": 0.8894045948982239, + "learning_rate": 5.6452004860267314e-05, + "loss": 1.3116, + "step": 7178 + }, + { + "epoch": 2.1790863560479585, + "grad_norm": 0.8644475936889648, + "learning_rate": 5.6445929526123945e-05, + "loss": 0.9116, + "step": 7179 + }, + { + "epoch": 2.17938989224465, + "grad_norm": 0.8027042746543884, + "learning_rate": 5.6439854191980555e-05, + "loss": 1.3788, + "step": 7180 + }, + { + "epoch": 2.1796934284413414, + "grad_norm": 0.6205067038536072, + "learning_rate": 5.6433778857837186e-05, + "loss": 0.9615, + "step": 7181 + }, + { + "epoch": 2.179996964638033, + "grad_norm": 0.9785541296005249, + "learning_rate": 5.642770352369381e-05, + "loss": 1.2723, + "step": 7182 + }, + { + "epoch": 2.1803005008347247, + "grad_norm": 0.541207492351532, + "learning_rate": 5.642162818955042e-05, + "loss": 0.9423, + "step": 7183 + }, + { + "epoch": 2.180604037031416, + "grad_norm": 0.7655600905418396, + "learning_rate": 5.641555285540705e-05, + "loss": 1.088, + "step": 7184 + }, + { + "epoch": 2.1809075732281076, + "grad_norm": 0.575629711151123, + "learning_rate": 5.6409477521263676e-05, + "loss": 1.0602, + "step": 7185 + }, + { + "epoch": 2.181211109424799, + "grad_norm": 0.6803514957427979, + "learning_rate": 5.640340218712029e-05, + "loss": 1.141, + "step": 7186 + }, + { + "epoch": 2.1815146456214904, + "grad_norm": 0.681253969669342, + "learning_rate": 5.639732685297692e-05, + "loss": 1.3516, + "step": 7187 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.6234375238418579, + "learning_rate": 5.639125151883354e-05, + "loss": 1.3573, + "step": 7188 + }, + { + "epoch": 2.1821217180148733, + "grad_norm": 0.809392511844635, + "learning_rate": 5.638517618469016e-05, + "loss": 1.3348, + "step": 7189 + }, + { + "epoch": 2.1824252542115645, + "grad_norm": 0.6876418590545654, + "learning_rate": 5.637910085054678e-05, + "loss": 0.8274, + "step": 7190 + }, + { + "epoch": 2.182728790408256, + "grad_norm": 0.7089405059814453, + "learning_rate": 5.63730255164034e-05, + "loss": 1.1393, + "step": 7191 + }, + { + "epoch": 2.183032326604948, + "grad_norm": 0.6622164249420166, + "learning_rate": 5.6366950182260024e-05, + "loss": 1.2418, + "step": 7192 + }, + { + "epoch": 2.183335862801639, + "grad_norm": 0.7589064836502075, + "learning_rate": 5.6360874848116655e-05, + "loss": 1.2326, + "step": 7193 + }, + { + "epoch": 2.1836393989983307, + "grad_norm": 0.8332014679908752, + "learning_rate": 5.6354799513973265e-05, + "loss": 1.2957, + "step": 7194 + }, + { + "epoch": 2.183942935195022, + "grad_norm": 0.7106983661651611, + "learning_rate": 5.634872417982989e-05, + "loss": 1.3794, + "step": 7195 + }, + { + "epoch": 2.1842464713917136, + "grad_norm": 0.7226594090461731, + "learning_rate": 5.634264884568652e-05, + "loss": 1.674, + "step": 7196 + }, + { + "epoch": 2.184550007588405, + "grad_norm": 0.7651084065437317, + "learning_rate": 5.633657351154313e-05, + "loss": 1.101, + "step": 7197 + }, + { + "epoch": 2.1848535437850964, + "grad_norm": 0.9110549688339233, + "learning_rate": 5.633049817739976e-05, + "loss": 0.7792, + "step": 7198 + }, + { + "epoch": 2.1851570799817877, + "grad_norm": 0.7132554650306702, + "learning_rate": 5.6324422843256386e-05, + "loss": 1.5932, + "step": 7199 + }, + { + "epoch": 2.1854606161784793, + "grad_norm": 0.7886115908622742, + "learning_rate": 5.6318347509113e-05, + "loss": 1.1547, + "step": 7200 + }, + { + "epoch": 2.185764152375171, + "grad_norm": 0.6051173806190491, + "learning_rate": 5.631227217496963e-05, + "loss": 1.2831, + "step": 7201 + }, + { + "epoch": 2.186067688571862, + "grad_norm": 0.815495491027832, + "learning_rate": 5.630619684082625e-05, + "loss": 0.4056, + "step": 7202 + }, + { + "epoch": 2.186371224768554, + "grad_norm": 1.083448886871338, + "learning_rate": 5.630012150668287e-05, + "loss": 1.3255, + "step": 7203 + }, + { + "epoch": 2.186674760965245, + "grad_norm": 0.8830431699752808, + "learning_rate": 5.629404617253949e-05, + "loss": 0.4751, + "step": 7204 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.7775143384933472, + "learning_rate": 5.628797083839611e-05, + "loss": 1.4157, + "step": 7205 + }, + { + "epoch": 2.187281833358628, + "grad_norm": 0.6572606563568115, + "learning_rate": 5.6281895504252734e-05, + "loss": 1.7001, + "step": 7206 + }, + { + "epoch": 2.1875853695553196, + "grad_norm": 0.7444910407066345, + "learning_rate": 5.627582017010936e-05, + "loss": 1.4273, + "step": 7207 + }, + { + "epoch": 2.187888905752011, + "grad_norm": 0.8146635293960571, + "learning_rate": 5.6269744835965975e-05, + "loss": 1.5225, + "step": 7208 + }, + { + "epoch": 2.1881924419487024, + "grad_norm": 0.7989526391029358, + "learning_rate": 5.62636695018226e-05, + "loss": 1.3059, + "step": 7209 + }, + { + "epoch": 2.1884959781453937, + "grad_norm": 0.7003793716430664, + "learning_rate": 5.625759416767923e-05, + "loss": 1.2017, + "step": 7210 + }, + { + "epoch": 2.1887995143420853, + "grad_norm": 0.7632039785385132, + "learning_rate": 5.625151883353584e-05, + "loss": 1.5865, + "step": 7211 + }, + { + "epoch": 2.189103050538777, + "grad_norm": 0.8134432435035706, + "learning_rate": 5.624544349939247e-05, + "loss": 1.3355, + "step": 7212 + }, + { + "epoch": 2.189406586735468, + "grad_norm": 0.7677670121192932, + "learning_rate": 5.6239368165249096e-05, + "loss": 1.4682, + "step": 7213 + }, + { + "epoch": 2.18971012293216, + "grad_norm": 0.6580960750579834, + "learning_rate": 5.623329283110571e-05, + "loss": 1.4944, + "step": 7214 + }, + { + "epoch": 2.190013659128851, + "grad_norm": 0.5932350158691406, + "learning_rate": 5.622721749696234e-05, + "loss": 1.3135, + "step": 7215 + }, + { + "epoch": 2.1903171953255427, + "grad_norm": 0.6935281157493591, + "learning_rate": 5.622114216281896e-05, + "loss": 1.4697, + "step": 7216 + }, + { + "epoch": 2.190620731522234, + "grad_norm": 0.6523483395576477, + "learning_rate": 5.621506682867558e-05, + "loss": 1.4928, + "step": 7217 + }, + { + "epoch": 2.1909242677189256, + "grad_norm": 0.6458442211151123, + "learning_rate": 5.62089914945322e-05, + "loss": 1.5989, + "step": 7218 + }, + { + "epoch": 2.191227803915617, + "grad_norm": 0.8362550139427185, + "learning_rate": 5.620291616038882e-05, + "loss": 1.3478, + "step": 7219 + }, + { + "epoch": 2.1915313401123084, + "grad_norm": 0.6641613245010376, + "learning_rate": 5.6196840826245444e-05, + "loss": 1.3929, + "step": 7220 + }, + { + "epoch": 2.1918348763089996, + "grad_norm": 0.8658133149147034, + "learning_rate": 5.619076549210207e-05, + "loss": 1.3856, + "step": 7221 + }, + { + "epoch": 2.1921384125056913, + "grad_norm": 0.8276031017303467, + "learning_rate": 5.6184690157958685e-05, + "loss": 1.1622, + "step": 7222 + }, + { + "epoch": 2.192441948702383, + "grad_norm": 0.7149022221565247, + "learning_rate": 5.617861482381531e-05, + "loss": 1.1412, + "step": 7223 + }, + { + "epoch": 2.192745484899074, + "grad_norm": 0.6524426937103271, + "learning_rate": 5.617253948967194e-05, + "loss": 1.482, + "step": 7224 + }, + { + "epoch": 2.193049021095766, + "grad_norm": 0.6739472150802612, + "learning_rate": 5.616646415552855e-05, + "loss": 1.585, + "step": 7225 + }, + { + "epoch": 2.193352557292457, + "grad_norm": 0.7816546559333801, + "learning_rate": 5.616038882138518e-05, + "loss": 0.8333, + "step": 7226 + }, + { + "epoch": 2.1936560934891487, + "grad_norm": 0.639371395111084, + "learning_rate": 5.6154313487241806e-05, + "loss": 1.5572, + "step": 7227 + }, + { + "epoch": 2.19395962968584, + "grad_norm": 0.860651969909668, + "learning_rate": 5.614823815309842e-05, + "loss": 1.3584, + "step": 7228 + }, + { + "epoch": 2.1942631658825316, + "grad_norm": 0.8631687760353088, + "learning_rate": 5.614216281895505e-05, + "loss": 1.2089, + "step": 7229 + }, + { + "epoch": 2.1945667020792228, + "grad_norm": 0.7528846263885498, + "learning_rate": 5.613608748481167e-05, + "loss": 1.1223, + "step": 7230 + }, + { + "epoch": 2.1948702382759144, + "grad_norm": 0.6682392954826355, + "learning_rate": 5.613001215066829e-05, + "loss": 0.9837, + "step": 7231 + }, + { + "epoch": 2.1951737744726056, + "grad_norm": 0.8789331912994385, + "learning_rate": 5.612393681652491e-05, + "loss": 0.9464, + "step": 7232 + }, + { + "epoch": 2.1954773106692973, + "grad_norm": 0.8156276941299438, + "learning_rate": 5.611786148238153e-05, + "loss": 1.3571, + "step": 7233 + }, + { + "epoch": 2.195780846865989, + "grad_norm": 0.9034867882728577, + "learning_rate": 5.6111786148238154e-05, + "loss": 1.4531, + "step": 7234 + }, + { + "epoch": 2.19608438306268, + "grad_norm": 0.773789644241333, + "learning_rate": 5.610571081409478e-05, + "loss": 1.2396, + "step": 7235 + }, + { + "epoch": 2.196387919259372, + "grad_norm": 0.753548800945282, + "learning_rate": 5.6099635479951395e-05, + "loss": 1.3218, + "step": 7236 + }, + { + "epoch": 2.196691455456063, + "grad_norm": 0.8552380800247192, + "learning_rate": 5.609356014580802e-05, + "loss": 1.1841, + "step": 7237 + }, + { + "epoch": 2.1969949916527547, + "grad_norm": 0.7362204790115356, + "learning_rate": 5.608748481166465e-05, + "loss": 1.7553, + "step": 7238 + }, + { + "epoch": 2.197298527849446, + "grad_norm": 0.7145305275917053, + "learning_rate": 5.608140947752126e-05, + "loss": 0.7013, + "step": 7239 + }, + { + "epoch": 2.1976020640461376, + "grad_norm": 0.8223626613616943, + "learning_rate": 5.607533414337789e-05, + "loss": 0.9573, + "step": 7240 + }, + { + "epoch": 2.1979056002428288, + "grad_norm": 0.6958496570587158, + "learning_rate": 5.6069258809234516e-05, + "loss": 1.1011, + "step": 7241 + }, + { + "epoch": 2.1982091364395204, + "grad_norm": 0.7131595611572266, + "learning_rate": 5.606318347509113e-05, + "loss": 0.9728, + "step": 7242 + }, + { + "epoch": 2.1985126726362116, + "grad_norm": 0.8090917468070984, + "learning_rate": 5.605710814094776e-05, + "loss": 1.2979, + "step": 7243 + }, + { + "epoch": 2.1988162088329033, + "grad_norm": 0.7696943879127502, + "learning_rate": 5.605103280680438e-05, + "loss": 1.4695, + "step": 7244 + }, + { + "epoch": 2.199119745029595, + "grad_norm": 0.759185791015625, + "learning_rate": 5.6044957472661e-05, + "loss": 1.2634, + "step": 7245 + }, + { + "epoch": 2.199423281226286, + "grad_norm": 0.7122607827186584, + "learning_rate": 5.603888213851762e-05, + "loss": 1.5165, + "step": 7246 + }, + { + "epoch": 2.199726817422978, + "grad_norm": 0.8475228548049927, + "learning_rate": 5.603280680437424e-05, + "loss": 0.8859, + "step": 7247 + }, + { + "epoch": 2.200030353619669, + "grad_norm": 0.8220751285552979, + "learning_rate": 5.6026731470230864e-05, + "loss": 1.274, + "step": 7248 + }, + { + "epoch": 2.2003338898163607, + "grad_norm": 0.5795495510101318, + "learning_rate": 5.602065613608749e-05, + "loss": 1.6139, + "step": 7249 + }, + { + "epoch": 2.200637426013052, + "grad_norm": 0.7077553272247314, + "learning_rate": 5.6014580801944105e-05, + "loss": 1.5815, + "step": 7250 + }, + { + "epoch": 2.2009409622097436, + "grad_norm": 0.7636668682098389, + "learning_rate": 5.600850546780073e-05, + "loss": 1.2543, + "step": 7251 + }, + { + "epoch": 2.2012444984064348, + "grad_norm": 0.8725208640098572, + "learning_rate": 5.600243013365736e-05, + "loss": 0.8778, + "step": 7252 + }, + { + "epoch": 2.2015480346031264, + "grad_norm": 0.8233357071876526, + "learning_rate": 5.599635479951397e-05, + "loss": 1.4274, + "step": 7253 + }, + { + "epoch": 2.201851570799818, + "grad_norm": 0.737484872341156, + "learning_rate": 5.59902794653706e-05, + "loss": 0.9487, + "step": 7254 + }, + { + "epoch": 2.2021551069965093, + "grad_norm": 0.7442474961280823, + "learning_rate": 5.5984204131227226e-05, + "loss": 1.5195, + "step": 7255 + }, + { + "epoch": 2.202458643193201, + "grad_norm": 0.6351808309555054, + "learning_rate": 5.5978128797083836e-05, + "loss": 1.7236, + "step": 7256 + }, + { + "epoch": 2.202762179389892, + "grad_norm": 1.272067666053772, + "learning_rate": 5.597205346294047e-05, + "loss": 1.1462, + "step": 7257 + }, + { + "epoch": 2.203065715586584, + "grad_norm": 0.8174318075180054, + "learning_rate": 5.596597812879709e-05, + "loss": 1.4007, + "step": 7258 + }, + { + "epoch": 2.203369251783275, + "grad_norm": 0.7911064028739929, + "learning_rate": 5.595990279465371e-05, + "loss": 1.4344, + "step": 7259 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.5833910703659058, + "learning_rate": 5.595382746051033e-05, + "loss": 1.2447, + "step": 7260 + }, + { + "epoch": 2.203976324176658, + "grad_norm": 0.793631374835968, + "learning_rate": 5.594775212636695e-05, + "loss": 0.8962, + "step": 7261 + }, + { + "epoch": 2.2042798603733496, + "grad_norm": 0.8431137204170227, + "learning_rate": 5.5941676792223574e-05, + "loss": 1.0257, + "step": 7262 + }, + { + "epoch": 2.204583396570041, + "grad_norm": 0.9787790775299072, + "learning_rate": 5.59356014580802e-05, + "loss": 1.5493, + "step": 7263 + }, + { + "epoch": 2.2048869327667324, + "grad_norm": 0.7261353135108948, + "learning_rate": 5.5929526123936815e-05, + "loss": 0.7474, + "step": 7264 + }, + { + "epoch": 2.205190468963424, + "grad_norm": 0.9692999124526978, + "learning_rate": 5.592345078979344e-05, + "loss": 1.408, + "step": 7265 + }, + { + "epoch": 2.2054940051601153, + "grad_norm": 0.5326679348945618, + "learning_rate": 5.591737545565007e-05, + "loss": 1.5479, + "step": 7266 + }, + { + "epoch": 2.205797541356807, + "grad_norm": 0.8038820028305054, + "learning_rate": 5.591130012150668e-05, + "loss": 1.2581, + "step": 7267 + }, + { + "epoch": 2.206101077553498, + "grad_norm": 0.6147705912590027, + "learning_rate": 5.5905224787363305e-05, + "loss": 1.0921, + "step": 7268 + }, + { + "epoch": 2.20640461375019, + "grad_norm": 0.8090271949768066, + "learning_rate": 5.5899149453219936e-05, + "loss": 0.9203, + "step": 7269 + }, + { + "epoch": 2.206708149946881, + "grad_norm": 0.6505485773086548, + "learning_rate": 5.5893074119076546e-05, + "loss": 1.2222, + "step": 7270 + }, + { + "epoch": 2.2070116861435727, + "grad_norm": 0.6681842803955078, + "learning_rate": 5.588699878493318e-05, + "loss": 1.3745, + "step": 7271 + }, + { + "epoch": 2.207315222340264, + "grad_norm": 0.6756628751754761, + "learning_rate": 5.58809234507898e-05, + "loss": 1.2509, + "step": 7272 + }, + { + "epoch": 2.2076187585369555, + "grad_norm": 1.3730113506317139, + "learning_rate": 5.587484811664642e-05, + "loss": 0.9545, + "step": 7273 + }, + { + "epoch": 2.207922294733647, + "grad_norm": 0.7486765384674072, + "learning_rate": 5.586877278250304e-05, + "loss": 1.2173, + "step": 7274 + }, + { + "epoch": 2.2082258309303384, + "grad_norm": 0.7601925730705261, + "learning_rate": 5.586269744835966e-05, + "loss": 0.8298, + "step": 7275 + }, + { + "epoch": 2.20852936712703, + "grad_norm": 1.0399149656295776, + "learning_rate": 5.5856622114216284e-05, + "loss": 1.4147, + "step": 7276 + }, + { + "epoch": 2.2088329033237213, + "grad_norm": 0.5739341974258423, + "learning_rate": 5.585054678007291e-05, + "loss": 1.2772, + "step": 7277 + }, + { + "epoch": 2.209136439520413, + "grad_norm": 0.6940932273864746, + "learning_rate": 5.5844471445929526e-05, + "loss": 1.1425, + "step": 7278 + }, + { + "epoch": 2.209439975717104, + "grad_norm": 0.7629711627960205, + "learning_rate": 5.583839611178615e-05, + "loss": 1.2434, + "step": 7279 + }, + { + "epoch": 2.209743511913796, + "grad_norm": 0.7137802839279175, + "learning_rate": 5.5832320777642774e-05, + "loss": 1.4219, + "step": 7280 + }, + { + "epoch": 2.210047048110487, + "grad_norm": 0.7725697159767151, + "learning_rate": 5.582624544349939e-05, + "loss": 1.6238, + "step": 7281 + }, + { + "epoch": 2.2103505843071787, + "grad_norm": 0.7074224352836609, + "learning_rate": 5.5820170109356015e-05, + "loss": 1.2055, + "step": 7282 + }, + { + "epoch": 2.21065412050387, + "grad_norm": 0.7771949172019958, + "learning_rate": 5.5814094775212646e-05, + "loss": 1.2759, + "step": 7283 + }, + { + "epoch": 2.2109576567005615, + "grad_norm": 0.9702246189117432, + "learning_rate": 5.5808019441069256e-05, + "loss": 1.16, + "step": 7284 + }, + { + "epoch": 2.211261192897253, + "grad_norm": 1.0390164852142334, + "learning_rate": 5.580194410692589e-05, + "loss": 1.6189, + "step": 7285 + }, + { + "epoch": 2.2115647290939444, + "grad_norm": 0.7810035943984985, + "learning_rate": 5.57958687727825e-05, + "loss": 1.3802, + "step": 7286 + }, + { + "epoch": 2.211868265290636, + "grad_norm": 0.6858797669410706, + "learning_rate": 5.578979343863913e-05, + "loss": 1.0308, + "step": 7287 + }, + { + "epoch": 2.2121718014873273, + "grad_norm": 0.8140575289726257, + "learning_rate": 5.578371810449575e-05, + "loss": 1.175, + "step": 7288 + }, + { + "epoch": 2.212475337684019, + "grad_norm": 0.7919504642486572, + "learning_rate": 5.577764277035237e-05, + "loss": 1.1394, + "step": 7289 + }, + { + "epoch": 2.21277887388071, + "grad_norm": 0.6557730436325073, + "learning_rate": 5.5771567436208994e-05, + "loss": 1.4748, + "step": 7290 + }, + { + "epoch": 2.213082410077402, + "grad_norm": 0.8063220977783203, + "learning_rate": 5.576549210206562e-05, + "loss": 1.4965, + "step": 7291 + }, + { + "epoch": 2.213385946274093, + "grad_norm": 0.8114713430404663, + "learning_rate": 5.5759416767922236e-05, + "loss": 1.2698, + "step": 7292 + }, + { + "epoch": 2.2136894824707847, + "grad_norm": 0.7607464790344238, + "learning_rate": 5.575334143377886e-05, + "loss": 1.6277, + "step": 7293 + }, + { + "epoch": 2.213993018667476, + "grad_norm": 0.8086090683937073, + "learning_rate": 5.5747266099635484e-05, + "loss": 1.1822, + "step": 7294 + }, + { + "epoch": 2.2142965548641675, + "grad_norm": 0.6083495616912842, + "learning_rate": 5.57411907654921e-05, + "loss": 1.6789, + "step": 7295 + }, + { + "epoch": 2.214600091060859, + "grad_norm": 0.6319407820701599, + "learning_rate": 5.5735115431348725e-05, + "loss": 1.0977, + "step": 7296 + }, + { + "epoch": 2.2149036272575504, + "grad_norm": 0.7058820724487305, + "learning_rate": 5.5729040097205356e-05, + "loss": 1.5291, + "step": 7297 + }, + { + "epoch": 2.215207163454242, + "grad_norm": 0.7968228459358215, + "learning_rate": 5.5722964763061967e-05, + "loss": 1.2518, + "step": 7298 + }, + { + "epoch": 2.2155106996509333, + "grad_norm": 0.8061215281486511, + "learning_rate": 5.57168894289186e-05, + "loss": 1.2453, + "step": 7299 + }, + { + "epoch": 2.215814235847625, + "grad_norm": 0.8020003437995911, + "learning_rate": 5.571081409477521e-05, + "loss": 1.4674, + "step": 7300 + }, + { + "epoch": 2.216117772044316, + "grad_norm": 0.8562387824058533, + "learning_rate": 5.570473876063184e-05, + "loss": 0.918, + "step": 7301 + }, + { + "epoch": 2.216421308241008, + "grad_norm": 0.5621716976165771, + "learning_rate": 5.569866342648846e-05, + "loss": 1.2054, + "step": 7302 + }, + { + "epoch": 2.216724844437699, + "grad_norm": 0.5586440563201904, + "learning_rate": 5.569258809234508e-05, + "loss": 1.2159, + "step": 7303 + }, + { + "epoch": 2.2170283806343907, + "grad_norm": 0.6557493805885315, + "learning_rate": 5.5686512758201704e-05, + "loss": 1.1882, + "step": 7304 + }, + { + "epoch": 2.217331916831082, + "grad_norm": 0.71869957447052, + "learning_rate": 5.568043742405833e-05, + "loss": 1.2973, + "step": 7305 + }, + { + "epoch": 2.2176354530277735, + "grad_norm": 0.5254454016685486, + "learning_rate": 5.5674362089914946e-05, + "loss": 1.6473, + "step": 7306 + }, + { + "epoch": 2.217938989224465, + "grad_norm": 0.626685380935669, + "learning_rate": 5.566828675577157e-05, + "loss": 1.5708, + "step": 7307 + }, + { + "epoch": 2.2182425254211564, + "grad_norm": 0.7227777242660522, + "learning_rate": 5.5662211421628194e-05, + "loss": 1.3069, + "step": 7308 + }, + { + "epoch": 2.218546061617848, + "grad_norm": 0.7912486791610718, + "learning_rate": 5.565613608748481e-05, + "loss": 1.3601, + "step": 7309 + }, + { + "epoch": 2.2188495978145393, + "grad_norm": 0.9218525886535645, + "learning_rate": 5.5650060753341435e-05, + "loss": 0.9959, + "step": 7310 + }, + { + "epoch": 2.219153134011231, + "grad_norm": 0.5446337461471558, + "learning_rate": 5.5643985419198066e-05, + "loss": 1.0407, + "step": 7311 + }, + { + "epoch": 2.219456670207922, + "grad_norm": 0.7623136639595032, + "learning_rate": 5.5637910085054677e-05, + "loss": 1.6247, + "step": 7312 + }, + { + "epoch": 2.219760206404614, + "grad_norm": 0.4917345643043518, + "learning_rate": 5.563183475091131e-05, + "loss": 1.0657, + "step": 7313 + }, + { + "epoch": 2.220063742601305, + "grad_norm": 0.7134093046188354, + "learning_rate": 5.562575941676792e-05, + "loss": 1.1832, + "step": 7314 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.9278779029846191, + "learning_rate": 5.561968408262455e-05, + "loss": 1.0206, + "step": 7315 + }, + { + "epoch": 2.2206708149946883, + "grad_norm": 0.6090062260627747, + "learning_rate": 5.561360874848117e-05, + "loss": 1.5232, + "step": 7316 + }, + { + "epoch": 2.2209743511913795, + "grad_norm": 0.7449427247047424, + "learning_rate": 5.560753341433779e-05, + "loss": 1.4115, + "step": 7317 + }, + { + "epoch": 2.221277887388071, + "grad_norm": 0.735029399394989, + "learning_rate": 5.5601458080194414e-05, + "loss": 1.7686, + "step": 7318 + }, + { + "epoch": 2.2215814235847624, + "grad_norm": 0.7010538578033447, + "learning_rate": 5.559538274605104e-05, + "loss": 1.6045, + "step": 7319 + }, + { + "epoch": 2.221884959781454, + "grad_norm": 0.6299903988838196, + "learning_rate": 5.5589307411907656e-05, + "loss": 1.4633, + "step": 7320 + }, + { + "epoch": 2.2221884959781453, + "grad_norm": 0.81271892786026, + "learning_rate": 5.558323207776428e-05, + "loss": 1.116, + "step": 7321 + }, + { + "epoch": 2.222492032174837, + "grad_norm": 0.6342976689338684, + "learning_rate": 5.5577156743620904e-05, + "loss": 1.1627, + "step": 7322 + }, + { + "epoch": 2.222795568371528, + "grad_norm": 0.6368468999862671, + "learning_rate": 5.557108140947752e-05, + "loss": 1.6084, + "step": 7323 + }, + { + "epoch": 2.22309910456822, + "grad_norm": 0.9048125743865967, + "learning_rate": 5.5565006075334145e-05, + "loss": 1.2591, + "step": 7324 + }, + { + "epoch": 2.223402640764911, + "grad_norm": 0.684788167476654, + "learning_rate": 5.5558930741190776e-05, + "loss": 0.9835, + "step": 7325 + }, + { + "epoch": 2.2237061769616027, + "grad_norm": 0.7801753878593445, + "learning_rate": 5.555285540704739e-05, + "loss": 1.4924, + "step": 7326 + }, + { + "epoch": 2.2240097131582943, + "grad_norm": 0.7492843866348267, + "learning_rate": 5.554678007290402e-05, + "loss": 1.2899, + "step": 7327 + }, + { + "epoch": 2.2243132493549855, + "grad_norm": 0.587412416934967, + "learning_rate": 5.554070473876063e-05, + "loss": 1.386, + "step": 7328 + }, + { + "epoch": 2.224616785551677, + "grad_norm": 0.8866457939147949, + "learning_rate": 5.553462940461726e-05, + "loss": 1.4909, + "step": 7329 + }, + { + "epoch": 2.2249203217483684, + "grad_norm": 0.7808331847190857, + "learning_rate": 5.552855407047388e-05, + "loss": 1.3854, + "step": 7330 + }, + { + "epoch": 2.22522385794506, + "grad_norm": 0.635968029499054, + "learning_rate": 5.5522478736330493e-05, + "loss": 1.3924, + "step": 7331 + }, + { + "epoch": 2.2255273941417513, + "grad_norm": 0.7001415491104126, + "learning_rate": 5.5516403402187124e-05, + "loss": 0.9052, + "step": 7332 + }, + { + "epoch": 2.225830930338443, + "grad_norm": 0.7402334213256836, + "learning_rate": 5.551032806804375e-05, + "loss": 1.4398, + "step": 7333 + }, + { + "epoch": 2.226134466535134, + "grad_norm": 0.5318384766578674, + "learning_rate": 5.5504252733900366e-05, + "loss": 1.1877, + "step": 7334 + }, + { + "epoch": 2.226438002731826, + "grad_norm": 0.7070531845092773, + "learning_rate": 5.549817739975699e-05, + "loss": 1.4545, + "step": 7335 + }, + { + "epoch": 2.2267415389285174, + "grad_norm": 0.7260397672653198, + "learning_rate": 5.5492102065613614e-05, + "loss": 1.4061, + "step": 7336 + }, + { + "epoch": 2.2270450751252087, + "grad_norm": 0.6360136270523071, + "learning_rate": 5.548602673147023e-05, + "loss": 1.7614, + "step": 7337 + }, + { + "epoch": 2.2273486113219003, + "grad_norm": 0.6618418097496033, + "learning_rate": 5.5479951397326855e-05, + "loss": 1.0696, + "step": 7338 + }, + { + "epoch": 2.2276521475185915, + "grad_norm": 0.8529426455497742, + "learning_rate": 5.5473876063183486e-05, + "loss": 1.4114, + "step": 7339 + }, + { + "epoch": 2.227955683715283, + "grad_norm": 0.7641217112541199, + "learning_rate": 5.54678007290401e-05, + "loss": 1.3159, + "step": 7340 + }, + { + "epoch": 2.2282592199119744, + "grad_norm": 0.7829504609107971, + "learning_rate": 5.546172539489672e-05, + "loss": 1.352, + "step": 7341 + }, + { + "epoch": 2.228562756108666, + "grad_norm": 0.7978317737579346, + "learning_rate": 5.545565006075334e-05, + "loss": 1.3526, + "step": 7342 + }, + { + "epoch": 2.2288662923053573, + "grad_norm": 0.8331364393234253, + "learning_rate": 5.544957472660996e-05, + "loss": 1.2954, + "step": 7343 + }, + { + "epoch": 2.229169828502049, + "grad_norm": 0.7829442620277405, + "learning_rate": 5.544349939246659e-05, + "loss": 1.3813, + "step": 7344 + }, + { + "epoch": 2.22947336469874, + "grad_norm": 0.6786047220230103, + "learning_rate": 5.5437424058323204e-05, + "loss": 1.159, + "step": 7345 + }, + { + "epoch": 2.229776900895432, + "grad_norm": 0.8328139781951904, + "learning_rate": 5.5431348724179834e-05, + "loss": 1.5405, + "step": 7346 + }, + { + "epoch": 2.2300804370921234, + "grad_norm": 0.49961405992507935, + "learning_rate": 5.542527339003646e-05, + "loss": 1.5272, + "step": 7347 + }, + { + "epoch": 2.2303839732888147, + "grad_norm": 0.8572553992271423, + "learning_rate": 5.5419198055893076e-05, + "loss": 1.3987, + "step": 7348 + }, + { + "epoch": 2.2306875094855063, + "grad_norm": 0.6319640278816223, + "learning_rate": 5.54131227217497e-05, + "loss": 1.242, + "step": 7349 + }, + { + "epoch": 2.2309910456821975, + "grad_norm": 0.617652416229248, + "learning_rate": 5.5407047387606324e-05, + "loss": 1.6851, + "step": 7350 + }, + { + "epoch": 2.231294581878889, + "grad_norm": 0.6495313048362732, + "learning_rate": 5.540097205346294e-05, + "loss": 1.487, + "step": 7351 + }, + { + "epoch": 2.2315981180755804, + "grad_norm": 0.9778041839599609, + "learning_rate": 5.5394896719319565e-05, + "loss": 1.4783, + "step": 7352 + }, + { + "epoch": 2.231901654272272, + "grad_norm": 0.9432831406593323, + "learning_rate": 5.538882138517619e-05, + "loss": 0.8277, + "step": 7353 + }, + { + "epoch": 2.2322051904689633, + "grad_norm": 0.8032881021499634, + "learning_rate": 5.538274605103281e-05, + "loss": 1.3029, + "step": 7354 + }, + { + "epoch": 2.232508726665655, + "grad_norm": 0.5441564917564392, + "learning_rate": 5.537667071688943e-05, + "loss": 1.4384, + "step": 7355 + }, + { + "epoch": 2.232812262862346, + "grad_norm": 0.8036965727806091, + "learning_rate": 5.537059538274605e-05, + "loss": 1.4117, + "step": 7356 + }, + { + "epoch": 2.233115799059038, + "grad_norm": 1.3503347635269165, + "learning_rate": 5.536452004860267e-05, + "loss": 1.5275, + "step": 7357 + }, + { + "epoch": 2.2334193352557294, + "grad_norm": 0.7045248746871948, + "learning_rate": 5.53584447144593e-05, + "loss": 1.3456, + "step": 7358 + }, + { + "epoch": 2.2337228714524207, + "grad_norm": 0.708449125289917, + "learning_rate": 5.5352369380315914e-05, + "loss": 1.4649, + "step": 7359 + }, + { + "epoch": 2.2340264076491123, + "grad_norm": 0.5780391693115234, + "learning_rate": 5.5346294046172544e-05, + "loss": 1.6705, + "step": 7360 + }, + { + "epoch": 2.2343299438458035, + "grad_norm": 0.5943533778190613, + "learning_rate": 5.534021871202917e-05, + "loss": 1.2699, + "step": 7361 + }, + { + "epoch": 2.234633480042495, + "grad_norm": 0.6117348074913025, + "learning_rate": 5.5334143377885786e-05, + "loss": 1.1232, + "step": 7362 + }, + { + "epoch": 2.2349370162391864, + "grad_norm": 0.6989806890487671, + "learning_rate": 5.532806804374241e-05, + "loss": 1.1067, + "step": 7363 + }, + { + "epoch": 2.235240552435878, + "grad_norm": 0.6868540644645691, + "learning_rate": 5.5321992709599034e-05, + "loss": 1.7178, + "step": 7364 + }, + { + "epoch": 2.2355440886325693, + "grad_norm": 0.6501347422599792, + "learning_rate": 5.531591737545565e-05, + "loss": 0.9142, + "step": 7365 + }, + { + "epoch": 2.235847624829261, + "grad_norm": 0.7098049521446228, + "learning_rate": 5.5309842041312275e-05, + "loss": 1.5727, + "step": 7366 + }, + { + "epoch": 2.236151161025952, + "grad_norm": 0.8211485743522644, + "learning_rate": 5.530376670716889e-05, + "loss": 1.2265, + "step": 7367 + }, + { + "epoch": 2.236454697222644, + "grad_norm": 0.636452853679657, + "learning_rate": 5.529769137302552e-05, + "loss": 1.0186, + "step": 7368 + }, + { + "epoch": 2.2367582334193354, + "grad_norm": 0.8176175355911255, + "learning_rate": 5.529161603888214e-05, + "loss": 1.3212, + "step": 7369 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5893675088882446, + "learning_rate": 5.528554070473876e-05, + "loss": 1.6353, + "step": 7370 + }, + { + "epoch": 2.2373653058127183, + "grad_norm": 0.7504252791404724, + "learning_rate": 5.527946537059538e-05, + "loss": 1.2427, + "step": 7371 + }, + { + "epoch": 2.2376688420094095, + "grad_norm": 0.8744253516197205, + "learning_rate": 5.527339003645201e-05, + "loss": 1.1929, + "step": 7372 + }, + { + "epoch": 2.237972378206101, + "grad_norm": 0.6894361972808838, + "learning_rate": 5.5267314702308624e-05, + "loss": 1.34, + "step": 7373 + }, + { + "epoch": 2.2382759144027924, + "grad_norm": 0.6582505106925964, + "learning_rate": 5.5261239368165254e-05, + "loss": 1.4561, + "step": 7374 + }, + { + "epoch": 2.238579450599484, + "grad_norm": 0.7623777985572815, + "learning_rate": 5.525516403402188e-05, + "loss": 1.295, + "step": 7375 + }, + { + "epoch": 2.2388829867961753, + "grad_norm": 0.9451694488525391, + "learning_rate": 5.5249088699878496e-05, + "loss": 1.1092, + "step": 7376 + }, + { + "epoch": 2.239186522992867, + "grad_norm": 0.8134379386901855, + "learning_rate": 5.524301336573512e-05, + "loss": 1.1195, + "step": 7377 + }, + { + "epoch": 2.239490059189558, + "grad_norm": 0.6125006079673767, + "learning_rate": 5.5236938031591744e-05, + "loss": 1.724, + "step": 7378 + }, + { + "epoch": 2.23979359538625, + "grad_norm": 0.8471882939338684, + "learning_rate": 5.523086269744836e-05, + "loss": 1.4037, + "step": 7379 + }, + { + "epoch": 2.2400971315829414, + "grad_norm": 0.7992955446243286, + "learning_rate": 5.5224787363304985e-05, + "loss": 1.3698, + "step": 7380 + }, + { + "epoch": 2.2404006677796326, + "grad_norm": 0.6339590549468994, + "learning_rate": 5.52187120291616e-05, + "loss": 1.6158, + "step": 7381 + }, + { + "epoch": 2.2407042039763243, + "grad_norm": 0.7954568862915039, + "learning_rate": 5.521263669501823e-05, + "loss": 1.2678, + "step": 7382 + }, + { + "epoch": 2.2410077401730155, + "grad_norm": 0.7957431674003601, + "learning_rate": 5.520656136087485e-05, + "loss": 0.9818, + "step": 7383 + }, + { + "epoch": 2.241311276369707, + "grad_norm": 0.6429740786552429, + "learning_rate": 5.520048602673147e-05, + "loss": 1.192, + "step": 7384 + }, + { + "epoch": 2.2416148125663984, + "grad_norm": 0.7835260629653931, + "learning_rate": 5.519441069258809e-05, + "loss": 1.344, + "step": 7385 + }, + { + "epoch": 2.24191834876309, + "grad_norm": 0.8486263751983643, + "learning_rate": 5.518833535844472e-05, + "loss": 1.1573, + "step": 7386 + }, + { + "epoch": 2.2422218849597813, + "grad_norm": 0.809238612651825, + "learning_rate": 5.5182260024301334e-05, + "loss": 0.7624, + "step": 7387 + }, + { + "epoch": 2.242525421156473, + "grad_norm": 0.8039452433586121, + "learning_rate": 5.5176184690157965e-05, + "loss": 0.9601, + "step": 7388 + }, + { + "epoch": 2.2428289573531646, + "grad_norm": 0.9128976464271545, + "learning_rate": 5.517010935601459e-05, + "loss": 0.9799, + "step": 7389 + }, + { + "epoch": 2.2431324935498558, + "grad_norm": 0.6718438863754272, + "learning_rate": 5.5164034021871206e-05, + "loss": 1.7084, + "step": 7390 + }, + { + "epoch": 2.2434360297465474, + "grad_norm": 0.7414968609809875, + "learning_rate": 5.515795868772783e-05, + "loss": 1.3195, + "step": 7391 + }, + { + "epoch": 2.2437395659432386, + "grad_norm": 0.7844395041465759, + "learning_rate": 5.5151883353584454e-05, + "loss": 0.8307, + "step": 7392 + }, + { + "epoch": 2.2440431021399303, + "grad_norm": 0.6547634601593018, + "learning_rate": 5.514580801944107e-05, + "loss": 1.5043, + "step": 7393 + }, + { + "epoch": 2.2443466383366215, + "grad_norm": 1.0427731275558472, + "learning_rate": 5.5139732685297695e-05, + "loss": 1.2751, + "step": 7394 + }, + { + "epoch": 2.244650174533313, + "grad_norm": 0.7984392642974854, + "learning_rate": 5.513365735115431e-05, + "loss": 0.9469, + "step": 7395 + }, + { + "epoch": 2.2449537107300044, + "grad_norm": 0.6773079037666321, + "learning_rate": 5.512758201701094e-05, + "loss": 1.1815, + "step": 7396 + }, + { + "epoch": 2.245257246926696, + "grad_norm": 0.7443834543228149, + "learning_rate": 5.512150668286756e-05, + "loss": 1.3827, + "step": 7397 + }, + { + "epoch": 2.2455607831233877, + "grad_norm": 0.6663649678230286, + "learning_rate": 5.511543134872418e-05, + "loss": 1.4079, + "step": 7398 + }, + { + "epoch": 2.245864319320079, + "grad_norm": 0.6260837912559509, + "learning_rate": 5.51093560145808e-05, + "loss": 1.5346, + "step": 7399 + }, + { + "epoch": 2.2461678555167706, + "grad_norm": 0.542671263217926, + "learning_rate": 5.510328068043743e-05, + "loss": 1.1236, + "step": 7400 + }, + { + "epoch": 2.2464713917134618, + "grad_norm": 0.7045307755470276, + "learning_rate": 5.5097205346294044e-05, + "loss": 1.5626, + "step": 7401 + }, + { + "epoch": 2.2467749279101534, + "grad_norm": 0.6996301412582397, + "learning_rate": 5.5091130012150675e-05, + "loss": 0.8182, + "step": 7402 + }, + { + "epoch": 2.2470784641068446, + "grad_norm": 0.7701748013496399, + "learning_rate": 5.50850546780073e-05, + "loss": 1.3315, + "step": 7403 + }, + { + "epoch": 2.2473820003035363, + "grad_norm": 0.5709070563316345, + "learning_rate": 5.507897934386391e-05, + "loss": 1.5871, + "step": 7404 + }, + { + "epoch": 2.2476855365002275, + "grad_norm": 0.7984616160392761, + "learning_rate": 5.507290400972054e-05, + "loss": 1.217, + "step": 7405 + }, + { + "epoch": 2.247989072696919, + "grad_norm": 0.7854733467102051, + "learning_rate": 5.5066828675577164e-05, + "loss": 1.2765, + "step": 7406 + }, + { + "epoch": 2.2482926088936104, + "grad_norm": 0.7947596907615662, + "learning_rate": 5.506075334143378e-05, + "loss": 1.3284, + "step": 7407 + }, + { + "epoch": 2.248596145090302, + "grad_norm": 0.5904085636138916, + "learning_rate": 5.5054678007290406e-05, + "loss": 1.8131, + "step": 7408 + }, + { + "epoch": 2.2488996812869937, + "grad_norm": 0.6968871355056763, + "learning_rate": 5.504860267314702e-05, + "loss": 1.4713, + "step": 7409 + }, + { + "epoch": 2.249203217483685, + "grad_norm": 0.8094422817230225, + "learning_rate": 5.504252733900365e-05, + "loss": 1.2643, + "step": 7410 + }, + { + "epoch": 2.2495067536803766, + "grad_norm": 0.675478458404541, + "learning_rate": 5.503645200486027e-05, + "loss": 1.2157, + "step": 7411 + }, + { + "epoch": 2.2498102898770678, + "grad_norm": 0.6240454912185669, + "learning_rate": 5.503037667071689e-05, + "loss": 1.5961, + "step": 7412 + }, + { + "epoch": 2.2501138260737594, + "grad_norm": 0.8300707936286926, + "learning_rate": 5.502430133657351e-05, + "loss": 1.1565, + "step": 7413 + }, + { + "epoch": 2.2504173622704506, + "grad_norm": 0.803261935710907, + "learning_rate": 5.501822600243014e-05, + "loss": 1.3636, + "step": 7414 + }, + { + "epoch": 2.2507208984671423, + "grad_norm": 0.5902475714683533, + "learning_rate": 5.5012150668286754e-05, + "loss": 1.3799, + "step": 7415 + }, + { + "epoch": 2.2510244346638335, + "grad_norm": 0.7087440490722656, + "learning_rate": 5.500607533414338e-05, + "loss": 1.2938, + "step": 7416 + }, + { + "epoch": 2.251327970860525, + "grad_norm": 0.6584330201148987, + "learning_rate": 5.500000000000001e-05, + "loss": 1.5664, + "step": 7417 + }, + { + "epoch": 2.2516315070572164, + "grad_norm": 0.7798967957496643, + "learning_rate": 5.499392466585662e-05, + "loss": 1.4939, + "step": 7418 + }, + { + "epoch": 2.251935043253908, + "grad_norm": 0.8013840913772583, + "learning_rate": 5.498784933171325e-05, + "loss": 1.4353, + "step": 7419 + }, + { + "epoch": 2.2522385794505997, + "grad_norm": 0.8025866150856018, + "learning_rate": 5.4981773997569874e-05, + "loss": 1.3093, + "step": 7420 + }, + { + "epoch": 2.252542115647291, + "grad_norm": 0.826526939868927, + "learning_rate": 5.497569866342649e-05, + "loss": 1.3398, + "step": 7421 + }, + { + "epoch": 2.2528456518439826, + "grad_norm": 1.1023015975952148, + "learning_rate": 5.4969623329283116e-05, + "loss": 1.4203, + "step": 7422 + }, + { + "epoch": 2.2531491880406738, + "grad_norm": 0.8900132775306702, + "learning_rate": 5.496354799513973e-05, + "loss": 1.551, + "step": 7423 + }, + { + "epoch": 2.2534527242373654, + "grad_norm": 0.7980222105979919, + "learning_rate": 5.495747266099636e-05, + "loss": 0.9455, + "step": 7424 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.88653165102005, + "learning_rate": 5.495139732685298e-05, + "loss": 1.3634, + "step": 7425 + }, + { + "epoch": 2.2540597966307483, + "grad_norm": 0.7236202955245972, + "learning_rate": 5.49453219927096e-05, + "loss": 0.6963, + "step": 7426 + }, + { + "epoch": 2.2543633328274395, + "grad_norm": 0.6392729878425598, + "learning_rate": 5.493924665856622e-05, + "loss": 1.1514, + "step": 7427 + }, + { + "epoch": 2.254666869024131, + "grad_norm": 0.738734781742096, + "learning_rate": 5.4933171324422847e-05, + "loss": 1.4334, + "step": 7428 + }, + { + "epoch": 2.2549704052208224, + "grad_norm": 0.7367070317268372, + "learning_rate": 5.4927095990279464e-05, + "loss": 1.343, + "step": 7429 + }, + { + "epoch": 2.255273941417514, + "grad_norm": 0.7800174355506897, + "learning_rate": 5.492102065613609e-05, + "loss": 1.2698, + "step": 7430 + }, + { + "epoch": 2.2555774776142057, + "grad_norm": 0.6558547616004944, + "learning_rate": 5.491494532199272e-05, + "loss": 1.4547, + "step": 7431 + }, + { + "epoch": 2.255881013810897, + "grad_norm": 0.6702367067337036, + "learning_rate": 5.490886998784933e-05, + "loss": 1.3955, + "step": 7432 + }, + { + "epoch": 2.2561845500075886, + "grad_norm": 0.6517998576164246, + "learning_rate": 5.490279465370596e-05, + "loss": 1.3565, + "step": 7433 + }, + { + "epoch": 2.2564880862042798, + "grad_norm": 0.7218438386917114, + "learning_rate": 5.4896719319562584e-05, + "loss": 0.7599, + "step": 7434 + }, + { + "epoch": 2.2567916224009714, + "grad_norm": 0.7125388979911804, + "learning_rate": 5.48906439854192e-05, + "loss": 1.5283, + "step": 7435 + }, + { + "epoch": 2.2570951585976626, + "grad_norm": 0.5857595801353455, + "learning_rate": 5.4884568651275826e-05, + "loss": 0.6817, + "step": 7436 + }, + { + "epoch": 2.2573986947943543, + "grad_norm": 0.7023595571517944, + "learning_rate": 5.487849331713244e-05, + "loss": 1.5239, + "step": 7437 + }, + { + "epoch": 2.2577022309910455, + "grad_norm": 0.8885743618011475, + "learning_rate": 5.487241798298907e-05, + "loss": 1.4502, + "step": 7438 + }, + { + "epoch": 2.258005767187737, + "grad_norm": 0.8196373581886292, + "learning_rate": 5.486634264884569e-05, + "loss": 1.2389, + "step": 7439 + }, + { + "epoch": 2.2583093033844284, + "grad_norm": 1.0717699527740479, + "learning_rate": 5.486026731470231e-05, + "loss": 1.2988, + "step": 7440 + }, + { + "epoch": 2.25861283958112, + "grad_norm": 0.5756183862686157, + "learning_rate": 5.485419198055893e-05, + "loss": 1.4591, + "step": 7441 + }, + { + "epoch": 2.2589163757778117, + "grad_norm": 0.8124812245368958, + "learning_rate": 5.4848116646415557e-05, + "loss": 1.1953, + "step": 7442 + }, + { + "epoch": 2.259219911974503, + "grad_norm": 0.7616799473762512, + "learning_rate": 5.4842041312272174e-05, + "loss": 1.0254, + "step": 7443 + }, + { + "epoch": 2.2595234481711945, + "grad_norm": 0.6586336493492126, + "learning_rate": 5.48359659781288e-05, + "loss": 0.9613, + "step": 7444 + }, + { + "epoch": 2.2598269843678858, + "grad_norm": 0.7181385159492493, + "learning_rate": 5.482989064398543e-05, + "loss": 1.4333, + "step": 7445 + }, + { + "epoch": 2.2601305205645774, + "grad_norm": 0.5932475328445435, + "learning_rate": 5.482381530984204e-05, + "loss": 1.1878, + "step": 7446 + }, + { + "epoch": 2.2604340567612686, + "grad_norm": 0.6791607737541199, + "learning_rate": 5.481773997569867e-05, + "loss": 1.1296, + "step": 7447 + }, + { + "epoch": 2.2607375929579603, + "grad_norm": 0.4580678641796112, + "learning_rate": 5.4811664641555294e-05, + "loss": 0.6515, + "step": 7448 + }, + { + "epoch": 2.261041129154652, + "grad_norm": 0.8818070292472839, + "learning_rate": 5.480558930741191e-05, + "loss": 1.2604, + "step": 7449 + }, + { + "epoch": 2.261344665351343, + "grad_norm": 0.8981488347053528, + "learning_rate": 5.4799513973268536e-05, + "loss": 1.2392, + "step": 7450 + }, + { + "epoch": 2.2616482015480344, + "grad_norm": 0.7887245416641235, + "learning_rate": 5.479343863912515e-05, + "loss": 0.9956, + "step": 7451 + }, + { + "epoch": 2.261951737744726, + "grad_norm": 0.6639705896377563, + "learning_rate": 5.478736330498178e-05, + "loss": 1.3412, + "step": 7452 + }, + { + "epoch": 2.2622552739414177, + "grad_norm": 0.7820801138877869, + "learning_rate": 5.47812879708384e-05, + "loss": 1.6015, + "step": 7453 + }, + { + "epoch": 2.262558810138109, + "grad_norm": 0.567541778087616, + "learning_rate": 5.477521263669502e-05, + "loss": 1.3132, + "step": 7454 + }, + { + "epoch": 2.2628623463348005, + "grad_norm": 0.7936879396438599, + "learning_rate": 5.476913730255164e-05, + "loss": 0.9349, + "step": 7455 + }, + { + "epoch": 2.2631658825314918, + "grad_norm": 0.6322331428527832, + "learning_rate": 5.476306196840827e-05, + "loss": 1.3739, + "step": 7456 + }, + { + "epoch": 2.2634694187281834, + "grad_norm": 0.7208223938941956, + "learning_rate": 5.4756986634264884e-05, + "loss": 1.504, + "step": 7457 + }, + { + "epoch": 2.2637729549248746, + "grad_norm": 0.6773326396942139, + "learning_rate": 5.475091130012151e-05, + "loss": 1.22, + "step": 7458 + }, + { + "epoch": 2.2640764911215663, + "grad_norm": 0.6207496523857117, + "learning_rate": 5.474483596597814e-05, + "loss": 1.4626, + "step": 7459 + }, + { + "epoch": 2.264380027318258, + "grad_norm": 0.7778047323226929, + "learning_rate": 5.473876063183475e-05, + "loss": 0.9966, + "step": 7460 + }, + { + "epoch": 2.264683563514949, + "grad_norm": 0.762438952922821, + "learning_rate": 5.473268529769138e-05, + "loss": 1.2628, + "step": 7461 + }, + { + "epoch": 2.264987099711641, + "grad_norm": 0.8586466908454895, + "learning_rate": 5.472660996354799e-05, + "loss": 0.9212, + "step": 7462 + }, + { + "epoch": 2.265290635908332, + "grad_norm": 0.6765137314796448, + "learning_rate": 5.472053462940462e-05, + "loss": 1.1541, + "step": 7463 + }, + { + "epoch": 2.2655941721050237, + "grad_norm": 0.6344521641731262, + "learning_rate": 5.4714459295261246e-05, + "loss": 1.8874, + "step": 7464 + }, + { + "epoch": 2.265897708301715, + "grad_norm": 0.6776248812675476, + "learning_rate": 5.4708383961117856e-05, + "loss": 1.6253, + "step": 7465 + }, + { + "epoch": 2.2662012444984065, + "grad_norm": 0.8977546691894531, + "learning_rate": 5.470230862697449e-05, + "loss": 0.8644, + "step": 7466 + }, + { + "epoch": 2.2665047806950978, + "grad_norm": 0.5835941433906555, + "learning_rate": 5.469623329283111e-05, + "loss": 1.3991, + "step": 7467 + }, + { + "epoch": 2.2668083168917894, + "grad_norm": 0.6627216935157776, + "learning_rate": 5.469015795868773e-05, + "loss": 1.4281, + "step": 7468 + }, + { + "epoch": 2.2671118530884806, + "grad_norm": 0.7152976989746094, + "learning_rate": 5.468408262454435e-05, + "loss": 1.295, + "step": 7469 + }, + { + "epoch": 2.2674153892851723, + "grad_norm": 0.7702943086624146, + "learning_rate": 5.467800729040098e-05, + "loss": 1.4717, + "step": 7470 + }, + { + "epoch": 2.267718925481864, + "grad_norm": 0.5449094772338867, + "learning_rate": 5.4671931956257594e-05, + "loss": 1.11, + "step": 7471 + }, + { + "epoch": 2.268022461678555, + "grad_norm": 0.6027811765670776, + "learning_rate": 5.466585662211422e-05, + "loss": 1.0541, + "step": 7472 + }, + { + "epoch": 2.268325997875247, + "grad_norm": 0.7328689098358154, + "learning_rate": 5.465978128797085e-05, + "loss": 1.4477, + "step": 7473 + }, + { + "epoch": 2.268629534071938, + "grad_norm": 0.6849613189697266, + "learning_rate": 5.465370595382746e-05, + "loss": 0.9747, + "step": 7474 + }, + { + "epoch": 2.2689330702686297, + "grad_norm": 0.8007455468177795, + "learning_rate": 5.464763061968409e-05, + "loss": 1.4307, + "step": 7475 + }, + { + "epoch": 2.269236606465321, + "grad_norm": 0.7696539759635925, + "learning_rate": 5.46415552855407e-05, + "loss": 1.3663, + "step": 7476 + }, + { + "epoch": 2.2695401426620125, + "grad_norm": 0.9261662364006042, + "learning_rate": 5.4635479951397325e-05, + "loss": 1.4261, + "step": 7477 + }, + { + "epoch": 2.2698436788587038, + "grad_norm": 0.6188323497772217, + "learning_rate": 5.4629404617253956e-05, + "loss": 1.0909, + "step": 7478 + }, + { + "epoch": 2.2701472150553954, + "grad_norm": 0.8149519562721252, + "learning_rate": 5.4623329283110566e-05, + "loss": 0.647, + "step": 7479 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.8626388907432556, + "learning_rate": 5.46172539489672e-05, + "loss": 1.2524, + "step": 7480 + }, + { + "epoch": 2.2707542874487783, + "grad_norm": 0.7279552817344666, + "learning_rate": 5.461117861482382e-05, + "loss": 1.2493, + "step": 7481 + }, + { + "epoch": 2.27105782364547, + "grad_norm": 0.7417263984680176, + "learning_rate": 5.460510328068044e-05, + "loss": 1.0718, + "step": 7482 + }, + { + "epoch": 2.271361359842161, + "grad_norm": 0.7569875121116638, + "learning_rate": 5.459902794653706e-05, + "loss": 1.4112, + "step": 7483 + }, + { + "epoch": 2.271664896038853, + "grad_norm": 0.6595586538314819, + "learning_rate": 5.459295261239369e-05, + "loss": 1.4326, + "step": 7484 + }, + { + "epoch": 2.271968432235544, + "grad_norm": 0.5473843216896057, + "learning_rate": 5.4586877278250304e-05, + "loss": 1.6897, + "step": 7485 + }, + { + "epoch": 2.2722719684322357, + "grad_norm": 0.7562367916107178, + "learning_rate": 5.458080194410693e-05, + "loss": 1.5851, + "step": 7486 + }, + { + "epoch": 2.272575504628927, + "grad_norm": 0.6894574761390686, + "learning_rate": 5.457472660996356e-05, + "loss": 1.1538, + "step": 7487 + }, + { + "epoch": 2.2728790408256185, + "grad_norm": 0.9435144662857056, + "learning_rate": 5.456865127582017e-05, + "loss": 1.3727, + "step": 7488 + }, + { + "epoch": 2.2731825770223097, + "grad_norm": 0.766242504119873, + "learning_rate": 5.4562575941676794e-05, + "loss": 1.064, + "step": 7489 + }, + { + "epoch": 2.2734861132190014, + "grad_norm": 0.6115533113479614, + "learning_rate": 5.455650060753341e-05, + "loss": 0.8242, + "step": 7490 + }, + { + "epoch": 2.2737896494156926, + "grad_norm": 0.5504389405250549, + "learning_rate": 5.4550425273390035e-05, + "loss": 0.9371, + "step": 7491 + }, + { + "epoch": 2.2740931856123843, + "grad_norm": 1.0381739139556885, + "learning_rate": 5.4544349939246666e-05, + "loss": 1.1618, + "step": 7492 + }, + { + "epoch": 2.274396721809076, + "grad_norm": 0.9171277284622192, + "learning_rate": 5.4538274605103276e-05, + "loss": 1.2575, + "step": 7493 + }, + { + "epoch": 2.274700258005767, + "grad_norm": 0.9959700107574463, + "learning_rate": 5.453219927095991e-05, + "loss": 1.3317, + "step": 7494 + }, + { + "epoch": 2.275003794202459, + "grad_norm": 0.6824067831039429, + "learning_rate": 5.452612393681653e-05, + "loss": 1.2123, + "step": 7495 + }, + { + "epoch": 2.27530733039915, + "grad_norm": 0.6326625347137451, + "learning_rate": 5.452004860267315e-05, + "loss": 1.6067, + "step": 7496 + }, + { + "epoch": 2.2756108665958417, + "grad_norm": 0.7540733814239502, + "learning_rate": 5.451397326852977e-05, + "loss": 1.3346, + "step": 7497 + }, + { + "epoch": 2.275914402792533, + "grad_norm": 0.643101692199707, + "learning_rate": 5.45078979343864e-05, + "loss": 0.8042, + "step": 7498 + }, + { + "epoch": 2.2762179389892245, + "grad_norm": 0.892805278301239, + "learning_rate": 5.4501822600243014e-05, + "loss": 0.8651, + "step": 7499 + }, + { + "epoch": 2.2765214751859157, + "grad_norm": 0.8604877591133118, + "learning_rate": 5.449574726609964e-05, + "loss": 1.4444, + "step": 7500 + }, + { + "epoch": 2.2768250113826074, + "grad_norm": 0.6573825478553772, + "learning_rate": 5.448967193195626e-05, + "loss": 1.2915, + "step": 7501 + }, + { + "epoch": 2.2771285475792986, + "grad_norm": 0.7212510108947754, + "learning_rate": 5.448359659781288e-05, + "loss": 0.9891, + "step": 7502 + }, + { + "epoch": 2.2774320837759903, + "grad_norm": 0.7919306755065918, + "learning_rate": 5.4477521263669504e-05, + "loss": 1.2099, + "step": 7503 + }, + { + "epoch": 2.277735619972682, + "grad_norm": 0.6194910407066345, + "learning_rate": 5.447144592952612e-05, + "loss": 1.5992, + "step": 7504 + }, + { + "epoch": 2.278039156169373, + "grad_norm": 0.8497462868690491, + "learning_rate": 5.4465370595382745e-05, + "loss": 1.2713, + "step": 7505 + }, + { + "epoch": 2.278342692366065, + "grad_norm": 0.747949481010437, + "learning_rate": 5.4459295261239376e-05, + "loss": 1.0268, + "step": 7506 + }, + { + "epoch": 2.278646228562756, + "grad_norm": 0.705798864364624, + "learning_rate": 5.4453219927095986e-05, + "loss": 1.3496, + "step": 7507 + }, + { + "epoch": 2.2789497647594477, + "grad_norm": 0.7338413000106812, + "learning_rate": 5.444714459295262e-05, + "loss": 1.2911, + "step": 7508 + }, + { + "epoch": 2.279253300956139, + "grad_norm": 0.8935163021087646, + "learning_rate": 5.444106925880924e-05, + "loss": 1.2852, + "step": 7509 + }, + { + "epoch": 2.2795568371528305, + "grad_norm": 0.7204432487487793, + "learning_rate": 5.443499392466586e-05, + "loss": 1.2556, + "step": 7510 + }, + { + "epoch": 2.2798603733495217, + "grad_norm": 0.8553929924964905, + "learning_rate": 5.442891859052248e-05, + "loss": 1.056, + "step": 7511 + }, + { + "epoch": 2.2801639095462134, + "grad_norm": 0.9489281177520752, + "learning_rate": 5.442284325637911e-05, + "loss": 1.4451, + "step": 7512 + }, + { + "epoch": 2.2804674457429046, + "grad_norm": 0.8464581966400146, + "learning_rate": 5.4416767922235724e-05, + "loss": 1.0983, + "step": 7513 + }, + { + "epoch": 2.2807709819395963, + "grad_norm": 0.8345524668693542, + "learning_rate": 5.441069258809235e-05, + "loss": 1.5898, + "step": 7514 + }, + { + "epoch": 2.281074518136288, + "grad_norm": 0.9463378190994263, + "learning_rate": 5.440461725394897e-05, + "loss": 1.1437, + "step": 7515 + }, + { + "epoch": 2.281378054332979, + "grad_norm": 0.799299955368042, + "learning_rate": 5.439854191980559e-05, + "loss": 1.3918, + "step": 7516 + }, + { + "epoch": 2.281681590529671, + "grad_norm": 0.8265707492828369, + "learning_rate": 5.4392466585662214e-05, + "loss": 1.3374, + "step": 7517 + }, + { + "epoch": 2.281985126726362, + "grad_norm": 0.6352429986000061, + "learning_rate": 5.438639125151883e-05, + "loss": 0.7019, + "step": 7518 + }, + { + "epoch": 2.2822886629230537, + "grad_norm": 0.8135592937469482, + "learning_rate": 5.4380315917375455e-05, + "loss": 1.278, + "step": 7519 + }, + { + "epoch": 2.282592199119745, + "grad_norm": 0.7900860905647278, + "learning_rate": 5.4374240583232086e-05, + "loss": 1.128, + "step": 7520 + }, + { + "epoch": 2.2828957353164365, + "grad_norm": 0.8089602589607239, + "learning_rate": 5.4368165249088696e-05, + "loss": 1.3828, + "step": 7521 + }, + { + "epoch": 2.283199271513128, + "grad_norm": 0.8169485330581665, + "learning_rate": 5.436208991494533e-05, + "loss": 1.5373, + "step": 7522 + }, + { + "epoch": 2.2835028077098194, + "grad_norm": 0.7280807495117188, + "learning_rate": 5.435601458080195e-05, + "loss": 1.485, + "step": 7523 + }, + { + "epoch": 2.2838063439065106, + "grad_norm": 0.7815548181533813, + "learning_rate": 5.434993924665857e-05, + "loss": 1.4671, + "step": 7524 + }, + { + "epoch": 2.2841098801032023, + "grad_norm": 0.6063195466995239, + "learning_rate": 5.434386391251519e-05, + "loss": 1.779, + "step": 7525 + }, + { + "epoch": 2.284413416299894, + "grad_norm": 0.8827876448631287, + "learning_rate": 5.433778857837182e-05, + "loss": 1.4556, + "step": 7526 + }, + { + "epoch": 2.284716952496585, + "grad_norm": 0.7507184743881226, + "learning_rate": 5.4331713244228434e-05, + "loss": 1.0536, + "step": 7527 + }, + { + "epoch": 2.285020488693277, + "grad_norm": 0.820326566696167, + "learning_rate": 5.432563791008506e-05, + "loss": 1.5834, + "step": 7528 + }, + { + "epoch": 2.285324024889968, + "grad_norm": 0.7697584629058838, + "learning_rate": 5.431956257594168e-05, + "loss": 0.9109, + "step": 7529 + }, + { + "epoch": 2.2856275610866597, + "grad_norm": 0.5971083045005798, + "learning_rate": 5.43134872417983e-05, + "loss": 1.093, + "step": 7530 + }, + { + "epoch": 2.285931097283351, + "grad_norm": 0.7807541489601135, + "learning_rate": 5.4307411907654924e-05, + "loss": 1.1812, + "step": 7531 + }, + { + "epoch": 2.2862346334800425, + "grad_norm": 0.6648563146591187, + "learning_rate": 5.430133657351154e-05, + "loss": 1.3252, + "step": 7532 + }, + { + "epoch": 2.286538169676734, + "grad_norm": 0.842267632484436, + "learning_rate": 5.4295261239368165e-05, + "loss": 1.0992, + "step": 7533 + }, + { + "epoch": 2.2868417058734254, + "grad_norm": 0.7376732230186462, + "learning_rate": 5.4289185905224796e-05, + "loss": 1.4368, + "step": 7534 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.6069490313529968, + "learning_rate": 5.4283110571081407e-05, + "loss": 1.8016, + "step": 7535 + }, + { + "epoch": 2.2874487782668083, + "grad_norm": 0.6976711750030518, + "learning_rate": 5.427703523693804e-05, + "loss": 1.0021, + "step": 7536 + }, + { + "epoch": 2.2877523144635, + "grad_norm": 0.6691964864730835, + "learning_rate": 5.427095990279466e-05, + "loss": 1.3759, + "step": 7537 + }, + { + "epoch": 2.288055850660191, + "grad_norm": 0.7515538930892944, + "learning_rate": 5.426488456865127e-05, + "loss": 1.3081, + "step": 7538 + }, + { + "epoch": 2.288359386856883, + "grad_norm": 0.8624359965324402, + "learning_rate": 5.42588092345079e-05, + "loss": 1.0082, + "step": 7539 + }, + { + "epoch": 2.288662923053574, + "grad_norm": 0.7907547950744629, + "learning_rate": 5.425273390036453e-05, + "loss": 0.8, + "step": 7540 + }, + { + "epoch": 2.2889664592502657, + "grad_norm": 0.7774684429168701, + "learning_rate": 5.4246658566221144e-05, + "loss": 1.2111, + "step": 7541 + }, + { + "epoch": 2.289269995446957, + "grad_norm": 0.8284247517585754, + "learning_rate": 5.424058323207777e-05, + "loss": 1.5555, + "step": 7542 + }, + { + "epoch": 2.2895735316436485, + "grad_norm": 0.7162729501724243, + "learning_rate": 5.4234507897934386e-05, + "loss": 1.1438, + "step": 7543 + }, + { + "epoch": 2.28987706784034, + "grad_norm": 0.7665915489196777, + "learning_rate": 5.422843256379101e-05, + "loss": 1.4582, + "step": 7544 + }, + { + "epoch": 2.2901806040370314, + "grad_norm": 0.734765350818634, + "learning_rate": 5.4222357229647634e-05, + "loss": 1.2054, + "step": 7545 + }, + { + "epoch": 2.290484140233723, + "grad_norm": 0.8265764117240906, + "learning_rate": 5.421628189550425e-05, + "loss": 1.712, + "step": 7546 + }, + { + "epoch": 2.2907876764304143, + "grad_norm": 0.7417080998420715, + "learning_rate": 5.4210206561360875e-05, + "loss": 1.2468, + "step": 7547 + }, + { + "epoch": 2.291091212627106, + "grad_norm": 0.7113915681838989, + "learning_rate": 5.4204131227217506e-05, + "loss": 1.3884, + "step": 7548 + }, + { + "epoch": 2.291394748823797, + "grad_norm": 0.7582618594169617, + "learning_rate": 5.4198055893074117e-05, + "loss": 1.454, + "step": 7549 + }, + { + "epoch": 2.291698285020489, + "grad_norm": 0.8136988282203674, + "learning_rate": 5.419198055893074e-05, + "loss": 1.3694, + "step": 7550 + }, + { + "epoch": 2.29200182121718, + "grad_norm": 0.7433492541313171, + "learning_rate": 5.418590522478737e-05, + "loss": 1.5307, + "step": 7551 + }, + { + "epoch": 2.2923053574138716, + "grad_norm": 0.7587953209877014, + "learning_rate": 5.417982989064398e-05, + "loss": 1.3316, + "step": 7552 + }, + { + "epoch": 2.292608893610563, + "grad_norm": 0.7869864106178284, + "learning_rate": 5.417375455650061e-05, + "loss": 1.2841, + "step": 7553 + }, + { + "epoch": 2.2929124298072545, + "grad_norm": 1.0088813304901123, + "learning_rate": 5.416767922235724e-05, + "loss": 1.1967, + "step": 7554 + }, + { + "epoch": 2.293215966003946, + "grad_norm": 0.7056594491004944, + "learning_rate": 5.4161603888213854e-05, + "loss": 1.372, + "step": 7555 + }, + { + "epoch": 2.2935195022006374, + "grad_norm": 0.7692909836769104, + "learning_rate": 5.415552855407048e-05, + "loss": 0.7488, + "step": 7556 + }, + { + "epoch": 2.293823038397329, + "grad_norm": 0.8228776454925537, + "learning_rate": 5.4149453219927096e-05, + "loss": 1.4965, + "step": 7557 + }, + { + "epoch": 2.2941265745940203, + "grad_norm": 0.869111955165863, + "learning_rate": 5.414337788578372e-05, + "loss": 1.4706, + "step": 7558 + }, + { + "epoch": 2.294430110790712, + "grad_norm": 0.806982696056366, + "learning_rate": 5.4137302551640344e-05, + "loss": 1.4263, + "step": 7559 + }, + { + "epoch": 2.294733646987403, + "grad_norm": 0.6825501918792725, + "learning_rate": 5.413122721749696e-05, + "loss": 1.2769, + "step": 7560 + }, + { + "epoch": 2.2950371831840948, + "grad_norm": 0.7721306085586548, + "learning_rate": 5.4125151883353585e-05, + "loss": 1.2624, + "step": 7561 + }, + { + "epoch": 2.295340719380786, + "grad_norm": 0.654353678226471, + "learning_rate": 5.411907654921021e-05, + "loss": 0.9266, + "step": 7562 + }, + { + "epoch": 2.2956442555774776, + "grad_norm": 0.7125826478004456, + "learning_rate": 5.411300121506683e-05, + "loss": 1.1077, + "step": 7563 + }, + { + "epoch": 2.295947791774169, + "grad_norm": 0.655035138130188, + "learning_rate": 5.410692588092345e-05, + "loss": 1.1465, + "step": 7564 + }, + { + "epoch": 2.2962513279708605, + "grad_norm": 0.4681672751903534, + "learning_rate": 5.410085054678008e-05, + "loss": 1.6199, + "step": 7565 + }, + { + "epoch": 2.296554864167552, + "grad_norm": 0.6791568994522095, + "learning_rate": 5.409477521263669e-05, + "loss": 1.1105, + "step": 7566 + }, + { + "epoch": 2.2968584003642434, + "grad_norm": 0.7070233821868896, + "learning_rate": 5.408869987849332e-05, + "loss": 1.3733, + "step": 7567 + }, + { + "epoch": 2.297161936560935, + "grad_norm": 0.6516934633255005, + "learning_rate": 5.408262454434995e-05, + "loss": 1.4664, + "step": 7568 + }, + { + "epoch": 2.2974654727576262, + "grad_norm": 0.8514277935028076, + "learning_rate": 5.4076549210206564e-05, + "loss": 1.3357, + "step": 7569 + }, + { + "epoch": 2.297769008954318, + "grad_norm": 0.6793623566627502, + "learning_rate": 5.407047387606319e-05, + "loss": 1.6255, + "step": 7570 + }, + { + "epoch": 2.298072545151009, + "grad_norm": 0.5945488810539246, + "learning_rate": 5.4064398541919806e-05, + "loss": 1.3195, + "step": 7571 + }, + { + "epoch": 2.2983760813477008, + "grad_norm": 0.7190368175506592, + "learning_rate": 5.405832320777643e-05, + "loss": 1.3829, + "step": 7572 + }, + { + "epoch": 2.298679617544392, + "grad_norm": 0.8405774831771851, + "learning_rate": 5.4052247873633054e-05, + "loss": 1.5689, + "step": 7573 + }, + { + "epoch": 2.2989831537410836, + "grad_norm": 0.725680410861969, + "learning_rate": 5.404617253948967e-05, + "loss": 0.8484, + "step": 7574 + }, + { + "epoch": 2.299286689937775, + "grad_norm": 0.8415581583976746, + "learning_rate": 5.4040097205346295e-05, + "loss": 1.2999, + "step": 7575 + }, + { + "epoch": 2.2995902261344665, + "grad_norm": 0.580029308795929, + "learning_rate": 5.403402187120292e-05, + "loss": 1.7616, + "step": 7576 + }, + { + "epoch": 2.299893762331158, + "grad_norm": 0.7359707355499268, + "learning_rate": 5.402794653705954e-05, + "loss": 1.1516, + "step": 7577 + }, + { + "epoch": 2.3001972985278494, + "grad_norm": 0.7400234937667847, + "learning_rate": 5.402187120291616e-05, + "loss": 1.1951, + "step": 7578 + }, + { + "epoch": 2.300500834724541, + "grad_norm": 0.7638850212097168, + "learning_rate": 5.401579586877279e-05, + "loss": 1.6116, + "step": 7579 + }, + { + "epoch": 2.3008043709212322, + "grad_norm": 0.8924289345741272, + "learning_rate": 5.40097205346294e-05, + "loss": 1.4147, + "step": 7580 + }, + { + "epoch": 2.301107907117924, + "grad_norm": 0.8381494283676147, + "learning_rate": 5.400364520048603e-05, + "loss": 1.1766, + "step": 7581 + }, + { + "epoch": 2.301411443314615, + "grad_norm": 0.8965625762939453, + "learning_rate": 5.399756986634266e-05, + "loss": 1.4292, + "step": 7582 + }, + { + "epoch": 2.3017149795113068, + "grad_norm": 0.8396190404891968, + "learning_rate": 5.3991494532199274e-05, + "loss": 1.2678, + "step": 7583 + }, + { + "epoch": 2.3020185157079984, + "grad_norm": 0.6765458583831787, + "learning_rate": 5.39854191980559e-05, + "loss": 1.4473, + "step": 7584 + }, + { + "epoch": 2.3023220519046896, + "grad_norm": 0.8363358974456787, + "learning_rate": 5.3979343863912516e-05, + "loss": 1.3189, + "step": 7585 + }, + { + "epoch": 2.302625588101381, + "grad_norm": 0.6315335631370544, + "learning_rate": 5.397326852976914e-05, + "loss": 1.3433, + "step": 7586 + }, + { + "epoch": 2.3029291242980725, + "grad_norm": 1.3354105949401855, + "learning_rate": 5.3967193195625764e-05, + "loss": 0.9348, + "step": 7587 + }, + { + "epoch": 2.303232660494764, + "grad_norm": 0.7214506268501282, + "learning_rate": 5.396111786148238e-05, + "loss": 1.1078, + "step": 7588 + }, + { + "epoch": 2.3035361966914554, + "grad_norm": 0.9364904761314392, + "learning_rate": 5.3955042527339005e-05, + "loss": 1.117, + "step": 7589 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.8593325018882751, + "learning_rate": 5.394896719319563e-05, + "loss": 1.1249, + "step": 7590 + }, + { + "epoch": 2.3041432690848382, + "grad_norm": 0.7999909520149231, + "learning_rate": 5.394289185905225e-05, + "loss": 1.3141, + "step": 7591 + }, + { + "epoch": 2.30444680528153, + "grad_norm": 0.6782107949256897, + "learning_rate": 5.393681652490887e-05, + "loss": 1.2862, + "step": 7592 + }, + { + "epoch": 2.304750341478221, + "grad_norm": 0.703163743019104, + "learning_rate": 5.39307411907655e-05, + "loss": 1.4451, + "step": 7593 + }, + { + "epoch": 2.3050538776749128, + "grad_norm": 0.7271427512168884, + "learning_rate": 5.392466585662211e-05, + "loss": 1.2476, + "step": 7594 + }, + { + "epoch": 2.3053574138716044, + "grad_norm": 0.7937582731246948, + "learning_rate": 5.391859052247874e-05, + "loss": 1.4786, + "step": 7595 + }, + { + "epoch": 2.3056609500682956, + "grad_norm": 0.6775838136672974, + "learning_rate": 5.391251518833537e-05, + "loss": 1.7379, + "step": 7596 + }, + { + "epoch": 2.3059644862649873, + "grad_norm": 0.7014533281326294, + "learning_rate": 5.3906439854191984e-05, + "loss": 1.5345, + "step": 7597 + }, + { + "epoch": 2.3062680224616785, + "grad_norm": 0.6490854620933533, + "learning_rate": 5.390036452004861e-05, + "loss": 1.338, + "step": 7598 + }, + { + "epoch": 2.30657155865837, + "grad_norm": 1.0077564716339111, + "learning_rate": 5.389428918590522e-05, + "loss": 1.2281, + "step": 7599 + }, + { + "epoch": 2.3068750948550614, + "grad_norm": 0.850725531578064, + "learning_rate": 5.388821385176185e-05, + "loss": 1.3029, + "step": 7600 + }, + { + "epoch": 2.307178631051753, + "grad_norm": 0.7772053480148315, + "learning_rate": 5.3882138517618474e-05, + "loss": 1.4725, + "step": 7601 + }, + { + "epoch": 2.3074821672484442, + "grad_norm": 0.681745707988739, + "learning_rate": 5.387606318347509e-05, + "loss": 0.8412, + "step": 7602 + }, + { + "epoch": 2.307785703445136, + "grad_norm": 0.8142476081848145, + "learning_rate": 5.3869987849331715e-05, + "loss": 1.5885, + "step": 7603 + }, + { + "epoch": 2.308089239641827, + "grad_norm": 0.8766453862190247, + "learning_rate": 5.386391251518834e-05, + "loss": 1.2459, + "step": 7604 + }, + { + "epoch": 2.3083927758385188, + "grad_norm": 0.9796348810195923, + "learning_rate": 5.385783718104496e-05, + "loss": 1.5479, + "step": 7605 + }, + { + "epoch": 2.3086963120352104, + "grad_norm": 0.6790328025817871, + "learning_rate": 5.385176184690158e-05, + "loss": 1.4504, + "step": 7606 + }, + { + "epoch": 2.3089998482319016, + "grad_norm": 0.7625194191932678, + "learning_rate": 5.384568651275821e-05, + "loss": 1.3028, + "step": 7607 + }, + { + "epoch": 2.3093033844285933, + "grad_norm": 0.7844623923301697, + "learning_rate": 5.383961117861482e-05, + "loss": 1.0466, + "step": 7608 + }, + { + "epoch": 2.3096069206252845, + "grad_norm": 0.8220598697662354, + "learning_rate": 5.383353584447145e-05, + "loss": 1.3591, + "step": 7609 + }, + { + "epoch": 2.309910456821976, + "grad_norm": 0.7971135377883911, + "learning_rate": 5.382746051032808e-05, + "loss": 1.3741, + "step": 7610 + }, + { + "epoch": 2.3102139930186674, + "grad_norm": 0.743306040763855, + "learning_rate": 5.382138517618469e-05, + "loss": 1.0722, + "step": 7611 + }, + { + "epoch": 2.310517529215359, + "grad_norm": 0.7106120586395264, + "learning_rate": 5.381530984204132e-05, + "loss": 1.632, + "step": 7612 + }, + { + "epoch": 2.3108210654120502, + "grad_norm": 0.7360802888870239, + "learning_rate": 5.380923450789793e-05, + "loss": 1.3646, + "step": 7613 + }, + { + "epoch": 2.311124601608742, + "grad_norm": 0.7011423707008362, + "learning_rate": 5.380315917375456e-05, + "loss": 1.4146, + "step": 7614 + }, + { + "epoch": 2.311428137805433, + "grad_norm": 0.7464067339897156, + "learning_rate": 5.3797083839611184e-05, + "loss": 1.4509, + "step": 7615 + }, + { + "epoch": 2.3117316740021248, + "grad_norm": 0.7687321901321411, + "learning_rate": 5.37910085054678e-05, + "loss": 1.3009, + "step": 7616 + }, + { + "epoch": 2.3120352101988164, + "grad_norm": 0.7935065627098083, + "learning_rate": 5.3784933171324425e-05, + "loss": 0.9285, + "step": 7617 + }, + { + "epoch": 2.3123387463955076, + "grad_norm": 0.7177073955535889, + "learning_rate": 5.377885783718105e-05, + "loss": 1.4441, + "step": 7618 + }, + { + "epoch": 2.3126422825921993, + "grad_norm": 0.7047598361968994, + "learning_rate": 5.377278250303767e-05, + "loss": 1.5208, + "step": 7619 + }, + { + "epoch": 2.3129458187888905, + "grad_norm": 0.6508432030677795, + "learning_rate": 5.376670716889429e-05, + "loss": 1.5971, + "step": 7620 + }, + { + "epoch": 2.313249354985582, + "grad_norm": 0.5820059776306152, + "learning_rate": 5.376063183475092e-05, + "loss": 1.0914, + "step": 7621 + }, + { + "epoch": 2.3135528911822734, + "grad_norm": 0.7413522005081177, + "learning_rate": 5.375455650060753e-05, + "loss": 1.324, + "step": 7622 + }, + { + "epoch": 2.313856427378965, + "grad_norm": 0.8028507828712463, + "learning_rate": 5.3748481166464156e-05, + "loss": 1.3074, + "step": 7623 + }, + { + "epoch": 2.3141599635756562, + "grad_norm": 0.839253842830658, + "learning_rate": 5.374240583232079e-05, + "loss": 1.3944, + "step": 7624 + }, + { + "epoch": 2.314463499772348, + "grad_norm": 0.9501205682754517, + "learning_rate": 5.37363304981774e-05, + "loss": 1.4148, + "step": 7625 + }, + { + "epoch": 2.314767035969039, + "grad_norm": 0.5154752135276794, + "learning_rate": 5.373025516403403e-05, + "loss": 1.1781, + "step": 7626 + }, + { + "epoch": 2.3150705721657308, + "grad_norm": 0.7667773365974426, + "learning_rate": 5.372417982989064e-05, + "loss": 1.3034, + "step": 7627 + }, + { + "epoch": 2.3153741083624224, + "grad_norm": 0.676240086555481, + "learning_rate": 5.371810449574727e-05, + "loss": 1.3928, + "step": 7628 + }, + { + "epoch": 2.3156776445591136, + "grad_norm": 0.8117524981498718, + "learning_rate": 5.3712029161603894e-05, + "loss": 1.2232, + "step": 7629 + }, + { + "epoch": 2.3159811807558053, + "grad_norm": 0.8300802111625671, + "learning_rate": 5.370595382746051e-05, + "loss": 1.0777, + "step": 7630 + }, + { + "epoch": 2.3162847169524965, + "grad_norm": 0.6476275324821472, + "learning_rate": 5.3699878493317136e-05, + "loss": 1.7488, + "step": 7631 + }, + { + "epoch": 2.316588253149188, + "grad_norm": 0.7915083765983582, + "learning_rate": 5.369380315917376e-05, + "loss": 1.3514, + "step": 7632 + }, + { + "epoch": 2.3168917893458794, + "grad_norm": 0.6225711703300476, + "learning_rate": 5.368772782503038e-05, + "loss": 1.6291, + "step": 7633 + }, + { + "epoch": 2.317195325542571, + "grad_norm": 0.7601152062416077, + "learning_rate": 5.3681652490887e-05, + "loss": 1.6199, + "step": 7634 + }, + { + "epoch": 2.3174988617392622, + "grad_norm": 0.7888222336769104, + "learning_rate": 5.3675577156743625e-05, + "loss": 1.566, + "step": 7635 + }, + { + "epoch": 2.317802397935954, + "grad_norm": 0.7114104628562927, + "learning_rate": 5.366950182260024e-05, + "loss": 0.8866, + "step": 7636 + }, + { + "epoch": 2.318105934132645, + "grad_norm": 0.7306579947471619, + "learning_rate": 5.3663426488456866e-05, + "loss": 1.5352, + "step": 7637 + }, + { + "epoch": 2.3184094703293368, + "grad_norm": 0.8664054274559021, + "learning_rate": 5.3657351154313484e-05, + "loss": 1.3496, + "step": 7638 + }, + { + "epoch": 2.3187130065260284, + "grad_norm": 0.6200141906738281, + "learning_rate": 5.365127582017011e-05, + "loss": 1.0732, + "step": 7639 + }, + { + "epoch": 2.3190165427227196, + "grad_norm": 0.7229865789413452, + "learning_rate": 5.364520048602674e-05, + "loss": 1.1097, + "step": 7640 + }, + { + "epoch": 2.3193200789194113, + "grad_norm": 0.6948032975196838, + "learning_rate": 5.363912515188335e-05, + "loss": 1.6897, + "step": 7641 + }, + { + "epoch": 2.3196236151161025, + "grad_norm": 0.606377363204956, + "learning_rate": 5.363304981773998e-05, + "loss": 0.9409, + "step": 7642 + }, + { + "epoch": 2.319927151312794, + "grad_norm": 0.7344121932983398, + "learning_rate": 5.3626974483596604e-05, + "loss": 1.0816, + "step": 7643 + }, + { + "epoch": 2.3202306875094854, + "grad_norm": 0.7421247363090515, + "learning_rate": 5.362089914945322e-05, + "loss": 1.3433, + "step": 7644 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.7186826467514038, + "learning_rate": 5.3614823815309846e-05, + "loss": 1.5924, + "step": 7645 + }, + { + "epoch": 2.3208377599028682, + "grad_norm": 0.7481942176818848, + "learning_rate": 5.360874848116647e-05, + "loss": 1.2185, + "step": 7646 + }, + { + "epoch": 2.32114129609956, + "grad_norm": 0.7925760746002197, + "learning_rate": 5.360267314702309e-05, + "loss": 1.5318, + "step": 7647 + }, + { + "epoch": 2.321444832296251, + "grad_norm": 0.8228962421417236, + "learning_rate": 5.359659781287971e-05, + "loss": 0.8963, + "step": 7648 + }, + { + "epoch": 2.3217483684929427, + "grad_norm": 0.8172785043716431, + "learning_rate": 5.3590522478736335e-05, + "loss": 1.1493, + "step": 7649 + }, + { + "epoch": 2.3220519046896344, + "grad_norm": 0.8325759172439575, + "learning_rate": 5.358444714459295e-05, + "loss": 1.4674, + "step": 7650 + }, + { + "epoch": 2.3223554408863256, + "grad_norm": 0.8055946826934814, + "learning_rate": 5.3578371810449576e-05, + "loss": 1.3843, + "step": 7651 + }, + { + "epoch": 2.3226589770830173, + "grad_norm": 0.7123836874961853, + "learning_rate": 5.3572296476306194e-05, + "loss": 1.1121, + "step": 7652 + }, + { + "epoch": 2.3229625132797085, + "grad_norm": 0.7075245380401611, + "learning_rate": 5.356622114216282e-05, + "loss": 1.6399, + "step": 7653 + }, + { + "epoch": 2.3232660494764, + "grad_norm": 0.8346471190452576, + "learning_rate": 5.356014580801945e-05, + "loss": 0.8726, + "step": 7654 + }, + { + "epoch": 2.3235695856730914, + "grad_norm": 0.8881227374076843, + "learning_rate": 5.355407047387606e-05, + "loss": 0.9226, + "step": 7655 + }, + { + "epoch": 2.323873121869783, + "grad_norm": 0.8708653450012207, + "learning_rate": 5.354799513973269e-05, + "loss": 1.2796, + "step": 7656 + }, + { + "epoch": 2.3241766580664747, + "grad_norm": 0.7582796216011047, + "learning_rate": 5.3541919805589314e-05, + "loss": 1.3877, + "step": 7657 + }, + { + "epoch": 2.324480194263166, + "grad_norm": 0.6816633343696594, + "learning_rate": 5.353584447144593e-05, + "loss": 0.6834, + "step": 7658 + }, + { + "epoch": 2.324783730459857, + "grad_norm": 0.4883376359939575, + "learning_rate": 5.3529769137302556e-05, + "loss": 1.2223, + "step": 7659 + }, + { + "epoch": 2.3250872666565487, + "grad_norm": 0.6481773853302002, + "learning_rate": 5.352369380315918e-05, + "loss": 1.0884, + "step": 7660 + }, + { + "epoch": 2.3253908028532404, + "grad_norm": 0.7207589149475098, + "learning_rate": 5.35176184690158e-05, + "loss": 1.7161, + "step": 7661 + }, + { + "epoch": 2.3256943390499316, + "grad_norm": 0.8019577860832214, + "learning_rate": 5.351154313487242e-05, + "loss": 1.5573, + "step": 7662 + }, + { + "epoch": 2.3259978752466233, + "grad_norm": 0.7258016467094421, + "learning_rate": 5.3505467800729045e-05, + "loss": 0.9274, + "step": 7663 + }, + { + "epoch": 2.3263014114433145, + "grad_norm": 0.7334061861038208, + "learning_rate": 5.349939246658566e-05, + "loss": 1.0776, + "step": 7664 + }, + { + "epoch": 2.326604947640006, + "grad_norm": 0.78205806016922, + "learning_rate": 5.3493317132442287e-05, + "loss": 1.2147, + "step": 7665 + }, + { + "epoch": 2.3269084838366974, + "grad_norm": 0.8690736889839172, + "learning_rate": 5.3487241798298904e-05, + "loss": 1.0358, + "step": 7666 + }, + { + "epoch": 2.327212020033389, + "grad_norm": 0.5215451121330261, + "learning_rate": 5.348116646415553e-05, + "loss": 0.5091, + "step": 7667 + }, + { + "epoch": 2.3275155562300807, + "grad_norm": 0.7433094382286072, + "learning_rate": 5.347509113001216e-05, + "loss": 1.1312, + "step": 7668 + }, + { + "epoch": 2.327819092426772, + "grad_norm": 0.8542289137840271, + "learning_rate": 5.346901579586877e-05, + "loss": 1.4203, + "step": 7669 + }, + { + "epoch": 2.3281226286234635, + "grad_norm": 0.7717257738113403, + "learning_rate": 5.34629404617254e-05, + "loss": 0.7082, + "step": 7670 + }, + { + "epoch": 2.3284261648201547, + "grad_norm": 0.8508163690567017, + "learning_rate": 5.3456865127582024e-05, + "loss": 1.1675, + "step": 7671 + }, + { + "epoch": 2.3287297010168464, + "grad_norm": 0.7925937175750732, + "learning_rate": 5.3450789793438635e-05, + "loss": 1.2884, + "step": 7672 + }, + { + "epoch": 2.3290332372135376, + "grad_norm": 0.8074536919593811, + "learning_rate": 5.3444714459295266e-05, + "loss": 1.417, + "step": 7673 + }, + { + "epoch": 2.3293367734102293, + "grad_norm": 0.718932569026947, + "learning_rate": 5.343863912515189e-05, + "loss": 1.4604, + "step": 7674 + }, + { + "epoch": 2.3296403096069205, + "grad_norm": 0.8349537253379822, + "learning_rate": 5.343256379100851e-05, + "loss": 1.5235, + "step": 7675 + }, + { + "epoch": 2.329943845803612, + "grad_norm": 0.8295914530754089, + "learning_rate": 5.342648845686513e-05, + "loss": 1.4084, + "step": 7676 + }, + { + "epoch": 2.3302473820003033, + "grad_norm": 0.6685676574707031, + "learning_rate": 5.3420413122721755e-05, + "loss": 1.0359, + "step": 7677 + }, + { + "epoch": 2.330550918196995, + "grad_norm": 0.8455519080162048, + "learning_rate": 5.341433778857837e-05, + "loss": 1.1332, + "step": 7678 + }, + { + "epoch": 2.3308544543936867, + "grad_norm": 0.8389970064163208, + "learning_rate": 5.3408262454434997e-05, + "loss": 1.6788, + "step": 7679 + }, + { + "epoch": 2.331157990590378, + "grad_norm": 0.7237008213996887, + "learning_rate": 5.3402187120291614e-05, + "loss": 0.9352, + "step": 7680 + }, + { + "epoch": 2.3314615267870695, + "grad_norm": 0.7687880992889404, + "learning_rate": 5.339611178614824e-05, + "loss": 1.2931, + "step": 7681 + }, + { + "epoch": 2.3317650629837607, + "grad_norm": 0.7523514628410339, + "learning_rate": 5.339003645200487e-05, + "loss": 1.5411, + "step": 7682 + }, + { + "epoch": 2.3320685991804524, + "grad_norm": 0.7142686247825623, + "learning_rate": 5.338396111786148e-05, + "loss": 0.8408, + "step": 7683 + }, + { + "epoch": 2.3323721353771436, + "grad_norm": 0.6777017712593079, + "learning_rate": 5.3377885783718103e-05, + "loss": 0.9626, + "step": 7684 + }, + { + "epoch": 2.3326756715738353, + "grad_norm": 0.5809240937232971, + "learning_rate": 5.3371810449574734e-05, + "loss": 1.1718, + "step": 7685 + }, + { + "epoch": 2.3329792077705265, + "grad_norm": 0.5568855404853821, + "learning_rate": 5.3365735115431345e-05, + "loss": 1.4329, + "step": 7686 + }, + { + "epoch": 2.333282743967218, + "grad_norm": 0.7619733214378357, + "learning_rate": 5.3359659781287976e-05, + "loss": 1.5341, + "step": 7687 + }, + { + "epoch": 2.3335862801639093, + "grad_norm": 0.8271357417106628, + "learning_rate": 5.33535844471446e-05, + "loss": 1.1607, + "step": 7688 + }, + { + "epoch": 2.333889816360601, + "grad_norm": 0.7103882431983948, + "learning_rate": 5.334750911300122e-05, + "loss": 1.465, + "step": 7689 + }, + { + "epoch": 2.3341933525572927, + "grad_norm": 0.9172919988632202, + "learning_rate": 5.334143377885784e-05, + "loss": 1.228, + "step": 7690 + }, + { + "epoch": 2.334496888753984, + "grad_norm": 0.8360595107078552, + "learning_rate": 5.3335358444714465e-05, + "loss": 1.4144, + "step": 7691 + }, + { + "epoch": 2.3348004249506755, + "grad_norm": 0.9780839085578918, + "learning_rate": 5.332928311057108e-05, + "loss": 1.2262, + "step": 7692 + }, + { + "epoch": 2.3351039611473667, + "grad_norm": 0.6523328423500061, + "learning_rate": 5.332320777642771e-05, + "loss": 1.3723, + "step": 7693 + }, + { + "epoch": 2.3354074973440584, + "grad_norm": 0.7507632374763489, + "learning_rate": 5.3317132442284324e-05, + "loss": 1.7642, + "step": 7694 + }, + { + "epoch": 2.3357110335407496, + "grad_norm": 0.689064085483551, + "learning_rate": 5.331105710814095e-05, + "loss": 1.052, + "step": 7695 + }, + { + "epoch": 2.3360145697374413, + "grad_norm": 0.7555691003799438, + "learning_rate": 5.330498177399757e-05, + "loss": 1.2013, + "step": 7696 + }, + { + "epoch": 2.3363181059341325, + "grad_norm": 0.7709565758705139, + "learning_rate": 5.329890643985419e-05, + "loss": 1.2231, + "step": 7697 + }, + { + "epoch": 2.336621642130824, + "grad_norm": 0.6575965285301208, + "learning_rate": 5.3292831105710814e-05, + "loss": 1.5299, + "step": 7698 + }, + { + "epoch": 2.3369251783275153, + "grad_norm": 0.8596627116203308, + "learning_rate": 5.3286755771567444e-05, + "loss": 1.3291, + "step": 7699 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.9520817399024963, + "learning_rate": 5.3280680437424055e-05, + "loss": 0.8895, + "step": 7700 + }, + { + "epoch": 2.3375322507208987, + "grad_norm": 0.7376377582550049, + "learning_rate": 5.3274605103280686e-05, + "loss": 0.7775, + "step": 7701 + }, + { + "epoch": 2.33783578691759, + "grad_norm": 0.8260226845741272, + "learning_rate": 5.326852976913731e-05, + "loss": 1.1357, + "step": 7702 + }, + { + "epoch": 2.3381393231142815, + "grad_norm": 0.9278427958488464, + "learning_rate": 5.326245443499393e-05, + "loss": 1.65, + "step": 7703 + }, + { + "epoch": 2.3384428593109727, + "grad_norm": 0.8064718246459961, + "learning_rate": 5.325637910085055e-05, + "loss": 0.7711, + "step": 7704 + }, + { + "epoch": 2.3387463955076644, + "grad_norm": 0.7880445122718811, + "learning_rate": 5.3250303766707175e-05, + "loss": 1.4331, + "step": 7705 + }, + { + "epoch": 2.3390499317043556, + "grad_norm": 0.6582452058792114, + "learning_rate": 5.324422843256379e-05, + "loss": 1.7935, + "step": 7706 + }, + { + "epoch": 2.3393534679010473, + "grad_norm": 1.155536413192749, + "learning_rate": 5.323815309842042e-05, + "loss": 0.5777, + "step": 7707 + }, + { + "epoch": 2.3396570040977385, + "grad_norm": 0.7595350742340088, + "learning_rate": 5.3232077764277034e-05, + "loss": 1.1607, + "step": 7708 + }, + { + "epoch": 2.33996054029443, + "grad_norm": 0.6159242391586304, + "learning_rate": 5.322600243013366e-05, + "loss": 1.3568, + "step": 7709 + }, + { + "epoch": 2.3402640764911213, + "grad_norm": 0.7928234934806824, + "learning_rate": 5.321992709599028e-05, + "loss": 1.3948, + "step": 7710 + }, + { + "epoch": 2.340567612687813, + "grad_norm": 0.691699206829071, + "learning_rate": 5.32138517618469e-05, + "loss": 1.4697, + "step": 7711 + }, + { + "epoch": 2.3408711488845046, + "grad_norm": 0.5798041224479675, + "learning_rate": 5.3207776427703524e-05, + "loss": 1.0516, + "step": 7712 + }, + { + "epoch": 2.341174685081196, + "grad_norm": 0.7047245502471924, + "learning_rate": 5.3201701093560154e-05, + "loss": 1.2814, + "step": 7713 + }, + { + "epoch": 2.3414782212778875, + "grad_norm": 0.7146697044372559, + "learning_rate": 5.3195625759416765e-05, + "loss": 1.3333, + "step": 7714 + }, + { + "epoch": 2.3417817574745787, + "grad_norm": 0.8074240684509277, + "learning_rate": 5.3189550425273396e-05, + "loss": 1.0494, + "step": 7715 + }, + { + "epoch": 2.3420852936712704, + "grad_norm": 0.8097215294837952, + "learning_rate": 5.318347509113002e-05, + "loss": 1.032, + "step": 7716 + }, + { + "epoch": 2.3423888298679616, + "grad_norm": 0.6857717633247375, + "learning_rate": 5.317739975698664e-05, + "loss": 1.7386, + "step": 7717 + }, + { + "epoch": 2.3426923660646533, + "grad_norm": 0.9278475642204285, + "learning_rate": 5.317132442284326e-05, + "loss": 1.0966, + "step": 7718 + }, + { + "epoch": 2.342995902261345, + "grad_norm": 0.6738936305046082, + "learning_rate": 5.316524908869988e-05, + "loss": 1.1826, + "step": 7719 + }, + { + "epoch": 2.343299438458036, + "grad_norm": 0.839474618434906, + "learning_rate": 5.31591737545565e-05, + "loss": 1.6056, + "step": 7720 + }, + { + "epoch": 2.3436029746547273, + "grad_norm": 1.0106525421142578, + "learning_rate": 5.315309842041313e-05, + "loss": 1.1876, + "step": 7721 + }, + { + "epoch": 2.343906510851419, + "grad_norm": 1.049715518951416, + "learning_rate": 5.3147023086269744e-05, + "loss": 1.3087, + "step": 7722 + }, + { + "epoch": 2.3442100470481106, + "grad_norm": 0.8846147060394287, + "learning_rate": 5.314094775212637e-05, + "loss": 1.2788, + "step": 7723 + }, + { + "epoch": 2.344513583244802, + "grad_norm": 0.6935809850692749, + "learning_rate": 5.313487241798299e-05, + "loss": 1.0473, + "step": 7724 + }, + { + "epoch": 2.3448171194414935, + "grad_norm": 0.5639526844024658, + "learning_rate": 5.312879708383961e-05, + "loss": 1.1472, + "step": 7725 + }, + { + "epoch": 2.3451206556381847, + "grad_norm": 0.64291912317276, + "learning_rate": 5.3122721749696234e-05, + "loss": 1.5788, + "step": 7726 + }, + { + "epoch": 2.3454241918348764, + "grad_norm": 0.7888056039810181, + "learning_rate": 5.3116646415552864e-05, + "loss": 0.9166, + "step": 7727 + }, + { + "epoch": 2.3457277280315676, + "grad_norm": 0.731212317943573, + "learning_rate": 5.3110571081409475e-05, + "loss": 1.5907, + "step": 7728 + }, + { + "epoch": 2.3460312642282592, + "grad_norm": 0.662165105342865, + "learning_rate": 5.3104495747266106e-05, + "loss": 1.39, + "step": 7729 + }, + { + "epoch": 2.346334800424951, + "grad_norm": 0.999442458152771, + "learning_rate": 5.309842041312273e-05, + "loss": 1.0353, + "step": 7730 + }, + { + "epoch": 2.346638336621642, + "grad_norm": 0.5597583651542664, + "learning_rate": 5.309234507897935e-05, + "loss": 1.1838, + "step": 7731 + }, + { + "epoch": 2.3469418728183338, + "grad_norm": 0.6151300668716431, + "learning_rate": 5.308626974483597e-05, + "loss": 1.5594, + "step": 7732 + }, + { + "epoch": 2.347245409015025, + "grad_norm": 0.6515387892723083, + "learning_rate": 5.308019441069259e-05, + "loss": 1.1982, + "step": 7733 + }, + { + "epoch": 2.3475489452117166, + "grad_norm": 0.7352016568183899, + "learning_rate": 5.307411907654921e-05, + "loss": 1.3948, + "step": 7734 + }, + { + "epoch": 2.347852481408408, + "grad_norm": 0.7883623838424683, + "learning_rate": 5.306804374240584e-05, + "loss": 1.3335, + "step": 7735 + }, + { + "epoch": 2.3481560176050995, + "grad_norm": 0.5472824573516846, + "learning_rate": 5.3061968408262454e-05, + "loss": 0.9117, + "step": 7736 + }, + { + "epoch": 2.3484595538017907, + "grad_norm": 0.8789010047912598, + "learning_rate": 5.305589307411908e-05, + "loss": 0.6223, + "step": 7737 + }, + { + "epoch": 2.3487630899984824, + "grad_norm": 0.8505709171295166, + "learning_rate": 5.30498177399757e-05, + "loss": 1.0333, + "step": 7738 + }, + { + "epoch": 2.3490666261951736, + "grad_norm": 0.7751732468605042, + "learning_rate": 5.304374240583232e-05, + "loss": 1.3989, + "step": 7739 + }, + { + "epoch": 2.3493701623918652, + "grad_norm": 0.8960305452346802, + "learning_rate": 5.3037667071688944e-05, + "loss": 1.1284, + "step": 7740 + }, + { + "epoch": 2.349673698588557, + "grad_norm": 0.7729844450950623, + "learning_rate": 5.3031591737545575e-05, + "loss": 1.4161, + "step": 7741 + }, + { + "epoch": 2.349977234785248, + "grad_norm": 0.7679091095924377, + "learning_rate": 5.3025516403402185e-05, + "loss": 1.2617, + "step": 7742 + }, + { + "epoch": 2.3502807709819398, + "grad_norm": 0.7796841263771057, + "learning_rate": 5.3019441069258816e-05, + "loss": 1.5505, + "step": 7743 + }, + { + "epoch": 2.350584307178631, + "grad_norm": 0.8468260765075684, + "learning_rate": 5.301336573511544e-05, + "loss": 1.483, + "step": 7744 + }, + { + "epoch": 2.3508878433753226, + "grad_norm": 0.6661270260810852, + "learning_rate": 5.300729040097205e-05, + "loss": 1.6524, + "step": 7745 + }, + { + "epoch": 2.351191379572014, + "grad_norm": 0.9717702865600586, + "learning_rate": 5.300121506682868e-05, + "loss": 0.8439, + "step": 7746 + }, + { + "epoch": 2.3514949157687055, + "grad_norm": 0.7468115091323853, + "learning_rate": 5.299513973268529e-05, + "loss": 1.3385, + "step": 7747 + }, + { + "epoch": 2.3517984519653967, + "grad_norm": 0.8282458186149597, + "learning_rate": 5.298906439854192e-05, + "loss": 1.4013, + "step": 7748 + }, + { + "epoch": 2.3521019881620884, + "grad_norm": 0.7290198802947998, + "learning_rate": 5.298298906439855e-05, + "loss": 0.9782, + "step": 7749 + }, + { + "epoch": 2.3524055243587796, + "grad_norm": 0.834172785282135, + "learning_rate": 5.2976913730255164e-05, + "loss": 1.2091, + "step": 7750 + }, + { + "epoch": 2.3527090605554712, + "grad_norm": 0.5838407278060913, + "learning_rate": 5.297083839611179e-05, + "loss": 1.0893, + "step": 7751 + }, + { + "epoch": 2.353012596752163, + "grad_norm": 0.6311367750167847, + "learning_rate": 5.296476306196841e-05, + "loss": 1.4858, + "step": 7752 + }, + { + "epoch": 2.353316132948854, + "grad_norm": 0.7519064545631409, + "learning_rate": 5.295868772782503e-05, + "loss": 1.3179, + "step": 7753 + }, + { + "epoch": 2.3536196691455458, + "grad_norm": 0.6211289763450623, + "learning_rate": 5.2952612393681654e-05, + "loss": 1.4659, + "step": 7754 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.6141878962516785, + "learning_rate": 5.2946537059538285e-05, + "loss": 1.5953, + "step": 7755 + }, + { + "epoch": 2.3542267415389286, + "grad_norm": 0.6442515850067139, + "learning_rate": 5.2940461725394895e-05, + "loss": 1.5598, + "step": 7756 + }, + { + "epoch": 2.35453027773562, + "grad_norm": 0.7627711296081543, + "learning_rate": 5.293438639125152e-05, + "loss": 1.2289, + "step": 7757 + }, + { + "epoch": 2.3548338139323115, + "grad_norm": 0.7751161456108093, + "learning_rate": 5.292831105710815e-05, + "loss": 1.0708, + "step": 7758 + }, + { + "epoch": 2.3551373501290027, + "grad_norm": 0.8048432469367981, + "learning_rate": 5.292223572296476e-05, + "loss": 0.9133, + "step": 7759 + }, + { + "epoch": 2.3554408863256944, + "grad_norm": 0.6209495067596436, + "learning_rate": 5.291616038882139e-05, + "loss": 1.5231, + "step": 7760 + }, + { + "epoch": 2.3557444225223856, + "grad_norm": 0.7496862411499023, + "learning_rate": 5.2910085054678e-05, + "loss": 1.0688, + "step": 7761 + }, + { + "epoch": 2.3560479587190772, + "grad_norm": 0.8569952845573425, + "learning_rate": 5.290400972053463e-05, + "loss": 1.1458, + "step": 7762 + }, + { + "epoch": 2.356351494915769, + "grad_norm": 0.7132030725479126, + "learning_rate": 5.289793438639126e-05, + "loss": 1.3457, + "step": 7763 + }, + { + "epoch": 2.35665503111246, + "grad_norm": 0.690146267414093, + "learning_rate": 5.2891859052247874e-05, + "loss": 1.5885, + "step": 7764 + }, + { + "epoch": 2.3569585673091518, + "grad_norm": 0.8199977278709412, + "learning_rate": 5.28857837181045e-05, + "loss": 1.5518, + "step": 7765 + }, + { + "epoch": 2.357262103505843, + "grad_norm": 0.9800877571105957, + "learning_rate": 5.287970838396112e-05, + "loss": 1.1599, + "step": 7766 + }, + { + "epoch": 2.3575656397025346, + "grad_norm": 0.6656652688980103, + "learning_rate": 5.287363304981774e-05, + "loss": 1.2197, + "step": 7767 + }, + { + "epoch": 2.357869175899226, + "grad_norm": 0.7522599101066589, + "learning_rate": 5.2867557715674364e-05, + "loss": 1.3696, + "step": 7768 + }, + { + "epoch": 2.3581727120959175, + "grad_norm": 0.8206160068511963, + "learning_rate": 5.286148238153099e-05, + "loss": 1.4364, + "step": 7769 + }, + { + "epoch": 2.3584762482926087, + "grad_norm": 0.713617742061615, + "learning_rate": 5.2855407047387605e-05, + "loss": 1.08, + "step": 7770 + }, + { + "epoch": 2.3587797844893004, + "grad_norm": 0.9411084055900574, + "learning_rate": 5.284933171324423e-05, + "loss": 1.2305, + "step": 7771 + }, + { + "epoch": 2.3590833206859916, + "grad_norm": 0.5938416123390198, + "learning_rate": 5.284325637910086e-05, + "loss": 0.7511, + "step": 7772 + }, + { + "epoch": 2.3593868568826832, + "grad_norm": 0.6943349242210388, + "learning_rate": 5.283718104495747e-05, + "loss": 1.6483, + "step": 7773 + }, + { + "epoch": 2.359690393079375, + "grad_norm": 0.7166835069656372, + "learning_rate": 5.28311057108141e-05, + "loss": 1.456, + "step": 7774 + }, + { + "epoch": 2.359993929276066, + "grad_norm": 1.1558526754379272, + "learning_rate": 5.282503037667071e-05, + "loss": 0.8777, + "step": 7775 + }, + { + "epoch": 2.3602974654727578, + "grad_norm": 0.7373621463775635, + "learning_rate": 5.281895504252734e-05, + "loss": 1.5, + "step": 7776 + }, + { + "epoch": 2.360601001669449, + "grad_norm": 0.5992624163627625, + "learning_rate": 5.281287970838397e-05, + "loss": 1.6233, + "step": 7777 + }, + { + "epoch": 2.3609045378661406, + "grad_norm": 0.8751627802848816, + "learning_rate": 5.2806804374240584e-05, + "loss": 1.3434, + "step": 7778 + }, + { + "epoch": 2.361208074062832, + "grad_norm": 0.6332023739814758, + "learning_rate": 5.280072904009721e-05, + "loss": 1.2038, + "step": 7779 + }, + { + "epoch": 2.3615116102595235, + "grad_norm": 0.6409838199615479, + "learning_rate": 5.279465370595383e-05, + "loss": 1.7465, + "step": 7780 + }, + { + "epoch": 2.361815146456215, + "grad_norm": 0.821772575378418, + "learning_rate": 5.278857837181045e-05, + "loss": 1.5604, + "step": 7781 + }, + { + "epoch": 2.3621186826529064, + "grad_norm": 0.9229840040206909, + "learning_rate": 5.2782503037667074e-05, + "loss": 1.2961, + "step": 7782 + }, + { + "epoch": 2.3624222188495976, + "grad_norm": 0.6327805519104004, + "learning_rate": 5.27764277035237e-05, + "loss": 1.3647, + "step": 7783 + }, + { + "epoch": 2.3627257550462892, + "grad_norm": 0.6412098407745361, + "learning_rate": 5.2770352369380315e-05, + "loss": 1.5738, + "step": 7784 + }, + { + "epoch": 2.363029291242981, + "grad_norm": 0.6698938012123108, + "learning_rate": 5.276427703523694e-05, + "loss": 0.809, + "step": 7785 + }, + { + "epoch": 2.363332827439672, + "grad_norm": 0.6202579140663147, + "learning_rate": 5.275820170109357e-05, + "loss": 1.4661, + "step": 7786 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.7623347043991089, + "learning_rate": 5.275212636695018e-05, + "loss": 1.4255, + "step": 7787 + }, + { + "epoch": 2.363939899833055, + "grad_norm": 0.7024716138839722, + "learning_rate": 5.274605103280681e-05, + "loss": 1.6767, + "step": 7788 + }, + { + "epoch": 2.3642434360297466, + "grad_norm": 0.7999787330627441, + "learning_rate": 5.273997569866342e-05, + "loss": 1.2606, + "step": 7789 + }, + { + "epoch": 2.364546972226438, + "grad_norm": 0.7593993544578552, + "learning_rate": 5.273390036452005e-05, + "loss": 1.3622, + "step": 7790 + }, + { + "epoch": 2.3648505084231295, + "grad_norm": 0.7108789682388306, + "learning_rate": 5.272782503037668e-05, + "loss": 1.351, + "step": 7791 + }, + { + "epoch": 2.365154044619821, + "grad_norm": 0.8540698885917664, + "learning_rate": 5.2721749696233294e-05, + "loss": 1.7725, + "step": 7792 + }, + { + "epoch": 2.3654575808165124, + "grad_norm": 0.8625686168670654, + "learning_rate": 5.271567436208992e-05, + "loss": 1.1768, + "step": 7793 + }, + { + "epoch": 2.365761117013204, + "grad_norm": 0.891380250453949, + "learning_rate": 5.270959902794654e-05, + "loss": 1.2084, + "step": 7794 + }, + { + "epoch": 2.3660646532098952, + "grad_norm": 0.6972417831420898, + "learning_rate": 5.270352369380316e-05, + "loss": 1.1735, + "step": 7795 + }, + { + "epoch": 2.366368189406587, + "grad_norm": 0.9224116802215576, + "learning_rate": 5.2697448359659784e-05, + "loss": 1.3848, + "step": 7796 + }, + { + "epoch": 2.366671725603278, + "grad_norm": 0.7985848188400269, + "learning_rate": 5.269137302551641e-05, + "loss": 0.8748, + "step": 7797 + }, + { + "epoch": 2.3669752617999698, + "grad_norm": 0.7604532837867737, + "learning_rate": 5.2685297691373025e-05, + "loss": 1.2969, + "step": 7798 + }, + { + "epoch": 2.367278797996661, + "grad_norm": 0.8171728253364563, + "learning_rate": 5.267922235722965e-05, + "loss": 1.3355, + "step": 7799 + }, + { + "epoch": 2.3675823341933526, + "grad_norm": 0.6600670218467712, + "learning_rate": 5.267314702308628e-05, + "loss": 1.115, + "step": 7800 + }, + { + "epoch": 2.367885870390044, + "grad_norm": 0.7643181085586548, + "learning_rate": 5.266707168894289e-05, + "loss": 1.0573, + "step": 7801 + }, + { + "epoch": 2.3681894065867355, + "grad_norm": 0.6425821781158447, + "learning_rate": 5.266099635479952e-05, + "loss": 1.649, + "step": 7802 + }, + { + "epoch": 2.368492942783427, + "grad_norm": 0.7698783278465271, + "learning_rate": 5.265492102065613e-05, + "loss": 1.1364, + "step": 7803 + }, + { + "epoch": 2.3687964789801184, + "grad_norm": 0.8478207588195801, + "learning_rate": 5.264884568651276e-05, + "loss": 1.272, + "step": 7804 + }, + { + "epoch": 2.36910001517681, + "grad_norm": 0.6805600523948669, + "learning_rate": 5.264277035236939e-05, + "loss": 0.9952, + "step": 7805 + }, + { + "epoch": 2.3694035513735012, + "grad_norm": 0.6795867681503296, + "learning_rate": 5.2636695018226004e-05, + "loss": 1.4623, + "step": 7806 + }, + { + "epoch": 2.369707087570193, + "grad_norm": 0.6063035726547241, + "learning_rate": 5.263061968408263e-05, + "loss": 1.2589, + "step": 7807 + }, + { + "epoch": 2.370010623766884, + "grad_norm": 0.7077288627624512, + "learning_rate": 5.262454434993925e-05, + "loss": 1.4941, + "step": 7808 + }, + { + "epoch": 2.3703141599635758, + "grad_norm": 0.7622582912445068, + "learning_rate": 5.261846901579587e-05, + "loss": 1.2967, + "step": 7809 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.8287667036056519, + "learning_rate": 5.2612393681652494e-05, + "loss": 1.537, + "step": 7810 + }, + { + "epoch": 2.3709212323569586, + "grad_norm": 0.6999541521072388, + "learning_rate": 5.260631834750912e-05, + "loss": 1.3958, + "step": 7811 + }, + { + "epoch": 2.37122476855365, + "grad_norm": 1.1540329456329346, + "learning_rate": 5.2600243013365735e-05, + "loss": 0.9358, + "step": 7812 + }, + { + "epoch": 2.3715283047503415, + "grad_norm": 0.9087563157081604, + "learning_rate": 5.259416767922236e-05, + "loss": 1.2583, + "step": 7813 + }, + { + "epoch": 2.371831840947033, + "grad_norm": 0.825188159942627, + "learning_rate": 5.258809234507898e-05, + "loss": 1.351, + "step": 7814 + }, + { + "epoch": 2.3721353771437244, + "grad_norm": 0.7714551687240601, + "learning_rate": 5.25820170109356e-05, + "loss": 1.3726, + "step": 7815 + }, + { + "epoch": 2.372438913340416, + "grad_norm": 0.782400906085968, + "learning_rate": 5.257594167679223e-05, + "loss": 0.9385, + "step": 7816 + }, + { + "epoch": 2.3727424495371072, + "grad_norm": 0.6738986968994141, + "learning_rate": 5.256986634264884e-05, + "loss": 1.0588, + "step": 7817 + }, + { + "epoch": 2.373045985733799, + "grad_norm": 0.7600053548812866, + "learning_rate": 5.256379100850547e-05, + "loss": 1.3494, + "step": 7818 + }, + { + "epoch": 2.37334952193049, + "grad_norm": 0.6999025344848633, + "learning_rate": 5.25577156743621e-05, + "loss": 1.6085, + "step": 7819 + }, + { + "epoch": 2.3736530581271817, + "grad_norm": 0.8892236351966858, + "learning_rate": 5.255164034021871e-05, + "loss": 1.4086, + "step": 7820 + }, + { + "epoch": 2.373956594323873, + "grad_norm": 0.794494092464447, + "learning_rate": 5.254556500607534e-05, + "loss": 1.6947, + "step": 7821 + }, + { + "epoch": 2.3742601305205646, + "grad_norm": 0.630824625492096, + "learning_rate": 5.253948967193196e-05, + "loss": 1.7344, + "step": 7822 + }, + { + "epoch": 2.374563666717256, + "grad_norm": 0.7141704559326172, + "learning_rate": 5.253341433778858e-05, + "loss": 1.5292, + "step": 7823 + }, + { + "epoch": 2.3748672029139475, + "grad_norm": 0.7559216022491455, + "learning_rate": 5.2527339003645204e-05, + "loss": 1.3356, + "step": 7824 + }, + { + "epoch": 2.375170739110639, + "grad_norm": 0.6533464789390564, + "learning_rate": 5.252126366950183e-05, + "loss": 1.5769, + "step": 7825 + }, + { + "epoch": 2.3754742753073304, + "grad_norm": 0.7361800670623779, + "learning_rate": 5.2515188335358445e-05, + "loss": 0.7596, + "step": 7826 + }, + { + "epoch": 2.375777811504022, + "grad_norm": 0.5638136863708496, + "learning_rate": 5.250911300121507e-05, + "loss": 1.6617, + "step": 7827 + }, + { + "epoch": 2.376081347700713, + "grad_norm": 0.5785853266716003, + "learning_rate": 5.250303766707169e-05, + "loss": 0.6196, + "step": 7828 + }, + { + "epoch": 2.376384883897405, + "grad_norm": 0.8371119499206543, + "learning_rate": 5.249696233292831e-05, + "loss": 1.2451, + "step": 7829 + }, + { + "epoch": 2.376688420094096, + "grad_norm": 0.8846685886383057, + "learning_rate": 5.249088699878494e-05, + "loss": 1.3992, + "step": 7830 + }, + { + "epoch": 2.3769919562907877, + "grad_norm": 0.8627526164054871, + "learning_rate": 5.248481166464155e-05, + "loss": 0.8391, + "step": 7831 + }, + { + "epoch": 2.377295492487479, + "grad_norm": 0.7415776252746582, + "learning_rate": 5.2478736330498176e-05, + "loss": 1.3816, + "step": 7832 + }, + { + "epoch": 2.3775990286841706, + "grad_norm": 0.6754639148712158, + "learning_rate": 5.247266099635481e-05, + "loss": 1.4647, + "step": 7833 + }, + { + "epoch": 2.377902564880862, + "grad_norm": 1.0893198251724243, + "learning_rate": 5.246658566221142e-05, + "loss": 1.351, + "step": 7834 + }, + { + "epoch": 2.3782061010775535, + "grad_norm": 0.6966081857681274, + "learning_rate": 5.246051032806805e-05, + "loss": 1.4548, + "step": 7835 + }, + { + "epoch": 2.378509637274245, + "grad_norm": 0.7975082993507385, + "learning_rate": 5.245443499392467e-05, + "loss": 1.7406, + "step": 7836 + }, + { + "epoch": 2.3788131734709363, + "grad_norm": 0.8312361240386963, + "learning_rate": 5.244835965978129e-05, + "loss": 1.3375, + "step": 7837 + }, + { + "epoch": 2.379116709667628, + "grad_norm": 0.6477303504943848, + "learning_rate": 5.2442284325637914e-05, + "loss": 1.2427, + "step": 7838 + }, + { + "epoch": 2.379420245864319, + "grad_norm": 0.7435750961303711, + "learning_rate": 5.243620899149454e-05, + "loss": 1.0545, + "step": 7839 + }, + { + "epoch": 2.379723782061011, + "grad_norm": 0.6959526538848877, + "learning_rate": 5.2430133657351155e-05, + "loss": 0.8377, + "step": 7840 + }, + { + "epoch": 2.380027318257702, + "grad_norm": 0.7352555990219116, + "learning_rate": 5.242405832320778e-05, + "loss": 1.2703, + "step": 7841 + }, + { + "epoch": 2.3803308544543937, + "grad_norm": 0.8414437174797058, + "learning_rate": 5.24179829890644e-05, + "loss": 1.1084, + "step": 7842 + }, + { + "epoch": 2.380634390651085, + "grad_norm": 0.7620357275009155, + "learning_rate": 5.241190765492102e-05, + "loss": 1.5271, + "step": 7843 + }, + { + "epoch": 2.3809379268477766, + "grad_norm": 0.6760739088058472, + "learning_rate": 5.2405832320777645e-05, + "loss": 1.5965, + "step": 7844 + }, + { + "epoch": 2.381241463044468, + "grad_norm": 0.7874226570129395, + "learning_rate": 5.239975698663426e-05, + "loss": 1.0306, + "step": 7845 + }, + { + "epoch": 2.3815449992411595, + "grad_norm": 0.8035131096839905, + "learning_rate": 5.2393681652490886e-05, + "loss": 1.1358, + "step": 7846 + }, + { + "epoch": 2.381848535437851, + "grad_norm": 1.040859341621399, + "learning_rate": 5.238760631834752e-05, + "loss": 1.0935, + "step": 7847 + }, + { + "epoch": 2.3821520716345423, + "grad_norm": 0.6593133807182312, + "learning_rate": 5.238153098420413e-05, + "loss": 1.3713, + "step": 7848 + }, + { + "epoch": 2.382455607831234, + "grad_norm": 0.8799865245819092, + "learning_rate": 5.237545565006076e-05, + "loss": 1.4511, + "step": 7849 + }, + { + "epoch": 2.382759144027925, + "grad_norm": 0.8076903820037842, + "learning_rate": 5.236938031591738e-05, + "loss": 1.5262, + "step": 7850 + }, + { + "epoch": 2.383062680224617, + "grad_norm": 0.7340336441993713, + "learning_rate": 5.2363304981774e-05, + "loss": 1.147, + "step": 7851 + }, + { + "epoch": 2.383366216421308, + "grad_norm": 0.4335605800151825, + "learning_rate": 5.2357229647630624e-05, + "loss": 0.7803, + "step": 7852 + }, + { + "epoch": 2.3836697526179997, + "grad_norm": 0.7532500624656677, + "learning_rate": 5.235115431348725e-05, + "loss": 1.2353, + "step": 7853 + }, + { + "epoch": 2.3839732888146914, + "grad_norm": 0.8696469068527222, + "learning_rate": 5.2345078979343865e-05, + "loss": 1.3929, + "step": 7854 + }, + { + "epoch": 2.3842768250113826, + "grad_norm": 0.8726606369018555, + "learning_rate": 5.233900364520049e-05, + "loss": 0.9976, + "step": 7855 + }, + { + "epoch": 2.384580361208074, + "grad_norm": 0.7265575528144836, + "learning_rate": 5.233292831105711e-05, + "loss": 1.5227, + "step": 7856 + }, + { + "epoch": 2.3848838974047655, + "grad_norm": 0.7864022850990295, + "learning_rate": 5.232685297691373e-05, + "loss": 1.3465, + "step": 7857 + }, + { + "epoch": 2.385187433601457, + "grad_norm": 0.8680126070976257, + "learning_rate": 5.2320777642770355e-05, + "loss": 1.1205, + "step": 7858 + }, + { + "epoch": 2.3854909697981483, + "grad_norm": 0.743444561958313, + "learning_rate": 5.231470230862697e-05, + "loss": 1.3234, + "step": 7859 + }, + { + "epoch": 2.38579450599484, + "grad_norm": 0.6110712289810181, + "learning_rate": 5.2308626974483596e-05, + "loss": 1.425, + "step": 7860 + }, + { + "epoch": 2.386098042191531, + "grad_norm": 0.5643225908279419, + "learning_rate": 5.230255164034023e-05, + "loss": 1.4562, + "step": 7861 + }, + { + "epoch": 2.386401578388223, + "grad_norm": 0.8390517830848694, + "learning_rate": 5.229647630619684e-05, + "loss": 1.0968, + "step": 7862 + }, + { + "epoch": 2.386705114584914, + "grad_norm": 0.7999104261398315, + "learning_rate": 5.229040097205347e-05, + "loss": 1.3739, + "step": 7863 + }, + { + "epoch": 2.3870086507816057, + "grad_norm": 1.0680652856826782, + "learning_rate": 5.228432563791009e-05, + "loss": 1.6146, + "step": 7864 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.8049265742301941, + "learning_rate": 5.227825030376671e-05, + "loss": 1.0325, + "step": 7865 + }, + { + "epoch": 2.3876157231749886, + "grad_norm": 0.7554400563240051, + "learning_rate": 5.2272174969623334e-05, + "loss": 1.4346, + "step": 7866 + }, + { + "epoch": 2.3879192593716803, + "grad_norm": 0.7211205959320068, + "learning_rate": 5.226609963547996e-05, + "loss": 1.6123, + "step": 7867 + }, + { + "epoch": 2.3882227955683715, + "grad_norm": 0.7623486518859863, + "learning_rate": 5.2260024301336576e-05, + "loss": 1.4401, + "step": 7868 + }, + { + "epoch": 2.388526331765063, + "grad_norm": 0.8081662654876709, + "learning_rate": 5.22539489671932e-05, + "loss": 1.1524, + "step": 7869 + }, + { + "epoch": 2.3888298679617543, + "grad_norm": 0.8566677570343018, + "learning_rate": 5.224787363304982e-05, + "loss": 1.2491, + "step": 7870 + }, + { + "epoch": 2.389133404158446, + "grad_norm": 0.7269451022148132, + "learning_rate": 5.224179829890644e-05, + "loss": 1.4801, + "step": 7871 + }, + { + "epoch": 2.389436940355137, + "grad_norm": 0.7573784589767456, + "learning_rate": 5.2235722964763065e-05, + "loss": 1.3765, + "step": 7872 + }, + { + "epoch": 2.389740476551829, + "grad_norm": 1.0910553932189941, + "learning_rate": 5.222964763061968e-05, + "loss": 1.007, + "step": 7873 + }, + { + "epoch": 2.39004401274852, + "grad_norm": 0.8029559254646301, + "learning_rate": 5.2223572296476306e-05, + "loss": 1.6888, + "step": 7874 + }, + { + "epoch": 2.3903475489452117, + "grad_norm": 0.7115080952644348, + "learning_rate": 5.221749696233294e-05, + "loss": 1.0651, + "step": 7875 + }, + { + "epoch": 2.3906510851419034, + "grad_norm": 0.7667282223701477, + "learning_rate": 5.221142162818955e-05, + "loss": 1.5417, + "step": 7876 + }, + { + "epoch": 2.3909546213385946, + "grad_norm": 0.7072556018829346, + "learning_rate": 5.220534629404618e-05, + "loss": 1.6521, + "step": 7877 + }, + { + "epoch": 2.3912581575352863, + "grad_norm": 0.7661739587783813, + "learning_rate": 5.21992709599028e-05, + "loss": 1.4645, + "step": 7878 + }, + { + "epoch": 2.3915616937319775, + "grad_norm": 0.8855286836624146, + "learning_rate": 5.219319562575942e-05, + "loss": 1.3801, + "step": 7879 + }, + { + "epoch": 2.391865229928669, + "grad_norm": 0.6786907315254211, + "learning_rate": 5.2187120291616044e-05, + "loss": 1.5246, + "step": 7880 + }, + { + "epoch": 2.3921687661253603, + "grad_norm": 0.771236002445221, + "learning_rate": 5.218104495747267e-05, + "loss": 1.3782, + "step": 7881 + }, + { + "epoch": 2.392472302322052, + "grad_norm": 0.6905536651611328, + "learning_rate": 5.2174969623329286e-05, + "loss": 1.4765, + "step": 7882 + }, + { + "epoch": 2.392775838518743, + "grad_norm": 0.8602134585380554, + "learning_rate": 5.216889428918591e-05, + "loss": 1.3996, + "step": 7883 + }, + { + "epoch": 2.393079374715435, + "grad_norm": 0.6106351613998413, + "learning_rate": 5.216281895504253e-05, + "loss": 1.6691, + "step": 7884 + }, + { + "epoch": 2.393382910912126, + "grad_norm": 0.6688586473464966, + "learning_rate": 5.215674362089915e-05, + "loss": 1.1784, + "step": 7885 + }, + { + "epoch": 2.3936864471088177, + "grad_norm": 0.8014105558395386, + "learning_rate": 5.2150668286755775e-05, + "loss": 1.3896, + "step": 7886 + }, + { + "epoch": 2.3939899833055094, + "grad_norm": 0.720625638961792, + "learning_rate": 5.214459295261239e-05, + "loss": 1.2415, + "step": 7887 + }, + { + "epoch": 2.3942935195022006, + "grad_norm": 0.7473063468933105, + "learning_rate": 5.2138517618469017e-05, + "loss": 0.6578, + "step": 7888 + }, + { + "epoch": 2.3945970556988923, + "grad_norm": 0.6142640709877014, + "learning_rate": 5.213244228432565e-05, + "loss": 1.1646, + "step": 7889 + }, + { + "epoch": 2.3949005918955835, + "grad_norm": 0.9030197858810425, + "learning_rate": 5.212636695018226e-05, + "loss": 1.3181, + "step": 7890 + }, + { + "epoch": 2.395204128092275, + "grad_norm": 0.715584397315979, + "learning_rate": 5.212029161603889e-05, + "loss": 1.5966, + "step": 7891 + }, + { + "epoch": 2.3955076642889663, + "grad_norm": 0.7543222308158875, + "learning_rate": 5.211421628189551e-05, + "loss": 1.6381, + "step": 7892 + }, + { + "epoch": 2.395811200485658, + "grad_norm": 0.8002287149429321, + "learning_rate": 5.210814094775212e-05, + "loss": 1.4268, + "step": 7893 + }, + { + "epoch": 2.396114736682349, + "grad_norm": 0.5649712085723877, + "learning_rate": 5.2102065613608754e-05, + "loss": 1.626, + "step": 7894 + }, + { + "epoch": 2.396418272879041, + "grad_norm": 0.7349185943603516, + "learning_rate": 5.2095990279465365e-05, + "loss": 1.4225, + "step": 7895 + }, + { + "epoch": 2.396721809075732, + "grad_norm": 0.8495616912841797, + "learning_rate": 5.2089914945321996e-05, + "loss": 1.5571, + "step": 7896 + }, + { + "epoch": 2.3970253452724237, + "grad_norm": 0.7431391477584839, + "learning_rate": 5.208383961117862e-05, + "loss": 1.2231, + "step": 7897 + }, + { + "epoch": 2.3973288814691154, + "grad_norm": 0.6172650456428528, + "learning_rate": 5.207776427703524e-05, + "loss": 1.5317, + "step": 7898 + }, + { + "epoch": 2.3976324176658066, + "grad_norm": 0.7999909520149231, + "learning_rate": 5.207168894289186e-05, + "loss": 1.0488, + "step": 7899 + }, + { + "epoch": 2.3979359538624982, + "grad_norm": 0.6296140551567078, + "learning_rate": 5.2065613608748485e-05, + "loss": 0.961, + "step": 7900 + }, + { + "epoch": 2.3982394900591895, + "grad_norm": 0.7614960074424744, + "learning_rate": 5.20595382746051e-05, + "loss": 1.4641, + "step": 7901 + }, + { + "epoch": 2.398543026255881, + "grad_norm": 0.6893866658210754, + "learning_rate": 5.2053462940461727e-05, + "loss": 1.625, + "step": 7902 + }, + { + "epoch": 2.3988465624525723, + "grad_norm": 0.9098206162452698, + "learning_rate": 5.204738760631836e-05, + "loss": 1.4526, + "step": 7903 + }, + { + "epoch": 2.399150098649264, + "grad_norm": 0.8129755258560181, + "learning_rate": 5.204131227217497e-05, + "loss": 1.4076, + "step": 7904 + }, + { + "epoch": 2.399453634845955, + "grad_norm": 0.8761286735534668, + "learning_rate": 5.203523693803159e-05, + "loss": 1.2927, + "step": 7905 + }, + { + "epoch": 2.399757171042647, + "grad_norm": 0.8439755439758301, + "learning_rate": 5.202916160388822e-05, + "loss": 1.3755, + "step": 7906 + }, + { + "epoch": 2.400060707239338, + "grad_norm": 0.7161287665367126, + "learning_rate": 5.2023086269744833e-05, + "loss": 1.2992, + "step": 7907 + }, + { + "epoch": 2.4003642434360297, + "grad_norm": 0.7279097437858582, + "learning_rate": 5.2017010935601464e-05, + "loss": 1.3541, + "step": 7908 + }, + { + "epoch": 2.4006677796327214, + "grad_norm": 0.8380422592163086, + "learning_rate": 5.2010935601458075e-05, + "loss": 1.4317, + "step": 7909 + }, + { + "epoch": 2.4009713158294126, + "grad_norm": 0.680642306804657, + "learning_rate": 5.2004860267314706e-05, + "loss": 0.9812, + "step": 7910 + }, + { + "epoch": 2.4012748520261042, + "grad_norm": 0.6640725135803223, + "learning_rate": 5.199878493317133e-05, + "loss": 0.8245, + "step": 7911 + }, + { + "epoch": 2.4015783882227955, + "grad_norm": 0.5655060410499573, + "learning_rate": 5.199270959902795e-05, + "loss": 2.0355, + "step": 7912 + }, + { + "epoch": 2.401881924419487, + "grad_norm": 0.8419114947319031, + "learning_rate": 5.198663426488457e-05, + "loss": 0.9828, + "step": 7913 + }, + { + "epoch": 2.4021854606161783, + "grad_norm": 0.6423788070678711, + "learning_rate": 5.1980558930741195e-05, + "loss": 1.6292, + "step": 7914 + }, + { + "epoch": 2.40248899681287, + "grad_norm": 0.8200101256370544, + "learning_rate": 5.197448359659781e-05, + "loss": 1.3373, + "step": 7915 + }, + { + "epoch": 2.4027925330095616, + "grad_norm": 0.595747709274292, + "learning_rate": 5.1968408262454437e-05, + "loss": 1.0328, + "step": 7916 + }, + { + "epoch": 2.403096069206253, + "grad_norm": 0.5122062563896179, + "learning_rate": 5.196233292831106e-05, + "loss": 1.1858, + "step": 7917 + }, + { + "epoch": 2.403399605402944, + "grad_norm": 0.7064041495323181, + "learning_rate": 5.195625759416768e-05, + "loss": 1.6892, + "step": 7918 + }, + { + "epoch": 2.4037031415996357, + "grad_norm": 0.7920200228691101, + "learning_rate": 5.19501822600243e-05, + "loss": 1.367, + "step": 7919 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.7359050512313843, + "learning_rate": 5.194410692588093e-05, + "loss": 1.0948, + "step": 7920 + }, + { + "epoch": 2.4043102139930186, + "grad_norm": 0.9729425311088562, + "learning_rate": 5.1938031591737543e-05, + "loss": 0.5859, + "step": 7921 + }, + { + "epoch": 2.4046137501897102, + "grad_norm": 0.8038437962532043, + "learning_rate": 5.1931956257594174e-05, + "loss": 1.3406, + "step": 7922 + }, + { + "epoch": 2.4049172863864015, + "grad_norm": 0.8591334819793701, + "learning_rate": 5.1925880923450785e-05, + "loss": 1.4519, + "step": 7923 + }, + { + "epoch": 2.405220822583093, + "grad_norm": 0.9573879837989807, + "learning_rate": 5.1919805589307416e-05, + "loss": 0.9611, + "step": 7924 + }, + { + "epoch": 2.4055243587797843, + "grad_norm": 0.7993837594985962, + "learning_rate": 5.191373025516404e-05, + "loss": 1.3623, + "step": 7925 + }, + { + "epoch": 2.405827894976476, + "grad_norm": 0.8858467936515808, + "learning_rate": 5.190765492102066e-05, + "loss": 1.5171, + "step": 7926 + }, + { + "epoch": 2.4061314311731676, + "grad_norm": 0.9811488389968872, + "learning_rate": 5.190157958687728e-05, + "loss": 1.2935, + "step": 7927 + }, + { + "epoch": 2.406434967369859, + "grad_norm": 0.6521837115287781, + "learning_rate": 5.1895504252733905e-05, + "loss": 1.4562, + "step": 7928 + }, + { + "epoch": 2.4067385035665505, + "grad_norm": 0.9483540058135986, + "learning_rate": 5.188942891859052e-05, + "loss": 0.8764, + "step": 7929 + }, + { + "epoch": 2.4070420397632417, + "grad_norm": 0.5991268754005432, + "learning_rate": 5.188335358444715e-05, + "loss": 1.5877, + "step": 7930 + }, + { + "epoch": 2.4073455759599334, + "grad_norm": 0.9723407626152039, + "learning_rate": 5.187727825030377e-05, + "loss": 1.1474, + "step": 7931 + }, + { + "epoch": 2.4076491121566246, + "grad_norm": 0.8698570132255554, + "learning_rate": 5.187120291616039e-05, + "loss": 1.5058, + "step": 7932 + }, + { + "epoch": 2.4079526483533162, + "grad_norm": 0.8418928980827332, + "learning_rate": 5.186512758201701e-05, + "loss": 1.1602, + "step": 7933 + }, + { + "epoch": 2.4082561845500075, + "grad_norm": 0.7727174758911133, + "learning_rate": 5.185905224787364e-05, + "loss": 1.5246, + "step": 7934 + }, + { + "epoch": 2.408559720746699, + "grad_norm": 0.6789799928665161, + "learning_rate": 5.1852976913730254e-05, + "loss": 1.3529, + "step": 7935 + }, + { + "epoch": 2.4088632569433903, + "grad_norm": 0.7298761010169983, + "learning_rate": 5.1846901579586884e-05, + "loss": 0.882, + "step": 7936 + }, + { + "epoch": 2.409166793140082, + "grad_norm": 0.7938309907913208, + "learning_rate": 5.1840826245443495e-05, + "loss": 1.2523, + "step": 7937 + }, + { + "epoch": 2.4094703293367736, + "grad_norm": 0.8047483563423157, + "learning_rate": 5.1834750911300126e-05, + "loss": 1.6818, + "step": 7938 + }, + { + "epoch": 2.409773865533465, + "grad_norm": 0.8456572890281677, + "learning_rate": 5.182867557715675e-05, + "loss": 1.2316, + "step": 7939 + }, + { + "epoch": 2.4100774017301565, + "grad_norm": 0.776390552520752, + "learning_rate": 5.182260024301337e-05, + "loss": 1.1516, + "step": 7940 + }, + { + "epoch": 2.4103809379268477, + "grad_norm": 0.775544285774231, + "learning_rate": 5.181652490886999e-05, + "loss": 0.9377, + "step": 7941 + }, + { + "epoch": 2.4106844741235394, + "grad_norm": 0.7695592641830444, + "learning_rate": 5.1810449574726615e-05, + "loss": 1.4437, + "step": 7942 + }, + { + "epoch": 2.4109880103202306, + "grad_norm": 0.928779125213623, + "learning_rate": 5.180437424058323e-05, + "loss": 0.8951, + "step": 7943 + }, + { + "epoch": 2.4112915465169222, + "grad_norm": 0.757379949092865, + "learning_rate": 5.179829890643986e-05, + "loss": 1.3946, + "step": 7944 + }, + { + "epoch": 2.4115950827136134, + "grad_norm": 0.6092455983161926, + "learning_rate": 5.179222357229648e-05, + "loss": 1.8418, + "step": 7945 + }, + { + "epoch": 2.411898618910305, + "grad_norm": 0.7635046243667603, + "learning_rate": 5.17861482381531e-05, + "loss": 1.5335, + "step": 7946 + }, + { + "epoch": 2.4122021551069963, + "grad_norm": 0.6753832101821899, + "learning_rate": 5.178007290400972e-05, + "loss": 0.8429, + "step": 7947 + }, + { + "epoch": 2.412505691303688, + "grad_norm": 0.737553596496582, + "learning_rate": 5.177399756986635e-05, + "loss": 1.4511, + "step": 7948 + }, + { + "epoch": 2.4128092275003796, + "grad_norm": 0.9160948991775513, + "learning_rate": 5.1767922235722964e-05, + "loss": 1.3461, + "step": 7949 + }, + { + "epoch": 2.413112763697071, + "grad_norm": 0.7199264168739319, + "learning_rate": 5.1761846901579594e-05, + "loss": 1.3522, + "step": 7950 + }, + { + "epoch": 2.4134162998937625, + "grad_norm": 1.0194474458694458, + "learning_rate": 5.1755771567436205e-05, + "loss": 1.0272, + "step": 7951 + }, + { + "epoch": 2.4137198360904537, + "grad_norm": 0.6661416888237, + "learning_rate": 5.1749696233292836e-05, + "loss": 1.5689, + "step": 7952 + }, + { + "epoch": 2.4140233722871454, + "grad_norm": 0.67854905128479, + "learning_rate": 5.174362089914946e-05, + "loss": 1.2744, + "step": 7953 + }, + { + "epoch": 2.4143269084838366, + "grad_norm": 0.7923321723937988, + "learning_rate": 5.173754556500607e-05, + "loss": 1.1285, + "step": 7954 + }, + { + "epoch": 2.4146304446805282, + "grad_norm": 0.5897708535194397, + "learning_rate": 5.17314702308627e-05, + "loss": 1.3658, + "step": 7955 + }, + { + "epoch": 2.4149339808772194, + "grad_norm": 0.7650224566459656, + "learning_rate": 5.1725394896719325e-05, + "loss": 1.4415, + "step": 7956 + }, + { + "epoch": 2.415237517073911, + "grad_norm": 1.342687726020813, + "learning_rate": 5.171931956257594e-05, + "loss": 1.0132, + "step": 7957 + }, + { + "epoch": 2.4155410532706023, + "grad_norm": 0.8072525858879089, + "learning_rate": 5.171324422843257e-05, + "loss": 1.5729, + "step": 7958 + }, + { + "epoch": 2.415844589467294, + "grad_norm": 0.640731930732727, + "learning_rate": 5.170716889428919e-05, + "loss": 1.5518, + "step": 7959 + }, + { + "epoch": 2.4161481256639856, + "grad_norm": 0.718439519405365, + "learning_rate": 5.170109356014581e-05, + "loss": 1.4649, + "step": 7960 + }, + { + "epoch": 2.416451661860677, + "grad_norm": 0.5635302662849426, + "learning_rate": 5.169501822600243e-05, + "loss": 1.2203, + "step": 7961 + }, + { + "epoch": 2.4167551980573685, + "grad_norm": 0.7853577136993408, + "learning_rate": 5.168894289185906e-05, + "loss": 0.93, + "step": 7962 + }, + { + "epoch": 2.4170587342540597, + "grad_norm": 0.9242436289787292, + "learning_rate": 5.1682867557715674e-05, + "loss": 1.4183, + "step": 7963 + }, + { + "epoch": 2.4173622704507514, + "grad_norm": 0.679762065410614, + "learning_rate": 5.1676792223572304e-05, + "loss": 0.8421, + "step": 7964 + }, + { + "epoch": 2.4176658066474426, + "grad_norm": 0.7319661974906921, + "learning_rate": 5.1670716889428915e-05, + "loss": 1.4754, + "step": 7965 + }, + { + "epoch": 2.4179693428441342, + "grad_norm": 0.8363327980041504, + "learning_rate": 5.166464155528554e-05, + "loss": 1.3688, + "step": 7966 + }, + { + "epoch": 2.4182728790408254, + "grad_norm": 0.7800347805023193, + "learning_rate": 5.165856622114217e-05, + "loss": 1.5161, + "step": 7967 + }, + { + "epoch": 2.418576415237517, + "grad_norm": 0.6953141093254089, + "learning_rate": 5.165249088699878e-05, + "loss": 1.4556, + "step": 7968 + }, + { + "epoch": 2.4188799514342083, + "grad_norm": 0.7389452457427979, + "learning_rate": 5.164641555285541e-05, + "loss": 0.9218, + "step": 7969 + }, + { + "epoch": 2.4191834876309, + "grad_norm": 0.583696722984314, + "learning_rate": 5.1640340218712035e-05, + "loss": 0.7905, + "step": 7970 + }, + { + "epoch": 2.4194870238275916, + "grad_norm": 0.7632086873054504, + "learning_rate": 5.163426488456865e-05, + "loss": 1.0743, + "step": 7971 + }, + { + "epoch": 2.419790560024283, + "grad_norm": 0.6496031284332275, + "learning_rate": 5.162818955042528e-05, + "loss": 1.4744, + "step": 7972 + }, + { + "epoch": 2.4200940962209745, + "grad_norm": 0.9461754560470581, + "learning_rate": 5.16221142162819e-05, + "loss": 1.1325, + "step": 7973 + }, + { + "epoch": 2.4203976324176657, + "grad_norm": 0.7598881721496582, + "learning_rate": 5.161603888213852e-05, + "loss": 1.4004, + "step": 7974 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.7698111534118652, + "learning_rate": 5.160996354799514e-05, + "loss": 1.2941, + "step": 7975 + }, + { + "epoch": 2.4210047048110486, + "grad_norm": 0.8465235829353333, + "learning_rate": 5.160388821385177e-05, + "loss": 1.0867, + "step": 7976 + }, + { + "epoch": 2.4213082410077402, + "grad_norm": 0.8211492896080017, + "learning_rate": 5.1597812879708384e-05, + "loss": 1.1229, + "step": 7977 + }, + { + "epoch": 2.4216117772044314, + "grad_norm": 0.7429980039596558, + "learning_rate": 5.159173754556501e-05, + "loss": 1.2347, + "step": 7978 + }, + { + "epoch": 2.421915313401123, + "grad_norm": 0.7130765318870544, + "learning_rate": 5.1585662211421625e-05, + "loss": 1.5815, + "step": 7979 + }, + { + "epoch": 2.4222188495978143, + "grad_norm": 0.8220276236534119, + "learning_rate": 5.157958687727825e-05, + "loss": 0.9547, + "step": 7980 + }, + { + "epoch": 2.422522385794506, + "grad_norm": 0.690679132938385, + "learning_rate": 5.157351154313488e-05, + "loss": 1.6903, + "step": 7981 + }, + { + "epoch": 2.4228259219911976, + "grad_norm": 0.5744288563728333, + "learning_rate": 5.156743620899149e-05, + "loss": 1.3488, + "step": 7982 + }, + { + "epoch": 2.423129458187889, + "grad_norm": 0.7903429269790649, + "learning_rate": 5.156136087484812e-05, + "loss": 1.1118, + "step": 7983 + }, + { + "epoch": 2.4234329943845805, + "grad_norm": 0.7011483311653137, + "learning_rate": 5.1555285540704745e-05, + "loss": 0.7834, + "step": 7984 + }, + { + "epoch": 2.4237365305812717, + "grad_norm": 0.7828824520111084, + "learning_rate": 5.154921020656136e-05, + "loss": 1.1493, + "step": 7985 + }, + { + "epoch": 2.4240400667779634, + "grad_norm": 0.7918079495429993, + "learning_rate": 5.154313487241799e-05, + "loss": 1.1872, + "step": 7986 + }, + { + "epoch": 2.4243436029746546, + "grad_norm": 0.7867520451545715, + "learning_rate": 5.153705953827461e-05, + "loss": 1.6563, + "step": 7987 + }, + { + "epoch": 2.424647139171346, + "grad_norm": 0.8857144117355347, + "learning_rate": 5.153098420413123e-05, + "loss": 0.9088, + "step": 7988 + }, + { + "epoch": 2.424950675368038, + "grad_norm": 0.9001855254173279, + "learning_rate": 5.152490886998785e-05, + "loss": 1.4135, + "step": 7989 + }, + { + "epoch": 2.425254211564729, + "grad_norm": 0.5790506601333618, + "learning_rate": 5.151883353584447e-05, + "loss": 0.921, + "step": 7990 + }, + { + "epoch": 2.4255577477614203, + "grad_norm": 0.6665115356445312, + "learning_rate": 5.1512758201701094e-05, + "loss": 1.0874, + "step": 7991 + }, + { + "epoch": 2.425861283958112, + "grad_norm": 0.8199989795684814, + "learning_rate": 5.150668286755772e-05, + "loss": 1.6912, + "step": 7992 + }, + { + "epoch": 2.4261648201548036, + "grad_norm": 1.0102654695510864, + "learning_rate": 5.1500607533414335e-05, + "loss": 1.1574, + "step": 7993 + }, + { + "epoch": 2.426468356351495, + "grad_norm": 1.0226120948791504, + "learning_rate": 5.149453219927096e-05, + "loss": 1.4441, + "step": 7994 + }, + { + "epoch": 2.4267718925481865, + "grad_norm": 0.9459152221679688, + "learning_rate": 5.148845686512759e-05, + "loss": 1.4011, + "step": 7995 + }, + { + "epoch": 2.4270754287448777, + "grad_norm": 0.6363176107406616, + "learning_rate": 5.14823815309842e-05, + "loss": 1.1087, + "step": 7996 + }, + { + "epoch": 2.4273789649415694, + "grad_norm": 0.7099559307098389, + "learning_rate": 5.147630619684083e-05, + "loss": 1.0363, + "step": 7997 + }, + { + "epoch": 2.4276825011382606, + "grad_norm": 0.7245994806289673, + "learning_rate": 5.1470230862697456e-05, + "loss": 1.5554, + "step": 7998 + }, + { + "epoch": 2.427986037334952, + "grad_norm": 0.6965836882591248, + "learning_rate": 5.146415552855407e-05, + "loss": 1.094, + "step": 7999 + }, + { + "epoch": 2.428289573531644, + "grad_norm": 0.6804750561714172, + "learning_rate": 5.14580801944107e-05, + "loss": 1.1956, + "step": 8000 + } + ], + "logging_steps": 1, + "max_steps": 16470, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.193666367442141e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}