{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9978030025631637, "eval_steps": 500, "global_step": 1364, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014646649578908824, "grad_norm": 11.437485320992703, "learning_rate": 1.4598540145985402e-07, "loss": 1.3349, "step": 1 }, { "epoch": 0.0029293299157817647, "grad_norm": 11.178701580362661, "learning_rate": 2.9197080291970804e-07, "loss": 1.338, "step": 2 }, { "epoch": 0.004393994873672647, "grad_norm": 11.35884244830137, "learning_rate": 4.379562043795621e-07, "loss": 1.3216, "step": 3 }, { "epoch": 0.005858659831563529, "grad_norm": 10.417628767122284, "learning_rate": 5.839416058394161e-07, "loss": 1.3553, "step": 4 }, { "epoch": 0.007323324789454412, "grad_norm": 11.211576033283285, "learning_rate": 7.299270072992701e-07, "loss": 1.3584, "step": 5 }, { "epoch": 0.008787989747345295, "grad_norm": 10.861271761101971, "learning_rate": 8.759124087591242e-07, "loss": 1.3293, "step": 6 }, { "epoch": 0.010252654705236177, "grad_norm": 9.89480118782974, "learning_rate": 1.0218978102189781e-06, "loss": 1.3379, "step": 7 }, { "epoch": 0.011717319663127059, "grad_norm": 9.16199354912927, "learning_rate": 1.1678832116788322e-06, "loss": 1.304, "step": 8 }, { "epoch": 0.013181984621017943, "grad_norm": 7.8824339227925, "learning_rate": 1.3138686131386864e-06, "loss": 1.2671, "step": 9 }, { "epoch": 0.014646649578908825, "grad_norm": 8.125898486086502, "learning_rate": 1.4598540145985402e-06, "loss": 1.2187, "step": 10 }, { "epoch": 0.016111314536799707, "grad_norm": 10.83717739533965, "learning_rate": 1.6058394160583942e-06, "loss": 1.2298, "step": 11 }, { "epoch": 0.01757597949469059, "grad_norm": 20.701868436023478, "learning_rate": 1.7518248175182485e-06, "loss": 1.1312, "step": 12 }, { "epoch": 0.01904064445258147, "grad_norm": 20.601367187466792, "learning_rate": 1.8978102189781023e-06, "loss": 1.0933, "step": 13 }, { "epoch": 0.020505309410472353, "grad_norm": 23.59333380509113, "learning_rate": 2.0437956204379563e-06, "loss": 1.0839, "step": 14 }, { "epoch": 0.021969974368363236, "grad_norm": 4.122191454831178, "learning_rate": 2.1897810218978103e-06, "loss": 1.0648, "step": 15 }, { "epoch": 0.023434639326254118, "grad_norm": 9.470990626696512, "learning_rate": 2.3357664233576643e-06, "loss": 1.0762, "step": 16 }, { "epoch": 0.024899304284145003, "grad_norm": 5.597117534177985, "learning_rate": 2.4817518248175183e-06, "loss": 0.9529, "step": 17 }, { "epoch": 0.026363969242035885, "grad_norm": 2.581862089288807, "learning_rate": 2.627737226277373e-06, "loss": 0.8969, "step": 18 }, { "epoch": 0.027828634199926768, "grad_norm": 5.742679255827132, "learning_rate": 2.7737226277372264e-06, "loss": 0.899, "step": 19 }, { "epoch": 0.02929329915781765, "grad_norm": 4.2881369714318796, "learning_rate": 2.9197080291970804e-06, "loss": 0.8479, "step": 20 }, { "epoch": 0.030757964115708532, "grad_norm": 8.043168797174141, "learning_rate": 3.065693430656935e-06, "loss": 0.8139, "step": 21 }, { "epoch": 0.032222629073599414, "grad_norm": 8.109008525384814, "learning_rate": 3.2116788321167884e-06, "loss": 0.832, "step": 22 }, { "epoch": 0.033687294031490296, "grad_norm": 3.7973925320214708, "learning_rate": 3.3576642335766425e-06, "loss": 0.793, "step": 23 }, { "epoch": 0.03515195898938118, "grad_norm": 3.0028103120588856, "learning_rate": 3.503649635036497e-06, "loss": 0.7458, "step": 24 }, { "epoch": 0.03661662394727206, "grad_norm": 2.9364448534773038, "learning_rate": 3.6496350364963505e-06, "loss": 0.7061, "step": 25 }, { "epoch": 0.03808128890516294, "grad_norm": 0.9806974298270025, "learning_rate": 3.7956204379562045e-06, "loss": 0.7023, "step": 26 }, { "epoch": 0.039545953863053825, "grad_norm": 1.064375799560964, "learning_rate": 3.9416058394160585e-06, "loss": 0.6876, "step": 27 }, { "epoch": 0.04101061882094471, "grad_norm": 0.7759784247428909, "learning_rate": 4.0875912408759126e-06, "loss": 0.6773, "step": 28 }, { "epoch": 0.04247528377883559, "grad_norm": 0.6478179875434958, "learning_rate": 4.233576642335767e-06, "loss": 0.6657, "step": 29 }, { "epoch": 0.04393994873672647, "grad_norm": 1.3039225652221051, "learning_rate": 4.379562043795621e-06, "loss": 0.6358, "step": 30 }, { "epoch": 0.04540461369461735, "grad_norm": 0.6416252297147181, "learning_rate": 4.525547445255475e-06, "loss": 0.5966, "step": 31 }, { "epoch": 0.046869278652508235, "grad_norm": 0.5640382481221872, "learning_rate": 4.671532846715329e-06, "loss": 0.5848, "step": 32 }, { "epoch": 0.048333943610399124, "grad_norm": 0.581441699738184, "learning_rate": 4.8175182481751835e-06, "loss": 0.5963, "step": 33 }, { "epoch": 0.049798608568290006, "grad_norm": 0.5919465771312318, "learning_rate": 4.963503649635037e-06, "loss": 0.5921, "step": 34 }, { "epoch": 0.05126327352618089, "grad_norm": 0.6061068767118333, "learning_rate": 5.1094890510948916e-06, "loss": 0.5766, "step": 35 }, { "epoch": 0.05272793848407177, "grad_norm": 0.5757173997249402, "learning_rate": 5.255474452554746e-06, "loss": 0.5234, "step": 36 }, { "epoch": 0.05419260344196265, "grad_norm": 0.5148147313648633, "learning_rate": 5.401459854014599e-06, "loss": 0.5428, "step": 37 }, { "epoch": 0.055657268399853535, "grad_norm": 0.5378426318034144, "learning_rate": 5.547445255474453e-06, "loss": 0.5421, "step": 38 }, { "epoch": 0.05712193335774442, "grad_norm": 0.5096082401040436, "learning_rate": 5.693430656934307e-06, "loss": 0.5447, "step": 39 }, { "epoch": 0.0585865983156353, "grad_norm": 0.48460009463464304, "learning_rate": 5.839416058394161e-06, "loss": 0.5396, "step": 40 }, { "epoch": 0.06005126327352618, "grad_norm": 0.49057265665249916, "learning_rate": 5.985401459854016e-06, "loss": 0.5393, "step": 41 }, { "epoch": 0.061515928231417064, "grad_norm": 0.5380540290567165, "learning_rate": 6.13138686131387e-06, "loss": 0.544, "step": 42 }, { "epoch": 0.06298059318930795, "grad_norm": 0.4836947048784, "learning_rate": 6.277372262773723e-06, "loss": 0.5291, "step": 43 }, { "epoch": 0.06444525814719883, "grad_norm": 0.4975286083516887, "learning_rate": 6.423357664233577e-06, "loss": 0.5006, "step": 44 }, { "epoch": 0.06590992310508971, "grad_norm": 0.4801869620568488, "learning_rate": 6.569343065693431e-06, "loss": 0.5177, "step": 45 }, { "epoch": 0.06737458806298059, "grad_norm": 0.3973492869700473, "learning_rate": 6.715328467153285e-06, "loss": 0.4788, "step": 46 }, { "epoch": 0.06883925302087147, "grad_norm": 0.363722763875419, "learning_rate": 6.86131386861314e-06, "loss": 0.5129, "step": 47 }, { "epoch": 0.07030391797876236, "grad_norm": 0.381635129408091, "learning_rate": 7.007299270072994e-06, "loss": 0.4867, "step": 48 }, { "epoch": 0.07176858293665324, "grad_norm": 0.3591035550780032, "learning_rate": 7.153284671532848e-06, "loss": 0.4963, "step": 49 }, { "epoch": 0.07323324789454412, "grad_norm": 0.4186636869053589, "learning_rate": 7.299270072992701e-06, "loss": 0.4986, "step": 50 }, { "epoch": 0.074697912852435, "grad_norm": 0.41597006530339986, "learning_rate": 7.445255474452555e-06, "loss": 0.4765, "step": 51 }, { "epoch": 0.07616257781032588, "grad_norm": 0.6427961759723377, "learning_rate": 7.591240875912409e-06, "loss": 0.4694, "step": 52 }, { "epoch": 0.07762724276821677, "grad_norm": 0.4874587482608775, "learning_rate": 7.737226277372264e-06, "loss": 0.4634, "step": 53 }, { "epoch": 0.07909190772610765, "grad_norm": 0.3334959408832527, "learning_rate": 7.883211678832117e-06, "loss": 0.4636, "step": 54 }, { "epoch": 0.08055657268399853, "grad_norm": 0.381153104725579, "learning_rate": 8.029197080291972e-06, "loss": 0.459, "step": 55 }, { "epoch": 0.08202123764188941, "grad_norm": 0.3340733041790047, "learning_rate": 8.175182481751825e-06, "loss": 0.4851, "step": 56 }, { "epoch": 0.0834859025997803, "grad_norm": 0.3517427711639141, "learning_rate": 8.32116788321168e-06, "loss": 0.4236, "step": 57 }, { "epoch": 0.08495056755767118, "grad_norm": 0.3108043883771063, "learning_rate": 8.467153284671533e-06, "loss": 0.4336, "step": 58 }, { "epoch": 0.08641523251556206, "grad_norm": 0.3334733312721485, "learning_rate": 8.613138686131386e-06, "loss": 0.4365, "step": 59 }, { "epoch": 0.08787989747345294, "grad_norm": 0.352845389781815, "learning_rate": 8.759124087591241e-06, "loss": 0.4671, "step": 60 }, { "epoch": 0.08934456243134382, "grad_norm": 0.3423521983704048, "learning_rate": 8.905109489051096e-06, "loss": 0.4405, "step": 61 }, { "epoch": 0.0908092273892347, "grad_norm": 0.32680792151395816, "learning_rate": 9.05109489051095e-06, "loss": 0.4244, "step": 62 }, { "epoch": 0.09227389234712559, "grad_norm": 0.35932668840566334, "learning_rate": 9.197080291970804e-06, "loss": 0.4316, "step": 63 }, { "epoch": 0.09373855730501647, "grad_norm": 0.30409497564005455, "learning_rate": 9.343065693430657e-06, "loss": 0.4404, "step": 64 }, { "epoch": 0.09520322226290737, "grad_norm": 0.34431142299821804, "learning_rate": 9.48905109489051e-06, "loss": 0.4538, "step": 65 }, { "epoch": 0.09666788722079825, "grad_norm": 0.33318967367640623, "learning_rate": 9.635036496350367e-06, "loss": 0.4246, "step": 66 }, { "epoch": 0.09813255217868913, "grad_norm": 0.32969822743310584, "learning_rate": 9.78102189781022e-06, "loss": 0.4185, "step": 67 }, { "epoch": 0.09959721713658001, "grad_norm": 0.3203953785691397, "learning_rate": 9.927007299270073e-06, "loss": 0.3801, "step": 68 }, { "epoch": 0.1010618820944709, "grad_norm": 0.32486117695619926, "learning_rate": 1.0072992700729928e-05, "loss": 0.4072, "step": 69 }, { "epoch": 0.10252654705236178, "grad_norm": 0.35349454533419244, "learning_rate": 1.0218978102189783e-05, "loss": 0.4105, "step": 70 }, { "epoch": 0.10399121201025266, "grad_norm": 0.32856345829840156, "learning_rate": 1.0364963503649636e-05, "loss": 0.3967, "step": 71 }, { "epoch": 0.10545587696814354, "grad_norm": 0.29148929214591157, "learning_rate": 1.0510948905109491e-05, "loss": 0.3824, "step": 72 }, { "epoch": 0.10692054192603442, "grad_norm": 0.35704567954771227, "learning_rate": 1.0656934306569344e-05, "loss": 0.3953, "step": 73 }, { "epoch": 0.1083852068839253, "grad_norm": 0.31980320150253144, "learning_rate": 1.0802919708029198e-05, "loss": 0.3896, "step": 74 }, { "epoch": 0.10984987184181619, "grad_norm": 0.34589383334602003, "learning_rate": 1.0948905109489052e-05, "loss": 0.3995, "step": 75 }, { "epoch": 0.11131453679970707, "grad_norm": 0.3272293236162762, "learning_rate": 1.1094890510948906e-05, "loss": 0.3901, "step": 76 }, { "epoch": 0.11277920175759795, "grad_norm": 0.3579751175217903, "learning_rate": 1.124087591240876e-05, "loss": 0.3921, "step": 77 }, { "epoch": 0.11424386671548883, "grad_norm": 0.3239414165797523, "learning_rate": 1.1386861313868614e-05, "loss": 0.3989, "step": 78 }, { "epoch": 0.11570853167337972, "grad_norm": 0.354305255693745, "learning_rate": 1.1532846715328467e-05, "loss": 0.3983, "step": 79 }, { "epoch": 0.1171731966312706, "grad_norm": 0.3440015850313245, "learning_rate": 1.1678832116788322e-05, "loss": 0.3994, "step": 80 }, { "epoch": 0.11863786158916148, "grad_norm": 0.36417507763366563, "learning_rate": 1.1824817518248176e-05, "loss": 0.368, "step": 81 }, { "epoch": 0.12010252654705236, "grad_norm": 0.3568328603972373, "learning_rate": 1.1970802919708031e-05, "loss": 0.3694, "step": 82 }, { "epoch": 0.12156719150494324, "grad_norm": 0.33678321301690173, "learning_rate": 1.2116788321167885e-05, "loss": 0.3688, "step": 83 }, { "epoch": 0.12303185646283413, "grad_norm": 0.36497790299914806, "learning_rate": 1.226277372262774e-05, "loss": 0.3998, "step": 84 }, { "epoch": 0.12449652142072501, "grad_norm": 0.32377149566716346, "learning_rate": 1.2408759124087593e-05, "loss": 0.3586, "step": 85 }, { "epoch": 0.1259611863786159, "grad_norm": 0.3633002134475903, "learning_rate": 1.2554744525547446e-05, "loss": 0.3749, "step": 86 }, { "epoch": 0.12742585133650677, "grad_norm": 0.3719628223211349, "learning_rate": 1.27007299270073e-05, "loss": 0.3811, "step": 87 }, { "epoch": 0.12889051629439766, "grad_norm": 0.344394951638129, "learning_rate": 1.2846715328467154e-05, "loss": 0.377, "step": 88 }, { "epoch": 0.13035518125228854, "grad_norm": 0.369488047348081, "learning_rate": 1.2992700729927009e-05, "loss": 0.3663, "step": 89 }, { "epoch": 0.13181984621017942, "grad_norm": 0.3123562848371242, "learning_rate": 1.3138686131386862e-05, "loss": 0.3561, "step": 90 }, { "epoch": 0.1332845111680703, "grad_norm": 0.3743377566954952, "learning_rate": 1.3284671532846715e-05, "loss": 0.3488, "step": 91 }, { "epoch": 0.13474917612596118, "grad_norm": 0.3633285055316245, "learning_rate": 1.343065693430657e-05, "loss": 0.3566, "step": 92 }, { "epoch": 0.13621384108385207, "grad_norm": 0.31139891879793924, "learning_rate": 1.3576642335766423e-05, "loss": 0.3464, "step": 93 }, { "epoch": 0.13767850604174295, "grad_norm": 0.3898557118255509, "learning_rate": 1.372262773722628e-05, "loss": 0.3559, "step": 94 }, { "epoch": 0.13914317099963383, "grad_norm": 0.3497149782228266, "learning_rate": 1.3868613138686133e-05, "loss": 0.3426, "step": 95 }, { "epoch": 0.1406078359575247, "grad_norm": 0.3646043020546925, "learning_rate": 1.4014598540145988e-05, "loss": 0.3641, "step": 96 }, { "epoch": 0.1420725009154156, "grad_norm": 0.33631572980864205, "learning_rate": 1.416058394160584e-05, "loss": 0.3325, "step": 97 }, { "epoch": 0.14353716587330648, "grad_norm": 0.37775436608527085, "learning_rate": 1.4306569343065696e-05, "loss": 0.3615, "step": 98 }, { "epoch": 0.14500183083119736, "grad_norm": 0.3575269321533561, "learning_rate": 1.4452554744525549e-05, "loss": 0.3404, "step": 99 }, { "epoch": 0.14646649578908824, "grad_norm": 0.34707886852032405, "learning_rate": 1.4598540145985402e-05, "loss": 0.33, "step": 100 }, { "epoch": 0.14793116074697912, "grad_norm": 0.36563790351605774, "learning_rate": 1.4744525547445257e-05, "loss": 0.3173, "step": 101 }, { "epoch": 0.14939582570487, "grad_norm": 0.3723711813860172, "learning_rate": 1.489051094890511e-05, "loss": 0.3216, "step": 102 }, { "epoch": 0.1508604906627609, "grad_norm": 0.3847201221658523, "learning_rate": 1.5036496350364965e-05, "loss": 0.3344, "step": 103 }, { "epoch": 0.15232515562065177, "grad_norm": 0.35970372266040174, "learning_rate": 1.5182481751824818e-05, "loss": 0.3289, "step": 104 }, { "epoch": 0.15378982057854265, "grad_norm": 0.3915138161843807, "learning_rate": 1.5328467153284673e-05, "loss": 0.3361, "step": 105 }, { "epoch": 0.15525448553643353, "grad_norm": 0.3332723731808281, "learning_rate": 1.5474452554744528e-05, "loss": 0.3147, "step": 106 }, { "epoch": 0.15671915049432442, "grad_norm": 0.3613636145975632, "learning_rate": 1.5620437956204383e-05, "loss": 0.3432, "step": 107 }, { "epoch": 0.1581838154522153, "grad_norm": 0.360953722994529, "learning_rate": 1.5766423357664234e-05, "loss": 0.2994, "step": 108 }, { "epoch": 0.15964848041010618, "grad_norm": 0.44388991308142534, "learning_rate": 1.591240875912409e-05, "loss": 0.3456, "step": 109 }, { "epoch": 0.16111314536799706, "grad_norm": 0.3537174118941569, "learning_rate": 1.6058394160583944e-05, "loss": 0.3091, "step": 110 }, { "epoch": 0.16257781032588794, "grad_norm": 0.3713881906195737, "learning_rate": 1.62043795620438e-05, "loss": 0.347, "step": 111 }, { "epoch": 0.16404247528377883, "grad_norm": 0.35286710247968334, "learning_rate": 1.635036496350365e-05, "loss": 0.3202, "step": 112 }, { "epoch": 0.1655071402416697, "grad_norm": 0.34725613487477996, "learning_rate": 1.6496350364963505e-05, "loss": 0.3049, "step": 113 }, { "epoch": 0.1669718051995606, "grad_norm": 0.3362671204409347, "learning_rate": 1.664233576642336e-05, "loss": 0.3197, "step": 114 }, { "epoch": 0.16843647015745147, "grad_norm": 0.3464409043696399, "learning_rate": 1.678832116788321e-05, "loss": 0.3218, "step": 115 }, { "epoch": 0.16990113511534236, "grad_norm": 0.3480542874556784, "learning_rate": 1.6934306569343066e-05, "loss": 0.3239, "step": 116 }, { "epoch": 0.17136580007323324, "grad_norm": 0.3748150806007165, "learning_rate": 1.708029197080292e-05, "loss": 0.3157, "step": 117 }, { "epoch": 0.17283046503112412, "grad_norm": 0.34345916023717576, "learning_rate": 1.7226277372262773e-05, "loss": 0.3051, "step": 118 }, { "epoch": 0.174295129989015, "grad_norm": 0.34387241825105747, "learning_rate": 1.737226277372263e-05, "loss": 0.3179, "step": 119 }, { "epoch": 0.17575979494690588, "grad_norm": 0.33782864040586463, "learning_rate": 1.7518248175182482e-05, "loss": 0.2856, "step": 120 }, { "epoch": 0.17722445990479677, "grad_norm": 0.37444408915373373, "learning_rate": 1.7664233576642337e-05, "loss": 0.3251, "step": 121 }, { "epoch": 0.17868912486268765, "grad_norm": 0.4116316036207847, "learning_rate": 1.7810218978102192e-05, "loss": 0.3048, "step": 122 }, { "epoch": 0.18015378982057853, "grad_norm": 0.3781332494432965, "learning_rate": 1.7956204379562047e-05, "loss": 0.3008, "step": 123 }, { "epoch": 0.1816184547784694, "grad_norm": 0.4297088346511556, "learning_rate": 1.81021897810219e-05, "loss": 0.3094, "step": 124 }, { "epoch": 0.1830831197363603, "grad_norm": 0.4029065646398177, "learning_rate": 1.8248175182481753e-05, "loss": 0.3113, "step": 125 }, { "epoch": 0.18454778469425118, "grad_norm": 0.3992462478484584, "learning_rate": 1.8394160583941608e-05, "loss": 0.2854, "step": 126 }, { "epoch": 0.18601244965214206, "grad_norm": 0.3604804381477147, "learning_rate": 1.854014598540146e-05, "loss": 0.2981, "step": 127 }, { "epoch": 0.18747711461003294, "grad_norm": 0.37186861704413154, "learning_rate": 1.8686131386861315e-05, "loss": 0.3258, "step": 128 }, { "epoch": 0.18894177956792385, "grad_norm": 0.3441534820222307, "learning_rate": 1.883211678832117e-05, "loss": 0.2956, "step": 129 }, { "epoch": 0.19040644452581473, "grad_norm": 0.4360804547373249, "learning_rate": 1.897810218978102e-05, "loss": 0.3091, "step": 130 }, { "epoch": 0.19187110948370562, "grad_norm": 0.36928340495254985, "learning_rate": 1.912408759124088e-05, "loss": 0.2982, "step": 131 }, { "epoch": 0.1933357744415965, "grad_norm": 0.36064269917913877, "learning_rate": 1.9270072992700734e-05, "loss": 0.301, "step": 132 }, { "epoch": 0.19480043939948738, "grad_norm": 0.3364056638775238, "learning_rate": 1.9416058394160586e-05, "loss": 0.2643, "step": 133 }, { "epoch": 0.19626510435737826, "grad_norm": 0.385357913235113, "learning_rate": 1.956204379562044e-05, "loss": 0.3151, "step": 134 }, { "epoch": 0.19772976931526914, "grad_norm": 0.4013408692909421, "learning_rate": 1.9708029197080295e-05, "loss": 0.2937, "step": 135 }, { "epoch": 0.19919443427316003, "grad_norm": 0.36182554518094284, "learning_rate": 1.9854014598540147e-05, "loss": 0.3009, "step": 136 }, { "epoch": 0.2006590992310509, "grad_norm": 0.3494640989877029, "learning_rate": 2e-05, "loss": 0.281, "step": 137 }, { "epoch": 0.2021237641889418, "grad_norm": 0.3725320500681877, "learning_rate": 1.999996722215577e-05, "loss": 0.3091, "step": 138 }, { "epoch": 0.20358842914683267, "grad_norm": 0.3623692853258256, "learning_rate": 1.9999868888837957e-05, "loss": 0.2911, "step": 139 }, { "epoch": 0.20505309410472355, "grad_norm": 0.36761059973259275, "learning_rate": 1.999970500069119e-05, "loss": 0.2773, "step": 140 }, { "epoch": 0.20651775906261444, "grad_norm": 0.34372760731316254, "learning_rate": 1.999947555878985e-05, "loss": 0.2524, "step": 141 }, { "epoch": 0.20798242402050532, "grad_norm": 0.34586143693014154, "learning_rate": 1.9999180564638056e-05, "loss": 0.3144, "step": 142 }, { "epoch": 0.2094470889783962, "grad_norm": 0.3715249380161417, "learning_rate": 1.9998820020169668e-05, "loss": 0.2698, "step": 143 }, { "epoch": 0.21091175393628708, "grad_norm": 0.35728560877179666, "learning_rate": 1.9998393927748257e-05, "loss": 0.2877, "step": 144 }, { "epoch": 0.21237641889417797, "grad_norm": 0.3457017556464188, "learning_rate": 1.9997902290167104e-05, "loss": 0.2634, "step": 145 }, { "epoch": 0.21384108385206885, "grad_norm": 0.32677024848818564, "learning_rate": 1.999734511064917e-05, "loss": 0.2962, "step": 146 }, { "epoch": 0.21530574880995973, "grad_norm": 0.38425582050655327, "learning_rate": 1.9996722392847082e-05, "loss": 0.2769, "step": 147 }, { "epoch": 0.2167704137678506, "grad_norm": 0.34979074920621583, "learning_rate": 1.9996034140843113e-05, "loss": 0.281, "step": 148 }, { "epoch": 0.2182350787257415, "grad_norm": 0.32730300075566926, "learning_rate": 1.999528035914915e-05, "loss": 0.3143, "step": 149 }, { "epoch": 0.21969974368363238, "grad_norm": 0.35690312453916395, "learning_rate": 1.9994461052706652e-05, "loss": 0.2508, "step": 150 }, { "epoch": 0.22116440864152326, "grad_norm": 0.34613822211772644, "learning_rate": 1.9993576226886644e-05, "loss": 0.2793, "step": 151 }, { "epoch": 0.22262907359941414, "grad_norm": 0.3922751418648848, "learning_rate": 1.999262588748966e-05, "loss": 0.2875, "step": 152 }, { "epoch": 0.22409373855730502, "grad_norm": 0.3521535383506413, "learning_rate": 1.9991610040745718e-05, "loss": 0.2673, "step": 153 }, { "epoch": 0.2255584035151959, "grad_norm": 0.33944461757575506, "learning_rate": 1.9990528693314273e-05, "loss": 0.2553, "step": 154 }, { "epoch": 0.2270230684730868, "grad_norm": 0.3428799092949669, "learning_rate": 1.9989381852284165e-05, "loss": 0.274, "step": 155 }, { "epoch": 0.22848773343097767, "grad_norm": 0.3511079662901022, "learning_rate": 1.99881695251736e-05, "loss": 0.2612, "step": 156 }, { "epoch": 0.22995239838886855, "grad_norm": 0.33067647835038166, "learning_rate": 1.998689171993006e-05, "loss": 0.2653, "step": 157 }, { "epoch": 0.23141706334675943, "grad_norm": 0.3487260974580051, "learning_rate": 1.9985548444930295e-05, "loss": 0.3049, "step": 158 }, { "epoch": 0.23288172830465032, "grad_norm": 0.32680975853473054, "learning_rate": 1.9984139708980228e-05, "loss": 0.2641, "step": 159 }, { "epoch": 0.2343463932625412, "grad_norm": 0.34722059123958804, "learning_rate": 1.9982665521314934e-05, "loss": 0.2713, "step": 160 }, { "epoch": 0.23581105822043208, "grad_norm": 0.34801179824934036, "learning_rate": 1.9981125891598545e-05, "loss": 0.267, "step": 161 }, { "epoch": 0.23727572317832296, "grad_norm": 0.3624289342633928, "learning_rate": 1.9979520829924212e-05, "loss": 0.263, "step": 162 }, { "epoch": 0.23874038813621384, "grad_norm": 0.3299205118170384, "learning_rate": 1.9977850346814026e-05, "loss": 0.2919, "step": 163 }, { "epoch": 0.24020505309410473, "grad_norm": 0.3595355800684274, "learning_rate": 1.997611445321896e-05, "loss": 0.2934, "step": 164 }, { "epoch": 0.2416697180519956, "grad_norm": 0.3544383226618447, "learning_rate": 1.9974313160518776e-05, "loss": 0.2505, "step": 165 }, { "epoch": 0.2431343830098865, "grad_norm": 0.34482857147345436, "learning_rate": 1.9972446480521972e-05, "loss": 0.2858, "step": 166 }, { "epoch": 0.24459904796777737, "grad_norm": 0.36217560659010756, "learning_rate": 1.9970514425465706e-05, "loss": 0.2478, "step": 167 }, { "epoch": 0.24606371292566825, "grad_norm": 0.37793259918304806, "learning_rate": 1.996851700801569e-05, "loss": 0.2557, "step": 168 }, { "epoch": 0.24752837788355914, "grad_norm": 0.34823612797091924, "learning_rate": 1.996645424126613e-05, "loss": 0.2449, "step": 169 }, { "epoch": 0.24899304284145002, "grad_norm": 0.3456664229365294, "learning_rate": 1.9964326138739645e-05, "loss": 0.2593, "step": 170 }, { "epoch": 0.2504577077993409, "grad_norm": 0.36365236173385046, "learning_rate": 1.996213271438715e-05, "loss": 0.26, "step": 171 }, { "epoch": 0.2519223727572318, "grad_norm": 0.3807231884615767, "learning_rate": 1.9959873982587795e-05, "loss": 0.2976, "step": 172 }, { "epoch": 0.25338703771512266, "grad_norm": 0.373069506679343, "learning_rate": 1.9957549958148844e-05, "loss": 0.2895, "step": 173 }, { "epoch": 0.25485170267301355, "grad_norm": 0.3531411189874793, "learning_rate": 1.9955160656305606e-05, "loss": 0.2483, "step": 174 }, { "epoch": 0.25631636763090443, "grad_norm": 0.35346645678091215, "learning_rate": 1.995270609272131e-05, "loss": 0.2649, "step": 175 }, { "epoch": 0.2577810325887953, "grad_norm": 0.3415190871733395, "learning_rate": 1.995018628348702e-05, "loss": 0.2523, "step": 176 }, { "epoch": 0.2592456975466862, "grad_norm": 0.3654772088675571, "learning_rate": 1.9947601245121514e-05, "loss": 0.3083, "step": 177 }, { "epoch": 0.2607103625045771, "grad_norm": 0.3460461457455375, "learning_rate": 1.9944950994571192e-05, "loss": 0.2792, "step": 178 }, { "epoch": 0.26217502746246796, "grad_norm": 0.3110444104590562, "learning_rate": 1.9942235549209955e-05, "loss": 0.2688, "step": 179 }, { "epoch": 0.26363969242035884, "grad_norm": 0.3272499534539241, "learning_rate": 1.993945492683909e-05, "loss": 0.2792, "step": 180 }, { "epoch": 0.2651043573782497, "grad_norm": 0.2878269183852618, "learning_rate": 1.993660914568716e-05, "loss": 0.2424, "step": 181 }, { "epoch": 0.2665690223361406, "grad_norm": 0.33774279808544694, "learning_rate": 1.9933698224409876e-05, "loss": 0.2496, "step": 182 }, { "epoch": 0.2680336872940315, "grad_norm": 0.3363925870755136, "learning_rate": 1.993072218208999e-05, "loss": 0.2613, "step": 183 }, { "epoch": 0.26949835225192237, "grad_norm": 0.3304059597878355, "learning_rate": 1.992768103823714e-05, "loss": 0.2523, "step": 184 }, { "epoch": 0.27096301720981325, "grad_norm": 0.3556147552320389, "learning_rate": 1.9924574812787766e-05, "loss": 0.2528, "step": 185 }, { "epoch": 0.27242768216770413, "grad_norm": 0.33358383808877295, "learning_rate": 1.992140352610494e-05, "loss": 0.2371, "step": 186 }, { "epoch": 0.273892347125595, "grad_norm": 0.32338465251509535, "learning_rate": 1.9918167198978246e-05, "loss": 0.2328, "step": 187 }, { "epoch": 0.2753570120834859, "grad_norm": 0.34675833301901965, "learning_rate": 1.991486585262365e-05, "loss": 0.2441, "step": 188 }, { "epoch": 0.2768216770413768, "grad_norm": 0.32681910042540224, "learning_rate": 1.991149950868336e-05, "loss": 0.2244, "step": 189 }, { "epoch": 0.27828634199926766, "grad_norm": 0.33323736986242397, "learning_rate": 1.9908068189225672e-05, "loss": 0.2375, "step": 190 }, { "epoch": 0.27975100695715854, "grad_norm": 0.30868253450774236, "learning_rate": 1.9904571916744836e-05, "loss": 0.2354, "step": 191 }, { "epoch": 0.2812156719150494, "grad_norm": 0.3404026835089613, "learning_rate": 1.990101071416091e-05, "loss": 0.244, "step": 192 }, { "epoch": 0.2826803368729403, "grad_norm": 0.3250658440023371, "learning_rate": 1.98973846048196e-05, "loss": 0.2436, "step": 193 }, { "epoch": 0.2841450018308312, "grad_norm": 0.3727507902802708, "learning_rate": 1.9893693612492116e-05, "loss": 0.2605, "step": 194 }, { "epoch": 0.28560966678872207, "grad_norm": 0.32087796418849673, "learning_rate": 1.9889937761375015e-05, "loss": 0.2473, "step": 195 }, { "epoch": 0.28707433174661295, "grad_norm": 0.3017096010090274, "learning_rate": 1.9886117076090033e-05, "loss": 0.2333, "step": 196 }, { "epoch": 0.28853899670450384, "grad_norm": 0.32081985018575454, "learning_rate": 1.9882231581683938e-05, "loss": 0.2401, "step": 197 }, { "epoch": 0.2900036616623947, "grad_norm": 0.31230329800532286, "learning_rate": 1.9878281303628352e-05, "loss": 0.2362, "step": 198 }, { "epoch": 0.2914683266202856, "grad_norm": 0.3218942752729413, "learning_rate": 1.9874266267819604e-05, "loss": 0.248, "step": 199 }, { "epoch": 0.2929329915781765, "grad_norm": 0.34876467385745175, "learning_rate": 1.987018650057853e-05, "loss": 0.2487, "step": 200 }, { "epoch": 0.29439765653606736, "grad_norm": 0.31745025914965086, "learning_rate": 1.986604202865033e-05, "loss": 0.2394, "step": 201 }, { "epoch": 0.29586232149395825, "grad_norm": 0.32627016177493307, "learning_rate": 1.986183287920437e-05, "loss": 0.2624, "step": 202 }, { "epoch": 0.29732698645184913, "grad_norm": 0.32599240801438883, "learning_rate": 1.9857559079834023e-05, "loss": 0.2404, "step": 203 }, { "epoch": 0.29879165140974, "grad_norm": 0.3110886547328817, "learning_rate": 1.9853220658556474e-05, "loss": 0.246, "step": 204 }, { "epoch": 0.3002563163676309, "grad_norm": 0.3439412203840107, "learning_rate": 1.984881764381254e-05, "loss": 0.2245, "step": 205 }, { "epoch": 0.3017209813255218, "grad_norm": 0.32093087763294337, "learning_rate": 1.9844350064466488e-05, "loss": 0.2351, "step": 206 }, { "epoch": 0.30318564628341266, "grad_norm": 0.3080777182901835, "learning_rate": 1.9839817949805843e-05, "loss": 0.2239, "step": 207 }, { "epoch": 0.30465031124130354, "grad_norm": 0.33465999975388977, "learning_rate": 1.9835221329541197e-05, "loss": 0.2362, "step": 208 }, { "epoch": 0.3061149761991944, "grad_norm": 0.2916494897812547, "learning_rate": 1.9830560233806006e-05, "loss": 0.2138, "step": 209 }, { "epoch": 0.3075796411570853, "grad_norm": 0.31069474576355127, "learning_rate": 1.9825834693156408e-05, "loss": 0.2381, "step": 210 }, { "epoch": 0.3090443061149762, "grad_norm": 0.30904277970849947, "learning_rate": 1.9821044738571008e-05, "loss": 0.2336, "step": 211 }, { "epoch": 0.31050897107286707, "grad_norm": 0.30754570168689266, "learning_rate": 1.981619040145068e-05, "loss": 0.2209, "step": 212 }, { "epoch": 0.31197363603075795, "grad_norm": 0.2976312068720902, "learning_rate": 1.9811271713618372e-05, "loss": 0.2558, "step": 213 }, { "epoch": 0.31343830098864883, "grad_norm": 0.3039100122516498, "learning_rate": 1.980628870731888e-05, "loss": 0.2265, "step": 214 }, { "epoch": 0.3149029659465397, "grad_norm": 0.3136540735451798, "learning_rate": 1.9801241415218636e-05, "loss": 0.2551, "step": 215 }, { "epoch": 0.3163676309044306, "grad_norm": 0.30585301656209735, "learning_rate": 1.979612987040552e-05, "loss": 0.2258, "step": 216 }, { "epoch": 0.3178322958623215, "grad_norm": 0.3334394299522592, "learning_rate": 1.9790954106388614e-05, "loss": 0.2374, "step": 217 }, { "epoch": 0.31929696082021236, "grad_norm": 0.3385575692037135, "learning_rate": 1.9785714157097992e-05, "loss": 0.2331, "step": 218 }, { "epoch": 0.32076162577810324, "grad_norm": 0.3550097472590768, "learning_rate": 1.9780410056884505e-05, "loss": 0.2316, "step": 219 }, { "epoch": 0.3222262907359941, "grad_norm": 0.3212031876917821, "learning_rate": 1.9775041840519547e-05, "loss": 0.264, "step": 220 }, { "epoch": 0.323690955693885, "grad_norm": 0.3417161392653498, "learning_rate": 1.976960954319483e-05, "loss": 0.2366, "step": 221 }, { "epoch": 0.3251556206517759, "grad_norm": 0.31739055359370316, "learning_rate": 1.9764113200522153e-05, "loss": 0.2193, "step": 222 }, { "epoch": 0.32662028560966677, "grad_norm": 0.3475167845756447, "learning_rate": 1.9758552848533168e-05, "loss": 0.2381, "step": 223 }, { "epoch": 0.32808495056755765, "grad_norm": 0.3227407213456651, "learning_rate": 1.9752928523679145e-05, "loss": 0.2403, "step": 224 }, { "epoch": 0.32954961552544854, "grad_norm": 0.32134472119602675, "learning_rate": 1.9747240262830734e-05, "loss": 0.244, "step": 225 }, { "epoch": 0.3310142804833394, "grad_norm": 0.3060769394485448, "learning_rate": 1.9741488103277722e-05, "loss": 0.2029, "step": 226 }, { "epoch": 0.3324789454412303, "grad_norm": 0.3422387817778384, "learning_rate": 1.9735672082728785e-05, "loss": 0.2166, "step": 227 }, { "epoch": 0.3339436103991212, "grad_norm": 0.29907859948451515, "learning_rate": 1.9729792239311243e-05, "loss": 0.223, "step": 228 }, { "epoch": 0.33540827535701206, "grad_norm": 0.27950206671923955, "learning_rate": 1.972384861157082e-05, "loss": 0.1952, "step": 229 }, { "epoch": 0.33687294031490295, "grad_norm": 0.3083999590796091, "learning_rate": 1.9717841238471377e-05, "loss": 0.2188, "step": 230 }, { "epoch": 0.33833760527279383, "grad_norm": 0.295065591964525, "learning_rate": 1.9711770159394654e-05, "loss": 0.2091, "step": 231 }, { "epoch": 0.3398022702306847, "grad_norm": 0.30797729325093864, "learning_rate": 1.9705635414140035e-05, "loss": 0.2106, "step": 232 }, { "epoch": 0.3412669351885756, "grad_norm": 0.31169542447907694, "learning_rate": 1.9699437042924266e-05, "loss": 0.2398, "step": 233 }, { "epoch": 0.3427316001464665, "grad_norm": 0.340358999491338, "learning_rate": 1.969317508638119e-05, "loss": 0.2564, "step": 234 }, { "epoch": 0.34419626510435736, "grad_norm": 0.3017727587765966, "learning_rate": 1.96868495855615e-05, "loss": 0.2024, "step": 235 }, { "epoch": 0.34566093006224824, "grad_norm": 0.33465091974213557, "learning_rate": 1.9680460581932448e-05, "loss": 0.2418, "step": 236 }, { "epoch": 0.3471255950201391, "grad_norm": 0.29191015974637435, "learning_rate": 1.967400811737759e-05, "loss": 0.2018, "step": 237 }, { "epoch": 0.34859025997803, "grad_norm": 0.28852204171964263, "learning_rate": 1.96674922341965e-05, "loss": 0.2341, "step": 238 }, { "epoch": 0.3500549249359209, "grad_norm": 0.32713188846613395, "learning_rate": 1.96609129751045e-05, "loss": 0.2304, "step": 239 }, { "epoch": 0.35151958989381177, "grad_norm": 0.2960166330763219, "learning_rate": 1.9654270383232377e-05, "loss": 0.218, "step": 240 }, { "epoch": 0.35298425485170265, "grad_norm": 0.3129431801548695, "learning_rate": 1.9647564502126094e-05, "loss": 0.2117, "step": 241 }, { "epoch": 0.35444891980959353, "grad_norm": 0.294963922960848, "learning_rate": 1.964079537574652e-05, "loss": 0.2203, "step": 242 }, { "epoch": 0.3559135847674844, "grad_norm": 0.3164361378068273, "learning_rate": 1.963396304846913e-05, "loss": 0.2199, "step": 243 }, { "epoch": 0.3573782497253753, "grad_norm": 0.3207780914062909, "learning_rate": 1.9627067565083716e-05, "loss": 0.2094, "step": 244 }, { "epoch": 0.3588429146832662, "grad_norm": 0.31070213874769786, "learning_rate": 1.962010897079409e-05, "loss": 0.1875, "step": 245 }, { "epoch": 0.36030757964115706, "grad_norm": 0.30122482811704177, "learning_rate": 1.96130873112178e-05, "loss": 0.2377, "step": 246 }, { "epoch": 0.36177224459904794, "grad_norm": 0.3070889454659401, "learning_rate": 1.9606002632385817e-05, "loss": 0.1948, "step": 247 }, { "epoch": 0.3632369095569388, "grad_norm": 0.30277497066736986, "learning_rate": 1.959885498074224e-05, "loss": 0.2122, "step": 248 }, { "epoch": 0.3647015745148297, "grad_norm": 0.33010916657115674, "learning_rate": 1.9591644403143997e-05, "loss": 0.1988, "step": 249 }, { "epoch": 0.3661662394727206, "grad_norm": 0.3187016737580306, "learning_rate": 1.958437094686052e-05, "loss": 0.1954, "step": 250 }, { "epoch": 0.36763090443061147, "grad_norm": 0.29492570210704677, "learning_rate": 1.9577034659573452e-05, "loss": 0.1932, "step": 251 }, { "epoch": 0.36909556938850235, "grad_norm": 0.31163033056152234, "learning_rate": 1.956963558937633e-05, "loss": 0.234, "step": 252 }, { "epoch": 0.37056023434639324, "grad_norm": 0.29081575214116195, "learning_rate": 1.9562173784774274e-05, "loss": 0.2256, "step": 253 }, { "epoch": 0.3720248993042841, "grad_norm": 0.2859981212044829, "learning_rate": 1.955464929468365e-05, "loss": 0.1863, "step": 254 }, { "epoch": 0.373489564262175, "grad_norm": 0.3362313373130634, "learning_rate": 1.9547062168431777e-05, "loss": 0.2191, "step": 255 }, { "epoch": 0.3749542292200659, "grad_norm": 0.31580606455125804, "learning_rate": 1.9539412455756578e-05, "loss": 0.2219, "step": 256 }, { "epoch": 0.3764188941779568, "grad_norm": 0.3138360531715862, "learning_rate": 1.9531700206806274e-05, "loss": 0.2216, "step": 257 }, { "epoch": 0.3778835591358477, "grad_norm": 0.30577065715195095, "learning_rate": 1.952392547213904e-05, "loss": 0.2143, "step": 258 }, { "epoch": 0.3793482240937386, "grad_norm": 0.3030869019075986, "learning_rate": 1.9516088302722696e-05, "loss": 0.2155, "step": 259 }, { "epoch": 0.38081288905162947, "grad_norm": 0.33061172813654255, "learning_rate": 1.9508188749934333e-05, "loss": 0.2375, "step": 260 }, { "epoch": 0.38227755400952035, "grad_norm": 0.31425985710724597, "learning_rate": 1.9500226865560015e-05, "loss": 0.197, "step": 261 }, { "epoch": 0.38374221896741123, "grad_norm": 0.3084208020631583, "learning_rate": 1.9492202701794432e-05, "loss": 0.2062, "step": 262 }, { "epoch": 0.3852068839253021, "grad_norm": 0.31591794037568377, "learning_rate": 1.9484116311240534e-05, "loss": 0.2012, "step": 263 }, { "epoch": 0.386671548883193, "grad_norm": 0.3243347143178351, "learning_rate": 1.9475967746909212e-05, "loss": 0.2157, "step": 264 }, { "epoch": 0.3881362138410839, "grad_norm": 0.3126285934402634, "learning_rate": 1.946775706221894e-05, "loss": 0.2015, "step": 265 }, { "epoch": 0.38960087879897476, "grad_norm": 0.29362391198343335, "learning_rate": 1.945948431099543e-05, "loss": 0.2082, "step": 266 }, { "epoch": 0.39106554375686564, "grad_norm": 0.3306599758257121, "learning_rate": 1.945114954747127e-05, "loss": 0.2217, "step": 267 }, { "epoch": 0.3925302087147565, "grad_norm": 0.31104138128230796, "learning_rate": 1.9442752826285578e-05, "loss": 0.2347, "step": 268 }, { "epoch": 0.3939948736726474, "grad_norm": 0.2882519240767416, "learning_rate": 1.9434294202483634e-05, "loss": 0.189, "step": 269 }, { "epoch": 0.3954595386305383, "grad_norm": 0.2993561393461098, "learning_rate": 1.9425773731516534e-05, "loss": 0.197, "step": 270 }, { "epoch": 0.39692420358842917, "grad_norm": 0.31696676157406817, "learning_rate": 1.9417191469240806e-05, "loss": 0.1853, "step": 271 }, { "epoch": 0.39838886854632005, "grad_norm": 0.2976550371764201, "learning_rate": 1.940854747191806e-05, "loss": 0.2062, "step": 272 }, { "epoch": 0.39985353350421093, "grad_norm": 0.3102511410212597, "learning_rate": 1.9399841796214625e-05, "loss": 0.2118, "step": 273 }, { "epoch": 0.4013181984621018, "grad_norm": 0.29711750834500367, "learning_rate": 1.9391074499201155e-05, "loss": 0.1736, "step": 274 }, { "epoch": 0.4027828634199927, "grad_norm": 0.33461146425534555, "learning_rate": 1.938224563835226e-05, "loss": 0.2023, "step": 275 }, { "epoch": 0.4042475283778836, "grad_norm": 0.2969695409049403, "learning_rate": 1.9373355271546156e-05, "loss": 0.2067, "step": 276 }, { "epoch": 0.40571219333577446, "grad_norm": 0.32397668137142743, "learning_rate": 1.9364403457064252e-05, "loss": 0.2266, "step": 277 }, { "epoch": 0.40717685829366534, "grad_norm": 0.35852753225097855, "learning_rate": 1.9355390253590775e-05, "loss": 0.2582, "step": 278 }, { "epoch": 0.4086415232515562, "grad_norm": 0.2936115798624861, "learning_rate": 1.9346315720212416e-05, "loss": 0.1899, "step": 279 }, { "epoch": 0.4101061882094471, "grad_norm": 0.3229854576353249, "learning_rate": 1.933717991641789e-05, "loss": 0.2103, "step": 280 }, { "epoch": 0.411570853167338, "grad_norm": 0.29206997385224825, "learning_rate": 1.9327982902097596e-05, "loss": 0.2157, "step": 281 }, { "epoch": 0.4130355181252289, "grad_norm": 0.30505940530286574, "learning_rate": 1.931872473754319e-05, "loss": 0.2077, "step": 282 }, { "epoch": 0.41450018308311976, "grad_norm": 0.2973043577148213, "learning_rate": 1.9309405483447208e-05, "loss": 0.1964, "step": 283 }, { "epoch": 0.41596484804101064, "grad_norm": 0.29785754593184066, "learning_rate": 1.9300025200902666e-05, "loss": 0.1913, "step": 284 }, { "epoch": 0.4174295129989015, "grad_norm": 0.33220986093953964, "learning_rate": 1.9290583951402648e-05, "loss": 0.1997, "step": 285 }, { "epoch": 0.4188941779567924, "grad_norm": 0.3165354043864197, "learning_rate": 1.9281081796839915e-05, "loss": 0.2028, "step": 286 }, { "epoch": 0.4203588429146833, "grad_norm": 0.30361046171154804, "learning_rate": 1.9271518799506494e-05, "loss": 0.2004, "step": 287 }, { "epoch": 0.42182350787257417, "grad_norm": 0.3125634361106797, "learning_rate": 1.9261895022093275e-05, "loss": 0.1981, "step": 288 }, { "epoch": 0.42328817283046505, "grad_norm": 0.30844567580026055, "learning_rate": 1.9252210527689596e-05, "loss": 0.2073, "step": 289 }, { "epoch": 0.42475283778835593, "grad_norm": 0.3155331823234916, "learning_rate": 1.9242465379782823e-05, "loss": 0.1854, "step": 290 }, { "epoch": 0.4262175027462468, "grad_norm": 0.34604055990819416, "learning_rate": 1.9232659642257942e-05, "loss": 0.2627, "step": 291 }, { "epoch": 0.4276821677041377, "grad_norm": 0.31441965292621443, "learning_rate": 1.9222793379397146e-05, "loss": 0.2214, "step": 292 }, { "epoch": 0.4291468326620286, "grad_norm": 0.3341564227129849, "learning_rate": 1.9212866655879397e-05, "loss": 0.2002, "step": 293 }, { "epoch": 0.43061149761991946, "grad_norm": 0.314168791046398, "learning_rate": 1.9202879536780013e-05, "loss": 0.199, "step": 294 }, { "epoch": 0.43207616257781034, "grad_norm": 0.3415225885868485, "learning_rate": 1.919283208757025e-05, "loss": 0.1915, "step": 295 }, { "epoch": 0.4335408275357012, "grad_norm": 0.34338764313976977, "learning_rate": 1.918272437411684e-05, "loss": 0.2134, "step": 296 }, { "epoch": 0.4350054924935921, "grad_norm": 0.30091408846543016, "learning_rate": 1.91725564626816e-05, "loss": 0.1956, "step": 297 }, { "epoch": 0.436470157451483, "grad_norm": 0.30066058971128345, "learning_rate": 1.9162328419920976e-05, "loss": 0.1901, "step": 298 }, { "epoch": 0.43793482240937387, "grad_norm": 0.30021693344518857, "learning_rate": 1.9152040312885604e-05, "loss": 0.195, "step": 299 }, { "epoch": 0.43939948736726475, "grad_norm": 0.30589418789890954, "learning_rate": 1.914169220901988e-05, "loss": 0.1904, "step": 300 }, { "epoch": 0.44086415232515563, "grad_norm": 0.2916943622706973, "learning_rate": 1.9131284176161505e-05, "loss": 0.1862, "step": 301 }, { "epoch": 0.4423288172830465, "grad_norm": 0.3111182978529323, "learning_rate": 1.9120816282541062e-05, "loss": 0.1903, "step": 302 }, { "epoch": 0.4437934822409374, "grad_norm": 0.296170352579387, "learning_rate": 1.911028859678155e-05, "loss": 0.1859, "step": 303 }, { "epoch": 0.4452581471988283, "grad_norm": 0.3144098925860641, "learning_rate": 1.9099701187897927e-05, "loss": 0.2115, "step": 304 }, { "epoch": 0.44672281215671916, "grad_norm": 0.28420551119731563, "learning_rate": 1.9089054125296692e-05, "loss": 0.1786, "step": 305 }, { "epoch": 0.44818747711461004, "grad_norm": 0.31553201198823727, "learning_rate": 1.907834747877539e-05, "loss": 0.1828, "step": 306 }, { "epoch": 0.4496521420725009, "grad_norm": 0.3045234900369304, "learning_rate": 1.906758131852218e-05, "loss": 0.1969, "step": 307 }, { "epoch": 0.4511168070303918, "grad_norm": 0.3172827302837682, "learning_rate": 1.9056755715115372e-05, "loss": 0.1961, "step": 308 }, { "epoch": 0.4525814719882827, "grad_norm": 0.31871947429630393, "learning_rate": 1.9045870739522953e-05, "loss": 0.1833, "step": 309 }, { "epoch": 0.4540461369461736, "grad_norm": 0.29640983142077165, "learning_rate": 1.9034926463102122e-05, "loss": 0.1735, "step": 310 }, { "epoch": 0.45551080190406446, "grad_norm": 0.2838973203329574, "learning_rate": 1.9023922957598847e-05, "loss": 0.1879, "step": 311 }, { "epoch": 0.45697546686195534, "grad_norm": 0.31619551941395607, "learning_rate": 1.901286029514736e-05, "loss": 0.1981, "step": 312 }, { "epoch": 0.4584401318198462, "grad_norm": 0.28916971260480706, "learning_rate": 1.9001738548269707e-05, "loss": 0.1681, "step": 313 }, { "epoch": 0.4599047967777371, "grad_norm": 0.29652343709617623, "learning_rate": 1.8990557789875265e-05, "loss": 0.1752, "step": 314 }, { "epoch": 0.461369461735628, "grad_norm": 0.2952557530849967, "learning_rate": 1.8979318093260268e-05, "loss": 0.1731, "step": 315 }, { "epoch": 0.46283412669351887, "grad_norm": 0.30085307847746645, "learning_rate": 1.8968019532107318e-05, "loss": 0.1975, "step": 316 }, { "epoch": 0.46429879165140975, "grad_norm": 0.3187121453479262, "learning_rate": 1.8956662180484913e-05, "loss": 0.198, "step": 317 }, { "epoch": 0.46576345660930063, "grad_norm": 0.30663390384150385, "learning_rate": 1.8945246112846952e-05, "loss": 0.1835, "step": 318 }, { "epoch": 0.4672281215671915, "grad_norm": 0.30881852574686636, "learning_rate": 1.893377140403225e-05, "loss": 0.1828, "step": 319 }, { "epoch": 0.4686927865250824, "grad_norm": 0.300908778514595, "learning_rate": 1.892223812926406e-05, "loss": 0.2214, "step": 320 }, { "epoch": 0.4701574514829733, "grad_norm": 0.2860561871686056, "learning_rate": 1.8910646364149548e-05, "loss": 0.1938, "step": 321 }, { "epoch": 0.47162211644086416, "grad_norm": 0.2772670520674923, "learning_rate": 1.889899618467933e-05, "loss": 0.1668, "step": 322 }, { "epoch": 0.47308678139875504, "grad_norm": 0.3062971263411581, "learning_rate": 1.8887287667226964e-05, "loss": 0.199, "step": 323 }, { "epoch": 0.4745514463566459, "grad_norm": 0.32124027529820043, "learning_rate": 1.887552088854844e-05, "loss": 0.2083, "step": 324 }, { "epoch": 0.4760161113145368, "grad_norm": 0.295114219753382, "learning_rate": 1.8863695925781685e-05, "loss": 0.1887, "step": 325 }, { "epoch": 0.4774807762724277, "grad_norm": 0.3194350024464684, "learning_rate": 1.8851812856446062e-05, "loss": 0.2086, "step": 326 }, { "epoch": 0.47894544123031857, "grad_norm": 0.29449304384305286, "learning_rate": 1.8839871758441842e-05, "loss": 0.1898, "step": 327 }, { "epoch": 0.48041010618820945, "grad_norm": 0.28550779650922725, "learning_rate": 1.882787271004972e-05, "loss": 0.1751, "step": 328 }, { "epoch": 0.48187477114610033, "grad_norm": 0.28143493349397003, "learning_rate": 1.8815815789930277e-05, "loss": 0.1853, "step": 329 }, { "epoch": 0.4833394361039912, "grad_norm": 0.3088998091001307, "learning_rate": 1.8803701077123492e-05, "loss": 0.1901, "step": 330 }, { "epoch": 0.4848041010618821, "grad_norm": 0.31201486962963987, "learning_rate": 1.8791528651048193e-05, "loss": 0.1956, "step": 331 }, { "epoch": 0.486268766019773, "grad_norm": 0.30000644267131227, "learning_rate": 1.8779298591501565e-05, "loss": 0.1852, "step": 332 }, { "epoch": 0.48773343097766386, "grad_norm": 0.279989644276303, "learning_rate": 1.8767010978658597e-05, "loss": 0.1564, "step": 333 }, { "epoch": 0.48919809593555474, "grad_norm": 0.31244506310168246, "learning_rate": 1.8754665893071583e-05, "loss": 0.2094, "step": 334 }, { "epoch": 0.4906627608934456, "grad_norm": 0.32128122069078735, "learning_rate": 1.874226341566958e-05, "loss": 0.2013, "step": 335 }, { "epoch": 0.4921274258513365, "grad_norm": 0.30462090670077413, "learning_rate": 1.872980362775789e-05, "loss": 0.1779, "step": 336 }, { "epoch": 0.4935920908092274, "grad_norm": 0.30874094182185663, "learning_rate": 1.87172866110175e-05, "loss": 0.1976, "step": 337 }, { "epoch": 0.4950567557671183, "grad_norm": 0.2978419876068271, "learning_rate": 1.870471244750458e-05, "loss": 0.1718, "step": 338 }, { "epoch": 0.49652142072500915, "grad_norm": 0.30085229404822755, "learning_rate": 1.8692081219649926e-05, "loss": 0.1729, "step": 339 }, { "epoch": 0.49798608568290004, "grad_norm": 0.309384694321714, "learning_rate": 1.867939301025842e-05, "loss": 0.1882, "step": 340 }, { "epoch": 0.4994507506407909, "grad_norm": 0.28838820512419444, "learning_rate": 1.8666647902508493e-05, "loss": 0.1789, "step": 341 }, { "epoch": 0.5009154155986818, "grad_norm": 0.32424956980873243, "learning_rate": 1.8653845979951577e-05, "loss": 0.1663, "step": 342 }, { "epoch": 0.5023800805565727, "grad_norm": 0.31499571469715115, "learning_rate": 1.864098732651155e-05, "loss": 0.1991, "step": 343 }, { "epoch": 0.5038447455144636, "grad_norm": 0.3158502822606953, "learning_rate": 1.8628072026484215e-05, "loss": 0.2042, "step": 344 }, { "epoch": 0.5053094104723544, "grad_norm": 0.3006662825507403, "learning_rate": 1.8615100164536696e-05, "loss": 0.1799, "step": 345 }, { "epoch": 0.5067740754302453, "grad_norm": 0.3197735718886773, "learning_rate": 1.8602071825706928e-05, "loss": 0.2037, "step": 346 }, { "epoch": 0.5082387403881362, "grad_norm": 0.29431082437854705, "learning_rate": 1.858898709540309e-05, "loss": 0.1624, "step": 347 }, { "epoch": 0.5097034053460271, "grad_norm": 0.3319207275942643, "learning_rate": 1.8575846059403036e-05, "loss": 0.1781, "step": 348 }, { "epoch": 0.511168070303918, "grad_norm": 0.3548295674736163, "learning_rate": 1.856264880385372e-05, "loss": 0.1777, "step": 349 }, { "epoch": 0.5126327352618089, "grad_norm": 0.3418284407905513, "learning_rate": 1.8549395415270664e-05, "loss": 0.1658, "step": 350 }, { "epoch": 0.5140974002196997, "grad_norm": 0.28587381220460634, "learning_rate": 1.8536085980537374e-05, "loss": 0.1521, "step": 351 }, { "epoch": 0.5155620651775906, "grad_norm": 0.28655840819303974, "learning_rate": 1.8522720586904758e-05, "loss": 0.177, "step": 352 }, { "epoch": 0.5170267301354815, "grad_norm": 0.3101035018861679, "learning_rate": 1.8509299321990583e-05, "loss": 0.1728, "step": 353 }, { "epoch": 0.5184913950933724, "grad_norm": 0.309153206811744, "learning_rate": 1.8495822273778867e-05, "loss": 0.1853, "step": 354 }, { "epoch": 0.5199560600512633, "grad_norm": 0.3072944032463391, "learning_rate": 1.8482289530619332e-05, "loss": 0.1994, "step": 355 }, { "epoch": 0.5214207250091542, "grad_norm": 0.2882455459923482, "learning_rate": 1.8468701181226803e-05, "loss": 0.1719, "step": 356 }, { "epoch": 0.522885389967045, "grad_norm": 0.3009687596348506, "learning_rate": 1.8455057314680646e-05, "loss": 0.1762, "step": 357 }, { "epoch": 0.5243500549249359, "grad_norm": 0.28709132379458197, "learning_rate": 1.8441358020424168e-05, "loss": 0.1872, "step": 358 }, { "epoch": 0.5258147198828268, "grad_norm": 0.3252839006016281, "learning_rate": 1.8427603388264027e-05, "loss": 0.1658, "step": 359 }, { "epoch": 0.5272793848407177, "grad_norm": 0.2794496573065983, "learning_rate": 1.8413793508369667e-05, "loss": 0.1536, "step": 360 }, { "epoch": 0.5287440497986086, "grad_norm": 0.295826215328857, "learning_rate": 1.839992847127271e-05, "loss": 0.1728, "step": 361 }, { "epoch": 0.5302087147564994, "grad_norm": 0.3057004431965132, "learning_rate": 1.838600836786635e-05, "loss": 0.1737, "step": 362 }, { "epoch": 0.5316733797143903, "grad_norm": 0.3032546345594625, "learning_rate": 1.8372033289404795e-05, "loss": 0.1804, "step": 363 }, { "epoch": 0.5331380446722812, "grad_norm": 0.31826939348019884, "learning_rate": 1.835800332750263e-05, "loss": 0.1567, "step": 364 }, { "epoch": 0.5346027096301721, "grad_norm": 0.3382986306853968, "learning_rate": 1.834391857413423e-05, "loss": 0.1857, "step": 365 }, { "epoch": 0.536067374588063, "grad_norm": 0.29281271654639596, "learning_rate": 1.8329779121633177e-05, "loss": 0.1755, "step": 366 }, { "epoch": 0.5375320395459539, "grad_norm": 0.32331356405290573, "learning_rate": 1.8315585062691616e-05, "loss": 0.174, "step": 367 }, { "epoch": 0.5389967045038447, "grad_norm": 0.3058216598921902, "learning_rate": 1.830133649035968e-05, "loss": 0.1743, "step": 368 }, { "epoch": 0.5404613694617356, "grad_norm": 0.3067354134944423, "learning_rate": 1.828703349804487e-05, "loss": 0.1715, "step": 369 }, { "epoch": 0.5419260344196265, "grad_norm": 0.36661193734735914, "learning_rate": 1.8272676179511428e-05, "loss": 0.1738, "step": 370 }, { "epoch": 0.5433906993775174, "grad_norm": 0.30007784767963736, "learning_rate": 1.8258264628879753e-05, "loss": 0.1552, "step": 371 }, { "epoch": 0.5448553643354083, "grad_norm": 0.3058776353752914, "learning_rate": 1.8243798940625752e-05, "loss": 0.1656, "step": 372 }, { "epoch": 0.5463200292932991, "grad_norm": 0.2906329929557275, "learning_rate": 1.8229279209580245e-05, "loss": 0.1724, "step": 373 }, { "epoch": 0.54778469425119, "grad_norm": 0.2764957936178043, "learning_rate": 1.8214705530928322e-05, "loss": 0.1569, "step": 374 }, { "epoch": 0.5492493592090809, "grad_norm": 0.31882196921773603, "learning_rate": 1.8200078000208745e-05, "loss": 0.1618, "step": 375 }, { "epoch": 0.5507140241669718, "grad_norm": 0.3031929350492432, "learning_rate": 1.818539671331329e-05, "loss": 0.1821, "step": 376 }, { "epoch": 0.5521786891248627, "grad_norm": 0.3285177453767559, "learning_rate": 1.8170661766486147e-05, "loss": 0.1708, "step": 377 }, { "epoch": 0.5536433540827536, "grad_norm": 0.291362505718777, "learning_rate": 1.815587325632328e-05, "loss": 0.1678, "step": 378 }, { "epoch": 0.5551080190406444, "grad_norm": 0.3154222441804908, "learning_rate": 1.8141031279771777e-05, "loss": 0.1712, "step": 379 }, { "epoch": 0.5565726839985353, "grad_norm": 0.2924737445300942, "learning_rate": 1.812613593412924e-05, "loss": 0.1623, "step": 380 }, { "epoch": 0.5580373489564262, "grad_norm": 0.3327921961518248, "learning_rate": 1.8111187317043136e-05, "loss": 0.168, "step": 381 }, { "epoch": 0.5595020139143171, "grad_norm": 0.3100694105568342, "learning_rate": 1.8096185526510154e-05, "loss": 0.1457, "step": 382 }, { "epoch": 0.560966678872208, "grad_norm": 0.31380150144375096, "learning_rate": 1.8081130660875557e-05, "loss": 0.1712, "step": 383 }, { "epoch": 0.5624313438300989, "grad_norm": 0.29763863880909497, "learning_rate": 1.8066022818832564e-05, "loss": 0.1712, "step": 384 }, { "epoch": 0.5638960087879897, "grad_norm": 0.31673229457527147, "learning_rate": 1.805086209942166e-05, "loss": 0.1642, "step": 385 }, { "epoch": 0.5653606737458806, "grad_norm": 0.28416394082082747, "learning_rate": 1.8035648602029997e-05, "loss": 0.1568, "step": 386 }, { "epoch": 0.5668253387037715, "grad_norm": 0.3118223455455188, "learning_rate": 1.80203824263907e-05, "loss": 0.2036, "step": 387 }, { "epoch": 0.5682900036616624, "grad_norm": 0.31436408212659767, "learning_rate": 1.8005063672582236e-05, "loss": 0.1761, "step": 388 }, { "epoch": 0.5697546686195533, "grad_norm": 0.30744846641916707, "learning_rate": 1.7989692441027744e-05, "loss": 0.153, "step": 389 }, { "epoch": 0.5712193335774441, "grad_norm": 0.29045516723158216, "learning_rate": 1.7974268832494397e-05, "loss": 0.1449, "step": 390 }, { "epoch": 0.572683998535335, "grad_norm": 0.2752010942760226, "learning_rate": 1.7958792948092726e-05, "loss": 0.181, "step": 391 }, { "epoch": 0.5741486634932259, "grad_norm": 0.3004642528909644, "learning_rate": 1.7943264889275944e-05, "loss": 0.167, "step": 392 }, { "epoch": 0.5756133284511168, "grad_norm": 0.28533925990840686, "learning_rate": 1.792768475783932e-05, "loss": 0.1508, "step": 393 }, { "epoch": 0.5770779934090077, "grad_norm": 0.3074271645217073, "learning_rate": 1.7912052655919478e-05, "loss": 0.1614, "step": 394 }, { "epoch": 0.5785426583668986, "grad_norm": 0.2860515134927835, "learning_rate": 1.7896368685993738e-05, "loss": 0.1467, "step": 395 }, { "epoch": 0.5800073233247894, "grad_norm": 0.2826797056983111, "learning_rate": 1.7880632950879438e-05, "loss": 0.1647, "step": 396 }, { "epoch": 0.5814719882826803, "grad_norm": 0.28758395090779254, "learning_rate": 1.7864845553733276e-05, "loss": 0.1643, "step": 397 }, { "epoch": 0.5829366532405712, "grad_norm": 0.29461983196821356, "learning_rate": 1.7849006598050626e-05, "loss": 0.1762, "step": 398 }, { "epoch": 0.5844013181984621, "grad_norm": 0.2777593396108342, "learning_rate": 1.7833116187664846e-05, "loss": 0.1561, "step": 399 }, { "epoch": 0.585865983156353, "grad_norm": 0.30600321149552173, "learning_rate": 1.781717442674662e-05, "loss": 0.1777, "step": 400 }, { "epoch": 0.5873306481142438, "grad_norm": 0.2758705111537229, "learning_rate": 1.7801181419803257e-05, "loss": 0.1556, "step": 401 }, { "epoch": 0.5887953130721347, "grad_norm": 0.2625670019142076, "learning_rate": 1.7785137271678013e-05, "loss": 0.1468, "step": 402 }, { "epoch": 0.5902599780300256, "grad_norm": 0.29391559042563303, "learning_rate": 1.776904208754941e-05, "loss": 0.1823, "step": 403 }, { "epoch": 0.5917246429879165, "grad_norm": 0.2824926653515285, "learning_rate": 1.7752895972930538e-05, "loss": 0.1638, "step": 404 }, { "epoch": 0.5931893079458074, "grad_norm": 0.31289373581775576, "learning_rate": 1.7736699033668353e-05, "loss": 0.1969, "step": 405 }, { "epoch": 0.5946539729036983, "grad_norm": 0.28541795717364044, "learning_rate": 1.772045137594301e-05, "loss": 0.1445, "step": 406 }, { "epoch": 0.5961186378615891, "grad_norm": 0.30005077280176484, "learning_rate": 1.770415310626715e-05, "loss": 0.188, "step": 407 }, { "epoch": 0.59758330281948, "grad_norm": 0.27829833597465525, "learning_rate": 1.7687804331485203e-05, "loss": 0.196, "step": 408 }, { "epoch": 0.5990479677773709, "grad_norm": 0.28631318068873735, "learning_rate": 1.7671405158772686e-05, "loss": 0.1624, "step": 409 }, { "epoch": 0.6005126327352618, "grad_norm": 0.2867469013067658, "learning_rate": 1.7654955695635498e-05, "loss": 0.1839, "step": 410 }, { "epoch": 0.6019772976931527, "grad_norm": 0.2737552258396289, "learning_rate": 1.7638456049909238e-05, "loss": 0.1553, "step": 411 }, { "epoch": 0.6034419626510436, "grad_norm": 0.3136051529335448, "learning_rate": 1.7621906329758466e-05, "loss": 0.175, "step": 412 }, { "epoch": 0.6049066276089344, "grad_norm": 0.2773613881722616, "learning_rate": 1.760530664367601e-05, "loss": 0.1696, "step": 413 }, { "epoch": 0.6063712925668253, "grad_norm": 0.29022770109259144, "learning_rate": 1.758865710048225e-05, "loss": 0.163, "step": 414 }, { "epoch": 0.6078359575247162, "grad_norm": 0.28813354195965973, "learning_rate": 1.7571957809324422e-05, "loss": 0.1649, "step": 415 }, { "epoch": 0.6093006224826071, "grad_norm": 0.2985551015333967, "learning_rate": 1.7555208879675875e-05, "loss": 0.1737, "step": 416 }, { "epoch": 0.610765287440498, "grad_norm": 0.31633838685259236, "learning_rate": 1.7538410421335373e-05, "loss": 0.1606, "step": 417 }, { "epoch": 0.6122299523983888, "grad_norm": 0.29552853603848234, "learning_rate": 1.752156254442636e-05, "loss": 0.1824, "step": 418 }, { "epoch": 0.6136946173562797, "grad_norm": 0.31144800745719076, "learning_rate": 1.7504665359396255e-05, "loss": 0.1501, "step": 419 }, { "epoch": 0.6151592823141706, "grad_norm": 0.30007771034473985, "learning_rate": 1.748771897701572e-05, "loss": 0.1756, "step": 420 }, { "epoch": 0.6166239472720615, "grad_norm": 0.3153299684246117, "learning_rate": 1.7470723508377935e-05, "loss": 0.1727, "step": 421 }, { "epoch": 0.6180886122299524, "grad_norm": 0.29874905779465544, "learning_rate": 1.745367906489786e-05, "loss": 0.1583, "step": 422 }, { "epoch": 0.6195532771878433, "grad_norm": 0.313158860779259, "learning_rate": 1.7436585758311512e-05, "loss": 0.1776, "step": 423 }, { "epoch": 0.6210179421457341, "grad_norm": 0.29027418449570486, "learning_rate": 1.7419443700675248e-05, "loss": 0.1615, "step": 424 }, { "epoch": 0.622482607103625, "grad_norm": 0.27091112505454484, "learning_rate": 1.7402253004365007e-05, "loss": 0.1471, "step": 425 }, { "epoch": 0.6239472720615159, "grad_norm": 0.31079312606497544, "learning_rate": 1.7385013782075575e-05, "loss": 0.1642, "step": 426 }, { "epoch": 0.6254119370194068, "grad_norm": 0.3191231075261632, "learning_rate": 1.736772614681987e-05, "loss": 0.1799, "step": 427 }, { "epoch": 0.6268766019772977, "grad_norm": 0.29107955830633764, "learning_rate": 1.7350390211928167e-05, "loss": 0.1364, "step": 428 }, { "epoch": 0.6283412669351885, "grad_norm": 0.28799608478930533, "learning_rate": 1.7333006091047386e-05, "loss": 0.1446, "step": 429 }, { "epoch": 0.6298059318930794, "grad_norm": 0.2805703556245509, "learning_rate": 1.7315573898140324e-05, "loss": 0.1578, "step": 430 }, { "epoch": 0.6312705968509703, "grad_norm": 0.3022531763710714, "learning_rate": 1.7298093747484923e-05, "loss": 0.1454, "step": 431 }, { "epoch": 0.6327352618088612, "grad_norm": 0.29056765252531186, "learning_rate": 1.7280565753673517e-05, "loss": 0.1409, "step": 432 }, { "epoch": 0.6341999267667521, "grad_norm": 0.28348330072419187, "learning_rate": 1.7262990031612072e-05, "loss": 0.1632, "step": 433 }, { "epoch": 0.635664591724643, "grad_norm": 0.27849254820160685, "learning_rate": 1.7245366696519448e-05, "loss": 0.1382, "step": 434 }, { "epoch": 0.6371292566825338, "grad_norm": 0.2849173212539919, "learning_rate": 1.7227695863926627e-05, "loss": 0.1404, "step": 435 }, { "epoch": 0.6385939216404247, "grad_norm": 0.29763893191664237, "learning_rate": 1.7209977649675975e-05, "loss": 0.1699, "step": 436 }, { "epoch": 0.6400585865983156, "grad_norm": 0.28500529876174013, "learning_rate": 1.7192212169920458e-05, "loss": 0.154, "step": 437 }, { "epoch": 0.6415232515562065, "grad_norm": 0.29341182785229564, "learning_rate": 1.717439954112291e-05, "loss": 0.1629, "step": 438 }, { "epoch": 0.6429879165140974, "grad_norm": 0.2833467699130664, "learning_rate": 1.7156539880055236e-05, "loss": 0.1479, "step": 439 }, { "epoch": 0.6444525814719883, "grad_norm": 0.29432333457804116, "learning_rate": 1.7138633303797676e-05, "loss": 0.1447, "step": 440 }, { "epoch": 0.6459172464298791, "grad_norm": 0.2899857698207261, "learning_rate": 1.712067992973803e-05, "loss": 0.1707, "step": 441 }, { "epoch": 0.64738191138777, "grad_norm": 0.300112032266628, "learning_rate": 1.710267987557087e-05, "loss": 0.1441, "step": 442 }, { "epoch": 0.6488465763456609, "grad_norm": 0.28676504299001276, "learning_rate": 1.7084633259296798e-05, "loss": 0.1418, "step": 443 }, { "epoch": 0.6503112413035518, "grad_norm": 0.27688806374870634, "learning_rate": 1.706654019922164e-05, "loss": 0.1478, "step": 444 }, { "epoch": 0.6517759062614427, "grad_norm": 0.2853894260456318, "learning_rate": 1.704840081395571e-05, "loss": 0.1524, "step": 445 }, { "epoch": 0.6532405712193335, "grad_norm": 0.2946623888589574, "learning_rate": 1.703021522241298e-05, "loss": 0.1471, "step": 446 }, { "epoch": 0.6547052361772244, "grad_norm": 0.2726573492109173, "learning_rate": 1.701198354381036e-05, "loss": 0.1312, "step": 447 }, { "epoch": 0.6561699011351153, "grad_norm": 0.28410262500836964, "learning_rate": 1.6993705897666873e-05, "loss": 0.1444, "step": 448 }, { "epoch": 0.6576345660930062, "grad_norm": 0.2833743474032485, "learning_rate": 1.697538240380288e-05, "loss": 0.1596, "step": 449 }, { "epoch": 0.6590992310508971, "grad_norm": 0.28744248608886525, "learning_rate": 1.695701318233931e-05, "loss": 0.1696, "step": 450 }, { "epoch": 0.660563896008788, "grad_norm": 0.2922684857852237, "learning_rate": 1.6938598353696864e-05, "loss": 0.1822, "step": 451 }, { "epoch": 0.6620285609666788, "grad_norm": 0.2824755616354114, "learning_rate": 1.6920138038595214e-05, "loss": 0.16, "step": 452 }, { "epoch": 0.6634932259245697, "grad_norm": 0.2881779281986939, "learning_rate": 1.6901632358052226e-05, "loss": 0.1306, "step": 453 }, { "epoch": 0.6649578908824606, "grad_norm": 0.3076676560083059, "learning_rate": 1.6883081433383163e-05, "loss": 0.1693, "step": 454 }, { "epoch": 0.6664225558403515, "grad_norm": 0.28360460469529464, "learning_rate": 1.6864485386199895e-05, "loss": 0.1347, "step": 455 }, { "epoch": 0.6678872207982424, "grad_norm": 0.2792514596107041, "learning_rate": 1.6845844338410077e-05, "loss": 0.1584, "step": 456 }, { "epoch": 0.6693518857561332, "grad_norm": 0.27734844058934455, "learning_rate": 1.6827158412216396e-05, "loss": 0.1489, "step": 457 }, { "epoch": 0.6708165507140241, "grad_norm": 0.2868505259889296, "learning_rate": 1.6808427730115716e-05, "loss": 0.1556, "step": 458 }, { "epoch": 0.672281215671915, "grad_norm": 0.27438002988710974, "learning_rate": 1.6789652414898315e-05, "loss": 0.1594, "step": 459 }, { "epoch": 0.6737458806298059, "grad_norm": 0.26963685143388444, "learning_rate": 1.677083258964707e-05, "loss": 0.1544, "step": 460 }, { "epoch": 0.6752105455876968, "grad_norm": 0.27489622544391323, "learning_rate": 1.675196837773664e-05, "loss": 0.1779, "step": 461 }, { "epoch": 0.6766752105455877, "grad_norm": 0.28666936527106884, "learning_rate": 1.673305990283266e-05, "loss": 0.1389, "step": 462 }, { "epoch": 0.6781398755034785, "grad_norm": 0.2854502743190489, "learning_rate": 1.6714107288890943e-05, "loss": 0.1518, "step": 463 }, { "epoch": 0.6796045404613694, "grad_norm": 0.28486649873799424, "learning_rate": 1.6695110660156652e-05, "loss": 0.1356, "step": 464 }, { "epoch": 0.6810692054192603, "grad_norm": 0.3095642301969388, "learning_rate": 1.6676070141163498e-05, "loss": 0.1676, "step": 465 }, { "epoch": 0.6825338703771512, "grad_norm": 0.3135455510235421, "learning_rate": 1.665698585673291e-05, "loss": 0.1606, "step": 466 }, { "epoch": 0.6839985353350421, "grad_norm": 0.28266851700362705, "learning_rate": 1.6637857931973233e-05, "loss": 0.1345, "step": 467 }, { "epoch": 0.685463200292933, "grad_norm": 0.27779394936697266, "learning_rate": 1.6618686492278892e-05, "loss": 0.1682, "step": 468 }, { "epoch": 0.6869278652508238, "grad_norm": 0.27567920609962066, "learning_rate": 1.6599471663329577e-05, "loss": 0.1334, "step": 469 }, { "epoch": 0.6883925302087147, "grad_norm": 0.2754295653847063, "learning_rate": 1.6580213571089427e-05, "loss": 0.1573, "step": 470 }, { "epoch": 0.6898571951666056, "grad_norm": 0.2750050531478153, "learning_rate": 1.656091234180619e-05, "loss": 0.1773, "step": 471 }, { "epoch": 0.6913218601244965, "grad_norm": 0.27965457420033163, "learning_rate": 1.65415681020104e-05, "loss": 0.1369, "step": 472 }, { "epoch": 0.6927865250823874, "grad_norm": 0.27090231974844964, "learning_rate": 1.6522180978514556e-05, "loss": 0.1375, "step": 473 }, { "epoch": 0.6942511900402782, "grad_norm": 0.2824639859361883, "learning_rate": 1.6502751098412282e-05, "loss": 0.1246, "step": 474 }, { "epoch": 0.6957158549981691, "grad_norm": 0.28655833648300955, "learning_rate": 1.648327858907749e-05, "loss": 0.1648, "step": 475 }, { "epoch": 0.69718051995606, "grad_norm": 0.27502131490113785, "learning_rate": 1.6463763578163563e-05, "loss": 0.1501, "step": 476 }, { "epoch": 0.6986451849139509, "grad_norm": 0.26898509925311315, "learning_rate": 1.6444206193602493e-05, "loss": 0.1499, "step": 477 }, { "epoch": 0.7001098498718418, "grad_norm": 0.3350314192757342, "learning_rate": 1.642460656360406e-05, "loss": 0.1572, "step": 478 }, { "epoch": 0.7015745148297327, "grad_norm": 0.2773978701354445, "learning_rate": 1.6404964816654993e-05, "loss": 0.1272, "step": 479 }, { "epoch": 0.7030391797876235, "grad_norm": 0.2901786822045925, "learning_rate": 1.638528108151811e-05, "loss": 0.1892, "step": 480 }, { "epoch": 0.7045038447455144, "grad_norm": 0.2847317225297176, "learning_rate": 1.63655554872315e-05, "loss": 0.1579, "step": 481 }, { "epoch": 0.7059685097034053, "grad_norm": 0.2760852156995962, "learning_rate": 1.6345788163107645e-05, "loss": 0.1411, "step": 482 }, { "epoch": 0.7074331746612962, "grad_norm": 0.2812021526710764, "learning_rate": 1.6325979238732606e-05, "loss": 0.1416, "step": 483 }, { "epoch": 0.7088978396191871, "grad_norm": 0.27495150899008586, "learning_rate": 1.630612884396515e-05, "loss": 0.1457, "step": 484 }, { "epoch": 0.710362504577078, "grad_norm": 0.27821852042543493, "learning_rate": 1.62862371089359e-05, "loss": 0.1409, "step": 485 }, { "epoch": 0.7118271695349688, "grad_norm": 0.26154961742164873, "learning_rate": 1.6266304164046505e-05, "loss": 0.1392, "step": 486 }, { "epoch": 0.7132918344928597, "grad_norm": 0.3344656299415514, "learning_rate": 1.6246330139968748e-05, "loss": 0.1395, "step": 487 }, { "epoch": 0.7147564994507506, "grad_norm": 0.2889353353792984, "learning_rate": 1.6226315167643723e-05, "loss": 0.1377, "step": 488 }, { "epoch": 0.7162211644086415, "grad_norm": 0.2866205322545156, "learning_rate": 1.6206259378280956e-05, "loss": 0.1339, "step": 489 }, { "epoch": 0.7176858293665324, "grad_norm": 0.2635137582733423, "learning_rate": 1.6186162903357562e-05, "loss": 0.1384, "step": 490 }, { "epoch": 0.7191504943244232, "grad_norm": 0.264894099077168, "learning_rate": 1.616602587461736e-05, "loss": 0.1415, "step": 491 }, { "epoch": 0.7206151592823141, "grad_norm": 0.2912590760990707, "learning_rate": 1.6145848424070032e-05, "loss": 0.1377, "step": 492 }, { "epoch": 0.722079824240205, "grad_norm": 0.28322985703286563, "learning_rate": 1.612563068399024e-05, "loss": 0.1381, "step": 493 }, { "epoch": 0.7235444891980959, "grad_norm": 0.2761592201460229, "learning_rate": 1.6105372786916776e-05, "loss": 0.1205, "step": 494 }, { "epoch": 0.7250091541559868, "grad_norm": 0.2593621751501278, "learning_rate": 1.6085074865651672e-05, "loss": 0.1396, "step": 495 }, { "epoch": 0.7264738191138777, "grad_norm": 0.2698751624804496, "learning_rate": 1.6064737053259355e-05, "loss": 0.1549, "step": 496 }, { "epoch": 0.7279384840717685, "grad_norm": 0.2784921841300199, "learning_rate": 1.604435948306575e-05, "loss": 0.142, "step": 497 }, { "epoch": 0.7294031490296594, "grad_norm": 0.282990933381995, "learning_rate": 1.6023942288657423e-05, "loss": 0.1532, "step": 498 }, { "epoch": 0.7308678139875503, "grad_norm": 0.2810338678318928, "learning_rate": 1.60034856038807e-05, "loss": 0.1379, "step": 499 }, { "epoch": 0.7323324789454412, "grad_norm": 0.2964732663844032, "learning_rate": 1.5982989562840785e-05, "loss": 0.1324, "step": 500 }, { "epoch": 0.7337971439033321, "grad_norm": 0.2943783008884028, "learning_rate": 1.596245429990088e-05, "loss": 0.143, "step": 501 }, { "epoch": 0.7352618088612229, "grad_norm": 0.292096480725846, "learning_rate": 1.5941879949681323e-05, "loss": 0.1466, "step": 502 }, { "epoch": 0.7367264738191138, "grad_norm": 0.28475675120953237, "learning_rate": 1.5921266647058683e-05, "loss": 0.1573, "step": 503 }, { "epoch": 0.7381911387770047, "grad_norm": 0.2741769477206498, "learning_rate": 1.5900614527164876e-05, "loss": 0.1485, "step": 504 }, { "epoch": 0.7396558037348956, "grad_norm": 0.3040355479807483, "learning_rate": 1.5879923725386307e-05, "loss": 0.1642, "step": 505 }, { "epoch": 0.7411204686927865, "grad_norm": 0.3037335266933942, "learning_rate": 1.5859194377362942e-05, "loss": 0.1497, "step": 506 }, { "epoch": 0.7425851336506774, "grad_norm": 0.2791249180827828, "learning_rate": 1.5838426618987455e-05, "loss": 0.1521, "step": 507 }, { "epoch": 0.7440497986085682, "grad_norm": 0.281309610689647, "learning_rate": 1.5817620586404315e-05, "loss": 0.1346, "step": 508 }, { "epoch": 0.7455144635664591, "grad_norm": 0.28553330164537244, "learning_rate": 1.5796776416008897e-05, "loss": 0.1446, "step": 509 }, { "epoch": 0.74697912852435, "grad_norm": 0.2844881306714679, "learning_rate": 1.5775894244446603e-05, "loss": 0.1414, "step": 510 }, { "epoch": 0.7484437934822409, "grad_norm": 0.268988666507586, "learning_rate": 1.575497420861194e-05, "loss": 0.1288, "step": 511 }, { "epoch": 0.7499084584401318, "grad_norm": 0.27447838457593043, "learning_rate": 1.573401644564764e-05, "loss": 0.1572, "step": 512 }, { "epoch": 0.7513731233980228, "grad_norm": 0.2973833246231482, "learning_rate": 1.571302109294377e-05, "loss": 0.1406, "step": 513 }, { "epoch": 0.7528377883559136, "grad_norm": 0.27289540026887815, "learning_rate": 1.569198828813681e-05, "loss": 0.1392, "step": 514 }, { "epoch": 0.7543024533138045, "grad_norm": 0.2870617946282251, "learning_rate": 1.567091816910875e-05, "loss": 0.1543, "step": 515 }, { "epoch": 0.7557671182716954, "grad_norm": 0.27283398029853356, "learning_rate": 1.5649810873986214e-05, "loss": 0.1276, "step": 516 }, { "epoch": 0.7572317832295863, "grad_norm": 0.2637139762799822, "learning_rate": 1.5628666541139523e-05, "loss": 0.1355, "step": 517 }, { "epoch": 0.7586964481874772, "grad_norm": 0.27854483215025555, "learning_rate": 1.5607485309181813e-05, "loss": 0.1483, "step": 518 }, { "epoch": 0.760161113145368, "grad_norm": 0.28494692420422013, "learning_rate": 1.55862673169681e-05, "loss": 0.1356, "step": 519 }, { "epoch": 0.7616257781032589, "grad_norm": 0.2898114191567579, "learning_rate": 1.5565012703594403e-05, "loss": 0.141, "step": 520 }, { "epoch": 0.7630904430611498, "grad_norm": 0.2822142264481914, "learning_rate": 1.55437216083968e-05, "loss": 0.1311, "step": 521 }, { "epoch": 0.7645551080190407, "grad_norm": 0.25757582269462975, "learning_rate": 1.552239417095052e-05, "loss": 0.1361, "step": 522 }, { "epoch": 0.7660197729769316, "grad_norm": 0.2694585038356422, "learning_rate": 1.5501030531069066e-05, "loss": 0.1405, "step": 523 }, { "epoch": 0.7674844379348225, "grad_norm": 0.27818748796170395, "learning_rate": 1.5479630828803235e-05, "loss": 0.1414, "step": 524 }, { "epoch": 0.7689491028927133, "grad_norm": 0.2892622203178171, "learning_rate": 1.5458195204440255e-05, "loss": 0.1387, "step": 525 }, { "epoch": 0.7704137678506042, "grad_norm": 0.2765799688074329, "learning_rate": 1.5436723798502842e-05, "loss": 0.1453, "step": 526 }, { "epoch": 0.7718784328084951, "grad_norm": 0.27743642550569964, "learning_rate": 1.5415216751748264e-05, "loss": 0.1326, "step": 527 }, { "epoch": 0.773343097766386, "grad_norm": 0.2602739542105575, "learning_rate": 1.5393674205167453e-05, "loss": 0.1329, "step": 528 }, { "epoch": 0.7748077627242769, "grad_norm": 0.2673531017304055, "learning_rate": 1.5372096299984064e-05, "loss": 0.1343, "step": 529 }, { "epoch": 0.7762724276821678, "grad_norm": 0.29845215607643555, "learning_rate": 1.5350483177653528e-05, "loss": 0.1522, "step": 530 }, { "epoch": 0.7777370926400586, "grad_norm": 0.2725973305117268, "learning_rate": 1.5328834979862158e-05, "loss": 0.13, "step": 531 }, { "epoch": 0.7792017575979495, "grad_norm": 0.263213840818661, "learning_rate": 1.5307151848526213e-05, "loss": 0.1246, "step": 532 }, { "epoch": 0.7806664225558404, "grad_norm": 0.2551375025943304, "learning_rate": 1.5285433925790946e-05, "loss": 0.1111, "step": 533 }, { "epoch": 0.7821310875137313, "grad_norm": 0.2622331657629095, "learning_rate": 1.5263681354029694e-05, "loss": 0.1351, "step": 534 }, { "epoch": 0.7835957524716222, "grad_norm": 0.3124931282937266, "learning_rate": 1.5241894275842946e-05, "loss": 0.1417, "step": 535 }, { "epoch": 0.785060417429513, "grad_norm": 0.30036829759264483, "learning_rate": 1.5220072834057387e-05, "loss": 0.1363, "step": 536 }, { "epoch": 0.7865250823874039, "grad_norm": 0.2596328178928325, "learning_rate": 1.5198217171724982e-05, "loss": 0.1214, "step": 537 }, { "epoch": 0.7879897473452948, "grad_norm": 0.2805919354886981, "learning_rate": 1.5176327432122028e-05, "loss": 0.1268, "step": 538 }, { "epoch": 0.7894544123031857, "grad_norm": 0.2771437475611675, "learning_rate": 1.5154403758748228e-05, "loss": 0.1308, "step": 539 }, { "epoch": 0.7909190772610766, "grad_norm": 0.2731861706982954, "learning_rate": 1.5132446295325722e-05, "loss": 0.128, "step": 540 }, { "epoch": 0.7923837422189675, "grad_norm": 0.3430389886450956, "learning_rate": 1.511045518579818e-05, "loss": 0.1374, "step": 541 }, { "epoch": 0.7938484071768583, "grad_norm": 0.2905683813596674, "learning_rate": 1.5088430574329836e-05, "loss": 0.1332, "step": 542 }, { "epoch": 0.7953130721347492, "grad_norm": 0.28971874507270196, "learning_rate": 1.5066372605304537e-05, "loss": 0.1358, "step": 543 }, { "epoch": 0.7967777370926401, "grad_norm": 0.2629369085736157, "learning_rate": 1.5044281423324826e-05, "loss": 0.1305, "step": 544 }, { "epoch": 0.798242402050531, "grad_norm": 0.2624966808667796, "learning_rate": 1.5022157173210969e-05, "loss": 0.1386, "step": 545 }, { "epoch": 0.7997070670084219, "grad_norm": 0.27495723120059107, "learning_rate": 1.5000000000000002e-05, "loss": 0.1261, "step": 546 }, { "epoch": 0.8011717319663128, "grad_norm": 0.2968841863976358, "learning_rate": 1.4977810048944806e-05, "loss": 0.1438, "step": 547 }, { "epoch": 0.8026363969242036, "grad_norm": 0.26619846720879786, "learning_rate": 1.495558746551313e-05, "loss": 0.139, "step": 548 }, { "epoch": 0.8041010618820945, "grad_norm": 0.2733899438656217, "learning_rate": 1.4933332395386652e-05, "loss": 0.1394, "step": 549 }, { "epoch": 0.8055657268399854, "grad_norm": 0.27528673995767655, "learning_rate": 1.4911044984460015e-05, "loss": 0.1235, "step": 550 }, { "epoch": 0.8070303917978763, "grad_norm": 0.27120553519443596, "learning_rate": 1.4888725378839877e-05, "loss": 0.1389, "step": 551 }, { "epoch": 0.8084950567557672, "grad_norm": 0.26970265074149985, "learning_rate": 1.4866373724843945e-05, "loss": 0.1381, "step": 552 }, { "epoch": 0.809959721713658, "grad_norm": 0.29150434298475913, "learning_rate": 1.484399016900003e-05, "loss": 0.1345, "step": 553 }, { "epoch": 0.8114243866715489, "grad_norm": 0.29587722131690225, "learning_rate": 1.4821574858045073e-05, "loss": 0.1356, "step": 554 }, { "epoch": 0.8128890516294398, "grad_norm": 0.26978671672533217, "learning_rate": 1.479912793892419e-05, "loss": 0.1363, "step": 555 }, { "epoch": 0.8143537165873307, "grad_norm": 0.26428779798336755, "learning_rate": 1.4776649558789698e-05, "loss": 0.1545, "step": 556 }, { "epoch": 0.8158183815452216, "grad_norm": 0.2715765644749667, "learning_rate": 1.475413986500017e-05, "loss": 0.1437, "step": 557 }, { "epoch": 0.8172830465031125, "grad_norm": 0.2626982755104111, "learning_rate": 1.4731599005119454e-05, "loss": 0.1277, "step": 558 }, { "epoch": 0.8187477114610033, "grad_norm": 0.27082844640160836, "learning_rate": 1.470902712691571e-05, "loss": 0.1111, "step": 559 }, { "epoch": 0.8202123764188942, "grad_norm": 0.2668734287792547, "learning_rate": 1.4686424378360434e-05, "loss": 0.1312, "step": 560 }, { "epoch": 0.8216770413767851, "grad_norm": 0.2869401318531047, "learning_rate": 1.4663790907627502e-05, "loss": 0.1509, "step": 561 }, { "epoch": 0.823141706334676, "grad_norm": 0.2773427427864892, "learning_rate": 1.4641126863092194e-05, "loss": 0.1577, "step": 562 }, { "epoch": 0.8246063712925669, "grad_norm": 0.2717284080506168, "learning_rate": 1.4618432393330211e-05, "loss": 0.1254, "step": 563 }, { "epoch": 0.8260710362504577, "grad_norm": 0.2919980547507266, "learning_rate": 1.4595707647116713e-05, "loss": 0.158, "step": 564 }, { "epoch": 0.8275357012083486, "grad_norm": 0.27833252973367867, "learning_rate": 1.4572952773425335e-05, "loss": 0.1252, "step": 565 }, { "epoch": 0.8290003661662395, "grad_norm": 0.2850694697710808, "learning_rate": 1.455016792142722e-05, "loss": 0.1333, "step": 566 }, { "epoch": 0.8304650311241304, "grad_norm": 0.26558195129076256, "learning_rate": 1.4527353240490039e-05, "loss": 0.1349, "step": 567 }, { "epoch": 0.8319296960820213, "grad_norm": 0.2944649491485909, "learning_rate": 1.4504508880176996e-05, "loss": 0.1528, "step": 568 }, { "epoch": 0.8333943610399122, "grad_norm": 0.2578991107569846, "learning_rate": 1.4481634990245871e-05, "loss": 0.1225, "step": 569 }, { "epoch": 0.834859025997803, "grad_norm": 0.33398387092007636, "learning_rate": 1.4458731720648024e-05, "loss": 0.1459, "step": 570 }, { "epoch": 0.8363236909556939, "grad_norm": 0.29274576860822454, "learning_rate": 1.4435799221527417e-05, "loss": 0.1377, "step": 571 }, { "epoch": 0.8377883559135848, "grad_norm": 0.2865214436536306, "learning_rate": 1.4412837643219625e-05, "loss": 0.1416, "step": 572 }, { "epoch": 0.8392530208714757, "grad_norm": 0.2803403830745175, "learning_rate": 1.4389847136250858e-05, "loss": 0.1291, "step": 573 }, { "epoch": 0.8407176858293666, "grad_norm": 0.2692461377236704, "learning_rate": 1.4366827851336964e-05, "loss": 0.1251, "step": 574 }, { "epoch": 0.8421823507872574, "grad_norm": 0.2626470317555678, "learning_rate": 1.4343779939382451e-05, "loss": 0.1111, "step": 575 }, { "epoch": 0.8436470157451483, "grad_norm": 0.28937945369153595, "learning_rate": 1.4320703551479494e-05, "loss": 0.1305, "step": 576 }, { "epoch": 0.8451116807030392, "grad_norm": 0.30541326375044037, "learning_rate": 1.4297598838906938e-05, "loss": 0.1502, "step": 577 }, { "epoch": 0.8465763456609301, "grad_norm": 0.2796697681217199, "learning_rate": 1.4274465953129326e-05, "loss": 0.1236, "step": 578 }, { "epoch": 0.848041010618821, "grad_norm": 0.2614920930808128, "learning_rate": 1.4251305045795874e-05, "loss": 0.1126, "step": 579 }, { "epoch": 0.8495056755767119, "grad_norm": 0.28511419468497884, "learning_rate": 1.422811626873951e-05, "loss": 0.1285, "step": 580 }, { "epoch": 0.8509703405346027, "grad_norm": 0.2707984473087975, "learning_rate": 1.4204899773975855e-05, "loss": 0.1436, "step": 581 }, { "epoch": 0.8524350054924936, "grad_norm": 0.281582631209825, "learning_rate": 1.4181655713702242e-05, "loss": 0.1297, "step": 582 }, { "epoch": 0.8538996704503845, "grad_norm": 0.27592043627026036, "learning_rate": 1.4158384240296707e-05, "loss": 0.1255, "step": 583 }, { "epoch": 0.8553643354082754, "grad_norm": 0.28772169132389885, "learning_rate": 1.4135085506316997e-05, "loss": 0.1504, "step": 584 }, { "epoch": 0.8568290003661663, "grad_norm": 0.26245407957464945, "learning_rate": 1.4111759664499562e-05, "loss": 0.1131, "step": 585 }, { "epoch": 0.8582936653240572, "grad_norm": 0.25257520518661797, "learning_rate": 1.4088406867758573e-05, "loss": 0.1235, "step": 586 }, { "epoch": 0.859758330281948, "grad_norm": 0.26981349513085906, "learning_rate": 1.4065027269184888e-05, "loss": 0.1429, "step": 587 }, { "epoch": 0.8612229952398389, "grad_norm": 0.2734637683112845, "learning_rate": 1.404162102204508e-05, "loss": 0.129, "step": 588 }, { "epoch": 0.8626876601977298, "grad_norm": 0.30246081365181193, "learning_rate": 1.4018188279780412e-05, "loss": 0.1314, "step": 589 }, { "epoch": 0.8641523251556207, "grad_norm": 0.287487018814624, "learning_rate": 1.3994729196005839e-05, "loss": 0.1259, "step": 590 }, { "epoch": 0.8656169901135116, "grad_norm": 0.27210880904989876, "learning_rate": 1.3971243924508996e-05, "loss": 0.1178, "step": 591 }, { "epoch": 0.8670816550714024, "grad_norm": 0.29342064785517297, "learning_rate": 1.3947732619249206e-05, "loss": 0.1341, "step": 592 }, { "epoch": 0.8685463200292933, "grad_norm": 0.27972993450917555, "learning_rate": 1.3924195434356443e-05, "loss": 0.1313, "step": 593 }, { "epoch": 0.8700109849871842, "grad_norm": 0.24692549028529914, "learning_rate": 1.3900632524130343e-05, "loss": 0.1233, "step": 594 }, { "epoch": 0.8714756499450751, "grad_norm": 0.32938798959658994, "learning_rate": 1.3877044043039189e-05, "loss": 0.1561, "step": 595 }, { "epoch": 0.872940314902966, "grad_norm": 0.27405616789978654, "learning_rate": 1.3853430145718892e-05, "loss": 0.1361, "step": 596 }, { "epoch": 0.8744049798608569, "grad_norm": 0.2846698580131875, "learning_rate": 1.382979098697198e-05, "loss": 0.1505, "step": 597 }, { "epoch": 0.8758696448187477, "grad_norm": 0.2886024048200631, "learning_rate": 1.3806126721766586e-05, "loss": 0.1421, "step": 598 }, { "epoch": 0.8773343097766386, "grad_norm": 0.24843406819575536, "learning_rate": 1.378243750523543e-05, "loss": 0.1166, "step": 599 }, { "epoch": 0.8787989747345295, "grad_norm": 0.29136988530850255, "learning_rate": 1.3758723492674803e-05, "loss": 0.163, "step": 600 }, { "epoch": 0.8802636396924204, "grad_norm": 0.27987479624274275, "learning_rate": 1.3734984839543547e-05, "loss": 0.1302, "step": 601 }, { "epoch": 0.8817283046503113, "grad_norm": 0.2766713098966634, "learning_rate": 1.3711221701462037e-05, "loss": 0.1264, "step": 602 }, { "epoch": 0.8831929696082021, "grad_norm": 0.281327006141709, "learning_rate": 1.368743423421116e-05, "loss": 0.1433, "step": 603 }, { "epoch": 0.884657634566093, "grad_norm": 0.2819498194158381, "learning_rate": 1.3663622593731294e-05, "loss": 0.1315, "step": 604 }, { "epoch": 0.8861222995239839, "grad_norm": 0.2869700943965007, "learning_rate": 1.3639786936121287e-05, "loss": 0.1313, "step": 605 }, { "epoch": 0.8875869644818748, "grad_norm": 0.28257828430613197, "learning_rate": 1.3615927417637435e-05, "loss": 0.1327, "step": 606 }, { "epoch": 0.8890516294397657, "grad_norm": 0.2983987956601424, "learning_rate": 1.3592044194692456e-05, "loss": 0.1435, "step": 607 }, { "epoch": 0.8905162943976566, "grad_norm": 0.2900543426098001, "learning_rate": 1.356813742385446e-05, "loss": 0.1366, "step": 608 }, { "epoch": 0.8919809593555474, "grad_norm": 0.27572519984679683, "learning_rate": 1.3544207261845928e-05, "loss": 0.1373, "step": 609 }, { "epoch": 0.8934456243134383, "grad_norm": 0.2721425773906768, "learning_rate": 1.3520253865542687e-05, "loss": 0.1223, "step": 610 }, { "epoch": 0.8949102892713292, "grad_norm": 0.282049356782574, "learning_rate": 1.3496277391972874e-05, "loss": 0.1192, "step": 611 }, { "epoch": 0.8963749542292201, "grad_norm": 0.2681081671045421, "learning_rate": 1.3472277998315915e-05, "loss": 0.1279, "step": 612 }, { "epoch": 0.897839619187111, "grad_norm": 0.276619227889388, "learning_rate": 1.3448255841901481e-05, "loss": 0.1327, "step": 613 }, { "epoch": 0.8993042841450019, "grad_norm": 0.28855198625509537, "learning_rate": 1.3424211080208478e-05, "loss": 0.1249, "step": 614 }, { "epoch": 0.9007689491028927, "grad_norm": 0.25545241022812626, "learning_rate": 1.3400143870863997e-05, "loss": 0.1053, "step": 615 }, { "epoch": 0.9022336140607836, "grad_norm": 0.24939455623227066, "learning_rate": 1.3376054371642282e-05, "loss": 0.1172, "step": 616 }, { "epoch": 0.9036982790186745, "grad_norm": 0.2890615502578014, "learning_rate": 1.3351942740463705e-05, "loss": 0.1368, "step": 617 }, { "epoch": 0.9051629439765654, "grad_norm": 0.2566294230704182, "learning_rate": 1.3327809135393728e-05, "loss": 0.1337, "step": 618 }, { "epoch": 0.9066276089344563, "grad_norm": 0.2902103194088055, "learning_rate": 1.3303653714641853e-05, "loss": 0.1251, "step": 619 }, { "epoch": 0.9080922738923471, "grad_norm": 0.2666865578103702, "learning_rate": 1.3279476636560608e-05, "loss": 0.1201, "step": 620 }, { "epoch": 0.909556938850238, "grad_norm": 0.2618945176801082, "learning_rate": 1.3255278059644496e-05, "loss": 0.1145, "step": 621 }, { "epoch": 0.9110216038081289, "grad_norm": 0.31710586966811777, "learning_rate": 1.323105814252895e-05, "loss": 0.1497, "step": 622 }, { "epoch": 0.9124862687660198, "grad_norm": 0.29714914241544244, "learning_rate": 1.3206817043989301e-05, "loss": 0.1297, "step": 623 }, { "epoch": 0.9139509337239107, "grad_norm": 0.26025416086413616, "learning_rate": 1.3182554922939748e-05, "loss": 0.1069, "step": 624 }, { "epoch": 0.9154155986818016, "grad_norm": 0.2983087250370599, "learning_rate": 1.3158271938432288e-05, "loss": 0.1285, "step": 625 }, { "epoch": 0.9168802636396924, "grad_norm": 0.27517063473066206, "learning_rate": 1.3133968249655701e-05, "loss": 0.1375, "step": 626 }, { "epoch": 0.9183449285975833, "grad_norm": 0.2659448809334693, "learning_rate": 1.3109644015934493e-05, "loss": 0.1144, "step": 627 }, { "epoch": 0.9198095935554742, "grad_norm": 0.31062215771722046, "learning_rate": 1.3085299396727851e-05, "loss": 0.1285, "step": 628 }, { "epoch": 0.9212742585133651, "grad_norm": 0.2555958267399944, "learning_rate": 1.3060934551628603e-05, "loss": 0.1111, "step": 629 }, { "epoch": 0.922738923471256, "grad_norm": 0.25825461588582355, "learning_rate": 1.3036549640362169e-05, "loss": 0.1153, "step": 630 }, { "epoch": 0.9242035884291468, "grad_norm": 0.270581810873964, "learning_rate": 1.301214482278551e-05, "loss": 0.1243, "step": 631 }, { "epoch": 0.9256682533870377, "grad_norm": 0.2632208259144816, "learning_rate": 1.2987720258886094e-05, "loss": 0.1098, "step": 632 }, { "epoch": 0.9271329183449286, "grad_norm": 0.2680331255251313, "learning_rate": 1.2963276108780829e-05, "loss": 0.1286, "step": 633 }, { "epoch": 0.9285975833028195, "grad_norm": 0.293030113117047, "learning_rate": 1.293881253271502e-05, "loss": 0.1559, "step": 634 }, { "epoch": 0.9300622482607104, "grad_norm": 0.27474296914093777, "learning_rate": 1.2914329691061327e-05, "loss": 0.1144, "step": 635 }, { "epoch": 0.9315269132186013, "grad_norm": 0.2738691898038605, "learning_rate": 1.2889827744318705e-05, "loss": 0.1159, "step": 636 }, { "epoch": 0.9329915781764921, "grad_norm": 0.2926627981538906, "learning_rate": 1.286530685311135e-05, "loss": 0.1476, "step": 637 }, { "epoch": 0.934456243134383, "grad_norm": 0.2542806182987207, "learning_rate": 1.2840767178187657e-05, "loss": 0.1076, "step": 638 }, { "epoch": 0.9359209080922739, "grad_norm": 0.2752287662506346, "learning_rate": 1.281620888041915e-05, "loss": 0.1369, "step": 639 }, { "epoch": 0.9373855730501648, "grad_norm": 0.251869065832337, "learning_rate": 1.279163212079944e-05, "loss": 0.116, "step": 640 }, { "epoch": 0.9388502380080557, "grad_norm": 0.2609971662177529, "learning_rate": 1.2767037060443173e-05, "loss": 0.1058, "step": 641 }, { "epoch": 0.9403149029659466, "grad_norm": 0.26794778938799796, "learning_rate": 1.2742423860584954e-05, "loss": 0.1212, "step": 642 }, { "epoch": 0.9417795679238374, "grad_norm": 0.3047657017773294, "learning_rate": 1.271779268257831e-05, "loss": 0.1339, "step": 643 }, { "epoch": 0.9432442328817283, "grad_norm": 0.2897849310784945, "learning_rate": 1.269314368789463e-05, "loss": 0.1084, "step": 644 }, { "epoch": 0.9447088978396192, "grad_norm": 0.29786222980952776, "learning_rate": 1.266847703812209e-05, "loss": 0.1106, "step": 645 }, { "epoch": 0.9461735627975101, "grad_norm": 0.2642203247056678, "learning_rate": 1.2643792894964611e-05, "loss": 0.1053, "step": 646 }, { "epoch": 0.947638227755401, "grad_norm": 0.28156653824521066, "learning_rate": 1.2619091420240795e-05, "loss": 0.1458, "step": 647 }, { "epoch": 0.9491028927132918, "grad_norm": 0.2557860588752611, "learning_rate": 1.2594372775882862e-05, "loss": 0.1086, "step": 648 }, { "epoch": 0.9505675576711827, "grad_norm": 0.26613179341981313, "learning_rate": 1.2569637123935581e-05, "loss": 0.1081, "step": 649 }, { "epoch": 0.9520322226290736, "grad_norm": 0.27208135899789015, "learning_rate": 1.2544884626555225e-05, "loss": 0.1309, "step": 650 }, { "epoch": 0.9534968875869645, "grad_norm": 0.2800087048695156, "learning_rate": 1.2520115446008493e-05, "loss": 0.122, "step": 651 }, { "epoch": 0.9549615525448554, "grad_norm": 0.265808555990337, "learning_rate": 1.2495329744671457e-05, "loss": 0.1143, "step": 652 }, { "epoch": 0.9564262175027463, "grad_norm": 0.2832651191459517, "learning_rate": 1.2470527685028485e-05, "loss": 0.1119, "step": 653 }, { "epoch": 0.9578908824606371, "grad_norm": 0.27691932316711165, "learning_rate": 1.2445709429671184e-05, "loss": 0.1196, "step": 654 }, { "epoch": 0.959355547418528, "grad_norm": 0.24468827419730876, "learning_rate": 1.2420875141297344e-05, "loss": 0.1004, "step": 655 }, { "epoch": 0.9608202123764189, "grad_norm": 0.28166326250407386, "learning_rate": 1.2396024982709845e-05, "loss": 0.1384, "step": 656 }, { "epoch": 0.9622848773343098, "grad_norm": 0.2864348243903678, "learning_rate": 1.2371159116815614e-05, "loss": 0.1314, "step": 657 }, { "epoch": 0.9637495422922007, "grad_norm": 0.29171091081890826, "learning_rate": 1.234627770662455e-05, "loss": 0.1348, "step": 658 }, { "epoch": 0.9652142072500915, "grad_norm": 0.26640633088569415, "learning_rate": 1.2321380915248446e-05, "loss": 0.1215, "step": 659 }, { "epoch": 0.9666788722079824, "grad_norm": 0.2719229168980855, "learning_rate": 1.2296468905899937e-05, "loss": 0.1163, "step": 660 }, { "epoch": 0.9681435371658733, "grad_norm": 0.28951810957646756, "learning_rate": 1.227154184189141e-05, "loss": 0.1321, "step": 661 }, { "epoch": 0.9696082021237642, "grad_norm": 0.30219688645041665, "learning_rate": 1.2246599886633951e-05, "loss": 0.1295, "step": 662 }, { "epoch": 0.9710728670816551, "grad_norm": 0.29588778808501925, "learning_rate": 1.222164320363627e-05, "loss": 0.1373, "step": 663 }, { "epoch": 0.972537532039546, "grad_norm": 0.26451637010025986, "learning_rate": 1.2196671956503611e-05, "loss": 0.1187, "step": 664 }, { "epoch": 0.9740021969974368, "grad_norm": 0.2670469003783616, "learning_rate": 1.217168630893671e-05, "loss": 0.1223, "step": 665 }, { "epoch": 0.9754668619553277, "grad_norm": 0.3201350019334305, "learning_rate": 1.2146686424730699e-05, "loss": 0.1238, "step": 666 }, { "epoch": 0.9769315269132186, "grad_norm": 0.25030723292519363, "learning_rate": 1.212167246777404e-05, "loss": 0.1087, "step": 667 }, { "epoch": 0.9783961918711095, "grad_norm": 0.30670242040276763, "learning_rate": 1.2096644602047447e-05, "loss": 0.1417, "step": 668 }, { "epoch": 0.9798608568290004, "grad_norm": 0.3387650718944703, "learning_rate": 1.2071602991622822e-05, "loss": 0.1244, "step": 669 }, { "epoch": 0.9813255217868913, "grad_norm": 0.31738790684917234, "learning_rate": 1.2046547800662163e-05, "loss": 0.1379, "step": 670 }, { "epoch": 0.9827901867447821, "grad_norm": 0.253861086544748, "learning_rate": 1.2021479193416502e-05, "loss": 0.0972, "step": 671 }, { "epoch": 0.984254851702673, "grad_norm": 0.27857395289374787, "learning_rate": 1.1996397334224814e-05, "loss": 0.1453, "step": 672 }, { "epoch": 0.9857195166605639, "grad_norm": 0.2649436923127358, "learning_rate": 1.1971302387512958e-05, "loss": 0.1338, "step": 673 }, { "epoch": 0.9871841816184548, "grad_norm": 0.27979461023670804, "learning_rate": 1.1946194517792584e-05, "loss": 0.1245, "step": 674 }, { "epoch": 0.9886488465763457, "grad_norm": 0.28275198179954225, "learning_rate": 1.1921073889660061e-05, "loss": 0.1316, "step": 675 }, { "epoch": 0.9901135115342365, "grad_norm": 0.26979156442079305, "learning_rate": 1.1895940667795395e-05, "loss": 0.1219, "step": 676 }, { "epoch": 0.9915781764921274, "grad_norm": 0.25593923579587835, "learning_rate": 1.1870795016961157e-05, "loss": 0.1048, "step": 677 }, { "epoch": 0.9930428414500183, "grad_norm": 0.2542007022812753, "learning_rate": 1.1845637102001383e-05, "loss": 0.1147, "step": 678 }, { "epoch": 0.9945075064079092, "grad_norm": 0.2763537672373692, "learning_rate": 1.1820467087840526e-05, "loss": 0.1228, "step": 679 }, { "epoch": 0.9959721713658001, "grad_norm": 0.2750343262145036, "learning_rate": 1.1795285139482341e-05, "loss": 0.1294, "step": 680 }, { "epoch": 0.997436836323691, "grad_norm": 0.2774530874594845, "learning_rate": 1.1770091422008824e-05, "loss": 0.1252, "step": 681 }, { "epoch": 0.9989015012815818, "grad_norm": 0.2607903166254994, "learning_rate": 1.174488610057913e-05, "loss": 0.119, "step": 682 }, { "epoch": 1.0003661662394727, "grad_norm": 0.2645760004291393, "learning_rate": 1.1719669340428472e-05, "loss": 0.1128, "step": 683 }, { "epoch": 1.0018308311973636, "grad_norm": 0.2552221963588916, "learning_rate": 1.1694441306867062e-05, "loss": 0.0897, "step": 684 }, { "epoch": 1.0032954961552545, "grad_norm": 0.24512146436759236, "learning_rate": 1.1669202165279009e-05, "loss": 0.0966, "step": 685 }, { "epoch": 1.0047601611131454, "grad_norm": 0.2926151907338783, "learning_rate": 1.164395208112124e-05, "loss": 0.0988, "step": 686 }, { "epoch": 1.0062248260710362, "grad_norm": 0.30856359947434714, "learning_rate": 1.1618691219922426e-05, "loss": 0.1138, "step": 687 }, { "epoch": 1.0076894910289271, "grad_norm": 0.26943247650215035, "learning_rate": 1.159341974728188e-05, "loss": 0.1157, "step": 688 }, { "epoch": 1.009154155986818, "grad_norm": 0.28468116825059964, "learning_rate": 1.1568137828868478e-05, "loss": 0.0881, "step": 689 }, { "epoch": 1.010618820944709, "grad_norm": 0.2399332416399268, "learning_rate": 1.1542845630419579e-05, "loss": 0.0893, "step": 690 }, { "epoch": 1.0120834859025998, "grad_norm": 0.3288588984625157, "learning_rate": 1.1517543317739931e-05, "loss": 0.1076, "step": 691 }, { "epoch": 1.0135481508604907, "grad_norm": 0.26604448567697603, "learning_rate": 1.1492231056700592e-05, "loss": 0.0932, "step": 692 }, { "epoch": 1.0150128158183815, "grad_norm": 0.24018805379412664, "learning_rate": 1.1466909013237819e-05, "loss": 0.1001, "step": 693 }, { "epoch": 1.0164774807762724, "grad_norm": 0.2651475188976319, "learning_rate": 1.1441577353352023e-05, "loss": 0.1035, "step": 694 }, { "epoch": 1.0179421457341633, "grad_norm": 0.31536104086551264, "learning_rate": 1.1416236243106638e-05, "loss": 0.1081, "step": 695 }, { "epoch": 1.0194068106920542, "grad_norm": 0.2613280520512752, "learning_rate": 1.1390885848627058e-05, "loss": 0.0845, "step": 696 }, { "epoch": 1.020871475649945, "grad_norm": 0.2837125123903259, "learning_rate": 1.1365526336099542e-05, "loss": 0.1328, "step": 697 }, { "epoch": 1.022336140607836, "grad_norm": 0.259222134905861, "learning_rate": 1.1340157871770117e-05, "loss": 0.0886, "step": 698 }, { "epoch": 1.0238008055657268, "grad_norm": 0.2865990464407519, "learning_rate": 1.13147806219435e-05, "loss": 0.1079, "step": 699 }, { "epoch": 1.0252654705236177, "grad_norm": 0.2794232474113488, "learning_rate": 1.1289394752982e-05, "loss": 0.106, "step": 700 }, { "epoch": 1.0267301354815086, "grad_norm": 0.2606486568133289, "learning_rate": 1.1264000431304422e-05, "loss": 0.0902, "step": 701 }, { "epoch": 1.0281948004393995, "grad_norm": 0.27767121394939476, "learning_rate": 1.1238597823385e-05, "loss": 0.0915, "step": 702 }, { "epoch": 1.0296594653972904, "grad_norm": 0.2865112658979935, "learning_rate": 1.1213187095752271e-05, "loss": 0.0994, "step": 703 }, { "epoch": 1.0311241303551812, "grad_norm": 0.28382542458351684, "learning_rate": 1.1187768414988015e-05, "loss": 0.0922, "step": 704 }, { "epoch": 1.0325887953130721, "grad_norm": 0.27487686960098845, "learning_rate": 1.1162341947726139e-05, "loss": 0.1187, "step": 705 }, { "epoch": 1.034053460270963, "grad_norm": 0.2629377900389775, "learning_rate": 1.1136907860651603e-05, "loss": 0.0894, "step": 706 }, { "epoch": 1.035518125228854, "grad_norm": 0.2613378798506544, "learning_rate": 1.1111466320499318e-05, "loss": 0.0921, "step": 707 }, { "epoch": 1.0369827901867448, "grad_norm": 0.255691486058076, "learning_rate": 1.1086017494053046e-05, "loss": 0.0937, "step": 708 }, { "epoch": 1.0384474551446357, "grad_norm": 0.2595524157003849, "learning_rate": 1.1060561548144321e-05, "loss": 0.0898, "step": 709 }, { "epoch": 1.0399121201025265, "grad_norm": 0.27529763982095773, "learning_rate": 1.1035098649651355e-05, "loss": 0.0966, "step": 710 }, { "epoch": 1.0413767850604174, "grad_norm": 0.2518906472665152, "learning_rate": 1.1009628965497927e-05, "loss": 0.0842, "step": 711 }, { "epoch": 1.0428414500183083, "grad_norm": 0.27904396918210833, "learning_rate": 1.0984152662652307e-05, "loss": 0.1148, "step": 712 }, { "epoch": 1.0443061149761992, "grad_norm": 0.2651133079255485, "learning_rate": 1.0958669908126151e-05, "loss": 0.087, "step": 713 }, { "epoch": 1.04577077993409, "grad_norm": 0.2767992985128494, "learning_rate": 1.0933180868973414e-05, "loss": 0.1033, "step": 714 }, { "epoch": 1.047235444891981, "grad_norm": 0.2527734956067706, "learning_rate": 1.0907685712289244e-05, "loss": 0.0912, "step": 715 }, { "epoch": 1.0487001098498718, "grad_norm": 0.26205384399328313, "learning_rate": 1.0882184605208895e-05, "loss": 0.089, "step": 716 }, { "epoch": 1.0501647748077627, "grad_norm": 0.2680315695418657, "learning_rate": 1.0856677714906632e-05, "loss": 0.1001, "step": 717 }, { "epoch": 1.0516294397656536, "grad_norm": 0.2265048738569077, "learning_rate": 1.083116520859463e-05, "loss": 0.0897, "step": 718 }, { "epoch": 1.0530941047235445, "grad_norm": 0.2507364042609683, "learning_rate": 1.080564725352188e-05, "loss": 0.0867, "step": 719 }, { "epoch": 1.0545587696814354, "grad_norm": 0.27387442999591954, "learning_rate": 1.0780124016973095e-05, "loss": 0.0998, "step": 720 }, { "epoch": 1.0560234346393262, "grad_norm": 0.24270388287807182, "learning_rate": 1.0754595666267609e-05, "loss": 0.0817, "step": 721 }, { "epoch": 1.0574880995972171, "grad_norm": 0.26311541978879793, "learning_rate": 1.0729062368758278e-05, "loss": 0.0988, "step": 722 }, { "epoch": 1.058952764555108, "grad_norm": 0.2704618854801208, "learning_rate": 1.0703524291830398e-05, "loss": 0.1082, "step": 723 }, { "epoch": 1.0604174295129989, "grad_norm": 0.28368113431359887, "learning_rate": 1.067798160290059e-05, "loss": 0.1009, "step": 724 }, { "epoch": 1.0618820944708898, "grad_norm": 0.26885585300497, "learning_rate": 1.0652434469415705e-05, "loss": 0.0986, "step": 725 }, { "epoch": 1.0633467594287807, "grad_norm": 0.24916401223915322, "learning_rate": 1.0626883058851737e-05, "loss": 0.0859, "step": 726 }, { "epoch": 1.0648114243866715, "grad_norm": 0.25631003685203035, "learning_rate": 1.0601327538712723e-05, "loss": 0.0923, "step": 727 }, { "epoch": 1.0662760893445624, "grad_norm": 0.2552784210398989, "learning_rate": 1.0575768076529627e-05, "loss": 0.0761, "step": 728 }, { "epoch": 1.0677407543024533, "grad_norm": 0.24165749739319964, "learning_rate": 1.0550204839859265e-05, "loss": 0.0752, "step": 729 }, { "epoch": 1.0692054192603442, "grad_norm": 0.2675621328921695, "learning_rate": 1.0524637996283195e-05, "loss": 0.1019, "step": 730 }, { "epoch": 1.070670084218235, "grad_norm": 0.2569008765612613, "learning_rate": 1.0499067713406622e-05, "loss": 0.0964, "step": 731 }, { "epoch": 1.072134749176126, "grad_norm": 0.24678690292067554, "learning_rate": 1.0473494158857298e-05, "loss": 0.0891, "step": 732 }, { "epoch": 1.0735994141340168, "grad_norm": 0.2549990776594623, "learning_rate": 1.0447917500284415e-05, "loss": 0.0925, "step": 733 }, { "epoch": 1.0750640790919077, "grad_norm": 0.28063287681936966, "learning_rate": 1.0422337905357523e-05, "loss": 0.1053, "step": 734 }, { "epoch": 1.0765287440497986, "grad_norm": 0.27072473670857133, "learning_rate": 1.0396755541765413e-05, "loss": 0.0951, "step": 735 }, { "epoch": 1.0779934090076895, "grad_norm": 0.2804354889898814, "learning_rate": 1.0371170577215036e-05, "loss": 0.1044, "step": 736 }, { "epoch": 1.0794580739655804, "grad_norm": 0.2707408971145425, "learning_rate": 1.0345583179430387e-05, "loss": 0.087, "step": 737 }, { "epoch": 1.0809227389234712, "grad_norm": 0.27265959106298226, "learning_rate": 1.0319993516151412e-05, "loss": 0.0957, "step": 738 }, { "epoch": 1.0823874038813621, "grad_norm": 0.2664867481183081, "learning_rate": 1.0294401755132912e-05, "loss": 0.0838, "step": 739 }, { "epoch": 1.083852068839253, "grad_norm": 0.2827928379341626, "learning_rate": 1.0268808064143438e-05, "loss": 0.0981, "step": 740 }, { "epoch": 1.0853167337971439, "grad_norm": 0.2516210381548982, "learning_rate": 1.0243212610964192e-05, "loss": 0.085, "step": 741 }, { "epoch": 1.0867813987550348, "grad_norm": 0.2609165490224745, "learning_rate": 1.0217615563387932e-05, "loss": 0.1047, "step": 742 }, { "epoch": 1.0882460637129256, "grad_norm": 0.2522780231083236, "learning_rate": 1.0192017089217863e-05, "loss": 0.1029, "step": 743 }, { "epoch": 1.0897107286708165, "grad_norm": 0.2640632238176228, "learning_rate": 1.0166417356266546e-05, "loss": 0.1053, "step": 744 }, { "epoch": 1.0911753936287074, "grad_norm": 0.266832288851606, "learning_rate": 1.0140816532354793e-05, "loss": 0.1068, "step": 745 }, { "epoch": 1.0926400585865983, "grad_norm": 0.24267415544535567, "learning_rate": 1.0115214785310567e-05, "loss": 0.0723, "step": 746 }, { "epoch": 1.0941047235444892, "grad_norm": 0.280429753176534, "learning_rate": 1.0089612282967884e-05, "loss": 0.1071, "step": 747 }, { "epoch": 1.09556938850238, "grad_norm": 0.26916094567137633, "learning_rate": 1.0064009193165713e-05, "loss": 0.0858, "step": 748 }, { "epoch": 1.097034053460271, "grad_norm": 0.274415937658407, "learning_rate": 1.0038405683746868e-05, "loss": 0.0869, "step": 749 }, { "epoch": 1.0984987184181618, "grad_norm": 0.2792545952852677, "learning_rate": 1.0012801922556918e-05, "loss": 0.0893, "step": 750 }, { "epoch": 1.0999633833760527, "grad_norm": 0.2875272278765476, "learning_rate": 9.987198077443085e-06, "loss": 0.1099, "step": 751 }, { "epoch": 1.1014280483339436, "grad_norm": 0.260518607411576, "learning_rate": 9.961594316253134e-06, "loss": 0.0828, "step": 752 }, { "epoch": 1.1028927132918345, "grad_norm": 0.2633760525239851, "learning_rate": 9.93599080683429e-06, "loss": 0.0969, "step": 753 }, { "epoch": 1.1043573782497254, "grad_norm": 0.25909573953326753, "learning_rate": 9.910387717032115e-06, "loss": 0.0756, "step": 754 }, { "epoch": 1.1058220432076162, "grad_norm": 0.27253806701721073, "learning_rate": 9.884785214689435e-06, "loss": 0.0856, "step": 755 }, { "epoch": 1.1072867081655071, "grad_norm": 0.2597508988107494, "learning_rate": 9.859183467645207e-06, "loss": 0.0855, "step": 756 }, { "epoch": 1.108751373123398, "grad_norm": 0.23868371653633474, "learning_rate": 9.833582643733457e-06, "loss": 0.0826, "step": 757 }, { "epoch": 1.1102160380812889, "grad_norm": 0.29179266600473996, "learning_rate": 9.807982910782142e-06, "loss": 0.1156, "step": 758 }, { "epoch": 1.1116807030391798, "grad_norm": 0.2522539204555841, "learning_rate": 9.782384436612072e-06, "loss": 0.0908, "step": 759 }, { "epoch": 1.1131453679970706, "grad_norm": 0.23700004230402077, "learning_rate": 9.756787389035813e-06, "loss": 0.0832, "step": 760 }, { "epoch": 1.1146100329549615, "grad_norm": 0.2829087070060208, "learning_rate": 9.731191935856566e-06, "loss": 0.1116, "step": 761 }, { "epoch": 1.1160746979128524, "grad_norm": 0.26523704277336946, "learning_rate": 9.705598244867093e-06, "loss": 0.0939, "step": 762 }, { "epoch": 1.1175393628707433, "grad_norm": 0.24086964841423913, "learning_rate": 9.68000648384859e-06, "loss": 0.0873, "step": 763 }, { "epoch": 1.1190040278286342, "grad_norm": 0.2654475048077318, "learning_rate": 9.654416820569618e-06, "loss": 0.1006, "step": 764 }, { "epoch": 1.120468692786525, "grad_norm": 0.2601239282279833, "learning_rate": 9.628829422784965e-06, "loss": 0.0773, "step": 765 }, { "epoch": 1.121933357744416, "grad_norm": 0.2569521050103609, "learning_rate": 9.603244458234589e-06, "loss": 0.0793, "step": 766 }, { "epoch": 1.1233980227023068, "grad_norm": 0.26593809509671273, "learning_rate": 9.577662094642478e-06, "loss": 0.0801, "step": 767 }, { "epoch": 1.1248626876601977, "grad_norm": 0.29040791441938224, "learning_rate": 9.552082499715588e-06, "loss": 0.0933, "step": 768 }, { "epoch": 1.1263273526180886, "grad_norm": 0.2943106175306918, "learning_rate": 9.526505841142702e-06, "loss": 0.0861, "step": 769 }, { "epoch": 1.1277920175759795, "grad_norm": 0.2601745466903212, "learning_rate": 9.50093228659338e-06, "loss": 0.0956, "step": 770 }, { "epoch": 1.1292566825338703, "grad_norm": 0.26049784872083914, "learning_rate": 9.475362003716804e-06, "loss": 0.0826, "step": 771 }, { "epoch": 1.1307213474917612, "grad_norm": 0.2629400244245209, "learning_rate": 9.449795160140737e-06, "loss": 0.0867, "step": 772 }, { "epoch": 1.1321860124496521, "grad_norm": 0.2621080280544426, "learning_rate": 9.424231923470378e-06, "loss": 0.1065, "step": 773 }, { "epoch": 1.133650677407543, "grad_norm": 0.26858890551499126, "learning_rate": 9.39867246128728e-06, "loss": 0.0941, "step": 774 }, { "epoch": 1.1351153423654339, "grad_norm": 0.2576626727357875, "learning_rate": 9.373116941148264e-06, "loss": 0.0964, "step": 775 }, { "epoch": 1.1365800073233248, "grad_norm": 0.26215360922586484, "learning_rate": 9.347565530584299e-06, "loss": 0.093, "step": 776 }, { "epoch": 1.1380446722812156, "grad_norm": 0.25775670982713683, "learning_rate": 9.322018397099414e-06, "loss": 0.0921, "step": 777 }, { "epoch": 1.1395093372391065, "grad_norm": 0.26195192691999436, "learning_rate": 9.296475708169603e-06, "loss": 0.0944, "step": 778 }, { "epoch": 1.1409740021969974, "grad_norm": 0.24607830549799634, "learning_rate": 9.270937631241723e-06, "loss": 0.0789, "step": 779 }, { "epoch": 1.1424386671548883, "grad_norm": 0.2504436025347225, "learning_rate": 9.245404333732395e-06, "loss": 0.0844, "step": 780 }, { "epoch": 1.1439033321127792, "grad_norm": 0.2577203090564361, "learning_rate": 9.219875983026909e-06, "loss": 0.0834, "step": 781 }, { "epoch": 1.14536799707067, "grad_norm": 0.25652775031593833, "learning_rate": 9.194352746478122e-06, "loss": 0.0806, "step": 782 }, { "epoch": 1.146832662028561, "grad_norm": 0.27571423516211513, "learning_rate": 9.168834791405374e-06, "loss": 0.1015, "step": 783 }, { "epoch": 1.1482973269864518, "grad_norm": 0.25833280528124036, "learning_rate": 9.143322285093371e-06, "loss": 0.0879, "step": 784 }, { "epoch": 1.1497619919443427, "grad_norm": 0.27768929864428427, "learning_rate": 9.117815394791107e-06, "loss": 0.0916, "step": 785 }, { "epoch": 1.1512266569022336, "grad_norm": 0.27218967256543264, "learning_rate": 9.092314287710757e-06, "loss": 0.0791, "step": 786 }, { "epoch": 1.1526913218601245, "grad_norm": 0.290779647815006, "learning_rate": 9.066819131026588e-06, "loss": 0.1023, "step": 787 }, { "epoch": 1.1541559868180153, "grad_norm": 0.27792973768255624, "learning_rate": 9.041330091873852e-06, "loss": 0.0979, "step": 788 }, { "epoch": 1.1556206517759062, "grad_norm": 0.2373738656510741, "learning_rate": 9.015847337347695e-06, "loss": 0.0806, "step": 789 }, { "epoch": 1.157085316733797, "grad_norm": 0.25696429168971635, "learning_rate": 8.990371034502078e-06, "loss": 0.0918, "step": 790 }, { "epoch": 1.158549981691688, "grad_norm": 0.280392566025079, "learning_rate": 8.964901350348648e-06, "loss": 0.0949, "step": 791 }, { "epoch": 1.1600146466495789, "grad_norm": 0.2535223646372367, "learning_rate": 8.939438451855684e-06, "loss": 0.0816, "step": 792 }, { "epoch": 1.1614793116074698, "grad_norm": 0.24735896623108233, "learning_rate": 8.913982505946958e-06, "loss": 0.0876, "step": 793 }, { "epoch": 1.1629439765653606, "grad_norm": 0.2543060334798377, "learning_rate": 8.888533679500688e-06, "loss": 0.0866, "step": 794 }, { "epoch": 1.1644086415232515, "grad_norm": 0.2648636536906661, "learning_rate": 8.863092139348397e-06, "loss": 0.1009, "step": 795 }, { "epoch": 1.1658733064811424, "grad_norm": 0.24166274223060005, "learning_rate": 8.837658052273863e-06, "loss": 0.0744, "step": 796 }, { "epoch": 1.1673379714390333, "grad_norm": 0.2593529998079471, "learning_rate": 8.812231585011987e-06, "loss": 0.1102, "step": 797 }, { "epoch": 1.1688026363969242, "grad_norm": 0.2547937310328503, "learning_rate": 8.78681290424773e-06, "loss": 0.083, "step": 798 }, { "epoch": 1.170267301354815, "grad_norm": 0.2678047431199921, "learning_rate": 8.761402176615002e-06, "loss": 0.1004, "step": 799 }, { "epoch": 1.171731966312706, "grad_norm": 0.2881247911795614, "learning_rate": 8.735999568695581e-06, "loss": 0.0941, "step": 800 }, { "epoch": 1.1731966312705968, "grad_norm": 0.306665541632391, "learning_rate": 8.710605247018002e-06, "loss": 0.1016, "step": 801 }, { "epoch": 1.1746612962284877, "grad_norm": 0.26549998092914273, "learning_rate": 8.685219378056503e-06, "loss": 0.1025, "step": 802 }, { "epoch": 1.1761259611863786, "grad_norm": 0.25884918535779855, "learning_rate": 8.659842128229886e-06, "loss": 0.0796, "step": 803 }, { "epoch": 1.1775906261442695, "grad_norm": 0.29483213665665864, "learning_rate": 8.634473663900461e-06, "loss": 0.0936, "step": 804 }, { "epoch": 1.1790552911021603, "grad_norm": 0.2827884978675775, "learning_rate": 8.609114151372947e-06, "loss": 0.0877, "step": 805 }, { "epoch": 1.1805199560600512, "grad_norm": 0.2929302563222184, "learning_rate": 8.583763756893366e-06, "loss": 0.0955, "step": 806 }, { "epoch": 1.181984621017942, "grad_norm": 0.2538641882723923, "learning_rate": 8.558422646647984e-06, "loss": 0.0808, "step": 807 }, { "epoch": 1.183449285975833, "grad_norm": 0.26631517555229367, "learning_rate": 8.533090986762183e-06, "loss": 0.0834, "step": 808 }, { "epoch": 1.1849139509337239, "grad_norm": 0.2788403763341659, "learning_rate": 8.507768943299415e-06, "loss": 0.0969, "step": 809 }, { "epoch": 1.1863786158916148, "grad_norm": 0.2781094675108167, "learning_rate": 8.482456682260069e-06, "loss": 0.0963, "step": 810 }, { "epoch": 1.1878432808495056, "grad_norm": 0.2473923042956088, "learning_rate": 8.457154369580424e-06, "loss": 0.0843, "step": 811 }, { "epoch": 1.1893079458073965, "grad_norm": 0.23006340181208546, "learning_rate": 8.431862171131524e-06, "loss": 0.0695, "step": 812 }, { "epoch": 1.1907726107652874, "grad_norm": 0.2556855544794983, "learning_rate": 8.406580252718125e-06, "loss": 0.0773, "step": 813 }, { "epoch": 1.1922372757231783, "grad_norm": 0.26970894596044315, "learning_rate": 8.381308780077575e-06, "loss": 0.0862, "step": 814 }, { "epoch": 1.1937019406810692, "grad_norm": 0.25936009464878035, "learning_rate": 8.356047918878762e-06, "loss": 0.0909, "step": 815 }, { "epoch": 1.19516660563896, "grad_norm": 0.26608540904879535, "learning_rate": 8.330797834720993e-06, "loss": 0.0833, "step": 816 }, { "epoch": 1.196631270596851, "grad_norm": 0.2591055743281483, "learning_rate": 8.305558693132943e-06, "loss": 0.0798, "step": 817 }, { "epoch": 1.1980959355547418, "grad_norm": 0.2607615757724153, "learning_rate": 8.280330659571532e-06, "loss": 0.0891, "step": 818 }, { "epoch": 1.1995606005126327, "grad_norm": 0.27689441116344976, "learning_rate": 8.255113899420873e-06, "loss": 0.1042, "step": 819 }, { "epoch": 1.2010252654705236, "grad_norm": 0.2634058051717049, "learning_rate": 8.229908577991177e-06, "loss": 0.0794, "step": 820 }, { "epoch": 1.2024899304284145, "grad_norm": 0.2712603230168793, "learning_rate": 8.204714860517662e-06, "loss": 0.0872, "step": 821 }, { "epoch": 1.2039545953863053, "grad_norm": 0.26002461903187646, "learning_rate": 8.179532912159477e-06, "loss": 0.0806, "step": 822 }, { "epoch": 1.2054192603441962, "grad_norm": 0.2778681441466659, "learning_rate": 8.154362897998619e-06, "loss": 0.0954, "step": 823 }, { "epoch": 1.206883925302087, "grad_norm": 0.2597653658409811, "learning_rate": 8.129204983038847e-06, "loss": 0.0837, "step": 824 }, { "epoch": 1.208348590259978, "grad_norm": 0.2409553005949464, "learning_rate": 8.104059332204606e-06, "loss": 0.0797, "step": 825 }, { "epoch": 1.2098132552178689, "grad_norm": 0.26099557745468915, "learning_rate": 8.07892611033994e-06, "loss": 0.0856, "step": 826 }, { "epoch": 1.2112779201757597, "grad_norm": 0.2670243095090208, "learning_rate": 8.053805482207418e-06, "loss": 0.0981, "step": 827 }, { "epoch": 1.2127425851336506, "grad_norm": 0.25384500506062874, "learning_rate": 8.028697612487046e-06, "loss": 0.0855, "step": 828 }, { "epoch": 1.2142072500915415, "grad_norm": 0.27259004018505645, "learning_rate": 8.003602665775189e-06, "loss": 0.0895, "step": 829 }, { "epoch": 1.2156719150494324, "grad_norm": 0.24539935533491744, "learning_rate": 7.978520806583503e-06, "loss": 0.075, "step": 830 }, { "epoch": 1.2171365800073233, "grad_norm": 0.28931365047548946, "learning_rate": 7.95345219933784e-06, "loss": 0.0939, "step": 831 }, { "epoch": 1.2186012449652142, "grad_norm": 0.24781333667809402, "learning_rate": 7.92839700837718e-06, "loss": 0.0847, "step": 832 }, { "epoch": 1.220065909923105, "grad_norm": 0.2597660565925631, "learning_rate": 7.903355397952557e-06, "loss": 0.0847, "step": 833 }, { "epoch": 1.221530574880996, "grad_norm": 0.25108362024888203, "learning_rate": 7.878327532225964e-06, "loss": 0.0729, "step": 834 }, { "epoch": 1.2229952398388868, "grad_norm": 0.2879443475536739, "learning_rate": 7.853313575269306e-06, "loss": 0.0926, "step": 835 }, { "epoch": 1.2244599047967777, "grad_norm": 0.24985250820254382, "learning_rate": 7.828313691063294e-06, "loss": 0.0729, "step": 836 }, { "epoch": 1.2259245697546686, "grad_norm": 0.2528963890297066, "learning_rate": 7.803328043496394e-06, "loss": 0.069, "step": 837 }, { "epoch": 1.2273892347125595, "grad_norm": 0.2470937539451322, "learning_rate": 7.778356796363734e-06, "loss": 0.0829, "step": 838 }, { "epoch": 1.2288538996704503, "grad_norm": 0.2521378561935959, "learning_rate": 7.753400113366052e-06, "loss": 0.0791, "step": 839 }, { "epoch": 1.2303185646283412, "grad_norm": 0.2631772936605158, "learning_rate": 7.728458158108592e-06, "loss": 0.0905, "step": 840 }, { "epoch": 1.231783229586232, "grad_norm": 0.2444159483676253, "learning_rate": 7.703531094100068e-06, "loss": 0.0741, "step": 841 }, { "epoch": 1.233247894544123, "grad_norm": 0.281794329109641, "learning_rate": 7.678619084751554e-06, "loss": 0.088, "step": 842 }, { "epoch": 1.2347125595020139, "grad_norm": 0.27262976373807446, "learning_rate": 7.653722293375453e-06, "loss": 0.1, "step": 843 }, { "epoch": 1.2361772244599047, "grad_norm": 0.25816315241450866, "learning_rate": 7.628840883184385e-06, "loss": 0.0906, "step": 844 }, { "epoch": 1.2376418894177956, "grad_norm": 0.2537369926820909, "learning_rate": 7.603975017290159e-06, "loss": 0.0777, "step": 845 }, { "epoch": 1.2391065543756865, "grad_norm": 0.2916357929163886, "learning_rate": 7.579124858702658e-06, "loss": 0.1011, "step": 846 }, { "epoch": 1.2405712193335774, "grad_norm": 0.27484734516089476, "learning_rate": 7.5542905703288175e-06, "loss": 0.0954, "step": 847 }, { "epoch": 1.2420358842914683, "grad_norm": 0.2704547563666006, "learning_rate": 7.529472314971522e-06, "loss": 0.0929, "step": 848 }, { "epoch": 1.2435005492493592, "grad_norm": 0.24275302504305166, "learning_rate": 7.504670255328548e-06, "loss": 0.0684, "step": 849 }, { "epoch": 1.24496521420725, "grad_norm": 0.2535609358202711, "learning_rate": 7.4798845539915126e-06, "loss": 0.071, "step": 850 }, { "epoch": 1.246429879165141, "grad_norm": 0.25442354937020867, "learning_rate": 7.455115373444779e-06, "loss": 0.0808, "step": 851 }, { "epoch": 1.2478945441230318, "grad_norm": 0.25458587191704724, "learning_rate": 7.430362876064424e-06, "loss": 0.0806, "step": 852 }, { "epoch": 1.2493592090809227, "grad_norm": 0.2636459021351091, "learning_rate": 7.4056272241171425e-06, "loss": 0.0975, "step": 853 }, { "epoch": 1.2508238740388136, "grad_norm": 0.26588179392313827, "learning_rate": 7.380908579759207e-06, "loss": 0.0908, "step": 854 }, { "epoch": 1.2522885389967044, "grad_norm": 0.25802097833264015, "learning_rate": 7.356207105035389e-06, "loss": 0.0727, "step": 855 }, { "epoch": 1.2537532039545953, "grad_norm": 0.2525429271439887, "learning_rate": 7.331522961877914e-06, "loss": 0.0852, "step": 856 }, { "epoch": 1.2552178689124862, "grad_norm": 0.24515558724346895, "learning_rate": 7.30685631210537e-06, "loss": 0.0716, "step": 857 }, { "epoch": 1.256682533870377, "grad_norm": 0.25450544202115294, "learning_rate": 7.282207317421691e-06, "loss": 0.0851, "step": 858 }, { "epoch": 1.258147198828268, "grad_norm": 0.26104307630858015, "learning_rate": 7.2575761394150476e-06, "loss": 0.0824, "step": 859 }, { "epoch": 1.2596118637861589, "grad_norm": 0.26362892269219795, "learning_rate": 7.232962939556831e-06, "loss": 0.0754, "step": 860 }, { "epoch": 1.2610765287440497, "grad_norm": 0.26960831313830425, "learning_rate": 7.20836787920056e-06, "loss": 0.088, "step": 861 }, { "epoch": 1.2625411937019406, "grad_norm": 0.2884650032848352, "learning_rate": 7.183791119580854e-06, "loss": 0.0844, "step": 862 }, { "epoch": 1.2640058586598315, "grad_norm": 0.2673739678569061, "learning_rate": 7.159232821812348e-06, "loss": 0.0917, "step": 863 }, { "epoch": 1.2654705236177224, "grad_norm": 0.2647717036142495, "learning_rate": 7.134693146888652e-06, "loss": 0.074, "step": 864 }, { "epoch": 1.2669351885756133, "grad_norm": 0.24828043520372733, "learning_rate": 7.1101722556813e-06, "loss": 0.0731, "step": 865 }, { "epoch": 1.2683998535335042, "grad_norm": 0.29021631743576276, "learning_rate": 7.085670308938674e-06, "loss": 0.0943, "step": 866 }, { "epoch": 1.269864518491395, "grad_norm": 0.25757971293739884, "learning_rate": 7.061187467284985e-06, "loss": 0.0762, "step": 867 }, { "epoch": 1.271329183449286, "grad_norm": 0.2707115051349976, "learning_rate": 7.0367238912191734e-06, "loss": 0.086, "step": 868 }, { "epoch": 1.2727938484071768, "grad_norm": 0.2416159349328539, "learning_rate": 7.012279741113909e-06, "loss": 0.0731, "step": 869 }, { "epoch": 1.2742585133650677, "grad_norm": 0.25001307113728843, "learning_rate": 6.987855177214489e-06, "loss": 0.0793, "step": 870 }, { "epoch": 1.2757231783229586, "grad_norm": 0.267697774653143, "learning_rate": 6.963450359637835e-06, "loss": 0.0881, "step": 871 }, { "epoch": 1.2771878432808494, "grad_norm": 0.2656180743033591, "learning_rate": 6.939065448371398e-06, "loss": 0.0805, "step": 872 }, { "epoch": 1.2786525082387403, "grad_norm": 0.24388348217463868, "learning_rate": 6.914700603272151e-06, "loss": 0.0739, "step": 873 }, { "epoch": 1.2801171731966312, "grad_norm": 0.27460701489232753, "learning_rate": 6.8903559840655075e-06, "loss": 0.079, "step": 874 }, { "epoch": 1.281581838154522, "grad_norm": 0.25551151990608106, "learning_rate": 6.866031750344302e-06, "loss": 0.0767, "step": 875 }, { "epoch": 1.283046503112413, "grad_norm": 0.25705060856070827, "learning_rate": 6.841728061567713e-06, "loss": 0.084, "step": 876 }, { "epoch": 1.2845111680703039, "grad_norm": 0.2535751605501693, "learning_rate": 6.817445077060256e-06, "loss": 0.0859, "step": 877 }, { "epoch": 1.2859758330281947, "grad_norm": 0.26041563534508294, "learning_rate": 6.7931829560107e-06, "loss": 0.0841, "step": 878 }, { "epoch": 1.2874404979860856, "grad_norm": 0.2537135436483843, "learning_rate": 6.768941857471054e-06, "loss": 0.0775, "step": 879 }, { "epoch": 1.2889051629439765, "grad_norm": 0.2659246611826128, "learning_rate": 6.744721940355508e-06, "loss": 0.0807, "step": 880 }, { "epoch": 1.2903698279018674, "grad_norm": 0.2540825283987494, "learning_rate": 6.720523363439393e-06, "loss": 0.0896, "step": 881 }, { "epoch": 1.2918344928597583, "grad_norm": 0.2575386822036422, "learning_rate": 6.69634628535815e-06, "loss": 0.0797, "step": 882 }, { "epoch": 1.2932991578176491, "grad_norm": 0.2649389460574201, "learning_rate": 6.672190864606276e-06, "loss": 0.0878, "step": 883 }, { "epoch": 1.29476382277554, "grad_norm": 0.2638636466333762, "learning_rate": 6.648057259536297e-06, "loss": 0.0831, "step": 884 }, { "epoch": 1.296228487733431, "grad_norm": 0.26545840639974744, "learning_rate": 6.623945628357718e-06, "loss": 0.081, "step": 885 }, { "epoch": 1.2976931526913218, "grad_norm": 0.27477957230836636, "learning_rate": 6.599856129136006e-06, "loss": 0.0787, "step": 886 }, { "epoch": 1.2991578176492127, "grad_norm": 0.2584584388897469, "learning_rate": 6.575788919791522e-06, "loss": 0.0888, "step": 887 }, { "epoch": 1.3006224826071036, "grad_norm": 0.2553038074640935, "learning_rate": 6.551744158098521e-06, "loss": 0.0804, "step": 888 }, { "epoch": 1.3020871475649944, "grad_norm": 0.2614338959940162, "learning_rate": 6.527722001684087e-06, "loss": 0.0842, "step": 889 }, { "epoch": 1.3035518125228853, "grad_norm": 0.27266050110549406, "learning_rate": 6.503722608027129e-06, "loss": 0.0953, "step": 890 }, { "epoch": 1.3050164774807762, "grad_norm": 0.26498428407714436, "learning_rate": 6.479746134457319e-06, "loss": 0.0948, "step": 891 }, { "epoch": 1.306481142438667, "grad_norm": 0.2425181307556485, "learning_rate": 6.455792738154074e-06, "loss": 0.0771, "step": 892 }, { "epoch": 1.307945807396558, "grad_norm": 0.3031551149501075, "learning_rate": 6.431862576145546e-06, "loss": 0.1127, "step": 893 }, { "epoch": 1.3094104723544489, "grad_norm": 0.2605793802598279, "learning_rate": 6.407955805307547e-06, "loss": 0.0838, "step": 894 }, { "epoch": 1.3108751373123397, "grad_norm": 0.2857025614498841, "learning_rate": 6.3840725823625685e-06, "loss": 0.0958, "step": 895 }, { "epoch": 1.3123398022702306, "grad_norm": 0.26266676335382666, "learning_rate": 6.3602130638787155e-06, "loss": 0.0919, "step": 896 }, { "epoch": 1.3138044672281215, "grad_norm": 0.2705157836138465, "learning_rate": 6.336377406268712e-06, "loss": 0.0843, "step": 897 }, { "epoch": 1.3152691321860124, "grad_norm": 0.2442729301782896, "learning_rate": 6.312565765788843e-06, "loss": 0.0696, "step": 898 }, { "epoch": 1.3167337971439033, "grad_norm": 0.23540342455420937, "learning_rate": 6.288778298537966e-06, "loss": 0.0736, "step": 899 }, { "epoch": 1.3181984621017941, "grad_norm": 0.2772712225095351, "learning_rate": 6.2650151604564534e-06, "loss": 0.0938, "step": 900 }, { "epoch": 1.319663127059685, "grad_norm": 0.25309744480957075, "learning_rate": 6.241276507325198e-06, "loss": 0.0758, "step": 901 }, { "epoch": 1.321127792017576, "grad_norm": 0.25863367628675915, "learning_rate": 6.217562494764569e-06, "loss": 0.0833, "step": 902 }, { "epoch": 1.3225924569754668, "grad_norm": 0.26530513213868256, "learning_rate": 6.193873278233417e-06, "loss": 0.0841, "step": 903 }, { "epoch": 1.3240571219333577, "grad_norm": 0.24979685098439777, "learning_rate": 6.170209013028021e-06, "loss": 0.0876, "step": 904 }, { "epoch": 1.3255217868912486, "grad_norm": 0.2523647371099479, "learning_rate": 6.146569854281111e-06, "loss": 0.082, "step": 905 }, { "epoch": 1.3269864518491394, "grad_norm": 0.2666393886883001, "learning_rate": 6.1229559569608144e-06, "loss": 0.0982, "step": 906 }, { "epoch": 1.3284511168070303, "grad_norm": 0.25240352584777037, "learning_rate": 6.099367475869658e-06, "loss": 0.0815, "step": 907 }, { "epoch": 1.3299157817649212, "grad_norm": 0.24034010865003308, "learning_rate": 6.075804565643562e-06, "loss": 0.0732, "step": 908 }, { "epoch": 1.331380446722812, "grad_norm": 0.2601774368496189, "learning_rate": 6.052267380750796e-06, "loss": 0.0804, "step": 909 }, { "epoch": 1.332845111680703, "grad_norm": 0.2416658354239181, "learning_rate": 6.028756075491007e-06, "loss": 0.0671, "step": 910 }, { "epoch": 1.3343097766385938, "grad_norm": 0.23420758822988802, "learning_rate": 6.005270803994165e-06, "loss": 0.0694, "step": 911 }, { "epoch": 1.3357744415964847, "grad_norm": 0.23323493701217127, "learning_rate": 5.981811720219593e-06, "loss": 0.0677, "step": 912 }, { "epoch": 1.3372391065543756, "grad_norm": 0.26506406252750964, "learning_rate": 5.958378977954922e-06, "loss": 0.0741, "step": 913 }, { "epoch": 1.3387037715122665, "grad_norm": 0.255574393142094, "learning_rate": 5.934972730815115e-06, "loss": 0.0811, "step": 914 }, { "epoch": 1.3401684364701574, "grad_norm": 0.2550215436110789, "learning_rate": 5.9115931322414285e-06, "loss": 0.0862, "step": 915 }, { "epoch": 1.3416331014280483, "grad_norm": 0.25209588662794513, "learning_rate": 5.888240335500439e-06, "loss": 0.0721, "step": 916 }, { "epoch": 1.3430977663859391, "grad_norm": 0.26335632891089883, "learning_rate": 5.8649144936830045e-06, "loss": 0.0749, "step": 917 }, { "epoch": 1.34456243134383, "grad_norm": 0.26611291985369345, "learning_rate": 5.841615759703296e-06, "loss": 0.0856, "step": 918 }, { "epoch": 1.346027096301721, "grad_norm": 0.3180398065874917, "learning_rate": 5.818344286297756e-06, "loss": 0.1111, "step": 919 }, { "epoch": 1.3474917612596118, "grad_norm": 0.2470882409448809, "learning_rate": 5.795100226024145e-06, "loss": 0.0842, "step": 920 }, { "epoch": 1.3489564262175027, "grad_norm": 0.24747051474580573, "learning_rate": 5.771883731260492e-06, "loss": 0.0849, "step": 921 }, { "epoch": 1.3504210911753936, "grad_norm": 0.2689238693234873, "learning_rate": 5.748694954204126e-06, "loss": 0.09, "step": 922 }, { "epoch": 1.3518857561332844, "grad_norm": 0.24131459800140173, "learning_rate": 5.7255340468706776e-06, "loss": 0.0788, "step": 923 }, { "epoch": 1.3533504210911753, "grad_norm": 0.23703584143200643, "learning_rate": 5.702401161093061e-06, "loss": 0.07, "step": 924 }, { "epoch": 1.3548150860490662, "grad_norm": 0.24911481329061927, "learning_rate": 5.679296448520509e-06, "loss": 0.0766, "step": 925 }, { "epoch": 1.356279751006957, "grad_norm": 0.2493171773305748, "learning_rate": 5.65622006061755e-06, "loss": 0.0743, "step": 926 }, { "epoch": 1.357744415964848, "grad_norm": 0.25543638955248743, "learning_rate": 5.633172148663039e-06, "loss": 0.0869, "step": 927 }, { "epoch": 1.3592090809227388, "grad_norm": 0.24551184220415498, "learning_rate": 5.610152863749143e-06, "loss": 0.069, "step": 928 }, { "epoch": 1.3606737458806297, "grad_norm": 0.26501893240626057, "learning_rate": 5.5871623567803756e-06, "loss": 0.0696, "step": 929 }, { "epoch": 1.3621384108385206, "grad_norm": 0.27208453202862887, "learning_rate": 5.564200778472583e-06, "loss": 0.0795, "step": 930 }, { "epoch": 1.3636030757964115, "grad_norm": 0.27329376137848915, "learning_rate": 5.5412682793519765e-06, "loss": 0.0772, "step": 931 }, { "epoch": 1.3650677407543024, "grad_norm": 0.2631020886292811, "learning_rate": 5.5183650097541295e-06, "loss": 0.0801, "step": 932 }, { "epoch": 1.3665324057121933, "grad_norm": 0.24801113517701595, "learning_rate": 5.495491119823007e-06, "loss": 0.0722, "step": 933 }, { "epoch": 1.3679970706700841, "grad_norm": 0.2650739437868264, "learning_rate": 5.472646759509963e-06, "loss": 0.0936, "step": 934 }, { "epoch": 1.369461735627975, "grad_norm": 0.2605363396379517, "learning_rate": 5.449832078572781e-06, "loss": 0.0878, "step": 935 }, { "epoch": 1.370926400585866, "grad_norm": 0.2646339088927977, "learning_rate": 5.427047226574671e-06, "loss": 0.0759, "step": 936 }, { "epoch": 1.3723910655437568, "grad_norm": 0.2601636184080717, "learning_rate": 5.404292352883291e-06, "loss": 0.0944, "step": 937 }, { "epoch": 1.3738557305016477, "grad_norm": 0.2837985033272464, "learning_rate": 5.3815676066697946e-06, "loss": 0.1013, "step": 938 }, { "epoch": 1.3753203954595388, "grad_norm": 0.284455217744833, "learning_rate": 5.358873136907808e-06, "loss": 0.0788, "step": 939 }, { "epoch": 1.3767850604174297, "grad_norm": 0.26876984673679605, "learning_rate": 5.336209092372502e-06, "loss": 0.0902, "step": 940 }, { "epoch": 1.3782497253753205, "grad_norm": 0.27988318639906634, "learning_rate": 5.313575621639568e-06, "loss": 0.0857, "step": 941 }, { "epoch": 1.3797143903332114, "grad_norm": 0.29898311525846655, "learning_rate": 5.290972873084296e-06, "loss": 0.0876, "step": 942 }, { "epoch": 1.3811790552911023, "grad_norm": 0.2392250599528443, "learning_rate": 5.268400994880547e-06, "loss": 0.0688, "step": 943 }, { "epoch": 1.3826437202489932, "grad_norm": 0.24702025812837353, "learning_rate": 5.245860134999831e-06, "loss": 0.0763, "step": 944 }, { "epoch": 1.384108385206884, "grad_norm": 0.25177865615717077, "learning_rate": 5.223350441210303e-06, "loss": 0.0804, "step": 945 }, { "epoch": 1.385573050164775, "grad_norm": 0.27025384321008283, "learning_rate": 5.200872061075814e-06, "loss": 0.0865, "step": 946 }, { "epoch": 1.3870377151226658, "grad_norm": 0.3196326325850145, "learning_rate": 5.178425141954926e-06, "loss": 0.1001, "step": 947 }, { "epoch": 1.3885023800805567, "grad_norm": 0.2659528533890264, "learning_rate": 5.15600983099997e-06, "loss": 0.0757, "step": 948 }, { "epoch": 1.3899670450384476, "grad_norm": 0.25677841458575923, "learning_rate": 5.133626275156055e-06, "loss": 0.0655, "step": 949 }, { "epoch": 1.3914317099963385, "grad_norm": 0.2847121257417236, "learning_rate": 5.111274621160127e-06, "loss": 0.0866, "step": 950 }, { "epoch": 1.3928963749542294, "grad_norm": 0.27638736754038373, "learning_rate": 5.088955015539989e-06, "loss": 0.1017, "step": 951 }, { "epoch": 1.3943610399121202, "grad_norm": 0.24594995873570738, "learning_rate": 5.06666760461335e-06, "loss": 0.0816, "step": 952 }, { "epoch": 1.3958257048700111, "grad_norm": 0.24663161576513942, "learning_rate": 5.044412534486873e-06, "loss": 0.0721, "step": 953 }, { "epoch": 1.397290369827902, "grad_norm": 0.24164833719808607, "learning_rate": 5.0221899510551965e-06, "loss": 0.0617, "step": 954 }, { "epoch": 1.3987550347857929, "grad_norm": 0.2772777632729084, "learning_rate": 5.000000000000003e-06, "loss": 0.0837, "step": 955 }, { "epoch": 1.4002196997436838, "grad_norm": 0.25527189944389456, "learning_rate": 4.977842826789034e-06, "loss": 0.0812, "step": 956 }, { "epoch": 1.4016843647015746, "grad_norm": 0.26722398889539783, "learning_rate": 4.955718576675176e-06, "loss": 0.0762, "step": 957 }, { "epoch": 1.4031490296594655, "grad_norm": 0.2575445402991656, "learning_rate": 4.933627394695464e-06, "loss": 0.0761, "step": 958 }, { "epoch": 1.4046136946173564, "grad_norm": 0.26548423508114666, "learning_rate": 4.911569425670168e-06, "loss": 0.0797, "step": 959 }, { "epoch": 1.4060783595752473, "grad_norm": 0.23644665405104945, "learning_rate": 4.88954481420182e-06, "loss": 0.073, "step": 960 }, { "epoch": 1.4075430245331382, "grad_norm": 0.25187795846556027, "learning_rate": 4.867553704674279e-06, "loss": 0.0872, "step": 961 }, { "epoch": 1.409007689491029, "grad_norm": 0.2698865546073546, "learning_rate": 4.845596241251773e-06, "loss": 0.0895, "step": 962 }, { "epoch": 1.41047235444892, "grad_norm": 0.24104521653041425, "learning_rate": 4.823672567877973e-06, "loss": 0.0758, "step": 963 }, { "epoch": 1.4119370194068108, "grad_norm": 0.2442657706715064, "learning_rate": 4.801782828275019e-06, "loss": 0.0702, "step": 964 }, { "epoch": 1.4134016843647017, "grad_norm": 0.2592721496347663, "learning_rate": 4.779927165942616e-06, "loss": 0.0702, "step": 965 }, { "epoch": 1.4148663493225926, "grad_norm": 0.249769758106348, "learning_rate": 4.758105724157058e-06, "loss": 0.0717, "step": 966 }, { "epoch": 1.4163310142804835, "grad_norm": 0.30631601628088784, "learning_rate": 4.7363186459703055e-06, "loss": 0.0855, "step": 967 }, { "epoch": 1.4177956792383744, "grad_norm": 0.2580744961558955, "learning_rate": 4.714566074209058e-06, "loss": 0.0815, "step": 968 }, { "epoch": 1.4192603441962652, "grad_norm": 0.27175417696412074, "learning_rate": 4.692848151473789e-06, "loss": 0.0884, "step": 969 }, { "epoch": 1.4207250091541561, "grad_norm": 0.26107195316664245, "learning_rate": 4.671165020137844e-06, "loss": 0.0914, "step": 970 }, { "epoch": 1.422189674112047, "grad_norm": 0.2659897433575913, "learning_rate": 4.649516822346474e-06, "loss": 0.0841, "step": 971 }, { "epoch": 1.4236543390699379, "grad_norm": 0.24447135185611163, "learning_rate": 4.62790370001594e-06, "loss": 0.0761, "step": 972 }, { "epoch": 1.4251190040278288, "grad_norm": 0.27972979502307804, "learning_rate": 4.606325794832545e-06, "loss": 0.0771, "step": 973 }, { "epoch": 1.4265836689857196, "grad_norm": 0.2736878498962296, "learning_rate": 4.584783248251738e-06, "loss": 0.0842, "step": 974 }, { "epoch": 1.4280483339436105, "grad_norm": 0.2837151699593587, "learning_rate": 4.563276201497161e-06, "loss": 0.0929, "step": 975 }, { "epoch": 1.4295129989015014, "grad_norm": 0.2517370953490891, "learning_rate": 4.5418047955597465e-06, "loss": 0.0702, "step": 976 }, { "epoch": 1.4309776638593923, "grad_norm": 0.28908876741636197, "learning_rate": 4.520369171196766e-06, "loss": 0.0928, "step": 977 }, { "epoch": 1.4324423288172832, "grad_norm": 0.3619682932427526, "learning_rate": 4.4989694689309394e-06, "loss": 0.086, "step": 978 }, { "epoch": 1.433906993775174, "grad_norm": 0.28663475644140607, "learning_rate": 4.477605829049479e-06, "loss": 0.0811, "step": 979 }, { "epoch": 1.435371658733065, "grad_norm": 0.2698934408611363, "learning_rate": 4.456278391603207e-06, "loss": 0.0811, "step": 980 }, { "epoch": 1.4368363236909558, "grad_norm": 0.2663708566419197, "learning_rate": 4.434987296405602e-06, "loss": 0.106, "step": 981 }, { "epoch": 1.4383009886488467, "grad_norm": 0.24655631319842156, "learning_rate": 4.413732683031901e-06, "loss": 0.078, "step": 982 }, { "epoch": 1.4397656536067376, "grad_norm": 0.24077565145998617, "learning_rate": 4.392514690818194e-06, "loss": 0.0695, "step": 983 }, { "epoch": 1.4412303185646285, "grad_norm": 0.2717847069075042, "learning_rate": 4.37133345886048e-06, "loss": 0.0902, "step": 984 }, { "epoch": 1.4426949835225193, "grad_norm": 0.2318112119726766, "learning_rate": 4.350189126013793e-06, "loss": 0.0641, "step": 985 }, { "epoch": 1.4441596484804102, "grad_norm": 0.26503499304444195, "learning_rate": 4.329081830891253e-06, "loss": 0.0967, "step": 986 }, { "epoch": 1.4456243134383011, "grad_norm": 0.2691482856135504, "learning_rate": 4.308011711863196e-06, "loss": 0.0911, "step": 987 }, { "epoch": 1.447088978396192, "grad_norm": 0.27532174581408686, "learning_rate": 4.28697890705623e-06, "loss": 0.0885, "step": 988 }, { "epoch": 1.4485536433540829, "grad_norm": 0.27243518360307484, "learning_rate": 4.265983554352361e-06, "loss": 0.0815, "step": 989 }, { "epoch": 1.4500183083119738, "grad_norm": 0.25834681516095914, "learning_rate": 4.245025791388063e-06, "loss": 0.0736, "step": 990 }, { "epoch": 1.4514829732698646, "grad_norm": 0.2850395862995923, "learning_rate": 4.224105755553402e-06, "loss": 0.0851, "step": 991 }, { "epoch": 1.4529476382277555, "grad_norm": 0.3092877439958859, "learning_rate": 4.203223583991103e-06, "loss": 0.1023, "step": 992 }, { "epoch": 1.4544123031856464, "grad_norm": 0.26774401365459694, "learning_rate": 4.18237941359569e-06, "loss": 0.0829, "step": 993 }, { "epoch": 1.4558769681435373, "grad_norm": 0.23800118678712484, "learning_rate": 4.161573381012547e-06, "loss": 0.0676, "step": 994 }, { "epoch": 1.4573416331014282, "grad_norm": 0.2666429232022083, "learning_rate": 4.140805622637062e-06, "loss": 0.0859, "step": 995 }, { "epoch": 1.458806298059319, "grad_norm": 0.24644352364865663, "learning_rate": 4.1200762746137e-06, "loss": 0.0761, "step": 996 }, { "epoch": 1.46027096301721, "grad_norm": 0.2860182896303882, "learning_rate": 4.099385472835128e-06, "loss": 0.1061, "step": 997 }, { "epoch": 1.4617356279751008, "grad_norm": 0.24601895494668852, "learning_rate": 4.078733352941322e-06, "loss": 0.0889, "step": 998 }, { "epoch": 1.4632002929329917, "grad_norm": 0.2766785303442865, "learning_rate": 4.05812005031868e-06, "loss": 0.0904, "step": 999 }, { "epoch": 1.4646649578908826, "grad_norm": 0.2652781699314485, "learning_rate": 4.0375457000991216e-06, "loss": 0.0816, "step": 1000 }, { "epoch": 1.4661296228487735, "grad_norm": 0.25088853578582626, "learning_rate": 4.01701043715922e-06, "loss": 0.0762, "step": 1001 }, { "epoch": 1.4675942878066643, "grad_norm": 0.2818469333221271, "learning_rate": 3.996514396119301e-06, "loss": 0.0774, "step": 1002 }, { "epoch": 1.4690589527645552, "grad_norm": 0.26036857004758346, "learning_rate": 3.976057711342578e-06, "loss": 0.0795, "step": 1003 }, { "epoch": 1.470523617722446, "grad_norm": 0.27442281645232014, "learning_rate": 3.95564051693425e-06, "loss": 0.07, "step": 1004 }, { "epoch": 1.471988282680337, "grad_norm": 0.24267887450363582, "learning_rate": 3.935262946740648e-06, "loss": 0.0756, "step": 1005 }, { "epoch": 1.4734529476382279, "grad_norm": 0.2511350381009027, "learning_rate": 3.914925134348328e-06, "loss": 0.069, "step": 1006 }, { "epoch": 1.4749176125961188, "grad_norm": 0.2342240326378387, "learning_rate": 3.894627213083227e-06, "loss": 0.0688, "step": 1007 }, { "epoch": 1.4763822775540096, "grad_norm": 0.23553774309732706, "learning_rate": 3.874369316009759e-06, "loss": 0.0744, "step": 1008 }, { "epoch": 1.4778469425119005, "grad_norm": 0.23926474659226465, "learning_rate": 3.85415157592997e-06, "loss": 0.0734, "step": 1009 }, { "epoch": 1.4793116074697914, "grad_norm": 0.268715175607369, "learning_rate": 3.833974125382639e-06, "loss": 0.0723, "step": 1010 }, { "epoch": 1.4807762724276823, "grad_norm": 0.2662279770589081, "learning_rate": 3.8138370966424386e-06, "loss": 0.0717, "step": 1011 }, { "epoch": 1.4822409373855732, "grad_norm": 0.2737509205481138, "learning_rate": 3.793740621719042e-06, "loss": 0.0814, "step": 1012 }, { "epoch": 1.483705602343464, "grad_norm": 0.2639004103415663, "learning_rate": 3.7736848323562803e-06, "loss": 0.091, "step": 1013 }, { "epoch": 1.485170267301355, "grad_norm": 0.24888682745157728, "learning_rate": 3.753669860031254e-06, "loss": 0.0791, "step": 1014 }, { "epoch": 1.4866349322592458, "grad_norm": 0.2768808484904024, "learning_rate": 3.7336958359534992e-06, "loss": 0.1019, "step": 1015 }, { "epoch": 1.4880995972171367, "grad_norm": 0.24635968007971593, "learning_rate": 3.7137628910640997e-06, "loss": 0.0656, "step": 1016 }, { "epoch": 1.4895642621750276, "grad_norm": 0.2769850878713628, "learning_rate": 3.693871156034854e-06, "loss": 0.0824, "step": 1017 }, { "epoch": 1.4910289271329185, "grad_norm": 0.2618910230484893, "learning_rate": 3.674020761267394e-06, "loss": 0.0749, "step": 1018 }, { "epoch": 1.4924935920908093, "grad_norm": 0.25597642357105027, "learning_rate": 3.6542118368923562e-06, "loss": 0.0698, "step": 1019 }, { "epoch": 1.4939582570487002, "grad_norm": 0.2717269361679958, "learning_rate": 3.634444512768501e-06, "loss": 0.083, "step": 1020 }, { "epoch": 1.495422922006591, "grad_norm": 0.2992590236586871, "learning_rate": 3.61471891848189e-06, "loss": 0.0886, "step": 1021 }, { "epoch": 1.496887586964482, "grad_norm": 0.2455530601341986, "learning_rate": 3.595035183345007e-06, "loss": 0.0666, "step": 1022 }, { "epoch": 1.4983522519223729, "grad_norm": 0.2811055696814055, "learning_rate": 3.575393436395941e-06, "loss": 0.0837, "step": 1023 }, { "epoch": 1.4998169168802638, "grad_norm": 0.25019620839932355, "learning_rate": 3.5557938063975105e-06, "loss": 0.0755, "step": 1024 }, { "epoch": 1.5012815818381546, "grad_norm": 0.24368791106013798, "learning_rate": 3.5362364218364387e-06, "loss": 0.0731, "step": 1025 }, { "epoch": 1.5027462467960455, "grad_norm": 0.26573828045849746, "learning_rate": 3.5167214109225113e-06, "loss": 0.0791, "step": 1026 }, { "epoch": 1.5042109117539364, "grad_norm": 0.2550833556385589, "learning_rate": 3.497248901587721e-06, "loss": 0.08, "step": 1027 }, { "epoch": 1.5056755767118273, "grad_norm": 0.2639518823077813, "learning_rate": 3.477819021485448e-06, "loss": 0.0782, "step": 1028 }, { "epoch": 1.5071402416697182, "grad_norm": 0.2646189162558523, "learning_rate": 3.4584318979896028e-06, "loss": 0.0878, "step": 1029 }, { "epoch": 1.508604906627609, "grad_norm": 0.2641718505669378, "learning_rate": 3.439087658193816e-06, "loss": 0.0936, "step": 1030 }, { "epoch": 1.5100695715855, "grad_norm": 0.24594190472472738, "learning_rate": 3.4197864289105763e-06, "loss": 0.0778, "step": 1031 }, { "epoch": 1.5115342365433908, "grad_norm": 0.28815729095628295, "learning_rate": 3.4005283366704268e-06, "loss": 0.0761, "step": 1032 }, { "epoch": 1.5129989015012817, "grad_norm": 0.2528110175883712, "learning_rate": 3.381313507721111e-06, "loss": 0.0747, "step": 1033 }, { "epoch": 1.5144635664591726, "grad_norm": 0.2533306883004603, "learning_rate": 3.36214206802677e-06, "loss": 0.0691, "step": 1034 }, { "epoch": 1.5159282314170635, "grad_norm": 0.25930598769235536, "learning_rate": 3.343014143267089e-06, "loss": 0.0692, "step": 1035 }, { "epoch": 1.5173928963749543, "grad_norm": 0.25924131927849636, "learning_rate": 3.3239298588365045e-06, "loss": 0.0775, "step": 1036 }, { "epoch": 1.5188575613328452, "grad_norm": 0.2491634194032525, "learning_rate": 3.304889339843347e-06, "loss": 0.0723, "step": 1037 }, { "epoch": 1.520322226290736, "grad_norm": 0.26529983131571094, "learning_rate": 3.285892711109059e-06, "loss": 0.0914, "step": 1038 }, { "epoch": 1.521786891248627, "grad_norm": 0.22521005677274164, "learning_rate": 3.2669400971673425e-06, "loss": 0.0616, "step": 1039 }, { "epoch": 1.5232515562065179, "grad_norm": 0.24331418307789873, "learning_rate": 3.2480316222633614e-06, "loss": 0.0652, "step": 1040 }, { "epoch": 1.5247162211644087, "grad_norm": 0.2488037942280336, "learning_rate": 3.229167410352931e-06, "loss": 0.0715, "step": 1041 }, { "epoch": 1.5261808861222996, "grad_norm": 0.26897121918679934, "learning_rate": 3.210347585101684e-06, "loss": 0.0793, "step": 1042 }, { "epoch": 1.5276455510801905, "grad_norm": 0.25348059670947104, "learning_rate": 3.1915722698842877e-06, "loss": 0.078, "step": 1043 }, { "epoch": 1.5291102160380814, "grad_norm": 0.233264660760853, "learning_rate": 3.1728415877836072e-06, "loss": 0.0761, "step": 1044 }, { "epoch": 1.5305748809959723, "grad_norm": 0.26375032434001777, "learning_rate": 3.154155661589924e-06, "loss": 0.0819, "step": 1045 }, { "epoch": 1.5320395459538632, "grad_norm": 0.26419620695615686, "learning_rate": 3.135514613800108e-06, "loss": 0.0765, "step": 1046 }, { "epoch": 1.533504210911754, "grad_norm": 0.23976923188526442, "learning_rate": 3.1169185666168377e-06, "loss": 0.0692, "step": 1047 }, { "epoch": 1.534968875869645, "grad_norm": 0.2493779944554121, "learning_rate": 3.098367641947775e-06, "loss": 0.0845, "step": 1048 }, { "epoch": 1.5364335408275358, "grad_norm": 0.29326865343508624, "learning_rate": 3.0798619614047885e-06, "loss": 0.0998, "step": 1049 }, { "epoch": 1.5378982057854267, "grad_norm": 0.2789752607489951, "learning_rate": 3.061401646303136e-06, "loss": 0.0886, "step": 1050 }, { "epoch": 1.5393628707433176, "grad_norm": 0.2737662371091596, "learning_rate": 3.04298681766069e-06, "loss": 0.0684, "step": 1051 }, { "epoch": 1.5408275357012085, "grad_norm": 0.2637024271799521, "learning_rate": 3.024617596197121e-06, "loss": 0.0859, "step": 1052 }, { "epoch": 1.5422922006590993, "grad_norm": 0.23552670812452456, "learning_rate": 3.0062941023331316e-06, "loss": 0.0639, "step": 1053 }, { "epoch": 1.5437568656169902, "grad_norm": 0.2442542376702699, "learning_rate": 2.988016456189644e-06, "loss": 0.0721, "step": 1054 }, { "epoch": 1.545221530574881, "grad_norm": 0.2681121581501204, "learning_rate": 2.9697847775870227e-06, "loss": 0.0781, "step": 1055 }, { "epoch": 1.546686195532772, "grad_norm": 0.3009772111270036, "learning_rate": 2.9515991860442973e-06, "loss": 0.0887, "step": 1056 }, { "epoch": 1.5481508604906629, "grad_norm": 0.2438680927400026, "learning_rate": 2.933459800778361e-06, "loss": 0.0708, "step": 1057 }, { "epoch": 1.5496155254485537, "grad_norm": 0.24192854414225795, "learning_rate": 2.9153667407032073e-06, "loss": 0.0651, "step": 1058 }, { "epoch": 1.5510801904064446, "grad_norm": 0.25560089457881163, "learning_rate": 2.8973201244291305e-06, "loss": 0.0748, "step": 1059 }, { "epoch": 1.5525448553643355, "grad_norm": 0.24635545740478473, "learning_rate": 2.879320070261974e-06, "loss": 0.0638, "step": 1060 }, { "epoch": 1.5540095203222264, "grad_norm": 0.3187176598509699, "learning_rate": 2.861366696202326e-06, "loss": 0.1081, "step": 1061 }, { "epoch": 1.5554741852801173, "grad_norm": 0.25629735568872725, "learning_rate": 2.8434601199447698e-06, "loss": 0.0769, "step": 1062 }, { "epoch": 1.5569388502380082, "grad_norm": 0.24990001099936748, "learning_rate": 2.825600458877095e-06, "loss": 0.0898, "step": 1063 }, { "epoch": 1.558403515195899, "grad_norm": 0.250013809003241, "learning_rate": 2.8077878300795446e-06, "loss": 0.0746, "step": 1064 }, { "epoch": 1.55986818015379, "grad_norm": 0.25934456236335596, "learning_rate": 2.7900223503240265e-06, "loss": 0.0958, "step": 1065 }, { "epoch": 1.5613328451116808, "grad_norm": 0.24098659840069045, "learning_rate": 2.7723041360733737e-06, "loss": 0.0695, "step": 1066 }, { "epoch": 1.5627975100695717, "grad_norm": 0.26301653549643544, "learning_rate": 2.7546333034805528e-06, "loss": 0.0741, "step": 1067 }, { "epoch": 1.5642621750274626, "grad_norm": 0.2391640316419889, "learning_rate": 2.737009968387929e-06, "loss": 0.0605, "step": 1068 }, { "epoch": 1.5657268399853534, "grad_norm": 0.25060691825740045, "learning_rate": 2.719434246326487e-06, "loss": 0.0643, "step": 1069 }, { "epoch": 1.5671915049432443, "grad_norm": 0.274283165127303, "learning_rate": 2.7019062525150783e-06, "loss": 0.0874, "step": 1070 }, { "epoch": 1.5686561699011352, "grad_norm": 0.2978642107033507, "learning_rate": 2.6844261018596806e-06, "loss": 0.0823, "step": 1071 }, { "epoch": 1.570120834859026, "grad_norm": 0.2690424055706032, "learning_rate": 2.6669939089526177e-06, "loss": 0.0856, "step": 1072 }, { "epoch": 1.571585499816917, "grad_norm": 0.28386560171341124, "learning_rate": 2.6496097880718364e-06, "loss": 0.079, "step": 1073 }, { "epoch": 1.5730501647748079, "grad_norm": 0.2898252725065379, "learning_rate": 2.632273853180132e-06, "loss": 0.0801, "step": 1074 }, { "epoch": 1.5745148297326987, "grad_norm": 0.2510043284519159, "learning_rate": 2.6149862179244257e-06, "loss": 0.0692, "step": 1075 }, { "epoch": 1.5759794946905896, "grad_norm": 0.2723382092456178, "learning_rate": 2.5977469956349956e-06, "loss": 0.0956, "step": 1076 }, { "epoch": 1.5774441596484805, "grad_norm": 0.26220486838825635, "learning_rate": 2.5805562993247536e-06, "loss": 0.0705, "step": 1077 }, { "epoch": 1.5789088246063714, "grad_norm": 0.2645372898294826, "learning_rate": 2.563414241688489e-06, "loss": 0.0691, "step": 1078 }, { "epoch": 1.5803734895642623, "grad_norm": 0.25156658959890105, "learning_rate": 2.5463209351021457e-06, "loss": 0.0796, "step": 1079 }, { "epoch": 1.5818381545221532, "grad_norm": 0.25334520784266373, "learning_rate": 2.529276491622067e-06, "loss": 0.0737, "step": 1080 }, { "epoch": 1.583302819480044, "grad_norm": 0.23790674931444958, "learning_rate": 2.5122810229842807e-06, "loss": 0.0629, "step": 1081 }, { "epoch": 1.584767484437935, "grad_norm": 0.2303091761767703, "learning_rate": 2.495334640603746e-06, "loss": 0.0767, "step": 1082 }, { "epoch": 1.5862321493958258, "grad_norm": 0.2510575661559066, "learning_rate": 2.4784374555736445e-06, "loss": 0.0649, "step": 1083 }, { "epoch": 1.5876968143537167, "grad_norm": 0.26143738599858646, "learning_rate": 2.4615895786646337e-06, "loss": 0.0831, "step": 1084 }, { "epoch": 1.5891614793116076, "grad_norm": 0.2567282259841764, "learning_rate": 2.444791120324127e-06, "loss": 0.0754, "step": 1085 }, { "epoch": 1.5906261442694984, "grad_norm": 0.2730764216157527, "learning_rate": 2.4280421906755814e-06, "loss": 0.0803, "step": 1086 }, { "epoch": 1.5920908092273893, "grad_norm": 0.25081318067201624, "learning_rate": 2.4113428995177522e-06, "loss": 0.0778, "step": 1087 }, { "epoch": 1.5935554741852802, "grad_norm": 0.24769756992277367, "learning_rate": 2.394693356323997e-06, "loss": 0.0775, "step": 1088 }, { "epoch": 1.595020139143171, "grad_norm": 0.2546886460741762, "learning_rate": 2.378093670241538e-06, "loss": 0.0688, "step": 1089 }, { "epoch": 1.596484804101062, "grad_norm": 0.2563027758513368, "learning_rate": 2.3615439500907657e-06, "loss": 0.0617, "step": 1090 }, { "epoch": 1.5979494690589529, "grad_norm": 0.25885408016228334, "learning_rate": 2.3450443043645035e-06, "loss": 0.0697, "step": 1091 }, { "epoch": 1.5994141340168437, "grad_norm": 0.26381952528868263, "learning_rate": 2.3285948412273198e-06, "loss": 0.0732, "step": 1092 }, { "epoch": 1.6008787989747346, "grad_norm": 0.22722995241678426, "learning_rate": 2.3121956685147995e-06, "loss": 0.0571, "step": 1093 }, { "epoch": 1.6023434639326255, "grad_norm": 0.2828387588961395, "learning_rate": 2.2958468937328528e-06, "loss": 0.0749, "step": 1094 }, { "epoch": 1.6038081288905164, "grad_norm": 0.2558084885710104, "learning_rate": 2.279548624056992e-06, "loss": 0.0733, "step": 1095 }, { "epoch": 1.6052727938484073, "grad_norm": 0.27462713945439543, "learning_rate": 2.263300966331652e-06, "loss": 0.0934, "step": 1096 }, { "epoch": 1.6067374588062981, "grad_norm": 0.2904102780660436, "learning_rate": 2.247104027069467e-06, "loss": 0.0901, "step": 1097 }, { "epoch": 1.608202123764189, "grad_norm": 0.2771702213465073, "learning_rate": 2.230957912450592e-06, "loss": 0.08, "step": 1098 }, { "epoch": 1.60966678872208, "grad_norm": 0.2528112186314324, "learning_rate": 2.214862728321987e-06, "loss": 0.0755, "step": 1099 }, { "epoch": 1.6111314536799708, "grad_norm": 0.23629870958401772, "learning_rate": 2.1988185801967464e-06, "loss": 0.0687, "step": 1100 }, { "epoch": 1.6125961186378617, "grad_norm": 0.24410925927191227, "learning_rate": 2.182825573253382e-06, "loss": 0.0727, "step": 1101 }, { "epoch": 1.6140607835957526, "grad_norm": 0.2561472124227788, "learning_rate": 2.1668838123351566e-06, "loss": 0.0909, "step": 1102 }, { "epoch": 1.6155254485536434, "grad_norm": 0.2635995106202067, "learning_rate": 2.150993401949376e-06, "loss": 0.076, "step": 1103 }, { "epoch": 1.6169901135115343, "grad_norm": 0.24617657014500907, "learning_rate": 2.135154446266726e-06, "loss": 0.068, "step": 1104 }, { "epoch": 1.6184547784694252, "grad_norm": 0.24053480589468268, "learning_rate": 2.119367049120565e-06, "loss": 0.0635, "step": 1105 }, { "epoch": 1.619919443427316, "grad_norm": 0.24204380335717454, "learning_rate": 2.103631314006267e-06, "loss": 0.0678, "step": 1106 }, { "epoch": 1.621384108385207, "grad_norm": 0.22690779315880516, "learning_rate": 2.087947344080522e-06, "loss": 0.057, "step": 1107 }, { "epoch": 1.6228487733430979, "grad_norm": 0.28433243723669543, "learning_rate": 2.0723152421606805e-06, "loss": 0.0951, "step": 1108 }, { "epoch": 1.6243134383009887, "grad_norm": 0.2599968195648375, "learning_rate": 2.0567351107240563e-06, "loss": 0.0695, "step": 1109 }, { "epoch": 1.6257781032588796, "grad_norm": 0.26139586892133587, "learning_rate": 2.041207051907279e-06, "loss": 0.078, "step": 1110 }, { "epoch": 1.6272427682167705, "grad_norm": 0.2885911977842118, "learning_rate": 2.0257311675056025e-06, "loss": 0.0893, "step": 1111 }, { "epoch": 1.6287074331746614, "grad_norm": 0.23726597176087105, "learning_rate": 2.0103075589722576e-06, "loss": 0.0614, "step": 1112 }, { "epoch": 1.6301720981325523, "grad_norm": 0.26578035267907674, "learning_rate": 1.9949363274177667e-06, "loss": 0.0921, "step": 1113 }, { "epoch": 1.6316367630904431, "grad_norm": 0.2749652618636836, "learning_rate": 1.9796175736093027e-06, "loss": 0.0835, "step": 1114 }, { "epoch": 1.633101428048334, "grad_norm": 0.24658373064000075, "learning_rate": 1.9643513979700035e-06, "loss": 0.0777, "step": 1115 }, { "epoch": 1.634566093006225, "grad_norm": 0.22932017498786317, "learning_rate": 1.9491379005783405e-06, "loss": 0.0723, "step": 1116 }, { "epoch": 1.6360307579641158, "grad_norm": 0.2545440028300663, "learning_rate": 1.933977181167439e-06, "loss": 0.076, "step": 1117 }, { "epoch": 1.6374954229220067, "grad_norm": 0.2552570068416127, "learning_rate": 1.9188693391244438e-06, "loss": 0.0754, "step": 1118 }, { "epoch": 1.6389600878798976, "grad_norm": 0.2478371358109087, "learning_rate": 1.9038144734898478e-06, "loss": 0.0729, "step": 1119 }, { "epoch": 1.6404247528377884, "grad_norm": 0.2291241183631655, "learning_rate": 1.8888126829568642e-06, "loss": 0.0634, "step": 1120 }, { "epoch": 1.6418894177956793, "grad_norm": 0.24193265605324504, "learning_rate": 1.8738640658707585e-06, "loss": 0.0671, "step": 1121 }, { "epoch": 1.6433540827535702, "grad_norm": 0.2576873945419135, "learning_rate": 1.8589687202282247e-06, "loss": 0.0723, "step": 1122 }, { "epoch": 1.644818747711461, "grad_norm": 0.24676250477181633, "learning_rate": 1.844126743676722e-06, "loss": 0.0748, "step": 1123 }, { "epoch": 1.646283412669352, "grad_norm": 0.263090468630097, "learning_rate": 1.8293382335138533e-06, "loss": 0.0752, "step": 1124 }, { "epoch": 1.6477480776272428, "grad_norm": 0.2540498525445381, "learning_rate": 1.8146032866867114e-06, "loss": 0.0708, "step": 1125 }, { "epoch": 1.6492127425851337, "grad_norm": 0.2690977824033387, "learning_rate": 1.7999219997912575e-06, "loss": 0.0718, "step": 1126 }, { "epoch": 1.6506774075430246, "grad_norm": 0.2565788082278319, "learning_rate": 1.7852944690716766e-06, "loss": 0.0834, "step": 1127 }, { "epoch": 1.6521420725009155, "grad_norm": 0.250344072055428, "learning_rate": 1.7707207904197566e-06, "loss": 0.0648, "step": 1128 }, { "epoch": 1.6536067374588064, "grad_norm": 0.2531519839830438, "learning_rate": 1.7562010593742496e-06, "loss": 0.0761, "step": 1129 }, { "epoch": 1.6550714024166973, "grad_norm": 0.2671783705252202, "learning_rate": 1.7417353711202478e-06, "loss": 0.0806, "step": 1130 }, { "epoch": 1.6565360673745881, "grad_norm": 0.27608727196744226, "learning_rate": 1.7273238204885734e-06, "loss": 0.0776, "step": 1131 }, { "epoch": 1.658000732332479, "grad_norm": 0.21272809051359043, "learning_rate": 1.7129665019551333e-06, "loss": 0.0543, "step": 1132 }, { "epoch": 1.65946539729037, "grad_norm": 0.2981100085240527, "learning_rate": 1.6986635096403213e-06, "loss": 0.0917, "step": 1133 }, { "epoch": 1.6609300622482608, "grad_norm": 0.2283996510993873, "learning_rate": 1.6844149373083852e-06, "loss": 0.0689, "step": 1134 }, { "epoch": 1.6623947272061517, "grad_norm": 0.2932656999891174, "learning_rate": 1.670220878366826e-06, "loss": 0.0913, "step": 1135 }, { "epoch": 1.6638593921640425, "grad_norm": 0.24998740552484003, "learning_rate": 1.6560814258657687e-06, "loss": 0.0842, "step": 1136 }, { "epoch": 1.6653240571219334, "grad_norm": 0.24855056763964203, "learning_rate": 1.6419966724973734e-06, "loss": 0.0734, "step": 1137 }, { "epoch": 1.6667887220798243, "grad_norm": 0.24369157654216977, "learning_rate": 1.6279667105952057e-06, "loss": 0.0911, "step": 1138 }, { "epoch": 1.6682533870377152, "grad_norm": 0.26041234841753214, "learning_rate": 1.6139916321336513e-06, "loss": 0.0704, "step": 1139 }, { "epoch": 1.669718051995606, "grad_norm": 0.2654682154981918, "learning_rate": 1.6000715287272938e-06, "loss": 0.0762, "step": 1140 }, { "epoch": 1.671182716953497, "grad_norm": 0.2379683222765605, "learning_rate": 1.5862064916303343e-06, "loss": 0.0734, "step": 1141 }, { "epoch": 1.6726473819113878, "grad_norm": 0.26506248533884014, "learning_rate": 1.5723966117359745e-06, "loss": 0.0673, "step": 1142 }, { "epoch": 1.6741120468692787, "grad_norm": 0.2380435720292988, "learning_rate": 1.5586419795758356e-06, "loss": 0.0726, "step": 1143 }, { "epoch": 1.6755767118271696, "grad_norm": 0.23717677968711492, "learning_rate": 1.5449426853193549e-06, "loss": 0.0796, "step": 1144 }, { "epoch": 1.6770413767850605, "grad_norm": 0.27958169485281625, "learning_rate": 1.5312988187731969e-06, "loss": 0.0855, "step": 1145 }, { "epoch": 1.6785060417429514, "grad_norm": 0.2568879441056742, "learning_rate": 1.5177104693806721e-06, "loss": 0.0712, "step": 1146 }, { "epoch": 1.6799707067008423, "grad_norm": 0.22152990710129233, "learning_rate": 1.5041777262211355e-06, "loss": 0.0584, "step": 1147 }, { "epoch": 1.6814353716587331, "grad_norm": 0.2652525501551631, "learning_rate": 1.4907006780094212e-06, "loss": 0.0697, "step": 1148 }, { "epoch": 1.682900036616624, "grad_norm": 0.23706028774243557, "learning_rate": 1.4772794130952416e-06, "loss": 0.0556, "step": 1149 }, { "epoch": 1.684364701574515, "grad_norm": 0.232103427190539, "learning_rate": 1.4639140194626289e-06, "loss": 0.0591, "step": 1150 }, { "epoch": 1.6858293665324058, "grad_norm": 0.24774373170203481, "learning_rate": 1.450604584729336e-06, "loss": 0.061, "step": 1151 }, { "epoch": 1.6872940314902967, "grad_norm": 0.24757349813419924, "learning_rate": 1.4373511961462828e-06, "loss": 0.0859, "step": 1152 }, { "epoch": 1.6887586964481875, "grad_norm": 0.239803460293539, "learning_rate": 1.4241539405969662e-06, "loss": 0.0694, "step": 1153 }, { "epoch": 1.6902233614060784, "grad_norm": 0.24607947040534622, "learning_rate": 1.411012904596909e-06, "loss": 0.0726, "step": 1154 }, { "epoch": 1.6916880263639693, "grad_norm": 0.24298503192914125, "learning_rate": 1.3979281742930706e-06, "loss": 0.071, "step": 1155 }, { "epoch": 1.6931526913218602, "grad_norm": 0.25982515681290586, "learning_rate": 1.3848998354633082e-06, "loss": 0.0663, "step": 1156 }, { "epoch": 1.694617356279751, "grad_norm": 0.2824274775862737, "learning_rate": 1.3719279735157875e-06, "loss": 0.0815, "step": 1157 }, { "epoch": 1.696082021237642, "grad_norm": 0.25503279831214637, "learning_rate": 1.359012673488449e-06, "loss": 0.084, "step": 1158 }, { "epoch": 1.6975466861955328, "grad_norm": 0.2572621551136547, "learning_rate": 1.346154020048428e-06, "loss": 0.0647, "step": 1159 }, { "epoch": 1.6990113511534237, "grad_norm": 0.26401952613949853, "learning_rate": 1.3333520974915093e-06, "loss": 0.0611, "step": 1160 }, { "epoch": 1.7004760161113146, "grad_norm": 0.27287208188302337, "learning_rate": 1.320606989741583e-06, "loss": 0.0887, "step": 1161 }, { "epoch": 1.7019406810692055, "grad_norm": 0.25110794107433626, "learning_rate": 1.307918780350077e-06, "loss": 0.0726, "step": 1162 }, { "epoch": 1.7034053460270964, "grad_norm": 0.26427102366363486, "learning_rate": 1.2952875524954233e-06, "loss": 0.0744, "step": 1163 }, { "epoch": 1.7048700109849872, "grad_norm": 0.2577483404898282, "learning_rate": 1.2827133889825039e-06, "loss": 0.0799, "step": 1164 }, { "epoch": 1.7063346759428781, "grad_norm": 0.2379395103991794, "learning_rate": 1.2701963722421162e-06, "loss": 0.0668, "step": 1165 }, { "epoch": 1.707799340900769, "grad_norm": 0.24586889905164777, "learning_rate": 1.2577365843304212e-06, "loss": 0.0888, "step": 1166 }, { "epoch": 1.70926400585866, "grad_norm": 0.23659079747711304, "learning_rate": 1.245334106928422e-06, "loss": 0.06, "step": 1167 }, { "epoch": 1.7107286708165508, "grad_norm": 0.23685869500201995, "learning_rate": 1.2329890213414063e-06, "loss": 0.0639, "step": 1168 }, { "epoch": 1.7121933357744417, "grad_norm": 0.2375875883124259, "learning_rate": 1.220701408498438e-06, "loss": 0.0669, "step": 1169 }, { "epoch": 1.7136580007323325, "grad_norm": 0.26520434490127226, "learning_rate": 1.2084713489518063e-06, "loss": 0.0737, "step": 1170 }, { "epoch": 1.7151226656902234, "grad_norm": 0.2629329649647797, "learning_rate": 1.19629892287651e-06, "loss": 0.0942, "step": 1171 }, { "epoch": 1.7165873306481143, "grad_norm": 0.23516934988436172, "learning_rate": 1.1841842100697253e-06, "loss": 0.0551, "step": 1172 }, { "epoch": 1.7180519956060052, "grad_norm": 0.27697429411981783, "learning_rate": 1.1721272899502856e-06, "loss": 0.0762, "step": 1173 }, { "epoch": 1.719516660563896, "grad_norm": 0.25426135353111506, "learning_rate": 1.1601282415581627e-06, "loss": 0.0664, "step": 1174 }, { "epoch": 1.720981325521787, "grad_norm": 0.24626607935465855, "learning_rate": 1.1481871435539415e-06, "loss": 0.0619, "step": 1175 }, { "epoch": 1.7224459904796778, "grad_norm": 0.25729901031239943, "learning_rate": 1.1363040742183162e-06, "loss": 0.073, "step": 1176 }, { "epoch": 1.7239106554375687, "grad_norm": 0.2567425743663647, "learning_rate": 1.1244791114515608e-06, "loss": 0.0578, "step": 1177 }, { "epoch": 1.7253753203954596, "grad_norm": 0.2624890331507839, "learning_rate": 1.1127123327730383e-06, "loss": 0.0695, "step": 1178 }, { "epoch": 1.7268399853533505, "grad_norm": 0.25775906362459766, "learning_rate": 1.1010038153206703e-06, "loss": 0.0671, "step": 1179 }, { "epoch": 1.7283046503112414, "grad_norm": 0.2357579800372649, "learning_rate": 1.0893536358504553e-06, "loss": 0.0626, "step": 1180 }, { "epoch": 1.7297693152691322, "grad_norm": 0.27772530636880377, "learning_rate": 1.0777618707359427e-06, "loss": 0.0768, "step": 1181 }, { "epoch": 1.7312339802270231, "grad_norm": 0.2383877132505535, "learning_rate": 1.0662285959677499e-06, "loss": 0.0667, "step": 1182 }, { "epoch": 1.732698645184914, "grad_norm": 0.23742698011306232, "learning_rate": 1.0547538871530482e-06, "loss": 0.0599, "step": 1183 }, { "epoch": 1.734163310142805, "grad_norm": 0.2587588714854726, "learning_rate": 1.0433378195150889e-06, "loss": 0.1056, "step": 1184 }, { "epoch": 1.7356279751006958, "grad_norm": 0.2623510431514894, "learning_rate": 1.0319804678926825e-06, "loss": 0.0724, "step": 1185 }, { "epoch": 1.7370926400585867, "grad_norm": 0.23000218942084621, "learning_rate": 1.0206819067397345e-06, "loss": 0.0631, "step": 1186 }, { "epoch": 1.7385573050164775, "grad_norm": 0.23524204735507692, "learning_rate": 1.009442210124737e-06, "loss": 0.0642, "step": 1187 }, { "epoch": 1.7400219699743684, "grad_norm": 0.26310108373605007, "learning_rate": 9.982614517302958e-07, "loss": 0.0732, "step": 1188 }, { "epoch": 1.7414866349322593, "grad_norm": 0.2822237389406422, "learning_rate": 9.871397048526431e-07, "loss": 0.09, "step": 1189 }, { "epoch": 1.7429512998901502, "grad_norm": 0.27062484433619666, "learning_rate": 9.760770424011557e-07, "loss": 0.0742, "step": 1190 }, { "epoch": 1.744415964848041, "grad_norm": 0.2942597554218074, "learning_rate": 9.650735368978793e-07, "loss": 0.0893, "step": 1191 }, { "epoch": 1.745880629805932, "grad_norm": 0.2595099599479516, "learning_rate": 9.541292604770502e-07, "loss": 0.0801, "step": 1192 }, { "epoch": 1.7473452947638228, "grad_norm": 0.25996832033844564, "learning_rate": 9.432442848846291e-07, "loss": 0.0802, "step": 1193 }, { "epoch": 1.7488099597217137, "grad_norm": 0.25099023362939665, "learning_rate": 9.324186814778202e-07, "loss": 0.0759, "step": 1194 }, { "epoch": 1.7502746246796046, "grad_norm": 0.261784151531945, "learning_rate": 9.216525212246131e-07, "loss": 0.068, "step": 1195 }, { "epoch": 1.7517392896374955, "grad_norm": 0.2676041434167344, "learning_rate": 9.109458747033106e-07, "loss": 0.0758, "step": 1196 }, { "epoch": 1.7532039545953864, "grad_norm": 0.27324413990185265, "learning_rate": 9.00298812102075e-07, "loss": 0.0781, "step": 1197 }, { "epoch": 1.7546686195532772, "grad_norm": 0.23917286810904692, "learning_rate": 8.897114032184539e-07, "loss": 0.0604, "step": 1198 }, { "epoch": 1.7561332845111681, "grad_norm": 0.2632087773397572, "learning_rate": 8.791837174589401e-07, "loss": 0.0835, "step": 1199 }, { "epoch": 1.757597949469059, "grad_norm": 0.2745654342512245, "learning_rate": 8.687158238384963e-07, "loss": 0.0688, "step": 1200 }, { "epoch": 1.7590626144269499, "grad_norm": 0.255009900836695, "learning_rate": 8.583077909801252e-07, "loss": 0.0729, "step": 1201 }, { "epoch": 1.7605272793848408, "grad_norm": 0.24279405877132665, "learning_rate": 8.47959687114398e-07, "loss": 0.064, "step": 1202 }, { "epoch": 1.7619919443427317, "grad_norm": 0.2671484719871084, "learning_rate": 8.37671580079027e-07, "loss": 0.0689, "step": 1203 }, { "epoch": 1.7634566093006225, "grad_norm": 0.22793656038154106, "learning_rate": 8.274435373184009e-07, "loss": 0.0609, "step": 1204 }, { "epoch": 1.7649212742585134, "grad_norm": 0.25888704046474315, "learning_rate": 8.172756258831638e-07, "loss": 0.0663, "step": 1205 }, { "epoch": 1.7663859392164043, "grad_norm": 0.2694033818200205, "learning_rate": 8.071679124297537e-07, "loss": 0.077, "step": 1206 }, { "epoch": 1.7678506041742952, "grad_norm": 0.25795707216950153, "learning_rate": 7.971204632199869e-07, "loss": 0.0738, "step": 1207 }, { "epoch": 1.769315269132186, "grad_norm": 0.25028412383513904, "learning_rate": 7.871333441206053e-07, "loss": 0.0677, "step": 1208 }, { "epoch": 1.770779934090077, "grad_norm": 0.23004892259257617, "learning_rate": 7.772066206028572e-07, "loss": 0.0612, "step": 1209 }, { "epoch": 1.7722445990479678, "grad_norm": 0.25383482467849333, "learning_rate": 7.673403577420591e-07, "loss": 0.0685, "step": 1210 }, { "epoch": 1.7737092640058587, "grad_norm": 0.2519042969852701, "learning_rate": 7.575346202171819e-07, "loss": 0.0647, "step": 1211 }, { "epoch": 1.7751739289637496, "grad_norm": 0.26600143625920347, "learning_rate": 7.477894723104073e-07, "loss": 0.0807, "step": 1212 }, { "epoch": 1.7766385939216405, "grad_norm": 0.24525033390296022, "learning_rate": 7.381049779067273e-07, "loss": 0.0616, "step": 1213 }, { "epoch": 1.7781032588795314, "grad_norm": 0.24140459393603353, "learning_rate": 7.284812004935083e-07, "loss": 0.0648, "step": 1214 }, { "epoch": 1.7795679238374222, "grad_norm": 0.27804404244360953, "learning_rate": 7.189182031600906e-07, "loss": 0.0886, "step": 1215 }, { "epoch": 1.7810325887953131, "grad_norm": 0.2619152806722488, "learning_rate": 7.094160485973567e-07, "loss": 0.0826, "step": 1216 }, { "epoch": 1.782497253753204, "grad_norm": 0.2204653484419197, "learning_rate": 6.999747990973382e-07, "loss": 0.0555, "step": 1217 }, { "epoch": 1.7839619187110949, "grad_norm": 0.26650987169853924, "learning_rate": 6.905945165527928e-07, "loss": 0.0699, "step": 1218 }, { "epoch": 1.7854265836689858, "grad_norm": 0.2451107677206505, "learning_rate": 6.812752624568131e-07, "loss": 0.0739, "step": 1219 }, { "epoch": 1.7868912486268766, "grad_norm": 0.24387464427507713, "learning_rate": 6.720170979024065e-07, "loss": 0.066, "step": 1220 }, { "epoch": 1.7883559135847675, "grad_norm": 0.24790390060583067, "learning_rate": 6.628200835821119e-07, "loss": 0.0568, "step": 1221 }, { "epoch": 1.7898205785426584, "grad_norm": 0.26645550894398845, "learning_rate": 6.536842797875876e-07, "loss": 0.0864, "step": 1222 }, { "epoch": 1.7912852435005493, "grad_norm": 0.2719628682812989, "learning_rate": 6.446097464092249e-07, "loss": 0.0822, "step": 1223 }, { "epoch": 1.7927499084584402, "grad_norm": 0.25379643552394104, "learning_rate": 6.355965429357513e-07, "loss": 0.0737, "step": 1224 }, { "epoch": 1.794214573416331, "grad_norm": 0.22752210074212567, "learning_rate": 6.266447284538446e-07, "loss": 0.0541, "step": 1225 }, { "epoch": 1.795679238374222, "grad_norm": 0.2665704509120899, "learning_rate": 6.177543616477377e-07, "loss": 0.0759, "step": 1226 }, { "epoch": 1.7971439033321128, "grad_norm": 0.2500751657498049, "learning_rate": 6.08925500798847e-07, "loss": 0.0631, "step": 1227 }, { "epoch": 1.7986085682900037, "grad_norm": 0.259144080680451, "learning_rate": 6.001582037853726e-07, "loss": 0.0777, "step": 1228 }, { "epoch": 1.8000732332478946, "grad_norm": 0.2897130385154202, "learning_rate": 5.914525280819383e-07, "loss": 0.0802, "step": 1229 }, { "epoch": 1.8015378982057855, "grad_norm": 0.2553061949427675, "learning_rate": 5.828085307591969e-07, "loss": 0.0692, "step": 1230 }, { "epoch": 1.8030025631636764, "grad_norm": 0.2572539562396507, "learning_rate": 5.742262684834698e-07, "loss": 0.0833, "step": 1231 }, { "epoch": 1.8044672281215672, "grad_norm": 0.24772988825308792, "learning_rate": 5.657057975163682e-07, "loss": 0.0673, "step": 1232 }, { "epoch": 1.8059318930794581, "grad_norm": 0.2717711132164481, "learning_rate": 5.572471737144247e-07, "loss": 0.0778, "step": 1233 }, { "epoch": 1.807396558037349, "grad_norm": 0.2539596096042241, "learning_rate": 5.488504525287319e-07, "loss": 0.0727, "step": 1234 }, { "epoch": 1.8088612229952399, "grad_norm": 0.2345694037301944, "learning_rate": 5.405156890045704e-07, "loss": 0.0616, "step": 1235 }, { "epoch": 1.8103258879531308, "grad_norm": 0.2609506722100195, "learning_rate": 5.322429377810612e-07, "loss": 0.0853, "step": 1236 }, { "epoch": 1.8117905529110216, "grad_norm": 0.25093904412233514, "learning_rate": 5.240322530907893e-07, "loss": 0.0823, "step": 1237 }, { "epoch": 1.8132552178689125, "grad_norm": 0.2544084467508812, "learning_rate": 5.158836887594687e-07, "loss": 0.063, "step": 1238 }, { "epoch": 1.8147198828268034, "grad_norm": 0.26650992359632614, "learning_rate": 5.07797298205569e-07, "loss": 0.0789, "step": 1239 }, { "epoch": 1.8161845477846943, "grad_norm": 0.23630880168422222, "learning_rate": 4.997731344399837e-07, "loss": 0.0606, "step": 1240 }, { "epoch": 1.8176492127425852, "grad_norm": 0.24553703591038675, "learning_rate": 4.91811250065668e-07, "loss": 0.0628, "step": 1241 }, { "epoch": 1.819113877700476, "grad_norm": 0.2265227129455893, "learning_rate": 4.839116972773061e-07, "loss": 0.0583, "step": 1242 }, { "epoch": 1.820578542658367, "grad_norm": 0.25312671344061183, "learning_rate": 4.7607452786095686e-07, "loss": 0.0608, "step": 1243 }, { "epoch": 1.8220432076162578, "grad_norm": 0.23959774438699266, "learning_rate": 4.682997931937283e-07, "loss": 0.0628, "step": 1244 }, { "epoch": 1.8235078725741487, "grad_norm": 0.31949707497166124, "learning_rate": 4.605875442434238e-07, "loss": 0.0964, "step": 1245 }, { "epoch": 1.8249725375320396, "grad_norm": 0.2500895954628304, "learning_rate": 4.5293783156822533e-07, "loss": 0.0679, "step": 1246 }, { "epoch": 1.8264372024899305, "grad_norm": 0.24693595115617872, "learning_rate": 4.4535070531635195e-07, "loss": 0.0722, "step": 1247 }, { "epoch": 1.8279018674478213, "grad_norm": 0.25545938072704727, "learning_rate": 4.378262152257273e-07, "loss": 0.0683, "step": 1248 }, { "epoch": 1.8293665324057122, "grad_norm": 0.25876832202455974, "learning_rate": 4.303644106236704e-07, "loss": 0.0662, "step": 1249 }, { "epoch": 1.8308311973636031, "grad_norm": 0.26720989490367125, "learning_rate": 4.2296534042654993e-07, "loss": 0.0682, "step": 1250 }, { "epoch": 1.832295862321494, "grad_norm": 0.26258372219745035, "learning_rate": 4.1562905313948354e-07, "loss": 0.0666, "step": 1251 }, { "epoch": 1.8337605272793849, "grad_norm": 0.26711183739660005, "learning_rate": 4.083555968560049e-07, "loss": 0.0801, "step": 1252 }, { "epoch": 1.8352251922372758, "grad_norm": 0.25571699526490943, "learning_rate": 4.0114501925775927e-07, "loss": 0.0765, "step": 1253 }, { "epoch": 1.8366898571951666, "grad_norm": 0.25954869259656904, "learning_rate": 3.9399736761418395e-07, "loss": 0.0876, "step": 1254 }, { "epoch": 1.8381545221530575, "grad_norm": 0.25678722587829267, "learning_rate": 3.8691268878220165e-07, "loss": 0.069, "step": 1255 }, { "epoch": 1.8396191871109484, "grad_norm": 0.2589134232675844, "learning_rate": 3.7989102920591103e-07, "loss": 0.069, "step": 1256 }, { "epoch": 1.8410838520688393, "grad_norm": 0.2450077649829702, "learning_rate": 3.729324349162866e-07, "loss": 0.0684, "step": 1257 }, { "epoch": 1.8425485170267302, "grad_norm": 0.2853806694841532, "learning_rate": 3.660369515308715e-07, "loss": 0.0634, "step": 1258 }, { "epoch": 1.844013181984621, "grad_norm": 0.2621846237984564, "learning_rate": 3.592046242534819e-07, "loss": 0.0714, "step": 1259 }, { "epoch": 1.845477846942512, "grad_norm": 0.23779005991008018, "learning_rate": 3.524354978739075e-07, "loss": 0.0583, "step": 1260 }, { "epoch": 1.8469425119004028, "grad_norm": 0.25039681136747866, "learning_rate": 3.4572961676762715e-07, "loss": 0.0738, "step": 1261 }, { "epoch": 1.8484071768582937, "grad_norm": 0.26098923170285054, "learning_rate": 3.390870248955025e-07, "loss": 0.0807, "step": 1262 }, { "epoch": 1.8498718418161846, "grad_norm": 0.26327776179903706, "learning_rate": 3.3250776580350143e-07, "loss": 0.0866, "step": 1263 }, { "epoch": 1.8513365067740755, "grad_norm": 0.23667264720127792, "learning_rate": 3.259918826224118e-07, "loss": 0.0649, "step": 1264 }, { "epoch": 1.8528011717319663, "grad_norm": 0.23761172772297415, "learning_rate": 3.1953941806755265e-07, "loss": 0.0845, "step": 1265 }, { "epoch": 1.8542658366898572, "grad_norm": 0.24024069607921544, "learning_rate": 3.131504144385023e-07, "loss": 0.0703, "step": 1266 }, { "epoch": 1.855730501647748, "grad_norm": 0.2420868970268621, "learning_rate": 3.0682491361881064e-07, "loss": 0.0748, "step": 1267 }, { "epoch": 1.857195166605639, "grad_norm": 0.2577949839616261, "learning_rate": 3.0056295707573736e-07, "loss": 0.0665, "step": 1268 }, { "epoch": 1.8586598315635299, "grad_norm": 0.2685695682554958, "learning_rate": 2.943645858599653e-07, "loss": 0.0792, "step": 1269 }, { "epoch": 1.8601244965214208, "grad_norm": 0.25460210036235115, "learning_rate": 2.8822984060534854e-07, "loss": 0.0738, "step": 1270 }, { "epoch": 1.8615891614793116, "grad_norm": 0.23461840979365672, "learning_rate": 2.82158761528627e-07, "loss": 0.0754, "step": 1271 }, { "epoch": 1.8630538264372025, "grad_norm": 0.22557458232043867, "learning_rate": 2.761513884291822e-07, "loss": 0.0612, "step": 1272 }, { "epoch": 1.8645184913950934, "grad_norm": 0.2470496535205524, "learning_rate": 2.7020776068875876e-07, "loss": 0.0711, "step": 1273 }, { "epoch": 1.8659831563529843, "grad_norm": 0.2329397076753076, "learning_rate": 2.6432791727121984e-07, "loss": 0.0704, "step": 1274 }, { "epoch": 1.8674478213108752, "grad_norm": 0.25477498790129094, "learning_rate": 2.5851189672228103e-07, "loss": 0.0773, "step": 1275 }, { "epoch": 1.868912486268766, "grad_norm": 0.23446595512868088, "learning_rate": 2.5275973716926804e-07, "loss": 0.0709, "step": 1276 }, { "epoch": 1.870377151226657, "grad_norm": 0.2605236921574388, "learning_rate": 2.4707147632085815e-07, "loss": 0.0758, "step": 1277 }, { "epoch": 1.8718418161845478, "grad_norm": 0.24531947955587904, "learning_rate": 2.414471514668348e-07, "loss": 0.0778, "step": 1278 }, { "epoch": 1.8733064811424387, "grad_norm": 0.2625806435047024, "learning_rate": 2.358867994778502e-07, "loss": 0.0742, "step": 1279 }, { "epoch": 1.8747711461003296, "grad_norm": 0.26172740430959623, "learning_rate": 2.3039045680517292e-07, "loss": 0.071, "step": 1280 }, { "epoch": 1.8762358110582205, "grad_norm": 0.25812348656180795, "learning_rate": 2.249581594804562e-07, "loss": 0.0702, "step": 1281 }, { "epoch": 1.8777004760161113, "grad_norm": 0.22412143150895958, "learning_rate": 2.1958994311549797e-07, "loss": 0.0633, "step": 1282 }, { "epoch": 1.8791651409740022, "grad_norm": 0.25437645779594675, "learning_rate": 2.1428584290201116e-07, "loss": 0.0697, "step": 1283 }, { "epoch": 1.880629805931893, "grad_norm": 0.2728173599791171, "learning_rate": 2.0904589361138927e-07, "loss": 0.0813, "step": 1284 }, { "epoch": 1.882094470889784, "grad_norm": 0.24620604884353783, "learning_rate": 2.0387012959448227e-07, "loss": 0.0812, "step": 1285 }, { "epoch": 1.8835591358476749, "grad_norm": 0.2246638728164293, "learning_rate": 1.9875858478136557e-07, "loss": 0.0591, "step": 1286 }, { "epoch": 1.8850238008055658, "grad_norm": 0.22425127272955642, "learning_rate": 1.9371129268112466e-07, "loss": 0.0615, "step": 1287 }, { "epoch": 1.8864884657634566, "grad_norm": 0.47369574027243294, "learning_rate": 1.8872828638162866e-07, "loss": 0.0678, "step": 1288 }, { "epoch": 1.8879531307213475, "grad_norm": 0.24744976308618485, "learning_rate": 1.8380959854932045e-07, "loss": 0.0745, "step": 1289 }, { "epoch": 1.8894177956792384, "grad_norm": 0.2619097810479823, "learning_rate": 1.7895526142899466e-07, "loss": 0.0705, "step": 1290 }, { "epoch": 1.8908824606371293, "grad_norm": 0.23871395237352025, "learning_rate": 1.7416530684359444e-07, "loss": 0.0551, "step": 1291 }, { "epoch": 1.8923471255950202, "grad_norm": 0.24628533422104648, "learning_rate": 1.6943976619399615e-07, "loss": 0.0677, "step": 1292 }, { "epoch": 1.893811790552911, "grad_norm": 0.23856368422966648, "learning_rate": 1.6477867045880613e-07, "loss": 0.0815, "step": 1293 }, { "epoch": 1.895276455510802, "grad_norm": 0.2555025186995139, "learning_rate": 1.6018205019415866e-07, "loss": 0.0708, "step": 1294 }, { "epoch": 1.8967411204686928, "grad_norm": 0.2528265155563436, "learning_rate": 1.5564993553351394e-07, "loss": 0.0738, "step": 1295 }, { "epoch": 1.8982057854265837, "grad_norm": 0.26162054123344747, "learning_rate": 1.511823561874637e-07, "loss": 0.07, "step": 1296 }, { "epoch": 1.8996704503844746, "grad_norm": 0.24902092413650095, "learning_rate": 1.4677934144352923e-07, "loss": 0.0668, "step": 1297 }, { "epoch": 1.9011351153423655, "grad_norm": 0.2635425216052296, "learning_rate": 1.4244092016597933e-07, "loss": 0.0797, "step": 1298 }, { "epoch": 1.9025997803002563, "grad_norm": 0.24273285605827147, "learning_rate": 1.3816712079563034e-07, "loss": 0.0738, "step": 1299 }, { "epoch": 1.9040644452581472, "grad_norm": 0.23589111930612008, "learning_rate": 1.3395797134967192e-07, "loss": 0.0633, "step": 1300 }, { "epoch": 1.905529110216038, "grad_norm": 0.2886312181672797, "learning_rate": 1.2981349942146947e-07, "loss": 0.0801, "step": 1301 }, { "epoch": 1.906993775173929, "grad_norm": 0.2504691073271145, "learning_rate": 1.257337321803964e-07, "loss": 0.0719, "step": 1302 }, { "epoch": 1.9084584401318199, "grad_norm": 0.2455439126340956, "learning_rate": 1.2171869637164769e-07, "loss": 0.0694, "step": 1303 }, { "epoch": 1.9099231050897107, "grad_norm": 0.2443358735504617, "learning_rate": 1.1776841831606544e-07, "loss": 0.0629, "step": 1304 }, { "epoch": 1.9113877700476016, "grad_norm": 0.2375928891798295, "learning_rate": 1.1388292390997035e-07, "loss": 0.0636, "step": 1305 }, { "epoch": 1.9128524350054925, "grad_norm": 0.2430807124576779, "learning_rate": 1.1006223862498944e-07, "loss": 0.0654, "step": 1306 }, { "epoch": 1.9143170999633834, "grad_norm": 0.2582360186234949, "learning_rate": 1.0630638750788625e-07, "loss": 0.0772, "step": 1307 }, { "epoch": 1.9157817649212743, "grad_norm": 0.22936545545360856, "learning_rate": 1.026153951804032e-07, "loss": 0.0555, "step": 1308 }, { "epoch": 1.9172464298791652, "grad_norm": 0.24362803135706435, "learning_rate": 9.898928583909284e-08, "loss": 0.0643, "step": 1309 }, { "epoch": 1.918711094837056, "grad_norm": 0.23451079860288676, "learning_rate": 9.542808325516573e-08, "loss": 0.0562, "step": 1310 }, { "epoch": 1.920175759794947, "grad_norm": 0.23945075253871337, "learning_rate": 9.193181077433055e-08, "loss": 0.07, "step": 1311 }, { "epoch": 1.9216404247528378, "grad_norm": 0.2586122862421169, "learning_rate": 8.850049131664206e-08, "loss": 0.0793, "step": 1312 }, { "epoch": 1.9231050897107287, "grad_norm": 0.25189426121665187, "learning_rate": 8.513414737635006e-08, "loss": 0.0665, "step": 1313 }, { "epoch": 1.9245697546686196, "grad_norm": 0.2331012828544825, "learning_rate": 8.183280102175617e-08, "loss": 0.0626, "step": 1314 }, { "epoch": 1.9260344196265105, "grad_norm": 0.24925536553110555, "learning_rate": 7.859647389506176e-08, "loss": 0.0688, "step": 1315 }, { "epoch": 1.9274990845844013, "grad_norm": 0.25467958607825186, "learning_rate": 7.542518721223469e-08, "loss": 0.0894, "step": 1316 }, { "epoch": 1.9289637495422922, "grad_norm": 0.25285918104772465, "learning_rate": 7.231896176285946e-08, "loss": 0.0773, "step": 1317 }, { "epoch": 1.930428414500183, "grad_norm": 0.23450711664911186, "learning_rate": 6.927781791001398e-08, "loss": 0.0639, "step": 1318 }, { "epoch": 1.931893079458074, "grad_norm": 0.23097872990832516, "learning_rate": 6.630177559012518e-08, "loss": 0.068, "step": 1319 }, { "epoch": 1.9333577444159649, "grad_norm": 0.2379637592228772, "learning_rate": 6.339085431284253e-08, "loss": 0.0632, "step": 1320 }, { "epoch": 1.9348224093738557, "grad_norm": 0.24051632358031436, "learning_rate": 6.054507316091141e-08, "loss": 0.0677, "step": 1321 }, { "epoch": 1.9362870743317466, "grad_norm": 0.28404424390350286, "learning_rate": 5.7764450790046554e-08, "loss": 0.0783, "step": 1322 }, { "epoch": 1.9377517392896375, "grad_norm": 0.24006965033366337, "learning_rate": 5.5049005428808865e-08, "loss": 0.0696, "step": 1323 }, { "epoch": 1.9392164042475284, "grad_norm": 0.2496157646264579, "learning_rate": 5.239875487848878e-08, "loss": 0.0635, "step": 1324 }, { "epoch": 1.9406810692054193, "grad_norm": 0.26403747456776216, "learning_rate": 4.981371651298306e-08, "loss": 0.0652, "step": 1325 }, { "epoch": 1.9421457341633102, "grad_norm": 0.25920069106169014, "learning_rate": 4.729390727869154e-08, "loss": 0.0799, "step": 1326 }, { "epoch": 1.943610399121201, "grad_norm": 0.26170514428715264, "learning_rate": 4.483934369439613e-08, "loss": 0.0755, "step": 1327 }, { "epoch": 1.945075064079092, "grad_norm": 0.24542483964564413, "learning_rate": 4.245004185115753e-08, "loss": 0.0615, "step": 1328 }, { "epoch": 1.9465397290369828, "grad_norm": 0.24837578672571922, "learning_rate": 4.0126017412207565e-08, "loss": 0.071, "step": 1329 }, { "epoch": 1.9480043939948737, "grad_norm": 0.23824898115769536, "learning_rate": 3.786728561285036e-08, "loss": 0.0594, "step": 1330 }, { "epoch": 1.9494690589527646, "grad_norm": 0.25242932208373725, "learning_rate": 3.567386126035577e-08, "loss": 0.0807, "step": 1331 }, { "epoch": 1.9509337239106554, "grad_norm": 0.26364658800502216, "learning_rate": 3.354575873386945e-08, "loss": 0.0701, "step": 1332 }, { "epoch": 1.9523983888685463, "grad_norm": 0.22064989501595925, "learning_rate": 3.1482991984312926e-08, "loss": 0.0601, "step": 1333 }, { "epoch": 1.9538630538264372, "grad_norm": 0.25926143810431157, "learning_rate": 2.948557453429701e-08, "loss": 0.0671, "step": 1334 }, { "epoch": 1.955327718784328, "grad_norm": 0.23624107516760934, "learning_rate": 2.7553519478028535e-08, "loss": 0.0665, "step": 1335 }, { "epoch": 1.956792383742219, "grad_norm": 0.2705488604754587, "learning_rate": 2.5686839481227077e-08, "loss": 0.0773, "step": 1336 }, { "epoch": 1.9582570487001099, "grad_norm": 0.2473961524857793, "learning_rate": 2.3885546781042824e-08, "loss": 0.073, "step": 1337 }, { "epoch": 1.9597217136580007, "grad_norm": 0.2441090901112598, "learning_rate": 2.2149653185973285e-08, "loss": 0.0769, "step": 1338 }, { "epoch": 1.9611863786158916, "grad_norm": 0.27046514605231436, "learning_rate": 2.0479170075788924e-08, "loss": 0.0716, "step": 1339 }, { "epoch": 1.9626510435737825, "grad_norm": 0.22983632777466814, "learning_rate": 1.8874108401456538e-08, "loss": 0.0575, "step": 1340 }, { "epoch": 1.9641157085316734, "grad_norm": 0.24070147293086863, "learning_rate": 1.7334478685068212e-08, "loss": 0.1044, "step": 1341 }, { "epoch": 1.9655803734895643, "grad_norm": 0.24856888752891837, "learning_rate": 1.586029101977249e-08, "loss": 0.0585, "step": 1342 }, { "epoch": 1.9670450384474552, "grad_norm": 0.25551924386712754, "learning_rate": 1.4451555069708856e-08, "loss": 0.0823, "step": 1343 }, { "epoch": 1.968509703405346, "grad_norm": 0.24403209467826953, "learning_rate": 1.3108280069941137e-08, "loss": 0.0723, "step": 1344 }, { "epoch": 1.969974368363237, "grad_norm": 0.25587914011988994, "learning_rate": 1.1830474826404204e-08, "loss": 0.0775, "step": 1345 }, { "epoch": 1.9714390333211278, "grad_norm": 0.2481616027718323, "learning_rate": 1.0618147715835137e-08, "loss": 0.073, "step": 1346 }, { "epoch": 1.9729036982790187, "grad_norm": 0.2503297758869869, "learning_rate": 9.471306685728821e-09, "loss": 0.0841, "step": 1347 }, { "epoch": 1.9743683632369096, "grad_norm": 0.25325687071902725, "learning_rate": 8.389959254281322e-09, "loss": 0.0674, "step": 1348 }, { "epoch": 1.9758330281948004, "grad_norm": 0.2413650911828191, "learning_rate": 7.3741125103399254e-09, "loss": 0.0737, "step": 1349 }, { "epoch": 1.9772976931526913, "grad_norm": 0.23526516349129203, "learning_rate": 6.423773113357623e-09, "loss": 0.0631, "step": 1350 }, { "epoch": 1.9787623581105822, "grad_norm": 0.25457045216657825, "learning_rate": 5.538947293349806e-09, "loss": 0.0765, "step": 1351 }, { "epoch": 1.980227023068473, "grad_norm": 0.23325185099514448, "learning_rate": 4.719640850852081e-09, "loss": 0.0612, "step": 1352 }, { "epoch": 1.981691688026364, "grad_norm": 0.26527379982321414, "learning_rate": 3.965859156885854e-09, "loss": 0.0775, "step": 1353 }, { "epoch": 1.9831563529842549, "grad_norm": 0.24201253323675176, "learning_rate": 3.2776071529183608e-09, "loss": 0.0685, "step": 1354 }, { "epoch": 1.9846210179421457, "grad_norm": 0.23684069181793793, "learning_rate": 2.65488935083158e-09, "loss": 0.0686, "step": 1355 }, { "epoch": 1.9860856829000366, "grad_norm": 0.25231448957519526, "learning_rate": 2.0977098328978098e-09, "loss": 0.0726, "step": 1356 }, { "epoch": 1.9875503478579275, "grad_norm": 0.2774834947544314, "learning_rate": 1.6060722517430293e-09, "loss": 0.0787, "step": 1357 }, { "epoch": 1.9890150128158184, "grad_norm": 0.2503245321475594, "learning_rate": 1.1799798303335775e-09, "loss": 0.0683, "step": 1358 }, { "epoch": 1.9904796777737093, "grad_norm": 0.2423085553528186, "learning_rate": 8.194353619450646e-10, "loss": 0.0731, "step": 1359 }, { "epoch": 1.9919443427316001, "grad_norm": 0.24083594462983027, "learning_rate": 5.244412101534924e-10, "loss": 0.066, "step": 1360 }, { "epoch": 1.993409007689491, "grad_norm": 0.23578471416137511, "learning_rate": 2.949993088130487e-10, "loss": 0.0585, "step": 1361 }, { "epoch": 1.994873672647382, "grad_norm": 0.24433039086610767, "learning_rate": 1.3111116204500562e-10, "loss": 0.0602, "step": 1362 }, { "epoch": 1.9963383376052728, "grad_norm": 0.2719863457397181, "learning_rate": 3.277784423105779e-11, "loss": 0.0863, "step": 1363 }, { "epoch": 1.9978030025631637, "grad_norm": 0.261652070533884, "learning_rate": 0.0, "loss": 0.0707, "step": 1364 }, { "epoch": 1.9978030025631637, "step": 1364, "total_flos": 612552856354816.0, "train_loss": 0.16333763612996885, "train_runtime": 7219.9243, "train_samples_per_second": 24.207, "train_steps_per_second": 0.189 } ], "logging_steps": 1, "max_steps": 1364, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 612552856354816.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }