diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9590 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9978030025631637, + "eval_steps": 500, + "global_step": 1364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014646649578908824, + "grad_norm": 11.437485320992703, + "learning_rate": 1.4598540145985402e-07, + "loss": 1.3349, + "step": 1 + }, + { + "epoch": 0.0029293299157817647, + "grad_norm": 11.178701580362661, + "learning_rate": 2.9197080291970804e-07, + "loss": 1.338, + "step": 2 + }, + { + "epoch": 0.004393994873672647, + "grad_norm": 11.35884244830137, + "learning_rate": 4.379562043795621e-07, + "loss": 1.3216, + "step": 3 + }, + { + "epoch": 0.005858659831563529, + "grad_norm": 10.417628767122284, + "learning_rate": 5.839416058394161e-07, + "loss": 1.3553, + "step": 4 + }, + { + "epoch": 0.007323324789454412, + "grad_norm": 11.211576033283285, + "learning_rate": 7.299270072992701e-07, + "loss": 1.3584, + "step": 5 + }, + { + "epoch": 0.008787989747345295, + "grad_norm": 10.861271761101971, + "learning_rate": 8.759124087591242e-07, + "loss": 1.3293, + "step": 6 + }, + { + "epoch": 0.010252654705236177, + "grad_norm": 9.89480118782974, + "learning_rate": 1.0218978102189781e-06, + "loss": 1.3379, + "step": 7 + }, + { + "epoch": 0.011717319663127059, + "grad_norm": 9.16199354912927, + "learning_rate": 1.1678832116788322e-06, + "loss": 1.304, + "step": 8 + }, + { + "epoch": 0.013181984621017943, + "grad_norm": 7.8824339227925, + "learning_rate": 1.3138686131386864e-06, + "loss": 1.2671, + "step": 9 + }, + { + "epoch": 0.014646649578908825, + "grad_norm": 8.125898486086502, + "learning_rate": 1.4598540145985402e-06, + "loss": 1.2187, + "step": 10 + }, + { + "epoch": 0.016111314536799707, + "grad_norm": 10.83717739533965, + "learning_rate": 1.6058394160583942e-06, + "loss": 1.2298, + "step": 11 + }, + { + "epoch": 0.01757597949469059, + "grad_norm": 20.701868436023478, + "learning_rate": 1.7518248175182485e-06, + "loss": 1.1312, + "step": 12 + }, + { + "epoch": 0.01904064445258147, + "grad_norm": 20.601367187466792, + "learning_rate": 1.8978102189781023e-06, + "loss": 1.0933, + "step": 13 + }, + { + "epoch": 0.020505309410472353, + "grad_norm": 23.59333380509113, + "learning_rate": 2.0437956204379563e-06, + "loss": 1.0839, + "step": 14 + }, + { + "epoch": 0.021969974368363236, + "grad_norm": 4.122191454831178, + "learning_rate": 2.1897810218978103e-06, + "loss": 1.0648, + "step": 15 + }, + { + "epoch": 0.023434639326254118, + "grad_norm": 9.470990626696512, + "learning_rate": 2.3357664233576643e-06, + "loss": 1.0762, + "step": 16 + }, + { + "epoch": 0.024899304284145003, + "grad_norm": 5.597117534177985, + "learning_rate": 2.4817518248175183e-06, + "loss": 0.9529, + "step": 17 + }, + { + "epoch": 0.026363969242035885, + "grad_norm": 2.581862089288807, + "learning_rate": 2.627737226277373e-06, + "loss": 0.8969, + "step": 18 + }, + { + "epoch": 0.027828634199926768, + "grad_norm": 5.742679255827132, + "learning_rate": 2.7737226277372264e-06, + "loss": 0.899, + "step": 19 + }, + { + "epoch": 0.02929329915781765, + "grad_norm": 4.2881369714318796, + "learning_rate": 2.9197080291970804e-06, + "loss": 0.8479, + "step": 20 + }, + { + "epoch": 0.030757964115708532, + "grad_norm": 8.043168797174141, + "learning_rate": 3.065693430656935e-06, + "loss": 0.8139, + "step": 21 + }, + { + "epoch": 0.032222629073599414, + "grad_norm": 8.109008525384814, + "learning_rate": 3.2116788321167884e-06, + "loss": 0.832, + "step": 22 + }, + { + "epoch": 0.033687294031490296, + "grad_norm": 3.7973925320214708, + "learning_rate": 3.3576642335766425e-06, + "loss": 0.793, + "step": 23 + }, + { + "epoch": 0.03515195898938118, + "grad_norm": 3.0028103120588856, + "learning_rate": 3.503649635036497e-06, + "loss": 0.7458, + "step": 24 + }, + { + "epoch": 0.03661662394727206, + "grad_norm": 2.9364448534773038, + "learning_rate": 3.6496350364963505e-06, + "loss": 0.7061, + "step": 25 + }, + { + "epoch": 0.03808128890516294, + "grad_norm": 0.9806974298270025, + "learning_rate": 3.7956204379562045e-06, + "loss": 0.7023, + "step": 26 + }, + { + "epoch": 0.039545953863053825, + "grad_norm": 1.064375799560964, + "learning_rate": 3.9416058394160585e-06, + "loss": 0.6876, + "step": 27 + }, + { + "epoch": 0.04101061882094471, + "grad_norm": 0.7759784247428909, + "learning_rate": 4.0875912408759126e-06, + "loss": 0.6773, + "step": 28 + }, + { + "epoch": 0.04247528377883559, + "grad_norm": 0.6478179875434958, + "learning_rate": 4.233576642335767e-06, + "loss": 0.6657, + "step": 29 + }, + { + "epoch": 0.04393994873672647, + "grad_norm": 1.3039225652221051, + "learning_rate": 4.379562043795621e-06, + "loss": 0.6358, + "step": 30 + }, + { + "epoch": 0.04540461369461735, + "grad_norm": 0.6416252297147181, + "learning_rate": 4.525547445255475e-06, + "loss": 0.5966, + "step": 31 + }, + { + "epoch": 0.046869278652508235, + "grad_norm": 0.5640382481221872, + "learning_rate": 4.671532846715329e-06, + "loss": 0.5848, + "step": 32 + }, + { + "epoch": 0.048333943610399124, + "grad_norm": 0.581441699738184, + "learning_rate": 4.8175182481751835e-06, + "loss": 0.5963, + "step": 33 + }, + { + "epoch": 0.049798608568290006, + "grad_norm": 0.5919465771312318, + "learning_rate": 4.963503649635037e-06, + "loss": 0.5921, + "step": 34 + }, + { + "epoch": 0.05126327352618089, + "grad_norm": 0.6061068767118333, + "learning_rate": 5.1094890510948916e-06, + "loss": 0.5766, + "step": 35 + }, + { + "epoch": 0.05272793848407177, + "grad_norm": 0.5757173997249402, + "learning_rate": 5.255474452554746e-06, + "loss": 0.5234, + "step": 36 + }, + { + "epoch": 0.05419260344196265, + "grad_norm": 0.5148147313648633, + "learning_rate": 5.401459854014599e-06, + "loss": 0.5428, + "step": 37 + }, + { + "epoch": 0.055657268399853535, + "grad_norm": 0.5378426318034144, + "learning_rate": 5.547445255474453e-06, + "loss": 0.5421, + "step": 38 + }, + { + "epoch": 0.05712193335774442, + "grad_norm": 0.5096082401040436, + "learning_rate": 5.693430656934307e-06, + "loss": 0.5447, + "step": 39 + }, + { + "epoch": 0.0585865983156353, + "grad_norm": 0.48460009463464304, + "learning_rate": 5.839416058394161e-06, + "loss": 0.5396, + "step": 40 + }, + { + "epoch": 0.06005126327352618, + "grad_norm": 0.49057265665249916, + "learning_rate": 5.985401459854016e-06, + "loss": 0.5393, + "step": 41 + }, + { + "epoch": 0.061515928231417064, + "grad_norm": 0.5380540290567165, + "learning_rate": 6.13138686131387e-06, + "loss": 0.544, + "step": 42 + }, + { + "epoch": 0.06298059318930795, + "grad_norm": 0.4836947048784, + "learning_rate": 6.277372262773723e-06, + "loss": 0.5291, + "step": 43 + }, + { + "epoch": 0.06444525814719883, + "grad_norm": 0.4975286083516887, + "learning_rate": 6.423357664233577e-06, + "loss": 0.5006, + "step": 44 + }, + { + "epoch": 0.06590992310508971, + "grad_norm": 0.4801869620568488, + "learning_rate": 6.569343065693431e-06, + "loss": 0.5177, + "step": 45 + }, + { + "epoch": 0.06737458806298059, + "grad_norm": 0.3973492869700473, + "learning_rate": 6.715328467153285e-06, + "loss": 0.4788, + "step": 46 + }, + { + "epoch": 0.06883925302087147, + "grad_norm": 0.363722763875419, + "learning_rate": 6.86131386861314e-06, + "loss": 0.5129, + "step": 47 + }, + { + "epoch": 0.07030391797876236, + "grad_norm": 0.381635129408091, + "learning_rate": 7.007299270072994e-06, + "loss": 0.4867, + "step": 48 + }, + { + "epoch": 0.07176858293665324, + "grad_norm": 0.3591035550780032, + "learning_rate": 7.153284671532848e-06, + "loss": 0.4963, + "step": 49 + }, + { + "epoch": 0.07323324789454412, + "grad_norm": 0.4186636869053589, + "learning_rate": 7.299270072992701e-06, + "loss": 0.4986, + "step": 50 + }, + { + "epoch": 0.074697912852435, + "grad_norm": 0.41597006530339986, + "learning_rate": 7.445255474452555e-06, + "loss": 0.4765, + "step": 51 + }, + { + "epoch": 0.07616257781032588, + "grad_norm": 0.6427961759723377, + "learning_rate": 7.591240875912409e-06, + "loss": 0.4694, + "step": 52 + }, + { + "epoch": 0.07762724276821677, + "grad_norm": 0.4874587482608775, + "learning_rate": 7.737226277372264e-06, + "loss": 0.4634, + "step": 53 + }, + { + "epoch": 0.07909190772610765, + "grad_norm": 0.3334959408832527, + "learning_rate": 7.883211678832117e-06, + "loss": 0.4636, + "step": 54 + }, + { + "epoch": 0.08055657268399853, + "grad_norm": 0.381153104725579, + "learning_rate": 8.029197080291972e-06, + "loss": 0.459, + "step": 55 + }, + { + "epoch": 0.08202123764188941, + "grad_norm": 0.3340733041790047, + "learning_rate": 8.175182481751825e-06, + "loss": 0.4851, + "step": 56 + }, + { + "epoch": 0.0834859025997803, + "grad_norm": 0.3517427711639141, + "learning_rate": 8.32116788321168e-06, + "loss": 0.4236, + "step": 57 + }, + { + "epoch": 0.08495056755767118, + "grad_norm": 0.3108043883771063, + "learning_rate": 8.467153284671533e-06, + "loss": 0.4336, + "step": 58 + }, + { + "epoch": 0.08641523251556206, + "grad_norm": 0.3334733312721485, + "learning_rate": 8.613138686131386e-06, + "loss": 0.4365, + "step": 59 + }, + { + "epoch": 0.08787989747345294, + "grad_norm": 0.352845389781815, + "learning_rate": 8.759124087591241e-06, + "loss": 0.4671, + "step": 60 + }, + { + "epoch": 0.08934456243134382, + "grad_norm": 0.3423521983704048, + "learning_rate": 8.905109489051096e-06, + "loss": 0.4405, + "step": 61 + }, + { + "epoch": 0.0908092273892347, + "grad_norm": 0.32680792151395816, + "learning_rate": 9.05109489051095e-06, + "loss": 0.4244, + "step": 62 + }, + { + "epoch": 0.09227389234712559, + "grad_norm": 0.35932668840566334, + "learning_rate": 9.197080291970804e-06, + "loss": 0.4316, + "step": 63 + }, + { + "epoch": 0.09373855730501647, + "grad_norm": 0.30409497564005455, + "learning_rate": 9.343065693430657e-06, + "loss": 0.4404, + "step": 64 + }, + { + "epoch": 0.09520322226290737, + "grad_norm": 0.34431142299821804, + "learning_rate": 9.48905109489051e-06, + "loss": 0.4538, + "step": 65 + }, + { + "epoch": 0.09666788722079825, + "grad_norm": 0.33318967367640623, + "learning_rate": 9.635036496350367e-06, + "loss": 0.4246, + "step": 66 + }, + { + "epoch": 0.09813255217868913, + "grad_norm": 0.32969822743310584, + "learning_rate": 9.78102189781022e-06, + "loss": 0.4185, + "step": 67 + }, + { + "epoch": 0.09959721713658001, + "grad_norm": 0.3203953785691397, + "learning_rate": 9.927007299270073e-06, + "loss": 0.3801, + "step": 68 + }, + { + "epoch": 0.1010618820944709, + "grad_norm": 0.32486117695619926, + "learning_rate": 1.0072992700729928e-05, + "loss": 0.4072, + "step": 69 + }, + { + "epoch": 0.10252654705236178, + "grad_norm": 0.35349454533419244, + "learning_rate": 1.0218978102189783e-05, + "loss": 0.4105, + "step": 70 + }, + { + "epoch": 0.10399121201025266, + "grad_norm": 0.32856345829840156, + "learning_rate": 1.0364963503649636e-05, + "loss": 0.3967, + "step": 71 + }, + { + "epoch": 0.10545587696814354, + "grad_norm": 0.29148929214591157, + "learning_rate": 1.0510948905109491e-05, + "loss": 0.3824, + "step": 72 + }, + { + "epoch": 0.10692054192603442, + "grad_norm": 0.35704567954771227, + "learning_rate": 1.0656934306569344e-05, + "loss": 0.3953, + "step": 73 + }, + { + "epoch": 0.1083852068839253, + "grad_norm": 0.31980320150253144, + "learning_rate": 1.0802919708029198e-05, + "loss": 0.3896, + "step": 74 + }, + { + "epoch": 0.10984987184181619, + "grad_norm": 0.34589383334602003, + "learning_rate": 1.0948905109489052e-05, + "loss": 0.3995, + "step": 75 + }, + { + "epoch": 0.11131453679970707, + "grad_norm": 0.3272293236162762, + "learning_rate": 1.1094890510948906e-05, + "loss": 0.3901, + "step": 76 + }, + { + "epoch": 0.11277920175759795, + "grad_norm": 0.3579751175217903, + "learning_rate": 1.124087591240876e-05, + "loss": 0.3921, + "step": 77 + }, + { + "epoch": 0.11424386671548883, + "grad_norm": 0.3239414165797523, + "learning_rate": 1.1386861313868614e-05, + "loss": 0.3989, + "step": 78 + }, + { + "epoch": 0.11570853167337972, + "grad_norm": 0.354305255693745, + "learning_rate": 1.1532846715328467e-05, + "loss": 0.3983, + "step": 79 + }, + { + "epoch": 0.1171731966312706, + "grad_norm": 0.3440015850313245, + "learning_rate": 1.1678832116788322e-05, + "loss": 0.3994, + "step": 80 + }, + { + "epoch": 0.11863786158916148, + "grad_norm": 0.36417507763366563, + "learning_rate": 1.1824817518248176e-05, + "loss": 0.368, + "step": 81 + }, + { + "epoch": 0.12010252654705236, + "grad_norm": 0.3568328603972373, + "learning_rate": 1.1970802919708031e-05, + "loss": 0.3694, + "step": 82 + }, + { + "epoch": 0.12156719150494324, + "grad_norm": 0.33678321301690173, + "learning_rate": 1.2116788321167885e-05, + "loss": 0.3688, + "step": 83 + }, + { + "epoch": 0.12303185646283413, + "grad_norm": 0.36497790299914806, + "learning_rate": 1.226277372262774e-05, + "loss": 0.3998, + "step": 84 + }, + { + "epoch": 0.12449652142072501, + "grad_norm": 0.32377149566716346, + "learning_rate": 1.2408759124087593e-05, + "loss": 0.3586, + "step": 85 + }, + { + "epoch": 0.1259611863786159, + "grad_norm": 0.3633002134475903, + "learning_rate": 1.2554744525547446e-05, + "loss": 0.3749, + "step": 86 + }, + { + "epoch": 0.12742585133650677, + "grad_norm": 0.3719628223211349, + "learning_rate": 1.27007299270073e-05, + "loss": 0.3811, + "step": 87 + }, + { + "epoch": 0.12889051629439766, + "grad_norm": 0.344394951638129, + "learning_rate": 1.2846715328467154e-05, + "loss": 0.377, + "step": 88 + }, + { + "epoch": 0.13035518125228854, + "grad_norm": 0.369488047348081, + "learning_rate": 1.2992700729927009e-05, + "loss": 0.3663, + "step": 89 + }, + { + "epoch": 0.13181984621017942, + "grad_norm": 0.3123562848371242, + "learning_rate": 1.3138686131386862e-05, + "loss": 0.3561, + "step": 90 + }, + { + "epoch": 0.1332845111680703, + "grad_norm": 0.3743377566954952, + "learning_rate": 1.3284671532846715e-05, + "loss": 0.3488, + "step": 91 + }, + { + "epoch": 0.13474917612596118, + "grad_norm": 0.3633285055316245, + "learning_rate": 1.343065693430657e-05, + "loss": 0.3566, + "step": 92 + }, + { + "epoch": 0.13621384108385207, + "grad_norm": 0.31139891879793924, + "learning_rate": 1.3576642335766423e-05, + "loss": 0.3464, + "step": 93 + }, + { + "epoch": 0.13767850604174295, + "grad_norm": 0.3898557118255509, + "learning_rate": 1.372262773722628e-05, + "loss": 0.3559, + "step": 94 + }, + { + "epoch": 0.13914317099963383, + "grad_norm": 0.3497149782228266, + "learning_rate": 1.3868613138686133e-05, + "loss": 0.3426, + "step": 95 + }, + { + "epoch": 0.1406078359575247, + "grad_norm": 0.3646043020546925, + "learning_rate": 1.4014598540145988e-05, + "loss": 0.3641, + "step": 96 + }, + { + "epoch": 0.1420725009154156, + "grad_norm": 0.33631572980864205, + "learning_rate": 1.416058394160584e-05, + "loss": 0.3325, + "step": 97 + }, + { + "epoch": 0.14353716587330648, + "grad_norm": 0.37775436608527085, + "learning_rate": 1.4306569343065696e-05, + "loss": 0.3615, + "step": 98 + }, + { + "epoch": 0.14500183083119736, + "grad_norm": 0.3575269321533561, + "learning_rate": 1.4452554744525549e-05, + "loss": 0.3404, + "step": 99 + }, + { + "epoch": 0.14646649578908824, + "grad_norm": 0.34707886852032405, + "learning_rate": 1.4598540145985402e-05, + "loss": 0.33, + "step": 100 + }, + { + "epoch": 0.14793116074697912, + "grad_norm": 0.36563790351605774, + "learning_rate": 1.4744525547445257e-05, + "loss": 0.3173, + "step": 101 + }, + { + "epoch": 0.14939582570487, + "grad_norm": 0.3723711813860172, + "learning_rate": 1.489051094890511e-05, + "loss": 0.3216, + "step": 102 + }, + { + "epoch": 0.1508604906627609, + "grad_norm": 0.3847201221658523, + "learning_rate": 1.5036496350364965e-05, + "loss": 0.3344, + "step": 103 + }, + { + "epoch": 0.15232515562065177, + "grad_norm": 0.35970372266040174, + "learning_rate": 1.5182481751824818e-05, + "loss": 0.3289, + "step": 104 + }, + { + "epoch": 0.15378982057854265, + "grad_norm": 0.3915138161843807, + "learning_rate": 1.5328467153284673e-05, + "loss": 0.3361, + "step": 105 + }, + { + "epoch": 0.15525448553643353, + "grad_norm": 0.3332723731808281, + "learning_rate": 1.5474452554744528e-05, + "loss": 0.3147, + "step": 106 + }, + { + "epoch": 0.15671915049432442, + "grad_norm": 0.3613636145975632, + "learning_rate": 1.5620437956204383e-05, + "loss": 0.3432, + "step": 107 + }, + { + "epoch": 0.1581838154522153, + "grad_norm": 0.360953722994529, + "learning_rate": 1.5766423357664234e-05, + "loss": 0.2994, + "step": 108 + }, + { + "epoch": 0.15964848041010618, + "grad_norm": 0.44388991308142534, + "learning_rate": 1.591240875912409e-05, + "loss": 0.3456, + "step": 109 + }, + { + "epoch": 0.16111314536799706, + "grad_norm": 0.3537174118941569, + "learning_rate": 1.6058394160583944e-05, + "loss": 0.3091, + "step": 110 + }, + { + "epoch": 0.16257781032588794, + "grad_norm": 0.3713881906195737, + "learning_rate": 1.62043795620438e-05, + "loss": 0.347, + "step": 111 + }, + { + "epoch": 0.16404247528377883, + "grad_norm": 0.35286710247968334, + "learning_rate": 1.635036496350365e-05, + "loss": 0.3202, + "step": 112 + }, + { + "epoch": 0.1655071402416697, + "grad_norm": 0.34725613487477996, + "learning_rate": 1.6496350364963505e-05, + "loss": 0.3049, + "step": 113 + }, + { + "epoch": 0.1669718051995606, + "grad_norm": 0.3362671204409347, + "learning_rate": 1.664233576642336e-05, + "loss": 0.3197, + "step": 114 + }, + { + "epoch": 0.16843647015745147, + "grad_norm": 0.3464409043696399, + "learning_rate": 1.678832116788321e-05, + "loss": 0.3218, + "step": 115 + }, + { + "epoch": 0.16990113511534236, + "grad_norm": 0.3480542874556784, + "learning_rate": 1.6934306569343066e-05, + "loss": 0.3239, + "step": 116 + }, + { + "epoch": 0.17136580007323324, + "grad_norm": 0.3748150806007165, + "learning_rate": 1.708029197080292e-05, + "loss": 0.3157, + "step": 117 + }, + { + "epoch": 0.17283046503112412, + "grad_norm": 0.34345916023717576, + "learning_rate": 1.7226277372262773e-05, + "loss": 0.3051, + "step": 118 + }, + { + "epoch": 0.174295129989015, + "grad_norm": 0.34387241825105747, + "learning_rate": 1.737226277372263e-05, + "loss": 0.3179, + "step": 119 + }, + { + "epoch": 0.17575979494690588, + "grad_norm": 0.33782864040586463, + "learning_rate": 1.7518248175182482e-05, + "loss": 0.2856, + "step": 120 + }, + { + "epoch": 0.17722445990479677, + "grad_norm": 0.37444408915373373, + "learning_rate": 1.7664233576642337e-05, + "loss": 0.3251, + "step": 121 + }, + { + "epoch": 0.17868912486268765, + "grad_norm": 0.4116316036207847, + "learning_rate": 1.7810218978102192e-05, + "loss": 0.3048, + "step": 122 + }, + { + "epoch": 0.18015378982057853, + "grad_norm": 0.3781332494432965, + "learning_rate": 1.7956204379562047e-05, + "loss": 0.3008, + "step": 123 + }, + { + "epoch": 0.1816184547784694, + "grad_norm": 0.4297088346511556, + "learning_rate": 1.81021897810219e-05, + "loss": 0.3094, + "step": 124 + }, + { + "epoch": 0.1830831197363603, + "grad_norm": 0.4029065646398177, + "learning_rate": 1.8248175182481753e-05, + "loss": 0.3113, + "step": 125 + }, + { + "epoch": 0.18454778469425118, + "grad_norm": 0.3992462478484584, + "learning_rate": 1.8394160583941608e-05, + "loss": 0.2854, + "step": 126 + }, + { + "epoch": 0.18601244965214206, + "grad_norm": 0.3604804381477147, + "learning_rate": 1.854014598540146e-05, + "loss": 0.2981, + "step": 127 + }, + { + "epoch": 0.18747711461003294, + "grad_norm": 0.37186861704413154, + "learning_rate": 1.8686131386861315e-05, + "loss": 0.3258, + "step": 128 + }, + { + "epoch": 0.18894177956792385, + "grad_norm": 0.3441534820222307, + "learning_rate": 1.883211678832117e-05, + "loss": 0.2956, + "step": 129 + }, + { + "epoch": 0.19040644452581473, + "grad_norm": 0.4360804547373249, + "learning_rate": 1.897810218978102e-05, + "loss": 0.3091, + "step": 130 + }, + { + "epoch": 0.19187110948370562, + "grad_norm": 0.36928340495254985, + "learning_rate": 1.912408759124088e-05, + "loss": 0.2982, + "step": 131 + }, + { + "epoch": 0.1933357744415965, + "grad_norm": 0.36064269917913877, + "learning_rate": 1.9270072992700734e-05, + "loss": 0.301, + "step": 132 + }, + { + "epoch": 0.19480043939948738, + "grad_norm": 0.3364056638775238, + "learning_rate": 1.9416058394160586e-05, + "loss": 0.2643, + "step": 133 + }, + { + "epoch": 0.19626510435737826, + "grad_norm": 0.385357913235113, + "learning_rate": 1.956204379562044e-05, + "loss": 0.3151, + "step": 134 + }, + { + "epoch": 0.19772976931526914, + "grad_norm": 0.4013408692909421, + "learning_rate": 1.9708029197080295e-05, + "loss": 0.2937, + "step": 135 + }, + { + "epoch": 0.19919443427316003, + "grad_norm": 0.36182554518094284, + "learning_rate": 1.9854014598540147e-05, + "loss": 0.3009, + "step": 136 + }, + { + "epoch": 0.2006590992310509, + "grad_norm": 0.3494640989877029, + "learning_rate": 2e-05, + "loss": 0.281, + "step": 137 + }, + { + "epoch": 0.2021237641889418, + "grad_norm": 0.3725320500681877, + "learning_rate": 1.999996722215577e-05, + "loss": 0.3091, + "step": 138 + }, + { + "epoch": 0.20358842914683267, + "grad_norm": 0.3623692853258256, + "learning_rate": 1.9999868888837957e-05, + "loss": 0.2911, + "step": 139 + }, + { + "epoch": 0.20505309410472355, + "grad_norm": 0.36761059973259275, + "learning_rate": 1.999970500069119e-05, + "loss": 0.2773, + "step": 140 + }, + { + "epoch": 0.20651775906261444, + "grad_norm": 0.34372760731316254, + "learning_rate": 1.999947555878985e-05, + "loss": 0.2524, + "step": 141 + }, + { + "epoch": 0.20798242402050532, + "grad_norm": 0.34586143693014154, + "learning_rate": 1.9999180564638056e-05, + "loss": 0.3144, + "step": 142 + }, + { + "epoch": 0.2094470889783962, + "grad_norm": 0.3715249380161417, + "learning_rate": 1.9998820020169668e-05, + "loss": 0.2698, + "step": 143 + }, + { + "epoch": 0.21091175393628708, + "grad_norm": 0.35728560877179666, + "learning_rate": 1.9998393927748257e-05, + "loss": 0.2877, + "step": 144 + }, + { + "epoch": 0.21237641889417797, + "grad_norm": 0.3457017556464188, + "learning_rate": 1.9997902290167104e-05, + "loss": 0.2634, + "step": 145 + }, + { + "epoch": 0.21384108385206885, + "grad_norm": 0.32677024848818564, + "learning_rate": 1.999734511064917e-05, + "loss": 0.2962, + "step": 146 + }, + { + "epoch": 0.21530574880995973, + "grad_norm": 0.38425582050655327, + "learning_rate": 1.9996722392847082e-05, + "loss": 0.2769, + "step": 147 + }, + { + "epoch": 0.2167704137678506, + "grad_norm": 0.34979074920621583, + "learning_rate": 1.9996034140843113e-05, + "loss": 0.281, + "step": 148 + }, + { + "epoch": 0.2182350787257415, + "grad_norm": 0.32730300075566926, + "learning_rate": 1.999528035914915e-05, + "loss": 0.3143, + "step": 149 + }, + { + "epoch": 0.21969974368363238, + "grad_norm": 0.35690312453916395, + "learning_rate": 1.9994461052706652e-05, + "loss": 0.2508, + "step": 150 + }, + { + "epoch": 0.22116440864152326, + "grad_norm": 0.34613822211772644, + "learning_rate": 1.9993576226886644e-05, + "loss": 0.2793, + "step": 151 + }, + { + "epoch": 0.22262907359941414, + "grad_norm": 0.3922751418648848, + "learning_rate": 1.999262588748966e-05, + "loss": 0.2875, + "step": 152 + }, + { + "epoch": 0.22409373855730502, + "grad_norm": 0.3521535383506413, + "learning_rate": 1.9991610040745718e-05, + "loss": 0.2673, + "step": 153 + }, + { + "epoch": 0.2255584035151959, + "grad_norm": 0.33944461757575506, + "learning_rate": 1.9990528693314273e-05, + "loss": 0.2553, + "step": 154 + }, + { + "epoch": 0.2270230684730868, + "grad_norm": 0.3428799092949669, + "learning_rate": 1.9989381852284165e-05, + "loss": 0.274, + "step": 155 + }, + { + "epoch": 0.22848773343097767, + "grad_norm": 0.3511079662901022, + "learning_rate": 1.99881695251736e-05, + "loss": 0.2612, + "step": 156 + }, + { + "epoch": 0.22995239838886855, + "grad_norm": 0.33067647835038166, + "learning_rate": 1.998689171993006e-05, + "loss": 0.2653, + "step": 157 + }, + { + "epoch": 0.23141706334675943, + "grad_norm": 0.3487260974580051, + "learning_rate": 1.9985548444930295e-05, + "loss": 0.3049, + "step": 158 + }, + { + "epoch": 0.23288172830465032, + "grad_norm": 0.32680975853473054, + "learning_rate": 1.9984139708980228e-05, + "loss": 0.2641, + "step": 159 + }, + { + "epoch": 0.2343463932625412, + "grad_norm": 0.34722059123958804, + "learning_rate": 1.9982665521314934e-05, + "loss": 0.2713, + "step": 160 + }, + { + "epoch": 0.23581105822043208, + "grad_norm": 0.34801179824934036, + "learning_rate": 1.9981125891598545e-05, + "loss": 0.267, + "step": 161 + }, + { + "epoch": 0.23727572317832296, + "grad_norm": 0.3624289342633928, + "learning_rate": 1.9979520829924212e-05, + "loss": 0.263, + "step": 162 + }, + { + "epoch": 0.23874038813621384, + "grad_norm": 0.3299205118170384, + "learning_rate": 1.9977850346814026e-05, + "loss": 0.2919, + "step": 163 + }, + { + "epoch": 0.24020505309410473, + "grad_norm": 0.3595355800684274, + "learning_rate": 1.997611445321896e-05, + "loss": 0.2934, + "step": 164 + }, + { + "epoch": 0.2416697180519956, + "grad_norm": 0.3544383226618447, + "learning_rate": 1.9974313160518776e-05, + "loss": 0.2505, + "step": 165 + }, + { + "epoch": 0.2431343830098865, + "grad_norm": 0.34482857147345436, + "learning_rate": 1.9972446480521972e-05, + "loss": 0.2858, + "step": 166 + }, + { + "epoch": 0.24459904796777737, + "grad_norm": 0.36217560659010756, + "learning_rate": 1.9970514425465706e-05, + "loss": 0.2478, + "step": 167 + }, + { + "epoch": 0.24606371292566825, + "grad_norm": 0.37793259918304806, + "learning_rate": 1.996851700801569e-05, + "loss": 0.2557, + "step": 168 + }, + { + "epoch": 0.24752837788355914, + "grad_norm": 0.34823612797091924, + "learning_rate": 1.996645424126613e-05, + "loss": 0.2449, + "step": 169 + }, + { + "epoch": 0.24899304284145002, + "grad_norm": 0.3456664229365294, + "learning_rate": 1.9964326138739645e-05, + "loss": 0.2593, + "step": 170 + }, + { + "epoch": 0.2504577077993409, + "grad_norm": 0.36365236173385046, + "learning_rate": 1.996213271438715e-05, + "loss": 0.26, + "step": 171 + }, + { + "epoch": 0.2519223727572318, + "grad_norm": 0.3807231884615767, + "learning_rate": 1.9959873982587795e-05, + "loss": 0.2976, + "step": 172 + }, + { + "epoch": 0.25338703771512266, + "grad_norm": 0.373069506679343, + "learning_rate": 1.9957549958148844e-05, + "loss": 0.2895, + "step": 173 + }, + { + "epoch": 0.25485170267301355, + "grad_norm": 0.3531411189874793, + "learning_rate": 1.9955160656305606e-05, + "loss": 0.2483, + "step": 174 + }, + { + "epoch": 0.25631636763090443, + "grad_norm": 0.35346645678091215, + "learning_rate": 1.995270609272131e-05, + "loss": 0.2649, + "step": 175 + }, + { + "epoch": 0.2577810325887953, + "grad_norm": 0.3415190871733395, + "learning_rate": 1.995018628348702e-05, + "loss": 0.2523, + "step": 176 + }, + { + "epoch": 0.2592456975466862, + "grad_norm": 0.3654772088675571, + "learning_rate": 1.9947601245121514e-05, + "loss": 0.3083, + "step": 177 + }, + { + "epoch": 0.2607103625045771, + "grad_norm": 0.3460461457455375, + "learning_rate": 1.9944950994571192e-05, + "loss": 0.2792, + "step": 178 + }, + { + "epoch": 0.26217502746246796, + "grad_norm": 0.3110444104590562, + "learning_rate": 1.9942235549209955e-05, + "loss": 0.2688, + "step": 179 + }, + { + "epoch": 0.26363969242035884, + "grad_norm": 0.3272499534539241, + "learning_rate": 1.993945492683909e-05, + "loss": 0.2792, + "step": 180 + }, + { + "epoch": 0.2651043573782497, + "grad_norm": 0.2878269183852618, + "learning_rate": 1.993660914568716e-05, + "loss": 0.2424, + "step": 181 + }, + { + "epoch": 0.2665690223361406, + "grad_norm": 0.33774279808544694, + "learning_rate": 1.9933698224409876e-05, + "loss": 0.2496, + "step": 182 + }, + { + "epoch": 0.2680336872940315, + "grad_norm": 0.3363925870755136, + "learning_rate": 1.993072218208999e-05, + "loss": 0.2613, + "step": 183 + }, + { + "epoch": 0.26949835225192237, + "grad_norm": 0.3304059597878355, + "learning_rate": 1.992768103823714e-05, + "loss": 0.2523, + "step": 184 + }, + { + "epoch": 0.27096301720981325, + "grad_norm": 0.3556147552320389, + "learning_rate": 1.9924574812787766e-05, + "loss": 0.2528, + "step": 185 + }, + { + "epoch": 0.27242768216770413, + "grad_norm": 0.33358383808877295, + "learning_rate": 1.992140352610494e-05, + "loss": 0.2371, + "step": 186 + }, + { + "epoch": 0.273892347125595, + "grad_norm": 0.32338465251509535, + "learning_rate": 1.9918167198978246e-05, + "loss": 0.2328, + "step": 187 + }, + { + "epoch": 0.2753570120834859, + "grad_norm": 0.34675833301901965, + "learning_rate": 1.991486585262365e-05, + "loss": 0.2441, + "step": 188 + }, + { + "epoch": 0.2768216770413768, + "grad_norm": 0.32681910042540224, + "learning_rate": 1.991149950868336e-05, + "loss": 0.2244, + "step": 189 + }, + { + "epoch": 0.27828634199926766, + "grad_norm": 0.33323736986242397, + "learning_rate": 1.9908068189225672e-05, + "loss": 0.2375, + "step": 190 + }, + { + "epoch": 0.27975100695715854, + "grad_norm": 0.30868253450774236, + "learning_rate": 1.9904571916744836e-05, + "loss": 0.2354, + "step": 191 + }, + { + "epoch": 0.2812156719150494, + "grad_norm": 0.3404026835089613, + "learning_rate": 1.990101071416091e-05, + "loss": 0.244, + "step": 192 + }, + { + "epoch": 0.2826803368729403, + "grad_norm": 0.3250658440023371, + "learning_rate": 1.98973846048196e-05, + "loss": 0.2436, + "step": 193 + }, + { + "epoch": 0.2841450018308312, + "grad_norm": 0.3727507902802708, + "learning_rate": 1.9893693612492116e-05, + "loss": 0.2605, + "step": 194 + }, + { + "epoch": 0.28560966678872207, + "grad_norm": 0.32087796418849673, + "learning_rate": 1.9889937761375015e-05, + "loss": 0.2473, + "step": 195 + }, + { + "epoch": 0.28707433174661295, + "grad_norm": 0.3017096010090274, + "learning_rate": 1.9886117076090033e-05, + "loss": 0.2333, + "step": 196 + }, + { + "epoch": 0.28853899670450384, + "grad_norm": 0.32081985018575454, + "learning_rate": 1.9882231581683938e-05, + "loss": 0.2401, + "step": 197 + }, + { + "epoch": 0.2900036616623947, + "grad_norm": 0.31230329800532286, + "learning_rate": 1.9878281303628352e-05, + "loss": 0.2362, + "step": 198 + }, + { + "epoch": 0.2914683266202856, + "grad_norm": 0.3218942752729413, + "learning_rate": 1.9874266267819604e-05, + "loss": 0.248, + "step": 199 + }, + { + "epoch": 0.2929329915781765, + "grad_norm": 0.34876467385745175, + "learning_rate": 1.987018650057853e-05, + "loss": 0.2487, + "step": 200 + }, + { + "epoch": 0.29439765653606736, + "grad_norm": 0.31745025914965086, + "learning_rate": 1.986604202865033e-05, + "loss": 0.2394, + "step": 201 + }, + { + "epoch": 0.29586232149395825, + "grad_norm": 0.32627016177493307, + "learning_rate": 1.986183287920437e-05, + "loss": 0.2624, + "step": 202 + }, + { + "epoch": 0.29732698645184913, + "grad_norm": 0.32599240801438883, + "learning_rate": 1.9857559079834023e-05, + "loss": 0.2404, + "step": 203 + }, + { + "epoch": 0.29879165140974, + "grad_norm": 0.3110886547328817, + "learning_rate": 1.9853220658556474e-05, + "loss": 0.246, + "step": 204 + }, + { + "epoch": 0.3002563163676309, + "grad_norm": 0.3439412203840107, + "learning_rate": 1.984881764381254e-05, + "loss": 0.2245, + "step": 205 + }, + { + "epoch": 0.3017209813255218, + "grad_norm": 0.32093087763294337, + "learning_rate": 1.9844350064466488e-05, + "loss": 0.2351, + "step": 206 + }, + { + "epoch": 0.30318564628341266, + "grad_norm": 0.3080777182901835, + "learning_rate": 1.9839817949805843e-05, + "loss": 0.2239, + "step": 207 + }, + { + "epoch": 0.30465031124130354, + "grad_norm": 0.33465999975388977, + "learning_rate": 1.9835221329541197e-05, + "loss": 0.2362, + "step": 208 + }, + { + "epoch": 0.3061149761991944, + "grad_norm": 0.2916494897812547, + "learning_rate": 1.9830560233806006e-05, + "loss": 0.2138, + "step": 209 + }, + { + "epoch": 0.3075796411570853, + "grad_norm": 0.31069474576355127, + "learning_rate": 1.9825834693156408e-05, + "loss": 0.2381, + "step": 210 + }, + { + "epoch": 0.3090443061149762, + "grad_norm": 0.30904277970849947, + "learning_rate": 1.9821044738571008e-05, + "loss": 0.2336, + "step": 211 + }, + { + "epoch": 0.31050897107286707, + "grad_norm": 0.30754570168689266, + "learning_rate": 1.981619040145068e-05, + "loss": 0.2209, + "step": 212 + }, + { + "epoch": 0.31197363603075795, + "grad_norm": 0.2976312068720902, + "learning_rate": 1.9811271713618372e-05, + "loss": 0.2558, + "step": 213 + }, + { + "epoch": 0.31343830098864883, + "grad_norm": 0.3039100122516498, + "learning_rate": 1.980628870731888e-05, + "loss": 0.2265, + "step": 214 + }, + { + "epoch": 0.3149029659465397, + "grad_norm": 0.3136540735451798, + "learning_rate": 1.9801241415218636e-05, + "loss": 0.2551, + "step": 215 + }, + { + "epoch": 0.3163676309044306, + "grad_norm": 0.30585301656209735, + "learning_rate": 1.979612987040552e-05, + "loss": 0.2258, + "step": 216 + }, + { + "epoch": 0.3178322958623215, + "grad_norm": 0.3334394299522592, + "learning_rate": 1.9790954106388614e-05, + "loss": 0.2374, + "step": 217 + }, + { + "epoch": 0.31929696082021236, + "grad_norm": 0.3385575692037135, + "learning_rate": 1.9785714157097992e-05, + "loss": 0.2331, + "step": 218 + }, + { + "epoch": 0.32076162577810324, + "grad_norm": 0.3550097472590768, + "learning_rate": 1.9780410056884505e-05, + "loss": 0.2316, + "step": 219 + }, + { + "epoch": 0.3222262907359941, + "grad_norm": 0.3212031876917821, + "learning_rate": 1.9775041840519547e-05, + "loss": 0.264, + "step": 220 + }, + { + "epoch": 0.323690955693885, + "grad_norm": 0.3417161392653498, + "learning_rate": 1.976960954319483e-05, + "loss": 0.2366, + "step": 221 + }, + { + "epoch": 0.3251556206517759, + "grad_norm": 0.31739055359370316, + "learning_rate": 1.9764113200522153e-05, + "loss": 0.2193, + "step": 222 + }, + { + "epoch": 0.32662028560966677, + "grad_norm": 0.3475167845756447, + "learning_rate": 1.9758552848533168e-05, + "loss": 0.2381, + "step": 223 + }, + { + "epoch": 0.32808495056755765, + "grad_norm": 0.3227407213456651, + "learning_rate": 1.9752928523679145e-05, + "loss": 0.2403, + "step": 224 + }, + { + "epoch": 0.32954961552544854, + "grad_norm": 0.32134472119602675, + "learning_rate": 1.9747240262830734e-05, + "loss": 0.244, + "step": 225 + }, + { + "epoch": 0.3310142804833394, + "grad_norm": 0.3060769394485448, + "learning_rate": 1.9741488103277722e-05, + "loss": 0.2029, + "step": 226 + }, + { + "epoch": 0.3324789454412303, + "grad_norm": 0.3422387817778384, + "learning_rate": 1.9735672082728785e-05, + "loss": 0.2166, + "step": 227 + }, + { + "epoch": 0.3339436103991212, + "grad_norm": 0.29907859948451515, + "learning_rate": 1.9729792239311243e-05, + "loss": 0.223, + "step": 228 + }, + { + "epoch": 0.33540827535701206, + "grad_norm": 0.27950206671923955, + "learning_rate": 1.972384861157082e-05, + "loss": 0.1952, + "step": 229 + }, + { + "epoch": 0.33687294031490295, + "grad_norm": 0.3083999590796091, + "learning_rate": 1.9717841238471377e-05, + "loss": 0.2188, + "step": 230 + }, + { + "epoch": 0.33833760527279383, + "grad_norm": 0.295065591964525, + "learning_rate": 1.9711770159394654e-05, + "loss": 0.2091, + "step": 231 + }, + { + "epoch": 0.3398022702306847, + "grad_norm": 0.30797729325093864, + "learning_rate": 1.9705635414140035e-05, + "loss": 0.2106, + "step": 232 + }, + { + "epoch": 0.3412669351885756, + "grad_norm": 0.31169542447907694, + "learning_rate": 1.9699437042924266e-05, + "loss": 0.2398, + "step": 233 + }, + { + "epoch": 0.3427316001464665, + "grad_norm": 0.340358999491338, + "learning_rate": 1.969317508638119e-05, + "loss": 0.2564, + "step": 234 + }, + { + "epoch": 0.34419626510435736, + "grad_norm": 0.3017727587765966, + "learning_rate": 1.96868495855615e-05, + "loss": 0.2024, + "step": 235 + }, + { + "epoch": 0.34566093006224824, + "grad_norm": 0.33465091974213557, + "learning_rate": 1.9680460581932448e-05, + "loss": 0.2418, + "step": 236 + }, + { + "epoch": 0.3471255950201391, + "grad_norm": 0.29191015974637435, + "learning_rate": 1.967400811737759e-05, + "loss": 0.2018, + "step": 237 + }, + { + "epoch": 0.34859025997803, + "grad_norm": 0.28852204171964263, + "learning_rate": 1.96674922341965e-05, + "loss": 0.2341, + "step": 238 + }, + { + "epoch": 0.3500549249359209, + "grad_norm": 0.32713188846613395, + "learning_rate": 1.96609129751045e-05, + "loss": 0.2304, + "step": 239 + }, + { + "epoch": 0.35151958989381177, + "grad_norm": 0.2960166330763219, + "learning_rate": 1.9654270383232377e-05, + "loss": 0.218, + "step": 240 + }, + { + "epoch": 0.35298425485170265, + "grad_norm": 0.3129431801548695, + "learning_rate": 1.9647564502126094e-05, + "loss": 0.2117, + "step": 241 + }, + { + "epoch": 0.35444891980959353, + "grad_norm": 0.294963922960848, + "learning_rate": 1.964079537574652e-05, + "loss": 0.2203, + "step": 242 + }, + { + "epoch": 0.3559135847674844, + "grad_norm": 0.3164361378068273, + "learning_rate": 1.963396304846913e-05, + "loss": 0.2199, + "step": 243 + }, + { + "epoch": 0.3573782497253753, + "grad_norm": 0.3207780914062909, + "learning_rate": 1.9627067565083716e-05, + "loss": 0.2094, + "step": 244 + }, + { + "epoch": 0.3588429146832662, + "grad_norm": 0.31070213874769786, + "learning_rate": 1.962010897079409e-05, + "loss": 0.1875, + "step": 245 + }, + { + "epoch": 0.36030757964115706, + "grad_norm": 0.30122482811704177, + "learning_rate": 1.96130873112178e-05, + "loss": 0.2377, + "step": 246 + }, + { + "epoch": 0.36177224459904794, + "grad_norm": 0.3070889454659401, + "learning_rate": 1.9606002632385817e-05, + "loss": 0.1948, + "step": 247 + }, + { + "epoch": 0.3632369095569388, + "grad_norm": 0.30277497066736986, + "learning_rate": 1.959885498074224e-05, + "loss": 0.2122, + "step": 248 + }, + { + "epoch": 0.3647015745148297, + "grad_norm": 0.33010916657115674, + "learning_rate": 1.9591644403143997e-05, + "loss": 0.1988, + "step": 249 + }, + { + "epoch": 0.3661662394727206, + "grad_norm": 0.3187016737580306, + "learning_rate": 1.958437094686052e-05, + "loss": 0.1954, + "step": 250 + }, + { + "epoch": 0.36763090443061147, + "grad_norm": 0.29492570210704677, + "learning_rate": 1.9577034659573452e-05, + "loss": 0.1932, + "step": 251 + }, + { + "epoch": 0.36909556938850235, + "grad_norm": 0.31163033056152234, + "learning_rate": 1.956963558937633e-05, + "loss": 0.234, + "step": 252 + }, + { + "epoch": 0.37056023434639324, + "grad_norm": 0.29081575214116195, + "learning_rate": 1.9562173784774274e-05, + "loss": 0.2256, + "step": 253 + }, + { + "epoch": 0.3720248993042841, + "grad_norm": 0.2859981212044829, + "learning_rate": 1.955464929468365e-05, + "loss": 0.1863, + "step": 254 + }, + { + "epoch": 0.373489564262175, + "grad_norm": 0.3362313373130634, + "learning_rate": 1.9547062168431777e-05, + "loss": 0.2191, + "step": 255 + }, + { + "epoch": 0.3749542292200659, + "grad_norm": 0.31580606455125804, + "learning_rate": 1.9539412455756578e-05, + "loss": 0.2219, + "step": 256 + }, + { + "epoch": 0.3764188941779568, + "grad_norm": 0.3138360531715862, + "learning_rate": 1.9531700206806274e-05, + "loss": 0.2216, + "step": 257 + }, + { + "epoch": 0.3778835591358477, + "grad_norm": 0.30577065715195095, + "learning_rate": 1.952392547213904e-05, + "loss": 0.2143, + "step": 258 + }, + { + "epoch": 0.3793482240937386, + "grad_norm": 0.3030869019075986, + "learning_rate": 1.9516088302722696e-05, + "loss": 0.2155, + "step": 259 + }, + { + "epoch": 0.38081288905162947, + "grad_norm": 0.33061172813654255, + "learning_rate": 1.9508188749934333e-05, + "loss": 0.2375, + "step": 260 + }, + { + "epoch": 0.38227755400952035, + "grad_norm": 0.31425985710724597, + "learning_rate": 1.9500226865560015e-05, + "loss": 0.197, + "step": 261 + }, + { + "epoch": 0.38374221896741123, + "grad_norm": 0.3084208020631583, + "learning_rate": 1.9492202701794432e-05, + "loss": 0.2062, + "step": 262 + }, + { + "epoch": 0.3852068839253021, + "grad_norm": 0.31591794037568377, + "learning_rate": 1.9484116311240534e-05, + "loss": 0.2012, + "step": 263 + }, + { + "epoch": 0.386671548883193, + "grad_norm": 0.3243347143178351, + "learning_rate": 1.9475967746909212e-05, + "loss": 0.2157, + "step": 264 + }, + { + "epoch": 0.3881362138410839, + "grad_norm": 0.3126285934402634, + "learning_rate": 1.946775706221894e-05, + "loss": 0.2015, + "step": 265 + }, + { + "epoch": 0.38960087879897476, + "grad_norm": 0.29362391198343335, + "learning_rate": 1.945948431099543e-05, + "loss": 0.2082, + "step": 266 + }, + { + "epoch": 0.39106554375686564, + "grad_norm": 0.3306599758257121, + "learning_rate": 1.945114954747127e-05, + "loss": 0.2217, + "step": 267 + }, + { + "epoch": 0.3925302087147565, + "grad_norm": 0.31104138128230796, + "learning_rate": 1.9442752826285578e-05, + "loss": 0.2347, + "step": 268 + }, + { + "epoch": 0.3939948736726474, + "grad_norm": 0.2882519240767416, + "learning_rate": 1.9434294202483634e-05, + "loss": 0.189, + "step": 269 + }, + { + "epoch": 0.3954595386305383, + "grad_norm": 0.2993561393461098, + "learning_rate": 1.9425773731516534e-05, + "loss": 0.197, + "step": 270 + }, + { + "epoch": 0.39692420358842917, + "grad_norm": 0.31696676157406817, + "learning_rate": 1.9417191469240806e-05, + "loss": 0.1853, + "step": 271 + }, + { + "epoch": 0.39838886854632005, + "grad_norm": 0.2976550371764201, + "learning_rate": 1.940854747191806e-05, + "loss": 0.2062, + "step": 272 + }, + { + "epoch": 0.39985353350421093, + "grad_norm": 0.3102511410212597, + "learning_rate": 1.9399841796214625e-05, + "loss": 0.2118, + "step": 273 + }, + { + "epoch": 0.4013181984621018, + "grad_norm": 0.29711750834500367, + "learning_rate": 1.9391074499201155e-05, + "loss": 0.1736, + "step": 274 + }, + { + "epoch": 0.4027828634199927, + "grad_norm": 0.33461146425534555, + "learning_rate": 1.938224563835226e-05, + "loss": 0.2023, + "step": 275 + }, + { + "epoch": 0.4042475283778836, + "grad_norm": 0.2969695409049403, + "learning_rate": 1.9373355271546156e-05, + "loss": 0.2067, + "step": 276 + }, + { + "epoch": 0.40571219333577446, + "grad_norm": 0.32397668137142743, + "learning_rate": 1.9364403457064252e-05, + "loss": 0.2266, + "step": 277 + }, + { + "epoch": 0.40717685829366534, + "grad_norm": 0.35852753225097855, + "learning_rate": 1.9355390253590775e-05, + "loss": 0.2582, + "step": 278 + }, + { + "epoch": 0.4086415232515562, + "grad_norm": 0.2936115798624861, + "learning_rate": 1.9346315720212416e-05, + "loss": 0.1899, + "step": 279 + }, + { + "epoch": 0.4101061882094471, + "grad_norm": 0.3229854576353249, + "learning_rate": 1.933717991641789e-05, + "loss": 0.2103, + "step": 280 + }, + { + "epoch": 0.411570853167338, + "grad_norm": 0.29206997385224825, + "learning_rate": 1.9327982902097596e-05, + "loss": 0.2157, + "step": 281 + }, + { + "epoch": 0.4130355181252289, + "grad_norm": 0.30505940530286574, + "learning_rate": 1.931872473754319e-05, + "loss": 0.2077, + "step": 282 + }, + { + "epoch": 0.41450018308311976, + "grad_norm": 0.2973043577148213, + "learning_rate": 1.9309405483447208e-05, + "loss": 0.1964, + "step": 283 + }, + { + "epoch": 0.41596484804101064, + "grad_norm": 0.29785754593184066, + "learning_rate": 1.9300025200902666e-05, + "loss": 0.1913, + "step": 284 + }, + { + "epoch": 0.4174295129989015, + "grad_norm": 0.33220986093953964, + "learning_rate": 1.9290583951402648e-05, + "loss": 0.1997, + "step": 285 + }, + { + "epoch": 0.4188941779567924, + "grad_norm": 0.3165354043864197, + "learning_rate": 1.9281081796839915e-05, + "loss": 0.2028, + "step": 286 + }, + { + "epoch": 0.4203588429146833, + "grad_norm": 0.30361046171154804, + "learning_rate": 1.9271518799506494e-05, + "loss": 0.2004, + "step": 287 + }, + { + "epoch": 0.42182350787257417, + "grad_norm": 0.3125634361106797, + "learning_rate": 1.9261895022093275e-05, + "loss": 0.1981, + "step": 288 + }, + { + "epoch": 0.42328817283046505, + "grad_norm": 0.30844567580026055, + "learning_rate": 1.9252210527689596e-05, + "loss": 0.2073, + "step": 289 + }, + { + "epoch": 0.42475283778835593, + "grad_norm": 0.3155331823234916, + "learning_rate": 1.9242465379782823e-05, + "loss": 0.1854, + "step": 290 + }, + { + "epoch": 0.4262175027462468, + "grad_norm": 0.34604055990819416, + "learning_rate": 1.9232659642257942e-05, + "loss": 0.2627, + "step": 291 + }, + { + "epoch": 0.4276821677041377, + "grad_norm": 0.31441965292621443, + "learning_rate": 1.9222793379397146e-05, + "loss": 0.2214, + "step": 292 + }, + { + "epoch": 0.4291468326620286, + "grad_norm": 0.3341564227129849, + "learning_rate": 1.9212866655879397e-05, + "loss": 0.2002, + "step": 293 + }, + { + "epoch": 0.43061149761991946, + "grad_norm": 0.314168791046398, + "learning_rate": 1.9202879536780013e-05, + "loss": 0.199, + "step": 294 + }, + { + "epoch": 0.43207616257781034, + "grad_norm": 0.3415225885868485, + "learning_rate": 1.919283208757025e-05, + "loss": 0.1915, + "step": 295 + }, + { + "epoch": 0.4335408275357012, + "grad_norm": 0.34338764313976977, + "learning_rate": 1.918272437411684e-05, + "loss": 0.2134, + "step": 296 + }, + { + "epoch": 0.4350054924935921, + "grad_norm": 0.30091408846543016, + "learning_rate": 1.91725564626816e-05, + "loss": 0.1956, + "step": 297 + }, + { + "epoch": 0.436470157451483, + "grad_norm": 0.30066058971128345, + "learning_rate": 1.9162328419920976e-05, + "loss": 0.1901, + "step": 298 + }, + { + "epoch": 0.43793482240937387, + "grad_norm": 0.30021693344518857, + "learning_rate": 1.9152040312885604e-05, + "loss": 0.195, + "step": 299 + }, + { + "epoch": 0.43939948736726475, + "grad_norm": 0.30589418789890954, + "learning_rate": 1.914169220901988e-05, + "loss": 0.1904, + "step": 300 + }, + { + "epoch": 0.44086415232515563, + "grad_norm": 0.2916943622706973, + "learning_rate": 1.9131284176161505e-05, + "loss": 0.1862, + "step": 301 + }, + { + "epoch": 0.4423288172830465, + "grad_norm": 0.3111182978529323, + "learning_rate": 1.9120816282541062e-05, + "loss": 0.1903, + "step": 302 + }, + { + "epoch": 0.4437934822409374, + "grad_norm": 0.296170352579387, + "learning_rate": 1.911028859678155e-05, + "loss": 0.1859, + "step": 303 + }, + { + "epoch": 0.4452581471988283, + "grad_norm": 0.3144098925860641, + "learning_rate": 1.9099701187897927e-05, + "loss": 0.2115, + "step": 304 + }, + { + "epoch": 0.44672281215671916, + "grad_norm": 0.28420551119731563, + "learning_rate": 1.9089054125296692e-05, + "loss": 0.1786, + "step": 305 + }, + { + "epoch": 0.44818747711461004, + "grad_norm": 0.31553201198823727, + "learning_rate": 1.907834747877539e-05, + "loss": 0.1828, + "step": 306 + }, + { + "epoch": 0.4496521420725009, + "grad_norm": 0.3045234900369304, + "learning_rate": 1.906758131852218e-05, + "loss": 0.1969, + "step": 307 + }, + { + "epoch": 0.4511168070303918, + "grad_norm": 0.3172827302837682, + "learning_rate": 1.9056755715115372e-05, + "loss": 0.1961, + "step": 308 + }, + { + "epoch": 0.4525814719882827, + "grad_norm": 0.31871947429630393, + "learning_rate": 1.9045870739522953e-05, + "loss": 0.1833, + "step": 309 + }, + { + "epoch": 0.4540461369461736, + "grad_norm": 0.29640983142077165, + "learning_rate": 1.9034926463102122e-05, + "loss": 0.1735, + "step": 310 + }, + { + "epoch": 0.45551080190406446, + "grad_norm": 0.2838973203329574, + "learning_rate": 1.9023922957598847e-05, + "loss": 0.1879, + "step": 311 + }, + { + "epoch": 0.45697546686195534, + "grad_norm": 0.31619551941395607, + "learning_rate": 1.901286029514736e-05, + "loss": 0.1981, + "step": 312 + }, + { + "epoch": 0.4584401318198462, + "grad_norm": 0.28916971260480706, + "learning_rate": 1.9001738548269707e-05, + "loss": 0.1681, + "step": 313 + }, + { + "epoch": 0.4599047967777371, + "grad_norm": 0.29652343709617623, + "learning_rate": 1.8990557789875265e-05, + "loss": 0.1752, + "step": 314 + }, + { + "epoch": 0.461369461735628, + "grad_norm": 0.2952557530849967, + "learning_rate": 1.8979318093260268e-05, + "loss": 0.1731, + "step": 315 + }, + { + "epoch": 0.46283412669351887, + "grad_norm": 0.30085307847746645, + "learning_rate": 1.8968019532107318e-05, + "loss": 0.1975, + "step": 316 + }, + { + "epoch": 0.46429879165140975, + "grad_norm": 0.3187121453479262, + "learning_rate": 1.8956662180484913e-05, + "loss": 0.198, + "step": 317 + }, + { + "epoch": 0.46576345660930063, + "grad_norm": 0.30663390384150385, + "learning_rate": 1.8945246112846952e-05, + "loss": 0.1835, + "step": 318 + }, + { + "epoch": 0.4672281215671915, + "grad_norm": 0.30881852574686636, + "learning_rate": 1.893377140403225e-05, + "loss": 0.1828, + "step": 319 + }, + { + "epoch": 0.4686927865250824, + "grad_norm": 0.300908778514595, + "learning_rate": 1.892223812926406e-05, + "loss": 0.2214, + "step": 320 + }, + { + "epoch": 0.4701574514829733, + "grad_norm": 0.2860561871686056, + "learning_rate": 1.8910646364149548e-05, + "loss": 0.1938, + "step": 321 + }, + { + "epoch": 0.47162211644086416, + "grad_norm": 0.2772670520674923, + "learning_rate": 1.889899618467933e-05, + "loss": 0.1668, + "step": 322 + }, + { + "epoch": 0.47308678139875504, + "grad_norm": 0.3062971263411581, + "learning_rate": 1.8887287667226964e-05, + "loss": 0.199, + "step": 323 + }, + { + "epoch": 0.4745514463566459, + "grad_norm": 0.32124027529820043, + "learning_rate": 1.887552088854844e-05, + "loss": 0.2083, + "step": 324 + }, + { + "epoch": 0.4760161113145368, + "grad_norm": 0.295114219753382, + "learning_rate": 1.8863695925781685e-05, + "loss": 0.1887, + "step": 325 + }, + { + "epoch": 0.4774807762724277, + "grad_norm": 0.3194350024464684, + "learning_rate": 1.8851812856446062e-05, + "loss": 0.2086, + "step": 326 + }, + { + "epoch": 0.47894544123031857, + "grad_norm": 0.29449304384305286, + "learning_rate": 1.8839871758441842e-05, + "loss": 0.1898, + "step": 327 + }, + { + "epoch": 0.48041010618820945, + "grad_norm": 0.28550779650922725, + "learning_rate": 1.882787271004972e-05, + "loss": 0.1751, + "step": 328 + }, + { + "epoch": 0.48187477114610033, + "grad_norm": 0.28143493349397003, + "learning_rate": 1.8815815789930277e-05, + "loss": 0.1853, + "step": 329 + }, + { + "epoch": 0.4833394361039912, + "grad_norm": 0.3088998091001307, + "learning_rate": 1.8803701077123492e-05, + "loss": 0.1901, + "step": 330 + }, + { + "epoch": 0.4848041010618821, + "grad_norm": 0.31201486962963987, + "learning_rate": 1.8791528651048193e-05, + "loss": 0.1956, + "step": 331 + }, + { + "epoch": 0.486268766019773, + "grad_norm": 0.30000644267131227, + "learning_rate": 1.8779298591501565e-05, + "loss": 0.1852, + "step": 332 + }, + { + "epoch": 0.48773343097766386, + "grad_norm": 0.279989644276303, + "learning_rate": 1.8767010978658597e-05, + "loss": 0.1564, + "step": 333 + }, + { + "epoch": 0.48919809593555474, + "grad_norm": 0.31244506310168246, + "learning_rate": 1.8754665893071583e-05, + "loss": 0.2094, + "step": 334 + }, + { + "epoch": 0.4906627608934456, + "grad_norm": 0.32128122069078735, + "learning_rate": 1.874226341566958e-05, + "loss": 0.2013, + "step": 335 + }, + { + "epoch": 0.4921274258513365, + "grad_norm": 0.30462090670077413, + "learning_rate": 1.872980362775789e-05, + "loss": 0.1779, + "step": 336 + }, + { + "epoch": 0.4935920908092274, + "grad_norm": 0.30874094182185663, + "learning_rate": 1.87172866110175e-05, + "loss": 0.1976, + "step": 337 + }, + { + "epoch": 0.4950567557671183, + "grad_norm": 0.2978419876068271, + "learning_rate": 1.870471244750458e-05, + "loss": 0.1718, + "step": 338 + }, + { + "epoch": 0.49652142072500915, + "grad_norm": 0.30085229404822755, + "learning_rate": 1.8692081219649926e-05, + "loss": 0.1729, + "step": 339 + }, + { + "epoch": 0.49798608568290004, + "grad_norm": 0.309384694321714, + "learning_rate": 1.867939301025842e-05, + "loss": 0.1882, + "step": 340 + }, + { + "epoch": 0.4994507506407909, + "grad_norm": 0.28838820512419444, + "learning_rate": 1.8666647902508493e-05, + "loss": 0.1789, + "step": 341 + }, + { + "epoch": 0.5009154155986818, + "grad_norm": 0.32424956980873243, + "learning_rate": 1.8653845979951577e-05, + "loss": 0.1663, + "step": 342 + }, + { + "epoch": 0.5023800805565727, + "grad_norm": 0.31499571469715115, + "learning_rate": 1.864098732651155e-05, + "loss": 0.1991, + "step": 343 + }, + { + "epoch": 0.5038447455144636, + "grad_norm": 0.3158502822606953, + "learning_rate": 1.8628072026484215e-05, + "loss": 0.2042, + "step": 344 + }, + { + "epoch": 0.5053094104723544, + "grad_norm": 0.3006662825507403, + "learning_rate": 1.8615100164536696e-05, + "loss": 0.1799, + "step": 345 + }, + { + "epoch": 0.5067740754302453, + "grad_norm": 0.3197735718886773, + "learning_rate": 1.8602071825706928e-05, + "loss": 0.2037, + "step": 346 + }, + { + "epoch": 0.5082387403881362, + "grad_norm": 0.29431082437854705, + "learning_rate": 1.858898709540309e-05, + "loss": 0.1624, + "step": 347 + }, + { + "epoch": 0.5097034053460271, + "grad_norm": 0.3319207275942643, + "learning_rate": 1.8575846059403036e-05, + "loss": 0.1781, + "step": 348 + }, + { + "epoch": 0.511168070303918, + "grad_norm": 0.3548295674736163, + "learning_rate": 1.856264880385372e-05, + "loss": 0.1777, + "step": 349 + }, + { + "epoch": 0.5126327352618089, + "grad_norm": 0.3418284407905513, + "learning_rate": 1.8549395415270664e-05, + "loss": 0.1658, + "step": 350 + }, + { + "epoch": 0.5140974002196997, + "grad_norm": 0.28587381220460634, + "learning_rate": 1.8536085980537374e-05, + "loss": 0.1521, + "step": 351 + }, + { + "epoch": 0.5155620651775906, + "grad_norm": 0.28655840819303974, + "learning_rate": 1.8522720586904758e-05, + "loss": 0.177, + "step": 352 + }, + { + "epoch": 0.5170267301354815, + "grad_norm": 0.3101035018861679, + "learning_rate": 1.8509299321990583e-05, + "loss": 0.1728, + "step": 353 + }, + { + "epoch": 0.5184913950933724, + "grad_norm": 0.309153206811744, + "learning_rate": 1.8495822273778867e-05, + "loss": 0.1853, + "step": 354 + }, + { + "epoch": 0.5199560600512633, + "grad_norm": 0.3072944032463391, + "learning_rate": 1.8482289530619332e-05, + "loss": 0.1994, + "step": 355 + }, + { + "epoch": 0.5214207250091542, + "grad_norm": 0.2882455459923482, + "learning_rate": 1.8468701181226803e-05, + "loss": 0.1719, + "step": 356 + }, + { + "epoch": 0.522885389967045, + "grad_norm": 0.3009687596348506, + "learning_rate": 1.8455057314680646e-05, + "loss": 0.1762, + "step": 357 + }, + { + "epoch": 0.5243500549249359, + "grad_norm": 0.28709132379458197, + "learning_rate": 1.8441358020424168e-05, + "loss": 0.1872, + "step": 358 + }, + { + "epoch": 0.5258147198828268, + "grad_norm": 0.3252839006016281, + "learning_rate": 1.8427603388264027e-05, + "loss": 0.1658, + "step": 359 + }, + { + "epoch": 0.5272793848407177, + "grad_norm": 0.2794496573065983, + "learning_rate": 1.8413793508369667e-05, + "loss": 0.1536, + "step": 360 + }, + { + "epoch": 0.5287440497986086, + "grad_norm": 0.295826215328857, + "learning_rate": 1.839992847127271e-05, + "loss": 0.1728, + "step": 361 + }, + { + "epoch": 0.5302087147564994, + "grad_norm": 0.3057004431965132, + "learning_rate": 1.838600836786635e-05, + "loss": 0.1737, + "step": 362 + }, + { + "epoch": 0.5316733797143903, + "grad_norm": 0.3032546345594625, + "learning_rate": 1.8372033289404795e-05, + "loss": 0.1804, + "step": 363 + }, + { + "epoch": 0.5331380446722812, + "grad_norm": 0.31826939348019884, + "learning_rate": 1.835800332750263e-05, + "loss": 0.1567, + "step": 364 + }, + { + "epoch": 0.5346027096301721, + "grad_norm": 0.3382986306853968, + "learning_rate": 1.834391857413423e-05, + "loss": 0.1857, + "step": 365 + }, + { + "epoch": 0.536067374588063, + "grad_norm": 0.29281271654639596, + "learning_rate": 1.8329779121633177e-05, + "loss": 0.1755, + "step": 366 + }, + { + "epoch": 0.5375320395459539, + "grad_norm": 0.32331356405290573, + "learning_rate": 1.8315585062691616e-05, + "loss": 0.174, + "step": 367 + }, + { + "epoch": 0.5389967045038447, + "grad_norm": 0.3058216598921902, + "learning_rate": 1.830133649035968e-05, + "loss": 0.1743, + "step": 368 + }, + { + "epoch": 0.5404613694617356, + "grad_norm": 0.3067354134944423, + "learning_rate": 1.828703349804487e-05, + "loss": 0.1715, + "step": 369 + }, + { + "epoch": 0.5419260344196265, + "grad_norm": 0.36661193734735914, + "learning_rate": 1.8272676179511428e-05, + "loss": 0.1738, + "step": 370 + }, + { + "epoch": 0.5433906993775174, + "grad_norm": 0.30007784767963736, + "learning_rate": 1.8258264628879753e-05, + "loss": 0.1552, + "step": 371 + }, + { + "epoch": 0.5448553643354083, + "grad_norm": 0.3058776353752914, + "learning_rate": 1.8243798940625752e-05, + "loss": 0.1656, + "step": 372 + }, + { + "epoch": 0.5463200292932991, + "grad_norm": 0.2906329929557275, + "learning_rate": 1.8229279209580245e-05, + "loss": 0.1724, + "step": 373 + }, + { + "epoch": 0.54778469425119, + "grad_norm": 0.2764957936178043, + "learning_rate": 1.8214705530928322e-05, + "loss": 0.1569, + "step": 374 + }, + { + "epoch": 0.5492493592090809, + "grad_norm": 0.31882196921773603, + "learning_rate": 1.8200078000208745e-05, + "loss": 0.1618, + "step": 375 + }, + { + "epoch": 0.5507140241669718, + "grad_norm": 0.3031929350492432, + "learning_rate": 1.818539671331329e-05, + "loss": 0.1821, + "step": 376 + }, + { + "epoch": 0.5521786891248627, + "grad_norm": 0.3285177453767559, + "learning_rate": 1.8170661766486147e-05, + "loss": 0.1708, + "step": 377 + }, + { + "epoch": 0.5536433540827536, + "grad_norm": 0.291362505718777, + "learning_rate": 1.815587325632328e-05, + "loss": 0.1678, + "step": 378 + }, + { + "epoch": 0.5551080190406444, + "grad_norm": 0.3154222441804908, + "learning_rate": 1.8141031279771777e-05, + "loss": 0.1712, + "step": 379 + }, + { + "epoch": 0.5565726839985353, + "grad_norm": 0.2924737445300942, + "learning_rate": 1.812613593412924e-05, + "loss": 0.1623, + "step": 380 + }, + { + "epoch": 0.5580373489564262, + "grad_norm": 0.3327921961518248, + "learning_rate": 1.8111187317043136e-05, + "loss": 0.168, + "step": 381 + }, + { + "epoch": 0.5595020139143171, + "grad_norm": 0.3100694105568342, + "learning_rate": 1.8096185526510154e-05, + "loss": 0.1457, + "step": 382 + }, + { + "epoch": 0.560966678872208, + "grad_norm": 0.31380150144375096, + "learning_rate": 1.8081130660875557e-05, + "loss": 0.1712, + "step": 383 + }, + { + "epoch": 0.5624313438300989, + "grad_norm": 0.29763863880909497, + "learning_rate": 1.8066022818832564e-05, + "loss": 0.1712, + "step": 384 + }, + { + "epoch": 0.5638960087879897, + "grad_norm": 0.31673229457527147, + "learning_rate": 1.805086209942166e-05, + "loss": 0.1642, + "step": 385 + }, + { + "epoch": 0.5653606737458806, + "grad_norm": 0.28416394082082747, + "learning_rate": 1.8035648602029997e-05, + "loss": 0.1568, + "step": 386 + }, + { + "epoch": 0.5668253387037715, + "grad_norm": 0.3118223455455188, + "learning_rate": 1.80203824263907e-05, + "loss": 0.2036, + "step": 387 + }, + { + "epoch": 0.5682900036616624, + "grad_norm": 0.31436408212659767, + "learning_rate": 1.8005063672582236e-05, + "loss": 0.1761, + "step": 388 + }, + { + "epoch": 0.5697546686195533, + "grad_norm": 0.30744846641916707, + "learning_rate": 1.7989692441027744e-05, + "loss": 0.153, + "step": 389 + }, + { + "epoch": 0.5712193335774441, + "grad_norm": 0.29045516723158216, + "learning_rate": 1.7974268832494397e-05, + "loss": 0.1449, + "step": 390 + }, + { + "epoch": 0.572683998535335, + "grad_norm": 0.2752010942760226, + "learning_rate": 1.7958792948092726e-05, + "loss": 0.181, + "step": 391 + }, + { + "epoch": 0.5741486634932259, + "grad_norm": 0.3004642528909644, + "learning_rate": 1.7943264889275944e-05, + "loss": 0.167, + "step": 392 + }, + { + "epoch": 0.5756133284511168, + "grad_norm": 0.28533925990840686, + "learning_rate": 1.792768475783932e-05, + "loss": 0.1508, + "step": 393 + }, + { + "epoch": 0.5770779934090077, + "grad_norm": 0.3074271645217073, + "learning_rate": 1.7912052655919478e-05, + "loss": 0.1614, + "step": 394 + }, + { + "epoch": 0.5785426583668986, + "grad_norm": 0.2860515134927835, + "learning_rate": 1.7896368685993738e-05, + "loss": 0.1467, + "step": 395 + }, + { + "epoch": 0.5800073233247894, + "grad_norm": 0.2826797056983111, + "learning_rate": 1.7880632950879438e-05, + "loss": 0.1647, + "step": 396 + }, + { + "epoch": 0.5814719882826803, + "grad_norm": 0.28758395090779254, + "learning_rate": 1.7864845553733276e-05, + "loss": 0.1643, + "step": 397 + }, + { + "epoch": 0.5829366532405712, + "grad_norm": 0.29461983196821356, + "learning_rate": 1.7849006598050626e-05, + "loss": 0.1762, + "step": 398 + }, + { + "epoch": 0.5844013181984621, + "grad_norm": 0.2777593396108342, + "learning_rate": 1.7833116187664846e-05, + "loss": 0.1561, + "step": 399 + }, + { + "epoch": 0.585865983156353, + "grad_norm": 0.30600321149552173, + "learning_rate": 1.781717442674662e-05, + "loss": 0.1777, + "step": 400 + }, + { + "epoch": 0.5873306481142438, + "grad_norm": 0.2758705111537229, + "learning_rate": 1.7801181419803257e-05, + "loss": 0.1556, + "step": 401 + }, + { + "epoch": 0.5887953130721347, + "grad_norm": 0.2625670019142076, + "learning_rate": 1.7785137271678013e-05, + "loss": 0.1468, + "step": 402 + }, + { + "epoch": 0.5902599780300256, + "grad_norm": 0.29391559042563303, + "learning_rate": 1.776904208754941e-05, + "loss": 0.1823, + "step": 403 + }, + { + "epoch": 0.5917246429879165, + "grad_norm": 0.2824926653515285, + "learning_rate": 1.7752895972930538e-05, + "loss": 0.1638, + "step": 404 + }, + { + "epoch": 0.5931893079458074, + "grad_norm": 0.31289373581775576, + "learning_rate": 1.7736699033668353e-05, + "loss": 0.1969, + "step": 405 + }, + { + "epoch": 0.5946539729036983, + "grad_norm": 0.28541795717364044, + "learning_rate": 1.772045137594301e-05, + "loss": 0.1445, + "step": 406 + }, + { + "epoch": 0.5961186378615891, + "grad_norm": 0.30005077280176484, + "learning_rate": 1.770415310626715e-05, + "loss": 0.188, + "step": 407 + }, + { + "epoch": 0.59758330281948, + "grad_norm": 0.27829833597465525, + "learning_rate": 1.7687804331485203e-05, + "loss": 0.196, + "step": 408 + }, + { + "epoch": 0.5990479677773709, + "grad_norm": 0.28631318068873735, + "learning_rate": 1.7671405158772686e-05, + "loss": 0.1624, + "step": 409 + }, + { + "epoch": 0.6005126327352618, + "grad_norm": 0.2867469013067658, + "learning_rate": 1.7654955695635498e-05, + "loss": 0.1839, + "step": 410 + }, + { + "epoch": 0.6019772976931527, + "grad_norm": 0.2737552258396289, + "learning_rate": 1.7638456049909238e-05, + "loss": 0.1553, + "step": 411 + }, + { + "epoch": 0.6034419626510436, + "grad_norm": 0.3136051529335448, + "learning_rate": 1.7621906329758466e-05, + "loss": 0.175, + "step": 412 + }, + { + "epoch": 0.6049066276089344, + "grad_norm": 0.2773613881722616, + "learning_rate": 1.760530664367601e-05, + "loss": 0.1696, + "step": 413 + }, + { + "epoch": 0.6063712925668253, + "grad_norm": 0.29022770109259144, + "learning_rate": 1.758865710048225e-05, + "loss": 0.163, + "step": 414 + }, + { + "epoch": 0.6078359575247162, + "grad_norm": 0.28813354195965973, + "learning_rate": 1.7571957809324422e-05, + "loss": 0.1649, + "step": 415 + }, + { + "epoch": 0.6093006224826071, + "grad_norm": 0.2985551015333967, + "learning_rate": 1.7555208879675875e-05, + "loss": 0.1737, + "step": 416 + }, + { + "epoch": 0.610765287440498, + "grad_norm": 0.31633838685259236, + "learning_rate": 1.7538410421335373e-05, + "loss": 0.1606, + "step": 417 + }, + { + "epoch": 0.6122299523983888, + "grad_norm": 0.29552853603848234, + "learning_rate": 1.752156254442636e-05, + "loss": 0.1824, + "step": 418 + }, + { + "epoch": 0.6136946173562797, + "grad_norm": 0.31144800745719076, + "learning_rate": 1.7504665359396255e-05, + "loss": 0.1501, + "step": 419 + }, + { + "epoch": 0.6151592823141706, + "grad_norm": 0.30007771034473985, + "learning_rate": 1.748771897701572e-05, + "loss": 0.1756, + "step": 420 + }, + { + "epoch": 0.6166239472720615, + "grad_norm": 0.3153299684246117, + "learning_rate": 1.7470723508377935e-05, + "loss": 0.1727, + "step": 421 + }, + { + "epoch": 0.6180886122299524, + "grad_norm": 0.29874905779465544, + "learning_rate": 1.745367906489786e-05, + "loss": 0.1583, + "step": 422 + }, + { + "epoch": 0.6195532771878433, + "grad_norm": 0.313158860779259, + "learning_rate": 1.7436585758311512e-05, + "loss": 0.1776, + "step": 423 + }, + { + "epoch": 0.6210179421457341, + "grad_norm": 0.29027418449570486, + "learning_rate": 1.7419443700675248e-05, + "loss": 0.1615, + "step": 424 + }, + { + "epoch": 0.622482607103625, + "grad_norm": 0.27091112505454484, + "learning_rate": 1.7402253004365007e-05, + "loss": 0.1471, + "step": 425 + }, + { + "epoch": 0.6239472720615159, + "grad_norm": 0.31079312606497544, + "learning_rate": 1.7385013782075575e-05, + "loss": 0.1642, + "step": 426 + }, + { + "epoch": 0.6254119370194068, + "grad_norm": 0.3191231075261632, + "learning_rate": 1.736772614681987e-05, + "loss": 0.1799, + "step": 427 + }, + { + "epoch": 0.6268766019772977, + "grad_norm": 0.29107955830633764, + "learning_rate": 1.7350390211928167e-05, + "loss": 0.1364, + "step": 428 + }, + { + "epoch": 0.6283412669351885, + "grad_norm": 0.28799608478930533, + "learning_rate": 1.7333006091047386e-05, + "loss": 0.1446, + "step": 429 + }, + { + "epoch": 0.6298059318930794, + "grad_norm": 0.2805703556245509, + "learning_rate": 1.7315573898140324e-05, + "loss": 0.1578, + "step": 430 + }, + { + "epoch": 0.6312705968509703, + "grad_norm": 0.3022531763710714, + "learning_rate": 1.7298093747484923e-05, + "loss": 0.1454, + "step": 431 + }, + { + "epoch": 0.6327352618088612, + "grad_norm": 0.29056765252531186, + "learning_rate": 1.7280565753673517e-05, + "loss": 0.1409, + "step": 432 + }, + { + "epoch": 0.6341999267667521, + "grad_norm": 0.28348330072419187, + "learning_rate": 1.7262990031612072e-05, + "loss": 0.1632, + "step": 433 + }, + { + "epoch": 0.635664591724643, + "grad_norm": 0.27849254820160685, + "learning_rate": 1.7245366696519448e-05, + "loss": 0.1382, + "step": 434 + }, + { + "epoch": 0.6371292566825338, + "grad_norm": 0.2849173212539919, + "learning_rate": 1.7227695863926627e-05, + "loss": 0.1404, + "step": 435 + }, + { + "epoch": 0.6385939216404247, + "grad_norm": 0.29763893191664237, + "learning_rate": 1.7209977649675975e-05, + "loss": 0.1699, + "step": 436 + }, + { + "epoch": 0.6400585865983156, + "grad_norm": 0.28500529876174013, + "learning_rate": 1.7192212169920458e-05, + "loss": 0.154, + "step": 437 + }, + { + "epoch": 0.6415232515562065, + "grad_norm": 0.29341182785229564, + "learning_rate": 1.717439954112291e-05, + "loss": 0.1629, + "step": 438 + }, + { + "epoch": 0.6429879165140974, + "grad_norm": 0.2833467699130664, + "learning_rate": 1.7156539880055236e-05, + "loss": 0.1479, + "step": 439 + }, + { + "epoch": 0.6444525814719883, + "grad_norm": 0.29432333457804116, + "learning_rate": 1.7138633303797676e-05, + "loss": 0.1447, + "step": 440 + }, + { + "epoch": 0.6459172464298791, + "grad_norm": 0.2899857698207261, + "learning_rate": 1.712067992973803e-05, + "loss": 0.1707, + "step": 441 + }, + { + "epoch": 0.64738191138777, + "grad_norm": 0.300112032266628, + "learning_rate": 1.710267987557087e-05, + "loss": 0.1441, + "step": 442 + }, + { + "epoch": 0.6488465763456609, + "grad_norm": 0.28676504299001276, + "learning_rate": 1.7084633259296798e-05, + "loss": 0.1418, + "step": 443 + }, + { + "epoch": 0.6503112413035518, + "grad_norm": 0.27688806374870634, + "learning_rate": 1.706654019922164e-05, + "loss": 0.1478, + "step": 444 + }, + { + "epoch": 0.6517759062614427, + "grad_norm": 0.2853894260456318, + "learning_rate": 1.704840081395571e-05, + "loss": 0.1524, + "step": 445 + }, + { + "epoch": 0.6532405712193335, + "grad_norm": 0.2946623888589574, + "learning_rate": 1.703021522241298e-05, + "loss": 0.1471, + "step": 446 + }, + { + "epoch": 0.6547052361772244, + "grad_norm": 0.2726573492109173, + "learning_rate": 1.701198354381036e-05, + "loss": 0.1312, + "step": 447 + }, + { + "epoch": 0.6561699011351153, + "grad_norm": 0.28410262500836964, + "learning_rate": 1.6993705897666873e-05, + "loss": 0.1444, + "step": 448 + }, + { + "epoch": 0.6576345660930062, + "grad_norm": 0.2833743474032485, + "learning_rate": 1.697538240380288e-05, + "loss": 0.1596, + "step": 449 + }, + { + "epoch": 0.6590992310508971, + "grad_norm": 0.28744248608886525, + "learning_rate": 1.695701318233931e-05, + "loss": 0.1696, + "step": 450 + }, + { + "epoch": 0.660563896008788, + "grad_norm": 0.2922684857852237, + "learning_rate": 1.6938598353696864e-05, + "loss": 0.1822, + "step": 451 + }, + { + "epoch": 0.6620285609666788, + "grad_norm": 0.2824755616354114, + "learning_rate": 1.6920138038595214e-05, + "loss": 0.16, + "step": 452 + }, + { + "epoch": 0.6634932259245697, + "grad_norm": 0.2881779281986939, + "learning_rate": 1.6901632358052226e-05, + "loss": 0.1306, + "step": 453 + }, + { + "epoch": 0.6649578908824606, + "grad_norm": 0.3076676560083059, + "learning_rate": 1.6883081433383163e-05, + "loss": 0.1693, + "step": 454 + }, + { + "epoch": 0.6664225558403515, + "grad_norm": 0.28360460469529464, + "learning_rate": 1.6864485386199895e-05, + "loss": 0.1347, + "step": 455 + }, + { + "epoch": 0.6678872207982424, + "grad_norm": 0.2792514596107041, + "learning_rate": 1.6845844338410077e-05, + "loss": 0.1584, + "step": 456 + }, + { + "epoch": 0.6693518857561332, + "grad_norm": 0.27734844058934455, + "learning_rate": 1.6827158412216396e-05, + "loss": 0.1489, + "step": 457 + }, + { + "epoch": 0.6708165507140241, + "grad_norm": 0.2868505259889296, + "learning_rate": 1.6808427730115716e-05, + "loss": 0.1556, + "step": 458 + }, + { + "epoch": 0.672281215671915, + "grad_norm": 0.27438002988710974, + "learning_rate": 1.6789652414898315e-05, + "loss": 0.1594, + "step": 459 + }, + { + "epoch": 0.6737458806298059, + "grad_norm": 0.26963685143388444, + "learning_rate": 1.677083258964707e-05, + "loss": 0.1544, + "step": 460 + }, + { + "epoch": 0.6752105455876968, + "grad_norm": 0.27489622544391323, + "learning_rate": 1.675196837773664e-05, + "loss": 0.1779, + "step": 461 + }, + { + "epoch": 0.6766752105455877, + "grad_norm": 0.28666936527106884, + "learning_rate": 1.673305990283266e-05, + "loss": 0.1389, + "step": 462 + }, + { + "epoch": 0.6781398755034785, + "grad_norm": 0.2854502743190489, + "learning_rate": 1.6714107288890943e-05, + "loss": 0.1518, + "step": 463 + }, + { + "epoch": 0.6796045404613694, + "grad_norm": 0.28486649873799424, + "learning_rate": 1.6695110660156652e-05, + "loss": 0.1356, + "step": 464 + }, + { + "epoch": 0.6810692054192603, + "grad_norm": 0.3095642301969388, + "learning_rate": 1.6676070141163498e-05, + "loss": 0.1676, + "step": 465 + }, + { + "epoch": 0.6825338703771512, + "grad_norm": 0.3135455510235421, + "learning_rate": 1.665698585673291e-05, + "loss": 0.1606, + "step": 466 + }, + { + "epoch": 0.6839985353350421, + "grad_norm": 0.28266851700362705, + "learning_rate": 1.6637857931973233e-05, + "loss": 0.1345, + "step": 467 + }, + { + "epoch": 0.685463200292933, + "grad_norm": 0.27779394936697266, + "learning_rate": 1.6618686492278892e-05, + "loss": 0.1682, + "step": 468 + }, + { + "epoch": 0.6869278652508238, + "grad_norm": 0.27567920609962066, + "learning_rate": 1.6599471663329577e-05, + "loss": 0.1334, + "step": 469 + }, + { + "epoch": 0.6883925302087147, + "grad_norm": 0.2754295653847063, + "learning_rate": 1.6580213571089427e-05, + "loss": 0.1573, + "step": 470 + }, + { + "epoch": 0.6898571951666056, + "grad_norm": 0.2750050531478153, + "learning_rate": 1.656091234180619e-05, + "loss": 0.1773, + "step": 471 + }, + { + "epoch": 0.6913218601244965, + "grad_norm": 0.27965457420033163, + "learning_rate": 1.65415681020104e-05, + "loss": 0.1369, + "step": 472 + }, + { + "epoch": 0.6927865250823874, + "grad_norm": 0.27090231974844964, + "learning_rate": 1.6522180978514556e-05, + "loss": 0.1375, + "step": 473 + }, + { + "epoch": 0.6942511900402782, + "grad_norm": 0.2824639859361883, + "learning_rate": 1.6502751098412282e-05, + "loss": 0.1246, + "step": 474 + }, + { + "epoch": 0.6957158549981691, + "grad_norm": 0.28655833648300955, + "learning_rate": 1.648327858907749e-05, + "loss": 0.1648, + "step": 475 + }, + { + "epoch": 0.69718051995606, + "grad_norm": 0.27502131490113785, + "learning_rate": 1.6463763578163563e-05, + "loss": 0.1501, + "step": 476 + }, + { + "epoch": 0.6986451849139509, + "grad_norm": 0.26898509925311315, + "learning_rate": 1.6444206193602493e-05, + "loss": 0.1499, + "step": 477 + }, + { + "epoch": 0.7001098498718418, + "grad_norm": 0.3350314192757342, + "learning_rate": 1.642460656360406e-05, + "loss": 0.1572, + "step": 478 + }, + { + "epoch": 0.7015745148297327, + "grad_norm": 0.2773978701354445, + "learning_rate": 1.6404964816654993e-05, + "loss": 0.1272, + "step": 479 + }, + { + "epoch": 0.7030391797876235, + "grad_norm": 0.2901786822045925, + "learning_rate": 1.638528108151811e-05, + "loss": 0.1892, + "step": 480 + }, + { + "epoch": 0.7045038447455144, + "grad_norm": 0.2847317225297176, + "learning_rate": 1.63655554872315e-05, + "loss": 0.1579, + "step": 481 + }, + { + "epoch": 0.7059685097034053, + "grad_norm": 0.2760852156995962, + "learning_rate": 1.6345788163107645e-05, + "loss": 0.1411, + "step": 482 + }, + { + "epoch": 0.7074331746612962, + "grad_norm": 0.2812021526710764, + "learning_rate": 1.6325979238732606e-05, + "loss": 0.1416, + "step": 483 + }, + { + "epoch": 0.7088978396191871, + "grad_norm": 0.27495150899008586, + "learning_rate": 1.630612884396515e-05, + "loss": 0.1457, + "step": 484 + }, + { + "epoch": 0.710362504577078, + "grad_norm": 0.27821852042543493, + "learning_rate": 1.62862371089359e-05, + "loss": 0.1409, + "step": 485 + }, + { + "epoch": 0.7118271695349688, + "grad_norm": 0.26154961742164873, + "learning_rate": 1.6266304164046505e-05, + "loss": 0.1392, + "step": 486 + }, + { + "epoch": 0.7132918344928597, + "grad_norm": 0.3344656299415514, + "learning_rate": 1.6246330139968748e-05, + "loss": 0.1395, + "step": 487 + }, + { + "epoch": 0.7147564994507506, + "grad_norm": 0.2889353353792984, + "learning_rate": 1.6226315167643723e-05, + "loss": 0.1377, + "step": 488 + }, + { + "epoch": 0.7162211644086415, + "grad_norm": 0.2866205322545156, + "learning_rate": 1.6206259378280956e-05, + "loss": 0.1339, + "step": 489 + }, + { + "epoch": 0.7176858293665324, + "grad_norm": 0.2635137582733423, + "learning_rate": 1.6186162903357562e-05, + "loss": 0.1384, + "step": 490 + }, + { + "epoch": 0.7191504943244232, + "grad_norm": 0.264894099077168, + "learning_rate": 1.616602587461736e-05, + "loss": 0.1415, + "step": 491 + }, + { + "epoch": 0.7206151592823141, + "grad_norm": 0.2912590760990707, + "learning_rate": 1.6145848424070032e-05, + "loss": 0.1377, + "step": 492 + }, + { + "epoch": 0.722079824240205, + "grad_norm": 0.28322985703286563, + "learning_rate": 1.612563068399024e-05, + "loss": 0.1381, + "step": 493 + }, + { + "epoch": 0.7235444891980959, + "grad_norm": 0.2761592201460229, + "learning_rate": 1.6105372786916776e-05, + "loss": 0.1205, + "step": 494 + }, + { + "epoch": 0.7250091541559868, + "grad_norm": 0.2593621751501278, + "learning_rate": 1.6085074865651672e-05, + "loss": 0.1396, + "step": 495 + }, + { + "epoch": 0.7264738191138777, + "grad_norm": 0.2698751624804496, + "learning_rate": 1.6064737053259355e-05, + "loss": 0.1549, + "step": 496 + }, + { + "epoch": 0.7279384840717685, + "grad_norm": 0.2784921841300199, + "learning_rate": 1.604435948306575e-05, + "loss": 0.142, + "step": 497 + }, + { + "epoch": 0.7294031490296594, + "grad_norm": 0.282990933381995, + "learning_rate": 1.6023942288657423e-05, + "loss": 0.1532, + "step": 498 + }, + { + "epoch": 0.7308678139875503, + "grad_norm": 0.2810338678318928, + "learning_rate": 1.60034856038807e-05, + "loss": 0.1379, + "step": 499 + }, + { + "epoch": 0.7323324789454412, + "grad_norm": 0.2964732663844032, + "learning_rate": 1.5982989562840785e-05, + "loss": 0.1324, + "step": 500 + }, + { + "epoch": 0.7337971439033321, + "grad_norm": 0.2943783008884028, + "learning_rate": 1.596245429990088e-05, + "loss": 0.143, + "step": 501 + }, + { + "epoch": 0.7352618088612229, + "grad_norm": 0.292096480725846, + "learning_rate": 1.5941879949681323e-05, + "loss": 0.1466, + "step": 502 + }, + { + "epoch": 0.7367264738191138, + "grad_norm": 0.28475675120953237, + "learning_rate": 1.5921266647058683e-05, + "loss": 0.1573, + "step": 503 + }, + { + "epoch": 0.7381911387770047, + "grad_norm": 0.2741769477206498, + "learning_rate": 1.5900614527164876e-05, + "loss": 0.1485, + "step": 504 + }, + { + "epoch": 0.7396558037348956, + "grad_norm": 0.3040355479807483, + "learning_rate": 1.5879923725386307e-05, + "loss": 0.1642, + "step": 505 + }, + { + "epoch": 0.7411204686927865, + "grad_norm": 0.3037335266933942, + "learning_rate": 1.5859194377362942e-05, + "loss": 0.1497, + "step": 506 + }, + { + "epoch": 0.7425851336506774, + "grad_norm": 0.2791249180827828, + "learning_rate": 1.5838426618987455e-05, + "loss": 0.1521, + "step": 507 + }, + { + "epoch": 0.7440497986085682, + "grad_norm": 0.281309610689647, + "learning_rate": 1.5817620586404315e-05, + "loss": 0.1346, + "step": 508 + }, + { + "epoch": 0.7455144635664591, + "grad_norm": 0.28553330164537244, + "learning_rate": 1.5796776416008897e-05, + "loss": 0.1446, + "step": 509 + }, + { + "epoch": 0.74697912852435, + "grad_norm": 0.2844881306714679, + "learning_rate": 1.5775894244446603e-05, + "loss": 0.1414, + "step": 510 + }, + { + "epoch": 0.7484437934822409, + "grad_norm": 0.268988666507586, + "learning_rate": 1.575497420861194e-05, + "loss": 0.1288, + "step": 511 + }, + { + "epoch": 0.7499084584401318, + "grad_norm": 0.27447838457593043, + "learning_rate": 1.573401644564764e-05, + "loss": 0.1572, + "step": 512 + }, + { + "epoch": 0.7513731233980228, + "grad_norm": 0.2973833246231482, + "learning_rate": 1.571302109294377e-05, + "loss": 0.1406, + "step": 513 + }, + { + "epoch": 0.7528377883559136, + "grad_norm": 0.27289540026887815, + "learning_rate": 1.569198828813681e-05, + "loss": 0.1392, + "step": 514 + }, + { + "epoch": 0.7543024533138045, + "grad_norm": 0.2870617946282251, + "learning_rate": 1.567091816910875e-05, + "loss": 0.1543, + "step": 515 + }, + { + "epoch": 0.7557671182716954, + "grad_norm": 0.27283398029853356, + "learning_rate": 1.5649810873986214e-05, + "loss": 0.1276, + "step": 516 + }, + { + "epoch": 0.7572317832295863, + "grad_norm": 0.2637139762799822, + "learning_rate": 1.5628666541139523e-05, + "loss": 0.1355, + "step": 517 + }, + { + "epoch": 0.7586964481874772, + "grad_norm": 0.27854483215025555, + "learning_rate": 1.5607485309181813e-05, + "loss": 0.1483, + "step": 518 + }, + { + "epoch": 0.760161113145368, + "grad_norm": 0.28494692420422013, + "learning_rate": 1.55862673169681e-05, + "loss": 0.1356, + "step": 519 + }, + { + "epoch": 0.7616257781032589, + "grad_norm": 0.2898114191567579, + "learning_rate": 1.5565012703594403e-05, + "loss": 0.141, + "step": 520 + }, + { + "epoch": 0.7630904430611498, + "grad_norm": 0.2822142264481914, + "learning_rate": 1.55437216083968e-05, + "loss": 0.1311, + "step": 521 + }, + { + "epoch": 0.7645551080190407, + "grad_norm": 0.25757582269462975, + "learning_rate": 1.552239417095052e-05, + "loss": 0.1361, + "step": 522 + }, + { + "epoch": 0.7660197729769316, + "grad_norm": 0.2694585038356422, + "learning_rate": 1.5501030531069066e-05, + "loss": 0.1405, + "step": 523 + }, + { + "epoch": 0.7674844379348225, + "grad_norm": 0.27818748796170395, + "learning_rate": 1.5479630828803235e-05, + "loss": 0.1414, + "step": 524 + }, + { + "epoch": 0.7689491028927133, + "grad_norm": 0.2892622203178171, + "learning_rate": 1.5458195204440255e-05, + "loss": 0.1387, + "step": 525 + }, + { + "epoch": 0.7704137678506042, + "grad_norm": 0.2765799688074329, + "learning_rate": 1.5436723798502842e-05, + "loss": 0.1453, + "step": 526 + }, + { + "epoch": 0.7718784328084951, + "grad_norm": 0.27743642550569964, + "learning_rate": 1.5415216751748264e-05, + "loss": 0.1326, + "step": 527 + }, + { + "epoch": 0.773343097766386, + "grad_norm": 0.2602739542105575, + "learning_rate": 1.5393674205167453e-05, + "loss": 0.1329, + "step": 528 + }, + { + "epoch": 0.7748077627242769, + "grad_norm": 0.2673531017304055, + "learning_rate": 1.5372096299984064e-05, + "loss": 0.1343, + "step": 529 + }, + { + "epoch": 0.7762724276821678, + "grad_norm": 0.29845215607643555, + "learning_rate": 1.5350483177653528e-05, + "loss": 0.1522, + "step": 530 + }, + { + "epoch": 0.7777370926400586, + "grad_norm": 0.2725973305117268, + "learning_rate": 1.5328834979862158e-05, + "loss": 0.13, + "step": 531 + }, + { + "epoch": 0.7792017575979495, + "grad_norm": 0.263213840818661, + "learning_rate": 1.5307151848526213e-05, + "loss": 0.1246, + "step": 532 + }, + { + "epoch": 0.7806664225558404, + "grad_norm": 0.2551375025943304, + "learning_rate": 1.5285433925790946e-05, + "loss": 0.1111, + "step": 533 + }, + { + "epoch": 0.7821310875137313, + "grad_norm": 0.2622331657629095, + "learning_rate": 1.5263681354029694e-05, + "loss": 0.1351, + "step": 534 + }, + { + "epoch": 0.7835957524716222, + "grad_norm": 0.3124931282937266, + "learning_rate": 1.5241894275842946e-05, + "loss": 0.1417, + "step": 535 + }, + { + "epoch": 0.785060417429513, + "grad_norm": 0.30036829759264483, + "learning_rate": 1.5220072834057387e-05, + "loss": 0.1363, + "step": 536 + }, + { + "epoch": 0.7865250823874039, + "grad_norm": 0.2596328178928325, + "learning_rate": 1.5198217171724982e-05, + "loss": 0.1214, + "step": 537 + }, + { + "epoch": 0.7879897473452948, + "grad_norm": 0.2805919354886981, + "learning_rate": 1.5176327432122028e-05, + "loss": 0.1268, + "step": 538 + }, + { + "epoch": 0.7894544123031857, + "grad_norm": 0.2771437475611675, + "learning_rate": 1.5154403758748228e-05, + "loss": 0.1308, + "step": 539 + }, + { + "epoch": 0.7909190772610766, + "grad_norm": 0.2731861706982954, + "learning_rate": 1.5132446295325722e-05, + "loss": 0.128, + "step": 540 + }, + { + "epoch": 0.7923837422189675, + "grad_norm": 0.3430389886450956, + "learning_rate": 1.511045518579818e-05, + "loss": 0.1374, + "step": 541 + }, + { + "epoch": 0.7938484071768583, + "grad_norm": 0.2905683813596674, + "learning_rate": 1.5088430574329836e-05, + "loss": 0.1332, + "step": 542 + }, + { + "epoch": 0.7953130721347492, + "grad_norm": 0.28971874507270196, + "learning_rate": 1.5066372605304537e-05, + "loss": 0.1358, + "step": 543 + }, + { + "epoch": 0.7967777370926401, + "grad_norm": 0.2629369085736157, + "learning_rate": 1.5044281423324826e-05, + "loss": 0.1305, + "step": 544 + }, + { + "epoch": 0.798242402050531, + "grad_norm": 0.2624966808667796, + "learning_rate": 1.5022157173210969e-05, + "loss": 0.1386, + "step": 545 + }, + { + "epoch": 0.7997070670084219, + "grad_norm": 0.27495723120059107, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.1261, + "step": 546 + }, + { + "epoch": 0.8011717319663128, + "grad_norm": 0.2968841863976358, + "learning_rate": 1.4977810048944806e-05, + "loss": 0.1438, + "step": 547 + }, + { + "epoch": 0.8026363969242036, + "grad_norm": 0.26619846720879786, + "learning_rate": 1.495558746551313e-05, + "loss": 0.139, + "step": 548 + }, + { + "epoch": 0.8041010618820945, + "grad_norm": 0.2733899438656217, + "learning_rate": 1.4933332395386652e-05, + "loss": 0.1394, + "step": 549 + }, + { + "epoch": 0.8055657268399854, + "grad_norm": 0.27528673995767655, + "learning_rate": 1.4911044984460015e-05, + "loss": 0.1235, + "step": 550 + }, + { + "epoch": 0.8070303917978763, + "grad_norm": 0.27120553519443596, + "learning_rate": 1.4888725378839877e-05, + "loss": 0.1389, + "step": 551 + }, + { + "epoch": 0.8084950567557672, + "grad_norm": 0.26970265074149985, + "learning_rate": 1.4866373724843945e-05, + "loss": 0.1381, + "step": 552 + }, + { + "epoch": 0.809959721713658, + "grad_norm": 0.29150434298475913, + "learning_rate": 1.484399016900003e-05, + "loss": 0.1345, + "step": 553 + }, + { + "epoch": 0.8114243866715489, + "grad_norm": 0.29587722131690225, + "learning_rate": 1.4821574858045073e-05, + "loss": 0.1356, + "step": 554 + }, + { + "epoch": 0.8128890516294398, + "grad_norm": 0.26978671672533217, + "learning_rate": 1.479912793892419e-05, + "loss": 0.1363, + "step": 555 + }, + { + "epoch": 0.8143537165873307, + "grad_norm": 0.26428779798336755, + "learning_rate": 1.4776649558789698e-05, + "loss": 0.1545, + "step": 556 + }, + { + "epoch": 0.8158183815452216, + "grad_norm": 0.2715765644749667, + "learning_rate": 1.475413986500017e-05, + "loss": 0.1437, + "step": 557 + }, + { + "epoch": 0.8172830465031125, + "grad_norm": 0.2626982755104111, + "learning_rate": 1.4731599005119454e-05, + "loss": 0.1277, + "step": 558 + }, + { + "epoch": 0.8187477114610033, + "grad_norm": 0.27082844640160836, + "learning_rate": 1.470902712691571e-05, + "loss": 0.1111, + "step": 559 + }, + { + "epoch": 0.8202123764188942, + "grad_norm": 0.2668734287792547, + "learning_rate": 1.4686424378360434e-05, + "loss": 0.1312, + "step": 560 + }, + { + "epoch": 0.8216770413767851, + "grad_norm": 0.2869401318531047, + "learning_rate": 1.4663790907627502e-05, + "loss": 0.1509, + "step": 561 + }, + { + "epoch": 0.823141706334676, + "grad_norm": 0.2773427427864892, + "learning_rate": 1.4641126863092194e-05, + "loss": 0.1577, + "step": 562 + }, + { + "epoch": 0.8246063712925669, + "grad_norm": 0.2717284080506168, + "learning_rate": 1.4618432393330211e-05, + "loss": 0.1254, + "step": 563 + }, + { + "epoch": 0.8260710362504577, + "grad_norm": 0.2919980547507266, + "learning_rate": 1.4595707647116713e-05, + "loss": 0.158, + "step": 564 + }, + { + "epoch": 0.8275357012083486, + "grad_norm": 0.27833252973367867, + "learning_rate": 1.4572952773425335e-05, + "loss": 0.1252, + "step": 565 + }, + { + "epoch": 0.8290003661662395, + "grad_norm": 0.2850694697710808, + "learning_rate": 1.455016792142722e-05, + "loss": 0.1333, + "step": 566 + }, + { + "epoch": 0.8304650311241304, + "grad_norm": 0.26558195129076256, + "learning_rate": 1.4527353240490039e-05, + "loss": 0.1349, + "step": 567 + }, + { + "epoch": 0.8319296960820213, + "grad_norm": 0.2944649491485909, + "learning_rate": 1.4504508880176996e-05, + "loss": 0.1528, + "step": 568 + }, + { + "epoch": 0.8333943610399122, + "grad_norm": 0.2578991107569846, + "learning_rate": 1.4481634990245871e-05, + "loss": 0.1225, + "step": 569 + }, + { + "epoch": 0.834859025997803, + "grad_norm": 0.33398387092007636, + "learning_rate": 1.4458731720648024e-05, + "loss": 0.1459, + "step": 570 + }, + { + "epoch": 0.8363236909556939, + "grad_norm": 0.29274576860822454, + "learning_rate": 1.4435799221527417e-05, + "loss": 0.1377, + "step": 571 + }, + { + "epoch": 0.8377883559135848, + "grad_norm": 0.2865214436536306, + "learning_rate": 1.4412837643219625e-05, + "loss": 0.1416, + "step": 572 + }, + { + "epoch": 0.8392530208714757, + "grad_norm": 0.2803403830745175, + "learning_rate": 1.4389847136250858e-05, + "loss": 0.1291, + "step": 573 + }, + { + "epoch": 0.8407176858293666, + "grad_norm": 0.2692461377236704, + "learning_rate": 1.4366827851336964e-05, + "loss": 0.1251, + "step": 574 + }, + { + "epoch": 0.8421823507872574, + "grad_norm": 0.2626470317555678, + "learning_rate": 1.4343779939382451e-05, + "loss": 0.1111, + "step": 575 + }, + { + "epoch": 0.8436470157451483, + "grad_norm": 0.28937945369153595, + "learning_rate": 1.4320703551479494e-05, + "loss": 0.1305, + "step": 576 + }, + { + "epoch": 0.8451116807030392, + "grad_norm": 0.30541326375044037, + "learning_rate": 1.4297598838906938e-05, + "loss": 0.1502, + "step": 577 + }, + { + "epoch": 0.8465763456609301, + "grad_norm": 0.2796697681217199, + "learning_rate": 1.4274465953129326e-05, + "loss": 0.1236, + "step": 578 + }, + { + "epoch": 0.848041010618821, + "grad_norm": 0.2614920930808128, + "learning_rate": 1.4251305045795874e-05, + "loss": 0.1126, + "step": 579 + }, + { + "epoch": 0.8495056755767119, + "grad_norm": 0.28511419468497884, + "learning_rate": 1.422811626873951e-05, + "loss": 0.1285, + "step": 580 + }, + { + "epoch": 0.8509703405346027, + "grad_norm": 0.2707984473087975, + "learning_rate": 1.4204899773975855e-05, + "loss": 0.1436, + "step": 581 + }, + { + "epoch": 0.8524350054924936, + "grad_norm": 0.281582631209825, + "learning_rate": 1.4181655713702242e-05, + "loss": 0.1297, + "step": 582 + }, + { + "epoch": 0.8538996704503845, + "grad_norm": 0.27592043627026036, + "learning_rate": 1.4158384240296707e-05, + "loss": 0.1255, + "step": 583 + }, + { + "epoch": 0.8553643354082754, + "grad_norm": 0.28772169132389885, + "learning_rate": 1.4135085506316997e-05, + "loss": 0.1504, + "step": 584 + }, + { + "epoch": 0.8568290003661663, + "grad_norm": 0.26245407957464945, + "learning_rate": 1.4111759664499562e-05, + "loss": 0.1131, + "step": 585 + }, + { + "epoch": 0.8582936653240572, + "grad_norm": 0.25257520518661797, + "learning_rate": 1.4088406867758573e-05, + "loss": 0.1235, + "step": 586 + }, + { + "epoch": 0.859758330281948, + "grad_norm": 0.26981349513085906, + "learning_rate": 1.4065027269184888e-05, + "loss": 0.1429, + "step": 587 + }, + { + "epoch": 0.8612229952398389, + "grad_norm": 0.2734637683112845, + "learning_rate": 1.404162102204508e-05, + "loss": 0.129, + "step": 588 + }, + { + "epoch": 0.8626876601977298, + "grad_norm": 0.30246081365181193, + "learning_rate": 1.4018188279780412e-05, + "loss": 0.1314, + "step": 589 + }, + { + "epoch": 0.8641523251556207, + "grad_norm": 0.287487018814624, + "learning_rate": 1.3994729196005839e-05, + "loss": 0.1259, + "step": 590 + }, + { + "epoch": 0.8656169901135116, + "grad_norm": 0.27210880904989876, + "learning_rate": 1.3971243924508996e-05, + "loss": 0.1178, + "step": 591 + }, + { + "epoch": 0.8670816550714024, + "grad_norm": 0.29342064785517297, + "learning_rate": 1.3947732619249206e-05, + "loss": 0.1341, + "step": 592 + }, + { + "epoch": 0.8685463200292933, + "grad_norm": 0.27972993450917555, + "learning_rate": 1.3924195434356443e-05, + "loss": 0.1313, + "step": 593 + }, + { + "epoch": 0.8700109849871842, + "grad_norm": 0.24692549028529914, + "learning_rate": 1.3900632524130343e-05, + "loss": 0.1233, + "step": 594 + }, + { + "epoch": 0.8714756499450751, + "grad_norm": 0.32938798959658994, + "learning_rate": 1.3877044043039189e-05, + "loss": 0.1561, + "step": 595 + }, + { + "epoch": 0.872940314902966, + "grad_norm": 0.27405616789978654, + "learning_rate": 1.3853430145718892e-05, + "loss": 0.1361, + "step": 596 + }, + { + "epoch": 0.8744049798608569, + "grad_norm": 0.2846698580131875, + "learning_rate": 1.382979098697198e-05, + "loss": 0.1505, + "step": 597 + }, + { + "epoch": 0.8758696448187477, + "grad_norm": 0.2886024048200631, + "learning_rate": 1.3806126721766586e-05, + "loss": 0.1421, + "step": 598 + }, + { + "epoch": 0.8773343097766386, + "grad_norm": 0.24843406819575536, + "learning_rate": 1.378243750523543e-05, + "loss": 0.1166, + "step": 599 + }, + { + "epoch": 0.8787989747345295, + "grad_norm": 0.29136988530850255, + "learning_rate": 1.3758723492674803e-05, + "loss": 0.163, + "step": 600 + }, + { + "epoch": 0.8802636396924204, + "grad_norm": 0.27987479624274275, + "learning_rate": 1.3734984839543547e-05, + "loss": 0.1302, + "step": 601 + }, + { + "epoch": 0.8817283046503113, + "grad_norm": 0.2766713098966634, + "learning_rate": 1.3711221701462037e-05, + "loss": 0.1264, + "step": 602 + }, + { + "epoch": 0.8831929696082021, + "grad_norm": 0.281327006141709, + "learning_rate": 1.368743423421116e-05, + "loss": 0.1433, + "step": 603 + }, + { + "epoch": 0.884657634566093, + "grad_norm": 0.2819498194158381, + "learning_rate": 1.3663622593731294e-05, + "loss": 0.1315, + "step": 604 + }, + { + "epoch": 0.8861222995239839, + "grad_norm": 0.2869700943965007, + "learning_rate": 1.3639786936121287e-05, + "loss": 0.1313, + "step": 605 + }, + { + "epoch": 0.8875869644818748, + "grad_norm": 0.28257828430613197, + "learning_rate": 1.3615927417637435e-05, + "loss": 0.1327, + "step": 606 + }, + { + "epoch": 0.8890516294397657, + "grad_norm": 0.2983987956601424, + "learning_rate": 1.3592044194692456e-05, + "loss": 0.1435, + "step": 607 + }, + { + "epoch": 0.8905162943976566, + "grad_norm": 0.2900543426098001, + "learning_rate": 1.356813742385446e-05, + "loss": 0.1366, + "step": 608 + }, + { + "epoch": 0.8919809593555474, + "grad_norm": 0.27572519984679683, + "learning_rate": 1.3544207261845928e-05, + "loss": 0.1373, + "step": 609 + }, + { + "epoch": 0.8934456243134383, + "grad_norm": 0.2721425773906768, + "learning_rate": 1.3520253865542687e-05, + "loss": 0.1223, + "step": 610 + }, + { + "epoch": 0.8949102892713292, + "grad_norm": 0.282049356782574, + "learning_rate": 1.3496277391972874e-05, + "loss": 0.1192, + "step": 611 + }, + { + "epoch": 0.8963749542292201, + "grad_norm": 0.2681081671045421, + "learning_rate": 1.3472277998315915e-05, + "loss": 0.1279, + "step": 612 + }, + { + "epoch": 0.897839619187111, + "grad_norm": 0.276619227889388, + "learning_rate": 1.3448255841901481e-05, + "loss": 0.1327, + "step": 613 + }, + { + "epoch": 0.8993042841450019, + "grad_norm": 0.28855198625509537, + "learning_rate": 1.3424211080208478e-05, + "loss": 0.1249, + "step": 614 + }, + { + "epoch": 0.9007689491028927, + "grad_norm": 0.25545241022812626, + "learning_rate": 1.3400143870863997e-05, + "loss": 0.1053, + "step": 615 + }, + { + "epoch": 0.9022336140607836, + "grad_norm": 0.24939455623227066, + "learning_rate": 1.3376054371642282e-05, + "loss": 0.1172, + "step": 616 + }, + { + "epoch": 0.9036982790186745, + "grad_norm": 0.2890615502578014, + "learning_rate": 1.3351942740463705e-05, + "loss": 0.1368, + "step": 617 + }, + { + "epoch": 0.9051629439765654, + "grad_norm": 0.2566294230704182, + "learning_rate": 1.3327809135393728e-05, + "loss": 0.1337, + "step": 618 + }, + { + "epoch": 0.9066276089344563, + "grad_norm": 0.2902103194088055, + "learning_rate": 1.3303653714641853e-05, + "loss": 0.1251, + "step": 619 + }, + { + "epoch": 0.9080922738923471, + "grad_norm": 0.2666865578103702, + "learning_rate": 1.3279476636560608e-05, + "loss": 0.1201, + "step": 620 + }, + { + "epoch": 0.909556938850238, + "grad_norm": 0.2618945176801082, + "learning_rate": 1.3255278059644496e-05, + "loss": 0.1145, + "step": 621 + }, + { + "epoch": 0.9110216038081289, + "grad_norm": 0.31710586966811777, + "learning_rate": 1.323105814252895e-05, + "loss": 0.1497, + "step": 622 + }, + { + "epoch": 0.9124862687660198, + "grad_norm": 0.29714914241544244, + "learning_rate": 1.3206817043989301e-05, + "loss": 0.1297, + "step": 623 + }, + { + "epoch": 0.9139509337239107, + "grad_norm": 0.26025416086413616, + "learning_rate": 1.3182554922939748e-05, + "loss": 0.1069, + "step": 624 + }, + { + "epoch": 0.9154155986818016, + "grad_norm": 0.2983087250370599, + "learning_rate": 1.3158271938432288e-05, + "loss": 0.1285, + "step": 625 + }, + { + "epoch": 0.9168802636396924, + "grad_norm": 0.27517063473066206, + "learning_rate": 1.3133968249655701e-05, + "loss": 0.1375, + "step": 626 + }, + { + "epoch": 0.9183449285975833, + "grad_norm": 0.2659448809334693, + "learning_rate": 1.3109644015934493e-05, + "loss": 0.1144, + "step": 627 + }, + { + "epoch": 0.9198095935554742, + "grad_norm": 0.31062215771722046, + "learning_rate": 1.3085299396727851e-05, + "loss": 0.1285, + "step": 628 + }, + { + "epoch": 0.9212742585133651, + "grad_norm": 0.2555958267399944, + "learning_rate": 1.3060934551628603e-05, + "loss": 0.1111, + "step": 629 + }, + { + "epoch": 0.922738923471256, + "grad_norm": 0.25825461588582355, + "learning_rate": 1.3036549640362169e-05, + "loss": 0.1153, + "step": 630 + }, + { + "epoch": 0.9242035884291468, + "grad_norm": 0.270581810873964, + "learning_rate": 1.301214482278551e-05, + "loss": 0.1243, + "step": 631 + }, + { + "epoch": 0.9256682533870377, + "grad_norm": 0.2632208259144816, + "learning_rate": 1.2987720258886094e-05, + "loss": 0.1098, + "step": 632 + }, + { + "epoch": 0.9271329183449286, + "grad_norm": 0.2680331255251313, + "learning_rate": 1.2963276108780829e-05, + "loss": 0.1286, + "step": 633 + }, + { + "epoch": 0.9285975833028195, + "grad_norm": 0.293030113117047, + "learning_rate": 1.293881253271502e-05, + "loss": 0.1559, + "step": 634 + }, + { + "epoch": 0.9300622482607104, + "grad_norm": 0.27474296914093777, + "learning_rate": 1.2914329691061327e-05, + "loss": 0.1144, + "step": 635 + }, + { + "epoch": 0.9315269132186013, + "grad_norm": 0.2738691898038605, + "learning_rate": 1.2889827744318705e-05, + "loss": 0.1159, + "step": 636 + }, + { + "epoch": 0.9329915781764921, + "grad_norm": 0.2926627981538906, + "learning_rate": 1.286530685311135e-05, + "loss": 0.1476, + "step": 637 + }, + { + "epoch": 0.934456243134383, + "grad_norm": 0.2542806182987207, + "learning_rate": 1.2840767178187657e-05, + "loss": 0.1076, + "step": 638 + }, + { + "epoch": 0.9359209080922739, + "grad_norm": 0.2752287662506346, + "learning_rate": 1.281620888041915e-05, + "loss": 0.1369, + "step": 639 + }, + { + "epoch": 0.9373855730501648, + "grad_norm": 0.251869065832337, + "learning_rate": 1.279163212079944e-05, + "loss": 0.116, + "step": 640 + }, + { + "epoch": 0.9388502380080557, + "grad_norm": 0.2609971662177529, + "learning_rate": 1.2767037060443173e-05, + "loss": 0.1058, + "step": 641 + }, + { + "epoch": 0.9403149029659466, + "grad_norm": 0.26794778938799796, + "learning_rate": 1.2742423860584954e-05, + "loss": 0.1212, + "step": 642 + }, + { + "epoch": 0.9417795679238374, + "grad_norm": 0.3047657017773294, + "learning_rate": 1.271779268257831e-05, + "loss": 0.1339, + "step": 643 + }, + { + "epoch": 0.9432442328817283, + "grad_norm": 0.2897849310784945, + "learning_rate": 1.269314368789463e-05, + "loss": 0.1084, + "step": 644 + }, + { + "epoch": 0.9447088978396192, + "grad_norm": 0.29786222980952776, + "learning_rate": 1.266847703812209e-05, + "loss": 0.1106, + "step": 645 + }, + { + "epoch": 0.9461735627975101, + "grad_norm": 0.2642203247056678, + "learning_rate": 1.2643792894964611e-05, + "loss": 0.1053, + "step": 646 + }, + { + "epoch": 0.947638227755401, + "grad_norm": 0.28156653824521066, + "learning_rate": 1.2619091420240795e-05, + "loss": 0.1458, + "step": 647 + }, + { + "epoch": 0.9491028927132918, + "grad_norm": 0.2557860588752611, + "learning_rate": 1.2594372775882862e-05, + "loss": 0.1086, + "step": 648 + }, + { + "epoch": 0.9505675576711827, + "grad_norm": 0.26613179341981313, + "learning_rate": 1.2569637123935581e-05, + "loss": 0.1081, + "step": 649 + }, + { + "epoch": 0.9520322226290736, + "grad_norm": 0.27208135899789015, + "learning_rate": 1.2544884626555225e-05, + "loss": 0.1309, + "step": 650 + }, + { + "epoch": 0.9534968875869645, + "grad_norm": 0.2800087048695156, + "learning_rate": 1.2520115446008493e-05, + "loss": 0.122, + "step": 651 + }, + { + "epoch": 0.9549615525448554, + "grad_norm": 0.265808555990337, + "learning_rate": 1.2495329744671457e-05, + "loss": 0.1143, + "step": 652 + }, + { + "epoch": 0.9564262175027463, + "grad_norm": 0.2832651191459517, + "learning_rate": 1.2470527685028485e-05, + "loss": 0.1119, + "step": 653 + }, + { + "epoch": 0.9578908824606371, + "grad_norm": 0.27691932316711165, + "learning_rate": 1.2445709429671184e-05, + "loss": 0.1196, + "step": 654 + }, + { + "epoch": 0.959355547418528, + "grad_norm": 0.24468827419730876, + "learning_rate": 1.2420875141297344e-05, + "loss": 0.1004, + "step": 655 + }, + { + "epoch": 0.9608202123764189, + "grad_norm": 0.28166326250407386, + "learning_rate": 1.2396024982709845e-05, + "loss": 0.1384, + "step": 656 + }, + { + "epoch": 0.9622848773343098, + "grad_norm": 0.2864348243903678, + "learning_rate": 1.2371159116815614e-05, + "loss": 0.1314, + "step": 657 + }, + { + "epoch": 0.9637495422922007, + "grad_norm": 0.29171091081890826, + "learning_rate": 1.234627770662455e-05, + "loss": 0.1348, + "step": 658 + }, + { + "epoch": 0.9652142072500915, + "grad_norm": 0.26640633088569415, + "learning_rate": 1.2321380915248446e-05, + "loss": 0.1215, + "step": 659 + }, + { + "epoch": 0.9666788722079824, + "grad_norm": 0.2719229168980855, + "learning_rate": 1.2296468905899937e-05, + "loss": 0.1163, + "step": 660 + }, + { + "epoch": 0.9681435371658733, + "grad_norm": 0.28951810957646756, + "learning_rate": 1.227154184189141e-05, + "loss": 0.1321, + "step": 661 + }, + { + "epoch": 0.9696082021237642, + "grad_norm": 0.30219688645041665, + "learning_rate": 1.2246599886633951e-05, + "loss": 0.1295, + "step": 662 + }, + { + "epoch": 0.9710728670816551, + "grad_norm": 0.29588778808501925, + "learning_rate": 1.222164320363627e-05, + "loss": 0.1373, + "step": 663 + }, + { + "epoch": 0.972537532039546, + "grad_norm": 0.26451637010025986, + "learning_rate": 1.2196671956503611e-05, + "loss": 0.1187, + "step": 664 + }, + { + "epoch": 0.9740021969974368, + "grad_norm": 0.2670469003783616, + "learning_rate": 1.217168630893671e-05, + "loss": 0.1223, + "step": 665 + }, + { + "epoch": 0.9754668619553277, + "grad_norm": 0.3201350019334305, + "learning_rate": 1.2146686424730699e-05, + "loss": 0.1238, + "step": 666 + }, + { + "epoch": 0.9769315269132186, + "grad_norm": 0.25030723292519363, + "learning_rate": 1.212167246777404e-05, + "loss": 0.1087, + "step": 667 + }, + { + "epoch": 0.9783961918711095, + "grad_norm": 0.30670242040276763, + "learning_rate": 1.2096644602047447e-05, + "loss": 0.1417, + "step": 668 + }, + { + "epoch": 0.9798608568290004, + "grad_norm": 0.3387650718944703, + "learning_rate": 1.2071602991622822e-05, + "loss": 0.1244, + "step": 669 + }, + { + "epoch": 0.9813255217868913, + "grad_norm": 0.31738790684917234, + "learning_rate": 1.2046547800662163e-05, + "loss": 0.1379, + "step": 670 + }, + { + "epoch": 0.9827901867447821, + "grad_norm": 0.253861086544748, + "learning_rate": 1.2021479193416502e-05, + "loss": 0.0972, + "step": 671 + }, + { + "epoch": 0.984254851702673, + "grad_norm": 0.27857395289374787, + "learning_rate": 1.1996397334224814e-05, + "loss": 0.1453, + "step": 672 + }, + { + "epoch": 0.9857195166605639, + "grad_norm": 0.2649436923127358, + "learning_rate": 1.1971302387512958e-05, + "loss": 0.1338, + "step": 673 + }, + { + "epoch": 0.9871841816184548, + "grad_norm": 0.27979461023670804, + "learning_rate": 1.1946194517792584e-05, + "loss": 0.1245, + "step": 674 + }, + { + "epoch": 0.9886488465763457, + "grad_norm": 0.28275198179954225, + "learning_rate": 1.1921073889660061e-05, + "loss": 0.1316, + "step": 675 + }, + { + "epoch": 0.9901135115342365, + "grad_norm": 0.26979156442079305, + "learning_rate": 1.1895940667795395e-05, + "loss": 0.1219, + "step": 676 + }, + { + "epoch": 0.9915781764921274, + "grad_norm": 0.25593923579587835, + "learning_rate": 1.1870795016961157e-05, + "loss": 0.1048, + "step": 677 + }, + { + "epoch": 0.9930428414500183, + "grad_norm": 0.2542007022812753, + "learning_rate": 1.1845637102001383e-05, + "loss": 0.1147, + "step": 678 + }, + { + "epoch": 0.9945075064079092, + "grad_norm": 0.2763537672373692, + "learning_rate": 1.1820467087840526e-05, + "loss": 0.1228, + "step": 679 + }, + { + "epoch": 0.9959721713658001, + "grad_norm": 0.2750343262145036, + "learning_rate": 1.1795285139482341e-05, + "loss": 0.1294, + "step": 680 + }, + { + "epoch": 0.997436836323691, + "grad_norm": 0.2774530874594845, + "learning_rate": 1.1770091422008824e-05, + "loss": 0.1252, + "step": 681 + }, + { + "epoch": 0.9989015012815818, + "grad_norm": 0.2607903166254994, + "learning_rate": 1.174488610057913e-05, + "loss": 0.119, + "step": 682 + }, + { + "epoch": 1.0003661662394727, + "grad_norm": 0.2645760004291393, + "learning_rate": 1.1719669340428472e-05, + "loss": 0.1128, + "step": 683 + }, + { + "epoch": 1.0018308311973636, + "grad_norm": 0.2552221963588916, + "learning_rate": 1.1694441306867062e-05, + "loss": 0.0897, + "step": 684 + }, + { + "epoch": 1.0032954961552545, + "grad_norm": 0.24512146436759236, + "learning_rate": 1.1669202165279009e-05, + "loss": 0.0966, + "step": 685 + }, + { + "epoch": 1.0047601611131454, + "grad_norm": 0.2926151907338783, + "learning_rate": 1.164395208112124e-05, + "loss": 0.0988, + "step": 686 + }, + { + "epoch": 1.0062248260710362, + "grad_norm": 0.30856359947434714, + "learning_rate": 1.1618691219922426e-05, + "loss": 0.1138, + "step": 687 + }, + { + "epoch": 1.0076894910289271, + "grad_norm": 0.26943247650215035, + "learning_rate": 1.159341974728188e-05, + "loss": 0.1157, + "step": 688 + }, + { + "epoch": 1.009154155986818, + "grad_norm": 0.28468116825059964, + "learning_rate": 1.1568137828868478e-05, + "loss": 0.0881, + "step": 689 + }, + { + "epoch": 1.010618820944709, + "grad_norm": 0.2399332416399268, + "learning_rate": 1.1542845630419579e-05, + "loss": 0.0893, + "step": 690 + }, + { + "epoch": 1.0120834859025998, + "grad_norm": 0.3288588984625157, + "learning_rate": 1.1517543317739931e-05, + "loss": 0.1076, + "step": 691 + }, + { + "epoch": 1.0135481508604907, + "grad_norm": 0.26604448567697603, + "learning_rate": 1.1492231056700592e-05, + "loss": 0.0932, + "step": 692 + }, + { + "epoch": 1.0150128158183815, + "grad_norm": 0.24018805379412664, + "learning_rate": 1.1466909013237819e-05, + "loss": 0.1001, + "step": 693 + }, + { + "epoch": 1.0164774807762724, + "grad_norm": 0.2651475188976319, + "learning_rate": 1.1441577353352023e-05, + "loss": 0.1035, + "step": 694 + }, + { + "epoch": 1.0179421457341633, + "grad_norm": 0.31536104086551264, + "learning_rate": 1.1416236243106638e-05, + "loss": 0.1081, + "step": 695 + }, + { + "epoch": 1.0194068106920542, + "grad_norm": 0.2613280520512752, + "learning_rate": 1.1390885848627058e-05, + "loss": 0.0845, + "step": 696 + }, + { + "epoch": 1.020871475649945, + "grad_norm": 0.2837125123903259, + "learning_rate": 1.1365526336099542e-05, + "loss": 0.1328, + "step": 697 + }, + { + "epoch": 1.022336140607836, + "grad_norm": 0.259222134905861, + "learning_rate": 1.1340157871770117e-05, + "loss": 0.0886, + "step": 698 + }, + { + "epoch": 1.0238008055657268, + "grad_norm": 0.2865990464407519, + "learning_rate": 1.13147806219435e-05, + "loss": 0.1079, + "step": 699 + }, + { + "epoch": 1.0252654705236177, + "grad_norm": 0.2794232474113488, + "learning_rate": 1.1289394752982e-05, + "loss": 0.106, + "step": 700 + }, + { + "epoch": 1.0267301354815086, + "grad_norm": 0.2606486568133289, + "learning_rate": 1.1264000431304422e-05, + "loss": 0.0902, + "step": 701 + }, + { + "epoch": 1.0281948004393995, + "grad_norm": 0.27767121394939476, + "learning_rate": 1.1238597823385e-05, + "loss": 0.0915, + "step": 702 + }, + { + "epoch": 1.0296594653972904, + "grad_norm": 0.2865112658979935, + "learning_rate": 1.1213187095752271e-05, + "loss": 0.0994, + "step": 703 + }, + { + "epoch": 1.0311241303551812, + "grad_norm": 0.28382542458351684, + "learning_rate": 1.1187768414988015e-05, + "loss": 0.0922, + "step": 704 + }, + { + "epoch": 1.0325887953130721, + "grad_norm": 0.27487686960098845, + "learning_rate": 1.1162341947726139e-05, + "loss": 0.1187, + "step": 705 + }, + { + "epoch": 1.034053460270963, + "grad_norm": 0.2629377900389775, + "learning_rate": 1.1136907860651603e-05, + "loss": 0.0894, + "step": 706 + }, + { + "epoch": 1.035518125228854, + "grad_norm": 0.2613378798506544, + "learning_rate": 1.1111466320499318e-05, + "loss": 0.0921, + "step": 707 + }, + { + "epoch": 1.0369827901867448, + "grad_norm": 0.255691486058076, + "learning_rate": 1.1086017494053046e-05, + "loss": 0.0937, + "step": 708 + }, + { + "epoch": 1.0384474551446357, + "grad_norm": 0.2595524157003849, + "learning_rate": 1.1060561548144321e-05, + "loss": 0.0898, + "step": 709 + }, + { + "epoch": 1.0399121201025265, + "grad_norm": 0.27529763982095773, + "learning_rate": 1.1035098649651355e-05, + "loss": 0.0966, + "step": 710 + }, + { + "epoch": 1.0413767850604174, + "grad_norm": 0.2518906472665152, + "learning_rate": 1.1009628965497927e-05, + "loss": 0.0842, + "step": 711 + }, + { + "epoch": 1.0428414500183083, + "grad_norm": 0.27904396918210833, + "learning_rate": 1.0984152662652307e-05, + "loss": 0.1148, + "step": 712 + }, + { + "epoch": 1.0443061149761992, + "grad_norm": 0.2651133079255485, + "learning_rate": 1.0958669908126151e-05, + "loss": 0.087, + "step": 713 + }, + { + "epoch": 1.04577077993409, + "grad_norm": 0.2767992985128494, + "learning_rate": 1.0933180868973414e-05, + "loss": 0.1033, + "step": 714 + }, + { + "epoch": 1.047235444891981, + "grad_norm": 0.2527734956067706, + "learning_rate": 1.0907685712289244e-05, + "loss": 0.0912, + "step": 715 + }, + { + "epoch": 1.0487001098498718, + "grad_norm": 0.26205384399328313, + "learning_rate": 1.0882184605208895e-05, + "loss": 0.089, + "step": 716 + }, + { + "epoch": 1.0501647748077627, + "grad_norm": 0.2680315695418657, + "learning_rate": 1.0856677714906632e-05, + "loss": 0.1001, + "step": 717 + }, + { + "epoch": 1.0516294397656536, + "grad_norm": 0.2265048738569077, + "learning_rate": 1.083116520859463e-05, + "loss": 0.0897, + "step": 718 + }, + { + "epoch": 1.0530941047235445, + "grad_norm": 0.2507364042609683, + "learning_rate": 1.080564725352188e-05, + "loss": 0.0867, + "step": 719 + }, + { + "epoch": 1.0545587696814354, + "grad_norm": 0.27387442999591954, + "learning_rate": 1.0780124016973095e-05, + "loss": 0.0998, + "step": 720 + }, + { + "epoch": 1.0560234346393262, + "grad_norm": 0.24270388287807182, + "learning_rate": 1.0754595666267609e-05, + "loss": 0.0817, + "step": 721 + }, + { + "epoch": 1.0574880995972171, + "grad_norm": 0.26311541978879793, + "learning_rate": 1.0729062368758278e-05, + "loss": 0.0988, + "step": 722 + }, + { + "epoch": 1.058952764555108, + "grad_norm": 0.2704618854801208, + "learning_rate": 1.0703524291830398e-05, + "loss": 0.1082, + "step": 723 + }, + { + "epoch": 1.0604174295129989, + "grad_norm": 0.28368113431359887, + "learning_rate": 1.067798160290059e-05, + "loss": 0.1009, + "step": 724 + }, + { + "epoch": 1.0618820944708898, + "grad_norm": 0.26885585300497, + "learning_rate": 1.0652434469415705e-05, + "loss": 0.0986, + "step": 725 + }, + { + "epoch": 1.0633467594287807, + "grad_norm": 0.24916401223915322, + "learning_rate": 1.0626883058851737e-05, + "loss": 0.0859, + "step": 726 + }, + { + "epoch": 1.0648114243866715, + "grad_norm": 0.25631003685203035, + "learning_rate": 1.0601327538712723e-05, + "loss": 0.0923, + "step": 727 + }, + { + "epoch": 1.0662760893445624, + "grad_norm": 0.2552784210398989, + "learning_rate": 1.0575768076529627e-05, + "loss": 0.0761, + "step": 728 + }, + { + "epoch": 1.0677407543024533, + "grad_norm": 0.24165749739319964, + "learning_rate": 1.0550204839859265e-05, + "loss": 0.0752, + "step": 729 + }, + { + "epoch": 1.0692054192603442, + "grad_norm": 0.2675621328921695, + "learning_rate": 1.0524637996283195e-05, + "loss": 0.1019, + "step": 730 + }, + { + "epoch": 1.070670084218235, + "grad_norm": 0.2569008765612613, + "learning_rate": 1.0499067713406622e-05, + "loss": 0.0964, + "step": 731 + }, + { + "epoch": 1.072134749176126, + "grad_norm": 0.24678690292067554, + "learning_rate": 1.0473494158857298e-05, + "loss": 0.0891, + "step": 732 + }, + { + "epoch": 1.0735994141340168, + "grad_norm": 0.2549990776594623, + "learning_rate": 1.0447917500284415e-05, + "loss": 0.0925, + "step": 733 + }, + { + "epoch": 1.0750640790919077, + "grad_norm": 0.28063287681936966, + "learning_rate": 1.0422337905357523e-05, + "loss": 0.1053, + "step": 734 + }, + { + "epoch": 1.0765287440497986, + "grad_norm": 0.27072473670857133, + "learning_rate": 1.0396755541765413e-05, + "loss": 0.0951, + "step": 735 + }, + { + "epoch": 1.0779934090076895, + "grad_norm": 0.2804354889898814, + "learning_rate": 1.0371170577215036e-05, + "loss": 0.1044, + "step": 736 + }, + { + "epoch": 1.0794580739655804, + "grad_norm": 0.2707408971145425, + "learning_rate": 1.0345583179430387e-05, + "loss": 0.087, + "step": 737 + }, + { + "epoch": 1.0809227389234712, + "grad_norm": 0.27265959106298226, + "learning_rate": 1.0319993516151412e-05, + "loss": 0.0957, + "step": 738 + }, + { + "epoch": 1.0823874038813621, + "grad_norm": 0.2664867481183081, + "learning_rate": 1.0294401755132912e-05, + "loss": 0.0838, + "step": 739 + }, + { + "epoch": 1.083852068839253, + "grad_norm": 0.2827928379341626, + "learning_rate": 1.0268808064143438e-05, + "loss": 0.0981, + "step": 740 + }, + { + "epoch": 1.0853167337971439, + "grad_norm": 0.2516210381548982, + "learning_rate": 1.0243212610964192e-05, + "loss": 0.085, + "step": 741 + }, + { + "epoch": 1.0867813987550348, + "grad_norm": 0.2609165490224745, + "learning_rate": 1.0217615563387932e-05, + "loss": 0.1047, + "step": 742 + }, + { + "epoch": 1.0882460637129256, + "grad_norm": 0.2522780231083236, + "learning_rate": 1.0192017089217863e-05, + "loss": 0.1029, + "step": 743 + }, + { + "epoch": 1.0897107286708165, + "grad_norm": 0.2640632238176228, + "learning_rate": 1.0166417356266546e-05, + "loss": 0.1053, + "step": 744 + }, + { + "epoch": 1.0911753936287074, + "grad_norm": 0.266832288851606, + "learning_rate": 1.0140816532354793e-05, + "loss": 0.1068, + "step": 745 + }, + { + "epoch": 1.0926400585865983, + "grad_norm": 0.24267415544535567, + "learning_rate": 1.0115214785310567e-05, + "loss": 0.0723, + "step": 746 + }, + { + "epoch": 1.0941047235444892, + "grad_norm": 0.280429753176534, + "learning_rate": 1.0089612282967884e-05, + "loss": 0.1071, + "step": 747 + }, + { + "epoch": 1.09556938850238, + "grad_norm": 0.26916094567137633, + "learning_rate": 1.0064009193165713e-05, + "loss": 0.0858, + "step": 748 + }, + { + "epoch": 1.097034053460271, + "grad_norm": 0.274415937658407, + "learning_rate": 1.0038405683746868e-05, + "loss": 0.0869, + "step": 749 + }, + { + "epoch": 1.0984987184181618, + "grad_norm": 0.2792545952852677, + "learning_rate": 1.0012801922556918e-05, + "loss": 0.0893, + "step": 750 + }, + { + "epoch": 1.0999633833760527, + "grad_norm": 0.2875272278765476, + "learning_rate": 9.987198077443085e-06, + "loss": 0.1099, + "step": 751 + }, + { + "epoch": 1.1014280483339436, + "grad_norm": 0.260518607411576, + "learning_rate": 9.961594316253134e-06, + "loss": 0.0828, + "step": 752 + }, + { + "epoch": 1.1028927132918345, + "grad_norm": 0.2633760525239851, + "learning_rate": 9.93599080683429e-06, + "loss": 0.0969, + "step": 753 + }, + { + "epoch": 1.1043573782497254, + "grad_norm": 0.25909573953326753, + "learning_rate": 9.910387717032115e-06, + "loss": 0.0756, + "step": 754 + }, + { + "epoch": 1.1058220432076162, + "grad_norm": 0.27253806701721073, + "learning_rate": 9.884785214689435e-06, + "loss": 0.0856, + "step": 755 + }, + { + "epoch": 1.1072867081655071, + "grad_norm": 0.2597508988107494, + "learning_rate": 9.859183467645207e-06, + "loss": 0.0855, + "step": 756 + }, + { + "epoch": 1.108751373123398, + "grad_norm": 0.23868371653633474, + "learning_rate": 9.833582643733457e-06, + "loss": 0.0826, + "step": 757 + }, + { + "epoch": 1.1102160380812889, + "grad_norm": 0.29179266600473996, + "learning_rate": 9.807982910782142e-06, + "loss": 0.1156, + "step": 758 + }, + { + "epoch": 1.1116807030391798, + "grad_norm": 0.2522539204555841, + "learning_rate": 9.782384436612072e-06, + "loss": 0.0908, + "step": 759 + }, + { + "epoch": 1.1131453679970706, + "grad_norm": 0.23700004230402077, + "learning_rate": 9.756787389035813e-06, + "loss": 0.0832, + "step": 760 + }, + { + "epoch": 1.1146100329549615, + "grad_norm": 0.2829087070060208, + "learning_rate": 9.731191935856566e-06, + "loss": 0.1116, + "step": 761 + }, + { + "epoch": 1.1160746979128524, + "grad_norm": 0.26523704277336946, + "learning_rate": 9.705598244867093e-06, + "loss": 0.0939, + "step": 762 + }, + { + "epoch": 1.1175393628707433, + "grad_norm": 0.24086964841423913, + "learning_rate": 9.68000648384859e-06, + "loss": 0.0873, + "step": 763 + }, + { + "epoch": 1.1190040278286342, + "grad_norm": 0.2654475048077318, + "learning_rate": 9.654416820569618e-06, + "loss": 0.1006, + "step": 764 + }, + { + "epoch": 1.120468692786525, + "grad_norm": 0.2601239282279833, + "learning_rate": 9.628829422784965e-06, + "loss": 0.0773, + "step": 765 + }, + { + "epoch": 1.121933357744416, + "grad_norm": 0.2569521050103609, + "learning_rate": 9.603244458234589e-06, + "loss": 0.0793, + "step": 766 + }, + { + "epoch": 1.1233980227023068, + "grad_norm": 0.26593809509671273, + "learning_rate": 9.577662094642478e-06, + "loss": 0.0801, + "step": 767 + }, + { + "epoch": 1.1248626876601977, + "grad_norm": 0.29040791441938224, + "learning_rate": 9.552082499715588e-06, + "loss": 0.0933, + "step": 768 + }, + { + "epoch": 1.1263273526180886, + "grad_norm": 0.2943106175306918, + "learning_rate": 9.526505841142702e-06, + "loss": 0.0861, + "step": 769 + }, + { + "epoch": 1.1277920175759795, + "grad_norm": 0.2601745466903212, + "learning_rate": 9.50093228659338e-06, + "loss": 0.0956, + "step": 770 + }, + { + "epoch": 1.1292566825338703, + "grad_norm": 0.26049784872083914, + "learning_rate": 9.475362003716804e-06, + "loss": 0.0826, + "step": 771 + }, + { + "epoch": 1.1307213474917612, + "grad_norm": 0.2629400244245209, + "learning_rate": 9.449795160140737e-06, + "loss": 0.0867, + "step": 772 + }, + { + "epoch": 1.1321860124496521, + "grad_norm": 0.2621080280544426, + "learning_rate": 9.424231923470378e-06, + "loss": 0.1065, + "step": 773 + }, + { + "epoch": 1.133650677407543, + "grad_norm": 0.26858890551499126, + "learning_rate": 9.39867246128728e-06, + "loss": 0.0941, + "step": 774 + }, + { + "epoch": 1.1351153423654339, + "grad_norm": 0.2576626727357875, + "learning_rate": 9.373116941148264e-06, + "loss": 0.0964, + "step": 775 + }, + { + "epoch": 1.1365800073233248, + "grad_norm": 0.26215360922586484, + "learning_rate": 9.347565530584299e-06, + "loss": 0.093, + "step": 776 + }, + { + "epoch": 1.1380446722812156, + "grad_norm": 0.25775670982713683, + "learning_rate": 9.322018397099414e-06, + "loss": 0.0921, + "step": 777 + }, + { + "epoch": 1.1395093372391065, + "grad_norm": 0.26195192691999436, + "learning_rate": 9.296475708169603e-06, + "loss": 0.0944, + "step": 778 + }, + { + "epoch": 1.1409740021969974, + "grad_norm": 0.24607830549799634, + "learning_rate": 9.270937631241723e-06, + "loss": 0.0789, + "step": 779 + }, + { + "epoch": 1.1424386671548883, + "grad_norm": 0.2504436025347225, + "learning_rate": 9.245404333732395e-06, + "loss": 0.0844, + "step": 780 + }, + { + "epoch": 1.1439033321127792, + "grad_norm": 0.2577203090564361, + "learning_rate": 9.219875983026909e-06, + "loss": 0.0834, + "step": 781 + }, + { + "epoch": 1.14536799707067, + "grad_norm": 0.25652775031593833, + "learning_rate": 9.194352746478122e-06, + "loss": 0.0806, + "step": 782 + }, + { + "epoch": 1.146832662028561, + "grad_norm": 0.27571423516211513, + "learning_rate": 9.168834791405374e-06, + "loss": 0.1015, + "step": 783 + }, + { + "epoch": 1.1482973269864518, + "grad_norm": 0.25833280528124036, + "learning_rate": 9.143322285093371e-06, + "loss": 0.0879, + "step": 784 + }, + { + "epoch": 1.1497619919443427, + "grad_norm": 0.27768929864428427, + "learning_rate": 9.117815394791107e-06, + "loss": 0.0916, + "step": 785 + }, + { + "epoch": 1.1512266569022336, + "grad_norm": 0.27218967256543264, + "learning_rate": 9.092314287710757e-06, + "loss": 0.0791, + "step": 786 + }, + { + "epoch": 1.1526913218601245, + "grad_norm": 0.290779647815006, + "learning_rate": 9.066819131026588e-06, + "loss": 0.1023, + "step": 787 + }, + { + "epoch": 1.1541559868180153, + "grad_norm": 0.27792973768255624, + "learning_rate": 9.041330091873852e-06, + "loss": 0.0979, + "step": 788 + }, + { + "epoch": 1.1556206517759062, + "grad_norm": 0.2373738656510741, + "learning_rate": 9.015847337347695e-06, + "loss": 0.0806, + "step": 789 + }, + { + "epoch": 1.157085316733797, + "grad_norm": 0.25696429168971635, + "learning_rate": 8.990371034502078e-06, + "loss": 0.0918, + "step": 790 + }, + { + "epoch": 1.158549981691688, + "grad_norm": 0.280392566025079, + "learning_rate": 8.964901350348648e-06, + "loss": 0.0949, + "step": 791 + }, + { + "epoch": 1.1600146466495789, + "grad_norm": 0.2535223646372367, + "learning_rate": 8.939438451855684e-06, + "loss": 0.0816, + "step": 792 + }, + { + "epoch": 1.1614793116074698, + "grad_norm": 0.24735896623108233, + "learning_rate": 8.913982505946958e-06, + "loss": 0.0876, + "step": 793 + }, + { + "epoch": 1.1629439765653606, + "grad_norm": 0.2543060334798377, + "learning_rate": 8.888533679500688e-06, + "loss": 0.0866, + "step": 794 + }, + { + "epoch": 1.1644086415232515, + "grad_norm": 0.2648636536906661, + "learning_rate": 8.863092139348397e-06, + "loss": 0.1009, + "step": 795 + }, + { + "epoch": 1.1658733064811424, + "grad_norm": 0.24166274223060005, + "learning_rate": 8.837658052273863e-06, + "loss": 0.0744, + "step": 796 + }, + { + "epoch": 1.1673379714390333, + "grad_norm": 0.2593529998079471, + "learning_rate": 8.812231585011987e-06, + "loss": 0.1102, + "step": 797 + }, + { + "epoch": 1.1688026363969242, + "grad_norm": 0.2547937310328503, + "learning_rate": 8.78681290424773e-06, + "loss": 0.083, + "step": 798 + }, + { + "epoch": 1.170267301354815, + "grad_norm": 0.2678047431199921, + "learning_rate": 8.761402176615002e-06, + "loss": 0.1004, + "step": 799 + }, + { + "epoch": 1.171731966312706, + "grad_norm": 0.2881247911795614, + "learning_rate": 8.735999568695581e-06, + "loss": 0.0941, + "step": 800 + }, + { + "epoch": 1.1731966312705968, + "grad_norm": 0.306665541632391, + "learning_rate": 8.710605247018002e-06, + "loss": 0.1016, + "step": 801 + }, + { + "epoch": 1.1746612962284877, + "grad_norm": 0.26549998092914273, + "learning_rate": 8.685219378056503e-06, + "loss": 0.1025, + "step": 802 + }, + { + "epoch": 1.1761259611863786, + "grad_norm": 0.25884918535779855, + "learning_rate": 8.659842128229886e-06, + "loss": 0.0796, + "step": 803 + }, + { + "epoch": 1.1775906261442695, + "grad_norm": 0.29483213665665864, + "learning_rate": 8.634473663900461e-06, + "loss": 0.0936, + "step": 804 + }, + { + "epoch": 1.1790552911021603, + "grad_norm": 0.2827884978675775, + "learning_rate": 8.609114151372947e-06, + "loss": 0.0877, + "step": 805 + }, + { + "epoch": 1.1805199560600512, + "grad_norm": 0.2929302563222184, + "learning_rate": 8.583763756893366e-06, + "loss": 0.0955, + "step": 806 + }, + { + "epoch": 1.181984621017942, + "grad_norm": 0.2538641882723923, + "learning_rate": 8.558422646647984e-06, + "loss": 0.0808, + "step": 807 + }, + { + "epoch": 1.183449285975833, + "grad_norm": 0.26631517555229367, + "learning_rate": 8.533090986762183e-06, + "loss": 0.0834, + "step": 808 + }, + { + "epoch": 1.1849139509337239, + "grad_norm": 0.2788403763341659, + "learning_rate": 8.507768943299415e-06, + "loss": 0.0969, + "step": 809 + }, + { + "epoch": 1.1863786158916148, + "grad_norm": 0.2781094675108167, + "learning_rate": 8.482456682260069e-06, + "loss": 0.0963, + "step": 810 + }, + { + "epoch": 1.1878432808495056, + "grad_norm": 0.2473923042956088, + "learning_rate": 8.457154369580424e-06, + "loss": 0.0843, + "step": 811 + }, + { + "epoch": 1.1893079458073965, + "grad_norm": 0.23006340181208546, + "learning_rate": 8.431862171131524e-06, + "loss": 0.0695, + "step": 812 + }, + { + "epoch": 1.1907726107652874, + "grad_norm": 0.2556855544794983, + "learning_rate": 8.406580252718125e-06, + "loss": 0.0773, + "step": 813 + }, + { + "epoch": 1.1922372757231783, + "grad_norm": 0.26970894596044315, + "learning_rate": 8.381308780077575e-06, + "loss": 0.0862, + "step": 814 + }, + { + "epoch": 1.1937019406810692, + "grad_norm": 0.25936009464878035, + "learning_rate": 8.356047918878762e-06, + "loss": 0.0909, + "step": 815 + }, + { + "epoch": 1.19516660563896, + "grad_norm": 0.26608540904879535, + "learning_rate": 8.330797834720993e-06, + "loss": 0.0833, + "step": 816 + }, + { + "epoch": 1.196631270596851, + "grad_norm": 0.2591055743281483, + "learning_rate": 8.305558693132943e-06, + "loss": 0.0798, + "step": 817 + }, + { + "epoch": 1.1980959355547418, + "grad_norm": 0.2607615757724153, + "learning_rate": 8.280330659571532e-06, + "loss": 0.0891, + "step": 818 + }, + { + "epoch": 1.1995606005126327, + "grad_norm": 0.27689441116344976, + "learning_rate": 8.255113899420873e-06, + "loss": 0.1042, + "step": 819 + }, + { + "epoch": 1.2010252654705236, + "grad_norm": 0.2634058051717049, + "learning_rate": 8.229908577991177e-06, + "loss": 0.0794, + "step": 820 + }, + { + "epoch": 1.2024899304284145, + "grad_norm": 0.2712603230168793, + "learning_rate": 8.204714860517662e-06, + "loss": 0.0872, + "step": 821 + }, + { + "epoch": 1.2039545953863053, + "grad_norm": 0.26002461903187646, + "learning_rate": 8.179532912159477e-06, + "loss": 0.0806, + "step": 822 + }, + { + "epoch": 1.2054192603441962, + "grad_norm": 0.2778681441466659, + "learning_rate": 8.154362897998619e-06, + "loss": 0.0954, + "step": 823 + }, + { + "epoch": 1.206883925302087, + "grad_norm": 0.2597653658409811, + "learning_rate": 8.129204983038847e-06, + "loss": 0.0837, + "step": 824 + }, + { + "epoch": 1.208348590259978, + "grad_norm": 0.2409553005949464, + "learning_rate": 8.104059332204606e-06, + "loss": 0.0797, + "step": 825 + }, + { + "epoch": 1.2098132552178689, + "grad_norm": 0.26099557745468915, + "learning_rate": 8.07892611033994e-06, + "loss": 0.0856, + "step": 826 + }, + { + "epoch": 1.2112779201757597, + "grad_norm": 0.2670243095090208, + "learning_rate": 8.053805482207418e-06, + "loss": 0.0981, + "step": 827 + }, + { + "epoch": 1.2127425851336506, + "grad_norm": 0.25384500506062874, + "learning_rate": 8.028697612487046e-06, + "loss": 0.0855, + "step": 828 + }, + { + "epoch": 1.2142072500915415, + "grad_norm": 0.27259004018505645, + "learning_rate": 8.003602665775189e-06, + "loss": 0.0895, + "step": 829 + }, + { + "epoch": 1.2156719150494324, + "grad_norm": 0.24539935533491744, + "learning_rate": 7.978520806583503e-06, + "loss": 0.075, + "step": 830 + }, + { + "epoch": 1.2171365800073233, + "grad_norm": 0.28931365047548946, + "learning_rate": 7.95345219933784e-06, + "loss": 0.0939, + "step": 831 + }, + { + "epoch": 1.2186012449652142, + "grad_norm": 0.24781333667809402, + "learning_rate": 7.92839700837718e-06, + "loss": 0.0847, + "step": 832 + }, + { + "epoch": 1.220065909923105, + "grad_norm": 0.2597660565925631, + "learning_rate": 7.903355397952557e-06, + "loss": 0.0847, + "step": 833 + }, + { + "epoch": 1.221530574880996, + "grad_norm": 0.25108362024888203, + "learning_rate": 7.878327532225964e-06, + "loss": 0.0729, + "step": 834 + }, + { + "epoch": 1.2229952398388868, + "grad_norm": 0.2879443475536739, + "learning_rate": 7.853313575269306e-06, + "loss": 0.0926, + "step": 835 + }, + { + "epoch": 1.2244599047967777, + "grad_norm": 0.24985250820254382, + "learning_rate": 7.828313691063294e-06, + "loss": 0.0729, + "step": 836 + }, + { + "epoch": 1.2259245697546686, + "grad_norm": 0.2528963890297066, + "learning_rate": 7.803328043496394e-06, + "loss": 0.069, + "step": 837 + }, + { + "epoch": 1.2273892347125595, + "grad_norm": 0.2470937539451322, + "learning_rate": 7.778356796363734e-06, + "loss": 0.0829, + "step": 838 + }, + { + "epoch": 1.2288538996704503, + "grad_norm": 0.2521378561935959, + "learning_rate": 7.753400113366052e-06, + "loss": 0.0791, + "step": 839 + }, + { + "epoch": 1.2303185646283412, + "grad_norm": 0.2631772936605158, + "learning_rate": 7.728458158108592e-06, + "loss": 0.0905, + "step": 840 + }, + { + "epoch": 1.231783229586232, + "grad_norm": 0.2444159483676253, + "learning_rate": 7.703531094100068e-06, + "loss": 0.0741, + "step": 841 + }, + { + "epoch": 1.233247894544123, + "grad_norm": 0.281794329109641, + "learning_rate": 7.678619084751554e-06, + "loss": 0.088, + "step": 842 + }, + { + "epoch": 1.2347125595020139, + "grad_norm": 0.27262976373807446, + "learning_rate": 7.653722293375453e-06, + "loss": 0.1, + "step": 843 + }, + { + "epoch": 1.2361772244599047, + "grad_norm": 0.25816315241450866, + "learning_rate": 7.628840883184385e-06, + "loss": 0.0906, + "step": 844 + }, + { + "epoch": 1.2376418894177956, + "grad_norm": 0.2537369926820909, + "learning_rate": 7.603975017290159e-06, + "loss": 0.0777, + "step": 845 + }, + { + "epoch": 1.2391065543756865, + "grad_norm": 0.2916357929163886, + "learning_rate": 7.579124858702658e-06, + "loss": 0.1011, + "step": 846 + }, + { + "epoch": 1.2405712193335774, + "grad_norm": 0.27484734516089476, + "learning_rate": 7.5542905703288175e-06, + "loss": 0.0954, + "step": 847 + }, + { + "epoch": 1.2420358842914683, + "grad_norm": 0.2704547563666006, + "learning_rate": 7.529472314971522e-06, + "loss": 0.0929, + "step": 848 + }, + { + "epoch": 1.2435005492493592, + "grad_norm": 0.24275302504305166, + "learning_rate": 7.504670255328548e-06, + "loss": 0.0684, + "step": 849 + }, + { + "epoch": 1.24496521420725, + "grad_norm": 0.2535609358202711, + "learning_rate": 7.4798845539915126e-06, + "loss": 0.071, + "step": 850 + }, + { + "epoch": 1.246429879165141, + "grad_norm": 0.25442354937020867, + "learning_rate": 7.455115373444779e-06, + "loss": 0.0808, + "step": 851 + }, + { + "epoch": 1.2478945441230318, + "grad_norm": 0.25458587191704724, + "learning_rate": 7.430362876064424e-06, + "loss": 0.0806, + "step": 852 + }, + { + "epoch": 1.2493592090809227, + "grad_norm": 0.2636459021351091, + "learning_rate": 7.4056272241171425e-06, + "loss": 0.0975, + "step": 853 + }, + { + "epoch": 1.2508238740388136, + "grad_norm": 0.26588179392313827, + "learning_rate": 7.380908579759207e-06, + "loss": 0.0908, + "step": 854 + }, + { + "epoch": 1.2522885389967044, + "grad_norm": 0.25802097833264015, + "learning_rate": 7.356207105035389e-06, + "loss": 0.0727, + "step": 855 + }, + { + "epoch": 1.2537532039545953, + "grad_norm": 0.2525429271439887, + "learning_rate": 7.331522961877914e-06, + "loss": 0.0852, + "step": 856 + }, + { + "epoch": 1.2552178689124862, + "grad_norm": 0.24515558724346895, + "learning_rate": 7.30685631210537e-06, + "loss": 0.0716, + "step": 857 + }, + { + "epoch": 1.256682533870377, + "grad_norm": 0.25450544202115294, + "learning_rate": 7.282207317421691e-06, + "loss": 0.0851, + "step": 858 + }, + { + "epoch": 1.258147198828268, + "grad_norm": 0.26104307630858015, + "learning_rate": 7.2575761394150476e-06, + "loss": 0.0824, + "step": 859 + }, + { + "epoch": 1.2596118637861589, + "grad_norm": 0.26362892269219795, + "learning_rate": 7.232962939556831e-06, + "loss": 0.0754, + "step": 860 + }, + { + "epoch": 1.2610765287440497, + "grad_norm": 0.26960831313830425, + "learning_rate": 7.20836787920056e-06, + "loss": 0.088, + "step": 861 + }, + { + "epoch": 1.2625411937019406, + "grad_norm": 0.2884650032848352, + "learning_rate": 7.183791119580854e-06, + "loss": 0.0844, + "step": 862 + }, + { + "epoch": 1.2640058586598315, + "grad_norm": 0.2673739678569061, + "learning_rate": 7.159232821812348e-06, + "loss": 0.0917, + "step": 863 + }, + { + "epoch": 1.2654705236177224, + "grad_norm": 0.2647717036142495, + "learning_rate": 7.134693146888652e-06, + "loss": 0.074, + "step": 864 + }, + { + "epoch": 1.2669351885756133, + "grad_norm": 0.24828043520372733, + "learning_rate": 7.1101722556813e-06, + "loss": 0.0731, + "step": 865 + }, + { + "epoch": 1.2683998535335042, + "grad_norm": 0.29021631743576276, + "learning_rate": 7.085670308938674e-06, + "loss": 0.0943, + "step": 866 + }, + { + "epoch": 1.269864518491395, + "grad_norm": 0.25757971293739884, + "learning_rate": 7.061187467284985e-06, + "loss": 0.0762, + "step": 867 + }, + { + "epoch": 1.271329183449286, + "grad_norm": 0.2707115051349976, + "learning_rate": 7.0367238912191734e-06, + "loss": 0.086, + "step": 868 + }, + { + "epoch": 1.2727938484071768, + "grad_norm": 0.2416159349328539, + "learning_rate": 7.012279741113909e-06, + "loss": 0.0731, + "step": 869 + }, + { + "epoch": 1.2742585133650677, + "grad_norm": 0.25001307113728843, + "learning_rate": 6.987855177214489e-06, + "loss": 0.0793, + "step": 870 + }, + { + "epoch": 1.2757231783229586, + "grad_norm": 0.267697774653143, + "learning_rate": 6.963450359637835e-06, + "loss": 0.0881, + "step": 871 + }, + { + "epoch": 1.2771878432808494, + "grad_norm": 0.2656180743033591, + "learning_rate": 6.939065448371398e-06, + "loss": 0.0805, + "step": 872 + }, + { + "epoch": 1.2786525082387403, + "grad_norm": 0.24388348217463868, + "learning_rate": 6.914700603272151e-06, + "loss": 0.0739, + "step": 873 + }, + { + "epoch": 1.2801171731966312, + "grad_norm": 0.27460701489232753, + "learning_rate": 6.8903559840655075e-06, + "loss": 0.079, + "step": 874 + }, + { + "epoch": 1.281581838154522, + "grad_norm": 0.25551151990608106, + "learning_rate": 6.866031750344302e-06, + "loss": 0.0767, + "step": 875 + }, + { + "epoch": 1.283046503112413, + "grad_norm": 0.25705060856070827, + "learning_rate": 6.841728061567713e-06, + "loss": 0.084, + "step": 876 + }, + { + "epoch": 1.2845111680703039, + "grad_norm": 0.2535751605501693, + "learning_rate": 6.817445077060256e-06, + "loss": 0.0859, + "step": 877 + }, + { + "epoch": 1.2859758330281947, + "grad_norm": 0.26041563534508294, + "learning_rate": 6.7931829560107e-06, + "loss": 0.0841, + "step": 878 + }, + { + "epoch": 1.2874404979860856, + "grad_norm": 0.2537135436483843, + "learning_rate": 6.768941857471054e-06, + "loss": 0.0775, + "step": 879 + }, + { + "epoch": 1.2889051629439765, + "grad_norm": 0.2659246611826128, + "learning_rate": 6.744721940355508e-06, + "loss": 0.0807, + "step": 880 + }, + { + "epoch": 1.2903698279018674, + "grad_norm": 0.2540825283987494, + "learning_rate": 6.720523363439393e-06, + "loss": 0.0896, + "step": 881 + }, + { + "epoch": 1.2918344928597583, + "grad_norm": 0.2575386822036422, + "learning_rate": 6.69634628535815e-06, + "loss": 0.0797, + "step": 882 + }, + { + "epoch": 1.2932991578176491, + "grad_norm": 0.2649389460574201, + "learning_rate": 6.672190864606276e-06, + "loss": 0.0878, + "step": 883 + }, + { + "epoch": 1.29476382277554, + "grad_norm": 0.2638636466333762, + "learning_rate": 6.648057259536297e-06, + "loss": 0.0831, + "step": 884 + }, + { + "epoch": 1.296228487733431, + "grad_norm": 0.26545840639974744, + "learning_rate": 6.623945628357718e-06, + "loss": 0.081, + "step": 885 + }, + { + "epoch": 1.2976931526913218, + "grad_norm": 0.27477957230836636, + "learning_rate": 6.599856129136006e-06, + "loss": 0.0787, + "step": 886 + }, + { + "epoch": 1.2991578176492127, + "grad_norm": 0.2584584388897469, + "learning_rate": 6.575788919791522e-06, + "loss": 0.0888, + "step": 887 + }, + { + "epoch": 1.3006224826071036, + "grad_norm": 0.2553038074640935, + "learning_rate": 6.551744158098521e-06, + "loss": 0.0804, + "step": 888 + }, + { + "epoch": 1.3020871475649944, + "grad_norm": 0.2614338959940162, + "learning_rate": 6.527722001684087e-06, + "loss": 0.0842, + "step": 889 + }, + { + "epoch": 1.3035518125228853, + "grad_norm": 0.27266050110549406, + "learning_rate": 6.503722608027129e-06, + "loss": 0.0953, + "step": 890 + }, + { + "epoch": 1.3050164774807762, + "grad_norm": 0.26498428407714436, + "learning_rate": 6.479746134457319e-06, + "loss": 0.0948, + "step": 891 + }, + { + "epoch": 1.306481142438667, + "grad_norm": 0.2425181307556485, + "learning_rate": 6.455792738154074e-06, + "loss": 0.0771, + "step": 892 + }, + { + "epoch": 1.307945807396558, + "grad_norm": 0.3031551149501075, + "learning_rate": 6.431862576145546e-06, + "loss": 0.1127, + "step": 893 + }, + { + "epoch": 1.3094104723544489, + "grad_norm": 0.2605793802598279, + "learning_rate": 6.407955805307547e-06, + "loss": 0.0838, + "step": 894 + }, + { + "epoch": 1.3108751373123397, + "grad_norm": 0.2857025614498841, + "learning_rate": 6.3840725823625685e-06, + "loss": 0.0958, + "step": 895 + }, + { + "epoch": 1.3123398022702306, + "grad_norm": 0.26266676335382666, + "learning_rate": 6.3602130638787155e-06, + "loss": 0.0919, + "step": 896 + }, + { + "epoch": 1.3138044672281215, + "grad_norm": 0.2705157836138465, + "learning_rate": 6.336377406268712e-06, + "loss": 0.0843, + "step": 897 + }, + { + "epoch": 1.3152691321860124, + "grad_norm": 0.2442729301782896, + "learning_rate": 6.312565765788843e-06, + "loss": 0.0696, + "step": 898 + }, + { + "epoch": 1.3167337971439033, + "grad_norm": 0.23540342455420937, + "learning_rate": 6.288778298537966e-06, + "loss": 0.0736, + "step": 899 + }, + { + "epoch": 1.3181984621017941, + "grad_norm": 0.2772712225095351, + "learning_rate": 6.2650151604564534e-06, + "loss": 0.0938, + "step": 900 + }, + { + "epoch": 1.319663127059685, + "grad_norm": 0.25309744480957075, + "learning_rate": 6.241276507325198e-06, + "loss": 0.0758, + "step": 901 + }, + { + "epoch": 1.321127792017576, + "grad_norm": 0.25863367628675915, + "learning_rate": 6.217562494764569e-06, + "loss": 0.0833, + "step": 902 + }, + { + "epoch": 1.3225924569754668, + "grad_norm": 0.26530513213868256, + "learning_rate": 6.193873278233417e-06, + "loss": 0.0841, + "step": 903 + }, + { + "epoch": 1.3240571219333577, + "grad_norm": 0.24979685098439777, + "learning_rate": 6.170209013028021e-06, + "loss": 0.0876, + "step": 904 + }, + { + "epoch": 1.3255217868912486, + "grad_norm": 0.2523647371099479, + "learning_rate": 6.146569854281111e-06, + "loss": 0.082, + "step": 905 + }, + { + "epoch": 1.3269864518491394, + "grad_norm": 0.2666393886883001, + "learning_rate": 6.1229559569608144e-06, + "loss": 0.0982, + "step": 906 + }, + { + "epoch": 1.3284511168070303, + "grad_norm": 0.25240352584777037, + "learning_rate": 6.099367475869658e-06, + "loss": 0.0815, + "step": 907 + }, + { + "epoch": 1.3299157817649212, + "grad_norm": 0.24034010865003308, + "learning_rate": 6.075804565643562e-06, + "loss": 0.0732, + "step": 908 + }, + { + "epoch": 1.331380446722812, + "grad_norm": 0.2601774368496189, + "learning_rate": 6.052267380750796e-06, + "loss": 0.0804, + "step": 909 + }, + { + "epoch": 1.332845111680703, + "grad_norm": 0.2416658354239181, + "learning_rate": 6.028756075491007e-06, + "loss": 0.0671, + "step": 910 + }, + { + "epoch": 1.3343097766385938, + "grad_norm": 0.23420758822988802, + "learning_rate": 6.005270803994165e-06, + "loss": 0.0694, + "step": 911 + }, + { + "epoch": 1.3357744415964847, + "grad_norm": 0.23323493701217127, + "learning_rate": 5.981811720219593e-06, + "loss": 0.0677, + "step": 912 + }, + { + "epoch": 1.3372391065543756, + "grad_norm": 0.26506406252750964, + "learning_rate": 5.958378977954922e-06, + "loss": 0.0741, + "step": 913 + }, + { + "epoch": 1.3387037715122665, + "grad_norm": 0.255574393142094, + "learning_rate": 5.934972730815115e-06, + "loss": 0.0811, + "step": 914 + }, + { + "epoch": 1.3401684364701574, + "grad_norm": 0.2550215436110789, + "learning_rate": 5.9115931322414285e-06, + "loss": 0.0862, + "step": 915 + }, + { + "epoch": 1.3416331014280483, + "grad_norm": 0.25209588662794513, + "learning_rate": 5.888240335500439e-06, + "loss": 0.0721, + "step": 916 + }, + { + "epoch": 1.3430977663859391, + "grad_norm": 0.26335632891089883, + "learning_rate": 5.8649144936830045e-06, + "loss": 0.0749, + "step": 917 + }, + { + "epoch": 1.34456243134383, + "grad_norm": 0.26611291985369345, + "learning_rate": 5.841615759703296e-06, + "loss": 0.0856, + "step": 918 + }, + { + "epoch": 1.346027096301721, + "grad_norm": 0.3180398065874917, + "learning_rate": 5.818344286297756e-06, + "loss": 0.1111, + "step": 919 + }, + { + "epoch": 1.3474917612596118, + "grad_norm": 0.2470882409448809, + "learning_rate": 5.795100226024145e-06, + "loss": 0.0842, + "step": 920 + }, + { + "epoch": 1.3489564262175027, + "grad_norm": 0.24747051474580573, + "learning_rate": 5.771883731260492e-06, + "loss": 0.0849, + "step": 921 + }, + { + "epoch": 1.3504210911753936, + "grad_norm": 0.2689238693234873, + "learning_rate": 5.748694954204126e-06, + "loss": 0.09, + "step": 922 + }, + { + "epoch": 1.3518857561332844, + "grad_norm": 0.24131459800140173, + "learning_rate": 5.7255340468706776e-06, + "loss": 0.0788, + "step": 923 + }, + { + "epoch": 1.3533504210911753, + "grad_norm": 0.23703584143200643, + "learning_rate": 5.702401161093061e-06, + "loss": 0.07, + "step": 924 + }, + { + "epoch": 1.3548150860490662, + "grad_norm": 0.24911481329061927, + "learning_rate": 5.679296448520509e-06, + "loss": 0.0766, + "step": 925 + }, + { + "epoch": 1.356279751006957, + "grad_norm": 0.2493171773305748, + "learning_rate": 5.65622006061755e-06, + "loss": 0.0743, + "step": 926 + }, + { + "epoch": 1.357744415964848, + "grad_norm": 0.25543638955248743, + "learning_rate": 5.633172148663039e-06, + "loss": 0.0869, + "step": 927 + }, + { + "epoch": 1.3592090809227388, + "grad_norm": 0.24551184220415498, + "learning_rate": 5.610152863749143e-06, + "loss": 0.069, + "step": 928 + }, + { + "epoch": 1.3606737458806297, + "grad_norm": 0.26501893240626057, + "learning_rate": 5.5871623567803756e-06, + "loss": 0.0696, + "step": 929 + }, + { + "epoch": 1.3621384108385206, + "grad_norm": 0.27208453202862887, + "learning_rate": 5.564200778472583e-06, + "loss": 0.0795, + "step": 930 + }, + { + "epoch": 1.3636030757964115, + "grad_norm": 0.27329376137848915, + "learning_rate": 5.5412682793519765e-06, + "loss": 0.0772, + "step": 931 + }, + { + "epoch": 1.3650677407543024, + "grad_norm": 0.2631020886292811, + "learning_rate": 5.5183650097541295e-06, + "loss": 0.0801, + "step": 932 + }, + { + "epoch": 1.3665324057121933, + "grad_norm": 0.24801113517701595, + "learning_rate": 5.495491119823007e-06, + "loss": 0.0722, + "step": 933 + }, + { + "epoch": 1.3679970706700841, + "grad_norm": 0.2650739437868264, + "learning_rate": 5.472646759509963e-06, + "loss": 0.0936, + "step": 934 + }, + { + "epoch": 1.369461735627975, + "grad_norm": 0.2605363396379517, + "learning_rate": 5.449832078572781e-06, + "loss": 0.0878, + "step": 935 + }, + { + "epoch": 1.370926400585866, + "grad_norm": 0.2646339088927977, + "learning_rate": 5.427047226574671e-06, + "loss": 0.0759, + "step": 936 + }, + { + "epoch": 1.3723910655437568, + "grad_norm": 0.2601636184080717, + "learning_rate": 5.404292352883291e-06, + "loss": 0.0944, + "step": 937 + }, + { + "epoch": 1.3738557305016477, + "grad_norm": 0.2837985033272464, + "learning_rate": 5.3815676066697946e-06, + "loss": 0.1013, + "step": 938 + }, + { + "epoch": 1.3753203954595388, + "grad_norm": 0.284455217744833, + "learning_rate": 5.358873136907808e-06, + "loss": 0.0788, + "step": 939 + }, + { + "epoch": 1.3767850604174297, + "grad_norm": 0.26876984673679605, + "learning_rate": 5.336209092372502e-06, + "loss": 0.0902, + "step": 940 + }, + { + "epoch": 1.3782497253753205, + "grad_norm": 0.27988318639906634, + "learning_rate": 5.313575621639568e-06, + "loss": 0.0857, + "step": 941 + }, + { + "epoch": 1.3797143903332114, + "grad_norm": 0.29898311525846655, + "learning_rate": 5.290972873084296e-06, + "loss": 0.0876, + "step": 942 + }, + { + "epoch": 1.3811790552911023, + "grad_norm": 0.2392250599528443, + "learning_rate": 5.268400994880547e-06, + "loss": 0.0688, + "step": 943 + }, + { + "epoch": 1.3826437202489932, + "grad_norm": 0.24702025812837353, + "learning_rate": 5.245860134999831e-06, + "loss": 0.0763, + "step": 944 + }, + { + "epoch": 1.384108385206884, + "grad_norm": 0.25177865615717077, + "learning_rate": 5.223350441210303e-06, + "loss": 0.0804, + "step": 945 + }, + { + "epoch": 1.385573050164775, + "grad_norm": 0.27025384321008283, + "learning_rate": 5.200872061075814e-06, + "loss": 0.0865, + "step": 946 + }, + { + "epoch": 1.3870377151226658, + "grad_norm": 0.3196326325850145, + "learning_rate": 5.178425141954926e-06, + "loss": 0.1001, + "step": 947 + }, + { + "epoch": 1.3885023800805567, + "grad_norm": 0.2659528533890264, + "learning_rate": 5.15600983099997e-06, + "loss": 0.0757, + "step": 948 + }, + { + "epoch": 1.3899670450384476, + "grad_norm": 0.25677841458575923, + "learning_rate": 5.133626275156055e-06, + "loss": 0.0655, + "step": 949 + }, + { + "epoch": 1.3914317099963385, + "grad_norm": 0.2847121257417236, + "learning_rate": 5.111274621160127e-06, + "loss": 0.0866, + "step": 950 + }, + { + "epoch": 1.3928963749542294, + "grad_norm": 0.27638736754038373, + "learning_rate": 5.088955015539989e-06, + "loss": 0.1017, + "step": 951 + }, + { + "epoch": 1.3943610399121202, + "grad_norm": 0.24594995873570738, + "learning_rate": 5.06666760461335e-06, + "loss": 0.0816, + "step": 952 + }, + { + "epoch": 1.3958257048700111, + "grad_norm": 0.24663161576513942, + "learning_rate": 5.044412534486873e-06, + "loss": 0.0721, + "step": 953 + }, + { + "epoch": 1.397290369827902, + "grad_norm": 0.24164833719808607, + "learning_rate": 5.0221899510551965e-06, + "loss": 0.0617, + "step": 954 + }, + { + "epoch": 1.3987550347857929, + "grad_norm": 0.2772777632729084, + "learning_rate": 5.000000000000003e-06, + "loss": 0.0837, + "step": 955 + }, + { + "epoch": 1.4002196997436838, + "grad_norm": 0.25527189944389456, + "learning_rate": 4.977842826789034e-06, + "loss": 0.0812, + "step": 956 + }, + { + "epoch": 1.4016843647015746, + "grad_norm": 0.26722398889539783, + "learning_rate": 4.955718576675176e-06, + "loss": 0.0762, + "step": 957 + }, + { + "epoch": 1.4031490296594655, + "grad_norm": 0.2575445402991656, + "learning_rate": 4.933627394695464e-06, + "loss": 0.0761, + "step": 958 + }, + { + "epoch": 1.4046136946173564, + "grad_norm": 0.26548423508114666, + "learning_rate": 4.911569425670168e-06, + "loss": 0.0797, + "step": 959 + }, + { + "epoch": 1.4060783595752473, + "grad_norm": 0.23644665405104945, + "learning_rate": 4.88954481420182e-06, + "loss": 0.073, + "step": 960 + }, + { + "epoch": 1.4075430245331382, + "grad_norm": 0.25187795846556027, + "learning_rate": 4.867553704674279e-06, + "loss": 0.0872, + "step": 961 + }, + { + "epoch": 1.409007689491029, + "grad_norm": 0.2698865546073546, + "learning_rate": 4.845596241251773e-06, + "loss": 0.0895, + "step": 962 + }, + { + "epoch": 1.41047235444892, + "grad_norm": 0.24104521653041425, + "learning_rate": 4.823672567877973e-06, + "loss": 0.0758, + "step": 963 + }, + { + "epoch": 1.4119370194068108, + "grad_norm": 0.2442657706715064, + "learning_rate": 4.801782828275019e-06, + "loss": 0.0702, + "step": 964 + }, + { + "epoch": 1.4134016843647017, + "grad_norm": 0.2592721496347663, + "learning_rate": 4.779927165942616e-06, + "loss": 0.0702, + "step": 965 + }, + { + "epoch": 1.4148663493225926, + "grad_norm": 0.249769758106348, + "learning_rate": 4.758105724157058e-06, + "loss": 0.0717, + "step": 966 + }, + { + "epoch": 1.4163310142804835, + "grad_norm": 0.30631601628088784, + "learning_rate": 4.7363186459703055e-06, + "loss": 0.0855, + "step": 967 + }, + { + "epoch": 1.4177956792383744, + "grad_norm": 0.2580744961558955, + "learning_rate": 4.714566074209058e-06, + "loss": 0.0815, + "step": 968 + }, + { + "epoch": 1.4192603441962652, + "grad_norm": 0.27175417696412074, + "learning_rate": 4.692848151473789e-06, + "loss": 0.0884, + "step": 969 + }, + { + "epoch": 1.4207250091541561, + "grad_norm": 0.26107195316664245, + "learning_rate": 4.671165020137844e-06, + "loss": 0.0914, + "step": 970 + }, + { + "epoch": 1.422189674112047, + "grad_norm": 0.2659897433575913, + "learning_rate": 4.649516822346474e-06, + "loss": 0.0841, + "step": 971 + }, + { + "epoch": 1.4236543390699379, + "grad_norm": 0.24447135185611163, + "learning_rate": 4.62790370001594e-06, + "loss": 0.0761, + "step": 972 + }, + { + "epoch": 1.4251190040278288, + "grad_norm": 0.27972979502307804, + "learning_rate": 4.606325794832545e-06, + "loss": 0.0771, + "step": 973 + }, + { + "epoch": 1.4265836689857196, + "grad_norm": 0.2736878498962296, + "learning_rate": 4.584783248251738e-06, + "loss": 0.0842, + "step": 974 + }, + { + "epoch": 1.4280483339436105, + "grad_norm": 0.2837151699593587, + "learning_rate": 4.563276201497161e-06, + "loss": 0.0929, + "step": 975 + }, + { + "epoch": 1.4295129989015014, + "grad_norm": 0.2517370953490891, + "learning_rate": 4.5418047955597465e-06, + "loss": 0.0702, + "step": 976 + }, + { + "epoch": 1.4309776638593923, + "grad_norm": 0.28908876741636197, + "learning_rate": 4.520369171196766e-06, + "loss": 0.0928, + "step": 977 + }, + { + "epoch": 1.4324423288172832, + "grad_norm": 0.3619682932427526, + "learning_rate": 4.4989694689309394e-06, + "loss": 0.086, + "step": 978 + }, + { + "epoch": 1.433906993775174, + "grad_norm": 0.28663475644140607, + "learning_rate": 4.477605829049479e-06, + "loss": 0.0811, + "step": 979 + }, + { + "epoch": 1.435371658733065, + "grad_norm": 0.2698934408611363, + "learning_rate": 4.456278391603207e-06, + "loss": 0.0811, + "step": 980 + }, + { + "epoch": 1.4368363236909558, + "grad_norm": 0.2663708566419197, + "learning_rate": 4.434987296405602e-06, + "loss": 0.106, + "step": 981 + }, + { + "epoch": 1.4383009886488467, + "grad_norm": 0.24655631319842156, + "learning_rate": 4.413732683031901e-06, + "loss": 0.078, + "step": 982 + }, + { + "epoch": 1.4397656536067376, + "grad_norm": 0.24077565145998617, + "learning_rate": 4.392514690818194e-06, + "loss": 0.0695, + "step": 983 + }, + { + "epoch": 1.4412303185646285, + "grad_norm": 0.2717847069075042, + "learning_rate": 4.37133345886048e-06, + "loss": 0.0902, + "step": 984 + }, + { + "epoch": 1.4426949835225193, + "grad_norm": 0.2318112119726766, + "learning_rate": 4.350189126013793e-06, + "loss": 0.0641, + "step": 985 + }, + { + "epoch": 1.4441596484804102, + "grad_norm": 0.26503499304444195, + "learning_rate": 4.329081830891253e-06, + "loss": 0.0967, + "step": 986 + }, + { + "epoch": 1.4456243134383011, + "grad_norm": 0.2691482856135504, + "learning_rate": 4.308011711863196e-06, + "loss": 0.0911, + "step": 987 + }, + { + "epoch": 1.447088978396192, + "grad_norm": 0.27532174581408686, + "learning_rate": 4.28697890705623e-06, + "loss": 0.0885, + "step": 988 + }, + { + "epoch": 1.4485536433540829, + "grad_norm": 0.27243518360307484, + "learning_rate": 4.265983554352361e-06, + "loss": 0.0815, + "step": 989 + }, + { + "epoch": 1.4500183083119738, + "grad_norm": 0.25834681516095914, + "learning_rate": 4.245025791388063e-06, + "loss": 0.0736, + "step": 990 + }, + { + "epoch": 1.4514829732698646, + "grad_norm": 0.2850395862995923, + "learning_rate": 4.224105755553402e-06, + "loss": 0.0851, + "step": 991 + }, + { + "epoch": 1.4529476382277555, + "grad_norm": 0.3092877439958859, + "learning_rate": 4.203223583991103e-06, + "loss": 0.1023, + "step": 992 + }, + { + "epoch": 1.4544123031856464, + "grad_norm": 0.26774401365459694, + "learning_rate": 4.18237941359569e-06, + "loss": 0.0829, + "step": 993 + }, + { + "epoch": 1.4558769681435373, + "grad_norm": 0.23800118678712484, + "learning_rate": 4.161573381012547e-06, + "loss": 0.0676, + "step": 994 + }, + { + "epoch": 1.4573416331014282, + "grad_norm": 0.2666429232022083, + "learning_rate": 4.140805622637062e-06, + "loss": 0.0859, + "step": 995 + }, + { + "epoch": 1.458806298059319, + "grad_norm": 0.24644352364865663, + "learning_rate": 4.1200762746137e-06, + "loss": 0.0761, + "step": 996 + }, + { + "epoch": 1.46027096301721, + "grad_norm": 0.2860182896303882, + "learning_rate": 4.099385472835128e-06, + "loss": 0.1061, + "step": 997 + }, + { + "epoch": 1.4617356279751008, + "grad_norm": 0.24601895494668852, + "learning_rate": 4.078733352941322e-06, + "loss": 0.0889, + "step": 998 + }, + { + "epoch": 1.4632002929329917, + "grad_norm": 0.2766785303442865, + "learning_rate": 4.05812005031868e-06, + "loss": 0.0904, + "step": 999 + }, + { + "epoch": 1.4646649578908826, + "grad_norm": 0.2652781699314485, + "learning_rate": 4.0375457000991216e-06, + "loss": 0.0816, + "step": 1000 + }, + { + "epoch": 1.4661296228487735, + "grad_norm": 0.25088853578582626, + "learning_rate": 4.01701043715922e-06, + "loss": 0.0762, + "step": 1001 + }, + { + "epoch": 1.4675942878066643, + "grad_norm": 0.2818469333221271, + "learning_rate": 3.996514396119301e-06, + "loss": 0.0774, + "step": 1002 + }, + { + "epoch": 1.4690589527645552, + "grad_norm": 0.26036857004758346, + "learning_rate": 3.976057711342578e-06, + "loss": 0.0795, + "step": 1003 + }, + { + "epoch": 1.470523617722446, + "grad_norm": 0.27442281645232014, + "learning_rate": 3.95564051693425e-06, + "loss": 0.07, + "step": 1004 + }, + { + "epoch": 1.471988282680337, + "grad_norm": 0.24267887450363582, + "learning_rate": 3.935262946740648e-06, + "loss": 0.0756, + "step": 1005 + }, + { + "epoch": 1.4734529476382279, + "grad_norm": 0.2511350381009027, + "learning_rate": 3.914925134348328e-06, + "loss": 0.069, + "step": 1006 + }, + { + "epoch": 1.4749176125961188, + "grad_norm": 0.2342240326378387, + "learning_rate": 3.894627213083227e-06, + "loss": 0.0688, + "step": 1007 + }, + { + "epoch": 1.4763822775540096, + "grad_norm": 0.23553774309732706, + "learning_rate": 3.874369316009759e-06, + "loss": 0.0744, + "step": 1008 + }, + { + "epoch": 1.4778469425119005, + "grad_norm": 0.23926474659226465, + "learning_rate": 3.85415157592997e-06, + "loss": 0.0734, + "step": 1009 + }, + { + "epoch": 1.4793116074697914, + "grad_norm": 0.268715175607369, + "learning_rate": 3.833974125382639e-06, + "loss": 0.0723, + "step": 1010 + }, + { + "epoch": 1.4807762724276823, + "grad_norm": 0.2662279770589081, + "learning_rate": 3.8138370966424386e-06, + "loss": 0.0717, + "step": 1011 + }, + { + "epoch": 1.4822409373855732, + "grad_norm": 0.2737509205481138, + "learning_rate": 3.793740621719042e-06, + "loss": 0.0814, + "step": 1012 + }, + { + "epoch": 1.483705602343464, + "grad_norm": 0.2639004103415663, + "learning_rate": 3.7736848323562803e-06, + "loss": 0.091, + "step": 1013 + }, + { + "epoch": 1.485170267301355, + "grad_norm": 0.24888682745157728, + "learning_rate": 3.753669860031254e-06, + "loss": 0.0791, + "step": 1014 + }, + { + "epoch": 1.4866349322592458, + "grad_norm": 0.2768808484904024, + "learning_rate": 3.7336958359534992e-06, + "loss": 0.1019, + "step": 1015 + }, + { + "epoch": 1.4880995972171367, + "grad_norm": 0.24635968007971593, + "learning_rate": 3.7137628910640997e-06, + "loss": 0.0656, + "step": 1016 + }, + { + "epoch": 1.4895642621750276, + "grad_norm": 0.2769850878713628, + "learning_rate": 3.693871156034854e-06, + "loss": 0.0824, + "step": 1017 + }, + { + "epoch": 1.4910289271329185, + "grad_norm": 0.2618910230484893, + "learning_rate": 3.674020761267394e-06, + "loss": 0.0749, + "step": 1018 + }, + { + "epoch": 1.4924935920908093, + "grad_norm": 0.25597642357105027, + "learning_rate": 3.6542118368923562e-06, + "loss": 0.0698, + "step": 1019 + }, + { + "epoch": 1.4939582570487002, + "grad_norm": 0.2717269361679958, + "learning_rate": 3.634444512768501e-06, + "loss": 0.083, + "step": 1020 + }, + { + "epoch": 1.495422922006591, + "grad_norm": 0.2992590236586871, + "learning_rate": 3.61471891848189e-06, + "loss": 0.0886, + "step": 1021 + }, + { + "epoch": 1.496887586964482, + "grad_norm": 0.2455530601341986, + "learning_rate": 3.595035183345007e-06, + "loss": 0.0666, + "step": 1022 + }, + { + "epoch": 1.4983522519223729, + "grad_norm": 0.2811055696814055, + "learning_rate": 3.575393436395941e-06, + "loss": 0.0837, + "step": 1023 + }, + { + "epoch": 1.4998169168802638, + "grad_norm": 0.25019620839932355, + "learning_rate": 3.5557938063975105e-06, + "loss": 0.0755, + "step": 1024 + }, + { + "epoch": 1.5012815818381546, + "grad_norm": 0.24368791106013798, + "learning_rate": 3.5362364218364387e-06, + "loss": 0.0731, + "step": 1025 + }, + { + "epoch": 1.5027462467960455, + "grad_norm": 0.26573828045849746, + "learning_rate": 3.5167214109225113e-06, + "loss": 0.0791, + "step": 1026 + }, + { + "epoch": 1.5042109117539364, + "grad_norm": 0.2550833556385589, + "learning_rate": 3.497248901587721e-06, + "loss": 0.08, + "step": 1027 + }, + { + "epoch": 1.5056755767118273, + "grad_norm": 0.2639518823077813, + "learning_rate": 3.477819021485448e-06, + "loss": 0.0782, + "step": 1028 + }, + { + "epoch": 1.5071402416697182, + "grad_norm": 0.2646189162558523, + "learning_rate": 3.4584318979896028e-06, + "loss": 0.0878, + "step": 1029 + }, + { + "epoch": 1.508604906627609, + "grad_norm": 0.2641718505669378, + "learning_rate": 3.439087658193816e-06, + "loss": 0.0936, + "step": 1030 + }, + { + "epoch": 1.5100695715855, + "grad_norm": 0.24594190472472738, + "learning_rate": 3.4197864289105763e-06, + "loss": 0.0778, + "step": 1031 + }, + { + "epoch": 1.5115342365433908, + "grad_norm": 0.28815729095628295, + "learning_rate": 3.4005283366704268e-06, + "loss": 0.0761, + "step": 1032 + }, + { + "epoch": 1.5129989015012817, + "grad_norm": 0.2528110175883712, + "learning_rate": 3.381313507721111e-06, + "loss": 0.0747, + "step": 1033 + }, + { + "epoch": 1.5144635664591726, + "grad_norm": 0.2533306883004603, + "learning_rate": 3.36214206802677e-06, + "loss": 0.0691, + "step": 1034 + }, + { + "epoch": 1.5159282314170635, + "grad_norm": 0.25930598769235536, + "learning_rate": 3.343014143267089e-06, + "loss": 0.0692, + "step": 1035 + }, + { + "epoch": 1.5173928963749543, + "grad_norm": 0.25924131927849636, + "learning_rate": 3.3239298588365045e-06, + "loss": 0.0775, + "step": 1036 + }, + { + "epoch": 1.5188575613328452, + "grad_norm": 0.2491634194032525, + "learning_rate": 3.304889339843347e-06, + "loss": 0.0723, + "step": 1037 + }, + { + "epoch": 1.520322226290736, + "grad_norm": 0.26529983131571094, + "learning_rate": 3.285892711109059e-06, + "loss": 0.0914, + "step": 1038 + }, + { + "epoch": 1.521786891248627, + "grad_norm": 0.22521005677274164, + "learning_rate": 3.2669400971673425e-06, + "loss": 0.0616, + "step": 1039 + }, + { + "epoch": 1.5232515562065179, + "grad_norm": 0.24331418307789873, + "learning_rate": 3.2480316222633614e-06, + "loss": 0.0652, + "step": 1040 + }, + { + "epoch": 1.5247162211644087, + "grad_norm": 0.2488037942280336, + "learning_rate": 3.229167410352931e-06, + "loss": 0.0715, + "step": 1041 + }, + { + "epoch": 1.5261808861222996, + "grad_norm": 0.26897121918679934, + "learning_rate": 3.210347585101684e-06, + "loss": 0.0793, + "step": 1042 + }, + { + "epoch": 1.5276455510801905, + "grad_norm": 0.25348059670947104, + "learning_rate": 3.1915722698842877e-06, + "loss": 0.078, + "step": 1043 + }, + { + "epoch": 1.5291102160380814, + "grad_norm": 0.233264660760853, + "learning_rate": 3.1728415877836072e-06, + "loss": 0.0761, + "step": 1044 + }, + { + "epoch": 1.5305748809959723, + "grad_norm": 0.26375032434001777, + "learning_rate": 3.154155661589924e-06, + "loss": 0.0819, + "step": 1045 + }, + { + "epoch": 1.5320395459538632, + "grad_norm": 0.26419620695615686, + "learning_rate": 3.135514613800108e-06, + "loss": 0.0765, + "step": 1046 + }, + { + "epoch": 1.533504210911754, + "grad_norm": 0.23976923188526442, + "learning_rate": 3.1169185666168377e-06, + "loss": 0.0692, + "step": 1047 + }, + { + "epoch": 1.534968875869645, + "grad_norm": 0.2493779944554121, + "learning_rate": 3.098367641947775e-06, + "loss": 0.0845, + "step": 1048 + }, + { + "epoch": 1.5364335408275358, + "grad_norm": 0.29326865343508624, + "learning_rate": 3.0798619614047885e-06, + "loss": 0.0998, + "step": 1049 + }, + { + "epoch": 1.5378982057854267, + "grad_norm": 0.2789752607489951, + "learning_rate": 3.061401646303136e-06, + "loss": 0.0886, + "step": 1050 + }, + { + "epoch": 1.5393628707433176, + "grad_norm": 0.2737662371091596, + "learning_rate": 3.04298681766069e-06, + "loss": 0.0684, + "step": 1051 + }, + { + "epoch": 1.5408275357012085, + "grad_norm": 0.2637024271799521, + "learning_rate": 3.024617596197121e-06, + "loss": 0.0859, + "step": 1052 + }, + { + "epoch": 1.5422922006590993, + "grad_norm": 0.23552670812452456, + "learning_rate": 3.0062941023331316e-06, + "loss": 0.0639, + "step": 1053 + }, + { + "epoch": 1.5437568656169902, + "grad_norm": 0.2442542376702699, + "learning_rate": 2.988016456189644e-06, + "loss": 0.0721, + "step": 1054 + }, + { + "epoch": 1.545221530574881, + "grad_norm": 0.2681121581501204, + "learning_rate": 2.9697847775870227e-06, + "loss": 0.0781, + "step": 1055 + }, + { + "epoch": 1.546686195532772, + "grad_norm": 0.3009772111270036, + "learning_rate": 2.9515991860442973e-06, + "loss": 0.0887, + "step": 1056 + }, + { + "epoch": 1.5481508604906629, + "grad_norm": 0.2438680927400026, + "learning_rate": 2.933459800778361e-06, + "loss": 0.0708, + "step": 1057 + }, + { + "epoch": 1.5496155254485537, + "grad_norm": 0.24192854414225795, + "learning_rate": 2.9153667407032073e-06, + "loss": 0.0651, + "step": 1058 + }, + { + "epoch": 1.5510801904064446, + "grad_norm": 0.25560089457881163, + "learning_rate": 2.8973201244291305e-06, + "loss": 0.0748, + "step": 1059 + }, + { + "epoch": 1.5525448553643355, + "grad_norm": 0.24635545740478473, + "learning_rate": 2.879320070261974e-06, + "loss": 0.0638, + "step": 1060 + }, + { + "epoch": 1.5540095203222264, + "grad_norm": 0.3187176598509699, + "learning_rate": 2.861366696202326e-06, + "loss": 0.1081, + "step": 1061 + }, + { + "epoch": 1.5554741852801173, + "grad_norm": 0.25629735568872725, + "learning_rate": 2.8434601199447698e-06, + "loss": 0.0769, + "step": 1062 + }, + { + "epoch": 1.5569388502380082, + "grad_norm": 0.24990001099936748, + "learning_rate": 2.825600458877095e-06, + "loss": 0.0898, + "step": 1063 + }, + { + "epoch": 1.558403515195899, + "grad_norm": 0.250013809003241, + "learning_rate": 2.8077878300795446e-06, + "loss": 0.0746, + "step": 1064 + }, + { + "epoch": 1.55986818015379, + "grad_norm": 0.25934456236335596, + "learning_rate": 2.7900223503240265e-06, + "loss": 0.0958, + "step": 1065 + }, + { + "epoch": 1.5613328451116808, + "grad_norm": 0.24098659840069045, + "learning_rate": 2.7723041360733737e-06, + "loss": 0.0695, + "step": 1066 + }, + { + "epoch": 1.5627975100695717, + "grad_norm": 0.26301653549643544, + "learning_rate": 2.7546333034805528e-06, + "loss": 0.0741, + "step": 1067 + }, + { + "epoch": 1.5642621750274626, + "grad_norm": 0.2391640316419889, + "learning_rate": 2.737009968387929e-06, + "loss": 0.0605, + "step": 1068 + }, + { + "epoch": 1.5657268399853534, + "grad_norm": 0.25060691825740045, + "learning_rate": 2.719434246326487e-06, + "loss": 0.0643, + "step": 1069 + }, + { + "epoch": 1.5671915049432443, + "grad_norm": 0.274283165127303, + "learning_rate": 2.7019062525150783e-06, + "loss": 0.0874, + "step": 1070 + }, + { + "epoch": 1.5686561699011352, + "grad_norm": 0.2978642107033507, + "learning_rate": 2.6844261018596806e-06, + "loss": 0.0823, + "step": 1071 + }, + { + "epoch": 1.570120834859026, + "grad_norm": 0.2690424055706032, + "learning_rate": 2.6669939089526177e-06, + "loss": 0.0856, + "step": 1072 + }, + { + "epoch": 1.571585499816917, + "grad_norm": 0.28386560171341124, + "learning_rate": 2.6496097880718364e-06, + "loss": 0.079, + "step": 1073 + }, + { + "epoch": 1.5730501647748079, + "grad_norm": 0.2898252725065379, + "learning_rate": 2.632273853180132e-06, + "loss": 0.0801, + "step": 1074 + }, + { + "epoch": 1.5745148297326987, + "grad_norm": 0.2510043284519159, + "learning_rate": 2.6149862179244257e-06, + "loss": 0.0692, + "step": 1075 + }, + { + "epoch": 1.5759794946905896, + "grad_norm": 0.2723382092456178, + "learning_rate": 2.5977469956349956e-06, + "loss": 0.0956, + "step": 1076 + }, + { + "epoch": 1.5774441596484805, + "grad_norm": 0.26220486838825635, + "learning_rate": 2.5805562993247536e-06, + "loss": 0.0705, + "step": 1077 + }, + { + "epoch": 1.5789088246063714, + "grad_norm": 0.2645372898294826, + "learning_rate": 2.563414241688489e-06, + "loss": 0.0691, + "step": 1078 + }, + { + "epoch": 1.5803734895642623, + "grad_norm": 0.25156658959890105, + "learning_rate": 2.5463209351021457e-06, + "loss": 0.0796, + "step": 1079 + }, + { + "epoch": 1.5818381545221532, + "grad_norm": 0.25334520784266373, + "learning_rate": 2.529276491622067e-06, + "loss": 0.0737, + "step": 1080 + }, + { + "epoch": 1.583302819480044, + "grad_norm": 0.23790674931444958, + "learning_rate": 2.5122810229842807e-06, + "loss": 0.0629, + "step": 1081 + }, + { + "epoch": 1.584767484437935, + "grad_norm": 0.2303091761767703, + "learning_rate": 2.495334640603746e-06, + "loss": 0.0767, + "step": 1082 + }, + { + "epoch": 1.5862321493958258, + "grad_norm": 0.2510575661559066, + "learning_rate": 2.4784374555736445e-06, + "loss": 0.0649, + "step": 1083 + }, + { + "epoch": 1.5876968143537167, + "grad_norm": 0.26143738599858646, + "learning_rate": 2.4615895786646337e-06, + "loss": 0.0831, + "step": 1084 + }, + { + "epoch": 1.5891614793116076, + "grad_norm": 0.2567282259841764, + "learning_rate": 2.444791120324127e-06, + "loss": 0.0754, + "step": 1085 + }, + { + "epoch": 1.5906261442694984, + "grad_norm": 0.2730764216157527, + "learning_rate": 2.4280421906755814e-06, + "loss": 0.0803, + "step": 1086 + }, + { + "epoch": 1.5920908092273893, + "grad_norm": 0.25081318067201624, + "learning_rate": 2.4113428995177522e-06, + "loss": 0.0778, + "step": 1087 + }, + { + "epoch": 1.5935554741852802, + "grad_norm": 0.24769756992277367, + "learning_rate": 2.394693356323997e-06, + "loss": 0.0775, + "step": 1088 + }, + { + "epoch": 1.595020139143171, + "grad_norm": 0.2546886460741762, + "learning_rate": 2.378093670241538e-06, + "loss": 0.0688, + "step": 1089 + }, + { + "epoch": 1.596484804101062, + "grad_norm": 0.2563027758513368, + "learning_rate": 2.3615439500907657e-06, + "loss": 0.0617, + "step": 1090 + }, + { + "epoch": 1.5979494690589529, + "grad_norm": 0.25885408016228334, + "learning_rate": 2.3450443043645035e-06, + "loss": 0.0697, + "step": 1091 + }, + { + "epoch": 1.5994141340168437, + "grad_norm": 0.26381952528868263, + "learning_rate": 2.3285948412273198e-06, + "loss": 0.0732, + "step": 1092 + }, + { + "epoch": 1.6008787989747346, + "grad_norm": 0.22722995241678426, + "learning_rate": 2.3121956685147995e-06, + "loss": 0.0571, + "step": 1093 + }, + { + "epoch": 1.6023434639326255, + "grad_norm": 0.2828387588961395, + "learning_rate": 2.2958468937328528e-06, + "loss": 0.0749, + "step": 1094 + }, + { + "epoch": 1.6038081288905164, + "grad_norm": 0.2558084885710104, + "learning_rate": 2.279548624056992e-06, + "loss": 0.0733, + "step": 1095 + }, + { + "epoch": 1.6052727938484073, + "grad_norm": 0.27462713945439543, + "learning_rate": 2.263300966331652e-06, + "loss": 0.0934, + "step": 1096 + }, + { + "epoch": 1.6067374588062981, + "grad_norm": 0.2904102780660436, + "learning_rate": 2.247104027069467e-06, + "loss": 0.0901, + "step": 1097 + }, + { + "epoch": 1.608202123764189, + "grad_norm": 0.2771702213465073, + "learning_rate": 2.230957912450592e-06, + "loss": 0.08, + "step": 1098 + }, + { + "epoch": 1.60966678872208, + "grad_norm": 0.2528112186314324, + "learning_rate": 2.214862728321987e-06, + "loss": 0.0755, + "step": 1099 + }, + { + "epoch": 1.6111314536799708, + "grad_norm": 0.23629870958401772, + "learning_rate": 2.1988185801967464e-06, + "loss": 0.0687, + "step": 1100 + }, + { + "epoch": 1.6125961186378617, + "grad_norm": 0.24410925927191227, + "learning_rate": 2.182825573253382e-06, + "loss": 0.0727, + "step": 1101 + }, + { + "epoch": 1.6140607835957526, + "grad_norm": 0.2561472124227788, + "learning_rate": 2.1668838123351566e-06, + "loss": 0.0909, + "step": 1102 + }, + { + "epoch": 1.6155254485536434, + "grad_norm": 0.2635995106202067, + "learning_rate": 2.150993401949376e-06, + "loss": 0.076, + "step": 1103 + }, + { + "epoch": 1.6169901135115343, + "grad_norm": 0.24617657014500907, + "learning_rate": 2.135154446266726e-06, + "loss": 0.068, + "step": 1104 + }, + { + "epoch": 1.6184547784694252, + "grad_norm": 0.24053480589468268, + "learning_rate": 2.119367049120565e-06, + "loss": 0.0635, + "step": 1105 + }, + { + "epoch": 1.619919443427316, + "grad_norm": 0.24204380335717454, + "learning_rate": 2.103631314006267e-06, + "loss": 0.0678, + "step": 1106 + }, + { + "epoch": 1.621384108385207, + "grad_norm": 0.22690779315880516, + "learning_rate": 2.087947344080522e-06, + "loss": 0.057, + "step": 1107 + }, + { + "epoch": 1.6228487733430979, + "grad_norm": 0.28433243723669543, + "learning_rate": 2.0723152421606805e-06, + "loss": 0.0951, + "step": 1108 + }, + { + "epoch": 1.6243134383009887, + "grad_norm": 0.2599968195648375, + "learning_rate": 2.0567351107240563e-06, + "loss": 0.0695, + "step": 1109 + }, + { + "epoch": 1.6257781032588796, + "grad_norm": 0.26139586892133587, + "learning_rate": 2.041207051907279e-06, + "loss": 0.078, + "step": 1110 + }, + { + "epoch": 1.6272427682167705, + "grad_norm": 0.2885911977842118, + "learning_rate": 2.0257311675056025e-06, + "loss": 0.0893, + "step": 1111 + }, + { + "epoch": 1.6287074331746614, + "grad_norm": 0.23726597176087105, + "learning_rate": 2.0103075589722576e-06, + "loss": 0.0614, + "step": 1112 + }, + { + "epoch": 1.6301720981325523, + "grad_norm": 0.26578035267907674, + "learning_rate": 1.9949363274177667e-06, + "loss": 0.0921, + "step": 1113 + }, + { + "epoch": 1.6316367630904431, + "grad_norm": 0.2749652618636836, + "learning_rate": 1.9796175736093027e-06, + "loss": 0.0835, + "step": 1114 + }, + { + "epoch": 1.633101428048334, + "grad_norm": 0.24658373064000075, + "learning_rate": 1.9643513979700035e-06, + "loss": 0.0777, + "step": 1115 + }, + { + "epoch": 1.634566093006225, + "grad_norm": 0.22932017498786317, + "learning_rate": 1.9491379005783405e-06, + "loss": 0.0723, + "step": 1116 + }, + { + "epoch": 1.6360307579641158, + "grad_norm": 0.2545440028300663, + "learning_rate": 1.933977181167439e-06, + "loss": 0.076, + "step": 1117 + }, + { + "epoch": 1.6374954229220067, + "grad_norm": 0.2552570068416127, + "learning_rate": 1.9188693391244438e-06, + "loss": 0.0754, + "step": 1118 + }, + { + "epoch": 1.6389600878798976, + "grad_norm": 0.2478371358109087, + "learning_rate": 1.9038144734898478e-06, + "loss": 0.0729, + "step": 1119 + }, + { + "epoch": 1.6404247528377884, + "grad_norm": 0.2291241183631655, + "learning_rate": 1.8888126829568642e-06, + "loss": 0.0634, + "step": 1120 + }, + { + "epoch": 1.6418894177956793, + "grad_norm": 0.24193265605324504, + "learning_rate": 1.8738640658707585e-06, + "loss": 0.0671, + "step": 1121 + }, + { + "epoch": 1.6433540827535702, + "grad_norm": 0.2576873945419135, + "learning_rate": 1.8589687202282247e-06, + "loss": 0.0723, + "step": 1122 + }, + { + "epoch": 1.644818747711461, + "grad_norm": 0.24676250477181633, + "learning_rate": 1.844126743676722e-06, + "loss": 0.0748, + "step": 1123 + }, + { + "epoch": 1.646283412669352, + "grad_norm": 0.263090468630097, + "learning_rate": 1.8293382335138533e-06, + "loss": 0.0752, + "step": 1124 + }, + { + "epoch": 1.6477480776272428, + "grad_norm": 0.2540498525445381, + "learning_rate": 1.8146032866867114e-06, + "loss": 0.0708, + "step": 1125 + }, + { + "epoch": 1.6492127425851337, + "grad_norm": 0.2690977824033387, + "learning_rate": 1.7999219997912575e-06, + "loss": 0.0718, + "step": 1126 + }, + { + "epoch": 1.6506774075430246, + "grad_norm": 0.2565788082278319, + "learning_rate": 1.7852944690716766e-06, + "loss": 0.0834, + "step": 1127 + }, + { + "epoch": 1.6521420725009155, + "grad_norm": 0.250344072055428, + "learning_rate": 1.7707207904197566e-06, + "loss": 0.0648, + "step": 1128 + }, + { + "epoch": 1.6536067374588064, + "grad_norm": 0.2531519839830438, + "learning_rate": 1.7562010593742496e-06, + "loss": 0.0761, + "step": 1129 + }, + { + "epoch": 1.6550714024166973, + "grad_norm": 0.2671783705252202, + "learning_rate": 1.7417353711202478e-06, + "loss": 0.0806, + "step": 1130 + }, + { + "epoch": 1.6565360673745881, + "grad_norm": 0.27608727196744226, + "learning_rate": 1.7273238204885734e-06, + "loss": 0.0776, + "step": 1131 + }, + { + "epoch": 1.658000732332479, + "grad_norm": 0.21272809051359043, + "learning_rate": 1.7129665019551333e-06, + "loss": 0.0543, + "step": 1132 + }, + { + "epoch": 1.65946539729037, + "grad_norm": 0.2981100085240527, + "learning_rate": 1.6986635096403213e-06, + "loss": 0.0917, + "step": 1133 + }, + { + "epoch": 1.6609300622482608, + "grad_norm": 0.2283996510993873, + "learning_rate": 1.6844149373083852e-06, + "loss": 0.0689, + "step": 1134 + }, + { + "epoch": 1.6623947272061517, + "grad_norm": 0.2932656999891174, + "learning_rate": 1.670220878366826e-06, + "loss": 0.0913, + "step": 1135 + }, + { + "epoch": 1.6638593921640425, + "grad_norm": 0.24998740552484003, + "learning_rate": 1.6560814258657687e-06, + "loss": 0.0842, + "step": 1136 + }, + { + "epoch": 1.6653240571219334, + "grad_norm": 0.24855056763964203, + "learning_rate": 1.6419966724973734e-06, + "loss": 0.0734, + "step": 1137 + }, + { + "epoch": 1.6667887220798243, + "grad_norm": 0.24369157654216977, + "learning_rate": 1.6279667105952057e-06, + "loss": 0.0911, + "step": 1138 + }, + { + "epoch": 1.6682533870377152, + "grad_norm": 0.26041234841753214, + "learning_rate": 1.6139916321336513e-06, + "loss": 0.0704, + "step": 1139 + }, + { + "epoch": 1.669718051995606, + "grad_norm": 0.2654682154981918, + "learning_rate": 1.6000715287272938e-06, + "loss": 0.0762, + "step": 1140 + }, + { + "epoch": 1.671182716953497, + "grad_norm": 0.2379683222765605, + "learning_rate": 1.5862064916303343e-06, + "loss": 0.0734, + "step": 1141 + }, + { + "epoch": 1.6726473819113878, + "grad_norm": 0.26506248533884014, + "learning_rate": 1.5723966117359745e-06, + "loss": 0.0673, + "step": 1142 + }, + { + "epoch": 1.6741120468692787, + "grad_norm": 0.2380435720292988, + "learning_rate": 1.5586419795758356e-06, + "loss": 0.0726, + "step": 1143 + }, + { + "epoch": 1.6755767118271696, + "grad_norm": 0.23717677968711492, + "learning_rate": 1.5449426853193549e-06, + "loss": 0.0796, + "step": 1144 + }, + { + "epoch": 1.6770413767850605, + "grad_norm": 0.27958169485281625, + "learning_rate": 1.5312988187731969e-06, + "loss": 0.0855, + "step": 1145 + }, + { + "epoch": 1.6785060417429514, + "grad_norm": 0.2568879441056742, + "learning_rate": 1.5177104693806721e-06, + "loss": 0.0712, + "step": 1146 + }, + { + "epoch": 1.6799707067008423, + "grad_norm": 0.22152990710129233, + "learning_rate": 1.5041777262211355e-06, + "loss": 0.0584, + "step": 1147 + }, + { + "epoch": 1.6814353716587331, + "grad_norm": 0.2652525501551631, + "learning_rate": 1.4907006780094212e-06, + "loss": 0.0697, + "step": 1148 + }, + { + "epoch": 1.682900036616624, + "grad_norm": 0.23706028774243557, + "learning_rate": 1.4772794130952416e-06, + "loss": 0.0556, + "step": 1149 + }, + { + "epoch": 1.684364701574515, + "grad_norm": 0.232103427190539, + "learning_rate": 1.4639140194626289e-06, + "loss": 0.0591, + "step": 1150 + }, + { + "epoch": 1.6858293665324058, + "grad_norm": 0.24774373170203481, + "learning_rate": 1.450604584729336e-06, + "loss": 0.061, + "step": 1151 + }, + { + "epoch": 1.6872940314902967, + "grad_norm": 0.24757349813419924, + "learning_rate": 1.4373511961462828e-06, + "loss": 0.0859, + "step": 1152 + }, + { + "epoch": 1.6887586964481875, + "grad_norm": 0.239803460293539, + "learning_rate": 1.4241539405969662e-06, + "loss": 0.0694, + "step": 1153 + }, + { + "epoch": 1.6902233614060784, + "grad_norm": 0.24607947040534622, + "learning_rate": 1.411012904596909e-06, + "loss": 0.0726, + "step": 1154 + }, + { + "epoch": 1.6916880263639693, + "grad_norm": 0.24298503192914125, + "learning_rate": 1.3979281742930706e-06, + "loss": 0.071, + "step": 1155 + }, + { + "epoch": 1.6931526913218602, + "grad_norm": 0.25982515681290586, + "learning_rate": 1.3848998354633082e-06, + "loss": 0.0663, + "step": 1156 + }, + { + "epoch": 1.694617356279751, + "grad_norm": 0.2824274775862737, + "learning_rate": 1.3719279735157875e-06, + "loss": 0.0815, + "step": 1157 + }, + { + "epoch": 1.696082021237642, + "grad_norm": 0.25503279831214637, + "learning_rate": 1.359012673488449e-06, + "loss": 0.084, + "step": 1158 + }, + { + "epoch": 1.6975466861955328, + "grad_norm": 0.2572621551136547, + "learning_rate": 1.346154020048428e-06, + "loss": 0.0647, + "step": 1159 + }, + { + "epoch": 1.6990113511534237, + "grad_norm": 0.26401952613949853, + "learning_rate": 1.3333520974915093e-06, + "loss": 0.0611, + "step": 1160 + }, + { + "epoch": 1.7004760161113146, + "grad_norm": 0.27287208188302337, + "learning_rate": 1.320606989741583e-06, + "loss": 0.0887, + "step": 1161 + }, + { + "epoch": 1.7019406810692055, + "grad_norm": 0.25110794107433626, + "learning_rate": 1.307918780350077e-06, + "loss": 0.0726, + "step": 1162 + }, + { + "epoch": 1.7034053460270964, + "grad_norm": 0.26427102366363486, + "learning_rate": 1.2952875524954233e-06, + "loss": 0.0744, + "step": 1163 + }, + { + "epoch": 1.7048700109849872, + "grad_norm": 0.2577483404898282, + "learning_rate": 1.2827133889825039e-06, + "loss": 0.0799, + "step": 1164 + }, + { + "epoch": 1.7063346759428781, + "grad_norm": 0.2379395103991794, + "learning_rate": 1.2701963722421162e-06, + "loss": 0.0668, + "step": 1165 + }, + { + "epoch": 1.707799340900769, + "grad_norm": 0.24586889905164777, + "learning_rate": 1.2577365843304212e-06, + "loss": 0.0888, + "step": 1166 + }, + { + "epoch": 1.70926400585866, + "grad_norm": 0.23659079747711304, + "learning_rate": 1.245334106928422e-06, + "loss": 0.06, + "step": 1167 + }, + { + "epoch": 1.7107286708165508, + "grad_norm": 0.23685869500201995, + "learning_rate": 1.2329890213414063e-06, + "loss": 0.0639, + "step": 1168 + }, + { + "epoch": 1.7121933357744417, + "grad_norm": 0.2375875883124259, + "learning_rate": 1.220701408498438e-06, + "loss": 0.0669, + "step": 1169 + }, + { + "epoch": 1.7136580007323325, + "grad_norm": 0.26520434490127226, + "learning_rate": 1.2084713489518063e-06, + "loss": 0.0737, + "step": 1170 + }, + { + "epoch": 1.7151226656902234, + "grad_norm": 0.2629329649647797, + "learning_rate": 1.19629892287651e-06, + "loss": 0.0942, + "step": 1171 + }, + { + "epoch": 1.7165873306481143, + "grad_norm": 0.23516934988436172, + "learning_rate": 1.1841842100697253e-06, + "loss": 0.0551, + "step": 1172 + }, + { + "epoch": 1.7180519956060052, + "grad_norm": 0.27697429411981783, + "learning_rate": 1.1721272899502856e-06, + "loss": 0.0762, + "step": 1173 + }, + { + "epoch": 1.719516660563896, + "grad_norm": 0.25426135353111506, + "learning_rate": 1.1601282415581627e-06, + "loss": 0.0664, + "step": 1174 + }, + { + "epoch": 1.720981325521787, + "grad_norm": 0.24626607935465855, + "learning_rate": 1.1481871435539415e-06, + "loss": 0.0619, + "step": 1175 + }, + { + "epoch": 1.7224459904796778, + "grad_norm": 0.25729901031239943, + "learning_rate": 1.1363040742183162e-06, + "loss": 0.073, + "step": 1176 + }, + { + "epoch": 1.7239106554375687, + "grad_norm": 0.2567425743663647, + "learning_rate": 1.1244791114515608e-06, + "loss": 0.0578, + "step": 1177 + }, + { + "epoch": 1.7253753203954596, + "grad_norm": 0.2624890331507839, + "learning_rate": 1.1127123327730383e-06, + "loss": 0.0695, + "step": 1178 + }, + { + "epoch": 1.7268399853533505, + "grad_norm": 0.25775906362459766, + "learning_rate": 1.1010038153206703e-06, + "loss": 0.0671, + "step": 1179 + }, + { + "epoch": 1.7283046503112414, + "grad_norm": 0.2357579800372649, + "learning_rate": 1.0893536358504553e-06, + "loss": 0.0626, + "step": 1180 + }, + { + "epoch": 1.7297693152691322, + "grad_norm": 0.27772530636880377, + "learning_rate": 1.0777618707359427e-06, + "loss": 0.0768, + "step": 1181 + }, + { + "epoch": 1.7312339802270231, + "grad_norm": 0.2383877132505535, + "learning_rate": 1.0662285959677499e-06, + "loss": 0.0667, + "step": 1182 + }, + { + "epoch": 1.732698645184914, + "grad_norm": 0.23742698011306232, + "learning_rate": 1.0547538871530482e-06, + "loss": 0.0599, + "step": 1183 + }, + { + "epoch": 1.734163310142805, + "grad_norm": 0.2587588714854726, + "learning_rate": 1.0433378195150889e-06, + "loss": 0.1056, + "step": 1184 + }, + { + "epoch": 1.7356279751006958, + "grad_norm": 0.2623510431514894, + "learning_rate": 1.0319804678926825e-06, + "loss": 0.0724, + "step": 1185 + }, + { + "epoch": 1.7370926400585867, + "grad_norm": 0.23000218942084621, + "learning_rate": 1.0206819067397345e-06, + "loss": 0.0631, + "step": 1186 + }, + { + "epoch": 1.7385573050164775, + "grad_norm": 0.23524204735507692, + "learning_rate": 1.009442210124737e-06, + "loss": 0.0642, + "step": 1187 + }, + { + "epoch": 1.7400219699743684, + "grad_norm": 0.26310108373605007, + "learning_rate": 9.982614517302958e-07, + "loss": 0.0732, + "step": 1188 + }, + { + "epoch": 1.7414866349322593, + "grad_norm": 0.2822237389406422, + "learning_rate": 9.871397048526431e-07, + "loss": 0.09, + "step": 1189 + }, + { + "epoch": 1.7429512998901502, + "grad_norm": 0.27062484433619666, + "learning_rate": 9.760770424011557e-07, + "loss": 0.0742, + "step": 1190 + }, + { + "epoch": 1.744415964848041, + "grad_norm": 0.2942597554218074, + "learning_rate": 9.650735368978793e-07, + "loss": 0.0893, + "step": 1191 + }, + { + "epoch": 1.745880629805932, + "grad_norm": 0.2595099599479516, + "learning_rate": 9.541292604770502e-07, + "loss": 0.0801, + "step": 1192 + }, + { + "epoch": 1.7473452947638228, + "grad_norm": 0.25996832033844564, + "learning_rate": 9.432442848846291e-07, + "loss": 0.0802, + "step": 1193 + }, + { + "epoch": 1.7488099597217137, + "grad_norm": 0.25099023362939665, + "learning_rate": 9.324186814778202e-07, + "loss": 0.0759, + "step": 1194 + }, + { + "epoch": 1.7502746246796046, + "grad_norm": 0.261784151531945, + "learning_rate": 9.216525212246131e-07, + "loss": 0.068, + "step": 1195 + }, + { + "epoch": 1.7517392896374955, + "grad_norm": 0.2676041434167344, + "learning_rate": 9.109458747033106e-07, + "loss": 0.0758, + "step": 1196 + }, + { + "epoch": 1.7532039545953864, + "grad_norm": 0.27324413990185265, + "learning_rate": 9.00298812102075e-07, + "loss": 0.0781, + "step": 1197 + }, + { + "epoch": 1.7546686195532772, + "grad_norm": 0.23917286810904692, + "learning_rate": 8.897114032184539e-07, + "loss": 0.0604, + "step": 1198 + }, + { + "epoch": 1.7561332845111681, + "grad_norm": 0.2632087773397572, + "learning_rate": 8.791837174589401e-07, + "loss": 0.0835, + "step": 1199 + }, + { + "epoch": 1.757597949469059, + "grad_norm": 0.2745654342512245, + "learning_rate": 8.687158238384963e-07, + "loss": 0.0688, + "step": 1200 + }, + { + "epoch": 1.7590626144269499, + "grad_norm": 0.255009900836695, + "learning_rate": 8.583077909801252e-07, + "loss": 0.0729, + "step": 1201 + }, + { + "epoch": 1.7605272793848408, + "grad_norm": 0.24279405877132665, + "learning_rate": 8.47959687114398e-07, + "loss": 0.064, + "step": 1202 + }, + { + "epoch": 1.7619919443427317, + "grad_norm": 0.2671484719871084, + "learning_rate": 8.37671580079027e-07, + "loss": 0.0689, + "step": 1203 + }, + { + "epoch": 1.7634566093006225, + "grad_norm": 0.22793656038154106, + "learning_rate": 8.274435373184009e-07, + "loss": 0.0609, + "step": 1204 + }, + { + "epoch": 1.7649212742585134, + "grad_norm": 0.25888704046474315, + "learning_rate": 8.172756258831638e-07, + "loss": 0.0663, + "step": 1205 + }, + { + "epoch": 1.7663859392164043, + "grad_norm": 0.2694033818200205, + "learning_rate": 8.071679124297537e-07, + "loss": 0.077, + "step": 1206 + }, + { + "epoch": 1.7678506041742952, + "grad_norm": 0.25795707216950153, + "learning_rate": 7.971204632199869e-07, + "loss": 0.0738, + "step": 1207 + }, + { + "epoch": 1.769315269132186, + "grad_norm": 0.25028412383513904, + "learning_rate": 7.871333441206053e-07, + "loss": 0.0677, + "step": 1208 + }, + { + "epoch": 1.770779934090077, + "grad_norm": 0.23004892259257617, + "learning_rate": 7.772066206028572e-07, + "loss": 0.0612, + "step": 1209 + }, + { + "epoch": 1.7722445990479678, + "grad_norm": 0.25383482467849333, + "learning_rate": 7.673403577420591e-07, + "loss": 0.0685, + "step": 1210 + }, + { + "epoch": 1.7737092640058587, + "grad_norm": 0.2519042969852701, + "learning_rate": 7.575346202171819e-07, + "loss": 0.0647, + "step": 1211 + }, + { + "epoch": 1.7751739289637496, + "grad_norm": 0.26600143625920347, + "learning_rate": 7.477894723104073e-07, + "loss": 0.0807, + "step": 1212 + }, + { + "epoch": 1.7766385939216405, + "grad_norm": 0.24525033390296022, + "learning_rate": 7.381049779067273e-07, + "loss": 0.0616, + "step": 1213 + }, + { + "epoch": 1.7781032588795314, + "grad_norm": 0.24140459393603353, + "learning_rate": 7.284812004935083e-07, + "loss": 0.0648, + "step": 1214 + }, + { + "epoch": 1.7795679238374222, + "grad_norm": 0.27804404244360953, + "learning_rate": 7.189182031600906e-07, + "loss": 0.0886, + "step": 1215 + }, + { + "epoch": 1.7810325887953131, + "grad_norm": 0.2619152806722488, + "learning_rate": 7.094160485973567e-07, + "loss": 0.0826, + "step": 1216 + }, + { + "epoch": 1.782497253753204, + "grad_norm": 0.2204653484419197, + "learning_rate": 6.999747990973382e-07, + "loss": 0.0555, + "step": 1217 + }, + { + "epoch": 1.7839619187110949, + "grad_norm": 0.26650987169853924, + "learning_rate": 6.905945165527928e-07, + "loss": 0.0699, + "step": 1218 + }, + { + "epoch": 1.7854265836689858, + "grad_norm": 0.2451107677206505, + "learning_rate": 6.812752624568131e-07, + "loss": 0.0739, + "step": 1219 + }, + { + "epoch": 1.7868912486268766, + "grad_norm": 0.24387464427507713, + "learning_rate": 6.720170979024065e-07, + "loss": 0.066, + "step": 1220 + }, + { + "epoch": 1.7883559135847675, + "grad_norm": 0.24790390060583067, + "learning_rate": 6.628200835821119e-07, + "loss": 0.0568, + "step": 1221 + }, + { + "epoch": 1.7898205785426584, + "grad_norm": 0.26645550894398845, + "learning_rate": 6.536842797875876e-07, + "loss": 0.0864, + "step": 1222 + }, + { + "epoch": 1.7912852435005493, + "grad_norm": 0.2719628682812989, + "learning_rate": 6.446097464092249e-07, + "loss": 0.0822, + "step": 1223 + }, + { + "epoch": 1.7927499084584402, + "grad_norm": 0.25379643552394104, + "learning_rate": 6.355965429357513e-07, + "loss": 0.0737, + "step": 1224 + }, + { + "epoch": 1.794214573416331, + "grad_norm": 0.22752210074212567, + "learning_rate": 6.266447284538446e-07, + "loss": 0.0541, + "step": 1225 + }, + { + "epoch": 1.795679238374222, + "grad_norm": 0.2665704509120899, + "learning_rate": 6.177543616477377e-07, + "loss": 0.0759, + "step": 1226 + }, + { + "epoch": 1.7971439033321128, + "grad_norm": 0.2500751657498049, + "learning_rate": 6.08925500798847e-07, + "loss": 0.0631, + "step": 1227 + }, + { + "epoch": 1.7986085682900037, + "grad_norm": 0.259144080680451, + "learning_rate": 6.001582037853726e-07, + "loss": 0.0777, + "step": 1228 + }, + { + "epoch": 1.8000732332478946, + "grad_norm": 0.2897130385154202, + "learning_rate": 5.914525280819383e-07, + "loss": 0.0802, + "step": 1229 + }, + { + "epoch": 1.8015378982057855, + "grad_norm": 0.2553061949427675, + "learning_rate": 5.828085307591969e-07, + "loss": 0.0692, + "step": 1230 + }, + { + "epoch": 1.8030025631636764, + "grad_norm": 0.2572539562396507, + "learning_rate": 5.742262684834698e-07, + "loss": 0.0833, + "step": 1231 + }, + { + "epoch": 1.8044672281215672, + "grad_norm": 0.24772988825308792, + "learning_rate": 5.657057975163682e-07, + "loss": 0.0673, + "step": 1232 + }, + { + "epoch": 1.8059318930794581, + "grad_norm": 0.2717711132164481, + "learning_rate": 5.572471737144247e-07, + "loss": 0.0778, + "step": 1233 + }, + { + "epoch": 1.807396558037349, + "grad_norm": 0.2539596096042241, + "learning_rate": 5.488504525287319e-07, + "loss": 0.0727, + "step": 1234 + }, + { + "epoch": 1.8088612229952399, + "grad_norm": 0.2345694037301944, + "learning_rate": 5.405156890045704e-07, + "loss": 0.0616, + "step": 1235 + }, + { + "epoch": 1.8103258879531308, + "grad_norm": 0.2609506722100195, + "learning_rate": 5.322429377810612e-07, + "loss": 0.0853, + "step": 1236 + }, + { + "epoch": 1.8117905529110216, + "grad_norm": 0.25093904412233514, + "learning_rate": 5.240322530907893e-07, + "loss": 0.0823, + "step": 1237 + }, + { + "epoch": 1.8132552178689125, + "grad_norm": 0.2544084467508812, + "learning_rate": 5.158836887594687e-07, + "loss": 0.063, + "step": 1238 + }, + { + "epoch": 1.8147198828268034, + "grad_norm": 0.26650992359632614, + "learning_rate": 5.07797298205569e-07, + "loss": 0.0789, + "step": 1239 + }, + { + "epoch": 1.8161845477846943, + "grad_norm": 0.23630880168422222, + "learning_rate": 4.997731344399837e-07, + "loss": 0.0606, + "step": 1240 + }, + { + "epoch": 1.8176492127425852, + "grad_norm": 0.24553703591038675, + "learning_rate": 4.91811250065668e-07, + "loss": 0.0628, + "step": 1241 + }, + { + "epoch": 1.819113877700476, + "grad_norm": 0.2265227129455893, + "learning_rate": 4.839116972773061e-07, + "loss": 0.0583, + "step": 1242 + }, + { + "epoch": 1.820578542658367, + "grad_norm": 0.25312671344061183, + "learning_rate": 4.7607452786095686e-07, + "loss": 0.0608, + "step": 1243 + }, + { + "epoch": 1.8220432076162578, + "grad_norm": 0.23959774438699266, + "learning_rate": 4.682997931937283e-07, + "loss": 0.0628, + "step": 1244 + }, + { + "epoch": 1.8235078725741487, + "grad_norm": 0.31949707497166124, + "learning_rate": 4.605875442434238e-07, + "loss": 0.0964, + "step": 1245 + }, + { + "epoch": 1.8249725375320396, + "grad_norm": 0.2500895954628304, + "learning_rate": 4.5293783156822533e-07, + "loss": 0.0679, + "step": 1246 + }, + { + "epoch": 1.8264372024899305, + "grad_norm": 0.24693595115617872, + "learning_rate": 4.4535070531635195e-07, + "loss": 0.0722, + "step": 1247 + }, + { + "epoch": 1.8279018674478213, + "grad_norm": 0.25545938072704727, + "learning_rate": 4.378262152257273e-07, + "loss": 0.0683, + "step": 1248 + }, + { + "epoch": 1.8293665324057122, + "grad_norm": 0.25876832202455974, + "learning_rate": 4.303644106236704e-07, + "loss": 0.0662, + "step": 1249 + }, + { + "epoch": 1.8308311973636031, + "grad_norm": 0.26720989490367125, + "learning_rate": 4.2296534042654993e-07, + "loss": 0.0682, + "step": 1250 + }, + { + "epoch": 1.832295862321494, + "grad_norm": 0.26258372219745035, + "learning_rate": 4.1562905313948354e-07, + "loss": 0.0666, + "step": 1251 + }, + { + "epoch": 1.8337605272793849, + "grad_norm": 0.26711183739660005, + "learning_rate": 4.083555968560049e-07, + "loss": 0.0801, + "step": 1252 + }, + { + "epoch": 1.8352251922372758, + "grad_norm": 0.25571699526490943, + "learning_rate": 4.0114501925775927e-07, + "loss": 0.0765, + "step": 1253 + }, + { + "epoch": 1.8366898571951666, + "grad_norm": 0.25954869259656904, + "learning_rate": 3.9399736761418395e-07, + "loss": 0.0876, + "step": 1254 + }, + { + "epoch": 1.8381545221530575, + "grad_norm": 0.25678722587829267, + "learning_rate": 3.8691268878220165e-07, + "loss": 0.069, + "step": 1255 + }, + { + "epoch": 1.8396191871109484, + "grad_norm": 0.2589134232675844, + "learning_rate": 3.7989102920591103e-07, + "loss": 0.069, + "step": 1256 + }, + { + "epoch": 1.8410838520688393, + "grad_norm": 0.2450077649829702, + "learning_rate": 3.729324349162866e-07, + "loss": 0.0684, + "step": 1257 + }, + { + "epoch": 1.8425485170267302, + "grad_norm": 0.2853806694841532, + "learning_rate": 3.660369515308715e-07, + "loss": 0.0634, + "step": 1258 + }, + { + "epoch": 1.844013181984621, + "grad_norm": 0.2621846237984564, + "learning_rate": 3.592046242534819e-07, + "loss": 0.0714, + "step": 1259 + }, + { + "epoch": 1.845477846942512, + "grad_norm": 0.23779005991008018, + "learning_rate": 3.524354978739075e-07, + "loss": 0.0583, + "step": 1260 + }, + { + "epoch": 1.8469425119004028, + "grad_norm": 0.25039681136747866, + "learning_rate": 3.4572961676762715e-07, + "loss": 0.0738, + "step": 1261 + }, + { + "epoch": 1.8484071768582937, + "grad_norm": 0.26098923170285054, + "learning_rate": 3.390870248955025e-07, + "loss": 0.0807, + "step": 1262 + }, + { + "epoch": 1.8498718418161846, + "grad_norm": 0.26327776179903706, + "learning_rate": 3.3250776580350143e-07, + "loss": 0.0866, + "step": 1263 + }, + { + "epoch": 1.8513365067740755, + "grad_norm": 0.23667264720127792, + "learning_rate": 3.259918826224118e-07, + "loss": 0.0649, + "step": 1264 + }, + { + "epoch": 1.8528011717319663, + "grad_norm": 0.23761172772297415, + "learning_rate": 3.1953941806755265e-07, + "loss": 0.0845, + "step": 1265 + }, + { + "epoch": 1.8542658366898572, + "grad_norm": 0.24024069607921544, + "learning_rate": 3.131504144385023e-07, + "loss": 0.0703, + "step": 1266 + }, + { + "epoch": 1.855730501647748, + "grad_norm": 0.2420868970268621, + "learning_rate": 3.0682491361881064e-07, + "loss": 0.0748, + "step": 1267 + }, + { + "epoch": 1.857195166605639, + "grad_norm": 0.2577949839616261, + "learning_rate": 3.0056295707573736e-07, + "loss": 0.0665, + "step": 1268 + }, + { + "epoch": 1.8586598315635299, + "grad_norm": 0.2685695682554958, + "learning_rate": 2.943645858599653e-07, + "loss": 0.0792, + "step": 1269 + }, + { + "epoch": 1.8601244965214208, + "grad_norm": 0.25460210036235115, + "learning_rate": 2.8822984060534854e-07, + "loss": 0.0738, + "step": 1270 + }, + { + "epoch": 1.8615891614793116, + "grad_norm": 0.23461840979365672, + "learning_rate": 2.82158761528627e-07, + "loss": 0.0754, + "step": 1271 + }, + { + "epoch": 1.8630538264372025, + "grad_norm": 0.22557458232043867, + "learning_rate": 2.761513884291822e-07, + "loss": 0.0612, + "step": 1272 + }, + { + "epoch": 1.8645184913950934, + "grad_norm": 0.2470496535205524, + "learning_rate": 2.7020776068875876e-07, + "loss": 0.0711, + "step": 1273 + }, + { + "epoch": 1.8659831563529843, + "grad_norm": 0.2329397076753076, + "learning_rate": 2.6432791727121984e-07, + "loss": 0.0704, + "step": 1274 + }, + { + "epoch": 1.8674478213108752, + "grad_norm": 0.25477498790129094, + "learning_rate": 2.5851189672228103e-07, + "loss": 0.0773, + "step": 1275 + }, + { + "epoch": 1.868912486268766, + "grad_norm": 0.23446595512868088, + "learning_rate": 2.5275973716926804e-07, + "loss": 0.0709, + "step": 1276 + }, + { + "epoch": 1.870377151226657, + "grad_norm": 0.2605236921574388, + "learning_rate": 2.4707147632085815e-07, + "loss": 0.0758, + "step": 1277 + }, + { + "epoch": 1.8718418161845478, + "grad_norm": 0.24531947955587904, + "learning_rate": 2.414471514668348e-07, + "loss": 0.0778, + "step": 1278 + }, + { + "epoch": 1.8733064811424387, + "grad_norm": 0.2625806435047024, + "learning_rate": 2.358867994778502e-07, + "loss": 0.0742, + "step": 1279 + }, + { + "epoch": 1.8747711461003296, + "grad_norm": 0.26172740430959623, + "learning_rate": 2.3039045680517292e-07, + "loss": 0.071, + "step": 1280 + }, + { + "epoch": 1.8762358110582205, + "grad_norm": 0.25812348656180795, + "learning_rate": 2.249581594804562e-07, + "loss": 0.0702, + "step": 1281 + }, + { + "epoch": 1.8777004760161113, + "grad_norm": 0.22412143150895958, + "learning_rate": 2.1958994311549797e-07, + "loss": 0.0633, + "step": 1282 + }, + { + "epoch": 1.8791651409740022, + "grad_norm": 0.25437645779594675, + "learning_rate": 2.1428584290201116e-07, + "loss": 0.0697, + "step": 1283 + }, + { + "epoch": 1.880629805931893, + "grad_norm": 0.2728173599791171, + "learning_rate": 2.0904589361138927e-07, + "loss": 0.0813, + "step": 1284 + }, + { + "epoch": 1.882094470889784, + "grad_norm": 0.24620604884353783, + "learning_rate": 2.0387012959448227e-07, + "loss": 0.0812, + "step": 1285 + }, + { + "epoch": 1.8835591358476749, + "grad_norm": 0.2246638728164293, + "learning_rate": 1.9875858478136557e-07, + "loss": 0.0591, + "step": 1286 + }, + { + "epoch": 1.8850238008055658, + "grad_norm": 0.22425127272955642, + "learning_rate": 1.9371129268112466e-07, + "loss": 0.0615, + "step": 1287 + }, + { + "epoch": 1.8864884657634566, + "grad_norm": 0.47369574027243294, + "learning_rate": 1.8872828638162866e-07, + "loss": 0.0678, + "step": 1288 + }, + { + "epoch": 1.8879531307213475, + "grad_norm": 0.24744976308618485, + "learning_rate": 1.8380959854932045e-07, + "loss": 0.0745, + "step": 1289 + }, + { + "epoch": 1.8894177956792384, + "grad_norm": 0.2619097810479823, + "learning_rate": 1.7895526142899466e-07, + "loss": 0.0705, + "step": 1290 + }, + { + "epoch": 1.8908824606371293, + "grad_norm": 0.23871395237352025, + "learning_rate": 1.7416530684359444e-07, + "loss": 0.0551, + "step": 1291 + }, + { + "epoch": 1.8923471255950202, + "grad_norm": 0.24628533422104648, + "learning_rate": 1.6943976619399615e-07, + "loss": 0.0677, + "step": 1292 + }, + { + "epoch": 1.893811790552911, + "grad_norm": 0.23856368422966648, + "learning_rate": 1.6477867045880613e-07, + "loss": 0.0815, + "step": 1293 + }, + { + "epoch": 1.895276455510802, + "grad_norm": 0.2555025186995139, + "learning_rate": 1.6018205019415866e-07, + "loss": 0.0708, + "step": 1294 + }, + { + "epoch": 1.8967411204686928, + "grad_norm": 0.2528265155563436, + "learning_rate": 1.5564993553351394e-07, + "loss": 0.0738, + "step": 1295 + }, + { + "epoch": 1.8982057854265837, + "grad_norm": 0.26162054123344747, + "learning_rate": 1.511823561874637e-07, + "loss": 0.07, + "step": 1296 + }, + { + "epoch": 1.8996704503844746, + "grad_norm": 0.24902092413650095, + "learning_rate": 1.4677934144352923e-07, + "loss": 0.0668, + "step": 1297 + }, + { + "epoch": 1.9011351153423655, + "grad_norm": 0.2635425216052296, + "learning_rate": 1.4244092016597933e-07, + "loss": 0.0797, + "step": 1298 + }, + { + "epoch": 1.9025997803002563, + "grad_norm": 0.24273285605827147, + "learning_rate": 1.3816712079563034e-07, + "loss": 0.0738, + "step": 1299 + }, + { + "epoch": 1.9040644452581472, + "grad_norm": 0.23589111930612008, + "learning_rate": 1.3395797134967192e-07, + "loss": 0.0633, + "step": 1300 + }, + { + "epoch": 1.905529110216038, + "grad_norm": 0.2886312181672797, + "learning_rate": 1.2981349942146947e-07, + "loss": 0.0801, + "step": 1301 + }, + { + "epoch": 1.906993775173929, + "grad_norm": 0.2504691073271145, + "learning_rate": 1.257337321803964e-07, + "loss": 0.0719, + "step": 1302 + }, + { + "epoch": 1.9084584401318199, + "grad_norm": 0.2455439126340956, + "learning_rate": 1.2171869637164769e-07, + "loss": 0.0694, + "step": 1303 + }, + { + "epoch": 1.9099231050897107, + "grad_norm": 0.2443358735504617, + "learning_rate": 1.1776841831606544e-07, + "loss": 0.0629, + "step": 1304 + }, + { + "epoch": 1.9113877700476016, + "grad_norm": 0.2375928891798295, + "learning_rate": 1.1388292390997035e-07, + "loss": 0.0636, + "step": 1305 + }, + { + "epoch": 1.9128524350054925, + "grad_norm": 0.2430807124576779, + "learning_rate": 1.1006223862498944e-07, + "loss": 0.0654, + "step": 1306 + }, + { + "epoch": 1.9143170999633834, + "grad_norm": 0.2582360186234949, + "learning_rate": 1.0630638750788625e-07, + "loss": 0.0772, + "step": 1307 + }, + { + "epoch": 1.9157817649212743, + "grad_norm": 0.22936545545360856, + "learning_rate": 1.026153951804032e-07, + "loss": 0.0555, + "step": 1308 + }, + { + "epoch": 1.9172464298791652, + "grad_norm": 0.24362803135706435, + "learning_rate": 9.898928583909284e-08, + "loss": 0.0643, + "step": 1309 + }, + { + "epoch": 1.918711094837056, + "grad_norm": 0.23451079860288676, + "learning_rate": 9.542808325516573e-08, + "loss": 0.0562, + "step": 1310 + }, + { + "epoch": 1.920175759794947, + "grad_norm": 0.23945075253871337, + "learning_rate": 9.193181077433055e-08, + "loss": 0.07, + "step": 1311 + }, + { + "epoch": 1.9216404247528378, + "grad_norm": 0.2586122862421169, + "learning_rate": 8.850049131664206e-08, + "loss": 0.0793, + "step": 1312 + }, + { + "epoch": 1.9231050897107287, + "grad_norm": 0.25189426121665187, + "learning_rate": 8.513414737635006e-08, + "loss": 0.0665, + "step": 1313 + }, + { + "epoch": 1.9245697546686196, + "grad_norm": 0.2331012828544825, + "learning_rate": 8.183280102175617e-08, + "loss": 0.0626, + "step": 1314 + }, + { + "epoch": 1.9260344196265105, + "grad_norm": 0.24925536553110555, + "learning_rate": 7.859647389506176e-08, + "loss": 0.0688, + "step": 1315 + }, + { + "epoch": 1.9274990845844013, + "grad_norm": 0.25467958607825186, + "learning_rate": 7.542518721223469e-08, + "loss": 0.0894, + "step": 1316 + }, + { + "epoch": 1.9289637495422922, + "grad_norm": 0.25285918104772465, + "learning_rate": 7.231896176285946e-08, + "loss": 0.0773, + "step": 1317 + }, + { + "epoch": 1.930428414500183, + "grad_norm": 0.23450711664911186, + "learning_rate": 6.927781791001398e-08, + "loss": 0.0639, + "step": 1318 + }, + { + "epoch": 1.931893079458074, + "grad_norm": 0.23097872990832516, + "learning_rate": 6.630177559012518e-08, + "loss": 0.068, + "step": 1319 + }, + { + "epoch": 1.9333577444159649, + "grad_norm": 0.2379637592228772, + "learning_rate": 6.339085431284253e-08, + "loss": 0.0632, + "step": 1320 + }, + { + "epoch": 1.9348224093738557, + "grad_norm": 0.24051632358031436, + "learning_rate": 6.054507316091141e-08, + "loss": 0.0677, + "step": 1321 + }, + { + "epoch": 1.9362870743317466, + "grad_norm": 0.28404424390350286, + "learning_rate": 5.7764450790046554e-08, + "loss": 0.0783, + "step": 1322 + }, + { + "epoch": 1.9377517392896375, + "grad_norm": 0.24006965033366337, + "learning_rate": 5.5049005428808865e-08, + "loss": 0.0696, + "step": 1323 + }, + { + "epoch": 1.9392164042475284, + "grad_norm": 0.2496157646264579, + "learning_rate": 5.239875487848878e-08, + "loss": 0.0635, + "step": 1324 + }, + { + "epoch": 1.9406810692054193, + "grad_norm": 0.26403747456776216, + "learning_rate": 4.981371651298306e-08, + "loss": 0.0652, + "step": 1325 + }, + { + "epoch": 1.9421457341633102, + "grad_norm": 0.25920069106169014, + "learning_rate": 4.729390727869154e-08, + "loss": 0.0799, + "step": 1326 + }, + { + "epoch": 1.943610399121201, + "grad_norm": 0.26170514428715264, + "learning_rate": 4.483934369439613e-08, + "loss": 0.0755, + "step": 1327 + }, + { + "epoch": 1.945075064079092, + "grad_norm": 0.24542483964564413, + "learning_rate": 4.245004185115753e-08, + "loss": 0.0615, + "step": 1328 + }, + { + "epoch": 1.9465397290369828, + "grad_norm": 0.24837578672571922, + "learning_rate": 4.0126017412207565e-08, + "loss": 0.071, + "step": 1329 + }, + { + "epoch": 1.9480043939948737, + "grad_norm": 0.23824898115769536, + "learning_rate": 3.786728561285036e-08, + "loss": 0.0594, + "step": 1330 + }, + { + "epoch": 1.9494690589527646, + "grad_norm": 0.25242932208373725, + "learning_rate": 3.567386126035577e-08, + "loss": 0.0807, + "step": 1331 + }, + { + "epoch": 1.9509337239106554, + "grad_norm": 0.26364658800502216, + "learning_rate": 3.354575873386945e-08, + "loss": 0.0701, + "step": 1332 + }, + { + "epoch": 1.9523983888685463, + "grad_norm": 0.22064989501595925, + "learning_rate": 3.1482991984312926e-08, + "loss": 0.0601, + "step": 1333 + }, + { + "epoch": 1.9538630538264372, + "grad_norm": 0.25926143810431157, + "learning_rate": 2.948557453429701e-08, + "loss": 0.0671, + "step": 1334 + }, + { + "epoch": 1.955327718784328, + "grad_norm": 0.23624107516760934, + "learning_rate": 2.7553519478028535e-08, + "loss": 0.0665, + "step": 1335 + }, + { + "epoch": 1.956792383742219, + "grad_norm": 0.2705488604754587, + "learning_rate": 2.5686839481227077e-08, + "loss": 0.0773, + "step": 1336 + }, + { + "epoch": 1.9582570487001099, + "grad_norm": 0.2473961524857793, + "learning_rate": 2.3885546781042824e-08, + "loss": 0.073, + "step": 1337 + }, + { + "epoch": 1.9597217136580007, + "grad_norm": 0.2441090901112598, + "learning_rate": 2.2149653185973285e-08, + "loss": 0.0769, + "step": 1338 + }, + { + "epoch": 1.9611863786158916, + "grad_norm": 0.27046514605231436, + "learning_rate": 2.0479170075788924e-08, + "loss": 0.0716, + "step": 1339 + }, + { + "epoch": 1.9626510435737825, + "grad_norm": 0.22983632777466814, + "learning_rate": 1.8874108401456538e-08, + "loss": 0.0575, + "step": 1340 + }, + { + "epoch": 1.9641157085316734, + "grad_norm": 0.24070147293086863, + "learning_rate": 1.7334478685068212e-08, + "loss": 0.1044, + "step": 1341 + }, + { + "epoch": 1.9655803734895643, + "grad_norm": 0.24856888752891837, + "learning_rate": 1.586029101977249e-08, + "loss": 0.0585, + "step": 1342 + }, + { + "epoch": 1.9670450384474552, + "grad_norm": 0.25551924386712754, + "learning_rate": 1.4451555069708856e-08, + "loss": 0.0823, + "step": 1343 + }, + { + "epoch": 1.968509703405346, + "grad_norm": 0.24403209467826953, + "learning_rate": 1.3108280069941137e-08, + "loss": 0.0723, + "step": 1344 + }, + { + "epoch": 1.969974368363237, + "grad_norm": 0.25587914011988994, + "learning_rate": 1.1830474826404204e-08, + "loss": 0.0775, + "step": 1345 + }, + { + "epoch": 1.9714390333211278, + "grad_norm": 0.2481616027718323, + "learning_rate": 1.0618147715835137e-08, + "loss": 0.073, + "step": 1346 + }, + { + "epoch": 1.9729036982790187, + "grad_norm": 0.2503297758869869, + "learning_rate": 9.471306685728821e-09, + "loss": 0.0841, + "step": 1347 + }, + { + "epoch": 1.9743683632369096, + "grad_norm": 0.25325687071902725, + "learning_rate": 8.389959254281322e-09, + "loss": 0.0674, + "step": 1348 + }, + { + "epoch": 1.9758330281948004, + "grad_norm": 0.2413650911828191, + "learning_rate": 7.3741125103399254e-09, + "loss": 0.0737, + "step": 1349 + }, + { + "epoch": 1.9772976931526913, + "grad_norm": 0.23526516349129203, + "learning_rate": 6.423773113357623e-09, + "loss": 0.0631, + "step": 1350 + }, + { + "epoch": 1.9787623581105822, + "grad_norm": 0.25457045216657825, + "learning_rate": 5.538947293349806e-09, + "loss": 0.0765, + "step": 1351 + }, + { + "epoch": 1.980227023068473, + "grad_norm": 0.23325185099514448, + "learning_rate": 4.719640850852081e-09, + "loss": 0.0612, + "step": 1352 + }, + { + "epoch": 1.981691688026364, + "grad_norm": 0.26527379982321414, + "learning_rate": 3.965859156885854e-09, + "loss": 0.0775, + "step": 1353 + }, + { + "epoch": 1.9831563529842549, + "grad_norm": 0.24201253323675176, + "learning_rate": 3.2776071529183608e-09, + "loss": 0.0685, + "step": 1354 + }, + { + "epoch": 1.9846210179421457, + "grad_norm": 0.23684069181793793, + "learning_rate": 2.65488935083158e-09, + "loss": 0.0686, + "step": 1355 + }, + { + "epoch": 1.9860856829000366, + "grad_norm": 0.25231448957519526, + "learning_rate": 2.0977098328978098e-09, + "loss": 0.0726, + "step": 1356 + }, + { + "epoch": 1.9875503478579275, + "grad_norm": 0.2774834947544314, + "learning_rate": 1.6060722517430293e-09, + "loss": 0.0787, + "step": 1357 + }, + { + "epoch": 1.9890150128158184, + "grad_norm": 0.2503245321475594, + "learning_rate": 1.1799798303335775e-09, + "loss": 0.0683, + "step": 1358 + }, + { + "epoch": 1.9904796777737093, + "grad_norm": 0.2423085553528186, + "learning_rate": 8.194353619450646e-10, + "loss": 0.0731, + "step": 1359 + }, + { + "epoch": 1.9919443427316001, + "grad_norm": 0.24083594462983027, + "learning_rate": 5.244412101534924e-10, + "loss": 0.066, + "step": 1360 + }, + { + "epoch": 1.993409007689491, + "grad_norm": 0.23578471416137511, + "learning_rate": 2.949993088130487e-10, + "loss": 0.0585, + "step": 1361 + }, + { + "epoch": 1.994873672647382, + "grad_norm": 0.24433039086610767, + "learning_rate": 1.3111116204500562e-10, + "loss": 0.0602, + "step": 1362 + }, + { + "epoch": 1.9963383376052728, + "grad_norm": 0.2719863457397181, + "learning_rate": 3.277784423105779e-11, + "loss": 0.0863, + "step": 1363 + }, + { + "epoch": 1.9978030025631637, + "grad_norm": 0.261652070533884, + "learning_rate": 0.0, + "loss": 0.0707, + "step": 1364 + }, + { + "epoch": 1.9978030025631637, + "step": 1364, + "total_flos": 612552856354816.0, + "train_loss": 0.16333763612996885, + "train_runtime": 7219.9243, + "train_samples_per_second": 24.207, + "train_steps_per_second": 0.189 + } + ], + "logging_steps": 1, + "max_steps": 1364, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 612552856354816.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}