diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25852485716501644, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010340994286600657, + "grad_norm": 5.403522491455078, + "learning_rate": 2e-05, + "loss": 2.3818, + "step": 1 + }, + { + "epoch": 0.00020681988573201314, + "grad_norm": 8.3225679397583, + "learning_rate": 4e-05, + "loss": 2.5235, + "step": 2 + }, + { + "epoch": 0.0003102298285980197, + "grad_norm": 3.45357608795166, + "learning_rate": 6e-05, + "loss": 2.1258, + "step": 3 + }, + { + "epoch": 0.0004136397714640263, + "grad_norm": 10.966958045959473, + "learning_rate": 8e-05, + "loss": 3.0596, + "step": 4 + }, + { + "epoch": 0.0005170497143300328, + "grad_norm": 1.818668246269226, + "learning_rate": 0.0001, + "loss": 2.5631, + "step": 5 + }, + { + "epoch": 0.0006204596571960394, + "grad_norm": 1.0356301069259644, + "learning_rate": 0.00012, + "loss": 1.683, + "step": 6 + }, + { + "epoch": 0.000723869600062046, + "grad_norm": 1.3918240070343018, + "learning_rate": 0.00014, + "loss": 2.5425, + "step": 7 + }, + { + "epoch": 0.0008272795429280526, + "grad_norm": 3.50264835357666, + "learning_rate": 0.00016, + "loss": 2.705, + "step": 8 + }, + { + "epoch": 0.0009306894857940591, + "grad_norm": 2.2157135009765625, + "learning_rate": 0.00018, + "loss": 2.5137, + "step": 9 + }, + { + "epoch": 0.0010340994286600657, + "grad_norm": 1.1943459510803223, + "learning_rate": 0.0002, + "loss": 2.2908, + "step": 10 + }, + { + "epoch": 0.0011375093715260721, + "grad_norm": 0.9670085906982422, + "learning_rate": 0.00019998965338851528, + "loss": 1.8234, + "step": 11 + }, + { + "epoch": 0.0012409193143920788, + "grad_norm": 0.866587221622467, + "learning_rate": 0.00019997930677703054, + "loss": 1.9754, + "step": 12 + }, + { + "epoch": 0.0013443292572580853, + "grad_norm": 0.941832423210144, + "learning_rate": 0.00019996896016554578, + "loss": 1.8104, + "step": 13 + }, + { + "epoch": 0.001447739200124092, + "grad_norm": 1.8255549669265747, + "learning_rate": 0.00019995861355406104, + "loss": 1.9344, + "step": 14 + }, + { + "epoch": 0.0015511491429900984, + "grad_norm": 2.044438600540161, + "learning_rate": 0.0001999482669425763, + "loss": 1.8897, + "step": 15 + }, + { + "epoch": 0.001654559085856105, + "grad_norm": 1.520342469215393, + "learning_rate": 0.00019993792033109158, + "loss": 1.8936, + "step": 16 + }, + { + "epoch": 0.0017579690287221116, + "grad_norm": 1.0684723854064941, + "learning_rate": 0.00019992757371960684, + "loss": 2.1489, + "step": 17 + }, + { + "epoch": 0.0018613789715881182, + "grad_norm": 0.8357504606246948, + "learning_rate": 0.0001999172271081221, + "loss": 1.9924, + "step": 18 + }, + { + "epoch": 0.0019647889144541247, + "grad_norm": 0.7619360089302063, + "learning_rate": 0.00019990688049663735, + "loss": 2.19, + "step": 19 + }, + { + "epoch": 0.0020681988573201314, + "grad_norm": 0.9019061923027039, + "learning_rate": 0.0001998965338851526, + "loss": 2.0112, + "step": 20 + }, + { + "epoch": 0.002171608800186138, + "grad_norm": 0.8690137267112732, + "learning_rate": 0.00019988618727366788, + "loss": 2.0031, + "step": 21 + }, + { + "epoch": 0.0022750187430521443, + "grad_norm": 1.88344407081604, + "learning_rate": 0.00019987584066218314, + "loss": 2.6006, + "step": 22 + }, + { + "epoch": 0.002378428685918151, + "grad_norm": 0.9320490956306458, + "learning_rate": 0.0001998654940506984, + "loss": 1.8907, + "step": 23 + }, + { + "epoch": 0.0024818386287841577, + "grad_norm": 1.128214955329895, + "learning_rate": 0.00019985514743921367, + "loss": 1.6415, + "step": 24 + }, + { + "epoch": 0.0025852485716501643, + "grad_norm": 1.0375274419784546, + "learning_rate": 0.0001998448008277289, + "loss": 1.9145, + "step": 25 + }, + { + "epoch": 0.0026886585145161706, + "grad_norm": 4.119718551635742, + "learning_rate": 0.00019983445421624418, + "loss": 1.6368, + "step": 26 + }, + { + "epoch": 0.0027920684573821773, + "grad_norm": 0.958248496055603, + "learning_rate": 0.00019982410760475944, + "loss": 1.632, + "step": 27 + }, + { + "epoch": 0.002895478400248184, + "grad_norm": 2.7749078273773193, + "learning_rate": 0.0001998137609932747, + "loss": 2.4571, + "step": 28 + }, + { + "epoch": 0.0029988883431141906, + "grad_norm": 1.2027294635772705, + "learning_rate": 0.00019980341438178997, + "loss": 1.8688, + "step": 29 + }, + { + "epoch": 0.003102298285980197, + "grad_norm": 1.0721759796142578, + "learning_rate": 0.00019979306777030524, + "loss": 1.9768, + "step": 30 + }, + { + "epoch": 0.0032057082288462035, + "grad_norm": 1.3779335021972656, + "learning_rate": 0.00019978272115882048, + "loss": 1.7229, + "step": 31 + }, + { + "epoch": 0.00330911817171221, + "grad_norm": 1.5010607242584229, + "learning_rate": 0.00019977237454733574, + "loss": 1.9824, + "step": 32 + }, + { + "epoch": 0.003412528114578217, + "grad_norm": 0.9950757026672363, + "learning_rate": 0.000199762027935851, + "loss": 1.1941, + "step": 33 + }, + { + "epoch": 0.003515938057444223, + "grad_norm": 0.9842276573181152, + "learning_rate": 0.00019975168132436627, + "loss": 1.6324, + "step": 34 + }, + { + "epoch": 0.00361934800031023, + "grad_norm": 1.6210508346557617, + "learning_rate": 0.00019974133471288154, + "loss": 2.2909, + "step": 35 + }, + { + "epoch": 0.0037227579431762365, + "grad_norm": 1.318981647491455, + "learning_rate": 0.0001997309881013968, + "loss": 2.4224, + "step": 36 + }, + { + "epoch": 0.003826167886042243, + "grad_norm": 0.8251323103904724, + "learning_rate": 0.00019972064148991204, + "loss": 2.1714, + "step": 37 + }, + { + "epoch": 0.003929577828908249, + "grad_norm": 0.9951919317245483, + "learning_rate": 0.0001997102948784273, + "loss": 1.819, + "step": 38 + }, + { + "epoch": 0.0040329877717742565, + "grad_norm": 0.8593564629554749, + "learning_rate": 0.00019969994826694258, + "loss": 1.8302, + "step": 39 + }, + { + "epoch": 0.004136397714640263, + "grad_norm": 1.3132660388946533, + "learning_rate": 0.00019968960165545784, + "loss": 2.0428, + "step": 40 + }, + { + "epoch": 0.004239807657506269, + "grad_norm": 0.806970477104187, + "learning_rate": 0.0001996792550439731, + "loss": 2.0075, + "step": 41 + }, + { + "epoch": 0.004343217600372276, + "grad_norm": 0.8618819713592529, + "learning_rate": 0.00019966890843248837, + "loss": 1.3391, + "step": 42 + }, + { + "epoch": 0.004446627543238282, + "grad_norm": 0.966143786907196, + "learning_rate": 0.0001996585618210036, + "loss": 1.6868, + "step": 43 + }, + { + "epoch": 0.004550037486104289, + "grad_norm": 1.2480261325836182, + "learning_rate": 0.00019964821520951888, + "loss": 2.3074, + "step": 44 + }, + { + "epoch": 0.004653447428970296, + "grad_norm": 0.9969954490661621, + "learning_rate": 0.00019963786859803414, + "loss": 2.0236, + "step": 45 + }, + { + "epoch": 0.004756857371836302, + "grad_norm": 1.7677679061889648, + "learning_rate": 0.0001996275219865494, + "loss": 2.1244, + "step": 46 + }, + { + "epoch": 0.004860267314702308, + "grad_norm": 0.9171494841575623, + "learning_rate": 0.00019961717537506467, + "loss": 1.6648, + "step": 47 + }, + { + "epoch": 0.004963677257568315, + "grad_norm": 1.048912763595581, + "learning_rate": 0.00019960682876357994, + "loss": 2.1855, + "step": 48 + }, + { + "epoch": 0.0050670872004343216, + "grad_norm": 1.4205477237701416, + "learning_rate": 0.00019959648215209518, + "loss": 2.0607, + "step": 49 + }, + { + "epoch": 0.005170497143300329, + "grad_norm": 1.180935025215149, + "learning_rate": 0.00019958613554061044, + "loss": 2.405, + "step": 50 + }, + { + "epoch": 0.005273907086166335, + "grad_norm": 2.3712823390960693, + "learning_rate": 0.0001995757889291257, + "loss": 1.7894, + "step": 51 + }, + { + "epoch": 0.005377317029032341, + "grad_norm": 0.8639737367630005, + "learning_rate": 0.00019956544231764097, + "loss": 1.488, + "step": 52 + }, + { + "epoch": 0.005480726971898348, + "grad_norm": 1.143019199371338, + "learning_rate": 0.00019955509570615624, + "loss": 2.0396, + "step": 53 + }, + { + "epoch": 0.0055841369147643545, + "grad_norm": 0.9509019255638123, + "learning_rate": 0.0001995447490946715, + "loss": 2.1223, + "step": 54 + }, + { + "epoch": 0.005687546857630361, + "grad_norm": 1.8030970096588135, + "learning_rate": 0.00019953440248318674, + "loss": 2.2948, + "step": 55 + }, + { + "epoch": 0.005790956800496368, + "grad_norm": 1.2299774885177612, + "learning_rate": 0.000199524055871702, + "loss": 2.2414, + "step": 56 + }, + { + "epoch": 0.005894366743362374, + "grad_norm": 0.9413872957229614, + "learning_rate": 0.00019951370926021727, + "loss": 2.2899, + "step": 57 + }, + { + "epoch": 0.005997776686228381, + "grad_norm": 1.6003267765045166, + "learning_rate": 0.00019950336264873254, + "loss": 1.8357, + "step": 58 + }, + { + "epoch": 0.0061011866290943875, + "grad_norm": 1.1218072175979614, + "learning_rate": 0.0001994930160372478, + "loss": 1.8433, + "step": 59 + }, + { + "epoch": 0.006204596571960394, + "grad_norm": 0.9314822554588318, + "learning_rate": 0.00019948266942576307, + "loss": 2.0523, + "step": 60 + }, + { + "epoch": 0.006308006514826401, + "grad_norm": 1.321779489517212, + "learning_rate": 0.0001994723228142783, + "loss": 1.6102, + "step": 61 + }, + { + "epoch": 0.006411416457692407, + "grad_norm": 0.9747542142868042, + "learning_rate": 0.00019946197620279358, + "loss": 2.0, + "step": 62 + }, + { + "epoch": 0.006514826400558413, + "grad_norm": 0.7083417177200317, + "learning_rate": 0.00019945162959130884, + "loss": 2.023, + "step": 63 + }, + { + "epoch": 0.00661823634342442, + "grad_norm": 0.7545279860496521, + "learning_rate": 0.0001994412829798241, + "loss": 1.8848, + "step": 64 + }, + { + "epoch": 0.006721646286290427, + "grad_norm": 1.7224853038787842, + "learning_rate": 0.00019943093636833937, + "loss": 2.3156, + "step": 65 + }, + { + "epoch": 0.006825056229156434, + "grad_norm": 1.3995798826217651, + "learning_rate": 0.00019942058975685464, + "loss": 2.1194, + "step": 66 + }, + { + "epoch": 0.00692846617202244, + "grad_norm": 0.8144084811210632, + "learning_rate": 0.0001994102431453699, + "loss": 1.8109, + "step": 67 + }, + { + "epoch": 0.007031876114888446, + "grad_norm": 1.196885585784912, + "learning_rate": 0.00019939989653388517, + "loss": 2.022, + "step": 68 + }, + { + "epoch": 0.007135286057754453, + "grad_norm": 0.7056652903556824, + "learning_rate": 0.0001993895499224004, + "loss": 2.086, + "step": 69 + }, + { + "epoch": 0.00723869600062046, + "grad_norm": 1.2433514595031738, + "learning_rate": 0.00019937920331091567, + "loss": 2.0888, + "step": 70 + }, + { + "epoch": 0.007342105943486466, + "grad_norm": 0.6289485692977905, + "learning_rate": 0.00019936885669943094, + "loss": 2.3105, + "step": 71 + }, + { + "epoch": 0.007445515886352473, + "grad_norm": 0.8752896189689636, + "learning_rate": 0.0001993585100879462, + "loss": 1.7598, + "step": 72 + }, + { + "epoch": 0.007548925829218479, + "grad_norm": 0.6081548929214478, + "learning_rate": 0.00019934816347646147, + "loss": 1.8297, + "step": 73 + }, + { + "epoch": 0.007652335772084486, + "grad_norm": 0.6561211347579956, + "learning_rate": 0.00019933781686497674, + "loss": 2.0692, + "step": 74 + }, + { + "epoch": 0.0077557457149504926, + "grad_norm": 0.7748151421546936, + "learning_rate": 0.000199327470253492, + "loss": 1.6976, + "step": 75 + }, + { + "epoch": 0.007859155657816499, + "grad_norm": 0.9392269849777222, + "learning_rate": 0.00019931712364200727, + "loss": 1.657, + "step": 76 + }, + { + "epoch": 0.007962565600682505, + "grad_norm": 1.0892373323440552, + "learning_rate": 0.0001993067770305225, + "loss": 2.0677, + "step": 77 + }, + { + "epoch": 0.008065975543548513, + "grad_norm": 1.633618950843811, + "learning_rate": 0.00019929643041903777, + "loss": 2.1268, + "step": 78 + }, + { + "epoch": 0.00816938548641452, + "grad_norm": 1.1079933643341064, + "learning_rate": 0.00019928608380755304, + "loss": 2.4634, + "step": 79 + }, + { + "epoch": 0.008272795429280526, + "grad_norm": 0.7377402782440186, + "learning_rate": 0.0001992757371960683, + "loss": 2.0904, + "step": 80 + }, + { + "epoch": 0.008376205372146532, + "grad_norm": 1.0623070001602173, + "learning_rate": 0.00019926539058458357, + "loss": 2.0502, + "step": 81 + }, + { + "epoch": 0.008479615315012538, + "grad_norm": 1.7848681211471558, + "learning_rate": 0.00019925504397309883, + "loss": 2.0149, + "step": 82 + }, + { + "epoch": 0.008583025257878544, + "grad_norm": 1.0376352071762085, + "learning_rate": 0.0001992446973616141, + "loss": 1.8225, + "step": 83 + }, + { + "epoch": 0.008686435200744552, + "grad_norm": 1.0023243427276611, + "learning_rate": 0.00019923435075012936, + "loss": 2.3898, + "step": 84 + }, + { + "epoch": 0.008789845143610558, + "grad_norm": 0.7046536803245544, + "learning_rate": 0.0001992240041386446, + "loss": 2.2428, + "step": 85 + }, + { + "epoch": 0.008893255086476565, + "grad_norm": 0.605064332485199, + "learning_rate": 0.00019921365752715987, + "loss": 2.0025, + "step": 86 + }, + { + "epoch": 0.008996665029342571, + "grad_norm": 0.8229315876960754, + "learning_rate": 0.00019920331091567513, + "loss": 1.5395, + "step": 87 + }, + { + "epoch": 0.009100074972208577, + "grad_norm": 0.5166494250297546, + "learning_rate": 0.0001991929643041904, + "loss": 2.0568, + "step": 88 + }, + { + "epoch": 0.009203484915074585, + "grad_norm": 1.8681402206420898, + "learning_rate": 0.00019918261769270566, + "loss": 2.483, + "step": 89 + }, + { + "epoch": 0.009306894857940591, + "grad_norm": 0.937924861907959, + "learning_rate": 0.00019917227108122093, + "loss": 2.0965, + "step": 90 + }, + { + "epoch": 0.009410304800806598, + "grad_norm": 1.0499122142791748, + "learning_rate": 0.00019916192446973617, + "loss": 1.8826, + "step": 91 + }, + { + "epoch": 0.009513714743672604, + "grad_norm": 0.8156949877738953, + "learning_rate": 0.00019915157785825143, + "loss": 2.261, + "step": 92 + }, + { + "epoch": 0.00961712468653861, + "grad_norm": 1.9210288524627686, + "learning_rate": 0.0001991412312467667, + "loss": 2.4876, + "step": 93 + }, + { + "epoch": 0.009720534629404616, + "grad_norm": 1.3630799055099487, + "learning_rate": 0.00019913088463528197, + "loss": 1.7244, + "step": 94 + }, + { + "epoch": 0.009823944572270624, + "grad_norm": 1.2075848579406738, + "learning_rate": 0.00019912053802379723, + "loss": 1.4987, + "step": 95 + }, + { + "epoch": 0.00992735451513663, + "grad_norm": 1.182338833808899, + "learning_rate": 0.0001991101914123125, + "loss": 2.0565, + "step": 96 + }, + { + "epoch": 0.010030764458002637, + "grad_norm": 1.4979480504989624, + "learning_rate": 0.00019909984480082773, + "loss": 2.0842, + "step": 97 + }, + { + "epoch": 0.010134174400868643, + "grad_norm": 0.6272477507591248, + "learning_rate": 0.000199089498189343, + "loss": 2.2594, + "step": 98 + }, + { + "epoch": 0.01023758434373465, + "grad_norm": 1.4160089492797852, + "learning_rate": 0.00019907915157785827, + "loss": 2.2122, + "step": 99 + }, + { + "epoch": 0.010340994286600657, + "grad_norm": 0.9657629132270813, + "learning_rate": 0.00019906880496637353, + "loss": 1.6975, + "step": 100 + }, + { + "epoch": 0.010444404229466664, + "grad_norm": 0.8677767515182495, + "learning_rate": 0.0001990584583548888, + "loss": 2.3514, + "step": 101 + }, + { + "epoch": 0.01054781417233267, + "grad_norm": 1.6429847478866577, + "learning_rate": 0.00019904811174340406, + "loss": 1.798, + "step": 102 + }, + { + "epoch": 0.010651224115198676, + "grad_norm": 0.6432476043701172, + "learning_rate": 0.0001990377651319193, + "loss": 1.7637, + "step": 103 + }, + { + "epoch": 0.010754634058064682, + "grad_norm": 0.9115468263626099, + "learning_rate": 0.00019902741852043457, + "loss": 1.8993, + "step": 104 + }, + { + "epoch": 0.01085804400093069, + "grad_norm": 1.1197313070297241, + "learning_rate": 0.00019901707190894983, + "loss": 1.5482, + "step": 105 + }, + { + "epoch": 0.010961453943796697, + "grad_norm": 0.8248776197433472, + "learning_rate": 0.0001990067252974651, + "loss": 2.2015, + "step": 106 + }, + { + "epoch": 0.011064863886662703, + "grad_norm": 0.7735565304756165, + "learning_rate": 0.00019899637868598036, + "loss": 1.4849, + "step": 107 + }, + { + "epoch": 0.011168273829528709, + "grad_norm": 1.956572413444519, + "learning_rate": 0.00019898603207449563, + "loss": 2.1429, + "step": 108 + }, + { + "epoch": 0.011271683772394715, + "grad_norm": 0.8068015575408936, + "learning_rate": 0.00019897568546301087, + "loss": 1.4472, + "step": 109 + }, + { + "epoch": 0.011375093715260721, + "grad_norm": 1.3157227039337158, + "learning_rate": 0.00019896533885152613, + "loss": 1.7154, + "step": 110 + }, + { + "epoch": 0.01147850365812673, + "grad_norm": 0.8677852153778076, + "learning_rate": 0.0001989549922400414, + "loss": 1.5613, + "step": 111 + }, + { + "epoch": 0.011581913600992736, + "grad_norm": 1.2042537927627563, + "learning_rate": 0.00019894464562855666, + "loss": 1.9679, + "step": 112 + }, + { + "epoch": 0.011685323543858742, + "grad_norm": 0.7814430594444275, + "learning_rate": 0.00019893429901707193, + "loss": 1.6496, + "step": 113 + }, + { + "epoch": 0.011788733486724748, + "grad_norm": 0.8422583341598511, + "learning_rate": 0.0001989239524055872, + "loss": 1.672, + "step": 114 + }, + { + "epoch": 0.011892143429590754, + "grad_norm": 0.829519510269165, + "learning_rate": 0.00019891360579410243, + "loss": 1.9504, + "step": 115 + }, + { + "epoch": 0.011995553372456762, + "grad_norm": 0.7655951976776123, + "learning_rate": 0.0001989032591826177, + "loss": 1.779, + "step": 116 + }, + { + "epoch": 0.012098963315322769, + "grad_norm": 1.0722001791000366, + "learning_rate": 0.00019889291257113296, + "loss": 1.948, + "step": 117 + }, + { + "epoch": 0.012202373258188775, + "grad_norm": 0.9230548143386841, + "learning_rate": 0.00019888256595964823, + "loss": 1.0467, + "step": 118 + }, + { + "epoch": 0.012305783201054781, + "grad_norm": 1.7951265573501587, + "learning_rate": 0.0001988722193481635, + "loss": 1.9929, + "step": 119 + }, + { + "epoch": 0.012409193143920787, + "grad_norm": 1.0556249618530273, + "learning_rate": 0.00019886187273667876, + "loss": 1.8817, + "step": 120 + }, + { + "epoch": 0.012512603086786795, + "grad_norm": 0.8999210000038147, + "learning_rate": 0.000198851526125194, + "loss": 2.1797, + "step": 121 + }, + { + "epoch": 0.012616013029652802, + "grad_norm": 0.6396268606185913, + "learning_rate": 0.00019884117951370927, + "loss": 1.8188, + "step": 122 + }, + { + "epoch": 0.012719422972518808, + "grad_norm": 1.6357016563415527, + "learning_rate": 0.00019883083290222453, + "loss": 2.4962, + "step": 123 + }, + { + "epoch": 0.012822832915384814, + "grad_norm": 1.6592292785644531, + "learning_rate": 0.0001988204862907398, + "loss": 1.768, + "step": 124 + }, + { + "epoch": 0.01292624285825082, + "grad_norm": 1.0383470058441162, + "learning_rate": 0.00019881013967925506, + "loss": 1.5453, + "step": 125 + }, + { + "epoch": 0.013029652801116827, + "grad_norm": 0.6433761119842529, + "learning_rate": 0.00019879979306777033, + "loss": 1.7572, + "step": 126 + }, + { + "epoch": 0.013133062743982835, + "grad_norm": 0.8064449429512024, + "learning_rate": 0.00019878944645628557, + "loss": 1.6774, + "step": 127 + }, + { + "epoch": 0.01323647268684884, + "grad_norm": 1.010185956954956, + "learning_rate": 0.00019877909984480083, + "loss": 1.9185, + "step": 128 + }, + { + "epoch": 0.013339882629714847, + "grad_norm": 0.8461198806762695, + "learning_rate": 0.0001987687532333161, + "loss": 1.7703, + "step": 129 + }, + { + "epoch": 0.013443292572580853, + "grad_norm": 1.4564467668533325, + "learning_rate": 0.00019875840662183136, + "loss": 2.3164, + "step": 130 + }, + { + "epoch": 0.01354670251544686, + "grad_norm": 0.7336029410362244, + "learning_rate": 0.00019874806001034663, + "loss": 1.8283, + "step": 131 + }, + { + "epoch": 0.013650112458312868, + "grad_norm": 2.1804862022399902, + "learning_rate": 0.0001987377133988619, + "loss": 1.9357, + "step": 132 + }, + { + "epoch": 0.013753522401178874, + "grad_norm": 1.1863375902175903, + "learning_rate": 0.00019872736678737713, + "loss": 1.8582, + "step": 133 + }, + { + "epoch": 0.01385693234404488, + "grad_norm": 1.062116265296936, + "learning_rate": 0.0001987170201758924, + "loss": 2.0296, + "step": 134 + }, + { + "epoch": 0.013960342286910886, + "grad_norm": 1.7113271951675415, + "learning_rate": 0.00019870667356440766, + "loss": 1.8241, + "step": 135 + }, + { + "epoch": 0.014063752229776893, + "grad_norm": 1.9041301012039185, + "learning_rate": 0.00019869632695292293, + "loss": 2.0608, + "step": 136 + }, + { + "epoch": 0.0141671621726429, + "grad_norm": 1.0523148775100708, + "learning_rate": 0.0001986859803414382, + "loss": 1.9497, + "step": 137 + }, + { + "epoch": 0.014270572115508907, + "grad_norm": 0.8391201496124268, + "learning_rate": 0.00019867563372995346, + "loss": 1.5612, + "step": 138 + }, + { + "epoch": 0.014373982058374913, + "grad_norm": 1.2530022859573364, + "learning_rate": 0.0001986652871184687, + "loss": 1.9835, + "step": 139 + }, + { + "epoch": 0.01447739200124092, + "grad_norm": 0.7818213701248169, + "learning_rate": 0.00019865494050698396, + "loss": 1.7641, + "step": 140 + }, + { + "epoch": 0.014580801944106925, + "grad_norm": 1.1031138896942139, + "learning_rate": 0.00019864459389549923, + "loss": 2.028, + "step": 141 + }, + { + "epoch": 0.014684211886972932, + "grad_norm": 0.6917052268981934, + "learning_rate": 0.0001986342472840145, + "loss": 1.7597, + "step": 142 + }, + { + "epoch": 0.01478762182983894, + "grad_norm": 1.2647300958633423, + "learning_rate": 0.00019862390067252976, + "loss": 2.0946, + "step": 143 + }, + { + "epoch": 0.014891031772704946, + "grad_norm": 0.819813072681427, + "learning_rate": 0.00019861355406104503, + "loss": 1.786, + "step": 144 + }, + { + "epoch": 0.014994441715570952, + "grad_norm": 2.5735700130462646, + "learning_rate": 0.00019860320744956027, + "loss": 1.8916, + "step": 145 + }, + { + "epoch": 0.015097851658436958, + "grad_norm": 1.1350979804992676, + "learning_rate": 0.00019859286083807553, + "loss": 2.223, + "step": 146 + }, + { + "epoch": 0.015201261601302965, + "grad_norm": 0.9886088967323303, + "learning_rate": 0.0001985825142265908, + "loss": 1.3458, + "step": 147 + }, + { + "epoch": 0.015304671544168973, + "grad_norm": 0.7377306222915649, + "learning_rate": 0.00019857216761510606, + "loss": 1.5796, + "step": 148 + }, + { + "epoch": 0.015408081487034979, + "grad_norm": 1.4361933469772339, + "learning_rate": 0.00019856182100362133, + "loss": 2.2587, + "step": 149 + }, + { + "epoch": 0.015511491429900985, + "grad_norm": 1.8182250261306763, + "learning_rate": 0.0001985514743921366, + "loss": 2.4222, + "step": 150 + }, + { + "epoch": 0.015614901372766991, + "grad_norm": 1.7325825691223145, + "learning_rate": 0.00019854112778065183, + "loss": 1.5374, + "step": 151 + }, + { + "epoch": 0.015718311315632998, + "grad_norm": 0.7336366176605225, + "learning_rate": 0.0001985307811691671, + "loss": 1.9757, + "step": 152 + }, + { + "epoch": 0.015821721258499004, + "grad_norm": 1.2688993215560913, + "learning_rate": 0.00019852043455768236, + "loss": 2.3533, + "step": 153 + }, + { + "epoch": 0.01592513120136501, + "grad_norm": 1.637938141822815, + "learning_rate": 0.00019851008794619763, + "loss": 2.4375, + "step": 154 + }, + { + "epoch": 0.016028541144231016, + "grad_norm": 0.6742689609527588, + "learning_rate": 0.0001984997413347129, + "loss": 2.0748, + "step": 155 + }, + { + "epoch": 0.016131951087097026, + "grad_norm": 1.2148876190185547, + "learning_rate": 0.00019848939472322816, + "loss": 1.8236, + "step": 156 + }, + { + "epoch": 0.016235361029963032, + "grad_norm": 2.3418567180633545, + "learning_rate": 0.0001984790481117434, + "loss": 2.413, + "step": 157 + }, + { + "epoch": 0.01633877097282904, + "grad_norm": 0.7256979942321777, + "learning_rate": 0.00019846870150025866, + "loss": 1.6911, + "step": 158 + }, + { + "epoch": 0.016442180915695045, + "grad_norm": 1.3450243473052979, + "learning_rate": 0.00019845835488877393, + "loss": 1.7432, + "step": 159 + }, + { + "epoch": 0.01654559085856105, + "grad_norm": 1.6073719263076782, + "learning_rate": 0.0001984480082772892, + "loss": 2.2007, + "step": 160 + }, + { + "epoch": 0.016649000801427057, + "grad_norm": 1.153110146522522, + "learning_rate": 0.00019843766166580446, + "loss": 1.715, + "step": 161 + }, + { + "epoch": 0.016752410744293064, + "grad_norm": 0.807807445526123, + "learning_rate": 0.00019842731505431973, + "loss": 1.5903, + "step": 162 + }, + { + "epoch": 0.01685582068715907, + "grad_norm": 1.3365882635116577, + "learning_rate": 0.00019841696844283496, + "loss": 1.978, + "step": 163 + }, + { + "epoch": 0.016959230630025076, + "grad_norm": 0.7147388458251953, + "learning_rate": 0.00019840662183135023, + "loss": 1.6717, + "step": 164 + }, + { + "epoch": 0.017062640572891082, + "grad_norm": 0.6310922503471375, + "learning_rate": 0.0001983962752198655, + "loss": 1.9129, + "step": 165 + }, + { + "epoch": 0.01716605051575709, + "grad_norm": 0.8110767602920532, + "learning_rate": 0.00019838592860838076, + "loss": 1.6107, + "step": 166 + }, + { + "epoch": 0.017269460458623098, + "grad_norm": 0.9394704103469849, + "learning_rate": 0.00019837558199689603, + "loss": 2.2022, + "step": 167 + }, + { + "epoch": 0.017372870401489104, + "grad_norm": 0.6974437236785889, + "learning_rate": 0.0001983652353854113, + "loss": 2.0294, + "step": 168 + }, + { + "epoch": 0.01747628034435511, + "grad_norm": 0.9328587055206299, + "learning_rate": 0.00019835488877392653, + "loss": 1.9777, + "step": 169 + }, + { + "epoch": 0.017579690287221117, + "grad_norm": 1.6323524713516235, + "learning_rate": 0.0001983445421624418, + "loss": 2.2248, + "step": 170 + }, + { + "epoch": 0.017683100230087123, + "grad_norm": 1.306799292564392, + "learning_rate": 0.00019833419555095706, + "loss": 1.5457, + "step": 171 + }, + { + "epoch": 0.01778651017295313, + "grad_norm": 1.1161526441574097, + "learning_rate": 0.00019832384893947233, + "loss": 1.9586, + "step": 172 + }, + { + "epoch": 0.017889920115819136, + "grad_norm": 0.6718397736549377, + "learning_rate": 0.0001983135023279876, + "loss": 1.901, + "step": 173 + }, + { + "epoch": 0.017993330058685142, + "grad_norm": 0.7336934208869934, + "learning_rate": 0.00019830315571650286, + "loss": 1.7648, + "step": 174 + }, + { + "epoch": 0.018096740001551148, + "grad_norm": 0.5712825655937195, + "learning_rate": 0.0001982928091050181, + "loss": 1.8274, + "step": 175 + }, + { + "epoch": 0.018200149944417154, + "grad_norm": 0.7899829149246216, + "learning_rate": 0.00019828246249353336, + "loss": 1.8218, + "step": 176 + }, + { + "epoch": 0.01830355988728316, + "grad_norm": 2.2078018188476562, + "learning_rate": 0.00019827211588204863, + "loss": 2.0636, + "step": 177 + }, + { + "epoch": 0.01840696983014917, + "grad_norm": 1.9544669389724731, + "learning_rate": 0.0001982617692705639, + "loss": 1.9228, + "step": 178 + }, + { + "epoch": 0.018510379773015177, + "grad_norm": 1.212548017501831, + "learning_rate": 0.00019825142265907916, + "loss": 1.407, + "step": 179 + }, + { + "epoch": 0.018613789715881183, + "grad_norm": 1.0474966764450073, + "learning_rate": 0.00019824107604759442, + "loss": 1.8511, + "step": 180 + }, + { + "epoch": 0.01871719965874719, + "grad_norm": 0.9905005097389221, + "learning_rate": 0.00019823072943610966, + "loss": 1.8197, + "step": 181 + }, + { + "epoch": 0.018820609601613195, + "grad_norm": 1.660206913948059, + "learning_rate": 0.00019822038282462493, + "loss": 2.2478, + "step": 182 + }, + { + "epoch": 0.0189240195444792, + "grad_norm": 0.8422425985336304, + "learning_rate": 0.0001982100362131402, + "loss": 1.6032, + "step": 183 + }, + { + "epoch": 0.019027429487345208, + "grad_norm": 0.9838465452194214, + "learning_rate": 0.00019819968960165546, + "loss": 2.0711, + "step": 184 + }, + { + "epoch": 0.019130839430211214, + "grad_norm": 1.9396435022354126, + "learning_rate": 0.00019818934299017073, + "loss": 2.2734, + "step": 185 + }, + { + "epoch": 0.01923424937307722, + "grad_norm": 1.4743622541427612, + "learning_rate": 0.000198178996378686, + "loss": 1.7227, + "step": 186 + }, + { + "epoch": 0.019337659315943227, + "grad_norm": 1.6646116971969604, + "learning_rate": 0.00019816864976720123, + "loss": 2.3045, + "step": 187 + }, + { + "epoch": 0.019441069258809233, + "grad_norm": 0.8518794178962708, + "learning_rate": 0.0001981583031557165, + "loss": 1.9421, + "step": 188 + }, + { + "epoch": 0.019544479201675242, + "grad_norm": 1.869166374206543, + "learning_rate": 0.00019814795654423176, + "loss": 1.6767, + "step": 189 + }, + { + "epoch": 0.01964788914454125, + "grad_norm": 0.9898916482925415, + "learning_rate": 0.00019813760993274703, + "loss": 1.7501, + "step": 190 + }, + { + "epoch": 0.019751299087407255, + "grad_norm": 0.7842496037483215, + "learning_rate": 0.0001981272633212623, + "loss": 2.2322, + "step": 191 + }, + { + "epoch": 0.01985470903027326, + "grad_norm": 1.4266057014465332, + "learning_rate": 0.00019811691670977756, + "loss": 1.9491, + "step": 192 + }, + { + "epoch": 0.019958118973139267, + "grad_norm": 0.8259785771369934, + "learning_rate": 0.00019810657009829282, + "loss": 1.6481, + "step": 193 + }, + { + "epoch": 0.020061528916005274, + "grad_norm": 0.8917863965034485, + "learning_rate": 0.00019809622348680806, + "loss": 1.7006, + "step": 194 + }, + { + "epoch": 0.02016493885887128, + "grad_norm": 1.5615893602371216, + "learning_rate": 0.00019808587687532333, + "loss": 1.8371, + "step": 195 + }, + { + "epoch": 0.020268348801737286, + "grad_norm": 0.999208927154541, + "learning_rate": 0.0001980755302638386, + "loss": 1.9719, + "step": 196 + }, + { + "epoch": 0.020371758744603292, + "grad_norm": 1.0224575996398926, + "learning_rate": 0.00019806518365235386, + "loss": 1.9708, + "step": 197 + }, + { + "epoch": 0.0204751686874693, + "grad_norm": 0.6324958205223083, + "learning_rate": 0.00019805483704086912, + "loss": 2.2463, + "step": 198 + }, + { + "epoch": 0.02057857863033531, + "grad_norm": 1.042873501777649, + "learning_rate": 0.0001980444904293844, + "loss": 1.1634, + "step": 199 + }, + { + "epoch": 0.020681988573201315, + "grad_norm": 1.5669771432876587, + "learning_rate": 0.00019803414381789966, + "loss": 1.8985, + "step": 200 + }, + { + "epoch": 0.02078539851606732, + "grad_norm": 1.6116214990615845, + "learning_rate": 0.00019802379720641492, + "loss": 1.9126, + "step": 201 + }, + { + "epoch": 0.020888808458933327, + "grad_norm": 1.1320672035217285, + "learning_rate": 0.00019801345059493016, + "loss": 1.96, + "step": 202 + }, + { + "epoch": 0.020992218401799333, + "grad_norm": 1.2705937623977661, + "learning_rate": 0.00019800310398344542, + "loss": 1.8843, + "step": 203 + }, + { + "epoch": 0.02109562834466534, + "grad_norm": 1.1148872375488281, + "learning_rate": 0.0001979927573719607, + "loss": 1.5164, + "step": 204 + }, + { + "epoch": 0.021199038287531346, + "grad_norm": 2.2498512268066406, + "learning_rate": 0.00019798241076047596, + "loss": 2.4194, + "step": 205 + }, + { + "epoch": 0.021302448230397352, + "grad_norm": 2.049044609069824, + "learning_rate": 0.00019797206414899122, + "loss": 1.8792, + "step": 206 + }, + { + "epoch": 0.02140585817326336, + "grad_norm": 0.7835369110107422, + "learning_rate": 0.0001979617175375065, + "loss": 1.7925, + "step": 207 + }, + { + "epoch": 0.021509268116129365, + "grad_norm": 0.814444363117218, + "learning_rate": 0.00019795137092602175, + "loss": 2.0929, + "step": 208 + }, + { + "epoch": 0.02161267805899537, + "grad_norm": 0.6900445222854614, + "learning_rate": 0.00019794102431453702, + "loss": 1.6241, + "step": 209 + }, + { + "epoch": 0.02171608800186138, + "grad_norm": 0.9963400959968567, + "learning_rate": 0.00019793067770305226, + "loss": 1.5765, + "step": 210 + }, + { + "epoch": 0.021819497944727387, + "grad_norm": 0.8830106258392334, + "learning_rate": 0.00019792033109156752, + "loss": 2.2472, + "step": 211 + }, + { + "epoch": 0.021922907887593393, + "grad_norm": 1.1107029914855957, + "learning_rate": 0.0001979099844800828, + "loss": 1.9912, + "step": 212 + }, + { + "epoch": 0.0220263178304594, + "grad_norm": 1.394497036933899, + "learning_rate": 0.00019789963786859805, + "loss": 2.049, + "step": 213 + }, + { + "epoch": 0.022129727773325406, + "grad_norm": 1.3322755098342896, + "learning_rate": 0.00019788929125711332, + "loss": 1.4923, + "step": 214 + }, + { + "epoch": 0.022233137716191412, + "grad_norm": 1.0636125802993774, + "learning_rate": 0.00019787894464562858, + "loss": 2.1262, + "step": 215 + }, + { + "epoch": 0.022336547659057418, + "grad_norm": 0.9886292219161987, + "learning_rate": 0.00019786859803414385, + "loss": 1.6698, + "step": 216 + }, + { + "epoch": 0.022439957601923424, + "grad_norm": 0.9913386106491089, + "learning_rate": 0.0001978582514226591, + "loss": 1.9735, + "step": 217 + }, + { + "epoch": 0.02254336754478943, + "grad_norm": 0.4542105197906494, + "learning_rate": 0.00019784790481117435, + "loss": 1.9013, + "step": 218 + }, + { + "epoch": 0.022646777487655437, + "grad_norm": 0.6689864993095398, + "learning_rate": 0.00019783755819968962, + "loss": 1.9576, + "step": 219 + }, + { + "epoch": 0.022750187430521443, + "grad_norm": 0.8507350087165833, + "learning_rate": 0.00019782721158820489, + "loss": 1.8583, + "step": 220 + }, + { + "epoch": 0.022853597373387453, + "grad_norm": 1.187589168548584, + "learning_rate": 0.00019781686497672015, + "loss": 1.6836, + "step": 221 + }, + { + "epoch": 0.02295700731625346, + "grad_norm": 1.2565590143203735, + "learning_rate": 0.00019780651836523542, + "loss": 2.155, + "step": 222 + }, + { + "epoch": 0.023060417259119465, + "grad_norm": 0.8139156699180603, + "learning_rate": 0.00019779617175375065, + "loss": 1.9202, + "step": 223 + }, + { + "epoch": 0.02316382720198547, + "grad_norm": 0.6143559217453003, + "learning_rate": 0.00019778582514226592, + "loss": 2.0392, + "step": 224 + }, + { + "epoch": 0.023267237144851478, + "grad_norm": 0.992732584476471, + "learning_rate": 0.00019777547853078119, + "loss": 1.8389, + "step": 225 + }, + { + "epoch": 0.023370647087717484, + "grad_norm": 0.8133312463760376, + "learning_rate": 0.00019776513191929645, + "loss": 2.2321, + "step": 226 + }, + { + "epoch": 0.02347405703058349, + "grad_norm": 0.5798576474189758, + "learning_rate": 0.00019775478530781172, + "loss": 1.905, + "step": 227 + }, + { + "epoch": 0.023577466973449496, + "grad_norm": 0.9979951977729797, + "learning_rate": 0.00019774443869632698, + "loss": 1.7011, + "step": 228 + }, + { + "epoch": 0.023680876916315503, + "grad_norm": 1.0430858135223389, + "learning_rate": 0.00019773409208484222, + "loss": 1.8414, + "step": 229 + }, + { + "epoch": 0.02378428685918151, + "grad_norm": 0.9151175022125244, + "learning_rate": 0.0001977237454733575, + "loss": 1.711, + "step": 230 + }, + { + "epoch": 0.023887696802047515, + "grad_norm": 1.0341556072235107, + "learning_rate": 0.00019771339886187275, + "loss": 1.5063, + "step": 231 + }, + { + "epoch": 0.023991106744913525, + "grad_norm": 1.1679956912994385, + "learning_rate": 0.00019770305225038802, + "loss": 2.314, + "step": 232 + }, + { + "epoch": 0.02409451668777953, + "grad_norm": 0.9437854886054993, + "learning_rate": 0.00019769270563890328, + "loss": 1.9386, + "step": 233 + }, + { + "epoch": 0.024197926630645537, + "grad_norm": 0.7488498687744141, + "learning_rate": 0.00019768235902741855, + "loss": 2.3507, + "step": 234 + }, + { + "epoch": 0.024301336573511544, + "grad_norm": 1.0003606081008911, + "learning_rate": 0.0001976720124159338, + "loss": 1.7557, + "step": 235 + }, + { + "epoch": 0.02440474651637755, + "grad_norm": 1.38227117061615, + "learning_rate": 0.00019766166580444905, + "loss": 1.7559, + "step": 236 + }, + { + "epoch": 0.024508156459243556, + "grad_norm": 0.732622504234314, + "learning_rate": 0.00019765131919296432, + "loss": 2.0015, + "step": 237 + }, + { + "epoch": 0.024611566402109562, + "grad_norm": 0.7456156015396118, + "learning_rate": 0.00019764097258147958, + "loss": 2.6102, + "step": 238 + }, + { + "epoch": 0.02471497634497557, + "grad_norm": 1.0653510093688965, + "learning_rate": 0.00019763062596999485, + "loss": 1.9302, + "step": 239 + }, + { + "epoch": 0.024818386287841575, + "grad_norm": 1.5663673877716064, + "learning_rate": 0.00019762027935851012, + "loss": 1.9772, + "step": 240 + }, + { + "epoch": 0.02492179623070758, + "grad_norm": 7.048519134521484, + "learning_rate": 0.00019760993274702535, + "loss": 2.0148, + "step": 241 + }, + { + "epoch": 0.02502520617357359, + "grad_norm": 1.1092578172683716, + "learning_rate": 0.00019759958613554062, + "loss": 1.4961, + "step": 242 + }, + { + "epoch": 0.025128616116439597, + "grad_norm": 1.3526990413665771, + "learning_rate": 0.00019758923952405588, + "loss": 2.0189, + "step": 243 + }, + { + "epoch": 0.025232026059305603, + "grad_norm": 0.9486410021781921, + "learning_rate": 0.00019757889291257115, + "loss": 2.1304, + "step": 244 + }, + { + "epoch": 0.02533543600217161, + "grad_norm": 1.422751545906067, + "learning_rate": 0.00019756854630108642, + "loss": 2.103, + "step": 245 + }, + { + "epoch": 0.025438845945037616, + "grad_norm": 1.5062353610992432, + "learning_rate": 0.00019755819968960168, + "loss": 1.7464, + "step": 246 + }, + { + "epoch": 0.025542255887903622, + "grad_norm": 1.2952933311462402, + "learning_rate": 0.00019754785307811692, + "loss": 1.6458, + "step": 247 + }, + { + "epoch": 0.025645665830769628, + "grad_norm": 0.8431225419044495, + "learning_rate": 0.00019753750646663219, + "loss": 1.6088, + "step": 248 + }, + { + "epoch": 0.025749075773635634, + "grad_norm": 0.7888919711112976, + "learning_rate": 0.00019752715985514745, + "loss": 1.9032, + "step": 249 + }, + { + "epoch": 0.02585248571650164, + "grad_norm": 0.90091872215271, + "learning_rate": 0.00019751681324366272, + "loss": 1.3846, + "step": 250 + }, + { + "epoch": 0.025955895659367647, + "grad_norm": 1.8097118139266968, + "learning_rate": 0.00019750646663217798, + "loss": 2.2661, + "step": 251 + }, + { + "epoch": 0.026059305602233653, + "grad_norm": 0.6663314700126648, + "learning_rate": 0.00019749612002069325, + "loss": 1.7202, + "step": 252 + }, + { + "epoch": 0.026162715545099663, + "grad_norm": 5.989753246307373, + "learning_rate": 0.00019748577340920849, + "loss": 1.879, + "step": 253 + }, + { + "epoch": 0.02626612548796567, + "grad_norm": 1.0955287218093872, + "learning_rate": 0.00019747542679772375, + "loss": 1.9042, + "step": 254 + }, + { + "epoch": 0.026369535430831675, + "grad_norm": 1.4397646188735962, + "learning_rate": 0.00019746508018623902, + "loss": 2.2163, + "step": 255 + }, + { + "epoch": 0.02647294537369768, + "grad_norm": 1.1022082567214966, + "learning_rate": 0.00019745473357475428, + "loss": 1.7654, + "step": 256 + }, + { + "epoch": 0.026576355316563688, + "grad_norm": 0.7697210907936096, + "learning_rate": 0.00019744438696326955, + "loss": 1.4869, + "step": 257 + }, + { + "epoch": 0.026679765259429694, + "grad_norm": 1.1588715314865112, + "learning_rate": 0.00019743404035178481, + "loss": 2.1267, + "step": 258 + }, + { + "epoch": 0.0267831752022957, + "grad_norm": 0.8320289254188538, + "learning_rate": 0.00019742369374030005, + "loss": 1.6688, + "step": 259 + }, + { + "epoch": 0.026886585145161707, + "grad_norm": 1.0717252492904663, + "learning_rate": 0.00019741334712881532, + "loss": 2.0394, + "step": 260 + }, + { + "epoch": 0.026989995088027713, + "grad_norm": 0.7327308654785156, + "learning_rate": 0.00019740300051733058, + "loss": 1.8524, + "step": 261 + }, + { + "epoch": 0.02709340503089372, + "grad_norm": 0.9403550624847412, + "learning_rate": 0.00019739265390584585, + "loss": 2.0664, + "step": 262 + }, + { + "epoch": 0.027196814973759725, + "grad_norm": 1.1287577152252197, + "learning_rate": 0.00019738230729436112, + "loss": 1.9978, + "step": 263 + }, + { + "epoch": 0.027300224916625735, + "grad_norm": 3.3969926834106445, + "learning_rate": 0.00019737196068287638, + "loss": 2.55, + "step": 264 + }, + { + "epoch": 0.02740363485949174, + "grad_norm": 1.8311293125152588, + "learning_rate": 0.00019736161407139162, + "loss": 2.335, + "step": 265 + }, + { + "epoch": 0.027507044802357748, + "grad_norm": 0.8647951483726501, + "learning_rate": 0.00019735126745990688, + "loss": 1.8155, + "step": 266 + }, + { + "epoch": 0.027610454745223754, + "grad_norm": 2.12625789642334, + "learning_rate": 0.00019734092084842215, + "loss": 1.7278, + "step": 267 + }, + { + "epoch": 0.02771386468808976, + "grad_norm": 1.0068492889404297, + "learning_rate": 0.00019733057423693742, + "loss": 1.8327, + "step": 268 + }, + { + "epoch": 0.027817274630955766, + "grad_norm": 1.791537880897522, + "learning_rate": 0.00019732022762545268, + "loss": 2.2064, + "step": 269 + }, + { + "epoch": 0.027920684573821773, + "grad_norm": 0.6931219100952148, + "learning_rate": 0.00019730988101396795, + "loss": 1.6676, + "step": 270 + }, + { + "epoch": 0.02802409451668778, + "grad_norm": 1.1089061498641968, + "learning_rate": 0.00019729953440248319, + "loss": 1.6938, + "step": 271 + }, + { + "epoch": 0.028127504459553785, + "grad_norm": 0.8395723104476929, + "learning_rate": 0.00019728918779099845, + "loss": 2.0785, + "step": 272 + }, + { + "epoch": 0.02823091440241979, + "grad_norm": 0.8383250832557678, + "learning_rate": 0.00019727884117951372, + "loss": 1.9188, + "step": 273 + }, + { + "epoch": 0.0283343243452858, + "grad_norm": 1.6984490156173706, + "learning_rate": 0.00019726849456802898, + "loss": 1.8758, + "step": 274 + }, + { + "epoch": 0.028437734288151807, + "grad_norm": 1.8110450506210327, + "learning_rate": 0.00019725814795654425, + "loss": 2.1544, + "step": 275 + }, + { + "epoch": 0.028541144231017813, + "grad_norm": 1.1028351783752441, + "learning_rate": 0.0001972478013450595, + "loss": 1.8011, + "step": 276 + }, + { + "epoch": 0.02864455417388382, + "grad_norm": 1.2571910619735718, + "learning_rate": 0.00019723745473357475, + "loss": 1.754, + "step": 277 + }, + { + "epoch": 0.028747964116749826, + "grad_norm": 0.7787155508995056, + "learning_rate": 0.00019722710812209002, + "loss": 1.847, + "step": 278 + }, + { + "epoch": 0.028851374059615832, + "grad_norm": 1.1525181531906128, + "learning_rate": 0.00019721676151060528, + "loss": 2.0515, + "step": 279 + }, + { + "epoch": 0.02895478400248184, + "grad_norm": 1.8915187120437622, + "learning_rate": 0.00019720641489912055, + "loss": 2.2906, + "step": 280 + }, + { + "epoch": 0.029058193945347845, + "grad_norm": 0.9054998755455017, + "learning_rate": 0.00019719606828763581, + "loss": 1.8102, + "step": 281 + }, + { + "epoch": 0.02916160388821385, + "grad_norm": 1.0367076396942139, + "learning_rate": 0.00019718572167615108, + "loss": 2.0181, + "step": 282 + }, + { + "epoch": 0.029265013831079857, + "grad_norm": 1.2361838817596436, + "learning_rate": 0.00019717537506466632, + "loss": 1.879, + "step": 283 + }, + { + "epoch": 0.029368423773945863, + "grad_norm": 0.7413283586502075, + "learning_rate": 0.00019716502845318158, + "loss": 1.7919, + "step": 284 + }, + { + "epoch": 0.029471833716811873, + "grad_norm": 0.993869960308075, + "learning_rate": 0.00019715468184169685, + "loss": 1.3424, + "step": 285 + }, + { + "epoch": 0.02957524365967788, + "grad_norm": 1.8364124298095703, + "learning_rate": 0.00019714433523021211, + "loss": 1.9326, + "step": 286 + }, + { + "epoch": 0.029678653602543886, + "grad_norm": 1.173508882522583, + "learning_rate": 0.00019713398861872738, + "loss": 2.6364, + "step": 287 + }, + { + "epoch": 0.029782063545409892, + "grad_norm": 0.8887481689453125, + "learning_rate": 0.00019712364200724265, + "loss": 1.5257, + "step": 288 + }, + { + "epoch": 0.029885473488275898, + "grad_norm": 1.2317752838134766, + "learning_rate": 0.00019711329539575788, + "loss": 2.1392, + "step": 289 + }, + { + "epoch": 0.029988883431141904, + "grad_norm": 1.7063419818878174, + "learning_rate": 0.00019710294878427315, + "loss": 2.3708, + "step": 290 + }, + { + "epoch": 0.03009229337400791, + "grad_norm": 0.9618234634399414, + "learning_rate": 0.00019709260217278842, + "loss": 1.2293, + "step": 291 + }, + { + "epoch": 0.030195703316873917, + "grad_norm": 1.4276773929595947, + "learning_rate": 0.00019708225556130368, + "loss": 1.6762, + "step": 292 + }, + { + "epoch": 0.030299113259739923, + "grad_norm": 1.2723143100738525, + "learning_rate": 0.00019707190894981895, + "loss": 2.0648, + "step": 293 + }, + { + "epoch": 0.03040252320260593, + "grad_norm": 3.1224710941314697, + "learning_rate": 0.0001970615623383342, + "loss": 2.1218, + "step": 294 + }, + { + "epoch": 0.030505933145471936, + "grad_norm": 1.720165729522705, + "learning_rate": 0.00019705121572684945, + "loss": 1.4688, + "step": 295 + }, + { + "epoch": 0.030609343088337945, + "grad_norm": 1.38371741771698, + "learning_rate": 0.00019704086911536472, + "loss": 1.7212, + "step": 296 + }, + { + "epoch": 0.03071275303120395, + "grad_norm": 0.995150625705719, + "learning_rate": 0.00019703052250387998, + "loss": 1.8055, + "step": 297 + }, + { + "epoch": 0.030816162974069958, + "grad_norm": 1.070515513420105, + "learning_rate": 0.00019702017589239525, + "loss": 1.5934, + "step": 298 + }, + { + "epoch": 0.030919572916935964, + "grad_norm": 1.0754715204238892, + "learning_rate": 0.0001970098292809105, + "loss": 1.4899, + "step": 299 + }, + { + "epoch": 0.03102298285980197, + "grad_norm": 1.0355829000473022, + "learning_rate": 0.00019699948266942578, + "loss": 2.0269, + "step": 300 + }, + { + "epoch": 0.031126392802667976, + "grad_norm": 1.7430510520935059, + "learning_rate": 0.00019698913605794102, + "loss": 1.6499, + "step": 301 + }, + { + "epoch": 0.031229802745533983, + "grad_norm": 1.3326469659805298, + "learning_rate": 0.00019697878944645628, + "loss": 2.3119, + "step": 302 + }, + { + "epoch": 0.03133321268839999, + "grad_norm": 1.1865754127502441, + "learning_rate": 0.00019696844283497155, + "loss": 1.9132, + "step": 303 + }, + { + "epoch": 0.031436622631265995, + "grad_norm": 0.8858489990234375, + "learning_rate": 0.00019695809622348681, + "loss": 1.7969, + "step": 304 + }, + { + "epoch": 0.031540032574132, + "grad_norm": 1.064029335975647, + "learning_rate": 0.00019694774961200208, + "loss": 2.0591, + "step": 305 + }, + { + "epoch": 0.03164344251699801, + "grad_norm": 0.7958260774612427, + "learning_rate": 0.00019693740300051734, + "loss": 1.7113, + "step": 306 + }, + { + "epoch": 0.031746852459864014, + "grad_norm": 0.9576128125190735, + "learning_rate": 0.00019692705638903258, + "loss": 1.4719, + "step": 307 + }, + { + "epoch": 0.03185026240273002, + "grad_norm": 0.8087986707687378, + "learning_rate": 0.00019691670977754785, + "loss": 2.1864, + "step": 308 + }, + { + "epoch": 0.031953672345596026, + "grad_norm": 1.0684764385223389, + "learning_rate": 0.00019690636316606311, + "loss": 2.1109, + "step": 309 + }, + { + "epoch": 0.03205708228846203, + "grad_norm": 1.1162570714950562, + "learning_rate": 0.00019689601655457838, + "loss": 1.6932, + "step": 310 + }, + { + "epoch": 0.03216049223132804, + "grad_norm": 0.9443684220314026, + "learning_rate": 0.00019688566994309365, + "loss": 1.9788, + "step": 311 + }, + { + "epoch": 0.03226390217419405, + "grad_norm": 0.8843461871147156, + "learning_rate": 0.0001968753233316089, + "loss": 1.5086, + "step": 312 + }, + { + "epoch": 0.03236731211706006, + "grad_norm": 1.1329318284988403, + "learning_rate": 0.00019686497672012415, + "loss": 1.5243, + "step": 313 + }, + { + "epoch": 0.032470722059926065, + "grad_norm": 0.6813123226165771, + "learning_rate": 0.00019685463010863942, + "loss": 2.0232, + "step": 314 + }, + { + "epoch": 0.03257413200279207, + "grad_norm": 1.022520661354065, + "learning_rate": 0.00019684428349715468, + "loss": 1.3452, + "step": 315 + }, + { + "epoch": 0.03267754194565808, + "grad_norm": 1.8104302883148193, + "learning_rate": 0.00019683393688566995, + "loss": 1.9257, + "step": 316 + }, + { + "epoch": 0.03278095188852408, + "grad_norm": 1.073581337928772, + "learning_rate": 0.0001968235902741852, + "loss": 1.5704, + "step": 317 + }, + { + "epoch": 0.03288436183139009, + "grad_norm": 2.10227632522583, + "learning_rate": 0.00019681324366270048, + "loss": 1.4086, + "step": 318 + }, + { + "epoch": 0.032987771774256096, + "grad_norm": 0.888225793838501, + "learning_rate": 0.00019680289705121572, + "loss": 1.8722, + "step": 319 + }, + { + "epoch": 0.0330911817171221, + "grad_norm": 0.8909750580787659, + "learning_rate": 0.00019679255043973098, + "loss": 1.8109, + "step": 320 + }, + { + "epoch": 0.03319459165998811, + "grad_norm": 1.5270005464553833, + "learning_rate": 0.00019678220382824625, + "loss": 2.4184, + "step": 321 + }, + { + "epoch": 0.033298001602854115, + "grad_norm": 1.501200795173645, + "learning_rate": 0.0001967718572167615, + "loss": 2.5458, + "step": 322 + }, + { + "epoch": 0.03340141154572012, + "grad_norm": 2.178788661956787, + "learning_rate": 0.00019676151060527678, + "loss": 1.6343, + "step": 323 + }, + { + "epoch": 0.03350482148858613, + "grad_norm": 2.2387502193450928, + "learning_rate": 0.00019675116399379204, + "loss": 2.2075, + "step": 324 + }, + { + "epoch": 0.03360823143145213, + "grad_norm": 0.9987953901290894, + "learning_rate": 0.0001967408173823073, + "loss": 1.8006, + "step": 325 + }, + { + "epoch": 0.03371164137431814, + "grad_norm": 1.5084731578826904, + "learning_rate": 0.00019673047077082255, + "loss": 1.9028, + "step": 326 + }, + { + "epoch": 0.033815051317184146, + "grad_norm": 0.7339261770248413, + "learning_rate": 0.0001967201241593378, + "loss": 1.8989, + "step": 327 + }, + { + "epoch": 0.03391846126005015, + "grad_norm": 0.7608749866485596, + "learning_rate": 0.00019670977754785308, + "loss": 1.4195, + "step": 328 + }, + { + "epoch": 0.03402187120291616, + "grad_norm": 1.1834888458251953, + "learning_rate": 0.00019669943093636834, + "loss": 1.5853, + "step": 329 + }, + { + "epoch": 0.034125281145782164, + "grad_norm": 0.7872140407562256, + "learning_rate": 0.0001966890843248836, + "loss": 1.6897, + "step": 330 + }, + { + "epoch": 0.03422869108864817, + "grad_norm": 1.3614797592163086, + "learning_rate": 0.00019667873771339888, + "loss": 1.6085, + "step": 331 + }, + { + "epoch": 0.03433210103151418, + "grad_norm": 1.9641176462173462, + "learning_rate": 0.00019666839110191414, + "loss": 2.2778, + "step": 332 + }, + { + "epoch": 0.03443551097438018, + "grad_norm": 1.391445517539978, + "learning_rate": 0.0001966580444904294, + "loss": 2.1415, + "step": 333 + }, + { + "epoch": 0.034538920917246196, + "grad_norm": 1.4932453632354736, + "learning_rate": 0.00019664769787894465, + "loss": 2.2698, + "step": 334 + }, + { + "epoch": 0.0346423308601122, + "grad_norm": 1.4438022375106812, + "learning_rate": 0.0001966373512674599, + "loss": 2.1187, + "step": 335 + }, + { + "epoch": 0.03474574080297821, + "grad_norm": 0.9921761751174927, + "learning_rate": 0.00019662700465597518, + "loss": 1.8119, + "step": 336 + }, + { + "epoch": 0.034849150745844215, + "grad_norm": 1.183098316192627, + "learning_rate": 0.00019661665804449044, + "loss": 1.7168, + "step": 337 + }, + { + "epoch": 0.03495256068871022, + "grad_norm": 0.990986168384552, + "learning_rate": 0.0001966063114330057, + "loss": 1.7015, + "step": 338 + }, + { + "epoch": 0.03505597063157623, + "grad_norm": 0.5842035412788391, + "learning_rate": 0.00019659596482152097, + "loss": 2.0659, + "step": 339 + }, + { + "epoch": 0.035159380574442234, + "grad_norm": 1.477404236793518, + "learning_rate": 0.00019658561821003624, + "loss": 1.9748, + "step": 340 + }, + { + "epoch": 0.03526279051730824, + "grad_norm": 0.8818874359130859, + "learning_rate": 0.0001965752715985515, + "loss": 1.9903, + "step": 341 + }, + { + "epoch": 0.035366200460174246, + "grad_norm": 1.2804322242736816, + "learning_rate": 0.00019656492498706674, + "loss": 1.9245, + "step": 342 + }, + { + "epoch": 0.03546961040304025, + "grad_norm": 1.5887545347213745, + "learning_rate": 0.000196554578375582, + "loss": 2.2518, + "step": 343 + }, + { + "epoch": 0.03557302034590626, + "grad_norm": 0.8990143537521362, + "learning_rate": 0.00019654423176409727, + "loss": 1.4269, + "step": 344 + }, + { + "epoch": 0.035676430288772265, + "grad_norm": 1.9320247173309326, + "learning_rate": 0.00019653388515261254, + "loss": 2.0075, + "step": 345 + }, + { + "epoch": 0.03577984023163827, + "grad_norm": 1.5792721509933472, + "learning_rate": 0.0001965235385411278, + "loss": 2.1097, + "step": 346 + }, + { + "epoch": 0.03588325017450428, + "grad_norm": 3.98584246635437, + "learning_rate": 0.00019651319192964307, + "loss": 1.951, + "step": 347 + }, + { + "epoch": 0.035986660117370284, + "grad_norm": 1.516032099723816, + "learning_rate": 0.0001965028453181583, + "loss": 2.3663, + "step": 348 + }, + { + "epoch": 0.03609007006023629, + "grad_norm": 2.0040533542633057, + "learning_rate": 0.00019649249870667357, + "loss": 1.8742, + "step": 349 + }, + { + "epoch": 0.036193480003102296, + "grad_norm": 1.3640230894088745, + "learning_rate": 0.00019648215209518884, + "loss": 2.1976, + "step": 350 + }, + { + "epoch": 0.0362968899459683, + "grad_norm": 1.2644320726394653, + "learning_rate": 0.0001964718054837041, + "loss": 1.9636, + "step": 351 + }, + { + "epoch": 0.03640029988883431, + "grad_norm": 1.351493239402771, + "learning_rate": 0.00019646145887221937, + "loss": 2.0332, + "step": 352 + }, + { + "epoch": 0.036503709831700315, + "grad_norm": 1.6324365139007568, + "learning_rate": 0.00019645111226073464, + "loss": 2.2028, + "step": 353 + }, + { + "epoch": 0.03660711977456632, + "grad_norm": 0.8964999318122864, + "learning_rate": 0.00019644076564924988, + "loss": 1.6669, + "step": 354 + }, + { + "epoch": 0.036710529717432334, + "grad_norm": 0.6752513647079468, + "learning_rate": 0.00019643041903776514, + "loss": 1.9048, + "step": 355 + }, + { + "epoch": 0.03681393966029834, + "grad_norm": 0.8167389631271362, + "learning_rate": 0.0001964200724262804, + "loss": 1.9142, + "step": 356 + }, + { + "epoch": 0.03691734960316435, + "grad_norm": 0.5024054646492004, + "learning_rate": 0.00019640972581479567, + "loss": 2.1693, + "step": 357 + }, + { + "epoch": 0.03702075954603035, + "grad_norm": 0.8729010224342346, + "learning_rate": 0.00019639937920331094, + "loss": 1.7099, + "step": 358 + }, + { + "epoch": 0.03712416948889636, + "grad_norm": 0.8799291849136353, + "learning_rate": 0.0001963890325918262, + "loss": 1.5546, + "step": 359 + }, + { + "epoch": 0.037227579431762366, + "grad_norm": 1.0670166015625, + "learning_rate": 0.00019637868598034144, + "loss": 1.6354, + "step": 360 + }, + { + "epoch": 0.03733098937462837, + "grad_norm": 0.9233925342559814, + "learning_rate": 0.0001963683393688567, + "loss": 2.1515, + "step": 361 + }, + { + "epoch": 0.03743439931749438, + "grad_norm": 1.1841235160827637, + "learning_rate": 0.00019635799275737197, + "loss": 1.7396, + "step": 362 + }, + { + "epoch": 0.037537809260360384, + "grad_norm": 0.8963435292243958, + "learning_rate": 0.00019634764614588724, + "loss": 2.1037, + "step": 363 + }, + { + "epoch": 0.03764121920322639, + "grad_norm": 1.3393969535827637, + "learning_rate": 0.0001963372995344025, + "loss": 2.0611, + "step": 364 + }, + { + "epoch": 0.0377446291460924, + "grad_norm": 1.2839521169662476, + "learning_rate": 0.00019632695292291777, + "loss": 1.2457, + "step": 365 + }, + { + "epoch": 0.0378480390889584, + "grad_norm": 0.8691089153289795, + "learning_rate": 0.000196316606311433, + "loss": 1.4168, + "step": 366 + }, + { + "epoch": 0.03795144903182441, + "grad_norm": 0.8430613279342651, + "learning_rate": 0.00019630625969994827, + "loss": 2.0188, + "step": 367 + }, + { + "epoch": 0.038054858974690416, + "grad_norm": 1.3562883138656616, + "learning_rate": 0.00019629591308846354, + "loss": 1.773, + "step": 368 + }, + { + "epoch": 0.03815826891755642, + "grad_norm": 1.4203057289123535, + "learning_rate": 0.0001962855664769788, + "loss": 2.0644, + "step": 369 + }, + { + "epoch": 0.03826167886042243, + "grad_norm": 1.5780055522918701, + "learning_rate": 0.00019627521986549407, + "loss": 1.8363, + "step": 370 + }, + { + "epoch": 0.038365088803288434, + "grad_norm": 1.713783621788025, + "learning_rate": 0.00019626487325400934, + "loss": 1.6578, + "step": 371 + }, + { + "epoch": 0.03846849874615444, + "grad_norm": 1.316636323928833, + "learning_rate": 0.00019625452664252457, + "loss": 1.571, + "step": 372 + }, + { + "epoch": 0.03857190868902045, + "grad_norm": 0.8482599854469299, + "learning_rate": 0.00019624418003103984, + "loss": 1.993, + "step": 373 + }, + { + "epoch": 0.03867531863188645, + "grad_norm": 0.9783570766448975, + "learning_rate": 0.0001962338334195551, + "loss": 1.7741, + "step": 374 + }, + { + "epoch": 0.03877872857475246, + "grad_norm": 1.0668089389801025, + "learning_rate": 0.00019622348680807037, + "loss": 2.0415, + "step": 375 + }, + { + "epoch": 0.038882138517618466, + "grad_norm": 0.7089089155197144, + "learning_rate": 0.00019621314019658564, + "loss": 1.9086, + "step": 376 + }, + { + "epoch": 0.03898554846048448, + "grad_norm": 1.2400895357131958, + "learning_rate": 0.0001962027935851009, + "loss": 1.9245, + "step": 377 + }, + { + "epoch": 0.039088958403350485, + "grad_norm": 0.638532280921936, + "learning_rate": 0.00019619244697361614, + "loss": 1.6457, + "step": 378 + }, + { + "epoch": 0.03919236834621649, + "grad_norm": 0.9810022711753845, + "learning_rate": 0.0001961821003621314, + "loss": 1.8889, + "step": 379 + }, + { + "epoch": 0.0392957782890825, + "grad_norm": 1.2179443836212158, + "learning_rate": 0.00019617175375064667, + "loss": 1.9645, + "step": 380 + }, + { + "epoch": 0.039399188231948504, + "grad_norm": 1.3626303672790527, + "learning_rate": 0.00019616140713916194, + "loss": 2.1616, + "step": 381 + }, + { + "epoch": 0.03950259817481451, + "grad_norm": 1.0680832862854004, + "learning_rate": 0.0001961510605276772, + "loss": 1.8425, + "step": 382 + }, + { + "epoch": 0.039606008117680516, + "grad_norm": 1.3585445880889893, + "learning_rate": 0.00019614071391619247, + "loss": 1.659, + "step": 383 + }, + { + "epoch": 0.03970941806054652, + "grad_norm": 0.6698444485664368, + "learning_rate": 0.0001961303673047077, + "loss": 1.8558, + "step": 384 + }, + { + "epoch": 0.03981282800341253, + "grad_norm": 1.7466821670532227, + "learning_rate": 0.00019612002069322297, + "loss": 1.7544, + "step": 385 + }, + { + "epoch": 0.039916237946278535, + "grad_norm": 1.762367844581604, + "learning_rate": 0.00019610967408173824, + "loss": 1.9057, + "step": 386 + }, + { + "epoch": 0.04001964788914454, + "grad_norm": 0.8794074654579163, + "learning_rate": 0.0001960993274702535, + "loss": 1.7722, + "step": 387 + }, + { + "epoch": 0.04012305783201055, + "grad_norm": 0.8490718007087708, + "learning_rate": 0.00019608898085876877, + "loss": 1.8479, + "step": 388 + }, + { + "epoch": 0.040226467774876554, + "grad_norm": 1.8691964149475098, + "learning_rate": 0.00019607863424728403, + "loss": 2.4129, + "step": 389 + }, + { + "epoch": 0.04032987771774256, + "grad_norm": 0.9891068339347839, + "learning_rate": 0.00019606828763579927, + "loss": 2.2297, + "step": 390 + }, + { + "epoch": 0.040433287660608566, + "grad_norm": 0.9857831001281738, + "learning_rate": 0.00019605794102431454, + "loss": 1.8453, + "step": 391 + }, + { + "epoch": 0.04053669760347457, + "grad_norm": 0.8795828223228455, + "learning_rate": 0.0001960475944128298, + "loss": 1.5699, + "step": 392 + }, + { + "epoch": 0.04064010754634058, + "grad_norm": 0.8345451951026917, + "learning_rate": 0.00019603724780134507, + "loss": 1.3905, + "step": 393 + }, + { + "epoch": 0.040743517489206585, + "grad_norm": 1.587632179260254, + "learning_rate": 0.00019602690118986034, + "loss": 1.6993, + "step": 394 + }, + { + "epoch": 0.04084692743207259, + "grad_norm": 0.9896488189697266, + "learning_rate": 0.0001960165545783756, + "loss": 1.9135, + "step": 395 + }, + { + "epoch": 0.0409503373749386, + "grad_norm": 2.5469861030578613, + "learning_rate": 0.00019600620796689084, + "loss": 2.1052, + "step": 396 + }, + { + "epoch": 0.041053747317804604, + "grad_norm": 3.350372314453125, + "learning_rate": 0.0001959958613554061, + "loss": 2.3579, + "step": 397 + }, + { + "epoch": 0.04115715726067062, + "grad_norm": 1.0747287273406982, + "learning_rate": 0.00019598551474392137, + "loss": 2.0841, + "step": 398 + }, + { + "epoch": 0.04126056720353662, + "grad_norm": 0.9288704991340637, + "learning_rate": 0.00019597516813243664, + "loss": 2.2219, + "step": 399 + }, + { + "epoch": 0.04136397714640263, + "grad_norm": 1.3250536918640137, + "learning_rate": 0.0001959648215209519, + "loss": 1.4469, + "step": 400 + }, + { + "epoch": 0.041467387089268636, + "grad_norm": 0.8420027494430542, + "learning_rate": 0.00019595447490946717, + "loss": 1.7364, + "step": 401 + }, + { + "epoch": 0.04157079703213464, + "grad_norm": 1.9357109069824219, + "learning_rate": 0.0001959441282979824, + "loss": 1.9886, + "step": 402 + }, + { + "epoch": 0.04167420697500065, + "grad_norm": 2.0010147094726562, + "learning_rate": 0.00019593378168649767, + "loss": 2.2205, + "step": 403 + }, + { + "epoch": 0.041777616917866654, + "grad_norm": 0.8120896220207214, + "learning_rate": 0.00019592343507501294, + "loss": 2.0961, + "step": 404 + }, + { + "epoch": 0.04188102686073266, + "grad_norm": 0.7679799795150757, + "learning_rate": 0.0001959130884635282, + "loss": 1.9252, + "step": 405 + }, + { + "epoch": 0.04198443680359867, + "grad_norm": 0.8940830826759338, + "learning_rate": 0.00019590274185204347, + "loss": 2.0255, + "step": 406 + }, + { + "epoch": 0.04208784674646467, + "grad_norm": 0.9653421640396118, + "learning_rate": 0.00019589239524055873, + "loss": 1.3163, + "step": 407 + }, + { + "epoch": 0.04219125668933068, + "grad_norm": 1.0259383916854858, + "learning_rate": 0.00019588204862907397, + "loss": 2.0588, + "step": 408 + }, + { + "epoch": 0.042294666632196685, + "grad_norm": 0.9519892930984497, + "learning_rate": 0.00019587170201758924, + "loss": 1.9092, + "step": 409 + }, + { + "epoch": 0.04239807657506269, + "grad_norm": 1.7076958417892456, + "learning_rate": 0.0001958613554061045, + "loss": 1.9263, + "step": 410 + }, + { + "epoch": 0.0425014865179287, + "grad_norm": 1.0932697057724, + "learning_rate": 0.00019585100879461977, + "loss": 1.796, + "step": 411 + }, + { + "epoch": 0.042604896460794704, + "grad_norm": 1.7663601636886597, + "learning_rate": 0.00019584066218313503, + "loss": 2.0681, + "step": 412 + }, + { + "epoch": 0.04270830640366071, + "grad_norm": 1.7543317079544067, + "learning_rate": 0.0001958303155716503, + "loss": 1.7257, + "step": 413 + }, + { + "epoch": 0.04281171634652672, + "grad_norm": 2.176375389099121, + "learning_rate": 0.00019581996896016554, + "loss": 2.515, + "step": 414 + }, + { + "epoch": 0.04291512628939272, + "grad_norm": 1.0832066535949707, + "learning_rate": 0.0001958096223486808, + "loss": 1.9952, + "step": 415 + }, + { + "epoch": 0.04301853623225873, + "grad_norm": 1.238050937652588, + "learning_rate": 0.00019579927573719607, + "loss": 1.8734, + "step": 416 + }, + { + "epoch": 0.043121946175124735, + "grad_norm": 1.0513511896133423, + "learning_rate": 0.00019578892912571134, + "loss": 2.0816, + "step": 417 + }, + { + "epoch": 0.04322535611799074, + "grad_norm": 0.7336549758911133, + "learning_rate": 0.0001957785825142266, + "loss": 1.6495, + "step": 418 + }, + { + "epoch": 0.04332876606085675, + "grad_norm": 1.76078200340271, + "learning_rate": 0.00019576823590274187, + "loss": 2.0322, + "step": 419 + }, + { + "epoch": 0.04343217600372276, + "grad_norm": 1.6764607429504395, + "learning_rate": 0.0001957578892912571, + "loss": 1.7698, + "step": 420 + }, + { + "epoch": 0.04353558594658877, + "grad_norm": 1.1513508558273315, + "learning_rate": 0.00019574754267977237, + "loss": 1.8382, + "step": 421 + }, + { + "epoch": 0.043638995889454774, + "grad_norm": 0.7220249176025391, + "learning_rate": 0.00019573719606828764, + "loss": 1.7605, + "step": 422 + }, + { + "epoch": 0.04374240583232078, + "grad_norm": 1.9611780643463135, + "learning_rate": 0.0001957268494568029, + "loss": 1.3147, + "step": 423 + }, + { + "epoch": 0.043845815775186786, + "grad_norm": 1.3807041645050049, + "learning_rate": 0.00019571650284531817, + "loss": 1.8702, + "step": 424 + }, + { + "epoch": 0.04394922571805279, + "grad_norm": 1.157392144203186, + "learning_rate": 0.00019570615623383343, + "loss": 1.9506, + "step": 425 + }, + { + "epoch": 0.0440526356609188, + "grad_norm": 2.0201995372772217, + "learning_rate": 0.00019569580962234867, + "loss": 2.2078, + "step": 426 + }, + { + "epoch": 0.044156045603784805, + "grad_norm": 1.2479861974716187, + "learning_rate": 0.00019568546301086394, + "loss": 2.5931, + "step": 427 + }, + { + "epoch": 0.04425945554665081, + "grad_norm": 1.2408305406570435, + "learning_rate": 0.0001956751163993792, + "loss": 1.7142, + "step": 428 + }, + { + "epoch": 0.04436286548951682, + "grad_norm": 1.2568728923797607, + "learning_rate": 0.00019566476978789447, + "loss": 1.6175, + "step": 429 + }, + { + "epoch": 0.044466275432382824, + "grad_norm": 0.9081295728683472, + "learning_rate": 0.00019565442317640973, + "loss": 1.6577, + "step": 430 + }, + { + "epoch": 0.04456968537524883, + "grad_norm": 2.198873281478882, + "learning_rate": 0.000195644076564925, + "loss": 1.851, + "step": 431 + }, + { + "epoch": 0.044673095318114836, + "grad_norm": 1.4093550443649292, + "learning_rate": 0.00019563372995344024, + "loss": 2.0104, + "step": 432 + }, + { + "epoch": 0.04477650526098084, + "grad_norm": 1.2647321224212646, + "learning_rate": 0.0001956233833419555, + "loss": 1.9424, + "step": 433 + }, + { + "epoch": 0.04487991520384685, + "grad_norm": 2.1517534255981445, + "learning_rate": 0.00019561303673047077, + "loss": 2.1392, + "step": 434 + }, + { + "epoch": 0.044983325146712855, + "grad_norm": 0.9314191937446594, + "learning_rate": 0.00019560269011898603, + "loss": 2.0857, + "step": 435 + }, + { + "epoch": 0.04508673508957886, + "grad_norm": 0.8166632056236267, + "learning_rate": 0.0001955923435075013, + "loss": 1.7981, + "step": 436 + }, + { + "epoch": 0.04519014503244487, + "grad_norm": 0.8604589104652405, + "learning_rate": 0.00019558199689601657, + "loss": 1.6313, + "step": 437 + }, + { + "epoch": 0.045293554975310873, + "grad_norm": 1.1065396070480347, + "learning_rate": 0.0001955716502845318, + "loss": 1.8895, + "step": 438 + }, + { + "epoch": 0.04539696491817688, + "grad_norm": 0.9663698077201843, + "learning_rate": 0.00019556130367304707, + "loss": 1.6672, + "step": 439 + }, + { + "epoch": 0.045500374861042886, + "grad_norm": 0.92950838804245, + "learning_rate": 0.00019555095706156234, + "loss": 1.8932, + "step": 440 + }, + { + "epoch": 0.0456037848039089, + "grad_norm": 1.708186388015747, + "learning_rate": 0.0001955406104500776, + "loss": 2.0815, + "step": 441 + }, + { + "epoch": 0.045707194746774905, + "grad_norm": 1.041289210319519, + "learning_rate": 0.00019553026383859287, + "loss": 1.8329, + "step": 442 + }, + { + "epoch": 0.04581060468964091, + "grad_norm": 2.087454080581665, + "learning_rate": 0.00019551991722710813, + "loss": 2.5975, + "step": 443 + }, + { + "epoch": 0.04591401463250692, + "grad_norm": 0.8211826682090759, + "learning_rate": 0.00019550957061562337, + "loss": 1.5402, + "step": 444 + }, + { + "epoch": 0.046017424575372924, + "grad_norm": 0.9692025184631348, + "learning_rate": 0.00019549922400413864, + "loss": 1.5424, + "step": 445 + }, + { + "epoch": 0.04612083451823893, + "grad_norm": 1.282361626625061, + "learning_rate": 0.0001954888773926539, + "loss": 1.1477, + "step": 446 + }, + { + "epoch": 0.04622424446110494, + "grad_norm": 1.3725165128707886, + "learning_rate": 0.00019547853078116917, + "loss": 1.7384, + "step": 447 + }, + { + "epoch": 0.04632765440397094, + "grad_norm": 1.2803922891616821, + "learning_rate": 0.00019546818416968443, + "loss": 1.5037, + "step": 448 + }, + { + "epoch": 0.04643106434683695, + "grad_norm": 1.339898943901062, + "learning_rate": 0.0001954578375581997, + "loss": 1.6536, + "step": 449 + }, + { + "epoch": 0.046534474289702955, + "grad_norm": 1.1587685346603394, + "learning_rate": 0.00019544749094671496, + "loss": 1.8648, + "step": 450 + }, + { + "epoch": 0.04663788423256896, + "grad_norm": 1.1297105550765991, + "learning_rate": 0.0001954371443352302, + "loss": 1.6633, + "step": 451 + }, + { + "epoch": 0.04674129417543497, + "grad_norm": 1.044912576675415, + "learning_rate": 0.00019542679772374547, + "loss": 1.6992, + "step": 452 + }, + { + "epoch": 0.046844704118300974, + "grad_norm": 1.1585153341293335, + "learning_rate": 0.00019541645111226073, + "loss": 1.5027, + "step": 453 + }, + { + "epoch": 0.04694811406116698, + "grad_norm": 2.7175047397613525, + "learning_rate": 0.000195406104500776, + "loss": 1.9774, + "step": 454 + }, + { + "epoch": 0.04705152400403299, + "grad_norm": 1.172574758529663, + "learning_rate": 0.00019539575788929126, + "loss": 1.8882, + "step": 455 + }, + { + "epoch": 0.04715493394689899, + "grad_norm": 1.0482293367385864, + "learning_rate": 0.00019538541127780653, + "loss": 1.6194, + "step": 456 + }, + { + "epoch": 0.047258343889765, + "grad_norm": 1.8754322528839111, + "learning_rate": 0.0001953750646663218, + "loss": 2.1531, + "step": 457 + }, + { + "epoch": 0.047361753832631005, + "grad_norm": 1.1492758989334106, + "learning_rate": 0.00019536471805483706, + "loss": 1.7342, + "step": 458 + }, + { + "epoch": 0.04746516377549701, + "grad_norm": 1.121586561203003, + "learning_rate": 0.0001953543714433523, + "loss": 1.3099, + "step": 459 + }, + { + "epoch": 0.04756857371836302, + "grad_norm": 0.9962121844291687, + "learning_rate": 0.00019534402483186757, + "loss": 1.738, + "step": 460 + }, + { + "epoch": 0.047671983661229024, + "grad_norm": 1.8688263893127441, + "learning_rate": 0.00019533367822038283, + "loss": 2.1001, + "step": 461 + }, + { + "epoch": 0.04777539360409503, + "grad_norm": 0.9260441660881042, + "learning_rate": 0.0001953233316088981, + "loss": 1.5776, + "step": 462 + }, + { + "epoch": 0.04787880354696104, + "grad_norm": 0.962735116481781, + "learning_rate": 0.00019531298499741336, + "loss": 1.3518, + "step": 463 + }, + { + "epoch": 0.04798221348982705, + "grad_norm": 3.1183736324310303, + "learning_rate": 0.00019530263838592863, + "loss": 2.299, + "step": 464 + }, + { + "epoch": 0.048085623432693056, + "grad_norm": 0.9161269664764404, + "learning_rate": 0.0001952922917744439, + "loss": 2.2219, + "step": 465 + }, + { + "epoch": 0.04818903337555906, + "grad_norm": 1.8039288520812988, + "learning_rate": 0.00019528194516295916, + "loss": 2.5586, + "step": 466 + }, + { + "epoch": 0.04829244331842507, + "grad_norm": 1.0817803144454956, + "learning_rate": 0.0001952715985514744, + "loss": 1.8248, + "step": 467 + }, + { + "epoch": 0.048395853261291075, + "grad_norm": 1.4744598865509033, + "learning_rate": 0.00019526125193998966, + "loss": 1.8484, + "step": 468 + }, + { + "epoch": 0.04849926320415708, + "grad_norm": 0.6107484698295593, + "learning_rate": 0.00019525090532850493, + "loss": 1.8709, + "step": 469 + }, + { + "epoch": 0.04860267314702309, + "grad_norm": 1.2721505165100098, + "learning_rate": 0.0001952405587170202, + "loss": 1.9078, + "step": 470 + }, + { + "epoch": 0.04870608308988909, + "grad_norm": 1.4992153644561768, + "learning_rate": 0.00019523021210553546, + "loss": 2.1692, + "step": 471 + }, + { + "epoch": 0.0488094930327551, + "grad_norm": 0.6918119788169861, + "learning_rate": 0.00019521986549405073, + "loss": 2.0205, + "step": 472 + }, + { + "epoch": 0.048912902975621106, + "grad_norm": 1.3048973083496094, + "learning_rate": 0.000195209518882566, + "loss": 1.5603, + "step": 473 + }, + { + "epoch": 0.04901631291848711, + "grad_norm": 2.52955961227417, + "learning_rate": 0.00019519917227108123, + "loss": 2.3796, + "step": 474 + }, + { + "epoch": 0.04911972286135312, + "grad_norm": 1.8823479413986206, + "learning_rate": 0.0001951888256595965, + "loss": 2.4266, + "step": 475 + }, + { + "epoch": 0.049223132804219125, + "grad_norm": 1.1469167470932007, + "learning_rate": 0.00019517847904811176, + "loss": 2.0518, + "step": 476 + }, + { + "epoch": 0.04932654274708513, + "grad_norm": 1.1344563961029053, + "learning_rate": 0.00019516813243662703, + "loss": 1.707, + "step": 477 + }, + { + "epoch": 0.04942995268995114, + "grad_norm": 2.04890513420105, + "learning_rate": 0.0001951577858251423, + "loss": 2.219, + "step": 478 + }, + { + "epoch": 0.04953336263281714, + "grad_norm": 1.6981679201126099, + "learning_rate": 0.00019514743921365756, + "loss": 2.0902, + "step": 479 + }, + { + "epoch": 0.04963677257568315, + "grad_norm": 0.7248592972755432, + "learning_rate": 0.0001951370926021728, + "loss": 1.8263, + "step": 480 + }, + { + "epoch": 0.049740182518549156, + "grad_norm": 1.1791532039642334, + "learning_rate": 0.00019512674599068806, + "loss": 1.7427, + "step": 481 + }, + { + "epoch": 0.04984359246141516, + "grad_norm": 1.0736463069915771, + "learning_rate": 0.00019511639937920333, + "loss": 1.7577, + "step": 482 + }, + { + "epoch": 0.04994700240428117, + "grad_norm": 0.7646867632865906, + "learning_rate": 0.0001951060527677186, + "loss": 1.9615, + "step": 483 + }, + { + "epoch": 0.05005041234714718, + "grad_norm": 2.399254083633423, + "learning_rate": 0.00019509570615623386, + "loss": 2.0014, + "step": 484 + }, + { + "epoch": 0.05015382229001319, + "grad_norm": 1.1391059160232544, + "learning_rate": 0.00019508535954474912, + "loss": 1.5579, + "step": 485 + }, + { + "epoch": 0.050257232232879194, + "grad_norm": 1.318179965019226, + "learning_rate": 0.00019507501293326436, + "loss": 1.7957, + "step": 486 + }, + { + "epoch": 0.0503606421757452, + "grad_norm": 3.170398473739624, + "learning_rate": 0.00019506466632177963, + "loss": 1.9489, + "step": 487 + }, + { + "epoch": 0.050464052118611206, + "grad_norm": 2.3561599254608154, + "learning_rate": 0.0001950543197102949, + "loss": 2.0789, + "step": 488 + }, + { + "epoch": 0.05056746206147721, + "grad_norm": 0.9455686807632446, + "learning_rate": 0.00019504397309881016, + "loss": 1.7533, + "step": 489 + }, + { + "epoch": 0.05067087200434322, + "grad_norm": 0.999107837677002, + "learning_rate": 0.00019503362648732542, + "loss": 1.8654, + "step": 490 + }, + { + "epoch": 0.050774281947209225, + "grad_norm": 1.1541388034820557, + "learning_rate": 0.0001950232798758407, + "loss": 1.9729, + "step": 491 + }, + { + "epoch": 0.05087769189007523, + "grad_norm": 1.5254406929016113, + "learning_rate": 0.00019501293326435593, + "loss": 2.4427, + "step": 492 + }, + { + "epoch": 0.05098110183294124, + "grad_norm": 4.286779403686523, + "learning_rate": 0.0001950025866528712, + "loss": 2.4384, + "step": 493 + }, + { + "epoch": 0.051084511775807244, + "grad_norm": 1.545657753944397, + "learning_rate": 0.00019499224004138646, + "loss": 1.6825, + "step": 494 + }, + { + "epoch": 0.05118792171867325, + "grad_norm": 1.4681485891342163, + "learning_rate": 0.00019498189342990172, + "loss": 1.9014, + "step": 495 + }, + { + "epoch": 0.051291331661539256, + "grad_norm": 1.6862431764602661, + "learning_rate": 0.000194971546818417, + "loss": 2.1929, + "step": 496 + }, + { + "epoch": 0.05139474160440526, + "grad_norm": 1.032334804534912, + "learning_rate": 0.00019496120020693226, + "loss": 1.8015, + "step": 497 + }, + { + "epoch": 0.05149815154727127, + "grad_norm": 1.258537769317627, + "learning_rate": 0.0001949508535954475, + "loss": 1.6544, + "step": 498 + }, + { + "epoch": 0.051601561490137275, + "grad_norm": 1.245613694190979, + "learning_rate": 0.00019494050698396276, + "loss": 1.8314, + "step": 499 + }, + { + "epoch": 0.05170497143300328, + "grad_norm": 1.1958906650543213, + "learning_rate": 0.00019493016037247803, + "loss": 2.0709, + "step": 500 + }, + { + "epoch": 0.05180838137586929, + "grad_norm": 1.1290735006332397, + "learning_rate": 0.0001949198137609933, + "loss": 1.6583, + "step": 501 + }, + { + "epoch": 0.051911791318735294, + "grad_norm": 0.966052234172821, + "learning_rate": 0.00019490946714950856, + "loss": 1.5282, + "step": 502 + }, + { + "epoch": 0.0520152012616013, + "grad_norm": 1.0488630533218384, + "learning_rate": 0.00019489912053802382, + "loss": 1.6947, + "step": 503 + }, + { + "epoch": 0.052118611204467306, + "grad_norm": 2.5863540172576904, + "learning_rate": 0.00019488877392653906, + "loss": 1.9402, + "step": 504 + }, + { + "epoch": 0.05222202114733331, + "grad_norm": 1.042441964149475, + "learning_rate": 0.00019487842731505433, + "loss": 1.7202, + "step": 505 + }, + { + "epoch": 0.052325431090199326, + "grad_norm": 1.457277774810791, + "learning_rate": 0.0001948680807035696, + "loss": 2.5209, + "step": 506 + }, + { + "epoch": 0.05242884103306533, + "grad_norm": 1.354345679283142, + "learning_rate": 0.00019485773409208486, + "loss": 2.0281, + "step": 507 + }, + { + "epoch": 0.05253225097593134, + "grad_norm": 1.181408166885376, + "learning_rate": 0.00019484738748060012, + "loss": 2.1902, + "step": 508 + }, + { + "epoch": 0.052635660918797345, + "grad_norm": 1.5439062118530273, + "learning_rate": 0.0001948370408691154, + "loss": 2.0531, + "step": 509 + }, + { + "epoch": 0.05273907086166335, + "grad_norm": 0.7987815141677856, + "learning_rate": 0.00019482669425763063, + "loss": 1.9365, + "step": 510 + }, + { + "epoch": 0.05284248080452936, + "grad_norm": 1.217344880104065, + "learning_rate": 0.0001948163476461459, + "loss": 1.5093, + "step": 511 + }, + { + "epoch": 0.05294589074739536, + "grad_norm": 1.0402098894119263, + "learning_rate": 0.00019480600103466116, + "loss": 1.6761, + "step": 512 + }, + { + "epoch": 0.05304930069026137, + "grad_norm": 0.9624907374382019, + "learning_rate": 0.00019479565442317642, + "loss": 2.379, + "step": 513 + }, + { + "epoch": 0.053152710633127376, + "grad_norm": 1.2226768732070923, + "learning_rate": 0.0001947853078116917, + "loss": 1.7264, + "step": 514 + }, + { + "epoch": 0.05325612057599338, + "grad_norm": 4.221198558807373, + "learning_rate": 0.00019477496120020695, + "loss": 1.6815, + "step": 515 + }, + { + "epoch": 0.05335953051885939, + "grad_norm": 1.2019705772399902, + "learning_rate": 0.0001947646145887222, + "loss": 1.5503, + "step": 516 + }, + { + "epoch": 0.053462940461725394, + "grad_norm": 1.3328227996826172, + "learning_rate": 0.00019475426797723746, + "loss": 1.5755, + "step": 517 + }, + { + "epoch": 0.0535663504045914, + "grad_norm": 1.030809760093689, + "learning_rate": 0.00019474392136575272, + "loss": 1.2846, + "step": 518 + }, + { + "epoch": 0.05366976034745741, + "grad_norm": 0.912787675857544, + "learning_rate": 0.000194733574754268, + "loss": 1.4728, + "step": 519 + }, + { + "epoch": 0.05377317029032341, + "grad_norm": 0.8509954214096069, + "learning_rate": 0.00019472322814278326, + "loss": 2.0389, + "step": 520 + }, + { + "epoch": 0.05387658023318942, + "grad_norm": 2.391146421432495, + "learning_rate": 0.00019471288153129852, + "loss": 1.894, + "step": 521 + }, + { + "epoch": 0.053979990176055426, + "grad_norm": 0.9386624693870544, + "learning_rate": 0.00019470253491981376, + "loss": 1.6739, + "step": 522 + }, + { + "epoch": 0.05408340011892143, + "grad_norm": 1.2840310335159302, + "learning_rate": 0.00019469218830832903, + "loss": 1.6585, + "step": 523 + }, + { + "epoch": 0.05418681006178744, + "grad_norm": 1.0667188167572021, + "learning_rate": 0.0001946818416968443, + "loss": 1.8078, + "step": 524 + }, + { + "epoch": 0.054290220004653444, + "grad_norm": 1.4864556789398193, + "learning_rate": 0.00019467149508535956, + "loss": 1.4433, + "step": 525 + }, + { + "epoch": 0.05439362994751945, + "grad_norm": 2.4991726875305176, + "learning_rate": 0.00019466114847387482, + "loss": 1.8232, + "step": 526 + }, + { + "epoch": 0.054497039890385464, + "grad_norm": 1.7041082382202148, + "learning_rate": 0.0001946508018623901, + "loss": 2.0924, + "step": 527 + }, + { + "epoch": 0.05460044983325147, + "grad_norm": 1.8495500087738037, + "learning_rate": 0.00019464045525090533, + "loss": 1.7738, + "step": 528 + }, + { + "epoch": 0.054703859776117476, + "grad_norm": 1.5319288969039917, + "learning_rate": 0.0001946301086394206, + "loss": 1.6553, + "step": 529 + }, + { + "epoch": 0.05480726971898348, + "grad_norm": 1.266295313835144, + "learning_rate": 0.00019461976202793586, + "loss": 2.0879, + "step": 530 + }, + { + "epoch": 0.05491067966184949, + "grad_norm": 1.0957818031311035, + "learning_rate": 0.00019460941541645112, + "loss": 1.8462, + "step": 531 + }, + { + "epoch": 0.055014089604715495, + "grad_norm": 1.0180859565734863, + "learning_rate": 0.0001945990688049664, + "loss": 1.6908, + "step": 532 + }, + { + "epoch": 0.0551174995475815, + "grad_norm": 3.8691227436065674, + "learning_rate": 0.00019458872219348165, + "loss": 1.7842, + "step": 533 + }, + { + "epoch": 0.05522090949044751, + "grad_norm": 1.1823201179504395, + "learning_rate": 0.0001945783755819969, + "loss": 1.9832, + "step": 534 + }, + { + "epoch": 0.055324319433313514, + "grad_norm": 1.3128019571304321, + "learning_rate": 0.00019456802897051216, + "loss": 2.1566, + "step": 535 + }, + { + "epoch": 0.05542772937617952, + "grad_norm": 1.2830859422683716, + "learning_rate": 0.00019455768235902742, + "loss": 1.817, + "step": 536 + }, + { + "epoch": 0.055531139319045526, + "grad_norm": 1.5610332489013672, + "learning_rate": 0.0001945473357475427, + "loss": 2.0753, + "step": 537 + }, + { + "epoch": 0.05563454926191153, + "grad_norm": 1.1201059818267822, + "learning_rate": 0.00019453698913605795, + "loss": 1.6957, + "step": 538 + }, + { + "epoch": 0.05573795920477754, + "grad_norm": 1.3312937021255493, + "learning_rate": 0.00019452664252457322, + "loss": 1.8485, + "step": 539 + }, + { + "epoch": 0.055841369147643545, + "grad_norm": 1.670798659324646, + "learning_rate": 0.00019451629591308846, + "loss": 1.5113, + "step": 540 + }, + { + "epoch": 0.05594477909050955, + "grad_norm": 1.1942532062530518, + "learning_rate": 0.00019450594930160372, + "loss": 1.7411, + "step": 541 + }, + { + "epoch": 0.05604818903337556, + "grad_norm": 0.8941847681999207, + "learning_rate": 0.000194495602690119, + "loss": 2.2823, + "step": 542 + }, + { + "epoch": 0.056151598976241564, + "grad_norm": 1.014508605003357, + "learning_rate": 0.00019448525607863426, + "loss": 1.1381, + "step": 543 + }, + { + "epoch": 0.05625500891910757, + "grad_norm": 1.2505890130996704, + "learning_rate": 0.00019447490946714952, + "loss": 1.8727, + "step": 544 + }, + { + "epoch": 0.056358418861973576, + "grad_norm": 1.0327749252319336, + "learning_rate": 0.0001944645628556648, + "loss": 1.4399, + "step": 545 + }, + { + "epoch": 0.05646182880483958, + "grad_norm": 1.232862949371338, + "learning_rate": 0.00019445421624418002, + "loss": 1.7911, + "step": 546 + }, + { + "epoch": 0.05656523874770559, + "grad_norm": 1.9683369398117065, + "learning_rate": 0.0001944438696326953, + "loss": 2.122, + "step": 547 + }, + { + "epoch": 0.0566686486905716, + "grad_norm": 1.4104269742965698, + "learning_rate": 0.00019443352302121056, + "loss": 1.4098, + "step": 548 + }, + { + "epoch": 0.05677205863343761, + "grad_norm": 1.673534631729126, + "learning_rate": 0.00019442317640972582, + "loss": 1.9461, + "step": 549 + }, + { + "epoch": 0.056875468576303614, + "grad_norm": 0.748426616191864, + "learning_rate": 0.0001944128297982411, + "loss": 1.6687, + "step": 550 + }, + { + "epoch": 0.05697887851916962, + "grad_norm": 1.689274549484253, + "learning_rate": 0.00019440248318675635, + "loss": 1.8469, + "step": 551 + }, + { + "epoch": 0.05708228846203563, + "grad_norm": 1.210861086845398, + "learning_rate": 0.0001943921365752716, + "loss": 1.7772, + "step": 552 + }, + { + "epoch": 0.05718569840490163, + "grad_norm": 1.6195064783096313, + "learning_rate": 0.00019438178996378686, + "loss": 1.6484, + "step": 553 + }, + { + "epoch": 0.05728910834776764, + "grad_norm": 2.027785062789917, + "learning_rate": 0.00019437144335230212, + "loss": 2.0626, + "step": 554 + }, + { + "epoch": 0.057392518290633646, + "grad_norm": 2.145552158355713, + "learning_rate": 0.0001943610967408174, + "loss": 1.6507, + "step": 555 + }, + { + "epoch": 0.05749592823349965, + "grad_norm": 0.8149853348731995, + "learning_rate": 0.00019435075012933265, + "loss": 2.1143, + "step": 556 + }, + { + "epoch": 0.05759933817636566, + "grad_norm": 1.6035088300704956, + "learning_rate": 0.00019434040351784792, + "loss": 2.1081, + "step": 557 + }, + { + "epoch": 0.057702748119231664, + "grad_norm": 0.7236506938934326, + "learning_rate": 0.00019433005690636316, + "loss": 1.7452, + "step": 558 + }, + { + "epoch": 0.05780615806209767, + "grad_norm": 1.3249518871307373, + "learning_rate": 0.00019431971029487842, + "loss": 1.7061, + "step": 559 + }, + { + "epoch": 0.05790956800496368, + "grad_norm": 0.9372283220291138, + "learning_rate": 0.0001943093636833937, + "loss": 1.8231, + "step": 560 + }, + { + "epoch": 0.05801297794782968, + "grad_norm": 1.004478096961975, + "learning_rate": 0.00019429901707190895, + "loss": 1.4882, + "step": 561 + }, + { + "epoch": 0.05811638789069569, + "grad_norm": 1.0969711542129517, + "learning_rate": 0.00019428867046042422, + "loss": 1.8174, + "step": 562 + }, + { + "epoch": 0.058219797833561696, + "grad_norm": 1.0373283624649048, + "learning_rate": 0.00019427832384893949, + "loss": 1.5651, + "step": 563 + }, + { + "epoch": 0.0583232077764277, + "grad_norm": 0.7008000612258911, + "learning_rate": 0.00019426797723745472, + "loss": 2.0102, + "step": 564 + }, + { + "epoch": 0.05842661771929371, + "grad_norm": 1.4896759986877441, + "learning_rate": 0.00019425763062597, + "loss": 1.4347, + "step": 565 + }, + { + "epoch": 0.058530027662159714, + "grad_norm": 1.116275668144226, + "learning_rate": 0.00019424728401448526, + "loss": 2.1501, + "step": 566 + }, + { + "epoch": 0.05863343760502572, + "grad_norm": 1.1160902976989746, + "learning_rate": 0.00019423693740300052, + "loss": 1.7055, + "step": 567 + }, + { + "epoch": 0.05873684754789173, + "grad_norm": 1.161346673965454, + "learning_rate": 0.00019422659079151579, + "loss": 1.7059, + "step": 568 + }, + { + "epoch": 0.05884025749075773, + "grad_norm": 1.6276146173477173, + "learning_rate": 0.00019421624418003105, + "loss": 2.172, + "step": 569 + }, + { + "epoch": 0.058943667433623746, + "grad_norm": 1.095842957496643, + "learning_rate": 0.0001942058975685463, + "loss": 1.8135, + "step": 570 + }, + { + "epoch": 0.05904707737648975, + "grad_norm": 1.5054162740707397, + "learning_rate": 0.00019419555095706156, + "loss": 1.5956, + "step": 571 + }, + { + "epoch": 0.05915048731935576, + "grad_norm": 1.1078232526779175, + "learning_rate": 0.00019418520434557682, + "loss": 2.1434, + "step": 572 + }, + { + "epoch": 0.059253897262221765, + "grad_norm": 1.3417181968688965, + "learning_rate": 0.0001941748577340921, + "loss": 2.0869, + "step": 573 + }, + { + "epoch": 0.05935730720508777, + "grad_norm": 1.2181316614151, + "learning_rate": 0.00019416451112260735, + "loss": 1.8027, + "step": 574 + }, + { + "epoch": 0.05946071714795378, + "grad_norm": 1.519131064414978, + "learning_rate": 0.00019415416451112262, + "loss": 1.9721, + "step": 575 + }, + { + "epoch": 0.059564127090819784, + "grad_norm": 1.165762186050415, + "learning_rate": 0.00019414381789963786, + "loss": 1.0573, + "step": 576 + }, + { + "epoch": 0.05966753703368579, + "grad_norm": 0.9382355213165283, + "learning_rate": 0.00019413347128815312, + "loss": 1.8539, + "step": 577 + }, + { + "epoch": 0.059770946976551796, + "grad_norm": 2.0592143535614014, + "learning_rate": 0.0001941231246766684, + "loss": 1.8897, + "step": 578 + }, + { + "epoch": 0.0598743569194178, + "grad_norm": 1.1367623805999756, + "learning_rate": 0.00019411277806518365, + "loss": 2.2578, + "step": 579 + }, + { + "epoch": 0.05997776686228381, + "grad_norm": 1.5440205335617065, + "learning_rate": 0.00019410243145369892, + "loss": 1.8463, + "step": 580 + }, + { + "epoch": 0.060081176805149815, + "grad_norm": 1.3517295122146606, + "learning_rate": 0.00019409208484221418, + "loss": 1.7255, + "step": 581 + }, + { + "epoch": 0.06018458674801582, + "grad_norm": 1.735840916633606, + "learning_rate": 0.00019408173823072945, + "loss": 1.8489, + "step": 582 + }, + { + "epoch": 0.06028799669088183, + "grad_norm": 1.2492977380752563, + "learning_rate": 0.00019407139161924472, + "loss": 1.679, + "step": 583 + }, + { + "epoch": 0.060391406633747834, + "grad_norm": 1.0322548151016235, + "learning_rate": 0.00019406104500775995, + "loss": 1.7664, + "step": 584 + }, + { + "epoch": 0.06049481657661384, + "grad_norm": 1.5941745042800903, + "learning_rate": 0.00019405069839627522, + "loss": 2.0438, + "step": 585 + }, + { + "epoch": 0.060598226519479846, + "grad_norm": 0.8804157376289368, + "learning_rate": 0.00019404035178479049, + "loss": 1.7586, + "step": 586 + }, + { + "epoch": 0.06070163646234585, + "grad_norm": 1.5512956380844116, + "learning_rate": 0.00019403000517330575, + "loss": 2.1695, + "step": 587 + }, + { + "epoch": 0.06080504640521186, + "grad_norm": 1.4347621202468872, + "learning_rate": 0.00019401965856182102, + "loss": 1.8846, + "step": 588 + }, + { + "epoch": 0.060908456348077865, + "grad_norm": 1.970655083656311, + "learning_rate": 0.00019400931195033628, + "loss": 2.0841, + "step": 589 + }, + { + "epoch": 0.06101186629094387, + "grad_norm": 0.9423110485076904, + "learning_rate": 0.00019399896533885155, + "loss": 1.6569, + "step": 590 + }, + { + "epoch": 0.061115276233809884, + "grad_norm": 0.7332543730735779, + "learning_rate": 0.0001939886187273668, + "loss": 1.9821, + "step": 591 + }, + { + "epoch": 0.06121868617667589, + "grad_norm": 1.3857728242874146, + "learning_rate": 0.00019397827211588205, + "loss": 2.016, + "step": 592 + }, + { + "epoch": 0.0613220961195419, + "grad_norm": 1.6231861114501953, + "learning_rate": 0.00019396792550439732, + "loss": 1.6667, + "step": 593 + }, + { + "epoch": 0.0614255060624079, + "grad_norm": 2.417095422744751, + "learning_rate": 0.00019395757889291258, + "loss": 2.3043, + "step": 594 + }, + { + "epoch": 0.06152891600527391, + "grad_norm": 0.9848974943161011, + "learning_rate": 0.00019394723228142785, + "loss": 1.7858, + "step": 595 + }, + { + "epoch": 0.061632325948139915, + "grad_norm": 1.6868712902069092, + "learning_rate": 0.00019393688566994311, + "loss": 1.6796, + "step": 596 + }, + { + "epoch": 0.06173573589100592, + "grad_norm": 1.9725900888442993, + "learning_rate": 0.00019392653905845838, + "loss": 1.6825, + "step": 597 + }, + { + "epoch": 0.06183914583387193, + "grad_norm": 1.056667447090149, + "learning_rate": 0.00019391619244697364, + "loss": 1.9901, + "step": 598 + }, + { + "epoch": 0.061942555776737934, + "grad_norm": 1.6670137643814087, + "learning_rate": 0.0001939058458354889, + "loss": 1.517, + "step": 599 + }, + { + "epoch": 0.06204596571960394, + "grad_norm": 1.2356425523757935, + "learning_rate": 0.00019389549922400415, + "loss": 1.9377, + "step": 600 + }, + { + "epoch": 0.06214937566246995, + "grad_norm": 1.196425199508667, + "learning_rate": 0.00019388515261251941, + "loss": 1.6639, + "step": 601 + }, + { + "epoch": 0.06225278560533595, + "grad_norm": 0.9325123429298401, + "learning_rate": 0.00019387480600103468, + "loss": 1.6499, + "step": 602 + }, + { + "epoch": 0.06235619554820196, + "grad_norm": 1.1611047983169556, + "learning_rate": 0.00019386445938954995, + "loss": 2.0172, + "step": 603 + }, + { + "epoch": 0.062459605491067965, + "grad_norm": 2.512784957885742, + "learning_rate": 0.0001938541127780652, + "loss": 2.5452, + "step": 604 + }, + { + "epoch": 0.06256301543393397, + "grad_norm": 1.321507215499878, + "learning_rate": 0.00019384376616658048, + "loss": 1.3509, + "step": 605 + }, + { + "epoch": 0.06266642537679998, + "grad_norm": 0.9709866046905518, + "learning_rate": 0.00019383341955509572, + "loss": 1.9421, + "step": 606 + }, + { + "epoch": 0.06276983531966598, + "grad_norm": 1.2269256114959717, + "learning_rate": 0.00019382307294361098, + "loss": 1.7247, + "step": 607 + }, + { + "epoch": 0.06287324526253199, + "grad_norm": 1.642680287361145, + "learning_rate": 0.00019381272633212625, + "loss": 1.9324, + "step": 608 + }, + { + "epoch": 0.062976655205398, + "grad_norm": 1.0991445779800415, + "learning_rate": 0.0001938023797206415, + "loss": 1.5484, + "step": 609 + }, + { + "epoch": 0.063080065148264, + "grad_norm": 1.4630918502807617, + "learning_rate": 0.00019379203310915678, + "loss": 1.7564, + "step": 610 + }, + { + "epoch": 0.06318347509113001, + "grad_norm": 1.593808889389038, + "learning_rate": 0.00019378168649767204, + "loss": 1.6729, + "step": 611 + }, + { + "epoch": 0.06328688503399602, + "grad_norm": 2.05412220954895, + "learning_rate": 0.00019377133988618728, + "loss": 1.9342, + "step": 612 + }, + { + "epoch": 0.06339029497686202, + "grad_norm": 2.5476601123809814, + "learning_rate": 0.00019376099327470255, + "loss": 2.4104, + "step": 613 + }, + { + "epoch": 0.06349370491972803, + "grad_norm": 1.3254003524780273, + "learning_rate": 0.0001937506466632178, + "loss": 1.7209, + "step": 614 + }, + { + "epoch": 0.06359711486259403, + "grad_norm": 1.5243037939071655, + "learning_rate": 0.00019374030005173308, + "loss": 1.8476, + "step": 615 + }, + { + "epoch": 0.06370052480546004, + "grad_norm": 1.1326062679290771, + "learning_rate": 0.00019372995344024834, + "loss": 1.3442, + "step": 616 + }, + { + "epoch": 0.06380393474832605, + "grad_norm": 1.375542402267456, + "learning_rate": 0.0001937196068287636, + "loss": 1.6413, + "step": 617 + }, + { + "epoch": 0.06390734469119205, + "grad_norm": 1.1514317989349365, + "learning_rate": 0.00019370926021727885, + "loss": 1.2693, + "step": 618 + }, + { + "epoch": 0.06401075463405806, + "grad_norm": 1.629643201828003, + "learning_rate": 0.00019369891360579411, + "loss": 1.8193, + "step": 619 + }, + { + "epoch": 0.06411416457692407, + "grad_norm": 9.200542449951172, + "learning_rate": 0.00019368856699430938, + "loss": 1.8048, + "step": 620 + }, + { + "epoch": 0.06421757451979007, + "grad_norm": 1.2734607458114624, + "learning_rate": 0.00019367822038282464, + "loss": 2.2, + "step": 621 + }, + { + "epoch": 0.06432098446265608, + "grad_norm": 1.6576577425003052, + "learning_rate": 0.0001936678737713399, + "loss": 2.0564, + "step": 622 + }, + { + "epoch": 0.06442439440552208, + "grad_norm": 1.359603762626648, + "learning_rate": 0.00019365752715985518, + "loss": 1.9281, + "step": 623 + }, + { + "epoch": 0.0645278043483881, + "grad_norm": 0.9970663785934448, + "learning_rate": 0.00019364718054837041, + "loss": 1.6042, + "step": 624 + }, + { + "epoch": 0.06463121429125411, + "grad_norm": 1.8795320987701416, + "learning_rate": 0.00019363683393688568, + "loss": 2.0144, + "step": 625 + }, + { + "epoch": 0.06473462423412012, + "grad_norm": 1.8463833332061768, + "learning_rate": 0.00019362648732540095, + "loss": 1.9506, + "step": 626 + }, + { + "epoch": 0.06483803417698612, + "grad_norm": 1.0363043546676636, + "learning_rate": 0.0001936161407139162, + "loss": 2.2462, + "step": 627 + }, + { + "epoch": 0.06494144411985213, + "grad_norm": 1.2393428087234497, + "learning_rate": 0.00019360579410243148, + "loss": 1.2618, + "step": 628 + }, + { + "epoch": 0.06504485406271814, + "grad_norm": 1.0203514099121094, + "learning_rate": 0.00019359544749094674, + "loss": 1.9836, + "step": 629 + }, + { + "epoch": 0.06514826400558414, + "grad_norm": 1.408217191696167, + "learning_rate": 0.00019358510087946198, + "loss": 1.9312, + "step": 630 + }, + { + "epoch": 0.06525167394845015, + "grad_norm": 1.4327986240386963, + "learning_rate": 0.00019357475426797725, + "loss": 1.9366, + "step": 631 + }, + { + "epoch": 0.06535508389131615, + "grad_norm": 0.8558879494667053, + "learning_rate": 0.0001935644076564925, + "loss": 1.7253, + "step": 632 + }, + { + "epoch": 0.06545849383418216, + "grad_norm": 1.3604400157928467, + "learning_rate": 0.00019355406104500778, + "loss": 2.0347, + "step": 633 + }, + { + "epoch": 0.06556190377704817, + "grad_norm": 1.0031954050064087, + "learning_rate": 0.00019354371443352304, + "loss": 1.8156, + "step": 634 + }, + { + "epoch": 0.06566531371991417, + "grad_norm": 1.2598495483398438, + "learning_rate": 0.00019353336782203828, + "loss": 1.5517, + "step": 635 + }, + { + "epoch": 0.06576872366278018, + "grad_norm": 0.7164933681488037, + "learning_rate": 0.00019352302121055355, + "loss": 1.7041, + "step": 636 + }, + { + "epoch": 0.06587213360564619, + "grad_norm": 1.795189380645752, + "learning_rate": 0.0001935126745990688, + "loss": 1.7281, + "step": 637 + }, + { + "epoch": 0.06597554354851219, + "grad_norm": 1.1797122955322266, + "learning_rate": 0.00019350232798758408, + "loss": 1.6707, + "step": 638 + }, + { + "epoch": 0.0660789534913782, + "grad_norm": 1.0083849430084229, + "learning_rate": 0.00019349198137609934, + "loss": 1.6134, + "step": 639 + }, + { + "epoch": 0.0661823634342442, + "grad_norm": 0.823398768901825, + "learning_rate": 0.0001934816347646146, + "loss": 2.1252, + "step": 640 + }, + { + "epoch": 0.06628577337711021, + "grad_norm": 1.0543327331542969, + "learning_rate": 0.00019347128815312985, + "loss": 1.4388, + "step": 641 + }, + { + "epoch": 0.06638918331997622, + "grad_norm": 1.2253812551498413, + "learning_rate": 0.0001934609415416451, + "loss": 1.6264, + "step": 642 + }, + { + "epoch": 0.06649259326284222, + "grad_norm": 0.5971275568008423, + "learning_rate": 0.00019345059493016038, + "loss": 2.0363, + "step": 643 + }, + { + "epoch": 0.06659600320570823, + "grad_norm": 1.0794010162353516, + "learning_rate": 0.00019344024831867564, + "loss": 1.8093, + "step": 644 + }, + { + "epoch": 0.06669941314857424, + "grad_norm": 0.8961673378944397, + "learning_rate": 0.0001934299017071909, + "loss": 1.8438, + "step": 645 + }, + { + "epoch": 0.06680282309144024, + "grad_norm": 0.8820412158966064, + "learning_rate": 0.00019341955509570618, + "loss": 2.2632, + "step": 646 + }, + { + "epoch": 0.06690623303430625, + "grad_norm": 1.2584704160690308, + "learning_rate": 0.00019340920848422141, + "loss": 1.7135, + "step": 647 + }, + { + "epoch": 0.06700964297717225, + "grad_norm": 1.5364105701446533, + "learning_rate": 0.00019339886187273668, + "loss": 2.0992, + "step": 648 + }, + { + "epoch": 0.06711305292003826, + "grad_norm": 1.1961735486984253, + "learning_rate": 0.00019338851526125195, + "loss": 1.7492, + "step": 649 + }, + { + "epoch": 0.06721646286290427, + "grad_norm": 1.824004888534546, + "learning_rate": 0.0001933781686497672, + "loss": 2.0148, + "step": 650 + }, + { + "epoch": 0.06731987280577027, + "grad_norm": 1.5567946434020996, + "learning_rate": 0.00019336782203828248, + "loss": 1.4838, + "step": 651 + }, + { + "epoch": 0.06742328274863628, + "grad_norm": 1.0475163459777832, + "learning_rate": 0.00019335747542679774, + "loss": 1.7052, + "step": 652 + }, + { + "epoch": 0.06752669269150229, + "grad_norm": 1.7999539375305176, + "learning_rate": 0.00019334712881531298, + "loss": 1.8598, + "step": 653 + }, + { + "epoch": 0.06763010263436829, + "grad_norm": 1.4463770389556885, + "learning_rate": 0.00019333678220382825, + "loss": 1.6957, + "step": 654 + }, + { + "epoch": 0.0677335125772343, + "grad_norm": 1.0363894701004028, + "learning_rate": 0.0001933264355923435, + "loss": 1.9333, + "step": 655 + }, + { + "epoch": 0.0678369225201003, + "grad_norm": 0.8181893825531006, + "learning_rate": 0.00019331608898085878, + "loss": 1.7909, + "step": 656 + }, + { + "epoch": 0.06794033246296631, + "grad_norm": 1.2460615634918213, + "learning_rate": 0.00019330574236937404, + "loss": 2.0109, + "step": 657 + }, + { + "epoch": 0.06804374240583232, + "grad_norm": 1.011409878730774, + "learning_rate": 0.0001932953957578893, + "loss": 2.0692, + "step": 658 + }, + { + "epoch": 0.06814715234869832, + "grad_norm": 1.0723906755447388, + "learning_rate": 0.00019328504914640455, + "loss": 2.0749, + "step": 659 + }, + { + "epoch": 0.06825056229156433, + "grad_norm": 1.5507792234420776, + "learning_rate": 0.0001932747025349198, + "loss": 2.0177, + "step": 660 + }, + { + "epoch": 0.06835397223443034, + "grad_norm": 1.6804261207580566, + "learning_rate": 0.00019326435592343508, + "loss": 1.7871, + "step": 661 + }, + { + "epoch": 0.06845738217729634, + "grad_norm": 1.0971132516860962, + "learning_rate": 0.00019325400931195034, + "loss": 1.7657, + "step": 662 + }, + { + "epoch": 0.06856079212016235, + "grad_norm": 0.9800976514816284, + "learning_rate": 0.0001932436627004656, + "loss": 1.6668, + "step": 663 + }, + { + "epoch": 0.06866420206302835, + "grad_norm": 1.3540741205215454, + "learning_rate": 0.00019323331608898087, + "loss": 1.5822, + "step": 664 + }, + { + "epoch": 0.06876761200589436, + "grad_norm": 1.2192633152008057, + "learning_rate": 0.0001932229694774961, + "loss": 1.8653, + "step": 665 + }, + { + "epoch": 0.06887102194876037, + "grad_norm": 1.444132924079895, + "learning_rate": 0.00019321262286601138, + "loss": 2.2671, + "step": 666 + }, + { + "epoch": 0.06897443189162639, + "grad_norm": 1.5947513580322266, + "learning_rate": 0.00019320227625452664, + "loss": 1.4868, + "step": 667 + }, + { + "epoch": 0.06907784183449239, + "grad_norm": 1.0117969512939453, + "learning_rate": 0.0001931919296430419, + "loss": 1.2683, + "step": 668 + }, + { + "epoch": 0.0691812517773584, + "grad_norm": 1.3698185682296753, + "learning_rate": 0.00019318158303155718, + "loss": 1.8173, + "step": 669 + }, + { + "epoch": 0.0692846617202244, + "grad_norm": 2.321216344833374, + "learning_rate": 0.00019317123642007244, + "loss": 1.4577, + "step": 670 + }, + { + "epoch": 0.06938807166309041, + "grad_norm": 1.6404632329940796, + "learning_rate": 0.00019316088980858768, + "loss": 1.4561, + "step": 671 + }, + { + "epoch": 0.06949148160595642, + "grad_norm": 1.2985002994537354, + "learning_rate": 0.00019315054319710294, + "loss": 2.0832, + "step": 672 + }, + { + "epoch": 0.06959489154882242, + "grad_norm": 1.020167589187622, + "learning_rate": 0.0001931401965856182, + "loss": 2.0277, + "step": 673 + }, + { + "epoch": 0.06969830149168843, + "grad_norm": 1.8142248392105103, + "learning_rate": 0.00019312984997413348, + "loss": 1.7957, + "step": 674 + }, + { + "epoch": 0.06980171143455444, + "grad_norm": 2.099975824356079, + "learning_rate": 0.00019311950336264874, + "loss": 1.9989, + "step": 675 + }, + { + "epoch": 0.06990512137742044, + "grad_norm": 2.1778149604797363, + "learning_rate": 0.000193109156751164, + "loss": 1.7527, + "step": 676 + }, + { + "epoch": 0.07000853132028645, + "grad_norm": 2.6961352825164795, + "learning_rate": 0.00019309881013967925, + "loss": 2.3416, + "step": 677 + }, + { + "epoch": 0.07011194126315246, + "grad_norm": 1.3259974718093872, + "learning_rate": 0.0001930884635281945, + "loss": 1.9032, + "step": 678 + }, + { + "epoch": 0.07021535120601846, + "grad_norm": 1.0762745141983032, + "learning_rate": 0.00019307811691670978, + "loss": 1.2353, + "step": 679 + }, + { + "epoch": 0.07031876114888447, + "grad_norm": 0.88735431432724, + "learning_rate": 0.00019306777030522504, + "loss": 1.756, + "step": 680 + }, + { + "epoch": 0.07042217109175047, + "grad_norm": 1.5669922828674316, + "learning_rate": 0.0001930574236937403, + "loss": 1.9952, + "step": 681 + }, + { + "epoch": 0.07052558103461648, + "grad_norm": 1.8280024528503418, + "learning_rate": 0.00019304707708225557, + "loss": 2.0567, + "step": 682 + }, + { + "epoch": 0.07062899097748249, + "grad_norm": 1.9424211978912354, + "learning_rate": 0.0001930367304707708, + "loss": 1.873, + "step": 683 + }, + { + "epoch": 0.07073240092034849, + "grad_norm": 1.0285444259643555, + "learning_rate": 0.00019302638385928608, + "loss": 1.4914, + "step": 684 + }, + { + "epoch": 0.0708358108632145, + "grad_norm": 1.020849347114563, + "learning_rate": 0.00019301603724780134, + "loss": 1.7896, + "step": 685 + }, + { + "epoch": 0.0709392208060805, + "grad_norm": 1.274097204208374, + "learning_rate": 0.0001930056906363166, + "loss": 1.6604, + "step": 686 + }, + { + "epoch": 0.07104263074894651, + "grad_norm": 2.1440742015838623, + "learning_rate": 0.00019299534402483187, + "loss": 1.6177, + "step": 687 + }, + { + "epoch": 0.07114604069181252, + "grad_norm": 1.0610333681106567, + "learning_rate": 0.00019298499741334714, + "loss": 1.6835, + "step": 688 + }, + { + "epoch": 0.07124945063467852, + "grad_norm": 1.0177770853042603, + "learning_rate": 0.00019297465080186238, + "loss": 1.5577, + "step": 689 + }, + { + "epoch": 0.07135286057754453, + "grad_norm": 1.0054686069488525, + "learning_rate": 0.00019296430419037764, + "loss": 1.4735, + "step": 690 + }, + { + "epoch": 0.07145627052041054, + "grad_norm": 0.7136942148208618, + "learning_rate": 0.0001929539575788929, + "loss": 2.0558, + "step": 691 + }, + { + "epoch": 0.07155968046327654, + "grad_norm": 2.260354995727539, + "learning_rate": 0.00019294361096740818, + "loss": 1.7685, + "step": 692 + }, + { + "epoch": 0.07166309040614255, + "grad_norm": 1.2539150714874268, + "learning_rate": 0.00019293326435592344, + "loss": 1.8324, + "step": 693 + }, + { + "epoch": 0.07176650034900856, + "grad_norm": 1.8409538269042969, + "learning_rate": 0.0001929229177444387, + "loss": 1.9525, + "step": 694 + }, + { + "epoch": 0.07186991029187456, + "grad_norm": 1.0681891441345215, + "learning_rate": 0.00019291257113295394, + "loss": 1.5554, + "step": 695 + }, + { + "epoch": 0.07197332023474057, + "grad_norm": 2.5881447792053223, + "learning_rate": 0.0001929022245214692, + "loss": 2.1174, + "step": 696 + }, + { + "epoch": 0.07207673017760657, + "grad_norm": 0.9745953679084778, + "learning_rate": 0.00019289187790998448, + "loss": 1.3149, + "step": 697 + }, + { + "epoch": 0.07218014012047258, + "grad_norm": 1.1194660663604736, + "learning_rate": 0.00019288153129849974, + "loss": 1.3463, + "step": 698 + }, + { + "epoch": 0.07228355006333859, + "grad_norm": 1.7407701015472412, + "learning_rate": 0.000192871184687015, + "loss": 2.2298, + "step": 699 + }, + { + "epoch": 0.07238696000620459, + "grad_norm": 1.0828900337219238, + "learning_rate": 0.00019286083807553027, + "loss": 1.7538, + "step": 700 + }, + { + "epoch": 0.0724903699490706, + "grad_norm": 2.003937005996704, + "learning_rate": 0.0001928504914640455, + "loss": 1.5031, + "step": 701 + }, + { + "epoch": 0.0725937798919366, + "grad_norm": 1.4628642797470093, + "learning_rate": 0.00019284014485256078, + "loss": 1.7986, + "step": 702 + }, + { + "epoch": 0.07269718983480261, + "grad_norm": 1.0137124061584473, + "learning_rate": 0.00019282979824107604, + "loss": 1.5951, + "step": 703 + }, + { + "epoch": 0.07280059977766862, + "grad_norm": 1.588923692703247, + "learning_rate": 0.0001928194516295913, + "loss": 1.8639, + "step": 704 + }, + { + "epoch": 0.07290400972053462, + "grad_norm": 0.8961623311042786, + "learning_rate": 0.00019280910501810657, + "loss": 1.5859, + "step": 705 + }, + { + "epoch": 0.07300741966340063, + "grad_norm": 0.8769466280937195, + "learning_rate": 0.00019279875840662184, + "loss": 1.3753, + "step": 706 + }, + { + "epoch": 0.07311082960626664, + "grad_norm": 0.658501148223877, + "learning_rate": 0.0001927884117951371, + "loss": 2.1286, + "step": 707 + }, + { + "epoch": 0.07321423954913264, + "grad_norm": 1.503748893737793, + "learning_rate": 0.00019277806518365237, + "loss": 2.365, + "step": 708 + }, + { + "epoch": 0.07331764949199865, + "grad_norm": 1.359349250793457, + "learning_rate": 0.0001927677185721676, + "loss": 1.9966, + "step": 709 + }, + { + "epoch": 0.07342105943486467, + "grad_norm": 1.5836228132247925, + "learning_rate": 0.00019275737196068287, + "loss": 1.2079, + "step": 710 + }, + { + "epoch": 0.07352446937773068, + "grad_norm": 1.0415390729904175, + "learning_rate": 0.00019274702534919814, + "loss": 1.8506, + "step": 711 + }, + { + "epoch": 0.07362787932059668, + "grad_norm": 1.3095402717590332, + "learning_rate": 0.0001927366787377134, + "loss": 1.2082, + "step": 712 + }, + { + "epoch": 0.07373128926346269, + "grad_norm": 1.356926441192627, + "learning_rate": 0.00019272633212622867, + "loss": 1.341, + "step": 713 + }, + { + "epoch": 0.0738346992063287, + "grad_norm": 1.0612878799438477, + "learning_rate": 0.00019271598551474394, + "loss": 1.7926, + "step": 714 + }, + { + "epoch": 0.0739381091491947, + "grad_norm": 1.9951964616775513, + "learning_rate": 0.0001927056389032592, + "loss": 1.6938, + "step": 715 + }, + { + "epoch": 0.0740415190920607, + "grad_norm": 1.6049528121948242, + "learning_rate": 0.00019269529229177447, + "loss": 1.602, + "step": 716 + }, + { + "epoch": 0.07414492903492671, + "grad_norm": 1.646397590637207, + "learning_rate": 0.0001926849456802897, + "loss": 1.9062, + "step": 717 + }, + { + "epoch": 0.07424833897779272, + "grad_norm": 1.7734266519546509, + "learning_rate": 0.00019267459906880497, + "loss": 1.9716, + "step": 718 + }, + { + "epoch": 0.07435174892065873, + "grad_norm": 3.2258822917938232, + "learning_rate": 0.00019266425245732024, + "loss": 2.0379, + "step": 719 + }, + { + "epoch": 0.07445515886352473, + "grad_norm": 1.35995614528656, + "learning_rate": 0.0001926539058458355, + "loss": 1.5753, + "step": 720 + }, + { + "epoch": 0.07455856880639074, + "grad_norm": 1.1354846954345703, + "learning_rate": 0.00019264355923435077, + "loss": 1.5182, + "step": 721 + }, + { + "epoch": 0.07466197874925674, + "grad_norm": 0.9102991223335266, + "learning_rate": 0.00019263321262286603, + "loss": 1.3167, + "step": 722 + }, + { + "epoch": 0.07476538869212275, + "grad_norm": 1.0951752662658691, + "learning_rate": 0.0001926228660113813, + "loss": 2.0027, + "step": 723 + }, + { + "epoch": 0.07486879863498876, + "grad_norm": 1.243021845817566, + "learning_rate": 0.00019261251939989656, + "loss": 1.8353, + "step": 724 + }, + { + "epoch": 0.07497220857785476, + "grad_norm": 1.185826063156128, + "learning_rate": 0.0001926021727884118, + "loss": 1.4974, + "step": 725 + }, + { + "epoch": 0.07507561852072077, + "grad_norm": 1.537313461303711, + "learning_rate": 0.00019259182617692707, + "loss": 1.7462, + "step": 726 + }, + { + "epoch": 0.07517902846358678, + "grad_norm": 1.8373098373413086, + "learning_rate": 0.00019258147956544233, + "loss": 2.0801, + "step": 727 + }, + { + "epoch": 0.07528243840645278, + "grad_norm": 1.0397356748580933, + "learning_rate": 0.0001925711329539576, + "loss": 1.3511, + "step": 728 + }, + { + "epoch": 0.07538584834931879, + "grad_norm": 1.2884058952331543, + "learning_rate": 0.00019256078634247287, + "loss": 2.1643, + "step": 729 + }, + { + "epoch": 0.0754892582921848, + "grad_norm": 1.4546620845794678, + "learning_rate": 0.00019255043973098813, + "loss": 2.0311, + "step": 730 + }, + { + "epoch": 0.0755926682350508, + "grad_norm": 1.3401422500610352, + "learning_rate": 0.00019254009311950337, + "loss": 2.1767, + "step": 731 + }, + { + "epoch": 0.0756960781779168, + "grad_norm": 2.024405002593994, + "learning_rate": 0.00019252974650801864, + "loss": 1.8924, + "step": 732 + }, + { + "epoch": 0.07579948812078281, + "grad_norm": 1.3837287425994873, + "learning_rate": 0.0001925193998965339, + "loss": 1.6319, + "step": 733 + }, + { + "epoch": 0.07590289806364882, + "grad_norm": 1.0432416200637817, + "learning_rate": 0.00019250905328504917, + "loss": 1.3704, + "step": 734 + }, + { + "epoch": 0.07600630800651482, + "grad_norm": 2.283970594406128, + "learning_rate": 0.00019249870667356443, + "loss": 2.1359, + "step": 735 + }, + { + "epoch": 0.07610971794938083, + "grad_norm": 1.2128592729568481, + "learning_rate": 0.0001924883600620797, + "loss": 1.5521, + "step": 736 + }, + { + "epoch": 0.07621312789224684, + "grad_norm": 1.692131757736206, + "learning_rate": 0.00019247801345059494, + "loss": 2.5598, + "step": 737 + }, + { + "epoch": 0.07631653783511284, + "grad_norm": 1.76670241355896, + "learning_rate": 0.0001924676668391102, + "loss": 1.6109, + "step": 738 + }, + { + "epoch": 0.07641994777797885, + "grad_norm": 2.0876705646514893, + "learning_rate": 0.00019245732022762547, + "loss": 1.9295, + "step": 739 + }, + { + "epoch": 0.07652335772084486, + "grad_norm": 1.60186767578125, + "learning_rate": 0.00019244697361614073, + "loss": 1.5051, + "step": 740 + }, + { + "epoch": 0.07662676766371086, + "grad_norm": 0.9031509160995483, + "learning_rate": 0.000192436627004656, + "loss": 1.8877, + "step": 741 + }, + { + "epoch": 0.07673017760657687, + "grad_norm": 2.985736131668091, + "learning_rate": 0.00019242628039317126, + "loss": 2.0977, + "step": 742 + }, + { + "epoch": 0.07683358754944287, + "grad_norm": 1.0996371507644653, + "learning_rate": 0.0001924159337816865, + "loss": 1.7847, + "step": 743 + }, + { + "epoch": 0.07693699749230888, + "grad_norm": 2.194692611694336, + "learning_rate": 0.00019240558717020177, + "loss": 2.3167, + "step": 744 + }, + { + "epoch": 0.07704040743517489, + "grad_norm": 0.8932716846466064, + "learning_rate": 0.00019239524055871703, + "loss": 1.9315, + "step": 745 + }, + { + "epoch": 0.0771438173780409, + "grad_norm": 1.4136930704116821, + "learning_rate": 0.0001923848939472323, + "loss": 1.7161, + "step": 746 + }, + { + "epoch": 0.0772472273209069, + "grad_norm": 1.594767689704895, + "learning_rate": 0.00019237454733574756, + "loss": 2.0131, + "step": 747 + }, + { + "epoch": 0.0773506372637729, + "grad_norm": 1.9824858903884888, + "learning_rate": 0.00019236420072426283, + "loss": 2.0987, + "step": 748 + }, + { + "epoch": 0.07745404720663891, + "grad_norm": 1.2606532573699951, + "learning_rate": 0.00019235385411277807, + "loss": 1.3413, + "step": 749 + }, + { + "epoch": 0.07755745714950492, + "grad_norm": 2.217545509338379, + "learning_rate": 0.00019234350750129333, + "loss": 1.7555, + "step": 750 + }, + { + "epoch": 0.07766086709237092, + "grad_norm": 1.3210011720657349, + "learning_rate": 0.0001923331608898086, + "loss": 2.0739, + "step": 751 + }, + { + "epoch": 0.07776427703523693, + "grad_norm": 1.381223440170288, + "learning_rate": 0.00019232281427832387, + "loss": 1.3535, + "step": 752 + }, + { + "epoch": 0.07786768697810295, + "grad_norm": 0.9292250275611877, + "learning_rate": 0.00019231246766683913, + "loss": 1.1796, + "step": 753 + }, + { + "epoch": 0.07797109692096896, + "grad_norm": 2.1304092407226562, + "learning_rate": 0.0001923021210553544, + "loss": 2.2769, + "step": 754 + }, + { + "epoch": 0.07807450686383496, + "grad_norm": 1.3840636014938354, + "learning_rate": 0.00019229177444386964, + "loss": 1.3881, + "step": 755 + }, + { + "epoch": 0.07817791680670097, + "grad_norm": 1.8769537210464478, + "learning_rate": 0.0001922814278323849, + "loss": 1.5471, + "step": 756 + }, + { + "epoch": 0.07828132674956698, + "grad_norm": 1.668870210647583, + "learning_rate": 0.00019227108122090017, + "loss": 2.2528, + "step": 757 + }, + { + "epoch": 0.07838473669243298, + "grad_norm": 1.6583913564682007, + "learning_rate": 0.00019226073460941543, + "loss": 1.835, + "step": 758 + }, + { + "epoch": 0.07848814663529899, + "grad_norm": 2.5908656120300293, + "learning_rate": 0.0001922503879979307, + "loss": 2.1578, + "step": 759 + }, + { + "epoch": 0.078591556578165, + "grad_norm": 0.833453357219696, + "learning_rate": 0.00019224004138644596, + "loss": 1.8684, + "step": 760 + }, + { + "epoch": 0.078694966521031, + "grad_norm": 0.9646596908569336, + "learning_rate": 0.0001922296947749612, + "loss": 1.2843, + "step": 761 + }, + { + "epoch": 0.07879837646389701, + "grad_norm": 1.4253886938095093, + "learning_rate": 0.00019221934816347647, + "loss": 1.8455, + "step": 762 + }, + { + "epoch": 0.07890178640676301, + "grad_norm": 1.565880537033081, + "learning_rate": 0.00019220900155199173, + "loss": 2.1664, + "step": 763 + }, + { + "epoch": 0.07900519634962902, + "grad_norm": 2.0248756408691406, + "learning_rate": 0.000192198654940507, + "loss": 2.2098, + "step": 764 + }, + { + "epoch": 0.07910860629249503, + "grad_norm": 1.5047415494918823, + "learning_rate": 0.00019218830832902226, + "loss": 1.9411, + "step": 765 + }, + { + "epoch": 0.07921201623536103, + "grad_norm": 3.2473227977752686, + "learning_rate": 0.00019217796171753753, + "loss": 2.1491, + "step": 766 + }, + { + "epoch": 0.07931542617822704, + "grad_norm": 1.174277424812317, + "learning_rate": 0.00019216761510605277, + "loss": 1.403, + "step": 767 + }, + { + "epoch": 0.07941883612109304, + "grad_norm": 1.26174795627594, + "learning_rate": 0.00019215726849456803, + "loss": 1.6089, + "step": 768 + }, + { + "epoch": 0.07952224606395905, + "grad_norm": 1.7186594009399414, + "learning_rate": 0.0001921469218830833, + "loss": 1.7814, + "step": 769 + }, + { + "epoch": 0.07962565600682506, + "grad_norm": 1.6304243803024292, + "learning_rate": 0.00019213657527159856, + "loss": 1.458, + "step": 770 + }, + { + "epoch": 0.07972906594969106, + "grad_norm": 1.331419825553894, + "learning_rate": 0.00019212622866011383, + "loss": 2.0749, + "step": 771 + }, + { + "epoch": 0.07983247589255707, + "grad_norm": 1.0822752714157104, + "learning_rate": 0.0001921158820486291, + "loss": 1.8906, + "step": 772 + }, + { + "epoch": 0.07993588583542308, + "grad_norm": 1.3944919109344482, + "learning_rate": 0.00019210553543714433, + "loss": 1.9101, + "step": 773 + }, + { + "epoch": 0.08003929577828908, + "grad_norm": 1.3132551908493042, + "learning_rate": 0.0001920951888256596, + "loss": 1.4764, + "step": 774 + }, + { + "epoch": 0.08014270572115509, + "grad_norm": 1.2457085847854614, + "learning_rate": 0.00019208484221417487, + "loss": 2.1376, + "step": 775 + }, + { + "epoch": 0.0802461156640211, + "grad_norm": 1.6716758012771606, + "learning_rate": 0.00019207449560269013, + "loss": 1.9652, + "step": 776 + }, + { + "epoch": 0.0803495256068871, + "grad_norm": 2.1182303428649902, + "learning_rate": 0.0001920641489912054, + "loss": 2.3112, + "step": 777 + }, + { + "epoch": 0.08045293554975311, + "grad_norm": 1.169825553894043, + "learning_rate": 0.00019205380237972066, + "loss": 1.7178, + "step": 778 + }, + { + "epoch": 0.08055634549261911, + "grad_norm": 1.0207164287567139, + "learning_rate": 0.0001920434557682359, + "loss": 1.3879, + "step": 779 + }, + { + "epoch": 0.08065975543548512, + "grad_norm": 0.6265839338302612, + "learning_rate": 0.00019203310915675117, + "loss": 1.9532, + "step": 780 + }, + { + "epoch": 0.08076316537835113, + "grad_norm": 1.588606834411621, + "learning_rate": 0.00019202276254526643, + "loss": 1.8934, + "step": 781 + }, + { + "epoch": 0.08086657532121713, + "grad_norm": 1.160733699798584, + "learning_rate": 0.0001920124159337817, + "loss": 1.7211, + "step": 782 + }, + { + "epoch": 0.08096998526408314, + "grad_norm": 0.9673109650611877, + "learning_rate": 0.00019200206932229696, + "loss": 1.7659, + "step": 783 + }, + { + "epoch": 0.08107339520694914, + "grad_norm": 1.6592562198638916, + "learning_rate": 0.00019199172271081223, + "loss": 2.2091, + "step": 784 + }, + { + "epoch": 0.08117680514981515, + "grad_norm": 1.3717565536499023, + "learning_rate": 0.00019198137609932747, + "loss": 1.7133, + "step": 785 + }, + { + "epoch": 0.08128021509268116, + "grad_norm": 1.01557457447052, + "learning_rate": 0.00019197102948784273, + "loss": 1.6405, + "step": 786 + }, + { + "epoch": 0.08138362503554716, + "grad_norm": 1.504614233970642, + "learning_rate": 0.000191960682876358, + "loss": 2.049, + "step": 787 + }, + { + "epoch": 0.08148703497841317, + "grad_norm": 1.606438159942627, + "learning_rate": 0.00019195033626487326, + "loss": 1.5679, + "step": 788 + }, + { + "epoch": 0.08159044492127918, + "grad_norm": 1.1667381525039673, + "learning_rate": 0.00019193998965338853, + "loss": 1.507, + "step": 789 + }, + { + "epoch": 0.08169385486414518, + "grad_norm": 1.2914198637008667, + "learning_rate": 0.0001919296430419038, + "loss": 1.9261, + "step": 790 + }, + { + "epoch": 0.08179726480701119, + "grad_norm": 1.4520224332809448, + "learning_rate": 0.00019191929643041903, + "loss": 1.7537, + "step": 791 + }, + { + "epoch": 0.0819006747498772, + "grad_norm": 0.614142894744873, + "learning_rate": 0.0001919089498189343, + "loss": 2.1076, + "step": 792 + }, + { + "epoch": 0.0820040846927432, + "grad_norm": 1.1694809198379517, + "learning_rate": 0.00019189860320744956, + "loss": 1.7364, + "step": 793 + }, + { + "epoch": 0.08210749463560921, + "grad_norm": 1.0019803047180176, + "learning_rate": 0.00019188825659596483, + "loss": 1.8743, + "step": 794 + }, + { + "epoch": 0.08221090457847521, + "grad_norm": 1.2464485168457031, + "learning_rate": 0.0001918779099844801, + "loss": 1.9216, + "step": 795 + }, + { + "epoch": 0.08231431452134123, + "grad_norm": 1.84528648853302, + "learning_rate": 0.00019186756337299536, + "loss": 2.1066, + "step": 796 + }, + { + "epoch": 0.08241772446420724, + "grad_norm": 1.9971226453781128, + "learning_rate": 0.0001918572167615106, + "loss": 1.7721, + "step": 797 + }, + { + "epoch": 0.08252113440707325, + "grad_norm": 1.5557864904403687, + "learning_rate": 0.00019184687015002586, + "loss": 1.8283, + "step": 798 + }, + { + "epoch": 0.08262454434993925, + "grad_norm": 1.3879731893539429, + "learning_rate": 0.00019183652353854113, + "loss": 1.9447, + "step": 799 + }, + { + "epoch": 0.08272795429280526, + "grad_norm": 1.3920087814331055, + "learning_rate": 0.0001918261769270564, + "loss": 1.9993, + "step": 800 + }, + { + "epoch": 0.08283136423567126, + "grad_norm": 1.2388873100280762, + "learning_rate": 0.00019181583031557166, + "loss": 1.4231, + "step": 801 + }, + { + "epoch": 0.08293477417853727, + "grad_norm": 0.550944983959198, + "learning_rate": 0.00019180548370408693, + "loss": 1.907, + "step": 802 + }, + { + "epoch": 0.08303818412140328, + "grad_norm": 1.2141319513320923, + "learning_rate": 0.00019179513709260217, + "loss": 1.5638, + "step": 803 + }, + { + "epoch": 0.08314159406426928, + "grad_norm": 1.085733413696289, + "learning_rate": 0.00019178479048111743, + "loss": 1.3835, + "step": 804 + }, + { + "epoch": 0.08324500400713529, + "grad_norm": 1.7833837270736694, + "learning_rate": 0.0001917744438696327, + "loss": 1.62, + "step": 805 + }, + { + "epoch": 0.0833484139500013, + "grad_norm": 1.7468349933624268, + "learning_rate": 0.00019176409725814796, + "loss": 2.0823, + "step": 806 + }, + { + "epoch": 0.0834518238928673, + "grad_norm": 2.2203943729400635, + "learning_rate": 0.00019175375064666323, + "loss": 2.3832, + "step": 807 + }, + { + "epoch": 0.08355523383573331, + "grad_norm": 1.2291004657745361, + "learning_rate": 0.0001917434040351785, + "loss": 1.3023, + "step": 808 + }, + { + "epoch": 0.08365864377859931, + "grad_norm": 0.8754696249961853, + "learning_rate": 0.00019173305742369373, + "loss": 1.7436, + "step": 809 + }, + { + "epoch": 0.08376205372146532, + "grad_norm": 1.8157202005386353, + "learning_rate": 0.000191722710812209, + "loss": 1.6954, + "step": 810 + }, + { + "epoch": 0.08386546366433133, + "grad_norm": 1.9625682830810547, + "learning_rate": 0.00019171236420072426, + "loss": 2.0076, + "step": 811 + }, + { + "epoch": 0.08396887360719733, + "grad_norm": 1.1999704837799072, + "learning_rate": 0.00019170201758923953, + "loss": 1.3628, + "step": 812 + }, + { + "epoch": 0.08407228355006334, + "grad_norm": 0.9686172008514404, + "learning_rate": 0.0001916916709777548, + "loss": 1.5073, + "step": 813 + }, + { + "epoch": 0.08417569349292935, + "grad_norm": 1.4953769445419312, + "learning_rate": 0.00019168132436627006, + "loss": 2.0297, + "step": 814 + }, + { + "epoch": 0.08427910343579535, + "grad_norm": 1.7770730257034302, + "learning_rate": 0.0001916709777547853, + "loss": 1.9318, + "step": 815 + }, + { + "epoch": 0.08438251337866136, + "grad_norm": 1.8911982774734497, + "learning_rate": 0.00019166063114330056, + "loss": 1.8299, + "step": 816 + }, + { + "epoch": 0.08448592332152736, + "grad_norm": 1.0817290544509888, + "learning_rate": 0.00019165028453181583, + "loss": 1.9069, + "step": 817 + }, + { + "epoch": 0.08458933326439337, + "grad_norm": 1.6773991584777832, + "learning_rate": 0.0001916399379203311, + "loss": 1.5428, + "step": 818 + }, + { + "epoch": 0.08469274320725938, + "grad_norm": 2.850167989730835, + "learning_rate": 0.00019162959130884636, + "loss": 1.9894, + "step": 819 + }, + { + "epoch": 0.08479615315012538, + "grad_norm": 1.3555854558944702, + "learning_rate": 0.00019161924469736163, + "loss": 2.0496, + "step": 820 + }, + { + "epoch": 0.08489956309299139, + "grad_norm": 1.4591540098190308, + "learning_rate": 0.00019160889808587686, + "loss": 1.6269, + "step": 821 + }, + { + "epoch": 0.0850029730358574, + "grad_norm": 1.1350617408752441, + "learning_rate": 0.00019159855147439213, + "loss": 1.5491, + "step": 822 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 1.1935083866119385, + "learning_rate": 0.0001915882048629074, + "loss": 1.8258, + "step": 823 + }, + { + "epoch": 0.08520979292158941, + "grad_norm": 1.8028544187545776, + "learning_rate": 0.00019157785825142266, + "loss": 1.374, + "step": 824 + }, + { + "epoch": 0.08531320286445541, + "grad_norm": 1.5393927097320557, + "learning_rate": 0.00019156751163993793, + "loss": 1.7656, + "step": 825 + }, + { + "epoch": 0.08541661280732142, + "grad_norm": 1.5210120677947998, + "learning_rate": 0.0001915571650284532, + "loss": 1.6707, + "step": 826 + }, + { + "epoch": 0.08552002275018743, + "grad_norm": 1.7695602178573608, + "learning_rate": 0.00019154681841696843, + "loss": 1.8571, + "step": 827 + }, + { + "epoch": 0.08562343269305343, + "grad_norm": 2.1058201789855957, + "learning_rate": 0.0001915364718054837, + "loss": 1.6645, + "step": 828 + }, + { + "epoch": 0.08572684263591944, + "grad_norm": 2.380413055419922, + "learning_rate": 0.00019152612519399896, + "loss": 2.0669, + "step": 829 + }, + { + "epoch": 0.08583025257878545, + "grad_norm": 1.2920933961868286, + "learning_rate": 0.00019151577858251423, + "loss": 1.6571, + "step": 830 + }, + { + "epoch": 0.08593366252165145, + "grad_norm": 1.4352431297302246, + "learning_rate": 0.0001915054319710295, + "loss": 1.7653, + "step": 831 + }, + { + "epoch": 0.08603707246451746, + "grad_norm": 1.5805858373641968, + "learning_rate": 0.00019149508535954476, + "loss": 1.684, + "step": 832 + }, + { + "epoch": 0.08614048240738346, + "grad_norm": 2.331017255783081, + "learning_rate": 0.00019148473874806002, + "loss": 2.1492, + "step": 833 + }, + { + "epoch": 0.08624389235024947, + "grad_norm": 1.6652169227600098, + "learning_rate": 0.00019147439213657526, + "loss": 1.5581, + "step": 834 + }, + { + "epoch": 0.08634730229311548, + "grad_norm": 0.9818973541259766, + "learning_rate": 0.00019146404552509053, + "loss": 1.4011, + "step": 835 + }, + { + "epoch": 0.08645071223598148, + "grad_norm": 1.396761178970337, + "learning_rate": 0.0001914536989136058, + "loss": 1.8978, + "step": 836 + }, + { + "epoch": 0.08655412217884749, + "grad_norm": 1.8270074129104614, + "learning_rate": 0.00019144335230212106, + "loss": 2.036, + "step": 837 + }, + { + "epoch": 0.0866575321217135, + "grad_norm": 1.182669997215271, + "learning_rate": 0.00019143300569063633, + "loss": 1.4941, + "step": 838 + }, + { + "epoch": 0.08676094206457952, + "grad_norm": 1.0010217428207397, + "learning_rate": 0.0001914226590791516, + "loss": 1.8623, + "step": 839 + }, + { + "epoch": 0.08686435200744552, + "grad_norm": 1.0710371732711792, + "learning_rate": 0.00019141231246766686, + "loss": 2.0832, + "step": 840 + }, + { + "epoch": 0.08696776195031153, + "grad_norm": 1.0800421237945557, + "learning_rate": 0.00019140196585618212, + "loss": 1.6501, + "step": 841 + }, + { + "epoch": 0.08707117189317753, + "grad_norm": 1.3268680572509766, + "learning_rate": 0.00019139161924469736, + "loss": 2.1682, + "step": 842 + }, + { + "epoch": 0.08717458183604354, + "grad_norm": 1.0593661069869995, + "learning_rate": 0.00019138127263321263, + "loss": 1.5648, + "step": 843 + }, + { + "epoch": 0.08727799177890955, + "grad_norm": 7.249431610107422, + "learning_rate": 0.0001913709260217279, + "loss": 2.1704, + "step": 844 + }, + { + "epoch": 0.08738140172177555, + "grad_norm": 1.202315092086792, + "learning_rate": 0.00019136057941024316, + "loss": 1.5081, + "step": 845 + }, + { + "epoch": 0.08748481166464156, + "grad_norm": 3.4483230113983154, + "learning_rate": 0.00019135023279875842, + "loss": 1.5247, + "step": 846 + }, + { + "epoch": 0.08758822160750757, + "grad_norm": 2.4174540042877197, + "learning_rate": 0.0001913398861872737, + "loss": 1.9511, + "step": 847 + }, + { + "epoch": 0.08769163155037357, + "grad_norm": 1.351784586906433, + "learning_rate": 0.00019132953957578895, + "loss": 1.4813, + "step": 848 + }, + { + "epoch": 0.08779504149323958, + "grad_norm": 1.2987369298934937, + "learning_rate": 0.00019131919296430422, + "loss": 1.7586, + "step": 849 + }, + { + "epoch": 0.08789845143610558, + "grad_norm": 3.5988855361938477, + "learning_rate": 0.00019130884635281946, + "loss": 2.0485, + "step": 850 + }, + { + "epoch": 0.08800186137897159, + "grad_norm": 0.9274793863296509, + "learning_rate": 0.00019129849974133472, + "loss": 1.5124, + "step": 851 + }, + { + "epoch": 0.0881052713218376, + "grad_norm": 2.8831684589385986, + "learning_rate": 0.00019128815312985, + "loss": 1.9516, + "step": 852 + }, + { + "epoch": 0.0882086812647036, + "grad_norm": 1.241784691810608, + "learning_rate": 0.00019127780651836525, + "loss": 1.998, + "step": 853 + }, + { + "epoch": 0.08831209120756961, + "grad_norm": 1.2777564525604248, + "learning_rate": 0.00019126745990688052, + "loss": 1.6914, + "step": 854 + }, + { + "epoch": 0.08841550115043562, + "grad_norm": 1.3454850912094116, + "learning_rate": 0.00019125711329539579, + "loss": 2.0849, + "step": 855 + }, + { + "epoch": 0.08851891109330162, + "grad_norm": 1.694756031036377, + "learning_rate": 0.00019124676668391105, + "loss": 1.4139, + "step": 856 + }, + { + "epoch": 0.08862232103616763, + "grad_norm": 1.0123131275177002, + "learning_rate": 0.0001912364200724263, + "loss": 1.4719, + "step": 857 + }, + { + "epoch": 0.08872573097903363, + "grad_norm": 2.1002085208892822, + "learning_rate": 0.00019122607346094156, + "loss": 1.4789, + "step": 858 + }, + { + "epoch": 0.08882914092189964, + "grad_norm": 1.3821834325790405, + "learning_rate": 0.00019121572684945682, + "loss": 1.8582, + "step": 859 + }, + { + "epoch": 0.08893255086476565, + "grad_norm": 0.9540427923202515, + "learning_rate": 0.00019120538023797209, + "loss": 1.2874, + "step": 860 + }, + { + "epoch": 0.08903596080763165, + "grad_norm": 1.8231464624404907, + "learning_rate": 0.00019119503362648735, + "loss": 1.5624, + "step": 861 + }, + { + "epoch": 0.08913937075049766, + "grad_norm": 1.7630882263183594, + "learning_rate": 0.00019118468701500262, + "loss": 2.1599, + "step": 862 + }, + { + "epoch": 0.08924278069336367, + "grad_norm": 2.143895387649536, + "learning_rate": 0.00019117434040351786, + "loss": 2.201, + "step": 863 + }, + { + "epoch": 0.08934619063622967, + "grad_norm": 1.8420048952102661, + "learning_rate": 0.00019116399379203312, + "loss": 1.4954, + "step": 864 + }, + { + "epoch": 0.08944960057909568, + "grad_norm": 0.9784311056137085, + "learning_rate": 0.0001911536471805484, + "loss": 1.9278, + "step": 865 + }, + { + "epoch": 0.08955301052196168, + "grad_norm": 1.2139637470245361, + "learning_rate": 0.00019114330056906365, + "loss": 1.5343, + "step": 866 + }, + { + "epoch": 0.08965642046482769, + "grad_norm": 1.4579898118972778, + "learning_rate": 0.00019113295395757892, + "loss": 1.7, + "step": 867 + }, + { + "epoch": 0.0897598304076937, + "grad_norm": 2.5946152210235596, + "learning_rate": 0.00019112260734609418, + "loss": 1.2391, + "step": 868 + }, + { + "epoch": 0.0898632403505597, + "grad_norm": 1.2225054502487183, + "learning_rate": 0.00019111226073460942, + "loss": 1.558, + "step": 869 + }, + { + "epoch": 0.08996665029342571, + "grad_norm": 1.6341155767440796, + "learning_rate": 0.0001911019141231247, + "loss": 1.4779, + "step": 870 + }, + { + "epoch": 0.09007006023629172, + "grad_norm": 0.8429521918296814, + "learning_rate": 0.00019109156751163995, + "loss": 2.0981, + "step": 871 + }, + { + "epoch": 0.09017347017915772, + "grad_norm": 1.4018882513046265, + "learning_rate": 0.00019108122090015522, + "loss": 1.6935, + "step": 872 + }, + { + "epoch": 0.09027688012202373, + "grad_norm": 1.764603853225708, + "learning_rate": 0.00019107087428867048, + "loss": 1.6275, + "step": 873 + }, + { + "epoch": 0.09038029006488973, + "grad_norm": 2.3118224143981934, + "learning_rate": 0.00019106052767718575, + "loss": 1.8666, + "step": 874 + }, + { + "epoch": 0.09048370000775574, + "grad_norm": 1.2632291316986084, + "learning_rate": 0.000191050181065701, + "loss": 1.6197, + "step": 875 + }, + { + "epoch": 0.09058710995062175, + "grad_norm": 1.9582924842834473, + "learning_rate": 0.00019103983445421625, + "loss": 1.5178, + "step": 876 + }, + { + "epoch": 0.09069051989348775, + "grad_norm": 1.8235286474227905, + "learning_rate": 0.00019102948784273152, + "loss": 1.7107, + "step": 877 + }, + { + "epoch": 0.09079392983635376, + "grad_norm": 0.9742588400840759, + "learning_rate": 0.00019101914123124679, + "loss": 1.3714, + "step": 878 + }, + { + "epoch": 0.09089733977921977, + "grad_norm": 1.9299086332321167, + "learning_rate": 0.00019100879461976205, + "loss": 2.1482, + "step": 879 + }, + { + "epoch": 0.09100074972208577, + "grad_norm": 1.1086747646331787, + "learning_rate": 0.00019099844800827732, + "loss": 1.5429, + "step": 880 + }, + { + "epoch": 0.09110415966495178, + "grad_norm": 0.9190935492515564, + "learning_rate": 0.00019098810139679255, + "loss": 1.5422, + "step": 881 + }, + { + "epoch": 0.0912075696078178, + "grad_norm": 1.3607590198516846, + "learning_rate": 0.00019097775478530782, + "loss": 1.5797, + "step": 882 + }, + { + "epoch": 0.0913109795506838, + "grad_norm": 1.1328152418136597, + "learning_rate": 0.00019096740817382309, + "loss": 1.6915, + "step": 883 + }, + { + "epoch": 0.09141438949354981, + "grad_norm": 2.0909512042999268, + "learning_rate": 0.00019095706156233835, + "loss": 1.6088, + "step": 884 + }, + { + "epoch": 0.09151779943641582, + "grad_norm": 5.501047134399414, + "learning_rate": 0.00019094671495085362, + "loss": 1.5214, + "step": 885 + }, + { + "epoch": 0.09162120937928182, + "grad_norm": 1.4709688425064087, + "learning_rate": 0.00019093636833936888, + "loss": 1.8307, + "step": 886 + }, + { + "epoch": 0.09172461932214783, + "grad_norm": 1.181794285774231, + "learning_rate": 0.00019092602172788412, + "loss": 2.4404, + "step": 887 + }, + { + "epoch": 0.09182802926501384, + "grad_norm": 1.3347965478897095, + "learning_rate": 0.0001909156751163994, + "loss": 1.6549, + "step": 888 + }, + { + "epoch": 0.09193143920787984, + "grad_norm": 1.237500786781311, + "learning_rate": 0.00019090532850491465, + "loss": 1.4831, + "step": 889 + }, + { + "epoch": 0.09203484915074585, + "grad_norm": 3.037428140640259, + "learning_rate": 0.00019089498189342992, + "loss": 2.5507, + "step": 890 + }, + { + "epoch": 0.09213825909361185, + "grad_norm": 1.1561611890792847, + "learning_rate": 0.00019088463528194518, + "loss": 1.5395, + "step": 891 + }, + { + "epoch": 0.09224166903647786, + "grad_norm": 0.9404453039169312, + "learning_rate": 0.00019087428867046045, + "loss": 1.3764, + "step": 892 + }, + { + "epoch": 0.09234507897934387, + "grad_norm": 0.9672577381134033, + "learning_rate": 0.0001908639420589757, + "loss": 1.4578, + "step": 893 + }, + { + "epoch": 0.09244848892220987, + "grad_norm": 1.6051990985870361, + "learning_rate": 0.00019085359544749095, + "loss": 1.9744, + "step": 894 + }, + { + "epoch": 0.09255189886507588, + "grad_norm": 1.4234753847122192, + "learning_rate": 0.00019084324883600622, + "loss": 1.904, + "step": 895 + }, + { + "epoch": 0.09265530880794189, + "grad_norm": 1.7959076166152954, + "learning_rate": 0.00019083290222452148, + "loss": 1.6987, + "step": 896 + }, + { + "epoch": 0.09275871875080789, + "grad_norm": 2.001089572906494, + "learning_rate": 0.00019082255561303675, + "loss": 1.8458, + "step": 897 + }, + { + "epoch": 0.0928621286936739, + "grad_norm": 1.192054271697998, + "learning_rate": 0.00019081220900155202, + "loss": 1.8139, + "step": 898 + }, + { + "epoch": 0.0929655386365399, + "grad_norm": 1.8499089479446411, + "learning_rate": 0.00019080186239006725, + "loss": 1.6258, + "step": 899 + }, + { + "epoch": 0.09306894857940591, + "grad_norm": 1.2967066764831543, + "learning_rate": 0.00019079151577858252, + "loss": 1.8121, + "step": 900 + }, + { + "epoch": 0.09317235852227192, + "grad_norm": 0.9628750681877136, + "learning_rate": 0.00019078116916709779, + "loss": 1.9695, + "step": 901 + }, + { + "epoch": 0.09327576846513792, + "grad_norm": 1.2634750604629517, + "learning_rate": 0.00019077082255561305, + "loss": 2.051, + "step": 902 + }, + { + "epoch": 0.09337917840800393, + "grad_norm": 0.887198805809021, + "learning_rate": 0.00019076047594412832, + "loss": 1.6507, + "step": 903 + }, + { + "epoch": 0.09348258835086994, + "grad_norm": 1.443992257118225, + "learning_rate": 0.00019075012933264358, + "loss": 1.5173, + "step": 904 + }, + { + "epoch": 0.09358599829373594, + "grad_norm": 2.711820125579834, + "learning_rate": 0.00019073978272115882, + "loss": 2.4954, + "step": 905 + }, + { + "epoch": 0.09368940823660195, + "grad_norm": 1.4171301126480103, + "learning_rate": 0.00019072943610967409, + "loss": 1.5614, + "step": 906 + }, + { + "epoch": 0.09379281817946795, + "grad_norm": 1.58612859249115, + "learning_rate": 0.00019071908949818935, + "loss": 1.9968, + "step": 907 + }, + { + "epoch": 0.09389622812233396, + "grad_norm": 2.6168100833892822, + "learning_rate": 0.00019070874288670462, + "loss": 1.9727, + "step": 908 + }, + { + "epoch": 0.09399963806519997, + "grad_norm": 1.3814314603805542, + "learning_rate": 0.00019069839627521988, + "loss": 1.4318, + "step": 909 + }, + { + "epoch": 0.09410304800806597, + "grad_norm": 1.9172375202178955, + "learning_rate": 0.00019068804966373515, + "loss": 1.8765, + "step": 910 + }, + { + "epoch": 0.09420645795093198, + "grad_norm": 0.6962313055992126, + "learning_rate": 0.0001906777030522504, + "loss": 1.9879, + "step": 911 + }, + { + "epoch": 0.09430986789379799, + "grad_norm": 2.2384886741638184, + "learning_rate": 0.00019066735644076565, + "loss": 2.0699, + "step": 912 + }, + { + "epoch": 0.09441327783666399, + "grad_norm": 1.6673318147659302, + "learning_rate": 0.00019065700982928092, + "loss": 1.8016, + "step": 913 + }, + { + "epoch": 0.09451668777953, + "grad_norm": 1.3593833446502686, + "learning_rate": 0.00019064666321779618, + "loss": 1.5257, + "step": 914 + }, + { + "epoch": 0.094620097722396, + "grad_norm": 1.5814733505249023, + "learning_rate": 0.00019063631660631145, + "loss": 1.796, + "step": 915 + }, + { + "epoch": 0.09472350766526201, + "grad_norm": 2.525911569595337, + "learning_rate": 0.00019062596999482671, + "loss": 2.0192, + "step": 916 + }, + { + "epoch": 0.09482691760812802, + "grad_norm": 1.1196669340133667, + "learning_rate": 0.00019061562338334195, + "loss": 1.7643, + "step": 917 + }, + { + "epoch": 0.09493032755099402, + "grad_norm": 1.3181729316711426, + "learning_rate": 0.00019060527677185722, + "loss": 1.4779, + "step": 918 + }, + { + "epoch": 0.09503373749386003, + "grad_norm": 1.9003279209136963, + "learning_rate": 0.00019059493016037248, + "loss": 1.8914, + "step": 919 + }, + { + "epoch": 0.09513714743672604, + "grad_norm": 1.2940086126327515, + "learning_rate": 0.00019058458354888775, + "loss": 1.5011, + "step": 920 + }, + { + "epoch": 0.09524055737959204, + "grad_norm": 1.673016905784607, + "learning_rate": 0.00019057423693740302, + "loss": 1.9298, + "step": 921 + }, + { + "epoch": 0.09534396732245805, + "grad_norm": 2.4916605949401855, + "learning_rate": 0.00019056389032591828, + "loss": 1.4196, + "step": 922 + }, + { + "epoch": 0.09544737726532405, + "grad_norm": 0.787038266658783, + "learning_rate": 0.00019055354371443352, + "loss": 1.9657, + "step": 923 + }, + { + "epoch": 0.09555078720819006, + "grad_norm": 2.5524959564208984, + "learning_rate": 0.00019054319710294878, + "loss": 1.543, + "step": 924 + }, + { + "epoch": 0.09565419715105608, + "grad_norm": 1.0950359106063843, + "learning_rate": 0.00019053285049146405, + "loss": 1.0407, + "step": 925 + }, + { + "epoch": 0.09575760709392209, + "grad_norm": 1.2391451597213745, + "learning_rate": 0.00019052250387997932, + "loss": 1.2096, + "step": 926 + }, + { + "epoch": 0.0958610170367881, + "grad_norm": 0.8148314952850342, + "learning_rate": 0.00019051215726849458, + "loss": 1.675, + "step": 927 + }, + { + "epoch": 0.0959644269796541, + "grad_norm": 2.42737078666687, + "learning_rate": 0.00019050181065700985, + "loss": 1.4995, + "step": 928 + }, + { + "epoch": 0.0960678369225201, + "grad_norm": 1.2046351432800293, + "learning_rate": 0.00019049146404552509, + "loss": 1.5453, + "step": 929 + }, + { + "epoch": 0.09617124686538611, + "grad_norm": 1.1635297536849976, + "learning_rate": 0.00019048111743404035, + "loss": 1.6753, + "step": 930 + }, + { + "epoch": 0.09627465680825212, + "grad_norm": 0.8624559640884399, + "learning_rate": 0.00019047077082255562, + "loss": 1.9709, + "step": 931 + }, + { + "epoch": 0.09637806675111812, + "grad_norm": 2.0648558139801025, + "learning_rate": 0.00019046042421107088, + "loss": 1.93, + "step": 932 + }, + { + "epoch": 0.09648147669398413, + "grad_norm": 1.3905847072601318, + "learning_rate": 0.00019045007759958615, + "loss": 1.309, + "step": 933 + }, + { + "epoch": 0.09658488663685014, + "grad_norm": 0.8313616514205933, + "learning_rate": 0.0001904397309881014, + "loss": 1.8303, + "step": 934 + }, + { + "epoch": 0.09668829657971614, + "grad_norm": 1.6961098909378052, + "learning_rate": 0.00019042938437661665, + "loss": 1.5906, + "step": 935 + }, + { + "epoch": 0.09679170652258215, + "grad_norm": 2.953028917312622, + "learning_rate": 0.00019041903776513192, + "loss": 1.5128, + "step": 936 + }, + { + "epoch": 0.09689511646544816, + "grad_norm": 1.1944273710250854, + "learning_rate": 0.00019040869115364718, + "loss": 1.2601, + "step": 937 + }, + { + "epoch": 0.09699852640831416, + "grad_norm": 1.3880339860916138, + "learning_rate": 0.00019039834454216245, + "loss": 1.3335, + "step": 938 + }, + { + "epoch": 0.09710193635118017, + "grad_norm": 3.0873427391052246, + "learning_rate": 0.00019038799793067771, + "loss": 2.2271, + "step": 939 + }, + { + "epoch": 0.09720534629404617, + "grad_norm": 1.1165976524353027, + "learning_rate": 0.00019037765131919298, + "loss": 0.925, + "step": 940 + }, + { + "epoch": 0.09730875623691218, + "grad_norm": 1.3360462188720703, + "learning_rate": 0.00019036730470770822, + "loss": 1.9376, + "step": 941 + }, + { + "epoch": 0.09741216617977819, + "grad_norm": 1.3079710006713867, + "learning_rate": 0.00019035695809622348, + "loss": 2.2023, + "step": 942 + }, + { + "epoch": 0.09751557612264419, + "grad_norm": 3.9090816974639893, + "learning_rate": 0.00019034661148473875, + "loss": 1.9705, + "step": 943 + }, + { + "epoch": 0.0976189860655102, + "grad_norm": 1.266180396080017, + "learning_rate": 0.00019033626487325401, + "loss": 2.0189, + "step": 944 + }, + { + "epoch": 0.0977223960083762, + "grad_norm": 1.4128590822219849, + "learning_rate": 0.00019032591826176928, + "loss": 1.9973, + "step": 945 + }, + { + "epoch": 0.09782580595124221, + "grad_norm": 1.5954068899154663, + "learning_rate": 0.00019031557165028452, + "loss": 1.6339, + "step": 946 + }, + { + "epoch": 0.09792921589410822, + "grad_norm": 2.1788735389709473, + "learning_rate": 0.00019030522503879978, + "loss": 1.2789, + "step": 947 + }, + { + "epoch": 0.09803262583697422, + "grad_norm": 2.4323537349700928, + "learning_rate": 0.00019029487842731505, + "loss": 1.3597, + "step": 948 + }, + { + "epoch": 0.09813603577984023, + "grad_norm": 1.620073676109314, + "learning_rate": 0.00019028453181583032, + "loss": 1.7057, + "step": 949 + }, + { + "epoch": 0.09823944572270624, + "grad_norm": 1.2310841083526611, + "learning_rate": 0.00019027418520434558, + "loss": 1.7842, + "step": 950 + }, + { + "epoch": 0.09834285566557224, + "grad_norm": 2.3200294971466064, + "learning_rate": 0.00019026383859286085, + "loss": 1.3738, + "step": 951 + }, + { + "epoch": 0.09844626560843825, + "grad_norm": 1.0328644514083862, + "learning_rate": 0.00019025349198137609, + "loss": 1.1602, + "step": 952 + }, + { + "epoch": 0.09854967555130426, + "grad_norm": 3.512075662612915, + "learning_rate": 0.00019024314536989135, + "loss": 1.6666, + "step": 953 + }, + { + "epoch": 0.09865308549417026, + "grad_norm": 2.2993133068084717, + "learning_rate": 0.00019023279875840662, + "loss": 2.9702, + "step": 954 + }, + { + "epoch": 0.09875649543703627, + "grad_norm": 1.472238302230835, + "learning_rate": 0.00019022245214692188, + "loss": 1.7172, + "step": 955 + }, + { + "epoch": 0.09885990537990227, + "grad_norm": 1.6245614290237427, + "learning_rate": 0.00019021210553543715, + "loss": 2.1302, + "step": 956 + }, + { + "epoch": 0.09896331532276828, + "grad_norm": 2.2646124362945557, + "learning_rate": 0.0001902017589239524, + "loss": 1.2524, + "step": 957 + }, + { + "epoch": 0.09906672526563429, + "grad_norm": 1.8233280181884766, + "learning_rate": 0.00019019141231246765, + "loss": 1.6103, + "step": 958 + }, + { + "epoch": 0.09917013520850029, + "grad_norm": 1.894910216331482, + "learning_rate": 0.00019018106570098292, + "loss": 1.8498, + "step": 959 + }, + { + "epoch": 0.0992735451513663, + "grad_norm": 2.561936378479004, + "learning_rate": 0.00019017071908949818, + "loss": 2.3839, + "step": 960 + }, + { + "epoch": 0.0993769550942323, + "grad_norm": 1.281097650527954, + "learning_rate": 0.00019016037247801345, + "loss": 1.4926, + "step": 961 + }, + { + "epoch": 0.09948036503709831, + "grad_norm": 1.704836130142212, + "learning_rate": 0.00019015002586652871, + "loss": 1.9475, + "step": 962 + }, + { + "epoch": 0.09958377497996432, + "grad_norm": 1.2036210298538208, + "learning_rate": 0.00019013967925504398, + "loss": 1.5725, + "step": 963 + }, + { + "epoch": 0.09968718492283032, + "grad_norm": 0.9029375314712524, + "learning_rate": 0.00019012933264355925, + "loss": 1.3477, + "step": 964 + }, + { + "epoch": 0.09979059486569633, + "grad_norm": 1.436606526374817, + "learning_rate": 0.0001901189860320745, + "loss": 1.7976, + "step": 965 + }, + { + "epoch": 0.09989400480856234, + "grad_norm": 1.4428085088729858, + "learning_rate": 0.00019010863942058975, + "loss": 1.8221, + "step": 966 + }, + { + "epoch": 0.09999741475142834, + "grad_norm": 1.1217411756515503, + "learning_rate": 0.00019009829280910501, + "loss": 1.3546, + "step": 967 + }, + { + "epoch": 0.10010082469429436, + "grad_norm": 1.180086612701416, + "learning_rate": 0.00019008794619762028, + "loss": 2.22, + "step": 968 + }, + { + "epoch": 0.10020423463716037, + "grad_norm": 2.1352503299713135, + "learning_rate": 0.00019007759958613555, + "loss": 2.0902, + "step": 969 + }, + { + "epoch": 0.10030764458002638, + "grad_norm": 0.9205876588821411, + "learning_rate": 0.0001900672529746508, + "loss": 1.7533, + "step": 970 + }, + { + "epoch": 0.10041105452289238, + "grad_norm": 1.8197256326675415, + "learning_rate": 0.00019005690636316608, + "loss": 1.2994, + "step": 971 + }, + { + "epoch": 0.10051446446575839, + "grad_norm": 2.683286428451538, + "learning_rate": 0.00019004655975168134, + "loss": 2.3546, + "step": 972 + }, + { + "epoch": 0.1006178744086244, + "grad_norm": 1.520284652709961, + "learning_rate": 0.0001900362131401966, + "loss": 1.4579, + "step": 973 + }, + { + "epoch": 0.1007212843514904, + "grad_norm": 1.7554701566696167, + "learning_rate": 0.00019002586652871185, + "loss": 1.664, + "step": 974 + }, + { + "epoch": 0.1008246942943564, + "grad_norm": 0.9988136887550354, + "learning_rate": 0.0001900155199172271, + "loss": 1.6349, + "step": 975 + }, + { + "epoch": 0.10092810423722241, + "grad_norm": 1.7587372064590454, + "learning_rate": 0.00019000517330574238, + "loss": 1.3191, + "step": 976 + }, + { + "epoch": 0.10103151418008842, + "grad_norm": 0.7632730603218079, + "learning_rate": 0.00018999482669425764, + "loss": 1.8607, + "step": 977 + }, + { + "epoch": 0.10113492412295443, + "grad_norm": 1.414077639579773, + "learning_rate": 0.0001899844800827729, + "loss": 1.6789, + "step": 978 + }, + { + "epoch": 0.10123833406582043, + "grad_norm": 0.9060714840888977, + "learning_rate": 0.00018997413347128817, + "loss": 1.9993, + "step": 979 + }, + { + "epoch": 0.10134174400868644, + "grad_norm": 1.177000641822815, + "learning_rate": 0.00018996378685980344, + "loss": 1.6802, + "step": 980 + }, + { + "epoch": 0.10144515395155244, + "grad_norm": 0.9905523061752319, + "learning_rate": 0.0001899534402483187, + "loss": 1.7459, + "step": 981 + }, + { + "epoch": 0.10154856389441845, + "grad_norm": 1.1813868284225464, + "learning_rate": 0.00018994309363683394, + "loss": 1.4899, + "step": 982 + }, + { + "epoch": 0.10165197383728446, + "grad_norm": 1.2635287046432495, + "learning_rate": 0.0001899327470253492, + "loss": 1.53, + "step": 983 + }, + { + "epoch": 0.10175538378015046, + "grad_norm": 0.9415909051895142, + "learning_rate": 0.00018992240041386448, + "loss": 1.7562, + "step": 984 + }, + { + "epoch": 0.10185879372301647, + "grad_norm": 1.4737257957458496, + "learning_rate": 0.00018991205380237974, + "loss": 1.5612, + "step": 985 + }, + { + "epoch": 0.10196220366588248, + "grad_norm": 1.445164680480957, + "learning_rate": 0.000189901707190895, + "loss": 1.6512, + "step": 986 + }, + { + "epoch": 0.10206561360874848, + "grad_norm": 1.29860520362854, + "learning_rate": 0.00018989136057941027, + "loss": 1.903, + "step": 987 + }, + { + "epoch": 0.10216902355161449, + "grad_norm": 1.0900709629058838, + "learning_rate": 0.0001898810139679255, + "loss": 1.4451, + "step": 988 + }, + { + "epoch": 0.1022724334944805, + "grad_norm": 1.3902963399887085, + "learning_rate": 0.00018987066735644078, + "loss": 1.8913, + "step": 989 + }, + { + "epoch": 0.1023758434373465, + "grad_norm": 2.240596055984497, + "learning_rate": 0.00018986032074495604, + "loss": 1.9735, + "step": 990 + }, + { + "epoch": 0.1024792533802125, + "grad_norm": 3.0811617374420166, + "learning_rate": 0.0001898499741334713, + "loss": 1.7042, + "step": 991 + }, + { + "epoch": 0.10258266332307851, + "grad_norm": 1.270492434501648, + "learning_rate": 0.00018983962752198657, + "loss": 1.541, + "step": 992 + }, + { + "epoch": 0.10268607326594452, + "grad_norm": 1.7597604990005493, + "learning_rate": 0.00018982928091050184, + "loss": 2.216, + "step": 993 + }, + { + "epoch": 0.10278948320881053, + "grad_norm": 1.8272831439971924, + "learning_rate": 0.00018981893429901708, + "loss": 1.5311, + "step": 994 + }, + { + "epoch": 0.10289289315167653, + "grad_norm": 1.4568592309951782, + "learning_rate": 0.00018980858768753234, + "loss": 1.3844, + "step": 995 + }, + { + "epoch": 0.10299630309454254, + "grad_norm": 1.969649076461792, + "learning_rate": 0.0001897982410760476, + "loss": 1.8859, + "step": 996 + }, + { + "epoch": 0.10309971303740854, + "grad_norm": 1.2497482299804688, + "learning_rate": 0.00018978789446456287, + "loss": 1.455, + "step": 997 + }, + { + "epoch": 0.10320312298027455, + "grad_norm": 2.641781806945801, + "learning_rate": 0.00018977754785307814, + "loss": 2.2136, + "step": 998 + }, + { + "epoch": 0.10330653292314056, + "grad_norm": 0.9556031227111816, + "learning_rate": 0.0001897672012415934, + "loss": 1.3249, + "step": 999 + }, + { + "epoch": 0.10340994286600656, + "grad_norm": 1.0540037155151367, + "learning_rate": 0.00018975685463010864, + "loss": 1.1152, + "step": 1000 + }, + { + "epoch": 0.10351335280887257, + "grad_norm": 1.8548526763916016, + "learning_rate": 0.0001897465080186239, + "loss": 1.6326, + "step": 1001 + }, + { + "epoch": 0.10361676275173858, + "grad_norm": 1.2354981899261475, + "learning_rate": 0.00018973616140713917, + "loss": 1.9958, + "step": 1002 + }, + { + "epoch": 0.10372017269460458, + "grad_norm": 1.3923211097717285, + "learning_rate": 0.00018972581479565444, + "loss": 1.601, + "step": 1003 + }, + { + "epoch": 0.10382358263747059, + "grad_norm": 0.9945183396339417, + "learning_rate": 0.0001897154681841697, + "loss": 1.5537, + "step": 1004 + }, + { + "epoch": 0.1039269925803366, + "grad_norm": 2.5964150428771973, + "learning_rate": 0.00018970512157268497, + "loss": 2.0066, + "step": 1005 + }, + { + "epoch": 0.1040304025232026, + "grad_norm": 1.4605211019515991, + "learning_rate": 0.0001896947749612002, + "loss": 1.6974, + "step": 1006 + }, + { + "epoch": 0.1041338124660686, + "grad_norm": 0.8577373623847961, + "learning_rate": 0.00018968442834971547, + "loss": 2.3723, + "step": 1007 + }, + { + "epoch": 0.10423722240893461, + "grad_norm": 1.6100918054580688, + "learning_rate": 0.00018967408173823074, + "loss": 1.8351, + "step": 1008 + }, + { + "epoch": 0.10434063235180062, + "grad_norm": 1.1363835334777832, + "learning_rate": 0.000189663735126746, + "loss": 2.0254, + "step": 1009 + }, + { + "epoch": 0.10444404229466663, + "grad_norm": 1.3265516757965088, + "learning_rate": 0.00018965338851526127, + "loss": 1.8254, + "step": 1010 + }, + { + "epoch": 0.10454745223753265, + "grad_norm": 1.4067211151123047, + "learning_rate": 0.00018964304190377654, + "loss": 1.2216, + "step": 1011 + }, + { + "epoch": 0.10465086218039865, + "grad_norm": 1.7004269361495972, + "learning_rate": 0.00018963269529229178, + "loss": 1.3757, + "step": 1012 + }, + { + "epoch": 0.10475427212326466, + "grad_norm": 1.0695922374725342, + "learning_rate": 0.00018962234868080704, + "loss": 1.2822, + "step": 1013 + }, + { + "epoch": 0.10485768206613066, + "grad_norm": 1.6438508033752441, + "learning_rate": 0.0001896120020693223, + "loss": 1.7428, + "step": 1014 + }, + { + "epoch": 0.10496109200899667, + "grad_norm": 1.1417630910873413, + "learning_rate": 0.00018960165545783757, + "loss": 2.2032, + "step": 1015 + }, + { + "epoch": 0.10506450195186268, + "grad_norm": 4.685209274291992, + "learning_rate": 0.00018959130884635284, + "loss": 1.7533, + "step": 1016 + }, + { + "epoch": 0.10516791189472868, + "grad_norm": 1.016170859336853, + "learning_rate": 0.0001895809622348681, + "loss": 2.2786, + "step": 1017 + }, + { + "epoch": 0.10527132183759469, + "grad_norm": 2.0337867736816406, + "learning_rate": 0.00018957061562338334, + "loss": 1.9138, + "step": 1018 + }, + { + "epoch": 0.1053747317804607, + "grad_norm": 1.1036525964736938, + "learning_rate": 0.0001895602690118986, + "loss": 1.5866, + "step": 1019 + }, + { + "epoch": 0.1054781417233267, + "grad_norm": 1.8529484272003174, + "learning_rate": 0.00018954992240041387, + "loss": 2.0264, + "step": 1020 + }, + { + "epoch": 0.10558155166619271, + "grad_norm": 1.2001843452453613, + "learning_rate": 0.00018953957578892914, + "loss": 1.5934, + "step": 1021 + }, + { + "epoch": 0.10568496160905871, + "grad_norm": 1.453425407409668, + "learning_rate": 0.0001895292291774444, + "loss": 1.9904, + "step": 1022 + }, + { + "epoch": 0.10578837155192472, + "grad_norm": 1.6289454698562622, + "learning_rate": 0.00018951888256595967, + "loss": 1.8033, + "step": 1023 + }, + { + "epoch": 0.10589178149479073, + "grad_norm": 1.375352144241333, + "learning_rate": 0.0001895085359544749, + "loss": 1.0417, + "step": 1024 + }, + { + "epoch": 0.10599519143765673, + "grad_norm": 1.941213846206665, + "learning_rate": 0.00018949818934299017, + "loss": 2.1872, + "step": 1025 + }, + { + "epoch": 0.10609860138052274, + "grad_norm": 1.7311123609542847, + "learning_rate": 0.00018948784273150544, + "loss": 1.9615, + "step": 1026 + }, + { + "epoch": 0.10620201132338875, + "grad_norm": 2.0823657512664795, + "learning_rate": 0.0001894774961200207, + "loss": 1.4303, + "step": 1027 + }, + { + "epoch": 0.10630542126625475, + "grad_norm": 2.074422597885132, + "learning_rate": 0.00018946714950853597, + "loss": 2.1897, + "step": 1028 + }, + { + "epoch": 0.10640883120912076, + "grad_norm": 2.0139143466949463, + "learning_rate": 0.00018945680289705124, + "loss": 1.3624, + "step": 1029 + }, + { + "epoch": 0.10651224115198676, + "grad_norm": 1.2934083938598633, + "learning_rate": 0.00018944645628556647, + "loss": 1.5068, + "step": 1030 + }, + { + "epoch": 0.10661565109485277, + "grad_norm": 1.888208270072937, + "learning_rate": 0.00018943610967408174, + "loss": 2.3472, + "step": 1031 + }, + { + "epoch": 0.10671906103771878, + "grad_norm": 1.3515228033065796, + "learning_rate": 0.000189425763062597, + "loss": 1.7646, + "step": 1032 + }, + { + "epoch": 0.10682247098058478, + "grad_norm": 1.068301796913147, + "learning_rate": 0.00018941541645111227, + "loss": 1.8304, + "step": 1033 + }, + { + "epoch": 0.10692588092345079, + "grad_norm": 0.8828536868095398, + "learning_rate": 0.00018940506983962754, + "loss": 1.6585, + "step": 1034 + }, + { + "epoch": 0.1070292908663168, + "grad_norm": 1.5016223192214966, + "learning_rate": 0.0001893947232281428, + "loss": 1.8102, + "step": 1035 + }, + { + "epoch": 0.1071327008091828, + "grad_norm": 1.7466976642608643, + "learning_rate": 0.00018938437661665804, + "loss": 1.6282, + "step": 1036 + }, + { + "epoch": 0.10723611075204881, + "grad_norm": 1.166925072669983, + "learning_rate": 0.0001893740300051733, + "loss": 1.5768, + "step": 1037 + }, + { + "epoch": 0.10733952069491481, + "grad_norm": 1.8839147090911865, + "learning_rate": 0.00018936368339368857, + "loss": 2.4981, + "step": 1038 + }, + { + "epoch": 0.10744293063778082, + "grad_norm": 1.049673080444336, + "learning_rate": 0.00018935333678220384, + "loss": 1.8435, + "step": 1039 + }, + { + "epoch": 0.10754634058064683, + "grad_norm": 2.3705356121063232, + "learning_rate": 0.0001893429901707191, + "loss": 1.9116, + "step": 1040 + }, + { + "epoch": 0.10764975052351283, + "grad_norm": 1.645258903503418, + "learning_rate": 0.00018933264355923437, + "loss": 1.6528, + "step": 1041 + }, + { + "epoch": 0.10775316046637884, + "grad_norm": 1.6526823043823242, + "learning_rate": 0.0001893222969477496, + "loss": 2.0118, + "step": 1042 + }, + { + "epoch": 0.10785657040924485, + "grad_norm": 1.2598350048065186, + "learning_rate": 0.00018931195033626487, + "loss": 2.2473, + "step": 1043 + }, + { + "epoch": 0.10795998035211085, + "grad_norm": 1.955881953239441, + "learning_rate": 0.00018930160372478014, + "loss": 1.8537, + "step": 1044 + }, + { + "epoch": 0.10806339029497686, + "grad_norm": 2.739224433898926, + "learning_rate": 0.0001892912571132954, + "loss": 2.3352, + "step": 1045 + }, + { + "epoch": 0.10816680023784286, + "grad_norm": 2.8407015800476074, + "learning_rate": 0.00018928091050181067, + "loss": 2.006, + "step": 1046 + }, + { + "epoch": 0.10827021018070887, + "grad_norm": 1.3017852306365967, + "learning_rate": 0.00018927056389032594, + "loss": 1.8562, + "step": 1047 + }, + { + "epoch": 0.10837362012357488, + "grad_norm": 1.8522440195083618, + "learning_rate": 0.00018926021727884117, + "loss": 1.3757, + "step": 1048 + }, + { + "epoch": 0.10847703006644088, + "grad_norm": 1.512387990951538, + "learning_rate": 0.00018924987066735644, + "loss": 1.7554, + "step": 1049 + }, + { + "epoch": 0.10858044000930689, + "grad_norm": 1.8739479780197144, + "learning_rate": 0.0001892395240558717, + "loss": 2.3464, + "step": 1050 + }, + { + "epoch": 0.1086838499521729, + "grad_norm": 1.4831976890563965, + "learning_rate": 0.00018922917744438697, + "loss": 1.9555, + "step": 1051 + }, + { + "epoch": 0.1087872598950389, + "grad_norm": 1.5891178846359253, + "learning_rate": 0.00018921883083290224, + "loss": 1.3746, + "step": 1052 + }, + { + "epoch": 0.10889066983790492, + "grad_norm": 2.351069688796997, + "learning_rate": 0.0001892084842214175, + "loss": 1.3839, + "step": 1053 + }, + { + "epoch": 0.10899407978077093, + "grad_norm": 2.2983150482177734, + "learning_rate": 0.00018919813760993274, + "loss": 2.013, + "step": 1054 + }, + { + "epoch": 0.10909748972363693, + "grad_norm": 0.9400889277458191, + "learning_rate": 0.000189187790998448, + "loss": 1.8208, + "step": 1055 + }, + { + "epoch": 0.10920089966650294, + "grad_norm": 1.7346934080123901, + "learning_rate": 0.00018917744438696327, + "loss": 1.6008, + "step": 1056 + }, + { + "epoch": 0.10930430960936895, + "grad_norm": 2.0144057273864746, + "learning_rate": 0.00018916709777547854, + "loss": 1.5488, + "step": 1057 + }, + { + "epoch": 0.10940771955223495, + "grad_norm": 1.8916040658950806, + "learning_rate": 0.0001891567511639938, + "loss": 1.9123, + "step": 1058 + }, + { + "epoch": 0.10951112949510096, + "grad_norm": 0.820121169090271, + "learning_rate": 0.00018914640455250907, + "loss": 1.5304, + "step": 1059 + }, + { + "epoch": 0.10961453943796697, + "grad_norm": 1.2862852811813354, + "learning_rate": 0.0001891360579410243, + "loss": 1.7054, + "step": 1060 + }, + { + "epoch": 0.10971794938083297, + "grad_norm": 1.6166162490844727, + "learning_rate": 0.00018912571132953957, + "loss": 1.4308, + "step": 1061 + }, + { + "epoch": 0.10982135932369898, + "grad_norm": 1.2072175741195679, + "learning_rate": 0.00018911536471805484, + "loss": 1.937, + "step": 1062 + }, + { + "epoch": 0.10992476926656498, + "grad_norm": 1.044589638710022, + "learning_rate": 0.0001891050181065701, + "loss": 1.6156, + "step": 1063 + }, + { + "epoch": 0.11002817920943099, + "grad_norm": 1.2547587156295776, + "learning_rate": 0.00018909467149508537, + "loss": 1.9699, + "step": 1064 + }, + { + "epoch": 0.110131589152297, + "grad_norm": 0.9746593832969666, + "learning_rate": 0.00018908432488360063, + "loss": 1.4966, + "step": 1065 + }, + { + "epoch": 0.110234999095163, + "grad_norm": 1.631894826889038, + "learning_rate": 0.00018907397827211587, + "loss": 1.586, + "step": 1066 + }, + { + "epoch": 0.11033840903802901, + "grad_norm": 1.65902578830719, + "learning_rate": 0.00018906363166063114, + "loss": 1.7788, + "step": 1067 + }, + { + "epoch": 0.11044181898089502, + "grad_norm": 0.8059775829315186, + "learning_rate": 0.0001890532850491464, + "loss": 1.7275, + "step": 1068 + }, + { + "epoch": 0.11054522892376102, + "grad_norm": 1.8470535278320312, + "learning_rate": 0.00018904293843766167, + "loss": 1.1985, + "step": 1069 + }, + { + "epoch": 0.11064863886662703, + "grad_norm": 1.7774137258529663, + "learning_rate": 0.00018903259182617693, + "loss": 2.0377, + "step": 1070 + }, + { + "epoch": 0.11075204880949303, + "grad_norm": 1.8821941614151, + "learning_rate": 0.0001890222452146922, + "loss": 1.8849, + "step": 1071 + }, + { + "epoch": 0.11085545875235904, + "grad_norm": 1.5304746627807617, + "learning_rate": 0.00018901189860320744, + "loss": 2.0822, + "step": 1072 + }, + { + "epoch": 0.11095886869522505, + "grad_norm": 1.7353805303573608, + "learning_rate": 0.0001890015519917227, + "loss": 1.7504, + "step": 1073 + }, + { + "epoch": 0.11106227863809105, + "grad_norm": 1.6357505321502686, + "learning_rate": 0.00018899120538023797, + "loss": 1.99, + "step": 1074 + }, + { + "epoch": 0.11116568858095706, + "grad_norm": 1.29119074344635, + "learning_rate": 0.00018898085876875324, + "loss": 1.5753, + "step": 1075 + }, + { + "epoch": 0.11126909852382307, + "grad_norm": 0.8210546374320984, + "learning_rate": 0.0001889705121572685, + "loss": 1.4492, + "step": 1076 + }, + { + "epoch": 0.11137250846668907, + "grad_norm": 1.6336994171142578, + "learning_rate": 0.00018896016554578377, + "loss": 2.0778, + "step": 1077 + }, + { + "epoch": 0.11147591840955508, + "grad_norm": 1.7355693578720093, + "learning_rate": 0.000188949818934299, + "loss": 1.7414, + "step": 1078 + }, + { + "epoch": 0.11157932835242108, + "grad_norm": 0.9598165154457092, + "learning_rate": 0.00018893947232281427, + "loss": 1.8316, + "step": 1079 + }, + { + "epoch": 0.11168273829528709, + "grad_norm": 1.4467240571975708, + "learning_rate": 0.00018892912571132954, + "loss": 1.3253, + "step": 1080 + }, + { + "epoch": 0.1117861482381531, + "grad_norm": 1.258633017539978, + "learning_rate": 0.0001889187790998448, + "loss": 2.1484, + "step": 1081 + }, + { + "epoch": 0.1118895581810191, + "grad_norm": 3.974945545196533, + "learning_rate": 0.00018890843248836007, + "loss": 2.4456, + "step": 1082 + }, + { + "epoch": 0.11199296812388511, + "grad_norm": 3.577840805053711, + "learning_rate": 0.00018889808587687533, + "loss": 2.3568, + "step": 1083 + }, + { + "epoch": 0.11209637806675112, + "grad_norm": 2.9973061084747314, + "learning_rate": 0.00018888773926539057, + "loss": 1.9609, + "step": 1084 + }, + { + "epoch": 0.11219978800961712, + "grad_norm": 2.482928514480591, + "learning_rate": 0.00018887739265390584, + "loss": 2.2426, + "step": 1085 + }, + { + "epoch": 0.11230319795248313, + "grad_norm": 1.4588831663131714, + "learning_rate": 0.0001888670460424211, + "loss": 1.3213, + "step": 1086 + }, + { + "epoch": 0.11240660789534913, + "grad_norm": 1.1756937503814697, + "learning_rate": 0.00018885669943093637, + "loss": 1.6379, + "step": 1087 + }, + { + "epoch": 0.11251001783821514, + "grad_norm": 1.3494646549224854, + "learning_rate": 0.00018884635281945163, + "loss": 1.4225, + "step": 1088 + }, + { + "epoch": 0.11261342778108115, + "grad_norm": 1.6202143430709839, + "learning_rate": 0.0001888360062079669, + "loss": 1.8345, + "step": 1089 + }, + { + "epoch": 0.11271683772394715, + "grad_norm": 1.5296990871429443, + "learning_rate": 0.00018882565959648216, + "loss": 2.0233, + "step": 1090 + }, + { + "epoch": 0.11282024766681316, + "grad_norm": 1.0927106142044067, + "learning_rate": 0.0001888153129849974, + "loss": 1.2746, + "step": 1091 + }, + { + "epoch": 0.11292365760967916, + "grad_norm": 1.003934383392334, + "learning_rate": 0.00018880496637351267, + "loss": 1.5895, + "step": 1092 + }, + { + "epoch": 0.11302706755254517, + "grad_norm": 1.3455843925476074, + "learning_rate": 0.00018879461976202793, + "loss": 2.2522, + "step": 1093 + }, + { + "epoch": 0.11313047749541118, + "grad_norm": 1.3665963411331177, + "learning_rate": 0.0001887842731505432, + "loss": 1.6567, + "step": 1094 + }, + { + "epoch": 0.11323388743827718, + "grad_norm": 3.7127087116241455, + "learning_rate": 0.00018877392653905847, + "loss": 1.7663, + "step": 1095 + }, + { + "epoch": 0.1133372973811432, + "grad_norm": 1.8103179931640625, + "learning_rate": 0.00018876357992757373, + "loss": 1.8739, + "step": 1096 + }, + { + "epoch": 0.11344070732400921, + "grad_norm": 2.501490592956543, + "learning_rate": 0.000188753233316089, + "loss": 2.0688, + "step": 1097 + }, + { + "epoch": 0.11354411726687522, + "grad_norm": 2.178652763366699, + "learning_rate": 0.00018874288670460426, + "loss": 1.7537, + "step": 1098 + }, + { + "epoch": 0.11364752720974122, + "grad_norm": 1.7944008111953735, + "learning_rate": 0.0001887325400931195, + "loss": 1.1038, + "step": 1099 + }, + { + "epoch": 0.11375093715260723, + "grad_norm": 1.4743938446044922, + "learning_rate": 0.00018872219348163477, + "loss": 1.8314, + "step": 1100 + }, + { + "epoch": 0.11385434709547324, + "grad_norm": 2.165440082550049, + "learning_rate": 0.00018871184687015003, + "loss": 1.7616, + "step": 1101 + }, + { + "epoch": 0.11395775703833924, + "grad_norm": 2.110471487045288, + "learning_rate": 0.0001887015002586653, + "loss": 2.0106, + "step": 1102 + }, + { + "epoch": 0.11406116698120525, + "grad_norm": 1.5662785768508911, + "learning_rate": 0.00018869115364718056, + "loss": 1.5418, + "step": 1103 + }, + { + "epoch": 0.11416457692407125, + "grad_norm": 2.1364023685455322, + "learning_rate": 0.00018868080703569583, + "loss": 2.4005, + "step": 1104 + }, + { + "epoch": 0.11426798686693726, + "grad_norm": 1.4032834768295288, + "learning_rate": 0.0001886704604242111, + "loss": 1.6216, + "step": 1105 + }, + { + "epoch": 0.11437139680980327, + "grad_norm": 1.2782148122787476, + "learning_rate": 0.00018866011381272636, + "loss": 1.3291, + "step": 1106 + }, + { + "epoch": 0.11447480675266927, + "grad_norm": 0.9738740921020508, + "learning_rate": 0.0001886497672012416, + "loss": 1.3762, + "step": 1107 + }, + { + "epoch": 0.11457821669553528, + "grad_norm": 2.0070621967315674, + "learning_rate": 0.00018863942058975686, + "loss": 1.9043, + "step": 1108 + }, + { + "epoch": 0.11468162663840129, + "grad_norm": 2.14795184135437, + "learning_rate": 0.00018862907397827213, + "loss": 1.5564, + "step": 1109 + }, + { + "epoch": 0.11478503658126729, + "grad_norm": 2.192723512649536, + "learning_rate": 0.0001886187273667874, + "loss": 2.1081, + "step": 1110 + }, + { + "epoch": 0.1148884465241333, + "grad_norm": 1.234890103340149, + "learning_rate": 0.00018860838075530266, + "loss": 1.4527, + "step": 1111 + }, + { + "epoch": 0.1149918564669993, + "grad_norm": 1.0031839609146118, + "learning_rate": 0.00018859803414381793, + "loss": 1.5052, + "step": 1112 + }, + { + "epoch": 0.11509526640986531, + "grad_norm": 1.5561741590499878, + "learning_rate": 0.0001885876875323332, + "loss": 1.9241, + "step": 1113 + }, + { + "epoch": 0.11519867635273132, + "grad_norm": 2.3440773487091064, + "learning_rate": 0.00018857734092084843, + "loss": 1.9541, + "step": 1114 + }, + { + "epoch": 0.11530208629559732, + "grad_norm": 1.510392427444458, + "learning_rate": 0.0001885669943093637, + "loss": 2.0317, + "step": 1115 + }, + { + "epoch": 0.11540549623846333, + "grad_norm": 1.3775566816329956, + "learning_rate": 0.00018855664769787896, + "loss": 1.7841, + "step": 1116 + }, + { + "epoch": 0.11550890618132933, + "grad_norm": 0.9024312496185303, + "learning_rate": 0.00018854630108639423, + "loss": 1.974, + "step": 1117 + }, + { + "epoch": 0.11561231612419534, + "grad_norm": 1.6050846576690674, + "learning_rate": 0.0001885359544749095, + "loss": 1.9041, + "step": 1118 + }, + { + "epoch": 0.11571572606706135, + "grad_norm": 1.6221837997436523, + "learning_rate": 0.00018852560786342476, + "loss": 1.8008, + "step": 1119 + }, + { + "epoch": 0.11581913600992735, + "grad_norm": 1.4674919843673706, + "learning_rate": 0.00018851526125194, + "loss": 1.0687, + "step": 1120 + }, + { + "epoch": 0.11592254595279336, + "grad_norm": 2.9665956497192383, + "learning_rate": 0.00018850491464045526, + "loss": 1.9465, + "step": 1121 + }, + { + "epoch": 0.11602595589565937, + "grad_norm": 2.3202719688415527, + "learning_rate": 0.00018849456802897053, + "loss": 2.2519, + "step": 1122 + }, + { + "epoch": 0.11612936583852537, + "grad_norm": 1.2720879316329956, + "learning_rate": 0.0001884842214174858, + "loss": 2.0585, + "step": 1123 + }, + { + "epoch": 0.11623277578139138, + "grad_norm": 1.8549882173538208, + "learning_rate": 0.00018847387480600106, + "loss": 1.8807, + "step": 1124 + }, + { + "epoch": 0.11633618572425738, + "grad_norm": 1.7395340204238892, + "learning_rate": 0.00018846352819451632, + "loss": 1.7122, + "step": 1125 + }, + { + "epoch": 0.11643959566712339, + "grad_norm": 1.8677852153778076, + "learning_rate": 0.00018845318158303156, + "loss": 1.8792, + "step": 1126 + }, + { + "epoch": 0.1165430056099894, + "grad_norm": 1.4431513547897339, + "learning_rate": 0.00018844283497154683, + "loss": 2.1237, + "step": 1127 + }, + { + "epoch": 0.1166464155528554, + "grad_norm": 1.1699776649475098, + "learning_rate": 0.0001884324883600621, + "loss": 1.2709, + "step": 1128 + }, + { + "epoch": 0.11674982549572141, + "grad_norm": 1.6420997381210327, + "learning_rate": 0.00018842214174857736, + "loss": 1.7907, + "step": 1129 + }, + { + "epoch": 0.11685323543858742, + "grad_norm": 1.390235424041748, + "learning_rate": 0.00018841179513709263, + "loss": 1.7772, + "step": 1130 + }, + { + "epoch": 0.11695664538145342, + "grad_norm": 1.4697620868682861, + "learning_rate": 0.0001884014485256079, + "loss": 1.5639, + "step": 1131 + }, + { + "epoch": 0.11706005532431943, + "grad_norm": 1.740312933921814, + "learning_rate": 0.00018839110191412313, + "loss": 1.7756, + "step": 1132 + }, + { + "epoch": 0.11716346526718543, + "grad_norm": 2.991058111190796, + "learning_rate": 0.0001883807553026384, + "loss": 1.7446, + "step": 1133 + }, + { + "epoch": 0.11726687521005144, + "grad_norm": 2.671280860900879, + "learning_rate": 0.00018837040869115366, + "loss": 1.9445, + "step": 1134 + }, + { + "epoch": 0.11737028515291745, + "grad_norm": 2.767537832260132, + "learning_rate": 0.00018836006207966893, + "loss": 2.0817, + "step": 1135 + }, + { + "epoch": 0.11747369509578345, + "grad_norm": 2.4302000999450684, + "learning_rate": 0.0001883497154681842, + "loss": 2.3611, + "step": 1136 + }, + { + "epoch": 0.11757710503864946, + "grad_norm": 5.114750862121582, + "learning_rate": 0.00018833936885669946, + "loss": 2.3867, + "step": 1137 + }, + { + "epoch": 0.11768051498151547, + "grad_norm": 1.0005320310592651, + "learning_rate": 0.0001883290222452147, + "loss": 2.0295, + "step": 1138 + }, + { + "epoch": 0.11778392492438149, + "grad_norm": 1.4860533475875854, + "learning_rate": 0.00018831867563372996, + "loss": 2.0879, + "step": 1139 + }, + { + "epoch": 0.11788733486724749, + "grad_norm": 0.8643640279769897, + "learning_rate": 0.00018830832902224523, + "loss": 2.0423, + "step": 1140 + }, + { + "epoch": 0.1179907448101135, + "grad_norm": 2.4749903678894043, + "learning_rate": 0.0001882979824107605, + "loss": 1.5508, + "step": 1141 + }, + { + "epoch": 0.1180941547529795, + "grad_norm": 1.414445400238037, + "learning_rate": 0.00018828763579927576, + "loss": 2.1075, + "step": 1142 + }, + { + "epoch": 0.11819756469584551, + "grad_norm": 2.0021872520446777, + "learning_rate": 0.00018827728918779102, + "loss": 1.0144, + "step": 1143 + }, + { + "epoch": 0.11830097463871152, + "grad_norm": 1.3872566223144531, + "learning_rate": 0.00018826694257630626, + "loss": 1.3635, + "step": 1144 + }, + { + "epoch": 0.11840438458157752, + "grad_norm": 1.5357730388641357, + "learning_rate": 0.00018825659596482153, + "loss": 1.7582, + "step": 1145 + }, + { + "epoch": 0.11850779452444353, + "grad_norm": 1.2004647254943848, + "learning_rate": 0.0001882462493533368, + "loss": 1.5143, + "step": 1146 + }, + { + "epoch": 0.11861120446730954, + "grad_norm": 1.6172764301300049, + "learning_rate": 0.00018823590274185206, + "loss": 1.2178, + "step": 1147 + }, + { + "epoch": 0.11871461441017554, + "grad_norm": 1.283896803855896, + "learning_rate": 0.00018822555613036732, + "loss": 2.0604, + "step": 1148 + }, + { + "epoch": 0.11881802435304155, + "grad_norm": 1.100882887840271, + "learning_rate": 0.0001882152095188826, + "loss": 1.5466, + "step": 1149 + }, + { + "epoch": 0.11892143429590755, + "grad_norm": 3.3976354598999023, + "learning_rate": 0.00018820486290739783, + "loss": 1.6791, + "step": 1150 + }, + { + "epoch": 0.11902484423877356, + "grad_norm": 1.0436819791793823, + "learning_rate": 0.0001881945162959131, + "loss": 1.5231, + "step": 1151 + }, + { + "epoch": 0.11912825418163957, + "grad_norm": 1.857042670249939, + "learning_rate": 0.00018818416968442836, + "loss": 2.3335, + "step": 1152 + }, + { + "epoch": 0.11923166412450557, + "grad_norm": 1.8413466215133667, + "learning_rate": 0.00018817382307294362, + "loss": 1.6003, + "step": 1153 + }, + { + "epoch": 0.11933507406737158, + "grad_norm": 1.1375494003295898, + "learning_rate": 0.0001881634764614589, + "loss": 2.1043, + "step": 1154 + }, + { + "epoch": 0.11943848401023759, + "grad_norm": 1.7668977975845337, + "learning_rate": 0.00018815312984997416, + "loss": 2.2028, + "step": 1155 + }, + { + "epoch": 0.11954189395310359, + "grad_norm": 1.776978611946106, + "learning_rate": 0.0001881427832384894, + "loss": 1.7239, + "step": 1156 + }, + { + "epoch": 0.1196453038959696, + "grad_norm": 2.0679118633270264, + "learning_rate": 0.00018813243662700466, + "loss": 1.4295, + "step": 1157 + }, + { + "epoch": 0.1197487138388356, + "grad_norm": 2.2436676025390625, + "learning_rate": 0.00018812209001551993, + "loss": 2.1156, + "step": 1158 + }, + { + "epoch": 0.11985212378170161, + "grad_norm": 1.7714333534240723, + "learning_rate": 0.0001881117434040352, + "loss": 1.8211, + "step": 1159 + }, + { + "epoch": 0.11995553372456762, + "grad_norm": 1.1076422929763794, + "learning_rate": 0.00018810139679255046, + "loss": 1.384, + "step": 1160 + }, + { + "epoch": 0.12005894366743362, + "grad_norm": 3.0645408630371094, + "learning_rate": 0.00018809105018106572, + "loss": 1.8257, + "step": 1161 + }, + { + "epoch": 0.12016235361029963, + "grad_norm": 1.4861654043197632, + "learning_rate": 0.00018808070356958096, + "loss": 2.0568, + "step": 1162 + }, + { + "epoch": 0.12026576355316564, + "grad_norm": 1.040371298789978, + "learning_rate": 0.00018807035695809623, + "loss": 1.6845, + "step": 1163 + }, + { + "epoch": 0.12036917349603164, + "grad_norm": 1.0902824401855469, + "learning_rate": 0.0001880600103466115, + "loss": 1.8769, + "step": 1164 + }, + { + "epoch": 0.12047258343889765, + "grad_norm": 1.154840111732483, + "learning_rate": 0.00018804966373512676, + "loss": 1.6885, + "step": 1165 + }, + { + "epoch": 0.12057599338176365, + "grad_norm": 1.073218584060669, + "learning_rate": 0.00018803931712364202, + "loss": 1.7502, + "step": 1166 + }, + { + "epoch": 0.12067940332462966, + "grad_norm": 0.9243725538253784, + "learning_rate": 0.0001880289705121573, + "loss": 1.4066, + "step": 1167 + }, + { + "epoch": 0.12078281326749567, + "grad_norm": 1.23936927318573, + "learning_rate": 0.00018801862390067253, + "loss": 2.0428, + "step": 1168 + }, + { + "epoch": 0.12088622321036167, + "grad_norm": 1.1509627103805542, + "learning_rate": 0.0001880082772891878, + "loss": 1.6855, + "step": 1169 + }, + { + "epoch": 0.12098963315322768, + "grad_norm": 1.6797055006027222, + "learning_rate": 0.00018799793067770306, + "loss": 2.1733, + "step": 1170 + }, + { + "epoch": 0.12109304309609369, + "grad_norm": 1.2919670343399048, + "learning_rate": 0.00018798758406621832, + "loss": 1.8092, + "step": 1171 + }, + { + "epoch": 0.12119645303895969, + "grad_norm": 0.9914805293083191, + "learning_rate": 0.0001879772374547336, + "loss": 1.621, + "step": 1172 + }, + { + "epoch": 0.1212998629818257, + "grad_norm": 1.9691003561019897, + "learning_rate": 0.00018796689084324886, + "loss": 2.0663, + "step": 1173 + }, + { + "epoch": 0.1214032729246917, + "grad_norm": 1.3074166774749756, + "learning_rate": 0.0001879565442317641, + "loss": 2.0947, + "step": 1174 + }, + { + "epoch": 0.12150668286755771, + "grad_norm": 1.9575916528701782, + "learning_rate": 0.00018794619762027936, + "loss": 1.2152, + "step": 1175 + }, + { + "epoch": 0.12161009281042372, + "grad_norm": 1.3250031471252441, + "learning_rate": 0.00018793585100879462, + "loss": 1.8787, + "step": 1176 + }, + { + "epoch": 0.12171350275328972, + "grad_norm": 1.2709016799926758, + "learning_rate": 0.0001879255043973099, + "loss": 1.8267, + "step": 1177 + }, + { + "epoch": 0.12181691269615573, + "grad_norm": 2.2685484886169434, + "learning_rate": 0.00018791515778582516, + "loss": 2.0463, + "step": 1178 + }, + { + "epoch": 0.12192032263902174, + "grad_norm": 2.0677502155303955, + "learning_rate": 0.00018790481117434042, + "loss": 1.9328, + "step": 1179 + }, + { + "epoch": 0.12202373258188774, + "grad_norm": 2.4347801208496094, + "learning_rate": 0.00018789446456285566, + "loss": 2.56, + "step": 1180 + }, + { + "epoch": 0.12212714252475375, + "grad_norm": 3.401825428009033, + "learning_rate": 0.00018788411795137093, + "loss": 1.718, + "step": 1181 + }, + { + "epoch": 0.12223055246761977, + "grad_norm": 2.031280040740967, + "learning_rate": 0.0001878737713398862, + "loss": 2.2049, + "step": 1182 + }, + { + "epoch": 0.12233396241048577, + "grad_norm": 1.5570838451385498, + "learning_rate": 0.00018786342472840146, + "loss": 1.5248, + "step": 1183 + }, + { + "epoch": 0.12243737235335178, + "grad_norm": 1.4443638324737549, + "learning_rate": 0.00018785307811691672, + "loss": 1.7989, + "step": 1184 + }, + { + "epoch": 0.12254078229621779, + "grad_norm": 2.5985703468322754, + "learning_rate": 0.000187842731505432, + "loss": 1.7115, + "step": 1185 + }, + { + "epoch": 0.1226441922390838, + "grad_norm": 1.325608730316162, + "learning_rate": 0.00018783238489394723, + "loss": 1.4347, + "step": 1186 + }, + { + "epoch": 0.1227476021819498, + "grad_norm": 1.1276994943618774, + "learning_rate": 0.0001878220382824625, + "loss": 1.1806, + "step": 1187 + }, + { + "epoch": 0.1228510121248158, + "grad_norm": 2.02768874168396, + "learning_rate": 0.00018781169167097776, + "loss": 1.9614, + "step": 1188 + }, + { + "epoch": 0.12295442206768181, + "grad_norm": 1.9223341941833496, + "learning_rate": 0.00018780134505949302, + "loss": 2.0055, + "step": 1189 + }, + { + "epoch": 0.12305783201054782, + "grad_norm": 1.4801243543624878, + "learning_rate": 0.0001877909984480083, + "loss": 1.5568, + "step": 1190 + }, + { + "epoch": 0.12316124195341382, + "grad_norm": 1.6072427034378052, + "learning_rate": 0.00018778065183652355, + "loss": 1.6918, + "step": 1191 + }, + { + "epoch": 0.12326465189627983, + "grad_norm": 1.8966267108917236, + "learning_rate": 0.0001877703052250388, + "loss": 1.7092, + "step": 1192 + }, + { + "epoch": 0.12336806183914584, + "grad_norm": 1.2562758922576904, + "learning_rate": 0.00018775995861355406, + "loss": 1.3349, + "step": 1193 + }, + { + "epoch": 0.12347147178201184, + "grad_norm": 1.6847071647644043, + "learning_rate": 0.00018774961200206932, + "loss": 1.2752, + "step": 1194 + }, + { + "epoch": 0.12357488172487785, + "grad_norm": 2.11639666557312, + "learning_rate": 0.0001877392653905846, + "loss": 2.1486, + "step": 1195 + }, + { + "epoch": 0.12367829166774386, + "grad_norm": 1.4768255949020386, + "learning_rate": 0.00018772891877909985, + "loss": 1.9611, + "step": 1196 + }, + { + "epoch": 0.12378170161060986, + "grad_norm": 0.8098143935203552, + "learning_rate": 0.00018771857216761512, + "loss": 1.5275, + "step": 1197 + }, + { + "epoch": 0.12388511155347587, + "grad_norm": 1.0442907810211182, + "learning_rate": 0.00018770822555613036, + "loss": 1.3282, + "step": 1198 + }, + { + "epoch": 0.12398852149634187, + "grad_norm": 2.2555179595947266, + "learning_rate": 0.00018769787894464562, + "loss": 2.0941, + "step": 1199 + }, + { + "epoch": 0.12409193143920788, + "grad_norm": 1.3613994121551514, + "learning_rate": 0.0001876875323331609, + "loss": 1.7308, + "step": 1200 + }, + { + "epoch": 0.12419534138207389, + "grad_norm": 2.940253496170044, + "learning_rate": 0.00018767718572167616, + "loss": 1.9223, + "step": 1201 + }, + { + "epoch": 0.1242987513249399, + "grad_norm": 2.203312873840332, + "learning_rate": 0.00018766683911019142, + "loss": 2.1979, + "step": 1202 + }, + { + "epoch": 0.1244021612678059, + "grad_norm": 1.6097736358642578, + "learning_rate": 0.0001876564924987067, + "loss": 1.169, + "step": 1203 + }, + { + "epoch": 0.1245055712106719, + "grad_norm": 2.578017234802246, + "learning_rate": 0.00018764614588722193, + "loss": 1.5018, + "step": 1204 + }, + { + "epoch": 0.12460898115353791, + "grad_norm": 1.4888848066329956, + "learning_rate": 0.0001876357992757372, + "loss": 1.8575, + "step": 1205 + }, + { + "epoch": 0.12471239109640392, + "grad_norm": 1.513687252998352, + "learning_rate": 0.00018762545266425246, + "loss": 1.5326, + "step": 1206 + }, + { + "epoch": 0.12481580103926992, + "grad_norm": 1.3281147480010986, + "learning_rate": 0.00018761510605276772, + "loss": 2.1136, + "step": 1207 + }, + { + "epoch": 0.12491921098213593, + "grad_norm": 1.496387004852295, + "learning_rate": 0.000187604759441283, + "loss": 1.8321, + "step": 1208 + }, + { + "epoch": 0.12502262092500194, + "grad_norm": 1.2002882957458496, + "learning_rate": 0.00018759441282979825, + "loss": 2.0025, + "step": 1209 + }, + { + "epoch": 0.12512603086786794, + "grad_norm": 1.6012659072875977, + "learning_rate": 0.0001875840662183135, + "loss": 1.3442, + "step": 1210 + }, + { + "epoch": 0.12522944081073395, + "grad_norm": 1.4702413082122803, + "learning_rate": 0.00018757371960682876, + "loss": 2.041, + "step": 1211 + }, + { + "epoch": 0.12533285075359996, + "grad_norm": 1.0756279230117798, + "learning_rate": 0.00018756337299534402, + "loss": 1.7976, + "step": 1212 + }, + { + "epoch": 0.12543626069646596, + "grad_norm": 4.028149604797363, + "learning_rate": 0.0001875530263838593, + "loss": 1.5223, + "step": 1213 + }, + { + "epoch": 0.12553967063933197, + "grad_norm": 2.3542799949645996, + "learning_rate": 0.00018754267977237455, + "loss": 1.5814, + "step": 1214 + }, + { + "epoch": 0.12564308058219797, + "grad_norm": 1.941663384437561, + "learning_rate": 0.00018753233316088982, + "loss": 2.1115, + "step": 1215 + }, + { + "epoch": 0.12574649052506398, + "grad_norm": 2.03517484664917, + "learning_rate": 0.00018752198654940506, + "loss": 1.8151, + "step": 1216 + }, + { + "epoch": 0.12584990046793, + "grad_norm": 1.52018141746521, + "learning_rate": 0.00018751163993792032, + "loss": 1.7026, + "step": 1217 + }, + { + "epoch": 0.125953310410796, + "grad_norm": 2.1755878925323486, + "learning_rate": 0.0001875012933264356, + "loss": 2.7846, + "step": 1218 + }, + { + "epoch": 0.126056720353662, + "grad_norm": 0.7671854496002197, + "learning_rate": 0.00018749094671495085, + "loss": 1.8867, + "step": 1219 + }, + { + "epoch": 0.126160130296528, + "grad_norm": 1.6932541131973267, + "learning_rate": 0.00018748060010346612, + "loss": 2.2631, + "step": 1220 + }, + { + "epoch": 0.126263540239394, + "grad_norm": 1.4602147340774536, + "learning_rate": 0.00018747025349198139, + "loss": 1.7261, + "step": 1221 + }, + { + "epoch": 0.12636695018226002, + "grad_norm": 1.7497947216033936, + "learning_rate": 0.00018745990688049665, + "loss": 2.4626, + "step": 1222 + }, + { + "epoch": 0.12647036012512602, + "grad_norm": 1.4249109029769897, + "learning_rate": 0.00018744956026901192, + "loss": 1.7258, + "step": 1223 + }, + { + "epoch": 0.12657377006799203, + "grad_norm": 1.2902969121932983, + "learning_rate": 0.00018743921365752716, + "loss": 1.2497, + "step": 1224 + }, + { + "epoch": 0.12667718001085804, + "grad_norm": 1.8637944459915161, + "learning_rate": 0.00018742886704604242, + "loss": 1.686, + "step": 1225 + }, + { + "epoch": 0.12678058995372404, + "grad_norm": 1.095707893371582, + "learning_rate": 0.00018741852043455769, + "loss": 2.192, + "step": 1226 + }, + { + "epoch": 0.12688399989659005, + "grad_norm": 1.8274673223495483, + "learning_rate": 0.00018740817382307295, + "loss": 1.5955, + "step": 1227 + }, + { + "epoch": 0.12698740983945606, + "grad_norm": 1.9945489168167114, + "learning_rate": 0.00018739782721158822, + "loss": 2.1984, + "step": 1228 + }, + { + "epoch": 0.12709081978232206, + "grad_norm": 1.4088261127471924, + "learning_rate": 0.00018738748060010348, + "loss": 2.0431, + "step": 1229 + }, + { + "epoch": 0.12719422972518807, + "grad_norm": 1.128542423248291, + "learning_rate": 0.00018737713398861875, + "loss": 2.0949, + "step": 1230 + }, + { + "epoch": 0.12729763966805407, + "grad_norm": 1.2845412492752075, + "learning_rate": 0.00018736678737713401, + "loss": 1.2357, + "step": 1231 + }, + { + "epoch": 0.12740104961092008, + "grad_norm": 1.4940705299377441, + "learning_rate": 0.00018735644076564925, + "loss": 1.8014, + "step": 1232 + }, + { + "epoch": 0.1275044595537861, + "grad_norm": 1.725649118423462, + "learning_rate": 0.00018734609415416452, + "loss": 1.8969, + "step": 1233 + }, + { + "epoch": 0.1276078694966521, + "grad_norm": 1.2723456621170044, + "learning_rate": 0.00018733574754267978, + "loss": 1.4834, + "step": 1234 + }, + { + "epoch": 0.1277112794395181, + "grad_norm": 0.8685710430145264, + "learning_rate": 0.00018732540093119505, + "loss": 1.5972, + "step": 1235 + }, + { + "epoch": 0.1278146893823841, + "grad_norm": 1.1668968200683594, + "learning_rate": 0.00018731505431971032, + "loss": 1.6169, + "step": 1236 + }, + { + "epoch": 0.1279180993252501, + "grad_norm": 1.3985564708709717, + "learning_rate": 0.00018730470770822558, + "loss": 1.5132, + "step": 1237 + }, + { + "epoch": 0.12802150926811612, + "grad_norm": 1.8650096654891968, + "learning_rate": 0.00018729436109674085, + "loss": 1.5906, + "step": 1238 + }, + { + "epoch": 0.12812491921098212, + "grad_norm": 1.2535521984100342, + "learning_rate": 0.0001872840144852561, + "loss": 1.6443, + "step": 1239 + }, + { + "epoch": 0.12822832915384813, + "grad_norm": 1.6224900484085083, + "learning_rate": 0.00018727366787377135, + "loss": 1.8258, + "step": 1240 + }, + { + "epoch": 0.12833173909671414, + "grad_norm": 1.5536644458770752, + "learning_rate": 0.00018726332126228662, + "loss": 1.8278, + "step": 1241 + }, + { + "epoch": 0.12843514903958014, + "grad_norm": 1.2493494749069214, + "learning_rate": 0.00018725297465080188, + "loss": 1.9223, + "step": 1242 + }, + { + "epoch": 0.12853855898244615, + "grad_norm": 1.945225715637207, + "learning_rate": 0.00018724262803931715, + "loss": 2.0077, + "step": 1243 + }, + { + "epoch": 0.12864196892531216, + "grad_norm": 0.7477271556854248, + "learning_rate": 0.0001872322814278324, + "loss": 1.8014, + "step": 1244 + }, + { + "epoch": 0.12874537886817816, + "grad_norm": 1.8180761337280273, + "learning_rate": 0.00018722193481634768, + "loss": 1.4772, + "step": 1245 + }, + { + "epoch": 0.12884878881104417, + "grad_norm": 1.7646000385284424, + "learning_rate": 0.00018721158820486292, + "loss": 1.598, + "step": 1246 + }, + { + "epoch": 0.1289521987539102, + "grad_norm": 1.0384913682937622, + "learning_rate": 0.00018720124159337818, + "loss": 1.5885, + "step": 1247 + }, + { + "epoch": 0.1290556086967762, + "grad_norm": 1.9679780006408691, + "learning_rate": 0.00018719089498189345, + "loss": 1.2981, + "step": 1248 + }, + { + "epoch": 0.12915901863964221, + "grad_norm": 1.365814447402954, + "learning_rate": 0.0001871805483704087, + "loss": 1.7245, + "step": 1249 + }, + { + "epoch": 0.12926242858250822, + "grad_norm": 1.7487189769744873, + "learning_rate": 0.00018717020175892398, + "loss": 1.6709, + "step": 1250 + }, + { + "epoch": 0.12936583852537423, + "grad_norm": 0.9565590620040894, + "learning_rate": 0.00018715985514743924, + "loss": 1.6439, + "step": 1251 + }, + { + "epoch": 0.12946924846824023, + "grad_norm": 2.015857696533203, + "learning_rate": 0.00018714950853595448, + "loss": 1.8146, + "step": 1252 + }, + { + "epoch": 0.12957265841110624, + "grad_norm": 0.9637671113014221, + "learning_rate": 0.00018713916192446975, + "loss": 1.7245, + "step": 1253 + }, + { + "epoch": 0.12967606835397225, + "grad_norm": 1.3832422494888306, + "learning_rate": 0.00018712881531298501, + "loss": 1.3993, + "step": 1254 + }, + { + "epoch": 0.12977947829683825, + "grad_norm": 2.1820902824401855, + "learning_rate": 0.00018711846870150028, + "loss": 1.7568, + "step": 1255 + }, + { + "epoch": 0.12988288823970426, + "grad_norm": 1.573229432106018, + "learning_rate": 0.00018710812209001555, + "loss": 1.415, + "step": 1256 + }, + { + "epoch": 0.12998629818257026, + "grad_norm": 1.5266555547714233, + "learning_rate": 0.0001870977754785308, + "loss": 1.5005, + "step": 1257 + }, + { + "epoch": 0.13008970812543627, + "grad_norm": 0.9026231169700623, + "learning_rate": 0.00018708742886704605, + "loss": 2.0375, + "step": 1258 + }, + { + "epoch": 0.13019311806830228, + "grad_norm": 1.4228705167770386, + "learning_rate": 0.00018707708225556131, + "loss": 1.8817, + "step": 1259 + }, + { + "epoch": 0.13029652801116828, + "grad_norm": 1.961610198020935, + "learning_rate": 0.00018706673564407658, + "loss": 1.7003, + "step": 1260 + }, + { + "epoch": 0.1303999379540343, + "grad_norm": 1.7642508745193481, + "learning_rate": 0.00018705638903259185, + "loss": 2.0717, + "step": 1261 + }, + { + "epoch": 0.1305033478969003, + "grad_norm": 2.054243564605713, + "learning_rate": 0.0001870460424211071, + "loss": 2.1404, + "step": 1262 + }, + { + "epoch": 0.1306067578397663, + "grad_norm": 2.2590620517730713, + "learning_rate": 0.00018703569580962235, + "loss": 2.1784, + "step": 1263 + }, + { + "epoch": 0.1307101677826323, + "grad_norm": 1.1175493001937866, + "learning_rate": 0.00018702534919813762, + "loss": 1.8119, + "step": 1264 + }, + { + "epoch": 0.13081357772549831, + "grad_norm": 2.2815165519714355, + "learning_rate": 0.00018701500258665288, + "loss": 2.1407, + "step": 1265 + }, + { + "epoch": 0.13091698766836432, + "grad_norm": 1.36077082157135, + "learning_rate": 0.00018700465597516815, + "loss": 2.338, + "step": 1266 + }, + { + "epoch": 0.13102039761123033, + "grad_norm": 1.8549292087554932, + "learning_rate": 0.0001869943093636834, + "loss": 2.447, + "step": 1267 + }, + { + "epoch": 0.13112380755409633, + "grad_norm": 2.007368564605713, + "learning_rate": 0.00018698396275219868, + "loss": 1.9328, + "step": 1268 + }, + { + "epoch": 0.13122721749696234, + "grad_norm": 1.925146222114563, + "learning_rate": 0.00018697361614071392, + "loss": 1.6689, + "step": 1269 + }, + { + "epoch": 0.13133062743982835, + "grad_norm": 1.3448445796966553, + "learning_rate": 0.00018696326952922918, + "loss": 1.6307, + "step": 1270 + }, + { + "epoch": 0.13143403738269435, + "grad_norm": 1.1631834506988525, + "learning_rate": 0.00018695292291774445, + "loss": 1.3101, + "step": 1271 + }, + { + "epoch": 0.13153744732556036, + "grad_norm": 1.2478338479995728, + "learning_rate": 0.0001869425763062597, + "loss": 2.0214, + "step": 1272 + }, + { + "epoch": 0.13164085726842636, + "grad_norm": 1.4405758380889893, + "learning_rate": 0.00018693222969477498, + "loss": 1.9482, + "step": 1273 + }, + { + "epoch": 0.13174426721129237, + "grad_norm": 1.3760428428649902, + "learning_rate": 0.00018692188308329024, + "loss": 1.3526, + "step": 1274 + }, + { + "epoch": 0.13184767715415838, + "grad_norm": 2.5238633155822754, + "learning_rate": 0.00018691153647180548, + "loss": 1.8157, + "step": 1275 + }, + { + "epoch": 0.13195108709702438, + "grad_norm": 0.7784034013748169, + "learning_rate": 0.00018690118986032075, + "loss": 1.7506, + "step": 1276 + }, + { + "epoch": 0.1320544970398904, + "grad_norm": 2.70515513420105, + "learning_rate": 0.00018689084324883601, + "loss": 1.3656, + "step": 1277 + }, + { + "epoch": 0.1321579069827564, + "grad_norm": 1.7337223291397095, + "learning_rate": 0.00018688049663735128, + "loss": 1.8144, + "step": 1278 + }, + { + "epoch": 0.1322613169256224, + "grad_norm": 1.2779606580734253, + "learning_rate": 0.00018687015002586654, + "loss": 1.5246, + "step": 1279 + }, + { + "epoch": 0.1323647268684884, + "grad_norm": 1.8382599353790283, + "learning_rate": 0.0001868598034143818, + "loss": 1.9191, + "step": 1280 + }, + { + "epoch": 0.13246813681135441, + "grad_norm": 2.009275197982788, + "learning_rate": 0.00018684945680289705, + "loss": 1.9812, + "step": 1281 + }, + { + "epoch": 0.13257154675422042, + "grad_norm": 1.8259612321853638, + "learning_rate": 0.00018683911019141231, + "loss": 1.6244, + "step": 1282 + }, + { + "epoch": 0.13267495669708643, + "grad_norm": 1.3301429748535156, + "learning_rate": 0.00018682876357992758, + "loss": 1.5117, + "step": 1283 + }, + { + "epoch": 0.13277836663995243, + "grad_norm": 1.6860365867614746, + "learning_rate": 0.00018681841696844285, + "loss": 1.6663, + "step": 1284 + }, + { + "epoch": 0.13288177658281844, + "grad_norm": 2.0984020233154297, + "learning_rate": 0.0001868080703569581, + "loss": 1.8559, + "step": 1285 + }, + { + "epoch": 0.13298518652568445, + "grad_norm": 1.4987295866012573, + "learning_rate": 0.00018679772374547338, + "loss": 1.8326, + "step": 1286 + }, + { + "epoch": 0.13308859646855045, + "grad_norm": 1.5269889831542969, + "learning_rate": 0.00018678737713398862, + "loss": 1.4678, + "step": 1287 + }, + { + "epoch": 0.13319200641141646, + "grad_norm": 1.3404724597930908, + "learning_rate": 0.00018677703052250388, + "loss": 1.4516, + "step": 1288 + }, + { + "epoch": 0.13329541635428246, + "grad_norm": 2.1874942779541016, + "learning_rate": 0.00018676668391101915, + "loss": 1.8405, + "step": 1289 + }, + { + "epoch": 0.13339882629714847, + "grad_norm": 1.327823519706726, + "learning_rate": 0.0001867563372995344, + "loss": 1.7545, + "step": 1290 + }, + { + "epoch": 0.13350223624001448, + "grad_norm": 1.8989559412002563, + "learning_rate": 0.00018674599068804968, + "loss": 2.4508, + "step": 1291 + }, + { + "epoch": 0.13360564618288048, + "grad_norm": 1.538395643234253, + "learning_rate": 0.00018673564407656494, + "loss": 1.9593, + "step": 1292 + }, + { + "epoch": 0.1337090561257465, + "grad_norm": 2.1572530269622803, + "learning_rate": 0.00018672529746508018, + "loss": 1.7497, + "step": 1293 + }, + { + "epoch": 0.1338124660686125, + "grad_norm": 1.3159223794937134, + "learning_rate": 0.00018671495085359545, + "loss": 2.1502, + "step": 1294 + }, + { + "epoch": 0.1339158760114785, + "grad_norm": 2.51725697517395, + "learning_rate": 0.0001867046042421107, + "loss": 1.6545, + "step": 1295 + }, + { + "epoch": 0.1340192859543445, + "grad_norm": 1.1315113306045532, + "learning_rate": 0.00018669425763062598, + "loss": 1.2736, + "step": 1296 + }, + { + "epoch": 0.13412269589721051, + "grad_norm": 1.8445188999176025, + "learning_rate": 0.00018668391101914124, + "loss": 2.1056, + "step": 1297 + }, + { + "epoch": 0.13422610584007652, + "grad_norm": 1.004447340965271, + "learning_rate": 0.0001866735644076565, + "loss": 1.9898, + "step": 1298 + }, + { + "epoch": 0.13432951578294253, + "grad_norm": 1.5629618167877197, + "learning_rate": 0.00018666321779617175, + "loss": 1.9748, + "step": 1299 + }, + { + "epoch": 0.13443292572580853, + "grad_norm": 1.6147910356521606, + "learning_rate": 0.000186652871184687, + "loss": 2.041, + "step": 1300 + }, + { + "epoch": 0.13453633566867454, + "grad_norm": 1.707336187362671, + "learning_rate": 0.00018664252457320228, + "loss": 1.4538, + "step": 1301 + }, + { + "epoch": 0.13463974561154055, + "grad_norm": 2.5680580139160156, + "learning_rate": 0.00018663217796171754, + "loss": 2.3289, + "step": 1302 + }, + { + "epoch": 0.13474315555440655, + "grad_norm": 2.430309772491455, + "learning_rate": 0.0001866218313502328, + "loss": 1.6057, + "step": 1303 + }, + { + "epoch": 0.13484656549727256, + "grad_norm": 1.7052457332611084, + "learning_rate": 0.00018661148473874808, + "loss": 2.0138, + "step": 1304 + }, + { + "epoch": 0.13494997544013856, + "grad_norm": 1.389034390449524, + "learning_rate": 0.00018660113812726331, + "loss": 1.3975, + "step": 1305 + }, + { + "epoch": 0.13505338538300457, + "grad_norm": 1.011025309562683, + "learning_rate": 0.00018659079151577858, + "loss": 1.7537, + "step": 1306 + }, + { + "epoch": 0.13515679532587058, + "grad_norm": 1.0741398334503174, + "learning_rate": 0.00018658044490429385, + "loss": 1.3626, + "step": 1307 + }, + { + "epoch": 0.13526020526873658, + "grad_norm": 1.2746872901916504, + "learning_rate": 0.0001865700982928091, + "loss": 1.0874, + "step": 1308 + }, + { + "epoch": 0.1353636152116026, + "grad_norm": 3.1592459678649902, + "learning_rate": 0.00018655975168132438, + "loss": 1.8081, + "step": 1309 + }, + { + "epoch": 0.1354670251544686, + "grad_norm": 1.210808515548706, + "learning_rate": 0.00018654940506983964, + "loss": 2.2929, + "step": 1310 + }, + { + "epoch": 0.1355704350973346, + "grad_norm": 1.5508512258529663, + "learning_rate": 0.00018653905845835488, + "loss": 1.7335, + "step": 1311 + }, + { + "epoch": 0.1356738450402006, + "grad_norm": 1.1455914974212646, + "learning_rate": 0.00018652871184687015, + "loss": 1.6225, + "step": 1312 + }, + { + "epoch": 0.13577725498306661, + "grad_norm": 0.8236886262893677, + "learning_rate": 0.0001865183652353854, + "loss": 1.6923, + "step": 1313 + }, + { + "epoch": 0.13588066492593262, + "grad_norm": 0.9368153214454651, + "learning_rate": 0.00018650801862390068, + "loss": 1.3802, + "step": 1314 + }, + { + "epoch": 0.13598407486879863, + "grad_norm": 2.053213357925415, + "learning_rate": 0.00018649767201241594, + "loss": 1.6705, + "step": 1315 + }, + { + "epoch": 0.13608748481166463, + "grad_norm": 2.63618803024292, + "learning_rate": 0.0001864873254009312, + "loss": 1.4974, + "step": 1316 + }, + { + "epoch": 0.13619089475453064, + "grad_norm": 0.9730865359306335, + "learning_rate": 0.00018647697878944645, + "loss": 1.6048, + "step": 1317 + }, + { + "epoch": 0.13629430469739665, + "grad_norm": 2.2292282581329346, + "learning_rate": 0.0001864666321779617, + "loss": 1.6644, + "step": 1318 + }, + { + "epoch": 0.13639771464026265, + "grad_norm": 1.701492428779602, + "learning_rate": 0.00018645628556647698, + "loss": 1.4162, + "step": 1319 + }, + { + "epoch": 0.13650112458312866, + "grad_norm": 2.6111085414886475, + "learning_rate": 0.00018644593895499224, + "loss": 1.8539, + "step": 1320 + }, + { + "epoch": 0.13660453452599466, + "grad_norm": 0.8146743178367615, + "learning_rate": 0.0001864355923435075, + "loss": 1.8789, + "step": 1321 + }, + { + "epoch": 0.13670794446886067, + "grad_norm": 1.8751240968704224, + "learning_rate": 0.00018642524573202277, + "loss": 2.0231, + "step": 1322 + }, + { + "epoch": 0.13681135441172668, + "grad_norm": 2.0674078464508057, + "learning_rate": 0.000186414899120538, + "loss": 1.5609, + "step": 1323 + }, + { + "epoch": 0.13691476435459268, + "grad_norm": 1.0525996685028076, + "learning_rate": 0.00018640455250905328, + "loss": 1.4097, + "step": 1324 + }, + { + "epoch": 0.1370181742974587, + "grad_norm": 4.645424842834473, + "learning_rate": 0.00018639420589756854, + "loss": 1.2942, + "step": 1325 + }, + { + "epoch": 0.1371215842403247, + "grad_norm": 2.035285472869873, + "learning_rate": 0.0001863838592860838, + "loss": 1.3918, + "step": 1326 + }, + { + "epoch": 0.1372249941831907, + "grad_norm": 1.863338828086853, + "learning_rate": 0.00018637351267459908, + "loss": 1.7489, + "step": 1327 + }, + { + "epoch": 0.1373284041260567, + "grad_norm": 1.5353502035140991, + "learning_rate": 0.00018636316606311434, + "loss": 1.4828, + "step": 1328 + }, + { + "epoch": 0.13743181406892271, + "grad_norm": 1.3041024208068848, + "learning_rate": 0.00018635281945162958, + "loss": 1.3859, + "step": 1329 + }, + { + "epoch": 0.13753522401178872, + "grad_norm": 1.15968918800354, + "learning_rate": 0.00018634247284014485, + "loss": 1.9243, + "step": 1330 + }, + { + "epoch": 0.13763863395465473, + "grad_norm": 1.0997211933135986, + "learning_rate": 0.0001863321262286601, + "loss": 1.5236, + "step": 1331 + }, + { + "epoch": 0.13774204389752073, + "grad_norm": 1.8653374910354614, + "learning_rate": 0.00018632177961717538, + "loss": 1.4485, + "step": 1332 + }, + { + "epoch": 0.13784545384038677, + "grad_norm": 2.3222272396087646, + "learning_rate": 0.00018631143300569064, + "loss": 1.9068, + "step": 1333 + }, + { + "epoch": 0.13794886378325277, + "grad_norm": 2.070629596710205, + "learning_rate": 0.0001863010863942059, + "loss": 1.1053, + "step": 1334 + }, + { + "epoch": 0.13805227372611878, + "grad_norm": 0.7526354789733887, + "learning_rate": 0.00018629073978272115, + "loss": 2.0752, + "step": 1335 + }, + { + "epoch": 0.13815568366898479, + "grad_norm": 1.8768320083618164, + "learning_rate": 0.0001862803931712364, + "loss": 1.8098, + "step": 1336 + }, + { + "epoch": 0.1382590936118508, + "grad_norm": 2.4717142581939697, + "learning_rate": 0.00018627004655975168, + "loss": 1.967, + "step": 1337 + }, + { + "epoch": 0.1383625035547168, + "grad_norm": 0.8968499302864075, + "learning_rate": 0.00018625969994826694, + "loss": 1.4093, + "step": 1338 + }, + { + "epoch": 0.1384659134975828, + "grad_norm": 1.5168501138687134, + "learning_rate": 0.0001862493533367822, + "loss": 0.7085, + "step": 1339 + }, + { + "epoch": 0.1385693234404488, + "grad_norm": 1.7330024242401123, + "learning_rate": 0.00018623900672529747, + "loss": 2.2376, + "step": 1340 + }, + { + "epoch": 0.13867273338331482, + "grad_norm": 2.234142780303955, + "learning_rate": 0.0001862286601138127, + "loss": 1.507, + "step": 1341 + }, + { + "epoch": 0.13877614332618082, + "grad_norm": 1.6087912321090698, + "learning_rate": 0.00018621831350232798, + "loss": 1.6281, + "step": 1342 + }, + { + "epoch": 0.13887955326904683, + "grad_norm": 1.7449696063995361, + "learning_rate": 0.00018620796689084324, + "loss": 1.8542, + "step": 1343 + }, + { + "epoch": 0.13898296321191284, + "grad_norm": 1.2030277252197266, + "learning_rate": 0.0001861976202793585, + "loss": 1.485, + "step": 1344 + }, + { + "epoch": 0.13908637315477884, + "grad_norm": 1.4781368970870972, + "learning_rate": 0.00018618727366787377, + "loss": 1.742, + "step": 1345 + }, + { + "epoch": 0.13918978309764485, + "grad_norm": 1.9535484313964844, + "learning_rate": 0.00018617692705638904, + "loss": 1.876, + "step": 1346 + }, + { + "epoch": 0.13929319304051085, + "grad_norm": 1.9578993320465088, + "learning_rate": 0.0001861665804449043, + "loss": 1.7654, + "step": 1347 + }, + { + "epoch": 0.13939660298337686, + "grad_norm": 1.1052072048187256, + "learning_rate": 0.00018615623383341957, + "loss": 1.5072, + "step": 1348 + }, + { + "epoch": 0.13950001292624287, + "grad_norm": 1.2735376358032227, + "learning_rate": 0.0001861458872219348, + "loss": 1.9359, + "step": 1349 + }, + { + "epoch": 0.13960342286910887, + "grad_norm": 1.1834639310836792, + "learning_rate": 0.00018613554061045008, + "loss": 1.4385, + "step": 1350 + }, + { + "epoch": 0.13970683281197488, + "grad_norm": 0.7228855490684509, + "learning_rate": 0.00018612519399896534, + "loss": 1.8304, + "step": 1351 + }, + { + "epoch": 0.13981024275484089, + "grad_norm": 1.2559309005737305, + "learning_rate": 0.0001861148473874806, + "loss": 1.6177, + "step": 1352 + }, + { + "epoch": 0.1399136526977069, + "grad_norm": 2.237501621246338, + "learning_rate": 0.00018610450077599587, + "loss": 1.9076, + "step": 1353 + }, + { + "epoch": 0.1400170626405729, + "grad_norm": 3.862377643585205, + "learning_rate": 0.00018609415416451114, + "loss": 1.7243, + "step": 1354 + }, + { + "epoch": 0.1401204725834389, + "grad_norm": 1.3240326642990112, + "learning_rate": 0.0001860838075530264, + "loss": 1.5081, + "step": 1355 + }, + { + "epoch": 0.1402238825263049, + "grad_norm": 1.5742530822753906, + "learning_rate": 0.00018607346094154167, + "loss": 1.422, + "step": 1356 + }, + { + "epoch": 0.14032729246917092, + "grad_norm": 1.4533076286315918, + "learning_rate": 0.0001860631143300569, + "loss": 1.9615, + "step": 1357 + }, + { + "epoch": 0.14043070241203692, + "grad_norm": 3.0016930103302, + "learning_rate": 0.00018605276771857217, + "loss": 1.9092, + "step": 1358 + }, + { + "epoch": 0.14053411235490293, + "grad_norm": 1.1997783184051514, + "learning_rate": 0.00018604242110708744, + "loss": 1.2373, + "step": 1359 + }, + { + "epoch": 0.14063752229776894, + "grad_norm": 0.9452020525932312, + "learning_rate": 0.0001860320744956027, + "loss": 1.9416, + "step": 1360 + }, + { + "epoch": 0.14074093224063494, + "grad_norm": 1.4949263334274292, + "learning_rate": 0.00018602172788411797, + "loss": 1.9632, + "step": 1361 + }, + { + "epoch": 0.14084434218350095, + "grad_norm": 1.2022387981414795, + "learning_rate": 0.00018601138127263323, + "loss": 1.3986, + "step": 1362 + }, + { + "epoch": 0.14094775212636695, + "grad_norm": 1.3742767572402954, + "learning_rate": 0.0001860010346611485, + "loss": 1.4181, + "step": 1363 + }, + { + "epoch": 0.14105116206923296, + "grad_norm": 1.5255154371261597, + "learning_rate": 0.00018599068804966377, + "loss": 2.0134, + "step": 1364 + }, + { + "epoch": 0.14115457201209897, + "grad_norm": 1.4387857913970947, + "learning_rate": 0.000185980341438179, + "loss": 1.4114, + "step": 1365 + }, + { + "epoch": 0.14125798195496497, + "grad_norm": 2.9835379123687744, + "learning_rate": 0.00018596999482669427, + "loss": 2.0213, + "step": 1366 + }, + { + "epoch": 0.14136139189783098, + "grad_norm": 1.301589846611023, + "learning_rate": 0.00018595964821520954, + "loss": 1.7207, + "step": 1367 + }, + { + "epoch": 0.14146480184069699, + "grad_norm": 1.2321486473083496, + "learning_rate": 0.0001859493016037248, + "loss": 1.6375, + "step": 1368 + }, + { + "epoch": 0.141568211783563, + "grad_norm": 2.0843982696533203, + "learning_rate": 0.00018593895499224007, + "loss": 2.2411, + "step": 1369 + }, + { + "epoch": 0.141671621726429, + "grad_norm": 1.382814645767212, + "learning_rate": 0.00018592860838075533, + "loss": 1.7965, + "step": 1370 + }, + { + "epoch": 0.141775031669295, + "grad_norm": 1.7248510122299194, + "learning_rate": 0.00018591826176927057, + "loss": 1.5689, + "step": 1371 + }, + { + "epoch": 0.141878441612161, + "grad_norm": 1.425204873085022, + "learning_rate": 0.00018590791515778584, + "loss": 1.7564, + "step": 1372 + }, + { + "epoch": 0.14198185155502702, + "grad_norm": 1.2911546230316162, + "learning_rate": 0.0001858975685463011, + "loss": 1.2647, + "step": 1373 + }, + { + "epoch": 0.14208526149789302, + "grad_norm": 1.1772571802139282, + "learning_rate": 0.00018588722193481637, + "loss": 1.8914, + "step": 1374 + }, + { + "epoch": 0.14218867144075903, + "grad_norm": 1.4799343347549438, + "learning_rate": 0.00018587687532333163, + "loss": 1.0355, + "step": 1375 + }, + { + "epoch": 0.14229208138362504, + "grad_norm": 2.084376811981201, + "learning_rate": 0.0001858665287118469, + "loss": 1.063, + "step": 1376 + }, + { + "epoch": 0.14239549132649104, + "grad_norm": 1.7417645454406738, + "learning_rate": 0.00018585618210036214, + "loss": 1.7731, + "step": 1377 + }, + { + "epoch": 0.14249890126935705, + "grad_norm": 1.8002780675888062, + "learning_rate": 0.0001858458354888774, + "loss": 1.9528, + "step": 1378 + }, + { + "epoch": 0.14260231121222305, + "grad_norm": 3.58180570602417, + "learning_rate": 0.00018583548887739267, + "loss": 1.6168, + "step": 1379 + }, + { + "epoch": 0.14270572115508906, + "grad_norm": 1.7094149589538574, + "learning_rate": 0.00018582514226590793, + "loss": 1.0873, + "step": 1380 + }, + { + "epoch": 0.14280913109795507, + "grad_norm": 2.5685203075408936, + "learning_rate": 0.0001858147956544232, + "loss": 1.7104, + "step": 1381 + }, + { + "epoch": 0.14291254104082107, + "grad_norm": 3.272225856781006, + "learning_rate": 0.00018580444904293847, + "loss": 1.8178, + "step": 1382 + }, + { + "epoch": 0.14301595098368708, + "grad_norm": 3.2546474933624268, + "learning_rate": 0.0001857941024314537, + "loss": 2.2033, + "step": 1383 + }, + { + "epoch": 0.14311936092655309, + "grad_norm": 1.2203083038330078, + "learning_rate": 0.00018578375581996897, + "loss": 1.7435, + "step": 1384 + }, + { + "epoch": 0.1432227708694191, + "grad_norm": 1.4671968221664429, + "learning_rate": 0.00018577340920848423, + "loss": 1.6157, + "step": 1385 + }, + { + "epoch": 0.1433261808122851, + "grad_norm": 0.9329012036323547, + "learning_rate": 0.0001857630625969995, + "loss": 1.9851, + "step": 1386 + }, + { + "epoch": 0.1434295907551511, + "grad_norm": 1.7417888641357422, + "learning_rate": 0.00018575271598551477, + "loss": 2.3805, + "step": 1387 + }, + { + "epoch": 0.1435330006980171, + "grad_norm": 1.4650931358337402, + "learning_rate": 0.00018574236937403003, + "loss": 1.7815, + "step": 1388 + }, + { + "epoch": 0.14363641064088312, + "grad_norm": 2.144329071044922, + "learning_rate": 0.00018573202276254527, + "loss": 1.751, + "step": 1389 + }, + { + "epoch": 0.14373982058374912, + "grad_norm": 1.9252647161483765, + "learning_rate": 0.00018572167615106054, + "loss": 2.0645, + "step": 1390 + }, + { + "epoch": 0.14384323052661513, + "grad_norm": 0.8894973397254944, + "learning_rate": 0.0001857113295395758, + "loss": 1.8464, + "step": 1391 + }, + { + "epoch": 0.14394664046948114, + "grad_norm": 3.1202118396759033, + "learning_rate": 0.00018570098292809107, + "loss": 2.0249, + "step": 1392 + }, + { + "epoch": 0.14405005041234714, + "grad_norm": 1.7900038957595825, + "learning_rate": 0.00018569063631660633, + "loss": 1.4751, + "step": 1393 + }, + { + "epoch": 0.14415346035521315, + "grad_norm": 1.6321499347686768, + "learning_rate": 0.0001856802897051216, + "loss": 1.7489, + "step": 1394 + }, + { + "epoch": 0.14425687029807915, + "grad_norm": 2.1138758659362793, + "learning_rate": 0.00018566994309363684, + "loss": 1.6695, + "step": 1395 + }, + { + "epoch": 0.14436028024094516, + "grad_norm": 1.5029677152633667, + "learning_rate": 0.0001856595964821521, + "loss": 1.4675, + "step": 1396 + }, + { + "epoch": 0.14446369018381117, + "grad_norm": 4.420119762420654, + "learning_rate": 0.00018564924987066737, + "loss": 2.0966, + "step": 1397 + }, + { + "epoch": 0.14456710012667717, + "grad_norm": 0.8471847772598267, + "learning_rate": 0.00018563890325918263, + "loss": 1.5981, + "step": 1398 + }, + { + "epoch": 0.14467051006954318, + "grad_norm": 1.1944760084152222, + "learning_rate": 0.0001856285566476979, + "loss": 1.6597, + "step": 1399 + }, + { + "epoch": 0.14477392001240919, + "grad_norm": 2.2949111461639404, + "learning_rate": 0.00018561821003621316, + "loss": 1.5731, + "step": 1400 + }, + { + "epoch": 0.1448773299552752, + "grad_norm": 3.328265428543091, + "learning_rate": 0.0001856078634247284, + "loss": 1.9217, + "step": 1401 + }, + { + "epoch": 0.1449807398981412, + "grad_norm": 0.9115211963653564, + "learning_rate": 0.00018559751681324367, + "loss": 1.5536, + "step": 1402 + }, + { + "epoch": 0.1450841498410072, + "grad_norm": 1.1472818851470947, + "learning_rate": 0.00018558717020175893, + "loss": 1.9001, + "step": 1403 + }, + { + "epoch": 0.1451875597838732, + "grad_norm": 1.5385702848434448, + "learning_rate": 0.0001855768235902742, + "loss": 1.187, + "step": 1404 + }, + { + "epoch": 0.14529096972673922, + "grad_norm": 1.0141092538833618, + "learning_rate": 0.00018556647697878946, + "loss": 1.1727, + "step": 1405 + }, + { + "epoch": 0.14539437966960522, + "grad_norm": 1.044488549232483, + "learning_rate": 0.00018555613036730473, + "loss": 1.6773, + "step": 1406 + }, + { + "epoch": 0.14549778961247123, + "grad_norm": 1.4678056240081787, + "learning_rate": 0.00018554578375581997, + "loss": 1.9119, + "step": 1407 + }, + { + "epoch": 0.14560119955533724, + "grad_norm": 4.0386738777160645, + "learning_rate": 0.00018553543714433523, + "loss": 2.031, + "step": 1408 + }, + { + "epoch": 0.14570460949820324, + "grad_norm": 1.2234376668930054, + "learning_rate": 0.0001855250905328505, + "loss": 1.4436, + "step": 1409 + }, + { + "epoch": 0.14580801944106925, + "grad_norm": 1.4218946695327759, + "learning_rate": 0.00018551474392136577, + "loss": 1.9956, + "step": 1410 + }, + { + "epoch": 0.14591142938393525, + "grad_norm": 1.7964671850204468, + "learning_rate": 0.00018550439730988103, + "loss": 1.362, + "step": 1411 + }, + { + "epoch": 0.14601483932680126, + "grad_norm": 0.8776262998580933, + "learning_rate": 0.0001854940506983963, + "loss": 1.718, + "step": 1412 + }, + { + "epoch": 0.14611824926966727, + "grad_norm": 0.9666465520858765, + "learning_rate": 0.00018548370408691154, + "loss": 1.6304, + "step": 1413 + }, + { + "epoch": 0.14622165921253327, + "grad_norm": 1.7246460914611816, + "learning_rate": 0.0001854733574754268, + "loss": 1.8842, + "step": 1414 + }, + { + "epoch": 0.14632506915539928, + "grad_norm": 1.5267548561096191, + "learning_rate": 0.00018546301086394207, + "loss": 1.8565, + "step": 1415 + }, + { + "epoch": 0.14642847909826529, + "grad_norm": 2.6059603691101074, + "learning_rate": 0.00018545266425245733, + "loss": 1.7332, + "step": 1416 + }, + { + "epoch": 0.1465318890411313, + "grad_norm": 1.2317285537719727, + "learning_rate": 0.0001854423176409726, + "loss": 1.2607, + "step": 1417 + }, + { + "epoch": 0.1466352989839973, + "grad_norm": 1.1376893520355225, + "learning_rate": 0.00018543197102948786, + "loss": 1.7747, + "step": 1418 + }, + { + "epoch": 0.14673870892686333, + "grad_norm": 1.2814754247665405, + "learning_rate": 0.0001854216244180031, + "loss": 1.6338, + "step": 1419 + }, + { + "epoch": 0.14684211886972934, + "grad_norm": 1.9180951118469238, + "learning_rate": 0.00018541127780651837, + "loss": 1.7912, + "step": 1420 + }, + { + "epoch": 0.14694552881259534, + "grad_norm": 1.478571891784668, + "learning_rate": 0.00018540093119503363, + "loss": 1.771, + "step": 1421 + }, + { + "epoch": 0.14704893875546135, + "grad_norm": 1.6875447034835815, + "learning_rate": 0.0001853905845835489, + "loss": 0.9275, + "step": 1422 + }, + { + "epoch": 0.14715234869832736, + "grad_norm": 1.1770519018173218, + "learning_rate": 0.00018538023797206416, + "loss": 1.1857, + "step": 1423 + }, + { + "epoch": 0.14725575864119336, + "grad_norm": 1.1245090961456299, + "learning_rate": 0.00018536989136057943, + "loss": 1.6672, + "step": 1424 + }, + { + "epoch": 0.14735916858405937, + "grad_norm": 2.2437310218811035, + "learning_rate": 0.00018535954474909467, + "loss": 2.0515, + "step": 1425 + }, + { + "epoch": 0.14746257852692538, + "grad_norm": 2.4972667694091797, + "learning_rate": 0.00018534919813760993, + "loss": 1.848, + "step": 1426 + }, + { + "epoch": 0.14756598846979138, + "grad_norm": 1.7464969158172607, + "learning_rate": 0.0001853388515261252, + "loss": 2.007, + "step": 1427 + }, + { + "epoch": 0.1476693984126574, + "grad_norm": 1.0928596258163452, + "learning_rate": 0.00018532850491464046, + "loss": 1.1851, + "step": 1428 + }, + { + "epoch": 0.1477728083555234, + "grad_norm": 1.876115322113037, + "learning_rate": 0.00018531815830315573, + "loss": 1.2047, + "step": 1429 + }, + { + "epoch": 0.1478762182983894, + "grad_norm": 2.2480833530426025, + "learning_rate": 0.000185307811691671, + "loss": 1.8043, + "step": 1430 + }, + { + "epoch": 0.1479796282412554, + "grad_norm": 1.9176514148712158, + "learning_rate": 0.00018529746508018623, + "loss": 2.0757, + "step": 1431 + }, + { + "epoch": 0.1480830381841214, + "grad_norm": 1.2820544242858887, + "learning_rate": 0.0001852871184687015, + "loss": 1.7068, + "step": 1432 + }, + { + "epoch": 0.14818644812698742, + "grad_norm": 1.16887366771698, + "learning_rate": 0.00018527677185721677, + "loss": 0.8879, + "step": 1433 + }, + { + "epoch": 0.14828985806985343, + "grad_norm": 1.29765784740448, + "learning_rate": 0.00018526642524573203, + "loss": 1.4978, + "step": 1434 + }, + { + "epoch": 0.14839326801271943, + "grad_norm": 1.439942479133606, + "learning_rate": 0.0001852560786342473, + "loss": 1.332, + "step": 1435 + }, + { + "epoch": 0.14849667795558544, + "grad_norm": 1.7839429378509521, + "learning_rate": 0.00018524573202276256, + "loss": 1.9976, + "step": 1436 + }, + { + "epoch": 0.14860008789845144, + "grad_norm": 1.2119537591934204, + "learning_rate": 0.0001852353854112778, + "loss": 2.2104, + "step": 1437 + }, + { + "epoch": 0.14870349784131745, + "grad_norm": 1.0513869524002075, + "learning_rate": 0.00018522503879979307, + "loss": 1.8778, + "step": 1438 + }, + { + "epoch": 0.14880690778418346, + "grad_norm": 2.588869571685791, + "learning_rate": 0.00018521469218830833, + "loss": 1.7518, + "step": 1439 + }, + { + "epoch": 0.14891031772704946, + "grad_norm": 2.166912794113159, + "learning_rate": 0.0001852043455768236, + "loss": 2.1437, + "step": 1440 + }, + { + "epoch": 0.14901372766991547, + "grad_norm": 1.8580693006515503, + "learning_rate": 0.00018519399896533886, + "loss": 1.5795, + "step": 1441 + }, + { + "epoch": 0.14911713761278148, + "grad_norm": 1.4996910095214844, + "learning_rate": 0.00018518365235385413, + "loss": 1.4934, + "step": 1442 + }, + { + "epoch": 0.14922054755564748, + "grad_norm": 2.2671191692352295, + "learning_rate": 0.00018517330574236937, + "loss": 2.2631, + "step": 1443 + }, + { + "epoch": 0.1493239574985135, + "grad_norm": 2.4487438201904297, + "learning_rate": 0.00018516295913088463, + "loss": 2.6226, + "step": 1444 + }, + { + "epoch": 0.1494273674413795, + "grad_norm": 0.7984973788261414, + "learning_rate": 0.0001851526125193999, + "loss": 2.0354, + "step": 1445 + }, + { + "epoch": 0.1495307773842455, + "grad_norm": 2.1399385929107666, + "learning_rate": 0.00018514226590791516, + "loss": 1.4461, + "step": 1446 + }, + { + "epoch": 0.1496341873271115, + "grad_norm": 1.6950798034667969, + "learning_rate": 0.00018513191929643043, + "loss": 1.6589, + "step": 1447 + }, + { + "epoch": 0.1497375972699775, + "grad_norm": 1.0480849742889404, + "learning_rate": 0.0001851215726849457, + "loss": 1.2511, + "step": 1448 + }, + { + "epoch": 0.14984100721284352, + "grad_norm": 1.5871855020523071, + "learning_rate": 0.00018511122607346093, + "loss": 1.4684, + "step": 1449 + }, + { + "epoch": 0.14994441715570953, + "grad_norm": 1.3230794668197632, + "learning_rate": 0.0001851008794619762, + "loss": 1.5079, + "step": 1450 + }, + { + "epoch": 0.15004782709857553, + "grad_norm": 2.08754825592041, + "learning_rate": 0.00018509053285049146, + "loss": 2.3247, + "step": 1451 + }, + { + "epoch": 0.15015123704144154, + "grad_norm": 3.3206236362457275, + "learning_rate": 0.00018508018623900673, + "loss": 1.5526, + "step": 1452 + }, + { + "epoch": 0.15025464698430754, + "grad_norm": 1.5044193267822266, + "learning_rate": 0.000185069839627522, + "loss": 1.8548, + "step": 1453 + }, + { + "epoch": 0.15035805692717355, + "grad_norm": 1.271353006362915, + "learning_rate": 0.00018505949301603726, + "loss": 1.7336, + "step": 1454 + }, + { + "epoch": 0.15046146687003956, + "grad_norm": 1.4992772340774536, + "learning_rate": 0.0001850491464045525, + "loss": 2.1556, + "step": 1455 + }, + { + "epoch": 0.15056487681290556, + "grad_norm": 1.797611951828003, + "learning_rate": 0.00018503879979306777, + "loss": 2.1368, + "step": 1456 + }, + { + "epoch": 0.15066828675577157, + "grad_norm": 2.4767348766326904, + "learning_rate": 0.00018502845318158303, + "loss": 1.3575, + "step": 1457 + }, + { + "epoch": 0.15077169669863758, + "grad_norm": 1.9797945022583008, + "learning_rate": 0.0001850181065700983, + "loss": 1.968, + "step": 1458 + }, + { + "epoch": 0.15087510664150358, + "grad_norm": 1.4756824970245361, + "learning_rate": 0.00018500775995861356, + "loss": 0.9718, + "step": 1459 + }, + { + "epoch": 0.1509785165843696, + "grad_norm": 1.1938824653625488, + "learning_rate": 0.00018499741334712883, + "loss": 1.5162, + "step": 1460 + }, + { + "epoch": 0.1510819265272356, + "grad_norm": 2.0596578121185303, + "learning_rate": 0.00018498706673564407, + "loss": 1.5841, + "step": 1461 + }, + { + "epoch": 0.1511853364701016, + "grad_norm": 0.9470666646957397, + "learning_rate": 0.00018497672012415933, + "loss": 1.5052, + "step": 1462 + }, + { + "epoch": 0.1512887464129676, + "grad_norm": 1.30280339717865, + "learning_rate": 0.0001849663735126746, + "loss": 1.0616, + "step": 1463 + }, + { + "epoch": 0.1513921563558336, + "grad_norm": 1.7117403745651245, + "learning_rate": 0.00018495602690118986, + "loss": 1.6116, + "step": 1464 + }, + { + "epoch": 0.15149556629869962, + "grad_norm": 1.2089602947235107, + "learning_rate": 0.00018494568028970513, + "loss": 1.7289, + "step": 1465 + }, + { + "epoch": 0.15159897624156563, + "grad_norm": 2.146265983581543, + "learning_rate": 0.0001849353336782204, + "loss": 2.1897, + "step": 1466 + }, + { + "epoch": 0.15170238618443163, + "grad_norm": 1.0412538051605225, + "learning_rate": 0.00018492498706673563, + "loss": 1.3649, + "step": 1467 + }, + { + "epoch": 0.15180579612729764, + "grad_norm": 1.6170674562454224, + "learning_rate": 0.0001849146404552509, + "loss": 1.5992, + "step": 1468 + }, + { + "epoch": 0.15190920607016364, + "grad_norm": 2.0406131744384766, + "learning_rate": 0.00018490429384376616, + "loss": 1.7176, + "step": 1469 + }, + { + "epoch": 0.15201261601302965, + "grad_norm": 1.5884300470352173, + "learning_rate": 0.00018489394723228143, + "loss": 1.7171, + "step": 1470 + }, + { + "epoch": 0.15211602595589566, + "grad_norm": 1.7351653575897217, + "learning_rate": 0.0001848836006207967, + "loss": 2.1659, + "step": 1471 + }, + { + "epoch": 0.15221943589876166, + "grad_norm": 1.464855670928955, + "learning_rate": 0.00018487325400931196, + "loss": 1.8101, + "step": 1472 + }, + { + "epoch": 0.15232284584162767, + "grad_norm": 1.5113881826400757, + "learning_rate": 0.0001848629073978272, + "loss": 1.5864, + "step": 1473 + }, + { + "epoch": 0.15242625578449367, + "grad_norm": 1.0496164560317993, + "learning_rate": 0.00018485256078634246, + "loss": 1.3613, + "step": 1474 + }, + { + "epoch": 0.15252966572735968, + "grad_norm": 1.4251512289047241, + "learning_rate": 0.00018484221417485773, + "loss": 0.8508, + "step": 1475 + }, + { + "epoch": 0.1526330756702257, + "grad_norm": 1.001116394996643, + "learning_rate": 0.000184831867563373, + "loss": 1.3093, + "step": 1476 + }, + { + "epoch": 0.1527364856130917, + "grad_norm": 2.355113983154297, + "learning_rate": 0.00018482152095188826, + "loss": 1.7889, + "step": 1477 + }, + { + "epoch": 0.1528398955559577, + "grad_norm": 0.9703680276870728, + "learning_rate": 0.00018481117434040353, + "loss": 2.0384, + "step": 1478 + }, + { + "epoch": 0.1529433054988237, + "grad_norm": 1.182063102722168, + "learning_rate": 0.0001848008277289188, + "loss": 1.3248, + "step": 1479 + }, + { + "epoch": 0.1530467154416897, + "grad_norm": 2.8416783809661865, + "learning_rate": 0.00018479048111743406, + "loss": 2.1734, + "step": 1480 + }, + { + "epoch": 0.15315012538455572, + "grad_norm": 1.6548497676849365, + "learning_rate": 0.0001847801345059493, + "loss": 1.9716, + "step": 1481 + }, + { + "epoch": 0.15325353532742172, + "grad_norm": 1.6484485864639282, + "learning_rate": 0.00018476978789446456, + "loss": 0.5344, + "step": 1482 + }, + { + "epoch": 0.15335694527028773, + "grad_norm": 1.1464232206344604, + "learning_rate": 0.00018475944128297983, + "loss": 1.4424, + "step": 1483 + }, + { + "epoch": 0.15346035521315374, + "grad_norm": 1.261054515838623, + "learning_rate": 0.0001847490946714951, + "loss": 1.8385, + "step": 1484 + }, + { + "epoch": 0.15356376515601974, + "grad_norm": 2.0900843143463135, + "learning_rate": 0.00018473874806001036, + "loss": 2.3219, + "step": 1485 + }, + { + "epoch": 0.15366717509888575, + "grad_norm": 2.0020716190338135, + "learning_rate": 0.00018472840144852562, + "loss": 1.9556, + "step": 1486 + }, + { + "epoch": 0.15377058504175176, + "grad_norm": 0.9590654969215393, + "learning_rate": 0.0001847180548370409, + "loss": 2.0801, + "step": 1487 + }, + { + "epoch": 0.15387399498461776, + "grad_norm": 1.3336544036865234, + "learning_rate": 0.00018470770822555615, + "loss": 1.3413, + "step": 1488 + }, + { + "epoch": 0.15397740492748377, + "grad_norm": 1.676986813545227, + "learning_rate": 0.00018469736161407142, + "loss": 1.1669, + "step": 1489 + }, + { + "epoch": 0.15408081487034977, + "grad_norm": 1.1323169469833374, + "learning_rate": 0.00018468701500258666, + "loss": 1.5916, + "step": 1490 + }, + { + "epoch": 0.15418422481321578, + "grad_norm": 1.1335134506225586, + "learning_rate": 0.00018467666839110192, + "loss": 1.3671, + "step": 1491 + }, + { + "epoch": 0.1542876347560818, + "grad_norm": 1.59001886844635, + "learning_rate": 0.0001846663217796172, + "loss": 1.6888, + "step": 1492 + }, + { + "epoch": 0.1543910446989478, + "grad_norm": 0.6160492300987244, + "learning_rate": 0.00018465597516813246, + "loss": 1.6795, + "step": 1493 + }, + { + "epoch": 0.1544944546418138, + "grad_norm": 1.1683189868927002, + "learning_rate": 0.00018464562855664772, + "loss": 1.5374, + "step": 1494 + }, + { + "epoch": 0.1545978645846798, + "grad_norm": 1.4470138549804688, + "learning_rate": 0.000184635281945163, + "loss": 1.7904, + "step": 1495 + }, + { + "epoch": 0.1547012745275458, + "grad_norm": 1.729058027267456, + "learning_rate": 0.00018462493533367825, + "loss": 1.8934, + "step": 1496 + }, + { + "epoch": 0.15480468447041182, + "grad_norm": 1.35962975025177, + "learning_rate": 0.0001846145887221935, + "loss": 1.4041, + "step": 1497 + }, + { + "epoch": 0.15490809441327782, + "grad_norm": 3.6339924335479736, + "learning_rate": 0.00018460424211070876, + "loss": 1.7157, + "step": 1498 + }, + { + "epoch": 0.15501150435614383, + "grad_norm": 1.2886037826538086, + "learning_rate": 0.00018459389549922402, + "loss": 1.6121, + "step": 1499 + }, + { + "epoch": 0.15511491429900984, + "grad_norm": 1.1662096977233887, + "learning_rate": 0.0001845835488877393, + "loss": 1.1437, + "step": 1500 + }, + { + "epoch": 0.15521832424187584, + "grad_norm": 1.9506018161773682, + "learning_rate": 0.00018457320227625455, + "loss": 2.239, + "step": 1501 + }, + { + "epoch": 0.15532173418474185, + "grad_norm": 3.311880111694336, + "learning_rate": 0.00018456285566476982, + "loss": 1.9514, + "step": 1502 + }, + { + "epoch": 0.15542514412760786, + "grad_norm": 0.8730196952819824, + "learning_rate": 0.00018455250905328506, + "loss": 1.7644, + "step": 1503 + }, + { + "epoch": 0.15552855407047386, + "grad_norm": 1.311930775642395, + "learning_rate": 0.00018454216244180032, + "loss": 1.6487, + "step": 1504 + }, + { + "epoch": 0.1556319640133399, + "grad_norm": 1.7224714756011963, + "learning_rate": 0.0001845318158303156, + "loss": 1.1888, + "step": 1505 + }, + { + "epoch": 0.1557353739562059, + "grad_norm": 1.223656415939331, + "learning_rate": 0.00018452146921883085, + "loss": 1.8018, + "step": 1506 + }, + { + "epoch": 0.1558387838990719, + "grad_norm": 1.7321665287017822, + "learning_rate": 0.00018451112260734612, + "loss": 1.5763, + "step": 1507 + }, + { + "epoch": 0.15594219384193791, + "grad_norm": 1.9610759019851685, + "learning_rate": 0.00018450077599586138, + "loss": 2.2778, + "step": 1508 + }, + { + "epoch": 0.15604560378480392, + "grad_norm": 2.585524797439575, + "learning_rate": 0.00018449042938437662, + "loss": 1.9698, + "step": 1509 + }, + { + "epoch": 0.15614901372766993, + "grad_norm": 0.9229252338409424, + "learning_rate": 0.0001844800827728919, + "loss": 1.4744, + "step": 1510 + }, + { + "epoch": 0.15625242367053593, + "grad_norm": 1.3276410102844238, + "learning_rate": 0.00018446973616140715, + "loss": 1.2333, + "step": 1511 + }, + { + "epoch": 0.15635583361340194, + "grad_norm": 1.4182887077331543, + "learning_rate": 0.00018445938954992242, + "loss": 1.314, + "step": 1512 + }, + { + "epoch": 0.15645924355626795, + "grad_norm": 1.714599609375, + "learning_rate": 0.00018444904293843769, + "loss": 1.7817, + "step": 1513 + }, + { + "epoch": 0.15656265349913395, + "grad_norm": 3.0684444904327393, + "learning_rate": 0.00018443869632695295, + "loss": 1.7853, + "step": 1514 + }, + { + "epoch": 0.15666606344199996, + "grad_norm": 0.8052710890769958, + "learning_rate": 0.0001844283497154682, + "loss": 1.972, + "step": 1515 + }, + { + "epoch": 0.15676947338486596, + "grad_norm": 1.2668997049331665, + "learning_rate": 0.00018441800310398346, + "loss": 2.2012, + "step": 1516 + }, + { + "epoch": 0.15687288332773197, + "grad_norm": 2.7253763675689697, + "learning_rate": 0.00018440765649249872, + "loss": 2.2218, + "step": 1517 + }, + { + "epoch": 0.15697629327059798, + "grad_norm": 1.9879379272460938, + "learning_rate": 0.000184397309881014, + "loss": 1.9497, + "step": 1518 + }, + { + "epoch": 0.15707970321346398, + "grad_norm": 1.4316390752792358, + "learning_rate": 0.00018438696326952925, + "loss": 1.9449, + "step": 1519 + }, + { + "epoch": 0.15718311315633, + "grad_norm": 0.9651163816452026, + "learning_rate": 0.00018437661665804452, + "loss": 1.857, + "step": 1520 + }, + { + "epoch": 0.157286523099196, + "grad_norm": 2.6256558895111084, + "learning_rate": 0.00018436627004655976, + "loss": 1.7782, + "step": 1521 + }, + { + "epoch": 0.157389933042062, + "grad_norm": 1.571499228477478, + "learning_rate": 0.00018435592343507502, + "loss": 1.5711, + "step": 1522 + }, + { + "epoch": 0.157493342984928, + "grad_norm": 1.6658003330230713, + "learning_rate": 0.0001843455768235903, + "loss": 2.0324, + "step": 1523 + }, + { + "epoch": 0.15759675292779401, + "grad_norm": 1.4340386390686035, + "learning_rate": 0.00018433523021210555, + "loss": 1.9085, + "step": 1524 + }, + { + "epoch": 0.15770016287066002, + "grad_norm": 1.508362054824829, + "learning_rate": 0.00018432488360062082, + "loss": 1.6622, + "step": 1525 + }, + { + "epoch": 0.15780357281352603, + "grad_norm": 2.414968729019165, + "learning_rate": 0.00018431453698913608, + "loss": 1.7528, + "step": 1526 + }, + { + "epoch": 0.15790698275639203, + "grad_norm": 1.766019344329834, + "learning_rate": 0.00018430419037765132, + "loss": 1.4326, + "step": 1527 + }, + { + "epoch": 0.15801039269925804, + "grad_norm": 1.2096456289291382, + "learning_rate": 0.0001842938437661666, + "loss": 1.7347, + "step": 1528 + }, + { + "epoch": 0.15811380264212405, + "grad_norm": 0.8431787490844727, + "learning_rate": 0.00018428349715468185, + "loss": 1.3484, + "step": 1529 + }, + { + "epoch": 0.15821721258499005, + "grad_norm": 1.123042106628418, + "learning_rate": 0.00018427315054319712, + "loss": 1.6327, + "step": 1530 + }, + { + "epoch": 0.15832062252785606, + "grad_norm": 1.5339807271957397, + "learning_rate": 0.00018426280393171238, + "loss": 2.3622, + "step": 1531 + }, + { + "epoch": 0.15842403247072206, + "grad_norm": 1.0583611726760864, + "learning_rate": 0.00018425245732022765, + "loss": 1.2064, + "step": 1532 + }, + { + "epoch": 0.15852744241358807, + "grad_norm": 1.5183857679367065, + "learning_rate": 0.0001842421107087429, + "loss": 1.8776, + "step": 1533 + }, + { + "epoch": 0.15863085235645408, + "grad_norm": 1.4887058734893799, + "learning_rate": 0.00018423176409725815, + "loss": 1.762, + "step": 1534 + }, + { + "epoch": 0.15873426229932008, + "grad_norm": 1.5032808780670166, + "learning_rate": 0.00018422141748577342, + "loss": 1.5944, + "step": 1535 + }, + { + "epoch": 0.1588376722421861, + "grad_norm": 1.552122950553894, + "learning_rate": 0.00018421107087428869, + "loss": 2.2603, + "step": 1536 + }, + { + "epoch": 0.1589410821850521, + "grad_norm": 1.0997596979141235, + "learning_rate": 0.00018420072426280395, + "loss": 1.7179, + "step": 1537 + }, + { + "epoch": 0.1590444921279181, + "grad_norm": 2.1846163272857666, + "learning_rate": 0.00018419037765131922, + "loss": 1.8186, + "step": 1538 + }, + { + "epoch": 0.1591479020707841, + "grad_norm": 1.4159355163574219, + "learning_rate": 0.00018418003103983446, + "loss": 1.9963, + "step": 1539 + }, + { + "epoch": 0.15925131201365011, + "grad_norm": 1.7774709463119507, + "learning_rate": 0.00018416968442834972, + "loss": 2.39, + "step": 1540 + }, + { + "epoch": 0.15935472195651612, + "grad_norm": 1.3332961797714233, + "learning_rate": 0.00018415933781686499, + "loss": 1.4559, + "step": 1541 + }, + { + "epoch": 0.15945813189938213, + "grad_norm": 1.3671313524246216, + "learning_rate": 0.00018414899120538025, + "loss": 1.9792, + "step": 1542 + }, + { + "epoch": 0.15956154184224813, + "grad_norm": 0.9749310612678528, + "learning_rate": 0.00018413864459389552, + "loss": 1.8855, + "step": 1543 + }, + { + "epoch": 0.15966495178511414, + "grad_norm": 1.3629378080368042, + "learning_rate": 0.00018412829798241078, + "loss": 2.0187, + "step": 1544 + }, + { + "epoch": 0.15976836172798015, + "grad_norm": 2.236741304397583, + "learning_rate": 0.00018411795137092602, + "loss": 1.6343, + "step": 1545 + }, + { + "epoch": 0.15987177167084615, + "grad_norm": 1.9610300064086914, + "learning_rate": 0.0001841076047594413, + "loss": 1.6565, + "step": 1546 + }, + { + "epoch": 0.15997518161371216, + "grad_norm": 1.3427401781082153, + "learning_rate": 0.00018409725814795655, + "loss": 1.7781, + "step": 1547 + }, + { + "epoch": 0.16007859155657816, + "grad_norm": 1.5893080234527588, + "learning_rate": 0.00018408691153647182, + "loss": 1.6186, + "step": 1548 + }, + { + "epoch": 0.16018200149944417, + "grad_norm": 1.7143442630767822, + "learning_rate": 0.00018407656492498708, + "loss": 1.726, + "step": 1549 + }, + { + "epoch": 0.16028541144231018, + "grad_norm": 1.6941561698913574, + "learning_rate": 0.00018406621831350235, + "loss": 1.4204, + "step": 1550 + }, + { + "epoch": 0.16038882138517618, + "grad_norm": 2.1100339889526367, + "learning_rate": 0.0001840558717020176, + "loss": 2.1381, + "step": 1551 + }, + { + "epoch": 0.1604922313280422, + "grad_norm": 1.1537351608276367, + "learning_rate": 0.00018404552509053285, + "loss": 1.1759, + "step": 1552 + }, + { + "epoch": 0.1605956412709082, + "grad_norm": 1.5180721282958984, + "learning_rate": 0.00018403517847904812, + "loss": 1.6175, + "step": 1553 + }, + { + "epoch": 0.1606990512137742, + "grad_norm": 2.0618677139282227, + "learning_rate": 0.00018402483186756338, + "loss": 1.5854, + "step": 1554 + }, + { + "epoch": 0.1608024611566402, + "grad_norm": 1.102594017982483, + "learning_rate": 0.00018401448525607865, + "loss": 1.3946, + "step": 1555 + }, + { + "epoch": 0.16090587109950621, + "grad_norm": 1.2164007425308228, + "learning_rate": 0.00018400413864459392, + "loss": 1.449, + "step": 1556 + }, + { + "epoch": 0.16100928104237222, + "grad_norm": 1.3186836242675781, + "learning_rate": 0.00018399379203310915, + "loss": 1.5946, + "step": 1557 + }, + { + "epoch": 0.16111269098523823, + "grad_norm": 2.05452036857605, + "learning_rate": 0.00018398344542162442, + "loss": 1.7512, + "step": 1558 + }, + { + "epoch": 0.16121610092810423, + "grad_norm": 1.482884168624878, + "learning_rate": 0.00018397309881013969, + "loss": 1.5238, + "step": 1559 + }, + { + "epoch": 0.16131951087097024, + "grad_norm": 1.144921898841858, + "learning_rate": 0.00018396275219865495, + "loss": 1.598, + "step": 1560 + }, + { + "epoch": 0.16142292081383625, + "grad_norm": 0.9143129587173462, + "learning_rate": 0.00018395240558717022, + "loss": 1.3763, + "step": 1561 + }, + { + "epoch": 0.16152633075670225, + "grad_norm": 1.4671123027801514, + "learning_rate": 0.00018394205897568548, + "loss": 1.8797, + "step": 1562 + }, + { + "epoch": 0.16162974069956826, + "grad_norm": 0.8517533540725708, + "learning_rate": 0.00018393171236420072, + "loss": 1.8004, + "step": 1563 + }, + { + "epoch": 0.16173315064243426, + "grad_norm": 1.7459747791290283, + "learning_rate": 0.00018392136575271599, + "loss": 1.5426, + "step": 1564 + }, + { + "epoch": 0.16183656058530027, + "grad_norm": 1.2846120595932007, + "learning_rate": 0.00018391101914123125, + "loss": 1.2563, + "step": 1565 + }, + { + "epoch": 0.16193997052816628, + "grad_norm": 1.0815187692642212, + "learning_rate": 0.00018390067252974652, + "loss": 1.9735, + "step": 1566 + }, + { + "epoch": 0.16204338047103228, + "grad_norm": 1.4935349225997925, + "learning_rate": 0.00018389032591826178, + "loss": 1.9829, + "step": 1567 + }, + { + "epoch": 0.1621467904138983, + "grad_norm": 0.7376902103424072, + "learning_rate": 0.00018387997930677705, + "loss": 2.0292, + "step": 1568 + }, + { + "epoch": 0.1622502003567643, + "grad_norm": 0.8843541741371155, + "learning_rate": 0.0001838696326952923, + "loss": 1.3294, + "step": 1569 + }, + { + "epoch": 0.1623536102996303, + "grad_norm": 0.8407658934593201, + "learning_rate": 0.00018385928608380755, + "loss": 1.499, + "step": 1570 + }, + { + "epoch": 0.1624570202424963, + "grad_norm": 1.1754120588302612, + "learning_rate": 0.00018384893947232282, + "loss": 1.832, + "step": 1571 + }, + { + "epoch": 0.16256043018536231, + "grad_norm": 2.4817240238189697, + "learning_rate": 0.00018383859286083808, + "loss": 2.1077, + "step": 1572 + }, + { + "epoch": 0.16266384012822832, + "grad_norm": 1.4242192506790161, + "learning_rate": 0.00018382824624935335, + "loss": 1.9378, + "step": 1573 + }, + { + "epoch": 0.16276725007109433, + "grad_norm": 1.4254143238067627, + "learning_rate": 0.0001838178996378686, + "loss": 1.5059, + "step": 1574 + }, + { + "epoch": 0.16287066001396033, + "grad_norm": 1.4265427589416504, + "learning_rate": 0.00018380755302638385, + "loss": 1.9646, + "step": 1575 + }, + { + "epoch": 0.16297406995682634, + "grad_norm": 1.5007380247116089, + "learning_rate": 0.00018379720641489912, + "loss": 1.1932, + "step": 1576 + }, + { + "epoch": 0.16307747989969235, + "grad_norm": 1.8024120330810547, + "learning_rate": 0.00018378685980341438, + "loss": 1.8475, + "step": 1577 + }, + { + "epoch": 0.16318088984255835, + "grad_norm": 1.2183353900909424, + "learning_rate": 0.00018377651319192965, + "loss": 1.9732, + "step": 1578 + }, + { + "epoch": 0.16328429978542436, + "grad_norm": 1.5071296691894531, + "learning_rate": 0.00018376616658044492, + "loss": 2.4146, + "step": 1579 + }, + { + "epoch": 0.16338770972829036, + "grad_norm": 1.172339916229248, + "learning_rate": 0.00018375581996896015, + "loss": 1.4546, + "step": 1580 + }, + { + "epoch": 0.16349111967115637, + "grad_norm": 2.6015713214874268, + "learning_rate": 0.00018374547335747542, + "loss": 2.5725, + "step": 1581 + }, + { + "epoch": 0.16359452961402238, + "grad_norm": 1.1628609895706177, + "learning_rate": 0.00018373512674599068, + "loss": 1.3454, + "step": 1582 + }, + { + "epoch": 0.16369793955688838, + "grad_norm": 1.3187514543533325, + "learning_rate": 0.00018372478013450595, + "loss": 1.5801, + "step": 1583 + }, + { + "epoch": 0.1638013494997544, + "grad_norm": 1.2284473180770874, + "learning_rate": 0.00018371443352302122, + "loss": 1.7379, + "step": 1584 + }, + { + "epoch": 0.1639047594426204, + "grad_norm": 1.388885259628296, + "learning_rate": 0.00018370408691153648, + "loss": 1.3364, + "step": 1585 + }, + { + "epoch": 0.1640081693854864, + "grad_norm": 1.2158303260803223, + "learning_rate": 0.00018369374030005172, + "loss": 1.6606, + "step": 1586 + }, + { + "epoch": 0.1641115793283524, + "grad_norm": 1.6560564041137695, + "learning_rate": 0.00018368339368856699, + "loss": 1.8817, + "step": 1587 + }, + { + "epoch": 0.16421498927121841, + "grad_norm": 1.2663594484329224, + "learning_rate": 0.00018367304707708225, + "loss": 1.6547, + "step": 1588 + }, + { + "epoch": 0.16431839921408442, + "grad_norm": 1.790147304534912, + "learning_rate": 0.00018366270046559752, + "loss": 1.5503, + "step": 1589 + }, + { + "epoch": 0.16442180915695043, + "grad_norm": 1.1513880491256714, + "learning_rate": 0.00018365235385411278, + "loss": 1.6339, + "step": 1590 + }, + { + "epoch": 0.16452521909981646, + "grad_norm": 1.9630792140960693, + "learning_rate": 0.00018364200724262805, + "loss": 2.0134, + "step": 1591 + }, + { + "epoch": 0.16462862904268247, + "grad_norm": 1.1871670484542847, + "learning_rate": 0.00018363166063114329, + "loss": 1.5574, + "step": 1592 + }, + { + "epoch": 0.16473203898554847, + "grad_norm": 1.6941906213760376, + "learning_rate": 0.00018362131401965855, + "loss": 1.566, + "step": 1593 + }, + { + "epoch": 0.16483544892841448, + "grad_norm": 1.7763041257858276, + "learning_rate": 0.00018361096740817382, + "loss": 1.933, + "step": 1594 + }, + { + "epoch": 0.16493885887128049, + "grad_norm": 1.50661039352417, + "learning_rate": 0.00018360062079668908, + "loss": 1.3962, + "step": 1595 + }, + { + "epoch": 0.1650422688141465, + "grad_norm": 1.118114709854126, + "learning_rate": 0.00018359027418520435, + "loss": 1.6802, + "step": 1596 + }, + { + "epoch": 0.1651456787570125, + "grad_norm": 1.108808159828186, + "learning_rate": 0.00018357992757371961, + "loss": 1.2169, + "step": 1597 + }, + { + "epoch": 0.1652490886998785, + "grad_norm": 1.5891281366348267, + "learning_rate": 0.00018356958096223485, + "loss": 1.9502, + "step": 1598 + }, + { + "epoch": 0.1653524986427445, + "grad_norm": 0.7753489017486572, + "learning_rate": 0.00018355923435075012, + "loss": 1.577, + "step": 1599 + }, + { + "epoch": 0.16545590858561052, + "grad_norm": 1.6518728733062744, + "learning_rate": 0.00018354888773926538, + "loss": 1.9462, + "step": 1600 + }, + { + "epoch": 0.16555931852847652, + "grad_norm": 2.548527240753174, + "learning_rate": 0.00018353854112778065, + "loss": 1.8724, + "step": 1601 + }, + { + "epoch": 0.16566272847134253, + "grad_norm": 1.5010498762130737, + "learning_rate": 0.00018352819451629592, + "loss": 1.8285, + "step": 1602 + }, + { + "epoch": 0.16576613841420854, + "grad_norm": 1.1613878011703491, + "learning_rate": 0.00018351784790481118, + "loss": 2.121, + "step": 1603 + }, + { + "epoch": 0.16586954835707454, + "grad_norm": 1.3867918252944946, + "learning_rate": 0.00018350750129332645, + "loss": 1.1178, + "step": 1604 + }, + { + "epoch": 0.16597295829994055, + "grad_norm": 2.6823570728302, + "learning_rate": 0.0001834971546818417, + "loss": 2.086, + "step": 1605 + }, + { + "epoch": 0.16607636824280655, + "grad_norm": 1.298323392868042, + "learning_rate": 0.00018348680807035695, + "loss": 1.8797, + "step": 1606 + }, + { + "epoch": 0.16617977818567256, + "grad_norm": 1.6981557607650757, + "learning_rate": 0.00018347646145887222, + "loss": 1.7928, + "step": 1607 + }, + { + "epoch": 0.16628318812853857, + "grad_norm": 1.8808917999267578, + "learning_rate": 0.00018346611484738748, + "loss": 1.7693, + "step": 1608 + }, + { + "epoch": 0.16638659807140457, + "grad_norm": 1.246436595916748, + "learning_rate": 0.00018345576823590275, + "loss": 1.6371, + "step": 1609 + }, + { + "epoch": 0.16649000801427058, + "grad_norm": 1.5189836025238037, + "learning_rate": 0.000183445421624418, + "loss": 1.8056, + "step": 1610 + }, + { + "epoch": 0.16659341795713659, + "grad_norm": 1.1304303407669067, + "learning_rate": 0.00018343507501293328, + "loss": 1.7153, + "step": 1611 + }, + { + "epoch": 0.1666968279000026, + "grad_norm": 1.4506385326385498, + "learning_rate": 0.00018342472840144854, + "loss": 2.0781, + "step": 1612 + }, + { + "epoch": 0.1668002378428686, + "grad_norm": 2.3718414306640625, + "learning_rate": 0.0001834143817899638, + "loss": 2.0175, + "step": 1613 + }, + { + "epoch": 0.1669036477857346, + "grad_norm": 1.657726526260376, + "learning_rate": 0.00018340403517847905, + "loss": 1.8869, + "step": 1614 + }, + { + "epoch": 0.1670070577286006, + "grad_norm": 1.561111569404602, + "learning_rate": 0.0001833936885669943, + "loss": 1.9702, + "step": 1615 + }, + { + "epoch": 0.16711046767146662, + "grad_norm": 1.083419919013977, + "learning_rate": 0.00018338334195550958, + "loss": 1.4907, + "step": 1616 + }, + { + "epoch": 0.16721387761433262, + "grad_norm": 1.3286840915679932, + "learning_rate": 0.00018337299534402484, + "loss": 1.436, + "step": 1617 + }, + { + "epoch": 0.16731728755719863, + "grad_norm": 1.2173808813095093, + "learning_rate": 0.0001833626487325401, + "loss": 1.8981, + "step": 1618 + }, + { + "epoch": 0.16742069750006464, + "grad_norm": 1.6411387920379639, + "learning_rate": 0.00018335230212105538, + "loss": 1.9403, + "step": 1619 + }, + { + "epoch": 0.16752410744293064, + "grad_norm": 1.6110321283340454, + "learning_rate": 0.00018334195550957064, + "loss": 2.2896, + "step": 1620 + }, + { + "epoch": 0.16762751738579665, + "grad_norm": 2.330904245376587, + "learning_rate": 0.0001833316088980859, + "loss": 1.8244, + "step": 1621 + }, + { + "epoch": 0.16773092732866265, + "grad_norm": 0.8903806209564209, + "learning_rate": 0.00018332126228660115, + "loss": 1.7901, + "step": 1622 + }, + { + "epoch": 0.16783433727152866, + "grad_norm": 1.515162467956543, + "learning_rate": 0.0001833109156751164, + "loss": 1.739, + "step": 1623 + }, + { + "epoch": 0.16793774721439467, + "grad_norm": 1.395464539527893, + "learning_rate": 0.00018330056906363168, + "loss": 1.5509, + "step": 1624 + }, + { + "epoch": 0.16804115715726067, + "grad_norm": 1.7974237203598022, + "learning_rate": 0.00018329022245214694, + "loss": 1.8329, + "step": 1625 + }, + { + "epoch": 0.16814456710012668, + "grad_norm": 1.6956497430801392, + "learning_rate": 0.0001832798758406622, + "loss": 1.5083, + "step": 1626 + }, + { + "epoch": 0.16824797704299269, + "grad_norm": 2.477332592010498, + "learning_rate": 0.00018326952922917747, + "loss": 2.1227, + "step": 1627 + }, + { + "epoch": 0.1683513869858587, + "grad_norm": 1.1041282415390015, + "learning_rate": 0.0001832591826176927, + "loss": 2.3492, + "step": 1628 + }, + { + "epoch": 0.1684547969287247, + "grad_norm": 2.6464221477508545, + "learning_rate": 0.00018324883600620798, + "loss": 2.2591, + "step": 1629 + }, + { + "epoch": 0.1685582068715907, + "grad_norm": 1.3996168375015259, + "learning_rate": 0.00018323848939472324, + "loss": 0.9682, + "step": 1630 + }, + { + "epoch": 0.1686616168144567, + "grad_norm": 1.6426746845245361, + "learning_rate": 0.0001832281427832385, + "loss": 1.9295, + "step": 1631 + }, + { + "epoch": 0.16876502675732272, + "grad_norm": 1.202686071395874, + "learning_rate": 0.00018321779617175377, + "loss": 1.3054, + "step": 1632 + }, + { + "epoch": 0.16886843670018872, + "grad_norm": 1.330463171005249, + "learning_rate": 0.00018320744956026904, + "loss": 1.2563, + "step": 1633 + }, + { + "epoch": 0.16897184664305473, + "grad_norm": 1.0541915893554688, + "learning_rate": 0.00018319710294878428, + "loss": 1.7607, + "step": 1634 + }, + { + "epoch": 0.16907525658592074, + "grad_norm": 0.961632251739502, + "learning_rate": 0.00018318675633729954, + "loss": 1.555, + "step": 1635 + }, + { + "epoch": 0.16917866652878674, + "grad_norm": 1.850738286972046, + "learning_rate": 0.0001831764097258148, + "loss": 1.6219, + "step": 1636 + }, + { + "epoch": 0.16928207647165275, + "grad_norm": 1.7717586755752563, + "learning_rate": 0.00018316606311433007, + "loss": 2.3512, + "step": 1637 + }, + { + "epoch": 0.16938548641451875, + "grad_norm": 0.7864888310432434, + "learning_rate": 0.00018315571650284534, + "loss": 1.3779, + "step": 1638 + }, + { + "epoch": 0.16948889635738476, + "grad_norm": 1.4093188047409058, + "learning_rate": 0.0001831453698913606, + "loss": 1.5036, + "step": 1639 + }, + { + "epoch": 0.16959230630025077, + "grad_norm": 1.5282201766967773, + "learning_rate": 0.00018313502327987584, + "loss": 1.4186, + "step": 1640 + }, + { + "epoch": 0.16969571624311677, + "grad_norm": 1.574284553527832, + "learning_rate": 0.0001831246766683911, + "loss": 1.483, + "step": 1641 + }, + { + "epoch": 0.16979912618598278, + "grad_norm": 2.263618230819702, + "learning_rate": 0.00018311433005690638, + "loss": 2.1512, + "step": 1642 + }, + { + "epoch": 0.16990253612884879, + "grad_norm": 1.4073622226715088, + "learning_rate": 0.00018310398344542164, + "loss": 2.0069, + "step": 1643 + }, + { + "epoch": 0.1700059460717148, + "grad_norm": 2.1453514099121094, + "learning_rate": 0.0001830936368339369, + "loss": 2.1053, + "step": 1644 + }, + { + "epoch": 0.1701093560145808, + "grad_norm": 2.447608709335327, + "learning_rate": 0.00018308329022245217, + "loss": 1.8623, + "step": 1645 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 1.4036715030670166, + "learning_rate": 0.0001830729436109674, + "loss": 1.3103, + "step": 1646 + }, + { + "epoch": 0.1703161759003128, + "grad_norm": 1.4812512397766113, + "learning_rate": 0.00018306259699948268, + "loss": 1.4172, + "step": 1647 + }, + { + "epoch": 0.17041958584317882, + "grad_norm": 1.730870008468628, + "learning_rate": 0.00018305225038799794, + "loss": 1.5538, + "step": 1648 + }, + { + "epoch": 0.17052299578604482, + "grad_norm": 1.4224426746368408, + "learning_rate": 0.0001830419037765132, + "loss": 2.0328, + "step": 1649 + }, + { + "epoch": 0.17062640572891083, + "grad_norm": 1.2195708751678467, + "learning_rate": 0.00018303155716502847, + "loss": 1.5262, + "step": 1650 + }, + { + "epoch": 0.17072981567177684, + "grad_norm": 1.5029513835906982, + "learning_rate": 0.00018302121055354374, + "loss": 1.6855, + "step": 1651 + }, + { + "epoch": 0.17083322561464284, + "grad_norm": 1.6222515106201172, + "learning_rate": 0.00018301086394205898, + "loss": 1.6238, + "step": 1652 + }, + { + "epoch": 0.17093663555750885, + "grad_norm": 1.0669522285461426, + "learning_rate": 0.00018300051733057424, + "loss": 1.3903, + "step": 1653 + }, + { + "epoch": 0.17104004550037485, + "grad_norm": 2.3513834476470947, + "learning_rate": 0.0001829901707190895, + "loss": 1.9285, + "step": 1654 + }, + { + "epoch": 0.17114345544324086, + "grad_norm": 1.5260512828826904, + "learning_rate": 0.00018297982410760477, + "loss": 1.1363, + "step": 1655 + }, + { + "epoch": 0.17124686538610687, + "grad_norm": 2.7969038486480713, + "learning_rate": 0.00018296947749612004, + "loss": 2.721, + "step": 1656 + }, + { + "epoch": 0.17135027532897287, + "grad_norm": 0.9958081245422363, + "learning_rate": 0.0001829591308846353, + "loss": 1.535, + "step": 1657 + }, + { + "epoch": 0.17145368527183888, + "grad_norm": 2.141313314437866, + "learning_rate": 0.00018294878427315054, + "loss": 2.1867, + "step": 1658 + }, + { + "epoch": 0.17155709521470489, + "grad_norm": 1.7324568033218384, + "learning_rate": 0.0001829384376616658, + "loss": 2.0558, + "step": 1659 + }, + { + "epoch": 0.1716605051575709, + "grad_norm": 2.449826240539551, + "learning_rate": 0.00018292809105018107, + "loss": 1.8396, + "step": 1660 + }, + { + "epoch": 0.1717639151004369, + "grad_norm": 1.0371530055999756, + "learning_rate": 0.00018291774443869634, + "loss": 1.5168, + "step": 1661 + }, + { + "epoch": 0.1718673250433029, + "grad_norm": 2.267475128173828, + "learning_rate": 0.0001829073978272116, + "loss": 1.443, + "step": 1662 + }, + { + "epoch": 0.1719707349861689, + "grad_norm": 1.4712553024291992, + "learning_rate": 0.00018289705121572687, + "loss": 1.5452, + "step": 1663 + }, + { + "epoch": 0.17207414492903492, + "grad_norm": 2.187161445617676, + "learning_rate": 0.0001828867046042421, + "loss": 2.2943, + "step": 1664 + }, + { + "epoch": 0.17217755487190092, + "grad_norm": 1.8305400609970093, + "learning_rate": 0.00018287635799275738, + "loss": 1.5445, + "step": 1665 + }, + { + "epoch": 0.17228096481476693, + "grad_norm": 2.1179354190826416, + "learning_rate": 0.00018286601138127264, + "loss": 1.921, + "step": 1666 + }, + { + "epoch": 0.17238437475763294, + "grad_norm": 1.11764395236969, + "learning_rate": 0.0001828556647697879, + "loss": 1.6441, + "step": 1667 + }, + { + "epoch": 0.17248778470049894, + "grad_norm": 1.8130611181259155, + "learning_rate": 0.00018284531815830317, + "loss": 1.5732, + "step": 1668 + }, + { + "epoch": 0.17259119464336495, + "grad_norm": 1.2046823501586914, + "learning_rate": 0.00018283497154681844, + "loss": 1.3871, + "step": 1669 + }, + { + "epoch": 0.17269460458623095, + "grad_norm": 1.1269457340240479, + "learning_rate": 0.00018282462493533368, + "loss": 1.7667, + "step": 1670 + }, + { + "epoch": 0.17279801452909696, + "grad_norm": 0.9750357270240784, + "learning_rate": 0.00018281427832384894, + "loss": 1.4863, + "step": 1671 + }, + { + "epoch": 0.17290142447196297, + "grad_norm": 2.1386210918426514, + "learning_rate": 0.0001828039317123642, + "loss": 1.6579, + "step": 1672 + }, + { + "epoch": 0.17300483441482897, + "grad_norm": 1.1148815155029297, + "learning_rate": 0.00018279358510087947, + "loss": 0.9662, + "step": 1673 + }, + { + "epoch": 0.17310824435769498, + "grad_norm": 1.1114896535873413, + "learning_rate": 0.00018278323848939474, + "loss": 1.2727, + "step": 1674 + }, + { + "epoch": 0.17321165430056099, + "grad_norm": 1.411158561706543, + "learning_rate": 0.00018277289187791, + "loss": 1.8358, + "step": 1675 + }, + { + "epoch": 0.173315064243427, + "grad_norm": 1.026279091835022, + "learning_rate": 0.00018276254526642524, + "loss": 1.4396, + "step": 1676 + }, + { + "epoch": 0.17341847418629303, + "grad_norm": 1.8113138675689697, + "learning_rate": 0.0001827521986549405, + "loss": 1.0483, + "step": 1677 + }, + { + "epoch": 0.17352188412915903, + "grad_norm": 1.5766301155090332, + "learning_rate": 0.00018274185204345577, + "loss": 1.8646, + "step": 1678 + }, + { + "epoch": 0.17362529407202504, + "grad_norm": 1.361986517906189, + "learning_rate": 0.00018273150543197104, + "loss": 1.3236, + "step": 1679 + }, + { + "epoch": 0.17372870401489104, + "grad_norm": 2.4059231281280518, + "learning_rate": 0.0001827211588204863, + "loss": 2.188, + "step": 1680 + }, + { + "epoch": 0.17383211395775705, + "grad_norm": 5.3925886154174805, + "learning_rate": 0.00018271081220900157, + "loss": 1.6441, + "step": 1681 + }, + { + "epoch": 0.17393552390062306, + "grad_norm": 1.831649661064148, + "learning_rate": 0.0001827004655975168, + "loss": 1.9166, + "step": 1682 + }, + { + "epoch": 0.17403893384348906, + "grad_norm": 1.7696832418441772, + "learning_rate": 0.00018269011898603207, + "loss": 1.6002, + "step": 1683 + }, + { + "epoch": 0.17414234378635507, + "grad_norm": 1.0492238998413086, + "learning_rate": 0.00018267977237454734, + "loss": 1.6034, + "step": 1684 + }, + { + "epoch": 0.17424575372922108, + "grad_norm": 2.082101821899414, + "learning_rate": 0.0001826694257630626, + "loss": 1.5458, + "step": 1685 + }, + { + "epoch": 0.17434916367208708, + "grad_norm": 1.129961609840393, + "learning_rate": 0.00018265907915157787, + "loss": 0.9186, + "step": 1686 + }, + { + "epoch": 0.1744525736149531, + "grad_norm": 1.0595530271530151, + "learning_rate": 0.00018264873254009314, + "loss": 1.5479, + "step": 1687 + }, + { + "epoch": 0.1745559835578191, + "grad_norm": 0.6668352484703064, + "learning_rate": 0.00018263838592860837, + "loss": 1.4539, + "step": 1688 + }, + { + "epoch": 0.1746593935006851, + "grad_norm": 1.6207376718521118, + "learning_rate": 0.00018262803931712364, + "loss": 1.7107, + "step": 1689 + }, + { + "epoch": 0.1747628034435511, + "grad_norm": 1.0871083736419678, + "learning_rate": 0.0001826176927056389, + "loss": 1.9602, + "step": 1690 + }, + { + "epoch": 0.1748662133864171, + "grad_norm": 1.131425380706787, + "learning_rate": 0.00018260734609415417, + "loss": 1.2928, + "step": 1691 + }, + { + "epoch": 0.17496962332928312, + "grad_norm": 1.9855254888534546, + "learning_rate": 0.00018259699948266944, + "loss": 1.5173, + "step": 1692 + }, + { + "epoch": 0.17507303327214913, + "grad_norm": 1.17534601688385, + "learning_rate": 0.0001825866528711847, + "loss": 1.185, + "step": 1693 + }, + { + "epoch": 0.17517644321501513, + "grad_norm": 1.2976510524749756, + "learning_rate": 0.00018257630625969994, + "loss": 1.7015, + "step": 1694 + }, + { + "epoch": 0.17527985315788114, + "grad_norm": 1.7299031019210815, + "learning_rate": 0.0001825659596482152, + "loss": 1.9024, + "step": 1695 + }, + { + "epoch": 0.17538326310074714, + "grad_norm": 1.3706202507019043, + "learning_rate": 0.00018255561303673047, + "loss": 0.9326, + "step": 1696 + }, + { + "epoch": 0.17548667304361315, + "grad_norm": 1.2045725584030151, + "learning_rate": 0.00018254526642524574, + "loss": 1.2952, + "step": 1697 + }, + { + "epoch": 0.17559008298647916, + "grad_norm": 1.2185454368591309, + "learning_rate": 0.000182534919813761, + "loss": 1.9565, + "step": 1698 + }, + { + "epoch": 0.17569349292934516, + "grad_norm": 2.578016996383667, + "learning_rate": 0.00018252457320227627, + "loss": 1.8957, + "step": 1699 + }, + { + "epoch": 0.17579690287221117, + "grad_norm": 1.152272343635559, + "learning_rate": 0.0001825142265907915, + "loss": 1.6591, + "step": 1700 + }, + { + "epoch": 0.17590031281507718, + "grad_norm": 2.588287591934204, + "learning_rate": 0.00018250387997930677, + "loss": 1.9043, + "step": 1701 + }, + { + "epoch": 0.17600372275794318, + "grad_norm": 1.1727057695388794, + "learning_rate": 0.00018249353336782204, + "loss": 1.464, + "step": 1702 + }, + { + "epoch": 0.1761071327008092, + "grad_norm": 1.9140554666519165, + "learning_rate": 0.0001824831867563373, + "loss": 2.1905, + "step": 1703 + }, + { + "epoch": 0.1762105426436752, + "grad_norm": 1.4966853857040405, + "learning_rate": 0.00018247284014485257, + "loss": 1.7903, + "step": 1704 + }, + { + "epoch": 0.1763139525865412, + "grad_norm": 1.9793884754180908, + "learning_rate": 0.00018246249353336784, + "loss": 0.965, + "step": 1705 + }, + { + "epoch": 0.1764173625294072, + "grad_norm": 1.5424928665161133, + "learning_rate": 0.00018245214692188307, + "loss": 2.1276, + "step": 1706 + }, + { + "epoch": 0.1765207724722732, + "grad_norm": 2.2728068828582764, + "learning_rate": 0.00018244180031039834, + "loss": 2.4757, + "step": 1707 + }, + { + "epoch": 0.17662418241513922, + "grad_norm": 1.7877531051635742, + "learning_rate": 0.0001824314536989136, + "loss": 1.708, + "step": 1708 + }, + { + "epoch": 0.17672759235800523, + "grad_norm": 1.9923186302185059, + "learning_rate": 0.00018242110708742887, + "loss": 1.5047, + "step": 1709 + }, + { + "epoch": 0.17683100230087123, + "grad_norm": 2.096897840499878, + "learning_rate": 0.00018241076047594414, + "loss": 1.7008, + "step": 1710 + }, + { + "epoch": 0.17693441224373724, + "grad_norm": 0.9570733904838562, + "learning_rate": 0.0001824004138644594, + "loss": 1.6933, + "step": 1711 + }, + { + "epoch": 0.17703782218660324, + "grad_norm": 1.9453891515731812, + "learning_rate": 0.00018239006725297464, + "loss": 1.769, + "step": 1712 + }, + { + "epoch": 0.17714123212946925, + "grad_norm": 0.9500687718391418, + "learning_rate": 0.0001823797206414899, + "loss": 1.6701, + "step": 1713 + }, + { + "epoch": 0.17724464207233526, + "grad_norm": 1.359328031539917, + "learning_rate": 0.00018236937403000517, + "loss": 1.742, + "step": 1714 + }, + { + "epoch": 0.17734805201520126, + "grad_norm": 1.8505967855453491, + "learning_rate": 0.00018235902741852044, + "loss": 1.4796, + "step": 1715 + }, + { + "epoch": 0.17745146195806727, + "grad_norm": 2.4523441791534424, + "learning_rate": 0.0001823486808070357, + "loss": 2.1651, + "step": 1716 + }, + { + "epoch": 0.17755487190093328, + "grad_norm": 1.6013096570968628, + "learning_rate": 0.00018233833419555097, + "loss": 1.6705, + "step": 1717 + }, + { + "epoch": 0.17765828184379928, + "grad_norm": 1.9895267486572266, + "learning_rate": 0.0001823279875840662, + "loss": 1.3338, + "step": 1718 + }, + { + "epoch": 0.1777616917866653, + "grad_norm": 1.7524160146713257, + "learning_rate": 0.00018231764097258147, + "loss": 2.2723, + "step": 1719 + }, + { + "epoch": 0.1778651017295313, + "grad_norm": 2.6437292098999023, + "learning_rate": 0.00018230729436109674, + "loss": 1.9029, + "step": 1720 + }, + { + "epoch": 0.1779685116723973, + "grad_norm": 1.6161471605300903, + "learning_rate": 0.000182296947749612, + "loss": 1.7033, + "step": 1721 + }, + { + "epoch": 0.1780719216152633, + "grad_norm": 1.4976915121078491, + "learning_rate": 0.00018228660113812727, + "loss": 1.7068, + "step": 1722 + }, + { + "epoch": 0.1781753315581293, + "grad_norm": 1.0355263948440552, + "learning_rate": 0.00018227625452664253, + "loss": 1.9508, + "step": 1723 + }, + { + "epoch": 0.17827874150099532, + "grad_norm": 0.9397589564323425, + "learning_rate": 0.00018226590791515777, + "loss": 1.8944, + "step": 1724 + }, + { + "epoch": 0.17838215144386133, + "grad_norm": 1.7493468523025513, + "learning_rate": 0.00018225556130367304, + "loss": 1.7938, + "step": 1725 + }, + { + "epoch": 0.17848556138672733, + "grad_norm": 2.3740999698638916, + "learning_rate": 0.0001822452146921883, + "loss": 1.9897, + "step": 1726 + }, + { + "epoch": 0.17858897132959334, + "grad_norm": 1.8972642421722412, + "learning_rate": 0.00018223486808070357, + "loss": 2.481, + "step": 1727 + }, + { + "epoch": 0.17869238127245934, + "grad_norm": 1.6285275220870972, + "learning_rate": 0.00018222452146921883, + "loss": 1.9144, + "step": 1728 + }, + { + "epoch": 0.17879579121532535, + "grad_norm": 3.3694674968719482, + "learning_rate": 0.0001822141748577341, + "loss": 2.0067, + "step": 1729 + }, + { + "epoch": 0.17889920115819136, + "grad_norm": 1.9680372476577759, + "learning_rate": 0.00018220382824624937, + "loss": 2.1982, + "step": 1730 + }, + { + "epoch": 0.17900261110105736, + "grad_norm": 1.4508297443389893, + "learning_rate": 0.0001821934816347646, + "loss": 1.5265, + "step": 1731 + }, + { + "epoch": 0.17910602104392337, + "grad_norm": 1.2594949007034302, + "learning_rate": 0.00018218313502327987, + "loss": 1.5713, + "step": 1732 + }, + { + "epoch": 0.17920943098678938, + "grad_norm": 1.9360672235488892, + "learning_rate": 0.00018217278841179514, + "loss": 1.9253, + "step": 1733 + }, + { + "epoch": 0.17931284092965538, + "grad_norm": 3.0524022579193115, + "learning_rate": 0.0001821624418003104, + "loss": 2.1621, + "step": 1734 + }, + { + "epoch": 0.1794162508725214, + "grad_norm": 1.465190052986145, + "learning_rate": 0.00018215209518882567, + "loss": 1.3045, + "step": 1735 + }, + { + "epoch": 0.1795196608153874, + "grad_norm": 1.1517199277877808, + "learning_rate": 0.00018214174857734093, + "loss": 1.381, + "step": 1736 + }, + { + "epoch": 0.1796230707582534, + "grad_norm": 3.004659414291382, + "learning_rate": 0.0001821314019658562, + "loss": 2.0144, + "step": 1737 + }, + { + "epoch": 0.1797264807011194, + "grad_norm": 1.5032134056091309, + "learning_rate": 0.00018212105535437146, + "loss": 1.3108, + "step": 1738 + }, + { + "epoch": 0.1798298906439854, + "grad_norm": 1.3385944366455078, + "learning_rate": 0.0001821107087428867, + "loss": 1.4403, + "step": 1739 + }, + { + "epoch": 0.17993330058685142, + "grad_norm": 1.471063494682312, + "learning_rate": 0.00018210036213140197, + "loss": 1.6935, + "step": 1740 + }, + { + "epoch": 0.18003671052971743, + "grad_norm": 2.064852714538574, + "learning_rate": 0.00018209001551991723, + "loss": 1.8696, + "step": 1741 + }, + { + "epoch": 0.18014012047258343, + "grad_norm": 2.8570311069488525, + "learning_rate": 0.0001820796689084325, + "loss": 1.6901, + "step": 1742 + }, + { + "epoch": 0.18024353041544944, + "grad_norm": 1.3590514659881592, + "learning_rate": 0.00018206932229694776, + "loss": 1.61, + "step": 1743 + }, + { + "epoch": 0.18034694035831544, + "grad_norm": 1.277525544166565, + "learning_rate": 0.00018205897568546303, + "loss": 1.8969, + "step": 1744 + }, + { + "epoch": 0.18045035030118145, + "grad_norm": 1.2931078672409058, + "learning_rate": 0.0001820486290739783, + "loss": 1.4851, + "step": 1745 + }, + { + "epoch": 0.18055376024404746, + "grad_norm": 1.664851188659668, + "learning_rate": 0.00018203828246249356, + "loss": 1.8105, + "step": 1746 + }, + { + "epoch": 0.18065717018691346, + "grad_norm": 1.2162920236587524, + "learning_rate": 0.0001820279358510088, + "loss": 1.3794, + "step": 1747 + }, + { + "epoch": 0.18076058012977947, + "grad_norm": 0.8630446791648865, + "learning_rate": 0.00018201758923952407, + "loss": 1.7145, + "step": 1748 + }, + { + "epoch": 0.18086399007264548, + "grad_norm": 2.054414749145508, + "learning_rate": 0.00018200724262803933, + "loss": 1.4697, + "step": 1749 + }, + { + "epoch": 0.18096740001551148, + "grad_norm": 1.0530321598052979, + "learning_rate": 0.0001819968960165546, + "loss": 1.249, + "step": 1750 + }, + { + "epoch": 0.1810708099583775, + "grad_norm": 1.1370590925216675, + "learning_rate": 0.00018198654940506986, + "loss": 1.5279, + "step": 1751 + }, + { + "epoch": 0.1811742199012435, + "grad_norm": 1.3252718448638916, + "learning_rate": 0.00018197620279358513, + "loss": 1.8875, + "step": 1752 + }, + { + "epoch": 0.1812776298441095, + "grad_norm": 1.1777899265289307, + "learning_rate": 0.0001819658561821004, + "loss": 1.4416, + "step": 1753 + }, + { + "epoch": 0.1813810397869755, + "grad_norm": 1.79104745388031, + "learning_rate": 0.00018195550957061563, + "loss": 1.3529, + "step": 1754 + }, + { + "epoch": 0.1814844497298415, + "grad_norm": 1.3128929138183594, + "learning_rate": 0.0001819451629591309, + "loss": 1.2693, + "step": 1755 + }, + { + "epoch": 0.18158785967270752, + "grad_norm": 1.3561937808990479, + "learning_rate": 0.00018193481634764616, + "loss": 1.1964, + "step": 1756 + }, + { + "epoch": 0.18169126961557353, + "grad_norm": 3.990429162979126, + "learning_rate": 0.00018192446973616143, + "loss": 1.7025, + "step": 1757 + }, + { + "epoch": 0.18179467955843953, + "grad_norm": 2.079883575439453, + "learning_rate": 0.0001819141231246767, + "loss": 2.1624, + "step": 1758 + }, + { + "epoch": 0.18189808950130554, + "grad_norm": 3.3075170516967773, + "learning_rate": 0.00018190377651319196, + "loss": 2.1052, + "step": 1759 + }, + { + "epoch": 0.18200149944417154, + "grad_norm": 1.6594444513320923, + "learning_rate": 0.0001818934299017072, + "loss": 1.8497, + "step": 1760 + }, + { + "epoch": 0.18210490938703755, + "grad_norm": 1.8769367933273315, + "learning_rate": 0.00018188308329022246, + "loss": 1.6261, + "step": 1761 + }, + { + "epoch": 0.18220831932990356, + "grad_norm": 1.0080485343933105, + "learning_rate": 0.00018187273667873773, + "loss": 1.7013, + "step": 1762 + }, + { + "epoch": 0.1823117292727696, + "grad_norm": 1.650251030921936, + "learning_rate": 0.000181862390067253, + "loss": 1.2848, + "step": 1763 + }, + { + "epoch": 0.1824151392156356, + "grad_norm": 1.2735741138458252, + "learning_rate": 0.00018185204345576826, + "loss": 1.7479, + "step": 1764 + }, + { + "epoch": 0.1825185491585016, + "grad_norm": 1.7841858863830566, + "learning_rate": 0.00018184169684428353, + "loss": 2.2005, + "step": 1765 + }, + { + "epoch": 0.1826219591013676, + "grad_norm": 1.6187489032745361, + "learning_rate": 0.00018183135023279876, + "loss": 2.1096, + "step": 1766 + }, + { + "epoch": 0.18272536904423362, + "grad_norm": 1.2000303268432617, + "learning_rate": 0.00018182100362131403, + "loss": 1.3292, + "step": 1767 + }, + { + "epoch": 0.18282877898709962, + "grad_norm": 1.7545541524887085, + "learning_rate": 0.0001818106570098293, + "loss": 1.621, + "step": 1768 + }, + { + "epoch": 0.18293218892996563, + "grad_norm": 1.455883502960205, + "learning_rate": 0.00018180031039834456, + "loss": 1.5392, + "step": 1769 + }, + { + "epoch": 0.18303559887283163, + "grad_norm": 1.1145455837249756, + "learning_rate": 0.00018178996378685983, + "loss": 1.4463, + "step": 1770 + }, + { + "epoch": 0.18313900881569764, + "grad_norm": 3.161123514175415, + "learning_rate": 0.0001817796171753751, + "loss": 1.4145, + "step": 1771 + }, + { + "epoch": 0.18324241875856365, + "grad_norm": 1.0690654516220093, + "learning_rate": 0.00018176927056389033, + "loss": 1.2464, + "step": 1772 + }, + { + "epoch": 0.18334582870142965, + "grad_norm": 2.340733528137207, + "learning_rate": 0.0001817589239524056, + "loss": 1.3952, + "step": 1773 + }, + { + "epoch": 0.18344923864429566, + "grad_norm": 1.527890920639038, + "learning_rate": 0.00018174857734092086, + "loss": 1.7334, + "step": 1774 + }, + { + "epoch": 0.18355264858716167, + "grad_norm": 2.002925157546997, + "learning_rate": 0.00018173823072943613, + "loss": 2.08, + "step": 1775 + }, + { + "epoch": 0.18365605853002767, + "grad_norm": 1.2468239068984985, + "learning_rate": 0.0001817278841179514, + "loss": 1.8638, + "step": 1776 + }, + { + "epoch": 0.18375946847289368, + "grad_norm": 1.9202920198440552, + "learning_rate": 0.00018171753750646666, + "loss": 2.5098, + "step": 1777 + }, + { + "epoch": 0.18386287841575968, + "grad_norm": 1.4701899290084839, + "learning_rate": 0.0001817071908949819, + "loss": 1.783, + "step": 1778 + }, + { + "epoch": 0.1839662883586257, + "grad_norm": 1.1845182180404663, + "learning_rate": 0.00018169684428349716, + "loss": 2.0948, + "step": 1779 + }, + { + "epoch": 0.1840696983014917, + "grad_norm": 2.3378961086273193, + "learning_rate": 0.00018168649767201243, + "loss": 1.5978, + "step": 1780 + }, + { + "epoch": 0.1841731082443577, + "grad_norm": 1.632500171661377, + "learning_rate": 0.0001816761510605277, + "loss": 1.7837, + "step": 1781 + }, + { + "epoch": 0.1842765181872237, + "grad_norm": 1.8515764474868774, + "learning_rate": 0.00018166580444904296, + "loss": 2.0129, + "step": 1782 + }, + { + "epoch": 0.18437992813008972, + "grad_norm": 1.1611318588256836, + "learning_rate": 0.00018165545783755822, + "loss": 2.0427, + "step": 1783 + }, + { + "epoch": 0.18448333807295572, + "grad_norm": 0.8150402307510376, + "learning_rate": 0.00018164511122607346, + "loss": 1.9761, + "step": 1784 + }, + { + "epoch": 0.18458674801582173, + "grad_norm": 2.1012911796569824, + "learning_rate": 0.00018163476461458873, + "loss": 1.3257, + "step": 1785 + }, + { + "epoch": 0.18469015795868773, + "grad_norm": 1.231393575668335, + "learning_rate": 0.000181624418003104, + "loss": 1.8023, + "step": 1786 + }, + { + "epoch": 0.18479356790155374, + "grad_norm": 1.4408806562423706, + "learning_rate": 0.00018161407139161926, + "loss": 1.6601, + "step": 1787 + }, + { + "epoch": 0.18489697784441975, + "grad_norm": 1.223767876625061, + "learning_rate": 0.00018160372478013453, + "loss": 1.972, + "step": 1788 + }, + { + "epoch": 0.18500038778728575, + "grad_norm": 1.9878640174865723, + "learning_rate": 0.0001815933781686498, + "loss": 2.1535, + "step": 1789 + }, + { + "epoch": 0.18510379773015176, + "grad_norm": 1.076229214668274, + "learning_rate": 0.00018158303155716503, + "loss": 1.533, + "step": 1790 + }, + { + "epoch": 0.18520720767301777, + "grad_norm": 2.2140488624572754, + "learning_rate": 0.0001815726849456803, + "loss": 1.9387, + "step": 1791 + }, + { + "epoch": 0.18531061761588377, + "grad_norm": 0.9972267150878906, + "learning_rate": 0.00018156233833419556, + "loss": 1.4952, + "step": 1792 + }, + { + "epoch": 0.18541402755874978, + "grad_norm": 2.2801876068115234, + "learning_rate": 0.00018155199172271083, + "loss": 1.9597, + "step": 1793 + }, + { + "epoch": 0.18551743750161578, + "grad_norm": 1.1939889192581177, + "learning_rate": 0.0001815416451112261, + "loss": 1.3651, + "step": 1794 + }, + { + "epoch": 0.1856208474444818, + "grad_norm": 1.5104225873947144, + "learning_rate": 0.00018153129849974136, + "loss": 1.9471, + "step": 1795 + }, + { + "epoch": 0.1857242573873478, + "grad_norm": 1.1902467012405396, + "learning_rate": 0.0001815209518882566, + "loss": 1.4968, + "step": 1796 + }, + { + "epoch": 0.1858276673302138, + "grad_norm": 1.8906666040420532, + "learning_rate": 0.00018151060527677186, + "loss": 1.0471, + "step": 1797 + }, + { + "epoch": 0.1859310772730798, + "grad_norm": 1.378791093826294, + "learning_rate": 0.00018150025866528713, + "loss": 1.6083, + "step": 1798 + }, + { + "epoch": 0.18603448721594582, + "grad_norm": 1.4541735649108887, + "learning_rate": 0.0001814899120538024, + "loss": 2.0237, + "step": 1799 + }, + { + "epoch": 0.18613789715881182, + "grad_norm": 1.8095440864562988, + "learning_rate": 0.00018147956544231766, + "loss": 1.9967, + "step": 1800 + }, + { + "epoch": 0.18624130710167783, + "grad_norm": 1.2104191780090332, + "learning_rate": 0.00018146921883083292, + "loss": 1.5496, + "step": 1801 + }, + { + "epoch": 0.18634471704454383, + "grad_norm": 2.2273075580596924, + "learning_rate": 0.00018145887221934816, + "loss": 2.3056, + "step": 1802 + }, + { + "epoch": 0.18644812698740984, + "grad_norm": 2.3855462074279785, + "learning_rate": 0.00018144852560786343, + "loss": 2.286, + "step": 1803 + }, + { + "epoch": 0.18655153693027585, + "grad_norm": 1.2031198740005493, + "learning_rate": 0.0001814381789963787, + "loss": 2.1857, + "step": 1804 + }, + { + "epoch": 0.18665494687314185, + "grad_norm": 0.9996086955070496, + "learning_rate": 0.00018142783238489396, + "loss": 2.0501, + "step": 1805 + }, + { + "epoch": 0.18675835681600786, + "grad_norm": 0.9292736053466797, + "learning_rate": 0.00018141748577340922, + "loss": 1.7131, + "step": 1806 + }, + { + "epoch": 0.18686176675887387, + "grad_norm": 1.5405597686767578, + "learning_rate": 0.0001814071391619245, + "loss": 2.1909, + "step": 1807 + }, + { + "epoch": 0.18696517670173987, + "grad_norm": 2.568983793258667, + "learning_rate": 0.00018139679255043973, + "loss": 1.8599, + "step": 1808 + }, + { + "epoch": 0.18706858664460588, + "grad_norm": 1.7339938879013062, + "learning_rate": 0.000181386445938955, + "loss": 1.1257, + "step": 1809 + }, + { + "epoch": 0.18717199658747188, + "grad_norm": 1.8022384643554688, + "learning_rate": 0.00018137609932747026, + "loss": 1.4831, + "step": 1810 + }, + { + "epoch": 0.1872754065303379, + "grad_norm": 0.9867877960205078, + "learning_rate": 0.00018136575271598553, + "loss": 1.0952, + "step": 1811 + }, + { + "epoch": 0.1873788164732039, + "grad_norm": 1.9051878452301025, + "learning_rate": 0.0001813554061045008, + "loss": 1.9558, + "step": 1812 + }, + { + "epoch": 0.1874822264160699, + "grad_norm": 1.5351530313491821, + "learning_rate": 0.00018134505949301606, + "loss": 1.4955, + "step": 1813 + }, + { + "epoch": 0.1875856363589359, + "grad_norm": 1.1694519519805908, + "learning_rate": 0.0001813347128815313, + "loss": 1.533, + "step": 1814 + }, + { + "epoch": 0.18768904630180192, + "grad_norm": 0.9428067803382874, + "learning_rate": 0.00018132436627004656, + "loss": 1.5396, + "step": 1815 + }, + { + "epoch": 0.18779245624466792, + "grad_norm": 1.915429949760437, + "learning_rate": 0.00018131401965856183, + "loss": 1.621, + "step": 1816 + }, + { + "epoch": 0.18789586618753393, + "grad_norm": 2.1339144706726074, + "learning_rate": 0.0001813036730470771, + "loss": 2.2716, + "step": 1817 + }, + { + "epoch": 0.18799927613039993, + "grad_norm": 1.354615569114685, + "learning_rate": 0.00018129332643559236, + "loss": 1.3368, + "step": 1818 + }, + { + "epoch": 0.18810268607326594, + "grad_norm": 1.710594892501831, + "learning_rate": 0.00018128297982410762, + "loss": 1.6227, + "step": 1819 + }, + { + "epoch": 0.18820609601613195, + "grad_norm": 0.7719640135765076, + "learning_rate": 0.00018127263321262286, + "loss": 1.7036, + "step": 1820 + }, + { + "epoch": 0.18830950595899795, + "grad_norm": 1.9698446989059448, + "learning_rate": 0.00018126228660113813, + "loss": 1.8552, + "step": 1821 + }, + { + "epoch": 0.18841291590186396, + "grad_norm": 1.904042363166809, + "learning_rate": 0.0001812519399896534, + "loss": 1.9086, + "step": 1822 + }, + { + "epoch": 0.18851632584472997, + "grad_norm": 1.8710474967956543, + "learning_rate": 0.00018124159337816866, + "loss": 1.9492, + "step": 1823 + }, + { + "epoch": 0.18861973578759597, + "grad_norm": 1.190084457397461, + "learning_rate": 0.00018123124676668392, + "loss": 1.8335, + "step": 1824 + }, + { + "epoch": 0.18872314573046198, + "grad_norm": 4.177672386169434, + "learning_rate": 0.0001812209001551992, + "loss": 2.4607, + "step": 1825 + }, + { + "epoch": 0.18882655567332798, + "grad_norm": 3.2717580795288086, + "learning_rate": 0.00018121055354371443, + "loss": 1.9554, + "step": 1826 + }, + { + "epoch": 0.188929965616194, + "grad_norm": 1.0546072721481323, + "learning_rate": 0.0001812002069322297, + "loss": 1.3306, + "step": 1827 + }, + { + "epoch": 0.18903337555906, + "grad_norm": 1.9088751077651978, + "learning_rate": 0.00018118986032074496, + "loss": 2.0673, + "step": 1828 + }, + { + "epoch": 0.189136785501926, + "grad_norm": 1.713178277015686, + "learning_rate": 0.00018117951370926022, + "loss": 1.864, + "step": 1829 + }, + { + "epoch": 0.189240195444792, + "grad_norm": 1.981321930885315, + "learning_rate": 0.0001811691670977755, + "loss": 1.7863, + "step": 1830 + }, + { + "epoch": 0.18934360538765801, + "grad_norm": 1.5771139860153198, + "learning_rate": 0.00018115882048629076, + "loss": 2.094, + "step": 1831 + }, + { + "epoch": 0.18944701533052402, + "grad_norm": 1.2328580617904663, + "learning_rate": 0.000181148473874806, + "loss": 1.9157, + "step": 1832 + }, + { + "epoch": 0.18955042527339003, + "grad_norm": 1.098201870918274, + "learning_rate": 0.00018113812726332126, + "loss": 1.3232, + "step": 1833 + }, + { + "epoch": 0.18965383521625603, + "grad_norm": 1.778517246246338, + "learning_rate": 0.00018112778065183652, + "loss": 1.9743, + "step": 1834 + }, + { + "epoch": 0.18975724515912204, + "grad_norm": 1.304283618927002, + "learning_rate": 0.0001811174340403518, + "loss": 1.9143, + "step": 1835 + }, + { + "epoch": 0.18986065510198805, + "grad_norm": 1.3429310321807861, + "learning_rate": 0.00018110708742886706, + "loss": 1.7206, + "step": 1836 + }, + { + "epoch": 0.18996406504485405, + "grad_norm": 1.1537504196166992, + "learning_rate": 0.00018109674081738232, + "loss": 1.7866, + "step": 1837 + }, + { + "epoch": 0.19006747498772006, + "grad_norm": 1.7021723985671997, + "learning_rate": 0.00018108639420589756, + "loss": 1.9366, + "step": 1838 + }, + { + "epoch": 0.19017088493058606, + "grad_norm": 1.0984216928482056, + "learning_rate": 0.00018107604759441283, + "loss": 1.344, + "step": 1839 + }, + { + "epoch": 0.19027429487345207, + "grad_norm": 1.413723111152649, + "learning_rate": 0.0001810657009829281, + "loss": 1.7328, + "step": 1840 + }, + { + "epoch": 0.19037770481631808, + "grad_norm": 1.7750047445297241, + "learning_rate": 0.00018105535437144336, + "loss": 1.584, + "step": 1841 + }, + { + "epoch": 0.19048111475918408, + "grad_norm": 1.1813859939575195, + "learning_rate": 0.00018104500775995862, + "loss": 2.1179, + "step": 1842 + }, + { + "epoch": 0.1905845247020501, + "grad_norm": 1.3422218561172485, + "learning_rate": 0.0001810346611484739, + "loss": 1.5151, + "step": 1843 + }, + { + "epoch": 0.1906879346449161, + "grad_norm": 1.4470585584640503, + "learning_rate": 0.00018102431453698913, + "loss": 2.2249, + "step": 1844 + }, + { + "epoch": 0.1907913445877821, + "grad_norm": 1.856718897819519, + "learning_rate": 0.0001810139679255044, + "loss": 1.36, + "step": 1845 + }, + { + "epoch": 0.1908947545306481, + "grad_norm": 1.5910100936889648, + "learning_rate": 0.00018100362131401966, + "loss": 1.3646, + "step": 1846 + }, + { + "epoch": 0.19099816447351411, + "grad_norm": 1.5017138719558716, + "learning_rate": 0.00018099327470253492, + "loss": 1.3302, + "step": 1847 + }, + { + "epoch": 0.19110157441638012, + "grad_norm": 1.379167079925537, + "learning_rate": 0.0001809829280910502, + "loss": 1.4517, + "step": 1848 + }, + { + "epoch": 0.19120498435924616, + "grad_norm": 2.3777546882629395, + "learning_rate": 0.00018097258147956545, + "loss": 1.7635, + "step": 1849 + }, + { + "epoch": 0.19130839430211216, + "grad_norm": 1.7774989604949951, + "learning_rate": 0.0001809622348680807, + "loss": 1.8686, + "step": 1850 + }, + { + "epoch": 0.19141180424497817, + "grad_norm": 1.2162162065505981, + "learning_rate": 0.00018095188825659596, + "loss": 1.3562, + "step": 1851 + }, + { + "epoch": 0.19151521418784417, + "grad_norm": 1.2358673810958862, + "learning_rate": 0.00018094154164511122, + "loss": 1.38, + "step": 1852 + }, + { + "epoch": 0.19161862413071018, + "grad_norm": 1.128541350364685, + "learning_rate": 0.0001809311950336265, + "loss": 1.9068, + "step": 1853 + }, + { + "epoch": 0.1917220340735762, + "grad_norm": 1.5324431657791138, + "learning_rate": 0.00018092084842214175, + "loss": 1.5886, + "step": 1854 + }, + { + "epoch": 0.1918254440164422, + "grad_norm": 1.675620436668396, + "learning_rate": 0.00018091050181065702, + "loss": 1.8084, + "step": 1855 + }, + { + "epoch": 0.1919288539593082, + "grad_norm": 2.1495304107666016, + "learning_rate": 0.00018090015519917226, + "loss": 1.8716, + "step": 1856 + }, + { + "epoch": 0.1920322639021742, + "grad_norm": 2.3184926509857178, + "learning_rate": 0.00018088980858768752, + "loss": 1.6612, + "step": 1857 + }, + { + "epoch": 0.1921356738450402, + "grad_norm": 1.2949163913726807, + "learning_rate": 0.0001808794619762028, + "loss": 1.3865, + "step": 1858 + }, + { + "epoch": 0.19223908378790622, + "grad_norm": 1.916689157485962, + "learning_rate": 0.00018086911536471806, + "loss": 1.3509, + "step": 1859 + }, + { + "epoch": 0.19234249373077222, + "grad_norm": 1.8200879096984863, + "learning_rate": 0.00018085876875323332, + "loss": 1.8744, + "step": 1860 + }, + { + "epoch": 0.19244590367363823, + "grad_norm": 1.343815565109253, + "learning_rate": 0.0001808484221417486, + "loss": 1.5256, + "step": 1861 + }, + { + "epoch": 0.19254931361650424, + "grad_norm": 2.418281078338623, + "learning_rate": 0.00018083807553026385, + "loss": 1.5507, + "step": 1862 + }, + { + "epoch": 0.19265272355937024, + "grad_norm": 2.052889585494995, + "learning_rate": 0.00018082772891877912, + "loss": 1.2509, + "step": 1863 + }, + { + "epoch": 0.19275613350223625, + "grad_norm": 2.2111313343048096, + "learning_rate": 0.00018081738230729436, + "loss": 1.5071, + "step": 1864 + }, + { + "epoch": 0.19285954344510225, + "grad_norm": 1.393479347229004, + "learning_rate": 0.00018080703569580962, + "loss": 1.3836, + "step": 1865 + }, + { + "epoch": 0.19296295338796826, + "grad_norm": 1.9624559879302979, + "learning_rate": 0.0001807966890843249, + "loss": 1.4573, + "step": 1866 + }, + { + "epoch": 0.19306636333083427, + "grad_norm": 1.0306349992752075, + "learning_rate": 0.00018078634247284015, + "loss": 1.514, + "step": 1867 + }, + { + "epoch": 0.19316977327370027, + "grad_norm": 1.0501223802566528, + "learning_rate": 0.00018077599586135542, + "loss": 1.702, + "step": 1868 + }, + { + "epoch": 0.19327318321656628, + "grad_norm": 1.1913334131240845, + "learning_rate": 0.00018076564924987068, + "loss": 1.5761, + "step": 1869 + }, + { + "epoch": 0.1933765931594323, + "grad_norm": 2.000929832458496, + "learning_rate": 0.00018075530263838595, + "loss": 2.0399, + "step": 1870 + }, + { + "epoch": 0.1934800031022983, + "grad_norm": 1.3432594537734985, + "learning_rate": 0.00018074495602690122, + "loss": 0.6957, + "step": 1871 + }, + { + "epoch": 0.1935834130451643, + "grad_norm": 1.617430329322815, + "learning_rate": 0.00018073460941541645, + "loss": 1.5554, + "step": 1872 + }, + { + "epoch": 0.1936868229880303, + "grad_norm": 1.2280590534210205, + "learning_rate": 0.00018072426280393172, + "loss": 1.4669, + "step": 1873 + }, + { + "epoch": 0.1937902329308963, + "grad_norm": 2.365945816040039, + "learning_rate": 0.00018071391619244699, + "loss": 1.5908, + "step": 1874 + }, + { + "epoch": 0.19389364287376232, + "grad_norm": 2.2847042083740234, + "learning_rate": 0.00018070356958096225, + "loss": 1.329, + "step": 1875 + }, + { + "epoch": 0.19399705281662832, + "grad_norm": 1.594077229499817, + "learning_rate": 0.00018069322296947752, + "loss": 2.0627, + "step": 1876 + }, + { + "epoch": 0.19410046275949433, + "grad_norm": 1.4474533796310425, + "learning_rate": 0.00018068287635799278, + "loss": 1.9983, + "step": 1877 + }, + { + "epoch": 0.19420387270236034, + "grad_norm": 1.8211748600006104, + "learning_rate": 0.00018067252974650805, + "loss": 1.3836, + "step": 1878 + }, + { + "epoch": 0.19430728264522634, + "grad_norm": 1.5337395668029785, + "learning_rate": 0.0001806621831350233, + "loss": 1.8984, + "step": 1879 + }, + { + "epoch": 0.19441069258809235, + "grad_norm": 1.8015854358673096, + "learning_rate": 0.00018065183652353855, + "loss": 1.5139, + "step": 1880 + }, + { + "epoch": 0.19451410253095835, + "grad_norm": 1.4770269393920898, + "learning_rate": 0.00018064148991205382, + "loss": 1.801, + "step": 1881 + }, + { + "epoch": 0.19461751247382436, + "grad_norm": 2.826849937438965, + "learning_rate": 0.00018063114330056908, + "loss": 1.9032, + "step": 1882 + }, + { + "epoch": 0.19472092241669037, + "grad_norm": 1.6297709941864014, + "learning_rate": 0.00018062079668908435, + "loss": 1.89, + "step": 1883 + }, + { + "epoch": 0.19482433235955637, + "grad_norm": 1.8109357357025146, + "learning_rate": 0.00018061045007759961, + "loss": 1.4969, + "step": 1884 + }, + { + "epoch": 0.19492774230242238, + "grad_norm": 1.7631341218948364, + "learning_rate": 0.00018060010346611485, + "loss": 2.0972, + "step": 1885 + }, + { + "epoch": 0.19503115224528839, + "grad_norm": 1.25094735622406, + "learning_rate": 0.00018058975685463012, + "loss": 1.1867, + "step": 1886 + }, + { + "epoch": 0.1951345621881544, + "grad_norm": 2.0920863151550293, + "learning_rate": 0.00018057941024314538, + "loss": 2.0022, + "step": 1887 + }, + { + "epoch": 0.1952379721310204, + "grad_norm": 1.4094507694244385, + "learning_rate": 0.00018056906363166065, + "loss": 1.168, + "step": 1888 + }, + { + "epoch": 0.1953413820738864, + "grad_norm": 1.4328402280807495, + "learning_rate": 0.00018055871702017591, + "loss": 1.7634, + "step": 1889 + }, + { + "epoch": 0.1954447920167524, + "grad_norm": 1.9557263851165771, + "learning_rate": 0.00018054837040869118, + "loss": 1.9204, + "step": 1890 + }, + { + "epoch": 0.19554820195961842, + "grad_norm": 1.4277377128601074, + "learning_rate": 0.00018053802379720642, + "loss": 1.5392, + "step": 1891 + }, + { + "epoch": 0.19565161190248442, + "grad_norm": 2.212749481201172, + "learning_rate": 0.00018052767718572168, + "loss": 1.9089, + "step": 1892 + }, + { + "epoch": 0.19575502184535043, + "grad_norm": 1.0590177774429321, + "learning_rate": 0.00018051733057423695, + "loss": 1.4042, + "step": 1893 + }, + { + "epoch": 0.19585843178821644, + "grad_norm": 1.6970980167388916, + "learning_rate": 0.00018050698396275222, + "loss": 1.638, + "step": 1894 + }, + { + "epoch": 0.19596184173108244, + "grad_norm": 1.8155356645584106, + "learning_rate": 0.00018049663735126748, + "loss": 1.5066, + "step": 1895 + }, + { + "epoch": 0.19606525167394845, + "grad_norm": 2.029043674468994, + "learning_rate": 0.00018048629073978275, + "loss": 2.2207, + "step": 1896 + }, + { + "epoch": 0.19616866161681445, + "grad_norm": 1.168373465538025, + "learning_rate": 0.00018047594412829798, + "loss": 1.3578, + "step": 1897 + }, + { + "epoch": 0.19627207155968046, + "grad_norm": 2.446021318435669, + "learning_rate": 0.00018046559751681325, + "loss": 1.6108, + "step": 1898 + }, + { + "epoch": 0.19637548150254647, + "grad_norm": 1.6077264547348022, + "learning_rate": 0.00018045525090532852, + "loss": 1.5486, + "step": 1899 + }, + { + "epoch": 0.19647889144541247, + "grad_norm": 1.242552638053894, + "learning_rate": 0.00018044490429384378, + "loss": 1.7297, + "step": 1900 + }, + { + "epoch": 0.19658230138827848, + "grad_norm": 1.8686028718948364, + "learning_rate": 0.00018043455768235905, + "loss": 1.758, + "step": 1901 + }, + { + "epoch": 0.19668571133114449, + "grad_norm": 1.9251303672790527, + "learning_rate": 0.0001804242110708743, + "loss": 1.4774, + "step": 1902 + }, + { + "epoch": 0.1967891212740105, + "grad_norm": 4.7904133796691895, + "learning_rate": 0.00018041386445938955, + "loss": 1.8559, + "step": 1903 + }, + { + "epoch": 0.1968925312168765, + "grad_norm": 0.9137360453605652, + "learning_rate": 0.00018040351784790482, + "loss": 1.5535, + "step": 1904 + }, + { + "epoch": 0.1969959411597425, + "grad_norm": 1.5711393356323242, + "learning_rate": 0.00018039317123642008, + "loss": 1.03, + "step": 1905 + }, + { + "epoch": 0.1970993511026085, + "grad_norm": 1.640053153038025, + "learning_rate": 0.00018038282462493535, + "loss": 1.836, + "step": 1906 + }, + { + "epoch": 0.19720276104547452, + "grad_norm": 0.7787024974822998, + "learning_rate": 0.0001803724780134506, + "loss": 1.9089, + "step": 1907 + }, + { + "epoch": 0.19730617098834052, + "grad_norm": 2.0896191596984863, + "learning_rate": 0.00018036213140196588, + "loss": 1.9978, + "step": 1908 + }, + { + "epoch": 0.19740958093120653, + "grad_norm": 2.179332971572876, + "learning_rate": 0.00018035178479048112, + "loss": 1.9776, + "step": 1909 + }, + { + "epoch": 0.19751299087407254, + "grad_norm": 1.585008978843689, + "learning_rate": 0.00018034143817899638, + "loss": 1.1899, + "step": 1910 + }, + { + "epoch": 0.19761640081693854, + "grad_norm": 1.303787350654602, + "learning_rate": 0.00018033109156751165, + "loss": 1.7906, + "step": 1911 + }, + { + "epoch": 0.19771981075980455, + "grad_norm": 1.9878695011138916, + "learning_rate": 0.00018032074495602691, + "loss": 1.8106, + "step": 1912 + }, + { + "epoch": 0.19782322070267055, + "grad_norm": 1.501775860786438, + "learning_rate": 0.00018031039834454218, + "loss": 1.4732, + "step": 1913 + }, + { + "epoch": 0.19792663064553656, + "grad_norm": 0.9017894864082336, + "learning_rate": 0.00018030005173305745, + "loss": 1.4282, + "step": 1914 + }, + { + "epoch": 0.19803004058840257, + "grad_norm": 2.1266887187957764, + "learning_rate": 0.00018028970512157268, + "loss": 2.2244, + "step": 1915 + }, + { + "epoch": 0.19813345053126857, + "grad_norm": 1.604373812675476, + "learning_rate": 0.00018027935851008795, + "loss": 1.6581, + "step": 1916 + }, + { + "epoch": 0.19823686047413458, + "grad_norm": 1.8775768280029297, + "learning_rate": 0.00018026901189860321, + "loss": 1.5546, + "step": 1917 + }, + { + "epoch": 0.19834027041700059, + "grad_norm": 1.487886905670166, + "learning_rate": 0.00018025866528711848, + "loss": 1.5156, + "step": 1918 + }, + { + "epoch": 0.1984436803598666, + "grad_norm": 3.0149381160736084, + "learning_rate": 0.00018024831867563375, + "loss": 1.7735, + "step": 1919 + }, + { + "epoch": 0.1985470903027326, + "grad_norm": 1.967848300933838, + "learning_rate": 0.000180237972064149, + "loss": 1.4067, + "step": 1920 + }, + { + "epoch": 0.1986505002455986, + "grad_norm": 1.7828800678253174, + "learning_rate": 0.00018022762545266425, + "loss": 1.3988, + "step": 1921 + }, + { + "epoch": 0.1987539101884646, + "grad_norm": 1.206902265548706, + "learning_rate": 0.00018021727884117952, + "loss": 1.9073, + "step": 1922 + }, + { + "epoch": 0.19885732013133062, + "grad_norm": 1.424898624420166, + "learning_rate": 0.00018020693222969478, + "loss": 1.6547, + "step": 1923 + }, + { + "epoch": 0.19896073007419662, + "grad_norm": 1.5171235799789429, + "learning_rate": 0.00018019658561821005, + "loss": 1.2257, + "step": 1924 + }, + { + "epoch": 0.19906414001706263, + "grad_norm": 2.3045217990875244, + "learning_rate": 0.0001801862390067253, + "loss": 1.5398, + "step": 1925 + }, + { + "epoch": 0.19916754995992864, + "grad_norm": 1.2358486652374268, + "learning_rate": 0.00018017589239524058, + "loss": 1.2342, + "step": 1926 + }, + { + "epoch": 0.19927095990279464, + "grad_norm": 3.0135512351989746, + "learning_rate": 0.00018016554578375582, + "loss": 2.1923, + "step": 1927 + }, + { + "epoch": 0.19937436984566065, + "grad_norm": 1.4758455753326416, + "learning_rate": 0.00018015519917227108, + "loss": 1.5922, + "step": 1928 + }, + { + "epoch": 0.19947777978852665, + "grad_norm": 1.570439100265503, + "learning_rate": 0.00018014485256078635, + "loss": 1.8605, + "step": 1929 + }, + { + "epoch": 0.19958118973139266, + "grad_norm": 1.461016058921814, + "learning_rate": 0.0001801345059493016, + "loss": 1.5143, + "step": 1930 + }, + { + "epoch": 0.19968459967425867, + "grad_norm": 0.7076051831245422, + "learning_rate": 0.00018012415933781688, + "loss": 1.5049, + "step": 1931 + }, + { + "epoch": 0.19978800961712467, + "grad_norm": 2.8588709831237793, + "learning_rate": 0.00018011381272633214, + "loss": 1.7324, + "step": 1932 + }, + { + "epoch": 0.19989141955999068, + "grad_norm": 0.7718111276626587, + "learning_rate": 0.00018010346611484738, + "loss": 1.6924, + "step": 1933 + }, + { + "epoch": 0.19999482950285669, + "grad_norm": 1.3416244983673096, + "learning_rate": 0.00018009311950336265, + "loss": 1.7367, + "step": 1934 + }, + { + "epoch": 0.20009823944572272, + "grad_norm": 1.6038373708724976, + "learning_rate": 0.00018008277289187791, + "loss": 1.4978, + "step": 1935 + }, + { + "epoch": 0.20020164938858873, + "grad_norm": 0.9637541770935059, + "learning_rate": 0.00018007242628039318, + "loss": 1.5899, + "step": 1936 + }, + { + "epoch": 0.20030505933145473, + "grad_norm": 2.398042678833008, + "learning_rate": 0.00018006207966890845, + "loss": 1.684, + "step": 1937 + }, + { + "epoch": 0.20040846927432074, + "grad_norm": 0.8711498379707336, + "learning_rate": 0.0001800517330574237, + "loss": 1.6272, + "step": 1938 + }, + { + "epoch": 0.20051187921718674, + "grad_norm": 1.1462258100509644, + "learning_rate": 0.00018004138644593895, + "loss": 1.5288, + "step": 1939 + }, + { + "epoch": 0.20061528916005275, + "grad_norm": 0.9858958125114441, + "learning_rate": 0.00018003103983445421, + "loss": 1.3255, + "step": 1940 + }, + { + "epoch": 0.20071869910291876, + "grad_norm": 2.7075014114379883, + "learning_rate": 0.00018002069322296948, + "loss": 1.7802, + "step": 1941 + }, + { + "epoch": 0.20082210904578476, + "grad_norm": 1.764085054397583, + "learning_rate": 0.00018001034661148475, + "loss": 1.7139, + "step": 1942 + }, + { + "epoch": 0.20092551898865077, + "grad_norm": 2.7296926975250244, + "learning_rate": 0.00018, + "loss": 1.9334, + "step": 1943 + }, + { + "epoch": 0.20102892893151678, + "grad_norm": 2.1738665103912354, + "learning_rate": 0.00017998965338851528, + "loss": 1.5643, + "step": 1944 + }, + { + "epoch": 0.20113233887438278, + "grad_norm": 1.6310564279556274, + "learning_rate": 0.00017997930677703052, + "loss": 1.8784, + "step": 1945 + }, + { + "epoch": 0.2012357488172488, + "grad_norm": 2.148175001144409, + "learning_rate": 0.00017996896016554578, + "loss": 2.0443, + "step": 1946 + }, + { + "epoch": 0.2013391587601148, + "grad_norm": 1.1791040897369385, + "learning_rate": 0.00017995861355406105, + "loss": 0.9135, + "step": 1947 + }, + { + "epoch": 0.2014425687029808, + "grad_norm": 0.9954102635383606, + "learning_rate": 0.0001799482669425763, + "loss": 1.2455, + "step": 1948 + }, + { + "epoch": 0.2015459786458468, + "grad_norm": 2.1388514041900635, + "learning_rate": 0.00017993792033109158, + "loss": 1.6639, + "step": 1949 + }, + { + "epoch": 0.2016493885887128, + "grad_norm": 1.2391835451126099, + "learning_rate": 0.00017992757371960684, + "loss": 1.5007, + "step": 1950 + }, + { + "epoch": 0.20175279853157882, + "grad_norm": 0.8275509476661682, + "learning_rate": 0.00017991722710812208, + "loss": 1.9589, + "step": 1951 + }, + { + "epoch": 0.20185620847444483, + "grad_norm": 1.2318191528320312, + "learning_rate": 0.00017990688049663735, + "loss": 1.4067, + "step": 1952 + }, + { + "epoch": 0.20195961841731083, + "grad_norm": 1.2213613986968994, + "learning_rate": 0.0001798965338851526, + "loss": 2.0376, + "step": 1953 + }, + { + "epoch": 0.20206302836017684, + "grad_norm": 1.7849206924438477, + "learning_rate": 0.00017988618727366788, + "loss": 2.1899, + "step": 1954 + }, + { + "epoch": 0.20216643830304284, + "grad_norm": 1.1603035926818848, + "learning_rate": 0.00017987584066218314, + "loss": 1.5716, + "step": 1955 + }, + { + "epoch": 0.20226984824590885, + "grad_norm": 1.1252952814102173, + "learning_rate": 0.0001798654940506984, + "loss": 1.9331, + "step": 1956 + }, + { + "epoch": 0.20237325818877486, + "grad_norm": 1.050102710723877, + "learning_rate": 0.00017985514743921365, + "loss": 2.0236, + "step": 1957 + }, + { + "epoch": 0.20247666813164086, + "grad_norm": 1.4389870166778564, + "learning_rate": 0.00017984480082772891, + "loss": 1.4933, + "step": 1958 + }, + { + "epoch": 0.20258007807450687, + "grad_norm": 1.7062374353408813, + "learning_rate": 0.00017983445421624418, + "loss": 1.843, + "step": 1959 + }, + { + "epoch": 0.20268348801737288, + "grad_norm": 1.5522654056549072, + "learning_rate": 0.00017982410760475944, + "loss": 1.8029, + "step": 1960 + }, + { + "epoch": 0.20278689796023888, + "grad_norm": 0.9937103390693665, + "learning_rate": 0.0001798137609932747, + "loss": 1.3794, + "step": 1961 + }, + { + "epoch": 0.2028903079031049, + "grad_norm": 2.6092488765716553, + "learning_rate": 0.00017980341438178998, + "loss": 1.7977, + "step": 1962 + }, + { + "epoch": 0.2029937178459709, + "grad_norm": 1.2122944593429565, + "learning_rate": 0.00017979306777030521, + "loss": 1.532, + "step": 1963 + }, + { + "epoch": 0.2030971277888369, + "grad_norm": 2.4709153175354004, + "learning_rate": 0.00017978272115882048, + "loss": 1.8433, + "step": 1964 + }, + { + "epoch": 0.2032005377317029, + "grad_norm": 1.01822829246521, + "learning_rate": 0.00017977237454733575, + "loss": 1.6264, + "step": 1965 + }, + { + "epoch": 0.2033039476745689, + "grad_norm": 1.2235360145568848, + "learning_rate": 0.000179762027935851, + "loss": 1.6075, + "step": 1966 + }, + { + "epoch": 0.20340735761743492, + "grad_norm": 1.8367936611175537, + "learning_rate": 0.00017975168132436628, + "loss": 1.5176, + "step": 1967 + }, + { + "epoch": 0.20351076756030093, + "grad_norm": 0.938755989074707, + "learning_rate": 0.00017974133471288154, + "loss": 1.5801, + "step": 1968 + }, + { + "epoch": 0.20361417750316693, + "grad_norm": 1.2774001359939575, + "learning_rate": 0.00017973098810139678, + "loss": 2.0368, + "step": 1969 + }, + { + "epoch": 0.20371758744603294, + "grad_norm": 1.1897248029708862, + "learning_rate": 0.00017972064148991205, + "loss": 1.599, + "step": 1970 + }, + { + "epoch": 0.20382099738889894, + "grad_norm": 2.377195119857788, + "learning_rate": 0.0001797102948784273, + "loss": 1.7758, + "step": 1971 + }, + { + "epoch": 0.20392440733176495, + "grad_norm": 0.9643241763114929, + "learning_rate": 0.00017969994826694258, + "loss": 1.4347, + "step": 1972 + }, + { + "epoch": 0.20402781727463096, + "grad_norm": 1.207589030265808, + "learning_rate": 0.00017968960165545784, + "loss": 1.8052, + "step": 1973 + }, + { + "epoch": 0.20413122721749696, + "grad_norm": 1.32759690284729, + "learning_rate": 0.0001796792550439731, + "loss": 1.6897, + "step": 1974 + }, + { + "epoch": 0.20423463716036297, + "grad_norm": 2.175830364227295, + "learning_rate": 0.00017966890843248835, + "loss": 1.6727, + "step": 1975 + }, + { + "epoch": 0.20433804710322898, + "grad_norm": 1.3569698333740234, + "learning_rate": 0.0001796585618210036, + "loss": 1.6497, + "step": 1976 + }, + { + "epoch": 0.20444145704609498, + "grad_norm": 1.3463072776794434, + "learning_rate": 0.00017964821520951888, + "loss": 1.4355, + "step": 1977 + }, + { + "epoch": 0.204544866988961, + "grad_norm": 1.4692943096160889, + "learning_rate": 0.00017963786859803414, + "loss": 1.7329, + "step": 1978 + }, + { + "epoch": 0.204648276931827, + "grad_norm": 2.3857581615448, + "learning_rate": 0.0001796275219865494, + "loss": 1.7336, + "step": 1979 + }, + { + "epoch": 0.204751686874693, + "grad_norm": 1.526208758354187, + "learning_rate": 0.00017961717537506467, + "loss": 1.3837, + "step": 1980 + }, + { + "epoch": 0.204855096817559, + "grad_norm": 3.072439193725586, + "learning_rate": 0.0001796068287635799, + "loss": 2.5553, + "step": 1981 + }, + { + "epoch": 0.204958506760425, + "grad_norm": 1.1713007688522339, + "learning_rate": 0.00017959648215209518, + "loss": 1.4069, + "step": 1982 + }, + { + "epoch": 0.20506191670329102, + "grad_norm": 1.4028162956237793, + "learning_rate": 0.00017958613554061044, + "loss": 1.4357, + "step": 1983 + }, + { + "epoch": 0.20516532664615703, + "grad_norm": 1.3035467863082886, + "learning_rate": 0.0001795757889291257, + "loss": 1.8673, + "step": 1984 + }, + { + "epoch": 0.20526873658902303, + "grad_norm": 1.277909755706787, + "learning_rate": 0.00017956544231764098, + "loss": 1.7968, + "step": 1985 + }, + { + "epoch": 0.20537214653188904, + "grad_norm": 2.3015072345733643, + "learning_rate": 0.00017955509570615624, + "loss": 1.6155, + "step": 1986 + }, + { + "epoch": 0.20547555647475504, + "grad_norm": 1.4320391416549683, + "learning_rate": 0.0001795447490946715, + "loss": 1.2945, + "step": 1987 + }, + { + "epoch": 0.20557896641762105, + "grad_norm": 1.892540454864502, + "learning_rate": 0.00017953440248318677, + "loss": 1.854, + "step": 1988 + }, + { + "epoch": 0.20568237636048706, + "grad_norm": 1.6273618936538696, + "learning_rate": 0.000179524055871702, + "loss": 1.8503, + "step": 1989 + }, + { + "epoch": 0.20578578630335306, + "grad_norm": 1.4557535648345947, + "learning_rate": 0.00017951370926021728, + "loss": 1.5845, + "step": 1990 + }, + { + "epoch": 0.20588919624621907, + "grad_norm": 1.505989909172058, + "learning_rate": 0.00017950336264873254, + "loss": 1.2067, + "step": 1991 + }, + { + "epoch": 0.20599260618908508, + "grad_norm": 0.9464778900146484, + "learning_rate": 0.0001794930160372478, + "loss": 1.1262, + "step": 1992 + }, + { + "epoch": 0.20609601613195108, + "grad_norm": 1.0865939855575562, + "learning_rate": 0.00017948266942576307, + "loss": 1.8715, + "step": 1993 + }, + { + "epoch": 0.2061994260748171, + "grad_norm": 2.5264039039611816, + "learning_rate": 0.00017947232281427834, + "loss": 1.3308, + "step": 1994 + }, + { + "epoch": 0.2063028360176831, + "grad_norm": 2.007601261138916, + "learning_rate": 0.0001794619762027936, + "loss": 1.3582, + "step": 1995 + }, + { + "epoch": 0.2064062459605491, + "grad_norm": 1.6429976224899292, + "learning_rate": 0.00017945162959130887, + "loss": 1.9568, + "step": 1996 + }, + { + "epoch": 0.2065096559034151, + "grad_norm": 1.2347596883773804, + "learning_rate": 0.0001794412829798241, + "loss": 0.9477, + "step": 1997 + }, + { + "epoch": 0.2066130658462811, + "grad_norm": 1.348817229270935, + "learning_rate": 0.00017943093636833937, + "loss": 1.5172, + "step": 1998 + }, + { + "epoch": 0.20671647578914712, + "grad_norm": 1.0015040636062622, + "learning_rate": 0.00017942058975685464, + "loss": 1.5398, + "step": 1999 + }, + { + "epoch": 0.20681988573201313, + "grad_norm": 1.2908525466918945, + "learning_rate": 0.0001794102431453699, + "loss": 1.6978, + "step": 2000 + }, + { + "epoch": 0.20692329567487913, + "grad_norm": 1.1875195503234863, + "learning_rate": 0.00017939989653388517, + "loss": 1.533, + "step": 2001 + }, + { + "epoch": 0.20702670561774514, + "grad_norm": 2.341791868209839, + "learning_rate": 0.00017938954992240044, + "loss": 1.9741, + "step": 2002 + }, + { + "epoch": 0.20713011556061114, + "grad_norm": 0.8359478116035461, + "learning_rate": 0.0001793792033109157, + "loss": 1.3734, + "step": 2003 + }, + { + "epoch": 0.20723352550347715, + "grad_norm": 0.9825917482376099, + "learning_rate": 0.00017936885669943097, + "loss": 1.5892, + "step": 2004 + }, + { + "epoch": 0.20733693544634316, + "grad_norm": 2.1322312355041504, + "learning_rate": 0.0001793585100879462, + "loss": 2.1437, + "step": 2005 + }, + { + "epoch": 0.20744034538920916, + "grad_norm": 2.004436731338501, + "learning_rate": 0.00017934816347646147, + "loss": 2.1666, + "step": 2006 + }, + { + "epoch": 0.20754375533207517, + "grad_norm": 2.5417864322662354, + "learning_rate": 0.00017933781686497674, + "loss": 1.3547, + "step": 2007 + }, + { + "epoch": 0.20764716527494118, + "grad_norm": 1.3734756708145142, + "learning_rate": 0.000179327470253492, + "loss": 2.0243, + "step": 2008 + }, + { + "epoch": 0.20775057521780718, + "grad_norm": 2.404106378555298, + "learning_rate": 0.00017931712364200727, + "loss": 1.8643, + "step": 2009 + }, + { + "epoch": 0.2078539851606732, + "grad_norm": 1.595017910003662, + "learning_rate": 0.00017930677703052253, + "loss": 1.7677, + "step": 2010 + }, + { + "epoch": 0.2079573951035392, + "grad_norm": 2.3860955238342285, + "learning_rate": 0.00017929643041903777, + "loss": 2.1223, + "step": 2011 + }, + { + "epoch": 0.2080608050464052, + "grad_norm": 1.7865846157073975, + "learning_rate": 0.00017928608380755304, + "loss": 1.5736, + "step": 2012 + }, + { + "epoch": 0.2081642149892712, + "grad_norm": 1.4908995628356934, + "learning_rate": 0.0001792757371960683, + "loss": 0.9999, + "step": 2013 + }, + { + "epoch": 0.2082676249321372, + "grad_norm": 1.2065404653549194, + "learning_rate": 0.00017926539058458357, + "loss": 1.3708, + "step": 2014 + }, + { + "epoch": 0.20837103487500322, + "grad_norm": 1.7153881788253784, + "learning_rate": 0.00017925504397309883, + "loss": 1.2165, + "step": 2015 + }, + { + "epoch": 0.20847444481786923, + "grad_norm": 2.4799675941467285, + "learning_rate": 0.0001792446973616141, + "loss": 2.138, + "step": 2016 + }, + { + "epoch": 0.20857785476073523, + "grad_norm": 2.0555872917175293, + "learning_rate": 0.00017923435075012934, + "loss": 1.4837, + "step": 2017 + }, + { + "epoch": 0.20868126470360124, + "grad_norm": 1.3154330253601074, + "learning_rate": 0.0001792240041386446, + "loss": 1.3167, + "step": 2018 + }, + { + "epoch": 0.20878467464646724, + "grad_norm": 1.519303798675537, + "learning_rate": 0.00017921365752715987, + "loss": 1.4666, + "step": 2019 + }, + { + "epoch": 0.20888808458933325, + "grad_norm": 1.393334984779358, + "learning_rate": 0.00017920331091567514, + "loss": 1.8231, + "step": 2020 + }, + { + "epoch": 0.20899149453219928, + "grad_norm": 2.9100759029388428, + "learning_rate": 0.0001791929643041904, + "loss": 2.0966, + "step": 2021 + }, + { + "epoch": 0.2090949044750653, + "grad_norm": 1.2485828399658203, + "learning_rate": 0.00017918261769270567, + "loss": 2.3454, + "step": 2022 + }, + { + "epoch": 0.2091983144179313, + "grad_norm": 1.3368488550186157, + "learning_rate": 0.0001791722710812209, + "loss": 1.5258, + "step": 2023 + }, + { + "epoch": 0.2093017243607973, + "grad_norm": 1.3732163906097412, + "learning_rate": 0.00017916192446973617, + "loss": 1.4365, + "step": 2024 + }, + { + "epoch": 0.2094051343036633, + "grad_norm": 2.5866189002990723, + "learning_rate": 0.00017915157785825144, + "loss": 2.0169, + "step": 2025 + }, + { + "epoch": 0.20950854424652932, + "grad_norm": 1.5932780504226685, + "learning_rate": 0.0001791412312467667, + "loss": 2.3346, + "step": 2026 + }, + { + "epoch": 0.20961195418939532, + "grad_norm": 1.9270954132080078, + "learning_rate": 0.00017913088463528197, + "loss": 1.405, + "step": 2027 + }, + { + "epoch": 0.20971536413226133, + "grad_norm": 1.3831976652145386, + "learning_rate": 0.00017912053802379723, + "loss": 1.543, + "step": 2028 + }, + { + "epoch": 0.20981877407512733, + "grad_norm": 2.5507442951202393, + "learning_rate": 0.00017911019141231247, + "loss": 1.992, + "step": 2029 + }, + { + "epoch": 0.20992218401799334, + "grad_norm": 2.1970303058624268, + "learning_rate": 0.00017909984480082774, + "loss": 1.2845, + "step": 2030 + }, + { + "epoch": 0.21002559396085935, + "grad_norm": 2.447885751724243, + "learning_rate": 0.000179089498189343, + "loss": 2.5404, + "step": 2031 + }, + { + "epoch": 0.21012900390372535, + "grad_norm": 2.6955151557922363, + "learning_rate": 0.00017907915157785827, + "loss": 2.1718, + "step": 2032 + }, + { + "epoch": 0.21023241384659136, + "grad_norm": 1.2139887809753418, + "learning_rate": 0.00017906880496637353, + "loss": 1.5857, + "step": 2033 + }, + { + "epoch": 0.21033582378945737, + "grad_norm": 3.4483728408813477, + "learning_rate": 0.0001790584583548888, + "loss": 1.2517, + "step": 2034 + }, + { + "epoch": 0.21043923373232337, + "grad_norm": 1.2948105335235596, + "learning_rate": 0.00017904811174340404, + "loss": 2.0194, + "step": 2035 + }, + { + "epoch": 0.21054264367518938, + "grad_norm": 1.39495050907135, + "learning_rate": 0.0001790377651319193, + "loss": 1.2252, + "step": 2036 + }, + { + "epoch": 0.21064605361805538, + "grad_norm": 1.9834434986114502, + "learning_rate": 0.00017902741852043457, + "loss": 1.8306, + "step": 2037 + }, + { + "epoch": 0.2107494635609214, + "grad_norm": 1.5110279321670532, + "learning_rate": 0.00017901707190894983, + "loss": 1.5356, + "step": 2038 + }, + { + "epoch": 0.2108528735037874, + "grad_norm": 1.726402997970581, + "learning_rate": 0.0001790067252974651, + "loss": 1.5534, + "step": 2039 + }, + { + "epoch": 0.2109562834466534, + "grad_norm": 1.5266774892807007, + "learning_rate": 0.00017899637868598037, + "loss": 1.9585, + "step": 2040 + }, + { + "epoch": 0.2110596933895194, + "grad_norm": 2.4746997356414795, + "learning_rate": 0.0001789860320744956, + "loss": 1.8594, + "step": 2041 + }, + { + "epoch": 0.21116310333238542, + "grad_norm": 1.94597589969635, + "learning_rate": 0.00017897568546301087, + "loss": 1.9834, + "step": 2042 + }, + { + "epoch": 0.21126651327525142, + "grad_norm": 1.2754744291305542, + "learning_rate": 0.00017896533885152613, + "loss": 1.2801, + "step": 2043 + }, + { + "epoch": 0.21136992321811743, + "grad_norm": 1.6256976127624512, + "learning_rate": 0.0001789549922400414, + "loss": 1.6023, + "step": 2044 + }, + { + "epoch": 0.21147333316098343, + "grad_norm": 1.609365463256836, + "learning_rate": 0.00017894464562855667, + "loss": 2.1025, + "step": 2045 + }, + { + "epoch": 0.21157674310384944, + "grad_norm": 1.7030891180038452, + "learning_rate": 0.00017893429901707193, + "loss": 1.7133, + "step": 2046 + }, + { + "epoch": 0.21168015304671545, + "grad_norm": 1.521166443824768, + "learning_rate": 0.00017892395240558717, + "loss": 1.7595, + "step": 2047 + }, + { + "epoch": 0.21178356298958145, + "grad_norm": 2.1347200870513916, + "learning_rate": 0.00017891360579410244, + "loss": 1.8925, + "step": 2048 + }, + { + "epoch": 0.21188697293244746, + "grad_norm": 1.0615814924240112, + "learning_rate": 0.0001789032591826177, + "loss": 1.2334, + "step": 2049 + }, + { + "epoch": 0.21199038287531347, + "grad_norm": 1.3307974338531494, + "learning_rate": 0.00017889291257113297, + "loss": 1.3501, + "step": 2050 + }, + { + "epoch": 0.21209379281817947, + "grad_norm": 2.896918773651123, + "learning_rate": 0.00017888256595964823, + "loss": 1.9585, + "step": 2051 + }, + { + "epoch": 0.21219720276104548, + "grad_norm": 1.7504158020019531, + "learning_rate": 0.0001788722193481635, + "loss": 1.6612, + "step": 2052 + }, + { + "epoch": 0.21230061270391148, + "grad_norm": 1.9615195989608765, + "learning_rate": 0.00017886187273667874, + "loss": 1.6103, + "step": 2053 + }, + { + "epoch": 0.2124040226467775, + "grad_norm": 1.6140780448913574, + "learning_rate": 0.000178851526125194, + "loss": 1.6658, + "step": 2054 + }, + { + "epoch": 0.2125074325896435, + "grad_norm": 2.134024143218994, + "learning_rate": 0.00017884117951370927, + "loss": 2.0824, + "step": 2055 + }, + { + "epoch": 0.2126108425325095, + "grad_norm": 1.1607773303985596, + "learning_rate": 0.00017883083290222453, + "loss": 1.5691, + "step": 2056 + }, + { + "epoch": 0.2127142524753755, + "grad_norm": 1.261562466621399, + "learning_rate": 0.0001788204862907398, + "loss": 1.8007, + "step": 2057 + }, + { + "epoch": 0.21281766241824152, + "grad_norm": 2.1017324924468994, + "learning_rate": 0.00017881013967925506, + "loss": 1.6821, + "step": 2058 + }, + { + "epoch": 0.21292107236110752, + "grad_norm": 1.7821030616760254, + "learning_rate": 0.0001787997930677703, + "loss": 1.5448, + "step": 2059 + }, + { + "epoch": 0.21302448230397353, + "grad_norm": 0.8674912452697754, + "learning_rate": 0.00017878944645628557, + "loss": 2.0453, + "step": 2060 + }, + { + "epoch": 0.21312789224683953, + "grad_norm": 1.2638546228408813, + "learning_rate": 0.00017877909984480083, + "loss": 1.4075, + "step": 2061 + }, + { + "epoch": 0.21323130218970554, + "grad_norm": 1.0837128162384033, + "learning_rate": 0.0001787687532333161, + "loss": 2.0548, + "step": 2062 + }, + { + "epoch": 0.21333471213257155, + "grad_norm": 3.9489150047302246, + "learning_rate": 0.00017875840662183136, + "loss": 2.059, + "step": 2063 + }, + { + "epoch": 0.21343812207543755, + "grad_norm": 3.7381279468536377, + "learning_rate": 0.00017874806001034663, + "loss": 2.097, + "step": 2064 + }, + { + "epoch": 0.21354153201830356, + "grad_norm": 1.6482309103012085, + "learning_rate": 0.00017873771339886187, + "loss": 1.8298, + "step": 2065 + }, + { + "epoch": 0.21364494196116957, + "grad_norm": 1.2123875617980957, + "learning_rate": 0.00017872736678737713, + "loss": 1.4125, + "step": 2066 + }, + { + "epoch": 0.21374835190403557, + "grad_norm": 2.2748680114746094, + "learning_rate": 0.0001787170201758924, + "loss": 1.8227, + "step": 2067 + }, + { + "epoch": 0.21385176184690158, + "grad_norm": 1.461757779121399, + "learning_rate": 0.00017870667356440767, + "loss": 1.7162, + "step": 2068 + }, + { + "epoch": 0.21395517178976758, + "grad_norm": 1.454840898513794, + "learning_rate": 0.00017869632695292293, + "loss": 1.9135, + "step": 2069 + }, + { + "epoch": 0.2140585817326336, + "grad_norm": 3.0303759574890137, + "learning_rate": 0.0001786859803414382, + "loss": 1.9702, + "step": 2070 + }, + { + "epoch": 0.2141619916754996, + "grad_norm": 1.0980781316757202, + "learning_rate": 0.00017867563372995344, + "loss": 1.3306, + "step": 2071 + }, + { + "epoch": 0.2142654016183656, + "grad_norm": 1.069142460823059, + "learning_rate": 0.0001786652871184687, + "loss": 0.7896, + "step": 2072 + }, + { + "epoch": 0.2143688115612316, + "grad_norm": 1.9988906383514404, + "learning_rate": 0.00017865494050698397, + "loss": 2.5048, + "step": 2073 + }, + { + "epoch": 0.21447222150409762, + "grad_norm": 1.0396260023117065, + "learning_rate": 0.00017864459389549923, + "loss": 1.173, + "step": 2074 + }, + { + "epoch": 0.21457563144696362, + "grad_norm": 1.9060814380645752, + "learning_rate": 0.0001786342472840145, + "loss": 2.4794, + "step": 2075 + }, + { + "epoch": 0.21467904138982963, + "grad_norm": 2.0201363563537598, + "learning_rate": 0.00017862390067252976, + "loss": 1.7278, + "step": 2076 + }, + { + "epoch": 0.21478245133269563, + "grad_norm": 1.4735513925552368, + "learning_rate": 0.000178613554061045, + "loss": 1.7763, + "step": 2077 + }, + { + "epoch": 0.21488586127556164, + "grad_norm": 1.1059455871582031, + "learning_rate": 0.00017860320744956027, + "loss": 1.5732, + "step": 2078 + }, + { + "epoch": 0.21498927121842765, + "grad_norm": 1.4942630529403687, + "learning_rate": 0.00017859286083807553, + "loss": 1.3362, + "step": 2079 + }, + { + "epoch": 0.21509268116129365, + "grad_norm": 1.017965316772461, + "learning_rate": 0.0001785825142265908, + "loss": 1.3988, + "step": 2080 + }, + { + "epoch": 0.21519609110415966, + "grad_norm": 1.3145697116851807, + "learning_rate": 0.00017857216761510606, + "loss": 1.2764, + "step": 2081 + }, + { + "epoch": 0.21529950104702567, + "grad_norm": 1.3202226161956787, + "learning_rate": 0.00017856182100362133, + "loss": 1.9009, + "step": 2082 + }, + { + "epoch": 0.21540291098989167, + "grad_norm": 2.01487398147583, + "learning_rate": 0.00017855147439213657, + "loss": 1.3367, + "step": 2083 + }, + { + "epoch": 0.21550632093275768, + "grad_norm": 0.839141845703125, + "learning_rate": 0.00017854112778065183, + "loss": 2.0631, + "step": 2084 + }, + { + "epoch": 0.21560973087562368, + "grad_norm": 1.0034396648406982, + "learning_rate": 0.0001785307811691671, + "loss": 1.7007, + "step": 2085 + }, + { + "epoch": 0.2157131408184897, + "grad_norm": 1.548252820968628, + "learning_rate": 0.00017852043455768236, + "loss": 1.1115, + "step": 2086 + }, + { + "epoch": 0.2158165507613557, + "grad_norm": 1.6088694334030151, + "learning_rate": 0.00017851008794619763, + "loss": 1.7482, + "step": 2087 + }, + { + "epoch": 0.2159199607042217, + "grad_norm": 1.0032658576965332, + "learning_rate": 0.0001784997413347129, + "loss": 1.952, + "step": 2088 + }, + { + "epoch": 0.2160233706470877, + "grad_norm": 1.0490684509277344, + "learning_rate": 0.00017848939472322813, + "loss": 2.0505, + "step": 2089 + }, + { + "epoch": 0.21612678058995372, + "grad_norm": 1.1849490404129028, + "learning_rate": 0.0001784790481117434, + "loss": 1.6461, + "step": 2090 + }, + { + "epoch": 0.21623019053281972, + "grad_norm": 1.3508158922195435, + "learning_rate": 0.00017846870150025867, + "loss": 0.9686, + "step": 2091 + }, + { + "epoch": 0.21633360047568573, + "grad_norm": 2.8345625400543213, + "learning_rate": 0.00017845835488877393, + "loss": 1.6479, + "step": 2092 + }, + { + "epoch": 0.21643701041855173, + "grad_norm": 1.3022091388702393, + "learning_rate": 0.0001784480082772892, + "loss": 1.609, + "step": 2093 + }, + { + "epoch": 0.21654042036141774, + "grad_norm": 1.6842771768569946, + "learning_rate": 0.00017843766166580446, + "loss": 1.1587, + "step": 2094 + }, + { + "epoch": 0.21664383030428375, + "grad_norm": 1.806605577468872, + "learning_rate": 0.0001784273150543197, + "loss": 1.8456, + "step": 2095 + }, + { + "epoch": 0.21674724024714975, + "grad_norm": 1.597494125366211, + "learning_rate": 0.00017841696844283497, + "loss": 1.3116, + "step": 2096 + }, + { + "epoch": 0.21685065019001576, + "grad_norm": 1.737967848777771, + "learning_rate": 0.00017840662183135023, + "loss": 1.2401, + "step": 2097 + }, + { + "epoch": 0.21695406013288177, + "grad_norm": 0.8340157270431519, + "learning_rate": 0.0001783962752198655, + "loss": 1.4509, + "step": 2098 + }, + { + "epoch": 0.21705747007574777, + "grad_norm": 1.295029640197754, + "learning_rate": 0.00017838592860838076, + "loss": 1.3269, + "step": 2099 + }, + { + "epoch": 0.21716088001861378, + "grad_norm": 1.8555551767349243, + "learning_rate": 0.00017837558199689603, + "loss": 1.8658, + "step": 2100 + }, + { + "epoch": 0.21726428996147978, + "grad_norm": 1.1895952224731445, + "learning_rate": 0.00017836523538541127, + "loss": 1.4318, + "step": 2101 + }, + { + "epoch": 0.2173676999043458, + "grad_norm": 1.3722859621047974, + "learning_rate": 0.00017835488877392653, + "loss": 1.5428, + "step": 2102 + }, + { + "epoch": 0.2174711098472118, + "grad_norm": 0.909134030342102, + "learning_rate": 0.0001783445421624418, + "loss": 1.7826, + "step": 2103 + }, + { + "epoch": 0.2175745197900778, + "grad_norm": 3.7859716415405273, + "learning_rate": 0.00017833419555095706, + "loss": 1.5052, + "step": 2104 + }, + { + "epoch": 0.2176779297329438, + "grad_norm": 2.5243780612945557, + "learning_rate": 0.00017832384893947233, + "loss": 2.3734, + "step": 2105 + }, + { + "epoch": 0.21778133967580984, + "grad_norm": 1.4078128337860107, + "learning_rate": 0.0001783135023279876, + "loss": 2.0086, + "step": 2106 + }, + { + "epoch": 0.21788474961867585, + "grad_norm": 1.2615593671798706, + "learning_rate": 0.00017830315571650283, + "loss": 1.6874, + "step": 2107 + }, + { + "epoch": 0.21798815956154186, + "grad_norm": 2.134047031402588, + "learning_rate": 0.0001782928091050181, + "loss": 1.9679, + "step": 2108 + }, + { + "epoch": 0.21809156950440786, + "grad_norm": 1.8929492235183716, + "learning_rate": 0.00017828246249353336, + "loss": 1.5872, + "step": 2109 + }, + { + "epoch": 0.21819497944727387, + "grad_norm": 1.081035852432251, + "learning_rate": 0.00017827211588204863, + "loss": 1.2067, + "step": 2110 + }, + { + "epoch": 0.21829838939013987, + "grad_norm": 1.5089231729507446, + "learning_rate": 0.0001782617692705639, + "loss": 1.6677, + "step": 2111 + }, + { + "epoch": 0.21840179933300588, + "grad_norm": 1.6253288984298706, + "learning_rate": 0.00017825142265907916, + "loss": 1.5697, + "step": 2112 + }, + { + "epoch": 0.2185052092758719, + "grad_norm": 1.9758878946304321, + "learning_rate": 0.0001782410760475944, + "loss": 1.7322, + "step": 2113 + }, + { + "epoch": 0.2186086192187379, + "grad_norm": 3.3685495853424072, + "learning_rate": 0.00017823072943610967, + "loss": 2.1612, + "step": 2114 + }, + { + "epoch": 0.2187120291616039, + "grad_norm": 0.7514423131942749, + "learning_rate": 0.00017822038282462493, + "loss": 1.8737, + "step": 2115 + }, + { + "epoch": 0.2188154391044699, + "grad_norm": 1.4034844636917114, + "learning_rate": 0.0001782100362131402, + "loss": 1.5761, + "step": 2116 + }, + { + "epoch": 0.2189188490473359, + "grad_norm": 2.4100217819213867, + "learning_rate": 0.00017819968960165546, + "loss": 1.6346, + "step": 2117 + }, + { + "epoch": 0.21902225899020192, + "grad_norm": 0.9915169477462769, + "learning_rate": 0.00017818934299017073, + "loss": 1.3795, + "step": 2118 + }, + { + "epoch": 0.21912566893306792, + "grad_norm": 1.9723891019821167, + "learning_rate": 0.000178178996378686, + "loss": 1.4034, + "step": 2119 + }, + { + "epoch": 0.21922907887593393, + "grad_norm": 1.6976430416107178, + "learning_rate": 0.00017816864976720126, + "loss": 1.9929, + "step": 2120 + }, + { + "epoch": 0.21933248881879994, + "grad_norm": 2.5415894985198975, + "learning_rate": 0.0001781583031557165, + "loss": 1.4351, + "step": 2121 + }, + { + "epoch": 0.21943589876166594, + "grad_norm": 2.1857051849365234, + "learning_rate": 0.00017814795654423176, + "loss": 1.3509, + "step": 2122 + }, + { + "epoch": 0.21953930870453195, + "grad_norm": 1.309126853942871, + "learning_rate": 0.00017813760993274703, + "loss": 2.0322, + "step": 2123 + }, + { + "epoch": 0.21964271864739796, + "grad_norm": 1.8605514764785767, + "learning_rate": 0.0001781272633212623, + "loss": 1.831, + "step": 2124 + }, + { + "epoch": 0.21974612859026396, + "grad_norm": 1.8593494892120361, + "learning_rate": 0.00017811691670977756, + "loss": 1.3086, + "step": 2125 + }, + { + "epoch": 0.21984953853312997, + "grad_norm": 1.4725764989852905, + "learning_rate": 0.00017810657009829282, + "loss": 2.0356, + "step": 2126 + }, + { + "epoch": 0.21995294847599597, + "grad_norm": 1.6571112871170044, + "learning_rate": 0.0001780962234868081, + "loss": 1.4547, + "step": 2127 + }, + { + "epoch": 0.22005635841886198, + "grad_norm": 1.592392086982727, + "learning_rate": 0.00017808587687532336, + "loss": 1.8363, + "step": 2128 + }, + { + "epoch": 0.220159768361728, + "grad_norm": 1.4015467166900635, + "learning_rate": 0.0001780755302638386, + "loss": 1.6763, + "step": 2129 + }, + { + "epoch": 0.220263178304594, + "grad_norm": 1.4670350551605225, + "learning_rate": 0.00017806518365235386, + "loss": 1.8672, + "step": 2130 + }, + { + "epoch": 0.22036658824746, + "grad_norm": 1.1072840690612793, + "learning_rate": 0.00017805483704086913, + "loss": 1.4605, + "step": 2131 + }, + { + "epoch": 0.220469998190326, + "grad_norm": 2.009456157684326, + "learning_rate": 0.0001780444904293844, + "loss": 2.1299, + "step": 2132 + }, + { + "epoch": 0.220573408133192, + "grad_norm": 2.815999746322632, + "learning_rate": 0.00017803414381789966, + "loss": 1.8397, + "step": 2133 + }, + { + "epoch": 0.22067681807605802, + "grad_norm": 2.340900182723999, + "learning_rate": 0.00017802379720641492, + "loss": 1.9299, + "step": 2134 + }, + { + "epoch": 0.22078022801892402, + "grad_norm": 1.3540948629379272, + "learning_rate": 0.0001780134505949302, + "loss": 1.8427, + "step": 2135 + }, + { + "epoch": 0.22088363796179003, + "grad_norm": 1.0881321430206299, + "learning_rate": 0.00017800310398344545, + "loss": 1.4275, + "step": 2136 + }, + { + "epoch": 0.22098704790465604, + "grad_norm": 1.0537045001983643, + "learning_rate": 0.0001779927573719607, + "loss": 1.662, + "step": 2137 + }, + { + "epoch": 0.22109045784752204, + "grad_norm": 2.121901273727417, + "learning_rate": 0.00017798241076047596, + "loss": 1.5767, + "step": 2138 + }, + { + "epoch": 0.22119386779038805, + "grad_norm": 1.318829894065857, + "learning_rate": 0.00017797206414899122, + "loss": 1.9675, + "step": 2139 + }, + { + "epoch": 0.22129727773325406, + "grad_norm": 2.5058493614196777, + "learning_rate": 0.0001779617175375065, + "loss": 1.6812, + "step": 2140 + }, + { + "epoch": 0.22140068767612006, + "grad_norm": 1.9833593368530273, + "learning_rate": 0.00017795137092602175, + "loss": 1.672, + "step": 2141 + }, + { + "epoch": 0.22150409761898607, + "grad_norm": 1.5669039487838745, + "learning_rate": 0.00017794102431453702, + "loss": 1.4147, + "step": 2142 + }, + { + "epoch": 0.22160750756185207, + "grad_norm": 1.6391706466674805, + "learning_rate": 0.00017793067770305226, + "loss": 0.935, + "step": 2143 + }, + { + "epoch": 0.22171091750471808, + "grad_norm": 2.4117825031280518, + "learning_rate": 0.00017792033109156752, + "loss": 1.5555, + "step": 2144 + }, + { + "epoch": 0.2218143274475841, + "grad_norm": 1.7879774570465088, + "learning_rate": 0.0001779099844800828, + "loss": 1.7854, + "step": 2145 + }, + { + "epoch": 0.2219177373904501, + "grad_norm": 1.3162338733673096, + "learning_rate": 0.00017789963786859806, + "loss": 1.9177, + "step": 2146 + }, + { + "epoch": 0.2220211473333161, + "grad_norm": 2.763244152069092, + "learning_rate": 0.00017788929125711332, + "loss": 2.4389, + "step": 2147 + }, + { + "epoch": 0.2221245572761821, + "grad_norm": 1.8798487186431885, + "learning_rate": 0.00017787894464562859, + "loss": 1.8481, + "step": 2148 + }, + { + "epoch": 0.2222279672190481, + "grad_norm": 1.089530348777771, + "learning_rate": 0.00017786859803414382, + "loss": 1.4647, + "step": 2149 + }, + { + "epoch": 0.22233137716191412, + "grad_norm": 1.5578677654266357, + "learning_rate": 0.0001778582514226591, + "loss": 2.0841, + "step": 2150 + }, + { + "epoch": 0.22243478710478012, + "grad_norm": 1.403699278831482, + "learning_rate": 0.00017784790481117436, + "loss": 2.2018, + "step": 2151 + }, + { + "epoch": 0.22253819704764613, + "grad_norm": 1.9366167783737183, + "learning_rate": 0.00017783755819968962, + "loss": 1.8533, + "step": 2152 + }, + { + "epoch": 0.22264160699051214, + "grad_norm": 1.0486398935317993, + "learning_rate": 0.0001778272115882049, + "loss": 2.0046, + "step": 2153 + }, + { + "epoch": 0.22274501693337814, + "grad_norm": 2.3693816661834717, + "learning_rate": 0.00017781686497672015, + "loss": 1.5874, + "step": 2154 + }, + { + "epoch": 0.22284842687624415, + "grad_norm": 1.4884207248687744, + "learning_rate": 0.0001778065183652354, + "loss": 1.4696, + "step": 2155 + }, + { + "epoch": 0.22295183681911016, + "grad_norm": 1.5521442890167236, + "learning_rate": 0.00017779617175375066, + "loss": 1.6104, + "step": 2156 + }, + { + "epoch": 0.22305524676197616, + "grad_norm": 2.393778085708618, + "learning_rate": 0.00017778582514226592, + "loss": 2.2676, + "step": 2157 + }, + { + "epoch": 0.22315865670484217, + "grad_norm": 1.3362990617752075, + "learning_rate": 0.0001777754785307812, + "loss": 1.5341, + "step": 2158 + }, + { + "epoch": 0.22326206664770817, + "grad_norm": 1.3599947690963745, + "learning_rate": 0.00017776513191929645, + "loss": 1.6264, + "step": 2159 + }, + { + "epoch": 0.22336547659057418, + "grad_norm": 1.904009222984314, + "learning_rate": 0.00017775478530781172, + "loss": 1.5065, + "step": 2160 + }, + { + "epoch": 0.2234688865334402, + "grad_norm": 1.6462085247039795, + "learning_rate": 0.00017774443869632696, + "loss": 1.5861, + "step": 2161 + }, + { + "epoch": 0.2235722964763062, + "grad_norm": 1.7757351398468018, + "learning_rate": 0.00017773409208484222, + "loss": 2.2307, + "step": 2162 + }, + { + "epoch": 0.2236757064191722, + "grad_norm": 1.2958214282989502, + "learning_rate": 0.0001777237454733575, + "loss": 1.764, + "step": 2163 + }, + { + "epoch": 0.2237791163620382, + "grad_norm": 1.368507981300354, + "learning_rate": 0.00017771339886187275, + "loss": 2.3469, + "step": 2164 + }, + { + "epoch": 0.2238825263049042, + "grad_norm": 1.3048007488250732, + "learning_rate": 0.00017770305225038802, + "loss": 1.8811, + "step": 2165 + }, + { + "epoch": 0.22398593624777022, + "grad_norm": 1.4118998050689697, + "learning_rate": 0.00017769270563890329, + "loss": 1.2182, + "step": 2166 + }, + { + "epoch": 0.22408934619063622, + "grad_norm": 2.8922696113586426, + "learning_rate": 0.00017768235902741852, + "loss": 1.9509, + "step": 2167 + }, + { + "epoch": 0.22419275613350223, + "grad_norm": 1.4795236587524414, + "learning_rate": 0.0001776720124159338, + "loss": 1.8401, + "step": 2168 + }, + { + "epoch": 0.22429616607636824, + "grad_norm": 2.1654906272888184, + "learning_rate": 0.00017766166580444905, + "loss": 1.8052, + "step": 2169 + }, + { + "epoch": 0.22439957601923424, + "grad_norm": 1.3528178930282593, + "learning_rate": 0.00017765131919296432, + "loss": 1.5025, + "step": 2170 + }, + { + "epoch": 0.22450298596210025, + "grad_norm": 1.5375784635543823, + "learning_rate": 0.00017764097258147959, + "loss": 2.1606, + "step": 2171 + }, + { + "epoch": 0.22460639590496626, + "grad_norm": 1.4318784475326538, + "learning_rate": 0.00017763062596999485, + "loss": 1.6414, + "step": 2172 + }, + { + "epoch": 0.22470980584783226, + "grad_norm": 2.0012366771698, + "learning_rate": 0.0001776202793585101, + "loss": 1.8316, + "step": 2173 + }, + { + "epoch": 0.22481321579069827, + "grad_norm": 1.7899354696273804, + "learning_rate": 0.00017760993274702536, + "loss": 1.3864, + "step": 2174 + }, + { + "epoch": 0.22491662573356427, + "grad_norm": 1.3995294570922852, + "learning_rate": 0.00017759958613554062, + "loss": 1.7188, + "step": 2175 + }, + { + "epoch": 0.22502003567643028, + "grad_norm": 2.419296979904175, + "learning_rate": 0.0001775892395240559, + "loss": 1.9595, + "step": 2176 + }, + { + "epoch": 0.2251234456192963, + "grad_norm": 2.1976547241210938, + "learning_rate": 0.00017757889291257115, + "loss": 1.7631, + "step": 2177 + }, + { + "epoch": 0.2252268555621623, + "grad_norm": 1.25783371925354, + "learning_rate": 0.00017756854630108642, + "loss": 1.0977, + "step": 2178 + }, + { + "epoch": 0.2253302655050283, + "grad_norm": 2.3799781799316406, + "learning_rate": 0.00017755819968960166, + "loss": 1.6176, + "step": 2179 + }, + { + "epoch": 0.2254336754478943, + "grad_norm": 1.4703243970870972, + "learning_rate": 0.00017754785307811692, + "loss": 1.1459, + "step": 2180 + }, + { + "epoch": 0.2255370853907603, + "grad_norm": 1.6233093738555908, + "learning_rate": 0.0001775375064666322, + "loss": 1.6363, + "step": 2181 + }, + { + "epoch": 0.22564049533362632, + "grad_norm": 1.394158959388733, + "learning_rate": 0.00017752715985514745, + "loss": 1.4521, + "step": 2182 + }, + { + "epoch": 0.22574390527649232, + "grad_norm": 2.117434501647949, + "learning_rate": 0.00017751681324366272, + "loss": 1.7419, + "step": 2183 + }, + { + "epoch": 0.22584731521935833, + "grad_norm": 1.5713629722595215, + "learning_rate": 0.00017750646663217798, + "loss": 1.6596, + "step": 2184 + }, + { + "epoch": 0.22595072516222434, + "grad_norm": 1.4215084314346313, + "learning_rate": 0.00017749612002069322, + "loss": 1.6273, + "step": 2185 + }, + { + "epoch": 0.22605413510509034, + "grad_norm": 2.092386484146118, + "learning_rate": 0.0001774857734092085, + "loss": 1.6127, + "step": 2186 + }, + { + "epoch": 0.22615754504795635, + "grad_norm": 1.4384335279464722, + "learning_rate": 0.00017747542679772375, + "loss": 2.0603, + "step": 2187 + }, + { + "epoch": 0.22626095499082235, + "grad_norm": 1.191169261932373, + "learning_rate": 0.00017746508018623902, + "loss": 0.8099, + "step": 2188 + }, + { + "epoch": 0.22636436493368836, + "grad_norm": 1.1086655855178833, + "learning_rate": 0.00017745473357475428, + "loss": 1.9708, + "step": 2189 + }, + { + "epoch": 0.22646777487655437, + "grad_norm": 1.1252591609954834, + "learning_rate": 0.00017744438696326955, + "loss": 1.4266, + "step": 2190 + }, + { + "epoch": 0.22657118481942037, + "grad_norm": 2.127028226852417, + "learning_rate": 0.0001774340403517848, + "loss": 1.4415, + "step": 2191 + }, + { + "epoch": 0.2266745947622864, + "grad_norm": 1.0664880275726318, + "learning_rate": 0.00017742369374030005, + "loss": 1.7211, + "step": 2192 + }, + { + "epoch": 0.22677800470515241, + "grad_norm": 1.391688585281372, + "learning_rate": 0.00017741334712881532, + "loss": 2.242, + "step": 2193 + }, + { + "epoch": 0.22688141464801842, + "grad_norm": 1.4060182571411133, + "learning_rate": 0.00017740300051733059, + "loss": 1.7529, + "step": 2194 + }, + { + "epoch": 0.22698482459088443, + "grad_norm": 1.3024177551269531, + "learning_rate": 0.00017739265390584585, + "loss": 1.2435, + "step": 2195 + }, + { + "epoch": 0.22708823453375043, + "grad_norm": 0.9886972308158875, + "learning_rate": 0.0001773823072943611, + "loss": 1.177, + "step": 2196 + }, + { + "epoch": 0.22719164447661644, + "grad_norm": 1.9852739572525024, + "learning_rate": 0.00017737196068287636, + "loss": 1.4258, + "step": 2197 + }, + { + "epoch": 0.22729505441948245, + "grad_norm": 1.706716537475586, + "learning_rate": 0.00017736161407139162, + "loss": 1.7284, + "step": 2198 + }, + { + "epoch": 0.22739846436234845, + "grad_norm": 2.0738205909729004, + "learning_rate": 0.00017735126745990689, + "loss": 2.6703, + "step": 2199 + }, + { + "epoch": 0.22750187430521446, + "grad_norm": 2.022379159927368, + "learning_rate": 0.00017734092084842215, + "loss": 1.1353, + "step": 2200 + }, + { + "epoch": 0.22760528424808046, + "grad_norm": 2.1549413204193115, + "learning_rate": 0.00017733057423693742, + "loss": 1.8087, + "step": 2201 + }, + { + "epoch": 0.22770869419094647, + "grad_norm": 1.0443583726882935, + "learning_rate": 0.00017732022762545266, + "loss": 1.1172, + "step": 2202 + }, + { + "epoch": 0.22781210413381248, + "grad_norm": 2.4402153491973877, + "learning_rate": 0.00017730988101396792, + "loss": 1.6493, + "step": 2203 + }, + { + "epoch": 0.22791551407667848, + "grad_norm": 1.2988420724868774, + "learning_rate": 0.0001772995344024832, + "loss": 1.4614, + "step": 2204 + }, + { + "epoch": 0.2280189240195445, + "grad_norm": 1.1931614875793457, + "learning_rate": 0.00017728918779099845, + "loss": 1.9354, + "step": 2205 + }, + { + "epoch": 0.2281223339624105, + "grad_norm": 1.1833059787750244, + "learning_rate": 0.00017727884117951372, + "loss": 1.4399, + "step": 2206 + }, + { + "epoch": 0.2282257439052765, + "grad_norm": 3.9384918212890625, + "learning_rate": 0.00017726849456802898, + "loss": 2.5865, + "step": 2207 + }, + { + "epoch": 0.2283291538481425, + "grad_norm": 1.3188074827194214, + "learning_rate": 0.00017725814795654422, + "loss": 1.9454, + "step": 2208 + }, + { + "epoch": 0.2284325637910085, + "grad_norm": 1.1684150695800781, + "learning_rate": 0.0001772478013450595, + "loss": 1.5983, + "step": 2209 + }, + { + "epoch": 0.22853597373387452, + "grad_norm": 1.6735270023345947, + "learning_rate": 0.00017723745473357475, + "loss": 1.4366, + "step": 2210 + }, + { + "epoch": 0.22863938367674053, + "grad_norm": 1.5910207033157349, + "learning_rate": 0.00017722710812209002, + "loss": 2.0415, + "step": 2211 + }, + { + "epoch": 0.22874279361960653, + "grad_norm": 1.5946006774902344, + "learning_rate": 0.00017721676151060528, + "loss": 1.7567, + "step": 2212 + }, + { + "epoch": 0.22884620356247254, + "grad_norm": 1.840343713760376, + "learning_rate": 0.00017720641489912055, + "loss": 1.3738, + "step": 2213 + }, + { + "epoch": 0.22894961350533855, + "grad_norm": 1.5357155799865723, + "learning_rate": 0.0001771960682876358, + "loss": 1.8191, + "step": 2214 + }, + { + "epoch": 0.22905302344820455, + "grad_norm": 1.179421067237854, + "learning_rate": 0.00017718572167615105, + "loss": 1.5523, + "step": 2215 + }, + { + "epoch": 0.22915643339107056, + "grad_norm": 1.764402151107788, + "learning_rate": 0.00017717537506466632, + "loss": 1.1171, + "step": 2216 + }, + { + "epoch": 0.22925984333393656, + "grad_norm": 1.2253751754760742, + "learning_rate": 0.00017716502845318159, + "loss": 1.751, + "step": 2217 + }, + { + "epoch": 0.22936325327680257, + "grad_norm": 2.0649373531341553, + "learning_rate": 0.00017715468184169685, + "loss": 1.865, + "step": 2218 + }, + { + "epoch": 0.22946666321966858, + "grad_norm": 1.24588143825531, + "learning_rate": 0.00017714433523021212, + "loss": 1.9165, + "step": 2219 + }, + { + "epoch": 0.22957007316253458, + "grad_norm": 1.8557204008102417, + "learning_rate": 0.00017713398861872735, + "loss": 1.5765, + "step": 2220 + }, + { + "epoch": 0.2296734831054006, + "grad_norm": 1.494800090789795, + "learning_rate": 0.00017712364200724262, + "loss": 1.8745, + "step": 2221 + }, + { + "epoch": 0.2297768930482666, + "grad_norm": 1.9378629922866821, + "learning_rate": 0.00017711329539575789, + "loss": 2.0202, + "step": 2222 + }, + { + "epoch": 0.2298803029911326, + "grad_norm": 2.8973958492279053, + "learning_rate": 0.00017710294878427315, + "loss": 1.1802, + "step": 2223 + }, + { + "epoch": 0.2299837129339986, + "grad_norm": 2.0168254375457764, + "learning_rate": 0.00017709260217278842, + "loss": 1.5332, + "step": 2224 + }, + { + "epoch": 0.2300871228768646, + "grad_norm": 0.7732300758361816, + "learning_rate": 0.00017708225556130368, + "loss": 0.937, + "step": 2225 + }, + { + "epoch": 0.23019053281973062, + "grad_norm": 2.062314987182617, + "learning_rate": 0.00017707190894981892, + "loss": 2.1582, + "step": 2226 + }, + { + "epoch": 0.23029394276259663, + "grad_norm": 1.475707769393921, + "learning_rate": 0.0001770615623383342, + "loss": 1.8616, + "step": 2227 + }, + { + "epoch": 0.23039735270546263, + "grad_norm": 1.7562530040740967, + "learning_rate": 0.00017705121572684945, + "loss": 1.6349, + "step": 2228 + }, + { + "epoch": 0.23050076264832864, + "grad_norm": 3.099172592163086, + "learning_rate": 0.00017704086911536472, + "loss": 2.1556, + "step": 2229 + }, + { + "epoch": 0.23060417259119464, + "grad_norm": 1.312379240989685, + "learning_rate": 0.00017703052250387998, + "loss": 1.5145, + "step": 2230 + }, + { + "epoch": 0.23070758253406065, + "grad_norm": 1.0672568082809448, + "learning_rate": 0.00017702017589239525, + "loss": 1.5323, + "step": 2231 + }, + { + "epoch": 0.23081099247692666, + "grad_norm": 2.077061891555786, + "learning_rate": 0.0001770098292809105, + "loss": 1.9191, + "step": 2232 + }, + { + "epoch": 0.23091440241979266, + "grad_norm": 3.2172346115112305, + "learning_rate": 0.00017699948266942575, + "loss": 1.3069, + "step": 2233 + }, + { + "epoch": 0.23101781236265867, + "grad_norm": 1.9069652557373047, + "learning_rate": 0.00017698913605794102, + "loss": 1.8306, + "step": 2234 + }, + { + "epoch": 0.23112122230552468, + "grad_norm": 1.4115225076675415, + "learning_rate": 0.00017697878944645628, + "loss": 1.1319, + "step": 2235 + }, + { + "epoch": 0.23122463224839068, + "grad_norm": 1.0676754713058472, + "learning_rate": 0.00017696844283497155, + "loss": 1.8715, + "step": 2236 + }, + { + "epoch": 0.2313280421912567, + "grad_norm": 1.7754651308059692, + "learning_rate": 0.00017695809622348682, + "loss": 1.6673, + "step": 2237 + }, + { + "epoch": 0.2314314521341227, + "grad_norm": 1.3493293523788452, + "learning_rate": 0.00017694774961200205, + "loss": 1.7102, + "step": 2238 + }, + { + "epoch": 0.2315348620769887, + "grad_norm": 1.344414472579956, + "learning_rate": 0.00017693740300051732, + "loss": 1.5833, + "step": 2239 + }, + { + "epoch": 0.2316382720198547, + "grad_norm": 2.3626956939697266, + "learning_rate": 0.00017692705638903259, + "loss": 1.6388, + "step": 2240 + }, + { + "epoch": 0.2317416819627207, + "grad_norm": 1.318768858909607, + "learning_rate": 0.00017691670977754785, + "loss": 1.4585, + "step": 2241 + }, + { + "epoch": 0.23184509190558672, + "grad_norm": 1.23952054977417, + "learning_rate": 0.00017690636316606312, + "loss": 1.211, + "step": 2242 + }, + { + "epoch": 0.23194850184845273, + "grad_norm": 3.935603380203247, + "learning_rate": 0.00017689601655457838, + "loss": 2.0167, + "step": 2243 + }, + { + "epoch": 0.23205191179131873, + "grad_norm": 1.5189486742019653, + "learning_rate": 0.00017688566994309365, + "loss": 2.0937, + "step": 2244 + }, + { + "epoch": 0.23215532173418474, + "grad_norm": 2.4079957008361816, + "learning_rate": 0.0001768753233316089, + "loss": 1.1639, + "step": 2245 + }, + { + "epoch": 0.23225873167705074, + "grad_norm": 1.9767324924468994, + "learning_rate": 0.00017686497672012415, + "loss": 1.7567, + "step": 2246 + }, + { + "epoch": 0.23236214161991675, + "grad_norm": 2.0366764068603516, + "learning_rate": 0.00017685463010863942, + "loss": 1.8011, + "step": 2247 + }, + { + "epoch": 0.23246555156278276, + "grad_norm": 1.6386555433273315, + "learning_rate": 0.00017684428349715468, + "loss": 1.8492, + "step": 2248 + }, + { + "epoch": 0.23256896150564876, + "grad_norm": 1.8099335432052612, + "learning_rate": 0.00017683393688566995, + "loss": 1.5167, + "step": 2249 + }, + { + "epoch": 0.23267237144851477, + "grad_norm": 1.8088555335998535, + "learning_rate": 0.00017682359027418521, + "loss": 1.8067, + "step": 2250 + }, + { + "epoch": 0.23277578139138078, + "grad_norm": 1.6619300842285156, + "learning_rate": 0.00017681324366270048, + "loss": 2.3162, + "step": 2251 + }, + { + "epoch": 0.23287919133424678, + "grad_norm": 1.1905879974365234, + "learning_rate": 0.00017680289705121574, + "loss": 1.6536, + "step": 2252 + }, + { + "epoch": 0.2329826012771128, + "grad_norm": 1.5096153020858765, + "learning_rate": 0.000176792550439731, + "loss": 2.1118, + "step": 2253 + }, + { + "epoch": 0.2330860112199788, + "grad_norm": 0.9325785040855408, + "learning_rate": 0.00017678220382824625, + "loss": 2.137, + "step": 2254 + }, + { + "epoch": 0.2331894211628448, + "grad_norm": 2.137453079223633, + "learning_rate": 0.00017677185721676151, + "loss": 2.3314, + "step": 2255 + }, + { + "epoch": 0.2332928311057108, + "grad_norm": 1.8949631452560425, + "learning_rate": 0.00017676151060527678, + "loss": 1.5151, + "step": 2256 + }, + { + "epoch": 0.2333962410485768, + "grad_norm": 1.6897765398025513, + "learning_rate": 0.00017675116399379205, + "loss": 1.5015, + "step": 2257 + }, + { + "epoch": 0.23349965099144282, + "grad_norm": 1.646003246307373, + "learning_rate": 0.0001767408173823073, + "loss": 1.7934, + "step": 2258 + }, + { + "epoch": 0.23360306093430883, + "grad_norm": 2.5955355167388916, + "learning_rate": 0.00017673047077082258, + "loss": 2.2967, + "step": 2259 + }, + { + "epoch": 0.23370647087717483, + "grad_norm": 1.4860060214996338, + "learning_rate": 0.00017672012415933784, + "loss": 1.8974, + "step": 2260 + }, + { + "epoch": 0.23380988082004084, + "grad_norm": 1.9003316164016724, + "learning_rate": 0.0001767097775478531, + "loss": 1.5562, + "step": 2261 + }, + { + "epoch": 0.23391329076290684, + "grad_norm": 2.3601090908050537, + "learning_rate": 0.00017669943093636835, + "loss": 2.0441, + "step": 2262 + }, + { + "epoch": 0.23401670070577285, + "grad_norm": 1.5946472883224487, + "learning_rate": 0.0001766890843248836, + "loss": 1.4005, + "step": 2263 + }, + { + "epoch": 0.23412011064863886, + "grad_norm": 2.192887783050537, + "learning_rate": 0.00017667873771339888, + "loss": 1.6484, + "step": 2264 + }, + { + "epoch": 0.23422352059150486, + "grad_norm": 2.673560619354248, + "learning_rate": 0.00017666839110191414, + "loss": 1.7329, + "step": 2265 + }, + { + "epoch": 0.23432693053437087, + "grad_norm": 1.700700044631958, + "learning_rate": 0.0001766580444904294, + "loss": 1.4666, + "step": 2266 + }, + { + "epoch": 0.23443034047723688, + "grad_norm": 2.216442346572876, + "learning_rate": 0.00017664769787894467, + "loss": 1.9816, + "step": 2267 + }, + { + "epoch": 0.23453375042010288, + "grad_norm": 2.568310022354126, + "learning_rate": 0.0001766373512674599, + "loss": 1.6597, + "step": 2268 + }, + { + "epoch": 0.2346371603629689, + "grad_norm": 2.262301206588745, + "learning_rate": 0.00017662700465597518, + "loss": 2.1794, + "step": 2269 + }, + { + "epoch": 0.2347405703058349, + "grad_norm": 1.755656361579895, + "learning_rate": 0.00017661665804449044, + "loss": 2.4144, + "step": 2270 + }, + { + "epoch": 0.2348439802487009, + "grad_norm": 1.864424705505371, + "learning_rate": 0.0001766063114330057, + "loss": 1.8568, + "step": 2271 + }, + { + "epoch": 0.2349473901915669, + "grad_norm": 0.9477588534355164, + "learning_rate": 0.00017659596482152097, + "loss": 1.8716, + "step": 2272 + }, + { + "epoch": 0.2350508001344329, + "grad_norm": 1.9028419256210327, + "learning_rate": 0.00017658561821003624, + "loss": 1.9195, + "step": 2273 + }, + { + "epoch": 0.23515421007729892, + "grad_norm": 1.5173331499099731, + "learning_rate": 0.00017657527159855148, + "loss": 1.8441, + "step": 2274 + }, + { + "epoch": 0.23525762002016493, + "grad_norm": 1.5247029066085815, + "learning_rate": 0.00017656492498706674, + "loss": 1.1869, + "step": 2275 + }, + { + "epoch": 0.23536102996303093, + "grad_norm": 1.830028772354126, + "learning_rate": 0.000176554578375582, + "loss": 1.7555, + "step": 2276 + }, + { + "epoch": 0.23546443990589694, + "grad_norm": 2.476260185241699, + "learning_rate": 0.00017654423176409728, + "loss": 1.7501, + "step": 2277 + }, + { + "epoch": 0.23556784984876297, + "grad_norm": 1.660536289215088, + "learning_rate": 0.00017653388515261254, + "loss": 1.5538, + "step": 2278 + }, + { + "epoch": 0.23567125979162898, + "grad_norm": 2.4655914306640625, + "learning_rate": 0.0001765235385411278, + "loss": 2.124, + "step": 2279 + }, + { + "epoch": 0.23577466973449498, + "grad_norm": 2.0129129886627197, + "learning_rate": 0.00017651319192964305, + "loss": 1.624, + "step": 2280 + }, + { + "epoch": 0.235878079677361, + "grad_norm": 1.2003356218338013, + "learning_rate": 0.0001765028453181583, + "loss": 1.4315, + "step": 2281 + }, + { + "epoch": 0.235981489620227, + "grad_norm": 1.2782875299453735, + "learning_rate": 0.00017649249870667358, + "loss": 1.7513, + "step": 2282 + }, + { + "epoch": 0.236084899563093, + "grad_norm": 2.0260257720947266, + "learning_rate": 0.00017648215209518884, + "loss": 1.9921, + "step": 2283 + }, + { + "epoch": 0.236188309505959, + "grad_norm": 1.1111077070236206, + "learning_rate": 0.0001764718054837041, + "loss": 1.8772, + "step": 2284 + }, + { + "epoch": 0.23629171944882502, + "grad_norm": 1.1391185522079468, + "learning_rate": 0.00017646145887221937, + "loss": 1.3482, + "step": 2285 + }, + { + "epoch": 0.23639512939169102, + "grad_norm": 1.0886589288711548, + "learning_rate": 0.0001764511122607346, + "loss": 1.0132, + "step": 2286 + }, + { + "epoch": 0.23649853933455703, + "grad_norm": 1.4947338104248047, + "learning_rate": 0.00017644076564924988, + "loss": 1.5809, + "step": 2287 + }, + { + "epoch": 0.23660194927742303, + "grad_norm": 1.7177505493164062, + "learning_rate": 0.00017643041903776514, + "loss": 1.2868, + "step": 2288 + }, + { + "epoch": 0.23670535922028904, + "grad_norm": 1.0610607862472534, + "learning_rate": 0.0001764200724262804, + "loss": 2.027, + "step": 2289 + }, + { + "epoch": 0.23680876916315505, + "grad_norm": 1.152755856513977, + "learning_rate": 0.00017640972581479567, + "loss": 1.4798, + "step": 2290 + }, + { + "epoch": 0.23691217910602105, + "grad_norm": 1.1068141460418701, + "learning_rate": 0.00017639937920331094, + "loss": 2.0155, + "step": 2291 + }, + { + "epoch": 0.23701558904888706, + "grad_norm": 1.3579416275024414, + "learning_rate": 0.00017638903259182618, + "loss": 1.878, + "step": 2292 + }, + { + "epoch": 0.23711899899175307, + "grad_norm": 1.3110994100570679, + "learning_rate": 0.00017637868598034144, + "loss": 1.5488, + "step": 2293 + }, + { + "epoch": 0.23722240893461907, + "grad_norm": 3.31453275680542, + "learning_rate": 0.0001763683393688567, + "loss": 2.1308, + "step": 2294 + }, + { + "epoch": 0.23732581887748508, + "grad_norm": 0.9859296679496765, + "learning_rate": 0.00017635799275737197, + "loss": 1.7665, + "step": 2295 + }, + { + "epoch": 0.23742922882035108, + "grad_norm": 1.7569581270217896, + "learning_rate": 0.00017634764614588724, + "loss": 1.7785, + "step": 2296 + }, + { + "epoch": 0.2375326387632171, + "grad_norm": 1.4133583307266235, + "learning_rate": 0.0001763372995344025, + "loss": 1.7305, + "step": 2297 + }, + { + "epoch": 0.2376360487060831, + "grad_norm": 3.925902843475342, + "learning_rate": 0.00017632695292291774, + "loss": 2.0507, + "step": 2298 + }, + { + "epoch": 0.2377394586489491, + "grad_norm": 1.5905970335006714, + "learning_rate": 0.000176316606311433, + "loss": 1.6149, + "step": 2299 + }, + { + "epoch": 0.2378428685918151, + "grad_norm": 1.6597696542739868, + "learning_rate": 0.00017630625969994828, + "loss": 1.4731, + "step": 2300 + }, + { + "epoch": 0.23794627853468112, + "grad_norm": 1.602620005607605, + "learning_rate": 0.00017629591308846354, + "loss": 1.8871, + "step": 2301 + }, + { + "epoch": 0.23804968847754712, + "grad_norm": 1.0693349838256836, + "learning_rate": 0.0001762855664769788, + "loss": 1.3862, + "step": 2302 + }, + { + "epoch": 0.23815309842041313, + "grad_norm": 1.631103277206421, + "learning_rate": 0.00017627521986549407, + "loss": 1.8944, + "step": 2303 + }, + { + "epoch": 0.23825650836327913, + "grad_norm": 2.4888055324554443, + "learning_rate": 0.0001762648732540093, + "loss": 2.5028, + "step": 2304 + }, + { + "epoch": 0.23835991830614514, + "grad_norm": 1.9068800210952759, + "learning_rate": 0.00017625452664252458, + "loss": 1.3553, + "step": 2305 + }, + { + "epoch": 0.23846332824901115, + "grad_norm": 2.230555295944214, + "learning_rate": 0.00017624418003103984, + "loss": 1.9119, + "step": 2306 + }, + { + "epoch": 0.23856673819187715, + "grad_norm": 1.1259522438049316, + "learning_rate": 0.0001762338334195551, + "loss": 1.7393, + "step": 2307 + }, + { + "epoch": 0.23867014813474316, + "grad_norm": 2.369147777557373, + "learning_rate": 0.00017622348680807037, + "loss": 1.8111, + "step": 2308 + }, + { + "epoch": 0.23877355807760917, + "grad_norm": 1.7751003503799438, + "learning_rate": 0.00017621314019658564, + "loss": 1.5761, + "step": 2309 + }, + { + "epoch": 0.23887696802047517, + "grad_norm": 2.357508420944214, + "learning_rate": 0.00017620279358510088, + "loss": 1.8912, + "step": 2310 + }, + { + "epoch": 0.23898037796334118, + "grad_norm": 0.9444859027862549, + "learning_rate": 0.00017619244697361614, + "loss": 2.177, + "step": 2311 + }, + { + "epoch": 0.23908378790620718, + "grad_norm": 1.4139807224273682, + "learning_rate": 0.0001761821003621314, + "loss": 1.5666, + "step": 2312 + }, + { + "epoch": 0.2391871978490732, + "grad_norm": 2.163088321685791, + "learning_rate": 0.00017617175375064667, + "loss": 1.5789, + "step": 2313 + }, + { + "epoch": 0.2392906077919392, + "grad_norm": 1.5765900611877441, + "learning_rate": 0.00017616140713916194, + "loss": 1.7379, + "step": 2314 + }, + { + "epoch": 0.2393940177348052, + "grad_norm": 2.842447280883789, + "learning_rate": 0.0001761510605276772, + "loss": 1.711, + "step": 2315 + }, + { + "epoch": 0.2394974276776712, + "grad_norm": 2.930110216140747, + "learning_rate": 0.00017614071391619244, + "loss": 2.089, + "step": 2316 + }, + { + "epoch": 0.23960083762053722, + "grad_norm": 1.4796249866485596, + "learning_rate": 0.0001761303673047077, + "loss": 1.9833, + "step": 2317 + }, + { + "epoch": 0.23970424756340322, + "grad_norm": 1.5853078365325928, + "learning_rate": 0.00017612002069322297, + "loss": 1.8592, + "step": 2318 + }, + { + "epoch": 0.23980765750626923, + "grad_norm": 0.9812987446784973, + "learning_rate": 0.00017610967408173824, + "loss": 1.7803, + "step": 2319 + }, + { + "epoch": 0.23991106744913523, + "grad_norm": 1.229555606842041, + "learning_rate": 0.0001760993274702535, + "loss": 1.0883, + "step": 2320 + }, + { + "epoch": 0.24001447739200124, + "grad_norm": 2.2885689735412598, + "learning_rate": 0.00017608898085876877, + "loss": 1.7889, + "step": 2321 + }, + { + "epoch": 0.24011788733486725, + "grad_norm": 1.1627824306488037, + "learning_rate": 0.000176078634247284, + "loss": 1.106, + "step": 2322 + }, + { + "epoch": 0.24022129727773325, + "grad_norm": 1.3515424728393555, + "learning_rate": 0.00017606828763579928, + "loss": 1.2491, + "step": 2323 + }, + { + "epoch": 0.24032470722059926, + "grad_norm": 0.9530225396156311, + "learning_rate": 0.00017605794102431454, + "loss": 1.7492, + "step": 2324 + }, + { + "epoch": 0.24042811716346527, + "grad_norm": 1.797919750213623, + "learning_rate": 0.0001760475944128298, + "loss": 1.4208, + "step": 2325 + }, + { + "epoch": 0.24053152710633127, + "grad_norm": 1.2609739303588867, + "learning_rate": 0.00017603724780134507, + "loss": 1.2555, + "step": 2326 + }, + { + "epoch": 0.24063493704919728, + "grad_norm": 1.9391148090362549, + "learning_rate": 0.00017602690118986034, + "loss": 1.0435, + "step": 2327 + }, + { + "epoch": 0.24073834699206328, + "grad_norm": 1.8738831281661987, + "learning_rate": 0.00017601655457837558, + "loss": 1.8572, + "step": 2328 + }, + { + "epoch": 0.2408417569349293, + "grad_norm": 2.168691396713257, + "learning_rate": 0.00017600620796689084, + "loss": 1.7667, + "step": 2329 + }, + { + "epoch": 0.2409451668777953, + "grad_norm": 2.458089828491211, + "learning_rate": 0.0001759958613554061, + "loss": 1.871, + "step": 2330 + }, + { + "epoch": 0.2410485768206613, + "grad_norm": 1.9956097602844238, + "learning_rate": 0.00017598551474392137, + "loss": 2.3792, + "step": 2331 + }, + { + "epoch": 0.2411519867635273, + "grad_norm": 1.3749737739562988, + "learning_rate": 0.00017597516813243664, + "loss": 1.6698, + "step": 2332 + }, + { + "epoch": 0.24125539670639332, + "grad_norm": 1.7299331426620483, + "learning_rate": 0.0001759648215209519, + "loss": 1.64, + "step": 2333 + }, + { + "epoch": 0.24135880664925932, + "grad_norm": 2.118608236312866, + "learning_rate": 0.00017595447490946714, + "loss": 1.3257, + "step": 2334 + }, + { + "epoch": 0.24146221659212533, + "grad_norm": 1.570468544960022, + "learning_rate": 0.0001759441282979824, + "loss": 1.5102, + "step": 2335 + }, + { + "epoch": 0.24156562653499133, + "grad_norm": 2.2476089000701904, + "learning_rate": 0.00017593378168649767, + "loss": 1.6563, + "step": 2336 + }, + { + "epoch": 0.24166903647785734, + "grad_norm": 2.5278682708740234, + "learning_rate": 0.00017592343507501294, + "loss": 1.7088, + "step": 2337 + }, + { + "epoch": 0.24177244642072335, + "grad_norm": 1.7787667512893677, + "learning_rate": 0.0001759130884635282, + "loss": 1.7681, + "step": 2338 + }, + { + "epoch": 0.24187585636358935, + "grad_norm": 1.1270442008972168, + "learning_rate": 0.00017590274185204347, + "loss": 1.477, + "step": 2339 + }, + { + "epoch": 0.24197926630645536, + "grad_norm": 1.9922478199005127, + "learning_rate": 0.0001758923952405587, + "loss": 2.2118, + "step": 2340 + }, + { + "epoch": 0.24208267624932137, + "grad_norm": 1.2730507850646973, + "learning_rate": 0.00017588204862907397, + "loss": 1.5687, + "step": 2341 + }, + { + "epoch": 0.24218608619218737, + "grad_norm": 2.0439341068267822, + "learning_rate": 0.00017587170201758924, + "loss": 1.6693, + "step": 2342 + }, + { + "epoch": 0.24228949613505338, + "grad_norm": 1.3062502145767212, + "learning_rate": 0.0001758613554061045, + "loss": 1.497, + "step": 2343 + }, + { + "epoch": 0.24239290607791938, + "grad_norm": 2.715073823928833, + "learning_rate": 0.00017585100879461977, + "loss": 1.6284, + "step": 2344 + }, + { + "epoch": 0.2424963160207854, + "grad_norm": 1.6569184064865112, + "learning_rate": 0.00017584066218313504, + "loss": 1.6966, + "step": 2345 + }, + { + "epoch": 0.2425997259636514, + "grad_norm": 1.6366918087005615, + "learning_rate": 0.00017583031557165027, + "loss": 1.8435, + "step": 2346 + }, + { + "epoch": 0.2427031359065174, + "grad_norm": 1.7843844890594482, + "learning_rate": 0.00017581996896016554, + "loss": 1.941, + "step": 2347 + }, + { + "epoch": 0.2428065458493834, + "grad_norm": 1.6880625486373901, + "learning_rate": 0.0001758096223486808, + "loss": 1.2436, + "step": 2348 + }, + { + "epoch": 0.24290995579224942, + "grad_norm": 1.39772629737854, + "learning_rate": 0.00017579927573719607, + "loss": 1.3784, + "step": 2349 + }, + { + "epoch": 0.24301336573511542, + "grad_norm": 1.4256283044815063, + "learning_rate": 0.00017578892912571134, + "loss": 1.4929, + "step": 2350 + }, + { + "epoch": 0.24311677567798143, + "grad_norm": 1.4295189380645752, + "learning_rate": 0.0001757785825142266, + "loss": 1.6985, + "step": 2351 + }, + { + "epoch": 0.24322018562084743, + "grad_norm": 2.4880120754241943, + "learning_rate": 0.00017576823590274184, + "loss": 1.9306, + "step": 2352 + }, + { + "epoch": 0.24332359556371344, + "grad_norm": 1.9826879501342773, + "learning_rate": 0.0001757578892912571, + "loss": 1.9971, + "step": 2353 + }, + { + "epoch": 0.24342700550657945, + "grad_norm": 1.8703138828277588, + "learning_rate": 0.00017574754267977237, + "loss": 1.2286, + "step": 2354 + }, + { + "epoch": 0.24353041544944545, + "grad_norm": 2.05326509475708, + "learning_rate": 0.00017573719606828764, + "loss": 1.7075, + "step": 2355 + }, + { + "epoch": 0.24363382539231146, + "grad_norm": 1.259153127670288, + "learning_rate": 0.0001757268494568029, + "loss": 1.7123, + "step": 2356 + }, + { + "epoch": 0.24373723533517747, + "grad_norm": 1.4366209506988525, + "learning_rate": 0.00017571650284531817, + "loss": 1.4503, + "step": 2357 + }, + { + "epoch": 0.24384064527804347, + "grad_norm": 1.6645400524139404, + "learning_rate": 0.0001757061562338334, + "loss": 1.1578, + "step": 2358 + }, + { + "epoch": 0.24394405522090948, + "grad_norm": 1.6528881788253784, + "learning_rate": 0.00017569580962234867, + "loss": 1.6437, + "step": 2359 + }, + { + "epoch": 0.24404746516377548, + "grad_norm": 1.262462854385376, + "learning_rate": 0.00017568546301086394, + "loss": 1.6392, + "step": 2360 + }, + { + "epoch": 0.2441508751066415, + "grad_norm": 1.7609543800354004, + "learning_rate": 0.0001756751163993792, + "loss": 2.3627, + "step": 2361 + }, + { + "epoch": 0.2442542850495075, + "grad_norm": 0.7221395969390869, + "learning_rate": 0.00017566476978789447, + "loss": 1.6682, + "step": 2362 + }, + { + "epoch": 0.2443576949923735, + "grad_norm": 2.4060263633728027, + "learning_rate": 0.00017565442317640974, + "loss": 1.7926, + "step": 2363 + }, + { + "epoch": 0.24446110493523954, + "grad_norm": 1.0417516231536865, + "learning_rate": 0.00017564407656492497, + "loss": 1.544, + "step": 2364 + }, + { + "epoch": 0.24456451487810554, + "grad_norm": 0.9361560344696045, + "learning_rate": 0.00017563372995344024, + "loss": 1.5395, + "step": 2365 + }, + { + "epoch": 0.24466792482097155, + "grad_norm": 1.880168080329895, + "learning_rate": 0.0001756233833419555, + "loss": 2.4219, + "step": 2366 + }, + { + "epoch": 0.24477133476383756, + "grad_norm": 1.9342352151870728, + "learning_rate": 0.00017561303673047077, + "loss": 1.3834, + "step": 2367 + }, + { + "epoch": 0.24487474470670356, + "grad_norm": 2.5511040687561035, + "learning_rate": 0.00017560269011898604, + "loss": 1.7103, + "step": 2368 + }, + { + "epoch": 0.24497815464956957, + "grad_norm": 0.9870395064353943, + "learning_rate": 0.0001755923435075013, + "loss": 1.3327, + "step": 2369 + }, + { + "epoch": 0.24508156459243557, + "grad_norm": 1.2001618146896362, + "learning_rate": 0.00017558199689601657, + "loss": 1.7844, + "step": 2370 + }, + { + "epoch": 0.24518497453530158, + "grad_norm": 1.2673696279525757, + "learning_rate": 0.0001755716502845318, + "loss": 1.6301, + "step": 2371 + }, + { + "epoch": 0.2452883844781676, + "grad_norm": 1.7478164434432983, + "learning_rate": 0.00017556130367304707, + "loss": 1.4385, + "step": 2372 + }, + { + "epoch": 0.2453917944210336, + "grad_norm": 1.680871844291687, + "learning_rate": 0.00017555095706156234, + "loss": 2.3595, + "step": 2373 + }, + { + "epoch": 0.2454952043638996, + "grad_norm": 1.445009469985962, + "learning_rate": 0.0001755406104500776, + "loss": 1.4895, + "step": 2374 + }, + { + "epoch": 0.2455986143067656, + "grad_norm": 1.6499723196029663, + "learning_rate": 0.00017553026383859287, + "loss": 1.0861, + "step": 2375 + }, + { + "epoch": 0.2457020242496316, + "grad_norm": 2.56699538230896, + "learning_rate": 0.00017551991722710813, + "loss": 1.8802, + "step": 2376 + }, + { + "epoch": 0.24580543419249762, + "grad_norm": 1.9932339191436768, + "learning_rate": 0.0001755095706156234, + "loss": 1.8787, + "step": 2377 + }, + { + "epoch": 0.24590884413536362, + "grad_norm": 1.4826245307922363, + "learning_rate": 0.00017549922400413866, + "loss": 1.3792, + "step": 2378 + }, + { + "epoch": 0.24601225407822963, + "grad_norm": 1.8881828784942627, + "learning_rate": 0.0001754888773926539, + "loss": 1.9625, + "step": 2379 + }, + { + "epoch": 0.24611566402109564, + "grad_norm": 3.217658519744873, + "learning_rate": 0.00017547853078116917, + "loss": 1.5442, + "step": 2380 + }, + { + "epoch": 0.24621907396396164, + "grad_norm": 1.5086668729782104, + "learning_rate": 0.00017546818416968443, + "loss": 1.3905, + "step": 2381 + }, + { + "epoch": 0.24632248390682765, + "grad_norm": 1.2646621465682983, + "learning_rate": 0.0001754578375581997, + "loss": 1.0604, + "step": 2382 + }, + { + "epoch": 0.24642589384969366, + "grad_norm": 1.091963529586792, + "learning_rate": 0.00017544749094671497, + "loss": 1.1739, + "step": 2383 + }, + { + "epoch": 0.24652930379255966, + "grad_norm": 1.5914099216461182, + "learning_rate": 0.00017543714433523023, + "loss": 2.022, + "step": 2384 + }, + { + "epoch": 0.24663271373542567, + "grad_norm": 1.4844694137573242, + "learning_rate": 0.0001754267977237455, + "loss": 2.1174, + "step": 2385 + }, + { + "epoch": 0.24673612367829167, + "grad_norm": 1.486586570739746, + "learning_rate": 0.00017541645111226076, + "loss": 1.4655, + "step": 2386 + }, + { + "epoch": 0.24683953362115768, + "grad_norm": 1.22221839427948, + "learning_rate": 0.000175406104500776, + "loss": 1.6983, + "step": 2387 + }, + { + "epoch": 0.2469429435640237, + "grad_norm": 1.0208945274353027, + "learning_rate": 0.00017539575788929127, + "loss": 1.2739, + "step": 2388 + }, + { + "epoch": 0.2470463535068897, + "grad_norm": 1.514829397201538, + "learning_rate": 0.00017538541127780653, + "loss": 0.8628, + "step": 2389 + }, + { + "epoch": 0.2471497634497557, + "grad_norm": 1.2276806831359863, + "learning_rate": 0.0001753750646663218, + "loss": 1.4324, + "step": 2390 + }, + { + "epoch": 0.2472531733926217, + "grad_norm": 1.2919999361038208, + "learning_rate": 0.00017536471805483706, + "loss": 1.4985, + "step": 2391 + }, + { + "epoch": 0.2473565833354877, + "grad_norm": 1.6556484699249268, + "learning_rate": 0.00017535437144335233, + "loss": 1.6762, + "step": 2392 + }, + { + "epoch": 0.24745999327835372, + "grad_norm": 1.1307460069656372, + "learning_rate": 0.0001753440248318676, + "loss": 1.6692, + "step": 2393 + }, + { + "epoch": 0.24756340322121972, + "grad_norm": 1.1884139776229858, + "learning_rate": 0.00017533367822038283, + "loss": 1.3586, + "step": 2394 + }, + { + "epoch": 0.24766681316408573, + "grad_norm": 1.539978265762329, + "learning_rate": 0.0001753233316088981, + "loss": 1.4159, + "step": 2395 + }, + { + "epoch": 0.24777022310695174, + "grad_norm": 2.753964424133301, + "learning_rate": 0.00017531298499741336, + "loss": 1.3736, + "step": 2396 + }, + { + "epoch": 0.24787363304981774, + "grad_norm": 1.3421005010604858, + "learning_rate": 0.00017530263838592863, + "loss": 1.8879, + "step": 2397 + }, + { + "epoch": 0.24797704299268375, + "grad_norm": 1.2711719274520874, + "learning_rate": 0.0001752922917744439, + "loss": 1.1862, + "step": 2398 + }, + { + "epoch": 0.24808045293554976, + "grad_norm": 1.870287537574768, + "learning_rate": 0.00017528194516295916, + "loss": 0.6339, + "step": 2399 + }, + { + "epoch": 0.24818386287841576, + "grad_norm": 1.4832231998443604, + "learning_rate": 0.0001752715985514744, + "loss": 1.7517, + "step": 2400 + }, + { + "epoch": 0.24828727282128177, + "grad_norm": 2.3762567043304443, + "learning_rate": 0.00017526125193998966, + "loss": 1.3174, + "step": 2401 + }, + { + "epoch": 0.24839068276414777, + "grad_norm": 1.2148253917694092, + "learning_rate": 0.00017525090532850493, + "loss": 1.4028, + "step": 2402 + }, + { + "epoch": 0.24849409270701378, + "grad_norm": 1.2104130983352661, + "learning_rate": 0.0001752405587170202, + "loss": 1.347, + "step": 2403 + }, + { + "epoch": 0.2485975026498798, + "grad_norm": 0.9523527026176453, + "learning_rate": 0.00017523021210553546, + "loss": 1.9917, + "step": 2404 + }, + { + "epoch": 0.2487009125927458, + "grad_norm": 1.6441329717636108, + "learning_rate": 0.00017521986549405073, + "loss": 1.6389, + "step": 2405 + }, + { + "epoch": 0.2488043225356118, + "grad_norm": 2.3193979263305664, + "learning_rate": 0.00017520951888256597, + "loss": 1.8484, + "step": 2406 + }, + { + "epoch": 0.2489077324784778, + "grad_norm": 1.6989390850067139, + "learning_rate": 0.00017519917227108123, + "loss": 0.9939, + "step": 2407 + }, + { + "epoch": 0.2490111424213438, + "grad_norm": 1.290727138519287, + "learning_rate": 0.0001751888256595965, + "loss": 1.7555, + "step": 2408 + }, + { + "epoch": 0.24911455236420982, + "grad_norm": 1.2122284173965454, + "learning_rate": 0.00017517847904811176, + "loss": 1.4804, + "step": 2409 + }, + { + "epoch": 0.24921796230707582, + "grad_norm": 1.3853946924209595, + "learning_rate": 0.00017516813243662703, + "loss": 1.897, + "step": 2410 + }, + { + "epoch": 0.24932137224994183, + "grad_norm": 2.634765148162842, + "learning_rate": 0.0001751577858251423, + "loss": 1.3514, + "step": 2411 + }, + { + "epoch": 0.24942478219280784, + "grad_norm": 2.0169286727905273, + "learning_rate": 0.00017514743921365753, + "loss": 2.0064, + "step": 2412 + }, + { + "epoch": 0.24952819213567384, + "grad_norm": 2.7714152336120605, + "learning_rate": 0.0001751370926021728, + "loss": 1.7758, + "step": 2413 + }, + { + "epoch": 0.24963160207853985, + "grad_norm": 1.5576125383377075, + "learning_rate": 0.00017512674599068806, + "loss": 1.4461, + "step": 2414 + }, + { + "epoch": 0.24973501202140586, + "grad_norm": 1.512837290763855, + "learning_rate": 0.00017511639937920333, + "loss": 1.7374, + "step": 2415 + }, + { + "epoch": 0.24983842196427186, + "grad_norm": 0.8817567825317383, + "learning_rate": 0.0001751060527677186, + "loss": 1.4742, + "step": 2416 + }, + { + "epoch": 0.24994183190713787, + "grad_norm": 0.9802753329277039, + "learning_rate": 0.00017509570615623386, + "loss": 1.0384, + "step": 2417 + }, + { + "epoch": 0.2500452418500039, + "grad_norm": 1.5185511112213135, + "learning_rate": 0.0001750853595447491, + "loss": 1.6683, + "step": 2418 + }, + { + "epoch": 0.2501486517928699, + "grad_norm": 1.5050675868988037, + "learning_rate": 0.00017507501293326436, + "loss": 1.8603, + "step": 2419 + }, + { + "epoch": 0.2502520617357359, + "grad_norm": 1.4089784622192383, + "learning_rate": 0.00017506466632177963, + "loss": 1.1532, + "step": 2420 + }, + { + "epoch": 0.2503554716786019, + "grad_norm": 1.0789377689361572, + "learning_rate": 0.0001750543197102949, + "loss": 1.1915, + "step": 2421 + }, + { + "epoch": 0.2504588816214679, + "grad_norm": 1.937451958656311, + "learning_rate": 0.00017504397309881016, + "loss": 1.8906, + "step": 2422 + }, + { + "epoch": 0.2505622915643339, + "grad_norm": 1.167373776435852, + "learning_rate": 0.00017503362648732543, + "loss": 2.0633, + "step": 2423 + }, + { + "epoch": 0.2506657015071999, + "grad_norm": 1.1927114725112915, + "learning_rate": 0.00017502327987584066, + "loss": 1.7121, + "step": 2424 + }, + { + "epoch": 0.2507691114500659, + "grad_norm": 1.8490036725997925, + "learning_rate": 0.00017501293326435593, + "loss": 1.7096, + "step": 2425 + }, + { + "epoch": 0.2508725213929319, + "grad_norm": 2.367034673690796, + "learning_rate": 0.0001750025866528712, + "loss": 2.011, + "step": 2426 + }, + { + "epoch": 0.25097593133579793, + "grad_norm": 1.748449444770813, + "learning_rate": 0.00017499224004138646, + "loss": 1.3284, + "step": 2427 + }, + { + "epoch": 0.25107934127866394, + "grad_norm": 1.6005412340164185, + "learning_rate": 0.00017498189342990173, + "loss": 1.866, + "step": 2428 + }, + { + "epoch": 0.25118275122152994, + "grad_norm": 1.6830029487609863, + "learning_rate": 0.000174971546818417, + "loss": 1.596, + "step": 2429 + }, + { + "epoch": 0.25128616116439595, + "grad_norm": 1.3933378458023071, + "learning_rate": 0.00017496120020693223, + "loss": 1.3813, + "step": 2430 + }, + { + "epoch": 0.25138957110726196, + "grad_norm": 2.1896424293518066, + "learning_rate": 0.0001749508535954475, + "loss": 1.8016, + "step": 2431 + }, + { + "epoch": 0.25149298105012796, + "grad_norm": 1.2955199480056763, + "learning_rate": 0.00017494050698396276, + "loss": 1.4218, + "step": 2432 + }, + { + "epoch": 0.25159639099299397, + "grad_norm": 1.6225368976593018, + "learning_rate": 0.00017493016037247803, + "loss": 1.5471, + "step": 2433 + }, + { + "epoch": 0.25169980093586, + "grad_norm": 1.3177118301391602, + "learning_rate": 0.0001749198137609933, + "loss": 1.3752, + "step": 2434 + }, + { + "epoch": 0.251803210878726, + "grad_norm": 2.3560526371002197, + "learning_rate": 0.00017490946714950856, + "loss": 1.2315, + "step": 2435 + }, + { + "epoch": 0.251906620821592, + "grad_norm": 2.910111665725708, + "learning_rate": 0.0001748991205380238, + "loss": 1.7354, + "step": 2436 + }, + { + "epoch": 0.252010030764458, + "grad_norm": 1.5888240337371826, + "learning_rate": 0.00017488877392653906, + "loss": 1.2371, + "step": 2437 + }, + { + "epoch": 0.252113440707324, + "grad_norm": 1.467577576637268, + "learning_rate": 0.00017487842731505433, + "loss": 1.2963, + "step": 2438 + }, + { + "epoch": 0.25221685065019, + "grad_norm": 0.8557147979736328, + "learning_rate": 0.0001748680807035696, + "loss": 1.392, + "step": 2439 + }, + { + "epoch": 0.252320260593056, + "grad_norm": 1.261374592781067, + "learning_rate": 0.00017485773409208486, + "loss": 1.3752, + "step": 2440 + }, + { + "epoch": 0.252423670535922, + "grad_norm": 1.7545433044433594, + "learning_rate": 0.00017484738748060012, + "loss": 1.375, + "step": 2441 + }, + { + "epoch": 0.252527080478788, + "grad_norm": 2.189995288848877, + "learning_rate": 0.00017483704086911536, + "loss": 1.9399, + "step": 2442 + }, + { + "epoch": 0.25263049042165403, + "grad_norm": 1.4490615129470825, + "learning_rate": 0.00017482669425763063, + "loss": 1.8315, + "step": 2443 + }, + { + "epoch": 0.25273390036452004, + "grad_norm": 1.2145003080368042, + "learning_rate": 0.0001748163476461459, + "loss": 1.0258, + "step": 2444 + }, + { + "epoch": 0.25283731030738604, + "grad_norm": 1.8237292766571045, + "learning_rate": 0.00017480600103466116, + "loss": 1.3875, + "step": 2445 + }, + { + "epoch": 0.25294072025025205, + "grad_norm": 1.7630832195281982, + "learning_rate": 0.00017479565442317643, + "loss": 1.7942, + "step": 2446 + }, + { + "epoch": 0.25304413019311806, + "grad_norm": 1.2417792081832886, + "learning_rate": 0.0001747853078116917, + "loss": 1.5995, + "step": 2447 + }, + { + "epoch": 0.25314754013598406, + "grad_norm": 0.8555414080619812, + "learning_rate": 0.00017477496120020693, + "loss": 1.4727, + "step": 2448 + }, + { + "epoch": 0.25325095007885007, + "grad_norm": 1.3805317878723145, + "learning_rate": 0.0001747646145887222, + "loss": 1.457, + "step": 2449 + }, + { + "epoch": 0.2533543600217161, + "grad_norm": 2.568117141723633, + "learning_rate": 0.00017475426797723746, + "loss": 1.5291, + "step": 2450 + }, + { + "epoch": 0.2534577699645821, + "grad_norm": 1.2383670806884766, + "learning_rate": 0.00017474392136575273, + "loss": 1.1544, + "step": 2451 + }, + { + "epoch": 0.2535611799074481, + "grad_norm": 1.880109429359436, + "learning_rate": 0.000174733574754268, + "loss": 1.7525, + "step": 2452 + }, + { + "epoch": 0.2536645898503141, + "grad_norm": 0.9020270109176636, + "learning_rate": 0.00017472322814278326, + "loss": 1.9656, + "step": 2453 + }, + { + "epoch": 0.2537679997931801, + "grad_norm": 0.9956245422363281, + "learning_rate": 0.0001747128815312985, + "loss": 1.5702, + "step": 2454 + }, + { + "epoch": 0.2538714097360461, + "grad_norm": 1.4423266649246216, + "learning_rate": 0.00017470253491981376, + "loss": 1.2459, + "step": 2455 + }, + { + "epoch": 0.2539748196789121, + "grad_norm": 1.7919795513153076, + "learning_rate": 0.00017469218830832903, + "loss": 1.6817, + "step": 2456 + }, + { + "epoch": 0.2540782296217781, + "grad_norm": 4.085237979888916, + "learning_rate": 0.0001746818416968443, + "loss": 1.8207, + "step": 2457 + }, + { + "epoch": 0.2541816395646441, + "grad_norm": 1.3783023357391357, + "learning_rate": 0.00017467149508535956, + "loss": 1.5989, + "step": 2458 + }, + { + "epoch": 0.25428504950751013, + "grad_norm": 1.3352357149124146, + "learning_rate": 0.00017466114847387482, + "loss": 1.5502, + "step": 2459 + }, + { + "epoch": 0.25438845945037614, + "grad_norm": 1.0194011926651, + "learning_rate": 0.00017465080186239006, + "loss": 1.7048, + "step": 2460 + }, + { + "epoch": 0.25449186939324214, + "grad_norm": 1.5900938510894775, + "learning_rate": 0.00017464045525090533, + "loss": 1.3563, + "step": 2461 + }, + { + "epoch": 0.25459527933610815, + "grad_norm": 2.041964530944824, + "learning_rate": 0.0001746301086394206, + "loss": 1.4161, + "step": 2462 + }, + { + "epoch": 0.25469868927897416, + "grad_norm": 3.3470609188079834, + "learning_rate": 0.00017461976202793586, + "loss": 2.2123, + "step": 2463 + }, + { + "epoch": 0.25480209922184016, + "grad_norm": 1.328002691268921, + "learning_rate": 0.00017460941541645112, + "loss": 1.4335, + "step": 2464 + }, + { + "epoch": 0.25490550916470617, + "grad_norm": 2.004028558731079, + "learning_rate": 0.0001745990688049664, + "loss": 1.5092, + "step": 2465 + }, + { + "epoch": 0.2550089191075722, + "grad_norm": 1.4580358266830444, + "learning_rate": 0.00017458872219348163, + "loss": 2.0, + "step": 2466 + }, + { + "epoch": 0.2551123290504382, + "grad_norm": 1.3243385553359985, + "learning_rate": 0.0001745783755819969, + "loss": 1.8759, + "step": 2467 + }, + { + "epoch": 0.2552157389933042, + "grad_norm": 2.5180490016937256, + "learning_rate": 0.00017456802897051216, + "loss": 1.5182, + "step": 2468 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 1.6826629638671875, + "learning_rate": 0.00017455768235902743, + "loss": 1.4656, + "step": 2469 + }, + { + "epoch": 0.2554225588790362, + "grad_norm": 2.267388105392456, + "learning_rate": 0.0001745473357475427, + "loss": 2.3572, + "step": 2470 + }, + { + "epoch": 0.2555259688219022, + "grad_norm": 2.7292869091033936, + "learning_rate": 0.00017453698913605796, + "loss": 2.0746, + "step": 2471 + }, + { + "epoch": 0.2556293787647682, + "grad_norm": 2.772287607192993, + "learning_rate": 0.0001745266425245732, + "loss": 1.9232, + "step": 2472 + }, + { + "epoch": 0.2557327887076342, + "grad_norm": 1.6850965023040771, + "learning_rate": 0.00017451629591308846, + "loss": 2.2237, + "step": 2473 + }, + { + "epoch": 0.2558361986505002, + "grad_norm": 2.2228617668151855, + "learning_rate": 0.00017450594930160373, + "loss": 2.0034, + "step": 2474 + }, + { + "epoch": 0.25593960859336623, + "grad_norm": 1.7392048835754395, + "learning_rate": 0.000174495602690119, + "loss": 1.2657, + "step": 2475 + }, + { + "epoch": 0.25604301853623224, + "grad_norm": 1.7491132020950317, + "learning_rate": 0.00017448525607863426, + "loss": 0.9487, + "step": 2476 + }, + { + "epoch": 0.25614642847909824, + "grad_norm": 1.7165073156356812, + "learning_rate": 0.00017447490946714952, + "loss": 1.463, + "step": 2477 + }, + { + "epoch": 0.25624983842196425, + "grad_norm": 1.6869577169418335, + "learning_rate": 0.00017446456285566476, + "loss": 1.8076, + "step": 2478 + }, + { + "epoch": 0.25635324836483026, + "grad_norm": 1.95469331741333, + "learning_rate": 0.00017445421624418003, + "loss": 1.8281, + "step": 2479 + }, + { + "epoch": 0.25645665830769626, + "grad_norm": 0.9262880086898804, + "learning_rate": 0.0001744438696326953, + "loss": 1.4683, + "step": 2480 + }, + { + "epoch": 0.25656006825056227, + "grad_norm": 1.4505789279937744, + "learning_rate": 0.00017443352302121056, + "loss": 1.425, + "step": 2481 + }, + { + "epoch": 0.2566634781934283, + "grad_norm": 1.750234842300415, + "learning_rate": 0.00017442317640972582, + "loss": 2.1092, + "step": 2482 + }, + { + "epoch": 0.2567668881362943, + "grad_norm": 1.320644497871399, + "learning_rate": 0.0001744128297982411, + "loss": 1.3916, + "step": 2483 + }, + { + "epoch": 0.2568702980791603, + "grad_norm": 0.9432196617126465, + "learning_rate": 0.00017440248318675633, + "loss": 1.3987, + "step": 2484 + }, + { + "epoch": 0.2569737080220263, + "grad_norm": 2.0192582607269287, + "learning_rate": 0.0001743921365752716, + "loss": 1.9502, + "step": 2485 + }, + { + "epoch": 0.2570771179648923, + "grad_norm": 1.798473834991455, + "learning_rate": 0.00017438178996378686, + "loss": 1.6392, + "step": 2486 + }, + { + "epoch": 0.2571805279077583, + "grad_norm": 1.092153549194336, + "learning_rate": 0.00017437144335230212, + "loss": 1.702, + "step": 2487 + }, + { + "epoch": 0.2572839378506243, + "grad_norm": 0.8232825398445129, + "learning_rate": 0.0001743610967408174, + "loss": 1.6369, + "step": 2488 + }, + { + "epoch": 0.2573873477934903, + "grad_norm": 2.7390313148498535, + "learning_rate": 0.00017435075012933266, + "loss": 2.2424, + "step": 2489 + }, + { + "epoch": 0.2574907577363563, + "grad_norm": 1.3584825992584229, + "learning_rate": 0.0001743404035178479, + "loss": 1.5879, + "step": 2490 + }, + { + "epoch": 0.25759416767922233, + "grad_norm": 1.8954455852508545, + "learning_rate": 0.00017433005690636316, + "loss": 2.0067, + "step": 2491 + }, + { + "epoch": 0.25769757762208834, + "grad_norm": 1.358513593673706, + "learning_rate": 0.00017431971029487842, + "loss": 1.8005, + "step": 2492 + }, + { + "epoch": 0.2578009875649544, + "grad_norm": 1.0788158178329468, + "learning_rate": 0.0001743093636833937, + "loss": 1.5718, + "step": 2493 + }, + { + "epoch": 0.2579043975078204, + "grad_norm": 1.8200621604919434, + "learning_rate": 0.00017429901707190896, + "loss": 0.9247, + "step": 2494 + }, + { + "epoch": 0.2580078074506864, + "grad_norm": 1.5555850267410278, + "learning_rate": 0.00017428867046042422, + "loss": 1.5324, + "step": 2495 + }, + { + "epoch": 0.2581112173935524, + "grad_norm": 1.2410125732421875, + "learning_rate": 0.00017427832384893946, + "loss": 0.8922, + "step": 2496 + }, + { + "epoch": 0.2582146273364184, + "grad_norm": 2.1035003662109375, + "learning_rate": 0.00017426797723745473, + "loss": 1.7946, + "step": 2497 + }, + { + "epoch": 0.25831803727928443, + "grad_norm": 1.2379070520401, + "learning_rate": 0.00017425763062597, + "loss": 1.4737, + "step": 2498 + }, + { + "epoch": 0.25842144722215044, + "grad_norm": 3.038174867630005, + "learning_rate": 0.00017424728401448526, + "loss": 1.7392, + "step": 2499 + }, + { + "epoch": 0.25852485716501644, + "grad_norm": 2.1067824363708496, + "learning_rate": 0.00017423693740300052, + "loss": 1.2049, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 19340, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "total_flos": 7.247812094145331e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}